diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7e497d755a14..19428b7bb9c2 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -31,6 +31,14 @@ jobs:
         parallelism: 1
         steps:
             - checkout
+            - run: if [[ "$CIRCLE_PULL_REQUEST" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
+            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/${CIRCLE_PULL_REQUEST##*/} >> github.txt'
+            - run: cat github.txt
+            - run: (python3 -c 'import json; from datetime import datetime; fp = open("github.txt"); data = json.load(fp); fp.close(); f = "%Y-%m-%dT%H:%M:%SZ"; created = datetime.strptime(data["created_at"], f); updated = datetime.strptime(data["updated_at"], f); s = (updated - created).total_seconds(); print(int(s))' || true) > elapsed.txt
+            - run: if [ "$(cat elapsed.txt)" == "" ]; then echo 60 > elapsed.txt; fi
+            - run: cat elapsed.txt
+            - run: if [ "$(cat elapsed.txt)" -lt "30" ]; then echo "PR is just opened, wait some actions from GitHub"; sleep 30; fi
+            - run: 'if grep -q "\"draft\": true," github.txt; then echo "draft mode, skip test!"; circleci-agent step halt; fi'
             - run: uv pip install -U -e .
             - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
             - run: mkdir -p test_preparation
@@ -146,7 +154,7 @@ jobs:
                   path: ~/transformers/installed.txt
             - run: python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
             - run: ruff check examples tests src utils
-            - run: ruff format tests src utils --check
+            - run: ruff format examples tests src utils --check
             - run: python utils/custom_init_isort.py --check_only
             - run: python utils/sort_auto_mappings.py --check_only
             - run: python utils/check_doc_toc.py
@@ -170,8 +178,7 @@ jobs:
             - store_artifacts:
                   path: ~/transformers/installed.txt
             - run: python utils/check_copies.py
-            - run: python utils/check_modular_conversion.py --num_workers 4
-            - run: python utils/check_table.py
+            - run: python utils/check_modular_conversion.py
             - run: python utils/check_dummies.py
             - run: python utils/check_repo.py
             - run: python utils/check_inits.py
@@ -181,7 +188,6 @@ jobs:
             - run: make deps_table_check_updated
             - run: python utils/update_metadata.py --check-only
             - run: python utils/check_docstrings.py
-            - run: python utils/check_support_list.py
 
 workflows:
     version: 2
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 71c75dac2ff0..2e87b4c2e1a8 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -28,13 +28,30 @@
     "TRANSFORMERS_IS_CI": True,
     "PYTEST_TIMEOUT": 120,
     "RUN_PIPELINE_TESTS": False,
-    "RUN_PT_TF_CROSS_TESTS": False,
-    "RUN_PT_FLAX_CROSS_TESTS": False,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
-COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsfE":None}
+COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
 DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}]
 
+# Strings that commonly appear in the output of flaky tests when they fail. These are used with `pytest-rerunfailures`
+# to rerun the tests that match these patterns.
+FLAKY_TEST_FAILURE_PATTERNS = [
+    "OSError",  # Machine/connection transient error
+    "Timeout",  # Machine/connection transient error
+    "ConnectionError",  # Connection transient error
+    "FileNotFoundError",  # Raised by `datasets` on Hub failures
+    "PIL.UnidentifiedImageError",  # Raised by `PIL.Image.open` on connection issues
+    "HTTPError",  # Also catches HfHubHTTPError
+    "AssertionError: Tensor-likes are not close!",  # `torch.testing.assert_close`, we might have unlucky random values
+    # TODO: error downloading tokenizer's `merged.txt` from hub can cause all the exceptions below. Throw and handle
+    # them under a single message.
+    "TypeError: expected str, bytes or os.PathLike object, not NoneType",
+    "TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType",
+    "Converting from Tiktoken failed",
+    "KeyError: <class ",
+    "TypeError: not a string",
+]
+
 
 class EmptyJob:
     job_name = "empty"
@@ -126,7 +143,9 @@ def to_dict(self):
                 # Examples special case: we need to download NLTK files in advance to avoid cuncurrency issues
         timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else ""
         marker_cmd = f"-m '{self.marker}'" if self.marker is not None else ""
-        additional_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
+        junit_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
+        joined_flaky_patterns = "|".join(FLAKY_TEST_FAILURE_PATTERNS)
+        repeat_on_failure_flags = f"--reruns 5 --reruns-delay 2 --only-rerun '({joined_flaky_patterns})'"
         parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> '
         steps = [
             "checkout",
@@ -152,9 +171,10 @@ def to_dict(self):
                     "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
                     }
             },
+            {"run": {"name": "fetch hub objects before pytest", "command": "python3 utils/fetch_hub_objects_for_ci.py"}},
             {"run": {
                 "name": "Run tests",
-                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {additional_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
+                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
             },
             {"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
             {"run": {"name": "Failed tests: show reasons",   "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
@@ -177,23 +197,6 @@ def job_name(self):
 
 
 # JOBS
-torch_and_tf_job = CircleCIJob(
-    "torch_and_tf",
-    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
-    additional_env={"RUN_PT_TF_CROSS_TESTS": True},
-    marker="is_pt_tf_cross_test",
-    pytest_options={"rA": None, "durations": 0},
-)
-
-
-torch_and_flax_job = CircleCIJob(
-    "torch_and_flax",
-    additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
-    docker_image=[{"image":"huggingface/transformers-torch-jax-light"}],
-    marker="is_pt_flax_cross_test",
-    pytest_options={"rA": None, "durations": 0},
-)
-
 torch_job = CircleCIJob(
     "torch",
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
@@ -204,6 +207,9 @@ def job_name(self):
 generate_job = CircleCIJob(
     "generate",
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
+    # networkx==3.3 (after #36957) cause some issues
+    # TODO: remove this once it works directly
+    install_steps=["uv venv && uv pip install . && uv pip install networkx==3.2.1"],
     marker="generate",
     parallelism=6,
 )
@@ -267,6 +273,7 @@ def job_name(self):
     docker_image=[{"image":"huggingface/transformers-examples-torch"}],
     # TODO @ArthurZucker remove this once docker is easier to build
     install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
+    pytest_num_workers=4,
 )
 
 
@@ -274,6 +281,7 @@ def job_name(self):
     "examples_tensorflow",
     additional_env={"OMP_NUM_THREADS": 8},
     docker_image=[{"image":"huggingface/transformers-examples-tf"}],
+    pytest_num_workers=2,
 )
 
 
@@ -324,6 +332,9 @@ def job_name(self):
 non_model_job = CircleCIJob(
     "non_model",
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
+    # networkx==3.3 (after #36957) cause some issues
+    # TODO: remove this once it works directly
+    install_steps=["uv venv && uv pip install . && uv pip install networkx==3.2.1"],
     marker="not generate",
     parallelism=6,
 )
@@ -353,9 +364,9 @@ def job_name(self):
     pytest_num_workers=1,
 )
 
-REGULAR_TESTS = [torch_and_tf_job, torch_and_flax_job, torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
-EXAMPLES_TESTS = [examples_torch_job, examples_tensorflow_job]
-PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job]
+REGULAR_TESTS = [torch_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
+EXAMPLES_TESTS = [examples_torch_job]
+PIPELINE_TESTS = [pipelines_torch_job]
 REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
 ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 9b2c00bac50d..6d5eca4f1ec8 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -38,21 +38,21 @@ body:
 
           - text models: @ArthurZucker
           - vision models: @amyeroberts, @qubvel
-          - speech models: @ylacombe, @eustlb
+          - speech models: @eustlb
           - graph models: @clefourrier
 
         Library:
 
-          - flax: @sanchit-gandhi
+          - flax: @gante and @Rocketknight1
           - generate: @zucchini-nlp (visual-language models) or @gante (all others)
           - pipelines: @Rocketknight1
           - tensorflow: @gante and @Rocketknight1
           - tokenizers: @ArthurZucker and @itazap
-          - trainer: @muellerzr @SunMarc
+          - trainer: @zach-huggingface @SunMarc
 
         Integrations:
 
-          - deepspeed: HF Trainer/Accelerate: @muellerzr
+          - deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
           - ray/raytune: @richardliaw, @amogkam
           - Big Model Inference: @SunMarc
           - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
@@ -72,7 +72,7 @@ body:
 
         Maintained examples (not research project or legacy):
 
-          - Flax: @sanchit-gandhi
+          - Flax: @Rocketknight1
           - PyTorch: See Models above and tag the person corresponding to the modality of the example.
           - TensorFlow: @Rocketknight1
 
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index ee7a7eaae113..439ab02ebc92 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -41,22 +41,22 @@ Models:
 
 - text models: @ArthurZucker
 - vision models: @amyeroberts, @qubvel
-- speech models: @ylacombe, @eustlb
+- speech models: @eustlb
 - graph models: @clefourrier
 
 Library:
 
-- flax: @sanchit-gandhi
+- flax: @gante and @Rocketknight1
 - generate: @zucchini-nlp (visual-language models) or @gante (all others)
 - pipelines: @Rocketknight1
 - tensorflow: @gante and @Rocketknight1
 - tokenizers: @ArthurZucker
-- trainer: @muellerzr and @SunMarc
+- trainer: @zach-huggingface and @SunMarc
 - chat templates: @Rocketknight1
 
 Integrations:
 
-- deepspeed: HF Trainer/Accelerate: @muellerzr
+- deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface
 - ray/raytune: @richardliaw, @amogkam
 - Big Model Inference: @SunMarc
 - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
@@ -72,7 +72,7 @@ HF projects:
 
 Maintained examples (not research project or legacy):
 
-- Flax: @sanchit-gandhi
+- Flax: @Rocketknight1
 - PyTorch: See Models above and tag the person corresponding to the modality of the example.
 - TensorFlow: @Rocketknight1
 
diff --git a/.github/scripts/assign_reviewers.py b/.github/scripts/assign_reviewers.py
new file mode 100644
index 000000000000..548ea3cb49b0
--- /dev/null
+++ b/.github/scripts/assign_reviewers.py
@@ -0,0 +1,102 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import github
+import json
+from github import Github
+import re
+from collections import Counter
+from pathlib import Path
+
+def pattern_to_regex(pattern):
+    if pattern.startswith("/"):
+        start_anchor = True
+        pattern = re.escape(pattern[1:])
+    else:
+        start_anchor = False
+        pattern = re.escape(pattern)
+    # Replace `*` with "any number of non-slash characters"
+    pattern = pattern.replace(r"\*", "[^/]*")
+    if start_anchor:
+        pattern = r"^\/?" + pattern  # Allow an optional leading slash after the start of the string
+    return pattern
+
+def get_file_owners(file_path, codeowners_lines):
+    # Process lines in reverse (last matching pattern takes precedence)
+    for line in reversed(codeowners_lines):
+        # Skip comments and empty lines, strip inline comments
+        line = line.split('#')[0].strip()
+        if not line:
+            continue
+
+        # Split into pattern and owners
+        parts = line.split()
+        pattern = parts[0]
+        # Can be empty, e.g. for dummy files with explicitly no owner!
+        owners = [owner.removeprefix("@") for owner in parts[1:]]
+
+        # Check if file matches pattern
+        file_regex = pattern_to_regex(pattern)
+        if re.search(file_regex, file_path) is not None:
+            return owners  # Remember, can still be empty!
+    return []  # Should never happen, but just in case
+
+def main():
+    script_dir = Path(__file__).parent.absolute()
+    with open(script_dir / "codeowners_for_review_action") as f:
+        codeowners_lines = f.readlines()
+
+    g = Github(os.environ['GITHUB_TOKEN'])
+    repo = g.get_repo("huggingface/transformers")
+    with open(os.environ['GITHUB_EVENT_PATH']) as f:
+        event = json.load(f)
+
+    # The PR number is available in the event payload
+    pr_number = event['pull_request']['number']
+    pr = repo.get_pull(pr_number)
+    pr_author = pr.user.login
+
+    existing_reviews = list(pr.get_reviews())
+    if existing_reviews:
+        print(f"Already has reviews: {[r.user.login for r in existing_reviews]}")
+        return
+
+    users_requested, teams_requested = pr.get_review_requests()
+    users_requested = list(users_requested)
+    if users_requested:
+        print(f"Reviewers already requested: {users_requested}")
+        return
+
+    locs_per_owner = Counter()
+    for file in pr.get_files():
+        owners = get_file_owners(file.filename, codeowners_lines)
+        for owner in owners:
+            locs_per_owner[owner] += file.changes
+
+    # Assign the top 2 based on locs changed as reviewers, but skip the owner if present
+    locs_per_owner.pop(pr_author, None)
+    top_owners = locs_per_owner.most_common(2)
+    print("Top owners", top_owners)
+    top_owners = [owner[0] for owner in top_owners]
+    try:
+        pr.create_review_request(top_owners)
+    except github.GithubException as e:
+        print(f"Failed to request review for {top_owners}: {e}")
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/codeowners_for_review_action b/.github/scripts/codeowners_for_review_action
new file mode 100644
index 000000000000..7325b0f570cc
--- /dev/null
+++ b/.github/scripts/codeowners_for_review_action
@@ -0,0 +1,370 @@
+# Top-level rules are matched only if nothing else matches
+* @Rocketknight1 @ArthurZucker # if no one is pinged based on the other rules, he will do the dispatch
+*.md @stevhliu
+*tokenization* @ArthurZucker
+docs/ @stevhliu
+/benchmark/ @McPatate
+/docker/ @ydshieh @ArthurZucker
+
+# More high-level globs catch cases when specific rules later don't apply
+/src/transformers/models/*/processing* @molbap @yonigozlan @qubvel
+/src/transformers/models/*/image_processing* @qubvel
+/src/transformers/models/*/image_processing_*_fast* @yonigozlan
+
+# Owners of subsections of the library
+/src/transformers/generation/ @gante
+/src/transformers/pipeline/ @Rocketknight1 @yonigozlan
+/src/transformers/integrations/ @SunMarc @MekkCyber @zach-huggingface
+/src/transformers/quantizers/ @SunMarc @MekkCyber
+tests/ @ydshieh
+tests/generation/ @gante
+
+/src/transformers/models/auto/ @ArthurZucker
+/src/transformers/utils/ @ArthurZucker @Rocketknight1
+/src/transformers/loss/ @ArthurZucker
+/src/transformers/onnx/ @michaelbenayoun
+
+# Specific files come after the sections/globs, so they take priority
+/.circleci/config.yml @ArthurZucker @ydshieh
+/utils/tests_fetcher.py @ydshieh
+trainer.py @zach-huggingface @SunMarc
+trainer_utils.py @zach-huggingface @SunMarc
+/utils/modular_model_converter.py @Cyrilvallez @ArthurZucker
+
+# Owners of individual models are specific / high priority, and so they come last
+# mod* captures modeling and modular files
+
+# Text models
+/src/transformers/models/albert/mod*_albert* @ArthurZucker
+/src/transformers/models/bamba/mod*_bamba* @ArthurZucker
+/src/transformers/models/bart/mod*_bart* @ArthurZucker
+/src/transformers/models/barthez/mod*_barthez* @ArthurZucker
+/src/transformers/models/bartpho/mod*_bartpho* @ArthurZucker
+/src/transformers/models/bert/mod*_bert* @ArthurZucker
+/src/transformers/models/bert_generation/mod*_bert_generation* @ArthurZucker
+/src/transformers/models/bert_japanese/mod*_bert_japanese* @ArthurZucker
+/src/transformers/models/bertweet/mod*_bertweet* @ArthurZucker
+/src/transformers/models/big_bird/mod*_big_bird* @ArthurZucker
+/src/transformers/models/bigbird_pegasus/mod*_bigbird_pegasus* @ArthurZucker
+/src/transformers/models/biogpt/mod*_biogpt* @ArthurZucker
+/src/transformers/models/blenderbot/mod*_blenderbot* @ArthurZucker
+/src/transformers/models/blenderbot_small/mod*_blenderbot_small* @ArthurZucker
+/src/transformers/models/bloom/mod*_bloom* @ArthurZucker
+/src/transformers/models/bort/mod*_bort* @ArthurZucker
+/src/transformers/models/byt5/mod*_byt5* @ArthurZucker
+/src/transformers/models/camembert/mod*_camembert* @ArthurZucker
+/src/transformers/models/canine/mod*_canine* @ArthurZucker
+/src/transformers/models/codegen/mod*_codegen* @ArthurZucker
+/src/transformers/models/code_llama/mod*_code_llama* @ArthurZucker
+/src/transformers/models/cohere/mod*_cohere* @ArthurZucker
+/src/transformers/models/cohere2/mod*_cohere2* @ArthurZucker
+/src/transformers/models/convbert/mod*_convbert* @ArthurZucker
+/src/transformers/models/cpm/mod*_cpm* @ArthurZucker
+/src/transformers/models/cpmant/mod*_cpmant* @ArthurZucker
+/src/transformers/models/ctrl/mod*_ctrl* @ArthurZucker
+/src/transformers/models/dbrx/mod*_dbrx* @ArthurZucker
+/src/transformers/models/deberta/mod*_deberta* @ArthurZucker
+/src/transformers/models/deberta_v2/mod*_deberta_v2* @ArthurZucker
+/src/transformers/models/dialogpt/mod*_dialogpt* @ArthurZucker
+/src/transformers/models/diffllama/mod*_diffllama* @ArthurZucker
+/src/transformers/models/distilbert/mod*_distilbert* @ArthurZucker
+/src/transformers/models/dpr/mod*_dpr* @ArthurZucker
+/src/transformers/models/electra/mod*_electra* @ArthurZucker
+/src/transformers/models/encoder_decoder/mod*_encoder_decoder* @ArthurZucker
+/src/transformers/models/ernie/mod*_ernie* @ArthurZucker
+/src/transformers/models/ernie_m/mod*_ernie_m* @ArthurZucker
+/src/transformers/models/esm/mod*_esm* @ArthurZucker
+/src/transformers/models/falcon/mod*_falcon* @ArthurZucker
+/src/transformers/models/falcon3/mod*_falcon3* @ArthurZucker
+/src/transformers/models/falcon_mamba/mod*_falcon_mamba* @ArthurZucker
+/src/transformers/models/fastspeech2_conformer/mod*_fastspeech2_conformer* @ArthurZucker
+/src/transformers/models/flan_t5/mod*_flan_t5* @ArthurZucker
+/src/transformers/models/flan_ul2/mod*_flan_ul2* @ArthurZucker
+/src/transformers/models/flaubert/mod*_flaubert* @ArthurZucker
+/src/transformers/models/fnet/mod*_fnet* @ArthurZucker
+/src/transformers/models/fsmt/mod*_fsmt* @ArthurZucker
+/src/transformers/models/funnel/mod*_funnel* @ArthurZucker
+/src/transformers/models/fuyu/mod*_fuyu* @ArthurZucker
+/src/transformers/models/gemma/mod*_gemma* @ArthurZucker
+/src/transformers/models/gemma2/mod*_gemma2* @ArthurZucker
+/src/transformers/models/glm/mod*_glm* @ArthurZucker
+/src/transformers/models/openai_gpt/mod*_openai_gpt* @ArthurZucker
+/src/transformers/models/gpt_neo/mod*_gpt_neo* @ArthurZucker
+/src/transformers/models/gpt_neox/mod*_gpt_neox* @ArthurZucker
+/src/transformers/models/gpt_neox_japanese/mod*_gpt_neox_japanese* @ArthurZucker
+/src/transformers/models/gptj/mod*_gptj* @ArthurZucker
+/src/transformers/models/gpt2/mod*_gpt2* @ArthurZucker
+/src/transformers/models/gpt_bigcode/mod*_gpt_bigcode* @ArthurZucker
+/src/transformers/models/gptsan_japanese/mod*_gptsan_japanese* @ArthurZucker
+/src/transformers/models/gpt_sw3/mod*_gpt_sw3* @ArthurZucker
+/src/transformers/models/granite/mod*_granite* @ArthurZucker
+/src/transformers/models/granitemoe/mod*_granitemoe* @ArthurZucker
+/src/transformers/models/herbert/mod*_herbert* @ArthurZucker
+/src/transformers/models/ibert/mod*_ibert* @ArthurZucker
+/src/transformers/models/jamba/mod*_jamba* @ArthurZucker
+/src/transformers/models/jetmoe/mod*_jetmoe* @ArthurZucker
+/src/transformers/models/jukebox/mod*_jukebox* @ArthurZucker
+/src/transformers/models/led/mod*_led* @ArthurZucker
+/src/transformers/models/llama/mod*_llama* @ArthurZucker @Cyrilvallez
+/src/transformers/models/longformer/mod*_longformer* @ArthurZucker
+/src/transformers/models/longt5/mod*_longt5* @ArthurZucker
+/src/transformers/models/luke/mod*_luke* @ArthurZucker
+/src/transformers/models/m2m_100/mod*_m2m_100* @ArthurZucker
+/src/transformers/models/madlad_400/mod*_madlad_400* @ArthurZucker
+/src/transformers/models/mamba/mod*_mamba* @ArthurZucker
+/src/transformers/models/mamba2/mod*_mamba2* @ArthurZucker
+/src/transformers/models/marian/mod*_marian* @ArthurZucker
+/src/transformers/models/markuplm/mod*_markuplm* @ArthurZucker
+/src/transformers/models/mbart/mod*_mbart* @ArthurZucker
+/src/transformers/models/mega/mod*_mega* @ArthurZucker
+/src/transformers/models/megatron_bert/mod*_megatron_bert* @ArthurZucker
+/src/transformers/models/megatron_gpt2/mod*_megatron_gpt2* @ArthurZucker
+/src/transformers/models/mistral/mod*_mistral* @ArthurZucker
+/src/transformers/models/mixtral/mod*_mixtral* @ArthurZucker
+/src/transformers/models/mluke/mod*_mluke* @ArthurZucker
+/src/transformers/models/mobilebert/mod*_mobilebert* @ArthurZucker
+/src/transformers/models/modernbert/mod*_modernbert* @ArthurZucker
+/src/transformers/models/mpnet/mod*_mpnet* @ArthurZucker
+/src/transformers/models/mpt/mod*_mpt* @ArthurZucker
+/src/transformers/models/mra/mod*_mra* @ArthurZucker
+/src/transformers/models/mt5/mod*_mt5* @ArthurZucker
+/src/transformers/models/mvp/mod*_mvp* @ArthurZucker
+/src/transformers/models/myt5/mod*_myt5* @ArthurZucker
+/src/transformers/models/nemotron/mod*_nemotron* @ArthurZucker
+/src/transformers/models/nezha/mod*_nezha* @ArthurZucker
+/src/transformers/models/nllb/mod*_nllb* @ArthurZucker
+/src/transformers/models/nllb_moe/mod*_nllb_moe* @ArthurZucker
+/src/transformers/models/nystromformer/mod*_nystromformer* @ArthurZucker
+/src/transformers/models/olmo/mod*_olmo* @ArthurZucker
+/src/transformers/models/olmo2/mod*_olmo2* @ArthurZucker
+/src/transformers/models/olmoe/mod*_olmoe* @ArthurZucker
+/src/transformers/models/open_llama/mod*_open_llama* @ArthurZucker
+/src/transformers/models/opt/mod*_opt* @ArthurZucker
+/src/transformers/models/pegasus/mod*_pegasus* @ArthurZucker
+/src/transformers/models/pegasus_x/mod*_pegasus_x* @ArthurZucker
+/src/transformers/models/persimmon/mod*_persimmon* @ArthurZucker
+/src/transformers/models/phi/mod*_phi* @ArthurZucker
+/src/transformers/models/phi3/mod*_phi3* @ArthurZucker
+/src/transformers/models/phimoe/mod*_phimoe* @ArthurZucker
+/src/transformers/models/phobert/mod*_phobert* @ArthurZucker
+/src/transformers/models/plbart/mod*_plbart* @ArthurZucker
+/src/transformers/models/prophetnet/mod*_prophetnet* @ArthurZucker
+/src/transformers/models/qdqbert/mod*_qdqbert* @ArthurZucker
+/src/transformers/models/qwen2/mod*_qwen2* @ArthurZucker
+/src/transformers/models/qwen2_moe/mod*_qwen2_moe* @ArthurZucker
+/src/transformers/models/rag/mod*_rag* @ArthurZucker
+/src/transformers/models/realm/mod*_realm* @ArthurZucker
+/src/transformers/models/recurrent_gemma/mod*_recurrent_gemma* @ArthurZucker
+/src/transformers/models/reformer/mod*_reformer* @ArthurZucker
+/src/transformers/models/rembert/mod*_rembert* @ArthurZucker
+/src/transformers/models/retribert/mod*_retribert* @ArthurZucker
+/src/transformers/models/roberta/mod*_roberta* @ArthurZucker
+/src/transformers/models/roberta_prelayernorm/mod*_roberta_prelayernorm* @ArthurZucker
+/src/transformers/models/roc_bert/mod*_roc_bert* @ArthurZucker
+/src/transformers/models/roformer/mod*_roformer* @ArthurZucker
+/src/transformers/models/rwkv/mod*_rwkv* @ArthurZucker
+/src/transformers/models/splinter/mod*_splinter* @ArthurZucker
+/src/transformers/models/squeezebert/mod*_squeezebert* @ArthurZucker
+/src/transformers/models/stablelm/mod*_stablelm* @ArthurZucker
+/src/transformers/models/starcoder2/mod*_starcoder2* @ArthurZucker
+/src/transformers/models/switch_transformers/mod*_switch_transformers* @ArthurZucker
+/src/transformers/models/t5/mod*_t5* @ArthurZucker
+/src/transformers/models/t5v1.1/mod*_t5v1.1* @ArthurZucker
+/src/transformers/models/tapex/mod*_tapex* @ArthurZucker
+/src/transformers/models/transfo_xl/mod*_transfo_xl* @ArthurZucker
+/src/transformers/models/ul2/mod*_ul2* @ArthurZucker
+/src/transformers/models/umt5/mod*_umt5* @ArthurZucker
+/src/transformers/models/xmod/mod*_xmod* @ArthurZucker
+/src/transformers/models/xglm/mod*_xglm* @ArthurZucker
+/src/transformers/models/xlm/mod*_xlm* @ArthurZucker
+/src/transformers/models/xlm_prophetnet/mod*_xlm_prophetnet* @ArthurZucker
+/src/transformers/models/xlm_roberta/mod*_xlm_roberta* @ArthurZucker
+/src/transformers/models/xlm_roberta_xl/mod*_xlm_roberta_xl* @ArthurZucker
+/src/transformers/models/xlm_v/mod*_xlm_v* @ArthurZucker
+/src/transformers/models/xlnet/mod*_xlnet* @ArthurZucker
+/src/transformers/models/yoso/mod*_yoso* @ArthurZucker
+/src/transformers/models/zamba/mod*_zamba* @ArthurZucker
+
+# Vision models
+/src/transformers/models/beit/mod*_beit* @amyeroberts @qubvel
+/src/transformers/models/bit/mod*_bit* @amyeroberts @qubvel
+/src/transformers/models/conditional_detr/mod*_conditional_detr* @amyeroberts @qubvel
+/src/transformers/models/convnext/mod*_convnext* @amyeroberts @qubvel
+/src/transformers/models/convnextv2/mod*_convnextv2* @amyeroberts @qubvel
+/src/transformers/models/cvt/mod*_cvt* @amyeroberts @qubvel
+/src/transformers/models/deformable_detr/mod*_deformable_detr* @amyeroberts @qubvel
+/src/transformers/models/deit/mod*_deit* @amyeroberts @qubvel
+/src/transformers/models/depth_anything/mod*_depth_anything* @amyeroberts @qubvel
+/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @amyeroberts @qubvel
+/src/transformers/models/deta/mod*_deta* @amyeroberts @qubvel
+/src/transformers/models/detr/mod*_detr* @amyeroberts @qubvel
+/src/transformers/models/dinat/mod*_dinat* @amyeroberts @qubvel
+/src/transformers/models/dinov2/mod*_dinov2* @amyeroberts @qubvel
+/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @amyeroberts @qubvel
+/src/transformers/models/dit/mod*_dit* @amyeroberts @qubvel
+/src/transformers/models/dpt/mod*_dpt* @amyeroberts @qubvel
+/src/transformers/models/efficientformer/mod*_efficientformer* @amyeroberts @qubvel
+/src/transformers/models/efficientnet/mod*_efficientnet* @amyeroberts @qubvel
+/src/transformers/models/focalnet/mod*_focalnet* @amyeroberts @qubvel
+/src/transformers/models/glpn/mod*_glpn* @amyeroberts @qubvel
+/src/transformers/models/hiera/mod*_hiera* @amyeroberts @qubvel
+/src/transformers/models/ijepa/mod*_ijepa* @amyeroberts @qubvel
+/src/transformers/models/imagegpt/mod*_imagegpt* @amyeroberts @qubvel
+/src/transformers/models/levit/mod*_levit* @amyeroberts @qubvel
+/src/transformers/models/mask2former/mod*_mask2former* @amyeroberts @qubvel
+/src/transformers/models/maskformer/mod*_maskformer* @amyeroberts @qubvel
+/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @amyeroberts @qubvel
+/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @amyeroberts @qubvel
+/src/transformers/models/mobilevit/mod*_mobilevit* @amyeroberts @qubvel
+/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @amyeroberts @qubvel
+/src/transformers/models/nat/mod*_nat* @amyeroberts @qubvel
+/src/transformers/models/poolformer/mod*_poolformer* @amyeroberts @qubvel
+/src/transformers/models/pvt/mod*_pvt* @amyeroberts @qubvel
+/src/transformers/models/pvt_v2/mod*_pvt_v2* @amyeroberts @qubvel
+/src/transformers/models/regnet/mod*_regnet* @amyeroberts @qubvel
+/src/transformers/models/resnet/mod*_resnet* @amyeroberts @qubvel
+/src/transformers/models/rt_detr/mod*_rt_detr* @amyeroberts @qubvel
+/src/transformers/models/segformer/mod*_segformer* @amyeroberts @qubvel
+/src/transformers/models/seggpt/mod*_seggpt* @amyeroberts @qubvel
+/src/transformers/models/superpoint/mod*_superpoint* @amyeroberts @qubvel
+/src/transformers/models/swiftformer/mod*_swiftformer* @amyeroberts @qubvel
+/src/transformers/models/swin/mod*_swin* @amyeroberts @qubvel
+/src/transformers/models/swinv2/mod*_swinv2* @amyeroberts @qubvel
+/src/transformers/models/swin2sr/mod*_swin2sr* @amyeroberts @qubvel
+/src/transformers/models/table_transformer/mod*_table_transformer* @amyeroberts @qubvel
+/src/transformers/models/textnet/mod*_textnet* @amyeroberts @qubvel
+/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @amyeroberts @qubvel
+/src/transformers/models/upernet/mod*_upernet* @amyeroberts @qubvel
+/src/transformers/models/van/mod*_van* @amyeroberts @qubvel
+/src/transformers/models/vit/mod*_vit* @amyeroberts @qubvel
+/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @amyeroberts @qubvel
+/src/transformers/models/vitdet/mod*_vitdet* @amyeroberts @qubvel
+/src/transformers/models/vit_mae/mod*_vit_mae* @amyeroberts @qubvel
+/src/transformers/models/vitmatte/mod*_vitmatte* @amyeroberts @qubvel
+/src/transformers/models/vit_msn/mod*_vit_msn* @amyeroberts @qubvel
+/src/transformers/models/vitpose/mod*_vitpose* @amyeroberts @qubvel
+/src/transformers/models/yolos/mod*_yolos* @amyeroberts @qubvel
+/src/transformers/models/zoedepth/mod*_zoedepth* @amyeroberts @qubvel
+
+# Audio models
+/src/transformers/models/audio_spectrogram_transformer/mod*_audio_spectrogram_transformer* @eustlb
+/src/transformers/models/bark/mod*_bark* @eustlb
+/src/transformers/models/clap/mod*_clap* @eustlb
+/src/transformers/models/dac/mod*_dac* @eustlb
+/src/transformers/models/encodec/mod*_encodec* @eustlb
+/src/transformers/models/hubert/mod*_hubert* @eustlb
+/src/transformers/models/mctct/mod*_mctct* @eustlb
+/src/transformers/models/mimi/mod*_mimi* @eustlb
+/src/transformers/models/mms/mod*_mms* @eustlb
+/src/transformers/models/moshi/mod*_moshi* @eustlb
+/src/transformers/models/musicgen/mod*_musicgen* @eustlb
+/src/transformers/models/musicgen_melody/mod*_musicgen_melody* @eustlb
+/src/transformers/models/pop2piano/mod*_pop2piano* @eustlb
+/src/transformers/models/seamless_m4t/mod*_seamless_m4t* @eustlb
+/src/transformers/models/seamless_m4t_v2/mod*_seamless_m4t_v2* @eustlb
+/src/transformers/models/sew/mod*_sew* @eustlb
+/src/transformers/models/sew_d/mod*_sew_d* @eustlb
+/src/transformers/models/speech_to_text/mod*_speech_to_text* @eustlb
+/src/transformers/models/speech_to_text_2/mod*_speech_to_text_2* @eustlb
+/src/transformers/models/speecht5/mod*_speecht5* @eustlb
+/src/transformers/models/unispeech/mod*_unispeech* @eustlb
+/src/transformers/models/unispeech_sat/mod*_unispeech_sat* @eustlb
+/src/transformers/models/univnet/mod*_univnet* @eustlb
+/src/transformers/models/vits/mod*_vits* @eustlb
+/src/transformers/models/wav2vec2/mod*_wav2vec2* @eustlb
+/src/transformers/models/wav2vec2_bert/mod*_wav2vec2_bert* @eustlb
+/src/transformers/models/wav2vec2_conformer/mod*_wav2vec2_conformer* @eustlb
+/src/transformers/models/wav2vec2_phoneme/mod*_wav2vec2_phoneme* @eustlb
+/src/transformers/models/wavlm/mod*_wavlm* @eustlb
+/src/transformers/models/whisper/mod*_whisper* @eustlb
+/src/transformers/models/xls_r/mod*_xls_r* @eustlb
+/src/transformers/models/xlsr_wav2vec2/mod*_xlsr_wav2vec2* @eustlb
+
+# Video models
+/src/transformers/models/timesformer/mod*_timesformer* @Rocketknight1
+/src/transformers/models/videomae/mod*_videomae* @Rocketknight1
+/src/transformers/models/vivit/mod*_vivit* @Rocketknight1
+
+# Multimodal models
+/src/transformers/models/align/mod*_align* @zucchini-nlp
+/src/transformers/models/altclip/mod*_altclip* @zucchini-nlp
+/src/transformers/models/aria/mod*_aria* @zucchini-nlp
+/src/transformers/models/blip/mod*_blip* @zucchini-nlp
+/src/transformers/models/blip_2/mod*_blip_2* @zucchini-nlp
+/src/transformers/models/bridgetower/mod*_bridgetower* @zucchini-nlp
+/src/transformers/models/bros/mod*_bros* @zucchini-nlp
+/src/transformers/models/chameleon/mod*_chameleon* @zucchini-nlp
+/src/transformers/models/chinese_clip/mod*_chinese_clip* @zucchini-nlp
+/src/transformers/models/clip/mod*_clip* @zucchini-nlp
+/src/transformers/models/clipseg/mod*_clipseg* @zucchini-nlp
+/src/transformers/models/clvp/mod*_clvp* @zucchini-nlp
+/src/transformers/models/colpali/mod*_colpali* @zucchini-nlp @yonigozlan
+/src/transformers/models/data2vec/mod*_data2vec* @zucchini-nlp
+/src/transformers/models/deplot/mod*_deplot* @zucchini-nlp
+/src/transformers/models/donut/mod*_donut* @zucchini-nlp
+/src/transformers/models/flava/mod*_flava* @zucchini-nlp
+/src/transformers/models/git/mod*_git* @zucchini-nlp
+/src/transformers/models/grounding_dino/mod*_grounding_dino* @qubvel
+/src/transformers/models/groupvit/mod*_groupvit* @zucchini-nlp
+/src/transformers/models/idefics/mod*_idefics* @zucchini-nlp
+/src/transformers/models/idefics2/mod*_idefics2* @zucchini-nlp
+/src/transformers/models/idefics3/mod*_idefics3* @zucchini-nlp
+/src/transformers/models/instructblip/mod*_instructblip* @zucchini-nlp
+/src/transformers/models/instructblipvideo/mod*_instructblipvideo* @zucchini-nlp
+/src/transformers/models/kosmos_2/mod*_kosmos_2* @zucchini-nlp
+/src/transformers/models/layoutlm/mod*_layoutlm* @NielsRogge
+/src/transformers/models/layoutlmv2/mod*_layoutlmv2* @NielsRogge
+/src/transformers/models/layoutlmv3/mod*_layoutlmv3* @NielsRogge
+/src/transformers/models/layoutxlm/mod*_layoutxlm* @NielsRogge
+/src/transformers/models/lilt/mod*_lilt* @zucchini-nlp
+/src/transformers/models/llava/mod*_llava* @zucchini-nlp @arthurzucker
+/src/transformers/models/llava_next/mod*_llava_next* @zucchini-nlp
+/src/transformers/models/llava_next_video/mod*_llava_next_video* @zucchini-nlp
+/src/transformers/models/llava_onevision/mod*_llava_onevision* @zucchini-nlp
+/src/transformers/models/lxmert/mod*_lxmert* @zucchini-nlp
+/src/transformers/models/matcha/mod*_matcha* @zucchini-nlp
+/src/transformers/models/mgp_str/mod*_mgp_str* @zucchini-nlp
+/src/transformers/models/mllama/mod*_mllama* @zucchini-nlp
+/src/transformers/models/nougat/mod*_nougat* @NielsRogge
+/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @qubvel @yonigozlan
+/src/transformers/models/oneformer/mod*_oneformer* @zucchini-nlp
+/src/transformers/models/owlvit/mod*_owlvit* @qubvel
+/src/transformers/models/owlv2/mod*_owlv2* @qubvel
+/src/transformers/models/paligemma/mod*_paligemma* @zucchini-nlp @molbap
+/src/transformers/models/perceiver/mod*_perceiver* @zucchini-nlp
+/src/transformers/models/pix2struct/mod*_pix2struct* @zucchini-nlp
+/src/transformers/models/pixtral/mod*_pixtral* @zucchini-nlp @ArthurZucker
+/src/transformers/models/qwen2_audio/mod*_qwen2_audio* @zucchini-nlp @ArthurZucker
+/src/transformers/models/qwen2_vl/mod*_qwen2_vl* @zucchini-nlp @ArthurZucker
+/src/transformers/models/sam/mod*_sam* @zucchini-nlp @ArthurZucker
+/src/transformers/models/siglip/mod*_siglip* @zucchini-nlp
+/src/transformers/models/speech_encoder_decoder/mod*_speech_encoder_decoder* @zucchini-nlp
+/src/transformers/models/tapas/mod*_tapas* @NielsRogge
+/src/transformers/models/trocr/mod*_trocr* @zucchini-nlp
+/src/transformers/models/tvlt/mod*_tvlt* @zucchini-nlp
+/src/transformers/models/tvp/mod*_tvp* @zucchini-nlp
+/src/transformers/models/udop/mod*_udop* @zucchini-nlp
+/src/transformers/models/video_llava/mod*_video_llava* @zucchini-nlp
+/src/transformers/models/vilt/mod*_vilt* @zucchini-nlp
+/src/transformers/models/vipllava/mod*_vipllava* @zucchini-nlp
+/src/transformers/models/vision_encoder_decoder/mod*_vision_encoder_decoder* @Rocketknight1
+/src/transformers/models/vision_text_dual_encoder/mod*_vision_text_dual_encoder* @Rocketknight1
+/src/transformers/models/visual_bert/mod*_visual_bert* @zucchini-nlp
+/src/transformers/models/xclip/mod*_xclip* @zucchini-nlp
+
+# Reinforcement learning models
+/src/transformers/models/decision_transformer/mod*_decision_transformer* @Rocketknight1
+/src/transformers/models/trajectory_transformer/mod*_trajectory_transformer* @Rocketknight1
+
+# Time series models
+/src/transformers/models/autoformer/mod*_autoformer* @Rocketknight1
+/src/transformers/models/informer/mod*_informer* @Rocketknight1
+/src/transformers/models/patchtsmixer/mod*_patchtsmixer* @Rocketknight1
+/src/transformers/models/patchtst/mod*_patchtst* @Rocketknight1
+/src/transformers/models/time_series_transformer/mod*_time_series_transformer* @Rocketknight1
+
+# Graph models
+/src/transformers/models/graphormer/mod*_graphormer* @clefourrier
+
+# Finally, files with no owners that shouldn't generate pings, usually automatically generated and checked in the CI
+utils/dummy*
\ No newline at end of file
diff --git a/.github/workflows/assign-reviewers.yml b/.github/workflows/assign-reviewers.yml
new file mode 100644
index 000000000000..46bcb52a169f
--- /dev/null
+++ b/.github/workflows/assign-reviewers.yml
@@ -0,0 +1,26 @@
+name: Assign PR Reviewers
+on:
+  pull_request_target:
+    branches:
+      - main
+    types: [ready_for_review]
+
+jobs:
+  assign_reviewers:
+    permissions:
+       pull-requests: write
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.13'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install PyGithub
+      - name: Run assignment script
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: python .github/scripts/assign_reviewers.py
\ No newline at end of file
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index bb5281778bf2..6b5555097c09 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -64,7 +64,7 @@ jobs:
             commit_id=$GITHUB_SHA
           fi
           commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark/benchmarks_entrypoint.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
+          python3 benchmark/benchmarks_entrypoint.py "$BRANCH_NAME" "$commit_id" "$commit_msg"
         env:
           HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
           # Enable this to see debug logs
@@ -73,3 +73,4 @@ jobs:
           PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
           PGUSER: transformers_benchmarks
           PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
+          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
diff --git a/.github/workflows/build-ci-docker-images.yml b/.github/workflows/build-ci-docker-images.yml
index 9d947684ee86..5606668531da 100644
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@@ -26,7 +26,7 @@ jobs:
 
     strategy:
       matrix:
-        file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "torch-jax-light", "jax-light", "examples-torch",  "examples-tf"]
+        file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "jax-light", "examples-torch",  "examples-tf"]
     continue-on-error: true
 
     steps:
@@ -34,11 +34,11 @@ jobs:
         name: Set tag
         run: |
               if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
-                  echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV" 
+                  echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
                   echo "setting it to DEV!"
               else
                   echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV"
-                  
+
               fi
       -
         name: Set up Docker Buildx
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index f698f860b2f9..cbf7caa84e87 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -15,4 +15,3 @@ jobs:
       pr_number: ${{ github.event.number }}
       package: transformers
       languages: ar de en es fr hi it ko pt tr zh ja te
-      custom_container: huggingface/transformers-doc-builder
diff --git a/.github/workflows/change_pr_to_draft.yml b/.github/workflows/change_pr_to_draft.yml
new file mode 100644
index 000000000000..c8132d2f49ea
--- /dev/null
+++ b/.github/workflows/change_pr_to_draft.yml
@@ -0,0 +1,25 @@
+name: Change PR to draft
+
+on:
+  pull_request_target:
+    types: [opened, reopened]
+
+jobs:
+  convert_pr_to_draft:
+    runs-on: ubuntu-22.04
+    name: Convert PR to draft
+    permissions:
+      pull-requests: write
+      contents: write
+    if: github.event.pull_request.draft == false
+    steps:
+      - name: Convert PR to draft
+        shell: bash
+        env:
+          PR_NUMBER: ${{ github.event.number }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+        run: |
+          echo $PR_NUMBER
+          gh pr ready $PR_NUMBER --repo $REPO --undo
+          gh pr comment $PR_NUMBER --repo $REPO --body "Hi 👋, thank you for opening this pull request! The pull request is converted to draft by default. The CI will be paused while the PR is in draft mode. When it is ready for review, please click the \`Ready for review\` button (at the bottom of the PR page). This will assign reviewers and trigger CI."
diff --git a/.github/workflows/check_failed_model_tests.yml b/.github/workflows/check_failed_model_tests.yml
index f3ea8646900a..5963523fd76c 100644
--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@@ -22,7 +22,6 @@ env:
   HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
   SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
   TF_FORCE_GPU_ALLOW_GROWTH: true
-  RUN_PT_TF_CROSS_TESTS: 1
   CUDA_VISIBLE_DEVICES: 0,1
 
 
diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml
index 001e2c531d9b..0997a1112ad7 100644
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@@ -30,7 +30,6 @@ env:
   HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
   SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
   TF_FORCE_GPU_ALLOW_GROWTH: true
-  RUN_PT_TF_CROSS_TESTS: 1
   CUDA_VISIBLE_DEVICES: 0,1
 
 jobs:
diff --git a/.github/workflows/model_jobs_amd.yml b/.github/workflows/model_jobs_amd.yml
index a7e6c7b1ccd5..c90181ec6f1b 100644
--- a/.github/workflows/model_jobs_amd.yml
+++ b/.github/workflows/model_jobs_amd.yml
@@ -30,7 +30,6 @@ env:
   HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
   SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
   TF_FORCE_GPU_ALLOW_GROWTH: true
-  RUN_PT_TF_CROSS_TESTS: 1
   CUDA_VISIBLE_DEVICES: 0,1
 
 jobs:
diff --git a/.github/workflows/new_model_pr_merged_notification.yml b/.github/workflows/new_model_pr_merged_notification.yml
new file mode 100644
index 000000000000..6282528c0b74
--- /dev/null
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@@ -0,0 +1,68 @@
+# Used to notify core maintainers about new model PR being merged
+name: New model PR merged notification
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'src/transformers/models/*/modeling_*'
+
+jobs:
+  notify_new_model:
+    name: Notify new model
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Check new model
+        shell: bash
+        run: |
+          python -m pip install gitpython
+          python -c 'from utils.pr_slow_ci_models import get_new_model; new_model = get_new_model(diff_with_last_commit=True); print(new_model)' | tee output.txt
+          echo "NEW_MODEL=$(tail -n 1 output.txt)" >> $GITHUB_ENV
+          echo "COMMIT_SHA=$(git log -1 --format=%H)" >> $GITHUB_ENV
+
+      - name: print commit sha
+        if: ${{ env.NEW_MODEL != ''}}
+        shell: bash
+        run: |
+          echo "$COMMIT_SHA"
+
+      - name: print new model
+        if: ${{ env.NEW_MODEL != ''}}
+        shell: bash
+        run: |
+          echo "$NEW_MODEL"
+
+      - name: Notify
+        if: ${{ env.NEW_MODEL != ''}}
+        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          channel-id: transformers-new-model-notification
+          # For posting a rich message using Block Kit
+          payload: |
+            {
+              "blocks": [
+                {
+                  "type": "header",
+                  "text": {
+                    "type": "plain_text",
+                    "text": "New model!",
+                    "emoji": true
+                  }
+                },
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh"
+                  }
+                }
+              ]
+            }
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/push-important-models.yml b/.github/workflows/push-important-models.yml
index 7294777655e1..099ded8018e9 100644
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@@ -7,14 +7,13 @@ on:
 env:
   OUTPUT_SLACK_CHANNEL_ID: "C06L2SGMEEA"
   HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  HF_HOME: /mnt/cache 
-  TRANSFORMERS_IS_CI: yes 
-  OMP_NUM_THREADS: 8 
-  MKL_NUM_THREADS: 8 
-  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. 
-  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} 
-  TF_FORCE_GPU_ALLOW_GROWTH: true 
-  RUN_PT_TF_CROSS_TESTS: 1
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  TF_FORCE_GPU_ALLOW_GROWTH: true
 
 jobs:
   get_modified_models:
@@ -25,13 +24,13 @@ jobs:
     steps:
       - name: Check out code
         uses: actions/checkout@v4
-      
+
       - name: Get changed files
         id: changed-files
-        uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42
+        uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
         with:
           files: src/transformers/models/**
-      
+
       - name: Run step if only the files listed above change
         if: steps.changed-files.outputs.any_changed == 'true'
         id: set-matrix
@@ -60,41 +59,41 @@ jobs:
     if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }}
     strategy:
       fail-fast: false
-      matrix: 
+      matrix:
         model-name: ${{ fromJson(needs.get_modified_models.outputs.matrix) }}
 
     steps:
       - name: Check out code
         uses: actions/checkout@v4
-      
+
       - name: Install locally transformers & other libs
         run: |
           apt install sudo
           sudo -H pip install --upgrade pip
-          sudo -H pip uninstall -y transformers 
-          sudo -H pip install -U -e ".[testing]" 
+          sudo -H pip uninstall -y transformers
+          sudo -H pip install -U -e ".[testing]"
           MAX_JOBS=4 pip install flash-attn --no-build-isolation
           pip install bitsandbytes
-      
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
-      
+
       - name: Show installed libraries and their versions
         run: pip freeze
-      
+
       - name: Run FA2 tests
         id: run_fa2_tests
         run:
           pytest -rsfE -m "flash_attn_test" --make-reports=${{ matrix.model-name }}_fa2_tests/ tests/${{ matrix.model-name }}/test_modeling_*
-      
+
       - name: "Test suite reports artifacts: ${{ matrix.model-name }}_fa2_tests"
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ matrix.model-name }}_fa2_tests
           path: /transformers/reports/${{ matrix.model-name }}_fa2_tests
-      
+
       - name: Post to Slack
         if: always()
         uses: huggingface/hf-workflows/.github/actions/post-slack@main
@@ -103,13 +102,13 @@ jobs:
           title: 🤗 Results of the FA2 tests - ${{ matrix.model-name }}
           status: ${{ steps.run_fa2_tests.conclusion}}
           slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-      
+
       - name: Run integration tests
         id: run_integration_tests
         if: always()
         run:
           pytest -rsfE -k "IntegrationTest"  --make-reports=tests_integration_${{ matrix.model-name }} tests/${{ matrix.model-name }}/test_modeling_*
-      
+
       - name: "Test suite reports artifacts: tests_integration_${{ matrix.model-name }}"
         if: ${{ always() }}
         uses: actions/upload-artifact@v4
@@ -119,7 +118,7 @@ jobs:
 
       - name: Post to Slack
         if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main 
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
         with:
           slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
           title: 🤗 Results of the Integration tests - ${{ matrix.model-name }}
diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml
index 7adad7551641..8defab44b2b0 100644
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@@ -22,7 +22,6 @@ env:
   HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
   SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
   TF_FORCE_GPU_ALLOW_GROWTH: true
-  RUN_PT_TF_CROSS_TESTS: 1
   CUDA_VISIBLE_DEVICES: 0,1
 
 jobs:
@@ -30,7 +29,7 @@ jobs:
     runs-on: ubuntu-22.04
     name: Get PR number
     # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
     outputs:
       PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
     steps:
diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml
index 6931c2f3eadc..621061988949 100644
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@@ -14,7 +14,6 @@ env:
   MKL_NUM_THREADS: 8
   PYTEST_TIMEOUT: 60
   TF_FORCE_GPU_ALLOW_GROWTH: true
-  RUN_PT_TF_CROSS_TESTS: 1
   HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
 
 jobs:
diff --git a/.github/workflows/self-push-caller.yml b/.github/workflows/self-push-caller.yml
index 59adde4c54e0..56299f30e517 100644
--- a/.github/workflows/self-push-caller.yml
+++ b/.github/workflows/self-push-caller.yml
@@ -25,7 +25,7 @@ jobs:
         
         - name: Get changed files
           id: changed-files
-          uses: tj-actions/changed-files@v41
+          uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
         
         - name: Was setup changed 
           id: was_changed
@@ -51,4 +51,4 @@ jobs:
     needs: build-docker-containers
     steps:
       - name: Trigger push CI via workflow_run
-        run: echo "Trigger push CI via workflow_run"
\ No newline at end of file
+        run: echo "Trigger push CI via workflow_run"
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index 940495c28753..3b3be41e3e9b 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -24,7 +24,6 @@ env:
   MKL_NUM_THREADS: 8
   PYTEST_TIMEOUT: 60
   TF_FORCE_GPU_ALLOW_GROWTH: true
-  RUN_PT_TF_CROSS_TESTS: 1
   CUDA_VISIBLE_DEVICES: 0,1
 
 jobs:
@@ -293,7 +292,7 @@ jobs:
 
           echo "$machine_type"
           echo "machine_type=$machine_type" >> $GITHUB_ENV
-          
+
       - name: Update clone using environment variables
         working-directory: /transformers
         run: |
@@ -406,7 +405,7 @@ jobs:
 
           echo "$machine_type"
           echo "machine_type=$machine_type" >> $GITHUB_ENV
-          
+
       - name: Update clone using environment variables
         working-directory: /workspace/transformers
         run: |
@@ -516,7 +515,7 @@ jobs:
 
           echo "$machine_type"
           echo "machine_type=$machine_type" >> $GITHUB_ENV
-          
+
       - name: Update clone using environment variables
         working-directory: /workspace/transformers
         run: |
@@ -648,6 +647,6 @@ jobs:
         # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
         run: |
           pip install huggingface_hub
-          pip install slack_sdk 
+          pip install slack_sdk
           pip show slack_sdk
           python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml
index a33b6e579c0e..4c6284a78cda 100644
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@@ -15,7 +15,7 @@ jobs:
     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
     with:
       job: run_models_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#amd-hf-ci"
       runner: mi250
       docker: huggingface/transformers-pytorch-amd-gpu
       ci_event: Scheduled CI (AMD) - mi250
@@ -26,7 +26,7 @@ jobs:
     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
     with:
       job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#amd-hf-ci"
       runner: mi250
       docker: huggingface/transformers-pytorch-amd-gpu
       ci_event: Scheduled CI (AMD) - mi250
@@ -37,7 +37,7 @@ jobs:
     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
     with:
       job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#amd-hf-ci"
       runner: mi250
       docker: huggingface/transformers-pytorch-amd-gpu
       ci_event: Scheduled CI (AMD) - mi250
@@ -48,7 +48,7 @@ jobs:
     uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
     with:
       job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#amd-hf-ci"
       runner: mi250
       docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
       ci_event: Scheduled CI (AMD) - mi250
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 48731f1c2ed3..78971820d146 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -40,7 +40,6 @@ env:
   HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
   SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
   TF_FORCE_GPU_ALLOW_GROWTH: true
-  RUN_PT_TF_CROSS_TESTS: 1
   CUDA_VISIBLE_DEVICES: 0,1
   NUM_SLICES: 2
 
@@ -571,4 +570,4 @@ jobs:
     with:
       docker: ${{ inputs.docker }}
       start_sha: ${{ github.sha }}
-    secrets: inherit
\ No newline at end of file
+    secrets: inherit
diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
index 02b022698b0c..e648883f191e 100644
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -5,7 +5,7 @@ on:
     inputs:
       runner_type:
         description: 'Type of runner to test (a10 or t4)'
-        required: true 
+        required: true
       docker_image:
         description: 'Name of the Docker image'
         required: true
@@ -15,15 +15,14 @@ on:
 
 env:
   HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  HF_HOME: /mnt/cache 
-  TRANSFORMERS_IS_CI: yes 
-  OMP_NUM_THREADS: 8 
-  MKL_NUM_THREADS: 8 
-  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. 
-  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} 
-  TF_FORCE_GPU_ALLOW_GROWTH: true 
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  TF_FORCE_GPU_ALLOW_GROWTH: true
   CUDA_VISIBLE_DEVICES: 0,1
-  RUN_PT_TF_CROSS_TESTS: 1
 
 jobs:
   get_runner:
@@ -78,7 +77,7 @@ jobs:
       - name: Show installed libraries and their versions
         working-directory: /transformers
         run: pip freeze
-      
+
       - name: NVIDIA-SMI
         run: |
           nvidia-smi
diff --git a/.github/workflows/update_metdata.yml b/.github/workflows/update_metdata.yml
index 90cd73077ac0..d55b6e336c09 100644
--- a/.github/workflows/update_metdata.yml
+++ b/.github/workflows/update_metdata.yml
@@ -19,7 +19,7 @@ jobs:
       - name: Setup environment
         run: |
           pip install --upgrade pip
-          pip install datasets pandas==2.0.3
+          pip install datasets pandas
           pip install .[torch,tf,flax]
 
       - name: Update metadata
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9eeea9971540..c4804c69fb03 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -221,10 +221,10 @@ You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main
    [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.
 
    If you're modifying documents under the `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
-   make sure you install the documentation builder:
+   make sure you install the [documentation builder](https://github.com/huggingface/doc-builder).
 
    ```bash
-   pip install ".[docs]"
+   pip install hf-doc-builder
    ```
 
    Run the following command from the root of the repository:
@@ -343,8 +343,6 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
 
 Like the slow tests, there are other environment variables available which are not enabled by default during testing:
 - `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
-- `RUN_PT_FLAX_CROSS_TESTS`: Enables tests for PyTorch + Flax integration.
-- `RUN_PT_TF_CROSS_TESTS`: Enables tests for TensorFlow + PyTorch integration.
 
 More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).
 
diff --git a/ISSUES.md b/ISSUES.md
index a5969a3027f8..3b4e587a6d12 100644
--- a/ISSUES.md
+++ b/ISSUES.md
@@ -263,9 +263,9 @@ You are not required to read the following guidelines before opening an issue. H
     But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:
 
     ```
-    > How big is your gpu cluster?
+    > How big is your GPU cluster?
 
-    Our cluster is made of 256 gpus.
+    Our cluster is made of 256 GPUs.
     ```
 
     If you are addressing multiple comments, quote the relevant parts of each before your answer. Some people use the same comment to do multiple replies, others separate them into separate comments. Either way works. The latter approach helps for linking to a specific comment.
diff --git a/Makefile b/Makefile
index 710c555b74f6..21152e985082 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,6 @@ autogenerate_code: deps_table_update
 repo-consistency:
 	python utils/check_copies.py
 	python utils/check_modular_conversion.py
-	python utils/check_table.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/check_inits.py
@@ -46,7 +45,6 @@ repo-consistency:
 	python utils/check_doctest_list.py
 	python utils/update_metadata.py --check-only
 	python utils/check_docstrings.py
-	python utils/check_support_list.py
 
 # this target runs checks on all files
 
@@ -82,7 +80,6 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
 	python utils/check_modular_conversion.py  --fix_and_overwrite
-	python utils/check_table.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite
diff --git a/README.md b/README.md
index 6bbcdbc82f8b..e51a2c51bb55 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@ limitations under the License.
 </p>
 
 <p align="center">
+    <a href="https://huggingface.com/models"><img alt="Checkpoints on Hub" src="https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen"></a>
     <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
     <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
     <a href="https://huggingface.co/docs/transformers/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online"></a>
@@ -54,275 +55,254 @@ limitations under the License.
 </h4>
 
 <h3 align="center">
-    <p>State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow</p>
+    <p>State-of-the-art pretrained models for inference and training</p>
 </h3>
 
 <h3 align="center">
     <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>
 
-🤗 Transformers provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.
+Transformers is a library of pretrained text, computer vision, audio, video, and multimodal models for inference and training. Use Transformers to fine-tune models on your data, build inference applications, and for generative AI use cases across multiple modalities.
 
-These models can be applied on:
+There are over 500K+ Transformers [model checkpoints](https://huggingface.co/models?library=transformers&sort=trending) on the [Hugging Face Hub](https://huggingface.com/models) you can use.
 
-* 📝 Text, for tasks like text classification, information extraction, question answering, summarization, translation, and text generation, in over 100 languages.
-* 🖼️ Images, for tasks like image classification, object detection, and segmentation.
-* 🗣️ Audio, for tasks like speech recognition and audio classification.
+Explore the [Hub](https://huggingface.com/) today to find a model and use Transformers to help you get started right away.
 
-Transformer models can also perform tasks on **several modalities combined**, such as table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
+## Installation
 
-🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments.
+Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.0+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+.
 
-🤗 Transformers is backed by the three most popular deep learning libraries — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.
+Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager.
 
-## Online demos
+```py
+# venv
+python -m venv .my-env
+source .my-env/bin/activate
 
-You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer [private model hosting, versioning, & an inference API](https://huggingface.co/pricing) for public and private models.
+# uv
+uv venv .my-env
+source .my-env/bin/activate
+```
 
-Here are a few examples:
+Install Transformers in your virtual environment.
 
-In Natural Language Processing:
-- [Masked word completion with BERT](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
-- [Named Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
-- [Text generation with Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
-- [Natural Language Inference with RoBERTa](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
-- [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
-- [Question answering with DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
-- [Translation with T5](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+```py
+# pip
+pip install transformers
 
-In Computer Vision:
-- [Image classification with ViT](https://huggingface.co/google/vit-base-patch16-224)
-- [Object Detection with DETR](https://huggingface.co/facebook/detr-resnet-50)
-- [Semantic Segmentation with SegFormer](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
-- [Panoptic Segmentation with Mask2Former](https://huggingface.co/facebook/mask2former-swin-large-coco-panoptic)
-- [Depth Estimation with Depth Anything](https://huggingface.co/docs/transformers/main/model_doc/depth_anything)
-- [Video Classification with VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)
-- [Universal Segmentation with OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
+# uv
+uv pip install transformers
+```
 
-In Audio:
-- [Automatic Speech Recognition with Whisper](https://huggingface.co/openai/whisper-large-v3)
-- [Keyword Spotting with Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
-- [Audio Classification with Audio Spectrogram Transformer](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
+Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error.
 
-In Multimodal tasks:
-- [Table Question Answering with TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
-- [Visual Question Answering with ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
-- [Image captioning with LLaVa](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
-- [Zero-shot Image Classification with SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384)
-- [Document Question Answering with LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
-- [Zero-shot Video Classification with X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
-- [Zero-shot Object Detection with OWLv2](https://huggingface.co/docs/transformers/en/model_doc/owlv2)
-- [Zero-shot Image Segmentation with CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)
-- [Automatic Mask Generation with SAM](https://huggingface.co/docs/transformers/model_doc/sam)
+```shell
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install .
+```
 
+## Quickstart
 
-## 100 projects using Transformers
+Get started with Transformers right away with the [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial) API. The `Pipeline` is a high-level inference class that supports text, audio, vision, and multimodal tasks. It handles preprocessing the input and returns the appropriate output.
 
-Transformers is more than a toolkit to use pretrained models: it's a community of projects built around it and the
-Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone
-else to build their dream projects.
+Instantiate a pipeline and specify model to use for text generation. The model is downloaded and cached so you can easily reuse it again. Finally, pass some text to prompt the model.
 
-In order to celebrate the 100,000 stars of transformers, we have decided to put the spotlight on the
-community, and we have created the [awesome-transformers](./awesome-transformers.md) page which lists 100
-incredible projects built in the vicinity of transformers.
+```py
+from transformers import pipeline
 
-If you own or use a project that you believe should be part of the list, please open a PR to add it!
+pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B")
+pipeline("the secret to baking a really good cake is ")
+[{'generated_text': 'the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.'}]
+```
 
-## Serious about AI in your organisation? Build faster with the Hugging Face Enterprise Hub.
+To chat with a model, the usage pattern is the same. The only difference is you need to construct a chat history (the input to `Pipeline`) between you and the system.
 
-<a target="_blank" href="https://huggingface.co/enterprise">
-    <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
-</a><br>
-
-## Quick tour
+> [!TIP]
+> You can also chat with a model directly from the command line.
+> ```shell
+> transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+> ```
 
-To immediately use a model on a given input (text, image, audio, ...), we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model's training. Here is how to quickly use a pipeline to classify positive versus negative texts:
+```py
+import torch
+from transformers import pipeline
 
-```python
->>> from transformers import pipeline
+chat = [
+    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+]
 
-# Allocate a pipeline for sentiment-analysis
->>> classifier = pipeline('sentiment-analysis')
->>> classifier('We are very happy to introduce pipeline to the transformers repository.')
-[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+response = pipeline(chat, max_new_tokens=512)
+print(response[0]["generated_text"][-1]["content"])
 ```
 
-The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here, the answer is "positive" with a confidence of 99.97%.
-
-Many tasks have a pre-trained `pipeline` ready to go, in NLP but also in computer vision and speech. For example, we can easily extract detected objects in an image:
-
-``` python
->>> import requests
->>> from PIL import Image
->>> from transformers import pipeline
-
-# Download an image with cute cats
->>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
->>> image_data = requests.get(url, stream=True).raw
->>> image = Image.open(image_data)
-
-# Allocate a pipeline for object detection
->>> object_detector = pipeline('object-detection')
->>> object_detector(image)
-[{'score': 0.9982201457023621,
-  'label': 'remote',
-  'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
- {'score': 0.9960021376609802,
-  'label': 'remote',
-  'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
- {'score': 0.9954745173454285,
-  'label': 'couch',
-  'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
- {'score': 0.9988006353378296,
-  'label': 'cat',
-  'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
- {'score': 0.9986783862113953,
-  'label': 'cat',
-  'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
+Expand the examples below to see how `Pipeline` works for different modalities and tasks.
+
+<details>
+<summary>Automatic speech recognition</summary>
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3")
+pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
 ```
 
-Here, we get a list of objects detected in the image, with a box surrounding the object and a confidence score. Here is the original image on the left, with the predictions displayed on the right:
+</details>
+
+<details>
+<summary>Image classification</summary>
 
 <h3 align="center">
-    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
-    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
+    <a><img src="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"></a>
 </h3>
 
-You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/docs/transformers/task_summary).
+```py
+from transformers import pipeline
+
+pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer")
+pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+[{'label': 'macaw', 'score': 0.997848391532898},
+ {'label': 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita',
+  'score': 0.0016551691805943847},
+ {'label': 'lorikeet', 'score': 0.00018523589824326336},
+ {'label': 'African grey, African gray, Psittacus erithacus',
+  'score': 7.85409429227002e-05},
+ {'label': 'quail', 'score': 5.502637941390276e-05}]
+```
 
-In addition to `pipeline`, to download and use any of the pretrained models on your given task, all it takes is three lines of code. Here is the PyTorch version:
-```python
->>> from transformers import AutoTokenizer, AutoModel
+</details>
 
->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
->>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
+<details>
+<summary>Visual question answering</summary>
 
->>> inputs = tokenizer("Hello world!", return_tensors="pt")
->>> outputs = model(**inputs)
-```
 
-And here is the equivalent code for TensorFlow:
-```python
->>> from transformers import AutoTokenizer, TFAutoModel
+<h3 align="center">
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg"></a>
+</h3>
 
->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
->>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
+```py
+from transformers import pipeline
 
->>> inputs = tokenizer("Hello world!", return_tensors="tf")
->>> outputs = model(**inputs)
+pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base")
+pipeline(
+    image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg",
+    question="What is in the image?",
+)
+[{'answer': 'statue of liberty'}]
 ```
 
-The tokenizer is responsible for all the preprocessing the pretrained model expects and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator.
-
-The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use as usual. [This tutorial](https://huggingface.co/docs/transformers/training) explains how to integrate such a model into a classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune on a new dataset.
+</details>
 
-## Why should I use transformers?
+## Why should I use Transformers?
 
 1. Easy-to-use state-of-the-art models:
-    - High performance on natural language understanding & generation, computer vision, and audio tasks.
-    - Low barrier to entry for educators and practitioners.
+    - High performance on natural language understanding & generation, computer vision, audio, video, and multimodal tasks.
+    - Low barrier to entry for researchers, engineers, and developers.
     - Few user-facing abstractions with just three classes to learn.
     - A unified API for using all our pretrained models.
 
 1. Lower compute costs, smaller carbon footprint:
-    - Researchers can share trained models instead of always retraining.
-    - Practitioners can reduce compute time and production costs.
-    - Dozens of architectures with over 400,000 pretrained models across all modalities.
+    - Share trained models instead of training from scratch.
+    - Reduce compute time and production costs.
+    - Dozens of model architectures with 1M+ pretrained checkpoints across all modalities.
 
-1. Choose the right framework for every part of a model's lifetime:
+1. Choose the right framework for every part of a models lifetime:
     - Train state-of-the-art models in 3 lines of code.
-    - Move a single model between TF2.0/PyTorch/JAX frameworks at will.
-    - Seamlessly pick the right framework for training, evaluation, and production.
+    - Move a single model between PyTorch/JAX/TF2.0 frameworks at will.
+    - Pick the right framework for training, evaluation, and production.
 
 1. Easily customize a model or an example to your needs:
     - We provide examples for each architecture to reproduce the results published by its original authors.
     - Model internals are exposed as consistently as possible.
     - Model files can be used independently of the library for quick experiments.
 
-## Why shouldn't I use transformers?
-
-- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
-- The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library (possibly, [Accelerate](https://huggingface.co/docs/accelerate)).
-- While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/main/examples) are just that: examples. It is expected that they won't work out-of-the-box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.
-
-## Installation
-
-### With pip
-
-This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 2.0+, and TensorFlow 2.6+.
-
-You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
-
-First, create a virtual environment with the version of Python you're going to use and activate it.
-
-**macOS/Linux**
-
-```python -m venv env
-source env/bin/activate
-```
-
-**Windows**
-
-``` python -m venv env
-env\Scripts\activate
-```
+<a target="_blank" href="https://huggingface.co/enterprise">
+    <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
+</a><br>
 
-To use 🤗 Transformers, you must install at least one of Flax, PyTorch, or TensorFlow. Refer to the official installation guides for platform-specific commands:
+## Why shouldn't I use Transformers?
 
-[TensorFlow installation page](https://www.tensorflow.org/install/), 
-[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) 
+- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
+- The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate).
+- The [example scripts]((https://github.com/huggingface/transformers/tree/main/examples)) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work.
 
-When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows:
+## 100 projects using Transformers
 
-```
-pip install transformers
-```
+Transformers is more than a toolkit to use pretrained models, it's a community of projects built around it and the
+Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone
+else to build their dream projects.
 
-If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you must [install the library from source](https://huggingface.co/docs/transformers/installation#installing-from-source).
+In order to celebrate Transformers 100,000 stars, we wanted to put the spotlight on the
+community with the [awesome-transformers](./awesome-transformers.md) page which lists 100
+incredible projects built with Transformers.
 
-```
-git clone https://github.com/huggingface/transformers.git
-cd transformers
-pip install .
-```
+If you own or use a project that you believe should be part of the list, please open a PR to add it!
 
-### With conda
+## Example models
 
-🤗 Transformers can be installed using conda as follows:
+You can test most of our models directly on their [Hub model pages](https://huggingface.co/models).
 
-```shell script
-conda install conda-forge::transformers
-```
+Expand each modality below to see a few example models for various use cases.
 
-> **_NOTE:_** Installing `transformers` from the `huggingface` channel is deprecated.
+<details>
+<summary>Audio</summary>
 
-Follow the installation pages of Flax, PyTorch or TensorFlow to see how to install them with conda.
+- Audio classification with [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo)
+- Automatic speech recognition with [Moonshine](https://huggingface.co/UsefulSensors/moonshine)
+- Keyword spotting with [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- Speech to speech generation with [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16)
+- Text to audio with [MusicGen](https://huggingface.co/facebook/musicgen-large)
+- Text to speech with [Bark](https://huggingface.co/suno/bark)
 
-> **_NOTE:_**  On Windows, you may be prompted to activate Developer Mode in order to benefit from caching. If this is not an option for you, please let us know in [this issue](https://github.com/huggingface/huggingface_hub/issues/1062).
+</details>
 
-## Model architectures
+<details>
+<summary>Computer vision</summary>
 
-**[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co/models), where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).
+- Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base)
+- Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf)
+- Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base)
+- Keypoint detection with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor)
+- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue)
+- Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd)
+- Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple)
+- Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large)
+- Video classification with [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large)
 
-Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+</details>
 
-🤗 Transformers currently provides the following architectures: see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them.
+<details>
+<summary>Multimodal</summary>
 
-To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
+- Audio or text to text with [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B)
+- Document question answering with [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base)
+- Image or text to text with [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
+- Image captioning [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b)
+- OCR-based document understanding with [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf)
+- Table question answering with [TAPAS](https://huggingface.co/google/tapas-base)
+- Unified multimodal understanding and generation with [Emu3](https://huggingface.co/BAAI/Emu3-Gen)
+- Vision to text with [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)
+- Visual question answering with [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
+- Visual referring expression segmentation with [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224)
 
-These implementations have been tested on several datasets (see the example scripts) and should match the performance of the original implementations. You can find more details on performance in the Examples section of the [documentation](https://github.com/huggingface/transformers/tree/main/examples).
+</details>
 
+<details>
+<summary>NLP</summary>
 
-## Learn more
+- Masked word completion with [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base)
+- Named entity recognition with [Gemma](https://huggingface.co/google/gemma-2-2b)
+- Question answering with [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
+- Summarization with [BART](https://huggingface.co/facebook/bart-large-cnn)
+- Translation with [T5](https://huggingface.co/google-t5/t5-base)
+- Text generation with [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B)
+- Text classification with [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B)
 
-| Section | Description |
-|-|-|
-| [Documentation](https://huggingface.co/docs/transformers/) | Full API documentation and tutorials |
-| [Task summary](https://huggingface.co/docs/transformers/task_summary) | Tasks supported by 🤗 Transformers |
-| [Preprocessing tutorial](https://huggingface.co/docs/transformers/preprocessing) | Using the `Tokenizer` class to prepare data for the models |
-| [Training and fine-tuning](https://huggingface.co/docs/transformers/training) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API |
-| [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/main/examples) | Example scripts for fine-tuning models on a wide range of tasks |
-| [Model sharing and uploading](https://huggingface.co/docs/transformers/model_sharing) | Upload and share your fine-tuned models with the community |
+</details>
 
 ## Citation
 
diff --git a/awesome-transformers.md b/awesome-transformers.md
index d706498a08eb..29f50184ec3d 100644
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@@ -29,7 +29,7 @@ Keywords: inpainting, SD, Stable Diffusion
 
 ## [flair](https://github.com/flairNLP/flair)
 
-FLAIR is a powerful PyTorch NLP framework, convering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things.
+FLAIR is a powerful PyTorch NLP framework, covering several important tasks: NER, sentiment-analysis, part-of-speech tagging, text and document embeddings, among other things.
 
 Keywords: NLP, text embedding, document embedding, biomedical, NER, PoS, sentiment-analysis
 
@@ -47,7 +47,7 @@ Keywords: LLMs, Large Language Models, Agents, Chains
 
 ## [LlamaIndex](https://github.com/run-llama/llama_index)
 
-[LlamaIndex](https://github.com/run-llama/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retreival mechanisms to perform different LLM tasks and obtain knowledge-augmented results.
+[LlamaIndex](https://github.com/run-llama/llama_index) is a project that provides a central interface to connect your LLM's with external data. It provides various kinds of indices and retrieval mechanisms to perform different LLM tasks and obtain knowledge-augmented results.
 
 Keywords: LLMs, Large Language Models, Data Retrieval, Indices, Knowledge Augmentation 
 
diff --git a/benchmark/README.md b/benchmark/README.md
index a827da444f08..3935f02b389d 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -12,7 +12,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
 
 ## Writing metrics to the database
 
-`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
+`MetricsRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
 
 cf [`llama.py`](./llama.py) to see an example of this in practice.
 
diff --git a/benchmark/benchmarks_entrypoint.py b/benchmark/benchmarks_entrypoint.py
index 7925e2902834..6c036fdd6939 100644
--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@@ -3,7 +3,6 @@
 import logging
 import os
 from typing import Dict
-import psycopg2
 import sys
 
 from psycopg2.extras import Json
@@ -136,7 +135,7 @@ def import_from_path(module_name, file_path):
                 continue
             logger.debug(f"loading: {entry.name}")
             module = import_from_path(entry.name.split(".")[0], entry.path)
-            logger.info(f"runnning benchmarks in: {entry.name}")
+            logger.info(f"running benchmarks in: {entry.name}")
             module.run_benchmark(logger, branch, commit_id, commit_msg)
         except ImportModuleException as e:
             logger.error(e)
diff --git a/benchmark/llama.py b/benchmark/llama.py
index bbe1afefd5ef..1857dee3d66b 100644
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@@ -118,7 +118,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
         with torch.no_grad():
             past_key_values = StaticCache(
                 model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                 device=device,
                 dtype=torch.float16,
                 max_cache_len=seq_length + num_tokens_to_generate,
@@ -144,7 +144,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
 
             past_key_values = StaticCache(
                 model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                 device=device,
                 dtype=torch.float16,
                 max_cache_len=seq_length + num_tokens_to_generate,
@@ -187,7 +187,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             # TODO use  decode_one_token(model, input_id.clone(), cache_position) for verification
             past_key_values = StaticCache(
                 model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                 device=device,
                 dtype=torch.float16,
                 max_cache_len=seq_length + num_tokens_to_generate + 10,
@@ -204,7 +204,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             time_to_first_token = end - start
             logger.info(f"completed first compile generation in: {time_to_first_token}s")
             cache_position += 1
-            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+            all_generated_tokens += next_token.tolist()
 
             cache_position = torch.tensor([seq_length], device=device)
             ### First compile, decoding
@@ -215,9 +215,9 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             torch.cuda.synchronize()
             end = perf_counter()
             time_to_second_token = end - start
-            logger.info(f"completed second compile generation in: {time_to_first_token}s")
+            logger.info(f"completed second compile generation in: {time_to_second_token}s")
             cache_position += 1
-            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+            all_generated_tokens += next_token.tolist()
 
             ### Second compile, decoding
             start = perf_counter()
@@ -227,15 +227,15 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             torch.cuda.synchronize()
             end = perf_counter()
             time_to_third_token = end - start
-            logger.info(f"completed third compile forward in: {time_to_first_token}s")
+            logger.info(f"completed third compile forward in: {time_to_third_token}s")
             cache_position += 1
-            all_generated_tokens += next_token.clone().detach().cpu().tolist()
+            all_generated_tokens += next_token.tolist()
 
             ### Using cuda graphs decoding
 
             start = perf_counter()
             for _ in range(1, num_tokens_to_generate):
-                all_generated_tokens += next_token.clone().detach().cpu().tolist()
+                all_generated_tokens += next_token.tolist()
                 next_token = decode_one_token(
                     model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values
                 )
@@ -254,7 +254,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
 
             past_key_values = StaticCache(
                 model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                 device=device,
                 dtype=torch.float16,
                 max_cache_len=seq_length + 128,
@@ -271,7 +271,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
 
             past_key_values = StaticCache(
                 model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                 device=device,
                 dtype=torch.float16,
                 max_cache_len=seq_length + 128,
@@ -287,7 +287,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
 
             past_key_values = StaticCache(
                 model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                 device=device,
                 dtype=torch.float16,
                 max_cache_len=seq_length + 128,
@@ -298,12 +298,12 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             output = model.generate(**inputs, past_key_values=past_key_values)
             end = perf_counter()
             third_compile_generate_time = end - start
-            logger.info(f"completed second compile generation in: {third_compile_generate_time}s")
+            logger.info(f"completed third compile generation in: {third_compile_generate_time}s")
             logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
 
             past_key_values = StaticCache(
                 model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                 device=device,
                 dtype=torch.float16,
                 max_cache_len=seq_length + 128,
@@ -313,7 +313,7 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             output = model.generate(**inputs, past_key_values=past_key_values)
             end = perf_counter()
             fourth_compile_generate_time = end - start
-            logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s")
+            logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s")
             logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
 
         metrics_recorder.collect_model_measurements(
diff --git a/conftest.py b/conftest.py
index 40e43f25e893..ee012215e070 100644
--- a/conftest.py
+++ b/conftest.py
@@ -46,10 +46,6 @@
     "test_keep_in_fp32_modules",
     "test_gradient_checkpointing_backward_compatibility",
     "test_gradient_checkpointing_enable_disable",
-    "test_save_load_fast_init_from_base",
-    "test_fast_init_context_manager",
-    "test_fast_init_tied_embeddings",
-    "test_save_load_fast_init_to_base",
     "test_torch_save_load",
     "test_initialization",
     "test_forward_signature",
@@ -61,7 +57,6 @@
     "test_load_save_without_tied_weights",
     "test_tied_weights_keys",
     "test_model_weights_reload_no_missing_tied_weights",
-    "test_pt_tf_model_equivalence",
     "test_mismatched_shapes_have_properly_initialized_weights",
     "test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist",
     "test_model_is_small",
@@ -85,12 +80,6 @@
 
 
 def pytest_configure(config):
-    config.addinivalue_line(
-        "markers", "is_pt_tf_cross_test: mark test to run only when PT and TF interactions are tested"
-    )
-    config.addinivalue_line(
-        "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
-    )
     config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
     config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
     config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
diff --git a/docker/README.md b/docker/README.md
index 2a71ab6fb6ec..5410a2839e37 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -2,8 +2,8 @@
 
 In this folder you will find various docker files, and some subfolders. 
 - dockerfiles (ex: `consistency.dockerfile`) present under `~/docker` are used for our "fast" CIs. You should be able to use them for tasks that only need CPU. For example `torch-light` is a very light weights container (703MiB). 
-- subfloder contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs)
+- subfolders contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs)
 
 Note that in both case, you need to run `uv pip install -e .`, which should take around 5 seconds. We do it outside the dockerfile for the need of our CI: we checkout a new branch each time, and the `transformers` code is thus updated. 
 
-We are open to contribution, and invite the community to create dockerfiles with potential arguments that properly choose extras depending on the model's dependencies! :hugs: 
\ No newline at end of file
+We are open to contribution, and invite the community to create dockerfiles with potential arguments that properly choose extras depending on the model's dependencies! :hugs: 
diff --git a/docker/consistency.dockerfile b/docker/consistency.dockerfile
index a564d76c9bb0..5b35a5f85dcd 100644
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@@ -5,12 +5,12 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
 RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,testing,torch-speech,vision]"
 RUN git lfs install
 
-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
\ No newline at end of file
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
diff --git a/docker/custom-tokenizers.dockerfile b/docker/custom-tokenizers.dockerfile
index 87601bcf3f41..a0a9f5ea23b7 100644
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@@ -1,5 +1,6 @@
 FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
 ENV UV_PYTHON=/usr/local/bin/python
@@ -16,11 +17,11 @@ RUN make install -j 10
 
 
 RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install  --no-cache-dir "transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
+RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
 RUN python3 -m unidic download
-RUN pip uninstall -y transformers
+RUN uv pip uninstall transformers
 
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
-RUN apt remove -y g++ cmake  xz-utils libprotobuf-dev protobuf-compiler
\ No newline at end of file
+RUN apt remove -y g++ cmake  xz-utils libprotobuf-dev protobuf-compiler
diff --git a/docker/examples-tf.dockerfile b/docker/examples-tf.dockerfile
index ffbaafd8b86b..306d00fdea57 100644
--- a/docker/examples-tf.dockerfile
+++ b/docker/examples-tf.dockerfile
@@ -1,12 +1,13 @@
 FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
 RUN apt-get install -y g++ cmake
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval
-RUN pip install  --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
-RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
+RUN uv pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3"
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/docker/examples-torch.dockerfile b/docker/examples-torch.dockerfile
index f9c7257b9cca..e1029f6ddf1f 100644
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@@ -1,11 +1,12 @@
 FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
+RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/docker/exotic-models.dockerfile b/docker/exotic-models.dockerfile
index dd40476064a3..32491888e800 100644
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@@ -5,13 +5,13 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
-RUN pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset'
+RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset'
 # RUN git clone https://github.com/facebookresearch/detectron2.git
 # RUN python3 -m pip install --no-cache-dir -e detectron2
-RUN pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3'
-RUN pip uninstall -y transformers
+RUN uv pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' --no-build-isolation
+RUN uv pip uninstall transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/docker/jax-light.dockerfile b/docker/jax-light.dockerfile
index df1e1144c0e1..c2a73e98ca98 100644
--- a/docker/jax-light.dockerfile
+++ b/docker/jax-light.dockerfile
@@ -5,6 +5,6 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
\ No newline at end of file
+RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
diff --git a/docker/pipeline-tf.dockerfile b/docker/pipeline-tf.dockerfile
index dd38b52acbb3..61a442a55945 100644
--- a/docker/pipeline-tf.dockerfile
+++ b/docker/pipeline-tf.dockerfile
@@ -5,6 +5,6 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" tensorflow_probability
-RUN apt-get clean && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/docker/pipeline-torch.dockerfile b/docker/pipeline-torch.dockerfile
index 2b4014b4fff1..10b6450b2dfc 100644
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
-RUN pip uninstall -y transformers
\ No newline at end of file
+RUN uv pip uninstall transformers
diff --git a/docker/quality.dockerfile b/docker/quality.dockerfile
index 7e6999e40f20..e2421efe00b4 100644
--- a/docker/quality.dockerfile
+++ b/docker/quality.dockerfile
@@ -6,4 +6,4 @@ RUN apt-get update && apt-get install -y time git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv &&  uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
-RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
+RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/docker/tf-light.dockerfile b/docker/tf-light.dockerfile
index 67dc928c22fa..0206c7406390 100644
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@@ -6,7 +6,7 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 RUN apt-get install -y  cmake
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
-RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
\ No newline at end of file
+RUN uv pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3"
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
diff --git a/docker/torch-jax-light.dockerfile b/docker/torch-jax-light.dockerfile
index b779fcbfac9a..6394bc76afc2 100644
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@@ -6,11 +6,11 @@ RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-deps accelerate
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"
+RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"
 
 
 # RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]"
 
-RUN pip uninstall -y transformers
+RUN uv pip uninstall transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
diff --git a/docker/torch-light.dockerfile b/docker/torch-light.dockerfile
index c1a8f8ac0f51..a13d855a53e7 100644
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken]"
-RUN pip uninstall -y transformers
\ No newline at end of file
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
+RUN uv pip uninstall transformers
diff --git a/docker/torch-tf-light.dockerfile b/docker/torch-tf-light.dockerfile
index 0556b79ffdb3..63512328f129 100644
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@@ -7,13 +7,13 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install
 
 RUN uv pip install --no-cache-dir pypi-kenlm
-RUN pip install --no-cache-dir  "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]"
+RUN uv pip install --no-cache-dir  "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" librosa
 
 
-RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
\ No newline at end of file
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index ebc7ae0d2115..1eb50ee4ad7f 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -57,7 +57,8 @@ RUN python3 -m pip uninstall -y ninja
 
 # For `dinat` model
 # The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
-RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels
+# pin `0.17.4` otherwise `cannot import name 'natten2dav' from 'natten.functional'`
+RUN python3 -m pip install --no-cache-dir natten==0.17.4+torch250cu121 -f https://shi-labs.com/natten/wheels
 
 # For `nougat` tokenizer
 RUN python3 -m pip install --no-cache-dir python-Levenshtein
diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile
index 4313c2242199..a71043dc8215 100644
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@@ -12,7 +12,7 @@ RUN git lfs install
 
 RUN python3 -m pip install --no-cache-dir --upgrade pip numpy
 
-RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
+RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4
 
 RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
 
diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
index a8edb8ff03eb..f70b15494100 100644
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -2,10 +2,10 @@ FROM rocm/dev-ubuntu-22.04:6.2.4
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
-ARG PYTORCH='2.5.1'
-ARG TORCH_VISION='0.20.0'
-ARG TORCH_AUDIO='2.5.0'
-ARG ROCM='6.2'
+ARG PYTORCH='2.6.0'
+ARG TORCH_VISION='0.21.0'
+ARG TORCH_AUDIO='2.6.0'
+ARG ROCM='6.2.4'
 
 RUN apt update && \
     apt install -y --no-install-recommends \
@@ -16,9 +16,11 @@ RUN apt update && \
     python-is-python3 \
     rocrand-dev \
     rocthrust-dev \
+    rocblas-dev \
+    hipsolver-dev \
     hipsparse-dev \
     hipblas-dev \
-    rocblas-dev && \
+    hipblaslt-dev && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index 700df877d10f..33d8b10b02ee 100755
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -9,9 +9,9 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).
 
-ARG PYTORCH='2.5.1'
+ARG PYTORCH='2.6.0'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu118'
+ARG CUDA='cu121'
 
 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
@@ -26,8 +26,6 @@ RUN echo torch=$VERSION
 # Currently, let's just use their latest releases (when `torch` is installed with a release version)
 RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
 
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
-
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
 
 # needed in bnb and awq
@@ -36,10 +34,9 @@ RUN python3 -m pip install --no-cache-dir einops
 # Add bitsandbytes for mixed int8 testing
 RUN python3 -m pip install --no-cache-dir bitsandbytes
 
-# Add auto-gptq for gtpq quantization testing, installed from source for pytorch==2.5.1 compatibility
-# TORCH_CUDA_ARCH_LIST="7.5+PTX" is added to make the package compile for Tesla T4 gpus available for the CI.
-RUN pip install gekko
-RUN git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ && TORCH_CUDA_ARCH_LIST="7.5+PTX" python3 setup.py install
+# Add gptqmodel for gtpq quantization testing, installed from source for pytorch==2.6.0 compatibility
+RUN python3 -m pip install lm_eval
+RUN git clone https://github.com/ModelCloud/GPTQModel.git && cd GPTQModel && pip install -v . --no-build-isolation
 
 # Add optimum for gptq quantization testing
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
@@ -51,10 +48,11 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef
 RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
 
 # Add vptq for quantization testing
-RUN python3 -m pip install --no-cache-dir vptq
+RUN pip install vptq
 
 # Add spqr for quantization testing
-RUN python3 -m pip install --no-cache-dir spqr_quant[gpu]
+# Commented for now as No matching distribution found we need to reach out to the authors
+# RUN python3 -m pip install --no-cache-dir spqr_quant[gpu]
 
 # Add hqq for quantization testing
 RUN python3 -m pip install --no-cache-dir hqq
@@ -63,18 +61,29 @@ RUN python3 -m pip install --no-cache-dir hqq
 RUN python3 -m pip install --no-cache-dir gguf
 
 # Add autoawq for quantization testing
-# >=v0.2.7 needed for compatibility with transformers > 4.46
-RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.7.post2/autoawq-0.2.7.post2-py3-none-any.whl
+# New release v0.2.8
+RUN python3 -m pip install --no-cache-dir autoawq[kernels]
 
 # Add quanto for quantization testing
 RUN python3 -m pip install --no-cache-dir optimum-quanto
 
 # Add eetq for quantization testing
-RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git
+RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install .
+
+# # Add flute-kernel and fast_hadamard_transform for quantization testing
+# # Commented for now as they cause issues with the build
+# # TODO: create a new workflow to test them
+# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
+# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git
 
-# Add flute-kernel and fast_hadamard_transform for quantization testing
-RUN python3 -m pip install --no-cache-dir flute-kernel==0.3.0 -i https://flute-ai.github.io/whl/cu118
-RUN python3 -m pip install --no-cache-dir fast_hadamard_transform==1.0.4.post1
+# Add compressed-tensors for quantization testing
+RUN python3 -m pip install --no-cache-dir compressed-tensors
+
+# Add AMD Quark for quantization testing
+RUN python3 -m pip install --no-cache-dir amd-quark
+
+# Add transformers in editable mode
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
diff --git a/docs/source/ar/agents.md b/docs/source/ar/agents.md
index 1213b3500860..c7efd8f02f48 100644
--- a/docs/source/ar/agents.md
+++ b/docs/source/ar/agents.md
@@ -195,7 +195,7 @@ You have access to the following tools:
 To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
 
 At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
-Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence.
+Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence.
 During each intermediate step, you can use 'print()' to save whatever important information you will then need.
 These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
 
@@ -205,7 +205,7 @@ Here are a few examples using notional tools:
 ---
 {examples}
 
-Above example were using notional tools that might not exist for you. You only have acces to those tools:
+Above example were using notional tools that might not exist for you. You only have access to those tools:
 <<tool_names>>
 You also can perform computations in the python code you generate.
 
diff --git a/docs/source/ar/bertology.md b/docs/source/ar/bertology.md
index d3f95e20d7df..d12d7838906e 100644
--- a/docs/source/ar/bertology.md
+++ b/docs/source/ar/bertology.md
@@ -15,4 +15,4 @@
 - الوصول إلى جميع أوزان الانتباه لكل رأس في BERT/GPT/GPT-2،
 - استرجاع قيم ومشتقات  مخرجات الرأس لحساب درجة أهمية الرأس وحذفه كما هو موضح في https://arxiv.org/abs/1905.10650.
 
-ولمساعدتك على فهم واستخدام هذه الميزات بسهولة، أضفنا مثالًا برمجيًا محددًا: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) أثناء استخراج المعلومات  وتقليص من نموذج تم تدريبه مسبقًا على GLUE.
\ No newline at end of file
+ولمساعدتك على فهم واستخدام هذه الميزات بسهولة، أضفنا مثالًا برمجيًا محددًا: [bertology.py](https://github.com/huggingface/transformers-research-projects/tree/main/bertology/run_bertology.py) أثناء استخراج المعلومات  وتقليص من نموذج تم تدريبه مسبقًا على GLUE.
\ No newline at end of file
diff --git a/docs/source/ar/run_scripts.md b/docs/source/ar/run_scripts.md
index 593d4aec85fc..c7aea4eb9611 100644
--- a/docs/source/ar/run_scripts.md
+++ b/docs/source/ar/run_scripts.md
@@ -2,7 +2,7 @@
 
 بالإضافة إلى دفاتر الملاحظات [notebooks](./notebooks) الخاصة بـ 🤗 Transformers، هناك أيضًا نصوص برمجية توضيحية تُظهر كيفية تدريب نموذج لمهمة باستخدام [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch) أو [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) أو [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 
-كما ستجد النصوص البرمجية التي استخدمناها في [مشاريع الأبحاث](https://github.com/huggingface/transformers/tree/main/examples/research_projects) و [الأمثلة القديمة](https://github.com/huggingface/transformers/tree/main/examples/legacy) والتي ساهم بها المجتمع بشكل أساسي. هذه النصوص البرمجية غير مدعومة بشكل نشط وقد تتطلب إصدارًا محددًا من مكتبة 🤗 Transformers والذي من المحتمل أن يكون غير متوافق مع الإصدار الأحدث من المكتبة.
+كما ستجد النصوص البرمجية التي استخدمناها في [مشاريع الأبحاث](https://github.com/huggingface/transformers-research-projects/) و [الأمثلة القديمة](https://github.com/huggingface/transformers/tree/main/examples/legacy) والتي ساهم بها المجتمع بشكل أساسي. هذه النصوص البرمجية غير مدعومة بشكل نشط وقد تتطلب إصدارًا محددًا من مكتبة 🤗 Transformers والذي من المحتمل أن يكون غير متوافق مع الإصدار الأحدث من المكتبة.
 
 لا يُتوقع أن تعمل النصوص البرمجية التوضيحية بشكل مباشر على كل مشكلة، وقد تحتاج إلى تكييف النص البرمجي مع المشكلة التي تحاول حلها. ولمساعدتك في ذلك، تعرض معظم النصوص البرمجية كيفية معالجة البيانات قبل التدريب بشكل كامل، مما يتيح لك تحريرها حسب الحاجة لحالتك الاستخدام.
 
diff --git a/docs/source/ar/serialization.md b/docs/source/ar/serialization.md
index 2df620d86239..6f437dea0681 100644
--- a/docs/source/ar/serialization.md
+++ b/docs/source/ar/serialization.md
@@ -116,11 +116,11 @@ optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_s
 
 <Tip warning={true}>
 
-لم يعد يتم دعم `tranformers.onnx`  يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة.
+لم يعد يتم دعم `transformers.onnx`  يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة.
 
 </Tip>
 
-لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `tranformers.onnx`، ثبّت التبعيات الإضافية:
+لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `transformers.onnx`، ثبّت التبعيات الإضافية:
 
 ```bash
 pip install transformers[onnx]
diff --git a/docs/source/ar/trainer.md b/docs/source/ar/trainer.md
index 7da7cbf4e171..e70dbb255eac 100644
--- a/docs/source/ar/trainer.md
+++ b/docs/source/ar/trainer.md
@@ -673,6 +673,29 @@ tpu_use_sudo: false
 use_cpu: false
 ```
 
+</hfoption>
+<hfoption id="Tensor Parallelism with PyTorch 2">
+
+```yml
+compute_environment: LOCAL_MACHINE
+tp_config:
+  tp_size: 4
+distributed_type: TP
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+
+```
+
 </hfoption>
 </hfoptions>
 يُعد أمر  [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) هو الطريقة المُوصى بها لتشغيل نص البرمجى للتدريب على نظام موزع باستخدام Accelerate و [`Trainer`] مع المعلمات المحددة في `config_file.yaml`. يتم حفظ هذا الملف في مجلد ذاكرة التخزين المؤقت لـ Accelerate ويتم تحميله تلقائيًا عند تشغيل `accelerate_launch`.
diff --git a/docs/source/de/contributing.md b/docs/source/de/contributing.md
index d014dd67c83a..61ee8c3fc4e3 100644
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@@ -283,8 +283,6 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
 Wie bei den langsamen Tests gibt es auch andere Umgebungsvariablen, die standardmäßig beim Testen nicht gesetzt sind:
 
 * `RUN_CUSTOM_TOKENIZERS`: Aktiviert Tests für benutzerdefinierte Tokenizer.
-* `RUN_PT_FLAX_CROSS_TESTS`: Aktiviert Tests für die Integration von PyTorch + Flax.
-* `RUN_PT_TF_CROSS_TESTS`: Aktiviert Tests für die Integration von TensorFlow + PyTorch.
 
 Weitere Umgebungsvariablen und zusätzliche Informationen finden Sie in der [testing_utils.py](src/transformers/testing_utils.py).
 
diff --git a/docs/source/de/index.md b/docs/source/de/index.md
index 5ddabb4e7382..8aaaa5952c07 100644
--- a/docs/source/de/index.md
+++ b/docs/source/de/index.md
@@ -88,7 +88,7 @@ Die Bibliothek enthält derzeit JAX-, PyTorch- und TensorFlow-Implementierungen,
 1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) and a German version of DistilBERT.
 1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
diff --git a/docs/source/de/quicktour.md b/docs/source/de/quicktour.md
index c01609207fec..856ba546b977 100644
--- a/docs/source/de/quicktour.md
+++ b/docs/source/de/quicktour.md
@@ -156,7 +156,7 @@ Die [`pipeline`] kann jedes Modell aus dem [Model Hub](https://huggingface.co/mo
 
 <frameworkcontent>
 <pt>
-Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` below):
+Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `AutoClass` below):
 
 ```py
 >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
@@ -166,7 +166,7 @@ Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the
 ```
 </pt>
 <tf>
-Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` below):
+Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `TFAutoClass` below):
 
 ```py
 >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
@@ -222,7 +222,7 @@ Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als
 Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält:
 
 * [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token.
-* [atttention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
+* [attention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
 
 Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben:
 
diff --git a/docs/source/de/run_scripts.md b/docs/source/de/run_scripts.md
index 17b725827dd7..4b62c73276e0 100644
--- a/docs/source/de/run_scripts.md
+++ b/docs/source/de/run_scripts.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 Neben den 🤗 Transformers [notebooks](./notebooks) gibt es auch Beispielskripte, die zeigen, wie man ein Modell für eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert.
 
-Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers/tree/main/examples/research_projects) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist.
+Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers-research-projects/) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist.
 
 Es wird nicht erwartet, dass die Beispielskripte bei jedem Problem sofort funktionieren. Möglicherweise müssen Sie das Skript an das Problem anpassen, das Sie zu lösen versuchen. Um Ihnen dabei zu helfen, legen die meisten Skripte vollständig offen, wie die Daten vorverarbeitet werden, so dass Sie sie nach Bedarf für Ihren Anwendungsfall bearbeiten können.
 
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index dc259103ae2e..6c4b7498b3da 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1,291 +1,310 @@
 - sections:
   - local: index
-    title: 🤗 Transformers
-  - local: quicktour
-    title: Quick tour
+    title: Transformers
   - local: installation
     title: Installation
-  - local: add_new_model
-    title: Adding a new model to `transformers`
+  - local: quicktour
+    title: Quickstart
   title: Get started
-- sections:
-  - local: pipeline_tutorial
-    title: Run inference with pipelines
-  - local: autoclass_tutorial
-    title: Write portable code with AutoClass
-  - local: preprocessing
-    title: Preprocess data
-  - local: training
-    title: Fine-tune a pretrained model
-  - local: run_scripts
-    title: Train with a script
-  - local: accelerate
-    title: Set up distributed training with 🤗 Accelerate
-  - local: peft
-    title: Load and train adapters with 🤗 PEFT
-  - local: model_sharing
-    title: Share your model
-  - local: agents
-    title: Agents 101
-  - local: agents_advanced
-    title: Agents, supercharged - Multi-agents, External tools, and more
-  - local: llm_tutorial
-    title: Generation with LLMs
-  - local: conversations
-    title: Chatting with Transformers
-  title: Tutorials
-- sections:
-  - isExpanded: false
-    sections:
-    - local: tasks/sequence_classification
-      title: Text classification
-    - local: tasks/token_classification
-      title: Token classification
-    - local: tasks/question_answering
-      title: Question answering
-    - local: tasks/language_modeling
-      title: Causal language modeling
-    - local: tasks/masked_language_modeling
-      title: Masked language modeling
-    - local: tasks/translation
-      title: Translation
-    - local: tasks/summarization
-      title: Summarization
-    - local: tasks/multiple_choice
-      title: Multiple choice
-    title: Natural Language Processing
-  - isExpanded: false
-    sections:
-    - local: tasks/audio_classification
-      title: Audio classification
-    - local: tasks/asr
-      title: Automatic speech recognition
-    title: Audio
-  - isExpanded: false
-    sections:
-    - local: tasks/image_classification
-      title: Image classification
-    - local: tasks/semantic_segmentation
-      title: Image segmentation
-    - local: tasks/video_classification
-      title: Video classification
-    - local: tasks/object_detection
-      title: Object detection
-    - local: tasks/zero_shot_object_detection
-      title: Zero-shot object detection
-    - local: tasks/zero_shot_image_classification
-      title: Zero-shot image classification
-    - local: tasks/monocular_depth_estimation
-      title: Depth estimation
-    - local: tasks/image_to_image
-      title: Image-to-Image
-    - local: tasks/image_feature_extraction
-      title: Image Feature Extraction
-    - local: tasks/mask_generation
-      title: Mask Generation
-    - local: tasks/keypoint_detection
-      title: Keypoint Detection
-    - local: tasks/knowledge_distillation_for_image_classification
-      title: Knowledge Distillation for Computer Vision
-    title: Computer Vision
-  - isExpanded: false
-    sections:
-    - local: tasks/image_captioning
-      title: Image captioning
-    - local: tasks/document_question_answering
-      title: Document Question Answering
-    - local: tasks/visual_question_answering
-      title: Visual Question Answering
-    - local: tasks/text-to-speech
-      title: Text to speech
-    - local: tasks/image_text_to_text
-      title: Image-text-to-text
-    - local: tasks/video_text_to_text
-      title: Video-text-to-text
-    title: Multimodal
-  - isExpanded: false
-    sections:
+- isExpanded: false
+  sections:
+  - sections:
+    - local: models
+      title: Loading models
+    - local: custom_models
+      title: Customizing models
+    - local: how_to_hack_models
+      title: Customizing model components
+    - local: model_sharing
+      title: Sharing
+    - local: add_new_model
+      title: Adding a new model to Transformers
+    - local: modular_transformers
+      title: Modular Transformers
+    - local: task_summary
+      title: What 🤗 Transformers can do
+    - local: tasks_explained
+      title: How 🤗 Transformers solve tasks
+    - local: model_summary
+      title: The Transformer model family
+    - local: attention
+      title: Attention mechanisms
+    - local: attention_interface
+      title: Customizing attention function
+    title: Models
+  - sections:
+    - local: fast_tokenizers
+      title: Tokenizers
+    - local: image_processors
+      title: Image processors
+    - local: backbones
+      title: Backbones
+    - local: feature_extractors
+      title: Feature extractors
+    - local: processors
+      title: Processors
+    - local: tokenizer_summary
+      title: Summary of the tokenizers
+    - local: pad_truncation
+      title: Padding and truncation
+    title: Preprocessors
+  title: Base classes
+- isExpanded: false
+  sections:
+  - sections:
+    - local: pipeline_tutorial
+      title: Pipeline
+    - local: pipeline_gradio
+      title: Machine learning apps
+    - local: pipeline_webserver
+      title: Web server inference
+    - local: add_new_pipeline
+      title: Adding a new pipeline
+    title: Pipeline API
+  - sections:
+    - local: llm_tutorial
+      title: Text generation
     - local: generation_strategies
-      title: Customize the generation strategy
-    - local: kv_cache
-      title: Best Practices for Generation with Cache
-    title: Generation
-  - isExpanded: false
-    sections:
-    - local: chat_template_basics
-      title: Getting Started with Chat Templates for Text LLMs
-    - local: chat_template_multimodal
-      title: Multimodal Chat Templates for Vision and Audio LLMs
-    - local: chat_template_tools_and_documents
-      title: Expanding Chat Templates with Tools and Documents
-    - local: chat_template_advanced
-      title: Advanced Usage and Customizing Your Chat Templates
-    title: Chat Templates
-  - isExpanded: false
-    sections:
-    - local: tasks/idefics
-      title: Image tasks with IDEFICS
+      title: Generation strategies
+    - local: generation_features
+      title: Generation features
     - local: tasks/prompting
-      title: LLM prompting guide
-    title: Prompting
-  title: Task Guides
-- sections:
-  - local: fast_tokenizers
-    title: Use fast tokenizers from 🤗 Tokenizers
-  - local: multilingual
-    title: Run inference with multilingual models
-  - local: create_a_model
-    title: Use model-specific APIs
-  - local: custom_models
-    title: Share a custom model
-  - local: trainer
-    title: Trainer
-  - local: sagemaker
-    title: Run training on Amazon SageMaker
-  - local: serialization
-    title: Export to ONNX
-  - local: tflite
-    title: Export to TFLite
-  - local: torchscript
-    title: Export to TorchScript
-  - local: notebooks
-    title: Notebooks with examples
-  - local: community
-    title: Community resources
-  - local: troubleshooting
-    title: Troubleshoot
-  - local: gguf
-    title: Interoperability with GGUF files
-  - local: tiktoken
-    title: Interoperability with TikToken files
-  - local: modular_transformers
-    title: Modularity in `transformers`
-  - local: how_to_hack_models
-    title: Model Hacking (overwriting a class to your usage)
-  title: Developer guides
-- sections:
+      title: Prompt engineering
+    - local: llm_optims
+      title: Optimizing inference
+    - local: kv_cache
+      title: KV cache strategies
+    - local: serving
+      title: Serving
+    - local: cache_explanation
+      title: Caching
+    - local: llm_tutorial_optimization
+      title: Getting the most out of LLMs
+    - local: perplexity
+      title: Perplexity of fixed-length models
+    title: LLMs
+  - sections:
+    - local: conversations
+      title: Chat basics
+    - local: chat_templating
+      title: Templates
+    - local: chat_templating_multimodal
+      title: Multimodal templates
+    - local: chat_templating_writing
+      title: Template writing
+    - local: chat_extras
+      title: Tools and RAG
+    title: Chat with models
+  - sections:
+    - local: perf_torch_compile
+      title: torch.compile
+    - local: perf_infer_gpu_one
+      title: GPU
+    - local: perf_infer_gpu_multi
+      title: Distributed GPU inference
+    - local: perf_infer_cpu
+      title: CPU
+    - local: tf_xla
+      title: XLA
+    title: Optimization
+  - local: agents
+    title: Agents
+  - local: tools
+    title: Tools
+  title: Inference
+- isExpanded: false
+  sections:
+  - sections:
+    - local: trainer
+      title: Trainer
+    - local: training
+      title: Fine-tuning
+    - local: optimizers
+      title: Optimizers
+    - local: hpo_train
+      title: Hyperparameter search
+    title: Trainer API
+  - sections:
+    - local: gpu_selection
+      title: GPU selection
+    - local: accelerate
+      title: Accelerate
+    - local: fsdp
+      title: FullyShardedDataParallel
+    - local: deepspeed
+      title: DeepSpeed
+    - local: debugging
+      title: Multi-GPU debugging
+    - local: perf_train_cpu_many
+      title: Distributed CPUs
+    - local: perf_train_gpu_many
+      title: Parallelism methods
+    title: Distributed training
+  - sections:
+    - local: perf_train_gpu_one
+      title: GPU
+    - local: perf_train_cpu
+      title: CPU
+    - local: perf_train_tpu_tf
+      title: TPU
+    - local: perf_train_special
+      title: Apple Silicon
+    - local: perf_hardware
+      title: Build your own machine
+    title: Hardware
+  - local: peft
+    title: PEFT
+  - local: model_memory_anatomy
+    title: Model training anatomy
+  title: Training
+- isExpanded: false
+  sections:
   - local: quantization/overview
-    title: Getting started
-  - local: quantization/bitsandbytes
-    title: bitsandbytes
-  - local: quantization/gptq
-    title: GPTQ
-  - local: quantization/awq
-    title: AWQ
+    title: Overview
   - local: quantization/aqlm
     title: AQLM
-  - local: quantization/vptq
-    title: SpQR
-  - local: quantization/spqr
-    title: VPTQ
-  - local: quantization/quanto
-    title: Quanto
+  - local: quantization/awq
+    title: AWQ
+  - local: quantization/bitnet
+    title: BitNet
+  - local: quantization/bitsandbytes
+    title: bitsandbytes
+  - local: quantization/compressed_tensors
+    title: compressed-tensors
   - local: quantization/eetq
     title: EETQ
+  - local: quantization/fbgemm_fp8
+    title: FBGEMM
+  - local: quantization/finegrained_fp8
+    title: Fine-grained FP8
+  - local: gguf
+    title: GGUF
+  - local: quantization/gptq
+    title: GPTQ
   - local: quantization/higgs
     title: HIGGS
   - local: quantization/hqq
     title: HQQ
-  - local: quantization/fbgemm_fp8
-    title: FBGEMM_FP8
   - local: quantization/optimum
     title: Optimum
+  - local: quantization/quanto
+    title: Quanto
+  - local: quantization/quark
+    title: Quark
   - local: quantization/torchao
-    title: TorchAO
-  - local: quantization/bitnet
-    title: BitNet
-  - local: quantization/compressed_tensors
-    title: compressed-tensors
-  - local: quantization/finegrained_fp8
-    title: Fine-grained FP8
+    title: torchao
+  - local: quantization/spqr
+    title: SpQR
+  - local: quantization/vptq
+    title: VPTQ
   - local: quantization/contribute
-    title: Contribute new quantization method
-  title: Quantization Methods
-- sections:
-  - local: performance
-    title: Overview
-  - local: llm_optims
-    title: LLM inference optimization
-  - sections:
-    - local: perf_train_gpu_one
-      title: Methods and tools for efficient training on a single GPU
-    - local: perf_train_gpu_many
-      title: Multiple GPUs and parallelism
-    - local: fsdp
-      title: Fully Sharded Data Parallel
-    - local: deepspeed
-      title: DeepSpeed
-    - local: perf_train_cpu
-      title: Efficient training on CPU
-    - local: perf_train_cpu_many
-      title: Distributed CPU training
-    - local: perf_train_tpu_tf
-      title: Training on TPU with TensorFlow
-    - local: perf_train_special
-      title: PyTorch training on Apple silicon
-    - local: perf_hardware
-      title: Custom hardware for training
-    - local: hpo_train
-      title: Hyperparameter Search using Trainer API
-    title: Efficient training techniques
+    title: Contribute
+  title: Quantization
+- isExpanded: false
+  sections:
+  - local: serialization
+    title: ONNX
+  - local: tflite
+    title: LiteRT
+  - local: executorch
+    title: ExecuTorch
+  - local: torchscript
+    title: TorchScript
+  title: Export to production
+- isExpanded: false
+  sections:
   - sections:
-    - local: perf_infer_cpu
-      title: CPU inference
-    - local: perf_infer_gpu_one
-      title: GPU inference
-    - local: perf_infer_gpu_multi
-      title: Multi-GPU inference
-    title: Optimizing inference
-  - local: big_models
-    title: Instantiate a big model
-  - local: debugging
-    title: Debugging
-  - local: tf_xla
-    title: XLA Integration for TensorFlow Models
-  - local: perf_torch_compile
-    title: Optimize inference using `torch.compile()`
-  title: Performance and scalability
-- sections:
+    - sections:
+      - local: tasks/sequence_classification
+        title: Text classification
+      - local: tasks/token_classification
+        title: Token classification
+      - local: tasks/question_answering
+        title: Question answering
+      - local: tasks/language_modeling
+        title: Causal language modeling
+      - local: tasks/masked_language_modeling
+        title: Masked language modeling
+      - local: tasks/translation
+        title: Translation
+      - local: tasks/summarization
+        title: Summarization
+      - local: tasks/multiple_choice
+        title: Multiple choice
+      title: Natural language processing
+    - sections:
+      - local: tasks/audio_classification
+        title: Audio classification
+      - local: tasks/asr
+        title: Automatic speech recognition
+      title: Audio
+    - sections:
+      - local: tasks/image_classification
+        title: Image classification
+      - local: tasks/semantic_segmentation
+        title: Image segmentation
+      - local: tasks/video_classification
+        title: Video classification
+      - local: tasks/object_detection
+        title: Object detection
+      - local: tasks/zero_shot_object_detection
+        title: Zero-shot object detection
+      - local: tasks/zero_shot_image_classification
+        title: Zero-shot image classification
+      - local: tasks/monocular_depth_estimation
+        title: Depth estimation
+      - local: tasks/image_to_image
+        title: Image-to-Image
+      - local: tasks/image_feature_extraction
+        title: Image Feature Extraction
+      - local: tasks/mask_generation
+        title: Mask Generation
+      - local: tasks/keypoint_detection
+        title: Keypoint detection
+      - local: tasks/knowledge_distillation_for_image_classification
+        title: Knowledge Distillation for Computer Vision
+      title: Computer vision
+    - sections:
+      - local: tasks/image_captioning
+        title: Image captioning
+      - local: tasks/document_question_answering
+        title: Document Question Answering
+      - local: tasks/visual_question_answering
+        title: Visual Question Answering
+      - local: tasks/text-to-speech
+        title: Text to speech
+      - local: tasks/idefics
+        title: Image tasks with IDEFICS
+      - local: tasks/image_text_to_text
+        title: Image-text-to-text
+      - local: tasks/video_text_to_text
+        title: Video-text-to-text
+      title: Multimodal
+    title: Task recipes
+  - local: run_scripts
+    title: Training scripts
+  - local: glossary
+    title: Glossary
+  - local: philosophy
+    title: Philosophy
+  - local: notebooks
+    title: Notebooks with examples
+  - local: community
+    title: Community resources
+  - local: troubleshooting
+    title: Troubleshoot
+  title: Resources
+- isExpanded: false
+  sections:
   - local: contributing
-    title: How to contribute to 🤗 Transformers?
-  - local: add_new_model
-    title: How to add a model to 🤗 Transformers?
-  - local: add_new_pipeline
-    title: How to add a pipeline to 🤗 Transformers?
+    title: Contribute to Transformers
   - local: testing
-    title: Testing
+    title: Transformers model tests
   - local: pr_checks
-    title: Checks on a Pull Request
+    title: Pull request checks
   title: Contribute
-- sections:
-  - local: philosophy
-    title: Philosophy
-  - local: glossary
-    title: Glossary
-  - local: task_summary
-    title: What 🤗 Transformers can do
-  - local: tasks_explained
-    title: How 🤗 Transformers solve tasks
-  - local: model_summary
-    title: The Transformer model family
-  - local: tokenizer_summary
-    title: Summary of the tokenizers
-  - local: attention
-    title: Attention mechanisms
-  - local: pad_truncation
-    title: Padding and truncation
-  - local: bertology
-    title: BERTology
-  - local: perplexity
-    title: Perplexity of fixed-length models
-  - local: pipeline_webserver
-    title: Pipelines for webserver inference
-  - local: model_memory_anatomy
-    title: Model training anatomy
-  - local: llm_tutorial_optimization
-    title: Getting the most out of LLMs
-  title: Conceptual guides
-- sections:
+- isExpanded: false
+  sections:
   - sections:
     - local: main_classes/agent
       title: Agents and Tools
@@ -313,6 +332,8 @@
       title: Optimization
     - local: main_classes/output
       title: Model outputs
+    - local: main_classes/peft
+      title: PEFT
     - local: main_classes/pipelines
       title: Pipelines
     - local: main_classes/processors
@@ -331,10 +352,9 @@
       title: Feature Extractor
     - local: main_classes/image_processor
       title: Image Processor
-    title: Main Classes
+    title: Main classes
   - sections:
-    - isExpanded: false
-      sections:
+    - sections:
       - local: model_doc/albert
         title: ALBERT
       - local: model_doc/bamba
@@ -395,6 +415,8 @@
         title: DeBERTa
       - local: model_doc/deberta-v2
         title: DeBERTa-v2
+      - local: model_doc/deepseek_v3
+        title: DeepSeek-V3
       - local: model_doc/dialogpt
         title: DialoGPT
       - local: model_doc/diffllama
@@ -461,6 +483,8 @@
         title: Granite
       - local: model_doc/granitemoe
         title: GraniteMoe
+      - local: model_doc/granitemoeshared
+        title: GraniteMoeShared
       - local: model_doc/granitevision
         title: GraniteVision
       - local: model_doc/helium
@@ -483,6 +507,8 @@
         title: Llama2
       - local: model_doc/llama3
         title: Llama3
+      - local: model_doc/llama4
+        title: Llama4
       - local: model_doc/longformer
         title: Longformer
       - local: model_doc/longt5
@@ -511,6 +537,8 @@
         title: MegatronGPT2
       - local: model_doc/mistral
         title: Mistral
+      - local: model_doc/mistral3
+        title: Mistral3
       - local: model_doc/mixtral
         title: Mixtral
       - local: model_doc/mluke
@@ -561,6 +589,8 @@
         title: Phi
       - local: model_doc/phi3
         title: Phi-3
+      - local: model_doc/phi4_multimodal
+        title: Phi4 Multimodal
       - local: model_doc/phimoe
         title: PhiMoE
       - local: model_doc/phobert
@@ -575,6 +605,10 @@
         title: Qwen2
       - local: model_doc/qwen2_moe
         title: Qwen2MoE
+      - local: model_doc/qwen3
+        title: Qwen3
+      - local: model_doc/qwen3_moe
+        title: Qwen3MoE
       - local: model_doc/rag
         title: RAG
       - local: model_doc/realm
@@ -642,8 +676,7 @@
       - local: model_doc/zamba2
         title: Zamba2
       title: Text models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: model_doc/beit
         title: BEiT
       - local: model_doc/bit
@@ -714,6 +747,8 @@
         title: NAT
       - local: model_doc/poolformer
         title: PoolFormer
+      - local: model_doc/prompt_depth_anything
+        title: Prompt Depth Anything
       - local: model_doc/pvt
         title: Pyramid Vision Transformer (PVT)
       - local: model_doc/pvt_v2
@@ -771,8 +806,7 @@
       - local: model_doc/zoedepth
         title: ZoeDepth
       title: Vision models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: model_doc/audio-spectrogram-transformer
         title: Audio Spectrogram Transformer
       - local: model_doc/bark
@@ -842,8 +876,7 @@
       - local: model_doc/xlsr_wav2vec2
         title: XLSR-Wav2Vec2
       title: Audio models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: model_doc/timesformer
         title: TimeSformer
       - local: model_doc/videomae
@@ -851,14 +884,15 @@
       - local: model_doc/vivit
         title: ViViT
       title: Video models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: model_doc/align
         title: ALIGN
       - local: model_doc/altclip
         title: AltCLIP
       - local: model_doc/aria
         title: Aria
+      - local: model_doc/aya_vision
+        title: AyaVision
       - local: model_doc/blip
         title: BLIP
       - local: model_doc/blip-2
@@ -889,6 +923,8 @@
         title: Emu3
       - local: model_doc/flava
         title: FLAVA
+      - local: model_doc/gemma3
+        title: Gemma3
       - local: model_doc/git
         title: GIT
       - local: model_doc/got_ocr2
@@ -961,8 +997,14 @@
         title: Qwen2VL
       - local: model_doc/sam
         title: Segment Anything
+      - local: model_doc/shieldgemma2
+        title: ShieldGemma2
       - local: model_doc/siglip
         title: SigLIP
+      - local: model_doc/siglip2
+        title: SigLIP2
+      - local: model_doc/smolvlm
+        title: SmolVLM
       - local: model_doc/speech-encoder-decoder
         title: Speech Encoder Decoder Models
       - local: model_doc/tapas
@@ -990,15 +1032,13 @@
       - local: model_doc/xclip
         title: X-CLIP
       title: Multimodal models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: model_doc/decision_transformer
         title: Decision Transformer
       - local: model_doc/trajectory_transformer
         title: Trajectory Transformer
       title: Reinforcement learning models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: model_doc/autoformer
         title: Autoformer
       - local: model_doc/informer
@@ -1010,8 +1050,7 @@
       - local: model_doc/time_series_transformer
         title: Time Series Transformer
       title: Time series models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: model_doc/graphormer
         title: Graphormer
       title: Graph models
@@ -1019,6 +1058,8 @@
   - sections:
     - local: internal/modeling_utils
       title: Custom Layers and Utilities
+    - local: internal/model_debugging_utils
+      title: Utilities for Model Debugging
     - local: internal/pipelines_utils
       title: Utilities for pipelines
     - local: internal/tokenization_utils
@@ -1035,5 +1076,5 @@
       title: General Utilities
     - local: internal/time_series_utils
       title: Utilities for Time Series
-    title: Internal Helpers
+    title: Internal helpers
   title: API
diff --git a/docs/source/en/accelerate.md b/docs/source/en/accelerate.md
index e0a7a9c65623..c0ad46f8ac91 100644
--- a/docs/source/en/accelerate.md
+++ b/docs/source/en/accelerate.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,123 +14,152 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Distributed training with 🤗 Accelerate
+# Accelerate
 
-As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment.
+[Accelerate](https://hf.co/docs/accelerate/index) is a library designed to simplify distributed training on any type of setup with PyTorch by uniting the most common frameworks ([Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/)) for it into a single interface. [`Trainer`] is powered by Accelerate under the hood, enabling loading big models and distributed training.
 
-## Setup
-
-Get started by installing 🤗 Accelerate:
+This guide will show you two ways to use Accelerate with Transformers, using FSDP as the backend. The first method demonstrates distributed training with [`Trainer`], and the second method demonstrates adapting a PyTorch training loop. For more detailed information about Accelerate, please refer to the [documentation](https://hf.co/docs/accelerate/index).
 
 ```bash
 pip install accelerate
 ```
 
-Then import and create an [`~accelerate.Accelerator`] object. The [`~accelerate.Accelerator`] will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
+Start by running [accelerate config](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-config) in the command line to answer a series of prompts about your training system. This creates and saves a configuration file to help Accelerate correctly set up training based on your setup.
 
-```py
->>> from accelerate import Accelerator
+```bash
+accelerate config
+```
 
->>> accelerator = Accelerator()
+Depending on your setup and the answers you provide, an example configuration file for distributing training with FSDP on one machine with two GPUs may look like the following.
+
+```yaml
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch_policy: BACKWARD_PRE
+  fsdp_forward_prefetch: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_transformer_layer_cls_to_wrap: BertLayer
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
 ```
 
-## Prepare to accelerate
+## Trainer
 
-The next step is to pass all the relevant training objects to the [`~accelerate.Accelerator.prepare`] method. This includes your training and evaluation DataLoaders, a model and an optimizer:
+Pass the path to the saved configuration file to [`TrainingArguments`], and from there, pass your [`TrainingArguments`] to [`Trainer`].
 
 ```py
->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-...     train_dataloader, eval_dataloader, model, optimizer
-... )
+from transformers import TrainingArguments, Trainer
+
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    fsdp_config="path/to/fsdp_config",
+    fsdp_strategy="full_shard",
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+)
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    processing_class=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+
+trainer.train()
 ```
 
-## Backward
+## Native PyTorch
 
-The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`] method:
+Accelerate can also be added to any PyTorch training loop to enable distributed training. The [`~accelerate.Accelerator`] is the main entry point for adapting your PyTorch code to work with Accelerate. It automatically detects your distributed training setup and initializes all the necessary components for training. You don't need to explicitly place your model on a device because [`~accelerate.Accelerator`] knows which device to move your model to.
 
 ```py
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         accelerator.backward(loss)
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
+from accelerate import Accelerator
+
+accelerator = Accelerator()
+device = accelerator.device
 ```
 
-As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training!
+All PyTorch objects (model, optimizer, scheduler, dataloaders) should be passed to the [`~accelerate.Accelerator.prepare`] method now. This method moves your model to the appropriate device or devices, adapts the optimizer and scheduler to use [`~accelerate.optimizer.AcceleratedOptimizer`] and [`~accelerate.scheduler.AcceleratedScheduler`], and creates a new shardable dataloader.
 
-```diff
-+ from accelerate import Accelerator
-  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+```py
+train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+    train_dataloader, eval_dataloader, model, optimizer
+)
+```
 
-+ accelerator = Accelerator()
+Replace `loss.backward` in your training loop with Accelerates [`~accelerate.Accelerator.backward`] method to scale the gradients and determine the appropriate `backward` method to use depending on your framework (for example, DeepSpeed or Megatron).
 
-  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
-  optimizer = AdamW(model.parameters(), lr=3e-5)
+```py
+for epoch in range(num_epochs):
+    for batch in train_dataloader:
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+        progress_bar.update(1)
+```
 
-- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-- model.to(device)
+Combine everything into a function and make it callable as a script.
 
-+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-+     train_dataloader, eval_dataloader, model, optimizer
-+ )
+```py
+from accelerate import Accelerator
+  
+def main():
+  accelerator = Accelerator()
 
-  num_epochs = 3
-  num_training_steps = num_epochs * len(train_dataloader)
-  lr_scheduler = get_scheduler(
-      "linear",
-      optimizer=optimizer,
-      num_warmup_steps=0,
-      num_training_steps=num_training_steps
+  model, optimizer, training_dataloader, scheduler = accelerator.prepare(
+      model, optimizer, training_dataloader, scheduler
   )
 
-  progress_bar = tqdm(range(num_training_steps))
-
-  model.train()
-  for epoch in range(num_epochs):
-      for batch in train_dataloader:
--         batch = {k: v.to(device) for k, v in batch.items()}
-          outputs = model(**batch)
-          loss = outputs.loss
--         loss.backward()
-+         accelerator.backward(loss)
-
-          optimizer.step()
-          lr_scheduler.step()
-          optimizer.zero_grad()
-          progress_bar.update(1)
+  for batch in training_dataloader:
+      optimizer.zero_grad()
+      inputs, targets = batch
+      outputs = model(inputs)
+      loss = loss_function(outputs, targets)
+      accelerator.backward(loss)
+      optimizer.step()
+      scheduler.step()
+
+if __name__ == "__main__":
+    main()
 ```
 
-## Train
-
-Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory.
+From the command line, call [accelerate launch](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-launch) to run your training script. Any additional arguments or parameters can be passed here as well.
 
-### Train with a script
-
-If you are running your training from a script, run the following command to create and save a configuration file:
-
-```bash
-accelerate config
-```
-
-Then launch your training with:
+To launch your training script on two GPUs, add the `--num_processes` argument.
 
 ```bash
-accelerate launch train.py
-```
-
-### Train with a notebook
-
-🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to [`~accelerate.notebook_launcher`]:
-
-```py
->>> from accelerate import notebook_launcher
-
->>> notebook_launcher(training_function)
+accelerate launch --num_processes=2 your_script.py
 ```
 
-For more information about 🤗 Accelerate and its rich features, refer to the [documentation](https://huggingface.co/docs/accelerate).
+Refer to the [Launching Accelerate scripts](https://hf.co/docs/accelerate/main/en/basic_tutorials/launch) for more details.
diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md
index 9aab36bb6fbe..419b1dced412 100644
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,496 +13,302 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# How to add a model to 🤗 Transformers?
+# Adding a new model to Transformers
 
-The 🤗 Transformers library is often able to offer new models thanks to community contributors. But this can be a challenging project and requires an in-depth knowledge of the 🤗 Transformers library and the model to implement. At Hugging Face, we're trying to empower more of the community to actively add models and we've put together this guide to walk you through the process of adding a PyTorch model (make sure you have [PyTorch installed](https://pytorch.org/get-started/locally/)).
+> [!TIP]
+> Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers!
 
-Along the way, you'll:
+Many of the models in Transformers are contributed by developers and researchers. As an open-source first project, we're invested in empowering the community to actively and independently add more models.
 
-- get insights into open-source best practices
-- understand the design principles behind one of the most popular deep learning libraries
-- learn how to efficiently test large models
-- learn how to integrate Python utilities like `black`, `ruff`, and `make fix-copies` to ensure clean and readable code
+When you add a model to Transformers, you'll learn:
 
-A Hugging Face team member will be available to help you along the way so you'll never be alone. 🤗 ❤️
+- more about open-source best practices
+- about a models architecture
+- about Transformers' design principles
+- how to efficiently test large models
+- how to use Python utilities like [Black](https://black.readthedocs.io/en/stable/) and [Ruff](https://docs.astral.sh/ruff/) to create clean and readable code
 
-To get started, open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue for the model you want to see in 🤗 Transformers. If you're not especially picky about contributing a specific model, you can filter by the [New model label](https://github.com/huggingface/transformers/labels/New%20model) to see if there are any unclaimed model requests and work on it.
+It is a challenging but rewarding process.
 
-Once you've opened a new model request, the first step is to get familiar with 🤗 Transformers if you aren't already!
+This guide will walk you through adding an example BrandNewLlama PyTorch model to Transformers. Before you begin, it is a good idea to familiarize yourself with the library.
 
-## General overview of 🤗 Transformers
+## Transformers overview
 
-First, you should get a general overview of 🤗 Transformers. 🤗 Transformers is a very opinionated library, so there is a
-chance that you don't agree with some of the library's philosophies or design choices. From our experience, however, we
-found that the fundamental design choices and philosophies of the library are crucial to efficiently scale 🤗
-Transformers while keeping maintenance costs at a reasonable level.
+Transformers is an opinionated library with its own unique philosophy and design choices. These choices help us sustainably scale and maintain Transformers.
 
-A good first starting point to better understand the library is to read the [documentation of our philosophy](philosophy). As a result of our way of working, there are some choices that we try to apply to all models:
+> [!TIP]
+> Learn more about our design principles on the [Philosophy](./philosophy) doc.
 
-- Composition is generally favored over-abstraction
-- Duplicating code is not always bad if it strongly improves the readability or accessibility of a model
-- Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only
-  have to look into the respective `modeling_....py` file.
+Some of these design choices are:
 
-In our opinion, the library's code is not just a means to provide a product, *e.g.* the ability to use BERT for
-inference, but also as the very product that we want to improve. Hence, when adding a model, the user is not only the
-person who will use your model, but also everybody who will read, try to understand, and possibly tweak your code.
+- composition > over-abstraction
+- duplicate code isn't always bad if it greatly improves readability and accessibility
+- model files are self-contained and all the necessary model code is found in the `modeling_mymodel.py` file
 
-With this in mind, let's go a bit deeper into the general library design.
+These design choices are important *for everyone* interacting with the model. It is easier to read, understand, and modify.
 
-### Overview of models
+This section describes how the model and configuration classes interact and the Transformers code style.
 
-To successfully add a model, it is important to understand the interaction between your model and its config,
-[`PreTrainedModel`], and [`PretrainedConfig`]. For exemplary purposes, we will
-call the model to be added to 🤗 Transformers `BrandNewBert`.
+### Model and configuration
 
-Let's take a look:
+All Transformers' models inherit from a base [`PreTrainedModel`] and [`PretrainedConfig`] class. The configuration is the models blueprint.
 
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
+There is never more than two levels of abstraction for any model to keep the code readable. The example model here, BrandNewLlama, inherits from `BrandNewLlamaPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] so that it can use the [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`] methods.
 
-As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute
-minimum. There are never more than two levels of abstraction for any model in the library. `BrandNewBertModel`
-inherits from `BrandNewBertPreTrainedModel` which in turn inherits from [`PreTrainedModel`] and
-that's it. As a general rule, we want to make sure that a new model only depends on
-[`PreTrainedModel`]. The important functionalities that are automatically provided to every new
-model are [`~PreTrainedModel.from_pretrained`] and
-[`~PreTrainedModel.save_pretrained`], which are used for serialization and deserialization. All of the
-other important functionalities, such as `BrandNewBertModel.forward` should be completely defined in the new
-`modeling_brand_new_bert.py` script. Next, we want to make sure that a model with a specific head layer, such as
-`BrandNewBertForMaskedLM` does not inherit from `BrandNewBertModel`, but rather uses `BrandNewBertModel`
-as a component that can be called in its forward pass to keep the level of abstraction low. Every new model requires a
-configuration class, called `BrandNewBertConfig`. This configuration is always stored as an attribute in
-[`PreTrainedModel`], and thus can be accessed via the `config` attribute for all classes
-inheriting from `BrandNewBertPreTrainedModel`:
+Other important functions like the forward method are defined in the `modeling.py` file.
 
-```python
-model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
-model.config  # model has access to its config
+Specific model heads (for example, sequence classification or language modeling) should call the base model in the forward pass rather than inheriting from it to keep abstraction low.
+
+New models require a configuration, for example `BrandNewLlamaConfig`, that is stored as an attribute of [`PreTrainedModel`].
+
+```py
+model = BrandNewLlamaModel.from_pretrained("username/brand_new_llama")
+model.config
 ```
 
-Similar to the model, the configuration inherits basic serialization and deserialization functionalities from
-[`PretrainedConfig`]. Note that the configuration and the model are always serialized into two
-different formats - the model to a *pytorch_model.bin* file and the configuration to a *config.json* file. Calling
-the model's [`~PreTrainedModel.save_pretrained`] will automatically call
-the config's [`~PretrainedConfig.save_pretrained`], so that both model and configuration are saved.
+[`PretrainedConfig`] provides the [`~PretrainedConfig.from_pretrained`] and [`~PretrainedConfig.save_pretrained`] methods.
+
+When you use [`PreTrainedModel.save_pretrained`], it automatically calls [`PretrainedConfig.save_pretrained`] so that both the model and configuration are saved together.
 
+A model is saved to a `model.safetensors` file and a configuration is saved to a `config.json` file.
 
 ### Code style
 
-When coding your new model, keep in mind that Transformers is an opinionated library and we have a few quirks of our
-own regarding how code should be written :-)
-
-1. The forward pass of your model should be fully written in the modeling file while being fully independent of other
-   models in the library. If you want to reuse a block from another model, copy the code and paste it with a
-   `# Copied from` comment on top (see [here](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   for a good example and [there](pr_checks#check-copies) for more documentation on Copied from). 
-2. The code should be fully understandable, even by a non-native English speaker. This means you should pick
-   descriptive variable names and avoid abbreviations. As an example, `activation` is preferred to `act`.
-   One-letter variable names are strongly discouraged unless it's an index in a for loop.
-3. More generally we prefer longer explicit code to short magical one.
-4. Avoid subclassing `nn.Sequential` in PyTorch but subclass `nn.Module` and write the forward pass, so that anyone
-   using your code can quickly debug it by adding print statements or breaking points.
-5. Your function signature should be type-annotated. For the rest, good variable names are way more readable and
-   understandable than type annotations.
-
-### Overview of tokenizers
-
-Not quite ready yet :-( This section will be added soon!
-
-## Step-by-step recipe to add a model to 🤗 Transformers
-
-Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries
-of how other contributors ported models to Hugging Face. Here is a list of community blog posts on how to port a model:
-
-1. [Porting GPT2 Model](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) by [Thomas](https://huggingface.co/thomwolf)
-2. [Porting WMT19 MT Model](https://huggingface.co/blog/porting-fsmt) by [Stas](https://huggingface.co/stas)
-
-From experience, we can tell you that the most important things to keep in mind when adding a model are:
-
--  Don't reinvent the wheel! Most parts of the code you will add for the new 🤗 Transformers model already exist
-  somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy
-  from. [grep](https://www.gnu.org/software/grep/) and [rg](https://github.com/BurntSushi/ripgrep) are your
-  friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and
-  your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code
-  is based on XLM.
--  It's more of an engineering challenge than a scientific challenge. You should spend more time creating an
-  efficient debugging environment rather than trying to understand all theoretical aspects of the model in the paper.
--  Ask for help, when you're stuck! Models are the core component of 🤗 Transformers so we at Hugging Face are more
-  than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making
-  progress.
-
-In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers.
-
-The following list is a summary of everything that has to be done to add a model and can be used by you as a To-Do
-List:
-
-☐ (Optional) Understood the model's theoretical aspects<br>
-☐ Prepared 🤗 Transformers dev environment<br>
-☐ Set up debugging environment of the original repository<br>
-☐ Created script that successfully runs the `forward()` pass using the original repository and checkpoint<br>
-☐ Successfully added the model skeleton to 🤗 Transformers<br>
-☐ Successfully converted original checkpoint to 🤗 Transformers checkpoint<br>
-☐ Successfully ran `forward()` pass in 🤗 Transformers that gives identical output to original checkpoint<br>
-☐ Finished model tests in 🤗 Transformers<br>
-☐ Successfully added tokenizer in 🤗 Transformers<br>
-☐ Run end-to-end integration tests<br>
-☐ Finished docs<br>
-☐ Uploaded model weights to the Hub<br>
-☐ Submitted the pull request<br>
-☐ (Optional) Added a demo notebook
-
-To begin with, we usually recommend starting by getting a good theoretical understanding of `BrandNewBert`. However,
-if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive
-into the `BrandNewBert`'s code-base. This option might suit you better if your engineering skills are better than
-your theoretical skill, if you have trouble understanding `BrandNewBert`'s paper, or if you just enjoy programming
-much more than reading scientific papers.
-
-### 1. (Optional) Theoretical aspects of BrandNewBert
-
-You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large
-sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is
-not to get a deep theoretical understanding of the paper, but to extract the necessary information required to
-effectively re-implement the model in 🤗 Transformers. That being said, you don't have to spend too much time on the
-theoretical aspects, but rather focus on the practical ones, namely:
-
--  What type of model is *brand_new_bert*? BERT-like encoder-only model? GPT2-like decoder-only model? BART-like
-  encoder-decoder model? Look at the [model_summary](model_summary) if you're not familiar with the differences between those.
--  What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,*
-  summarization?
--  What is the novel feature of the model that makes it different from BERT/GPT-2/BART?
--  Which of the already existing [🤗 Transformers models](https://huggingface.co/transformers/#contents) is most
-  similar to *brand_new_bert*?
--  What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used
-  for BERT or BART?
-
-After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the
-Hugging Face team with any questions you might have. This might include questions regarding the model's architecture,
-its attention layer, etc. We will be more than happy to help you.
-
-### 2. Next prepare your environment
-
-1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the ‘Fork' button on the
-   repository's page. This creates a copy of the code under your GitHub user account.
-
-2. Clone your `transformers` fork to your local disk, and add the base repository as a remote:
-
-   ```bash
-   git clone https://github.com/[your Github handle]/transformers.git
-   cd transformers
-   git remote add upstream https://github.com/huggingface/transformers.git
-   ```
-
-3. Set up a development environment, for instance by running the following command:
-
-   ```bash
-   python -m venv .env
-   source .env/bin/activate
-   pip install -e ".[dev]"
-   ```
-
-   Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
-   failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
-   (PyTorch, TensorFlow and/or Flax) then do:
-
-   ```bash
-   pip install -e ".[quality]"
-   ```
-
-   which should be enough for most use cases. You can then return to the parent directory
-
-   ```bash
-   cd ..
-   ```
-
-4. We recommend adding the PyTorch version of *brand_new_bert* to Transformers. To install PyTorch, please follow the
-   instructions on https://pytorch.org/get-started/locally/.
-
-   **Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient.
-
-5. To port *brand_new_bert*, you will also need access to its original repository:
-
-   ```bash
-   git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
-   cd brand_new_bert
-   pip install -e .
-   ```
-
-Now you have set up a development environment to port *brand_new_bert* to 🤗 Transformers.
-
-### 3.-4. Run a pretrained checkpoint using the original repository
-
-At first, you will work on the original *brand_new_bert* repository. Often, the original implementation is very
-“researchy”. Meaning that documentation might be lacking and the code can be difficult to understand. But this should
-be exactly your motivation to reimplement *brand_new_bert*. At Hugging Face, one of our main goals is to *make people
-stand on the shoulders of giants* which translates here very well into taking a working model and rewriting it to make
-it as **accessible, user-friendly, and beautiful** as possible. This is the number-one motivation to re-implement
-models into 🤗 Transformers - trying to make complex new NLP technology accessible to **everybody**.
-
-You should start thereby by diving into the original repository.
-
-Successfully running the official pretrained model in the original repository is often **the most difficult** step.
-From our experience, it is very important to spend some time getting familiar with the original code-base. You need to
-figure out the following:
-
-- Where to find the pretrained weights?
-- How to load the pretrained weights into the corresponding model?
-- How to run the tokenizer independently from the model?
-- Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually,
-  you only have to reimplement those functions.
-- Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes,
-  *e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers,
-  *e.g.* *self-attention*, *cross-attention*...?
-- How can you debug the model in the original environment of the repo? Do you have to add *print* statements, can you
-  work with an interactive debugger like *ipdb*, or should you use an efficient IDE to debug the model, like PyCharm?
+Transformers prefers a clean and readable code over a more abstracted code style. Some of the code style choices include:
 
-It is very important that before you start the porting process, you can **efficiently** debug code in the original
-repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or
-even a pull request in the original repository. The maintainers of this repository are most likely very happy about
-someone looking into their code!
-
-At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original
-model. We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to
-dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. Only
-at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the
-model also works as expected on GPU.
+- The code should be accessible to non-English users. Pick descriptive variable names and avoid abbreviations. For example, "activation" is preferred over "act". One letter variables names are highly discouraged unless it's an index in a for loop.
 
-In general, there are two possible debugging environments for running the original model
+- Explicit code is preferred - even if it's longer - over shorter code.
 
--  [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
--  Local python scripts.
+- Avoid subclassing [nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html). Subclass [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) instead so the code can be quickly debugged with print statements or breakpoints.
 
-Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split
-logical components from one another and to have faster debugging cycles as intermediate results can be stored. Also,
-notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging
-Face team for help. If you are familiar with Jupyter notebooks, we strongly recommend you work with them.
+- Function signatures should be type-annotated. Otherwise, use good variable names so they're more understandable.
 
-The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend
-some time adjusting to the new programming environment and you might not be able to use your known debugging tools
-anymore, like `ipdb`.
+## New model addition issue
 
-For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a
-single forward pass using a dummy integer vector of input IDs as an input. Such a script could look like this (in
-pseudocode):
+Open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue to add a specific model.
 
-```python
-model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
-input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
-original_output = model.predict(input_ids)
-```
-
-Next, regarding the debugging strategy, there are generally a few from which to choose from:
+> [!TIP]
+> Filter by the [New model](https://github.com/huggingface/transformers/labels/New%20model) label on GitHub to view and add any existing model requests.
 
-- Decompose the original model into many small testable components and run a forward pass on each of those for
-  verification
-- Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on
-  those, and use intermediate print statements or breakpoints for verification
+Now is a good time to get familiar with BrandNewLlama. It is helpful to read a models research paper to understand its technical design and implementation. You don't necessarily have to worry too much about the theoretical details. Instead, focus on the practical ones. Use the questions below to guide your reading.
 
-Again, it is up to you which strategy to choose. Often, one or the other is advantageous depending on the original code
-base.
+- What type of model is BrandNewLlama? Is it a encoder, decoder, or encoder-decoder model?
+- What tasks can BrandNewLlama be used for?
+- What makes BrandNewLlama different from other models?
+- What models in Transformers are most similar to BrandNewLlama?
+- What tokenizer does BrandNewLlama use?
 
-If the original code-base allows you to decompose the model into smaller sub-components, *e.g.* if the original
-code-base can easily be run in eager mode, it is usually worth the effort to do so. There are some important advantages
-to taking the more difficult road in the beginning:
+In addition to learning more about your model, use the tips below to help you add a model faster.
 
-- at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically
-  for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead
-  of relying on visual comparison via print statements
-- it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting
-  individual components and thus structure your work better
-- separating the model into logical meaningful components will help you to get a better overview of the model's design
-  and thus to better understand the model
-- at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue
-  changing your code
+> [!TIP]
+> Each contributor has a unique style and workflow for adding models to Transformers. For an example, take a look at how [Gemma](https://github.com/huggingface/transformers/pull/29167) was added.
 
-[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) integration checks for ELECTRA
-gives a nice example of how this can be done.
+- Don't reinvent the wheel! Take your time to explore existing models and tokenizers to see what you can copy and reuse. [Grep](https://www.gnu.org/software/grep/) and [ripgrep](https://github.com/BurntSushi/ripgrep) are great tools for this.
+- This is more of an engineering than a science challenge. Focus on the more practical (setting up an efficient debugging environment for example) instead of the theorertical aspects of the model.
+- Don't be shy to ask for help! We are here to support you. 🤗
 
-However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode,
-it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. A good
-example is [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) library which is
-very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one
-often relies on verifying print statements.
+## Dev environment
 
-No matter which strategy you choose, the recommended procedure is often the same that you should start to debug the
-starting layers first and the ending layers last.
+Click on the **Fork** button on the [Transformers](https://github.com/huggingface/transformers) repository to create your own copy to work on. Clone the repository to your local disk and add the base repository as the remote.
 
-It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following
-layers in the following order:
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
 
-1. Retrieve the input IDs passed to the model
-2. Retrieve the word embeddings
-3. Retrieve the input of the first Transformer layer
-4. Retrieve the output of the first Transformer layer
-5. Retrieve the output of the following n - 1 Transformer layers
-6. Retrieve the output of the whole BrandNewBert Model
+Create a virtual environment and perform an [editable install](./installation#editable-install) of the library with the "dev" or development dependencies.
 
-Input IDs should thereby consists of an array of integers, *e.g.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
 
-The outputs of the following layers often consist of multi-dimensional float arrays and can look like this:
+Due to the number of optional dependencies as Transformers grows, this command may fail. In this case, install the "quality" dependencies. Also make sure you have a deep learning framework installed.
 
+```bash
+pip install -e ".[quality]"
 ```
-[[
- [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
- [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
- [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
- ...,
- [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
- [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
- [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+
+Return to the parent directory and clone and install the original BrandNewLlama repository.
+
+```bash
+git clone https://github.com/org_that_created_brand_new_llama_org/brand_new_llama.git
+cd brand_new_bert
+pip install -e .
 ```
 
-We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original
-model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001!
-Since it is normal that the exact same model written in different libraries can give a slightly different output
-depending on the library framework, we accept an error tolerance of 1e-3 (0.001). It is not enough if the model gives
-nearly the same output, they have to be almost identical. Therefore, you will certainly compare the intermediate
-outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of
-*brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely
-important. Here is some advice to make your debugging environment as efficient as possible.
-
-- Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should
-  probably take the time to write a longer script that decomposes the original model into smaller sub-components to
-  retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on
-  TensorFlow print operations like [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) to output
-  intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when
-  running the forward pass, *e.g.* check-out [this link](https://github.com/google/jax/issues/196).
-- Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle
-  becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds.
-  In case only very large checkpoints are available, it might make more sense to create a dummy model in the new
-  environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version
-  of your model
-- Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to
-  find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called
-  `predict`, `evaluate`, `forward` or `__call__`. You don't want to debug a function that calls `forward`
-  multiple times, *e.g.* to generate text, like `autoregressive_sample`, `generate`.
-- Try to separate the tokenization from the model's *forward* pass. If the original repository shows examples where
-  you have to input a string, then try to find out where in the forward call the string input is changed to input ids
-  and start from this point. This might mean that you have to possibly write a small script yourself or change the
-  original code so that you can directly input the ids instead of an input string.
-- Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield
-  random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging
-  environment is **deterministic** so that the dropout layers are not used. Or use *transformers.utils.set_seed*
-  if the old and new implementations are in the same framework.
-
-The following section gives you more specific details/tips on how you can do this for *brand_new_bert*.
-
-### 5.-14. Port BrandNewBert to 🤗 Transformers
-
-Next, you can finally start adding new code to 🤗 Transformers. Go into the clone of your 🤗 Transformers' fork:
+Return to your clone of Transformers to begin porting BrandNewLlama.
 
 ```bash
 cd transformers
 ```
 
-In the special case that you are adding a model whose architecture exactly matches the model architecture of an
-existing model you only have to add a conversion script as described in [this section](#write-a-conversion-script).
-In this case, you can just re-use the whole model architecture of the already existing model.
+There are two possible debugging environments for running the original model, a notebook ([Google Colab](https://colab.research.google.com/notebooks/intro.ipynb) or [Jupyter](https://jupyter.org/)) or a local Python script.
 
-Otherwise, let's start generating a new model. We recommend using the following script to add a model starting from
-an existing model:
+> [!WARNING]
+> We don't recommend setting up a GPU environment to run the original model because it can be expensive. Instead, work in a CPU environment first to verify the model works in Transformers. Once it does, then you can verify it on a GPU.
+
+Notebooks are great for executing code cell-by-cell which can help split logical components from one another. It can also accelerate debugging cycles because intermediate results can be stored. You can also share notebooks when working with other contributors.
+
+The downside is that if you aren't used to them, it may take some time to get used to.
+
+> [!TIP]
+> If the model architecture is identical to an existing model, skip ahead to add a [conversion script](#conversion-script), because you can reuse the architecture of the existing model.
+
+Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt.
 
 ```bash
 transformers-cli add-new-model-like
 ```
 
-You will be prompted with a questionnaire to fill in the basic information of your model.
+## Create a pull request
 
-**Open a Pull Request on the main huggingface/transformers repo**
+Before you start adapting the code, create a pull request to track your progress and get feedback from the Transformers team. Title your pull request **[WIP] Add BrandNewLlama** so it's clear that this is a work in progress.
 
-Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)” pull
-request, *e.g.* “[WIP] Add *brand_new_bert*”, in 🤗 Transformers so that you and the Hugging Face team can work
-side-by-side on integrating the model into 🤗 Transformers.
+Create a branch with a descriptive name from your main branch.
 
-You should do the following:
+```bash
+git checkout -b add_brand_new_bert
+```
 
-1. Create a branch with a descriptive name from your main branch
+Commit the code, and then fetch and rebase on the main branch.
 
-   ```bash
-   git checkout -b add_brand_new_bert
-   ```
+```bash
+git add .
+git commit
+git fetch upstream
+git rebase upstream/main
+```
 
-2. Commit the automatically generated code:
+Push any changes to your branch and click on **Compare & pull request** to open a pull request on GitHub. Open the pull request as a *draft* to indicate it's a work in progress.
 
-   ```bash
-   git add .
-   git commit
-   ```
+```bash
+git push -u origin a-descriptive-name-for-my-changes
+```
 
-3. Fetch and rebase to current main
+Include relevant Hugging Face team members by adding their GitHub handles in the pull request for questions, feedback, comments, and reviews. Direct team members to specific parts of the code you want by clicking on the **Files changed** tab, and then clicking on **+** to the left of the line number to add a comment. When a question or problem is solved, click on **Resolve** to indicate the issue is resolved. This keeps the conversation organized and clean.
 
-   ```bash
-   git fetch upstream
-   git rebase upstream/main
-   ```
+Remember to periodically commit and push your work, and update your work with the current main branch.
 
-4. Push the changes to your account using:
+```bash
+git fetch upstream
+git merge upstream/main
+```
 
-   ```bash
-   git push -u origin a-descriptive-name-for-my-changes
-   ```
+## Original checkpoint
 
-5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
-   GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
-   future changes.
+Take some time to work on the original model implementation first to understand how it works.
 
-6. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page.
+This can be difficult if the original model repository is lacking documentation or if the codebase is complex. But you should use this as your motivation to implement the model in Transformers. Your contribution makes it more accessible and user-friendly to everyone!
 
-In the following, whenever you have made some progress, don't forget to commit your work and push it to your account so
-that it shows in the pull request. Additionally, you should make sure to update your work with the current main from
-time to time by doing:
+Orient yourself with the original repository by doing the following.
 
-```bash
-git fetch upstream
-git merge upstream/main
+- Locate the pretrained weights.
+- Figure out how to the load pretrained weights into the model.
+- Figure out how to run the tokenizer independently of the model.
+- Trace one forward pass to understand which classes and functions are required. These are probably the only classes and functions you'll have to implement.
+- Locate all the important components (model class, model subclasses, self-attention layer, etc.) of the model.
+- Figure out how to debug the model in the original repository. Add print statements, use interactive debuggers like [ipdb](https://github.com/gotcha/ipdb), or a efficient integrated development environment (IDE) like [PyCharm](https://www.jetbrains.com/pycharm/).
+
+The last point is especially important because you'll need a thorough understanding of what's happening inside the original model before you can reimplement it in Transformers. Feel free to open issues and pull requests in the original repository if you encounter any issues.
+
+A good first step is to load a *small* pretrained checkpoint and try to reproduce a single forward pass with an example integer vector of inputs. For example, in pseudocode, this could look like the following.
+
+```py
+model = BrandNewLlamaModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
+original_output = model.generate(input_ids)
 ```
 
-In general, all questions you might have regarding the model or your implementation should be asked in your PR and
-discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
-if you have a question. It is often very helpful to point the Hugging Face team to your added code so that the Hugging
-Face team can efficiently understand your problem or question.
+### Debugging
 
-To do so, you can go to the “Files changed” tab where you see all of your changes, go to a line regarding which you
-want to ask a question, and click on the “+” symbol to add a comment. Whenever a question or problem has been solved,
-you can click on the “Resolve” button of the created comment.
+If you run into issues, you'll need to choose one of the following debugging strategies depending on the original models codebase.
 
-In the same way, the Hugging Face team will open comments when reviewing your code. We recommend asking most questions
-on GitHub on your PR. For some very general questions that are not very useful for the public, feel free to ping the
-Hugging Face team by Slack or email.
+<hfoptions id="debug-strategy">
+<hfoption id="sub-components">
 
-**5. Adapt the generated models code for brand_new_bert**
+This strategy relies on breaking the original model into smaller sub-components, such as when the code can be easily run in eager mode. While more difficult, there are some advantages to this approach.
 
-At first, we will focus only on the model itself and not care about the tokenizer. All the relevant code should be
-found in the generated files `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` and
-`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
+1. It is easier later to compare the original model to your implementation. You can automatically verify that each individual component matches its corresponding component in the Transformers' implementation. This is better than relying on a visual comparison based on print statements.
+2. It is easier to port individual components instead of the entire model.
+3. It is easier for understanding how a model works by breaking it up into smaller parts.
+4. It is easier to prevent regressions at a later stage when you change your code thanks to component-by-component tests.
 
-Now you can finally start coding :). The generated code in
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` will either have the same architecture as BERT if
-it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what
-you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or
-BART?*". Implement those changes which often means changing the *self-attention* layer, the order of the normalization
-layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to
-get a better feeling of how your model should be implemented.
+> [!TIP]
+> Refer to the ELECTRA [integration checks](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) for a good example of how to decompose a model into smaller components.
 
-**Note** that at this point, you don't have to be very sure that your code is fully correct or clean. Rather, it is
-advised to add a first *unclean*, copy-pasted version of the original code to
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` until you feel like all the necessary code is
-added. From our experience, it is much more efficient to quickly add a first version of the required code and
-improve/correct the code iteratively with the conversion script as described in the next section. The only thing that
-has to work at this point is that you can instantiate the 🤗 Transformers implementation of *brand_new_bert*, *i.e.* the
-following command should work:
+</hfoption>
+<hfoption id="model and tokenizer">
 
-```python
-from transformers import BrandNewBertModel, BrandNewBertConfig
+This strategy is viable when the original codebase is too complex, only allows intermediate components to be run in compiled mode, or if it's too time-consuming (maybe even impossible) to separate the model into smaller sub-components.
+
+For example, the MeshTensorFlow implementation of [T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) is too complex and doesn't offer a simple way to decompose the model into its sub-components. In this situation, you'll have to rely on verifying print statements.
 
-model = BrandNewBertModel(BrandNewBertConfig())
+</hfoption>
+</hfoptions>
+
+Whichever strategy you choose, it is recommended to debug the initial layers first and the final layers last. Retrieve the output, either with print statements or sub-component functions, of the following layers in this order.
+
+1. input ids passed to the model
+2. word embeddings
+3. input of the first Transformer layer
+4. output of the first Transformer layer
+5. output of the following n-1 Transformer layers
+6. output of the whole model
+
+The input ids should just be an array of integers like `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`.
+
+Layer outputs often consist of multi-dimensional float arrays.
+
+```py
+[[
+ [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
+ [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
+ [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
+ ...,
+ [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
+ [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
+ [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
 ```
 
-The above command will create a model according to the default parameters as defined in `BrandNewBertConfig()` with
-random weights, thus making sure that the `init()` methods of all components works.
+Every Transformers model output should have a precision or error tolerance of *1e-3*. This accounts for any output differences that arise from using a different library framework. Compare the intermediate outputs of the original model with the Transformers implementation to ensure they're nearly identical. Having an *efficient* debugging environment is crucial for this step.
+
+Here are some tips for an efficient debugging environment.
+
+- To debug intermediate results, it depends on the machine learning framework the original model repository is using. For PyTorch, you should write a script to decompose the original model into smaller sub-components to retrieve the intermediate values. For TensorFlow, you may need to use [tf.print](https://www.tensorflow.org/api_docs/python/tf/print). For Flax, make sure the model is *not jitted* during the forward pass (refer to this GitHub [Issue](https://github.com/google/jax/issues/196) for more details).
+
+- It is faster to debug with a smaller pretrained checkpoint versus a larger checkpoint where the forward pass takes more than 10 seconds. If only large checkpoints are available, create a dummy model with randomly initialized weights and save those weights to compare against the Transformers implementation.
+
+- Find the easiest way to call the model's forward pass. Ideally, this function (may be called `predict`, `evaluate`, `forward`, or `__call__`) should only call the forward pass *once*. It is more difficult to debug a function that calls the forward pass multiple times.
+
+- Separate tokenization from the forward pass. Locate where a string input is changed to input ids in the forward pass and start here. You may need to create a small script or modify the original code to directly input the input ids instead of an input string.
+
+- Ensure the model is *not* in training mode. This can produce random outputs due to multiple dropout layers in a model. The forward pass in your debugging environment should be *deterministic* so that the dropout layers aren't used.
+
+Once you're able to run the original checkpoint, you're ready to start adapting the model code for Transformers.
+
+## Adapt the model code
+
+The `transformers-cli add-new-model-like` command should have generated a model and configuration file.
+
+- `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py`
+- `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py`
+
+The automatically generated code in the `modeling.py` file has the same architecture as Llama if you answered it's a decoder-only model or it will have the same architecture as BART if you answered it's an encoder-decoder model. The generated code is just a starting point. Based on your research on the new model, you'll need to implement those specific changes by adapting the generated code. This may involve changes to the self-attention layer, the order of the normalization layer, and so on.
+
+### Model initialization
+
+At this point, your code doesn't have to be clean or even fully correct, It is more efficient to quickly create a first draft and then iteratively improve on it. The most important thing is that your model can be instantiated from Transformers. The command below creates a model from the configuration with random weights, verifying that the `__init__` method works.
+
+```py
+from transformers import BrandNewLlama, BrandNewLlamaConfig
+model = BrandNewLlama(BrandNewLlamaConfig())
+```
 
-Note that all random initialization should happen in the `_init_weights` method of your `BrandnewBertPreTrainedModel`
-class. It should initialize all leaf modules depending on the variables of the config. Here is an example with the
-BERT `_init_weights` method:
+Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreTrainedModel`. All leaf modules are initialized depending on the configuration's variables.
 
 ```py
 def _init_weights(self, module):
@@ -520,9 +326,9 @@ def _init_weights(self, module):
         module.weight.data.fill_(1.0)
 ```
 
-You can have some more custom schemes if you need a special initialization for some modules. For instance, in
-`Wav2Vec2ForPreTraining`, the last two linear layers need to have the initialization of the regular PyTorch `nn.Linear`
-but all the other ones should use an initialization as above. This is coded like this:
+The initialization scheme can look different if you need to adapt it to your model. For example, [`Wav2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers.
+
+The `_is_hf_initialized` flag makes sure the submodule is only initialized once. Setting `module.project_q` and `module.project_hid` to `True` ensures the custom initialization is not overridden later. The `_init_weights` function won't be applied to these modules.
 
 ```py
 def _init_weights(self, module):
@@ -538,30 +344,34 @@ def _init_weights(self, module):
             module.bias.data.zero_()
 ```
 
-The `_is_hf_initialized` flag is internally used to make sure we only initialize a submodule once. By setting it to
-`True` for `module.project_q` and `module.project_hid`, we make sure the custom initialization we did is not overridden later on,
-the `_init_weights` function won't be applied to them.
+### Convert checkpoints to Transformers
 
-**6. Write a conversion script**
+The original checkpoint must be converted to a Transformers compatible checkpoint.
 
-Next, you should write a conversion script that lets you convert the checkpoint you used to debug *brand_new_bert* in
-the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of
-*brand_new_bert*. It is not advised to write the conversion script from scratch, but rather to look through already
-existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in
-the same framework as *brand_new_bert*. Usually, it is enough to copy an already existing conversion script and
-slightly adapt it for your use case. Don't hesitate to ask the Hugging Face team to point you to a similar already
-existing conversion script for your model.
+> [!TIP]
+> Try looking for an existing conversion script to copy, adapt, and reuse for your model!
+>
+> - If you're porting a model from TensorFlow to PyTorch, a good starting point may be the BERT [conversion script](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91).
+> - If you're porting a model from PyTorch to PyTorch, a good starting point may be the BART [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py).
 
-- If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script [here](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
-- If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)
+Make sure **all** required weights are initialized and print out all the checkpoint weights that weren't used for initialization to make sure the model has been converted correctly.
 
-In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. In PyTorch, the
-name of a layer is defined by the name of the class attribute you give the layer. Let's define a dummy model in
-PyTorch, called `SimpleModel` as follows:
+You may encounter wrong shape statements or name assignments during the conversion. This is most likely because of incorrect parameters in `BrandNewLlamaConfig`, the wrong architecture, a bug in the `init` method of your implementation, or you need to transpose one of the checkpoint weights.
 
-```python
-from torch import nn
+Keep iterating on the [Adapt the model code](#adapt-the-model-code) section until all the checkpoint weights are correctly loaded. Once you can load a checkpoint in your model, save it to a folder. This should contain a `model.safetensors` file and a `config.json` file.
 
+```py
+model.save_pretrained("/path/to/converted/checkpoint/folder")
+```
+
+To help with conversion, the next section briefly describes how PyTorch models stores and defines layer weights and names.
+
+#### PyTorch layer weights and names
+
+It is helpful to create a basic PyTorch model to understand how layer names are defined and weights are initialized.
+
+```py
+from torch import nn
 
 class SimpleModel(nn.Module):
     def __init__(self):
@@ -571,18 +381,11 @@ class SimpleModel(nn.Module):
         self.layer_norm = nn.LayerNorm(10)
 ```
 
-Now we can create an instance of this model definition which will fill all weights: `dense`, `intermediate`,
-`layer_norm` with random weights. We can print the model to see its architecture
+PyTorch layer names are defined by the class attribute name of the layer (`dense`, `intermediate`, `layer_norm`). Create a instance of `SimpleModel` to fill all the layers with random weights.
 
-```python
+```py
 model = SimpleModel()
-
 print(model)
-```
-
-This will print out the following:
-
-```
 SimpleModel(
   (dense): Linear(in_features=10, out_features=10, bias=True)
   (intermediate): Linear(in_features=10, out_features=10, bias=True)
@@ -590,16 +393,10 @@ SimpleModel(
 )
 ```
 
-We can see that the layer names are defined by the name of the class attribute in PyTorch. You can print out the weight
-values of a specific layer:
+The weight values of a specific layer are randomly initialized.
 
-```python
+```py
 print(model.dense.weight.data)
-```
-
-to see that the weights were randomly initialized
-
-```
 tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
          -0.2077,  0.2157],
         [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
@@ -622,339 +419,247 @@ tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
           0.2220,  0.2358]]).
 ```
 
-In the conversion script, you should fill those randomly initialized weights with the exact weights of the
-corresponding layer in the checkpoint. *E.g.*
+In the conversion script, the random weights should be replaced with the exact weights from the corresponding layer in the original checkpoint.
 
-```python
-# retrieve matching layer weights, e.g. by
-# recursive algorithm
+```py
+# retrieve matching layer weights with recursive algorithm
 layer_name = "dense"
 pretrained_weight = array_of_dense_layer
 
 model_pointer = getattr(model, "dense")
-
 model_pointer.weight.data = torch.from_numpy(pretrained_weight)
 ```
 
-While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding
-pretrained checkpoint weight exactly match in both **shape and name**. To do so, it is **necessary** to add assert
-statements for the shape and print out the names of the checkpoints weights. E.g. you should add statements like:
+Verify the randomly initialized weights and their corresponding pretrained checkpoint weights have the identical **shape** and **name**. Add assert statements for the shape and print out the checkpoint weight names.
 
-```python
+```py
 assert (
     model_pointer.weight.shape == pretrained_weight.shape
 ), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+
+logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
 ```
 
-Besides, you should also print out the names of both weights to make sure they match, *e.g.*
+When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because the `BrandNewLlama` parameters don't exactly match the original models parameters. But it could also be that the PyTorch layer implementation requires the weights to be transposed first.
 
-```python
-logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+### Implement the forward pass
+
+The forward pass should be implemented next if the model loads correctly. It takes some inputs and returns the model output.
+
+```py
+model = BrandNewLlamaModel.from_pretrained("/path/to/converted/checkpoint/folder")
+input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+output = model.generate(input_ids).last_hidden_states
 ```
 
-If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly
-initialized layer of the 🤗 Transformers implementation.
+Don't be discouraged if your forward pass isn't identical with the output from the original model or if it returns an error. Check that the forward pass doesn't throw any errors. This is often because the dimensions are wrong or because the wrong data type is used ([torch.long](https://pytorch.org/docs/stable/generated/torch.Tensor.long.html) instead of [torch.float32](https://pytorch.org/docs/stable/tensors.html)).
 
-An incorrect shape is most likely due to an incorrect setting of the config parameters in `BrandNewBertConfig()` that
-do not exactly match those that were used for the checkpoint you want to convert. However, it could also be that
-PyTorch's implementation of a layer requires the weight to be transposed beforehand.
+Your output should have a precision of *1e-3*. Ensure the output shapes and output values are identical. Common reasons for why the outputs aren't identical include:
 
-Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that
-were not used for initialization to make sure the model is correctly converted. It is completely normal, that the
-conversion trials fail with either a wrong shape statement or a wrong name assignment. This is most likely because either
-you used incorrect parameters in `BrandNewBertConfig()`, have a wrong architecture in the 🤗 Transformers
-implementation, you have a bug in the `init()` functions of one of the components of the 🤗 Transformers
-implementation or you need to transpose one of the checkpoint weights.
+- Some layers were not added (activation layer or a residual connection).
+- The word embedding matrix is not tied.
+- The wrong positional embeddings are used because the original implementation includes an offset.
+- Dropout is applied during the forward pass. Fix this error by making sure `model.training` is `False` and passing `self.training` to [torch.nn.functional.dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout).
 
-This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the
-Transformers model. Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save
-the model under a folder of your choice `/path/to/converted/checkpoint/folder` that should then contain both a
-`pytorch_model.bin` file and a `config.json` file:
+Compare the forward pass of the original model and your implementation to check if there are any differences. Ideally, debug and print out the intermediate outputs of both implementations of the forward pass to pinpoint where the original implementation differs from yours.
 
-```python
-model.save_pretrained("/path/to/converted/checkpoint/folder")
-```
+1. Make sure the hardcoded `input_ids` in both implementations are identical.
+2. Verify the outputs of the first transformation of `input_ids` (usually the word embeddings) are identical, and work your way through to the last layer.
 
-**7. Implement the forward pass**
+Any difference between the two implementations should point to the bug in your implementation.
 
-Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make
-sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#3-4-run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
-pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers
-implementation instead of the original one. It should look as follows:
+One of the best strategies is to add many print statements to the same positions in both implementations, and then successively remove them when they output identical values for the intermediate outputs.
 
-```python
-model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
-input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
-output = model(input_ids).last_hidden_states
-```
-
-It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact
-same output the very first time or that the forward pass throws an error. Don't be disappointed - it's expected! First,
-you should make sure that the forward pass doesn't throw any errors. It often happens that the wrong dimensions are
-used leading to a *Dimensionality mismatch* error or that the wrong data type object is used, *e.g.* `torch.long`
-instead of `torch.float32`. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve
-certain errors.
-
-The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are
-equivalent to a precision of `1e-3`. First, you should ensure that the output shapes are identical, *i.e.*
-`outputs.shape` should yield the same value for the script of the 🤗 Transformers implementation and the original
-implementation. Next, you should make sure that the output values are identical as well. This one of the most difficult
-parts of adding a new model. Common mistakes why the outputs are not identical are:
-
-- Some layers were not added, *i.e.* an *activation* layer was not added, or the residual connection was forgotten
-- The word embedding matrix was not tied
-- The wrong positional embeddings are used because the original implementation uses on offset
-- Dropout is applied during the forward pass. To fix this make sure *model.training is False* and that no dropout
-  layer is falsely activated during the forward pass, *i.e.* pass *self.training* to [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)
-
-The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗
-Transformers implementation side-by-side and check if there are any differences. Ideally, you should debug/print out
-intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗
-Transformers implementation shows a different output than the original implementation. First, make sure that the
-hard-coded `input_ids` in both scripts are identical. Next, verify that the outputs of the first transformation of
-the `input_ids` (usually the word embeddings) are identical. And then work your way up to the very last layer of the
-network. At some point, you will notice a difference between the two implementations, which should point you to the bug
-in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements
-in both the original implementation and 🤗 Transformers implementation, at the same positions in the network
-respectively, and to successively remove print statements showing the same values for intermediate presentations.
-
-When you're confident that both implementations yield the same output, verify the outputs with
-`torch.allclose(original_output, output, atol=1e-3)`, you're done with the most difficult part! Congratulations - the
-work left to be done should be a cakewalk 😊.
-
-**8. Adding all necessary model tests**
-
-At this point, you have successfully added a new model. However, it is very much possible that the model does not yet
-fully comply with the required design. To make sure, the implementation is fully compatible with 🤗 Transformers, all
-common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under
-the same `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. Run this test file to verify that all common
-tests pass:
+When both implementations produce the same output, verify the outputs are within a precision of *1e-3*.
 
-```bash
-pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py
+```py
+torch.allclose(original_output, output, atol=1e-3)
 ```
 
-Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that
+This is typically the most difficult part of the process. Congratulations if you've made it this far!
+
+And if you're stuck or struggling with this step, don't hesitate to ask for help on your pull request.
 
-- a) The community can easily understand your work by looking at specific tests of *brand_new_bert*
-- b) Future changes to your model will not break any important feature of the model.
+### Add model tests
 
-At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts
-you used earlier to implement the model to 🤗 Transformers. A template of those model tests has already added by the
-Cookiecutter, called `BrandNewBertModelIntegrationTests` and only has to be filled out by you. To ensure that those
-tests are passing, run
+While the model works, you still need to add tests to ensure it is compatible with Transformers. Tests are important because they help users understand your work by looking at specific tests, and because they prevent your model from breaking in the future if any changes are made.
+
+[Cookiecutter](https://cookiecutter.readthedocs.io/en/stable/) should have added a test file for your model. Run the test file below to make sure all common tests pass.
 
 ```bash
-RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+pytest tests/models/brand_new_llama/test_modeling_brand_new_llama.py
 ```
 
-<Tip>
+The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, `BrandNewLlamaModelIntegrationTests`, was added by Cookiecutter and should be filled out. To ensure it passes, run the following command.
+
+<hfoptions id="integration-test">
+<hfoption id="macOS">
+
+```bash
+RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_llama.py::BrandNewLlamaModelIntegrationTests
+```
 
-In case you are using Windows, you should replace `RUN_SLOW=1` with `SET RUN_SLOW=1`
+</hfoption>
+<hfoption id="Windows">
 
-</Tip>
+```bash
+SET RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_llama.py::BrandNewLlamaModelIntegrationTests
+```
 
-Second, all features that are special to *brand_new_bert* should be tested additionally in a separate test under
-`BrandNewBertModelTester`/`BrandNewBertModelTest`. This part is often forgotten but is extremely useful in two
-ways:
+</hfoption>
+</hfoptions>
 
-- It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the
-  special features of *brand_new_bert* should work.
-- Future contributors can quickly test changes to the model by running those special tests.
+All features unique to BrandNewLlama should be tested in a separate test under `BrandNewLlamaModelTester/BrandNewLlamaModelTest`. This test is often overlooked, but it is extremely important because:
 
+- it helps transfer knowledge you acquired during the process to the community by showing how the models novel features work
+- future contributors can quickly test changes to the model by running these special tests
 
-**9. Implement the tokenizer**
+## Implement tokenizer
 
-Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent to or very similar to an
-already existing tokenizer of 🤗 Transformers.
+> [!TIP]
+> We recommend adding a fast tokenizer ([`PreTrainedTokenizerFast`]) to give users the best performance. Feel free to tag [@ArthurZucker](https://github.com/ArthurZucker) or [@itazap](https://github.com/itazap) in your PR for help on how to add [`PreTrainedTokenizerFast`].
 
-It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗
-Transformers' implementation of the tokenizer.
+With the model out of the way, time to focus on the tokenizer. The tokenizer should be identical or very similar to an existing tokenizer in Transformers.
 
-To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository
-that inputs a string and returns the `input_ids`. It could look similar to this (in pseudo-code):
+Find and load the original tokenizer file into your implementation. Create a script in the original repository that inputs a string and returns the `input_ids`. The pseudocode should look similar to the code below.
 
-```python
+```py
 input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
-model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+model = BrandNewLlamaModel.load_pretrained_checkpoint("/path/to/checkpoint/")
 input_ids = model.tokenize(input_str)
 ```
 
-You might have to take a deeper look again into the original repository to find the correct tokenizer function or you
-might even have to do changes to your clone of the original repository to only output the `input_ids`. Having written
-a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be
-created. It should look similar to this:
+You may need to search the original repository to find the correct tokenizer function or modify the existing tokenizer in your clone of the original repository to only return the `input_ids`. The script for your tokenizer should look similar to the following.
 
-```python
-from transformers import BrandNewBertTokenizer
+```py
+from transformers import BrandNewLlamaTokenizer
 
 input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
-
-tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
-
+tokenizer = BrandNewLlamaTokenizer.from_pretrained("/path/to/tokenizer/folder/")
 input_ids = tokenizer(input_str).input_ids
 ```
 
-When both `input_ids` yield the same values, as a final step a tokenizer test file should also be added.
+When both implementations have the same `input_ids`, add a tokenizer test file. This file is analogous to the modeling test files. The tokenizer test files should contain a couple of hardcoded integration tests.
+
+## Implement image processor
+
+> [!TIP]
+> Fast image processors use the [torchvision](https://pytorch.org/vision/stable/index.html) library and can perform image processing on the GPU, significantly improving processing speed.
+> We recommend adding a fast image processor ([`BaseImageProcessorFast`]) in addition to the "slow" image processor ([`BaseImageProcessor`]) to provide users with the best performance. Feel free to tag [@yonigozlan](https://github.com/yonigozlan) for help adding a [`BaseImageProcessorFast`].
 
-Analogous to the modeling test files of *brand_new_bert*, the tokenization test files of *brand_new_bert* should
-contain a couple of hard-coded integration tests.
+While this example doesn't include an image processor, you may need to implement one if your model requires image inputs. The image processor is responsible for converting images into a format suitable for your model. Before implementing a new one, check whether an existing image processor in the Transformers library can be reused, as many models share similar image processing techniques. Note that you can also use [modular](./modular_transformers) for image processors to reuse existing components.
 
-**10. Run End-to-end integration tests**
+If you do need to implement a new image processor, refer to an existing image processor to understand the expected structure. Slow image processors ([`BaseImageProcessor`]) and fast image processors ([`BaseImageProcessorFast`]) are designed differently, so make sure you follow the correct structure based on the processor type you're implementing.
 
-Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the
-tokenizer to `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in 🤗 Transformers.
-Such a test should show on a meaningful
-text-to-text sample that the 🤗 Transformers implementation works as expected. A meaningful text-to-text sample can
-include *e.g.* a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none
-of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. In a
-final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. It can
-happen that you forgot to add some `.to(self.device)` statements to internal tensors of the model, which in such a
-test would show in an error. In case you have no access to a GPU, the Hugging Face team can take care of running those
-tests for you.
+Run the following command (only if you haven't already created the fast image processor with the `transformers-cli add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.
 
-**11. Add Docstring**
+```bash
+transformers-cli add-fast-image-processor --model-name your_model_name
+```
 
-Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is
-a nice docstring and a doc page. The Cookiecutter should have added a template file called
-`docs/source/model_doc/brand_new_bert.md` that you should fill out. Users of your model will usually first look at
-this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for
-the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team
-regarding the docstrings.
+This command will generate the necessary imports and provide a pre-filled template for the fast image processor. You can then modify it to fit your model's needs.
 
-Next, make sure that the docstring added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` is
-correct and included all necessary inputs and outputs. We have a detailed guide about writing documentation and our docstring format [here](writing-documentation). It is always good to remind oneself that documentation should
-be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact
-point of the community with the model.
+Add tests for the image processor in `tests/models/your_model_name/test_image_processing_your_model_name.py`. These tests should be similar to those for other image processors and should verify that the image processor correctly handles image inputs. If your image processor includes unique features or processing methods, ensure you add specific tests for those as well.
 
-**Code refactor**
+## Implement processor
 
-Great, now you have added all the necessary code for *brand_new_bert*. At this point, you should correct some potential
-incorrect code style by running:
+If your model accepts multiple modalities, like text and images, you need to add a processor. The processor centralizes the preprocessing of different modalities before passing them to the model.
 
-```bash
-make style
+The processor should call the appropriate modality-specific processors within its `__call__` function to handle each type of input correctly. Be sure to check existing processors in the library to understand their expected structure. Transformers uses the following convention in the `__call__` function signature.
+
+```python
+def __call__(
+    self,
+    images: ImageInput = None,
+    text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+    audio=None,
+    videos=None,
+    **kwargs: Unpack[YourModelProcessorKwargs],
+) -> BatchFeature:
+    ...
 ```
 
-and verify that your coding style passes the quality check:
+`YourModelProcessorKwargs` is a `TypedDict` that includes all the typical processing arguments and any extra arguments a specific processor may require.
 
-```bash
-make quality
-```
+Add tests for the processor in `tests/models/your_model_name/test_processor_your_model_name.py`. These tests should be similar to those for other processors and should verify that the processor correctly handles the different modalities.
 
-There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in
-the tests of your pull request. This is often because of some missing information in the docstring or some incorrect
-naming. The Hugging Face team will surely help you if you're stuck here.
+## Integration tests
 
-Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. With all
-tests passing, now it's a good time to go over the added code again and do some refactoring.
+Now that you have a model and tokenizer, add end-to-end integration tests for the model and tokenizer to `tests/models/brand_new_llama/test_modeling_brand_new_llama.py`.
 
-You have now finished the coding part, congratulation! 🎉 You are Awesome! 😎
+The test should provide a meaningful text-to-text example to show the model works as expected. For example, you can include a source-to-target translation pair, an article-to-summary pair, or a question-to-answer pair.
 
-**12. Upload the models to the model hub**
+If the checkpoint hasn't been fine-tuned on a downstream task, then the model tests are sufficient.
 
-In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each
-uploaded model checkpoint. You can get familiar with the hub functionalities by reading our [Model sharing and uploading Page](model_sharing). You should work alongside the Hugging Face team here to decide on a fitting name for each
-checkpoint and to get the required access rights to be able to upload the model under the author's organization of
-*brand_new_bert*. The `push_to_hub` method, present in all models in `transformers`, is a quick and efficient way to push your checkpoint to the hub. A little snippet is pasted below:
+Finally, try to make sure your tests can run on a GPU by adding `.to(self.device)` statements to the models internal tensors. If you don't have access to a GPU, we can take care of that for you.
 
-```python
-brand_new_bert.push_to_hub("brand_new_bert")
-# Uncomment the following line to push to an organization.
-# brand_new_bert.push_to_hub("<organization>/brand_new_bert")
-```
+## Add documentation
+
+Your model is only useful if users know how to use it. This is why it's important to add documentation and docstrings. Cookiecutter added a template file, `docs/source/model_doc/brand_new_llama.md`, that you can fill out with information about your model.
 
-It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the
-specific characteristics of this particular checkpoint, *e.g.* On which dataset was the checkpoint
-pretrained/fine-tuned on? On what down-stream task should the model be used? And also include some code on how to
-correctly use the model.
+This is generally a user's first interaction with a model, so the documentation should be clear and concise. It is often very useful to add examples of how the model should be used.
 
-**13. (Optional) Add notebook**
+Make sure docstrings are added to `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py` and includes all necessary inputs and outputs. Review our [guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for writing documentation and docstrings.
 
-It is very helpful to add a notebook that showcases in-detail how *brand_new_bert* can be used for inference and/or
-fine-tuned on a downstream task. This is not mandatory to merge your PR, but very useful for the community.
+## Refactor
 
-**14. Submit your finished PR**
+Time to tidy things up and make sure the code style is consistent with the rest of the library. Run the following command to automatically fix incorrect styles.
 
-You're done programming now and can move to the last step, which is getting your PR merged into main. Usually, the
-Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished
-PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your
-reviewer.
+```bash
+make style
+```
 
-### Share your work!!
+To verify the code style passes quality checks, run the command below.
 
-Now, it's time to get some credit from the community for your work! Having completed a model addition is a major
-contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be
-used by hundreds and possibly even thousands of developers and researchers. You should be proud of your work and share
-your achievements with the community.
+```bash
+make quality
+```
 
-**You have made another model that is super easy to access for everyone in the community! 🤯**
+There may be other failing tests or checks (missing docstring or incorrect naming) on your pull request due to Transformers strict design tests. We can help you with these issues if you're stuck.
 
-## Model additions and their timeline: when is a model added to transformers?
+After ensuring the code runs correctly, you may want to refactor it to make it more readable or cleaner.
 
-We aim for `transformers` to have support for new model architectures and checkpoints as early as possible:
-availability can range from day-0 (and hour-0) releases for some models, to a few days/weeks for others.
+## Upload to the Hub
 
-The availability of this is usually up to the model contributors, as well as how excited the community is for the
-architecture.
+Convert and upload all checkpoints to the [Hub](https://hf.co/models). Add a model card to provide more transparency and context about the model. The model card should highlight specific characteristics of a checkpoint, how the model was trained, and code examples of how to use it.
 
-We can split the model architecture possibilities in four sections:
-- Day-0 integration
-- Same-week integration
-- Post-release integration
-- Hub-first release
+> [!TIP]
+> In many cases, adding an interactive notebook users can run is a great way to showcase how to use the model for inference or fine-tune it on a downstream task. While not required, including a notebook can drive greater adoption of your model.
 
-Let's dive into each of these and see how we (the transformers team) can help you contribute your architecture and get
-your architecture to be very easily used by all members of the community.
+You should also consult with the Transformers team to decide on an appropriate name for the model, and getting the required access rights to upload the model.
 
-### Day-0 integration
+Use the [`~PreTrainedModel.push_to_hub`] method to upload the model.
 
-For a day-0 integration to work, we'll usually want to work hand-in-hand with you directly. In order to keep your
-architecture private until your checkpoints and release are ready, we'll work together in a private fork of
-transformers.
+```py
+brand_new_bert.push_to_hub("brand_new_llama")
+```
 
-If you plan on having a transformers-first release, this is a great option: we run CI ahead of time, ensure the
-documentation is clear, and we aim to optimize your model as much as possible (providing quantization, optimizing it
-with Flash-Attention/SDPA, optimizing the KV cache, etc). 
+Refer to the [Sharing](./model_sharing) guide for more information about uploading models to the Hub.
 
-We can also lend you a hand in adding the model, reviewing it early, and help you make sure the `transformers` 
-API works as expected!
+## Merge your model
 
-If this is the path you wish to go with, we ask for you to reach out in advance, especially if the architecture is 
-particularly novel (at least a few days, but a few weeks will enable the absolute best integration). In order to reach
-out, please contact transformers@huggingface.co 🤗.
+You're finally ready to merge your pull request and officially add the model to Transformers! Make sure all the tests are passing and all comments and feedback have been addressed.
 
-### Same-week integration
+Congratulations on adding a new model to Transformers! 🥳
 
-A same-week integration usually happens when model authors do not reach out; but we see significant community
-requests.
+This is a very significant contribution. Your work makes Transformers more accessible to developers and researchers around the world. You should be proud of your contribution and share your accomplishment with the community!
 
-In order to specify you'd like for us to integrate a specific model, we'll redirect you to our
-[issue tracker](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&projects=&template=new-model-addition.yml)
-where you can request a specific model.
+## Model addition timeline
 
-The more activity on the issue, the faster/more likely we are to integrate the model!
+There are four timelines for model additions depending on the model contributor and community demand for an architecture.
 
-### Post-release integration
+- **day-0 integration**: If you plan on having a Transformers-first release, this is a great option because we can ensure the documentation is clear and optimize your model as much as possible (quantization, FlashAttention, KV-cache, etc.). We can also help you add the model, provide early reviews and make sure it works as expected.
 
-A post-release integration usually happens when there has not been sufficient activity/requests to warrant a same-week
-integration, or that we lack the sufficient bandwidth to integrate it.
+  Reach out to transformers@huggingface.co a few days (preferably weeks) in advance, especially if an architecture is particularly novel, to ensure model integration. We'll work together on a private fork of Transformers until your checkpoint and release is ready.
 
-We very gladly welcome community contributions in those instances; more than half of the library was contributed
-by contributors external to Hugging Face. If this is something that is interesting to you, we recommend that you look
-at our [open issues tagged with "New model"](https://github.com/huggingface/transformers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+model%22).
+- **same week integration**: Models with significant requests/demand are usually added the same week if the model author doesn't reach out.
 
-We recommend you try your hand at a heavily requested model as this will multiply the impact of your contribution.
-We'll be there to help you in case that's your first contribution 🤗.
+  Use the [issue tracker](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&projects=&template=new-model-addition.yml) to request a specific model to add. The more activity on the issue, the faster and more likely we'll integrate it.
 
-### Code-on-Hub release
+- **post-release integration**: Models without popular requests/demand or if we don't have the bandwidth to integrate it are added post-release.
 
-Finally, transformers has a "remote-code" possibility, in which contributions are not made within the toolkit, but on
-the Hub. This can be particularly interesting for groups that are using `transformers` as a backbone for their project,
-but don't have the bandwidth to contribute the model to transformers directly.
+  This is a good opportunity if you're interested in contributing a model to Transformers. Take a look at open issues tagged with ["New model"](https://github.com/huggingface/transformers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+model%22). Feel free to give the most requested models a try first to multiply the impact of your contribution. We'll be there to help you each step of the way!
 
-In case the model is very successful, then we'll very likely end up integrating it in `transformers` at the end - as this
-provides better documentation, CI, maintenance, and optimizations - but this remains a great way to make your model
-accessible day-0 with minimal friction.
+- **Hub-first release**: Transformers [remote-code](./models#custom-models) feature allows Transformers-based projects to be shared directly on the Hub. This is a good option if you don't have the bandwidth to add a model directly to Transformers.
 
-This guide is a great starting point for a Hub-first release: [Custom models](./custom_models)
\ No newline at end of file
+  If a model ends up being very popular, then it's very likely that we'll integrate it in Transformers ourselves to enable better support (documentation, maintenance, optimization, etc.) for it. A Hub-first release is the most frictionless way to add a model.
diff --git a/docs/source/en/add_new_pipeline.md b/docs/source/en/add_new_pipeline.md
index e8234c565b26..60ef43dab585 100644
--- a/docs/source/en/add_new_pipeline.md
+++ b/docs/source/en/add_new_pipeline.md
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,92 +13,66 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# How to create a custom pipeline?
+# Adding a new pipeline
 
-In this guide, we will see how to create a custom pipeline and share it on the [Hub](https://hf.co/models) or add it to the
-🤗 Transformers library.
+Make [`Pipeline`] your own by subclassing it and implementing a few methods. Share the code with the community on the [Hub](https://hf.co) and register the pipeline with Transformers so that everyone can quickly and easily use it.
 
-First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
-dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
-as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the
-pipeline (`preprocess`).
+This guide will walk you through the process of adding a new pipeline to Transformers.
 
-Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of
-`postprocess` method.
+## Design choices
 
-Start by inheriting the base class `Pipeline` with the 4 methods needed to implement `preprocess`,
-`_forward`, `postprocess`, and `_sanitize_parameters`.
+At a minimum, you only need to provide [`Pipeline`] with an appropriate input for a task. This is also where you should begin when designing your pipeline.
 
+Decide what input types [`Pipeline`] can accept. It can be strings, raw bytes, dictionaries, and so on. Try to keep the inputs in pure Python where possible because it's more compatible. Next, decide on the output [`Pipeline`] should return. Again, keeping the output in Python is the simplest and best option because it's easier to work with.
 
-```python
-from transformers import Pipeline
+Keeping the inputs and outputs simple, and ideally JSON-serializable, makes it easier for users to run your [`Pipeline`] without needing to learn new object types. It's also common to support many different input types for even greater ease of use. For example, making an audio file acceptable from a filename, URL, or raw bytes gives the user more flexibility in how they provide the audio data.
+
+## Create a pipeline
+
+With an input and output decided, you can start implementing [`Pipeline`]. Your pipeline should inherit from the base [`Pipeline`] class and include 4 methods.
 
+```py
+from transformers import Pipeline
 
 class MyPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "maybe_arg" in kwargs:
-            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-        return preprocess_kwargs, {}, {}
 
-    def preprocess(self, inputs, maybe_arg=2):
-        model_input = Tensor(inputs["input_ids"])
-        return {"model_input": model_input}
+    def preprocess(self, inputs, args=2):
 
     def _forward(self, model_inputs):
-        # model_inputs == {"model_input": model_input}
-        outputs = self.model(**model_inputs)
-        # Maybe {"logits": Tensor(...)}
-        return outputs
 
     def postprocess(self, model_outputs):
-        best_class = model_outputs["logits"].softmax(-1)
-        return best_class
 ```
 
-The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
-pre/postprocessing on the CPU on different threads
-
-`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
-contain more information and is usually a `Dict`.
-
-`_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred
-called method as it contains safeguards to make sure everything is working on the expected device. If anything is
-linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess.
-
-`postprocess` methods will take the output of `_forward` and turn it into the final output that was decided
-earlier.
-
-`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
-time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
+1. `preprocess` takes the inputs and transforms them into the appropriate input format for the model.
 
-The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`,
-`_forward`, and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
-allows to keep the default arguments in the function definition which is always more "natural".
-
-A classic example would be a `top_k` argument in the post processing in classification tasks.
+```py
+def preprocess(self, inputs, maybe_arg=2):
+    model_input = Tensor(inputs["input_ids"])
+    return {"model_input": model_input}
+```
 
-```python
->>> pipe = pipeline("my-new-task")
->>> pipe("This is a test")
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
-{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+2. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward` and everything else belongs in either `preprocess` or `postprocess`.
 
->>> pipe("This is a test", top_k=2)
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+```py
+def _forward(self, model_inputs):
+    outputs = self.model(**model_inputs)
+    return outputs
 ```
 
-In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
-`_sanitize_parameters` to allow this new parameter.
-
+3. `postprocess` generates the final output from the models output in `_forward`.
 
-```python
+```py
 def postprocess(self, model_outputs, top_k=5):
     best_class = model_outputs["logits"].softmax(-1)
-    # Add logic to handle top_k
     return best_class
+```
+
+4. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with extra parameters. This keeps the default arguments in the function definition which is always more natural.
 
+For example, add a `top_k` parameter in `postprocess` to return the top 5 most likely classes. Then in `_sanitize_parameters`, check if the user passed in `top_k` and add it to `postprocess_kwargs`.
 
+```py
 def _sanitize_parameters(self, **kwargs):
     preprocess_kwargs = {}
     if "maybe_arg" in kwargs:
@@ -110,55 +84,61 @@ def _sanitize_parameters(self, **kwargs):
     return preprocess_kwargs, {}, postprocess_kwargs
 ```
 
-Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
-without requiring users to understand new kinds of objects. It's also relatively common to support many different types
-of arguments for ease of use (audio files, which can be filenames, URLs or pure bytes)
+Now the pipeline can return the top most likely labels if a user chooses to.
 
+```py
+from transformers import pipeline
 
+pipeline = pipeline("my-task")
+# returns 3 most likely labels
+pipeline("This is the best meal I've ever had", top_k=3)
+# returns 5 most likely labels by default
+pipeline("This is the best meal I've ever had")
+```
+
+## Register a pipeline
 
-## Adding it to the list of supported tasks
+Register the new task your pipeline supports in the `PIPELINE_REGISTRY`. The registry defines:
 
-To register your `new-task` to the list of supported tasks, you have to add it to the `PIPELINE_REGISTRY`:
+- the machine learning framework the pipeline supports with either `pt_model` or `tf_model` (add both to ensure it works with either frameworks)
+- a default model which should come from a specific revision (branch, or commit hash) where the model works as expected with `default`
+- the expected input with `type`
 
-```python
+```py
 from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
 
 PIPELINE_REGISTRY.register_pipeline(
     "new-task",
     pipeline_class=MyPipeline,
     pt_model=AutoModelForSequenceClassification,
+    tf_model=TFAutoModelForSequenceClassification,
+    default={"pt": ("user/awesome-model", "branch-name")},
+    type="text",
 )
 ```
 
-You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took `"abcdef"`) as well as the type:
+## Share your pipeline
 
-```python
-PIPELINE_REGISTRY.register_pipeline(
-    "new-task",
-    pipeline_class=MyPipeline,
-    pt_model=AutoModelForSequenceClassification,
-    default={"pt": ("user/awesome_model", "abcdef")},
-    type="text",  # current support type: text, audio, image, multimodal
-)
-```
+Share your pipeline with the community on the [Hub](https://hf.co) or you can add it directly to Transformers.
 
-## Share your pipeline on the Hub
+It's faster to upload your pipeline code to the Hub because it doesn't require a review from the Transformers team. Adding the pipeline to Transformers may be slower because it requires a review and you need to add tests to ensure your [`Pipeline`] works.
 
-To share your custom pipeline on the Hub, you just have to save the custom code of your `Pipeline` subclass in a
-python file. For instance, let's say we want to use a custom pipeline for sentence pair classification like this:
+### Upload to the Hub
+
+Add your pipeline code to the Hub in a Python file.
+
+For example, a custom pipeline for sentence pair classification might look like the following code below. The implementation works for PyTorch and TensorFlow models.
 
 ```py
 import numpy as np
-
 from transformers import Pipeline
 
-
 def softmax(outputs):
     maxes = np.max(outputs, axis=-1, keepdims=True)
     shifted_exp = np.exp(outputs - maxes)
     return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
 
-
 class PairClassificationPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):
         preprocess_kwargs = {}
@@ -183,8 +163,7 @@ class PairClassificationPipeline(Pipeline):
         return {"label": label, "score": score, "logits": logits}
 ```
 
-The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in
-a file named `pair_classification.py`, we can then import it and register it like this.
+Save the code in a file named `pair_classification.py`, and import and register it as shown below.
 
 ```py
 from pair_classification import PairClassificationPipeline
@@ -215,56 +194,36 @@ The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5f
   },
 ```
 
-Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been
-fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not.
+Call [`~Pipeline.push_to_hub`] to push the pipeline to the Hub. The Python file containing the code is copied to the Hub, and the pipelines model and tokenizer are also saved and pushed to the Hub. Your pipeline should now be available on the Hub under your namespace.
 
 ```py
 from transformers import pipeline
 
-classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
+pipeline = pipeline(task="pair-classification", model="sgugger/finetuned-bert-mrpc")
+pipeline.push_to_hub("pair-classification-pipeline")
 ```
 
-Then we can share it on the Hub by using the `push_to_hub` method:
-
-```py
-classifier.push_to_hub("test-dynamic-pipeline")
-```
-
-This will copy the file where you defined `PairClassificationPipeline` inside the folder `"test-dynamic-pipeline"`,
-along with saving the model and tokenizer of the pipeline, before pushing everything into the repository
-`{your_username}/test-dynamic-pipeline`. After that, anyone can use it as long as they provide the option
-`trust_remote_code=True`:
+To use the pipeline, add `trust_remote_code=True` when loading the pipeline.
 
 ```py
 from transformers import pipeline
 
-classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
+pipeline = pipeline(task="pair-classification", trust_remote_code=True)
 ```
 
-## Add the pipeline to 🤗 Transformers
+### Add to Transformers
+
+Adding a custom pipeline to Transformers requires adding tests to make sure everything works as expected, and requesting a review from the Transformers team.
 
-If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the `pipelines` submodule
-with the code of your pipeline, then add it to the list of tasks defined in `pipelines/__init__.py`.
+Add your pipeline code as a new module to the [pipelines](https://github.com/huggingface/transformers/tree/main/src/transformers/pipelines) submodule, and add it to the list of tasks defined in [pipelines/__init__.py](https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/__init__.py).
 
-Then you will need to add tests. Create a new file `tests/test_pipelines_MY_PIPELINE.py` with examples of the other tests.
+Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline.
 
-The `run_pipeline_test` function will be very generic and run on small random models on every possible
-architecture as defined by `model_mapping` and `tf_model_mapping`.
+The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48) and [tf_model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L49). This is important for testing future compatibility with new models.
 
-This is very important to test future compatibility, meaning if someone adds a new model for
-`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
-impossible to check for actual values, that's why there is a helper `ANY` that will simply attempt to match the
-output of the pipeline TYPE.
+You'll also notice `ANY` is used throughout the [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function. The models are random, so you can't check the actual values. Using `ANY` allows the test to match the output of the pipeline type instead.
 
-You also *need* to implement 2 (ideally 4) tests.
+Finally, you should also implement the following 4 tests.
 
-- `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
-  and test the pipeline outputs. The results should be the same as `test_small_model_tf`.
-- `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
-  and test the pipeline outputs. The results should be the same as `test_small_model_pt`.
-- `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
-  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
-  sure there is no drift in future releases.
-- `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
-  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
-  sure there is no drift in future releases.
+1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59) and [test_small_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L150), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result.
+1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187) nad [test_large_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L220), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow.
diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md
index 56c9184980f4..bd24d8ce30cc 100644
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@@ -13,211 +13,135 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
-# Agents and tools
 
-[[open-in-colab]]
-
-### What is an agent?
-
-Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to.
-
-One approach to overcome this weakness is to create an *agent*.
-
-An agent is a system that uses an LLM as its engine, and it has access to functions called *tools*.
-
-These *tools* are functions for performing a task, and they contain all necessary description for the agent to properly use them.
+> [!WARNING]
+> Agents and tools are being spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. These docs will be deprecated in the future!
 
-The agent can be programmed to:
-- devise a series of actions/tools and run them all at once,  like the [`CodeAgent`]
-- plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one, like the [`ReactJsonAgent`]
+# Agents
 
-### Types of agents
+[[open-in-colab]]
 
-#### Code agent
+An agent is a system where a large language model (LLM) can execute more complex tasks through *planning* and using *tools*.
 
-This agent has a planning step, then generates python code to execute all its actions at once. It natively handles different input and output types for its tools, thus it is the recommended choice for multimodal tasks.
+- Planning helps a LLM reason its way through a task by breaking it down into smaller subtasks. For example, [`CodeAgent`] plans a series of actions to take and then generates Python code to execute all the actions at once.
 
-#### React agents
+    Another planning method is by self-reflection and refinement of its previous actions to improve its performance. The [`ReactJsonAgent`] is an example of this type of planning, and it's based on the [ReAct](https://hf.co/papers/2210.03629) framework. This agent plans and executes actions one at a time based on the feedback it receives from each action.
 
-This is the go-to agent to solve reasoning tasks, since the ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) makes it really efficient to think on the basis of its previous observations.
+- Tools give a LLM access to external functions or APIs that it can use to help it complete a task. For example, [gradio-tools](https://github.com/freddyaboulton/gradio-tools) gives a LLM access to any of the [Gradio](https://www.gradio.app/) apps available on Hugging Face [Spaces](https://hf.co/spaces). These apps can be used for a wide range of tasks such as image generation, video generation, audio transcription, and more.
 
-We implement two versions of ReactJsonAgent: 
-- [`ReactJsonAgent`] generates tool calls as a JSON in its output.
-- [`ReactCodeAgent`] is a new type of ReactJsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance.
+To use agents in Transformers, make sure you have the extra `agents` dependencies installed.
 
-> [!TIP]
-> Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more about ReAct agents.
+```bash
+!pip install transformers[agents]
+```
 
-<div class="flex justify-center">
-    <img
-        class="block dark:hidden"
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Agent_ManimCE.gif"
-    />
-    <img
-        class="hidden dark:block"
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Agent_ManimCE.gif"
-    />
-</div>
+Create an agent instance (refer to the [Agents](./main_classes/agent#agents) API for supported agents in Transformers) and a list of tools available for it to use, then [`~ReactAgent.run`] the agent on your task. The example below demonstrates how a ReAct agent reasons through a task.
 
-![Framework of a React Agent](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
+```py
+from transformers import ReactCodeAgent
 
-For example, here is how a ReAct Code agent would work its way through the following question.
+agent = ReactCodeAgent(tools=[])
+agent.run(
+    "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
+)
+```
 
-```py3
->>> agent.run(
-...     "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
-... )
-=====New task=====
+```bash
+======== New task ========
 How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
-====Agent is executing the code below:
-bert_blocks = search(query="number of blocks in BERT base encoder")
-print("BERT blocks:", bert_blocks)
+==== Agent is executing the code below:
+bert_layers = 12  # BERT base encoder has 12 layers
+attention_layers = 6  # Encoder in Attention is All You Need has 6 layers
+layer_diff = bert_layers - attention_layers
+print("The difference in layers between BERT base encoder and Attention is All You Need is", layer_diff)
 ====
 Print outputs:
-BERT blocks: twelve encoder blocks
+The difference in layers between BERT base encoder and Attention is All You Need is 6
 
-====Agent is executing the code below:
-attention_layer = search(query="number of layers in Attention is All You Need")
-print("Attention layers:", attention_layer)
-====
-Print outputs:
-Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
-
-====Agent is executing the code below:
-bert_blocks = 12
-attention_layers = 6
-diff = bert_blocks - attention_layers
-print("Difference in blocks:", diff)
-final_answer(diff)
+==== Agent is executing the code below:
+final_answer("BERT base encoder has {} more layers than the encoder from Attention is All You Need.".format(layer_diff))
 ====
-
 Print outputs:
-Difference in blocks: 6
-
-Final answer: 6
-```
-
-### How can I build an agent?
-
-To initialize an agent, you need these arguments:
-
-- an LLM to power your agent - the agent is not exactly the LLM, it’s more like the agent is a program that uses an LLM as its engine.
-- a system prompt: what the LLM engine will be prompted with to generate its output
-- a toolbox from which the agent pick tools to execute
-- a parser to extract from the LLM output which tools are to call and with which arguments
-
-Upon initialization of the agent system, the tool attributes are used to generate a tool description, then baked into the agent’s `system_prompt` to let it know which tools it can use and why.
 
-To start with, please install the `agents` extras in order to install all default dependencies.
-
-```bash
-pip install transformers[agents]
+>>> Final answer:
+BERT base encoder has 6 more layers than the encoder from Attention is All You Need.
 ```
 
-Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating.
+This guide will walk you through in more detail how to initialize an agent.
 
-```python
-from huggingface_hub import login, InferenceClient
+## LLM
 
-login("<YOUR_HUGGINGFACEHUB_API_TOKEN>")
+An agent uses a LLM to plan and execute a task; it is the engine that powers the agent. To choose and build your own LLM engine, you need a method that:
 
-client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct")
+1. the input uses the [chat template](./chat_templating) format, `List[Dict[str, str]]`, and it returns a string
+2. the LLM stops generating outputs when it encounters the sequences in `stop_sequences`
 
+```py
 def llm_engine(messages, stop_sequences=["Task"]) -> str:
     response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
     answer = response.choices[0].message.content
     return answer
 ```
 
-You could use any `llm_engine` method as long as:
-1. it follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`.
-2. it stops generating outputs at the sequences passed in the argument `stop_sequences`
+Next, initialize an engine to load a model. To run an agent locally, create a [`TransformersEngine`] to load a preinitialized [`Pipeline`].
 
-Additionally, `llm_engine` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to llm_engine, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs.
+However, you could also leverage Hugging Face's powerful inference infrastructure, [Inference API](https://hf.co/docs/api-inference/index) or [Inference Endpoints](https://hf.co/docs/inference-endpoints/index), to run your model. This is useful for loading larger models that are typically required for agentic behavior. In this case, load the [`HfApiEngine`] to run the agent.
 
-You will also need a `tools` argument which accepts a list of `Tools` - it can be an empty list. You can also add the default toolbox on top of your `tools` list by defining the optional argument `add_base_tools=True`.
+The agent requires a list of tools it can use to complete a task. If you aren't using any additional tools, pass an empty list. The default tools provided by Transformers are loaded automatically, but you can optionally set `add_base_tools=True` to explicitly enable them.
 
-Now you can create an agent, like [`CodeAgent`], and run it. You can also create a [`TransformersEngine`] with a pre-initialized pipeline to run inference on your local machine using `transformers`.
-For convenience, since agentic behaviours generally require stronger models such as `Llama-3.1-70B-Instruct` that are harder to run locally for now, we also provide the [`HfApiEngine`] class that initializes a `huggingface_hub.InferenceClient` under the hood. 
+<hfoptions id="engine">
+<hfoption id="TransformersEngine">
 
-```python
-from transformers import CodeAgent, HfApiEngine
-
-llm_engine = HfApiEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
-agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TransformersEngine, CodeAgent
 
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct").to("cuda")
+pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
+llm_engine = TransformersEngine(pipeline)
+agent = CodeAgent(tools=[], llm_engine=llm_engine)
 agent.run(
-    "Could you translate this sentence from French, say it out loud and return the audio.",
-    sentence="Où est la boulangerie la plus proche?",
+    "What causes bread to rise?",
 )
 ```
 
-This will be handy in case of emergency baguette need!
-You can even leave the argument `llm_engine` undefined, and an [`HfApiEngine`] will be created by default.
+</hfoption>
+<hfoption id="HfApiEngine">
 
-```python
-from transformers import CodeAgent
-
-agent = CodeAgent(tools=[], add_base_tools=True)
+```py
+from transformers import CodeAgent, HfApiEngine
 
+llm_engine = HfApiEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
+agent = CodeAgent(tools=[], llm_engine=llm_engine)
 agent.run(
-    "Could you translate this sentence from French, say it out loud and give me the audio.",
+    "Could you translate this sentence from French, say it out loud and return the audio.",
     sentence="Où est la boulangerie la plus proche?",
 )
 ```
 
-Note that we used an additional `sentence` argument: you can pass text as additional arguments to the model.
+</hfoption>
+</hfoptions>
 
-You can also use this to indicate the path to local or remote files for the model to use:
+The agent supports [constrained generation](https://hf.co/docs/text-generation-inference/conceptual/guidance) for generating outputs according to a specific structure with the `grammar` parameter. The `grammar` parameter should be specified in the `llm_engine` method or you can set it when initializing an agent.
+
+Lastly, an agent accepts additional inputs such as text and audio. In the [`HfApiEngine`] example above, the agent accepted a sentence to translate. But you could also pass a path to a local or remote file for the agent to access. The example below demonstrates how to pass a path to an audio file.
 
 ```py
 from transformers import ReactCodeAgent
 
-agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-
-agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
-```
-
-
-The prompt and output parser were automatically defined, but you can easily inspect them by calling the `system_prompt_template` on your agent.
-
-```python
-print(agent.system_prompt_template)
+agent = ReactCodeAgent(tools=[], llm_engine=llm_engine)
+agent.run("Why doesn't he know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
 ```
 
-It's important to explain as clearly as possible the task you want to perform.
-Every [`~Agent.run`] operation is independent, and since an agent is powered by an LLM, minor variations in your prompt might yield completely different results.
-You can also run an agent consecutively for different tasks: each time the attributes `agent.task` and `agent.logs` will be re-initialized.
+## System prompt
 
+A system prompt describes how an agent should behave, a description of the available tools, and the expected output format.
 
-#### Code execution
+Tools are defined by the `<<tool_descriptions>>` token which is dynamically replaced during runtime with the actual tool. The tool description is derived from the tool name, description, inputs, output type, and a Jinja2 template. Refer to the [Tools](./tools) guide for more information about how to describe tools.
 
-A Python interpreter executes the code on a set of inputs passed along with your tools.
-This should be safe because the only functions that can be called are the tools you provided (especially if it's only tools by Hugging Face) and the print function, so you're already limited in what can be executed.
-
-The Python interpreter also doesn't allow imports by default outside of a safe list, so all the most obvious attacks shouldn't be an issue.
-You can still authorize additional imports by passing the authorized modules as a list of strings in argument `additional_authorized_imports` upon initialization of your [`ReactCodeAgent`] or [`CodeAgent`]:
+The example below is the system prompt for [`ReactCodeAgent`].
 
 ```py
->>> from transformers import ReactCodeAgent
-
->>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
->>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
-
-(...)
-'Hugging Face – Blog'
-```
-
-The execution will stop at any code trying to perform an illegal operation or if there is a regular Python error with the code generated by the agent.
-
-> [!WARNING]
-> The LLM can generate arbitrary code that will then be executed: do not add any unsafe imports!
-
-### The system prompt
-
-An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the [`ReactCodeAgent`] (below version is slightly simplified).
-
-```text
 You will be given a task to solve as best you can.
 You have access to the following tools:
 <<tool_descriptions>>
@@ -235,7 +159,7 @@ Here are a few examples using notional tools:
 ---
 {examples}
 
-Above example were using notional tools that might not exist for you. You only have acces to those tools:
+Above example were using notional tools that might not exist for you. You only have access to those tools:
 <<tool_names>>
 You also can perform computations in the python code you generate.
 
@@ -249,183 +173,125 @@ Remember to make sure that variables you use are all defined.
 Now Begin!
 ```
 
-The system prompt includes:
-- An *introduction* that explains how the agent should behave and what tools are.
-- A description of all the tools that is defined by a `<<tool_descriptions>>` token that is dynamically replaced at runtime with the tools defined/chosen by the user.
-    - The tool description comes from the tool attributes, `name`, `description`, `inputs` and `output_type`,  and a simple `jinja2` template that you can refine.
-- The expected output format.
-
-You could improve the system prompt, for example, by adding an explanation of the output format.
+The system prompt can be tailored to the intended task. For example, you can add a better explanation of the output format or you can overwrite the system prompt template entirely with your own custom system prompt as shown below.
 
-For maximum flexibility, you can overwrite the whole system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter.
+> [!WARNING]
+> If you're writing a custom system prompt, make sure to include `<<tool_descriptions>>` in the template so the agent is aware of the available tools.
 
-```python
+```py
 from transformers import ReactJsonAgent
 from transformers.agents import PythonInterpreterTool
 
 agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
 ```
 
-> [!WARNING]
-> Please make sure to define the `<<tool_descriptions>>` string somewhere in the `template` so the agent is aware 
-of the available tools.
-
-
-### Inspecting an agent run
-
-Here are a few useful attributes to inspect what happened after a run:
-- `agent.logs` stores the fine-grained logs of the agent. At every step of the agent's run, everything gets stored in a dictionary that then is appended to `agent.logs`.
-- Running `agent.write_inner_memory_from_logs()` creates an inner memory of the agent's logs for the LLM to view, as a list of chat messages. This method goes over each step of the log and only stores what it's interested in as a message: for instance, it will save the system prompt and task in separate messages, then for each step it will store the LLM output as a message, and the tool call output as another message. Use this if you want a higher-level view of what has happened - but not every log will be transcripted by this method.
-
-## Tools
-
-A tool is an atomic function to be used by an agent.
-
-You can for instance check the [`PythonInterpreterTool`]: it has a name, a description, input descriptions, an output type, and a `__call__` method to perform the action.
-
-When the agent is initialized, the tool attributes are used to generate a tool description which is baked into the agent's system prompt. This lets the agent know which tools it can use and why.
+## Code execution
 
-### Default toolbox
+For safety, only the tools you provide (and the default Transformers tools) and the `print` function are executed. The interpreter doesn't allow importing modules that aren't on a safe list.
 
-Transformers comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools = True`:
+To import modules that aren't on the list, add them as a list to the `additional_authorized_imports` parameter when initializing an agent.
 
-- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut))
-- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt))
-- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper))
-- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5))
-- **Translation**: translates a given sentence from source language to target language.
-- **DuckDuckGo search***: performs a web search using DuckDuckGo browser.
-- **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [`ReactJsonAgent`] if you initialize it with `add_base_tools=True`, since code-based agent can already natively execute Python code
-
-
-You can manually use a tool by calling the [`load_tool`] function and a task to perform.
-
-
-```python
-from transformers import load_tool
+```py
+from transformers import ReactCodeAgent
 
-tool = load_tool("text-to-speech")
-audio = tool("This is a text to speech tool")
+agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
+agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
 ```
 
+Code execution stops if a tool isn't on the safe list, it isn't authorized, or if the code generated by the agent returns a Python error.
 
-### Create a new tool
-
-You can create your own tool for use cases not covered by the default tools from Hugging Face.
-For example, let's create a tool that returns the most downloaded model for a given task from the Hub.
-
-You'll start with the code below.
-
-```python
-from huggingface_hub import list_models
+> [!WARNING]
+> A LLM can generate any arbitrary code that can be executed, so don't add any unsafe imports!
 
-task = "text-classification"
+## Multi-agent
 
-model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-print(model.id)
-```
+[Multi-agent](https://hf.co/papers/2308.08155) refers to multiple agents working together to solve a task. Performance is typically better because each agent is specialized for a particular subtask.
 
-This code can quickly be converted into a tool, just by wrapping it in a function and adding the `tool` decorator:
+Multi-agents are created through a [`ManagedAgent`] class, where a *manager agent* oversees how other agents work together. The manager agent requires an agent and their name and description. These are added to the manager agents system prompt which lets it know how to call and use them.
 
+The multi-agent example below creates a web search agent that is managed by another [`ReactCodeAgent`].
 
 ```py
-from transformers import tool
-
-@tool
-def model_download_tool(task: str) -> str:
-    """
-    This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
-    It returns the name of the checkpoint.
-
-    Args:
-        task: The task for which
-    """
-    model = next(iter(list_models(filter="text-classification", sort="downloads", direction=-1)))
-    return model.id
-```
-
-The function needs:
-- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_tool`.
-- Type hints on both inputs and output
-- A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint).
-All these will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible!
-
-> [!TIP]
-> This definition format is the same as tool schemas used in `apply_chat_template`, the only difference is the added `tool` decorator: read more on our tool use API [here](https://huggingface.co/blog/unified-tool-use#passing-tools-to-a-chat-template).
-
-Then you can directly initialize your agent:
-```py
-from transformers import CodeAgent
-agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
-agent.run(
-    "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
+
+llm_engine = HfApiEngine()
+web_agent = ReactCodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine)
+managed_web_agent = ManagedAgent(
+    agent=web_agent,
+    name="web_search",
+    description="Runs web searches for you. Give it your query as an argument."
 )
-```
-
-You get the following:
-```text
-======== New task ========
-Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
-==== Agent is executing the code below:
-most_downloaded_model = model_download_tool(task="text-to-video")
-print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
-====
-```
-
-And the output:
-`"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."`
-
-### Manage your agent's toolbox
-
-If you have already initialized an agent, it is inconvenient to reinitialize it from scratch with a tool you want to use. With Transformers, you can manage an agent's toolbox by adding or replacing a tool.
-
-Let's add the `model_download_tool` to an existing agent initialized with only the default toolbox.
-
-```python
-from transformers import CodeAgent
-
-agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-agent.toolbox.add_tool(model_download_tool)
-```
-Now we can leverage both the new tool and the previous text-to-speech tool:
-
-```python
-agent.run(
-    "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
+manager_agent = ReactCodeAgent(
+    tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent]
 )
+manager_agent.run("Who is the CEO of Hugging Face?")
 ```
 
+## Gradio integration
 
-| **Audio**                                                                                                                                            |
-|------------------------------------------------------------------------------------------------------------------------------------------------------|
-| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
-
-
-> [!WARNING]
-> Beware when adding tools to an agent that already works well because it can bias selection towards your tool or select another tool other than the one already defined.
+[Gradio](https://www.gradio.app/) is a library for quickly creating and sharing machine learning apps. The [gradio.Chatbot](https://www.gradio.app/docs/gradio/chatbot) supports chatting with a Transformers agent with the [`stream_to_gradio`] function.
 
+Load a tool and LLM with an agent, and then create a Gradio app. The key is to use [`stream_to_gradio`] to stream the agents messages and display how it's reasoning through a task.
 
-Use the `agent.toolbox.update_tool()` method to replace an existing tool in the agent's toolbox.
-This is useful if your new tool is a one-to-one replacement of the existing tool because the agent already knows how to perform that specific task.
-Just make sure the new tool follows the same API as the replaced tool or adapt the system prompt template to ensure all examples using the replaced tool are updated.
+```py
+import gradio as gr
+from transformers import (
+    load_tool,
+    ReactCodeAgent,
+    HfApiEngine,
+    stream_to_gradio,
+)
 
+# Import tool from Hub
+image_generation_tool = load_tool("m-ric/text-to-image")
+llm_engine = HfApiEngine("meta-llama/Meta-Llama-3-70B-Instruct")
+
+# Initialize the agent with the image generation tool
+agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
+
+def interact_with_agent(task):
+    messages = []
+    messages.append(gr.ChatMessage(role="user", content=task))
+    yield messages
+    for msg in stream_to_gradio(agent, task):
+        messages.append(msg)
+        yield messages + [
+            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
+        ]
+    yield messages
+
+with gr.Blocks() as demo:
+    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
+    submit = gr.Button("Run illustrator agent!")
+    chatbot = gr.Chatbot(
+        label="Agent",
+        type="messages",
+        avatar_images=(
+            None,
+            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
+        ),
+    )
+    submit.click(interact_with_agent, [text_input], [chatbot])
+
+if __name__ == "__main__":
+    demo.launch()
+```
 
-### Use a collection of tools
+## Troubleshoot
 
-You can leverage tool collections by using the ToolCollection object, with the slug of the collection you want to use.
-Then pass them as a list to initialize you agent, and start using them!
+For a better idea of what is happening when you call an agent, it is always a good idea to check the system prompt template first.
 
 ```py
-from transformers import ToolCollection, ReactCodeAgent
+print(agent.system_prompt_template)
+```
 
-image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f")
-agent = ReactCodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True)
+If the agent is behaving unexpectedly, remember to explain the task you want to perform as clearly as possible. Every [`~Agent.run`] is different and minor variations in your system prompt may yield completely different results.
 
-agent.run("Please draw me a picture of rivers and lakes.")
-```
+To find out what happened after a run, check the following agent attributes.
 
-To speed up the start, tools are loaded only if called by the agent.
+- `agent.logs` stores the finegrained agent logs. At every step of the agents run, everything is stored in a dictionary and appended to `agent.logs`.
+- `agent.write_inner_memory_from_logs` only stores a high-level overview of the agents run. For example, at each step, it stores the LLM output as a message and the tool call output as a separate message. Not every detail from a step is transcripted by `write_inner_memory_from_logs`.
 
-This gets you this image:
+## Resources
 
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png">
+Learn more about ReAct agents in the [Open-source LLMs as LangChain Agents](https://hf.co/blog/open-source-llms-as-agents) blog post.
diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md
deleted file mode 100644
index eb5149d2faa3..000000000000
--- a/docs/source/en/agents_advanced.md
+++ /dev/null
@@ -1,261 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-# Agents, supercharged - Multi-agents, External tools, and more
-
-[[open-in-colab]]
-
-### What is an agent?
-
-> [!TIP]
-> If you're new to `transformers.agents`, make sure to first read the main [agents documentation](./agents).
-
-In this page we're going to highlight several advanced uses of `transformers.agents`.
-
-## Multi-agents
-
-Multi-agent has been introduced in Microsoft's framework [Autogen](https://huggingface.co/papers/2308.08155).
-It simply means having several agents working together to solve your task instead of only one.
-It empirically yields better performance on most benchmarks. The reason for this better performance is conceptually simple: for many tasks, rather than using a do-it-all system, you would prefer to specialize units on sub-tasks. Here, having agents with separate tool sets and memories allows to achieve efficient specialization.
-
-You can easily build hierarchical multi-agent systems with `transformers.agents`.
-
-To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools.
-
-Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]:
-
-```py
-from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
-
-llm_engine = HfApiEngine()
-
-web_agent = ReactCodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine)
-
-managed_web_agent = ManagedAgent(
-    agent=web_agent,
-    name="web_search",
-    description="Runs web searches for you. Give it your query as an argument."
-)
-
-manager_agent = ReactCodeAgent(
-    tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent]
-)
-
-manager_agent.run("Who is the CEO of Hugging Face?")
-```
-
-> [!TIP]
-> For an in-depth example of an efficient multi-agent implementation, see [how we pushed our multi-agent system to the top of the GAIA leaderboard](https://huggingface.co/blog/beating-gaia).
-
-
-## Advanced tool usage
-
-### Directly define a tool by subclassing Tool, and share it to the Hub
-
-Let's take again the tool example from main documentation, for which we had implemented a `tool` decorator.
-
-If you need to add variation, like custom attributes for your tool, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass.
-
-The custom tool needs:
-- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name it `model_download_counter`.
-- An attribute `description` is used to populate the agent's system prompt.
-- An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input.
-- An `output_type` attribute, which specifies the output type.
-- A `forward` method which contains the inference code to be executed.
-
-The types for both `inputs` and `output_type` should be amongst [Pydantic formats](https://docs.pydantic.dev/latest/concepts/json_schema/#generating-json-schema).
-
-```python
-from transformers import Tool
-from huggingface_hub import list_models
-
-class HFModelDownloadsTool(Tool):
-    name = "model_download_counter"
-    description = """
-    This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
-    It returns the name of the checkpoint."""
-
-    inputs = {
-        "task": {
-            "type": "string",
-            "description": "the task category (such as text-classification, depth-estimation, etc)",
-        }
-    }
-    output_type = "string"
-
-    def forward(self, task: str):
-        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-        return model.id
-```
-
-Now that the custom `HfModelDownloadsTool` class is ready, you can save it to a file named `model_downloads.py` and import it for use.
-
-
-```python
-from model_downloads import HFModelDownloadsTool
-
-tool = HFModelDownloadsTool()
-```
-
-You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access.
-
-```python
-tool.push_to_hub("{your_username}/hf-model-downloads")
-```
-
-Load the tool with the [`~Tool.load_tool`] function and pass it to the `tools` parameter in your agent.
-
-```python
-from transformers import load_tool, CodeAgent
-
-model_download_tool = load_tool("m-ric/hf-model-downloads")
-```
-
-### Import a Space as a tool 🚀
-
-You can directly import a Space from the Hub as a tool using the [`Tool.from_space`] method!
-
-You only need to provide the id of the Space on the Hub, its name, and a description that will help you agent understand what the tool does. Under the hood, this will use [`gradio-client`](https://pypi.org/project/gradio-client/) library to call the Space.
-
-For instance, let's import the [FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) Space from the Hub and use it to generate an image.
-
-```
-from transformers import Tool
-
-image_generation_tool = Tool.from_space(
-    "black-forest-labs/FLUX.1-dev",
-    name="image_generator",
-    description="Generate an image from a prompt")
-
-image_generation_tool("A sunny beach")
-```
-And voilà, here's your image! 🏖️
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sunny_beach.webp">
-
-Then you can use this tool just like any other tool.  For example, let's improve the prompt  `a rabbit wearing a space suit` and generate an image of it.
-
-```python
-from transformers import ReactCodeAgent
-
-agent = ReactCodeAgent(tools=[image_generation_tool])
-
-agent.run(
-    "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit'
-)
-```
-
-```text
-=== Agent thoughts:
-improved_prompt could be "A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background"
-
-Now that I have improved the prompt, I can use the image generator tool to generate an image based on this prompt.
-=== Agent is executing the code below:
-image = image_generator(prompt="A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background")
-final_answer(image)
-```
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit_spacesuit_flux.webp">
-
-How cool is this? 🤩
-
-### Use gradio-tools
-
-[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging
-Face Spaces as tools. It supports many existing Spaces as well as custom Spaces.
-
-Transformers supports `gradio_tools` with the [`Tool.from_gradio`] method. For example, let's use the [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) from `gradio-tools` toolkit for improving prompts to generate better images.
-
-Import and instantiate the tool, then pass it to the `Tool.from_gradio` method:
-
-```python
-from gradio_tools import StableDiffusionPromptGeneratorTool
-from transformers import Tool, load_tool, CodeAgent
-
-gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
-prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
-```
-
-> [!WARNING]
-> gradio-tools require *textual* inputs and outputs even when working with different modalities like image and audio objects. Image and audio inputs and outputs are currently incompatible.
-
-### Use LangChain tools
-
-We love Langchain and think it has a very compelling suite of tools.
-To import a tool from LangChain, use the `from_langchain()` method.
-
-Here is how you can use it to recreate the intro's search result using a LangChain web search tool.
-This tool will need `pip install google-search-results` to work properly.
-```python
-from langchain.agents import load_tools
-from transformers import Tool, ReactCodeAgent
-
-search_tool = Tool.from_langchain(load_tools(["serpapi"])[0])
-
-agent = ReactCodeAgent(tools=[search_tool])
-
-agent.run("How many more blocks (also denoted as layers) are in BERT base encoder compared to the encoder from the architecture proposed in Attention is All You Need?")
-```
-
-## Display your agent run in a cool Gradio interface
-
-You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example:
-
-```py
-import gradio as gr
-from transformers import (
-    load_tool,
-    ReactCodeAgent,
-    HfApiEngine,
-    stream_to_gradio,
-)
-
-# Import tool from Hub
-image_generation_tool = load_tool("m-ric/text-to-image")
-
-llm_engine = HfApiEngine("meta-llama/Meta-Llama-3-70B-Instruct")
-
-# Initialize the agent with the image generation tool
-agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
-
-
-def interact_with_agent(task):
-    messages = []
-    messages.append(gr.ChatMessage(role="user", content=task))
-    yield messages
-    for msg in stream_to_gradio(agent, task):
-        messages.append(msg)
-        yield messages + [
-            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
-        ]
-    yield messages
-
-
-with gr.Blocks() as demo:
-    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
-    submit = gr.Button("Run illustrator agent!")
-    chatbot = gr.Chatbot(
-        label="Agent",
-        type="messages",
-        avatar_images=(
-            None,
-            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
-        ),
-    )
-    submit.click(interact_with_agent, [text_input], [chatbot])
-
-if __name__ == "__main__":
-    demo.launch()
-```
diff --git a/docs/source/en/attention_interface.md b/docs/source/en/attention_interface.md
new file mode 100644
index 000000000000..054a0e471314
--- /dev/null
+++ b/docs/source/en/attention_interface.md
@@ -0,0 +1,128 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Attention Interface
+
+This page describes how to use the `AttentionInterface` in order to register custom attention functions to use with
+supported models.
+
+## Customizing attention function
+
+Most recent models can now switch from one attention function used in the Attention layer to the other, thanks to a simple mapping.
+By default, we provide the implementation for [`sdpa`](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html),
+[`flash_attention_2`](https://github.com/Dao-AILab/flash-attention) and [`flex_attention`](https://pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention)
+as well as `eager`, which is a simple matrix multiplication without any optimization on top.  
+This is the setting you can usually choose when instantiating a model:
+
+```python
+from transformers import AutoModelForCausalLM
+
+model_id = "meta-llama/Llama-3.2-1B"
+
+# Here, using flash attention as an example
+model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2")
+```
+
+But what if you wanted to create your own attention function? Or simply play around with existing ones, adding
+a few statements here and there? You can now do so with the `AttentionInterface`! Here is an example:
+
+```python
+from transformers import AutoModelForCausalLM, AttentionInterface
+from transformers.integrations.sdpa_attention import sdpa_attention_forward
+import torch
+
+model_id = "meta-llama/Llama-3.2-1B"
+
+def my_new_sdpa(*args, **kwargs):
+    print("I just entered the attention computation")
+    return sdpa_attention_forward(*args, **kwargs)
+
+AttentionInterface.register("my_new_sdpa", my_new_sdpa)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="my_new_sdpa")
+# Try running the forward with the new attention function
+model(torch.ones(1, 5, dtype=int))
+```
+
+You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times).
+
+## Dynamically switching attention function
+
+You could dynamically change the model's attention function as well, by overriding the `config._attn_implementation` field:
+
+```python
+# Back to use original sdpa implementation
+model.config._attn_implementation = "sdpa"
+
+model(torch.ones(1, 5, dtype=int))
+```
+
+and it will stop printing the statements, as it now uses the `sdpa` attention.  
+This allows to quickly change an attention function, without needing to reload the model!
+
+## What about new args needed in my custom attention function?
+
+But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
+`AttentionInterface` propagate kwargs all the way to the Attention layers, and to the used attention function. That way,
+you can simply pass the arg (as a kwargs, i.e. you need to qualify the name of the arg) in the model's forward, and it will be correctly used in the attention. However, custom attention functions have some limitations. In particular, it must follow the signature and return format of other attention functions, i.e.
+
+```python
+from transformers import AutoModelForCausalLM, AttentionInterface
+from transformers.integrations.sdpa_attention import sdpa_attention_forward
+import torch
+
+def custom_attention(
+    module: torch.nn.Module,  # required arg
+    query: torch.Tensor,  # required arg
+    key: torch.Tensor,  # required arg
+    value: torch.Tensor,  # required arg
+    attention_mask: Optional[torch.Tensor],  # required arg
+    a_new_kwargs = None,  # You can now add as many kwargs as you need
+    another_new_kwargs = None,  # You can now add as many kwargs as you need
+    **kwargs,  # You need to accept **kwargs as models will pass other args
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]
+    ...  # do your magic!
+    return attn_output, attn_weights  # attn_weights are optional here
+
+AttentionInterface.register("custom", custom_attention)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="custom")
+# Forward pass with the new kwargs
+model(torch.ones(1, 5, dtype=int), a_new_kwargs=..., another_new_kwargs=...)
+```
+
+If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)!
+
+## Accessing current available implementations
+
+Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
+and/or perform a few checks, the prefered way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
+would expect from a usual Python dictionary:
+
+```python
+>>> from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+>>> list(ALL_ATTENTION_FUNCTIONS.keys())
+>>> ['flash_attention_2', 'flex_attention', 'sdpa']
+
+>>> ALL_ATTENTION_FUNCTIONS["sdpa"]
+>>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
+
+>>> ALL_ATTENTION_FUNCTIONS.get("sdpa", None)
+>>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
+
+# You can also globally `register` a new function directly on it
+>>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func)
+```
\ No newline at end of file
diff --git a/docs/source/en/autoclass_tutorial.md b/docs/source/en/autoclass_tutorial.md
deleted file mode 100644
index 33f48b2b043f..000000000000
--- a/docs/source/en/autoclass_tutorial.md
+++ /dev/null
@@ -1,189 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Load pretrained instances with an AutoClass
-
-With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infers and loads the correct architecture from a given checkpoint. The `from_pretrained()` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different.
-
-<Tip>
-
-Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/google-bert/bert-base-uncased) is an architecture, while `google-bert/bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint.
-
-</Tip>
-
-In this tutorial, learn to:
-
-* Load a pretrained tokenizer.
-* Load a pretrained image processor
-* Load a pretrained feature extractor.
-* Load a pretrained processor.
-* Load a pretrained model.
-* Load a model as a backbone.
-
-## AutoTokenizer
-
-Nearly every NLP task begins with a tokenizer. A tokenizer converts your input into a format that can be processed by the model.
-
-Load a tokenizer with [`AutoTokenizer.from_pretrained`]:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
-```
-
-Then tokenize your input as shown below:
-
-```py
->>> sequence = "In a hole in the ground there lived a hobbit."
->>> print(tokenizer(sequence))
-{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-## AutoImageProcessor
-
-For vision tasks, an image processor processes the image into the correct input format.
-
-```py
->>> from transformers import AutoImageProcessor
-
->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
-```
-
-## AutoBackbone
-
-<div style="text-align: center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stages.png">
-    <figcaption class="mt-2 text-center text-sm text-gray-500">A Swin backbone with multiple stages for outputting a feature map.</figcaption>
-</div>
-
-The [`AutoBackbone`] lets you use pretrained models as backbones to get feature maps from different stages of the backbone. You should specify one of the following parameters in [`~PretrainedConfig.from_pretrained`]:
-
-* `out_indices` is the index of the layer you'd like to get the feature map from
-* `out_features` is the name of the layer you'd like to get the feature map from
-
-These parameters can be used interchangeably, but if you use both, make sure they're aligned with each other! If you don't pass any of these parameters, the backbone returns the feature map from the last layer.
-
-<div style="text-align: center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png">
-    <figcaption class="mt-2 text-center text-sm text-gray-500">A feature map from the first stage of the backbone. The patch partition refers to the model stem.</figcaption>
-</div>
-
-For example, in the above diagram, to return the feature map from the first stage of the Swin backbone, you can set `out_indices=(1,)`:
-
-```py
->>> from transformers import AutoImageProcessor, AutoBackbone
->>> import torch
->>> from PIL import Image
->>> import requests
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
->>> processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
->>> model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
-
->>> inputs = processor(image, return_tensors="pt")
->>> outputs = model(**inputs)
->>> feature_maps = outputs.feature_maps
-```
-
-Now you can access the `feature_maps` object from the first stage of the backbone:
-
-```py
->>> list(feature_maps[0].shape)
-[1, 96, 56, 56]
-```
-
-## AutoFeatureExtractor
-
-For audio tasks, a feature extractor processes the audio signal into the correct input format.
-
-Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained(
-...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
-... )
-```
-
-## AutoProcessor
-
-Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the [LayoutLMV2](model_doc/layoutlmv2) model requires an image processor to handle images and a tokenizer to handle text; a processor combines both of them.
-
-Load a processor with [`AutoProcessor.from_pretrained`]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
-```
-
-## AutoModel
-
-<frameworkcontent>
-<pt>
-The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`].
-
-> [!WARNING]
-> By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto")
-```
-
-Easily reuse the same checkpoint to load an architecture for a different task:
-
-```py
->>> from transformers import AutoModelForTokenClassification
-
->>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto")
-```
-
-<Tip warning={true}>
-
-For PyTorch models, the `from_pretrained()` method uses `torch.load()` which internally uses `pickle` and is known to be insecure. In general, never load a model that could have come from an untrusted source, or that could have been tampered with. This security risk is partially mitigated for public models hosted on the Hugging Face Hub, which are [scanned for malware](https://huggingface.co/docs/hub/security-malware) at each commit. See the [Hub documentation](https://huggingface.co/docs/hub/security) for best practices like [signed commit verification](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) with GPG.
-
-TensorFlow and Flax checkpoints are not affected, and can be loaded within PyTorch architectures using the `from_tf` and `from_flax` kwargs for the `from_pretrained` method to circumvent this issue.
-
-</Tip>
-
-Generally, we recommend using the `AutoTokenizer` class and the `AutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
-</pt>
-<tf>
-Finally, the `TFAutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`TFAutoModelForSequenceClassification.from_pretrained`]:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-Easily reuse the same checkpoint to load an architecture for a different task:
-
-```py
->>> from transformers import TFAutoModelForTokenClassification
-
->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
-</tf>
-</frameworkcontent>
diff --git a/docs/source/en/backbones.md b/docs/source/en/backbones.md
new file mode 100644
index 000000000000..792b0b0d38f1
--- /dev/null
+++ b/docs/source/en/backbones.md
@@ -0,0 +1,155 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Backbones
+
+Higher-level computer visions tasks, such as object detection or image segmentation, use several models together to generate a prediction. A separate model is used for the *backbone*, neck, and head. The backbone extracts useful features from an input image into a feature map, the neck combines and processes the feature maps, and the head uses them to make a prediction.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Backbone.png"/>
+</div>
+
+Load a backbone with [`~PretrainedConfig.from_pretrained`] and use the `out_indices` parameter to determine which layer, given by the index, to extract a feature map from.
+
+```py
+from transformers import AutoBackbone
+
+model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+```
+
+This guide describes the backbone class, backbones from the [timm](https://hf.co/docs/timm/index) library, and how to extract features with them.
+
+## Backbone classes
+
+There are two backbone classes.
+
+- [`~transformers.utils.BackboneMixin`] allows you to load a backbone and includes functions for extracting the feature maps and indices.
+- [`~transformers.utils.BackboneConfigMixin`] allows you to set the feature map and indices of a backbone configuration.
+
+Refer to the [Backbone](./main_classes/backbones) API documentation to check which models support a backbone.
+
+There are two ways to load a Transformers backbone, [`AutoBackbone`] and a model-specific backbone class.
+
+<hfoptions id="backbone-classes">
+<hfoption id="AutoBackbone">
+
+The [AutoClass](./model_doc/auto) API automatically loads a pretrained vision model with [`~PretrainedConfig.from_pretrained`] as a backbone if it's supported.
+
+Set the `out_indices` parameter to the layer you'd like to get the feature map from. If you know the name of the layer, you could also use `out_features`. These parameters can be used interchangeably, but if you use both, make sure they refer to the same layer.
+
+When `out_indices` or `out_features` isn't used, the backbone returns the feature map from the last layer. The example code below uses `out_indices=(1,)` to get the feature map from the first layer.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png"/>
+</div>
+
+```py
+from transformers import AutoImageProcessor, AutoBackbone
+
+model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+```
+
+</hfoption>
+<hfoption id="model-specific backbone">
+
+When you know a model supports a backbone, you can load the backbone and neck directly into the models configuration. Pass the configuration to the model to initialize it for a task.
+
+The example below loads a [ResNet](./model_doc/resnet) backbone and neck for use in a [MaskFormer](./model_doc/maskformer) instance segmentation head.
+
+Set `backbone` to a pretrained model and  `use_pretrained_backbone=True` to use pretrained weights instead of randomly initialized weights.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+Another option is to separately load the backbone configuration and then pass it to `backbone_config` in the model configuration.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
+
+# instantiate backbone configuration
+backbone_config = ResNetConfig()
+# load backbone in model
+config = MaskFormerConfig(backbone_config=backbone_config)
+# attach backbone to model head
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+</hfoption>
+</hfoptions>
+
+## timm backbones
+
+[timm](https://hf.co/docs/timm/index) is a collection of vision models for training and inference. Transformers supports timm models as backbones with the [`TimmBackbone`] and [`TimmBackboneConfig`] classes.
+
+Set `use_timm_backbone=True` to load pretrained timm weights, and `use_pretrained_backbone` to use pretrained or randomly initialized weights.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="resnet50", use_timm_backbone=True, use_pretrained_backbone=True)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+You could also explicitly call the [`TimmBackboneConfig`] class to load and create a pretrained timm backbone.
+
+```py
+from transformers import TimmBackboneConfig
+
+backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=True)
+```
+
+Pass the backbone configuration to the model configuration and instantiate the model head, [`MaskFormerForInstanceSegmentation`], with the backbone.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone_config=backbone_config)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+## Feature extraction
+
+The backbone is used to extract image features. Pass an image through the backbone to get the feature maps.
+
+Load and preprocess an image and pass it to the backbone. The example below extracts the feature maps from the first layer.
+
+```py
+from transformers import AutoImageProcessor, AutoBackbone
+import torch
+from PIL import Image
+import requests
+
+model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(image, return_tensors="pt")
+outputs = model(**inputs)
+```
+
+The features are stored and accessed from the outputs `feature_maps` attribute.
+
+```py
+feature_maps = outputs.feature_maps
+list(feature_maps[0].shape)
+[1, 96, 56, 56]
+```
diff --git a/docs/source/en/bertology.md b/docs/source/en/bertology.md
deleted file mode 100644
index a1b92a362cd0..000000000000
--- a/docs/source/en/bertology.md
+++ /dev/null
@@ -1,41 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# BERTology
-
-There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
-(that some call "BERTology"). Some good examples of this field are:
-
-
-- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
-  https://arxiv.org/abs/1905.05950
-- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
-  Manning: https://arxiv.org/abs/1906.04341
-- CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633
-
-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
-help people access the inner representations, mainly adapted from the great work of Paul Michel
-(https://arxiv.org/abs/1905.10650):
-
-
-- accessing all the hidden-states of BERT/GPT/GPT-2,
-- accessing all the attention weights for each head of BERT/GPT/GPT-2,
-- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
-  in https://arxiv.org/abs/1905.10650.
-
-To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) which extracts information and prune a model pre-trained on
-GLUE.
diff --git a/docs/source/en/big_models.md b/docs/source/en/big_models.md
deleted file mode 100644
index 0c1737af1abd..000000000000
--- a/docs/source/en/big_models.md
+++ /dev/null
@@ -1,215 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Instantiate a big model
-
-A barrier to accessing very large pretrained models is the amount of memory required. When loading a pretrained PyTorch model, you usually:
-
-1. Create a model with random weights.
-2. Load your pretrained weights.
-3. Put those pretrained weights in the model.
-
-The first two steps both require a full version of the model in memory and if the model weighs several GBs, you may not have enough memory for two copies of it. This problem is amplified in distributed training environments because each process loads a pretrained model and stores two copies in memory.
-
-> [!TIP]
-> The randomly created model is initialized with "empty" tensors, which take space in memory without filling it. The random values are whatever was in this chunk of memory at the time. To improve loading speed, the [`_fast_init`](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter is set to `True` by default to skip the random initialization for all weights that are correctly loaded.
-
-This guide will show you how Transformers can help you load large pretrained models despite their memory requirements.
-
-## Sharded checkpoints
-
-From Transformers v4.18.0, a checkpoint larger than 10GB is automatically sharded by the [`~PreTrainedModel.save_pretrained`] method. It is split into several smaller partial checkpoints and creates an index file that maps parameter names to the files they're stored in.
-
-The maximum shard size is controlled with the `max_shard_size` parameter, but by default it is 5GB, because it is easier to run on free-tier GPU instances without running out of memory.
-
-For example, let's shard [BioMistral/BioMistral-7B](https://hf.co/BioMistral/BioMistral-7B).
-
-```py
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="5GB")
-...     print(sorted(os.listdir(tmp_dir)))
-['config.json', 'generation_config.json', 'model-00001-of-00006.safetensors', 'model-00002-of-00006.safetensors', 'model-00003-of-00006.safetensors', 'model-00004-of-00006.safetensors', 'model-00005-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model.safetensors.index.json']
-```
-
-The sharded checkpoint is reloaded with the [`~PreTrainedModel.from_pretrained`] method.
-
-```py
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="5GB")
-...     new_model = AutoModel.from_pretrained(tmp_dir)
-```
-
-The main advantage of sharded checkpoints for big models is that each shard is loaded after the previous one, which caps the memory usage to only the model size and the largest shard size.
-
-You could also directly load a sharded checkpoint inside a model without the [`~PreTrainedModel.from_pretrained`] method (similar to PyTorch's `load_state_dict()` method for a full checkpoint). In this case, use the [`~modeling_utils.load_sharded_checkpoint`] method.
-
-```py
->>> from transformers.modeling_utils import load_sharded_checkpoint
-
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="5GB")
-...     load_sharded_checkpoint(model, tmp_dir)
-```
-
-### Shard metadata
-
-The index file determines which keys are in the checkpoint and where the corresponding weights are stored. This file is loaded like any other JSON file and you can get a dictionary from it.
-
-```py
->>> import json
-
->>> with tempfile.TemporaryDirectory() as tmp_dir:
-...     model.save_pretrained(tmp_dir, max_shard_size="5GB")
-...     with open(os.path.join(tmp_dir, "model.safetensors.index.json"), "r") as f:
-...         index = json.load(f)
-
->>> print(index.keys())
-dict_keys(['metadata', 'weight_map'])
-```
-
-The `metadata` key provides the total model size.
-
-```py
->>> index["metadata"]
-{'total_size': 28966928384}
-```
-
-The `weight_map` key maps each parameter name (typically `state_dict` in a PyTorch model) to the shard it's stored in.
-
-```py
->>> index["weight_map"]
-{'lm_head.weight': 'model-00006-of-00006.safetensors',
- 'model.embed_tokens.weight': 'model-00001-of-00006.safetensors',
- 'model.layers.0.input_layernorm.weight': 'model-00001-of-00006.safetensors',
- 'model.layers.0.mlp.down_proj.weight': 'model-00001-of-00006.safetensors',
- ...
-}
-```
-
-## Accelerate's Big Model Inference
-
-> [!TIP]
-> Make sure you have Accelerate v0.9.0 or later and PyTorch v1.9.0 or later installed.
-
-From Transformers v4.20.0, the [`~PreTrainedModel.from_pretrained`] method is supercharged with Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature to efficiently handle really big models! Big Model Inference creates a *model skeleton* on PyTorch's [**meta**](https://pytorch.org/docs/main/meta.html) device. The randomly initialized parameters are only created when the pretrained weights are loaded. This way, you aren't keeping two copies of the model in memory at the same time (one for the randomly initialized model and one for the pretrained weights), and the maximum memory consumed is only the full model size.
-
-To enable Big Model Inference in Transformers, set `low_cpu_mem_usage=True` in the [`~PreTrainedModel.from_pretrained`] method.
-
-```py
-from transformers import AutoModelForCausalLM
-
-gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", low_cpu_mem_usage=True)
-```
-
-Accelerate automatically dispatches the model weights across all available devices, starting with the fastest device (GPU) first and then offloading to the slower devices (CPU and even hard drive). This is enabled by setting `device_map="auto"` in the [`~PreTrainedModel.from_pretrained`] method. When you pass the `device_map` parameter, `low_cpu_mem_usage` is automatically set to `True` so you don't need to specify it.
-
-```py
-from transformers import AutoModelForCausalLM
-
-# these loading methods are equivalent
-gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto")
-gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto", low_cpu_mem_usage=True)
-```
-
-You can also write your own `device_map` by mapping each layer to a device. It should map all model parameters to a device, but you don't have to detail where all the submodules of a layer go if the entire layer is on the same device.
-
-```python
-device_map = {"model.layers.1": 0, "model.layers.14": 1, "model.layers.31": "cpu", "lm_head": "disk"}
-```
-
-Access `hf_device_map` attribute to see how Accelerate split the model across devices.
-
-```py
-gemma.hf_device_map
-```
-
-```python out
-{'model.embed_tokens': 0,
- 'model.layers.0': 0,
- 'model.layers.1': 0,
- 'model.layers.2': 0,
- 'model.layers.3': 0,
- 'model.layers.4': 0,
- 'model.layers.5': 0,
- 'model.layers.6': 0,
- 'model.layers.7': 0,
- 'model.layers.8': 0,
- 'model.layers.9': 0,
- 'model.layers.10': 0,
- 'model.layers.11': 0,
- 'model.layers.12': 0,
- 'model.layers.13': 0,
- 'model.layers.14': 'cpu',
- 'model.layers.15': 'cpu',
- 'model.layers.16': 'cpu',
- 'model.layers.17': 'cpu',
- 'model.layers.18': 'cpu',
- 'model.layers.19': 'cpu',
- 'model.layers.20': 'cpu',
- 'model.layers.21': 'cpu',
- 'model.layers.22': 'cpu',
- 'model.layers.23': 'cpu',
- 'model.layers.24': 'cpu',
- 'model.layers.25': 'cpu',
- 'model.layers.26': 'cpu',
- 'model.layers.27': 'cpu',
- 'model.layers.28': 'cpu',
- 'model.layers.29': 'cpu',
- 'model.layers.30': 'cpu',
- 'model.layers.31': 'cpu',
- 'model.norm': 'cpu',
- 'lm_head': 'cpu'}
-```
-
-## Model data type
-
-PyTorch model weights are normally instantiated as torch.float32 and it can be an issue if you try to load a model as a different data type. For example, you'd need twice as much memory to load the weights in torch.float32 and then again to load them in your desired data type, like torch.float16.
-
-> [!WARNING]
-> Due to how PyTorch is designed, the `torch_dtype` parameter only supports floating data types.
-
-To avoid wasting memory like this, explicitly set the `torch_dtype` parameter to the desired data type or set `torch_dtype="auto"` to load the weights with the most optimal memory pattern (the data type is automatically derived from the model weights).
-
-<hfoptions id="dtype">
-<hfoption id="specific dtype">
-
-```py
-from transformers import AutoModelForCausalLM
-
-gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16)
-```
-
-</hfoption>
-<hfoption id="auto dtype">
-
-```py
-from transformers import AutoModelForCausalLM
-
-gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype="auto")
-```
-
-</hfoption>
-</hfoptions>
-
-You can also set the data type to use for models instantiated from scratch.
-
-```python
-import torch
-from transformers import AutoConfig, AutoModel
-
-my_config = AutoConfig.from_pretrained("google/gemma-2b", torch_dtype=torch.float16)
-model = AutoModel.from_config(my_config)
-```
diff --git a/docs/source/en/cache_explanation.md b/docs/source/en/cache_explanation.md
new file mode 100644
index 000000000000..59496e4298fc
--- /dev/null
+++ b/docs/source/en/cache_explanation.md
@@ -0,0 +1,96 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Caching
+
+Imagine you’re having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right?
+
+You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context.
+
+To predict the 1000th token, the model requires information from the previous 999 tokens. The information is represented as matrix multiplications across the token representations.
+
+To predict the 1001th token, you need the same information from the previous 999 tokens in addition to any information from the 1000th token. This is a lot of matrix multiplications a model has to compute over and over for each token!
+
+A key-value (KV) cache eliminates this inefficiency by storing kv pairs derived from the attention layers of previously processed tokens. The stored kv pairs are retrieved from the cache and reused for subsequent tokens, avoiding the need to recompute.
+
+> [!WARNING]
+> Caching should only be used for **inference**. It may cause unexpected errors if it's enabled during training.
+
+## Cache class
+
+When you use Transformers' [`Cache`] class, the self-attention module performs several critical steps to integrate past and present information.
+
+1. The attention module concatenates current kv pairs with past kv pairs stored in the cache. This creates attentions weights with the shape `(new_tokens_length, past_kv_length + new_tokens_length)`. The current and past kv pairs are essentially combined to compute the attention scores, ensuring a model is aware of previous context and the current input.
+
+2. When the `forward` method is called iteratively, it's crucial that the attention mask shape matches the combined length of the past and current kv pairs. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is typically handled internally in [`~GenerationMixin.generate`], but if you want to implement your own generation loop with [`Cache`], keep this in mind! The attention mask should hold the past and current token values.
+
+3. It is also important to be aware of the `cache_position`. This is important if you want to reuse a prefilled [`Cache`] with the `forward` method because you have to pass a valid `cache_position` value. This indicates the input positions in a sequence. `cache_position` is unaffected by padding, and it always adds one more position for each token. For example, if a kv cache contains 10 tokens - regardless of pad tokens - the cache position for the next token should be `torch.tensor([10])`.
+
+The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+past_key_values = DynamicCache()
+messages = [{"role": "user", "content": "Hello, what's your name."}]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
+
+generated_ids = inputs.input_ids
+cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0")
+max_new_tokens = 10
+
+for _ in range(max_new_tokens):
+    outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True)
+    # Greedily sample one next token
+    next_token_ids = outputs.logits[:, -1:].argmax(-1)
+    generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
+    # Prepare inputs for the next generation step by leaving unprocessed tokens, in our case we have only one new token
+    # and expanding attn mask for the new token, as explained above
+    attention_mask = inputs["attention_mask"]
+    attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+    inputs = {"input_ids": next_token_ids, "attention_mask": attention_mask}
+    cache_position = cache_position[-1:] + 1 # add one more position for the next token
+
+print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
+"[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
+```
+
+## Legacy cache format
+
+Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format has is dynamic because it grows as text is generated, similar to [`DynamicCache`].
+
+If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format.
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
+
+# `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache
+# in the legacy format
+generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5)
+
+cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
+legacy_format_cache = cache.to_legacy_cache()
+```
\ No newline at end of file
diff --git a/docs/source/en/chat_extras.md b/docs/source/en/chat_extras.md
new file mode 100644
index 000000000000..863b0f06f2e9
--- /dev/null
+++ b/docs/source/en/chat_extras.md
@@ -0,0 +1,299 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tools and RAG
+
+The [`~PreTrainedTokenizerBase.apply_chat_template`] method supports virtually any additional argument types - strings, lists, dicts - besides the chat message. This makes it possible to use chat templates for many use cases.
+
+This guide will demonstrate how to use chat templates with tools and retrieval-augmented generation (RAG).
+
+## Tools
+
+Tools are functions a large language model (LLM) can call to perform specific tasks. It is a powerful way to extend the capabilities of conversational agents with real-time information, computational tools, or access to large databases.
+
+Follow the rules below when creating a tool.
+
+1. The function should have a descriptive name.
+2. The function arguments must have a type hint in the function header (don't include in the `Args` block).
+3. The function must have a [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) docstring.
+4. The function can have a return type and `Returns` block, but these are optional because most tool use models ignore them.
+
+An example tool to get temperature and wind speed is shown below.
+
+```py
+def get_current_temperature(location: str, unit: str) -> float:
+    """
+    Get the current temperature at a location.
+    
+    Args:
+        location: The location to get the temperature for, in the format "City, Country"
+        unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
+    Returns:
+        The current temperature at the specified location in the specified units, as a float.
+    """
+    return 22.  # A real function should probably actually get the temperature!
+
+def get_current_wind_speed(location: str) -> float:
+    """
+    Get the current wind speed in km/h at a given location.
+    
+    Args:
+        location: The location to get the temperature for, in the format "City, Country"
+    Returns:
+        The current wind speed at the given location in km/h, as a float.
+    """
+    return 6.  # A real function should probably actually get the wind speed!
+
+tools = [get_current_temperature, get_current_wind_speed]
+```
+
+Load a model and tokenizer that supports tool-use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto")
+```
+
+Create a chat message.
+
+```py
+messages = [
+  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
+  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
+]
+```
+
+Pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_template`]. Then you can pass the inputs to the model for generation.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+outputs = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+<tool_call>
+{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
+</tool_call><|im_end|>
+```
+
+The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature. 
+
+Now append the `get_current_temperature` function and these arguments to the chat message as `tool_call`. The `tool_call` dictionary should be provided to the `assistant` role instead of the `system` or `user`.
+
+> [!WARNING]
+> The OpenAI API uses a JSON string as its `tool_call` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
+
+<hfoptions id="tool-call">
+<hfoption id="Llama">
+
+```py
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
+```
+
+Allow the assistant to read the function outputs and chat with the user.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|>
+```
+
+</hfoption>
+<hfoption id="Mistral/Mixtral">
+
+For [Mistral](./model_doc/mistral) and [Mixtral](./model_doc/mixtral) models, you need an additional `tool_call_id`. The `tool_call_id` is 9 randomly generated alphanumeric characters assigned to the `id` key in the `tool_call` dictionary.
+
+```py
+tool_call_id = "9Ae3bDc2F"
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
+```
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+</hfoption>
+</hfoptions>
+
+## Schema
+
+[`~PreTrainedTokenizerBase.apply_chat_template`] converts functions into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step) which is passed to the chat template. A LLM never sees the code inside the function. In other words, a LLM doesn't care how the function works technically, it only cares about function **definition** and **arguments**.
+
+The JSON schema is automatically generated behind the scenes as long as your function follows the [rules](#tools) listed earlier above. But you can use [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) to manually convert a schema for more visibility or debugging.
+
+```py
+from transformers.utils import get_json_schema
+
+def multiply(a: float, b: float):
+    """
+    A function that multiplies two numbers
+    
+    Args:
+        a: The first number to multiply
+        b: The second number to multiply
+    """
+    return a * b
+
+schema = get_json_schema(multiply)
+print(schema)
+```
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+You can edit the schema or write one entirely from scratch. This gives you a lot of flexibility to define precise schemas for more complex functions.
+
+> [!WARNING]
+> Try keeping your function signatures simple and the arguments to a minimum. These are easier for a model to understand and use than complex functions for example with nested arguments.
+
+The example below demonstrates writing a schema manually and then passing it to [`~PreTrainedTokenizerBase.apply_chat_template`].
+
+```py
+# A simple function that takes no arguments
+current_time = {
+  "type": "function", 
+  "function": {
+    "name": "current_time",
+    "description": "Get the current local time as a string.",
+    "parameters": {
+      'type': 'object',
+      'properties': {}
+    }
+  }
+}
+
+# A more complete function that takes two numerical arguments
+multiply = {
+  'type': 'function',
+  'function': {
+    'name': 'multiply',
+    'description': 'A function that multiplies two numbers', 
+    'parameters': {
+      'type': 'object', 
+      'properties': {
+        'a': {
+          'type': 'number',
+          'description': 'The first number to multiply'
+        }, 
+        'b': {
+          'type': 'number', 'description': 'The second number to multiply'
+        }
+      }, 
+      'required': ['a', 'b']
+    }
+  }
+}
+
+model_input = tokenizer.apply_chat_template(
+    messages,
+    tools = [current_time, multiply]
+)
+```
+
+## RAG
+
+Retrieval-augmented generation (RAG) models enhance a models existing knowledge by allowing it to search documents for additional information before returning a query. For RAG models, add a `documents` parameter to [`~PreTrainedTokenizerBase.apply_chat_template`]. This `documents` parameter should be a list of documents, and each document should be a single dict with `title` and `content` keys.
+
+> [!TIP]
+> The `documents` parameter for RAG isn't widely supported and many models have chat templates that ignore `documents`. Verify if a model supports `documents` by reading its model card or executing `print(tokenizer.chat_template)` to see if the `documents` key is present. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024) both support `documents` in their RAG chat templates.
+
+Create a list of documents to pass to the model.
+
+```py
+documents = [
+    {
+        "title": "The Moon: Our Age-Old Foe", 
+        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
+    },
+    {
+        "title": "The Sun: Our Age-Old Friend",
+        "text": "Although often underappreciated, the sun provides several notable benefits..."
+    }
+]
+```
+
+Set `chat_template="rag"` in [`~PreTrainedTokenizerBase.apply_chat_template`] and generate a response.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# Load the model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit")
+model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto")
+device = model.device # Get the device the model is loaded on
+
+# Define conversation input
+conversation = [
+    {"role": "user", "content": "What has Man always dreamed of?"}
+]
+
+input_ids = tokenizer.apply_chat_template(
+    conversation=conversation,
+    documents=documents,
+    chat_template="rag",
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt").to(device)
+
+# Generate a response 
+generated_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    )
+
+# Decode and print the generated text along with generation prompt
+generated_text = tokenizer.decode(generated_tokens[0])
+print(generated_text)
+```
diff --git a/docs/source/en/chat_template_advanced.md b/docs/source/en/chat_template_advanced.md
deleted file mode 100644
index 5943709539e7..000000000000
--- a/docs/source/en/chat_template_advanced.md
+++ /dev/null
@@ -1,463 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Advanced Usage and Customizing Your Chat Templates
-
-In this page, we’ll explore more advanced techniques for working with chat templates in Transformers. Whether you’re looking to write your own templates, create custom components, or optimize your templates for efficiency, we’ll cover everything you need to take your templates to the next level. Let’s dive into the tools and strategies that will help you get the most out of your chat models.
-
-
-## How do chat templates work?
-
-The chat template for a model is stored on the `tokenizer.chat_template` attribute. Let's take a look at a `Zephyr` chat template, though note this
-one is a little simplified from the actual one!
-
-```
-{%- for message in messages %}
-    {{- '<|' + message['role'] + '|>\n' }}
-    {{- message['content'] + eos_token }}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|assistant|>\n' }}
-{%- endif %}
-```
-
-If you've never seen one of these before, this is a [Jinja template](https://jinja.palletsprojects.com/en/3.1.x/templates/).
-Jinja is a templating language that allows you to write simple code that generates text. In many ways, the code and
-syntax resembles Python. In pure Python, this template would look something like this:
-
-```python
-for message in messages:
-    print(f'<|{message["role"]}|>')
-    print(message['content'] + eos_token)
-if add_generation_prompt:
-    print('<|assistant|>')
-```
-
-Effectively, the template does three things:
-1. For each message, print the role enclosed in `<|` and `|>`, like `<|user|>` or `<|assistant|>`.
-2. Next, print the content of the message, followed by the end-of-sequence token.
-3. Finally, if `add_generation_prompt` is set, print the assistant token, so that the model knows to start generating
-   an assistant response.
-
-This is a pretty simple template but Jinja gives you a lot of flexibility to do more complex things! Let's see a Jinja
-template that can format inputs similarly to the way LLaMA formats them (note that the real LLaMA template includes 
-handling for default system messages and slightly different system message handling in general - don't use this one 
-in your actual code!)
-
-```
-{%- for message in messages %}
-    {%- if message['role'] == 'user' %}
-        {{- bos_token + '[INST] ' + message['content'] + ' [/INST]' }}
-    {%- elif message['role'] == 'system' %}
-        {{- '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}
-    {%- elif message['role'] == 'assistant' %}
-        {{- ' '  + message['content'] + ' ' + eos_token }}
-    {%- endif %}
-{%- endfor %}
-```
-
-Hopefully if you stare at this for a little bit you can see what this template is doing - it adds specific tokens like
-`[INST]` and `[/INST]` based on the role of each message. User, assistant and system messages are clearly
-distinguishable to the model because of the tokens they're wrapped in.
-
-
-## How do I create a chat template?
-
-Simple, just write a jinja template and set `tokenizer.chat_template`. You may find it easier to start with an 
-existing template from another model and simply edit it for your needs! For example, we could take the LLaMA template
-above and add "[ASST]" and "[/ASST]" to assistant messages:
-
-```
-{%- for message in messages %}
-    {%- if message['role'] == 'user' %}
-        {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
-    {%- elif message['role'] == 'system' %}
-        {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
-    {%- elif message['role'] == 'assistant' %}
-        {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
-    {%- endif %}
-{%- endfor %}
-```
-
-Now, simply set the `tokenizer.chat_template` attribute. Next time you use [`~PreTrainedTokenizer.apply_chat_template`], it will
-use your new template! This attribute will be saved in the `tokenizer_config.json` file, so you can use
-[`~utils.PushToHubMixin.push_to_hub`] to upload your new template to the Hub and make sure everyone's using the right
-template for your model!
-
-```python
-template = tokenizer.chat_template
-template = template.replace("SYS", "SYSTEM")  # Change the system token
-tokenizer.chat_template = template  # Set the new template
-tokenizer.push_to_hub("model_name")  # Upload your new template to the Hub!
-```
-
-The method [`~PreTrainedTokenizer.apply_chat_template`] which uses your chat template is called by the [`TextGenerationPipeline`] class, so 
-once you set the correct chat template, your model will automatically become compatible with [`TextGenerationPipeline`].
-
-<Tip>
-If you're fine-tuning a model for chat, in addition to setting a chat template, you should probably add any new chat
-control tokens as special tokens in the tokenizer. Special tokens are never split, 
-ensuring that your control tokens are always handled as single tokens rather than being tokenized in pieces. You 
-should also set the tokenizer's `eos_token` attribute to the token that marks the end of assistant generations in your
-template. This will ensure that text generation tools can correctly figure out when to stop generating text.
-</Tip>
-
-
-## Why do some models have multiple templates?
-
-Some models use different templates for different use cases. For example, they might use one template for normal chat
-and another for tool-use, or retrieval-augmented generation. In these cases, `tokenizer.chat_template` is a dictionary.
-This can cause some confusion, and where possible, we recommend using a single template for all use-cases. You can use
-Jinja statements like `if tools is defined` and `{% macro %}` definitions to easily wrap multiple code paths in a
-single template.
-
-When a tokenizer has multiple templates, `tokenizer.chat_template` will be a `dict`, where each key is the name
-of a template. The `apply_chat_template` method has special handling for certain template names: Specifically, it will
-look for a template named `default` in most cases, and will raise an error if it can't find one. However, if a template
-named `tool_use` exists when the user has passed a `tools` argument, it will use that instead. To access templates
-with other names, pass the name of the template you want to the `chat_template` argument of
-`apply_chat_template()`.
-
-We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend
-trying to put it all in a single template where possible!
-
-
-## What template should I use?
-
-When setting the template for a model that's already been trained for chat, you should ensure that the template
-exactly matches the message formatting that the model saw during training, or else you will probably experience
-performance degradation. This is true even if you're training the model further - you will probably get the best 
-performance if you keep the chat tokens constant. This is very analogous to tokenization - you generally get the
-best performance for inference or fine-tuning when you precisely match the tokenization used during training.
-
-If you're training a model from scratch, or fine-tuning a base language model for chat, on the other hand,
-you have a lot of freedom to choose an appropriate template! LLMs are smart enough to learn to handle lots of different
-input formats. One popular choice is the `ChatML` format, and this is a good, flexible choice for many use-cases. 
-It looks like this:
-
-```
-{%- for message in messages %}
-    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
-{%- endfor %}
-```
-
-If you like this one, here it is in one-liner form, ready to copy into your code. The one-liner also includes
-handy support for [generation prompts](#what-are-generation-prompts), but note that it doesn't add BOS or EOS tokens!
-If your model expects those, they won't be added automatically by `apply_chat_template` - in other words, the
-text will be tokenized with `add_special_tokens=False`. This is to avoid potential conflicts between the template and
-the `add_special_tokens` logic. If your model expects special tokens, make sure to add them to the template!
-
-```python
-tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
-```
-
-This template wraps each message in `<|im_start|>` and `<|im_end|>` tokens, and simply writes the role as a string, which
-allows for flexibility in the roles you train with. The output looks like this:
-
-```text
-<|im_start|>system
-You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
-<|im_start|>user
-How are you?<|im_end|>
-<|im_start|>assistant
-I'm doing great!<|im_end|>
-```
-
-The "user", "system" and "assistant" roles are the standard for chat, and we recommend using them when it makes sense,
-particularly if you want your model to operate well with [`TextGenerationPipeline`]. However, you are not limited
-to these roles - templating is extremely flexible, and any string can be a role.
-
-## I want to add some chat templates! How should I get started?
-
-If you have any chat models, you should set their `tokenizer.chat_template` attribute and test it using
-[`~PreTrainedTokenizer.apply_chat_template`], then push the updated tokenizer to the Hub. This applies even if you're
-not the model owner - if you're using a model with an empty chat template, or one that's still using the default class
-template, please open a [pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) to the model repository so that this attribute can be set properly!
-
-Once the attribute is set, that's it, you're done! `tokenizer.apply_chat_template` will now work correctly for that
-model, which means it is also automatically supported in places like `TextGenerationPipeline`!
-
-By ensuring that models have this attribute, we can make sure that the whole community gets to use the full power of
-open-source models. Formatting mismatches have been haunting the field and silently harming performance for too long - 
-it's time to put an end to them!
-
-
-<Tip>
-
-The easiest way to get started with writing Jinja templates is to take a look at some existing ones. You can use
-`print(tokenizer.chat_template)` for any chat model to see what template it's using. In general, models that support tool use have 
-much more complex templates than other models - so when you're just getting started, they're probably a bad example
-to learn from! You can also take a look at the 
-[Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for details
-of general Jinja formatting and syntax.
-
-</Tip>
-
-Jinja templates in `transformers` are identical to Jinja templates elsewhere. The main thing to know is that 
-the conversation history will be accessible inside your template as a variable called `messages`.  
-You will be able to access `messages` in your template just like you can in Python, which means you can loop over 
-it with `{% for message in messages %}` or access individual messages with `{{ messages[0] }}`, for example.
-
-You can also use the following tips to write clean, efficient Jinja templates:
-
-### Trimming whitespace
-
-By default, Jinja will print any whitespace that comes before or after a block. This can be a problem for chat
-templates, which generally want to be very precise with whitespace! To avoid this, we strongly recommend writing
-your templates like this:
-
-```
-{%- for message in messages %}
-    {{- message['role'] + message['content'] }}
-{%- endfor %}
-```
-
-rather than like this:
-
-```
-{% for message in messages %}
-    {{ message['role'] + message['content'] }}
-{% endfor %}
-```
-
-Adding `-` will strip any whitespace that comes before the block. The second example looks innocent, but the newline
-and indentation may end up being included in the output, which is probably not what you want!
-
-### Special variables
-
-Inside your template, you will have access several special variables. The most important of these is `messages`, 
-which contains the chat history as a list of message dicts. However, there are several others. Not every
-variable will be used in every template. The most common other variables are:
-
-- `tools` contains a list of tools in JSON schema format. Will be `None` or undefined if no tools are passed.
-- `documents` contains a list of documents in the format `{"title": "Title", "contents": "Contents"}`, used for retrieval-augmented generation. Will be `None` or undefined if no documents are passed.
-- `add_generation_prompt` is a bool that is `True` if the user has requested a generation prompt, and `False` otherwise. If this is set, your template should add the header for an assistant message to the end of the conversation. If your model doesn't have a specific header for assistant messages, you can ignore this flag.
-- **Special tokens** like `bos_token` and `eos_token`. These are extracted from `tokenizer.special_tokens_map`. The exact tokens available inside each template will differ depending on the parent tokenizer.
-
-<Tip>
-
-You can actually pass any `kwarg` to `apply_chat_template`, and it will be accessible inside the template as a variable. In general,
-we recommend trying to stick to the core variables above, as it will make your model harder to use if users have
-to write custom code to pass model-specific `kwargs`. However, we're aware that this field moves quickly, so if you
-have a new use-case that doesn't fit in the core API, feel free to use a new `kwarg` for it! If a new `kwarg`
-becomes common we may promote it into the core API and create a standard, documented format for it.
-
-</Tip>
-
-### Callable functions
-
-There is also a short list of callable functions available to you inside your templates. These are:
-
-- `raise_exception(msg)`: Raises a `TemplateException`. This is useful for debugging, and for telling users when they're
-doing something that your template doesn't support.
-- `strftime_now(format_str)`: Equivalent to `datetime.now().strftime(format_str)` in Python. This is used for getting
-the current date/time in a specific format, which is sometimes included in system messages.
-
-### Compatibility with non-Python Jinja
-
-There are multiple implementations of Jinja in various languages. They generally have the same syntax,
-but a key difference is that when you're writing a template in Python you can use Python methods, such as
-`.lower()` on strings or `.items()` on dicts. This will break if someone tries to use your template on a non-Python
-implementation of Jinja. Non-Python implementations are particularly common in deployment environments, where JS
-and Rust are very popular. 
-
-Don't panic, though! There are a few easy changes you can make to your templates to ensure they're compatible across
-all implementations of Jinja:
-
-- Replace Python methods with Jinja filters. These usually have the same name, for example `string.lower()` becomes
-  `string|lower`, and `dict.items()` becomes `dict|items`. One notable change is that `string.strip()` becomes `string|trim`.
-  See the [list of built-in filters](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters)
-  in the Jinja documentation for more.
-- Replace `True`, `False` and `None`, which are Python-specific, with `true`, `false` and `none`.
-- Directly rendering a dict or list may give different results in other implementations (for example, string entries
-  might change from single-quoted to double-quoted). Adding the `tojson` filter can help to ensure consistency here.
-
-### Writing generation prompts
-
-We mentioned above that `add_generation_prompt` is a special variable that will be accessible inside your template,
-and is controlled by the user setting the `add_generation_prompt` flag. If your model expects a header for
-assistant messages, then your template must support adding the header when `add_generation_prompt` is set.
-
-Here is an example of a template that formats messages ChatML-style, with generation prompt support:
-
-```text
-{{- bos_token }}
-{%- for message in messages %}
-    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-{%- endif %}
-```
-
-The exact content of the assistant header will depend on your specific model, but it should always be **the string
-that represents the start of an assistant message**, so that if the user applies your template with 
-`add_generation_prompt=True` and then generates text, the model will write an assistant response. Also note that some
-models do not need a generation prompt, because assistant messages always begin immediately after user messages. 
-This is particularly common for LLaMA and Mistral models, where assistant messages begin immediately after the `[/INST]`
-token that ends user messages. In these cases, the template can ignore the `add_generation_prompt` flag.
-
-Generation prompts are important! If your model requires a generation prompt but it is not set in the template, then
-model generations will likely be severely degraded, or the model may display unusual behaviour like continuing 
-the final user message! 
-
-### Writing and debugging larger templates
-
-When this feature was introduced, most templates were quite small, the Jinja equivalent of a "one-liner" script. 
-However, with new models and features like tool-use and RAG, some templates can be 100 lines long or more. When
-writing templates like these, it's a good idea to write them in a separate file, using a text editor. You can easily 
-extract a chat template to a file:
-
-```python
-open("template.jinja", "w").write(tokenizer.chat_template)
-```
-
-Or load the edited template back into the tokenizer:
-
-```python
-tokenizer.chat_template = open("template.jinja").read()
-```
-
-As an added bonus, when you write a long, multi-line template in a separate file, line numbers in that file will
-exactly correspond to line numbers in template parsing or execution errors. This will make it much easier to
-identify the source of issues.
-
-
-
-## Writing templates for tools
-
-Although chat templates do not enforce a specific API for tools (or for anything, really), we recommend 
-template authors try to stick to a standard API where possible. The whole point of chat templates is to allow code
-to be transferable across models, so deviating from the standard tools API means users will have to write
-custom code to use tools with your model. Sometimes it's unavoidable, but often with clever templating you can
-make the standard API work!
-
-Below, we'll list the elements of the standard API, and give tips on writing templates that will work well with it.
-
-### Tool definitions
-
-Your template should expect that the variable `tools` will either be null (if no tools are passed), or is a list 
-of JSON schema dicts. Our chat template methods allow users to pass tools as either JSON schema or Python functions, but when
-functions are passed, we automatically generate JSON schema and pass that to your template. As a result, the 
-`tools` variable that your template receives will always be a list of JSON schema. Here is
-a sample tool JSON schema:
-
-```json
-{
-  "type": "function", 
-  "function": {
-    "name": "multiply", 
-    "description": "A function that multiplies two numbers", 
-    "parameters": {
-      "type": "object", 
-      "properties": {
-        "a": {
-          "type": "number", 
-          "description": "The first number to multiply"
-        }, 
-        "b": {
-          "type": "number",
-          "description": "The second number to multiply"
-        }
-      }, 
-      "required": ["a", "b"]
-    }
-  }
-}
-```
-
-And here is some example code for handling tools in your chat template. Remember, this is just an example for a
-specific format - your model will probably need different formatting!
-
-```text
-{%- if tools %}
-    {%- for tool in tools %}
-        {{- '<tool>' + tool['function']['name'] + '\n' }}
-        {%- for argument in tool['function']['parameters']['properties'] %}
-            {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
-        {%- endfor %}
-        {{- '\n</tool>' }}
-    {%- endif %}
-{%- endif %}
-```
-
-The specific tokens and tool descriptions your template renders should of course be chosen to match the ones your model
-was trained with. There is no requirement that your **model** understands JSON schema input, only that your template can translate
-JSON schema into your model's format. For example, [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) 
-was trained with tools defined using Python function headers, but the Command-R tool template accepts JSON schema, 
-converts types internally and renders the input tools as Python headers. You can do a lot with templates!
-
-### Tool calls
-
-Tool calls, if present, will be a list attached to a message with the "assistant" role. Note that `tool_calls` is 
-always a list, even though most tool-calling models only support single tool calls at a time, which means
-the list will usually only have a single element. Here is a sample message dict containing a tool call:
-
-```json
-{
-  "role": "assistant",
-  "tool_calls": [
-    {
-      "type": "function",
-      "function": {
-        "name": "multiply",
-        "arguments": {
-          "a": 5,
-          "b": 6
-        }
-      }
-    }
-  ]
-}
-```
-
-And a common pattern for handling them would be something like this:
-
-```text
-{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
-    {%- for tool_call in message['tool_calls'] %}
-            {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
-        {%- endif %}
-    {%- endfor %}
-{%- endif %}
-```
-
-Again, you should render the tool call with the formatting and special tokens that your model expects.
-
-### Tool responses
-
-Tool responses have a simple format: They are a message dict with the "tool" role, a "name" key giving the name
-of the called function, and a "content" key containing the result of the tool call. Here is a sample tool response:
-
-```json
-{
-  "role": "tool",
-  "name": "multiply",
-  "content": "30"
-}
-```
-
-You don't need to use all of the keys in the tool response. For example, if your model doesn't expect the function
-name to be included in the tool response, then rendering it can be as simple as:
-
-```text
-{%- if message['role'] == 'tool' %}
-    {{- "<tool_result>" + message['content'] + "</tool_result>" }}
-{%- endif %}
-```
-
-Again, remember that the actual formatting and special tokens are model-specific - you should take a lot of care
-to ensure that tokens, whitespace and everything else exactly match the format your model was trained with!
diff --git a/docs/source/en/chat_template_basics.md b/docs/source/en/chat_template_basics.md
deleted file mode 100644
index 2179fa4779ad..000000000000
--- a/docs/source/en/chat_template_basics.md
+++ /dev/null
@@ -1,287 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Getting Started with Chat Templates for Text LLMs
-
-An increasingly common use case for LLMs is **chat**. In a chat context, rather than continuing a single string
-of text (as is the case with a standard language model), the model instead continues a conversation that consists
-of one or more **messages**, each of which includes a **role**, like "user" or "assistant", as well as message text.
-
-Much like tokenization, different models expect very different input formats for chat. This is the reason we added
-**chat templates** as a feature. Chat templates are part of the tokenizer for text-only LLMs or processor for multimodal LLMs. They specify how to convert conversations, represented as lists of messages, into a single tokenizable string in the format that the model expects. 
-
-We'll explore the basic usage of chat templates with text-only LLMs in this page. For detailed guidance on multimodal models, we have a dedicated [documentation oage for multimodal models](./chat_template_multimodal), which covers how to work with image, video and audio inputs in your templates.
-
-Let's make this concrete with a quick example using the `mistralai/Mistral-7B-Instruct-v0.1` model:
-
-```python
->>> from transformers import AutoTokenizer
->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
-
->>> chat = [
-...   {"role": "user", "content": "Hello, how are you?"},
-...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
-...   {"role": "user", "content": "I'd like to show off how chat templating works!"},
-... ]
-
->>> tokenizer.apply_chat_template(chat, tokenize=False)
-"<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"
-```
-
-Notice how the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of 
-user messages (but not assistant messages!), and the entire chat is condensed into a single string. 
-If we use `tokenize=True`, which is the default setting, that string will also be tokenized for us.
-
-Now, try the same code, but swap in the `HuggingFaceH4/zephyr-7b-beta` model instead, and you should get:
-
-```text
-<|user|>
-Hello, how are you?</s>
-<|assistant|>
-I'm doing great. How can I help you today?</s>
-<|user|>
-I'd like to show off how chat templating works!</s>
-```
-
-Both Zephyr and Mistral-Instruct were fine-tuned from the same base model, `Mistral-7B-v0.1`. However, they were trained
-with totally different chat formats. Without chat templates, you would have to write manual formatting code for each
-model, and it's very easy to make minor errors that hurt performance! Chat templates handle the details of formatting 
-for you, allowing you to write universal code that works for any model.
-
-
-## How do I use chat templates?
-
-As you can see in the example above, chat templates are easy to use. Simply build a list of messages, with `role`
-and `content` keys, and then pass it to the [`~PreTrainedTokenizer.apply_chat_template`] or [`~ProcessorMixin.apply_chat_template`] method
-depending on what type of model you are using. Once you do that,
-you'll get output that's ready to go! When using chat templates as input for model generation, it's also a good idea
-to use `add_generation_prompt=True` to add a [generation prompt](#what-are-generation-prompts). 
-
-Here's an example of preparing input for `model.generate()`, using `Zephyr` again:
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-checkpoint = "HuggingFaceH4/zephyr-7b-beta"
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-model = AutoModelForCausalLM.from_pretrained(checkpoint)  # You may want to use bfloat16 and/or move to GPU here
-
-messages = [
-    {
-        "role": "system",
-        "content": "You are a friendly chatbot who always responds in the style of a pirate",
-    },
-    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
- ]
-tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
-print(tokenizer.decode(tokenized_chat[0]))
-```
-This will yield a string in the input format that Zephyr expects. 
-```text
-<|system|>
-You are a friendly chatbot who always responds in the style of a pirate</s> 
-<|user|>
-How many helicopters can a human eat in one sitting?</s> 
-<|assistant|>
-```
-
-Now that our input is formatted correctly for Zephyr, we can use the model to generate a response to the user's question:
-
-```python
-outputs = model.generate(tokenized_chat, max_new_tokens=128) 
-print(tokenizer.decode(outputs[0]))
-```
-
-This will yield:
-
-```text
-<|system|>
-You are a friendly chatbot who always responds in the style of a pirate</s> 
-<|user|>
-How many helicopters can a human eat in one sitting?</s> 
-<|assistant|>
-Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
-```
-
-Arr, 'twas easy after all!
-
-
-## Is there an automated pipeline for chat?
-
-Yes, there is! Our text generation pipelines support chat inputs, which makes it easy to use chat models. In the past,
-we used to use a dedicated "ConversationalPipeline" class, but this has now been deprecated and its functionality
-has been merged into the [`TextGenerationPipeline`]. Let's try the `Zephyr` example again, but this time using 
-a pipeline:
-
-```python
-from transformers import pipeline
-
-pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta")
-messages = [
-    {
-        "role": "system",
-        "content": "You are a friendly chatbot who always responds in the style of a pirate",
-    },
-    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
-]
-print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1])  # Print the assistant's response
-```
-
-```text
-{'role': 'assistant', 'content': "Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all."}
-```
-
-The pipeline will take care of all the details of tokenization and calling `apply_chat_template` for you -
-once the model has a chat template, all you need to do is initialize the pipeline and pass it the list of messages!
-
-
-## What are "generation prompts"?
-
-You may have noticed that the `apply_chat_template` method has an `add_generation_prompt` argument. This argument tells
-the template to add tokens that indicate the start of a bot response. For example, consider the following chat:
-
-```python
-messages = [
-    {"role": "user", "content": "Hi there!"},
-    {"role": "assistant", "content": "Nice to meet you!"},
-    {"role": "user", "content": "Can I ask a question?"}
-]
-```
-
-Here's what this will look like without a generation prompt, for a model that uses standard "ChatML" formatting:
-
-```python
-tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
-"""<|im_start|>user
-Hi there!<|im_end|>
-<|im_start|>assistant
-Nice to meet you!<|im_end|>
-<|im_start|>user
-Can I ask a question?<|im_end|>
-"""
-```
-
-And here's what it looks like **with** a generation prompt:
-
-```python
-tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-"""<|im_start|>user
-Hi there!<|im_end|>
-<|im_start|>assistant
-Nice to meet you!<|im_end|>
-<|im_start|>user
-Can I ask a question?<|im_end|>
-<|im_start|>assistant
-"""
-```
-
-Note that this time, we've added the tokens that indicate the start of a bot response. This ensures that when the model
-generates text it will write a bot response instead of doing something unexpected, like continuing the user's 
-message. Remember, chat models are still just language models - they're trained to continue text, and chat is just a 
-special kind of text to them! You need to guide them with appropriate control tokens, so they know what they're 
-supposed to be doing.
-
-Not all models require generation prompts. Some models, like LLaMA, don't have any
-special tokens before bot responses. In these cases, the `add_generation_prompt` argument will have no effect. The exact
-effect that `add_generation_prompt` has will depend on the template being used.
-
-
-## What does "continue_final_message" do?
-
-When passing a list of messages to `apply_chat_template` or `TextGenerationPipeline`, you can choose
-to format the chat so the model will continue the final message in the chat instead of starting a new one. This is done
-by removing any end-of-sequence tokens that indicate the end of the final message, so that the model will simply
-extend the final message when it begins to generate text. This is useful for "prefilling" the model's response. 
-
-Here's an example:
-
-```python
-chat = [
-    {"role": "user", "content": "Can you format the answer in JSON?"},
-    {"role": "assistant", "content": '{"name": "'},
-]
-
-formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True)
-model.generate(**formatted_chat)
-```
-
-The model will generate text that continues the JSON string, rather than starting a new message. This approach
-can be very useful for improving the accuracy of the model's instruction-following when you know how you want
-it to start its replies.
-
-Because `add_generation_prompt` adds the tokens that start a new message, and `continue_final_message` removes any
-end-of-message tokens from the final message, it does not make sense to use them together. As a result, you'll
-get an error if you try!
-
-<Tip>
-
-The default behaviour of `TextGenerationPipeline` is to set `add_generation_prompt=True` so that it starts a new
-message. However, if the final message in the input chat has the "assistant" role, it will assume that this message is 
-a prefill and switch to `continue_final_message=True` instead, because most models do not support multiple 
-consecutive assistant messages. You can override this behaviour by explicitly passing the `continue_final_message` 
-argument when calling the pipeline.
-
-</Tip>
-
-
-## Can I use chat templates in training?
-
-Yes! This is a good way to ensure that the chat template matches the tokens the model sees during training.
-We recommend that you apply the chat template as a preprocessing step for your dataset. After this, you
-can simply continue like any other language model training task. When training, you should usually set 
-`add_generation_prompt=False`, because the added tokens to prompt an assistant response will not be helpful during 
-training. Let's see an example:
-
-```python
-from transformers import AutoTokenizer
-from datasets import Dataset
-
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
-
-chat1 = [
-    {"role": "user", "content": "Which is bigger, the moon or the sun?"},
-    {"role": "assistant", "content": "The sun."}
-]
-chat2 = [
-    {"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
-    {"role": "assistant", "content": "A bacterium."}
-]
-
-dataset = Dataset.from_dict({"chat": [chat1, chat2]})
-dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
-print(dataset['formatted_chat'][0])
-```
-And we get:
-```text
-<|user|>
-Which is bigger, the moon or the sun?</s>
-<|assistant|>
-The sun.</s>
-```
-
-From here, just continue training like you would with a standard language modelling task, using the `formatted_chat` column.
-
-<Tip>
-
-By default, some tokenizers add special tokens like `<bos>` and `<eos>` to text they tokenize. Chat templates should 
-already include all the special tokens they need, and so additional special tokens will often be incorrect or 
-duplicated, which will hurt model performance.
-
-Therefore, if you format text with `apply_chat_template(tokenize=False)`, you should set the argument
-`add_special_tokens=False` when you tokenize that text later. If you use `apply_chat_template(tokenize=True)`, you don't need to worry about this!
-
-</Tip>
-
diff --git a/docs/source/en/chat_template_multimodal.md b/docs/source/en/chat_template_multimodal.md
deleted file mode 100644
index 1b283449605b..000000000000
--- a/docs/source/en/chat_template_multimodal.md
+++ /dev/null
@@ -1,289 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Multimodal Chat Templates for Vision and Audio LLMs
-
-In this section, we'll explore how to use chat templates with multimodal models, enabling your templates to handle a variety of inputs such as text, images, and audio. Multimodal models provide richer, more interactive experiences, and understanding how to effectively combine these inputs within your templates is key. We’ll walk through how to work with different modalities, configure your templates for optimal performance, and tackle common challenges along the way.
-
-Just like with text-only LLMs, multimodal models expect a chat with **messages**, each of which includes a **role** and **content**. However, for multimodal models, chat templates are a part of the [Processor](./main_cllasses/processors) class. Let's see how we can format our prompts when there are images or videos in the input along with text.
-
-
-## Image inputs
-
-For models such as [LLaVA](https://huggingface.co/llava-hf) the prompts can be formatted as below. Notice that the only difference from text-only models is that we need to also pass a placeholder for input images. To accommodate for extra modalities, each **content** is a list containing either a text or an image **type**.
-
-Let's make this concrete with a quick example using the `llava-hf/llava-onevision-qwen2-0.5b-ov-hf` model:
-
-```python
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-
-model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
-processor = AutoProcessor.from_pretrained(model_id)
-
-messages = [
-    {
-        "role": "system",
-        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
-    },
-    {
-      "role": "user",
-      "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What are these?"},
-        ],
-    },
-]
-
-formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-print(formatted_prompt)
-```
-
-This yields a string in LLaVA's expected input format with many `<image>` tokens prepended before the text.
-```text
-'<|im_start|>system 
-<|im_start|>system 
-You are a friendly chatbot who always responds in the style of a pirate<|im_end|><|im_start|>user <image>
-What are these?<|im_end|>
-```
-
-
-### Image paths or URLs
-
-To incorporate images into your chat templates, you can pass them as file paths or URLs. This method automatically loads the image, processes it, and prepares the necessary pixel values to create ready-to-use inputs for the model. This approach simplifies the integration of images, enabling seamless multimodal functionality.
-
-Let's see how it works with an example using the same model as above. This time we'll indicate an image URL with `"url"` key in the message's **content** and ask the chat template to `tokenize` and `return_dict`. Currently, "base64", "url", and "path" are supported image sources.
-
-```python
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-
-model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
-model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)
-processor = AutoProcessor.from_pretrained(model_id)
-
-messages = [
-    {
-        "role": "system",
-        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
-    },
-    {
-      "role": "user",
-      "content": [
-            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
-            {"type": "text", "text": "What are these?"},
-        ],
-    },
-]
-
-processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
-print(processed_chat.keys())
-```
-
-This yields a dictionary with inputs processed and ready to be further passed into [`~GenerationMixin.generate`] to generate text.
-```text
-dict_keys(["input_ids", "attention_mask", "pixel_values", "image_sizes"])
-```
-
-
-## Video inputs
-
-Some vision models support videos as inputs as well as images. The message format is very similar to the image-only models with tiny differences to handle loading videos from a URL. We can continue using the same model as before since it supports videos.
-
-### Sampling with fixed number of frames
-
-Here's an example of how to set up a conversation with video inputs. Notice the extra `kwargs` passed to `processor.apply_chat_template()`. The key parameter here is `num_frames`, which controls how many frames to sample uniformly from the video. Each model checkpoint has a maximum frame count it was trained with, and exceeding this limit can significantly impact generation quality. So, it’s important to choose a frame count that fits both the model's capacity and your computational resources. If you don't specify `num_frames`, the entire video will be loaded without any frame sampling.
-
-You also have the option to choose a specific framework to load the video, depending on your preferences or needs. Currently, we support `decord`, `pyav` (the default), `opencv`, and `torchvision`. For this example, we’ll use `decord`, as it's a bit faster than `pyav`.
-
-
-<Tip>
-
-Note that if you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backend.
-
-</Tip>
-
-
-```python
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-
-model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
-model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)
-processor = AutoProcessor.from_pretrained(model_id)
-
-messages = [
-    {
-        "role": "system",
-        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
-    },
-    {
-      "role": "user",
-      "content": [
-            {"type": "video", "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"},
-            {"type": "text", "text": "What do you see in this video?"},
-        ],
-    },
-]
-
-processed_chat = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-    num_frames=32,
-    video_load_backend="decord",
-)
-print(processed_chat.keys())
-```
-
-### Sampling with FPS
-
-When working with long videos, you might want to sample more frames for better representation. Instead of a fixed number of frames, you can specify `video_fps`, which determines how many frames per second to extract. For example, if a video is **10 seconds long** and you set `video_fps=2`, the model will sample **20 frames** (2 per second, uniformly spaced). 
-
-Using the above model, we need to apply chat template as follows to sample 2 frames per second.
-
-```python
-processed_chat = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    video_fps=32,
-    video_load_backend="decord",
-)
-print(processed_chat.keys())
-```
-
-
-### Custom Frame Sampling with a Function  
-
-Not all models sample frames **uniformly** — some require more complex logic to determine which frames to use. If your model follows a different sampling strategy, you can **customize** frame selection by providing a function:  
-
-🔹 Use the `sample_indices_fn` argument to pass a **callable function** for sampling.  
-🔹 If provided, this function **overrides** standard `num_frames` and `fps` methods.  
-🔹 It receives all the arguments passed to `load_video` and must return **valid frame indices** to sample.  
-
-You should use `sample_indices_fn` when:
-
-- If you need a custom sampling strategy (e.g., **adaptive frame selection** instead of uniform sampling).  
-- If your model prioritizes **key moments** in a video rather than evenly spaced frames.  
-
-Here’s an example of how to implement it:  
-
-
-```python
-
-def sample_indices_fn(metadata, **kwargs):
-    # samples only the first and the second frame
-    return [0, 1]
-
-processed_chat = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    sample_indices_fn=sample_indices_fn,
-    video_load_backend="decord",
-)
-print(processed_chat.keys())
-```
-
-By using `sample_indices_fn`, you gain **full control** over frame selection, making your model **more adaptable** to different video scenarios. 🚀  
-
-
-### List of image frames as video
-
-Sometimes, instead of having a full video file, you might only have a set of sampled frames stored as images.
-
-You can pass a list of image file paths, and the processor will automatically concatenate them into a video. Just make sure that all images have the same size, as they are assumed to be from the same video.
-
-
-```python
-frames_paths = ["/path/to/frame0.png", "/path/to/frame5.png", "/path/to/frame10.png"]
-messages = [
-    {
-        "role": "system",
-        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
-    },
-    {
-      "role": "user",
-      "content": [
-            {"type": "video", "path": frames_paths},
-            {"type": "text", "text": "What do you see in this video?"},
-        ],
-    },
-]
-
-processed_chat = processor.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-)
-print(processed_chat.keys())
-```
-
-
-## Multimodal conversational pipeline
-
-[`ImageTextToTextPipeline`] currently accepts images as inputs but we are planning to add support for video inputs in the future. The pipeline supports chat inputs in the same format as we have seen above. Apart from that, the pipeline will accept chats in OpenAI format. This format is supported exclusively within the pipeline to make inference easier and more accessible. 
-
-Here is how the OpenAI conversation format looks:
-
-```python
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What is in this image?",
-            },
-            {
-                "type": "image_url",
-                "image_url": {"url": f"http://images.cocodataset.org/val2017/000000039769.jpg"},
-            },
-        ],
-    }
-]
-```
-
-## Best Practices for Multimodal Template Configuration
-
-
-To add a custom chat template for your multimodal LLM, simply create your template using [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with `processor.chat_template`. If you're new to writing chat templates or need some tips, check out our [tutorial here](./chat_template_advanced) for helpful guidance.
-
-In some cases, you may want your template to handle a **list of content** from multiple modalities, while still supporting a plain string for text-only inference. Here's an example of how you can achieve that, using the [Llama-Vision](https://huggingface.co/collections/meta-llama/metas-llama-32-multimodal-models-675bfd70e574a62dd0e4059b) chat template.
-
-
-```
-{% for message in messages %}
-{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}
-{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
-{% if message['content'] is string %}
-{{ message['content'] }}
-{% else %}
-{% for content in message['content'] %}
-{% if content['type'] == 'image' %}
-{{ '<|image|>' }}
-{% elif content['type'] == 'text' %}
-{{ content['text'] }}
-{% endif %}
-{% endfor %}
-{% endif %}
-{{ '<|eot_id|>' }}
-{% endfor %}
-{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
-```
diff --git a/docs/source/en/chat_template_tools_and_documents.md b/docs/source/en/chat_template_tools_and_documents.md
deleted file mode 100644
index 6c5491a2484a..000000000000
--- a/docs/source/en/chat_template_tools_and_documents.md
+++ /dev/null
@@ -1,410 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-
-# Expanding Chat Templates with Tools and Documents
-
-The only argument that `apply_chat_template` requires is `messages`. However, you can pass any keyword
-argument to `apply_chat_template` and it will be accessible inside the template. This gives you a lot of freedom to use
-chat templates for many things. There are no restrictions on the names or the format of these arguments - you can pass
-strings, lists, dicts or whatever else you want. 
-
-That said, there are some common use-cases for these extra arguments,
-such as passing tools for function calling, or documents for retrieval-augmented generation. In these common cases,
-we have some opinionated recommendations about what the names and formats of these arguments should be, which are
-described in the sections below. We encourage model authors to make their chat templates compatible with this format,
-to make it easy to transfer tool-calling code between models.
-
-## Tool use / function calling
-
-"Tool use" LLMs can choose to call functions as external tools before generating an answer. When passing tools
-to a tool-use model, you can simply pass a list of functions to the `tools` argument:
-
-```python
-import datetime
-
-def current_time():
-    """Get the current local time as a string."""
-    return str(datetime.now())
-
-def multiply(a: float, b: float):
-    """
-    A function that multiplies two numbers
-    
-    Args:
-        a: The first number to multiply
-        b: The second number to multiply
-    """
-    return a * b
-
-tools = [current_time, multiply]
-
-model_input = tokenizer.apply_chat_template(
-    messages,
-    tools=tools
-)
-```
-
-In order for this to work correctly, you should write your functions in the format above, so that they can be parsed
-correctly as tools. Specifically, you should follow these rules:
-
-- The function should have a descriptive name
-- Every argument must have a type hint
-- The function must have a docstring in the standard Google style (in other words, an initial function description  
-  followed by an `Args:` block that describes the arguments, unless the function does not have any arguments.) 
-- Do not include types in the `Args:` block. In other words, write `a: The first number to multiply`, not
-  `a (int): The first number to multiply`. Type hints should go in the function header instead.
-- The function can have a return type and a `Returns:` block in the docstring. However, these are optional
-  because most tool-use models ignore them.
-
-### Passing tool results to the model
-
-The sample code above is enough to list the available tools for your model, but what happens if it wants to actually use
-one? If that happens, you should:
-
-1. Parse the model's output to get the tool name(s) and arguments.
-2. Add the model's tool call(s) to the conversation.
-3. Call the corresponding function(s) with those arguments.
-4. Add the result(s) to the conversation
-
-### A complete tool use example
-
-Let's walk through a tool use example, step by step. For this example, we will use an 8B `Hermes-2-Pro` model,
-as it is one of the highest-performing tool-use models in its size category at the time of writing. If you have the
-memory, you can consider using a larger model instead like [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
-or [Mixtral-8x22B](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1), both of which also support tool use
-and offer even stronger performance.
-
-First, let's load our model and tokenizer:
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B"
-
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
-```
-
-Next, let's define a list of tools:
-
-```python
-def get_current_temperature(location: str, unit: str) -> float:
-    """
-    Get the current temperature at a location.
-    
-    Args:
-        location: The location to get the temperature for, in the format "City, Country"
-        unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
-    Returns:
-        The current temperature at the specified location in the specified units, as a float.
-    """
-    return 22.  # A real function should probably actually get the temperature!
-
-def get_current_wind_speed(location: str) -> float:
-    """
-    Get the current wind speed in km/h at a given location.
-    
-    Args:
-        location: The location to get the temperature for, in the format "City, Country"
-    Returns:
-        The current wind speed at the given location in km/h, as a float.
-    """
-    return 6.  # A real function should probably actually get the wind speed!
-
-tools = [get_current_temperature, get_current_wind_speed]
-```
-
-Now, let's set up a conversation for our bot:
-
-```python
-messages = [
-  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
-  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
-]
-```
-
-Now, let's apply the chat template and generate a response:
-
-```python
-inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
-inputs = {k: v.to(model.device) for k, v in inputs.items()}
-out = model.generate(**inputs, max_new_tokens=128)
-print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
-```
-
-And we get:
-
-```text
-<tool_call>
-{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
-</tool_call><|im_end|>
-```
-
-The model has called the function with valid arguments, in the format requested by the function docstring. It has
-inferred that we're most likely referring to the Paris in France, and it remembered that, as the home of SI units,
-the temperature in France should certainly be displayed in Celsius.
-
-<Tip>
-
-The output format above is specific to the `Hermes-2-Pro` model we're using in this example. Other models may emit different
-tool call formats, and you may need to do some manual parsing at this step. For example, `Llama-3.1` models will emit
-slightly different JSON, with `parameters` instead of `arguments`. Regardless of the format the model outputs, you 
-should add the tool call to the conversation in the format below, with `tool_calls`, `function` and `arguments` keys. 
-
-</Tip>
-
-Next, let's append the model's tool call to the conversation.
-
-```python
-tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
-messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
-```
-
-<Tip warning={true}>
-
-If you're familiar with the OpenAI API, you should pay attention to an important difference here - the `tool_call` is
-a dict, but in the OpenAI API it's a JSON string. Passing a string may cause errors or strange model behaviour!
-
-</Tip>
-
-Now that we've added the tool call to the conversation, we can call the function and append the result to the
-conversation. Since we're just using a dummy function for this example that always returns 22.0, we can just append 
-that result directly.
-
-```python
-messages.append({"role": "tool", "name": "get_current_temperature", "content": "22.0"})
-```
-
-<Tip>
-
-Some model architectures, notably Mistral/Mixtral, also require a `tool_call_id` here, which should be
-9 randomly-generated alphanumeric characters, and assigned to the `id` key of the tool call
-dictionary. The same key should also be assigned to the `tool_call_id` key of the tool response dictionary below, so 
-that tool calls can be matched to tool responses. So, for Mistral/Mixtral models, the code above would be:
-
-```python
-tool_call_id = "9Ae3bDc2F"  # Random ID, 9 alphanumeric characters
-tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
-messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
-```
-
-and
-
-```python
-messages.append({"role": "tool", "tool_call_id": tool_call_id, "name": "get_current_temperature", "content": "22.0"})
-```
-
-</Tip>
-
-Finally, let's let the assistant read the function outputs and continue chatting with the user:
-
-```python
-inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
-inputs = {k: v.to(model.device) for k, v in inputs.items()}
-out = model.generate(**inputs, max_new_tokens=128)
-print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
-```
-
-And we get:
-
-```text
-The current temperature in Paris, France is 22.0 ° Celsius.<|im_end|>
-```
-
-Although this was a simple demo with dummy tools and a single call, the same technique works with 
-multiple real tools and longer conversations. This can be a powerful way to extend the capabilities of conversational
-agents with real-time information, computational tools like calculators, or access to large databases.
-
-### Understanding tool schemas
-
-Each function you pass to the `tools` argument of `apply_chat_template` is converted into a 
-[JSON schema](https://json-schema.org/learn/getting-started-step-by-step). These schemas
-are then passed to the model chat template. In other words, tool-use models do not see your functions directly, and they
-never see the actual code inside them. What they care about is the function **definitions** and the **arguments** they
-need to pass to them - they care about what the tools do and how to use them, not how they work! It is up to you
-to read their outputs, detect if they have requested to use a tool, pass their arguments to the tool function, and
-return the response in the chat.
-
-Generating JSON schemas to pass to the template should be automatic and invisible as long as your functions
-follow the specification above, but if you encounter problems, or you simply want more control over the conversion, 
-you can handle the conversion manually. Here is an example of a manual schema conversion.
-
-```python
-from transformers.utils import get_json_schema
-
-def multiply(a: float, b: float):
-    """
-    A function that multiplies two numbers
-    
-    Args:
-        a: The first number to multiply
-        b: The second number to multiply
-    """
-    return a * b
-
-schema = get_json_schema(multiply)
-print(schema)
-```
-
-This will yield:
-
-```json
-{
-  "type": "function", 
-  "function": {
-    "name": "multiply", 
-    "description": "A function that multiplies two numbers", 
-    "parameters": {
-      "type": "object", 
-      "properties": {
-        "a": {
-          "type": "number", 
-          "description": "The first number to multiply"
-        }, 
-        "b": {
-          "type": "number",
-          "description": "The second number to multiply"
-        }
-      }, 
-      "required": ["a", "b"]
-    }
-  }
-}
-```
-
-If you wish, you can edit these schemas, or even write them from scratch yourself without using `get_json_schema` at 
-all. JSON schemas can be passed directly to the `tools` argument of 
-`apply_chat_template` - this gives you a lot of power to define precise schemas for more complex functions. Be careful,
-though - the more complex your schemas, the more likely the model is to get confused when dealing with them! We 
-recommend simple function signatures where possible, keeping arguments (and especially complex, nested arguments) 
-to a minimum.
-
-Here is an example of defining schemas by hand, and passing them directly to `apply_chat_template`:
-
-```python
-# A simple function that takes no arguments
-current_time = {
-  "type": "function", 
-  "function": {
-    "name": "current_time",
-    "description": "Get the current local time as a string.",
-    "parameters": {
-      'type': 'object',
-      'properties': {}
-    }
-  }
-}
-
-# A more complete function that takes two numerical arguments
-multiply = {
-  'type': 'function',
-  'function': {
-    'name': 'multiply',
-    'description': 'A function that multiplies two numbers', 
-    'parameters': {
-      'type': 'object', 
-      'properties': {
-        'a': {
-          'type': 'number',
-          'description': 'The first number to multiply'
-        }, 
-        'b': {
-          'type': 'number', 'description': 'The second number to multiply'
-        }
-      }, 
-      'required': ['a', 'b']
-    }
-  }
-}
-
-model_input = tokenizer.apply_chat_template(
-    messages,
-    tools = [current_time, multiply]
-)
-```
-
-## Retrieval-augmented generation
-
-"Retrieval-augmented generation" or "RAG" LLMs can search a corpus of documents for information before responding
-to a query. This allows models to vastly expand their knowledge base beyond their limited context size. Our 
-recommendation for RAG models is that their template
-should accept a `documents` argument. This should be a list of documents, where each "document"
-is a single dict with `title` and `contents` keys, both of which are strings. Because this format is much simpler
-than the JSON schemas used for tools, no helper functions are necessary.
-
-Here's an example of a RAG template in action:
-
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-# Load the model and tokenizer
-model_id = "CohereForAI/c4ai-command-r-v01-4bit"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
-device = model.device # Get the device the model is loaded on
-
-# Define conversation input
-conversation = [
-    {"role": "user", "content": "What has Man always dreamed of?"}
-]
-
-# Define documents for retrieval-based generation
-documents = [
-    {
-        "title": "The Moon: Our Age-Old Foe", 
-        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
-    },
-    {
-        "title": "The Sun: Our Age-Old Friend",
-        "text": "Although often underappreciated, the sun provides several notable benefits..."
-    }
-]
-
-# Tokenize conversation and documents using a RAG template, returning PyTorch tensors.
-input_ids = tokenizer.apply_chat_template(
-    conversation=conversation,
-    documents=documents,
-    chat_template="rag",
-    tokenize=True,
-    add_generation_prompt=True,
-    return_tensors="pt").to(device)
-
-# Generate a response 
-gen_tokens = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
-    temperature=0.3,
-    )
-
-# Decode and print the generated text along with generation prompt
-gen_text = tokenizer.decode(gen_tokens[0])
-print(gen_text)
-```
-
-<Tip>
-
-The `documents` input for retrieval-augmented generation is not widely supported, and many models have chat templates which simply ignore this input.
-
-To verify if a model supports the `documents` input, you can read its model card, or `print(tokenizer.chat_template)` to see if the `documents` key is used anywhere.
-
-One model class that does support it, though, is Cohere's [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024), through their `rag` chat template. You can see additional examples of grounded generation using this feature in their model cards.
-
-</Tip>
-
-
diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
new file mode 100644
index 000000000000..d11b8fa5b404
--- /dev/null
+++ b/docs/source/en/chat_templating.md
@@ -0,0 +1,229 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Templates
+
+The [chat pipeline](./conversations) guide introduced [`TextGenerationPipeline`] and the concept of a chat prompt or chat template for conversing with a model. Underlying this high-level pipeline is the [`apply_chat_template`] method. A chat template is a part of the tokenizer and it specifies how to convert conversations into a single tokenizable string in the expected model format.
+
+In the example below, Mistral-7B-Instruct and Zephyr-7B are finetuned from the same base model but they’re trained with different chat formats. Without chat templates, you have to manually write formatting code for each model and even minor errors can hurt performance. Chat templates offer a universal way to format chat inputs to any model.
+
+<hfoptions id="template">
+<hfoption id="Mistral">
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
+chat = [
+  {"role": "user", "content": "Hello, how are you?"},
+  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+  {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+
+tokenizer.apply_chat_template(chat, tokenize=False)
+```
+```md
+<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]
+```
+
+</hfoption>
+<hfoption id="Zephyr">
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+chat = [
+  {"role": "user", "content": "Hello, how are you?"},
+  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+  {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+
+tokenizer.apply_chat_template(chat, tokenize=False)
+```
+```md
+<|user|>\nHello, how are you?</s>\n<|assistant|>\nI'm doing great. How can I help you today?</s>\n<|user|>\nI'd like to show off how chat templating works!</s>\n
+```
+
+</hfoption>
+</hfoptions>
+
+This guide explores [`apply_chat_template`] and chat templates in more detail.
+
+## apply_chat_template
+
+Chats should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker (usually between you and the system), and the `content` key contains your message. For the system, the `content` is a high-level description of how the model should behave and respond when you’re chatting with it.
+
+Pass your messages to [`apply_chat_template`] to tokenize and format them. You can set [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` to indicate the start of a message.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.bfloat16)
+
+messages = [
+    {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
+    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+print(tokenizer.decode(tokenized_chat[0]))
+```
+```md
+<|system|>
+You are a friendly chatbot who always responds in the style of a pirate</s>
+<|user|>
+How many helicopters can a human eat in one sitting?</s>
+<|assistant|>
+```
+
+Now pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response.
+
+```py
+outputs = model.generate(tokenized_chat, max_new_tokens=128) 
+print(tokenizer.decode(outputs[0]))
+```
+```md
+<|system|>
+You are a friendly chatbot who always responds in the style of a pirate</s>
+<|user|>
+How many helicopters can a human eat in one sitting?</s>
+<|assistant|>
+Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
+```
+
+### add_generation_prompt
+The [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) parameter adds tokens that indicate the start of a response. This ensures the chat model generates a system response instead of continuing a users message.
+
+Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the system response. In this case, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
+
+```py
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+tokenized_chat
+```
+```md
+<|im_start|>user
+Hi there!<|im_end|>
+<|im_start|>assistant
+Nice to meet you!<|im_end|>
+<|im_start|>user
+Can I ask a question?<|im_end|>
+```
+
+### continue_final_message
+
+The [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) parameter controls whether the final message in the chat should be continued or not instead of starting a new one. It removes end of sequence tokens so that the model continues generation from the final message.
+
+This is useful for “prefilling” a model response. In the example below, the model generates text that continues the JSON string rather than starting a new message. It can be very useful for improving the accuracy for instruction following when you know how to start its replies.
+
+```py
+chat = [
+    {"role": "user", "content": "Can you format the answer in JSON?"},
+    {"role": "assistant", "content": '{"name": "'},
+]
+
+formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True)
+model.generate(**formatted_chat)
+```
+
+> [!WARNING]
+> You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.
+
+[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the “assistant” role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) to the pipeline.
+
+## Multiple templates
+
+A model may have several different templates for different use cases. For example, a model may have a template for regular chat, tool use, and RAG.
+
+When there are multiple templates, the chat template is a dictionary. Each key corresponds to the name of a template. [`apply_chat_template`] handles multiple templates based on their name. It looks for a template named `default` in most cases and if it can’t find one, it raises an error.
+
+For a tool calling template, if a user passes a `tools` parameter and a `tool_use` template exists, the tool calling template is used instead of `default`.
+
+To access templates with other names, pass the template name to the `chat_template` parameter in [`apply_chat_template`]. For example, if you’re using a RAG template then set `chat_template="rag"`.
+
+It can be confusing to manage multiple templates though, so we recommend using a single template for all use cases. Use Jinja statements like `if tools is defined` and `{% macro %}` definitions to wrap multiple code paths in a single template.
+
+## Template selection
+
+It is important to set a chat template format that matches the template format a model was pretrained on, otherwise performance may suffer. Even if you’re training the model further, performance is best if the chat tokens are kept constant.
+
+But if you’re training a model from scratch or finetuning a model for chat, you have more options to select a template. For example, [ChatML](https://github.com/openai/openai-python/blob/release-v0.28.0/chatml.md) is a popular format that is flexbile enough to handle many use cases. It even includes support for [generation prompts](#add_generation_prompt), but it doesn’t add beginning-of-string (`BOS`) or end-of-string (`EOS`) tokens. If your model expects `BOS` and `EOS` tokens, set `add_special_tokens=True` and make sure to add them to your template.
+
+```py
+{%- for message in messages %}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{%- endfor %}
+```
+
+Set the template with the following logic to support [generation prompts](#add_generation_prompt). The template wraps each message with `<|im_start|>` and `<|im_end|>` tokens and writes the role as a string. This allows you to easily customize the roles you want to train with.
+
+```py
+tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+```
+
+The `user`, `system` and `assistant` roles are standard roles in chat templates. We recommend using these roles when it makes sense, especially if you’re using your model with the [`TextGenerationPipeline`].
+
+```py
+<|im_start|>system
+You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
+<|im_start|>user
+How are you?<|im_end|>
+<|im_start|>assistant
+I'm doing great!<|im_end|>
+```
+
+## Model training
+
+Training a model with a chat template is a good way to ensure a chat template matches the tokens a model is trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training.
+
+An example of preprocessing a dataset with a chat template is shown below.
+
+```py
+from transformers import AutoTokenizer
+from datasets import Dataset
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+
+chat1 = [
+    {"role": "user", "content": "Which is bigger, the moon or the sun?"},
+    {"role": "assistant", "content": "The sun."}
+]
+chat2 = [
+    {"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
+    {"role": "assistant", "content": "A bacterium."}
+]
+
+dataset = Dataset.from_dict({"chat": [chat1, chat2]})
+dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
+print(dataset['formatted_chat'][0])
+```
+```md
+<|user|>
+Which is bigger, the moon or the sun?</s>
+<|assistant|>
+The sun.</s>
+```
+
+After this step, you can continue following the [training recipe](./tasks/language_modeling) for causal language models using the `formatted_chat` column.
+
+Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` as well to avoid duplicating them.
+
+```py
+apply_chat_template(messages, tokenize=False, add_special_tokens=False)
+```
+
+This isn’t an issue if `apply_chat_template(tokenize=True)`.
diff --git a/docs/source/en/chat_templating_multimodal.md b/docs/source/en/chat_templating_multimodal.md
new file mode 100644
index 000000000000..6319f00b97b9
--- /dev/null
+++ b/docs/source/en/chat_templating_multimodal.md
@@ -0,0 +1,272 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Multimodal templates
+
+Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`.
+
+Multimodal templates are included in the [Processor](./processors) class and require an additional `type` key for specifying whether the included content is an image, video, or text.
+
+This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template
+
+## ImageTextToTextPipeline
+
+[`ImageTextToTextPipeline`] is a high-level image and text generation class with a “chat mode”. Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
+
+Start by building a chat history with the following two roles.
+
+- `system` describes how the model should behave and respond when you’re chatting with it. This role isn’t supported by all chat models.
+- `user` is where you enter your first message to the model.
+
+```py
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+            {"type": "text", "text": "What are these?"},
+        ],
+    },
+]
+```
+
+Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
+
+> [!TIP]
+> The [`ImageTextToTextPipeline`] accepts chats in the OpenAI format to make inference easier and more accessible. 
+
+```python
+import torch
+from transformers import pipeline
+
+pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16)
+pipeline(text=messages, max_new_tokens=50, return_full_text=False)
+[{'input_text': [{'role': 'system',
+    'content': [{'type': 'text',
+      'text': 'You are a friendly chatbot who always responds in the style of a pirate'}]},
+   {'role': 'user',
+    'content': [{'type': 'image',
+      'url': 'http://images.cocodataset.org/val2017/000000039769.jpg'},
+     {'type': 'text', 'text': 'What are these?'}]}],
+  'generated_text': 'The image shows two cats lying on a pink surface, which appears to be a cushion or a soft blanket. The cat on the left has a striped coat, typical of tabby cats, and is lying on its side with its head resting on the'}]
+```
+
+## Image inputs
+
+For multimodal models that accept images like [LLaVA](./model_doc/llava), include the following in `content` as shown below.
+
+- The content `"type"` can be an `"image"` or `"text"`.
+- For images, it can be a link to the image (`"url"`), a file path (`"path"`), or `"base64"`. Images are automatically loaded, processed, and prepared into pixel values as inputs to the model.
+
+```python
+from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+
+model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+
+messages = [
+    {
+      "role": "system",
+      "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+            {"type": "text", "text": "What are these?"},
+        ],
+    },
+]
+```
+
+Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content and return the `input_ids` and `pixel_values`.
+
+```py
+processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
+print(processed_chat.keys())
+```
+
+These inputs are now ready to be used in [`~GenerationMixin.generate`].
+
+## Video inputs
+
+Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs).
+
+- The content `"type"` should be `"video"` to indicate the content is a video.
+- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
+
+> [!WARNING]
+> Loading a video from `"url"` is only supported by the PyAV or Decord backends.
+
+```python
+from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+
+model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+messages = [
+    {
+      "role": "system",
+      "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "video", "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"},
+            {"type": "text", "text": "What do you see in this video?"},
+        ],
+    },
+]
+```
+
+Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.
+
+The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
+
+The examples below use Decord as the backend because it is a bit faster than PyAV.
+
+<hfoptions id="sampling">
+<hfoption id="fixed number of frames">
+
+The `num_frames` parameter controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling.
+
+
+```python
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    num_frames=32,
+    video_load_backend="decord",
+)
+print(processed_chat.keys())
+```
+
+These inputs are now ready to be used in [`~GenerationMixin.generate`].
+
+</hfoption>
+<hfoption id="fps">
+
+For longer videos, it may be better to sample more frames for better representation with the `video_fps` parameter. This determines how many frames per second to extract. As an example, if a video is 10 seconds long and `video_fps=2`, then the model samples 20 frames. In other words, 2 frames are uniformly sampled every 10 seconds.
+
+```py
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    video_fps=32,
+    video_load_backend="decord",
+)
+print(processed_chat.keys())
+```
+
+</hfoption>
+<hfoption id="custom frame sampling">
+
+Some models don't sample frames *uniformly* and require more complex logic to determine which frames to use. For example, the model may have an *adaptive frame selection* or if the model prioritizes *key moments* in a video rather than evenly spaced frames.
+
+If a model has a different sampling strategy, you can write a function that customizes frame selection. The function should include the following requirements.
+
+- Use the `sample_indices_fn` parameter to pass a callable function for sampling.
+- If provided, this function *overrides* the standard `num_frames` and `fps` parameters.
+- The function receives all the parameters passed to `load_video` and must return valid frame indices to sample from.
+
+An example function is shown below. This gives you full control over frame selection, making the model more adaptable to different video scenarios.
+
+```py
+def sample_indices_fn(metadata, **kwargs):
+    # samples only the first and the second frame
+    return [0, 1]
+
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    sample_indices_fn=sample_indices_fn,
+    video_load_backend="decord",
+)
+print(processed_chat.keys())
+```
+
+</hfoption>
+<hfoption id="list of image frames">
+
+Videos may also exist as a set of sampled frames stored as images rather than the full video file.
+
+In this case, pass a list of image file paths and the processor automatically concatenates them into a video. Make sure all images are the same size since they are assumed to be from the same video.
+
+```py
+frames_paths = ["/path/to/frame0.png", "/path/to/frame5.png", "/path/to/frame10.png"]
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "video", "path": frames_paths},
+            {"type": "text", "text": "What do you see in this video?"},
+        ],
+    },
+]
+
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+)
+print(processed_chat.keys())
+```
+
+</hfoption>
+</hfoptions>
+
+## Template configuration
+
+You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.apply_chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details.
+
+For example, to enable a template to handle a *list of content* from multiple modalities while still supporting plain strings for text-only inference, specify how to handle the `content['type']` if it is an image or text as shown below in the Llama 3.2 Vision Instruct [template](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/blob/main/chat_template.json).
+
+```jinja
+{% for message in messages %}
+{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}
+{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+{% if message['content'] is string %}
+{{ message['content'] }}
+{% else %}
+{% for content in message['content'] %}
+{% if content['type'] == 'image' %}
+{{ '<|image|>' }}
+{% elif content['type'] == 'text' %}
+{{ content['text'] }}
+{% endif %}
+{% endfor %}
+{% endif %}
+{{ '<|eot_id|>' }}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
+```
diff --git a/docs/source/en/chat_templating_writing.md b/docs/source/en/chat_templating_writing.md
new file mode 100644
index 000000000000..fbcec9f71c01
--- /dev/null
+++ b/docs/source/en/chat_templating_writing.md
@@ -0,0 +1,251 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Template writing
+
+A chat template is a [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) template stored in the tokenizers [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax. A chat template performs the following three roles.
+
+1. Print the role enclosed in `<|` and `|>` (`<|user|>`, `<|assistant|>`, etc.).
+2. Print the message followed by an end-of-sequence (`EOS`) token.
+3. Print the assistant token if [add_generation_prompt=True](./chat_templating#add_generation_prompt) so the model generates an assistant response.
+
+An example template is shown below.
+
+```jinja
+{%- for message in messages %}
+    {{- '<|' + message['role'] + |>\n' }}
+    {{- message['content'] + eos_token }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|assistant|>\n' }}
+{%- endif %}
+```
+
+The template can be customized to handle more complex use cases. This guide will show you how to add and edit templates and includes template writing tips.
+
+## Create a template
+
+Create a template by writing a Jinja template and then setting it as the chat template in the tokenizer. For example, the template below adds `[ASST]` and `[/ASST]` tags to the assistant messages.
+
+```jinja
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
+    {%- endif %}
+{%- endfor %}
+```
+
+Set the template in the tokenizer, and the next time you use [`~PreTrainedTokenizerBase.apply_chat_template`], the new template is used.
+
+```py
+template = tokenizer.chat_template
+template = template.replace("SYS", "SYSTEM")  # Change the system token
+tokenizer.chat_template = template  # Set the new template
+```
+
+The template is saved in the `tokenizer_config.json` file. Upload it to the Hub with [`~PreTrainedTokenizer.push_to_hub`] so you can reuse it later and make sure everyone is using the right template for your model.
+
+```py
+tokenizer.push_to_hub("model_name")
+```
+
+## Template writing tips
+
+The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see what template it's using. Try starting with simple models that don't call any tools or support RAG. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for more details about formatting and syntax.
+
+This section curates some best practices for writing clean and efficient Jinja templates.
+
+### Trimming whitespace
+
+Jinja prints any whitespace before or after a block of text. This can be an issue for chat templates because whitespace usage should be intentional. Add `-` to strip any whitespace before a block.
+
+```jinja
+{%- for message in messages %}
+    {{- message['role'] + message['content'] }}
+{%- endfor %}
+```
+
+The incorrect whitespace usage example below may introduce a newline and indentation in the output.
+
+```jinja
+{% for message in messages %}
+    {{ message['role'] + message['content'] }}
+{% endfor %}
+```
+
+### Special variables
+
+There are five special variables available inside a template. You can pass virtually any additional arguments to [`~PreTrainedTokenizerBase.apply_chat_template`] and it will be available inside the template as a variable. However, you should try to keep the number of variables to the five below to make it easier for users to use the chat model without writing custom code to handle model-specific arguments.
+
+- `messages` contains the chat history as a list of message dicts.
+- `tools` contains a list of tools in JSON schema format.
+- `documents` contains a list of documents with the format `{"title": Title, "contents": "Contents"}` (designed for RAG models).
+- `add_generation_prompt` is a boolean that determines whether to add an assistant header at the end of the conversation.
+- `bos_token` and `eos_token` are special tokens extracted from a tokenizers `special_tokens_map`.
+
+### Callable functions
+
+There are two callable functions available inside a template.
+
+- `raise_exception(msg)` raises a `TemplateException`. This is useful for debugging or warning users about incorrect template usage.
+- `strftime_now(format_str)` retrieves the current date and time in a specific format which could be useful to include in system messages. It is equivalent to [datetime.now().strftime(format_str)](https://docs.python.org/3/library/datetime.html#datetime.datetime.now) in Python.
+
+### Compatibility with non-Python Jinja
+
+Jinja is implemented in multiple languages and they generally have the same syntax. Writing a template in Python allows you to use Python methods such as [lower](https://docs.python.org/3/library/stdtypes.html#str.lower) on strings or [items](https://docs.python.org/3/library/stdtypes.html#dict.items) on dicts. But this won't work if the template is used in a non-Python implementation, for example, when deploying with Javascript or Rust.
+
+Make the changes below to ensure compatibility across all Jinja implementations.
+
+- Replace Python methods with Jinja filters. For example, replace `string.lower()` with `string|lower` or `dict.items()` with `dict|dictitems`. Most of the changes follow the same pattern except `string.strip()`, which is replaced with `string|trim`. Refer to the list of [built-in filters](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) for a complete list of filters.
+- Replace `True`, `False`, and `None` (these are Python specific) with `true`, `false`, and `none` respectively.
+- Directly rendering a dict or list may return different results in other implementations. For example, string entries may change from single-quote to double-quote. To avoid this, add the [tojson](https://jinja.palletsprojects.com/en/3.1.x/templates/#jinja-filters.tojson) filter to maintain consistency.
+
+### Big templates
+
+Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues.
+
+Write the template in a separate file and extract it to the chat template.
+
+```py
+open("template.jinja", "w").write(tokenizer.chat_template)
+```
+
+You could also load an edited template back into the tokenizer.
+
+```py
+tokenizer.chat_template = open("template.jinja").read()
+```
+
+## Templates for tools
+
+There isn't a specific format for writing templates for tools but it is best to follow the standard API. This ensures the template is widely accessible across models without requiring users to write custom code to use tools with your model.
+
+> [!WARNING]
+> Formatting such as whitespace and special tokens are model-specific. Make sure everything exactly matches the format a model was trained with.
+
+The following section lists elements of the standard API for writing templates for tools.
+
+### Tool definitions
+
+Transformers chat template methods allow a user to pass tools as Python functions or a JSON schema. When functions are passed, a JSON schema is automatically generated and passed to the template. The `tools` variable in a template always takes a list of JSON schemas.
+
+The specific tokens and tool descriptions should match the ones your model was trained with. Your model doesn't need to understand the JSON schema input because your template can translate the JSON schema into your models format. For example, [Command-R](./model_doc/cohere) was trained with tools defined with Python function headers, but the Command-R tool template accepts JSON schemas. The template internally converts types and renders the input tools as Python headers.
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+An example for handling tool definitions in a chat template is shown below. The specific tokens and tool descriptions should be changed to match the ones a model was trained with.
+
+```
+{%- if tools %}
+    {%- for tool in tools %}
+        {{- '<tool>' + tool['function']['name'] + '\n' }}
+        {%- for argument in tool['function']['parameters']['properties'] %}
+            {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
+        {%- endfor %}
+        {{- '\n</tool>' }}
+    {%- endif %}
+{%- endif %}
+```
+
+### Tool calls
+
+Tool calls, if present, is a list with the `"assistant”` role. This is always a list even though most tool-calling models only support single tool calls, which means the list usually only contains a single element.
+
+```json
+{
+  "role": "assistant",
+  "tool_calls": [
+    {
+      "type": "function",
+      "function": {
+        "name": "multiply",
+        "arguments": {
+          "a": 5,
+          "b": 6
+        }
+      }
+    }
+  ]
+}
+```
+
+A common pattern for handling tool calls is shown below.
+
+```
+{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
+    {%- for tool_call in message['tool_calls'] %}
+            {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
+        {%- endif %}
+    {%- endfor %}
+{%- endif %}
+```
+
+### Tool responses
+
+Tool responses are a message dict with the `role`, `name` (name of the function) and `content` (result of the tool call) keys.
+
+```json
+{
+  "role": "tool",
+  "name": "multiply",
+  "content": "30"
+}
+```
+
+Not all the keys need to be used in the tool response. For example, if a model doesn’t expect the function name to be included in the tool response, then you can just include the `role` and `content`.
+
+```
+{%- if message['role'] == 'tool' %}
+    {{- "<tool_result>" + message['content'] + "</tool_result>" }}
+{%- endif %}
+```
+
+## Contribute
+
+Add a chat template by setting the `chat_template` attribute in the tokenizer and testing it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then you can upload it to the Hub with with [`~PreTrainedTokenizer.push_to_hub`].
+
+Even if you're not the model owner, it is still helpful to add a template for a model with an empty chat template or a model that is using a default class template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template.
+
+```py
+tokenizer.chat_template = template
+tokenizer.push_to_hub("model_name")
+```
diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md
index a48c046b4949..2e842265a215 100644
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@@ -14,62 +14,66 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Chatting with Transformers
+# Chat basics
 
-If you're reading this article, you're almost certainly aware of **chat models**. Chat models are conversational
-AIs that you can send and receive messages with. The most famous of these is the proprietary ChatGPT, but there are
-now many open-source chat models which match or even substantially exceed its performance. These models are free to
-download and run on a local machine. Although the largest and most capable models require high-powered hardware
-and lots of memory to run, there are smaller models that will run perfectly well on a single consumer GPU, or even
-an ordinary desktop or notebook CPU. 
+Chat models are conversational models you can send and receive messages from. There are many chat models available to choose from, but in general, larger models tend to be better though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which means it's a 56B and 141B parameter model. You can try quantizing larger models to reduce memory requirements, otherwise you'll need ~2 bytes of memory per parameter.
 
-This guide will help you get started with chat models. We'll start with a brief quickstart guide that uses a convenient,
-high-level "pipeline". This is all you need if you just want to start running a chat model 
-immediately. After the quickstart, we'll move on to more detailed information about
-what exactly chat models are, how to choose an appropriate one, and a low-level breakdown of each of the
-steps involved in talking to a chat model. We'll also give some tips on optimizing the performance and memory usage
-of your chat models.
+Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to further help you identify the best chat models for your use case. Models that are specialized in certain domains (medical, legal text, non-English languages, etc.) may sometimes outperform larger general purpose models.
 
+> [!TIP]
+> Chat with a number of open-source models for free on [HuggingChat](https://hf.co/chat/)!
 
-## Quickstart
+This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].
 
-If you have no time for details, here's the brief summary: Chat models continue chats. This means that you pass them
-a conversation history, which can be as short as a single user message, and the model will continue the conversation
-by adding its response. Let's see this in action. First, let's build a chat:
+## transformers-cli
 
-```python
+Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options.
+
+```bash
+transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/>
+</div>
+
+For a full list of options, run the command below.
+
+```bash
+transformers-cli chat -h
+```
+
+The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
+
+## TextGenerationPipeline
+
+[`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
+
+To start, build a chat history with the following two roles.
+
+- `system` describes how the model should behave and respond when you're chatting with it. This role isn't supported by all chat models.
+- `user` is where you enter your first message to the model.
+
+```py
 chat = [
     {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
     {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
 ]
 ```
 
-Notice that in addition to the user's message, we added a **system** message at the start of the conversation. Not all
-chat models support system messages, but when they do, they represent high-level directives about how the model
-should behave in the conversation. You can use this to guide the model - whether you want short or long responses,
-lighthearted or serious ones, and so on. If you want the model to do useful work instead of
-practicing its improv routine, you can either omit the system message or try a terse one such as "You are a helpful and intelligent
-AI assistant who responds to user queries."
-
-Once you have a chat, the quickest way to continue it is using the [`TextGenerationPipeline`]. 
-Let's see this in action with `LLaMA-3`. Note that `LLaMA-3` is a gated model, which means you will need to 
-[apply for access](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and log in with your Hugging Face 
-account to use it. We'll also use `device_map="auto"`, which will load the model on GPU if there's enough memory
-for it, and set the dtype to `torch.bfloat16` to save memory:
+Create the [`TextGenerationPipeline`] and pass `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
 
-```python
+```py
 import torch
 from transformers import pipeline
 
-pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
-response = pipe(chat, max_new_tokens=512)
-print(response[0]['generated_text'][-1]['content'])
+pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+response = pipeline(chat, max_new_tokens=512)
+print(response[0]["generated_text"][-1]["content"])
 ```
 
-And you'll get:
-
-```text
-(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright, 
+```txt
+(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
 alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!
 
 So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
@@ -91,22 +95,18 @@ So, there you have it, pal! That's my expert advice on what to do in New York. N
 excuse me, I've got some oil changes to attend to. (winks)
 ```
 
-You can continue the chat by appending your own response to it. The
-`response` object returned by the pipeline actually contains the entire chat so far, so we can simply append
-a message and pass it back:
+Use the `append` method on `chat` to respond to the models message.
 
-```python
-chat = response[0]['generated_text']
+```py
+chat = response[0]["generated_text"]
 chat.append(
     {"role": "user", "content": "Wait, what's so wild about soup cans?"}
 )
-response = pipe(chat, max_new_tokens=512)
-print(response[0]['generated_text'][-1]['content'])
+response = pipeline(chat, max_new_tokens=512)
+print(response[0]["generated_text"][-1]["content"])
 ```
 
-And you'll get:
-
-```text
+```txt
 (laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
 It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
 like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
@@ -120,171 +120,35 @@ But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. (
 But, hey, that's what makes art, art, right? (laughs)
 ```
 
-The remainder of this tutorial will cover specific topics such
-as performance and memory, or how to select a chat model for your needs.
-
-## Choosing a chat model
-
-There are an enormous number of different chat models available on the [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending),
-and new users often feel very overwhelmed by the selection offered. Don't be, though! You really need to just focus on
-two important considerations: 
-- The model's size, which will determine if you can fit it in memory and how quickly it will
-run.
-- The quality of the model's chat output.
-
-In general, these are correlated - bigger models tend to be 
-more capable, but even so there's a lot of variation at a given size point!
-
-### Size and model naming
-The size of a model is easy to spot - it's the number in the model name, like "8B" or "70B". This is the number of
-**parameters** in the model. Without quantization, you should expect to need about 2 bytes of memory per parameter.
-This means that an "8B" model with 8 billion parameters will need about 16GB of memory just to fit the parameters, 
-plus a little extra for other overhead. It's a good fit for a high-end consumer GPU with 24GB of memory, such as a 3090
-or 4090.
-
-Some chat models are "Mixture of Experts" models. These may list their sizes in different ways, such as "8x7B" or 
-"141B-A35B". The numbers are a little fuzzier here, but in general you can read this as saying that the model
-has approximately 56 (8x7) billion parameters in the first case, or 141 billion parameters in the second case.
-
-Note that it is very common to use quantization techniques to reduce the memory usage per parameter to 8 bits, 4 bits,
-or even less. This topic is discussed in more detail in the [Memory considerations](#memory-considerations) section below.
-
-### But which chat model is best?
-Even once you know the size of chat model you can run, there's still a lot of choice out there. One way to sift through
-it all is to consult **leaderboards**. Two of the most popular leaderboards are the [OpenLLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
-and the [LMSys Chatbot Arena Leaderboard](https://chat.lmsys.org/?leaderboard). Note that the LMSys leaderboard
-also includes proprietary models - look at the `licence` column to identify open-source ones that you can download, then
-search for them on the [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending).
-
-### Specialist domains
-Some models may be specialized for certain domains, such as medical or legal text, or non-English languages. 
-If you're working in these domains, you may find that a specialized model will give you big performance benefits. 
-Don't automatically assume that, though! Particularly when specialized models are smaller or older than the current 
-cutting-edge, a top-end general-purpose model may still outclass them. Thankfully, we are beginning to see 
-[domain-specific leaderboards](https://huggingface.co/blog/leaderboard-medicalllm) that should make it easier to locate
-the best models for specialized domains.
-
-## What happens inside the pipeline?
-
-The quickstart above used a high-level pipeline to chat with a chat model, which is convenient, but not the
-most flexible. Let's take a more low-level approach, to see each of the steps involved in chat. Let's start with
-a code sample, and then break it down:
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
-
-# Prepare the input as before
-chat = [
-    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
-    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
-]
-
-# 1: Load the model and tokenizer
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16)
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-
-# 2: Apply the chat template
-formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-print("Formatted chat:\n", formatted_chat)
-
-# 3: Tokenize the chat (This can be combined with the previous step using tokenize=True)
-inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False)
-# Move the tokenized inputs to the same device the model is on (GPU/CPU)
-inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
-print("Tokenized inputs:\n", inputs)
-
-# 4: Generate text from the model
-outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1)
-print("Generated tokens:\n", outputs)
-
-# 5: Decode the output back to a string
-decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
-print("Decoded output:\n", decoded_output)
-```
-
-There's a lot in here, each piece of which could be its own document! Rather than going into too much detail, I'll cover
-the broad ideas, and leave the details for the linked documents. The key steps are:
-
-1. [Models](https://huggingface.co/learn/nlp-course/en/chapter2/3) and [Tokenizers](https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt) are loaded from the Hugging Face Hub.
-2. The chat is formatted using the tokenizer's [chat template](https://huggingface.co/docs/transformers/main/en/chat_templating)
-3. The formatted chat is [tokenized](https://huggingface.co/learn/nlp-course/en/chapter2/4) using the tokenizer.
-4. We [generate](https://huggingface.co/docs/transformers/en/llm_tutorial) a response from the model.
-5. The tokens output by the model are decoded back to a string
-
-## Performance, memory and hardware
+## Performance
 
-You probably know by now that most machine learning tasks are run on GPUs. However, it is entirely possible
-to generate text from a chat model or language model on a CPU, albeit somewhat more slowly. If you can fit
-the model in GPU memory, though, this will usually be the preferable option.
+Transformers load models in full precision by default, and for a 8B model, this requires ~32GB of memory! Reduce memory usage by loading a model in half-precision or bfloat16 (only uses ~2 bytes per parameter). You can even quantize the model to a lower precision like 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index).
 
-### Memory considerations
+> [!TIP]
+> Refer to the [Quantization](./quantization/overview) docs for more information about the different quantization backends available.
 
-By default, Hugging Face classes like [`TextGenerationPipeline`] or [`AutoModelForCausalLM`] will load the model in 
-`float32` precision. This means that it will need 4 bytes (32 bits) per parameter, so an "8B" model with 8 billion
-parameters will need ~32GB of memory. However, this can be wasteful! Most modern language models are trained in 
-"bfloat16" precision, which uses only 2 bytes per parameter. If your hardware supports it (Nvidia 30xx/Axxx
-or newer), you can load the model in `bfloat16` precision, using the `torch_dtype` argument as we did above.
+Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. The example below quantizes a model to 8-bits.
 
-It is possible to go even lower than 16-bits using "quantization", a method to lossily compress model weights. This
-allows each parameter to be squeezed down to 8 bits, 4 bits or even less. Note that, especially at 4 bits,
-the model's outputs may be negatively affected, but often this is a tradeoff worth making to fit a larger and more
-capable chat model in memory. Let's see this in action with `bitsandbytes`:
-
-```python
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)  # You can also try load_in_4bit
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", quantization_config=quantization_config)
-```
-
-Or we can do the same thing using the `pipeline` API:
-
-```python
+```py
 from transformers import pipeline, BitsAndBytesConfig
 
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)  # You can also try load_in_4bit
-pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
 ```
 
-There are several other options for quantizing models besides `bitsandbytes` - please see the [Quantization guide](./quantization)
-for more information.
-
-### Performance considerations
-
-<Tip>
-
-For a more extensive guide on language model performance and optimization, check out [LLM Inference Optimization](./llm_optims) .
-
-</Tip>
-
+In general, larger models are slower in addition to requiring more memory because text generation is bottlenecked by **memory bandwidth** instead of compute power. Each active parameter must be read from memory for every generated token. For a 16GB model, 16GB must be read from memory for every generated token.
 
-As a general rule, larger chat models will be slower in addition to requiring more memory. It's possible to be
-more concrete about this, though: Generating text from a chat model is unusual in that it is bottlenecked by
-**memory bandwidth** rather than compute power, because every active parameter must be read from memory for each
-token that the model generates. This means that number of tokens per second you can generate from a chat
-model is generally proportional to the total bandwidth of the memory it resides in, divided by the size of the model.
+The number of generated tokens/sec is proportional to the total memory bandwidth of the system divided by the model size. Depending on your hardware, total memory bandwidth can vary. Refer to the table below for approximate generation speeds for different hardware types.
 
-In our quickstart example above, our model was ~16GB in size when loaded in `bfloat16` precision. 
-This means that 16GB must be read from memory for every token generated by the model. Total memory bandwidth can
-vary from 20-100GB/sec for consumer CPUs to 200-900GB/sec for consumer GPUs, specialized CPUs like
-Intel Xeon, AMD Threadripper/Epyc or high-end Apple silicon, and finally up to 2-3TB/sec for data center GPUs like
-the Nvidia A100 or H100. This should give you a good idea of the generation speed you can expect from these different
-hardware types.
+| Hardware | Memory bandwidth |
+|---|---|
+| consumer CPU | 20-100GB/sec |
+| specialized CPU (Intel Xeon, AMD Threadripper/Epyc, Apple silicon) | 200-900GB/sec |
+| data center GPU (NVIDIA A100/H100) | 2-3TB/sec |
 
-Therefore, if you want to improve the speed of text generation, the easiest solution is to either reduce the
-size of the model in memory (usually by quantization), or get hardware with higher memory bandwidth. For advanced users, 
-several other techniques exist to get around this bandwidth bottleneck. The most common are variants on 
-[assisted generation](https://huggingface.co/blog/assisted-generation), also known as "speculative
-sampling". These techniques try to guess multiple future tokens at once, often using a smaller "draft model", and then
-confirm these generations with the chat model. If the guesses are validated by the chat model, more than one token can
-be generated per forward pass, which greatly alleviates the bandwidth bottleneck and improves generation speed.  
+The easiest solution for improving generation speed is to either quantize a model or use hardware with higher memory bandwidth.
 
-Finally, we should also note the impact of "Mixture of Experts" (MoE) models here. Several popular chat models,
-such as Mixtral, Qwen-MoE and DBRX, are MoE models. In these models, not every parameter is active for every token generated.
-As a result, MoE models generally have much lower memory bandwidth requirements, even though their total size
-can be quite large. They can therefore be several times faster than a normal "dense" model of the same size. However,
-techniques like assisted generation are generally ineffective for these models because more parameters will become
-active with each new speculated token, which will negate the bandwidth and speed benefits that the MoE architecture
-provides.
+You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed.
 
+> [!TIP]
+> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
diff --git a/docs/source/en/create_a_model.md b/docs/source/en/create_a_model.md
deleted file mode 100644
index 0ecc503df615..000000000000
--- a/docs/source/en/create_a_model.md
+++ /dev/null
@@ -1,472 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Create a custom architecture
-
-An [`AutoClass`](model_doc/auto) automatically infers the model architecture and downloads pretrained configuration and weights. Generally, we recommend using an `AutoClass` to produce checkpoint-agnostic code. But users who want more control over specific model parameters can create a custom 🤗 Transformers model from just a few base classes. This could be particularly useful for anyone who is interested in studying, training or experimenting with a 🤗 Transformers model. In this guide, dive deeper into creating a custom model without an `AutoClass`. Learn how to:
-
-- Load and customize a model configuration.
-- Create a model architecture.
-- Create a slow and fast tokenizer for text.
-- Create an image processor for vision tasks.
-- Create a feature extractor for audio tasks.
-- Create a processor for multimodal tasks.
-
-## Configuration
-
-A [configuration](main_classes/configuration) refers to a model's specific attributes. Each model configuration has different attributes; for instance, all NLP models have the `hidden_size`, `num_attention_heads`, `num_hidden_layers` and `vocab_size` attributes in common. These attributes specify the number of attention heads or hidden layers to construct a model with.
-
-Get a closer look at [DistilBERT](model_doc/distilbert) by accessing [`DistilBertConfig`] to inspect it's attributes:
-
-```py
->>> from transformers import DistilBertConfig
-
->>> config = DistilBertConfig()
->>> print(config)
-DistilBertConfig {
-  "activation": "gelu",
-  "attention_dropout": 0.1,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-[`DistilBertConfig`] displays all the default attributes used to build a base [`DistilBertModel`]. All attributes are customizable, creating space for experimentation. For example, you can customize a default model to:
-
-- Try a different activation function with the `activation` parameter.
-- Use a higher dropout ratio for the attention probabilities with the `attention_dropout` parameter.
-
-```py
->>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
->>> print(my_config)
-DistilBertConfig {
-  "activation": "relu",
-  "attention_dropout": 0.4,
-  "dim": 768,
-  "dropout": 0.1,
-  "hidden_dim": 3072,
-  "initializer_range": 0.02,
-  "max_position_embeddings": 512,
-  "model_type": "distilbert",
-  "n_heads": 12,
-  "n_layers": 6,
-  "pad_token_id": 0,
-  "qa_dropout": 0.1,
-  "seq_classif_dropout": 0.2,
-  "sinusoidal_pos_embds": false,
-  "transformers_version": "4.16.2",
-  "vocab_size": 30522
-}
-```
-
-Pretrained model attributes can be modified in the [`~PretrainedConfig.from_pretrained`] function:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
-```
-
-Once you are satisfied with your model configuration, you can save it with [`~PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory:
-
-```py
->>> my_config.save_pretrained(save_directory="./your_model_save_path")
-```
-
-To reuse the configuration file, load it with [`~PretrainedConfig.from_pretrained`]:
-
-```py
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
-```
-
-<Tip>
-
-You can also save your configuration file as a dictionary or even just the difference between your custom configuration attributes and the default configuration attributes! See the [configuration](main_classes/configuration) documentation for more details.
-
-</Tip>
-
-## Model
-
-The next step is to create a [model](main_classes/models). The model - also loosely referred to as the architecture - defines what each layer is doing and what operations are happening. Attributes like `num_hidden_layers` from the configuration are used to define the architecture. Every model shares the base class [`PreTrainedModel`] and a few common methods like resizing input embeddings and pruning self-attention heads. In addition, all models are also either a [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) or [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. This means models are compatible with each of their respective framework's usage.
-
-<frameworkcontent>
-<pt>
-Load your custom configuration attributes into the model:
-
-```py
->>> from transformers import DistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
->>> model = DistilBertModel(my_config)
-```
-
-This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training.
-
-Create a pretrained model with [`~PreTrainedModel.from_pretrained`]:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
-
-```py
->>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
-```
-</pt>
-<tf>
-Load your custom configuration attributes into the model:
-
-```py
->>> from transformers import TFDistilBertModel
-
->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
->>> tf_model = TFDistilBertModel(my_config)
-```
-
-This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training.
-
-Create a pretrained model with [`~TFPreTrainedModel.from_pretrained`]:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
-
-```py
->>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config)
-```
-</tf>
-</frameworkcontent>
-
-### Model heads
-
-At this point, you have a base DistilBERT model which outputs the *hidden states*. The hidden states are passed as inputs to a model head to produce the final output. 🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation).
-
-<frameworkcontent>
-<pt>
-For example, [`DistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs.
-
-```py
->>> from transformers import DistilBertForSequenceClassification
-
->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`DistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
-
-```py
->>> from transformers import DistilBertForQuestionAnswering
-
->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
-```
-</pt>
-<tf>
-For example, [`TFDistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs.
-
-```py
->>> from transformers import TFDistilBertForSequenceClassification
-
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`TFDistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
-
-```py
->>> from transformers import TFDistilBertForQuestionAnswering
-
->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
-```
-</tf>
-</frameworkcontent>
-
-## Tokenizer
-
-The last base class you need before using a model for textual data is a [tokenizer](main_classes/tokenizer) to convert raw text to tensors. There are two types of tokenizers you can use with 🤗 Transformers:
-
-- [`PreTrainedTokenizer`]: a Python implementation of a tokenizer.
-- [`PreTrainedTokenizerFast`]: a tokenizer from our Rust-based [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) library. This tokenizer type is significantly faster - especially during batch tokenization - due to its Rust implementation. The fast tokenizer also offers additional methods like *offset mapping* which maps tokens to their original words or characters.
-
-Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens.
-
-<Tip warning={true}>
-
-Not every model supports a fast tokenizer. Take a look at this [table](index#supported-frameworks) to check if a model has fast tokenizer support.
-
-</Tip>
-
-If you trained your own tokenizer, you can create one from your *vocabulary* file:
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
-```
-
-It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. Create a tokenizer with a pretrained model's vocabulary with the [`DistilBertTokenizer`] class:
-
-```py
->>> from transformers import DistilBertTokenizer
-
->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-Create a fast tokenizer with the [`DistilBertTokenizerFast`] class:
-
-```py
->>> from transformers import DistilBertTokenizerFast
-
->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
-```
-
-<Tip>
-
-By default, [`AutoTokenizer`] will try to load a fast tokenizer. You can disable this behavior by setting `use_fast=False` in `from_pretrained`.
-
-</Tip>
-
-## Image processor
-
-An image processor processes vision inputs. It inherits from the base [`~image_processing_utils.ImageProcessingMixin`] class.
-
-To use, create an image processor associated with the model you're using. For example, create a default [`ViTImageProcessor`] if you are using [ViT](model_doc/vit) for image classification:
-
-```py
->>> from transformers import ViTImageProcessor
-
->>> vit_extractor = ViTImageProcessor()
->>> print(vit_extractor)
-ViTImageProcessor {
-  "do_normalize": true,
-  "do_resize": true,
-  "image_processor_type": "ViTImageProcessor",
-  "image_mean": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": 2,
-  "size": 224
-}
-```
-
-<Tip>
-
-If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default image processor parameters.
-
-</Tip>
-
-Modify any of the [`ViTImageProcessor`] parameters to create your custom image processor:
-
-```py
->>> from transformers import ViTImageProcessor
-
->>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
->>> print(my_vit_extractor)
-ViTImageProcessor {
-  "do_normalize": false,
-  "do_resize": true,
-  "image_processor_type": "ViTImageProcessor",
-  "image_mean": [
-    0.3,
-    0.3,
-    0.3
-  ],
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": "PIL.Image.BOX",
-  "size": 224
-}
-```
-
-## Backbone
-
-<div style="text-align: center">
-  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Backbone.png">
-</div>
-
-Computer vision models consist of a backbone, neck, and head. The backbone extracts features from an input image, the neck combines and enhances the extracted features, and the head is used for the main task (e.g., object detection). Start by initializing a backbone in the model config and specify whether you want to load pretrained weights or load randomly initialized weights. Then you can pass the model config to the model head.
-
-For example, to load a [ResNet](../model_doc/resnet) backbone into a [MaskFormer](../model_doc/maskformer) model with an instance segmentation head:
-
-<hfoptions id="backbone">
-<hfoption id="pretrained weights">
-
-Set `use_pretrained_backbone=True` to load pretrained ResNet weights for the backbone.
-
-```py
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
-
-config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True) # backbone and neck config
-model = MaskFormerForInstanceSegmentation(config) # head
-```
-
-</hfoption>
-<hfoption id="random weights">
-
-Set `use_pretrained_backbone=False` to randomly initialize a ResNet backbone.
-
-```py
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
-
-config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=False) # backbone and neck config
-model = MaskFormerForInstanceSegmentation(config) # head
-```
-
-You could also load the backbone config separately and then pass it to the model config.
-
-```py
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
-
-backbone_config = ResNetConfig()
-config = MaskFormerConfig(backbone_config=backbone_config)
-model = MaskFormerForInstanceSegmentation(config)
-```
-
-</hfoption>
-</hfoptions id="timm backbone">
-
-[timm](https://hf.co/docs/timm/index) models are loaded within a model with `use_timm_backbone=True` or with [`TimmBackbone`] and [`TimmBackboneConfig`].
-
-Use `use_timm_backbone=True` and `use_pretrained_backbone=True` to load pretrained timm weights for the backbone.
-
-```python
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
-
-config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=True, use_timm_backbone=True) # backbone and neck config
-model = MaskFormerForInstanceSegmentation(config) # head
-```
-
-Set `use_timm_backbone=True` and `use_pretrained_backbone=False` to load a randomly initialized timm backbone.
-
-```python
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
-
-config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=False, use_timm_backbone=True) # backbone and neck config
-model = MaskFormerForInstanceSegmentation(config) # head
-```
-
-You could also load the backbone config and use it to create a `TimmBackbone` or pass it to the model config. Timm backbones will load pretrained weights by default. Set `use_pretrained_backbone=False` to load randomly initialized weights.
-
-```python
-from transformers import TimmBackboneConfig, TimmBackbone
-
-backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=False)
-
-# Create a backbone class
-backbone = TimmBackbone(config=backbone_config)
-
-# Create a model with a timm backbone
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
-
-config = MaskFormerConfig(backbone_config=backbone_config)
-model = MaskFormerForInstanceSegmentation(config)
-```
-
-## Feature extractor
-
-A feature extractor processes audio inputs. It inherits from the base [`~feature_extraction_utils.FeatureExtractionMixin`] class, and may also inherit from the [`SequenceFeatureExtractor`] class for processing audio inputs.
-
-To use, create a feature extractor associated with the model you're using. For example, create a default [`Wav2Vec2FeatureExtractor`] if you are using [Wav2Vec2](model_doc/wav2vec2) for audio classification:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> w2v2_extractor = Wav2Vec2FeatureExtractor()
->>> print(w2v2_extractor)
-Wav2Vec2FeatureExtractor {
-  "do_normalize": true,
-  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
-  "feature_size": 1,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "return_attention_mask": false,
-  "sampling_rate": 16000
-}
-```
-
-<Tip>
-
-If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default feature extractor parameters.
-
-</Tip>
-
-Modify any of the [`Wav2Vec2FeatureExtractor`] parameters to create your custom feature extractor:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False)
->>> print(w2v2_extractor)
-Wav2Vec2FeatureExtractor {
-  "do_normalize": false,
-  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
-  "feature_size": 1,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "return_attention_mask": false,
-  "sampling_rate": 8000
-}
-```
-
-## Processor
-
-For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps processing classes such as a feature extractor and a tokenizer into a single object. For example, let's use the [`Wav2Vec2Processor`] for an automatic speech recognition task (ASR). ASR transcribes audio to text, so you will need a feature extractor and a tokenizer.
-
-Create a feature extractor to handle the audio inputs:
-
-```py
->>> from transformers import Wav2Vec2FeatureExtractor
-
->>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
-```
-
-Create a tokenizer to handle the text inputs:
-
-```py
->>> from transformers import Wav2Vec2CTCTokenizer
-
->>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
-```
-
-Combine the feature extractor and tokenizer in [`Wav2Vec2Processor`]:
-
-```py
->>> from transformers import Wav2Vec2Processor
-
->>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-```
-
-With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, image processor, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. Each of these base classes are configurable, allowing you to use the specific attributes you want. You can easily setup a model for training or modify an existing pretrained model to fine-tune.
diff --git a/docs/source/en/custom_models.md b/docs/source/en/custom_models.md
index 6599ded962d1..592aa9aa105f 100644
--- a/docs/source/en/custom_models.md
+++ b/docs/source/en/custom_models.md
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,45 +14,33 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Building custom models
+# Customizing models
 
-The 🤗 Transformers library is designed to be easily extensible. Every model is fully coded in a given subfolder
-of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs.
+Transformers models are designed to be customizable. A models code is fully contained in the [model](https://github.com/huggingface/transformers/tree/main/src/transformers/models) subfolder of the Transformers repository. Each folder contains a `modeling.py` and a `configuration.py` file. Copy these files to start customizing a model.
 
-If you are writing a brand new model, it might be easier to start from scratch. In this tutorial, we will show you
-how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it
-with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗
-Transformers library. We'll see how to build upon transformers and extend the framework with your hooks and
-custom code.
+> [!TIP]
+> It may be easier to start from scratch if you're creating an entirely new model. But for models that are very similar to an existing one in Transformers, it is faster to reuse or subclass the same configuration and model class.
 
-We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the
-[timm library](https://github.com/rwightman/pytorch-image-models) into a [`PreTrainedModel`].
+This guide will show you how to customize a ResNet model, enable [AutoClass](./models#autoclass) support, and share it on the Hub.
 
-## Writing a custom configuration
+## Configuration
 
-Before we dive into the model, let's first write its configuration. The configuration of a model is an object that
-will contain all the necessary information to build the model. As we will see in the next section, the model can only
-take a `config` to be initialized, so we really need that object to be as complete as possible.
+A configuration, given by the base [`PretrainedConfig`] class, contains all the necessary information to build a model. This is where you'll configure the attributes of the custom ResNet model. Different attributes gives different ResNet model types.
 
-<Tip>
+The main rules for customizing a configuration are:
 
-Models in the `transformers` library itself generally follow the convention that they accept a `config` object
-in their `__init__` method, and then pass the whole `config` to sub-layers in the model, rather than breaking the 
-config object into multiple arguments that are all passed individually to sub-layers. Writing your model in this 
-style results in simpler code with a clear "source of truth" for any hyperparameters, and also makes it easier
-to reuse code from other models in `transformers`.
+1. A custom configuration must subclass [`PretrainedConfig`]. This ensures a custom model has all the functionality of a Transformers' model such as [`~PretrainedConfig.from_pretrained`], [`~PretrainedConfig.save_pretrained`], and [`~PretrainedConfig.push_to_hub`].
+2. The [`PretrainedConfig`] `__init__` must accept any `kwargs` and they must be passed to the superclass `__init__`. [`PretrainedConfig`] has more fields than the ones set in your custom configuration, so when you load a configuration with [`~PretrainedConfig.from_pretrained`], those fields need to be accepted by your configuration and passed to the superclass.
 
-</Tip>
+> [!TIP]
+> It is useful to check the validity of some of the parameters. In the example below, a check is implemented to ensure `block_type` and `stem_type` belong to one of the predefined values.
+>
+> Add `model_type` to the configuration class to enable [AutoClass](./models#autoclass) support.
 
-In our example, we will take a couple of arguments of the ResNet class that we might want to tweak. Different
-configurations will then give us the different types of ResNets that are possible. We then just store those arguments,
-after checking the validity of a few of them.
-
-```python
+```py
 from transformers import PretrainedConfig
 from typing import List
 
-
 class ResnetConfig(PretrainedConfig):
     model_type = "resnet"
 
@@ -86,56 +74,38 @@ class ResnetConfig(PretrainedConfig):
         super().__init__(**kwargs)
 ```
 
-The three important things to remember when writing you own configuration are the following:
-- you have to inherit from `PretrainedConfig`,
-- the `__init__` of your `PretrainedConfig` must accept any kwargs,
-- those `kwargs` need to be passed to the superclass `__init__`.
-
-The inheritance is to make sure you get all the functionality from the 🤗 Transformers library, while the two other
-constraints come from the fact a `PretrainedConfig` has more fields than the ones you are setting. When reloading a
-config with the `from_pretrained` method, those fields need to be accepted by your config and then sent to the
-superclass.
-
-Defining a `model_type` for your configuration (here `model_type="resnet"`) is not mandatory, unless you want to
-register your model with the auto classes (see last section).
-
-With this done, you can easily create and save your configuration like you would do with any other model config of the
-library. Here is how we can create a resnet50d config and save it:
+Save the configuration to a JSON file in your custom model folder, `custom-resnet`, with [`~PretrainedConfig.save_pretrained`].
 
 ```py
 resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
 resnet50d_config.save_pretrained("custom-resnet")
 ```
 
-This will save a file named `config.json` inside the folder `custom-resnet`. You can then reload your config with the
-`from_pretrained` method:
+## Model
 
-```py
-resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
-```
+With the custom ResNet configuration, you can now create and customize the model. The model subclasses the base [`PreTrainedModel`] class. Like [`PretrainedConfig`], inheriting from [`PreTrainedModel`] and initializing the superclass with the configuration extends Transformers' functionalities such as saving and loading to the custom model.
+
+Transformers' models follow the convention of accepting a `config` object in the `__init__` method. This passes the entire `config` to the model sublayers, instead of breaking the `config` object into multiple arguments that are individually passed to the sublayers.
 
-You can also use any other method of the [`PretrainedConfig`] class, like [`~PretrainedConfig.push_to_hub`] to
-directly upload your config to the Hub.
+Writing models this way produces simpler code with a clear source of truth for any hyperparameters. It also makes it easier to reuse code from other Transformers' models.
 
-## Writing a custom model
+You'll create two ResNet models, a barebones ResNet model that outputs the hidden states and a ResNet model with an image classification head.
 
-Now that we have our ResNet configuration, we can go on writing the model. We will actually write two: one that
-extracts the hidden features from a batch of images (like [`BertModel`]) and one that is suitable for image
-classification (like [`BertForSequenceClassification`]).
+<hfoptions id="resnet">
+<hfoption id="ResnetModel">
 
-As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. The only
-thing we need to do before writing this class is a map between the block types and actual block classes. Then the
-model is defined from the configuration by passing everything to the `ResNet` class:
+Define a mapping between the block types and classes. Everything else is created by passing the configuration class to the ResNet model class.
+
+> [!TIP]
+> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.
 
 ```py
 from transformers import PreTrainedModel
 from timm.models.resnet import BasicBlock, Bottleneck, ResNet
 from .configuration_resnet import ResnetConfig
 
-
 BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
 
-
 class ResnetModel(PreTrainedModel):
     config_class = ResnetConfig
 
@@ -158,12 +128,17 @@ class ResnetModel(PreTrainedModel):
         return self.model.forward_features(tensor)
 ```
 
-For the model that will classify images, we just change the forward method:
+</hfoption>
+<hfoption id="ResnetModelForImageClassification">
+
+The `forward` method needs to be rewritten to calculate the loss for each logit if labels are available. Otherwise, the ResNet model class is the same.
+
+> [!TIP]
+> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.
 
 ```py
 import torch
 
-
 class ResnetModelForImageClassification(PreTrainedModel):
     config_class = ResnetConfig
 
@@ -190,34 +165,20 @@ class ResnetModelForImageClassification(PreTrainedModel):
         return {"logits": logits}
 ```
 
-In both cases, notice how we inherit from `PreTrainedModel` and call the superclass initialization with the `config`
-(a bit like when you write a regular `torch.nn.Module`). The line that sets the `config_class` is not mandatory, unless
-you want to register your model with the auto classes (see last section).
-
-<Tip>
-
-If your model is very similar to a model inside the library, you can re-use the same configuration as this model.
+</hfoption>
+</hfoptions>
 
-</Tip>
+A model can return any output format. Returning a dictionary (like `ResnetModelForImageClassification`) with losses when labels are available makes the custom model compatible with [`Trainer`]. For other output formats, you'll need your own training loop or a different library for training.
 
-You can have your model return anything you want, but returning a dictionary like we did for
-`ResnetModelForImageClassification`, with the loss included when labels are passed, will make your model directly
-usable inside the [`Trainer`] class. Using another output format is fine as long as you are planning on using your own
-training loop or another library for training.
-
-Now that we have our model class, let's create one:
+Instantiate the custom model class with the configuration.
 
 ```py
 resnet50d = ResnetModelForImageClassification(resnet50d_config)
 ```
 
-Again, you can use any of the methods of [`PreTrainedModel`], like [`~PreTrainedModel.save_pretrained`] or
-[`~PreTrainedModel.push_to_hub`]. We will use the second in the next section, and see how to push the model weights
-with the code of our model. But first, let's load some pretrained weights inside our model.
+At this point, you can load pretrained weights into the model or train it from scratch. In this guide, you'll load pretrained weights.
 
-In your own use case, you will probably be training your custom model on your own data. To go fast for this tutorial,
-we will use the pretrained version of the resnet50d. Since our model is just a wrapper around it, it's going to be
-easy to transfer those weights:
+Load the pretrained weights from the [timm](https://hf.co/docs/timm/index) library, and then transfer those weights to the custom model with [load_state_dict](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict).
 
 ```py
 import timm
@@ -226,17 +187,14 @@ pretrained_model = timm.create_model("resnet50d", pretrained=True)
 resnet50d.model.load_state_dict(pretrained_model.state_dict())
 ```
 
-Now let's see how to make sure that when we do [`~PreTrainedModel.save_pretrained`] or [`~PreTrainedModel.push_to_hub`], the
-code of the model is saved.
+## AutoClass
 
-## Registering a model with custom code to the auto classes
+The [AutoClass](./models#model-classes) API is a shortcut for automatically loading the correct architecture for a given model. It is convenient to enable this for users loading your custom model.
 
-If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own
-model. This is different from pushing the code to the Hub in the sense that users will need to import your library to
-get the custom models (contrarily to automatically downloading the model code from the Hub).
+Make sure you have the `model_type` attribute (must be different from existing model types) in the configuration class and `config_class` attribute in the model class. Use the [`~AutoConfig.register`] method to add the custom configuration and model to the [AutoClass](./models#model-classes) API.
 
-As long as your config has a `model_type` attribute that is different from existing model types, and that your model
-classes have the right `config_class` attributes, you can just add them to the auto classes like this:
+> [!TIP]
+> The first argument to [`AutoConfig.register`] must match the `model_type` attribute in the custom configuration class, and the first argument to [`AutoModel.register`] must match the `config_class` of the custom model class.
 
 ```py
 from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
@@ -246,25 +204,23 @@ AutoModel.register(ResnetConfig, ResnetModel)
 AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
 ```
 
-Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type`
-of your custom config, and the first argument used when registering your custom models to any auto model class needs
-to match the `config_class` of those models.
+Your custom model code is now compatible with the [AutoClass](./models#autoclass) API. Users can load the model with the [AutoModel](./model_doc/auto#automodel) or [`AutoModelForImageClassification`] classes.
 
-## Sending the code to the Hub
+## Upload
 
-<Tip warning={true}>
+Upload a custom model to the [Hub](https://hf.co/models) to allow other users to easily load and use it.
 
-This API is experimental and may have some slight breaking changes in the next releases.
+Ensure the model directory is structured correctly as shown below. The directory should contain:
 
-</Tip>
+- `modeling.py`: Contains the code for `ResnetModel` and `ResnetModelForImageClassification`. This file can rely on relative imports to other files as long as they're in the same directory.
 
-First, make sure your model is fully defined in a `.py` file. It can rely on relative imports to some other files as
-long as all the files are in the same directory (we don't support submodules for this feature yet). For our example,
-we'll define a `modeling_resnet.py` file and a `configuration_resnet.py` file in a folder of the current working
-directory named `resnet_model`. The configuration file contains the code for `ResnetConfig` and the modeling file
-contains the code of `ResnetModel` and `ResnetModelForImageClassification`.
+> [!WARNING]
+> When copying a Transformers' model file, replace all relative imports at the top of the `modeling.py` file to import from Transformers instead.
 
-```
+- `configuration.py`: Contains the code for `ResnetConfig`.
+- `__init__.py`: Can be empty, this file allows Python `resnet_model` to be used as a module.
+
+```bash
 .
 └── resnet_model
     ├── __init__.py
@@ -272,27 +228,16 @@ contains the code of `ResnetModel` and `ResnetModelForImageClassification`.
     └── modeling_resnet.py
 ```
 
-The `__init__.py` can be empty, it's just there so that Python detects `resnet_model` can be use as a module.
-
-<Tip warning={true}>
-
-If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file
-to import from the `transformers` package.
-
-</Tip>
-
-Note that you can re-use (or subclass) an existing configuration/model.
-
-To share your model with the community, follow those steps: first import the ResNet model and config from the newly
-created files:
+To share the model, import the ResNet model and configuration.
 
 ```py
 from resnet_model.configuration_resnet import ResnetConfig
 from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
 ```
 
-Then you have to tell the library you want to copy the code files of those objects when using the `save_pretrained`
-method and properly register them with a given Auto class (especially for models), just run:
+Copy the code from the model and configuration files. To make sure the AutoClass objects are saved with [`~PreTrainedModel.save_pretrained`], call the [`~PretrainedConfig.register_for_auto_class`] method. This modifies the configuration JSON file to include the AutoClass objects and mapping.
+
+For a model, pick the appropriate `AutoModelFor` class based on the task.
 
 ```py
 ResnetConfig.register_for_auto_class()
@@ -300,27 +245,17 @@ ResnetModel.register_for_auto_class("AutoModel")
 ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
 ```
 
-Note that there is no need to specify an auto class for the configuration (there is only one auto class for them,
-[`AutoConfig`]) but it's different for models. Your custom model could be suitable for many different tasks, so you
-have to specify which one of the auto classes is the correct one for your model.
-
-<Tip>
-
-Use `register_for_auto_class()` if you want the code files to be copied. If you instead prefer to use code on the Hub from another repo, 
-you don't need to call it. In cases where there's more than one auto class, you can modify the `config.json` directly using the 
-following structure:
+To map more than one task to the model, edit `auto_map` in the configuration JSON file directly.
 
 ```json
-"auto_map": {     
-	"AutoConfig": "<your-repo-name>--<config-name>",     
-	"AutoModel": "<your-repo-name>--<config-name>",
-	"AutoModelFor<Task>": "<your-repo-name>--<config-name>",    
+"auto_map": {
+    "AutoConfig": "<your-repo-name>--<config-name>",
+    "AutoModel": "<your-repo-name>--<config-name>",
+    "AutoModelFor<Task>": "<your-repo-name>--<config-name>",    
 },
 ```
 
-</Tip>
-
-Next, let's create the config and models as we did before:
+Create the configuration and model and load pretrained weights into it.
 
 ```py
 resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
@@ -330,13 +265,17 @@ pretrained_model = timm.create_model("resnet50d", pretrained=True)
 resnet50d.model.load_state_dict(pretrained_model.state_dict())
 ```
 
-Now to send the model to the Hub, make sure you are logged in. Either run in your terminal:
+The model is ready to be pushed to the Hub now. Log in to your Hugging Face account from the command line or notebook.
+
+<hfoptions id="push">
+<hfoption id="huggingface-CLI">
 
 ```bash
 huggingface-cli login
 ```
 
-or from a notebook:
+</hfoption>
+<hfoption id="notebook">
 
 ```py
 from huggingface_hub import notebook_login
@@ -344,41 +283,15 @@ from huggingface_hub import notebook_login
 notebook_login()
 ```
 
-You can then push to your own namespace (or an organization you are a member of) like this:
-
-```py
-resnet50d.push_to_hub("custom-resnet50d")
-```
-
-On top of the modeling weights and the configuration in json format, this also copied the modeling and
-configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result
-in this [model repo](https://huggingface.co/sgugger/custom-resnet50d).
+</hfoption>
+</hfoptions>
 
-See the [sharing tutorial](model_sharing) for more information on the push to Hub method.
-
-## Using a model with custom code
-
-You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and
-the `from_pretrained` method. All files and code uploaded to the Hub are scanned for malware (refer to the [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) documentation for more information), but you should still 
-review the model code and author to avoid executing malicious code on your machine. Set `trust_remote_code=True` to use
-a model with custom code:
+Call [`~PreTrainedModel.push_to_hub`] on the model to upload the model to the Hub.
 
 ```py
-from transformers import AutoModelForImageClassification
-
-model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
-```
-
-It is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not
-update the code with some malicious new lines (unless you fully trust the authors of the models).
-
-```py
-commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
-model = AutoModelForImageClassification.from_pretrained(
-    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
-)
+resnet50d.push_to_hub("custom-resnet50d")
 ```
 
-Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit
-hash of any commit.
+The pretrained weights, configuration, `modeling.py` and `configuration.py` files should all be uploaded to the Hub now in a [repository](https://hf.co/sgugger/custom-resnet50d) under your namespace.
 
+Because a custom model doesn't use the same modeling code as a Transformers' model, you need to add `trust_remode_code=True` in [`~PreTrainedModel.from_pretrained`] to load it. Refer to the load [custom models](./models#custom-models) section for more information.
diff --git a/docs/source/en/debugging.md b/docs/source/en/debugging.md
index 76e87f063206..09394d2229d1 100644
--- a/docs/source/en/debugging.md
+++ b/docs/source/en/debugging.md
@@ -1,4 +1,4 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,55 +14,52 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Debugging
+# Multi-GPU debugging
 
-Training on multiple GPUs can be a tricky endeavor whether you're running into installation issues or communication problems between your GPUs. This debugging guide covers some issues you may run into and how to resolve them.
+Distributed training can be tricky because you have to ensure you're using the correct CUDA version across your system. You may encounter inter-communication issues between GPUs, and there may be underflow or overflow problems in your model.
 
-## DeepSpeed CUDA installation
+This guide covers how to debug these issues, especially as it relates to DeepSpeed and PyTorch.
 
-If you're using DeepSpeed, you've probably already installed it with the following command.
+## DeepSpeed CUDA
+
+DeepSpeed compiles CUDA C++ which can be a potential source of errors when building PyTorch extensions that require CUDA. These errors depend on how CUDA is installed on your system. This section focuses on PyTorch built with *CUDA 10.2*
 
 ```bash
 pip install deepspeed
 ```
 
-DeepSpeed compiles CUDA C++ code and it can be a potential source of errors when building PyTorch extensions that require CUDA. These errors depend on how CUDA is installed on your system, and this section focuses on PyTorch built with *CUDA 10.2*.
-
-<Tip>
-
-For any other installation issues, please [open an issue](https://github.com/deepspeedai/DeepSpeed/issues) with the DeepSpeed team.
+> [!TIP]
+> For any other installation issues, please [open an issue](https://github.com/microsoft/DeepSpeed/issues) with the DeepSpeed team.
 
-</Tip>
+### Non-identical toolkits
 
-### Non-identical CUDA toolkits
+PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed everywhere.
 
-PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed system-wide. If you don't have CUDA installed system-wide, you should install it first.
-
-The exact location may vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly setup and added to your `PATH` environment variable, you can find the installation location with the following command:
+The exact location can vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command.
 
 ```bash
 which nvcc
 ```
 
-### Multiple CUDA toolkits
+### Multiple toolkits
 
-You may also have more than one CUDA toolkit installed system-wide.
+You may also have more than one CUDA toolkit installed on your system.
 
 ```bash
 /usr/local/cuda-10.2
 /usr/local/cuda-11.0
 ```
 
-Typically, package installers set the paths to whatever the last version was installed. If the package build fails because it can't find the right CUDA version (despite it being installed system-wide already), then you need to configure the `PATH` and `LD_LIBRARY_PATH` environment variables to point to the correct path.
+Typically, package installers set the paths to whatever the last version was installed. If the package build fails because it can't find the right CUDA version (despite it being installed already), then you need to configure the `PATH` and `LD_LIBRARY_PATH` environment variables to point to the correct path.
 
-Take a look at the contents of these environment variables first:
+Take a look at the contents of the following environment variables first.
 
 ```bash
 echo $PATH
 echo $LD_LIBRARY_PATH
 ```
 
-`PATH` lists the locations of the executables and `LD_LIBRARY_PATH` lists where to look for shared libraries. Earlier entries are prioritized over later ones, and `:` is used to separate multiple entries. To tell the build program where to find the specific CUDA toolkit you want, insert the correct path to list first. This command prepends rather than overwrites the existing values.
+`PATH` lists the locations of the executables and `LD_LIBRARY_PATH` lists where to look for shared libraries. Earlier entries are prioritized over later ones, and `:` is used to separate multiple entries. To find a specific CUDA toolkit, insert the correct path to list first. This command prepends rather than overwrites the existing values.
 
 ```bash
 # adjust the version and full path if needed
@@ -70,23 +67,23 @@ export PATH=/usr/local/cuda-10.2/bin:$PATH
 export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH
 ```
 
-In addition, you should also check the directories you assign actually exist. The `lib64` sub-directory contains various CUDA `.so` objects (like `libcudart.so`) and while it is unlikely your system names them differently, you should check the actual names and change them accordingly.
+In addition, you should also check that the assigned directories actually exist. The `lib64` sub-directory contains various CUDA `.so` objects (like `libcudart.so`), and while it is unlikely your system names them differently, you should check the actual names and change them accordingly.
 
-### Older CUDA versions
+### Older versions
 
 Sometimes, older CUDA versions may refuse to build with newer compilers. For example, if you have `gcc-9` but CUDA wants `gcc-7`. Usually, installing the latest CUDA toolkit enables support for the newer compiler.
 
-You could also install an older version of the compiler in addition to the one you're currently using (or it may already be installed but it's not used by default and the build system can't see it). To resolve this, you can create a symlink to give the build system visibility to the older compiler.
+You could also install an older version of the compiler in addition to the one you're currently using (or it may already be installed but it's not used by default and the build system can't see it). To resolve this, create a symlink to give the build system visibility to the older compiler.
 
 ```bash
-# adapt the path to your system
+# adjust the path to your system
 sudo ln -s /usr/bin/gcc-7  /usr/local/cuda-10.2/bin/gcc
 sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
 ```
 
 ### Prebuild
 
-If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, you can try to prebuild the DeepSpeed modules before installing them. To make a local build for DeepSpeed:
+If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, try to prebuild the DeepSpeed modules before installing them. Run the commands below to make a local build for DeepSpeed.
 
 ```bash
 git clone https://github.com/deepspeedai/DeepSpeed/
@@ -97,19 +94,16 @@ TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
 --disable-pip-version-check 2>&1 | tee build.log
 ```
 
-<Tip>
-
-To use NVMe offload, add the `DS_BUILD_AIO=1` parameter to the build command and make sure you install the libaio-dev package system-wide.
+> [!TIP]
+> Add the `DS_BUILD_AIO=1` parameter to the build command to use NVMe offload. Make sure you install the libaio-dev package across your system.
 
-</Tip>
-
-Next, you'll have to specify your GPU's architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command:
+Next, specify your GPUs architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command.
 
 ```bash
 python -c "import torch; print(torch.cuda.get_arch_list())"
 ```
 
-Find the architecture for a GPU with the following command:
+Find the architecture for a GPU with the following command.
 
 <hfoptions id="arch">
 <hfoption id="same GPUs">
@@ -121,7 +115,7 @@ CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capa
 </hfoption>
 <hfoption id="specific GPU">
 
-To find the architecture for GPU `0`:
+Run the following command to find the architecture for GPU `0`. The results will show a value for `major` and `minor`, which is your GPU architecture. The GPU architecture below is `8.6`.
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
@@ -129,8 +123,6 @@ print(torch.cuda.get_device_properties(torch.device('cuda')))
 "_CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)"
 ```
 
-This means your GPU architecture is `8.6`.
-
 </hfoption>
 </hfoptions>
 
@@ -138,7 +130,7 @@ If you get `8, 6`, then you can set `TORCH_CUDA_ARCH_LIST="8.6"`. For multiple G
 
 It is also possible to not specify `TORCH_CUDA_ARCH_LIST` and the build program automatically queries the GPU architecture of the build. However, it may or may not match the actual GPU on the target machine which is why it is better to explicitly specify the correct architecture.
 
-For training on multiple machines with the same setup, you'll need to make a binary wheel:
+For training on multiple machines with the same setup, you'll need to make a binary wheel as shown below.
 
 ```bash
 git clone https://github.com/deepspeedai/DeepSpeed/
@@ -148,88 +140,64 @@ TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
 python setup.py build_ext -j8 bdist_wheel
 ```
 
-This command generates a binary wheel that'll look something like `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`. Now you can install this wheel locally or on another machine.
+This command generates a binary wheel that'll look something like `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`. Install this wheel locally or on another machine.
 
 ```bash
 pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl
 ```
 
-## Multi-GPU Network Issues Debug
+## Communication
 
-When training or inferencing with `DistributedDataParallel` and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues.
+Distributed training involves communication between processes and or nodes and this can be a potential source of errors.
 
-```bash
-wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
-```
-
-For example to test how 2 GPUs interact do:
+Download the script below to diagnose network issues, and then run it to test GPU communication. The example command below tests how two GPUs communicate. Adjust the `--nproc_per_node` and `--nnodes` parameters to adapt it to your system.
 
 ```bash
+wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
 python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
 ```
-If both processes can talk to each and allocate GPU memory each will print an OK status.
-
-For more GPUs or nodes adjust the arguments in the script.
 
-You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment.
+The script prints an `OK` status if both GPUs are able to communicate and allocate memory. Take a closer look at the diagnostic script for more details and a recipe for running it in a SLURM environment.
 
-An additional level of debug is to add `NCCL_DEBUG=INFO` environment variable as follows:
+Add the `NCCL_DEBUG=INFO` environment variable to report more NCCL-related debugging information.
 
 ```bash
 NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
 ```
 
-This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. Or if you're not sure how to interpret the output you can share the log file in an Issue.
-
-
-
-## Underflow and Overflow Detection
-
-<Tip>
+## Underflow and overflow detection
 
-This feature is currently available for PyTorch-only.
+Underflow and overflow can occur when activations or weights are `inf`, `nan`, and when `loss=NaN`. This may indicate an underflow or overflow issue. To detect these issues, activate the `DebugUnderflowOverflow` module in [`TrainingArguments.debug`] or import and add the module to your own training loop or another trainer class.
 
-</Tip>
+<hfoptions id="overflow">
+<hfoption id="Trainer">
 
-<Tip>
+```py
+from transformers import TrainingArguments
 
-For multi-GPU training it requires DDP (`torch.distributed.launch`).
-
-</Tip>
-
-<Tip>
-
-This feature can be used with any `nn.Module`-based model.
-
-</Tip>
-
-If you start getting `loss=NaN` or the model exhibits some other abnormal behavior due to `inf` or `nan` in
-activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
-you can accomplish that easily by activating a special module that will do the detection automatically.
-
-If you're using [`Trainer`], you just need to add:
-
-```bash
---debug underflow_overflow
+args = TrainingArguments(
+    debug="underflow_overflow",
+    ...
+)
 ```
 
-to the normal command line arguments, or pass `debug="underflow_overflow"` when creating the
-[`TrainingArguments`] object.
-
-If you're using your own training loop or another Trainer you can accomplish the same with:
+</hfoption>
+<hfoption id="PyTorch training loop">
 
-```python
+```py
 from transformers.debug_utils import DebugUnderflowOverflow
 
 debug_overflow = DebugUnderflowOverflow(model)
 ```
 
-[`~debug_utils.DebugUnderflowOverflow`] inserts hooks into the model that immediately after each
-forward call will test input and output variables and also the corresponding module's weights. As soon as `inf` or
-`nan` is detected in at least one element of the activations or weights, the program will assert and print a report
-like this (this was caught with `google/mt5-small` under fp16 mixed precision):
+</hfoption>
+</hfoptions>
 
-```
+The [`~debug_utils.DebugUnderflowOverflow`] module inserts hooks into the model to test the input and output variables and the corresponding model weights after each forward call. If `inf` or `nan` is detected in at least one element of the activations or weights, the module prints a report like the one shown below.
+
+The example below is for fp16 mixed precision training with [google/mt5-small](https://huggingface.co/google/mt5-small).
+
+```shell
 Detected inf/nan during batch_number=0
 Last 21 forward frames:
 abs min  abs max  metadata
@@ -269,48 +237,20 @@ abs min  abs max  metadata
 0.00e+00      inf output
 ```
 
-The example output has been trimmed in the middle for brevity.
-
-The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
-the inputs and outputs were in the range of `1e4`. So when this training was done under fp16 mixed precision the very
-last step overflowed (since under `fp16` the largest number before `inf` is `64e3`). To avoid overflows under
-`fp16` the activations must remain way below `1e4`, because `1e4 * 1e4 = 1e8` so any matrix multiplication with
-large activations is going to lead to a numerical overflow condition.
-
-At the very start of the trace you can discover at which batch number the problem occurred (here `Detected inf/nan during batch_number=0` means the problem occurred on the first batch).
+At the start of the report, you can see which batch number the error occurred. In this case, it occurred on the first batch.
 
-Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting
-for. If we look just at this frame:
+Each frame describes the module it is reporting on. For example, the frame below inspected `encoder.block.2.layer.1.layer_norm`. This indicates the layer norm in the first layer of the second block of the encoder. The forward calls are to `T5LayerNorm`.
 
-```
+```shell
                   encoder.block.2.layer.1.layer_norm T5LayerNorm
 8.69e-02 4.18e-01 weight
 2.65e-04 3.42e+03 input[0]
 1.79e-06 4.65e+00 output
 ```
 
-Here, `encoder.block.2.layer.1.layer_norm` indicates that it was a layer norm for the first layer, of the second
-block of the encoder. And the specific calls of the `forward` is `T5LayerNorm`.
-
-Let's look at the last few frames of that report:
+The last frame reports on the `Dropout.forward` function. It called the `dropout` attribute from inside the `DenseReluDense` class. You can observe that the overflow (`inf`) occurred in the first layer of the encoders second block in the first batch. The absolute largest input element was 6.27e+04.
 
-```
-Detected inf/nan during batch_number=0
-Last 21 forward frames:
-abs min  abs max  metadata
-[...]
-                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
-2.17e-07 4.50e+00 weight
-1.79e-06 4.65e+00 input[0]
-2.68e-06 3.70e+01 output
-                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
-8.08e-07 2.66e+01 weight
-1.79e-06 4.65e+00 input[0]
-1.27e-04 2.37e+02 output
-                  encoder.block.2.layer.1.DenseReluDense.wo Linear
-1.01e-06 6.44e+00 weight
-0.00e+00 9.74e+03 input[0]
-3.18e-04 6.27e+04 output
+```shell
                   encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
 1.79e-06 4.65e+00 input[0]
 3.18e-04 6.27e+04 output
@@ -319,22 +259,11 @@ abs min  abs max  metadata
 0.00e+00      inf output
 ```
 
-The last frame reports for `Dropout.forward` function with the first entry for the only input and the second for the
-only output. You can see that it was called from an attribute `dropout` inside `DenseReluDense` class. We can see
-that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest
-input elements was `6.27e+04` and same for the output was `inf`.
-
-You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
-around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which renormalizes
-the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
-overflow (`inf`).
-
-As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
-numbers.
+The `T5DenseGatedGeluDense.forward` function output activations had an absolute maximum value of 6.27e+04 which is close to fp16s maximum limit of 6.4e+04. In the next step, `Dropout` renormalizes the weights, after zeroing some elements, which pushes the absolute maximum value to greater than 6.4e+04 resulting in an overflow.
 
-Let's match the report to the code from `models/t5/modeling_t5.py`:
+Now that you know where the error is happening, you can investigate the modeling code in [modeling_t5.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py).
 
-```python
+```py
 class T5DenseGatedGeluDense(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -353,29 +282,11 @@ class T5DenseGatedGeluDense(nn.Module):
         return hidden_states
 ```
 
-Now it's easy to see the `dropout` call, and all the previous calls as well.
-
-Since the detection is happening in a forward hook, these reports are printed immediately after each `forward`
-returns.
-
-Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
-started to go up and most likely switch to the `fp32` mode here, so that the numbers don't overflow when multiplied
-or summed up. Of course, there might be other solutions. For example, we could turn off `amp` temporarily if it's
-enabled, after moving the original `forward` into a helper wrapper, like so:
-
-```python
-def _forward(self, hidden_states):
-    hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
-    hidden_linear = self.wi_1(hidden_states)
-    hidden_states = hidden_gelu * hidden_linear
-    hidden_states = self.dropout(hidden_states)
-    hidden_states = self.wo(hidden_states)
-    return hidden_states
-
+One solution is to go back a few steps before the values started growing too large and switch to fp32 so the numbers don't overflow when multiplied or summed. Another potential solution is to temporarily disable mixed precision training (`amp`).
 
+```py
 import torch
 
-
 def forward(self, hidden_states):
     if torch.is_autocast_enabled():
         with torch.cuda.amp.autocast(enabled=False):
@@ -384,14 +295,11 @@ def forward(self, hidden_states):
         return self._forward(hidden_states)
 ```
 
-Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
-want to analyse the intermediary stages of any specific `forward` function as well. In such a case you can use the
-`detect_overflow` helper function to inject the detector where you want it, for example:
+The report only returns inputs and outputs of full frames, so you may also want to analyze the intermediate values of any `forward` function as well. Add the `detect_overflow` function after the forward calls to track `inf` or `nan` values in the intermediate `forwarded_states`.
 
-```python
+```py
 from debug_utils import detect_overflow
 
-
 class T5LayerFF(nn.Module):
     [...]
 
@@ -403,40 +311,25 @@ class T5LayerFF(nn.Module):
         return hidden_states + self.dropout(forwarded_states)
 ```
 
-You can see that we added 2 of these and now we track if `inf` or `nan` for `forwarded_states` was detected
-somewhere in between.
-
-Actually, the detector already reports these because each of the calls in the example above is a `nn.Module`, but
-let's say if you had some local direct calculations this is how you'd do that.
-
-Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from
-its default, e.g.:
+Finally, you can configure the number of frames printed by [`~debug_utils.DebugUnderflowOverflow`].
 
-```python
+```py
 from transformers.debug_utils import DebugUnderflowOverflow
 
 debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
 ```
 
-### Specific batch absolute min and max value tracing
+### Batch tracing
 
-The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off.
+[`~debug_utils.DebugUnderflowOverflow`] is able to trace the absolute minimum and maximum values in each batch with the underflow and overflow feature disabled. This is useful for identifying where errors are occurring in the model.
 
-Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a given
-batch, and only do that for batches 1 and 3. Then you instantiate this class as:
+The example below shows how to trace the minimum and maximum values in batches 1 and 3 (batches are zero-indexd).
 
-```python
+```py
 debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
 ```
 
-And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
-
-Batches are 0-indexed.
-
-This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward
-right to that area. Here is a sample truncated output for such configuration:
-
-```
+```shell
                   *** Starting batch number=1 ***
 abs min  abs max  metadata
                   shared Embedding
@@ -465,13 +358,10 @@ abs min  abs max  metadata
 [...]
 ```
 
-Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may
-not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if
-a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where
-numbers started to diverge.
+[`~debug_utils.DebugUnderflowOverflow`] reports on a large number of frames which is easier for debugging. Once you know where a problem is occurring, say batch 150, then you can focus the trace for batches 149 and 150 and compare where the numbers are diverging.
 
-You can also specify the batch number after which to stop the training, with:
+It is also possible to abort the trace after a certain batch number, for example, batch 3.
 
-```python
+```py
 debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
 ```
diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md
index cb21b7e8fca8..4d1df98e50a2 100644
--- a/docs/source/en/deepspeed.md
+++ b/docs/source/en/deepspeed.md
@@ -16,27 +16,21 @@ rendered properly in your Markdown viewer.
 
 # DeepSpeed
 
-[DeepSpeed](https://www.deepspeed.ai/) is a PyTorch optimization library that makes distributed training memory-efficient and fast. At its core is the [Zero Redundancy Optimizer (ZeRO)](https://hf.co/papers/1910.02054) which enables training large models at scale. ZeRO works in several stages:
+[DeepSpeed](https://www.deepspeed.ai/) is designed to optimize distributed training for large models with data, model, pipeline, and even a combination of all three [parallelism](./perf_train_gpu_many) strategies to provide better memory efficiency and faster training speeds. This is achieved with the [Zero Redundancy Optimizer (ZeRO)](https://hf.co/papers/1910.02054) which consists of three stages.
 
-* ZeRO-1, optimizer state partitioning across GPUs
-* ZeRO-2, gradient partitioning across GPUs
-* ZeRO-3, parameter partitioning across GPUs
+| ZeRO stage | description |
+|---|---|
+| 1 | partition optimizer states |
+| 2 | partition optimizer and gradient states |
+| 3 | partition optimizer, gradient, and parameters |
 
-In GPU-limited environments, ZeRO also enables offloading optimizer memory and computation from the GPU to the CPU to fit and train really large models on a single GPU. DeepSpeed is integrated with the Transformers [`Trainer`] class for all ZeRO stages and offloading. All you need to do is provide a config file or you can use a provided template. For inference, Transformers support ZeRO-3 and offloading since it allows loading huge models.
+Each stage progressively saves more memory, allowing really large models to fit and train on a single GPU. All ZeRO stages, offloading optimizer memory and computations from the GPU to the CPU are integrated with [`Trainer`]. Provide a config file or one of the example templates to [`Trainer`] to enable DeepSpeed features.
 
-This guide will walk you through how to deploy DeepSpeed training, the features you can enable, how to setup the config files for different ZeRO stages, offloading, inference, and using DeepSpeed without the [`Trainer`].
+This guide walks you through setting up a DeepSpeed config file, how to enable its features in [`Trainer`], and deploy for training.
 
-## Installation
+Install DeepSpeed from either PyPI or Transformers. For more detailed installation instructions, refer to the DeepSpeed [installation](https://www.deepspeed.ai/tutorials/advanced-install/) or GitHUB [README](https://github.com/microsoft/deepspeed#installation).
 
-DeepSpeed is available to install from PyPI or Transformers (for more detailed installation options, take a look at the DeepSpeed [installation details](https://www.deepspeed.ai/tutorials/advanced-install/) or the GitHub [README](https://github.com/deepspeedai/DeepSpeed#installation)).
-
-<Tip>
-
-If you're having difficulties installing DeepSpeed, check the [DeepSpeed CUDA installation](../debugging#deepspeed-cuda-installation) guide. While DeepSpeed has a pip installable PyPI package, it is highly recommended to [install it from source](https://www.deepspeed.ai/tutorials/advanced-install/#install-deepspeed-from-source) to best match your hardware and to support certain features, like 1-bit Adam, which aren’t available in the PyPI distribution.
-
-</Tip>
-
-<hfoptions id="install">
+<hfoptions id="installation">
 <hfoption id="PyPI">
 
 ```bash
@@ -53,9 +47,12 @@ pip install transformers[deepspeed]
 </hfoption>
 </hfoptions>
 
-## Memory requirements
+> [!WARNING]
+> Refer to the [DeepSpeed CUDA installation](./debugging#deepspeed-cuda-issues) if you're having trouble with your installation. While DeepSpeed has a pip installable package, it is highly recommended to [install it from source](https://www.deepspeed.ai/tutorials/advanced-install/#install-deepspeed-from-source) to ensure it matches your hardware and to support certain features which aren't available in the PyPI distribution.
 
-Before you begin, it is a good idea to check whether you have enough GPU and CPU memory to fit your model. DeepSpeed provides a tool for estimating the required CPU/GPU memory. For example, to estimate the memory requirements for the [bigscience/T0_3B](bigscience/T0_3B) model on a single GPU:
+DeepSpeed provides a tool for estimating the required CPU and GPU memory for the parameters, optimizer and gradient states. You'll also to need to reserve some memory for the CUDA kernels and activations.
+
+Run the command below to check the memory requirements for [bigscience/T0_3B](https://huggingface.co/docs/transformers/main/en/bigscience/T0_3B) on a single GPU.
 
 ```bash
 $ python -c 'from transformers import AutoModel; \
@@ -75,64 +72,48 @@ SW: Model with 2783M total params, 65M largest layer params.
    15.56GB |  46.91GB | offload_param=none, offload_optimizer=none, zero_init=0
 ```
 
-This means you either need a single 80GB GPU without CPU offload or a 8GB GPU and a ~60GB CPU to offload to (these are just the memory requirements for the parameters, optimizer states and gradients, and you'll need a bit more for the CUDA kernels and activations). You should also consider the tradeoff between cost and speed because it'll be cheaper to rent or buy a smaller GPU but it'll take longer to train your model.
-
-If you have enough GPU memory make sure you disable CPU/NVMe offload to make everything faster.
+> [!TIP]
+> If you have enough GPU memory, disable CPU and NVMe offload to speed everything up.
 
-## Select a ZeRO stage
+## Choosing a ZeRO stage
 
-After you've installed DeepSpeed and have a better idea of your memory requirements, the next step is selecting a ZeRO stage to use. In order of fastest and most memory-efficient:
+Consider the table below to help you choose the appropriate ZeRO stage for training because there is a trade-off between training speed and memory usage. The table orders the ZeRO stages from fastest to slowest and from least memory usage to most.
 
-| Fastest          | Memory efficient |
-|------------------|------------------|
-| ZeRO-1           | ZeRO-3 + offload |
-| ZeRO-2           | ZeRO-3           |
+| fastest | least memory usage |
+|---|---|
+| ZeRO-1 | ZeRO-3 + offload |
+| ZeRO-2 | ZeRO-3 |
 | ZeRO-2 + offload | ZeRO-2 + offload |
-| ZeRO-3           | ZeRO-2           |
-| ZeRO-3 + offload | ZeRO-1           |
-
-To find what works best for you, start with the fastest approach and if you run out of memory, try the next stage which is slower but more memory efficient. Feel free to work in whichever direction you prefer (starting with the most memory efficient or fastest) to discover the appropriate balance between speed and memory usage.
-
-A general process you can use is (start with batch size of 1):
+| ZeRO-3 | ZeRO-2 |
+| ZeRO-3 + offload | ZeRO-1 |
 
-1. enable gradient checkpointing
-2. try ZeRO-2
-3. try ZeRO-2 and offload the optimizer
-4. try ZeRO-3
-5. try ZeRO-3 and offload parameters to the CPU
-6. try ZeRO-3 and offload parameters and the optimizer to the CPU
-7. try lowering various default values like a narrower search beam if you're using the [`~GenerationMixin.generate`] method
-8. try mixed half-precision (fp16 on older GPU architectures and bf16 on Ampere) over full-precision weights
-9. add more hardware if possible or enable Infinity to offload parameters and the optimizer to a NVMe
-10. once you're not running out of memory, measure effective throughput and then try to increase the batch size as large as you can to maximize GPU efficiency
-11. lastly, try to optimize your training setup by disabling some offload features or use a faster ZeRO stage and increasing/decreasing the batch size to find the best tradeoff between speed and memory usage
+Decide the type of performance you're optimizing for, speed or memory, and then work backwards to discover the best ZeRO stage for your use case. For example, if you're optimizing for speed, start with the fastest ZeRO stage and if you run out of memory, try the next stage which is slower but more memory efficient.
 
+## Config file
 
-## DeepSpeed configuration file
+Once you've decided on a ZeRO stage, set up a config file to enable DeepSpeed with [`Trainer`]. The config file contains all the parameters for how to configure and set up your training. When the training script is executed, DeepSpeed logs the configuration from [`Trainer`] to the console so you can see exactly what's being used.
 
-DeepSpeed works with the [`Trainer`] class by way of a config file containing all the parameters for configuring how you want setup your training run. When you execute your training script, DeepSpeed logs the configuration it received from [`Trainer`] to the console so you can see exactly what configuration was used.
+> [!TIP]
+> Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. There are also practical examples of various DeepSpeed configuration examples in the [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) main [DeepSpeed](https://github.com/microsoft/DeepSpeed) repository. Run the command below to quickly find specific examples.
+>
+> ```bash
+> git clone https://github.com/microsoft/DeepSpeedExamples
+> cd DeepSpeedExamples
+> find . -name '*json'
+> # find examples with the Lamb optimizer
+> grep -i Lamb $(find . -name '*json')
+> ```
 
-<Tip>
-
-Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. You can also find more practical examples of various DeepSpeed configuration examples on the [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) repository or the main [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) repository. To quickly find specific examples, you can:
-
-```bash
-git clone https://github.com/deepspeedai/DeepSpeedExamples
-cd DeepSpeedExamples
-find . -name '*json'
-# find examples with the Lamb optimizer
-grep -i Lamb $(find . -name '*json')
-```
-
-</Tip>
-
-The DeepSpeed configuration file is passed as a path to a JSON file if you're training from the command line interface or as a nested `dict` object if you're using the [`Trainer`] in a notebook setting.
+The config file is passed as a path to a JSON file if you're training from the command line interface or as a nested dict object if you're using [`Trainer`] in a notebook.
 
 <hfoptions id="pass-config">
 <hfoption id="path to file">
 
 ```py
-TrainingArguments(..., deepspeed="path/to/deepspeed_config.json")
+TrainingArguments(
+    deepspeed="path/to/deepspeed_config.json",
+    ...,
+)
 ```
 
 </hfoption>
@@ -140,45 +121,49 @@ TrainingArguments(..., deepspeed="path/to/deepspeed_config.json")
 
 ```py
 ds_config_dict = dict(scheduler=scheduler_params, optimizer=optimizer_params)
-args = TrainingArguments(..., deepspeed=ds_config_dict)
-trainer = Trainer(model, args, ...)
+args = TrainingArguments(
+    deepspeed=ds_config_dict,
+    ...,
+)
+trainer = Trainer(
+    model,
+    args,
+    ...,
+)
 ```
 
 </hfoption>
 </hfoptions>
 
-### DeepSpeed and Trainer parameters
-
-There are three types of configuration parameters:
+### DeepSpeed versus Trainer parameters
 
-1. Some of the configuration parameters are shared by [`Trainer`] and DeepSpeed, and it can be difficult to identify errors when there are conflicting definitions. To make it easier, these shared configuration parameters are configured from the [`Trainer`] command line arguments.
+There are three types of config parameters.
 
-2. Some configuration parameters that are automatically derived from the model configuration so you don't need to manually adjust these values. The [`Trainer`] uses a configuration value `auto` to determine set the most correct or efficient value. You could set your own configuration parameters explicitly, but you must take care to ensure the [`Trainer`] arguments and DeepSpeed configuration parameters agree. Mismatches may cause the training to fail in very difficult to detect ways!
+1. Some config parameters are shared by DeepSpeed and [`Trainer`] making it difficult to identify errors when there are conflicting definitions. In this case, configure these parameters from the [`Trainer`] command line arguments.
+1. Some config parameters are automatically derived from the model configuration and don't need to be manually configured. [`Trainer`] uses the config value `auto` to set the most correct or efficient option. You could define these parameters explicitly, but you must take care to ensure the [`Trainer`] and DeepSpeed config parameters match. Mismatches may cause training to fail in very difficult to detect ways.
+1. Some config parameters are specific to DeepSpeed and should be manually set based on your training requirements.
 
-3. Some configuration parameters specific to DeepSpeed only which need to be manually set based on your training needs.
+There are two ways to modify the config parameters.
 
-You could also modify the DeepSpeed configuration and edit [`TrainingArguments`] from it:
+> [!TIP]
+> Some values, such as `scheduler.params.total_num_steps`, are calculated by [`Trainer`] during training.
 
-1. Create or load a DeepSpeed configuration to use as the main configuration
-2. Create a [`TrainingArguments`] object based on these DeepSpeed configuration values
+1. Create or load a DeepSpeed config to use as the main config.
+1. Create a [`TrainingArguments`] object based on the DeepSpeed config values.
 
-Some values, such as `scheduler.params.total_num_steps` are calculated by the [`Trainer`] during training.
+### ZeRO stage
 
-### ZeRO configuration
+Each ZeRO stage config is defined in `zero_optimization`.
 
-There are three configurations, each corresponding to a different ZeRO stage. Stage 1 is not as interesting for scalability, and this guide focuses on stages 2 and 3. The `zero_optimization` configuration contains all the options for what to enable and how to configure them. For a more detailed explanation of each parameter, take a look at the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference.
+For a more detailed explanation of each parameter, refer to the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. These parameters must be set up with DeepSpeed because [`Trainer`] doesn't provide equivalent command line arguments.
 
-<Tip warning={true}>
-DeepSpeed doesn’t validate parameter names and any typos fallback on the parameter's default setting. You can watch the DeepSpeed engine startup log messages to see what values it is going to use.
-
-</Tip>
-
-The following configurations must be setup with DeepSpeed because the [`Trainer`] doesn't provide equivalent command line arguments.
+> [!WARNING]
+> DeepSpeed doesn't validate parameter names and any typos will fallback on the parameters default setting. Observe the DeepSpeed engine startup log messages to see what values are being used.
 
 <hfoptions id="zero-config">
 <hfoption id="ZeRO-1">
 
-ZeRO-1 shards the optimizer states across GPUs, and you can expect a tiny speed up. The ZeRO-1 config can be setup like this:
+ZeRO-1 shards the optimizer states across GPUs and you can expect a small speed up.
 
 ```yml
 {
@@ -191,11 +176,11 @@ ZeRO-1 shards the optimizer states across GPUs, and you can expect a tiny speed
 </hfoption>
 <hfoption id="ZeRO-2">
 
-ZeRO-2 shards the optimizer and gradients across GPUs. This stage is primarily used for training since its features are not relevant to inference. Some important parameters to configure for better performance include:
+ZeRO-2 shards the optimizer and gradient states across GPUs. This stage is primarily used for training since its features are not relevant to inference. Some important parameters to configure for better performance include the following.
 
 * `offload_optimizer` should be enabled to reduce GPU memory usage.
-* `overlap_comm` when set to `true` trades off increased GPU memory usage to lower allreduce latency. This feature uses 4.5x the `allgather_bucket_size` and `reduce_bucket_size` values. In this example, they're set to `5e8` which means it requires 9GB of GPU memory. If your GPU memory is 8GB or less, you should reduce `overlap_comm` to lower the memory requirements and prevent an out-of-memory (OOM) error.
-* `allgather_bucket_size` and `reduce_bucket_size` trade off available GPU memory for communication speed. The smaller their values, the slower communication is and the more GPU memory is available. You can balance, for example, whether a bigger batch size is more important than a slightly slower training time.
+* `overlap_comm` when set to `true` uses more GPU memory in exchange for lower allreduce latency. This feature uses 4.5x the `allgather_bucket_size` and `reduce_bucket_size` values. In this example, they're set to `5e8` which means it requires 9GB of GPU memory. If your GPU memory is 8GB or less, you should reduce `overlap_comm` to lower the memory requirements and prevent an out-of-memory (OOM) error.
+* `allgather_bucket_size` and `reduce_bucket_size` trade-off available GPU memory for communication speed. The smaller their values, the slower communication is and the more GPU memory is available. You can balance, for example, whether a bigger batch size is more important than a slightly slower training time.
 * `round_robin_gradients` is available in DeepSpeed 0.4.4 for CPU offloading. It parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism).
 
 ```yml
@@ -220,19 +205,19 @@ ZeRO-2 shards the optimizer and gradients across GPUs. This stage is primarily u
 </hfoption>
 <hfoption id="ZeRO-3">
 
-ZeRO-3 shards the optimizer, gradient, and parameters across GPUs. Unlike ZeRO-2, ZeRO-3 can also be used for inference, in addition to training, because it allows large models to be loaded on multiple GPUs. Some important parameters to configure include:
+ZeRO-3 shards the optimizer and gradient states, and parameters across GPUs. Unlike ZeRO-2, ZeRO-3 can also be used for inference in addition to training because it loads large models onto multiple GPUs. Some important parameters to configure include the following.
 
-* `device: "cpu"` can help if you're running out of GPU memory and if you have free CPU memory available. This allows offloading model parameters to the CPU.
+* `device: "cpu"` can help if you're running out of GPU memory and if you have free CPU memory available. This offloads model parameters to the CPU.
 * `pin_memory: true` can improve throughput, but less memory becomes available for other processes because the pinned memory is reserved for the specific process that requested it and it's typically accessed much faster than normal CPU memory.
-* `stage3_max_live_parameters` is the upper limit on how many full parameters you want to keep on the GPU at any given time. Reduce this value if you encounter an OOM error.
-* `stage3_max_reuse_distance` is a value for determining when a parameter is used again in the future, and it helps decide whether to throw the parameter away or to keep it. If the parameter is going to be reused (if the value is less than `stage3_max_reuse_distance`), then it is kept to reduce communication overhead. This is super helpful when activation checkpointing is enabled and you want to keep the parameter in the forward recompute until the backward pass. But reduce this value if you encounter an OOM error.
+* `stage3_max_live_parameters` is the upper limit on how many full parameters to keep on the GPU at any given time. Reduce this value if you encounter an OOM error.
+* `stage3_max_reuse_distance` is a value for determining when a parameter is used again in the future, and it helps decide whether to throw the parameter away or to keep it. If the parameter is going to be reused (if the value is less than `stage3_max_reuse_distance`), then it is kept to reduce communication overhead. This is helpful when activation checkpointing is enabled and you want to keep the parameter in the forward recompute until the backward pass. Reduce this value if you encounter an OOM error.
 * `stage3_gather_16bit_weights_on_model_save` consolidates fp16 weights when a model is saved. For large models and multiple GPUs, this is expensive in terms of memory and speed. You should enable it if you're planning on resuming training.
-* `sub_group_size` controls which parameters are updated during the optimizer step. Parameters are grouped into buckets of `sub_group_size` and each bucket is updated one at a time. When used with NVMe offload, `sub_group_size` determines when model states are moved in and out of CPU memory from during the optimization step. This prevents running out of CPU memory for extremely large models. `sub_group_size` can be left to its default value if you aren't using NVMe offload, but you may want to change it if you:
+* `sub_group_size` controls which parameters are updated during the optimizer step. Parameters are grouped into buckets of `sub_group_size` and each bucket is updated one at a time. When used with NVMe offload, `sub_group_size` determines when model states are moved in and out of CPU memory during the optimization step. This prevents running out of CPU memory for extremely large models. `sub_group_size` can be left to its default value if you aren't using NVMe offload, but you may want to change it if you:
 
-    1. Run into an OOM error during the optimizer step. In this case, reduce `sub_group_size` to reduce memory usage of the temporary buffers.
-    2. The optimizer step is taking a really long time. In this case, increase `sub_group_size` to improve bandwidth utilization as a result of increased data buffers.
+    1. Run into an OOM error during the optimization step. In this case, reduce `sub_group_size` to reduce memory usage of the temporary buffers.
+    2. The optimization step is taking a really long time. In this case, increase `sub_group_size` to improve bandwidth utilization as a result of increased data buffers.
 
-* `reduce_bucket_size`, `stage3_prefetch_bucket_size`, and `stage3_param_persistence_threshold` are dependent on a model's hidden size. It is recommended to set these values to `auto` and allow the [`Trainer`] to automatically assign the values.
+* `reduce_bucket_size`, `stage3_prefetch_bucket_size`, and `stage3_param_persistence_threshold` are dependent on a models hidden size. It is recommended to set these values to `auto` and allow [`Trainer`] to automatically assign the values.
 
 ```yml
 {
@@ -259,7 +244,9 @@ ZeRO-3 shards the optimizer, gradient, and parameters across GPUs. Unlike ZeRO-2
 }
 ```
 
-You can use the [`deepspeed.zero.Init`](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init) context manager to initialize a model faster:
+### Initialize large models
+
+With ZeRO-3, use the [deepspeed.zero.Init](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init) context manager to initialize a model faster.
 
 ```py
 from transformers import T5ForConditionalGeneration, T5Config
@@ -270,7 +257,10 @@ with deepspeed.zero.Init():
     model = T5ForConditionalGeneration(config)
 ```
 
-For pretrained models, the DeepSped config file needs to have `is_deepspeed_zero3_enabled: true` setup in [`TrainingArguments`] and it needs a ZeRO configuration enabled. The [`TrainingArguments`] object must be created **before** calling the model [`~PreTrainedModel.from_pretrained`].
+The DeepSped config file needs to have `is_deepspeed_zero3_enabled: true` setup in [`TrainingArguments`] and it needs a ZeRO configuration enabled. The [`TrainingArguments`] object must be created **before** calling [`~PreTrainedModel.from_pretrained`].
+
+> [!TIP]
+> You'll need ZeRO-3 when the fp16 weights don't fit on a single GPU. But if you're able to load the fp16 weights, set `torch_dtype=torch.float16` in [`~PreTrainedModel.from_pretrained`].
 
 ```py
 from transformers import AutoModel, Trainer, TrainingArguments
@@ -280,34 +270,31 @@ model = AutoModel.from_pretrained("google-t5/t5-small")
 trainer = Trainer(model=model, args=training_args, ...)
 ```
 
-You'll need ZeRO-3 if the fp16 weights don't fit on a single GPU. If you're able to load fp16 weights, then make sure you specify `torch_dtype=torch.float16` in [`~PreTrainedModel.from_pretrained`].
+When there are multiple GPUs, no single GPU has all the parameters unless it's the parameters of the currently executing layer. To access all parameters from all the layers at once, such as loading pretrained model weights in [`~PreTrainedModel.from_pretrained`], one layer is loaded at a time and immediately partitioned to all GPUs. For very large models, it isn't possible to load the weights onto one GPU and then distribute them across the other GPUs due to memory limitations.
 
-Another consideration for ZeRO-3 is if you have multiple GPUs, no single GPU has all the parameters unless it's the parameters for the currently executing layer. To access all parameters from all the layers at once, such as loading pretrained model weights in [`~PreTrainedModel.from_pretrained`], one layer is loaded at a time and immediately partitioned to all GPUs. This is because for very large models, it isn't possible to load the weights on one GPU and then distribute them across the other GPUs due to memory limitations.
-
-If you encounter a model parameter weight that looks like the following, where `tensor([1.])` or the parameter size is 1 instead of a larger multi-dimensional shape, this means the parameter is partitioned and this is a ZeRO-3 placeholder.
+If you encounter a model parameter weight where `tensor([1.])` or the parameter size is 1 instead of a larger multidimensional shape, it means the parameter is partitioned and this is a ZeRO-3 placeholder.
 
 ```py
 tensor([1.0], device="cuda:0", dtype=torch.float16, requires_grad=True)
 ```
 
-<Tip>
-
-For more information about initializing large models with ZeRO-3 and accessing the parameters, take a look at the [Constructing Massive Models](https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models) and [Gathering Parameters](https://deepspeed.readthedocs.io/en/latest/zero3.html#gathering-parameters) guides.
-
-</Tip>
+> [!TIP]
+> For more information about initializing large models with ZeRO-3 and accessing the parameters, take a look at the [Constructing Massive Models](https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models) and [Gathering Parameters](https://deepspeed.readthedocs.io/en/latest/zero3.html#gathering-parameters) guides.
 
 </hfoption>
 </hfoptions>
 
-### NVMe configuration
+### NVMe
 
-[ZeRO-Infinity](https://hf.co/papers/2104.07857) allows offloading model states to the CPU and/or NVMe to save even more memory. Smart partitioning and tiling algorithms allow each GPU to send and receive very small amounts of data during offloading such that a modern NVMe can fit an even larger total memory pool than is available to your training process. ZeRO-Infinity requires ZeRO-3.
+[ZeRO-Infinity](https://hf.co/papers/2104.07857) offloads model states to the CPU and/or NVMe to save even more memory. Smart partitioning and tiling algorithms allow each GPU to send and receive very small amounts of data during offloading such that a modern NVMe can fit an even larger total memory pool than is available to your training process. ZeRO-Infinity requires ZeRO-3.
 
-Depending on the CPU and/or NVMe memory available, you can offload both the [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading), just one of them, or none. You should also make sure the `nvme_path` is pointing to an NVMe device, because while it still works with a normal hard drive or solid state drive, it'll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read and ~3GB/s for write operations. Lastly, [run a benchmark](https://github.com/deepspeedai/DeepSpeed/issues/998) on your training setup to determine the optimal `aio` configuration.
+Depending on the CPU and NVMe memory available, you can offload both the [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading), just one of them, or none of them. Make sure the `nvme_path` points to a NVMe device, because while it still works with a regular hard drive or solid state drive, it'll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read operations and ~3GB/s for write operations.
 
-The example ZeRO-3/Infinity configuration file below sets most of the parameter values to `auto`, but you could also manually add these values.
+Consider running a [benchmark](https://github.com/microsoft/DeepSpeed/issues/998) on your training setup to determine the optimal `aio` configuration.
 
-```yml
+The example ZeRO-3 and ZeRO-Infinity config below sets most of the parameter values to `auto`, but you can also manually set configure these values.
+
+```yaml
 {
     "fp16": {
         "enabled": "auto",
@@ -381,103 +368,76 @@ The example ZeRO-3/Infinity configuration file below sets most of the parameter
 }
 ```
 
-## DeepSpeed features
+## Training features
 
-There are a number of important parameters to specify in the DeepSpeed configuration file which are briefly described in this section.
+DeepSpeed supports many training features that can be configured in the config file. This section describes some of the most important features.
 
-### Activation/gradient checkpointing
+### Gradient checkpointing
 
-Activation and gradient checkpointing trades speed for more GPU memory which allows you to overcome scenarios where your GPU is out of memory or to increase your batch size for better performance. To enable this feature:
+Gradient checkpointing saves memory by only storing *some* of the intermediate activations instead of storing *all* of them. It is useful for fitting larger models on the GPU without running out of memory or to increase the batch size for better performance. Training speed is slower though.
 
-1. For a Hugging Face model, set `model.gradient_checkpointing_enable()` or `--gradient_checkpointing` in the [`Trainer`].
-2. For a non-Hugging Face model, use the DeepSpeed [Activation Checkpointing API](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html). You could also replace the Transformers modeling code and replace `torch.utils.checkpoint` with the DeepSpeed API. This approach is more flexible because you can offload the forward activations to the CPU memory instead of recalculating them.
+* For a Transformers model, set `model.gradient_checkpointing_enable()` or add `--gradient_checkpointing` in the [`TrainingArguments`].
+* For a non-Transformers model, use the DeepSpeed [Activation Checkpointing API](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html). Replacing Transformers modeling code and [torch.utils.checkpoint](https://pytorch.org/docs/stable/checkpoint.html) with the DeepSpeed API gives you more flexibility because you can offload the forward activations to the CPU memory instead of recalculating them.
 
-### Optimizer and scheduler
+### Batch size
 
-DeepSpeed and Transformers optimizer and scheduler can be mixed and matched as long as you don't enable `offload_optimizer`. When `offload_optimizer` is enabled, you could use a non-DeepSpeed optimizer (except for LAMB) as long as it has both a CPU and GPU implementation.
+The batch size can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets `train_micro_batch_size_per_gpu` and `train_batch_size` to the value of `world_size * per_device_train_batch_size * gradient_accumulation_steps`.
 
-<Tip warning={true}>
+```yaml
+{
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto"
+}
+```
 
-The optimizer and scheduler parameters for the config file can be set from the command line to avoid hard to find errors. For example, if the learning rate is set to a different value in another place you can override it from the command line. Aside from the optimizer and scheduler parameters, you'll need to ensure your [`Trainer`] command line arguments match the DeepSpeed configuration.
+### Communication data type
 
-</Tip>
+A separate data type is used for communication collectives like reduction, gathering and scattering operations.
 
-<hfoptions id="opt-sched">
-<hfoption id="optimizer">
+All gather and scatter operations are performed in the same data type the data is in. For example, if you're training in bf16, the data is also gathered in bf16 because gathering is a non-lossy operation.
 
-DeepSpeed offers several [optimizers](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters) (Adam, AdamW, OneBitAdam, and LAMB) but you can also import other optimizers from PyTorch. If you don't configure the optimizer in the config, the [`Trainer`] automatically selects AdamW and either uses the supplied values or the default values for the following parameters from the command line: `lr`, `adam_beta1`, `adam_beta2`, `adam_epsilon`, `weight_decay`.
+Reduce operations are lossy, for example, when gradients are averaged across multiple GPUs. When the communication is done in fp16 or bf16, it's more likely to be lossy because adding multiple numbers in low precision isn't exact. This is especially the case with bf16 which has a lower precision than fp16. For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients.
 
-You can set the parameters to `"auto"` or manually input your own desired values.
+Choose the communication data type by setting the `communication_data_type` parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it's downcasted to whichever half-precision data type you're training in.
 
 ```yaml
 {
-   "optimizer": {
-       "type": "AdamW",
-       "params": {
-         "lr": "auto",
-         "betas": "auto",
-         "eps": "auto",
-         "weight_decay": "auto"
-       }
-   }
+    "communication_data_type": "fp32"
 }
 ```
 
-You can also use an unsupported optimizer by adding the following to the top level configuration.
+### Gradient accumulation
 
-```yaml
-{
-   "zero_allow_untested_optimizer": true
-}
-```
+Gradient accumulation accumulates gradients over several mini-batches of data before updating parameters. It stores less gradients and enables training with a larger *effective batch size*. Training speed is slower though, but it's useful for overcoming memory constraints.
 
-From DeepSpeed==0.8.3 on, if you want to use offload, you'll also need to the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer.
+Gradient accumulation can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `gradient_accumulation_steps`.
 
 ```yaml
 {
-   "zero_force_ds_cpu_optimizer": false
+    "gradient_accumulation_steps": "auto"
 }
 ```
 
-</hfoption>
-<hfoption id="scheduler">
-
-DeepSpeed supports the LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR learning rate [schedulers](https://www.deepspeed.ai/docs/config-json/#scheduler-parameters).
-
-Transformers and DeepSpeed provide two of the same schedulers:
-
-* WarmupLR is the same as `--lr_scheduler_type constant_with_warmup` in Transformers
-* WarmupDecayLR is the same as  `--lr_scheduler_type linear` in Transformers (this is the default scheduler used in Transformers)
+### Gradient clipping
 
-If you don't configure the scheduler in the config, the [`Trainer`] automatically selects WarmupDecayLR and either uses the supplied values or the default values for the following parameters from the command line: `warmup_min_lr`, `warmup_max_lr`, `warmup_num_steps`, `total_num_steps` (automatically calculated during run time if `max_steps` is not provided).
+Gradient clipping is useful for preventing exploding gradients which can lead to instability during training. It sets a maximum threshold value and rescales the gradients if their norm exceeds the threshold.
 
-You can set the parameters to `"auto"` or manually input your own desired values.
+Gradient clipping can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `max_grad_norm`.
 
 ```yaml
 {
-   "scheduler": {
-         "type": "WarmupDecayLR",
-         "params": {
-             "total_num_steps": "auto",
-             "warmup_min_lr": "auto",
-             "warmup_max_lr": "auto",
-             "warmup_num_steps": "auto"
-         }
-     }
+    "gradient_clipping": "auto"
 }
 ```
 
-</hfoption>
-</hfoptions>
-
-### Precision
+### Mixed precision training
 
-Deepspeed supports fp32, fp16, and bf16 mixed precision.
+Mixed precision accelerates training speed by performing some calculations in half-precision, but it also maintains some calculations in full-precision to preserve accuracy. DeepSpeed supports fp32, fp16, and bf16 data types.
 
 <hfoptions id="precision">
 <hfoption id="fp32">
 
-If your model doesn't work well with mixed precision, for example if it wasn't pretrained in mixed precision, you may encounter overflow or underflow issues which can cause NaN loss. For these cases, you should use full fp32 precision by explicitly disabling the default fp16 mode.
+Train in fp32 if a model wasn't pretrained in mixed precision because it may cause underflow or overflow errors. Disable fp16, the default, in this case.
 
 ```yaml
 {
@@ -487,12 +447,12 @@ If your model doesn't work well with mixed precision, for example if it wasn't p
 }
 ```
 
-For Ampere GPUs and PyTorch > 1.7, it automatically switches to the more efficient [tf32](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) format for some operations but the results are still in fp32. You can control it from the [`Trainer`] by setting `--tf32` to enable it, and `--tf32 0` or `--no_tf32` to disable it.
+For Ampere GPUs and PyTorch 1.7+, the more efficient [tf32](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) mode is automatically enabled for some operations but the results are still in fp32. Configure it in [`Trainer`] by setting `--tf32` to enable it, and `--tf32 0` or `--no_tf32` to disable it.
 
 </hfoption>
 <hfoption id="fp16">
 
-To configure PyTorch AMP-like fp16 mixed precision reduces memory usage and accelerates training speed. [`Trainer`] automatically enables or disables fp16 based on the value of `args.fp16_backend`, and the rest of the config can be set by you. fp16 is enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend amp` or `--fp16_full_eval`.
+To configure AMP-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically enables or disables fp16 based on the value of `fp16_backend`, and the rest of the config can be set by you. fp16 is enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend amp` or `--fp16_full_eval`.
 
 ```yaml
 {
@@ -509,7 +469,7 @@ To configure PyTorch AMP-like fp16 mixed precision reduces memory usage and acce
 
 For additional DeepSpeed fp16 training options, take a look at the [FP16 Training Options](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) reference.
 
-To configure Apex-like fp16 mixed precision, setup the config as shown below with `"auto"` or your own values. [`Trainer`] automatically configure `amp` based on the values of `args.fp16_backend` and `args.fp16_opt_level`. It can also be enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend apex` or `--fp16_opt_level 01`.
+To configure Apex-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically configures `amp` based on the values of `fp16_backend` and `fp16_opt_level`. It can also be enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend apex` or `--fp16_opt_level 01`.
 
 ```yaml
 {
@@ -523,9 +483,12 @@ To configure Apex-like fp16 mixed precision, setup the config as shown below wit
 </hfoption>
 <hfoption id="bf16">
 
-To use bf16, you'll need at least DeepSpeed==0.6.0. bf16 has the same dynamic range as fp32 and doesn’t require loss scaling. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desired because this format's low precision can lead to lossy accumulation.
+> [!TIP]
+> bf16 requires DeepSpeed 0.6.0.
 
-bf16 can be setup in the config file or enabled from the command line when the following arguments are passed: `--bf16` or `--bf16_full_eval`.
+bf16 has the same dynamic range as fp32, and doesn’t require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because the lower precision can lead to lossy accumulation.
+
+bf16 can be set up in the config file or enabled from the command line when the following arguments are passed: `--bf16` or `--bf16_full_eval`.
 
 ```yaml
 {
@@ -538,59 +501,85 @@ bf16 can be setup in the config file or enabled from the command line when the f
 </hfoption>
 </hfoptions>
 
-### Batch size
+### Optimizer and scheduler
+
+DeepSpeed and Transformers optimizers and schedulers can be mixed and matched if `offload_optimizer` isn't enabled. When `offload_optimizer` is enabled, use a non-DeepSpeed optimizer (except for LAMB) as long as it has it a CPU and GPU implementation.
+
+Set the optimizer and scheduler parameters for the config file from the command line to avoid hard to find errors. For example, if the learning rate is set to a different value in another place, you can override it from the command line.
+
+<hfoptions id="opt-sched">
+<hfoption id="optimizer">
 
-The batch size can be auto-configured or explicitly set. If you choose to use the `"auto"` option, [`Trainer`] sets `train_micro_batch_size_per_gpu` to the value of args.`per_device_train_batch_size` and `train_batch_size` to `args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps`.
+DeepSpeed offers several [optimizers](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters) (Adam, AdamW, OneBitAdam, and LAMB) but you can also import other optimizers from PyTorch. If you don't configure the optimizer in the config, [`Trainer`] automatically selects AdamW and either uses the supplied values or the default values for the following parameters from the command line: `lr`, `adam_beta1`, `adam_beta2`, `adam_epsilon`, `weight_decay`.
+
+You can set the parameters to `"auto"` or manually input your own values.
 
 ```yaml
 {
-    "train_micro_batch_size_per_gpu": "auto",
-    "train_batch_size": "auto"
+   "optimizer": {
+       "type": "AdamW",
+       "params": {
+         "lr": "auto",
+         "betas": "auto",
+         "eps": "auto",
+         "weight_decay": "auto"
+       }
+   }
 }
 ```
 
-### Gradient accumulation
-
-Gradient accumulation can be auto-configured or explicitly set. If you choose to use the `"auto"` option, [`Trainer`] sets it to the value of `args.gradient_accumulation_steps`.
+Use an unsupported optimizer by adding the following to the top level configuration.
 
 ```yaml
 {
-    "gradient_accumulation_steps": "auto"
+   "zero_allow_untested_optimizer": true
 }
-
 ```
 
-### Gradient clipping
-
-Gradient clipping can be auto-configured or explicitly set. If you choose to use the `"auto"` option, [`Trainer`] sets it to the value of `args.max_grad_norm`.
+From DeepSpeed 0.8.3+, if you want to use offload, you'll also need to add the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer.
 
 ```yaml
 {
-    "gradient_clipping": "auto"
+   "zero_force_ds_cpu_optimizer": false
 }
 ```
 
-### Communication data type
+</hfoption>
+<hfoption id="scheduler">
+
+DeepSpeed supports the LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR learning rate [schedulers](https://www.deepspeed.ai/docs/config-json/#scheduler-parameters).
 
-For communication collectives like reduction, gathering and scattering operations, a separate data type is used.
+Transformers and DeepSpeed provide two of the same schedulers:
 
-All gather and scatter operations are performed in the same data type the data is in. For example, if you're training with bf16, the data is also gathered in bf16 because gathering is a non-lossy operation.
+* WarmupLR is the same as `--lr_scheduler_type constant_with_warmup` in Transformers.
+* WarmupDecayLR is the same as  `--lr_scheduler_type linear` in Transformers (this is the default scheduler used in Transformers).
 
-Reduce operations are lossy, for example when gradients are averaged across multiple GPUs. When the communication is done in fp16 or bf16, it is more likely to be lossy because adding multiple numbers in low precision isn't exact. This is especially the case with bf16 which has a lower precision than fp16. For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients.
+If you don't configure the scheduler in the config file, [`Trainer`] automatically selects WarmupDecayLR and either uses the supplied values or the default values for the following parameters from the command line: `warmup_min_lr`, `warmup_max_lr`, `warmup_num_steps`, `total_num_steps` (automatically calculated during run time if `max_steps` is not provided).
 
-You can choose the communication data type by setting the `communication_data_type` parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it is downcasted to whichever half-precision dtype you're training in.
+You can set the parameters to `"auto"` or manually input your own values.
 
 ```yaml
 {
-    "communication_data_type": "fp32"
+   "scheduler": {
+         "type": "WarmupDecayLR",
+         "params": {
+             "total_num_steps": "auto",
+             "warmup_min_lr": "auto",
+             "warmup_max_lr": "auto",
+             "warmup_num_steps": "auto"
+         }
+     }
 }
 ```
 
-### Universal Checkpointing
+</hfoption>
+</hfoptions>
 
-[Universal Checkpointing](https://www.deepspeed.ai/tutorials/universal-checkpointing) is an efficient and flexible feature for saving and loading model checkpoints. It enables seamless model training continuation and fine-tuning across different model architectures, parallelism techniques, and training configurations.
+### Universal checkpointing
 
-Resume training with a universal checkpoint by setting [load_universal](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to `true` in the config file.
+[Universal Checkpointing](https://www.deepspeed.ai/tutorials/universal-checkpointing) saves and loads model, optimizer and training scheduler states across different model architectures, parallelism techniques, and training configurations. By saving them in a Universal format, it enables easier model training continuation and fine-tuning.
+
+Resume training with a Universal checkpoint by setting `load_universal` to `true` in the config file.
 
 ```yaml
 {
@@ -600,17 +589,16 @@ Resume training with a universal checkpoint by setting [load_universal](https://
 }
 ```
 
-## Deployment
-
-DeepSpeed can be deployed by different launchers such as [torchrun](https://pytorch.org/docs/stable/elastic/run.html), the `deepspeed` launcher, or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch). To deploy, add `--deepspeed ds_config.json` to the [`Trainer`] command line. It’s recommended to use DeepSpeed’s [`add_config_arguments`](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any necessary command line arguments to your code.
+## Deploy
 
-This guide will show you how to deploy DeepSpeed with the `deepspeed` launcher for different training setups. You can check out this [post](https://github.com/huggingface/transformers/issues/8771#issuecomment-759248400) for more practical usage examples.
+DeepSpeed can be deployed with its native launcher, [torchrun](https://pytorch.org/docs/stable/elastic/run.html) or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch).
 
+Add the `--deepspeed ds_config.json` argument to [`Trainer`] in the command line. It is recommended to use DeepSpeeds [add_config_arguments](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any other command line arguments to your code.
 
 <hfoptions id="deploy">
 <hfoption id="multi-GPU">
 
-To deploy DeepSpeed on multiple GPUs, add the `--num_gpus` parameter. If you want to use all available GPUs, you don't need to add `--num_gpus`. The example below uses 2 GPUs.
+To deploy DeepSpeed on multiple GPUs, add `--num_gpus`. You don't need to add `--num_gpus` if you're planning on using all available GPUs.
 
 ```bash
 deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
@@ -625,7 +613,15 @@ deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
 </hfoption>
 <hfoption id="single-GPU">
 
-To deploy DeepSpeed on a single GPU, add the `--num_gpus` parameter. It isn't necessary to explicitly set this value if you only have 1 GPU because DeepSpeed deploys all GPUs it can see on a given node.
+DeepSpeed is still useful with just one GPU because you can:
+
+1. Offload some computations and memory to the CPU to make more GPU resources available to your model to use a larger batch size or fit a very large model that normally won't fit.
+2. Minimize memory fragmentation with its smart GPU memory management system which also allows you to fit bigger models and data batches.
+
+To deploy DeepSpeed on a single GPU, add `--num_gpus`. You don't need to add `--num_gpus` if you only have one GPU because DeepSpeed deploys all GPUs it can see on a given node.
+
+> [!TIP]
+> Set the `allgather_bucket_size` and `reduce_bucket_size` values to 2e8 in the [ZeRO-2](#zero-configuration) configuration file to get better performance on a single GPU.
 
 ```bash
 deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
@@ -637,25 +633,12 @@ deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
 --source_lang en --target_lang ro
 ```
 
-DeepSpeed is still useful with just 1 GPU because you can:
-
-1. Offload some computations and memory to the CPU to make more GPU resources available to your model to use a larger batch size or fit a very large model that normally won't fit.
-2. Minimize memory fragmentation with it's smart GPU memory management system which also allows you to fit bigger models and data batches.
-
-<Tip>
-
-Set the `allgather_bucket_size` and `reduce_bucket_size` values to 2e8 in the [ZeRO-2](#zero-configuration) configuration file to get better performance on a single GPU.
-
-</Tip>
-
 </hfoption>
 </hfoptions>
 
-### Multi-node deployment
-
-A node is one or more GPUs for running a workload. A more powerful setup is a multi-node setup which can be launched with the `deepspeed` launcher. For this guide, let's assume there are two nodes with 8 GPUs each. The first node can be accessed `ssh hostname1` and the second node with `ssh hostname2`. Both nodes must be able to communicate with each other locally over ssh without a password.
+### Multi-node
 
-By default, DeepSpeed expects your multi-node environment to use a shared storage. If this is not the case and each node can only see the local filesystem, you need to adjust the config file to include a [`checkpoint`](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to allow loading without access to a shared filesystem:
+A multi-node setup consists of multiple nodes, where each node has one of more GPUs running a workload. DeepSpeed expects a shared storage system, but if this is not the case, you need to adjust the config file to include a [checkpoint](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to allow loading without access to a shared filesystem.
 
 ```yaml
 {
@@ -665,12 +648,14 @@ By default, DeepSpeed expects your multi-node environment to use a shared storag
 }
 ```
 
-You could also use the [`Trainer`]'s `--save_on_each_node` argument to automatically add the above `checkpoint` to your config.
+You could also use the `--save_on_each_node` parameter in [`TrainingArguments`] to automatically add the above `checkpoint` to your config.
+
+The examples below for the torchrun and DeepSpeed launcher shows how to deploy two nodes with eight GPUs each. Access the first node with `ssh hostname1` and the second node with `ssh hostname2`. Both nodes must be able to communicate with each other locally over ssh without a password.
 
 <hfoptions id="multinode">
 <hfoption id="torchrun">
 
-For [torchrun](https://pytorch.org/docs/stable/elastic/run.html), you have to ssh to each node and run the following command on both of them. The launcher waits until both nodes are synchronized before launching the training.
+With [torchrun](https://pytorch.org/docs/stable/elastic/run.html), ssh to each node and run the following command on both of them. The launcher waits until both nodes are synchronized before launching the training.
 
 ```bash
 torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \
@@ -678,16 +663,16 @@ torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \
 ```
 
 </hfoption>
-<hfoption id="deepspeed">
+<hfoption id="DeepSpeed">
 
-For the `deepspeed` launcher, start by creating a `hostfile`.
+Create a `hostfile` for the DeepSpeed launcher.
 
 ```bash
 hostname1 slots=8
 hostname2 slots=8
 ```
 
-Then you can launch the training with the following command. The `deepspeed` launcher automatically launches the command on both nodes at once.
+The DeepSpeed launcher automatically launches the command on both nodes at once with the command below.
 
 ```bash
 deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 \
@@ -699,9 +684,9 @@ Check out the [Resource Configuration (multi-node)](https://www.deepspeed.ai/get
 </hfoption>
 </hfoptions>
 
-### SLURM
+### Slurm
 
-In a SLURM environment, you'll need to adapt your SLURM script to your specific SLURM environment. An example SLURM script may look like:
+[Slurm](https://slurm.schedmd.com/documentation.html) is a cluster management and job scheduling system. An example Slurm script is shown below.
 
 ```bash
 #SBATCH --job-name=test-nodes        # name
@@ -722,19 +707,18 @@ srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
 your_program.py <normal cl args> --deepspeed ds_config.json'
 ```
 
-Then you can schedule your multi-node deployment with the following command which launches training simultaneously on all nodes.
+Launch training simultaneously on all nodes with the command below.
 
 ```bash
 sbatch launch.slurm
 ```
 
-### Notebook
+### Jupyter Notebook
 
-The `deepspeed` launcher doesn't support deployment from a notebook so you'll need to emulate the distributed environment. However, this only works for 1 GPU. If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. This means you have to use the `deepspeed` launcher which can't be emulated as shown here.
+To use DeepSpeed in a Jupyter Notebook, you need to emulate a distributed environment because the launcher doesn't support deployment from a notebook. This is only supported for one GPU. To use multiple GPUs, you must use a multi-process environment, which means you have to use the DeepSpeed launcher which can't be emulated as shown here.
 
 ```py
-# DeepSpeed requires a distributed environment even when only one process is used.
-# This emulates a launcher in the notebook
+# emulate a launcher in the notebook
 import os
 
 os.environ["MASTER_ADDR"] = "localhost"
@@ -743,13 +727,12 @@ os.environ["RANK"] = "0"
 os.environ["LOCAL_RANK"] = "0"
 os.environ["WORLD_SIZE"] = "1"
 
-# Now proceed as normal, plus pass the DeepSpeed config file
 training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json")
 trainer = Trainer(...)
 trainer.train()
 ```
 
-If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated cell.
+Create a config file on the fly in the notebook in the current directory with a dedicated cell.
 
 ```py
 %%bash
@@ -814,14 +797,14 @@ cat <<'EOT' > ds_config_zero3.json
 EOT
 ```
 
-If the training script is in a file and not in a notebook cell, you can launch `deepspeed` normally from the shell in a notebook cell. For example, to launch `run_translation.py`:
+If the training script is in a file and not a notebook cell, launch DeepSpeed from the shell in the notebook cell.
 
 ```py
 !git clone https://github.com/huggingface/transformers
 !cd transformers; deepspeed examples/pytorch/translation/run_translation.py ...
 ```
 
-You could also use `%%bash` magic and write multi-line code to run the shell program, but you won't be able to view the logs until training is complete. With `%%bash` magic, you don't need to emulate a distributed environment.
+Another option is to use `%%bash` to run the shell program without emulating the distributed environment. However, you won't be able to view the logs until training is complete.
 
 ```py
 %%bash
@@ -833,69 +816,33 @@ deepspeed examples/pytorch/translation/run_translation.py ...
 
 ## Save model weights
 
-DeepSpeed stores the main full precision fp32 weights in custom checkpoint optimizer files (the glob pattern looks like `global_step*/*optim_states.pt`) and are saved under the normal checkpoint.
+DeepSpeed stores the main fp32 weights in custom checkpoint optimizer files (`global_step*/*optim_states.pt`) which are saved under the normal checkpoint.
 
-<hfoptions id="save">
-<hfoption id="fp16">
+### fp16
+
+ZeRO-2 saves the model weights in fp16. To save the weights in fp16 for ZeRO-3, set `"stage3_gather_16bit_weights_on_model_save": true` in the config file, because the weights are distributed across multiple GPUs.
 
-A model trained with ZeRO-2 saves the pytorch_model.bin weights in fp16. To save the model weights in fp16 for a model trained with ZeRO-3, you need to set `"stage3_gather_16bit_weights_on_model_save": true` because the model weights are partitioned across multiple GPUs. Otherwise, the [`Trainer`] won't save the weights in fp16 and it won't create a pytorch_model.bin file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights and you won't be able to load them.
+If you don't, [`Trainer`] won't save the weights in fp16 and won't create a `pytorch_model.bin` file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights, so you won't be able to load it.
 
 ```yaml
 {
     "zero_optimization": {
+        "stage": 3,
         "stage3_gather_16bit_weights_on_model_save": true
     }
 }
 ```
 
-</hfoption>
-<hfoption id="fp32">
-
-The full precision weights shouldn't be saved during training because it can require a lot of memory. It is usually best to save the fp32 weights offline after training is complete. But if you have a lot of free CPU memory, it is possible to save the fp32 weights during training. This section covers both online and offline approaches.
-
-### Online
-
-You must have saved at least one checkpoint to load the latest checkpoint as shown in the following:
-
-```py
-from transformers.trainer_utils import get_last_checkpoint
-from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
-
-checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
-fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
-```
-
-If you've enabled the `--load_best_model_at_end` parameter to track the best checkpoint in [`TrainingArguments`], you can finish training first and save the final model explicitly. Then you can reload it as shown below:
-
-```py
-from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
-
-checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
-trainer.deepspeed.save_checkpoint(checkpoint_dir)
-fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
-```
-
-<Tip>
-
-Once `load_state_dict_from_zero_checkpoint` is run, the model is no longer usable in DeepSpeed in the context of the same application. You'll need to initialize the DeepSpeed engine again since `model.load_state_dict(state_dict)` removes all the DeepSpeed magic from it. Only use this at the very end of training.
-
-</Tip>
+### fp32
 
-You can also extract and load the state_dict of the fp32 weights:
+Unless you have a lot of free CPU memory, fp32 weights shouldn't be saved during training because it can require a lot of memory. It is usually best to save the fp32 weights offline after training is complete.
 
-```py
-from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
-
-state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)  # already on cpu
-model = model.cpu()
-model.load_state_dict(state_dict)
-```
-
-### Offline
+<hfoptions id="save">
+<hfoption id="offline">
 
-DeepSpeed provides a zero_to_fp32.py script at the top-level of the checkpoint folder for extracting weights at any point. This is a standalone script and you don't need a configuration file or [`Trainer`].
+DeepSpeed provides a [zero_to_fp32.py](https://github.com/microsoft/DeepSpeed/blob/91829476a8fd4d0d9268c03c1d56795d20a51c12/deepspeed/utils/zero_to_fp32.py#L14) script at the top-level checkpoint folder for extracting weights at any point. This is a standalone script and you don't need a config file or [`Trainer`].
 
-For example, if your checkpoint folder looked like this:
+For example, if your checkpoint folder looks like the one shown below, then you can run the following command to create and consolidate the fp32 weights from multiple GPUs into a single `pytorch_model.bin` file. The script automatically discovers the subfolder `global_step1` which contains the checkpoint.
 
 ```bash
 $ ls -l output_dir/checkpoint-1/
@@ -913,44 +860,57 @@ drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
 -rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*
 ```
 
-To reconstruct the fp32 weights from the DeepSpeed checkpoint (ZeRO-2 or ZeRO-3) subfolder `global_step1`, run the following command to create and consolidate the full fp32 weights from multiple GPUs into a single pytorch_model.bin file. The script automatically discovers the subfolder containing the checkpoint.
+> [!TIP]
+> Run `python zero_to_fp32.py -h` for more usage details. The script requires 2x the general RAM of the final fp32 weights.
 
-```py
+```bash
 python zero_to_fp32.py . pytorch_model.bin
 ```
 
-<Tip>
+</hfoption>
+<hfoption id="online">
 
-Run `python zero_to_fp32.py -h` for more usage details. The script requires 2x the general RAM of the final fp32 weights.
+Adding the `--load_best_model_at_end` parameter in [`TrainingArguments`] tracks the best checkpoint so you can finish training first and save the final model explicitly. Reload the model as shown below.
 
-</Tip>
+> [!WARNING]
+> Once [load_state_dict_from_zero_checkpoint](https://deepspeed.readthedocs.io/en/stable/model-checkpointing.html#deepspeed.utils.zero_to_fp32.load_state_dict_from_zero_checkpoint) is run, the model is no longer usable in DeepSpeed in the context of the same application. You'll need to reinitialize the DeepSpeed engine because `model.load_state_dict(state_dict)` removes all the DeepSpeed magic from it. Only use this function once training is complete.
 
-</hfoption>
-</hfoptions>
-
-## ZeRO Inference
+```py
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
 
-[ZeRO Inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html) places the model weights in CPU or NVMe memory to avoid burdening the GPU which makes it possible to run inference with huge models on a GPU. Inference doesn't require any large additional amounts of memory for the optimizer states and gradients so you can fit much larger batches and/or sequence lengths on the same hardware.
+checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
+trainer.deepspeed.save_checkpoint(checkpoint_dir)
+fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+```
 
-ZeRO Inference shares the same configuration file as [ZeRO-3](#zero-configuration), and ZeRO-2 and ZeRO-1 configs won't work because they don't provide any benefits for inference.
+You must have saved at least one checkpoint to load the latest checkpoint as shown in the example below.
 
-To run ZeRO Inference, pass your usual training arguments to the [`TrainingArguments`] class and add the `--do_eval` argument.
+```py
+from transformers.trainer_utils import get_last_checkpoint
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
 
-```bash
-deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json
+checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
+fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
 ```
 
-## Non-Trainer DeepSpeed integration
+Use `load_state_dict` to extract and load the state_dict of the fp32 weights.
 
-DeepSpeed also works with Transformers without the [`Trainer`] class. This is handled by the [`HfDeepSpeedConfig`] which only takes care of gathering ZeRO-3 parameters and splitting a model across multiple GPUs when you call [`~PreTrainedModel.from_pretrained`].
+```py
+from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+
+state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)
+model = model.cpu()
+model.load_state_dict(state_dict)
+```
 
-<Tip>
+</hfoption>
+</hfoptions>
 
-If you want everything automatically taken care of for you, try using DeepSpeed with the [`Trainer`]! You'll need to follow the [DeepSpeed documentation](https://www.deepspeed.ai/), and manually configure the parameter values in the config file (you can't use the `"auto"` value).
+## Non-Trainer integration
 
-</Tip>
+DeepSpeed also works with Transformers without [`Trainer`]. The [`~integrations.HfDeepSpeedConfig`] is responsible for gathering ZeRO-3 parameters and partitioning a model across multiple GPUs when [`~PreTrainedModel.from_pretrained`] is called.
 
-To efficiently deploy ZeRO-3, you must instantiate the [`HfDeepSpeedConfig`] object before the model and keep that object alive:
+You must instantiate [`~integrations.HfDeepSpeedConfig`] before loading a model to efficiently deploy ZeRO-3.
 
 <hfoptions id="models">
 <hfoption id="pretrained model">
@@ -960,8 +920,9 @@ from transformers.integrations import HfDeepSpeedConfig
 from transformers import AutoModel
 import deepspeed
 
-ds_config = {...}  # deepspeed config object or path to the file
-# must run before instantiating the model to detect zero 3
+# DeepSpeed config object or path to the file
+ds_config = {...}
+# must run before instantiating the model to detect ZeRO-3
 dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
 model = AutoModel.from_pretrained("openai-community/gpt2")
 engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
@@ -970,16 +931,18 @@ engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
 </hfoption>
 <hfoption id="non-pretrained model">
 
-[`HfDeepSpeedConfig`] is not required for ZeRO-1 or ZeRO-2.
+[`~integrations.HfDeepSpeedConfig`] is not required for ZeRO-1 or ZeRO-2.
 
 ```py
 from transformers.integrations import HfDeepSpeedConfig
 from transformers import AutoModel, AutoConfig
 import deepspeed
 
-ds_config = {...}  # deepspeed config object or path to the file
+# DeepSpeed config object or path to the file
+ds_config = {...}
 # must run before instantiating the model to detect zero 3
 dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
+# randomly initialize model weights
 config = AutoConfig.from_pretrained("openai-community/gpt2")
 model = AutoModel.from_config(config)
 engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
@@ -988,208 +951,40 @@ engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
 </hfoption>
 </hfoptions>
 
-### Non-Trainer ZeRO Inference
-
-To run ZeRO Inference without the [`Trainer`] in cases where you can’t fit a model onto a single GPU, try using additional GPUs or/and offloading to CPU memory. The important nuance to understand here is that the way ZeRO is designed, you can process different inputs on different GPUs in parallel.
-
-Make sure to:
-
-* disable CPU offload if you have enough GPU memory (since it slows things down).
-* enable bf16 if you have an Ampere or newer GPU to make things faster. If you don’t have one of these GPUs, you may enable fp16 as long as you don’t use a model pretrained in bf16 (T5 models) because it may lead to an overflow error.
-
-Take a look at the following script to get a better idea of how to run ZeRO Inference without the [`Trainer`] on a model that won't fit on a single GPU.
-
-```py
-#!/usr/bin/env python
-
-# This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model
-# into a single GPU
-#
-# 1. Use 1 GPU with CPU offload
-# 2. Or use multiple GPUs instead
-#
-# First you need to install deepspeed: pip install deepspeed
-#
-# Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2
-# small GPUs can handle it. or 1 small GPU and a lot of CPU memory.
-#
-# To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU -
-# you will need 2-4 gpus. And then you can adapt the script to handle more gpus if you want to
-# process multiple inputs at once.
-#
-# The provided deepspeed config also activates CPU memory offloading, so chances are that if you
-# have a lot of available CPU memory and you don't mind a slowdown you should be able to load a
-# model that doesn't normally fit into a single GPU. If you have enough GPU memory the program will
-# run faster if you don't want offload to CPU - so disable that section then.
-#
-# To deploy on 1 gpu:
-#
-# deepspeed --num_gpus 1 t0.py
-# or:
-# python -m torch.distributed.run --nproc_per_node=1 t0.py
-#
-# To deploy on 2 gpus:
-#
-# deepspeed --num_gpus 2 t0.py
-# or:
-# python -m torch.distributed.run --nproc_per_node=2 t0.py
-
-from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
-from transformers.integrations import HfDeepSpeedConfig
-import deepspeed
-import os
-import torch
-
-os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
-
-# distributed setup
-local_rank = int(os.getenv("LOCAL_RANK", "0"))
-world_size = int(os.getenv("WORLD_SIZE", "1"))
-torch.cuda.set_device(local_rank)
-deepspeed.init_distributed()
-
-model_name = "bigscience/T0_3B"
-
-config = AutoConfig.from_pretrained(model_name)
-model_hidden_size = config.d_model
-
-# batch size has to be divisible by world_size, but can be bigger than world_size
-train_batch_size = 1 * world_size
-
-# ds_config notes
-#
-# - enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be
-# faster.
-#
-# - for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g.
-# all official t5 models are bf16-pretrained
-#
-# - set offload_param.device to "none" or completely remove the `offload_param` section if you don't
-# - want CPU offload
-#
-# - if using `offload_param` you can manually finetune stage3_param_persistence_threshold to control
-# - which params should remain on gpus - the larger the value the smaller the offload size
-#
-# For in-depth info on Deepspeed config see
-# https://huggingface.co/docs/transformers/main/main_classes/deepspeed
-
-# keeping the same format as json for consistency, except it uses lower case for true/false
-# fmt: off
-ds_config = {
-    "fp16": {
-        "enabled": False
-    },
-    "bf16": {
-        "enabled": False
-    },
-    "zero_optimization": {
-        "stage": 3,
-        "offload_param": {
-            "device": "cpu",
-            "pin_memory": True
-        },
-        "overlap_comm": True,
-        "contiguous_gradients": True,
-        "reduce_bucket_size": model_hidden_size * model_hidden_size,
-        "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
-        "stage3_param_persistence_threshold": 10 * model_hidden_size
-    },
-    "steps_per_print": 2000,
-    "train_batch_size": train_batch_size,
-    "train_micro_batch_size_per_gpu": 1,
-    "wall_clock_breakdown": False
-}
-# fmt: on
-
-# next line instructs transformers to partition the model directly over multiple gpus using
-# deepspeed.zero.Init when model's `from_pretrained` method is called.
-#
-# **it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name)**
-#
-# otherwise the model will first be loaded normally and only partitioned at forward time which is
-# less efficient and when there is little CPU RAM may fail
-dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
-
-# now a model can be loaded.
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-
-# initialise Deepspeed ZeRO and store only the engine object
-ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
-ds_engine.module.eval()  # inference
-
-# Deepspeed ZeRO can process unrelated inputs on each GPU. So for 2 gpus you process 2 inputs at once.
-# If you use more GPUs adjust for more.
-# And of course if you have just one input to process you then need to pass the same string to both gpus
-# If you use only one GPU, then you will have only rank 0.
-rank = torch.distributed.get_rank()
-if rank == 0:
-    text_in = "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"
-elif rank == 1:
-    text_in = "Is this review positive or negative? Review: this is the worst restaurant ever"
-
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank)
-with torch.no_grad():
-    outputs = ds_engine.module.generate(inputs, synced_gpus=True)
-text_out = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(f"rank{rank}:\n   in={text_in}\n  out={text_out}")
-```
-
-Save the script as t0.py and launch it:
-
-```bash
-$ deepspeed --num_gpus 2 t0.py
-rank0:
-   in=Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy
-  out=Positive
-rank1:
-   in=Is this review positive or negative? Review: this is the worst restaurant ever
-  out=negative
-```
-
-This is a very basic example and you'll want to adapt it to your use case.
-
-### Generate
-
-Using multiple GPUs with ZeRO-3 for generation requires synchronizing the GPUs by setting `synced_gpus=True` in the [`~GenerationMixin.generate`] method. Otherwise, if one GPU is finished generating before another one, the whole system hangs because the remaining GPUs haven't received the weight shard from the GPU that finished first.
-
-For Transformers>=4.28, if `synced_gpus` is automatically set to `True` if multiple GPUs are detected during generation.
-
 ## Troubleshoot
 
-When you encounter an issue, you should consider whether DeepSpeed is the cause of the problem because often it isn't (unless it's super obviously and you can see DeepSpeed modules in the exception)! The first step should be to retry your setup without DeepSpeed, and if the problem persists, then you can report the issue. If the issue is a core DeepSpeed problem and unrelated to the Transformers integration, open an Issue on the [DeepSpeed repository](https://github.com/deepspeedai/DeepSpeed).
+One of the first things to check when you encounter an error is whether DeepSpeed is the cause (because often it isn't). Retry your setup without DeepSpeed, and if the error persists, report the issue. If the issue is unrelated to the Transformers integration, please open the issue on the DeepSpeed [repository](https://github.com/microsoft/DeepSpeed).
 
-For issues related to the Transformers integration, please provide the following information:
+For issues related to the Transformers integration, please provide the following information.
 
-* the full DeepSpeed config file
+* The full DeepSpeed config file.
+* The command line arguments for [`Trainer`] or the [`TrainingArguments`] if you're scripting the [`Trainer`] setup yourself (don't dump the entire [`TrainingArguments`] which contains many irrelevant entries).
+* The outputs of the following commands.
 
-* the command line arguments of the [`Trainer`], or [`TrainingArguments`] arguments if you're scripting the [`Trainer`] setup yourself (don't dump the [`TrainingArguments`] which has dozens of irrelevant entries)
+    ```bash
+    python -c 'import torch; print(f"torch: {torch.__version__}")'
+    python -c 'import transformers; print(f"transformers: {transformers.__version__}")'
+    python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'
+    ```
 
-* the outputs of:
-
-```bash
-python -c 'import torch; print(f"torch: {torch.__version__}")'
-python -c 'import transformers; print(f"transformers: {transformers.__version__}")'
-python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'
-```
-
-* a link to a Google Colab notebook to reproduce the issue
-
-* if impossible, a standard and non-custom dataset we can use and also try to use an existing example to reproduce the issue with
+* A link to a Google Colab notebook to reproduce the issue.
+* A standard or non-custom dataset or an existing example to reproduce the issue.
 
 The following sections provide a guide for resolving two of the most common issues.
 
-### DeepSpeed process killed at startup
+### Process killed at startup
+
+When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than is available on your system. Or the process may have tried to allocate more CPU memory than allowed, leading the OS kernel to terminate the process.
 
-When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than your system has or your process tried to allocate more CPU memory than allowed leading the OS kernel to terminate the process. In this case, check whether your configuration file has either `offload_optimizer`, `offload_param` or both configured to offload to the CPU. 
+In this case, check whether your config file has either `offload_optimizer`, `offlload_param`, or both configured to offload to the CPU.
 
-If you have NVMe and ZeRO-3 setup, experiment with offloading to the NVMe ([estimate](https://deepspeed.readthedocs.io/en/latest/memory.html) the memory requirements for your model).
+If you have NVM3 and ZeRO-3 set up, experiment with offloading to the NVMe ([estimate](https://deepspeed.readthedocs.io/en/latest/memory.html) the memory requirements of a model first) instead.
 
 ### NaN loss
 
-NaN loss often occurs when a model is pretrained in bf16 and then you try to use it with fp16 (especially relevant for TPU trained models). To resolve this, use fp32 or bf16 if your hardware supports it (TPU, Ampere GPUs or newer).
+NaN loss often occurs when a model is pretrained in bf16 and you try to use it with fp16 (especially relevant to TPU trained models). To resolve this, use fp32 or bf16 if your hardware (TPUs, Ampere GPUs or newer) supports it.
 
-The other issue may be related to using fp16. For example, if this is your fp16 configuration:
+It is also possible that fp16 is causing overflow. For example, if your config file looks like the one below, you may see the following overflow errors in the logs.
 
 ```yaml
 {
@@ -1204,7 +999,7 @@ The other issue may be related to using fp16. For example, if this is your fp16
 }
 ```
 
-You might see the following `OVERFLOW!` messages in the logs:
+The `OVERFLOW!` error below is a result of the DeepSpeed loss scaler unable to find a scaling coefficient to overcome the loss overflow. Try a higher `initial_scale_power` value in this case (32 usually works).
 
 ```bash
 0%|                                                                                                                             | 0/189 [00:00<?, ?it/s]
@@ -1223,13 +1018,11 @@ You might see the following `OVERFLOW!` messages in the logs:
 [...]
 ```
 
-This means the DeepSpeed loss scaler is unable to find a scaling coefficient to overcome loss overflow. To fix it, try a higher `initial_scale_power` value (32 usually works).
-
 ## Resources
 
-DeepSpeed ZeRO is a powerful technology for training and loading very large models for inference with limited GPU resources, making it more accessible to everyone. To learn more about DeepSpeed, feel free to read the [blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [documentation](https://www.deepspeed.ai/getting-started/), and [GitHub repository](https://github.com/deepspeedai/DeepSpeed). 
+DeepSpeed is a powerful technology for scaling large model training. To learn more about DeepSpeed, take a look at their [blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [documentation](https://www.deepspeed.ai/getting-started/), and [GitHub](https://github.com/microsoft/deepspeed).
 
-The following papers are also a great resource for learning more about ZeRO:
+The papers below provide additional details about ZeRO.
 
 * [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://hf.co/papers/1910.02054)
 * [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://hf.co/papers/2101.06840)
diff --git a/docs/source/en/executorch.md b/docs/source/en/executorch.md
new file mode 100644
index 000000000000..3e9c097c9e6c
--- /dev/null
+++ b/docs/source/en/executorch.md
@@ -0,0 +1,59 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ExecuTorch
+
+[ExecuTorch](https://pytorch.org/executorch/stable/index.html) is a platform that enables PyTorch training and inference programs to be run on mobile and edge devices. It is powered by [torch.compile](https://pytorch.org/docs/stable/torch.compiler.html) and [torch.export](https://pytorch.org/docs/main/export.html) for performance and deployment.
+
+You can use ExecuTorch with Transformers with [torch.export](https://pytorch.org/docs/main/export.html). The [`~transformers.convert_and_export_with_cache`] method converts a [`PreTrainedModel`] into an exportable module. Under the hood, it uses [torch.export](https://pytorch.org/docs/main/export.html) to export the model, ensuring compatibility with ExecuTorch.
+
+```py
+import torch
+from transformers import LlamaForCausalLM, AutoTokenizer, GenerationConfig
+from transformers.integrations.executorch import(
+    TorchExportableModuleWithStaticCache,
+    convert_and_export_with_cache
+)
+
+generation_config = GenerationConfig(
+    use_cache=True,
+    cache_implementation="static",
+    cache_config={
+        "batch_size": 1,
+        "max_cache_len": 20,
+    }
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", pad_token="</s>", padding_side="right")
+model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="sdpa", generation_config=generation_config)
+
+exported_program = convert_and_export_with_cache(model)
+```
+
+The exported PyTorch model is now ready to be used with ExecuTorch. Wrap the model with [`~transformers.TorchExportableModuleWithStaticCache`] to generate text.
+
+```py
+prompts = ["Simply put, the theory of relativity states that "]
+prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+prompt_token_ids = prompt_tokens["input_ids"]
+
+generated_ids = TorchExportableModuleWithStaticCache.generate(
+    exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=20,
+)
+generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+print(generated_text)
+['Simply put, the theory of relativity states that 1) the speed of light is the']
+```
diff --git a/docs/source/en/fast_tokenizers.md b/docs/source/en/fast_tokenizers.md
index aebc17106008..921c0ba7b6f5 100644
--- a/docs/source/en/fast_tokenizers.md
+++ b/docs/source/en/fast_tokenizers.md
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,61 +14,349 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Use tokenizers from 🤗 Tokenizers
+# Tokenizers
 
-The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be
-loaded very simply into 🤗 Transformers.
+Tokenizers convert text into an array of numbers known as tensors, the inputs to a text model. There are several tokenizer algorithms, but they all share the same purpose. Split text into smaller words or subwords (tokens) according to some rules, and convert them into numbers (input ids). A Transformers tokenizer also returns an attention mask to indicate which tokens should be attended to.
 
-Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
+> [!TIP]
+> Learn about the most popular tokenization algorithms on the [Summary of the tokenizers](./tokenizer_summary) doc.
 
-```python
->>> from tokenizers import Tokenizer
->>> from tokenizers.models import BPE
->>> from tokenizers.trainers import BpeTrainer
->>> from tokenizers.pre_tokenizers import Whitespace
+Call [`~PreTrainedTokenizer.from_pretrained`] to load a tokenizer and its configuration from the Hugging Face [Hub](https://hf.co) or a local directory. The pretrained tokenizer is saved in a [tokenizer.model](https://huggingface.co/google/gemma-2-2b/blob/main/tokenizer.model) file with all its associated vocabulary files.
 
->>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
->>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+Pass a string of text to the tokenizer to return the input ids and attention mask, and set the framework tensor type to return with the `return_tensors` parameter.
 
->>> tokenizer.pre_tokenizer = Whitespace()
->>> files = [...]
->>> tokenizer.train(files, trainer)
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library", return_tensors="pt")
+{'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
+         156808, 128149,   9581, 235265]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+}
+```
+
+Whichever tokenizer you use, make sure the tokenizer vocabulary is the same as the pretrained models tokenizer vocabulary. This is especially important if you're using a custom tokenizer with a different vocabulary from the pretrained models tokenizer.
+
+This guide provides a brief overview of the tokenizer classes and how to preprocess text with it.
+
+## Tokenizer classes
+
+All tokenizers inherit from a [`PreTrainedTokenizerBase`] class that provides common methods for all tokenizers like [`~PreTrainedTokenizerBase.from_pretrained`] and [`~PreTrainedTokenizerBase.batch_decode`]. There are two main tokenizer classes that build on top of the base class.
+
+- [`PreTrainedTokenizer`] is a Python implementation, for example [`LlamaTokenizer`].
+- [`PreTrainedTokenizerFast`] is a fast Rust-based implementation from the [Tokenizers](https://hf.co/docs/tokenizers/index) library, for example [`LlamaTokenizerFast`].
+
+There are two ways you can load a tokenizer, with [`AutoTokenizer`] or a model-specific tokenizer.
+
+<hfoptions id="tokenizer-classes">
+<hfoption id="AutoTokenizer">
+
+The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, [`AutoTokenizer`] tries to load a fast tokenizer if it's available, otherwise, it loads the Python implementation.
+
+Use [`~PreTrainedTokenizer.from_pretrained`] to load a tokenizer.
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+{'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
+         156808, 128149,   9581, 235265]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+}
+```
+
+Load your own tokenizer by passing its vocabulary file to [`~AutoTokenizer.from_pretrained`].
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("./model_directory/my_vocab_file.txt")
+```
+
+</hfoption>
+<hfoption id="model-specific tokenizer">
+
+Each pretrained model is associated with a tokenizer and the specific vocabulary it was trained on. A tokenizer can be loaded directly from the model-specific class.
+
+> [!TIP]
+> Refer to a models API documentation to check whether a fast tokenizer is supported.
+
+```py
+from transformers import GemmaTokenizer
+
+tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+```
+
+To load a fast tokenizer, use the fast implementation class.
+
+```py
+from transformers import GemmaTokenizerFast
+
+tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+```
+
+Load your own tokenizer by passing its vocabulary file to the `vocab_file` parameter.
+
+```py
+from transformers import GemmaTokenizerFast
+
+tokenizer = GemmaTokenizerFast(vocab_file="my_vocab_file.txt")
 ```
 
-We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
-a JSON file for future re-use.
+</hfoption>
+</hfoptions>
+
+## Multimodal tokenizers
+
+In addition to text tokens, multimodal tokenizers also holds tokens from other modalities as a part of its attributes for easy access. 
 
-## Loading directly from the tokenizer object
+To add these special tokens to a tokenizer, pass them as a dictionary to the `extra_special_tokens` parameter in [`~AutoTokenizer.from_pretrained`]. The example below adds the `image_token` to a vision-language model.
 
-Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
-[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated
-*tokenizer* object as an argument:
+Save the tokenizer so you can reuse it with direct access to the `image_token`, `boi_token`, and `eoi_token`.
 
-```python
->>> from transformers import PreTrainedTokenizerFast
+```py
+vision_tokenizer = AutoTokenizer.from_pretrained(
+    "llava-hf/llava-1.5-7b-hf",
+    extra_special_tokens={"image_token": "<image>", "boi_token": "<image_start>", "eoi_token": "<image_end>"}
+)
+print(vision_tokenizer.image_token, vision_tokenizer.image_token_id)
+("<image>", 32000)
 
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+vision_tokenizer.save_pretrained("./path/to/tokenizer")
 ```
 
-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
-page](main_classes/tokenizer) for more information.
+## Fast tokenizers
+
+<Youtube id="3umI3tm27Vw"/>
+
+[`PreTrainedTokenizerFast`] or *fast tokenizers* are Rust-based tokenizers from the [Tokenizers](https://hf.co/docs/tokenizers) library. It is significantly faster at batched tokenization and provides additional alignment methods compared to the Python-based tokenizers.
+
+[`AutoTokenizer`] automatically loads a fast tokenizer if it's supported. Otherwise, you need to explicitly load the fast tokenizer.
+
+This section will show you how to train a fast tokenizer and reuse it in Transformers.
 
-## Loading from a JSON file
+To train a Byte-Pair Encoding (BPE) tokenizer, create a [`~tokenizers.Tokenizer`] and [`~tokenizers.trainers.BpeTrainer`] class and define the unknown token and special tokens.
+
+```py
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+
+tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+```
+
+Split the tokens on [`~tokenizers.pre_tokenizers.Whitespace`] to create tokens that don't overlap with each other.
+
+```py
+from tokenizers.pre_tokenizers import Whitespace
+
+tokenizer.pre_tokenizer = Whitespace()
+```
 
-In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
+Call [`~tokenizers.Tokenizer.train`] on the text files and trainer to start training.
 
-```python
->>> tokenizer.save("tokenizer.json")
+```py
+files = [...]
+tokenizer.train(files, trainer)
 ```
 
-The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization
-method using the `tokenizer_file` parameter:
+Use [`~tokenizers.Tokenizer.save`] to save the tokenizers configuration and vocabulary to a JSON file.
+
+```py
+tokenizer.save("tokenizer.json")
+```
+
+Now you can load and reuse the tokenizer object in Transformers by passing it to the `tokenizer_object` parameter in [`PreTrainedTokenizerFast`].
+
+```py
+from transformers import PreTrainedTokenizerFast
+
+fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+To load a saved tokenizer from its JSON file, pass the file path to the `tokenizer_file` parameter in [`PreTrainedTokenizerFast`].
+
+```py
+from transformers import PreTrainedTokenizerFast
+
+fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+## tiktoken
+
+[tiktoken](https://github.com/openai/tiktoken) is a [byte-pair encoding (BPE)](./tokenizer_summary#byte-pair-encoding-bpe) tokenizer by OpenAI. It includes several tokenization schemes or encodings for how text should be tokenized.
+
+There are currently two models trained and released with tiktoken, GPT2 and Llama3. Transformers supports models with a [tokenizer.model](https://hf.co/meta-llama/Meta-Llama-3-8B/blob/main/original/tokenizer.model) tiktoken file. The tiktoken file is automatically converted into Transformers Rust-based [`PreTrainedTokenizerFast`].
+
+Add the `subfolder` parameter to [`~PreTrainedModel.from_pretrained`] to specify where the `tokenizer.model` tiktoken file is located.
 
-```python
->>> from transformers import PreTrainedTokenizerFast
+```py
+from transformers import AutoTokenizer
 
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", subfolder="original") 
 ```
 
-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
-page](main_classes/tokenizer) for more information.
+### Create a tiktoken tokenizer
+
+The tiktoken `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json` (the appropriate format for [`PreTrainedTokenizerFast`]).
+
+Generate the tiktoken `tokenizer.model` file with the [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) function, and convert it to `tokenizer.json` with [convert_tiktoken_to_fast](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/integrations/tiktoken.py#L8).
+
+```py
+from transformers.integrations.tiktoken import convert_tiktoken_to_fast
+from tiktoken import get_encoding
+
+# Load your custom encoding or the one provided by OpenAI
+encoding = get_encoding("gpt2")
+convert_tiktoken_to_fast(encoding, "config/save/dir")
+```
+
+The resulting `tokenizer.json` file is saved to the specified directory and loaded with [`~PreTrainedTokenizerFast.from_pretrained`].
+
+```py
+tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
+```
+
+## Preprocess
+
+<Youtube id="Yffk5aydLzg"/>
+
+A Transformers model expects the input to be a PyTorch, TensorFlow, or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter.
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+{'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
+         156808, 128149,   9581, 235265]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+}
+```
+
+The tokenization process of converting text into input ids is completed in two steps.
+
+<hfoptions id="steps">
+<hfoption id="1. tokenize">
+
+In the first step, a string of text is split into tokens by the [`~PreTrainedTokenizer.tokenize`] function. How the text is split depends on the tokenization algorithm.
+
+```py
+tokens = tokenizer.tokenize("We are very happy to show you the 🤗 Transformers library")
+print(tokens)
+['We', '▁are', '▁very', '▁happy', '▁to', '▁show', '▁you', '▁the', '▁🤗', '▁Transformers', '▁library']
+```
+
+Gemma uses a [SentencePiece](./tokenizer_summary#sentencepiece) tokenizer which replaces spaces with an underscore `_`.
+
+</hfoption>
+<hfoption id="2. convert tokens to ids">
+
+In the second step, the tokens are converted into ids with [`~PreTrainedTokenizer.convert_tokens_to_ids`].
+
+```py
+ids = tokenizer.convert_tokens_to_ids(tokens)
+print(ids)
+[1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
+```
+
+</hfoption>
+<hfoption id="3. decode ids to text">
+
+Lastly, the model prediction typically generates numerical outputs which are converted back to text with [`~PreTrainedTokenizer.decode`].
+
+```py
+decoded_string = tokenizer.decode(ids)
+print(decoded_string)
+'We are very happy to show you the 🤗 Transformers library'
+```
+
+</hfoption>
+</hfoptions>
+
+> [!TIP]
+> Visualize how different tokenizers work in the [Tokenizer Playground](https://xenova-the-tokenizer-playground.static.hf.space).
+
+### Special tokens
+
+Special tokens provide the model with some additional information about the text.
+
+For example, if you compare the tokens obtained from passing text directly to the tokenizer and from [`~PreTrainedTokenizer.convert_tokens_to_ids`], you'll notice some additional tokens are added.
+
+```py
+model_inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+[2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
+tokenizer.convert_tokens_to_ids(tokens)
+[1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
+```
+
+When you [`~PreTrainedTokenizer.decode`] the ids, you'll see `<bos>` at the beginning of the string. This is used to indicate the beginning of a sentence to the model.
+
+```py
+print(tokenizer.decode(model_inputs["input_ids"]))
+print(tokenizer.decode(ids))
+'<bos>We are very happy to show you the 🤗 Transformers library.'
+'We are very happy to show you the 🤗 Transformers library'
+```
+
+Not all models need special tokens, but if they do, a tokenizer automatically adds them.
+
+### Batch tokenization
+
+It is faster and more efficient to preprocess *batches* of text instead of a single sentence at a time. Fast tokenizers are especially good at parallelizing tokenization.
+
+Pass a list of string text to the tokenizer.
+
+```py
+batch_sentences = [
+    "But what about second breakfast?",
+    "Don't think he knows about second breakfast, Pip.",
+    "What about elevensies?",
+]
+encoded_inputs = tokenizer(batch_sentences, return_tensors="pt")
+print(encoded_inputs)
+{
+ 'input_ids': 
+    [[2, 1860, 1212, 1105, 2257, 14457, 235336], 
+     [2, 4454, 235303, 235251, 1742, 693, 9242, 1105, 2257, 14457, 235269, 48782, 235265], 
+     [2, 1841, 1105, 29754, 37453, 235336]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1]]
+}
+```
+
+### Padding
+
+> [!TIP]
+> Learn about additional padding strategies in the [Padding and truncation](./pad_truncation) guide.
+
+In the output above, the `input_ids` have different lengths. This is an issue because Transformers expects them to have the same lengths so it can pack them into a batch. Sequences with uneven lengths can't be batched.
+
+Padding adds a special *padding token* to ensure all sequences have the same length. Set `padding=True` to pad the sequences to the longest sequence length in the batch.
+
+```py
+encoded_inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
+print(encoded_inputs)
+```
+
+The tokenizer added the special padding token `0` to the left side (*left padding*) because Gemma and LLMs in general are not trained to continue generation from a padding token.
+
+### Truncation
+
+> [!TIP]
+> Learn about additional truncation strategies in the [Padding and truncation](./pad_truncation) guide.
+
+Models are only able to process sequences up to a certain length. If you try to process a sequence longer than a model can handle, it crashes.
+
+Truncation removes tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the maximum length accepted by the model. You can also set the maximum length yourself with the `max_length` parameter.
+
+```py
+encoded_inputs = tokenizer(batch_sentences, max_length=8, truncation=True, return_tensors="pt")
+print(encoded_inputs)
+```
diff --git a/docs/source/en/feature_extractors.md b/docs/source/en/feature_extractors.md
new file mode 100644
index 000000000000..6cc202057697
--- /dev/null
+++ b/docs/source/en/feature_extractors.md
@@ -0,0 +1,199 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Feature extractors
+
+Feature extractors preprocess audio data into the correct format for a given model. It takes the raw audio signal and converts it into a tensor that can be fed to a model. The tensor shape depends on the model, but the feature extractor will correctly preprocess the audio data for you given the model you're using. Feature extractors also include methods for padding, truncation, and resampling.
+
+Call [`~AutoFeatureExtractor.from_pretrained`] to load a feature extractor and its preprocessor configuration from the Hugging Face [Hub](https://hf.co/models) or local directory. The feature extractor and preprocessor configuration is saved in a [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json) file.
+
+Pass the audio signal, typically stored in `array`, to the feature extractor and set the `sampling_rate` parameter to the pretrained audio models sampling rate. It is important the sampling rate of the audio data matches the sampling rate of the data a pretrained audio model was trained on.
+
+```py
+from transformers import AutoFeatureExtractor
+
+feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
+processed_sample
+{'input_values': [array([ 9.4472744e-05,  3.0777880e-03, -2.8888427e-03, ...,
+       -2.8888427e-03,  9.4472744e-05,  9.4472744e-05], dtype=float32)]}
+```
+
+The feature extractor returns an input, `input_values`, that is ready for the model to consume.
+
+This guide walks you through the feature extractor classes and how to preprocess audio data.
+
+## Feature extractor classes
+
+Transformers feature extractors inherit from the base [`SequenceFeatureExtractor`] class which subclasses [`FeatureExtractionMixin`].
+
+- [`SequenceFeatureExtractor`] provides a method to [`~SequenceFeatureExtractor.pad`] sequences to a certain length to avoid uneven sequence lengths.
+- [`FeatureExtractionMixin`] provides [`~FeatureExtractionMixin.from_pretrained`] and [`~FeatureExtractionMixin.save_pretrained`] to load and save a feature extractor.
+
+There are two ways you can load a feature extractor, [`AutoFeatureExtractor`] and a model-specific feature extractor class.
+
+<hfoptions id="feature-extractor-classes">
+<hfoption id="AutoFeatureExtractor">
+
+The [AutoClass](./model_doc/auto) API automatically loads the correct feature extractor for a given model.
+
+Use [`~AutoFeatureExtractor.from_pretrained`] to load a feature extractor.
+
+```py
+from transformers import AutoFeatureExtractor
+
+feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny")
+```
+
+</hfoption>
+<hfoption id="model-specific feature extractor">
+
+Every pretrained audio model has a specific associated feature extractor for correctly processing audio data. When you load a feature extractor, it retrieves the feature extractors configuration (feature size, chunk length, etc.) from [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json).
+
+A feature extractor can be loaded directly from its model-specific class.
+
+```py
+from transformers import WhisperFeatureExtractor
+
+feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
+```
+
+</hfoption>
+</hfoptions>
+
+## Preprocess
+
+A feature extractor expects the input as a PyTorch tensor of a certain shape. The exact input shape can vary depending on the specific audio model you're using.
+
+For example, [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) expects `input_features` to be a tensor of shape `(batch_size, feature_size, sequence_length)` but [Wav2Vec2](https://hf.co/docs/transformers/model_doc/wav2vec2) expects `input_values` to be a tensor of shape `(batch_size, sequence_length)`.
+
+The feature extractor generates the correct input shape for whichever audio model you're using.
+
+A feature extractor also sets the sampling rate (the number of audio signal values taken per second) of the audio files. The sampling rate of your audio data must match the sampling rate of the dataset a pretrained model was trained on. This value is typically given in the model card.
+
+Load a dataset and feature extractor with [`~FeatureExtractionMixin.from_pretrained`].
+
+```py
+from datasets import load_dataset, Audio
+from transformers import AutoFeatureExtractor
+
+dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+Check out the first example from the dataset and access the `audio` column which contains `array`, the raw audio signal.
+
+```py
+dataset[0]["audio"]["array"]
+array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+        0.        ,  0.        ])
+```
+
+The feature extractor preprocesses `array` into the expected input format for a given audio model. Use the `sampling_rate` parameter to set the appropriate sampling rate.
+
+```py
+processed_dataset = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
+processed_dataset
+{'input_values': [array([ 9.4472744e-05,  3.0777880e-03, -2.8888427e-03, ...,
+       -2.8888427e-03,  9.4472744e-05,  9.4472744e-05], dtype=float32)]}
+```
+
+### Padding
+
+Audio sequence lengths that are different is an issue because Transformers expects all sequences to have the same lengths so they can be batched. Uneven sequence lengths can't be batched.
+
+```py
+dataset[0]["audio"]["array"].shape
+(86699,)
+
+dataset[1]["audio"]["array"].shape
+(53248,)
+```
+
+Padding adds a special *padding token* to ensure all sequences have the same length. The feature extractor adds a `0` - interpreted as silence - to `array` to pad it. Set `padding=True` to pad sequences to the longest sequence length in the batch.
+
+```py
+def preprocess_function(examples):
+    audio_arrays = [x["array"] for x in examples["audio"]]
+    inputs = feature_extractor(
+        audio_arrays,
+        sampling_rate=16000,
+        padding=True,
+    )
+    return inputs
+
+processed_dataset = preprocess_function(dataset[:5])
+processed_dataset["input_values"][0].shape
+(86699,)
+
+processed_dataset["input_values"][1].shape
+(86699,)
+```
+
+### Truncation
+
+Models can only process sequences up to a certain length before crashing.
+
+Truncation is a strategy for removing excess tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the length in the `max_length` parameter.
+
+```py
+def preprocess_function(examples):
+    audio_arrays = [x["array"] for x in examples["audio"]]
+    inputs = feature_extractor(
+        audio_arrays,
+        sampling_rate=16000,
+        max_length=50000,
+        truncation=True,
+    )
+    return inputs
+
+processed_dataset = preprocess_function(dataset[:5])
+processed_dataset["input_values"][0].shape
+(50000,)
+
+processed_dataset["input_values"][1].shape
+(50000,)
+```
+
+### Resampling
+
+The [Datasets](https://hf.co/docs/datasets/index) library can also resample audio data to match an audio models expected sampling rate. This method resamples the audio data on the fly when they're loaded which can be faster than resampling the entire dataset in-place.
+
+The audio dataset you've been working on has a sampling rate of 8kHz and the pretrained model expects 16kHz.
+
+```py
+dataset[0]["audio"]
+{'path': '/root/.cache/huggingface/datasets/downloads/extracted/f507fdca7f475d961f5bb7093bcc9d544f16f8cab8608e772a2ed4fbeb4d6f50/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ]),
+ 'sampling_rate': 8000}
+```
+
+Call [`~datasets.Dataset.cast_column`] on the `audio` column to upsample the sampling rate to 16kHz.
+
+```py
+dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+```
+
+When you load the dataset sample, it is now resampled to 16kHz.
+
+```py
+dataset[0]["audio"]
+{'path': '/root/.cache/huggingface/datasets/downloads/extracted/f507fdca7f475d961f5bb7093bcc9d544f16f8cab8608e772a2ed4fbeb4d6f50/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'array': array([ 1.70562416e-05,  2.18727451e-04,  2.28099874e-04, ...,
+         3.43842403e-05, -5.96364771e-06, -1.76846661e-05]),
+ 'sampling_rate': 16000}
+```
diff --git a/docs/source/en/fsdp.md b/docs/source/en/fsdp.md
index 2c4f114dec85..944c5a18e109 100644
--- a/docs/source/en/fsdp.md
+++ b/docs/source/en/fsdp.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,81 +14,86 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Fully Sharded Data Parallel
+# FullyShardedDataParallel
 
-[Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) is a data parallel method that shards a model's parameters, gradients and optimizer states across the number of available GPUs (also called workers or *rank*). Unlike [DistributedDataParallel (DDP)](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html), FSDP reduces memory-usage because a model is replicated on each GPU. This improves GPU memory-efficiency and allows you to train much larger models on fewer GPUs. FSDP is integrated with the Accelerate, a library for easily managing training in distributed environments, which means it is available for use from the [`Trainer`] class.
+[Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) is a [parallelism](./perf_train_gpu_many) method that combines the advantages of data and model parallelism for distributed training.
 
-Before you start, make sure Accelerate is installed and at least PyTorch 2.1.0 or newer.
+Unlike [DistributedDataParallel (DDP)](./perf_train_gpu_many#distributeddataparallel), FSDP saves more memory because it doesn't replicate a model on each GPU. It shards the models parameters, gradients and optimizer states across GPUs. Each model shard processes a portion of the data and the results are synchronized to speed up training.
+
+This guide covers how to set up training a model with FSDP and [Accelerate](https://hf.co/docs/accelerate/index), a library for managing distributed training.
 
 ```bash
 pip install accelerate
 ```
 
-## FSDP configuration
+## Configuration options
 
-To start, run the [`accelerate config`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) command to create a configuration file for your training environment. Accelerate uses this configuration file to automatically setup the correct training environment based on your selected training options in `accelerate config`.
+Always start by running the [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config) command to help Accelerate set up the correct distributed training environment.
 
 ```bash
 accelerate config
 ```
 
-When you run `accelerate config`, you'll be prompted with a series of options to configure your training environment. This section covers some of the most important FSDP options. To learn more about the other available FSDP options, take a look at the [fsdp_config](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.fsdp_config) parameters.
+The section below discusses some of the more important FSDP configuration options. Learn more about other available options in the [fsdp_config](https://hf.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.fsdp_config) parameter.
 
 ### Sharding strategy
 
-FSDP offers a number of sharding strategies to select from:
-
-* `FULL_SHARD` - shards model parameters, gradients and optimizer states across workers; select `1` for this option
-* `SHARD_GRAD_OP`- shard gradients and optimizer states across workers; select `2` for this option
-* `NO_SHARD` - don't shard anything (this is equivalent to DDP); select `3` for this option
-* `HYBRID_SHARD` - shard model parameters, gradients and optimizer states within each worker where each worker also has a full copy; select `4` for this option
-* `HYBRID_SHARD_ZERO2` - shard gradients and optimizer states within each worker where each worker also has a full copy; select `5` for this option
+FSDP offers several sharding strategies to distribute a model. Refer to the table below to help you choose the best strategy for your setup. Specify a strategy with the `fsdp_sharding_strategy` parameter in the configuration file.
 
-This is enabled by the `fsdp_sharding_strategy` flag.
+| sharding strategy | description | parameter value |
+|---|---|---|
+| `FULL_SHARD` | shards model parameters, gradients, and optimizer states | `1` |
+| `SHARD_GRAD_OP` | shards gradients and optimizer states | `2` |
+| `NO_SHARD` | don't shard the model | `3` |
+| `HYBRID_SHARD` | shards model parameters, gradients, and optimizer states within each GPU | `4` |
+| `HYBRID_SHARD_ZERO2` | shards gradients and optimizer states within each GPU | `5` |
 
 ### CPU offload
 
-You could also offload parameters and gradients when they are not in use to the CPU to save even more GPU memory and help you fit large models where even FSDP may not be sufficient. This is enabled by setting `fsdp_offload_params: true` when running `accelerate config`.
+Offload model parameters and gradients when they aren't being used to the CPU to save additional GPU memory. This is useful for scenarios where a model is too large even with FSDP.
+
+Specify `fsdp_offload_params: true` in the configuration file to enable offloading.
 
 ### Wrapping policy
 
-FSDP is applied by wrapping each layer in the network. The wrapping is usually applied in a nested way where the full weights are discarded after each forward pass to save memory for use in the next layer. The *auto wrapping* policy is the simplest way to implement this and you don't need to change any code. You should select `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP` to wrap a Transformer layer and `fsdp_transformer_layer_cls_to_wrap` to specify which layer to wrap (for example `BertLayer`).
+FSDP is applied by wrapping each layer in the network. The wrapping is usually applied in a nested way where the full weights are discarded after each forward pass to save memory for the next layer.
+
+There are several wrapping policies available, but the *auto wrapping* policy is the simplest and doesn't require any changes to your code. Specify `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP` to wrap a Transformer layer and `fsdp_transformer_layer_cls_to_wrap` to determine which layer to wrap (for example, `BertLayer`).
 
-Otherwise, you can choose a size-based wrapping policy where FSDP is applied to a layer if it exceeds a certain number of parameters. This is enabled by setting `fsdp_wrap_policy: SIZE_BASED_WRAP` and `min_num_param` to the desired size threshold.
+Size-based wrapping is also available. If a layer exceeds a certain number of parameters, it is wrapped. Specify `fsdp_wrap_policy: SIZED_BASED_WRAP` and `min_num_param` to set the minimum number of parameters for a layer to be wrapped.
 
-### Checkpointing
+### Checkpoints
 
-Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`] method.
+Intermediate checkpoints should be saved as a sharded state dict because saving the full state dict - even with CPU offloading - is time consuming and can cause `NCCL Timeout` errors due to indefinite hanging during broadcasting.
+
+Specify `fsdp_state_dict_type: SHARDED_STATE_DICT` in the configuration file to save the sharded state dict. Now you can resume training from the sharded state dict with [`~accelerate.Accelerator.load_state`].
 
 ```py
-# directory containing checkpoints
-accelerator.load_state("ckpt")
+accelerator.load_state("directory/containing/checkpoints")
 ```
 
-However, when training ends, you want to save the full state dict because sharded state dict is only compatible with FSDP.
+Once training is complete though, you should save the full state dict because the sharded state dict is only compatible with FSDP.
 
 ```py
 if trainer.is_fsdp_enabled:
-    trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+  trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
 
 trainer.save_model(script_args.output_dir)
 ```
 
 ### TPU
 
-[PyTorch XLA](https://pytorch.org/xla/release/2.1/index.html) supports FSDP training for TPUs and it can be enabled by modifying the FSDP configuration file generated by `accelerate config`. In addition to the sharding strategies and wrapping options specified above, you can add the parameters shown below to the file.
+[PyTorch XLA](https://pytorch.org/xla/release/2.1/index.html), a package for running PyTorch on XLA devices, enables FSDP on TPUs. Modify the configuration file to include the parameters below. Refer to the [xla_fsdp_settings](https://github.com/pytorch/xla/blob/2e6e183e0724818f137c8135b34ef273dea33318/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py#L128) parameter for additional XLA-specific parameters you can configure for FSDP.
 
 ```yaml
 xla: True # must be set to True to enable PyTorch/XLA
-xla_fsdp_settings: # XLA-specific FSDP parameters
-xla_fsdp_grad_ckpt: True # use gradient checkpointing
+xla_fsdp_settings: # XLA specific FSDP parameters
+xla_fsdp_grad_ckpt: True # enable gradient checkpointing
 ```
 
-The [`xla_fsdp_settings`](https://github.com/pytorch/xla/blob/2e6e183e0724818f137c8135b34ef273dea33318/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py#L128) allow you to configure additional XLA-specific parameters for FSDP.
-
-## Launch training
+## Training
 
-An example FSDP configuration file may look like:
+After running [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config), your configuration file should be ready. An example configuration file is shown below that fully shards the parameter, gradient and optimizer states on two GPUs. Your file may look different depending on how you set up your configuration.
 
 ```yaml
 compute_environment: LOCAL_MACHINE
@@ -119,20 +124,22 @@ tpu_use_sudo: false
 use_cpu: false
 ```
 
-To launch training, run the [`accelerate launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) command and it'll automatically use the configuration file you previously created with `accelerate config`.
+Run the [accelerate launch](https://hf.co/docs/accelerate/package_reference/cli#accelerate-launch) command to launch a training script with the FSDP configurations you chose in the configuration file.
 
 ```bash
-accelerate launch my-trainer-script.py
+accelerate launch my-training-script.py
 ```
 
+It is also possible to directly specify some of the FSDP arguments in the command line.
+
 ```bash
-accelerate launch --fsdp="full shard" --fsdp_config="path/to/fsdp_config/ my-trainer-script.py
+accelerate launch --fsdp="full shard" --fsdp_config="path/to/fsdp_config/" my-training-script.py
 ```
 
-## Next steps
+## Resources
 
-FSDP can be a powerful tool for training really large models and you have access to more than one GPU or TPU. By sharding the model parameters, optimizer and gradient states, and even offloading them to the CPU when they're inactive, FSDP can reduce the high cost of large-scale training. If you're interested in learning more, the following may be helpful:
+FSDP is a powerful tool for training large models with fewer GPUs compared to other parallelism strategies. Refer to the following resources below to learn even more about FSDP.
 
-* Follow along with the more in-depth Accelerate guide for [FSDP](https://huggingface.co/docs/accelerate/usage_guides/fsdp).
-* Read the [Introducing PyTorch Fully Sharded Data Parallel (FSDP) API](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) blog post.
-* Read the [Scaling PyTorch models on Cloud TPUs with FSDP](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) blog post.
+- Follow along with the more in-depth Accelerate guide for [FSDP](https://hf.co/docs/accelerate/usage_guides/fsdp).
+- Read the [Introducing PyTorch Fully Sharded Data Parallel (FSDP) API](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) blog post.
+- Read the [Scaling PyTorch models on Cloud TPUs with FSDP](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) blog post.
diff --git a/docs/source/en/generation_features.md b/docs/source/en/generation_features.md
new file mode 100644
index 000000000000..19ac98780726
--- /dev/null
+++ b/docs/source/en/generation_features.md
@@ -0,0 +1,82 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Generation features
+
+The [`~GenerationMixin.generate`] API supports a couple features for building applications on top of it.
+
+This guide will show you how to use these features.
+
+## Streaming
+
+Streaming starts returning text as soon as it is generated so you don't have to wait to see the entire generated response all at once. It is important in user-facing applications because it reduces perceived latency and allows users to see the generation progression.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual-dark_360.gif"/>
+</div>
+
+> [!TIP]
+> Learn more about streaming in the [Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/conceptual/streaming) docs.
+
+Create an instance of [`TextStreamer`] with the tokenizer. Pass [`TextStreamer`] to the `streamer` parameter in [`~GenerationMixin.generate`] to stream the output one word at a time.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+inputs = tokenizer(["The secret to baking a good cake is "], return_tensors="pt")
+streamer = TextStreamer(tokenizer)
+
+_ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
+```
+
+The `streamer` parameter is compatible with any class with a [`~TextStreamer.put`] and [`~TextStreamer.end`] method. [`~TextStreamer.put`] pushes new tokens and [`~TextStreamer.end`] flags the end of generation. You can create your own streamer class as long as they include these two methods, or you can use Transformers' basic streamer classes.
+
+## Watermarking
+
+Watermarking is useful for detecting whether text is generated. The [watermarking strategy](https://hf.co/papers/2306.04634) in Transformers randomly "colors" a subset of the tokens green. When green tokens are generated, they have a small bias added to their logits, and a higher probability of being generated. You can detect generated text by comparing the proportion of green tokens to the amount of green tokens typically found in human-generated text.
+
+Watermarking is supported for any generative model in Transformers and doesn't require an extra classification model to detect the watermarked text.
+
+Create a [`WatermarkingConfig`] with the bias value to add to the logits and watermarking algorithm. The example below uses the `"selfhash"` algorithm, where the green token selection only depends on the current token. Pass the [`WatermarkingConfig`] to [`~GenerationMixin.generate`].
+
+> [!TIP]
+> The [`WatermarkDetector`] class detects the proportion of green tokens in generated text, which is why it is recommended to strip the prompt text, if it is much longer than the generated text. Padding can also have an effect on [`WatermarkDetector`].
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM, WatermarkDetector, WatermarkingConfig
+
+model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+tokenizer.pad_token_id = tokenizer.eos_token_id
+tokenizer.padding_side = "left"
+
+inputs = tokenizer(["This is the beginning of a long story", "Alice and Bob are"], padding=True, return_tensors="pt")
+input_len = inputs["input_ids"].shape[-1]
+
+watermarking_config = WatermarkingConfig(bias=2.5, seeding_scheme="selfhash")
+out = model.generate(**inputs, watermarking_config=watermarking_config, do_sample=False, max_length=20)
+```
+
+Create an instance of [`WatermarkDetector`] and pass the model output to it to detect whether the text is machine-generated. The [`WatermarkDetector`] must have the same [`WatermarkingConfig`] used during generation.
+
+```py
+detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config)
+detection_out = detector(out, return_dict=True)
+detection_out.prediction
+array([True, True])
+```
diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index 99049cceef34..5c56fb0e04f8 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,595 +14,317 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Text generation strategies
+# Generation strategies
 
-Text generation is essential to many NLP tasks, such as open-ended text generation, summarization, translation, and
-more. It also plays a role in a variety of mixed-modality applications that have text as an output like speech-to-text
-and vision-to-text. Some of the models that can generate text include
-GPT2, XLNet, OpenAI GPT, CTRL, TransformerXL, XLM, Bart, T5, GIT, Whisper.
+A decoding strategy informs how a model should select the next generated token. There are many types of decoding strategies, and choosing the appropriate one has a significant impact on the quality of the generated text.
 
-Check out a few examples that use [`~generation.GenerationMixin.generate`] method to produce
-text outputs for different tasks:
-* [Text summarization](./tasks/summarization#inference)
-* [Image captioning](./model_doc/git#transformers.GitForCausalLM.forward.example)
-* [Audio transcription](./model_doc/whisper#transformers.WhisperForConditionalGeneration.forward.example)
+This guide will help you understand the different decoding strategies available in Transformers and how and when to use them.
 
-Note that the inputs to the generate method depend on the model's modality. They are returned by the model's preprocessor
-class, such as AutoTokenizer or AutoProcessor. If a model's preprocessor creates more than one kind of input, pass all
-the inputs to generate(). You can learn more about the individual model's preprocessor in the corresponding model's documentation.
+## Greedy search
 
-The process of selecting output tokens to generate text is known as decoding, and you can customize the decoding strategy
-that the `generate()` method will use. Modifying a decoding strategy does not change the values of any trainable parameters.
-However, it can have a noticeable impact on the quality of the generated output. It can help reduce repetition in the text
-and make it more coherent.
+Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 tokens.
 
-This guide describes:
-* default generation configuration
-* common decoding strategies and their main parameters
-* saving and sharing custom generation configurations with your fine-tuned model on 🤗 Hub
+Greedy search works well for tasks with relatively short outputs. However, it breaks down when generating longer sequences because it begins to repeat itself.
 
-<Tip>
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-`generate()` is a critical component of our [chat CLI](quicktour#chat-with-text-generation-models).
-You can apply the learnings of this guide there as well.
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("I look forward to", return_tensors="pt").to("cuda")
 
-</Tip>
-
-## Default text generation configuration
-
-A decoding strategy for a model is defined in its generation configuration. When using pre-trained models for inference
-within a [`pipeline`], the models call the `PreTrainedModel.generate()` method that applies a default generation
-configuration under the hood. The default configuration is also used when no custom configuration has been saved with
-the model.
-
-When you load a model explicitly, you can inspect the generation configuration that comes with it through
- `model.generation_config`:
-
-```python
->>> from transformers import AutoModelForCausalLM
-
->>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
->>> model.generation_config
-GenerationConfig {
-  "bos_token_id": 50256,
-  "eos_token_id": 50256
-}
-<BLANKLINE>
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to default length because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=20)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a suite of tools and services for building, deploying, and maintaining natural language processing'
 ```
 
-Printing out the `model.generation_config` reveals only the values that are different from the default generation
-configuration, and does not list any of the default values.
-
-The default generation configuration limits the size of the output combined with the input prompt to a maximum of 20
-tokens to avoid running into resource limitations. The default decoding strategy is greedy search, which is the simplest decoding strategy that picks a token with the highest probability as the next token. For many tasks
-and small output sizes this works well. However, when used to generate longer outputs, greedy search can start
-producing highly repetitive results.
-
-## Customize text generation
-
-You can override any `generation_config` by passing the parameters and their values directly to the [`generate`] method:
+## Contrastive search
 
-```python
->>> my_model.generate(**inputs, num_beams=4, do_sample=True)  # doctest: +SKIP
-```
+[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
 
-Even if the default decoding strategy mostly works for your task, you can still tweak a few things. Some of the
-commonly adjusted parameters include:
-
-- `max_new_tokens`: the maximum number of tokens to generate. In other words, the size of the output sequence, not
-including the tokens in the prompt. As an alternative to using the output's length as a stopping criteria, you can choose
-to stop generation whenever the full generation exceeds some amount of time. To learn more, check [`StoppingCriteria`].
-- `num_beams`: by specifying a number of beams higher than 1, you are effectively switching from greedy search to
-beam search. This strategy evaluates several hypotheses at each time step and eventually chooses the hypothesis that
-has the overall highest probability for the entire sequence. This has the advantage of identifying high-probability
-sequences that start with a lower probability initial tokens and would've been ignored by the greedy search. Visualize how it works [here](https://huggingface.co/spaces/m-ric/beam_search_visualizer).
-- `do_sample`: if set to `True`, this parameter enables decoding strategies such as multinomial sampling, beam-search
-multinomial sampling, Top-K sampling and Top-p sampling. All these strategies select the next token from the probability
-distribution over the entire vocabulary with various strategy-specific adjustments.
-- `num_return_sequences`: the number of sequence candidates to return for each input. This option is only available for
-the decoding strategies that support multiple sequence candidates, e.g. variations of beam search and sampling. Decoding
-strategies like greedy search and contrastive search return a single output sequence.
-
-It is also possible to extend `generate()` with external libraries or handcrafted code. The `logits_processor` argument
-allows you to pass custom [`LogitsProcessor`] instances, allowing you to manipulate the next token probability
-distributions. Likewise, the `stopping_criteria` argument lets you set custom [`StoppingCriteria`] to stop text generation.
-The [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo) library contains examples of external
-`generate()`-compatible extensions.
-
-## Save a custom decoding strategy with your model
-
-If you would like to share your fine-tuned model with a specific generation configuration, you can:
-* Create a [`GenerationConfig`] class instance
-* Specify the decoding strategy parameters
-* Save your generation configuration with [`GenerationConfig.save_pretrained`], making sure to leave its `config_file_name` argument empty
-* Set `push_to_hub` to `True` to upload your config to the model's repo
-
-```python
->>> from transformers import AutoModelForCausalLM, GenerationConfig
+Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
 
->>> model = AutoModelForCausalLM.from_pretrained("my_account/my_model")  # doctest: +SKIP
->>> generation_config = GenerationConfig(
-...     max_new_tokens=50, do_sample=True, top_k=50, eos_token_id=model.config.eos_token_id
-... )
->>> generation_config.save_pretrained("my_account/my_model", push_to_hub=True)  # doctest: +SKIP
-```
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-You can also store several generation configurations in a single directory, making use of the `config_file_name`
-argument in [`GenerationConfig.save_pretrained`]. You can later instantiate them with [`GenerationConfig.from_pretrained`]. This is useful if you want to
-store several generation configurations for a single model (e.g. one for creative text generation with sampling, and
-one for summarization with beam search). You must have the right Hub permissions to add configuration files to a model.
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
 
-```python
->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
-
->>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
-
->>> translation_generation_config = GenerationConfig(
-...     num_beams=4,
-...     early_stopping=True,
-...     decoder_start_token_id=0,
-...     eos_token_id=model.config.eos_token_id,
-...     pad_token=model.config.pad_token_id,
-... )
-
->>> # Tip: add `push_to_hub=True` to push to the Hub
->>> translation_generation_config.save_pretrained("/tmp", "translation_generation_config.json")
-
->>> # You could then use the named generation config file to parameterize generation
->>> generation_config = GenerationConfig.from_pretrained("/tmp", "translation_generation_config.json")
->>> inputs = tokenizer("translate English to French: Configuration files are easy to use!", return_tensors="pt")
->>> outputs = model.generate(**inputs, generation_config=generation_config)
->>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
-['Les fichiers de configuration sont faciles à utiliser!']
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
 ```
 
-## Streaming
-
-The `generate()` supports streaming, through its `streamer` input. The `streamer` input is compatible with any instance
-from a class that has the following methods: `put()` and `end()`. Internally, `put()` is used to push new tokens and
-`end()` is used to flag the end of text generation.
-
-<Tip warning={true}>
+## Beam search
 
-The API for the streamer classes is still under development and may change in the future.
+Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability.
 
-</Tip>
+> [!TIP]
+> Check out the [beam search visualizer](https://huggingface.co/spaces/m-ric/beam_search_visualizer) to see how beam search works.
 
-In practice, you can craft your own streaming class for all sorts of purposes! We also have basic streaming classes
-ready for you to use. For example, you can use the [`TextStreamer`] class to stream the output of `generate()` into
-your screen, one word at a time:
+Enable beam search with the `num_beams` parameter (should be greater than 1 otherwise it's equivalent to greedy search).
 
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
->>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
->>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
->>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt")
->>> streamer = TextStreamer(tok)
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
 
->>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
->>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)
-An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, num_beams=2)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+"['Hugging Face is an open-source company that develops and maintains the Hugging Face platform, which is a collection of tools and libraries for building and deploying natural language processing (NLP) models. Hugging Face was founded in 2018 by Thomas Wolf']"
 ```
 
+## Diverse beam search
 
-## Watermarking
-
-The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green".
-When generating the "green" will have a small 'bias' value added to their logits, thus having a higher chance to be generated.
-The watermarked text can be detected by calculating the proportion of "green" tokens in the text and estimating how likely it is
-statistically to obtain that amount of "green" tokens for human-generated text. This watermarking strategy was proposed in the paper
-["On the Reliability of Watermarks for Large Language Models"](https://arxiv.org/abs/2306.04634). For more information on
-the inner functioning of watermarking, it is recommended to refer to the paper.
-
-The watermarking can be used with any generative model in `tranformers` and does not require an extra classification model
-to detect watermarked text. To trigger watermarking, pass in a [`WatermarkingConfig`] with needed arguments directly to the
-`.generate()` method or add it to the [`GenerationConfig`]. Watermarked text can be later detected with a [`WatermarkDetector`].
-
-
-<Tip warning={true}>
-
-The WatermarkDetector internally relies on the proportion of "green" tokens, and whether generated text follows the coloring pattern.
-That is why it is recommended to strip off the prompt text, if it is much longer than the generated text.
-This also can have an effect when one sequence in the batch is a lot longer causing other rows to be padded.
-Additionally, the detector **must** be initiated with identical watermark configuration arguments used when generating.
-
-</Tip>
-
-Let's generate some text with watermarking. In the below code snippet, we set the bias to 2.5 which is a value that
-will be added to "green" tokens' logits. After generating watermarked text, we can pass it directly to the `WatermarkDetector`
-to check if the text is machine-generated (outputs `True` for machine-generated and `False` otherwise).
+[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
 
-```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM, WatermarkDetector, WatermarkingConfig
-
->>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
->>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2")
->>> tok.pad_token_id = tok.eos_token_id
->>> tok.padding_side = "left"
+Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
 
->>> inputs = tok(["This is the beginning of a long story", "Alice and Bob are"], padding=True, return_tensors="pt")
->>> input_len = inputs["input_ids"].shape[-1]
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
->>> watermarking_config = WatermarkingConfig(bias=2.5, seeding_scheme="selfhash")
->>> out = model.generate(**inputs, watermarking_config=watermarking_config, do_sample=False, max_length=20)
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
 
->>> detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config)
->>> detection_out = detector(out, return_dict=True)
->>> detection_out.prediction
-array([ True,  True])
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
 ```
 
+## Multinomial sampling
 
-## Decoding strategies
+Search methods selects the most likely tokens. Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire models vocabulary. This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs.
 
-Certain combinations of the `generate()` parameters, and ultimately `generation_config`, can be used to enable specific
-decoding strategies. If you are new to this concept, we recommend reading
-[this blog post that illustrates how common decoding strategies work](https://huggingface.co/blog/how-to-generate).
+Enable multinomial sampling with `do_sample=True` and `num_beams=1`.
 
-Here, we'll show some of the parameters that control the decoding strategies and illustrate how you can use them.
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-<Tip>
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
 
-Selecting a given decoding strategy is not the only way you can influence the outcome of `generate()` with your model.
-The decoding strategies act based (mostly) on the logits, the distribution of probabilities for the next token, and
-thus selecting a good logits manipulation strategy can go a long way! In other words, manipulating the logits is another
-dimension you can act upon, in addition to selecting a decoding strategy. Popular logits manipulation strategies include
-`top_p`, `min_p`, and `repetition_penalty` -- you can check the full list in the [`GenerationConfig`] class.
-
-</Tip>
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'
+```
 
-### Greedy Search
+## Beam search multinomial sampling
 
-[`generate`] uses greedy search decoding by default so you don't have to pass any parameters to enable it. This means the parameters `num_beams` is set to 1 and `do_sample=False`.
+This decoding strategy is a combination of beam search and multinomial sampling. It generates multiple beams and uses a sampling strategy for each beam.
 
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
+Enable beam search multinomial sampling by setting `num_beams` to a value greater than 1 and `do_sample=True`.
 
->>> prompt = "I look forward to"
->>> checkpoint = "distilbert/distilgpt2"
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
 
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> outputs = model.generate(**inputs)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=4)
+'Hugging Face is an open-source company 100% dedicated to making AI more accessible. We believe that AI should be available to everyone, and we’re working hard to make that a reality.\nWe’re a team of passionate engineers, designers,'
 ```
 
-### Contrastive search
+## Speculative decoding
 
-The contrastive search decoding strategy was proposed in the 2022 paper [A Contrastive Framework for Neural Text Generation](https://arxiv.org/abs/2202.06417).
-It demonstrates superior results for generating non-repetitive yet coherent long outputs. To learn how contrastive search
-works, check out [this blog post](https://huggingface.co/blog/introducing-csearch).
-The two main parameters that enable and control the behavior of contrastive search are `penalty_alpha` and `top_k`:
+[Speculative](https://hf.co/papers/2211.17192) or assistive decoding isn't a search or sampling strategy. Instead, speculative decoding adds a second smaller model to generate candidate tokens. The main model verifies the candidate tokens in a single `forward` pass, which speeds up the decoding process overall. This method is especially useful for LLMs where it can be more costly and slower to generate tokens. Refer to the [speculative decoding](./llm_optims#speculative-decoding) guide to learn more.
 
-```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
+Currently, only greedy search and multinomial sampling are supported with speculative decoding. Batched inputs aren't supported either.
 
->>> checkpoint = "openai-community/gpt2-large"
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+Enable speculative decoding with the `assistant_model` parameter. You'll notice the fastest speed up with an assistant model that is much smaller than the main model. Add `do_sample=True` to enable token validation with resampling.
 
->>> prompt = "Hugging Face Company is"
->>> inputs = tokenizer(prompt, return_tensors="pt")
+<hfoptions id="spec-decoding">
+<hfoption id="greedy search">
 
->>> outputs = model.generate(**inputs, penalty_alpha=0.6, top_k=4, max_new_tokens=100)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Hugging Face Company is a family owned and operated business. We pride ourselves on being the best
-in the business and our customer service is second to none.\n\nIf you have any questions about our
-products or services, feel free to contact us at any time. We look forward to hearing from you!']
-```
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-### Multinomial sampling
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt")
 
-As opposed to greedy search that always chooses a token with the highest probability as the
-next token, multinomial sampling (also called ancestral sampling) randomly selects the next token based on the probability distribution over the entire
-vocabulary given by the model. Every token with a non-zero probability has a chance of being selected, thus reducing the
-risk of repetition.
+outputs = model.generate(**inputs, assistant_model=assistant_model)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a platform for developers to build and deploy machine'
+```
 
-To enable multinomial sampling set `do_sample=True` and `num_beams=1`.
+Speculative decoding is also supported in [`Pipeline`] with the `assistant_model` parameter.
 
 ```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
->>> set_seed(0)  # For reproducibility
-
->>> checkpoint = "openai-community/gpt2-large"
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
-
->>> prompt = "Today was an amazing day because"
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-["Today was an amazing day because we received these wonderful items by the way of a gift shop. The box arrived on a Thursday and I opened it on Monday afternoon to receive the gifts. Both bags featured pieces from all the previous years!\n\nThe box had lots of surprises in it, including some sweet little mini chocolate chips! I don't think I'd eat all of these. This was definitely one of the most expensive presents I have ever got, I actually got most of them for free!\n\nThe first package came"]
+from transformers import pipeline
+import torch
+
+pipe = pipeline(
+    "text-generation",
+    model="meta-llama/Llama-3.1-8B",
+    assistant_model="meta-llama/Llama-3.2-1B",
+    torch_dtype=torch.bfloat16
+)
+pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False)
+pipe_output[0]["generated_text"]
 ```
 
-### Beam-search decoding
-
-Unlike greedy search, beam-search decoding keeps several hypotheses at each time step and eventually chooses
-the hypothesis that has the overall highest probability for the entire sequence. This has the advantage of identifying high-probability
-sequences that start with lower probability initial tokens and would've been ignored by the greedy search.
-
-<a href="https://huggingface.co/spaces/m-ric/beam_search_visualizer" class="flex flex-col justify-center">
-    <img style="max-width: 90%; margin: auto;" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beam_search.png"/>
-</a>
-
-You can visualize how beam-search decoding works in [this interactive demo](https://huggingface.co/spaces/m-ric/beam_search_visualizer): type your input sentence, and play with the parameters to see how the decoding beams change.
-
-To enable this decoding strategy, specify the `num_beams` (aka number of hypotheses to keep track of) that is greater than 1.
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
+</hfoption>
+<hfoption id="multinomial sampling">
 
->>> prompt = "It is astonishing how one can"
->>> checkpoint = "openai-community/gpt2-medium"
+Add the `temperature` parameter to control sampling randomness. For speculative decoding, a lower temperature may improve latency.
 
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt")
 
->>> outputs = model.generate(**inputs, num_beams=5, max_new_tokens=50)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['It is astonishing how one can have such a profound impact on the lives of so many people in such a short period of
-time."\n\nHe added: "I am very proud of the work I have been able to do in the last few years.\n\n"I have']
+outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that is dedicated to creating a better world through technology.'
 ```
 
-### Beam-search multinomial sampling
-
-As the name implies, this decoding strategy combines beam search with multinomial sampling. You need to specify
-the `num_beams` greater than 1, and set `do_sample=True` to use this decoding strategy.
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, set_seed
->>> set_seed(0)  # For reproducibility
-
->>> prompt = "translate English to German: The house is wonderful."
->>> checkpoint = "google-t5/t5-small"
-
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
+</hfoption>
+</hfoptions>
 
->>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+### Prompt lookup decoding
 
->>> outputs = model.generate(**inputs, num_beams=5, do_sample=True)
->>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'Das Haus ist wunderbar.'
-```
+[Prompt lookup decoding](./llm_optims#prompt-lookup-decoding) is a variant of speculative decoding that uses overlapping n-grams as the candidate tokens. It works well for input-grounded tasks such as summarization. Refer to the [prompt lookup decoding](./llm_optims#prompt-lookup-decoding) guide to learn more.
 
-### Diverse beam search decoding
+Enable prompt lookup decoding with the `prompt_lookup_num_tokens` parameter.
 
-The diverse beam search decoding strategy is an extension of the beam search strategy that allows for generating a more diverse
-set of beam sequences to choose from. To learn how it works, refer to [Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf).
-This approach has three main parameters: `num_beams`, `num_beam_groups`, and `diversity_penalty`.
-The diversity penalty ensures the outputs are distinct across groups, and beam search is used within each group.
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
+assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M", torch_dtype=torch.float16).to("cuda")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
 
-```python
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
->>> checkpoint = "google/pegasus-xsum"
->>> prompt = (
-...     "The Permaculture Design Principles are a set of universal design principles "
-...     "that can be applied to any location, climate and culture, and they allow us to design "
-...     "the most efficient and sustainable human habitation and food production systems. "
-...     "Permaculture is a design system that encompasses a wide variety of disciplines, such "
-...     "as ecology, landscape design, environmental science and energy conservation, and the "
-...     "Permaculture design principles are drawn from these various disciplines. Each individual "
-...     "design principle itself embodies a complete conceptual framework based on sound "
-...     "scientific principles. When we bring all these separate  principles together, we can "
-...     "create a design system that both looks at whole systems, the parts that these systems "
-...     "consist of, and how those parts interact with each other to create a complex, dynamic, "
-...     "living system. Each design principle serves as a tool that allows us to integrate all "
-...     "the separate parts of a design, referred to as elements, into a functional, synergistic, "
-...     "whole system, where the elements harmoniously interact and work together in the most "
-...     "efficient way possible."
-... )
-
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
-
->>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30, diversity_penalty=1.0)
->>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'The Design Principles are a set of universal design principles that can be applied to any location, climate and
-culture, and they allow us to design the'
+outputs = model.generate(**inputs, assistant_model=assistant_model, max_new_tokens=20, prompt_lookup_num_tokens=5)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a platform for developers to build and deploy machine learning models. It offers a variety of tools'
 ```
 
-This guide illustrates the main parameters that enable various decoding strategies. More advanced parameters exist for the
-[`generate`] method, which gives you even further control over the [`generate`] method's behavior.
-For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation).
+### Self-speculative decoding
 
-### Speculative Decoding
+Early exiting uses the earlier hidden states from the language modeling head as inputs, effectively skipping layers to yield a lower quality output. The lower quality output is used as the assistant output and self-speculation is applied to fix the output using the remaining layers. The final generated result from this self-speculative method is the same (or has the same distribution) as the original models generation.
 
-Speculative decoding (also known as assisted decoding) is a modification of the decoding strategies above, that uses an
-assistant model (ideally a much smaller one), to generate a few candidate tokens. The main model then validates the candidate
-tokens in a single forward pass, which speeds up the decoding process. If `do_sample=True`, then the token validation with
-resampling introduced in the [speculative decoding paper](https://arxiv.org/pdf/2211.17192.pdf) is used.
-Assisted decoding assumes the main and assistant models have the same tokenizer, otherwise, see Universal Assisted Decoding below.
+The assistant model is also part of the target model, so the caches and weights can be shared, resulting in lower memory requirements.
 
-Currently, only greedy search and sampling are supported with assisted decoding, and assisted decoding doesn't support batched inputs.
-To learn more about assisted decoding, check [this blog post](https://huggingface.co/blog/assisted-generation).
+For a model trained with early exit, pass `assistant_early_exit` to [`~GenerationMixin.generate`].
 
-To enable assisted decoding, set the `assistant_model` argument with a model.
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> prompt = "Alice and Bob"
->>> checkpoint = "EleutherAI/pythia-1.4b-deduped"
->>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped"
+prompt = "Alice and Bob"
+checkpoint = "facebook/layerskip-llama3.2-1B"
 
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+inputs = tokenizer(prompt, return_tensors="pt")
 
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
->>> outputs = model.generate(**inputs, assistant_model=assistant_model)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a glass of wine.']
+model = AutoModelForCausalLM.from_pretrained(checkpoint)
+outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ```
 
-<Tip>
-
-If you're using a `pipeline` object, all you need to do is to pass the assistant checkpoint under `assistant_model`
-
-```python
->>> from transformers import pipeline
->>> import torch
-
->>> pipe = pipeline(
-...     "text-generation",
-...     model="meta-llama/Llama-3.1-8B",
-...     assistant_model="meta-llama/Llama-3.2-1B",  # This extra line is all that's needed, also works with UAD
-...     torch_dtype=torch.bfloat16
-... )
->>> pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False)
->>> pipe_output[0]["generated_text"]
-'Once upon a time, 3D printing was a niche technology that was only'
-```
+### Universal assisted decoding
 
-</Tip>
+Universal assisted decoding (UAD) enables the main and assistant models to use different tokenizers. The main models input tokens are re-encoded into assistant model tokens. Candidate tokens are generated in the assistant encoding which are re-encoded into the main model candidate tokens. The candidate tokens are verified as explained in [speculative decoding](#speculative-decoding).
 
+Re-encoding involves decoding token ids into text and encoding the text with a different tokenizer. To prevent tokenization discrepancies during re-encoding, UAD finds the longest common sub-sequence between the source and target encodings to ensure the new tokens include the correct prompt suffix.
 
-When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness,
-just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency.
+Add the `tokenizer` and `assistant_tokenizer` parameters to [`~GenerationMixin.generate`] to enable UAD.
 
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
->>> set_seed(42)  # For reproducibility
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
->>> prompt = "Alice and Bob"
->>> checkpoint = "EleutherAI/pythia-1.4b-deduped"
->>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped"
+prompt = "Alice and Bob"
 
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
+assistant_tokenizer = AutoTokenizer.from_pretrained("double7/vicuna-68m")
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+inputs = tokenizer(prompt, return_tensors="pt")
 
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
->>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are two people who are very different, but they are both very good at what they do. Alice']
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2-9b")
+assistant_model = AutoModelForCausalLM.from_pretrained("double7/vicuna-68m")
+outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```
 
-We recommend to install `scikit-learn` library to enhance the candidate generation strategy and achieve additional speedup.
-
-#### Universal Assisted Decoding
+## DoLa
 
-Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers.
-To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below).
-Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are
-in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above.
-The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer.
-Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings,
-to ensure the new tokens include the correct prompt suffix.
+[Decoding by Contrasting Layers (DoLa)](https://hf.co/papers/2309.03883) is a contrastive decoding strategy for improving factuality and reducing hallucination. This strategy works by contrasting the logit differences between the final and early layers. As a result, factual knowledge localized to particular layers are amplified. DoLa is not recommended for smaller models like GPT-2.
 
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> prompt = "Alice and Bob"
->>> checkpoint = "google/gemma-2-9b"
->>> assistant_checkpoint = "double7/vicuna-68m"
+Enable DoLa with the following parameters.
 
->>> assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint)
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
->>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a']
-```
+- `dola_layers` are the candidate layers to be contrasted with the final layer. It can be a string (`low` or `high`) to contrast the lower or higher parts of a layer. `high` is recommended for short-answer tasks like TruthfulQA. `low` is recommended for long-answer reasoning tasks like GSM8K, StrategyQA, FACTOR, and VicunaQA.
 
-#### Prompt Lookup
+  When a model has tied word embeddings, layer 0 is skipped and it begins from layer 2.
 
-Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed
-to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259).
+  It can also be a list of integers that represent the layer indices between 0 and the total number of layers. Layer 0 is the word embedding, 1 is the first transformer layer, and so on. Refer to the table below for the range of layer indices depending on the number of model layers.
 
-#### Self-Speculative Decoding
+  | layers | low | high |
+  |---|---|---|
+  | > 40 | (0, 20, 2) | (N - 20, N, 2) |
+  | <= 40 | range(0, N // 2, 2) | range(N // 2, N, 2) |
 
-An LLM can be trained to also use its language modeling head with earlier hidden states as input, effectively
-skipping layers to yield a lower-quality output -- a technique called early exiting.
-We use the lower-quality early exit output as an assistant output, and apply self-speculation to fix the output using the remaining layers. The final generation of that self-speculative solution is the same (or has the same distribution) as the original model's generation.
-If the model you're using was trained to do early exit, you can pass
-`assistant_early_exit` (integer). In this case, the assistant model will be the same model but exiting early, hence the
-"self-speculative" name. Because the assistant model is a portion of the target model, caches and weights can be shared, which results in lower memory requirements. As in other assisted generation methods, the final generated result has the same quality as if no assistant had been used.
+- `repetition_penalty` reduces repetition and it is recommended to set it to 1.2.
 
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
+<hfoptions id="dola">
+<hfoption id="contrast higher layers">
 
->>> prompt = "Alice and Bob"
->>> checkpoint = "facebook/layerskip-llama3.2-1B"
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
+inputs = tokenizer("What is the highest peak in the world??", return_tensors="pt").to("cuda")
 
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a']
+outputs = model.generate(**inputs, max_new_tokens=50, dola_layers="high", do_sample=False)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+" Mount EverestMount Everest, called Himalaya in Nepali, is the world's highest peak, lying almost 9.5 kilometers above the sea level and the tallest mountain from 19,036.91 ft. The mountain was"
 ```
 
-### DoLa Decoding
-
-**D**ecoding by C**o**ntrasting **La**yers (DoLa) is a contrastive decoding strategy to improve the factuality and reduce the
-hallucinations of LLMs, as described in this paper of ICLR 2024 [DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language Models](https://arxiv.org/abs/2309.03883).
+</hfoption>
+<hfoption id="contrast specific layers">
 
-DoLa is achieved by contrasting the differences in logits obtained from final
-layers versus earlier layers, thus amplify the factual knowledge localized to particular part of transformer layers.
+Contrast layers 18 and 20 with the final layer.
 
-Do the following two steps to activate DoLa decoding when calling the `model.generate` function:
-1. Set the `dola_layers` argument, which can be either a string or a list of integers.
-    - If set to a string, it can be one of `low`, `high`.
-    - If set to a list of integers, it should be a list of layer indices between 0 and the total number of layers in the model. The 0-th layer is word embedding, and the 1st layer is the first transformer layer, and so on.
-2. Set `repetition_penalty = 1.2` is suggested to reduce repetition in DoLa decoding.
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-See the following examples for DoLa decoding with the 32-layer LLaMA-7B model.
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
+inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt").to("cuda")
 
-```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
->>> import torch
->>> from accelerate.test_utils.testing import get_backend
-
->>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
->>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
->>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16).to(device)
->>> set_seed(42)
-
->>> text = "On what date was the Declaration of Independence officially signed?"
->>> inputs = tokenizer(text, return_tensors="pt").to(device)
-
-# Vanilla greddy decoding
->>> vanilla_output = model.generate(**inputs, do_sample=False, max_new_tokens=50)
->>> tokenizer.batch_decode(vanilla_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
-['\nThe Declaration of Independence was signed on July 4, 1776.\nWhat was the date of the signing of the Declaration of Independence?\nThe Declaration of Independence was signed on July 4,']
-
-# DoLa decoding with contrasting higher part of layers (layers 16,18,...,30)
->>> dola_high_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers='high')
->>> tokenizer.batch_decode(dola_high_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
-['\nJuly 4, 1776, when the Continental Congress voted to separate from Great Britain. The 56 delegates to the Continental Congress signed the Declaration on August 2, 1776.']
-
-# DoLa decoding with contrasting specific layers (layers 28 and 30)
->>> dola_custom_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers=[28,30], repetition_penalty=1.2)
->>> tokenizer.batch_decode(dola_custom_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
-['\nIn 1891, when he was 54 years old, John Jacob Astor founded his empire. He opened a one-man business and spent the next 27 years working 10-hour days. When']
+outputs = model.generate(**inputs, max_new_tokens=50, dola_layers=[18,20], do_sample=False, repetition_penalty=1.2)
+tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
+" Mount EverestMount Everest, called Himalaya in Nepali, is the world's highest peak above sea level and it rises to an incredible height of 29,028 feet above the ocean. Its summit is over a mile taller than Mt"
 ```
 
-#### Understanding the `dola_layers` argument
-
-`dola_layers` stands for the candidate layers in premature layer selection, as described in the DoLa paper. The selected premature layer will be contrasted with the final layer.
+</hfoption>
+</hfoptions>
 
-Setting `dola_layers` to `'low'` or `'high'` will select the lower or higher part of the layers to contrast, respectively.
-- For `N`-layer models with `N <= 40` layers, the layers of `range(0, N // 2, 2)` and `range(N // 2, N, 2)` are used for `'low'` and `'high'` layers, respectively.
-- For models with `N > 40` layers, the layers of `range(0, 20, 2)` and `range(N - 20, N, 2)` are used for `'low'` and `'high'` layers, respectively.
-- If the model has tied word embeddings, we skip the word embeddings (0-th) layer and start from the 2nd layer, as the early exit from word embeddings will become identity function.
-- Set the `dola_layers` to a list of integers for layer indices to contrast manually specified layers. For example, setting `dola_layers=[28,30]` will contrast the final layer (32-th layer) with the 28-th and 30-th layers.
+## Resources
 
-The paper suggested that contrasting `'high'` layers to improve short-answer tasks like TruthfulQA, and contrasting `'low'` layers to improve all the other long-answer reasoning tasks, such as GSM8K, StrategyQA, FACTOR, and VicunaQA. Applying DoLa to smaller models like GPT-2 is not recommended, as the results shown in the Appendix N of the paper.
+Read the [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) blog post for an explanation of how common decoding strategies work.
diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md
index b1afd55c8952..5043da792155 100644
--- a/docs/source/en/gguf.md
+++ b/docs/source/en/gguf.md
@@ -14,109 +14,40 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# GGUF and interaction with Transformers
-
-The GGUF file format is used to store models for inference with [GGML](https://github.com/ggerganov/ggml) and other 
-libraries that depend on it, like the very popular [llama.cpp](https://github.com/ggerganov/llama.cpp) or 
-[whisper.cpp](https://github.com/ggerganov/whisper.cpp).
-
-It is a file format [supported by the Hugging Face Hub](https://huggingface.co/docs/hub/en/gguf) with features 
-allowing for quick inspection of tensors and metadata within the file.
-
-This file format is designed as a "single-file-format" where a single file usually contains both the configuration
-attributes, the tokenizer vocabulary and other attributes, as well as all tensors to be loaded in the model. These
-files come in different formats according to the quantization type of the file. We briefly go over some of them
-[here](https://huggingface.co/docs/hub/en/gguf#quantization-types).
-
-## Support within Transformers
-
-We have added the ability to load `gguf` files within `transformers` in order to offer further training/fine-tuning
-capabilities to gguf models, before converting back those models to `gguf` to use within the `ggml` ecosystem. When
-loading a model, we first dequantize it to fp32, before loading the weights to be used in PyTorch.
-
-> [!NOTE]
-> The support is still very exploratory and we welcome contributions in order to solidify it across quantization types
-> and model architectures.
-
-For now, here are the supported model architectures and quantization types:
-
-### Supported quantization types
-
-The initial supported quantization types are decided according to the popular quantized files that have been shared
-on the Hub.
-
-- F32
-- F16
-- BF16
-- Q4_0
-- Q4_1
-- Q5_0
-- Q5_1
-- Q8_0
-- Q2_K
-- Q3_K
-- Q4_K
-- Q5_K
-- Q6_K
-- IQ1_S
-- IQ1_M
-- IQ2_XXS
-- IQ2_XS
-- IQ2_S
-- IQ3_XXS
-- IQ3_S
-- IQ4_XS
-- IQ4_NL
-
-> [!NOTE]
-> To support gguf dequantization, `gguf>=0.10.0` installation is required.
-
-### Supported model architectures
-
-For now the supported model architectures are the architectures that have been very popular on the Hub, namely:
-
-- LLaMa
-- Mistral
-- Qwen2
-- Qwen2Moe
-- Phi3
-- Bloom
-- Falcon
-- StableLM
-- GPT2
-- Starcoder2
-- T5
-- Mamba
-- Nemotron
-- Gemma2
-
-## Example usage
-
-In order to load `gguf` files in `transformers`, you should specify the `gguf_file` argument to the `from_pretrained`
-methods of both tokenizers and models. Here is how one would load a tokenizer and a model, which can be loaded
-from the exact same file:
+# GGUF
+
+[GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) is a file format used to store models for inference with [GGML](https://github.com/ggerganov/ggml), a fast and lightweight inference framework written in C and C++. GGUF is a single-file format containing the model metadata and tensors.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/gguf-spec.png"/>
+</div>
+
+The GGUF format also supports many quantized data types (refer to [quantization type table](https://hf.co/docs/hub/en/gguf#quantization-types) for a complete list of supported quantization types) which saves a significant amount of memory, making inference with large models like Whisper and Llama feasible on local and edge devices.
+
+Transformers supports loading models stored in the GGUF format for further training or finetuning. The GGUF checkpoint is **dequantized to fp32** where the full model weights are available and compatible with PyTorch.
+
+> [!TIP]
+> Models that support GGUF include Llama, Mistral, Qwen2, Qwen2Moe, Phi3, Bloom, Falcon, StableLM, GPT2, Starcoder2, and [more](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/ggml.py)
+
+Add the `gguf_file` parameter to [`~PreTrainedModel.from_pretrained`] to specify the GGUF file to load.
 
 ```py
+# pip install gguf
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
 filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
 
+torch_dtype = torch.float32 # could be torch.float16 or torch.bfloat16 too
 tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
-model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
+model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename, torch_dtype=torch_dtype)
 ```
 
-Now you have access to the full, unquantized version of the model in the PyTorch ecosystem, where you can combine it
-with a plethora of other tools.
-
-In order to convert back to a `gguf` file, we recommend using the 
-[`convert-hf-to-gguf.py` file](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) from llama.cpp.
-
-Here's how you would complete the script above to save the model and export it back to `gguf`:
+Once you're done tinkering with the model, save and convert it back to the GGUF format with the [convert-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) script.
 
 ```py
-tokenizer.save_pretrained('directory')
-model.save_pretrained('directory')
+tokenizer.save_pretrained("directory")
+model.save_pretrained("directory")
 
 !python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory}
 ```
diff --git a/docs/source/en/gpu_selection.md b/docs/source/en/gpu_selection.md
new file mode 100644
index 000000000000..57623ed74a14
--- /dev/null
+++ b/docs/source/en/gpu_selection.md
@@ -0,0 +1,94 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPU selection
+
+During distributed training, you can specify the number of GPUs to use and in what order. This can be useful when you have GPUs with different computing power and you want to use the faster GPU first. Or you could only use a subset of the available GPUs. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
+
+This guide will show you how to select the number of GPUs to use and the order to use them in.
+
+## Number of GPUs
+
+For example, if there are 4 GPUs and you only want to use the first 2, run the command below.
+
+<hfoptions id="select-gpu">
+<hfoption id="torchrun">
+
+Use the `--nproc_per_node` to select how many GPUs to use.
+
+```bash
+torchrun --nproc_per_node=2  trainer-program.py ...
+```
+
+</hfoption>
+<hfoption id="Accelerate">
+
+Use `--num_processes` to select how many GPUs to use.
+
+```bash
+accelerate launch --num_processes 2 trainer-program.py ...
+```
+
+</hfoption>
+<hfoption id="DeepSpeed">
+
+Use `--num_gpus` to select how many GPUs to use.
+
+```bash
+deepspeed --num_gpus 2 trainer-program.py ...
+```
+
+</hfoption>
+</hfoptions>
+
+### Order of GPUs
+
+To select specific GPUs to use and their order, configure the `CUDA_VISIBLE_DEVICES` environment variable. It is easiest to set the environment variable in `~/bashrc` or another startup config file. `CUDA_VISIBLE_DEVICES` is used to map which GPUs are used. For example, if there are 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
+```
+
+Only the 2 physical GPUs (0 and 2) are "visible" to PyTorch and these are mapped to `cuda:0` and `cuda:1` respectively. You can also reverse the order of the GPUs to use 2 first. The mapping becomes `cuda:1` for GPU 0 and `cuda:0` for GPU 2.
+
+```bash
+CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
+```
+
+You can also set the `CUDA_VISIBLE_DEVICES` environment variable to an empty value to create an environment without GPUs.
+
+```bash
+CUDA_VISIBLE_DEVICES= python trainer-program.py ...
+```
+
+> [!WARNING]
+> As with any environment variable, they can be exported instead of being added to the command line. However, this is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong GPUs. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
+
+`CUDA_DEVICE_ORDER` is an alternative environment variable you can use to control how the GPUs are ordered. You can order according to the following.
+
+1. PCIe bus IDs that matches the order of [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) and [`rocm-smi`](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/.doxygen/docBin/html/index.html) for NVIDIA and AMD GPUs respectively.
+
+```bash
+export CUDA_DEVICE_ORDER=PCI_BUS_ID
+```
+
+2. GPU compute ability.
+
+```bash
+export CUDA_DEVICE_ORDER=FASTEST_FIRST
+```
+
+The `CUDA_DEVICE_ORDER` is especially useful if your training setup consists of an older and newer GPU, where the older GPU appears first, but you cannot physically swap the cards to make the newer GPU appear first. In this case, set `CUDA_DEVICE_ORDER=FASTEST_FIRST` to always use the newer and faster GPU first (`nvidia-smi` or `rocm-smi` still reports the GPUs in their PCIe order). Or you could also set `export CUDA_VISIBLE_DEVICES=1,0`.
\ No newline at end of file
diff --git a/docs/source/en/how_to_hack_models.md b/docs/source/en/how_to_hack_models.md
index 5e2aa8297bcf..cc229f6b0148 100644
--- a/docs/source/en/how_to_hack_models.md
+++ b/docs/source/en/how_to_hack_models.md
@@ -13,68 +13,34 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# How to Hack Any Transformers Model
+# Customizing model components
 
-The [🤗 Transformers](https://github.com/huggingface/transformers) library offers a collection of pre-trained models and tools for natural language processing, vision, and beyond. While these models cover a wide range of applications, you might encounter use cases that aren't supported out of the box. Customizing models can unlock new possibilities, such as adding new layers, altering architectures, or optimizing attention mechanisms. This guide will show you how to modify existing Transformers models to fit your specific needs. The great thing is, you don’t have to step away from the Transformers framework to make these changes. You can actually modify models directly in Transformers and still take advantage of features like the [Trainer API](https://huggingface.co/docs/transformers/main/en/main_classes/trainer), [PreTrainedModel](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel), and efficient fine-tuning with tools like [PEFT](https://huggingface.co/docs/peft/index).
+Another way to customize a model is to modify their components, rather than writing a new model entirely, allowing you to tailor a model to your specific use case. For example, you can add new layers or optimize the attention mechanism of an architecture. Customizations are applied directly to a Transformers model so that you can continue to use features such as [`Trainer`], [`PreTrainedModel`], and the [PEFT](https://huggingface.co/docs/peft/en/index) library.
 
-In this guide, we’ll walk you through how to customize existing Transformers models to meet your requirements—without losing the benefits of the ecosystem.
+This guide will show you how to customize a models attention mechanism in order to apply [Low-Rank Adaptation (LoRA)](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) to it.
 
-You'll learn how to:
+> [!TIP]
+> The [clear_import_cache](https://github.com/huggingface/transformers/blob/9985d06add07a4cc691dc54a7e34f54205c04d40/src/transformers/utils/import_utils.py#L2286) utility is very useful when you're iteratively modifying and developing model code. It removes all cached Transformers modules and allows Python to reload the modified code without constantly restarting your environment.
+>
+> ```py
+> from transformers import AutoModel
+> from transformers.utils.import_utils import clear_import_cache
+>
+> model = AutoModel.from_pretrained("bert-base-uncased")
+> # modifications to model code
+> # clear cache to reload modified code
+> clear_import_cache()
+> # re-import to use updated code
+> model = AutoModel.from_pretrained("bert-base-uncased")
+> ```
 
-- Modify a model's architecture by changing its attention mechanism.
-- Apply techniques like Low-Rank Adaptation (LoRA) to specific model components.
+## Attention class
 
-We encourage you to contribute your own hacks and share them here with the community!
+[Segment Anything](./model_doc/sam) is an image segmentation model, and it combines the query-key-value (`qkv`) projection in its attention mechanisms. To reduce the number of trainable parameters and computational overhead, you can apply LoRA to the `qkv` projection. This requires splitting the `qkv` projection so that you can separately target the `q` and `v` with LoRA.
 
-## Efficient Development Workflow
+1. Create a custom attention class, `SamVisionAttentionSplit`, by subclassing the original `SamVisionAttention` class. In the `__init__`, delete the combined `qkv` and create a separate linear layer for `q`, `k` and `v`.
 
-When modifying model code, you'll often need to test your changes without restarting your Python session. The `clear_import_cache()` utility helps with this workflow, especially during model development and contribution when you need to frequently test and compare model outputs:
-
-```python
-from transformers import AutoModel
-model = AutoModel.from_pretrained("bert-base-uncased")
-
-# Make modifications to the transformers code...
-
-# Clear the cache to reload the modified code
-from transformers.utils.import_utils import clear_import_cache
-clear_import_cache()
-
-# Reimport to get the changes
-from transformers import AutoModel
-model = AutoModel.from_pretrained("bert-base-uncased")  # Will use updated code
-```
-
-This is particularly useful when:
-- Iteratively modifying model architectures
-- Debugging model implementations 
-- Testing changes during model development
-- Comparing outputs between original and modified versions
-- Working on model contributions
-
-The `clear_import_cache()` function removes all cached Transformers modules and allows Python to reload the modified code. This enables rapid development cycles without constantly restarting your environment.
-
-This workflow is especially valuable when implementing new models, where you need to frequently compare outputs between the original implementation and your Transformers version (as described in the [Add New Model](https://huggingface.co/docs/transformers/add_new_model) guide).
-
-## Example: Modifying the Attention Mechanism in the Segment Anything Model (SAM)
-
-The **Segment Anything Model (SAM)** is a state-of-the-art model for image segmentation. In its default implementation, SAM uses a combined query-key-value (`qkv`) projection in its attention mechanism. However, you might want to fine-tune only specific components of the attention mechanism, such as the query (`q`) and value (`v`) projections, to reduce the number of trainable parameters and computational resources required.
-
-### Motivation
-
-By splitting the combined `qkv` projection into separate `q`, `k`, and `v` projections, you can apply techniques like **LoRA** (Low-Rank Adaptation) to only the `q` and `v` projections. This approach allows you to:
-
-- Fine-tune fewer parameters, reducing computational overhead.
-- Potentially achieve better performance by focusing on specific components.
-- Experiment with different adaptation strategies in the attention mechanism.
-
-### Implementation
-
-#### **Step 1: Create a Custom Attention Class**
-
-Next, subclass the original `SamVisionAttention` class and modify it to have separate `q`, `k`, and `v` projections.
-
-```python
+```py
 import torch
 import torch.nn as nn
 from transformers.models.sam.modeling_sam import SamVisionAttention
@@ -82,30 +48,39 @@ from transformers.models.sam.modeling_sam import SamVisionAttention
 class SamVisionAttentionSplit(SamVisionAttention, nn.Module):
     def __init__(self, config, window_size):
         super().__init__(config, window_size)
+        # remove combined qkv
         del self.qkv
-        # Separate q, k, v projections
+        # separate q, k, v projections
         self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
         self.k = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
         self.v = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
         self._register_load_state_dict_pre_hook(self.split_q_k_v_load_hook)
+```
+
+2. The `_split_qkv_load_hook` function splits the pretrained `qkv` weights into separate `q`, `k`, and `v` weights when loading the model to ensure compatibility with any pretrained model.
 
+```py
     def split_q_k_v_load_hook(self, state_dict, prefix, *args):
         keys_to_delete = []
         for key in list(state_dict.keys()):
             if "qkv." in key:
-                # Split q, k, v from the combined projection
+                # split q, k, v from the combined projection
                 q, k, v = state_dict[key].chunk(3, dim=0)
-                # Replace with individual q, k, v projections
+                # replace with individual q, k, v projections
                 state_dict[key.replace("qkv.", "q.")] = q
                 state_dict[key.replace("qkv.", "k.")] = k
                 state_dict[key.replace("qkv.", "v.")] = v
-                # Mark the old qkv key for deletion
+                # mark the old qkv key for deletion
                 keys_to_delete.append(key)
         
-        # Remove old qkv keys
+        # remove old qkv keys
         for key in keys_to_delete:
             del state_dict[key]
+```
+
+3. In the `forward` pass, `q`, `k`, and `v` are computed separately while the rest of the attention mechanism remains the same.
 
+```py
     def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
         batch_size, height, width, _ = hidden_states.shape
         qkv_shapes = (batch_size *  self.num_attention_heads,  height * width, -1)
@@ -133,78 +108,49 @@ class SamVisionAttentionSplit(SamVisionAttention, nn.Module):
         return outputs
 ```
 
-**Explanation:**
-
-- **Separate Projections:** The combined `qkv` projection is removed, and separate `q`, `k`, and `v` linear layers are created.
-- **Weight Loading Hook:** The `_split_qkv_load_hook` method splits the pre-trained `qkv` weights into separate `q`, `k`, and `v` weights when loading the model. This ensures compatibility with any pre-trained model.
-- **Forward Pass:** Queries, keys, and values are computed separately, and the attention mechanism proceeds as usual.
+Assign the custom `SamVisionAttentionSplit` class to the original models `SamVisionAttention` module to replace it. All instances of `SamVisionAttention` in the model is replaced with the split attention version.
 
-#### **Step 2: Replace the Original Attention Class**
+Load the model with [`~PreTrainedModel.from_pretrained`].
 
-Replace the original `SamVisionAttention` class with your custom class so that the model uses the modified attention mechanism.
-
-```python
+```py
 from transformers import SamModel
 from transformers.models.sam import modeling_sam
 
-# Replace the attention class in the modeling_sam module
+# replace the attention class in the modeling_sam module
 modeling_sam.SamVisionAttention = SamVisionAttentionSplit
 
-# Load the pre-trained SAM model
+# load the pretrained SAM model
 model = SamModel.from_pretrained("facebook/sam-vit-base")
 ```
 
-**Explanation:**
-
-- **Class Replacement:** By assigning your custom class to `modeling_sam.SamVisionAttention`, any instances of `SamVisionAttention` in the model will use the modified version. Thus when you call `SamModel`, it will use the newly defined `SamVisionAttentionSplit`. 
-- **Model Loading:** The model is loaded using `from_pretrained`, and the custom attention mechanism is integrated.
+## LoRA
 
-#### **Step 3: Apply LoRA to Specific Projections**
+With separate `q`, `k`, and `v` projections, apply LoRA to `q` and `v`.
 
-With separate `q`, `k`, and `v` projections, you can now apply LoRA to specific components, such as the `q` and `v` projections.
+Create a [LoraConfig](https://huggingface.co/docs/peft/package_reference/config#peft.PeftConfig) and specify the rank `r`, `lora_alpha`, `lora_dropout`, `task_type`, and most importantly, the modules to target.
 
-```python
+```py
 from peft import LoraConfig, get_peft_model
 
 config = LoraConfig(
     r=16,
     lora_alpha=32,
-    target_modules=["q", "v"],  # Apply LoRA to q and v projections
+    # apply LoRA to q and v
+    target_modules=["q", "v"],
     lora_dropout=0.1,
     task_type="mask-generation"
 )
-
-# Apply LoRA to the model
-model = get_peft_model(model, config)
 ```
 
-**Explanation:**
-
-- **LoRA Configuration:** The `LoraConfig` specifies the rank `r`, scaling factor `lora_alpha`, target modules (`"q"` and `"v"`), dropout, and task type.
-- **Applying LoRA:** The `get_peft_model` function applies LoRA to the specified modules in the model.
-- **Parameter Reduction:** By focusing on `q` and `v`, you reduce the number of trainable parameters, leading to faster training and lower memory usage.
-
-#### **Step 4: Verify the Number of Trainable Parameters**
-
-It's simple to verify the number of trainable parameters and see what impact your modification had. 
+Pass the model and [LoraConfig](https://huggingface.co/docs/peft/package_reference/config#peft.PeftConfig) to [get_peft_model](https://huggingface.co/docs/peft/package_reference/peft_model#peft.get_peft_model) to apply LoRA to the model.
 
-```python
-model.print_trainable_parameters()
-```
-
-**Expected Output:**
-
-```
-trainable params: 608,256 || all params: 94,343,728 || trainable%: 0.6447
-trainable params: 912,384 || all params: 94,647,856 || trainable%: 0.9640 # with k 
+```py
+model = get_peft_model(model, config)
 ```
 
-## Contributing Your Own Hacks
-
-Modifying pre-trained models can open up new avenues for research and application. By understanding and adjusting the internal mechanisms of models like SAM, you can tailor them to your specific needs, optimize performance, and experiment with new ideas.
+Call [print_trainable_parameters](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftMixedModel.print_trainable_parameters) to view the number of parameters you're training as a result versus the total number of parameters.
 
-If you've developed your own hacks for Transformers models and would like to share them, consider contributing to this doc.
-
-- **Open a Pull Request:** Share your code changes and improvements directly in the repository.
-- **Write Documentation:** Provide clear explanations and examples of your modifications.
-- **Engage with the Community:** Discuss your ideas and get feedback from other developers and researchers by opening an issue.
\ No newline at end of file
+```py
+model.print_trainable_parameters()
+"trainable params: 608,256 || all params: 94,343,728 || trainable%: 0.6447"
+```
\ No newline at end of file
diff --git a/docs/source/en/hpo_train.md b/docs/source/en/hpo_train.md
index 49dde04fe606..303ff6fb53b4 100644
--- a/docs/source/en/hpo_train.md
+++ b/docs/source/en/hpo_train.md
@@ -13,124 +13,155 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Hyperparameter Search using Trainer API
+# Hyperparameter search
 
-🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] provides API for hyperparameter search. This doc shows how to enable it in example.
+Hyperparameter search discovers an optimal set of hyperparameters that produces the best model performance. [`Trainer`] supports several hyperparameter search backends - [Optuna](https://optuna.readthedocs.io/en/stable/index.html), [SigOpt](https://docs.sigopt.com/), [Weights & Biases](https://docs.wandb.ai/), [Ray Tune](https://docs.ray.io/en/latest/tune/index.html) - through  [`~Trainer.hyperparameter_search`] to optimize an objective or even multiple objectives.
 
-## Hyperparameter Search backend
+This guide will go over how to set up a hyperparameter search for each of the backends.
 
-[`Trainer`] supports four hyperparameter search backends currently:
-[optuna](https://optuna.org/), [sigopt](https://sigopt.com/), [raytune](https://docs.ray.io/en/latest/tune/index.html) and [wandb](https://wandb.ai/site/sweeps).
-
-you should install them before using them as the hyperparameter search backend
 ```bash
 pip install optuna/sigopt/wandb/ray[tune]
 ```
 
-## How to enable Hyperparameter search in example
+To use [`~Trainer.hyperparameter_search`], you need to create a `model_init` function. This function includes basic model information (arguments and configuration) because it needs to be reinitialized for each search trial in the run.
+
+> [!WARNING]
+> The `model_init` function is incompatible with the [optimizers](./main_classes/trainer#transformers.Trainer.optimizers) parameter. Subclass [`Trainer`] and override the [`~Trainer.create_optimizer_and_scheduler`] method to create a custom optimizer and scheduler.
 
-Define the hyperparameter search space, different backends need different format.
+An example `model_init` function is shown below.
 
-For sigopt, see sigopt [object_parameter](https://docs.sigopt.com/ai-module-api-references/api_reference/objects/object_parameter), it's like following:
 ```py
->>> def sigopt_hp_space(trial):
-...     return [
-...         {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double"},
-...         {
-...             "categorical_values": ["16", "32", "64", "128"],
-...             "name": "per_device_train_batch_size",
-...             "type": "categorical",
-...         },
-...     ]
+def model_init(trial):
+    return AutoModelForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
 ```
 
-For optuna, see optuna [object_parameter](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html#sphx-glr-tutorial-10-key-features-002-configurations-py), it's like following:
+Pass `model_init` to [`Trainer`] along with everything else you need for training. Then you can call [`~Trainer.hyperparameter_search`] to start the search.
 
-```py
->>> def optuna_hp_space(trial):
-...     return {
-...         "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
-...         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
-...     }
-```
+[`~Trainer.hyperparameter_search`] accepts a [direction](./main_classes/trainer#transformers.Trainer.hyperparameter_search.direction) parameter to specify whether to minimize, maximize, or minimize and maximize multiple objectives. You'll also need to set the [backend](./main_classes/trainer#transformers.Trainer.hyperparameter_search.backend) you're using, an [object](./main_classes/trainer#transformers.Trainer.hyperparameter_search.hp_space) containing the hyperparameters to optimize for, the [number of trials](./main_classes/trainer#transformers.Trainer.hyperparameter_search.n_trials) to run, and a [compute_objective](./main_classes/trainer#transformers.Trainer.hyperparameter_search.compute_objective) to return the objective values.
 
-Optuna provides multi-objective HPO. You can pass `direction` in `hyperparameter_search` and define your own compute_objective to return multiple objective values. The Pareto Front (`List[BestRun]`) will be returned in hyperparameter_search, you should refer to the test case `TrainerHyperParameterMultiObjectOptunaIntegrationTest` in [test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py). It's like following
+> [!TIP]
+> If [compute_objective](./main_classes/trainer#transformers.Trainer.hyperparameter_search.compute_objective) isn't defined, the default [compute_objective](./main_classes/trainer#transformers.Trainer.hyperparameter_search.compute_objective) is called which is the sum of an evaluation metric like F1.
 
 ```py
->>> best_trials = trainer.hyperparameter_search(
-...     direction=["minimize", "maximize"],
-...     backend="optuna",
-...     hp_space=optuna_hp_space,
-...     n_trials=20,
-...     compute_objective=compute_objective,
-... )
+from transformers import Trainer
+
+trainer = Trainer(
+    model=None,
+    args=training_args,
+    train_dataset=small_train_dataset,
+    eval_dataset=small_eval_dataset,
+    compute_metrics=compute_metrics,
+    processing_class=tokenizer,
+    model_init=model_init,
+    data_collator=data_collator,
+)
+trainer.hyperparameter_search(...)
 ```
 
-For raytune, see raytune [object_parameter](https://docs.ray.io/en/latest/tune/api/search_space.html), it's like following:
+The following examples demonstrate how to perform a hyperparameter search for the learning rate and training batch size using the different backends.
 
-```py
->>> def ray_hp_space(trial):
-...     return {
-...         "learning_rate": tune.loguniform(1e-6, 1e-4),
-...         "per_device_train_batch_size": tune.choice([16, 32, 64, 128]),
-...     }
-```
+<hfoptions id="backends">
+<hfoption id="Optuna">
 
-For wandb, see wandb [object_parameter](https://docs.wandb.ai/guides/sweeps/configuration), it's like following:
+[Optuna](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html#sphx-glr-tutorial-10-key-features-002-configurations-py) optimizes categories, integers, and floats.
 
 ```py
->>> def wandb_hp_space(trial):
-...     return {
-...         "method": "random",
-...         "metric": {"name": "objective", "goal": "minimize"},
-...         "parameters": {
-...             "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
-...             "per_device_train_batch_size": {"values": [16, 32, 64, 128]},
-...         },
-...     }
+def optuna_hp_space(trial):
+    return {
+        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
+        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
+    }
+
+best_trials = trainer.hyperparameter_search(
+    direction=["minimize", "maximize"],
+    backend="optuna",
+    hp_space=optuna_hp_space,
+    n_trials=20,
+    compute_objective=compute_objective,
+)
 ```
 
-Define a `model_init` function and pass it to the [`Trainer`], as an example:
+</hfoption>
+<hfoption id="Ray Tune">
+
+[Ray Tune](https://docs.ray.io/en/latest/tune/api/search_space.html) optimizes floats, integers, and categorical parameters. It also offers multiple sampling distributions for each parameter such as uniform and log-uniform.
+
 ```py
->>> def model_init(trial):
-...     return AutoModelForSequenceClassification.from_pretrained(
-...         model_args.model_name_or_path,
-...         from_tf=bool(".ckpt" in model_args.model_name_or_path),
-...         config=config,
-...         cache_dir=model_args.cache_dir,
-...         revision=model_args.model_revision,
-...         token=True if model_args.use_auth_token else None,
-...     )
+def ray_hp_space(trial):
+    return {
+        "learning_rate": tune.loguniform(1e-6, 1e-4),
+        "per_device_train_batch_size": tune.choice([16, 32, 64, 128]),
+    }
+
+best_trials = trainer.hyperparameter_search( 
+    direction=["minimize", "maximize"],
+    backend="ray",
+    hp_space=ray_hp_space,
+    n_trials=20,
+    compute_objective=compute_objective,
+)
 ```
 
-Create a [`Trainer`] with your `model_init` function, training arguments, training and test datasets, and evaluation function:
+</hfoption>
+<hfoption id="SigOpt">
+
+[SigOpt](https://docs.sigopt.com/ai-module-api-references/api_reference/objects/object_parameter) optimizes double, integer, and categorical parameters.
 
 ```py
->>> trainer = Trainer(
-...     model=None,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-...     processing_class=tokenizer,
-...     model_init=model_init,
-...     data_collator=data_collator,
-... )
+def sigopt_hp_space(trial):
+    return [
+        {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double"},
+        {
+            "categorical_values": ["16", "32", "64", "128"],
+            "name": "per_device_train_batch_size",
+            "type": "categorical",
+        },
+    ]
+
+best_trials = trainer.hyperparameter_search( 
+    direction=["minimize", "maximize"],
+    backend="sigopt",
+    hp_space=sigopt_hp_space,
+    n_trials=20,
+    compute_objective=compute_objective,
+)
 ```
 
-Call hyperparameter search, get the best trial parameters, backend could be `"optuna"`/`"sigopt"`/`"wandb"`/`"ray"`. direction can be`"minimize"` or `"maximize"`, which indicates whether to optimize greater or lower objective.
+</hfoption>
+<hfoption id="Weights & Biases">
 
-You could define your own compute_objective function, if not defined, the default compute_objective will be called, and the sum of eval metric like f1 is returned as objective value.
+[Weights & Biases](https://docs.wandb.ai/guides/sweeps/sweep-config-keys) also optimizes integers, floats, and categorical parameters. It also includes support for different search strategies and distribution options.
 
 ```py
->>> best_trial = trainer.hyperparameter_search(
-...     direction="maximize",
-...     backend="optuna",
-...     hp_space=optuna_hp_space,
-...     n_trials=20,
-...     compute_objective=compute_objective,
-... )
+def wandb_hp_space(trial):
+    return {
+        "method": "random",
+        "metric": {"name": "objective", "goal": "minimize"},
+        "parameters": {
+            "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
+            "per_device_train_batch_size": {"values": [16, 32, 64, 128]},
+        },
+    }
+
+best_trials = trainer.hyperparameter_search( 
+    direction=["minimize", "maximize"],
+    backend="wandb",
+    hp_space=wandb_hp_space,
+    n_trials=20,
+    compute_objective=compute_objective,
+)
 ```
 
-## Hyperparameter search For DDP finetune
-Currently, Hyperparameter search for DDP is enabled for optuna and sigopt. Only the rank-zero process will generate the search trial and pass the argument to other ranks.
+</hfoption>
+</hfoptions>
+
+## Distributed Data Parallel
+
+[`Trainer`] only supports hyperparameter search for distributed data parallel (DDP) on the Optuna and SigOpt backends. Only the rank-zero process is used to generate the search trial, and the resulting parameters are passed along to the other ranks.
diff --git a/docs/source/en/image_processors.md b/docs/source/en/image_processors.md
new file mode 100644
index 000000000000..2e5e466cd5d2
--- /dev/null
+++ b/docs/source/en/image_processors.md
@@ -0,0 +1,222 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Image processors
+
+Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.
+
+- [`~BaseImageProcessor.center_crop`] to resize an image
+- [`~BaseImageProcessor.normalize`] or [`~BaseImageProcessor.rescale`] pixel values
+
+Use [`~ImageProcessingMixin.from_pretrained`] to load an image processors configuration (image size, whether to normalize and rescale, etc.) from a vision model on the Hugging Face [Hub](https://hf.co) or local directory. The configuration for each pretrained model is saved in a [preprocessor_config.json](https://huggingface.co/google/vit-base-patch16-224/blob/main/preprocessor_config.json) file.
+
+```py
+from transformers import AutoImageProcessor
+
+image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+Pass an image to the image processor to transform it into pixel values, and set `return_tensors="pt"` to return PyTorch tensors. Feel free to print out the inputs to see what the image looks like as a tensor.
+
+```py
+from PIL import Image
+import requests
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/image_processor_example.png"
+image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+inputs = image_processor(image, return_tensors="pt")
+```
+
+This guide covers the image processor class and how to preprocess images for vision models.
+
+## Image processor classes
+
+Image processors inherit from the [`BaseImageProcessor`] class which provides the [`~BaseImageProcessor.center_crop`], [`~BaseImageProcessor.normalize`], and [`~BaseImageProcessor.rescale`] functions. There are two types of image processors.
+
+- [`BaseImageProcessor`] is a Python implementation.
+- [`BaseImageProcessorFast`] is a faster [torchvision-backed](https://pytorch.org/vision/stable/index.html) version. For a batch of [torch.Tensor](https://pytorch.org/docs/stable/tensors.html) inputs, this can be up to 33x faster. [`BaseImageProcessorFast`] is not available for all vision models at the moment. Refer to a models API documentation to check if it is supported.
+
+Each image processor subclasses the [`ImageProcessingMixin`] class which provides the [`~ImageProcessingMixin.from_pretrained`] and [`~ImageProcessingMixin.save_pretrained`] methods for loading and saving image processors.
+
+There are two ways you can load an image processor, with [`AutoImageProcessor`] or a model-specific image processor.
+
+<hfoptions id="image-processor-classes">
+<hfoption id="AutoImageProcessor">
+
+The [AutoClass](./model_doc/auto) API provides a convenient method to load an image processor without directly specifying the model the image processor is associated with.
+
+Use [`~AutoImageProcessor.from_pretrained`] to load an image processor, and set `use_fast=True` to load a fast image processor if it's supported.
+
+```py
+from transformers import AutoImageProcessor
+
+image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224", use_fast=True)
+```
+
+</hfoption>
+<hfoption id="model-specific image processor">
+
+Each image processor is associated with a specific pretrained vision model, and the image processors configuration contains the models expected size and whether to normalize and resize.
+
+The image processor can be loaded directly from the model-specific class. Check a models API documentation to see whether it supports a fast image processor.
+
+```py
+from transformers import ViTImageProcessor
+
+image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+To load a fast image processor, use the fast implementation class.
+
+```py
+from transformers import ViTImageProcessorFast
+
+image_processor = ViTImageProcessorFast.from_pretrained("google/vit-base-patch16-224")
+```
+
+</hfoption>
+</hfoptions>
+
+## Fast image processors
+
+[`BaseImageProcessorFast`] is based on [torchvision](https://pytorch.org/vision/stable/index.html) and is significantly faster, especially when processing on a GPU. This class can be used as a drop-in replacement for [`BaseImageProcessor`] if it's available for a model because it has the same design. Make sure [torchvision](https://pytorch.org/get-started/locally/#mac-installation) is installed, and set the `use_fast` parameter to `True`.
+
+```py
+from transformers import AutoImageProcessor
+
+processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True)
+```
+
+Control which device processing is performed on with the `device` parameter. Processing is performed on the same device as the input by default if the inputs are tensors, otherwise they are processed on the CPU. The example below places the fast processor on a GPU.
+
+```py
+from torchvision.io import read_image
+from transformers import DetrImageProcessorFast
+
+images = read_image("image.jpg")
+processor = DetrImageProcessorFast.from_pretrained("facebook/detr-resnet-50")
+images_processed = processor(images, return_tensors="pt", device="cuda")
+```
+
+<details>
+<summary>Benchmarks</summary>
+
+The benchmarks are obtained from an [AWS EC2 g5.2xlarge](https://aws.amazon.com/ec2/instance-types/g5/) instance with a NVIDIA A10G Tensor Core GPU.
+
+<div class="flex">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_detr_fast_padded.png" />
+</div>
+<div class="flex">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_detr_fast_batched_compiled.png" />
+</div>
+<div class="flex">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_rt_detr_fast_single.png" />
+</div>
+<div class="flex">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_rt_detr_fast_batched.png" />
+</div>
+</details>
+
+## Preprocess
+
+Transformers' vision models expects the input as PyTorch tensors of pixel values. An image processor handles the conversion of images to pixel values, which is represented by the batch size, number of channels, height, and width. To achieve this, an image is resized (center cropped) and the pixel values are normalized and rescaled to the models expected values.
+
+Image preprocessing is not the same as *image augmentation*. Image augmentation makes changes (brightness, colors, rotatation, etc.) to an image for the purpose of either creating new training examples or prevent overfitting. Image preprocessing makes changes to an image for the purpose of matching a pretrained model's expected input format.
+
+Typically, images are augmented (to increase performance) and then preprocessed before being passed to a model. You can use any library ([Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb), [Kornia](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)) for augmentation and an image processor for preprocessing.
+
+This guide uses the torchvision [transforms](https://pytorch.org/vision/stable/transforms.html) module for augmentation.
+
+Start by loading a small sample of the [food101](https://hf.co/datasets/food101) dataset.
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("food101", split="train[:100]")
+```
+
+From the [transforms](https://pytorch.org/vision/stable/transforms.html) module, use the [Compose](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) API to chain together [RandomResizedCrop](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) and [ColorJitter](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html). These transforms randomly crop and resize an image, and randomly adjusts an images colors.
+
+The image size to randomly crop to can be retrieved from the image processor. For some models, an exact height and width are expected while for others, only the `shortest_edge` is required.
+
+```py
+from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose
+
+size = (
+    image_processor.size["shortest_edge"]
+    if "shortest_edge" in image_processor.size
+    else (image_processor.size["height"], image_processor.size["width"])
+)
+_transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)])
+```
+
+Apply the transforms to the images and convert them to the RGB format. Then pass the augmented images to the image processor to return the pixel values.
+
+The `do_resize` parameter is set to `False` because the images have already been resized in the augmentation step by [RandomResizedCrop](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html). If you don't augment the images, then the image processor automatically resizes and normalizes the images with the `image_mean` and `image_std` values. These values are found in the preprocessor configuration file.
+
+```py
+def transforms(examples):
+    images = [_transforms(img.convert("RGB")) for img in examples["image"]]
+    examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"]
+    return examples
+```
+
+Apply the combined augmentation and preprocessing function to the entire dataset on the fly with [`~datasets.Dataset.set_transform`].
+
+```py
+dataset.set_transform(transforms)
+```
+
+Convert the pixel values back into an image to see how the image has been augmented and preprocessed.
+
+```py
+import numpy as np
+import matplotlib.pyplot as plt
+
+img = dataset[0]["pixel_values"]
+plt.imshow(img.permute(1, 2, 0))
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">before</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">after</figcaption>
+  </div>
+</div>
+
+For other vision tasks like object detection or segmentation, the image processor includes post-processing methods to convert a models raw output into meaningful predictions like bounding boxes or segmentation maps.
+
+### Padding
+
+Some models, like [DETR](./model_doc/detr), applies [scale augmentation](https://paperswithcode.com/method/image-scale-augmentation) during training which can cause images in a batch to have different sizes. Images with different sizes can't be batched together.
+
+To fix this, pad the images with the special padding token `0`. Use the [pad](https://github.com/huggingface/transformers/blob/9578c2597e2d88b6f0b304b5a05864fd613ddcc1/src/transformers/models/detr/image_processing_detr.py#L1151) method to pad the images, and define a custom collate function to batch them together.
+
+```py
+def collate_fn(batch):
+    pixel_values = [item["pixel_values"] for item in batch]
+    encoding = image_processor.pad(pixel_values, return_tensors="pt")
+    labels = [item["labels"] for item in batch]
+    batch = {}
+    batch["pixel_values"] = encoding["pixel_values"]
+    batch["pixel_mask"] = encoding["pixel_mask"]
+    batch["labels"] = labels
+    return batch
+```
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 89d7434b5a20..5c3898ce7883 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,383 +13,33 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 -->
 
-# 🤗 Transformers
+# Transformers
 
-State-of-the-art Machine Learning for [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), and [JAX](https://jax.readthedocs.io/en/latest/).
+Transformers is a library of pretrained natural language processing, computer vision, audio, and multimodal models for inference and training. Use Transformers to train models on your data, build inference applications, and generate text with large language models.
 
-🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as:
+Explore the [Hugging Face Hub](https://huggingface.com) today to find a model and use Transformers to help you get started right away.
 
-📝 **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, code generation, summarization, translation, multiple choice, and text generation.<br>
-🖼️ **Computer Vision**: image classification, object detection, and segmentation.<br>
-🗣️ **Audio**: automatic speech recognition and audio classification.<br>
-🐙 **Multimodal**: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
+## Features
 
-🤗 Transformers support framework interoperability between PyTorch, TensorFlow, and JAX. This provides the flexibility to use a different framework at each stage of a model's life; train a model in three lines of code in one framework, and load it for inference in another. Models can also be exported to a format like ONNX and TorchScript for deployment in production environments.
+Transformers provides everything you need for inference or training with state-of-the-art pretrained models. Some of the main features include:
 
-Join the growing community on the [Hub](https://huggingface.co/models), [forum](https://discuss.huggingface.co/), or [Discord](https://discord.com/invite/JfAtkvEtRb) today!
+- [Pipeline](./pipeline_tutorial): Simple and optimized inference class for many machine learning tasks like text generation, image segmentation, automatic speech recognition, document question answering, and more.
+- [Trainer](./trainer): A comprehensive trainer that supports features such as mixed precision, torch.compile, and FlashAttention for training and distributed training for PyTorch models.
+- [generate](./llm_tutorial): Fast text generation with large language models (LLMs) and vision language models (VLMs), including support for streaming and multiple decoding strategies.
 
-## If you are looking for custom support from the Hugging Face team
+## Design
 
-<a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
-</a>
+> [!TIP]
+> Read our [Philosophy](./philosophy) to learn more about Transformers' design principles.
 
-## Contents
+Transformers is designed for developers and machine learning engineers and researchers. Its main design principles are:
 
-The documentation is organized into five sections:
+1. Fast and easy to use: Every model is implemented from only three main classes (configuration, model, and preprocessor) and can be quickly used for inference or training with [`Pipeline`] or [`Trainer`].
+2. Pretrained models: Reduce your carbon footprint, compute cost and time by using a pretrained model instead of training an entirely new one. Each pretrained model is reproduced as closely as possible to the original model and offers state-of-the-art performance.
 
-- **GET STARTED** provides a quick tour of the library and installation instructions to get up and running.
-- **TUTORIALS** are a great place to start if you're a beginner. This section will help you gain the basic skills you need to start using the library.
-- **HOW-TO GUIDES** show you how to achieve a specific goal, like finetuning a pretrained model for language modeling or how to write and share a custom model.
-- **CONCEPTUAL GUIDES** offers more discussion and explanation of the underlying concepts and ideas behind models, tasks, and the design philosophy of 🤗 Transformers.
-- **API** describes all classes and functions:
+<div class="flex justify-center">
+  <a target="_blank" href="https://huggingface.co/support">
+      <img alt="HuggingFace Expert Acceleration Program" src="https://hf.co/datasets/huggingface/documentation-images/resolve/81d7d9201fd4ceb537fc4cebc22c29c37a2ed216/transformers/transformers-index.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+  </a>
+</div>
 
-  - **MAIN CLASSES** details the most important classes like configuration, model, tokenizer, and pipeline.
-  - **MODELS** details the classes and functions related to each model implemented in the library.
-  - **INTERNAL HELPERS** details utility classes and functions used internally.
-
-
-## Supported models and frameworks
-
-The table below represents the current support in the library for each of those models, whether they have a Python
-tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in Jax (via
-Flax), PyTorch, and/or TensorFlow.
-
-<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
-
-|                                  Model                                   | PyTorch support | TensorFlow support | Flax Support |
-|:------------------------------------------------------------------------:|:---------------:|:------------------:|:------------:|
-|                        [ALBERT](model_doc/albert)                        |       ✅        |         ✅         |      ✅      |
-|                         [ALIGN](model_doc/align)                         |       ✅        |         ❌         |      ❌      |
-|                       [AltCLIP](model_doc/altclip)                       |       ✅        |         ❌         |      ❌      |
-|                          [Aria](model_doc/aria)                          |       ✅        |         ❌         |      ❌      |
-|                     [AriaText](model_doc/aria_text)                      |       ✅        |         ❌         |      ❌      |
-| [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) |       ✅        |         ❌         |      ❌      |
-|                    [Autoformer](model_doc/autoformer)                    |       ✅        |         ❌         |      ❌      |
-|                         [Bamba](model_doc/bamba)                         |       ✅        |         ❌         |      ❌      |
-|                          [Bark](model_doc/bark)                          |       ✅        |         ❌         |      ❌      |
-|                          [BART](model_doc/bart)                          |       ✅        |         ✅         |      ✅      |
-|                       [BARThez](model_doc/barthez)                       |       ✅        |         ✅         |      ✅      |
-|                       [BARTpho](model_doc/bartpho)                       |       ✅        |         ✅         |      ✅      |
-|                          [BEiT](model_doc/beit)                          |       ✅        |         ❌         |      ✅      |
-|                          [BERT](model_doc/bert)                          |       ✅        |         ✅         |      ✅      |
-|               [Bert Generation](model_doc/bert-generation)               |       ✅        |         ❌         |      ❌      |
-|                 [BertJapanese](model_doc/bert-japanese)                  |       ✅        |         ✅         |      ✅      |
-|                      [BERTweet](model_doc/bertweet)                      |       ✅        |         ✅         |      ✅      |
-|                      [BigBird](model_doc/big_bird)                       |       ✅        |         ❌         |      ✅      |
-|               [BigBird-Pegasus](model_doc/bigbird_pegasus)               |       ✅        |         ❌         |      ❌      |
-|                        [BioGpt](model_doc/biogpt)                        |       ✅        |         ❌         |      ❌      |
-|                           [BiT](model_doc/bit)                           |       ✅        |         ❌         |      ❌      |
-|                    [Blenderbot](model_doc/blenderbot)                    |       ✅        |         ✅         |      ✅      |
-|              [BlenderbotSmall](model_doc/blenderbot-small)               |       ✅        |         ✅         |      ✅      |
-|                          [BLIP](model_doc/blip)                          |       ✅        |         ✅         |      ❌      |
-|                        [BLIP-2](model_doc/blip-2)                        |       ✅        |         ❌         |      ❌      |
-|                         [BLOOM](model_doc/bloom)                         |       ✅        |         ❌         |      ✅      |
-|                          [BORT](model_doc/bort)                          |       ✅        |         ✅         |      ✅      |
-|                   [BridgeTower](model_doc/bridgetower)                   |       ✅        |         ❌         |      ❌      |
-|                          [BROS](model_doc/bros)                          |       ✅        |         ❌         |      ❌      |
-|                          [ByT5](model_doc/byt5)                          |       ✅        |         ✅         |      ✅      |
-|                     [CamemBERT](model_doc/camembert)                     |       ✅        |         ✅         |      ❌      |
-|                        [CANINE](model_doc/canine)                        |       ✅        |         ❌         |      ❌      |
-|                     [Chameleon](model_doc/chameleon)                     |       ✅        |         ❌         |      ❌      |
-|                  [Chinese-CLIP](model_doc/chinese_clip)                  |       ✅        |         ❌         |      ❌      |
-|                          [CLAP](model_doc/clap)                          |       ✅        |         ❌         |      ❌      |
-|                          [CLIP](model_doc/clip)                          |       ✅        |         ✅         |      ✅      |
-|                       [CLIPSeg](model_doc/clipseg)                       |       ✅        |         ❌         |      ❌      |
-|                          [CLVP](model_doc/clvp)                          |       ✅        |         ❌         |      ❌      |
-|                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
-|                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ✅      |
-|                        [Cohere](model_doc/cohere)                        |       ✅        |         ❌         |      ❌      |
-|                       [Cohere2](model_doc/cohere2)                       |       ✅        |         ❌         |      ❌      |
-|                       [ColPali](model_doc/colpali)                       |       ✅        |         ❌         |      ❌      |
-|              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
-|                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
-|                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
-|                    [ConvNeXTV2](model_doc/convnextv2)                    |       ✅        |         ✅         |      ❌      |
-|                           [CPM](model_doc/cpm)                           |       ✅        |         ✅         |      ✅      |
-|                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
-|                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
-|                           [CvT](model_doc/cvt)                           |       ✅        |         ✅         |      ❌      |
-|                      [DAB-DETR](model_doc/dab-detr)                      |       ✅        |         ❌         |      ❌      |
-|                           [DAC](model_doc/dac)                           |       ✅        |         ❌         |      ❌      |
-|                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
-|                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
-|                   [Data2VecVision](model_doc/data2vec)                   |       ✅        |         ✅         |      ❌      |
-|                          [DBRX](model_doc/dbrx)                          |       ✅        |         ❌         |      ❌      |
-|                       [DeBERTa](model_doc/deberta)                       |       ✅        |         ✅         |      ❌      |
-|                    [DeBERTa-v2](model_doc/deberta-v2)                    |       ✅        |         ✅         |      ❌      |
-|          [Decision Transformer](model_doc/decision_transformer)          |       ✅        |         ❌         |      ❌      |
-|               [Deformable DETR](model_doc/deformable_detr)               |       ✅        |         ❌         |      ❌      |
-|                          [DeiT](model_doc/deit)                          |       ✅        |         ✅         |      ❌      |
-|                        [DePlot](model_doc/deplot)                        |       ✅        |         ❌         |      ❌      |
-|                [Depth Anything](model_doc/depth_anything)                |       ✅        |         ❌         |      ❌      |
-|                     [DepthPro](model_doc/depth_pro)                      |       ✅        |         ❌         |      ❌      |
-|                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
-|                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
-|                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
-|                     [DiffLlama](model_doc/diffllama)                     |       ✅        |         ❌         |      ❌      |
-|                         [DiNAT](model_doc/dinat)                         |       ✅        |         ❌         |      ❌      |
-|                        [DINOv2](model_doc/dinov2)                        |       ✅        |         ❌         |      ✅      |
-|         [DINOv2 with Registers](model_doc/dinov2_with_registers)         |       ✅        |         ❌         |      ❌      |
-|                    [DistilBERT](model_doc/distilbert)                    |       ✅        |         ✅         |      ✅      |
-|                           [DiT](model_doc/dit)                           |       ✅        |         ❌         |      ✅      |
-|                       [DonutSwin](model_doc/donut)                       |       ✅        |         ❌         |      ❌      |
-|                           [DPR](model_doc/dpr)                           |       ✅        |         ✅         |      ❌      |
-|                           [DPT](model_doc/dpt)                           |       ✅        |         ❌         |      ❌      |
-|               [EfficientFormer](model_doc/efficientformer)               |       ✅        |         ✅         |      ❌      |
-|                  [EfficientNet](model_doc/efficientnet)                  |       ✅        |         ❌         |      ❌      |
-|                       [ELECTRA](model_doc/electra)                       |       ✅        |         ✅         |      ✅      |
-|                          [Emu3](model_doc/emu3)                          |       ✅        |         ❌         |      ❌      |
-|                       [EnCodec](model_doc/encodec)                       |       ✅        |         ❌         |      ❌      |
-|               [Encoder decoder](model_doc/encoder-decoder)               |       ✅        |         ✅         |      ✅      |
-|                         [ERNIE](model_doc/ernie)                         |       ✅        |         ❌         |      ❌      |
-|                       [ErnieM](model_doc/ernie_m)                        |       ✅        |         ❌         |      ❌      |
-|                           [ESM](model_doc/esm)                           |       ✅        |         ✅         |      ❌      |
-|              [FairSeq Machine-Translation](model_doc/fsmt)               |       ✅        |         ❌         |      ❌      |
-|                        [Falcon](model_doc/falcon)                        |       ✅        |         ❌         |      ❌      |
-|                       [Falcon3](model_doc/falcon3)                       |       ✅        |         ❌         |      ✅      |
-|                  [FalconMamba](model_doc/falcon_mamba)                   |       ✅        |         ❌         |      ❌      |
-|         [FastSpeech2Conformer](model_doc/fastspeech2_conformer)          |       ✅        |         ❌         |      ❌      |
-|                       [FLAN-T5](model_doc/flan-t5)                       |       ✅        |         ✅         |      ✅      |
-|                      [FLAN-UL2](model_doc/flan-ul2)                      |       ✅        |         ✅         |      ✅      |
-|                      [FlauBERT](model_doc/flaubert)                      |       ✅        |         ✅         |      ❌      |
-|                         [FLAVA](model_doc/flava)                         |       ✅        |         ❌         |      ❌      |
-|                          [FNet](model_doc/fnet)                          |       ✅        |         ❌         |      ❌      |
-|                      [FocalNet](model_doc/focalnet)                      |       ✅        |         ❌         |      ❌      |
-|                  [Funnel Transformer](model_doc/funnel)                  |       ✅        |         ✅         |      ❌      |
-|                          [Fuyu](model_doc/fuyu)                          |       ✅        |         ❌         |      ❌      |
-|                         [Gemma](model_doc/gemma)                         |       ✅        |         ❌         |      ✅      |
-|                        [Gemma2](model_doc/gemma2)                        |       ✅        |         ❌         |      ❌      |
-|                           [GIT](model_doc/git)                           |       ✅        |         ❌         |      ❌      |
-|                           [GLM](model_doc/glm)                           |       ✅        |         ❌         |      ❌      |
-|                          [GLPN](model_doc/glpn)                          |       ✅        |         ❌         |      ❌      |
-|                      [GOT-OCR2](model_doc/got_ocr2)                      |       ✅        |         ❌         |      ❌      |
-|                       [GPT Neo](model_doc/gpt_neo)                       |       ✅        |         ❌         |      ✅      |
-|                      [GPT NeoX](model_doc/gpt_neox)                      |       ✅        |         ❌         |      ❌      |
-|             [GPT NeoX Japanese](model_doc/gpt_neox_japanese)             |       ✅        |         ❌         |      ❌      |
-|                         [GPT-J](model_doc/gptj)                          |       ✅        |         ✅         |      ✅      |
-|                       [GPT-Sw3](model_doc/gpt-sw3)                       |       ✅        |         ✅         |      ✅      |
-|                   [GPTBigCode](model_doc/gpt_bigcode)                    |       ✅        |         ❌         |      ❌      |
-|               [GPTSAN-japanese](model_doc/gptsan-japanese)               |       ✅        |         ❌         |      ❌      |
-|                       [Granite](model_doc/granite)                       |       ✅        |         ❌         |      ❌      |
-|                  [GraniteMoeMoe](model_doc/granitemoe)                   |       ✅        |         ❌         |      ❌      |
-|                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
-|                [Grounding DINO](model_doc/grounding-dino)                |       ✅        |         ❌         |      ❌      |
-|                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
-|                        [Helium](model_doc/helium)                        |       ✅        |         ❌         |      ❌      |
-|                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
-|                         [Hiera](model_doc/hiera)                         |       ✅        |         ❌         |      ❌      |
-|                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
-|                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
-|                        [I-JEPA](model_doc/ijepa)                         |       ✅        |         ❌         |      ❌      |
-|                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ✅         |      ❌      |
-|                      [Idefics2](model_doc/idefics2)                      |       ✅        |         ❌         |      ❌      |
-|                      [Idefics3](model_doc/idefics3)                      |       ✅        |         ❌         |      ❌      |
-|          [Idefics3VisionTransformer](model_doc/idefics3_vision)          |       ❌        |         ❌         |      ❌      |
-|                      [ImageGPT](model_doc/imagegpt)                      |       ✅        |         ❌         |      ❌      |
-|                      [Informer](model_doc/informer)                      |       ✅        |         ❌         |      ❌      |
-|                  [InstructBLIP](model_doc/instructblip)                  |       ✅        |         ❌         |      ❌      |
-|             [InstructBlipVideo](model_doc/instructblipvideo)             |       ✅        |         ❌         |      ❌      |
-|                         [Jamba](model_doc/jamba)                         |       ✅        |         ❌         |      ❌      |
-|                        [JetMoe](model_doc/jetmoe)                        |       ✅        |         ❌         |      ❌      |
-|                       [Jukebox](model_doc/jukebox)                       |       ✅        |         ❌         |      ❌      |
-|                      [KOSMOS-2](model_doc/kosmos-2)                      |       ✅        |         ❌         |      ❌      |
-|                      [LayoutLM](model_doc/layoutlm)                      |       ✅        |         ✅         |      ❌      |
-|                    [LayoutLMv2](model_doc/layoutlmv2)                    |       ✅        |         ❌         |      ❌      |
-|                    [LayoutLMv3](model_doc/layoutlmv3)                    |       ✅        |         ✅         |      ❌      |
-|                     [LayoutXLM](model_doc/layoutxlm)                     |       ✅        |         ❌         |      ❌      |
-|                           [LED](model_doc/led)                           |       ✅        |         ✅         |      ❌      |
-|                         [LeViT](model_doc/levit)                         |       ✅        |         ❌         |      ❌      |
-|                          [LiLT](model_doc/lilt)                          |       ✅        |         ❌         |      ❌      |
-|                         [LLaMA](model_doc/llama)                         |       ✅        |         ❌         |      ✅      |
-|                        [Llama2](model_doc/llama2)                        |       ✅        |         ❌         |      ✅      |
-|                        [Llama3](model_doc/llama3)                        |       ✅        |         ❌         |      ✅      |
-|                         [LLaVa](model_doc/llava)                         |       ✅        |         ❌         |      ❌      |
-|                    [LLaVA-NeXT](model_doc/llava_next)                    |       ✅        |         ❌         |      ❌      |
-|              [LLaVa-NeXT-Video](model_doc/llava_next_video)              |       ✅        |         ❌         |      ❌      |
-|               [LLaVA-Onevision](model_doc/llava_onevision)               |       ✅        |         ❌         |      ❌      |
-|                    [Longformer](model_doc/longformer)                    |       ✅        |         ✅         |      ❌      |
-|                        [LongT5](model_doc/longt5)                        |       ✅        |         ❌         |      ✅      |
-|                          [LUKE](model_doc/luke)                          |       ✅        |         ❌         |      ❌      |
-|                        [LXMERT](model_doc/lxmert)                        |       ✅        |         ✅         |      ❌      |
-|                        [M-CTC-T](model_doc/mctct)                        |       ✅        |         ❌         |      ❌      |
-|                       [M2M100](model_doc/m2m_100)                        |       ✅        |         ❌         |      ❌      |
-|                    [MADLAD-400](model_doc/madlad-400)                    |       ✅        |         ✅         |      ✅      |
-|                         [Mamba](model_doc/mamba)                         |       ✅        |         ❌         |      ❌      |
-|                        [mamba2](model_doc/mamba2)                        |       ✅        |         ❌         |      ❌      |
-|                        [Marian](model_doc/marian)                        |       ✅        |         ✅         |      ✅      |
-|                      [MarkupLM](model_doc/markuplm)                      |       ✅        |         ❌         |      ❌      |
-|                   [Mask2Former](model_doc/mask2former)                   |       ✅        |         ❌         |      ❌      |
-|                    [MaskFormer](model_doc/maskformer)                    |       ✅        |         ❌         |      ❌      |
-|                        [MatCha](model_doc/matcha)                        |       ✅        |         ❌         |      ❌      |
-|                         [mBART](model_doc/mbart)                         |       ✅        |         ✅         |      ✅      |
-|                      [mBART-50](model_doc/mbart50)                       |       ✅        |         ✅         |      ✅      |
-|                          [MEGA](model_doc/mega)                          |       ✅        |         ❌         |      ❌      |
-|                 [Megatron-BERT](model_doc/megatron-bert)                 |       ✅        |         ❌         |      ❌      |
-|                 [Megatron-GPT2](model_doc/megatron_gpt2)                 |       ✅        |         ✅         |      ✅      |
-|                       [MGP-STR](model_doc/mgp-str)                       |       ✅        |         ❌         |      ❌      |
-|                          [Mimi](model_doc/mimi)                          |       ✅        |         ❌         |      ❌      |
-|                       [Mistral](model_doc/mistral)                       |       ✅        |         ✅         |      ✅      |
-|                       [Mixtral](model_doc/mixtral)                       |       ✅        |         ❌         |      ❌      |
-|                        [Mllama](model_doc/mllama)                        |       ✅        |         ❌         |      ❌      |
-|                         [mLUKE](model_doc/mluke)                         |       ✅        |         ❌         |      ❌      |
-|                           [MMS](model_doc/mms)                           |       ✅        |         ✅         |      ✅      |
-|                    [MobileBERT](model_doc/mobilebert)                    |       ✅        |         ✅         |      ❌      |
-|                  [MobileNetV1](model_doc/mobilenet_v1)                   |       ✅        |         ❌         |      ❌      |
-|                  [MobileNetV2](model_doc/mobilenet_v2)                   |       ✅        |         ❌         |      ❌      |
-|                     [MobileViT](model_doc/mobilevit)                     |       ✅        |         ✅         |      ❌      |
-|                   [MobileViTV2](model_doc/mobilevitv2)                   |       ✅        |         ❌         |      ❌      |
-|                    [ModernBERT](model_doc/modernbert)                    |       ✅        |         ❌         |      ❌      |
-|                     [Moonshine](model_doc/moonshine)                     |       ✅        |         ❌         |      ❌      |
-|                         [Moshi](model_doc/moshi)                         |       ✅        |         ❌         |      ❌      |
-|                         [MPNet](model_doc/mpnet)                         |       ✅        |         ✅         |      ❌      |
-|                           [MPT](model_doc/mpt)                           |       ✅        |         ❌         |      ❌      |
-|                           [MRA](model_doc/mra)                           |       ✅        |         ❌         |      ❌      |
-|                           [MT5](model_doc/mt5)                           |       ✅        |         ✅         |      ✅      |
-|                      [MusicGen](model_doc/musicgen)                      |       ✅        |         ❌         |      ❌      |
-|               [MusicGen Melody](model_doc/musicgen_melody)               |       ✅        |         ❌         |      ❌      |
-|                           [MVP](model_doc/mvp)                           |       ✅        |         ❌         |      ❌      |
-|                           [NAT](model_doc/nat)                           |       ✅        |         ❌         |      ❌      |
-|                      [Nemotron](model_doc/nemotron)                      |       ✅        |         ❌         |      ❌      |
-|                         [Nezha](model_doc/nezha)                         |       ✅        |         ❌         |      ❌      |
-|                          [NLLB](model_doc/nllb)                          |       ✅        |         ❌         |      ❌      |
-|                      [NLLB-MOE](model_doc/nllb-moe)                      |       ✅        |         ❌         |      ❌      |
-|                        [Nougat](model_doc/nougat)                        |       ✅        |         ✅         |      ✅      |
-|                 [Nyströmformer](model_doc/nystromformer)                 |       ✅        |         ❌         |      ❌      |
-|                          [OLMo](model_doc/olmo)                          |       ✅        |         ❌         |      ❌      |
-|                         [OLMo2](model_doc/olmo2)                         |       ✅        |         ❌         |      ❌      |
-|                         [OLMoE](model_doc/olmoe)                         |       ✅        |         ❌         |      ❌      |
-|                   [OmDet-Turbo](model_doc/omdet-turbo)                   |       ✅        |         ❌         |      ❌      |
-|                     [OneFormer](model_doc/oneformer)                     |       ✅        |         ❌         |      ❌      |
-|                    [OpenAI GPT](model_doc/openai-gpt)                    |       ✅        |         ✅         |      ❌      |
-|                      [OpenAI GPT-2](model_doc/gpt2)                      |       ✅        |         ✅         |      ✅      |
-|                    [OpenLlama](model_doc/open-llama)                     |       ✅        |         ❌         |      ❌      |
-|                           [OPT](model_doc/opt)                           |       ✅        |         ✅         |      ✅      |
-|                       [OWL-ViT](model_doc/owlvit)                        |       ✅        |         ❌         |      ❌      |
-|                         [OWLv2](model_doc/owlv2)                         |       ✅        |         ❌         |      ❌      |
-|                     [PaliGemma](model_doc/paligemma)                     |       ✅        |         ❌         |      ❌      |
-|                  [PatchTSMixer](model_doc/patchtsmixer)                  |       ✅        |         ❌         |      ❌      |
-|                      [PatchTST](model_doc/patchtst)                      |       ✅        |         ❌         |      ❌      |
-|                       [Pegasus](model_doc/pegasus)                       |       ✅        |         ✅         |      ✅      |
-|                     [PEGASUS-X](model_doc/pegasus_x)                     |       ✅        |         ❌         |      ❌      |
-|                     [Perceiver](model_doc/perceiver)                     |       ✅        |         ❌         |      ❌      |
-|                     [Persimmon](model_doc/persimmon)                     |       ✅        |         ❌         |      ❌      |
-|                           [Phi](model_doc/phi)                           |       ✅        |         ❌         |      ❌      |
-|                          [Phi3](model_doc/phi3)                          |       ✅        |         ❌         |      ❌      |
-|                        [Phimoe](model_doc/phimoe)                        |       ✅        |         ❌         |      ❌      |
-|                       [PhoBERT](model_doc/phobert)                       |       ✅        |         ✅         |      ✅      |
-|                    [Pix2Struct](model_doc/pix2struct)                    |       ✅        |         ❌         |      ❌      |
-|                       [Pixtral](model_doc/pixtral)                       |       ✅        |         ❌         |      ❌      |
-|                        [PLBart](model_doc/plbart)                        |       ✅        |         ❌         |      ❌      |
-|                    [PoolFormer](model_doc/poolformer)                    |       ✅        |         ❌         |      ❌      |
-|                     [Pop2Piano](model_doc/pop2piano)                     |       ✅        |         ❌         |      ❌      |
-|                    [ProphetNet](model_doc/prophetnet)                    |       ✅        |         ❌         |      ❌      |
-|                           [PVT](model_doc/pvt)                           |       ✅        |         ❌         |      ❌      |
-|                        [PVTv2](model_doc/pvt_v2)                         |       ✅        |         ❌         |      ❌      |
-|                       [QDQBert](model_doc/qdqbert)                       |       ✅        |         ❌         |      ❌      |
-|                         [Qwen2](model_doc/qwen2)                         |       ✅        |         ❌         |      ❌      |
-|                    [Qwen2_5_VL](model_doc/qwen2_5_vl)                    |       ✅        |         ❌         |      ❌      |
-|                   [Qwen2Audio](model_doc/qwen2_audio)                    |       ✅        |         ❌         |      ❌      |
-|                     [Qwen2MoE](model_doc/qwen2_moe)                      |       ✅        |         ❌         |      ❌      |
-|                      [Qwen2VL](model_doc/qwen2_vl)                       |       ✅        |         ❌         |      ❌      |
-|                           [RAG](model_doc/rag)                           |       ✅        |         ✅         |      ❌      |
-|                         [REALM](model_doc/realm)                         |       ✅        |         ❌         |      ❌      |
-|               [RecurrentGemma](model_doc/recurrent_gemma)                |       ✅        |         ❌         |      ❌      |
-|                      [Reformer](model_doc/reformer)                      |       ✅        |         ❌         |      ❌      |
-|                        [RegNet](model_doc/regnet)                        |       ✅        |         ✅         |      ✅      |
-|                       [RemBERT](model_doc/rembert)                       |       ✅        |         ✅         |      ❌      |
-|                        [ResNet](model_doc/resnet)                        |       ✅        |         ✅         |      ✅      |
-|                     [RetriBERT](model_doc/retribert)                     |       ✅        |         ❌         |      ❌      |
-|                       [RoBERTa](model_doc/roberta)                       |       ✅        |         ✅         |      ✅      |
-|          [RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)          |       ✅        |         ✅         |      ✅      |
-|                      [RoCBert](model_doc/roc_bert)                       |       ✅        |         ❌         |      ❌      |
-|                      [RoFormer](model_doc/roformer)                      |       ✅        |         ✅         |      ✅      |
-|                       [RT-DETR](model_doc/rt_detr)                       |       ✅        |         ❌         |      ❌      |
-|                [RT-DETR-ResNet](model_doc/rt_detr_resnet)                |       ✅        |         ❌         |      ❌      |
-|                    [RT-DETRv2](model_doc/rt_detr_v2)                     |       ✅        |         ❌         |      ❌      |
-|                          [RWKV](model_doc/rwkv)                          |       ✅        |         ❌         |      ❌      |
-|                           [SAM](model_doc/sam)                           |       ✅        |         ✅         |      ❌      |
-|                  [SeamlessM4T](model_doc/seamless_m4t)                   |       ✅        |         ❌         |      ❌      |
-|                [SeamlessM4Tv2](model_doc/seamless_m4t_v2)                |       ✅        |         ❌         |      ❌      |
-|                     [SegFormer](model_doc/segformer)                     |       ✅        |         ✅         |      ❌      |
-|                        [SegGPT](model_doc/seggpt)                        |       ✅        |         ❌         |      ❌      |
-|                           [SEW](model_doc/sew)                           |       ✅        |         ❌         |      ❌      |
-|                         [SEW-D](model_doc/sew-d)                         |       ✅        |         ❌         |      ❌      |
-|                        [SigLIP](model_doc/siglip)                        |       ✅        |         ❌         |      ❌      |
-|        [Speech Encoder decoder](model_doc/speech-encoder-decoder)        |       ✅        |         ❌         |      ✅      |
-|                 [Speech2Text](model_doc/speech_to_text)                  |       ✅        |         ✅         |      ❌      |
-|                      [SpeechT5](model_doc/speecht5)                      |       ✅        |         ❌         |      ❌      |
-|                      [Splinter](model_doc/splinter)                      |       ✅        |         ❌         |      ❌      |
-|                   [SqueezeBERT](model_doc/squeezebert)                   |       ✅        |         ❌         |      ❌      |
-|                      [StableLm](model_doc/stablelm)                      |       ✅        |         ❌         |      ❌      |
-|                    [Starcoder2](model_doc/starcoder2)                    |       ✅        |         ❌         |      ❌      |
-|                     [SuperGlue](model_doc/superglue)                     |       ✅        |         ❌         |      ❌      |
-|                    [SuperPoint](model_doc/superpoint)                    |       ✅        |         ❌         |      ❌      |
-|                   [SwiftFormer](model_doc/swiftformer)                   |       ✅        |         ✅         |      ❌      |
-|                    [Swin Transformer](model_doc/swin)                    |       ✅        |         ✅         |      ❌      |
-|                 [Swin Transformer V2](model_doc/swinv2)                  |       ✅        |         ❌         |      ❌      |
-|                       [Swin2SR](model_doc/swin2sr)                       |       ✅        |         ❌         |      ❌      |
-|           [SwitchTransformers](model_doc/switch_transformers)            |       ✅        |         ❌         |      ❌      |
-|                            [T5](model_doc/t5)                            |       ✅        |         ✅         |      ✅      |
-|                        [T5v1.1](model_doc/t5v1.1)                        |       ✅        |         ✅         |      ✅      |
-|             [Table Transformer](model_doc/table-transformer)             |       ✅        |         ❌         |      ❌      |
-|                         [TAPAS](model_doc/tapas)                         |       ✅        |         ✅         |      ❌      |
-|                         [TAPEX](model_doc/tapex)                         |       ✅        |         ✅         |      ✅      |
-|                       [TextNet](model_doc/textnet)                       |       ✅        |         ❌         |      ❌      |
-|       [Time Series Transformer](model_doc/time_series_transformer)       |       ✅        |         ❌         |      ❌      |
-|                   [TimeSformer](model_doc/timesformer)                   |       ✅        |         ❌         |      ❌      |
-|                [TimmWrapperModel](model_doc/timm_wrapper)                |       ✅        |         ❌         |      ❌      |
-|        [Trajectory Transformer](model_doc/trajectory_transformer)        |       ✅        |         ❌         |      ❌      |
-|                  [Transformer-XL](model_doc/transfo-xl)                  |       ✅        |         ✅         |      ❌      |
-|                         [TrOCR](model_doc/trocr)                         |       ✅        |         ❌         |      ❌      |
-|                          [TVLT](model_doc/tvlt)                          |       ✅        |         ❌         |      ❌      |
-|                           [TVP](model_doc/tvp)                           |       ✅        |         ❌         |      ❌      |
-|                          [UDOP](model_doc/udop)                          |       ✅        |         ❌         |      ❌      |
-|                           [UL2](model_doc/ul2)                           |       ✅        |         ✅         |      ✅      |
-|                          [UMT5](model_doc/umt5)                          |       ✅        |         ❌         |      ❌      |
-|                     [UniSpeech](model_doc/unispeech)                     |       ✅        |         ❌         |      ❌      |
-|                 [UniSpeechSat](model_doc/unispeech-sat)                  |       ✅        |         ❌         |      ❌      |
-|                       [UnivNet](model_doc/univnet)                       |       ✅        |         ❌         |      ❌      |
-|                       [UPerNet](model_doc/upernet)                       |       ✅        |         ❌         |      ❌      |
-|                           [VAN](model_doc/van)                           |       ✅        |         ❌         |      ❌      |
-|                   [VideoLlava](model_doc/video_llava)                    |       ✅        |         ❌         |      ❌      |
-|                      [VideoMAE](model_doc/videomae)                      |       ✅        |         ❌         |      ❌      |
-|                          [ViLT](model_doc/vilt)                          |       ✅        |         ❌         |      ❌      |
-|                      [VipLlava](model_doc/vipllava)                      |       ✅        |         ❌         |      ❌      |
-|        [Vision Encoder decoder](model_doc/vision-encoder-decoder)        |       ✅        |         ✅         |      ✅      |
-|       [VisionTextDualEncoder](model_doc/vision-text-dual-encoder)        |       ✅        |         ✅         |      ✅      |
-|                   [VisualBERT](model_doc/visual_bert)                    |       ✅        |         ❌         |      ❌      |
-|                           [ViT](model_doc/vit)                           |       ✅        |         ✅         |      ✅      |
-|                    [ViT Hybrid](model_doc/vit_hybrid)                    |       ✅        |         ❌         |      ❌      |
-|                        [VitDet](model_doc/vitdet)                        |       ✅        |         ❌         |      ❌      |
-|                       [ViTMAE](model_doc/vit_mae)                        |       ✅        |         ✅         |      ❌      |
-|                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
-|                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
-|                       [ViTPose](model_doc/vitpose)                       |       ✅        |         ❌         |      ❌      |
-|              [ViTPoseBackbone](model_doc/vitpose_backbone)               |       ✅        |         ❌         |      ❌      |
-|                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
-|                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
-|                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |
-|                 [Wav2Vec2-BERT](model_doc/wav2vec2-bert)                 |       ✅        |         ❌         |      ❌      |
-|            [Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)            |       ✅        |         ❌         |      ❌      |
-|              [Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)               |       ✅        |         ✅         |      ✅      |
-|                         [WavLM](model_doc/wavlm)                         |       ✅        |         ❌         |      ❌      |
-|                       [Whisper](model_doc/whisper)                       |       ✅        |         ✅         |      ✅      |
-|                        [X-CLIP](model_doc/xclip)                         |       ✅        |         ❌         |      ❌      |
-|                         [X-MOD](model_doc/xmod)                          |       ✅        |         ❌         |      ❌      |
-|                          [XGLM](model_doc/xglm)                          |       ✅        |         ✅         |      ✅      |
-|                           [XLM](model_doc/xlm)                           |       ✅        |         ✅         |      ❌      |
-|                [XLM-ProphetNet](model_doc/xlm-prophetnet)                |       ✅        |         ❌         |      ❌      |
-|                   [XLM-RoBERTa](model_doc/xlm-roberta)                   |       ✅        |         ✅         |      ✅      |
-|                [XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)                |       ✅        |         ❌         |      ❌      |
-|                         [XLM-V](model_doc/xlm-v)                         |       ✅        |         ✅         |      ✅      |
-|                         [XLNet](model_doc/xlnet)                         |       ✅        |         ✅         |      ❌      |
-|                         [XLS-R](model_doc/xls_r)                         |       ✅        |         ✅         |      ✅      |
-|                 [XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)                 |       ✅        |         ✅         |      ✅      |
-|                         [YOLOS](model_doc/yolos)                         |       ✅        |         ❌         |      ❌      |
-|                          [YOSO](model_doc/yoso)                          |       ✅        |         ❌         |      ❌      |
-|                         [Zamba](model_doc/zamba)                         |       ✅        |         ❌         |      ❌      |
-|                        [Zamba2](model_doc/zamba2)                        |       ✅        |         ❌         |      ❌      |
-|                      [ZoeDepth](model_doc/zoedepth)                      |       ✅        |         ❌         |      ❌      |
-
-<!-- End table-->
diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md
index 4573efbb43c7..45a043ac8d9a 100644
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@@ -1,5 +1,5 @@
 <!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
+Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,45 +20,61 @@ rendered properly in your Markdown viewer.
 
 # Installation
 
-Install 🤗 Transformers for whichever deep learning library you're working with, setup your cache, and optionally configure 🤗 Transformers to run offline.
+Transformers works with [PyTorch](https://pytorch.org/get-started/locally/), [TensorFlow 2.0](https://www.tensorflow.org/install/pip), and [Flax](https://flax.readthedocs.io/en/latest/). It has been tested on Python 3.9+, PyTorch 2.0+, TensorFlow 2.6+, and Flax 0.4.1+.
 
-🤗 Transformers is tested on Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, and Flax. Follow the installation instructions below for the deep learning library you are using:
+## Virtual environment
 
-* [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
-* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions.
-* [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
+A virtual environment helps manage different projects and avoids compatibility issues between dependencies. Take a look at the [Install packages in a virtual environment using pip and venv](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) guide if you're unfamiliar with Python virtual environments.
 
-## Install with pip
+<hfoptions id="virtual">
+<hfoption id="venv">
 
-You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.
-
-Create a virtual environment with [uv](https://docs.astral.sh/uv/) (refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions), a fast Rust-based Python package and project manager.
+Create and activate a virtual environment in your project directory with [venv](https://docs.python.org/3/library/venv.html).
 
 ```bash
-uv venv my-env
-source my-env/bin/activate
+python -m venv .env
+source .env/bin/activate
 ```
 
-Now you're ready to install 🤗 Transformers with pip or uv.
-
-<hfoptions id="install">
+</hfoption>
 <hfoption id="uv">
 
+[uv](https://docs.astral.sh/uv/) is a fast Rust-based Python package and project manager.
+
 ```bash
-uv pip install transformers
+uv venv .env
+source .env/bin/activate
 ```
 
 </hfoption>
+</hfoptions>
+
+## Python
+
+You can install Transformers with pip or uv.
+
+<hfoptions id="install">
 <hfoption id="pip">
 
+[pip](https://pip.pypa.io/en/stable/) is a package installer for Python. Install Transformers with pip in your newly created virtual environment.
+
 ```bash
 pip install transformers
 ```
 
+</hfoption>
+<hfoption id="uv">
+
+[uv](https://docs.astral.sh/uv/) is a fast Rust-based Python package and project manager.
+
+```bash
+uv pip install transformers
+```
+
 </hfoption>
 </hfoptions>
 
-For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally) and TensorFlow(https://www.tensorflow.org/install/pip).
+For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally) and [TensorFlow](https://www.tensorflow.org/install/pip).
 
 Run the command below to check if your system detects an NVIDIA GPU.
 
@@ -66,72 +82,73 @@ Run the command below to check if your system detects an NVIDIA GPU.
 nvidia-smi
 ```
 
-For CPU-support only, you can conveniently install 🤗 Transformers and a deep learning library in one line. For example, install 🤗 Transformers and PyTorch with:
+To install a CPU-only version of Transformers and a machine learning framework, run the following command.
 
-```bash
-pip install 'transformers[torch]'
-```
-
-🤗 Transformers and TensorFlow 2.0:
+<hfoptions id="cpu-only">
+<hfoption id="PyTorch">
 
 ```bash
-pip install 'transformers[tf-cpu]'
+pip install 'transformers[torch]'
+uv pip install 'transformers[torch]'
 ```
 
-<Tip warning={true}>
+</hfoption>
+<hfoption id="TensorFlow">
 
-M1 / ARM Users
+For Apple M1 hardware, you need to install CMake and pkg-config first.
 
-You will need to install the following before installing TensorFlow 2.0
 ```bash
 brew install cmake
 brew install pkg-config
 ```
 
-</Tip>
-
-🤗 Transformers and Flax:
+Install TensorFlow 2.0.
 
 ```bash
-pip install 'transformers[flax]'
+pip install 'transformers[tf-cpu]'
+uv pip install 'transformers[tf-cpu]'
 ```
 
-Finally, check if 🤗 Transformers has been properly installed by running the following command. It will download a pretrained model:
+</hfoption>
+<hfoption id="Flax">
 
 ```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+pip install 'transformers[flax]'
+uv pip install 'transformers[flax]'
 ```
 
-Then print out the label and score:
+</hfoption>
+</hfoptions>
+
+Test whether the install was successful with the following command. It should return a label and score for the provided text.
 
 ```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('hugging face is the best'))"
 [{'label': 'POSITIVE', 'score': 0.9998704791069031}]
 ```
 
-## Install from source
+### Source install
+
+Installing from source installs the *latest* version rather than the *stable* version of the library. It ensures you have the most up-to-date changes in Transformers and it's useful for experimenting with the latest features or fixing a bug that hasn't been officially released in the stable version yet.
+
+The downside is that the latest version may not always be stable. If you encounter any problems, please open a [GitHub Issue](https://github.com/huggingface/transformers/issues) so we can fix it as soon as possible.
 
-Install 🤗 Transformers from source with the following command:
+Install from source with the following command.
 
 ```bash
 pip install git+https://github.com/huggingface/transformers
 ```
 
-This command installs the bleeding edge `main` version rather than the latest `stable` version. The `main` version is useful for staying up-to-date with the latest developments. For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet. However, this means the `main` version may not always be stable. We strive to keep the `main` version operational, and most issues are usually resolved within a few hours or a day. If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues) so we can fix it even sooner!
-
-Check if 🤗 Transformers has been properly installed by running the following command:
+Check if the install was successful with the command below. It should return a label and score for the provided text.
 
 ```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('hugging face is the best'))"
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
 ```
 
-## Editable install
-
-You will need an editable install if you'd like to:
+### Editable install
 
-* Use the `main` version of the source code.
-* Contribute to 🤗 Transformers and need to test changes in the code.
-
-Clone the repository and install 🤗 Transformers with the following commands:
+An [editable install](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs) is useful if you're developing locally with Transformers. It links your local copy of Transformers to the Transformers [repository](https://github.com/huggingface/transformers) instead of copying the files. The files are added to Python's import path.
 
 ```bash
 git clone https://github.com/huggingface/transformers.git
@@ -139,162 +156,68 @@ cd transformers
 pip install -e .
 ```
 
-These commands will link the folder you cloned the repository to and your Python library paths. Python will now look inside the folder you cloned to in addition to the normal library paths. For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python will also search the folder you cloned to: `~/transformers/`.
-
-<Tip warning={true}>
-
-You must keep the `transformers` folder if you want to keep using the library.
+> [!WARNING]
+> You must keep the local Transformers folder to keep using it.
 
-</Tip>
-
-Now you can easily update your clone to the latest version of 🤗 Transformers with the following command:
+Update your local version of Transformers with the latest changes in the main repository with the following command.
 
 ```bash
 cd ~/transformers/
 git pull
 ```
 
-Your Python environment will find the `main` version of 🤗 Transformers on the next run.
-
-## Install with conda
+## conda
 
-Install from the conda channel `conda-forge`:
+[conda](https://docs.conda.io/projects/conda/en/stable/#) is a language-agnostic package manager. Install Transformers from the [conda-forge](https://anaconda.org/conda-forge/transformers) channel in your newly created virtual environment.
 
 ```bash
 conda install conda-forge::transformers
 ```
 
-## Cache setup
+## Set up
 
-Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/hub`. This is the default directory given by the shell environment variable `TRANSFORMERS_CACHE`. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\hub`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory:
+After installation, you can configure the Transformers cache location or set up the library for offline usage.
 
-1. Shell environment variable (default): `HF_HUB_CACHE` or `TRANSFORMERS_CACHE`.
-2. Shell environment variable: `HF_HOME`.
-3. Shell environment variable: `XDG_CACHE_HOME` + `/huggingface`.
+### Cache directory
 
-<Tip>
+When you load a pretrained model with [`~PreTrainedModel.from_pretrained`], the model is downloaded from the Hub and locally cached.
 
-🤗 Transformers will use the shell environment variables `PYTORCH_TRANSFORMERS_CACHE` or `PYTORCH_PRETRAINED_BERT_CACHE` if you are coming from an earlier iteration of this library and have set those environment variables, unless you specify the shell environment variable `TRANSFORMERS_CACHE`.
+Every time you load a model, it checks whether the cached model is up-to-date. If it's the same, then the local model is loaded. If it's not the same, the newer model is downloaded and cached.
 
-</Tip>
+The default directory given by the shell environment variable `TRANSFORMERS_CACHE` is `~/.cache/huggingface/hub`. On Windows, the default directory is `C:\Users\username\.cache\huggingface\hub`.
 
-## Offline mode
+Cache a model in a different directory by changing the path in the following shell environment variables (listed by priority).
 
-Run 🤗 Transformers in a firewalled or offline environment with locally cached files by setting the environment variable `HF_HUB_OFFLINE=1`.
+1. [HF_HUB_CACHE](https://hf.co/docs/huggingface_hub/package_reference/environment_variables#hfhubcache) or `TRANSFORMERS_CACHE` (default)
+2. [HF_HOME](https://hf.co/docs/huggingface_hub/package_reference/environment_variables#hfhome)
+3. [XDG_CACHE_HOME](https://hf.co/docs/huggingface_hub/package_reference/environment_variables#xdgcachehome) + `/huggingface` (only if `HF_HOME` is not set)
 
-<Tip>
+Older versions of Transformers uses the shell environment variables `PYTORCH_TRANSFORMERS_CACHE` or `PYTORCH_PRETRAINED_BERT_CACHE`. You should keep these unless you specify the newer shell environment variable `TRANSFORMERS_CACHE`.
 
-Add [🤗 Datasets](https://huggingface.co/docs/datasets/) to your offline training workflow with the environment variable `HF_DATASETS_OFFLINE=1`.
+### Offline mode
 
-</Tip>
+To use Transformers in an offline or firewalled environment requires the downloaded and cached files ahead of time. Download a model repository from the Hub with the [`~huggingface_hub.snapshot_download`] method.
 
-```bash
-HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
-python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-This script should run without hanging or waiting to timeout because it won't attempt to download the model from the Hub.
-
-You can also bypass loading a model from the Hub from each [`~PreTrainedModel.from_pretrained`] call with the [`local_files_only`] parameter. When set to `True`, only local files are loaded:
+> [!TIP]
+> Refer to the [Download files from the Hub](https://hf.co/docs/huggingface_hub/guides/download) guide for more options for downloading files from the Hub. You can download files from specific revisions, download from the CLI, and even filter which files to download from a repository.
 
 ```py
-from transformers import T5Model
+from huggingface_hub import snapshot_download
 
-model = T5Model.from_pretrained("./path/to/local/directory", local_files_only=True)
+snapshot_download(repo_id="meta-llama/Llama-2-7b-hf", repo_type="model")
 ```
 
-### Fetch models and tokenizers to use offline
-
-Another option for using 🤗 Transformers offline is to download the files ahead of time, and then point to their local path when you need to use them offline. There are three ways to do this:
-
-* Download a file through the user interface on the [Model Hub](https://huggingface.co/models) by clicking on the ↓ icon.
-
-    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
-
-* Use the [`PreTrainedModel.from_pretrained`] and [`PreTrainedModel.save_pretrained`] workflow:
-
-    1. Download your files ahead of time with [`PreTrainedModel.from_pretrained`]:
-
-    ```py
-    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
-    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
-    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
-    ```
-
-    2. Save your files to a specified directory with [`PreTrainedModel.save_pretrained`]:
+Set the environment variable `HF_HUB_OFFLINE=1` to prevent HTTP calls to the Hub when loading a model.
 
-    ```py
-    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
-    >>> model.save_pretrained("./your/path/bigscience_t0")
-    ```
-
-    3. Now when you're offline, reload your files with [`PreTrainedModel.from_pretrained`] from the specified directory:
-
-    ```py
-    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
-    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
-    ```
-
-* Programmatically download files with the [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) library:
-
-    1. Install the `huggingface_hub` library in your virtual environment:
-
-    ```bash
-    python -m pip install huggingface_hub
-    ```
-
-    2. Use the [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) function to download a file to a specific path. For example, the following command downloads the `config.json` file from the [T0](https://huggingface.co/bigscience/T0_3B) model to your desired path:
-
-    ```py
-    >>> from huggingface_hub import hf_hub_download
-
-    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
-    ```
-
-Once your file is downloaded and locally cached, specify it's local path to load and use it:
-
-```py
->>> from transformers import AutoConfig
-
->>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
-```
-
-<Tip>
-
-See the [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) section for more details on downloading files stored on the Hub.
-
-</Tip>
-
-## Troubleshooting
-
-See below for some of the more common installation issues and how to resolve them.
-
-### Unsupported Python version
-
-Ensure you are using Python 3.9 or later. Run the command below to check your Python version.
-
-```
-python --version
-```
-
-### Missing dependencies
-
-Install all required dependencies by running the following command. Ensure you’re in the project directory before executing the command.
-
-```
-pip install -r requirements.txt
+```bash
+HF_HUB_OFFLINE=1 \
+python examples/pytorch/language-modeling/run_clm.py --model_name_or_path meta-llama/Llama-2-7b-hf --dataset_name wikitext ...
 ```
 
-### Windows-specific
+Another option for only loading cached files is to set `local_files_only=True` in [`~PreTrainedModel.from_pretrained`].
 
-If you encounter issues on Windows, you may need to activate Developer Mode. Navigate to Windows Settings > For Developers > Developer Mode.
-
-Alternatively, create and activate a virtual environment as shown below.
+```py
+from transformers import LlamaForCausalLM
 
+model = LlamaForCausalLM.from_pretrained("./path/to/local/directory", local_files_only=True)
 ```
-python -m venv env
-.\env\Scripts\activate
-```
-
-
diff --git a/docs/source/en/internal/model_debugging_utils.md b/docs/source/en/internal/model_debugging_utils.md
new file mode 100644
index 000000000000..ab11a45b3447
--- /dev/null
+++ b/docs/source/en/internal/model_debugging_utils.md
@@ -0,0 +1,71 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Model debugging toolboxes
+
+This page lists all the debugging and model adding tools used by the library, as well as the utility functions it provides for it.
+
+Most of those are only useful if you are adding new models in the library.
+
+
+## Model addition debuggers
+
+
+### Model addition debugger - context manager for model adders
+
+This context manager is a power user tool intended for model adders.
+It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
+To note, this context manager enforces `torch.inference_mode()`.
+
+### Rationale
+
+Because when porting models to transformers, even from python to python, model adders often have to do a lot of manual operations, involving saving and loading tensors, comparing dtypes, etc. This small tool can hopefully shave off some time.
+
+### Usage
+
+Add this context manager as follows to debug a model:
+
+```python
+import torch
+from PIL import Image
+import requests
+from transformers import LlavaProcessor, LlavaForConditionalGeneration
+torch.random.manual_seed(673)
+
+# load pretrained model and processor
+model_id = "llava-hf/llava-1.5-7b-hf"
+processor = LlavaProcessor.from_pretrained(model_id)
+model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True)
+
+# create random image input
+random_image = Image.fromarray(torch.randint(0, 256, (224, 224, 3), dtype=torch.uint8).numpy())
+
+# prompt
+prompt = "<image>Describe this image."
+
+# process inputs
+inputs = processor(text=prompt, images=random_image, return_tensors="pt")
+
+# call forward method (not .generate!)
+with model_addition_debugger_context(model, "optional_path_to_your_output_file.json"):
+    output = model.forward(**inputs)
+
+```
+
+
+[[autodoc]] model_addition_debugger
+
+[[autodoc]] model_addition_debugger_context
diff --git a/docs/source/en/internal/modeling_utils.md b/docs/source/en/internal/modeling_utils.md
index afc8123558f5..35b8b2e88eb5 100644
--- a/docs/source/en/internal/modeling_utils.md
+++ b/docs/source/en/internal/modeling_utils.md
@@ -16,10 +16,18 @@ rendered properly in your Markdown viewer.
 
 # Custom Layers and Utilities
 
-This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
+This page lists all the custom layers used by the library, as well as the utility functions and classes it provides for modeling.
 
 Most of those are only useful if you are studying the code of the models in the library.
 
+## Attention Functions
+
+[[autodoc]] AttentionInterface
+    - register
+
+## Rotary Position Embedding Functions
+
+[[autodoc]] dynamic_rope_update
 
 ## Pytorch custom modules
 
diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md
index d02c007b115f..36f82fb3dc9a 100644
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@@ -14,420 +14,346 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Best Practices for Generation with Cache
+# KV cache strategies
 
-Efficient caching is crucial for optimizing the performance of models in various generative tasks,
-including text generation, translation, summarization and other transformer-based applications.
-Effective caching helps reduce computation time and improve response rates, especially in real-time or resource-intensive applications.
+The key-value (KV) vectors are used to calculate attention scores. For autoregressive models, KV scores are calculated *every* time because the model predicts one token at a time. Each prediction depends on the previous tokens, which means the model performs the same computations each time.
 
-Transformers support various caching methods, leveraging "Cache" classes to abstract and manage the caching logic.
-This document outlines best practices for using these classes to maximize performance and efficiency.
-Check out all the available `Cache` classes in the [API documentation](./internal/generation_utils).
+A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. Refer to the [Caching](./cache_explanation.md) doc for a more detailed explanation about how a cache works.
 
-## What is Cache and why we should care?
+Transformers offers several [`Cache`] classes that implement different caching mechanisms. Some of these [`Cache`] classes are optimized to save memory while others are designed to maximize generation speed. Refer to the table below to compare cache types and use it to help you select the best cache for your use case.
 
-Imagine you’re having a conversation with someone, and instead of remembering what was said previously, you have to start from scratch every time you respond. This would be slow and inefficient, right? In the world of Transformer models, a similar concept applies, and that's where Caching keys and values come into play. From now on, I'll refer to the concept as KV Cache.
+| Cache Type             | Memory Efficient  | Supports torch.compile() | Initialization Recommended | Latency | Long Context Generation |
+|------------------------|------------------|--------------------------|----------------------------|---------|-------------------------|
+| Dynamic Cache          | No               | No                       | No                         | Mid     | No                      |
+| Static Cache           | No               | Yes                      | Yes                        | High    | No                      |
+| Offloaded Cache         | Yes              | No                       | No                         | Low     | Yes                     |
+| Offloaded Static Cache  | No               | Yes                      | Yes                        | High    | Yes                     |
+| Quantized Cache        | Yes              | No                       | No                         | Low     | Yes                     |
+| Sliding Window Cache   | No               | Yes                      | Yes                        | High    | No                      |
+| Sink Cache             | Yes              | No                       | Yes                        | Mid     | Yes                     |
 
-KV cache is needed to optimize the generation in autoregressive models, where the model predicts text token by token. This process can be slow since the model can generate only one token at a time, and each new prediction is dependent on the previous context. That means, to predict token number 1000 in the generation, you need information from the previous 999 tokens, which comes in the form of some matrix multiplications across the representations of those tokens. But to predict token number 1001, you also need the same information from the first 999 tokens, plus additional information from token number 1000. That is where key-value cache is used to optimize the sequential generation process by storing previous calculations to reuse in subsequent tokens, so they don't need to be computed again.
+This guide introduces you to the different [`Cache`] classes and shows you how to use them for generation.
 
-More concretely, key-value cache acts as a memory bank for these generative models, where the model stores key-value pairs derived from self-attention layers for previously processed tokens. By storing this information, the model can avoid redundant computations and instead retrieve keys and values of previous tokens from the cache. Note that caching can be used only in inference and should be disabled when training, otherwise it might cause unexpected errors.
+## Default cache
 
-<details>
-  <summary><em>For the Curious Minds Who Like to Dive Deep</em></summary>
+The [`DynamicCache`] is the default cache class for most models. It allows the cache size to grow dynamically in order to store an increasing number of keys and values as generation progresses.
 
-  ### Under the Hood: How Cache Object Works in Attention Mechanism
+Disable the cache by configuring `use_cache=False` in [`~GenerationMixin.generate`].
 
-  When utilizing a cache object in the input, the Attention module performs several critical steps to integrate past and present information seamlessly.
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
-  The Attention module concatenates the current key-values with the past key-values stored in the cache. This results in attention weights of shape `(new_tokens_length, past_kv_length + new_tokens_length)`. Essentially, the past and current key-values are combined to compute attention scores, ensuring that the model considers both previous context and new input. The concatenated key-values are used to compute the attention scores resulting in attention weights of shape `(new_tokens_length, past_kv_length + new_tokens_length)`.
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
 
-  Therefore, when iteratively calling `forward()` instead of the `generate()` method, it’s crucial to ensure that the attention mask shape matches the combined length of past and current key-values. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is usually handled internally when you call `generate()` method. If you want to implement your own generation loop with Cache classes, take this into consideration and prepare the attention mask to hold values to current and past tokens.
+model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False)
+```
 
-  <Tip warning={true}>
+Cache classes can also be initialized first before calling and passing it to the models [past_key_values](https://hf.co/docs/transformers/internal/generation_utils#transformers.generation.GenerateDecoderOnlyOutput.past_key_values) parameter. This cache initialization strategy is only recommended for some cache types.
 
-  One important concept you need to know when writing your own generation loop, is `cache_position`. In case you want to reuse an already filled Cache object by calling `forward()`, you have to pass in a valid `cache_position` which will indicate the positions of inputs in the sequence. Note that `cache_position` is not affected by padding, and always adds one more position for each token. For example, if key/value cache contains 10 tokens (no matter how many of it is a pad token), the cache position for the next token should be `torch.tensor([10])`.
+In most other cases, it's easier to define the cache strategy in the [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) parameter.
 
-  </Tip>
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
 
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
 
-  See an example below for how to implement your own generation loop.
+past_key_values = DynamicCache()
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, past_key_values=past_key_values)
+```
 
-  ```python
-  >>> import torch
-  >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+## Memory efficient caches
 
-  >>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-  >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
-  >>> tokenizer = AutoTokenizer.from_pretrained(model_id)
+The KV cache can occupy a significant portion of memory and become a [bottleneck](https://hf.co/blog/llama31#inference-memory-requirements) for long-context generation. Memory efficient caches focus on trading off speed for reduced memory usage. This is especially important for large language models (LLMs) and if your hardware is memory constrained.
 
-  >>> past_key_values = DynamicCache()
-  >>> messages = [{"role": "user", "content": "Hello, what's your name."}]
-  >>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
+### Offloaded cache
 
-  >>> generated_ids = inputs.input_ids
-  >>> cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device=model.device)
-  >>> max_new_tokens = 10
+The [`OffloadedCache`] saves GPU memory by moving the KV cache for most model layers to the CPU. Only the current layer cache is maintained on the GPU during a models `forward` iteration over the layers. [`OffloadedCache`] asynchronously prefetches the next layer cache and sends the previous layer cache back to the CPU.
 
-  >>> for _ in range(max_new_tokens):
-  ...     outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True)
-  ...     # Greedily sample one next token
-  ...     next_token_ids = outputs.logits[:, -1:].argmax(-1)
-  ...     generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
-  ...
-  ...     # Prepare inputs for the next generation step by leaaving unprocessed tokens, in our case we have only one new token
-  ...     # and expanding attn mask for the new token, as explained above
-  ...     attention_mask = inputs["attention_mask"]
-  ...     attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
-  ...     inputs = {"input_ids": next_token_ids, "attention_mask": attention_mask}
-  ...     cache_position = cache_position[-1:] + 1 # add one more position for the next token
+This cache strategy always generates the same result as [`DynamicCache`] and works as a drop-in replacement or fallback. You may want to use [`OffloadedCache`] if you have a GPU and you're getting out-of-memory (OOM) errors.
 
-  >>> print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
-  ```
-  ```txt
-  <|user|>
-  Hello, what's your name. 
-  <|assistant|>
-  My name is Sarah. 
-  <|
-  ```
+> [!WARNING]
+> You may notice a small degradation in generation throughput compared to [`DynamicCache`] depending on your model and generation choices (context size, number of generated tokens, number of beams, etc.).
 
-</details>
+Enable [`OffloadedCache`] by configuring `cache_implementation="offloaded"` in either [`GenerationConfig`] or [`~GenerationMixin.generate`].
 
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
+ckpt = "microsoft/Phi-3-mini-4k-instruct"
+tokenizer = AutoTokenizer.from_pretrained(ckpt)
+model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
+inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)
 
-## Generate with Cache
+out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
+print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
+```
 
-In 🤗 Transformers, we support various Cache types to optimize the performance across different models and tasks. By default, all models generate with caching,
-with the [`~DynamicCache`] class being the default cache for most models. It allows us to dynamically grow cache size, by saving more and more keys and values as we generate. If for some reason you don't want to use caches, you can pass `use_cache=False` into the `generate()` method.
+The example below shows how you can fallback on [`OffloadedCache`] if you run out of memory.
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+def resilient_generate(model, *args, **kwargs):
+    oom = False
+    try:
+        return model.generate(*args, **kwargs)
+    except torch.cuda.OutOfMemoryError as e:
+        print(e)
+        print("retrying with cache_implementation='offloaded'")
+        oom = True
+    if oom:
+        torch.cuda.empty_cache()
+        kwargs["cache_implementation"] = "offloaded"
+        return model.generate(*args, **kwargs)
+
+ckpt = "microsoft/Phi-3-mini-4k-instruct"
+tokenizer = AutoTokenizer.from_pretrained(ckpt)
+model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
+prompt = ["okay "*1000 + "Fun fact: The most"]
+inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
+out = resilient_generate(model, **inputs, **beams)
+responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True)
+```
 
-Refer to the table below to see the difference between cache types and choose the one that suits best for your use-case. Models for which initialization is recommended should be initialized before calling the model and passed to model as a kwarg. In all other cases you can simply define desired `cache_implementation` and we take care of the rest for you.
+### Quantized cache
 
-| Cache Type             | Memory Efficient | Supports torch.compile() | Initialization Recommended | Latency | Long Context Generation |
-|------------------------|------------------|--------------------------|----------------------------|---------|-------------------------|
-| Dynamic Cache          | No               | No                       | No                         | Mid     | No                      |
-| Static Cache           | No               | Yes                      | Yes                        | High    | No                      |
-| Offloaded Cache        | Yes              | No                       | No                         | Low     | Yes                     |
-| Offloaded Static Cache | No               | Yes                      | Yes                        | High    | Yes                     |
-| Quantized Cache        | Yes              | No                       | No                         | Low     | Yes                     |
-| Sliding Window Cache   | No               | Yes                      | Yes                        | High    | No                      |
-| Sink Cache             | Yes              | No                       | Yes                        | Mid     | Yes                     |
+The [`QuantizedCache`] reduces memory requirements by quantizing the KV values to a lower precision. [`QuantizedCache`] currently supports two quantization backends.
 
+- [`HQQQuantizedCache`] supports int2, int4, and int8 datatypes.
+- [`QuantoQuantizedCache`] supports int2 and int4 datatypes. This is the default quantization backend.
 
-These cache classes can be set with a `cache_implementation` argument when generating. To learn about the available options for the cache_implementation flag, please refer to the [API Documentation](./main_classes/text_generation#transformers.GenerationConfig). Now, let's explore each cache type in detail and see how to use them. Note that the below examples are for decoder-only Tranformer-based models. We also support ["Model-Specific Cache"] classes for models such as Mamba or Jamba, keep reading for more details.
+> [!WARNING]
+> Quantizing the cache can harm latency if the context length is short and there is enough GPU memory available for generation without enabling cache quantization. Try to find a balance between memory efficiency and latency.
 
-### Quantized Cache
+Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and indicate the quantization backend in [`QuantizedCacheConfig`]. Any additional quantization related parameters should also be passed either as a dict or an instance of [`QuantizedCacheConfig`]. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.
 
-The key and value cache can occupy a large portion of memory, becoming a [bottleneck for long-context generation](https://huggingface.co/blog/llama31#inference-memory-requirements), especially for Large Language Models.
-Quantizing the cache when using `generate()` can significantly reduce memory requirements at the cost of speed.
+<hfoptions id="quantized-cache">
+<hfoption id="HQQQuantizedCache">
 
-KV Cache quantization in `transformers` is largely inspired by the paper ["KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache"](https://arxiv.org/abs/2402.02750) and currently supports [`~QuantoQuantizedCache`] and [`~HQQQuantizedCache`] classes. For more information on the inner workings see the paper.
+For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `1`.
 
-To enable quantization of the key-value cache, one needs to indicate `cache_implementation="quantized"` in the `generation_config`.
-Quantization related arguments should be passed to the `generation_config` either as a `dict` or an instance of a [`~QuantizedCacheConfig`] class.
-One has to indicate which quantization backend to use in the [`~QuantizedCacheConfig`], the default is `quanto`.
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig
 
-It is recommended to set `axis-key/axis-value` parameters in the cache config to `0` if you're using the `quanto` backend and to `1` if you're using the `HQQ` backend. For other config values, please use the defaults unless you're running out of memory. In that case, you may consider decreasing the residual length.
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
 
-<Tip warning={true}>
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"axis-key": 1, "axis-value": 1, "backend": "hqq"})
+print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+I like rock music because it's loud and energetic. It's a great way to express myself and rel
+```
 
-Cache quantization can be detrimental in terms of latency if the context length is short and there is enough GPU VRAM available to run without cache quantization. It is recommended to seek balance between memory efficiency and latency.
-</Tip>
+</hfoption>
+<hfoption id="Quanto">
 
+For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `0`.
 
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig
 
->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
->>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
 
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
-I like rock music because it's a great way to express myself. I like the way it makes me feel, the
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "axis-key": 0, "axis-value": 0, "backend": "quanto"})
+print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+I like rock music because it's loud and energetic. It's a great way to express myself and rel
 ```
 
-### Offloaded Cache
+</hfoption>
+</hfoptions>
 
-Similarly to KV cache quantization, [`~OffloadedCache`] strategy aims to reduce GPU VRAM usage.
-It does so by moving the KV cache for most layers to the CPU.
-As the model's `forward()` method iterates over the layers, this strategy maintains the current layer cache on the GPU.
-At the same time it asynchronously prefetches the next layer cache as well as sending the previous layer cache back to the CPU.
-Unlike KV cache quantization, this strategy always produces the same result as the default KV cache implementation.
-Thus, it can serve as a drop-in replacement or a fallback for it.
+### Sink cache
 
-Depending on your model and the characteristics of your generation task (size of context, number of generated tokens, number of beams, etc.)
-you may notice a small degradation in generation throughput compared to the default KV cache implementation.
+[`SinkCache`] is capable of generating very long sequences ("infinite length" according to the paper) by only retaining a few initial tokens from the sequence. These are called the *sink tokens* because they account for a significant portion of the attention scores during generation. Subsequent tokens are discarded on a sliding windowed basis, and only the latest `window_size` tokens are kept. This means most of the previous knowledge is discarded.
 
-To enable KV cache offloading, pass `cache_implementation="offloaded"` in the `generation_config` or directly to the `generate()` call.
-Use `cache_implementation="offloaded_static"` for an offloaded static cache (see also [Offloaded Static Cache](#offloaded-static-cache) below).
+The sink tokens allow a model to maintain stable performance even when it's dealing with very long text sequences.
 
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
+Enable [`SinkCache`] by initializing it first with the [window_length](https://hf.co/docs/transformers/main/en/internal/generation_utils#transformers.SinkCache.window_length) and [num_sink_tokens](https://hf.co/docs/transformers/main/en/internal/generation_utils#transformers.SinkCache.num_sink_tokens) parameters before passing it to [past_key_values](https://hf.co/docs/transformers/internal/generation_utils#transformers.generation.GenerateDecoderOnlyOutput.past_key_values) in [`~GenerationMixin.generate`].
 
->>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
->>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, device_map="auto")
->>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)
-
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
-Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
 
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23)
->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
-Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
-```
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device)
 
-<Tip warning={true}>
-
-Cache offloading requires a CUDA GPU and can be slower than dynamic KV cache. Use it if you are getting CUDA out of memory errors.
-
-</Tip>
-
-The example below shows how KV cache offloading can be used as a fallback strategy.
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> def resilient_generate(model, *args, **kwargs):
-...     oom = False
-...     try:
-...         return model.generate(*args, **kwargs)
-...     except torch.cuda.OutOfMemoryError as e:
-...         print(e)
-...         print("retrying with cache_implementation='offloaded'")
-...         oom = True
-...     if oom:
-...         torch.cuda.empty_cache()
-...         kwargs["cache_implementation"] = "offloaded"
-...         return model.generate(*args, **kwargs)
-...
-...
->>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
->>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
->>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
->>> prompt = ["okay "*1000 + "Fun fact: The most"]
->>> inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
->>> beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
->>> out = resilient_generate(model, **inputs, **beams)
->>> responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True)
+past_key_values = SinkCache(window_length=256, num_sink_tokens=4)
+out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values)
+tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+"This is a long story about unicorns, fairies and magic. It is a fantasy world where unicorns and fairies live together in harmony. The story follows a young girl named Lily"
 ```
 
-On a GPU with 50 GB of RAM, running this code will print
-```
-CUDA out of memory. Tried to allocate 4.83 GiB. GPU
-retrying with cache_implementation='offloaded'
-```
-before successfully generating 40 beams.
+## Speed optimized caches
 
+The default [`DynamicCache`] prevents you from taking advantage of just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation.
 
-### Static Cache
+### Static cache
 
-Since the "DynamicCache" dynamically grows with each generation step, it prevents you from taking advantage of JIT optimizations. The [`~StaticCache`] pre-allocates
-a specific maximum size for the keys and values, allowing you to generate up to the maximum length without having to modify cache size. Check the below usage example.
+A [`StaticCache`] pre-allocates a specific maximum cache size for the kv pairs. You can generate up to the maximum cache size without needing to modify it.
 
-For more examples with Static Cache and JIT compilation, take a look at [StaticCache & torchcompile](./llm_optims#static-kv-cache-and-torchcompile)
+Enable [`StaticCache`] by configuring `cache_implementation="static"` in [`~GenerationMixin.generate`].
 
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
->>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
 
->>> # simply pass the cache implementation="static"
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static")
->>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"Hello, my name is [Your Name] and I am a [Your Position] at [Your Company]. I am writing"
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static")
+tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
 ```
 
+### Offloaded static cache
 
-## Offloaded Static Cache
+The [`OffloadedStaticCache`] is very similar to the [OffloadedCache](#offloaded-cache) except the cache size is set to a maximum cache size. Otherwise, [`OffloadedStaticCache`] only keeps the current layer cache on the GPU and the rest are moved to the CPU.
 
-Like [`~OffloadedCache`] exists for offloading a "DynamicCache", there is also an offloaded static cache. It fully supports
-JIT optimizations. Just pass `cache_implementation="offloaded_static"` in the `generation_config` or directly to the `generate()` call.
-This will use the [`~OffloadedStaticCache`] implementation instead.
+Enable [`OffloadedStaticCache`] by configuring `cache_implementation="offloaded_static"` in [`~GenerationMixin.generate`].
 
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
->>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
 
->>> # simply pass the cache implementation="offloaded_static"
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
->>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
+tokenizer.batch_decode(out, skip_special_tokens=True)[0]
 "Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
 ```
 Cache offloading requires a CUDA GPU.
 
+### Sliding window cache
 
-### Sliding Window Cache
+[`SlidingWindowCache`] implements a sliding window over the previous kv pairs, and only keeps the last `sliding_window` tokens. This cache type is designed to only work with models that support *sliding window attention*, such as [Mistral](./model_doc/mistral). Older kv states are discarded and replaced by new kv states.
 
-As the name suggests, this cache type implements a sliding window over previous keys and values, retaining only the last `sliding_window` tokens. It should be used with models like Mistral that support sliding window attention. Additionally, similar to Static Cache, this one is JIT-friendly and can be used with the same compile tecniques as Static Cache.
+Enable [`SlidingWindowCache`] by configuring `cache_implementation="sliding_window"` in [`~GenerationMixin.generate`].
 
-Note that you can use this cache only for models that support sliding window, e.g. Mistral models.
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
 
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
+inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)
 
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
-
->>> tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B")
->>> model = AutoModelForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16, device_map="auto")
->>> inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)
-
->>> # can be used by passing in cache implementation
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
->>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"Yesterday I was on a rock concert and. I was so excited to see my favorite band perform live. I was so happy that I could hardly contain myself. I was jumping up and down and"
+out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
+tokenizer.batch_decode(out, skip_special_tokens=True)[0]
 ```
 
-### Sink Cache
-
-Sink Cache was introduced in ["Efficient Streaming Language Models with Attention Sinks"](https://arxiv.org/abs/2309.17453). It allows you to generate long sequences of text ("infinite length" according to the paper) without any fine-tuning. That is achieved by smart handling of previous keys and values, specifically it retains a few initial tokens from the sequence, called "sink tokens". This is based on the observation that these initial tokens attract a significant portion of attention scores during the generation process. Tokens that come after "sink tokens" are discarded on a sliding windowed basis, keeping only the latest `window_size` tokens. By keeping these initial tokens as "attention sinks," the model maintains stable performance even when dealing with very long texts, thus discarding most of the previous knowledge.
-
-Unlike other cache classes, this one can't be used directly by indicating a `cache_implementation`. You have to initialize the Cache before calling on `generate()` as follows.
-
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
+## Model caches
 
->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
->>> inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device)
+Some model types, like encoder-decoder models or [Gemma2](./model_doc/gemma2) and [Mamba](./model_doc/mamba), have dedicated cache classes.
 
->>> # get our cache, specify number of sink tokens and window size
->>> # Note that window size already includes sink tokens, so has to be larger
->>> past_key_values = SinkCache(window_length=256, num_sink_tokens=4)
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values)
->>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"This is a long story about unicorns, fairies and magic. It is a story about a young girl named Lily who discovers that she has the power to control the elements. She learns that she can"
-```
-
-### Encoder-Decoder Cache
+### Encoder-decoder cache
 
-The [`~EncoderDecoderCache`] is a wrapper designed to handle the caching needs of encoder-decoder models. This cache type is specifically built to manage both self-attention and cross-attention caches, ensuring storage and retrieval of past key/values required for these complex models. Cool thing about Encoder-Decoder Cache is that you can set different cache types for the encoder and for the decoder, depending on your use case. Currently this cache is only supported in [Whisper](./model_doc/whisper) models but we will be adding more models soon.
+[`EncoderDecoderCache`] is designed for encoder-decoder models. It manages both the self-attention and cross-attention caches to ensure storage and retrieval of previous kv pairs. It is possible to individually set a different cache type for the encoder and decoder.
 
-In terms of usage, there is nothing special to be done and calling `generate()` or `forward()` will handle everything for you.
+This cache type doesn't require any setup. It can be used when calling [`~GenerationMixin.generate`] or a models `forward` method.
 
+> [!TIP]
+> The [`EncoderDecoderCache`] currently only supports [Whisper](./model_doc/whisper).
 
-### Model-specific Cache Classes
+### Model-specific caches
 
-Some models require storing previous keys, values, or states in a specific way, and the above cache classes cannot be used. For such cases, we have several specialized cache classes that are designed for specific models. These models only accept their own dedicated cache classes and do not support using any other cache types. Some examples include [`~HybridCache`] for [Gemma2](./model_doc/gemma2) series models or [`~MambaCache`] for [Mamba](./model_doc/mamba) architecture models.
+Some models have a unique way of storing past kv pairs or states that is not compatible with any other cache classes.
 
+[Gemma2](./model_doc/gemma2) requires [`HybridCache`], which uses a combination of [`SlidingWindowCache`] for sliding window attention and [`StaticCache`] for global attention under the hood.
 
-## Iterative Generation with Cache
+[Mamba](./model_doc/mamba) requires [`MambaCache`] because the model doesn't have an attention mechanism or kv states.
 
-We have seen how to use each of the cache types when generating. What if you want to use cache in iterative generation setting, for example in applications like chatbots, where interactions involve multiple turns and continuous back-and-forth exchanges. Iterative generation with cache allows these systems to handle ongoing conversations effectively without reprocessing the entire context at each step. But there are some tips that you should know before you start implementing:
+## Iterative generation
 
-The general format when doing iterative generation is as below. First you have to initialize an empty cache of the type you want, and you can start feeding in new prompts iteratively. Keeping track of dialogues history and formatting can be done with chat templates, read more on that in [chat_templating](./chat_templating)
+A cache can also work in iterative generation settings where there is back-and-forth interaction with a model (chatbots). Like regular generation, iterative generation with a cache allows a model to efficiently handle ongoing conversations without recomputing the entire context at each step.
 
-In case you are using Sink Cache, you have to crop your inputs to that maximum length because Sink Cache can generate text longer than its maximum window size, but it expects the first input to not exceed the maximum cache length.
+For iterative generation with a cache, start by initializing an empty cache class and then you can feed in your new prompts. Keep track of dialogue history with a [chat template](./chat_templating).
 
+If you're using [`SinkCache`], the inputs need to be truncated to the maximum length because [`SinkCache`] can generate text that exceeds its maximum window size. However, the first input shouldn't exceed the maximum cache length.
 
-```python
->>> import torch
->>> from transformers import AutoTokenizer,AutoModelForCausalLM
->>> from transformers.cache_utils import (
-...    DynamicCache,
-...    SinkCache,
-...    StaticCache,
-...    SlidingWindowCache,
-...    QuantoQuantizedCache,
-...    QuantizedCacheConfig,
-... )
+The example below demonstrates how to use a cache for iterative generation.
 
->>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
->>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
->>> tokenizer = AutoTokenizer.from_pretrained(model_id)
+```py
+import torch
+from transformers import AutoTokenizer,AutoModelForCausalLM
+from transformers.cache_utils import (
+    DynamicCache,
+    SinkCache,
+    StaticCache,
+    SlidingWindowCache,
+    QuantoQuantizedCache,
+    QuantizedCacheConfig,
+)
 
->>> user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."]
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 
->>> past_key_values = DynamicCache()
->>> max_cache_length = past_key_values.get_max_cache_shape()
+user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."]
 
->>> messages = []
->>> for prompt in user_prompts:
-...     messages.append({"role": "user", "content": prompt})
-...     inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
-...     if isinstance(past_key_values, SinkCache):
-...         inputs = {k: v[:, -max_cache_length:] for k, v in inputs.items()}
-...
-...     input_length = inputs["input_ids"].shape[1]
-...
-...     outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256, past_key_values=past_key_values)
-...     completion = tokenizer.decode(outputs[0, input_length: ], skip_special_tokens=True)
-...     messages.append({"role": "assistant", "content": completion})
+past_key_values = DynamicCache()
+max_cache_length = past_key_values.get_max_length()
 
-print(messages)
-[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': "Hello, I'm AI."}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': "I'm sorry to hear that you were on a rock concert yesterday. It sounds like a fun experience, but I'm not capable of experiencing music or concerts. However, I can provide you with some information about rock music and its history. Rock music emerged in the 1950s and 1960s in the United States and Britain, and it quickly gained popularity around the world. Some of the most famous rock bands of all time include The Beatles, The Rolling Stones, Led Zeppelin, and Pink Floyd. Rock music has a distinct sound and style, with elements of blues, country, and folk music. It often features guitar solos, heavy bass lines, and drums. Rock music has had a significant impact on popular culture, influencing genres such as punk rock, heavy metal, and alternative rock."}]
+messages = []
+for prompt in user_prompts:
+    messages.append({"role": "user", "content": prompt})
+    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
+    if isinstance(past_key_values, SinkCache):
+        inputs = {k: v[:, -max_cache_length:] for k, v in inputs.items()}
+    input_length = inputs["input_ids"].shape[1]
+    outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256, past_key_values=past_key_values)
+    completion = tokenizer.decode(outputs[0, input_length: ], skip_special_tokens=True)
+    messages.append({"role": "assistant", "content": completion})
 ```
 
+## Prefill a cache
 
-## Re-use Cache to continue generation
-
-Sometimes you would want to first fill-in cache object with key/values for certain prefix prompt and re-use it several times to generate different sequences from it. In that case you can construct a `Cache` object that will hold the instruction prompt, and re-use it several times with different text sequences.
-
-```python
->>> import copy
->>> import torch
->>> from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache
->>> from accelerate.test_utils.testing import get_backend
-
->>> DEVICE, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
->>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
->>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=DEVICE)
->>> tokenizer = AutoTokenizer.from_pretrained(model_id)
-
->>> # Init StaticCache with big enough max-length (1024 tokens for the below example)
->>> # You can also init a DynamicCache, if that suits you better
->>> prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device=DEVICE, dtype=torch.bfloat16)
-
->>> INITIAL_PROMPT = "You are a helpful assistant. "
->>> inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(DEVICE)
->>> # This is the common prompt cached, we need to run forward without grad to be abel to copy
->>> with torch.no_grad():
-...      prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values
-
->>> prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
->>> responses = []
->>> for prompt in prompts:
-...     new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(DEVICE)
-...     past_key_values = copy.deepcopy(prompt_cache)
-...     outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
-...     response = tokenizer.batch_decode(outputs)[0]
-...     responses.append(response)
-
->>> print(responses)
-['<s> You are a helpful assistant. Help me to write a blogpost about travelling.  I am excited to share my experiences with you.  I have been traveling for the past', '<s> You are a helpful assistant. What is the capital of France? \n\nAnswer: Paris is the capital of France.</s>']
-```
+In some situations, you may want to fill a [`Cache`] with kv pairs for a certain prefix prompt and reuse it to generate different sequences.
 
+The example below initializes a [`StaticCache`], and then caches an initial prompt. Now you can generate several sequences from the prefilled prompt.
 
-## Legacy cache format
+```py
+import copy
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache
 
-Prior to the introduction of the `Cache` object, the cache of LLMs used to be a tuple of tuples of tensors. The legacy
-format has a dynamic size, growing as we generate text -- very similar to `DynamicCache`. If your project depend on
-this legacy format, you can seamlessly convert it to a `DynamicCache` and back.
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+# Init StaticCache with big enough max-length (1024 tokens for the below example) 
+# You can also init a DynamicCache, if that suits you better
+prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)
 
->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
->>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
+INITIAL_PROMPT = "You are a helpful assistant. "
+inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
+# This is the common prompt cached, we need to run forward without grad to be able to copy
+with torch.no_grad():
+     prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values
 
->>> # `return_dict_in_generate=True` is required to return the cache. `return_legacy_cache` forces the returned cache
->>> # to be of the legacy type
->>> generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5)
+prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
+responses = []
+for prompt in prompts:
+    new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
+    past_key_values = copy.deepcopy(prompt_cache)
+    outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20) 
+    response = tokenizer.batch_decode(outputs)[0]
+    responses.append(response)
 
->>> # We can convert a legacy cache to a DynamicCache -- and the other way around. This is helpful if you have custom
->>> # logic to manipulate a cache in a specific format.
->>> cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
->>> legacy_format_cache = cache.to_legacy_cache()
+print(responses)
 ```
diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md
index 37406ea0bef2..e8e20dab5db6 100644
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@@ -9,46 +9,42 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 -->
 
-# LLM inference optimization
+# Optimizing inference
 
-Large language models (LLMs) have pushed text generation applications, such as chat and code completion models, to the next level by producing text that displays a high level of understanding and fluency. But what makes LLMs so powerful - namely their size - also presents challenges for inference.
+Inference with large language models (LLMs) can be challenging because they have to store and handle billions of parameters. To load a 70B parameter [Llama 2](https://hf.co/meta-llama/Llama-2-70b-hf) model, it requires 256GB of memory for full precision weights and 128GB of memory for half-precision weights. The most powerful GPUs today - the A100 and H100 - only have 80GB of memory.
 
-Basic inference is slow because LLMs have to be called repeatedly to generate the next token. The input sequence increases as generation progresses, which takes longer and longer for the LLM to process. LLMs also have billions of parameters, making it a challenge to store and handle all those weights in memory.
+On top of the memory requirements, inference is slow because LLMs are called repeatedly to generate the next token. The input sequence increases as generation progresses, which takes longer and longer to process.
 
-This guide will show you how to use the optimization techniques available in Transformers to accelerate LLM inference.
+This guide will show you how to optimize LLM inference to accelerate generation and reduce memory usage.
 
 > [!TIP]
-> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes deployment-oriented optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference.
+> Try out [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a Hugging Face library dedicated to deploying and serving highly optimized LLMs for inference.
 
-## Static kv-cache and `torch.compile`
+## Static kv-cache and torch.compile
 
-During decoding, a LLM computes the key-value (kv) values for each input token and since it is autoregressive, it computes the same kv values each time because the generated output becomes part of the input now. This is not very efficient because you're recomputing the same kv values each time.
+LLMs compute key-value (kv) values for each input token, and it performs the same kv computation each time because the generated output becomes part of the input. However, performing the same kv computation every time is not very efficient.
 
-To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [`torch.compile`](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels. We have an entire guide dedicated to kv-caches [here](./kv_cache).
+A *kv-cache* stores the past keys and values instead of recomputing them each time. As a result, the kv-cache is dynamic and it grows with each generation step which prevents you from taking advantage of [torch.compile](./perf_torch_compile), a powerful optimization method that fuses PyTorch code into optimized kernels.
 
-The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with `torch.compile` for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware.
+The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value, so you can combine it with [torch.compile](./perf_torch_compile) for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware.
 
 > [!WARNING]
-> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and `torch.compile`. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list.
+> Follow this [issue](https://github.com/huggingface/transformers/issues/28981) to track which models (Llama, Gemma, Mistral, etc.) support a static kv-cache and torch.compile.
 
-There are three flavors of static kv-cache usage, depending on the complexity of your task:
-1. Basic usage: simply set a flag in `generation_config` (recommended);
-2. Advanced usage: handle a cache object for multi-turn generation or a custom generation loop;
-3. Advanced usage: compile the entire `generate` function into a single graph, if having a single graph is relevant for you.
+Depending on your task, there are several ways you can use the static kv-cache.
 
-Select the correct tab below for further instructions on each of these flavors.
+1. For basic use cases, set [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) to `"static"` (recommended).
+2. For multi-turn generation or a custom generation loop, initialize and handle [`StaticCache`] directly.
+3. For more unique hardware or use cases, it may be better to compile the entire [`~GenerationMixin.generate`] function into a single graph.
 
 > [!TIP]
-> Regardless of the strategy used with `torch.compile`, you can avoid shape-related recompilations if you left-pad your LLM inputs to a limited set of values. The [`pad_to_multiple_of` tokenizer flag](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.pad_to_multiple_of) is your friend!
+> Regardless of how you use the static kv-cache and torch.compile, left-pad your inputs with [pad_to_multiple_of](https://hf.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.pad_to_multiple_of) to a limited set of values to avoid shape-related recompilations.
 
 <hfoptions id="static-kv">
-<hfoption id="basic usage: generation_config">
+<hfoption id="1. cache_implementation">
 
-For this example, let's use the [Gemma](https://hf.co/google/gemma-2b) model. All we need to do is to:
-1. Access the model's `generation_config` attribute and set the `cache_implementation` to "static";
-2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
-
-And that's it!
+1. Set the [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) to `"static"` in a models [`GenerationConfig`].
+2. Call [torch.compile](./perf_torch_compile) to compile the forward pass with the static kv-cache.
 
 ```py
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -70,17 +66,15 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
 ```
 
-Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. Avoiding re-compilation is critical to get the most out of `torch.compile`, and you should be aware of the following:
-1. If the batch size changes or the maximum output length increases between calls, the cache will have to be reinitialized, triggering a new compilation;
-2. The first couple of calls of the compiled function are slower, as the function is being compiled.
+Under the hood, [`~GenerationMixin.generate`] attempts to reuse the same cache object to avoid recompilation at each call, which is critical to get the most out of [torch.compile](./perf_torch_compile). Be aware of the following to avoid triggering recompilation or if generation is slower than expected.
 
-> [!WARNING]
-> For a more advanced usage of the static cache, such as multi-turn conversations, we recommend instantiating and manipulating the cache object outside [`~GenerationMixin.generate`]. See the advanced usage tab.
+1. If the batch size changes or the maximum output length increases between calls, the cache is reinitialized and recompiled.
+2. The first several calls of the compiled function are slower because it is being compiled.
 
 </hfoption>
-<hfoption id="advanced usage: control Static Cache">
+<hfoption id="2. StaticCache">
 
-A [`StaticCache`] object can be passed to the model's [`~GenerationMixin.generate`] under the `past_key_values` argument. The object will retain the cache contents, so you can pass it to a new [`~GenerationMixin.generate`] call to continue generation, like you would do with a dynamic cache.
+Directly initialize a [`StaticCache`] object and pass it to the `past_key_values` parameter in [`~GenerationMixin.generate`]. The [`StaticCache`] keeps the cache contents, so you can pass it to a new [`~GenerationMixin.generate`] call to continue generation, similar to a dynamic cache.
 
 ```py
 from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache
@@ -99,7 +93,7 @@ model.generation_config.max_new_tokens = 16
 
 past_key_values = StaticCache(
     config=model.config,
-    batch_size=1,
+    max_batch_size=1,
     # If you plan to reuse the cache, make sure the cache length is large enough for all cases
     max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
     device=model.device,
@@ -118,9 +112,9 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ```
 
 > [!TIP]
-> If you want to reuse the same [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method between calls
+> To reuse [`StaticCache`] on a new prompt, use [`~StaticCache.reset`] to reset the cache contents between calls.
 
-If you want to go further down a level, the [`StaticCache`] object can also be passed to the model's forward pass under the same `past_key_values` argument. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens.
+Another option for using [`StaticCache`] is to pass it to a models forward pass using the same `past_key_values` argument. This allows you to write your own custom decoding function to decode the next token given the current token, position, and cache position of previously generated tokens.
 
 ```py
 from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging
@@ -153,10 +147,11 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu
     return new_token
 ```
 
-There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method:
-1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length.
-2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
-3. Use `SDPBackend.MATH` in the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more.
+To enable static kv-cache and [torch.compile](./perf_torch_compile) with [`StaticCache`], follow the steps below.
+
+1. Initialize [`StaticCache`] before using the model for inference to configure parameters like the maximum batch size and sequence length.
+2. Call [torch.compile](./perf_torch_compile) on the model to compile the forward pass with the static kv-cache.
+3. se SDPBackend.MATH in the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more.
 
 ```py
 from torch.nn.attention import SDPBackend, sdpa_kernel
@@ -164,7 +159,7 @@ from torch.nn.attention import SDPBackend, sdpa_kernel
 batch_size, seq_length = inputs["input_ids"].shape
 with torch.no_grad():
     past_key_values = StaticCache(
-        config=model.config, batch_size=2, max_cache_len=4096, device=torch_device, dtype=model.dtype
+        config=model.config, max_batch_size=2, max_cache_len=4096, device=torch_device, dtype=model.dtype
     )
     cache_position = torch.arange(seq_length, device=torch_device)
     generated_ids = torch.zeros(
@@ -193,9 +188,9 @@ text
 ```
 
 </hfoption>
-<hfoption id="advanced usage: end-to-end generate compilation">
+<hfoption id="3. compile entire generate function">
 
-Compiling the entire `generate` function, in terms of code, is even simpler than in the basic usage: call `torch.compile` on `generate` to compile the entire function. No need to specify the use of the static cache: although it is compatible, dynamic cache (default) was faster in our benchmarks.
+Compiling the entire [`~GenerationMixin.generate`] function also compiles the input preparation logit processor operations, and more, in addition to the forward pass. With this approach, you don't need to initialize [`StaticCache`] or set the [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) parameter.
 
 ```py
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -215,28 +210,33 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
 ```
 
-As a result, we compile not only the model forward pass, but also all input preparation, logit processor operations, and so on. The result should be a slightly `generate` call, compared to the basic usage example, and the compiled graph may be better suited to more exotic hardware devices or use cases. However, there are severe drawbacks in using this approach:
-1. Compilation is much slower;
-2. All parameterization of `generate` must be done through `generation_config`;
-3. Many warnings and exceptions are suppressed -- we suggest testing with its uncompiled form first;
-4. Although we are working on it, it is heavily feature restricted (for instance, at the time of writing, generation does not stop if an EOS token is selected).
+This usage pattern is more appropriate for unique hardware or use cases, but there are several drawbacks to consider.
+
+1. Compilation is much slower.
+2. Parameters must be configured through [`GenerationConfig`].
+3. Many warnings and exceptions are suppressed. We recommend testing the uncompiled model first.
+4. Many features are unavailable at the moment. For example, generation does not stop if an `EOS` token is selected.
 
 </hfoption>
 </hfoptions>
 
-## Speculative decoding
+## Decoding strategies
+
+Decoding can also be optimized to accelerate generation. You can use a lightweight assistant model to generate candidate tokens faster than the LLM itself or you can use a variant of this decoding strategy that works especially well for input-grounded tasks.
+
+### Speculative decoding
 
 > [!TIP]
 > For a more in-depth explanation, take a look at the [Assisted Generation: a new direction toward low-latency text generation](https://hf.co/blog/assisted-generation) blog post!
 
-Another issue with autoregression is that for each input token you need to load the model weights each time during the forward pass. This is slow and cumbersome for LLMs which have billions of parameters. Speculative decoding alleviates this slowdown by using a second smaller and faster assistant model to generate candidate tokens that are verified by the larger LLM in a single forward pass. If the verified tokens are correct, the LLM essentially gets them for "free" without having to generate them itself. There is no degradation in accuracy because the verification forward pass ensures the same outputs are generated as if the LLM had generated them on its own.
+For each input token, the model weights are loaded each time during the forward pass, which is slow and cumbersome when a model has billions of parameters. Speculative decoding alleviates this slowdown by using a second smaller and faster assistant model to generate candidate tokens that are verified by the larger model in a single forward pass. If the verified tokens are correct, the LLM essentially gets them for "free" without having to generate them itself. There is no degradation in accuracy because the verification forward pass ensures the same outputs are generated as if the LLM had generated them on its own.
 
 To get the largest speed up, the assistant model should be a lot smaller than the LLM so that it can generate tokens quickly. The assistant and LLM model must also share the same tokenizer to avoid re-encoding and decoding tokens.
 
 > [!WARNING]
-> Speculative decoding is only supported for the greedy search and sampling decoding strategies, and it also doesn't support batched inputs.
+> Speculative decoding is only supported for the greedy search and sampling decoding strategies, and it doesn't support batched inputs.
 
-Enable speculative decoding by loading an assistant model and passing it to the [`~GenerationMixin.generate`] method.
+Enable speculative decoding by loading an assistant model and passing it to [`~GenerationMixin.generate`].
 
 <hfoptions id="spec-decoding">
 <hfoption id="greedy search">
@@ -261,7 +261,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 </hfoption>
 <hfoption id="sampling">
 
-For speculative sampling decoding, add the `do_sample` and `temperature` parameters to the [`~GenerationMixin.generate`] method in addition to the assistant model.
+For speculative sampling decoding, add the [do_sample](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.do_sample) and [temperature](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.temperature) parameters to [`~GenerationMixin.generate`].
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -287,7 +287,7 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 
 Prompt lookup decoding is a variant of speculative decoding that is also compatible with greedy search and sampling. Prompt lookup works especially well for input-grounded tasks - such as summarization - where there is often overlapping words between the prompt and output. These overlapping n-grams are used as the LLM candidate tokens.
 
-To enable prompt lookup decoding, specify the number of tokens that should be overlapping in the `prompt_lookup_num_tokens` parameter. Then you can pass this parameter to the [`~GenerationMixin.generate`] method.
+To enable prompt lookup decoding, specify the number of tokens that should be overlapping in the [prompt_lookup_num_tokens](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.prompt_lookup_num_tokens) parameter. Then pass this parameter to [`~GenerationMixin.generate`].
 
 <hfoptions id="pld">
 <hfoption id="greedy decoding">
@@ -312,7 +312,7 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 </hfoption>
 <hfoption id="sampling">
 
-For prompt lookup decoding with sampling, add the `do_sample` and `temperature` parameters to the [`~GenerationMixin.generate`] method.
+For prompt lookup decoding with sampling, add the [do_sample](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.do_sample) and [temperature](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.temperature) parameters to [`~GenerationMixin.generate`].
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -333,15 +333,15 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 </hfoption>
 </hfoptions>
 
-## Attention optimizations
+## Attention
 
-A known issue with transformer models is that the self-attention mechanism grows quadratically in compute and memory with the number of input tokens. This limitation is only magnified in LLMs which handles much longer sequences. To address this, try FlashAttention2 or PyTorch's scaled dot product attention (SDPA), which are more memory efficient attention implementations and can accelerate inference.
+A known issue with transformer models is that the self-attention mechanism grows quadratically in compute and memory with the number of input tokens. This limitation is only magnified in LLMs which handles much longer sequences. To address this, try FlashAttention2 or PyTorch's scaled dot product attention (SDPA), which are more memory efficient attention implementations.
 
 ### FlashAttention-2
 
-FlashAttention and [FlashAttention-2](./perf_infer_gpu_one#flashattention-2) break up the attention computation into smaller chunks and reduces the number of intermediate read/write operations to GPU memory to speed up inference. FlashAttention-2 improves on the original FlashAttention algorithm by also parallelizing over sequence length dimension and better partitioning work on the hardware to reduce synchronization and communication overhead.
+FlashAttention and [FlashAttention-2](./perf_infer_gpu_one#flashattention-2) break up the attention computation into smaller chunks and reduces the number of intermediate read/write operations to the GPU memory to speed up inference. FlashAttention-2 improves on the original FlashAttention algorithm by also parallelizing over sequence length dimension and better partitioning work on the hardware to reduce synchronization and communication overhead.
 
-To use FlashAttention-2, set `attn_implementation="flash_attention_2"` in the [`~PreTrainedModel.from_pretrained`] method.
+To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`].
 
 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
@@ -355,105 +355,12 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
-### Fine-Tuning with torch.compile and Padding-Free Data Collation
-
-In addition to optimizing inference, you can also enhance the training efficiency of large language models by leveraging torch.compile during fine-tuning and using a padding-free data collator. This approach can significantly speed up training and reduce computational overhead.
-
-Here's how you can fine-tune a Llama model using SFTTrainer from the TRL library, with torch_compile enabled and a padding-free data collator:
-
-```
-#################### IMPORTS ###################
-
-import math
-import datasets
-import dataclasses
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TrainingArguments
-)
-from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
-
-#################### MODEL LOADING WITH FLASH ATTENTION ###################
-
-model_name = "meta-llama/Llama-3.2-1B"
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    attn_implementation="flash_attention_2"  # Enables FlashAttention-2
-)
-tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
-
-#################### DATA PREPROCESSING (PADDING-FREE) ###################
-
-response_template = "\n### Label:"
-response_template_ids = tokenizer.encode(
-    response_template, add_special_tokens=False
-)[2:]  # Exclude special tokens
-
-data_collator = DataCollatorForCompletionOnlyLM(
-    response_template_ids=response_template_ids,
-    tokenizer=tokenizer,
-    ignore_index=-100,
-    padding_free=True  # Enables padding-free collation
-)
-
-def format_dataset(example):
-    return {
-        "output": example["output"] + tokenizer.eos_token
-    }
-
-data_files = {"train": "path/to/dataset"}  # Replace with your dataset path
-json_dataset = datasets.load_dataset("json", data_files=data_files)
-formatted_train_dataset = json_dataset["train"].map(format_dataset)
-
-################# TRAINING CONFIGURATION ############################
-
-train_args = TrainingArguments(
-    num_train_epochs=5,
-    per_device_train_batch_size=4,
-    per_device_eval_batch_size=4,
-    gradient_accumulation_steps=4,
-    learning_rate=1e-5,
-    weight_decay=0.0,
-    warmup_ratio=0.03,
-    lr_scheduler_type="cosine",
-    logging_steps=1,
-    include_tokens_per_second=True,
-    save_strategy="epoch",
-    output_dir="output",
-    torch_compile=True,  # Enables torch.compile
-    torch_compile_backend="inductor",
-    torch_compile_mode="default"
-)
-
-# Convert TrainingArguments to SFTConfig
-transformer_train_arg_fields = [x.name for x in dataclasses.fields(SFTConfig)]
-transformer_kwargs = {
-    k: v
-    for k, v in train_args.to_dict().items()
-    if k in transformer_train_arg_fields
-}
-training_args = SFTConfig(**transformer_kwargs)
-
-####################### FINE-TUNING #####################
-
-trainer = SFTTrainer(
-    model=model,
-    tokenizer=tokenizer,
-    train_dataset=formatted_train_dataset,
-    data_collator=data_collator,
-    dataset_text_field="output",
-    args=training_args,
-)
-trainer.train()
-```
-
 ### PyTorch scaled dot product attention
 
 Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation.
 
 > [!TIP]
-> SDPA supports FlashAttention-2 as long as you have the latest PyTorch version installed.
+> SDPA automaticallysupports FlashAttention-2 as long as you have the latest PyTorch version installed.
 
 Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention.
 
@@ -473,12 +380,14 @@ with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
 
 ## Quantization
 
-Quantization reduces the size of the LLM weights by storing them in a lower precision. This translates to lower memory usage and makes loading LLMs for inference more accessible if you're constrained by your GPUs memory. If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can incur a small latency cost (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights.
+Quantization reduces the size of model weights by storing them in a lower precision. This translates to lower memory usage and makes loading LLMs for inference more accessible if you're constrained by GPU memory.
+
+If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can increase latency slightly (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights.
 
 > [!TIP]
 > There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes.
 
-Use the Model Memory Calculator below to estimate and compare how much memory is required to load a model. For example, try estimating how much memory it costs to load [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1).
+Use the Model Memory Calculator below to estimate and compare how much memory is required to load a model. For example, try estimating the memory required to load [Mistral-7B-v0.1](https://hf.co/mistralai/Mistral-7B-v0.1).
 
 <iframe
 	src="https://hf-accelerate-model-memory-usage.hf.space"
@@ -487,7 +396,7 @@ Use the Model Memory Calculator below to estimate and compare how much memory is
 	height="450"
 ></iframe>
 
-To load Mistral-7B-v0.1 in half-precision, set the `torch_dtype` parameter in the [`~transformers.AutoModelForCausalLM.from_pretrained`] method to `torch.bfloat16`. This requires 13.74GB of memory.
+To load a model in half-precision, set the [torch_dtype](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.torch_dtype) parameter in [`~transformers.AutoModelForCausalLM.from_pretrained`] to `torch.bfloat16`. This requires 13.74GB of memory.
 
 ```py
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -498,7 +407,7 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
-To load a quantized model (8-bit or 4-bit) for inference, try [bitsandbytes](https://hf.co/docs/bitsandbytes) and set the `load_in_4bit` or `load_in_8bit` parameters to `True`. Loading the model in 8-bits only requires 6.87 GB of memory.
+To load a quantized model (8-bit or 4-bit), try [bitsandbytes](https://hf.co/docs/bitsandbytes) and set the [load_in_4bit](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.BitsAndBytesConfig.load_in_4bit) or [load_in_8bit](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.BitsAndBytesConfig.load_in_8bit) parameters to `True`. Loading the model in 8-bits only requires 6.87 GB of memory.
 
 ```py
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md
index b0cb96293b68..d86765720241 100644
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,269 +14,276 @@ rendered properly in your Markdown viewer.
 
 -->
 
-
-# Generation with LLMs
+# Text generation
 
 [[open-in-colab]]
 
-LLMs, or Large Language Models, are the key component behind text generation. In a nutshell, they consist of large pretrained transformer models trained to predict the next word (or, more precisely, token) given some input text. Since they predict one token at a time, you need to do something more elaborate to generate new sentences other than just calling the model -- you need to do autoregressive generation.
-
-Autoregressive generation is the inference-time procedure of iteratively calling a model with its own generated outputs, given a few initial inputs. In 🤗 Transformers, this is handled by the [`~generation.GenerationMixin.generate`] method, which is available to all models with generative capabilities.
-
-<Tip>
+Text generation is the most popular application for large language models (LLMs). A LLM is trained to generate the next word (token) given some initial text (prompt) along with its own generated outputs up to a predefined length or when it reaches an end-of-sequence (`EOS`) token.
 
-If you want to jump straight to chatting with a model, [try our chat CLI](quicktour#chat-with-text-generation-models).
+In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities.
 
-</Tip>
+This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.
 
-This tutorial will show you how to:
+## Default generate
 
-* Generate text with an LLM
-* Avoid common pitfalls
-* Next steps to help you get the most out of your LLM
-
-Before you begin, make sure you have all the necessary libraries installed:
+Before you begin, it's helpful to install [bitsandbytes](https://hf.co/docs/bitsandbytes/index) to quantize really large models to reduce their memory usage.
 
 ```bash
-pip install transformers bitsandbytes>=0.39.0 -q
+!pip install -U transformers bitsandbytes
 ```
 Bitsandbytes supports multiple backends in addition to CUDA-based GPUs. Refer to the multi-backend installation [guide](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend) to learn more.
 
+Load a LLM with [`~PreTrainedModel.from_pretrained`] and add the following two parameters to reduce the memory requirements.
 
-## Generate text
-
-A language model trained for [causal language modeling](tasks/language_modeling) takes a sequence of text tokens as input and returns the probability distribution for the next token.
+- `device_map="auto"` enables Accelerates' [Big Model Inference](./models#big-model-inference) feature for automatically initiating the model skeleton and loading and dispatching the model weights across all available devices, starting with the fastest device (GPU).
+- `quantization_config` is a configuration object that defines the quantization settings. This examples uses bitsandbytes as the quantization backend (see the [Quantization](./quantization/overview) section for more available backends) and it loads the model in [4-bits](./quantization/bitsandbytes).
 
-<!-- [GIF 1 -- FWD PASS] -->
-<figure class="image table text-center m-0 w-full">
-    <video
-        style="max-width: 90%; margin: auto;"
-        autoplay loop muted playsinline
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov"
-    ></video>
-    <figcaption>"Forward pass of an LLM"</figcaption>
-</figure>
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 
-A critical aspect of autoregressive generation with LLMs is how to select the next token from this probability distribution. Anything goes in this step as long as you end up with a token for the next iteration. This means it can be as simple as selecting the most likely token from the probability distribution or as complex as applying a dozen transformations before sampling from the resulting distribution.
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto", quantization_config=quantization_config)
+```
 
-<!-- [GIF 2 -- TEXT GENERATION] -->
-<figure class="image table text-center m-0 w-full">
-    <video
-        style="max-width: 90%; margin: auto;"
-        autoplay loop muted playsinline
-        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_2_1080p.mov"
-    ></video>
-    <figcaption>"Autoregressive generation iteratively selects the next token from a probability distribution to generate text"</figcaption>
-</figure>
+Tokenize your input, and set the [`~PreTrainedTokenizer.padding_side`] parameter to `"left"` because a LLM is not trained to continue generation from padding tokens. The tokenizer returns the input ids and attention mask.
 
-The process depicted above is repeated iteratively until some stopping condition is reached. Ideally, the stopping condition is dictated by the model, which should learn when to output an end-of-sequence (`EOS`) token. If this is not the case, generation stops when some predefined maximum length is reached.
+> [!TIP]
+> Process more than one prompt at a time by passing a list of strings to the tokenizer. Batch the inputs to improve throughput at a small cost to latency and memory.
 
-Properly setting up the token selection step and the stopping condition is essential to make your model behave as you'd expect on your task. That is why we have a [`~generation.GenerationConfig`] file associated with each model, which contains a good default generative parameterization and is loaded alongside your model.
+```py
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
+model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+```
 
-Let's talk code!
+Pass the inputs to [`~GenerationMixin.generate`] to generate tokens, and [`~PreTrainedTokenizer.batch_decode`] the generated tokens back to text.
 
-<Tip>
+```py
+generated_ids = model.generate(**model_inputs)
+tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+"A list of colors: red, blue, green, yellow, orange, purple, pink,"
+```
 
-If you're interested in basic LLM usage, our high-level [`Pipeline`](pipeline_tutorial) interface is a great starting point. However, LLMs often require advanced features like quantization and fine control of the token selection step, which is best done through [`~generation.GenerationMixin.generate`]. Autoregressive generation with LLMs is also resource-intensive and should be executed on a GPU for adequate throughput.
+## Generation configuration
 
-</Tip>
+All generation settings are contained in [`GenerationConfig`]. In the example above, the generation settings are derived from the `generation_config.json` file of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1). A default decoding strategy is used when no configuration is saved with a model.
 
-First, you need to load the model.
+Inspect the configuration through the `generation_config` attribute. It only shows values that are different from the default configuration, in this case, the `bos_token_id` and `eos_token_id`.
 
 ```py
->>> from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto")
+model.generation_config
+GenerationConfig {
+  "bos_token_id": 1,
+  "eos_token_id": 2
+}
+```
 
->>> model = AutoModelForCausalLM.from_pretrained(
-...     "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
-... )
+You can customize [`~GenerationMixin.generate`] by overriding the parameters and values in [`GenerationConfig`]. Some of the most commonly adjusted parameters are [max_new_tokens](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.max_new_tokens), [num_beams](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.num_beams), [do_sample](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.do_sample), and [num_return_sequences](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.num_return_sequences).
+
+```py
+# enable beam search sampling strategy
+model.generate(**inputs, num_beams=4, do_sample=True)
 ```
 
-You'll notice two flags in the `from_pretrained` call:
+[`~GenerationMixin.generate`] can also be extended with external libraries or custom code. The `logits_processor` parameter accepts custom [`LogitsProcessor`] instances for manipulating the next token probability distribution. `stopping_criteria` supports custom [`StoppingCriteria`] to stop text generation. Check out the [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo) for more examples of external [`~GenerationMixin.generate`]-compatible extensions.
 
- - `device_map` ensures the model is moved to your GPU(s)
- - `load_in_4bit` applies [4-bit dynamic quantization](main_classes/quantization) to massively reduce the resource requirements
+Refer to the [Generation strategies](./generation_strategies) guide to learn more about search, sampling, and decoding strategies.
 
-There are other ways to initialize a model, but this is a good baseline to begin with an LLM.
+### Saving
 
-Next, you need to preprocess your text input with a [tokenizer](tokenizer_summary).
+Create an instance of [`GenerationConfig`] and specify the decoding parameters you want.
 
 ```py
->>> from transformers import AutoTokenizer
->>> from accelerate.test_utils.testing import get_backend
+from transformers import AutoModelForCausalLM, GenerationConfig
 
->>> DEVICE, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
->>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(DEVICE)
+model = AutoModelForCausalLM.from_pretrained("my_account/my_model")
+generation_config = GenerationConfig(
+    max_new_tokens=50, do_sample=True, top_k=50, eos_token_id=model.config.eos_token_id
+)
 ```
 
-The `model_inputs` variable holds the tokenized text input, as well as the attention mask. While [`~generation.GenerationMixin.generate`] does its best effort to infer the attention mask when it is not passed, we recommend passing it whenever possible for optimal results.
-
-After tokenizing the inputs, you can call the [`~generation.GenerationMixin.generate`] method to returns the generated tokens. The generated tokens then should be converted to text before printing.
+Use [`~GenerationConfig.save_pretrained`] to save a specific generation configuration and set the `push_to_hub` parameter to `True` to upload it to the Hub.
 
 ```py
->>> generated_ids = model.generate(**model_inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-'A list of colors: red, blue, green, yellow, orange, purple, pink,'
+generation_config.save_pretrained("my_account/my_model", push_to_hub=True)
 ```
 
-Finally, you don't need to do it one sequence at a time! You can batch your inputs, which will greatly improve the throughput at a small latency and memory cost. All you need to do is to make sure you pad your inputs properly (more on that below).
+Leave the `config_file_name` parameter empty. This parameter should be used when storing multiple generation configurations in a single directory. It gives you a way to specify which generation configuration to load. You can create different configurations for different generative tasks (creative text generation with sampling, summarization with beam search) for use with a single model.
 
 ```py
->>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
->>> model_inputs = tokenizer(
-...     ["A list of colors: red, blue", "Portugal is"], return_tensors="pt", padding=True
-... ).to(DEVICE)
->>> generated_ids = model.generate(**model_inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-['A list of colors: red, blue, green, yellow, orange, purple, pink,',
-'Portugal is a country in southwestern Europe, on the Iber']
-```
-
-And that's it! In a few lines of code, you can harness the power of an LLM.
-
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
 
-## Common pitfalls
+tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
+model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
 
-There are many [generation strategies](generation_strategies), and sometimes the default values may not be appropriate for your use case. If your outputs aren't aligned with what you're expecting, we've created a list of the most common pitfalls and how to avoid them.
+translation_generation_config = GenerationConfig(
+    num_beams=4,
+    early_stopping=True,
+    decoder_start_token_id=0,
+    eos_token_id=model.config.eos_token_id,
+    pad_token=model.config.pad_token_id,
+)
 
-```py
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
+translation_generation_config.save_pretrained("/tmp", config_file_name="translation_generation_config.json", push_to_hub=True)
 
->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
->>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
->>> model = AutoModelForCausalLM.from_pretrained(
-...     "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
-... )
+generation_config = GenerationConfig.from_pretrained("/tmp", config_file_name="translation_generation_config.json")
+inputs = tokenizer("translate English to French: Configuration files are easy to use!", return_tensors="pt")
+outputs = model.generate(**inputs, generation_config=generation_config)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ```
 
-### Generated output is too short/long
+## Pitfalls
+
+The section below covers some common issues you may encounter during text generation and how to solve them.
 
-If not specified in the [`~generation.GenerationConfig`] file, `generate` returns up to 20 tokens by default. We highly recommend manually setting `max_new_tokens` in your `generate` call to control the maximum number of new tokens it can return. Keep in mind LLMs (more precisely, [decoder-only models](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)) also return the input prompt as part of the output.
+### Output length
 
+[`~GenerationMixin.generate`] returns up to 20 tokens by default unless otherwise specified in a models [`GenerationConfig`]. It is highly recommended to manually set the number of generated tokens with the [`max_new_tokens`] parameter to control the output length. [Decoder-only](https://hf.co/learn/nlp-course/chapter1/6?fw=pt) models returns the initial prompt along with the generated tokens.
 
 ```py
->>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to(DEVICE)
+model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
+```
 
->>> # By default, the output will contain up to 20 tokens
->>> generated_ids = model.generate(**model_inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+<hfoptions id="output-length">
+<hfoption id="default length">
+
+```py
+generated_ids = model.generate(**model_inputs)
+tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 'A sequence of numbers: 1, 2, 3, 4, 5'
+```
 
->>> # Setting `max_new_tokens` allows you to control the maximum length
->>> generated_ids = model.generate(**model_inputs, max_new_tokens=50)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+</hfoption>
+<hfoption id="max_new_tokens">
+
+```py
+generated_ids = model.generate(**model_inputs, max_new_tokens=50)
+tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
 ```
 
-### Incorrect generation mode
+</hfoption>
+</hfoptions>
+
+### Decoding strategy
 
-By default, and unless specified in the [`~generation.GenerationConfig`] file, `generate` selects the most likely token at each iteration (greedy decoding). Depending on your task, this may be undesirable; creative tasks like chatbots or writing an essay benefit from sampling. On the other hand, input-grounded tasks like audio transcription or translation benefit from greedy decoding. Enable sampling with `do_sample=True`, and you can learn more about this topic in this [blog post](https://huggingface.co/blog/how-to-generate).
+The default decoding strategy in [`~GenerationMixin.generate`] is *greedy search*, which selects the next most likely token, unless otherwise specified in a models [`GenerationConfig`]. While this decoding strategy works well for input-grounded tasks (transcription, translation), it is not optimal for more creative use cases (story writing, chat applications).
+
+For example, enable a [multinomial sampling](./generation_strategies#multinomial-sampling) strategy to generate more diverse outputs. Refer to the [Generation strategy](./generation_strategies) guide for more decoding strategies.
 
 ```py
->>> # Set seed for reproducibility -- you don't need this unless you want full reproducibility
->>> from transformers import set_seed
->>> set_seed(42)
+model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
+```
+
+<hfoptions id="decoding">
+<hfoption id="greedy search">
 
->>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to(DEVICE)
+```py
+generated_ids = model.generate(**model_inputs)
+tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+```
 
->>> # LLM + greedy decoding = repetitive, boring output
->>> generated_ids = model.generate(**model_inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-'I am a cat. I am a cat. I am a cat. I am a cat'
+</hfoption>
+<hfoption id="multinomial sampling">
 
->>> # With sampling, the output becomes more creative!
->>> generated_ids = model.generate(**model_inputs, do_sample=True)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-'I am a cat.  Specifically, I am an indoor-only cat.  I'
+```py
+generated_ids = model.generate(**model_inputs, do_sample=True)
+tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 ```
 
-### Wrong padding side
+</hfoption>
+</hfoptions>
 
-LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt) architectures, meaning they continue to iterate on your input prompt. If your inputs do not have the same length, they need to be padded. Since LLMs are not trained to continue from pad tokens, your input needs to be left-padded. Make sure you also don't forget to pass the attention mask to generate!
+### Padding side
+
+Inputs need to be padded if they don't have the same length. But LLMs aren't trained to continue generation from padding tokens, which means the [`~PreTrainedTokenizer.padding_side`] parameter needs to be set to the left of the input.
+
+<hfoptions id="padding">
+<hfoption id="right pad">
 
 ```py
->>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
->>> # which is shorter, has padding on the right side. Generation fails to capture the logic.
->>> model_inputs = tokenizer(
-...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
-... ).to(DEVICE)
->>> generated_ids = model.generate(**model_inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+model_inputs = tokenizer(
+    ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+).to("cuda")
+generated_ids = model.generate(**model_inputs)
+tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 '1, 2, 33333333333'
+```
 
->>> # With left-padding, it works as expected!
->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
->>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
->>> model_inputs = tokenizer(
-...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
-... ).to(DEVICE)
->>> generated_ids = model.generate(**model_inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+</hfoption>
+<hfoption id="left pad">
+
+```py
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
+tokenizer.pad_token = tokenizer.eos_token
+model_inputs = tokenizer(
+    ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+).to("cuda")
+generated_ids = model.generate(**model_inputs)
+tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 '1, 2, 3, 4, 5, 6,'
 ```
 
-### Wrong prompt
-
-Some models and tasks expect a certain input prompt format to work properly. When this format is not applied, you will get a silent performance degradation: the model kinda works, but not as well as if you were following the expected prompt. More information about prompting, including which models and tasks need to be careful, is available in this [guide](tasks/prompting). Let's see an example with a chat LLM, which makes use of [chat templating](chat_templating):
-
-```python
->>> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
->>> model = AutoModelForCausalLM.from_pretrained(
-...     "HuggingFaceH4/zephyr-7b-alpha", device_map="auto", load_in_4bit=True
-... )
->>> set_seed(0)
->>> prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
->>> model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE)
->>> input_length = model_inputs.input_ids.shape[1]
->>> generated_ids = model.generate(**model_inputs, max_new_tokens=20)
->>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
-"I'm not a thug, but i can tell you that a human cannot eat"
->>> # Oh no, it did not follow our instruction to reply as a thug! Let's see what happens when we write
->>> # a better prompt and use the right template for this model (through `tokenizer.apply_chat_template`)
-
->>> set_seed(0)
->>> messages = [
-...     {
-...         "role": "system",
-...         "content": "You are a friendly chatbot who always responds in the style of a thug",
-...     },
-...     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
-... ]
->>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
->>> input_length = model_inputs.shape[1]
->>> generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20)
->>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
-'None, you thug. How bout you try to focus on more useful questions?'
->>> # As we can see, it followed a proper thug style 😎
-```
+</hfoption>
+</hfoptions>
+
+### Prompt format
+
+Some models and tasks expect a certain input prompt format, and if the format is incorrect, the model returns a suboptimal output. You can learn more about prompting in the [prompt engineering](./tasks/prompting) guide.
+
+For example, a chat model expects the input as a [chat template](./chat_templating). Your prompt should include a `role` and `content` to indicate who is participating in the conversation. If you try to pass your prompt as a single string, the model doesn't always return the expected output.
 
-## Further resources
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
-While the autoregressive generation process is relatively straightforward, making the most out of your LLM can be a challenging endeavor because there are many moving parts. For your next steps to help you dive deeper into LLM usage and understanding:
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
+model = AutoModelForCausalLM.from_pretrained(
+    "HuggingFaceH4/zephyr-7b-alpha", device_map="auto", load_in_4bit=True
+)
+```
 
-### Advanced generate usage
+<hfoptions id="format">
+<hfoption id="no format">
 
-1. Guide on how to [control different generation methods](generation_strategies), how to set up the generation configuration file, and how to stream the output;
-2. [Accelerating text generation](llm_optims);
-3. [Prompt templates for chat LLMs](chat_templating);
-4. [Prompt design guide](tasks/prompting);
-5. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils). Most of the classes, including the logits processors, have usage examples!
+```py
+prompt = """How many cats does it take to change a light bulb? Reply as a pirate."""
+model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
+input_length = model_inputs.input_ids.shape[1]
+generated_ids = model.generate(**model_inputs, max_new_tokens=50)
+print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
+"Aye, matey! 'Tis a simple task for a cat with a keen eye and nimble paws. First, the cat will climb up the ladder, carefully avoiding the rickety rungs. Then, with"
+```
 
-### LLM leaderboards
+</hfoption>
+<hfoption id="chat template">
 
-1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), which focuses on the quality of the open-source models;
-2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard), which focuses on LLM throughput.
+```py
+messages = [
+    {
+        "role": "system",
+        "content": "You are a friendly chatbot who always responds in the style of a pirate",
+    },
+    {"role": "user", "content": "How many cats does it take to change a light bulb?"},
+]
+model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
+input_length = model_inputs.shape[1]
+generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=50)
+print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
+"Arr, matey! According to me beliefs, 'twas always one cat to hold the ladder and another to climb up it an’ change the light bulb, but if yer looking to save some catnip, maybe yer can
+```
 
-### Latency, throughput and memory utilization
+</hfoption>
+</hfoptions>
 
-1. Guide on how to [optimize LLMs for speed and memory](llm_tutorial_optimization);
-2. Guide on [quantization](main_classes/quantization) such as bitsandbytes and autogptq, which shows you how to drastically reduce your memory requirements.
+## Resources
 
-### Related libraries
+Take a look below for some more specific and specialized text generation libraries.
 
-1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices;
-2. [`outlines`](https://github.com/outlines-dev/outlines), a library where you can constrain text generation (e.g. to generate JSON files);
-3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation (e.g. JSON, SQL, Python);
-4. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs;
-5. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), a UI for text generation;
-6. [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo), containing additional options to control text generation with 🤗 Transformers. See our related [blog post](https://huggingface.co/blog/logits-processor-zoo).
+- [Optimum](https://github.com/huggingface/optimum): an extension of Transformers focused on optimizing training and inference on specific hardware devices
+- [Outlines](https://github.com/dottxt-ai/outlines): a library for constrained text generation (generate JSON files for example).
+- [SynCode](https://github.com/uiuc-focal-lab/syncode): a library for context-free grammar guided generation (JSON, SQL, Python).
+- [Text Generation Inference](https://github.com/huggingface/text-generation-inference): a production-ready server for LLMs.
+- [Text generation web UI](https://github.com/oobabooga/text-generation-webui): a Gradio web UI for text generation.
+- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
\ No newline at end of file
diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md
index 3414725fc370..c7c53765a2f8 100644
--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@@ -8,6 +8,7 @@ specific language governing permissions and limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
 -->
+
 # Optimizing LLMs for Speed and Memory
 
 [[open-in-colab]]
@@ -55,7 +56,7 @@ To give some examples of how much VRAM it roughly takes to load a model in bfloa
 
 As of writing this document, the largest GPU chip on the market is the A100 & H100 offering 80GB of VRAM. Most of the models listed before require more than 80GB just to be loaded and therefore necessarily require [tensor parallelism](https://huggingface.co/docs/transformers/perf_train_gpu_many#tensor-parallelism) and/or [pipeline parallelism](https://huggingface.co/docs/transformers/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
 
-🤗 Transformers does not support tensor parallelism out of the box as it requires the model architecture to be written in a specific way. If you're interested in writing models in a tensor-parallelism-friendly way, feel free to have a look at [the text-generation-inference library](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).
+🤗 Transformers now supports tensor parallelism for supported models having `base_tp_plan` in their respective config classes. Learn more about Tensor Parallelism [here](perf_train_gpu_many#tensor-parallelism). Furthermore, if you're interested in writing models in a tensor-parallelism-friendly way, feel free to have a look at [the text-generation-inference library](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).
 
 Naive pipeline parallelism is supported out of the box. For this, simply load the model with `device="auto"` which will automatically place the different layers on the available GPUs as explained [here](https://huggingface.co/docs/accelerate/v0.22.0/en/concept_guides/big_model_inference).
 Note, however that while very effective, this naive pipeline parallelism does not tackle the issues of GPU idling. For this more advanced pipeline parallelism is required as explained [here](https://huggingface.co/docs/transformers/en/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
@@ -550,7 +551,7 @@ $$ \mathbf{\hat{q}}_i^T \mathbf{\hat{x}}_j = \mathbf{{q}}_i^T \mathbf{R}_{\theta
 
 \\( \mathbf{R}_{\theta, i - j} \\) thereby represents a rotational matrix. \\( \theta \\) is *not* learned during training, but instead set to a pre-defined value that depends on the maximum input sequence length during training.
 
-> By doing so, the propability score between \\( \mathbf{q}_i \\) and \\( \mathbf{q}_j \\) is only affected if \\( i \ne j \\) and solely depends on the relative distance \\( i - j \\) regardless of each vector's specific positions \\( i \\) and \\( j \\) .
+> By doing so, the probability score between \\( \mathbf{q}_i \\) and \\( \mathbf{q}_j \\) is only affected if \\( i \ne j \\) and solely depends on the relative distance \\( i - j \\) regardless of each vector's specific positions \\( i \\) and \\( j \\) .
 
 *RoPE* is used in multiple of today's most important LLMs, such as:
 
diff --git a/docs/source/en/main_classes/callback.md b/docs/source/en/main_classes/callback.md
index ee91737ef050..99f76b7b05e4 100644
--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@@ -45,6 +45,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
 - [`~integrations.DagsHubCallback`] if [dagshub](https://dagshub.com/) is installed.
 - [`~integrations.FlyteCallback`] if [flyte](https://flyte.org/) is installed.
 - [`~integrations.DVCLiveCallback`] if [dvclive](https://dvc.org/doc/dvclive) is installed.
+- [`~integrations.SwanLabCallback`] if [swanlab](http://swanlab.cn/) is installed.
 
 If a package is installed but you don't wish to use the accompanying integration, you can change `TrainingArguments.report_to` to a list of just those integrations you want to use (e.g. `["azure_ml", "wandb"]`). 
 
@@ -92,6 +93,9 @@ Here is the list of the available [`TrainerCallback`] in the library:
 [[autodoc]] integrations.DVCLiveCallback
     - setup
 
+[[autodoc]] integrations.SwanLabCallback
+    - setup
+
 ## TrainerCallback
 
 [[autodoc]] TrainerCallback
diff --git a/docs/source/en/main_classes/optimizer_schedules.md b/docs/source/en/main_classes/optimizer_schedules.md
index 9815b430ab0c..24c978e6fe3c 100644
--- a/docs/source/en/main_classes/optimizer_schedules.md
+++ b/docs/source/en/main_classes/optimizer_schedules.md
@@ -22,9 +22,6 @@ The `.optimization` module provides:
 - several schedules in the form of schedule objects that inherit from `_LRSchedule`:
 - a gradient accumulation class to accumulate the gradients of multiple batches
 
-## AdamW (PyTorch)
-
-[[autodoc]] AdamW
 
 ## AdaFactor (PyTorch)
 
diff --git a/docs/source/en/main_classes/peft.md b/docs/source/en/main_classes/peft.md
new file mode 100644
index 000000000000..85790f120ebf
--- /dev/null
+++ b/docs/source/en/main_classes/peft.md
@@ -0,0 +1,23 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# PEFT
+
+The [`~integrations.PeftAdapterMixin`] provides functions from the [PEFT](https://huggingface.co/docs/peft/index) library for managing adapters with Transformers. This mixin currently supports LoRA, IA3, and AdaLora. Prefix tuning methods (prompt tuning, prompt learning) aren't supported because they can't be injected into a torch module.
+
+[[autodoc]] integrations.PeftAdapterMixin
+    - load_adapter
+    - add_adapter
+    - set_adapter
+    - disable_adapters
+    - enable_adapters
+    - active_adapters
+    - get_adapter_state_dict
diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index 6da5b8ce69b5..fb42e886bace 100755
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -88,3 +88,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 ## FineGrainedFP8Config
 
 [[autodoc]] FineGrainedFP8Config
+
+## QuarkConfig
+
+[[autodoc]] QuarkConfig
diff --git a/docs/source/en/model_doc/albert.md b/docs/source/en/model_doc/albert.md
index d195203615de..21cd57675e53 100644
--- a/docs/source/en/model_doc/albert.md
+++ b/docs/source/en/model_doc/albert.md
@@ -17,12 +17,11 @@ rendered properly in your Markdown viewer.
 # ALBERT
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=albert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-albert-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/albert-base-v2">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/align.md b/docs/source/en/model_doc/align.md
index 0d34d95a7981..b2920bdc2bac 100644
--- a/docs/source/en/model_doc/align.md
+++ b/docs/source/en/model_doc/align.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # ALIGN
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The ALIGN model was proposed in [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. ALIGN is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image classification. ALIGN features a dual-encoder architecture with [EfficientNet](efficientnet) as its vision encoder and [BERT](bert) as its text encoder, and learns to align visual and text representations with contrastive learning. Unlike previous work, ALIGN leverages a massive noisy dataset and shows that the scale of the corpus can be used to achieve SOTA representations with a simple recipe.
diff --git a/docs/source/en/model_doc/altclip.md b/docs/source/en/model_doc/altclip.md
index b1fc9b382694..0dfbf797a033 100644
--- a/docs/source/en/model_doc/altclip.md
+++ b/docs/source/en/model_doc/altclip.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # AltCLIP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The AltCLIP model was proposed in [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679v2) by Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu. AltCLIP
diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md
index 9ff7a6687aa9..7b58f59cab7e 100644
--- a/docs/source/en/model_doc/aria.md
+++ b/docs/source/en/model_doc/aria.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Aria
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team.
diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.md b/docs/source/en/model_doc/audio-spectrogram-transformer.md
index d83c3bbb6cf2..14669ce0fb1b 100644
--- a/docs/source/en/model_doc/audio-spectrogram-transformer.md
+++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Audio Spectrogram Transformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Audio Spectrogram Transformer model was proposed in [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
diff --git a/docs/source/en/model_doc/autoformer.md b/docs/source/en/model_doc/autoformer.md
index bb423e941c78..2c5e27153e03 100644
--- a/docs/source/en/model_doc/autoformer.md
+++ b/docs/source/en/model_doc/autoformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Autoformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Autoformer model was proposed in [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
diff --git a/docs/source/en/model_doc/aya_vision.md b/docs/source/en/model_doc/aya_vision.md
new file mode 100644
index 000000000000..17daf4949206
--- /dev/null
+++ b/docs/source/en/model_doc/aya_vision.md
@@ -0,0 +1,243 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# AyaVision
+
+## Overview
+
+The Aya Vision 8B and 32B models is a state-of-the-art multilingual multimodal models developed by Cohere For AI. They build on the Aya Expanse recipe to handle both visual and textual information without compromising on the strong multilingual textual performance of the original model.
+
+Aya Vision 8B combines the `Siglip2-so400-384-14` vision encoder with the Cohere CommandR-7B language model further post-trained with the Aya Expanse recipe, creating a powerful vision-language model capable of understanding images and generating text across 23 languages. Whereas, Aya Vision 32B uses Aya Expanse 32B as the language model.
+
+Key features of Aya Vision include:
+- Multimodal capabilities in 23 languages
+- Strong text-only multilingual capabilities inherited from CommandR-7B post-trained with the Aya Expanse recipe and Aya Expanse 32B
+- High-quality visual understanding using the Siglip2-so400-384-14 vision encoder
+- Seamless integration of visual and textual information in 23 languages.
+
+<!-- <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/aya_vision_architecture.webp"
+alt="drawing" width="600"/>
+
+<small> Aya Vision architecture. </small> -->
+
+Tips:
+
+- Aya Vision is a multimodal model that takes images and text as input and produces text as output.
+- Images are represented using the `<image>` tag in the templated input.
+- For best results, use the `apply_chat_template` method of the processor to format your inputs correctly.
+- The model can process multiple images in a single conversation.
+- Aya Vision can understand and generate text in 23 languages, making it suitable for multilingual multimodal applications.
+
+This model was contributed by [saurabhdash](https://huggingface.co/saurabhdash) and [yonigozlan](https://huggingface.co/yonigozlan).
+
+
+## Usage
+
+Here's how to use Aya Vision for inference:
+
+```python
+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch
+
+model_id = "CohereForAI/aya-vision-8b"
+torch_device = "cuda:0"
+
+# Use fast image processor
+processor = AutoProcessor.from_pretrained(model_id, use_fast=True)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_id, device_map=torch_device, torch_dtype=torch.float16
+)
+
+# Format message with the aya-vision chat template
+messages = [
+    {"role": "user",
+     "content": [
+       {"type": "image", "url": "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium"},
+        {"type": "text", "text": "चित्र में लिखा पाठ क्या कहता है?"},
+    ]},
+    ]
+
+# Process image on CUDA
+inputs = processor.apply_chat_template(
+    messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", device=torch_device
+).to(model.device)
+
+gen_tokens = model.generate(
+    **inputs, 
+    max_new_tokens=300, 
+    do_sample=True, 
+    temperature=0.3,
+)
+
+gen_text = print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))
+```
+### Pipeline
+
+```python
+from transformers import pipeline
+
+pipe = pipeline(model="CohereForAI/aya-vision-8b", task="image-text-to-text", device_map="auto")
+
+# Format message with the aya-vision chat template
+messages = [
+    {"role": "user",
+     "content": [
+       {"type": "image", "url": "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo="},
+        {"type": "text", "text": "Bu resimde hangi anıt gösterilmektedir?"},
+    ]},
+    ]
+outputs = pipe(text=messages, max_new_tokens=300, return_full_text=False)
+
+print(outputs)
+```
+
+### Multiple Images and Batched Inputs
+
+Aya Vision can process multiple images in a single conversation. Here's how to use it with multiple images:
+
+```python
+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch
+
+model_id = "CohereForAI/aya-vision-8b"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_id, device_map="cuda:0", torch_dtype=torch.float16
+)
+
+# Example with multiple images in a single message
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+            },
+            {
+                "type": "image",
+                "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+            },
+            {
+                "type": "text",
+                "text": "These images depict two different landmarks. Can you identify them?",
+            },
+        ],
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+).to(model.device)
+
+gen_tokens = model.generate(
+    **inputs, 
+    max_new_tokens=300, 
+    do_sample=True, 
+    temperature=0.3,
+)
+
+gen_text = processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(gen_text)
+```
+
+For processing batched inputs (multiple conversations at once):
+
+```python
+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch
+
+model_id = "CohereForAI/aya-vision-8b"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_id, device_map="cuda:0", torch_dtype=torch.float16
+)
+
+# Prepare two different conversations
+batch_messages = [
+    # First conversation with a single image
+    [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                {"type": "text", "text": "Write a haiku for this image"},
+            ],
+        },
+    ],
+    # Second conversation with multiple images
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                },
+                {
+                    "type": "image",
+                    "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+                },
+                {
+                    "type": "text",
+                    "text": "These images depict two different landmarks. Can you identify them?",
+                },
+            ],
+        },
+    ],
+]
+
+# Process each conversation separately and combine into a batch
+batch_inputs = processor.apply_chat_template(
+    batch_messages, 
+    padding=True, 
+    add_generation_prompt=True, 
+    tokenize=True, 
+    return_dict=True, 
+    return_tensors="pt"
+).to(model.device)
+
+# Generate responses for the batch
+batch_outputs = model.generate(
+    **batch_inputs,
+    max_new_tokens=300,
+    do_sample=True,
+    temperature=0.3,
+)
+
+# Decode the generated responses
+for i, output in enumerate(batch_outputs):
+    response = processor.tokenizer.decode(
+        output[batch_inputs.input_ids.shape[1]:], 
+        skip_special_tokens=True
+    )
+    print(f"Response {i+1}:\n{response}\n")
+```
+
+## AyaVisionProcessor
+
+[[autodoc]] AyaVisionProcessor
+
+## AyaVisionConfig
+
+[[autodoc]] AyaVisionConfig
+
+## AyaVisionForConditionalGeneration
+
+[[autodoc]] AyaVisionForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/bamba.md b/docs/source/en/model_doc/bamba.md
index 4ea8475edb88..c6e1bcec56a2 100644
--- a/docs/source/en/model_doc/bamba.md
+++ b/docs/source/en/model_doc/bamba.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # Bamba
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
 
 ## Overview
 
diff --git a/docs/source/en/model_doc/bark.md b/docs/source/en/model_doc/bark.md
index 7c02e4be7011..912f552fa7c0 100644
--- a/docs/source/en/model_doc/bark.md
+++ b/docs/source/en/model_doc/bark.md
@@ -12,6 +12,11 @@ specific language governing permissions and limitations under the License.
 
 # Bark
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+</div>
+
 ## Overview
 
 Bark is a transformer-based text-to-speech model proposed by Suno AI in [suno-ai/bark](https://github.com/suno-ai/bark).
diff --git a/docs/source/en/model_doc/bart.md b/docs/source/en/model_doc/bart.md
index 7986228915cf..aaccd78047db 100644
--- a/docs/source/en/model_doc/bart.md
+++ b/docs/source/en/model_doc/bart.md
@@ -17,12 +17,12 @@ rendered properly in your Markdown viewer.
 # BART
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=bart">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bart-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/bart-large-mnli">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/barthez.md b/docs/source/en/model_doc/barthez.md
index 1b571e242f47..131b1dd8e185 100644
--- a/docs/source/en/model_doc/barthez.md
+++ b/docs/source/en/model_doc/barthez.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # BARThez
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The BARThez model was proposed in [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
diff --git a/docs/source/en/model_doc/bartpho.md b/docs/source/en/model_doc/bartpho.md
index 8f0a5f8bfe24..b3749516323d 100644
--- a/docs/source/en/model_doc/bartpho.md
+++ b/docs/source/en/model_doc/bartpho.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # BARTpho
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The BARTpho model was proposed in [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
diff --git a/docs/source/en/model_doc/beit.md b/docs/source/en/model_doc/beit.md
index 25b0eafb26a0..24dfabf682b6 100644
--- a/docs/source/en/model_doc/beit.md
+++ b/docs/source/en/model_doc/beit.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # BEiT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The BEiT model was proposed in [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by
diff --git a/docs/source/en/model_doc/bert-generation.md b/docs/source/en/model_doc/bert-generation.md
index 40c2fbaa212e..0c42adbeb564 100644
--- a/docs/source/en/model_doc/bert-generation.md
+++ b/docs/source/en/model_doc/bert-generation.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # BertGeneration
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The BertGeneration model is a BERT model that can be leveraged for sequence-to-sequence tasks using
diff --git a/docs/source/en/model_doc/bert-japanese.md b/docs/source/en/model_doc/bert-japanese.md
index d68bb221d577..33a720318b63 100644
--- a/docs/source/en/model_doc/bert-japanese.md
+++ b/docs/source/en/model_doc/bert-japanese.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # BertJapanese
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The BERT models trained on Japanese text.
diff --git a/docs/source/en/model_doc/bert.md b/docs/source/en/model_doc/bert.md
index b6e99d1031e8..94425336d5b1 100644
--- a/docs/source/en/model_doc/bert.md
+++ b/docs/source/en/model_doc/bert.md
@@ -14,160 +14,85 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# BERT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=bert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bert-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/bert-base-uncased">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>
 
-## Overview
-
-The BERT model was proposed in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a
-bidirectional transformer pretrained using a combination of masked language modeling objective and next sentence
-prediction on a large corpus comprising the Toronto Book Corpus and Wikipedia.
-
-The abstract from the paper is the following:
-
-*We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations
-from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional
-representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result,
-the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models
-for a wide range of tasks, such as question answering and language inference, without substantial task-specific
-architecture modifications.*
-
-*BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural
-language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI
-accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute
-improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/bert).
+# BERT
 
-## Usage tips
+[BERT](https://huggingface.co/papers/1810.04805) is a bidirectional transformer pretrained on unlabeled text to predict masked tokens in a sentence and to predict whether one sentence follows another. The main idea is that by randomly masking some tokens, the model can train on text to the left and right, giving it a more thorough understanding. BERT is also very versatile because its learned language representations can be adapted for other NLP tasks by fine-tuning an additional layer or head.
 
-- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-- BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is
-  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation.
-- Corrupts the inputs by using random masking, more precisely, during pretraining, a given percentage of tokens (usually 15%) is masked by:
+You can find all the original BERT checkpoints under the [BERT](https://huggingface.co/collections/google/bert-release-64ff5e7a4be99045d1896dbc) collection.
 
-    * a special mask token with probability 0.8
-    * a random token different from the one masked with probability 0.1
-    * the same token with probability 0.1
-    
-- The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50% they are not related. The model has to predict if the sentences are consecutive or not.
+> [!TIP]
+> Click on the BERT models in the right sidebar for more examples of how to apply BERT to different language tasks.
 
-### Using Scaled Dot Product Attention (SDPA)
+The example below demonstrates how to predict the `[MASK]` token with [`Pipeline`], [`AutoModel`], and from the command line.
 
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+```py
+import torch
+from transformers import pipeline
 
+pipeline = pipeline(
+    task="fill-mask",
+    model="google-bert/bert-base-uncased",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create [MASK] through a process known as photosynthesis.")
 ```
-from transformers import BertModel
 
-model = BertModel.from_pretrained("bert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")
-...
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google-bert/bert-base-uncased",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "google-bert/bert-base-uncased",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
 ```
 
-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
-
-On a local benchmark (A100-80GB, CPUx12, RAM 96.6GB, PyTorch 2.2.0, OS Ubuntu 22.04) with `float16`, we saw the 
-following speedups during training and inference.
-
-#### Training
-
-|batch_size|seq_len|Time per batch (eager - s)|Time per batch (sdpa - s)|Speedup (%)|Eager peak mem (MB)|sdpa peak mem (MB)|Mem saving (%)|
-|----------|-------|--------------------------|-------------------------|-----------|-------------------|------------------|--------------|
-|4         |256    |0.023                     |0.017                    |35.472     |939.213            |764.834           |22.800        |
-|4         |512    |0.023                     |0.018                    |23.687     |1970.447           |1227.162          |60.569        |
-|8         |256    |0.023                     |0.018                    |23.491     |1594.295           |1226.114          |30.028        |
-|8         |512    |0.035                     |0.025                    |43.058     |3629.401           |2134.262          |70.054        |
-|16        |256    |0.030                     |0.024                    |25.583     |2874.426           |2134.262          |34.680        |
-|16        |512    |0.064                     |0.044                    |46.223     |6964.659           |3961.013          |75.830        |
-
-#### Inference
-
-|batch_size|seq_len|Per token latency eager (ms)|Per token latency SDPA (ms)|Speedup (%)|Mem eager (MB)|Mem BT (MB)|Mem saved (%)|
-|----------|-------|----------------------------|---------------------------|-----------|--------------|-----------|-------------|
-|1         |128    |5.736                       |4.987                      |15.022     |282.661       |282.924    |-0.093       |
-|1         |256    |5.689                       |4.945                      |15.055     |298.686       |298.948    |-0.088       |
-|2         |128    |6.154                       |4.982                      |23.521     |314.523       |314.785    |-0.083       |
-|2         |256    |6.201                       |4.949                      |25.303     |347.546       |347.033    |0.148        |
-|4         |128    |6.049                       |4.987                      |21.305     |378.895       |379.301    |-0.107       |
-|4         |256    |6.285                       |5.364                      |17.166     |443.209       |444.382    |-0.264       |
-
-
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
-- A blog post on [BERT Text Classification in a different language](https://www.philschmid.de/bert-text-classification-in-a-different-language).
-- A notebook for [Finetuning BERT (and friends) for multi-label text classification](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb).
-- A notebook on how to [Finetune BERT for multi-label classification using PyTorch](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb). 🌎
-- A notebook on how to [warm-start an EncoderDecoder model with BERT for summarization](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb).
-- [`BertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
-- [`TFBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
-- [`FlaxBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
-- [Text classification task guide](../tasks/sequence_classification)
-
-<PipelineTag pipeline="token-classification"/>
-
-- A blog post on how to use [Hugging Face Transformers with Keras: Fine-tune a non-English BERT for Named Entity Recognition](https://www.philschmid.de/huggingface-transformers-keras-tf).
-- A notebook for [Finetuning BERT for named-entity recognition](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb) using only the first wordpiece of each word in the word label during tokenization. To propagate the label of the word to all wordpieces, see this [version](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb) of the notebook instead.
-- [`BertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
-- [`TFBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
-- [`FlaxBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
-- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
-- [Token classification task guide](../tasks/token_classification)
-
-<PipelineTag pipeline="fill-mask"/>
-
-- [`BertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-- [`FlaxBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
-- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
-- [Masked language modeling task guide](../tasks/masked_language_modeling)
-
-<PipelineTag pipeline="question-answering"/>
-
-- [`BertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
-- [`TFBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
-- [`FlaxBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
-- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
-- [Question answering task guide](../tasks/question_answering)
+</hfoption>
+<hfoption id="transformers-cli">
 
-**Multiple choice**
-- [`BertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
-- [`TFBertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
-- [Multiple choice task guide](../tasks/multiple_choice)
+```bash
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google-bert/bert-base-uncased --device 0
+```
 
-⚡️ **Inference**
-- A blog post on how to [Accelerate BERT inference with Hugging Face Transformers and AWS Inferentia](https://huggingface.co/blog/bert-inferentia-sagemaker).
-- A blog post on how to [Accelerate BERT inference with DeepSpeed-Inference on GPUs](https://www.philschmid.de/bert-deepspeed-inference).
+</hfoption>
+</hfoptions>
 
-⚙️ **Pretraining**
-- A blog post on [Pre-Training BERT with Hugging Face Transformers and Habana Gaudi](https://www.philschmid.de/pre-training-bert-habana).
+## Notes
 
-🚀 **Deploy**
-- A blog post on how to [Convert Transformers to ONNX with Hugging Face Optimum](https://www.philschmid.de/convert-transformers-to-onnx).
-- A blog post on how to [Setup Deep Learning environment for Hugging Face Transformers with Habana Gaudi on AWS](https://www.philschmid.de/getting-started-habana-gaudi#conclusion).
-- A blog post on [Autoscaling BERT with Hugging Face Transformers, Amazon SageMaker and Terraform module](https://www.philschmid.de/terraform-huggingface-amazon-sagemaker-advanced).
-- A blog post on [Serverless BERT with HuggingFace, AWS Lambda, and Docker](https://www.philschmid.de/serverless-bert-with-huggingface-aws-lambda-docker).
-- A blog post on [Hugging Face Transformers BERT fine-tuning using Amazon SageMaker and Training Compiler](https://www.philschmid.de/huggingface-amazon-sagemaker-training-compiler).
-- A blog post on [Task-specific knowledge distillation for BERT using Transformers & Amazon SageMaker](https://www.philschmid.de/knowledge-distillation-bert-transformers).
+- Inputs should be padded on the right because BERT uses absolute position embeddings.
 
 ## BertConfig
 
@@ -182,35 +107,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
     - create_token_type_ids_from_sequences
     - save_vocabulary
 
-<frameworkcontent>
-<pt>
-
 ## BertTokenizerFast
 
 [[autodoc]] BertTokenizerFast
 
-</pt>
-<tf>
-
-## TFBertTokenizer
-
-[[autodoc]] TFBertTokenizer
-
-</tf>
-</frameworkcontent>
-
-## Bert specific outputs
-
-[[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput
-
-[[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
-
-[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
-
-
-<frameworkcontent>
-<pt>
-
 ## BertModel
 
 [[autodoc]] BertModel
@@ -256,8 +156,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] BertForQuestionAnswering
     - forward
 
-</pt>
-<tf>
+## TFBertTokenizer
+
+[[autodoc]] TFBertTokenizer
 
 ## TFBertModel
 
@@ -304,9 +205,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] TFBertForQuestionAnswering
     - call
 
-</tf>
-<jax>
-
 ## FlaxBertModel
 
 [[autodoc]] FlaxBertModel
@@ -352,7 +250,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] FlaxBertForQuestionAnswering
     - __call__
 
-</jax>
-</frameworkcontent>
+## Bert specific outputs
+
+[[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput
 
+[[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
 
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
\ No newline at end of file
diff --git a/docs/source/en/model_doc/bertweet.md b/docs/source/en/model_doc/bertweet.md
index c4c883b21ad7..be489643173f 100644
--- a/docs/source/en/model_doc/bertweet.md
+++ b/docs/source/en/model_doc/bertweet.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # BERTweet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The BERTweet model was proposed in [BERTweet: A pre-trained language model for English Tweets](https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf) by Dat Quoc Nguyen, Thanh Vu, Anh Tuan Nguyen.
diff --git a/docs/source/en/model_doc/big_bird.md b/docs/source/en/model_doc/big_bird.md
index 3d1ef91d5606..32ca5a2062a2 100644
--- a/docs/source/en/model_doc/big_bird.md
+++ b/docs/source/en/model_doc/big_bird.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # BigBird
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by
diff --git a/docs/source/en/model_doc/bigbird_pegasus.md b/docs/source/en/model_doc/bigbird_pegasus.md
index 003e5643719b..499d40b3149b 100644
--- a/docs/source/en/model_doc/bigbird_pegasus.md
+++ b/docs/source/en/model_doc/bigbird_pegasus.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # BigBirdPegasus
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by
diff --git a/docs/source/en/model_doc/biogpt.md b/docs/source/en/model_doc/biogpt.md
index 7d0943d5393d..ab8aea6c29e8 100644
--- a/docs/source/en/model_doc/biogpt.md
+++ b/docs/source/en/model_doc/biogpt.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # BioGPT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The BioGPT model was proposed in [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. BioGPT is a domain-specific generative pre-trained Transformer language model for biomedical text generation and mining. BioGPT follows the Transformer language model backbone, and is pre-trained on 15M PubMed abstracts from scratch.
diff --git a/docs/source/en/model_doc/bit.md b/docs/source/en/model_doc/bit.md
index 7f8a8ea67c45..550c07662dd7 100644
--- a/docs/source/en/model_doc/bit.md
+++ b/docs/source/en/model_doc/bit.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Big Transfer (BiT)
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The BiT model was proposed in [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
diff --git a/docs/source/en/model_doc/blenderbot-small.md b/docs/source/en/model_doc/blenderbot-small.md
index d5f4a7d849b7..647a865de339 100644
--- a/docs/source/en/model_doc/blenderbot-small.md
+++ b/docs/source/en/model_doc/blenderbot-small.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # Blenderbot Small
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 Note that [`BlenderbotSmallModel`] and
 [`BlenderbotSmallForConditionalGeneration`] are only used in combination with the checkpoint
 [facebook/blenderbot-90M](https://huggingface.co/facebook/blenderbot-90M). Larger Blenderbot checkpoints should
diff --git a/docs/source/en/model_doc/blenderbot.md b/docs/source/en/model_doc/blenderbot.md
index 42e1710cb2d5..ec24d5ed7495 100644
--- a/docs/source/en/model_doc/blenderbot.md
+++ b/docs/source/en/model_doc/blenderbot.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # Blenderbot
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The Blender chatbot model was proposed in [Recipes for building an open-domain chatbot](https://arxiv.org/pdf/2004.13637.pdf) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md
index 4125d372d55a..94331d9a5f6e 100644
--- a/docs/source/en/model_doc/blip-2.md
+++ b/docs/source/en/model_doc/blip-2.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # BLIP-2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The BLIP-2 model was proposed in [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by
diff --git a/docs/source/en/model_doc/blip.md b/docs/source/en/model_doc/blip.md
index 0545400b8355..1acf172f26b8 100644
--- a/docs/source/en/model_doc/blip.md
+++ b/docs/source/en/model_doc/blip.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # BLIP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The BLIP model was proposed in [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
diff --git a/docs/source/en/model_doc/bloom.md b/docs/source/en/model_doc/bloom.md
index a1d39d13ad00..9de987059574 100644
--- a/docs/source/en/model_doc/bloom.md
+++ b/docs/source/en/model_doc/bloom.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # BLOOM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The BLOOM model has been proposed with its various versions through the [BigScience Workshop](https://bigscience.huggingface.co/). BigScience is inspired by other open science initiatives where researchers have pooled their time and resources to collectively achieve a higher impact.
diff --git a/docs/source/en/model_doc/bort.md b/docs/source/en/model_doc/bort.md
index 1542d464d9fd..04cc2feb063b 100644
--- a/docs/source/en/model_doc/bort.md
+++ b/docs/source/en/model_doc/bort.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # BORT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we do not accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/bridgetower.md b/docs/source/en/model_doc/bridgetower.md
index 013fea06c277..2aee4cdebe5d 100644
--- a/docs/source/en/model_doc/bridgetower.md
+++ b/docs/source/en/model_doc/bridgetower.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # BridgeTower
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The BridgeTower model was proposed in [BridgeTower: Building Bridges Between Encoders in Vision-Language Representative Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. The goal of this model is to build a
diff --git a/docs/source/en/model_doc/bros.md b/docs/source/en/model_doc/bros.md
index 419e725e75e8..baa658e598fb 100644
--- a/docs/source/en/model_doc/bros.md
+++ b/docs/source/en/model_doc/bros.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # BROS
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The BROS model was proposed in [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
diff --git a/docs/source/en/model_doc/byt5.md b/docs/source/en/model_doc/byt5.md
index dc2942e33bbe..7e95bae53e87 100644
--- a/docs/source/en/model_doc/byt5.md
+++ b/docs/source/en/model_doc/byt5.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # ByT5
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The ByT5 model was presented in [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir
diff --git a/docs/source/en/model_doc/camembert.md b/docs/source/en/model_doc/camembert.md
index fd872282d588..9066ee360c6c 100644
--- a/docs/source/en/model_doc/camembert.md
+++ b/docs/source/en/model_doc/camembert.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # CamemBERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The CamemBERT model was proposed in [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by
diff --git a/docs/source/en/model_doc/canine.md b/docs/source/en/model_doc/canine.md
index 7729d8aa91d7..cd1cce34c79c 100644
--- a/docs/source/en/model_doc/canine.md
+++ b/docs/source/en/model_doc/canine.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # CANINE
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The CANINE model was proposed in [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language
diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md
index 6cbbdf398274..3810b3590a00 100644
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Chameleon
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Chameleon model was proposed in [Chameleon: Mixed-Modal Early-Fusion Foundation Models
diff --git a/docs/source/en/model_doc/chinese_clip.md b/docs/source/en/model_doc/chinese_clip.md
index b2d27a844e9e..c73fee0422f0 100644
--- a/docs/source/en/model_doc/chinese_clip.md
+++ b/docs/source/en/model_doc/chinese_clip.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Chinese-CLIP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Chinese-CLIP model was proposed in [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
diff --git a/docs/source/en/model_doc/clap.md b/docs/source/en/model_doc/clap.md
index 2bd2814e1b06..e060662c01a9 100644
--- a/docs/source/en/model_doc/clap.md
+++ b/docs/source/en/model_doc/clap.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # CLAP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The CLAP model was proposed in [Large Scale Contrastive Language-Audio pretraining with
diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md
index cd2d56229b4e..4ab9fe3f21ac 100644
--- a/docs/source/en/model_doc/clip.md
+++ b/docs/source/en/model_doc/clip.md
@@ -14,212 +14,77 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# CLIP
-
-## Overview
-
-The CLIP model was proposed in [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh,
-Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. CLIP
-(Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be
-instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing
-for the task, similarly to the zero-shot capabilities of GPT-2 and 3.
-
-The abstract from the paper is the following:
-
-*State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This
-restricted form of supervision limits their generality and usability since additional labeled data is needed to specify
-any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a
-much broader source of supervision. We demonstrate that the simple pre-training task of predicting which caption goes
-with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400
-million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference
-learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks. We study
-the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks
-such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The
-model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need
-for any dataset specific training. For instance, we match the accuracy of the original ResNet-50 on ImageNet zero-shot
-without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained
-model weights at this https URL.*
-
-This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/openai/CLIP).
-
-## Usage tips and example
-
-CLIP is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image
-classification. CLIP uses a ViT like transformer to get visual features and a causal language model to get the text
-features. Both the text and visual features are then projected to a latent space with identical dimension. The dot
-product between the projected image and text features is then used as a similar score.
-
-To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
-which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image. The authors
-also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder.
-The [`CLIPImageProcessor`] can be used to resize (or rescale) and normalize images for the model.
-
-The [`CLIPTokenizer`] is used to encode the text. The [`CLIPProcessor`] wraps
-[`CLIPImageProcessor`] and [`CLIPTokenizer`] into a single instance to both
-encode the text and prepare the images. The following example shows how to get the image-text similarity scores using
-[`CLIPProcessor`] and [`CLIPModel`].
-
-
-```python
->>> from PIL import Image
->>> import requests
-
->>> from transformers import CLIPProcessor, CLIPModel
-
->>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
->>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
-
->>> outputs = model(**inputs)
->>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
->>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
-```
-
-
-### Combining CLIP and Flash Attention 2
-
-First, make sure to install the latest version of Flash Attention 2.
-
-```bash
-pip install -U flash-attn --no-build-isolation
-```
-
-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16`)
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
 
-<Tip warning={true}>
-
-For small batch sizes, you might notice a slowdown in your model when using flash attention. Refer to the section [Expected speedups with Flash Attention and SDPA](#Expected-speedups-with-Flash-Attention-and-SDPA) below and select an appropriate attention implementation.
-
-</Tip>
-
-To load and run a model using Flash Attention 2, refer to the snippet below:
-
-```python
->>> import torch
->>> import requests
->>> from PIL import Image
+# CLIP
 
->>> from transformers import CLIPProcessor, CLIPModel
+[CLIP](https://huggingface.co/papers/2103.00020) is a is a multimodal vision and language model motivated by overcoming the fixed number of object categories when training a computer vision model. CLIP learns about images directly from raw text by jointly training on 400M (image, text) pairs. Pretraining on this scale enables zero-shot transfer to downstream tasks. CLIP uses an image encoder and text encoder to get visual features and text features. Both features are projected to a latent space with the same number of dimensions and their dot product gives a similarity score.
 
->>> device = "cuda"
->>> torch_dtype = torch.float16
+You can find all the original CLIP checkpoints under the [OpenAI](https://huggingface.co/openai?search_models=clip) organization.
 
->>> model = CLIPModel.from_pretrained(
-...     "openai/clip-vit-base-patch32",
-...     attn_implementation="flash_attention_2",
-...     device_map=device,
-...     torch_dtype=torch_dtype,
-... )
->>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+> [!TIP]
+> Click on the CLIP models in the right sidebar for more examples of how to apply CLIP to different image and language tasks.
 
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
+The example below demonstrates how to calculate similarity scores between multiple text descriptions and an image with [`Pipeline`] or the [`AutoModel`] class.
 
->>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
->>> inputs.to(device)
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
->>> with torch.no_grad():
-...     with torch.autocast(device):
-...         outputs = model(**inputs)
+```py
+import torch
+from transformers import pipeline
 
->>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
->>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
->>> print(probs)
-tensor([[0.9946, 0.0052]], device='cuda:0', dtype=torch.float16)
+clip = pipeline(
+   task="zero-shot-image-classification",
+   model="openai/clip-vit-base-patch32",
+   torch_dtype=torch.bfloat16,
+   device=0
+)
+labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]
+clip("http://images.cocodataset.org/val2017/000000039769.jpg", candidate_labels=labels)
 ```
 
+</hfoption>
+<hfoption id="AutoModel">
 
-### Using Scaled Dot Product Attention (SDPA)
+```py
+import requests
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModel
 
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
+model = AutoModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.bfloat16, attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]
 
-```python
-from transformers import CLIPModel
+inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
 
-model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.float16, attn_implementation="sdpa")
+outputs = model(**inputs)
+logits_per_image = outputs.logits_per_image
+probs = logits_per_image.softmax(dim=1)
+most_likely_idx = probs.argmax(dim=1).item()
+most_likely_label = labels[most_likely_idx]
+print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_likely_idx].item():.3f}")
 ```
 
-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
-
-### Expected speedups with Flash Attention and SDPA
-
-On a local benchmark (NVIDIA A10G, PyTorch 2.3.1+cu121) with `float16`, we saw the following speedups during inference for `"openai/clip-vit-large-patch14"` checkpoint ([code](https://gist.github.com/qubvel/ac691a54e54f9fae8144275f866a7ff8)):
-
-#### CLIPTextModel
-
-|   Num text labels |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
-|------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
-|                 4 |            0.009 |          0.012 |         0.737 |           0.007 |          1.269 |
-|                16 |            0.009 |          0.014 |         0.659 |           0.008 |          1.187 |
-|                32 |            0.018 |          0.021 |         0.862 |           0.016 |          1.142 |
-|                64 |            0.034 |          0.034 |         1.001 |           0.03  |          1.163 |
-|               128 |            0.063 |          0.058 |         1.09  |           0.054 |          1.174 |
-
-![clip_text_model_viz_3](https://github.com/user-attachments/assets/e9826b43-4e66-4f4c-952b-af4d90bd38eb)
-
-#### CLIPVisionModel
-
-|   Image batch size |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
-|-------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
-|                  1 |            0.016 |          0.013 |         1.247 |           0.012 |          1.318 |
-|                  4 |            0.025 |          0.021 |         1.198 |           0.021 |          1.202 |
-|                 16 |            0.093 |          0.075 |         1.234 |           0.075 |          1.24  |
-|                 32 |            0.181 |          0.147 |         1.237 |           0.146 |          1.241 |
-
-![clip_image_model_viz_3](https://github.com/user-attachments/assets/50a36206-e3b9-4adc-ac8e-926b8b071d63)
-
-#### CLIPModel
-
-|   Image batch size |   Num text labels |   Eager (s/iter) |   FA2 (s/iter) |   FA2 speedup |   SDPA (s/iter) |   SDPA speedup |
-|-------------------:|------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
-|                  1 |                 4 |            0.025 |          0.026 |         0.954 |           0.02  |          1.217 |
-|                  1 |                16 |            0.026 |          0.028 |         0.918 |           0.02  |          1.287 |
-|                  1 |                64 |            0.042 |          0.046 |         0.906 |           0.036 |          1.167 |
-|                  4 |                 4 |            0.028 |          0.033 |         0.849 |           0.024 |          1.189 |
-|                  4 |                16 |            0.034 |          0.035 |         0.955 |           0.029 |          1.169 |
-|                  4 |                64 |            0.059 |          0.055 |         1.072 |           0.05  |          1.179 |
-|                 16 |                 4 |            0.096 |          0.088 |         1.091 |           0.078 |          1.234 |
-|                 16 |                16 |            0.102 |          0.09  |         1.129 |           0.083 |          1.224 |
-|                 16 |                64 |            0.127 |          0.11  |         1.157 |           0.105 |          1.218 |
-|                 32 |                 4 |            0.185 |          0.159 |         1.157 |           0.149 |          1.238 |
-|                 32 |                16 |            0.19  |          0.162 |         1.177 |           0.154 |          1.233 |
-|                 32 |                64 |            0.216 |          0.181 |         1.19  |           0.176 |          1.228 |
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIP.
-
-- [Fine tuning CLIP with Remote Sensing (Satellite) images and captions](https://huggingface.co/blog/fine-tune-clip-rsicd), a blog post about how to fine-tune CLIP with [RSICD dataset](https://github.com/201528014227051/RSICD_optimal) and comparison of performance changes due to data augmentation.
-- This [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/contrastive-image-text) shows how to train a CLIP-like vision-text dual encoder model using a pre-trained vision and text encoder using [COCO dataset](https://cocodataset.org/#home).
-
-<PipelineTag pipeline="image-to-text"/>
-
-- A [notebook](https://colab.research.google.com/drive/1tuoAC5F4sC7qid56Z0ap-stR3rwdk0ZV?usp=sharing) on how to use a pretrained CLIP for inference with beam search for image captioning. 🌎
-
-**Image retrieval**
-
-- A [notebook](https://colab.research.google.com/drive/1bLVwVKpAndpEDHqjzxVPr_9nGrSbuOQd?usp=sharing) on image retrieval using pretrained CLIP and computing MRR(Mean Reciprocal Rank) score. 🌎
-- A [notebook](https://colab.research.google.com/github/deep-diver/image_search_with_natural_language/blob/main/notebooks/Image_Search_CLIP.ipynb) on image retrieval and showing the similarity score. 🌎
-- A [notebook](https://colab.research.google.com/drive/1xO-wC_m_GNzgjIBQ4a4znvQkvDoZJvH4?usp=sharing) on how to map images and texts to the same vector space using Multilingual CLIP. 🌎 
-- A [notebook](https://colab.research.google.com/github/vivien000/clip-demo/blob/master/clip.ipynb#scrollTo=uzdFhRGqiWkR) on how to run CLIP on semantic image search using [Unsplash](https://unsplash.com) and [TMDB](https://www.themoviedb.org/) datasets. 🌎
-
-**Explainability**
+</hfoption>
+</hfoptions>
 
-- A [notebook](https://colab.research.google.com/github/hila-chefer/Transformer-MM-Explainability/blob/main/CLIP_explainability.ipynb) on how to visualize similarity between input token and image segment. 🌎
+## Notes
 
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
-The resource should ideally demonstrate something new instead of duplicating an existing resource.
+- Use [`CLIPImageProcessor`] to resize (or rescale) and normalizes images for the model.
 
 ## CLIPConfig
 
diff --git a/docs/source/en/model_doc/clipseg.md b/docs/source/en/model_doc/clipseg.md
index 005e6746d097..f594dbc3e0f3 100644
--- a/docs/source/en/model_doc/clipseg.md
+++ b/docs/source/en/model_doc/clipseg.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # CLIPSeg
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The CLIPSeg model was proposed in [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke
diff --git a/docs/source/en/model_doc/clvp.md b/docs/source/en/model_doc/clvp.md
index a30269faf9ca..cfa4f97b8286 100644
--- a/docs/source/en/model_doc/clvp.md
+++ b/docs/source/en/model_doc/clvp.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # CLVP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The CLVP (Contrastive Language-Voice Pretrained Transformer) model was proposed in [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md
index 6eb687a728a0..09499ca54757 100644
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@@ -14,102 +14,154 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# CodeLlama
-
-## Overview
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+    </div>
+</div>
 
-The Code Llama model was proposed in [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
+# CodeLlama
 
-The abstract from the paper is the following:
+[Code Llama](https://huggingface.co/papers/2308.12950) is a specialized family of large language models based on [Llama 2](./llama2) for coding tasks.  It comes in different flavors - general code, Python-specific, and instruction-following variant - all available in 7B, 13B, 34B, and 70B parameters. Code Llama models can generate, explain, and even fill in missing parts of your code (called "infilling"). It can also handle very long contexts with stable generation up to 100k tokens, even though it was trained on sequences of 16K tokens.
 
-*We release Code Llama, a family of large language models for code based on Llama 2 providing state-of-the-art performance among open models, infilling capabilities, support for large input contexts, and zero-shot instruction following ability for programming tasks. We provide multiple flavors to cover a wide range of applications: foundation models (Code Llama), Python specializations (Code Llama - Python), and instruction-following models (Code Llama - Instruct) with 7B, 13B and 34B parameters each. All models are trained on sequences of 16k tokens and show improvements on inputs with up to 100k tokens. 7B and 13B Code Llama and Code Llama - Instruct variants support infilling based on surrounding content. Code Llama reaches state-of-the-art performance among open models on several code benchmarks, with scores of up to 53% and 55% on HumanEval and MBPP, respectively. Notably, Code Llama - Python 7B outperforms Llama 2 70B on HumanEval and MBPP, and all our models outperform every other publicly available model on MultiPL-E. We release Code Llama under a permissive license that allows for both research and commercial use.*
+You can find all the original Code Llama checkpoints under the [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933) collection.
 
-Check out all Code Llama model checkpoints [here](https://huggingface.co/models?search=code_llama) and the officially released ones in the [Meta Llama org](https://huggingface.co/meta-llama).
+> [!TIP]
+> Click on the Code Llama models in the right sidebar for more examples of how to apply Code Llama to different coding tasks.
 
-This model was contributed by [ArthurZucker](https://huggingface.co/ArthurZ). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+The example below demonstrates how to generate code with [`Pipeline`], or the [`AutoModel`], and from the command line.
 
-## Usage tips and examples
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+    
+```py
+import torch
+from transformers import pipeline
 
-<Tip warning={true}>
+pipe = pipeline(
+    "text-generation",
+    model="meta-llama/CodeLlama-7b-hf",
+    torch_dtype=torch.float16,
+    device_map=0
+)
 
-The `Llama2` family models, on which Code Llama is based, were trained using `bfloat16`, but the original inference uses `float16`. Let's look at the different precisions:
+# basic code generation
+result = pipe("# Function to calculate the factorial of a number\ndef factorial(n):", max_new_tokens=256)
+print(result[0]['generated_text'])
 
-* `float32`: PyTorch convention on model initialization is to load models in `float32`, no matter with which `dtype` the model weights were stored. `transformers` also follows this convention for consistency with PyTorch. This will be picked by default. If you want the `AutoModel` API to load the checkpoints with the storage weights type, you must specify `torch_dtype="auto"`, e.g. `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`.
-* `bfloat16`: Code Llama was trained with this precision, so we recommend using it for further training or fine-tuning.
-* `float16`: We recommend running inference using this precision, as it's usually faster than `bfloat16`, and evaluation metrics show no discernible degradation with respect to `bfloat16`. You can also run inference using `bfloat16`, and we recommend you check inference results with both `float16` and `bfloat16` after fine-tuning.
+# infilling
+infill_result = pipe("def remove_non_ascii(s: str) -> str:\n    \"\"\" <FILL_ME>\n    return result", max_new_tokens=200)
+print(infill_result[0]['generated_text'])
+```
 
-As mentioned above, the `dtype` of the storage weights is mostly irrelevant unless you are using `torch_dtype="auto"` when initializing a model using. The reason is that the model will first be downloaded (using the `dtype` of the checkpoints online) and then will be casted to the default `dtype` of `torch` (becomes `torch.float32`). If there is a specified `torch_dtype`, it will be used instead.
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/CodeLlama-7b-hf",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+# basic code generation
+prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
+input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+output = model.generate(
+    **input_ids, 
+    max_new_tokens=256,
+    cache_implementation="static"
+)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+
+# infilling
+infill_prompt = "def remove_non_ascii(s: str) -> str:\n    \"\"\" <FILL_ME>\n    return result"
+input_ids = tokenizer(infill_prompt, return_tensors="pt").to(model.device)
+
+filled_output = model.generate(**input_ids, max_new_tokens=200)
+filled_text = tokenizer.decode(filled_output[0], skip_special_tokens=True)
+print(filled_text)
+```
 
-</Tip>
+</hfoption>
+<hfoption id="transformers-cli">
+    
+```bash
+echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers-cli run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
+```
 
+</hfoption>
+</hfoptions>
 
-Tips:
-- The infilling task is supported out of the box. You should be using the `tokenizer.fill_token` where you want your input to be filled.
-- The model conversion script is the same as for the `Llama2` family:
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
 
-Here is a sample usage:
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
 
-```bash
-python src/transformers/models/llama/convert_llama_weights_to_hf.py \
-    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
-```
+```py
+# pip install bitsandbytes
+import torch
+from transformers import AutoModelForCausalLM, CodeLlamaTokenizer, BitsAndBytesConfig
 
-Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-
-After conversion, the model and tokenizer can be loaded via:
-
-```python
->>> from transformers import LlamaForCausalLM, CodeLlamaTokenizer
-
->>> tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
->>> model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
->>> PROMPT = '''def remove_non_ascii(s: str) -> str:
-...     """ <FILL_ME>
-...     return result
-... '''
->>> input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
->>> generated_ids = model.generate(input_ids, max_new_tokens=128)
-
->>> filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
->>> print(PROMPT.replace("<FILL_ME>", filling))
-def remove_non_ascii(s: str) -> str:
-    """ Remove non-ASCII characters from a string.
-<BLANKLINE>
-    Args:
-        s: The string to remove non-ASCII characters from.
-<BLANKLINE>
-    Returns:
-        The string with non-ASCII characters removed.
-    """
-    result = ""
-    for c in s:
-        if ord(c) < 128:
-            result += c
-    return result
-<BLANKLINE>
-```
+bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)
+tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-34b-hf")
+model = AutoModelForCausalLM.from_pretrained(
+   "meta-llama/CodeLlama-34b-hf",
+   torch_dtype=torch.bfloat16,
+   device_map="auto",
+   quantization_config=bnb_config
+)
 
-If you only want the infilled part:
-```python
->>> from transformers import pipeline
->>> import torch
+prompt = "# Write a Python function to check if a string is a palindrome\ndef is_palindrome(s):"
+input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
 
->>> generator = pipeline("text-generation",model="meta-llama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
->>> generator('def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return result', max_new_tokens = 128)
-[{'generated_text': 'def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return resultRemove non-ASCII characters from a string. """\n    result = ""\n    for c in s:\n        if ord(c) < 128:\n            result += c'}]
+output = model.generate(**input_ids, max_new_tokens=200, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 
-Under the hood, the tokenizer [automatically splits by `<FILL_ME>`](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) to create a formatted input string that follows [the original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug.  To see how much CPU and GPU memory you need for this model or others, try [this calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) which can help determine that value.
-
-The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
 
-<Tip>
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
 
-Code Llama has the same architecture as the `Llama2` models, refer to [Llama2's documentation page](llama2) for the API reference.
-Find Code Llama tokenizer reference below. 
-</Tip>
+visualizer = AttentionMaskVisualizer("meta-llama/CodeLlama-7b-hf")
+visualizer("""def func(a, b):
+  return a + b""")
+```
 
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/codellama-attn-mask.png"/>
+</div>
+
+## Notes
+
+- Infilling is only available in the 7B and 13B base models, and not in the Python, Instruct, 34B, or 70B models.
+- Use the `<FILL_ME>` token where you want your input to be filled. The tokenizer splits this token to create a formatted input string that follows the [original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself.
+    ```py
+    from transformers import LlamaForCausalLM, CodeLlamaTokenizer
+    
+    tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
+    model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
+    PROMPT = '''def remove_non_ascii(s: str) -> str:
+        """ <FILL_ME>
+        return result
+    '''
+    input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
+    generated_ids = model.generate(input_ids, max_new_tokens=128)
+    
+    filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
+    print(PROMPT.replace("<FILL_ME>", filling))
+    ```
+- Use `bfloat16` for further training or fine-tuning and `float16` for inference.
+- The `BOS` character is not used for infilling when encoding the prefix or suffix, but only at the beginning of each prompt.
+- The tokenizer is a byte-pair encoding model based on [SentencePiece](https://github.com/google/sentencepiece). During decoding, if the first token is the start of the word (for example, “Banana”), the tokenizer doesn’t prepend the prefix space to the string.
 
 ## CodeLlamaTokenizer
 
diff --git a/docs/source/en/model_doc/codegen.md b/docs/source/en/model_doc/codegen.md
index bee8c8a07620..465c8e5445b8 100644
--- a/docs/source/en/model_doc/codegen.md
+++ b/docs/source/en/model_doc/codegen.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # CodeGen
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The CodeGen model was proposed in [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, and Caiming Xiong.
diff --git a/docs/source/en/model_doc/cohere.md b/docs/source/en/model_doc/cohere.md
index 4275f059c532..48b924e1ff13 100644
--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@@ -1,118 +1,115 @@
-# Cohere
-
-## Overview
-
-The Cohere Command-R model was proposed in the blogpost [Command-R: Retrieval Augmented Generation at Production Scale](https://txt.cohere.com/command-r/) by the Cohere Team.
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
 
-The abstract from the paper is the following:
 
-*Command-R is a scalable generative model targeting RAG and Tool Use to enable production-scale AI for enterprise. Today, we are introducing Command-R, a new LLM aimed at large-scale production workloads. Command-R targets the emerging “scalable” category of models that balance high efficiency with strong accuracy, enabling companies to move beyond proof of concept, and into production.*
+# Cohere
 
-*Command-R is a generative model optimized for long context tasks such as retrieval augmented generation (RAG) and using external APIs and tools. It is designed to work in concert with our industry-leading Embed and Rerank models to provide best-in-class integration for RAG applications and excel at enterprise use cases. As a model built for companies to implement at scale, Command-R boasts:
-- Strong accuracy on RAG and Tool Use
-- Low latency, and high throughput
-- Longer 128k context and lower pricing
-- Strong capabilities across 10 key languages
-- Model weights available on HuggingFace for research and evaluation
+Cohere Command-R is a 35B parameter multilingual large language model designed for long context tasks like retrieval-augmented generation (RAG) and calling external APIs and tools. The model is specifically trained for grounded generation and supports both single-step and multi-step tool use. It supports a context length of 128K tokens.
 
-Checkout model checkpoints [here](https://huggingface.co/CohereForAI/c4ai-command-r-v01).
-This model was contributed by [Saurabh Dash](https://huggingface.co/saurabhdash) and [Ahmet Üstün](https://huggingface.co/ahmetustun). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox).
+You can find all the original Command-R checkpoints under the [Command Models](https://huggingface.co/collections/CohereForAI/command-models-67652b401665205e17b192ad) collection.
 
-## Usage tips
 
-<Tip warning={true}>
+> [!TIP]
+> Click on the Cohere models in the right sidebar for more examples of how to apply Cohere to different language tasks.
 
-The checkpoints uploaded on the Hub use `torch_dtype = 'float16'`, which will be
-used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`. 
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`], and from the command line.
 
-The `dtype` of the online weights is mostly irrelevant unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online), then it will be casted to the default `dtype` of `torch` (becomes `torch.float32`), and finally, if there is a `torch_dtype` provided in the config, it will be used. 
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-Training the model in `float16` is not recommended and is known to produce `nan`; as such, the model should be trained in `bfloat16`.
+```python
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="text-generation",
+    model="CohereForAI/c4ai-command-r-v01",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create energy through a process known as")
+```
 
-</Tip>
-The model and tokenizer can be loaded via:
+</hfoption>
+<hfoption id="AutoModel">
 
 ```python
-# pip install transformers
+import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
-model_id = "CohereForAI/c4ai-command-r-v01"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
+model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
 
-# Format message with the command-r chat template
-messages = [{"role": "user", "content": "Hello, how are you?"}]
-input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
-## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
-
-gen_tokens = model.generate(
+# format message with the Command-R chat template
+messages = [{"role": "user", "content": "How do plants make energy?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+output = model.generate(
     input_ids, 
     max_new_tokens=100, 
     do_sample=True, 
     temperature=0.3,
-    )
-
-gen_text = tokenizer.decode(gen_tokens[0])
-print(gen_text)
+    cache_implementation="static",
+)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 
-- When using Flash Attention 2 via `attn_implementation="flash_attention_2"`, don't pass `torch_dtype` to the `from_pretrained` class method and use Automatic Mixed-Precision training. When using `Trainer`, it is simply specifying either `fp16` or `bf16` to `True`. Otherwise, make sure you are using `torch.autocast`. This is required because the Flash Attention only support `fp16` and `bf16` data type.
-
+</hfoption>
+<hfoption id="transformers-cli">
 
-## Resources
+```bash
+# pip install -U flash-attn --no-build-isolation
+transformers-cli chat --model_name_or_path CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
+```
 
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Command-R. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+</hfoption>
+</hfoptions>
 
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
 
-<PipelineTag pipeline="text-generation"/>
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to 4-bits.
 
-Loading FP16 model
 ```python
-# pip install transformers
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-model_id = "CohereForAI/c4ai-command-r-v01"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id)
+import torch
+from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
 
-# Format message with the command-r chat template
-messages = [{"role": "user", "content": "Hello, how are you?"}]
-input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
-## <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
+model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", torch_dtype=torch.float16, device_map="auto", quantization_config=bnb_config, attn_implementation="sdpa")
 
-gen_tokens = model.generate(
+# format message with the Command-R chat template
+messages = [{"role": "user", "content": "How do plants make energy?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+output = model.generate(
     input_ids, 
     max_new_tokens=100, 
     do_sample=True, 
     temperature=0.3,
-    )
-
-gen_text = tokenizer.decode(gen_tokens[0])
-print(gen_text)
+    cache_implementation="static",
+)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 
-Loading bitsnbytes 4bit quantized model
-```python
-# pip install transformers bitsandbytes accelerate
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
 
-bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
 
-model_id = "CohereForAI/c4ai-command-r-v01"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
+visualizer = AttentionMaskVisualizer("CohereForAI/c4ai-command-r-v01")
+visualizer("Plants create energy through a process known as")
+```
 
-gen_tokens = model.generate(
-    input_ids, 
-    max_new_tokens=100, 
-    do_sample=True, 
-    temperature=0.3,
-    )
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/cohere-attn-mask.png"/>
+</div>
 
-gen_text = tokenizer.decode(gen_tokens[0])
-print(gen_text)
-```
 
+## Notes
+- Don’t use the torch_dtype parameter in [`~AutoModel.from_pretrained`] if you’re using FlashAttention-2 because it only supports fp16 or bf16. You should use [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html), set fp16 or bf16 to True if using [`Trainer`], or use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast).
 
 ## CohereConfig
 
@@ -137,5 +134,3 @@ print(gen_text)
 
 [[autodoc]] CohereForCausalLM
     - forward
-
-
diff --git a/docs/source/en/model_doc/cohere2.md b/docs/source/en/model_doc/cohere2.md
index 33e67d48fb0e..3b0b6e1740a9 100644
--- a/docs/source/en/model_doc/cohere2.md
+++ b/docs/source/en/model_doc/cohere2.md
@@ -1,5 +1,11 @@
 # Cohere
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 [C4AI Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model developed by Cohere and Cohere For AI. It has advanced capabilities optimized for various use cases, including reasoning, summarization, question answering, and code. The model is trained to perform sophisticated tasks including Retrieval Augmented Generation (RAG) and tool use. The model also has powerful agentic capabilities that can use and combine multiple tools over multiple steps to accomplish more difficult tasks. It obtains top performance on enterprise-relevant code use cases. C4AI Command R7B is a multilingual model trained on 23 languages.
 
diff --git a/docs/source/en/model_doc/colpali.md b/docs/source/en/model_doc/colpali.md
index 3f6b0cbc6613..07c4b45f140a 100644
--- a/docs/source/en/model_doc/colpali.md
+++ b/docs/source/en/model_doc/colpali.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # ColPali
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The *ColPali* model was proposed in [ColPali: Efficient Document Retrieval with Vision Language Models](https://doi.org/10.48550/arXiv.2407.01449) by **Manuel Faysse***, **Hugues Sibille***, **Tony Wu***, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo (* denotes equal contribution). Work lead by ILLUIN Technology.
diff --git a/docs/source/en/model_doc/conditional_detr.md b/docs/source/en/model_doc/conditional_detr.md
index 400c5c2c53b6..6a03d14d969c 100644
--- a/docs/source/en/model_doc/conditional_detr.md
+++ b/docs/source/en/model_doc/conditional_detr.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Conditional DETR
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Conditional DETR model was proposed in [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR.
diff --git a/docs/source/en/model_doc/convbert.md b/docs/source/en/model_doc/convbert.md
index 17b5d7920c6c..e52bbd5c4772 100644
--- a/docs/source/en/model_doc/convbert.md
+++ b/docs/source/en/model_doc/convbert.md
@@ -17,12 +17,8 @@ rendered properly in your Markdown viewer.
 # ConvBERT
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=convbert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-convbert-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/conv-bert-base">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/convnext.md b/docs/source/en/model_doc/convnext.md
index f3d10d77b1d2..576e95ee043d 100644
--- a/docs/source/en/model_doc/convnext.md
+++ b/docs/source/en/model_doc/convnext.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # ConvNeXT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The ConvNeXT model was proposed in [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
diff --git a/docs/source/en/model_doc/convnextv2.md b/docs/source/en/model_doc/convnextv2.md
index 8cd142c2765f..87a261b8dede 100644
--- a/docs/source/en/model_doc/convnextv2.md
+++ b/docs/source/en/model_doc/convnextv2.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # ConvNeXt V2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The ConvNeXt V2 model was proposed in [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
diff --git a/docs/source/en/model_doc/cpm.md b/docs/source/en/model_doc/cpm.md
index 129c4ed3a377..8a1826a25c6d 100644
--- a/docs/source/en/model_doc/cpm.md
+++ b/docs/source/en/model_doc/cpm.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # CPM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The CPM model was proposed in [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin,
diff --git a/docs/source/en/model_doc/cpmant.md b/docs/source/en/model_doc/cpmant.md
index 4bcf774507fb..f8e2b3b515ec 100644
--- a/docs/source/en/model_doc/cpmant.md
+++ b/docs/source/en/model_doc/cpmant.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # CPMAnt
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 CPM-Ant is an open-source Chinese pre-trained language model (PLM) with 10B parameters. It is also the first milestone of the live training process of CPM-Live. The training process is cost-effective and environment-friendly. CPM-Ant also achieves promising results with delta tuning on the CUGE benchmark. Besides the full model, we also provide various compressed versions to meet the requirements of different hardware configurations. [See more](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live)
diff --git a/docs/source/en/model_doc/ctrl.md b/docs/source/en/model_doc/ctrl.md
index be9fa85c7073..0253d4e007e0 100644
--- a/docs/source/en/model_doc/ctrl.md
+++ b/docs/source/en/model_doc/ctrl.md
@@ -17,12 +17,8 @@ rendered properly in your Markdown viewer.
 # CTRL
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=ctrl">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-ctrl-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/tiny-ctrl">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/cvt.md b/docs/source/en/model_doc/cvt.md
index 503f97795c0e..fec632ed84d1 100644
--- a/docs/source/en/model_doc/cvt.md
+++ b/docs/source/en/model_doc/cvt.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # Convolutional Vision Transformer (CvT)
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The CvT model was proposed in [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan and Lei Zhang. The Convolutional vision Transformer (CvT) improves the [Vision Transformer (ViT)](vit) in performance and efficiency by introducing convolutions into ViT to yield the best of both designs.
diff --git a/docs/source/en/model_doc/dab-detr.md b/docs/source/en/model_doc/dab-detr.md
index 6071ee6ca460..d19b45b486b0 100644
--- a/docs/source/en/model_doc/dab-detr.md
+++ b/docs/source/en/model_doc/dab-detr.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # DAB-DETR
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The DAB-DETR model was proposed in [DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR](https://arxiv.org/abs/2201.12329) by Shilong Liu, Feng Li, Hao Zhang, Xiao Yang, Xianbiao Qi, Hang Su, Jun Zhu, Lei Zhang.
diff --git a/docs/source/en/model_doc/dac.md b/docs/source/en/model_doc/dac.md
index db54b387b1c3..3ee4d92b58e0 100644
--- a/docs/source/en/model_doc/dac.md
+++ b/docs/source/en/model_doc/dac.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # DAC
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 
diff --git a/docs/source/en/model_doc/data2vec.md b/docs/source/en/model_doc/data2vec.md
index cb1dc675caa5..62ddbd8ff184 100644
--- a/docs/source/en/model_doc/data2vec.md
+++ b/docs/source/en/model_doc/data2vec.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Data2Vec
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli.
diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
index fb53742d0541..11463e93d160 100644
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@@ -12,6 +12,12 @@ specific language governing permissions and limitations under the License.
 
 # DBRX
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 DBRX is a [transformer-based](https://www.isattentionallyouneed.com/) decoder-only large language model (LLM) that was trained using next-token prediction.
diff --git a/docs/source/en/model_doc/deberta-v2.md b/docs/source/en/model_doc/deberta-v2.md
index e3bd91e8e4fa..2e48a3e9a7fc 100644
--- a/docs/source/en/model_doc/deberta-v2.md
+++ b/docs/source/en/model_doc/deberta-v2.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # DeBERTa-v2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
diff --git a/docs/source/en/model_doc/deberta.md b/docs/source/en/model_doc/deberta.md
index 342a3bc47960..39afe83f5fe3 100644
--- a/docs/source/en/model_doc/deberta.md
+++ b/docs/source/en/model_doc/deberta.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # DeBERTa
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
diff --git a/docs/source/en/model_doc/decision_transformer.md b/docs/source/en/model_doc/decision_transformer.md
index 07ef2ecbdc8e..fb932ce3ec7a 100644
--- a/docs/source/en/model_doc/decision_transformer.md
+++ b/docs/source/en/model_doc/decision_transformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Decision Transformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Decision Transformer model was proposed in [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)  
diff --git a/docs/source/en/model_doc/deepseek_v3.md b/docs/source/en/model_doc/deepseek_v3.md
new file mode 100644
index 000000000000..c3322a102f6e
--- /dev/null
+++ b/docs/source/en/model_doc/deepseek_v3.md
@@ -0,0 +1,184 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeepSeek-V3
+
+## Overview
+
+The DeepSeek-V3 model was proposed in [DeepSeek-V3 Technical Report](https://arxiv.org/abs/2412.19437) by DeepSeek-AI Team.
+
+The abstract from the paper is the following:
+We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its full training. In addition, its training process is remarkably stable. Throughout the entire training process, we did not experience any irrecoverable loss spikes or perform any rollbacks. The model checkpoints are available at https://github.com/deepseek-ai/DeepSeek-V3.
+
+## Limitations and call for contribution!
+
+We are super happy to make this code community-powered, and would love to see how you can best optimize the following: 
+
+- current implementation uses the "naive" attention compution (so not really MLA)
+- current implementation loops through the experts. This should be replaced. Pointers to use `get_packed_weights` from `intetrations/tensor_parallel`. 
+- current implementation uses the eleuther formula for ROPE, using the orginal one would be more efficient! (should still follow our API)
+- static cache is not supported (this should be just a generation config issue / config shape issues)
+
+### Usage tips
+The model uses Multi-head Latent Attention (MLA) and DeepSeekMoE architectures for efficient inference and cost-effective training. It employs an auxiliary-loss-free strategy for load balancing and multi-token prediction training objective. The model can be used for various language tasks after being pre-trained on 14.8 trillion tokens and going through Supervised Fine-Tuning and Reinforcement Learning stages.
+
+You can run the model in `FP8` automatically, using 2 nodes of 8 H100 should be more than enough! 
+
+```python
+# `run_deepseek_v1.py`
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+torch.manual_seed(30)
+
+tokenizer = AutoTokenizer.from_pretrained("deepseek-r1")
+
+chat = [
+  {"role": "user", "content": "Hello, how are you?"},
+  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+  {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+
+
+model = AutoModelForCausalLM.from_pretrained("deepseek-r1", device_map="auto", torch_dtype=torch.bfloat16)
+inputs = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
+import time
+start = time.time()
+outputs = model.generate(inputs, max_new_tokens=50)
+print(tokenizer.batch_decode(outputs))
+print(time.time()-start)
+```
+This generated: 
+
+``````
+<｜Assistant｜><think>
+Okay, the user wants to demonstrate how chat templating works. Let me break down what that means. Chat templating is about structuring the conversation data, especially for models that need specific input formats. Maybe they're referring to something like how messages are formatted with roles (user, assistant, system) in APIs like OpenAI.
+
+First, I should explain what chat templating is. It's the process of formatting conversation data into a structured format that the model can understand. This usually includes roles and content. For example, user messages, assistant responses, and system messages each have their own role tags.
+
+They might want an example. Let me think of a simple conversation. The user says "Hello, how are you?" and the assistant responds "I'm doing great. How can I help you today?" Then the user follows up with wanting to show off chat templating. So the example should include the history and the new message.
+
+In some frameworks, like Hugging Face's Transformers, chat templates are applied using Jinja2 templates. The template might look something like combining system messages, then looping through user and assistant messages with appropriate tags. For instance, using {% for message in messages %} and assigning roles like <|user|>, <|assistant|>, etc.
+
+I should structure the example with the messages array, showing each role and content. Then apply a hypothetical template to convert that into a formatted string the model uses. Also, mention that different models have different templating requirements, like using special tokens or varying role labels.
+
+Wait, the user mentioned "chat templating" in the context of showing off. Maybe they want a practical example they can present. So providing a code snippet or a structured data example would be helpful. Let me outline a typical messages array and then the templated output.
+
+Also, it's important to note that proper templating ensures the model knows the conversation flow, which is crucial for generating coherent responses. Maybe include a note about why it's important, like maintaining context and role-specific processing.
+
+Let me check if there are any common mistakes or things to avoid. For example, not closing tags properly, or mismatching roles. But maybe that's too detailed unless the user asks. Focus on the positive example first.
+
+Putting it all together, the response should have an example messages array, the applied template, and the final formatted string. Maybe use angle brackets or special tokens as placeholders. Also, mention that this helps in training or fine-tuning models with structured data.
+
+I think that's a solid approach. Let me structure it step by step to make it clear.
+</think>
+
+Chat templating is a way to structure conversation data (e.g., user/assistant interactions) into a format that language models understand. This is especially important for models trained to handle multi-turn dialogues, where the input must explicitly separate roles (user, assistant, system, etc.) and messages. Let’s break this down with an example!
+
+---
+
+### **Step 1: Raw Conversation History**
+Suppose we have this conversation:
+- **User**: "Hello, how are you?"
+- **Assistant**: "I'm doing great. How can I help you today?"
+- **User**: "I'd like to show off how chat templating works!"
+
+---
+
+### **Step 2: Structured Messages**
+In frameworks like Hugging Face Transformers or OpenAI, conversations are often formatted as a list of dictionaries with `role` and `content`:
+```python
+messages = [
+    {"role": "user", "content": "Hello, how are you?"},
+    {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+    {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+```
+
+---
+
+### **Step 3: Apply a Chat Template**
+A **chat template** converts this structured data into a single string formatted for the model. For example, using a Jinja-style template (common in Hugging Face):
+
+```jinja
+{% for message in messages %}
+    {% if message['role'] == 'user' %}
+        <|user|>{{ message['content'] }}<|end|>
+    {% elif message['role'] == 'assistant' %}
+        <|assistant|>{{ message['content'] }}<|end|>
+    {% endif %}
+{% endfor %}
+<|assistant|>
+```
+
+---
+
+### **Step 4: Final Templated Output**
+Applying the template to our `messages` list would produce:
+```text
+<|user|>Hello, how are you?<|end|>
+<|assistant|>I'm doing great. How can I help you today?<|end|>
+<|user|>I'd like to show off how chat templating works!<|end|>
+<|assistant|>
+```
+
+This tells the model:  
+1. The conversation history (user/assistant turns).  
+2. The model’s turn to generate a response (`<|assistant|>` at the end).  
+
+---
+
+### **Key Notes**:
+- **Role Separation**: Tags like `<|user|>` and `<|assistant|>` help the model distinguish speakers.
+- **Special Tokens**: Models often use unique tokens (e.g., `<|end|>`) to mark message boundaries.
+- **Flexibility**: Templates vary by model (e.g., OpenAI uses `{"role": "user", "content": "..."}` instead of tags).
+
+---
+
+### **Why This Matters**:
+- **Consistency**: Ensures the model understands dialogue structure.
+- **Context Preservation**: Maintains the flow of multi-turn conversations.
+- **Alignment**: Matches the format the model was trained on for better performance.
+
+Want to dive deeper or see a specific framework’s implementation (e.g., OpenAI, Llama, Mistral)? Let me know! 😊<｜end▁of▁sentence｜>
+``````
+
+Use the following to run it
+```bash
+torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0|1 --rdzv-id an_id --rdzv-backend c10d --rdzv-endpoint master_addr:master_port run_deepseek_r1.py
+```
+
+If you have: 
+```bash
+[rank0]: ncclInternalError: Internal check failed.
+[rank0]: Last error:
+[rank0]: Bootstrap : no socket interface found
+```
+error, it means NCCL was probably not loaded. 
+
+
+## DeepseekV3Config
+
+[[autodoc]] DeepseekV3Config
+
+## DeepseekV3Model
+
+[[autodoc]] DeepseekV3Model
+    - forward
+
+## DeepseekV3ForCausalLM
+
+[[autodoc]] DeepseekV3ForCausalLM
+    - forward
diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md
index 5ed99dfe81d1..5b83f23cf5b3 100644
--- a/docs/source/en/model_doc/deformable_detr.md
+++ b/docs/source/en/model_doc/deformable_detr.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Deformable DETR
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Deformable DETR model was proposed in [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
diff --git a/docs/source/en/model_doc/deit.md b/docs/source/en/model_doc/deit.md
index a24632d5f867..57cfee1f11c5 100644
--- a/docs/source/en/model_doc/deit.md
+++ b/docs/source/en/model_doc/deit.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # DeiT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The DeiT model was proposed in [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre
diff --git a/docs/source/en/model_doc/deplot.md b/docs/source/en/model_doc/deplot.md
index a77bee39de76..d3c0de7b7f84 100644
--- a/docs/source/en/model_doc/deplot.md
+++ b/docs/source/en/model_doc/deplot.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # DePlot
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview 
 
 DePlot was proposed in the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) from Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
diff --git a/docs/source/en/model_doc/depth_anything.md b/docs/source/en/model_doc/depth_anything.md
index 7cdf72de5c84..ea52dea915dd 100644
--- a/docs/source/en/model_doc/depth_anything.md
+++ b/docs/source/en/model_doc/depth_anything.md
@@ -14,97 +14,69 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Depth Anything
-
-## Overview
-
-The Depth Anything model was proposed in [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao. Depth Anything is based on the [DPT](dpt) architecture, trained on ~62 million images, obtaining state-of-the-art results for both relative and absolute depth estimation.
-
-<Tip>
-
-[Depth Anything V2](depth_anything_v2) was released in June 2024. It uses the same architecture as Depth Anything and therefore it is compatible with all code examples and existing workflows. However, it leverages synthetic data and a larger capacity teacher model to achieve much finer and robust depth predictions.
-
-</Tip>
-
-The abstract from the paper is the following:
-
-*This work presents Depth Anything, a highly practical solution for robust monocular depth estimation. Without pursuing novel technical modules, we aim to build a simple yet powerful foundation model dealing with any images under any circumstances. To this end, we scale up the dataset by designing a data engine to collect and automatically annotate large-scale unlabeled data (~62M), which significantly enlarges the data coverage and thus is able to reduce the generalization error. We investigate two simple yet effective strategies that make data scaling-up promising. First, a more challenging optimization target is created by leveraging data augmentation tools. It compels the model to actively seek extra visual knowledge and acquire robust representations. Second, an auxiliary supervision is developed to enforce the model to inherit rich semantic priors from pre-trained encoders. We evaluate its zero-shot capabilities extensively, including six public datasets and randomly captured photos. It demonstrates impressive generalization ability. Further, through fine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs are set. Our better depth model also results in a better depth-conditioned ControlNet.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/depth_anything_overview.jpg"
-alt="drawing" width="600"/>
-
-<small> Depth Anything overview. Taken from the <a href="https://arxiv.org/abs/2401.10891">original paper</a>.</small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/LiheYoung/Depth-Anything).
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
 
-## Usage example
+# Depth Anything
 
-There are 2 main ways to use Depth Anything: either using the pipeline API, which abstracts away all the complexity for you, or by using the `DepthAnythingForDepthEstimation` class yourself.
+[Depth Anything](https://huggingface.co/papers/2401.10891) is designed to be a foundation model for monocular depth estimation (MDE). It is jointly trained on labeled and ~62M unlabeled images to enhance the dataset. It uses a pretrained [DINOv2](./dinov2) model as an image encoder to inherit its existing rich semantic priors, and [DPT](./dpt) as the decoder. A teacher model is trained on unlabeled images to create pseudo-labels. The student model is trained on a combination of the pseudo-labels and labeled images. To improve the student model's performance, strong perturbations are added to the unlabeled images to challenge the student model to learn more visual knowledge from the image.
 
-### Pipeline API
+You can find all the original Depth Anything checkpoints under the [Depth Anything](https://huggingface.co/collections/LiheYoung/depth-anything-release-65b317de04eec72abf6b55aa) collection.
 
-The pipeline allows to use the model in a few lines of code:
+> [!TIP]
+> Click on the Depth Anything models in the right sidebar for more examples of how to apply Depth Anything to different vision tasks.
 
-```python
->>> from transformers import pipeline
->>> from PIL import Image
->>> import requests
+The example below demonstrates how to obtain a depth map with [`Pipeline`] or the [`AutoModel`] class.
 
->>> # load pipe
->>> pipe = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-small-hf")
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
->>> # load image
->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
->>> image = Image.open(requests.get(url, stream=True).raw)
+```py
+import torch
+from transformers import pipeline
 
->>> # inference
->>> depth = pipe(image)["depth"]
+pipe = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-base-hf", torch_dtype=torch.bfloat16, device=0)
+pipe("http://images.cocodataset.org/val2017/000000039769.jpg")["depth"]
 ```
 
-### Using the model yourself
-
-If you want to do the pre- and postprocessing yourself, here's how to do that:
-
-```python
->>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
->>> import torch
->>> import numpy as np
->>> from PIL import Image
->>> import requests
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
->>> model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")
-
->>> # prepare image for the model
->>> inputs = image_processor(images=image, return_tensors="pt")
-
->>> with torch.no_grad():
-...     outputs = model(**inputs)
-
->>> # interpolate to original size and visualize the prediction
->>> post_processed_output = image_processor.post_process_depth_estimation(
-...     outputs,
-...     target_sizes=[(image.height, image.width)],
-... )
-
->>> predicted_depth = post_processed_output[0]["predicted_depth"]
->>> depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
->>> depth = depth.detach().cpu().numpy() * 255
->>> depth = Image.fromarray(depth.astype("uint8"))
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+import requests
+import numpy as np
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+
+image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-base-hf")
+model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-base-hf", torch_dtype=torch.bfloat16)
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = image_processor(images=image, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+post_processed_output = image_processor.post_process_depth_estimation(
+    outputs,
+    target_sizes=[(image.height, image.width)],
+)
+predicted_depth = post_processed_output[0]["predicted_depth"]
+depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
+depth = depth.detach().cpu().numpy() * 255
+Image.fromarray(depth.astype("uint8"))
 ```
 
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Depth Anything.
+</hfoption>
+</hfoptions>
 
-- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation)
-- A notebook showcasing inference with [`DepthAnythingForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Depth%20Anything/Predicting_depth_in_an_image_with_Depth_Anything.ipynb). 🌎
+## Notes
 
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+- [DepthAnythingV2](./depth_anything_v2), released in June 2024, uses the same architecture as Depth Anything and is compatible with all code examples and existing workflows. It uses synthetic data and a larger capacity teacher model to achieve much finer and robust depth predictions.
 
 ## DepthAnythingConfig
 
diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
index 2447b7d93dd5..42fd725a9abe 100644
--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # DepthPro
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun.
@@ -86,7 +90,7 @@ The `DepthProEncoder` further uses two encoders:
 - `image_encoder`
    - Input image is also rescaled to `patch_size` and processed by the **`image_encoder`**
 
-Both these encoders can be configured via `patch_model_config` and `image_model_config` respectively, both of which are seperate `Dinov2Model` by default.
+Both these encoders can be configured via `patch_model_config` and `image_model_config` respectively, both of which are separate `Dinov2Model` by default.
 
 Outputs from both encoders (`last_hidden_state`) and selected intermediate states (`hidden_states`) from **`patch_encoder`** are fused by a `DPT`-based `FeatureFusionStage` for depth estimation.
 
diff --git a/docs/source/en/model_doc/deta.md b/docs/source/en/model_doc/deta.md
index 996142bc59d6..e3859341a71a 100644
--- a/docs/source/en/model_doc/deta.md
+++ b/docs/source/en/model_doc/deta.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # DETA
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md
index 43c6e6d17e2f..4614d549a180 100644
--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # DETR
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The DETR model was proposed in [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by
diff --git a/docs/source/en/model_doc/dialogpt.md b/docs/source/en/model_doc/dialogpt.md
index 558b91d76d25..33d7e3b16d88 100644
--- a/docs/source/en/model_doc/dialogpt.md
+++ b/docs/source/en/model_doc/dialogpt.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # DialoGPT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 DialoGPT was proposed in [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao,
diff --git a/docs/source/en/model_doc/diffllama.md b/docs/source/en/model_doc/diffllama.md
index 80afcfe433e9..c4a170c26572 100644
--- a/docs/source/en/model_doc/diffllama.md
+++ b/docs/source/en/model_doc/diffllama.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # DiffLlama
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The DiffLlama model was proposed in [Differential Transformer](https://arxiv.org/abs/2410.05258) by Kazuma Matsumoto and .
diff --git a/docs/source/en/model_doc/dinat.md b/docs/source/en/model_doc/dinat.md
index 23dfa3b74fb0..cd1d67073be6 100644
--- a/docs/source/en/model_doc/dinat.md
+++ b/docs/source/en/model_doc/dinat.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Dilated Neighborhood Attention Transformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 DiNAT was proposed in [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001)
diff --git a/docs/source/en/model_doc/dinov2.md b/docs/source/en/model_doc/dinov2.md
index 19674907f0c2..acf7b2060038 100644
--- a/docs/source/en/model_doc/dinov2.md
+++ b/docs/source/en/model_doc/dinov2.md
@@ -12,6 +12,14 @@ specific language governing permissions and limitations under the License.
 
 # DINOv2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The DINOv2 model was proposed in [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by
diff --git a/docs/source/en/model_doc/dinov2_with_registers.md b/docs/source/en/model_doc/dinov2_with_registers.md
index 360ebf9b8f8a..3b12d314a5a0 100644
--- a/docs/source/en/model_doc/dinov2_with_registers.md
+++ b/docs/source/en/model_doc/dinov2_with_registers.md
@@ -9,6 +9,12 @@ specific language governing permissions and limitations under the License.
 
 # DINOv2 with Registers
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The DINOv2 with Registers model was proposed in [Vision Transformers Need Registers](https://arxiv.org/abs/2309.16588) by Timothée Darcet, Maxime Oquab, Julien Mairal, Piotr Bojanowski.
diff --git a/docs/source/en/model_doc/distilbert.md b/docs/source/en/model_doc/distilbert.md
index 10f7c2d757a2..cb906234501c 100644
--- a/docs/source/en/model_doc/distilbert.md
+++ b/docs/source/en/model_doc/distilbert.md
@@ -14,202 +14,91 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# DistilBERT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=distilbert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-distilbert-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/distilbert-base-uncased">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-<a href="https://huggingface.co/papers/1910.01108">
-<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1910.01108-green">
-</a>
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    </div>
 </div>
 
-## Overview
+# DistilBERT
 
-The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a
-distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a
-distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a
-small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
-*google-bert/bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
-understanding benchmark.
+[DistilBERT](https://huggingface.co/papers/1910.01108) is pretrained by knowledge distillation to create a smaller model with faster inference and requires less compute to train. Through a triple loss objective during pretraining, language modeling loss, distillation loss, cosine-distance loss, DistilBERT demonstrates similar performance to a larger transformer language model.
 
-The abstract from the paper is the following:
+You can find all the original DistilBERT checkpoints under the [DistilBERT](https://huggingface.co/distilbert) organization.
 
-*As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP),
-operating these large models in on-the-edge and/or under constrained computational training or inference budgets
-remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
-model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
-counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
-knowledge distillation during the pretraining phase and show that it is possible to reduce the size of a BERT model by
-40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
-biases learned by larger models during pretraining, we introduce a triple loss combining language modeling,
-distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
-demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
-study.*
+> [!TIP]
+> Click on the DistilBERT models in the right sidebar for more examples of how to apply DistilBERT to different language tasks.
 
-This model was contributed by [victorsanh](https://huggingface.co/victorsanh). This model jax version was
-contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation).
+The example below demonstrates how to classify text with [`Pipeline`], [`AutoModel`], and from the command line.
 
-## Usage tips
+<hfoptions id="usage">
 
-- DistilBERT doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just
-  separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`).
-- DistilBERT doesn't have options to select the input positions (`position_ids` input). This could be added if
-  necessary though, just let us know if you need this option.
-- Same as BERT but smaller. Trained by distillation of the pretrained BERT model, meaning it’s been trained to predict the same probabilities as the larger model. The actual objective is a combination of:
-
-    * finding the same probabilities as the teacher model
-    * predicting the masked tokens correctly (but no next-sentence objective)
-    * a cosine similarity between the hidden states of the student and the teacher model
-
-### Using Scaled Dot Product Attention (SDPA)
+<hfoption id="Pipeline">
 
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
+```py
+from transformers import pipeline
 
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+classifier = pipeline(
+    task="text-classification",
+    model="distilbert-base-uncased-finetuned-sst-2-english",
+    torch_dtype=torch.float16,
+    device=0
+)
 
+result = classifier("I love using Hugging Face Transformers!")
+print(result)
+# Output: [{'label': 'POSITIVE', 'score': 0.9998}]
 ```
-from transformers import DistilBertModel
-model = DistilBertModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")
-```
-
-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
-
-On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.3.1, OS Ubuntu 20.04) with `float16` and the `distilbert-base-uncased` model with
-a MaskedLM head, we saw the following speedups during training and inference.
-
-#### Training
-
-| num_training_steps | batch_size | seq_len | is cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | sdpa peak mem (MB) | Mem saving (%) |
-|--------------------|------------|---------|---------|----------------------------|---------------------------|-------------|---------------------|--------------------|----------------|
-| 100                | 1          | 128     | False   | 0.010                      | 0.008                     | 28.870      | 397.038             | 399.629            | -0.649         |
-| 100                | 1          | 256     | False   | 0.011                      | 0.009                     | 20.681      | 412.505             | 412.606            | -0.025         |
-| 100                | 2          | 128     | False   | 0.011                      | 0.009                     | 23.741      | 412.213             | 412.606            | -0.095         |
-| 100                | 2          | 256     | False   | 0.015                      | 0.013                     | 16.502      | 427.491             | 425.787            | 0.400          |
-| 100                | 4          | 128     | False   | 0.015                      | 0.013                     | 13.828      | 427.491             | 425.787            | 0.400          |
-| 100                | 4          | 256     | False   | 0.025                      | 0.022                     | 12.882      | 594.156             | 502.745            | 18.182         |
-| 100                | 8          | 128     | False   | 0.023                      | 0.022                     | 8.010       | 545.922             | 502.745            | 8.588          |
-| 100                | 8          | 256     | False   | 0.046                      | 0.041                     | 12.763      | 983.450             | 798.480            | 23.165         |
-
-#### Inference
-
-| num_batches | batch_size | seq_len | is cuda | is half | use mask | Per token latency eager (ms) | Per token latency SDPA (ms) | Speedup (%) | Mem eager (MB) | Mem BT (MB) | Mem saved (%) |
-|-------------|------------|---------|---------|---------|----------|-----------------------------|-----------------------------|-------------|----------------|--------------|---------------|
-| 50          | 2          | 64      | True    | True    | True     | 0.032                       | 0.025                       | 28.192      | 154.532        | 155.531      | -0.642        |
-| 50          | 2          | 128     | True    | True    | True     | 0.033                       | 0.025                       | 32.636      | 157.286        | 157.482      | -0.125        |
-| 50          | 4          | 64      | True    | True    | True     | 0.032                       | 0.026                       | 24.783      | 157.023        | 157.449      | -0.271        |
-| 50          | 4          | 128     | True    | True    | True     | 0.034                       | 0.028                       | 19.299      | 162.794        | 162.269      | 0.323         |
-| 50          | 8          | 64      | True    | True    | True     | 0.035                       | 0.028                       | 25.105      | 160.958        | 162.204      | -0.768        |
-| 50          | 8          | 128     | True    | True    | True     | 0.052                       | 0.046                       | 12.375      | 173.155        | 171.844      | 0.763         |
-| 50          | 16         | 64      | True    | True    | True     | 0.051                       | 0.045                       | 12.882      | 172.106        | 171.713      | 0.229         |
-| 50          | 16         | 128     | True    | True    | True     | 0.096                       | 0.081                       | 18.524      | 191.257        | 191.517      | -0.136        |
-
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DistilBERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
-- A blog post on [Getting Started with Sentiment Analysis using Python](https://huggingface.co/blog/sentiment-analysis-python) with DistilBERT.
-- A blog post on how to [train DistilBERT with Blurr for sequence classification](https://huggingface.co/blog/fastai).
-- A blog post on how to use [Ray to tune DistilBERT hyperparameters](https://huggingface.co/blog/ray-tune).
-- A blog post on how to [train DistilBERT with Hugging Face and Amazon SageMaker](https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face).
-- A notebook on how to [finetune DistilBERT for multi-label classification](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb). 🌎
-- A notebook on how to [finetune DistilBERT for multiclass classification with PyTorch](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb). 🌎
-- A notebook on how to [finetune DistilBERT for text classification in TensorFlow](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb). 🌎
-- [`DistilBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
-- [`TFDistilBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
-- [`FlaxDistilBertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
-- [Text classification task guide](../tasks/sequence_classification)
 
+</hfoption>
 
-<PipelineTag pipeline="token-classification"/>
+<hfoption id="AutoModel">
 
-- [`DistilBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb).
-- [`TFDistilBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
-- [`FlaxDistilBertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
-- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
-- [Token classification task guide](../tasks/token_classification)
+```py
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
+tokenizer = AutoTokenizer.from_pretrained(
+    "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
+)
+model = AutoModelForSequenceClassification.from_pretrained(
+    "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+inputs = tokenizer("I love using Hugging Face Transformers!", return_tensors="pt").to("cuda")
 
-<PipelineTag pipeline="fill-mask"/>
-
-- [`DistilBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFDistilBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-- [`FlaxDistilBertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
-- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
-- [Masked language modeling task guide](../tasks/masked_language_modeling)
-
-<PipelineTag pipeline="question-answering"/>
-
-- [`DistilBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
-- [`TFDistilBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
-- [`FlaxDistilBertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
-- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
-- [Question answering task guide](../tasks/question_answering)
-
-**Multiple choice**
-- [`DistilBertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
-- [`TFDistilBertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
-- [Multiple choice task guide](../tasks/multiple_choice)
-
-⚗️ Optimization
-
-- A blog post on how to [quantize DistilBERT with 🤗 Optimum and Intel](https://huggingface.co/blog/intel).
-- A blog post on how [Optimizing Transformers for GPUs with 🤗 Optimum](https://www.philschmid.de/optimizing-transformers-with-optimum-gpu).
-- A blog post on [Optimizing Transformers with Hugging Face Optimum](https://www.philschmid.de/optimizing-transformers-with-optimum).
-
-⚡️ Inference
-
-- A blog post on how to [Accelerate BERT inference with Hugging Face Transformers and AWS Inferentia](https://huggingface.co/blog/bert-inferentia-sagemaker) with DistilBERT.
-- A blog post on [Serverless Inference with Hugging Face's Transformers, DistilBERT and Amazon SageMaker](https://www.philschmid.de/sagemaker-serverless-huggingface-distilbert).
-
-🚀 Deploy
-
-- A blog post on how to [deploy DistilBERT on Google Cloud](https://huggingface.co/blog/how-to-deploy-a-pipeline-to-google-clouds).
-- A blog post on how to [deploy DistilBERT with Amazon SageMaker](https://huggingface.co/blog/deploy-hugging-face-models-easily-with-amazon-sagemaker).
-- A blog post on how to [Deploy BERT with Hugging Face Transformers, Amazon SageMaker and Terraform module](https://www.philschmid.de/terraform-huggingface-amazon-sagemaker).
+with torch.no_grad():
+    outputs = model(**inputs)
 
+predicted_class_id = torch.argmax(outputs.logits, dim=-1).item()
+predicted_label = model.config.id2label[predicted_class_id]
+print(f"Predicted label: {predicted_label}")
+```
 
-## Combining DistilBERT and Flash Attention 2
+</hfoption>
 
-First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+<hfoption id="transformers-cli">
 
 ```bash
-pip install -U flash-attn --no-build-isolation
+echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
 ```
 
-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16`)
+</hfoption>
 
-To load and run a model using Flash Attention 2, refer to the snippet below:
+</hfoptions>
 
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModel
-
->>> device = "cuda" # the device to load the model onto
-
->>> tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
->>> model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
-
->>> text = "Replace me by any text you'd like."
-
->>> encoded_input = tokenizer(text, return_tensors='pt').to(device)
->>> model.to(device)
-
->>> output = model(**encoded_input)
-```
+## Notes
 
+- DistilBERT doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just
+  separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`).
+- DistilBERT doesn't have options to select the input positions (`position_ids` input). This could be added if
+  necessary though, just let us know if you need this option.
 
 ## DistilBertConfig
 
diff --git a/docs/source/en/model_doc/dit.md b/docs/source/en/model_doc/dit.md
index 7f6691a15bc4..8848948375e8 100644
--- a/docs/source/en/model_doc/dit.md
+++ b/docs/source/en/model_doc/dit.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # DiT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 DiT was proposed in [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
diff --git a/docs/source/en/model_doc/dpr.md b/docs/source/en/model_doc/dpr.md
index 8b9f352b637b..0f6b19c90014 100644
--- a/docs/source/en/model_doc/dpr.md
+++ b/docs/source/en/model_doc/dpr.md
@@ -17,12 +17,9 @@ rendered properly in your Markdown viewer.
 # DPR
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=dpr">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-dpr-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/dpr-question_encoder-bert-base-multilingual">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/dpt.md b/docs/source/en/model_doc/dpt.md
index a02313a31235..95e422dee862 100644
--- a/docs/source/en/model_doc/dpt.md
+++ b/docs/source/en/model_doc/dpt.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # DPT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The DPT model was proposed in [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
diff --git a/docs/source/en/model_doc/efficientformer.md b/docs/source/en/model_doc/efficientformer.md
index 24b20793b03c..f05ccacc3dbf 100644
--- a/docs/source/en/model_doc/efficientformer.md
+++ b/docs/source/en/model_doc/efficientformer.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # EfficientFormer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/efficientnet.md b/docs/source/en/model_doc/efficientnet.md
index a69b255dba5e..a34378fa4709 100644
--- a/docs/source/en/model_doc/efficientnet.md
+++ b/docs/source/en/model_doc/efficientnet.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # EfficientNet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The EfficientNet model was proposed in [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) 
diff --git a/docs/source/en/model_doc/electra.md b/docs/source/en/model_doc/electra.md
index 700c49df7993..9506d6dba1c0 100644
--- a/docs/source/en/model_doc/electra.md
+++ b/docs/source/en/model_doc/electra.md
@@ -14,68 +14,95 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# ELECTRA
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=electra">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-electra-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/electra_large_discriminator_squad2_512">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>
 
-## Overview
-
-The ELECTRA model was proposed in the paper [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than
-Generators](https://openreview.net/pdf?id=r1xMH1BtvB). ELECTRA is a new pretraining approach which trains two
-transformer models: the generator and the discriminator. The generator's role is to replace tokens in a sequence, and
-is therefore trained as a masked language model. The discriminator, which is the model we're interested in, tries to
-identify which tokens were replaced by the generator in the sequence.
-
-The abstract from the paper is the following:
-
-*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
-and then train a model to reconstruct the original tokens. While they produce good results when transferred to
-downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
-more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
-corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
-of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
-predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
-demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
-rather than just the small subset that was masked out. As a result, the contextual representations learned by our
-approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
-particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
-using 30x more compute) on the GLUE natural language understanding benchmark. Our approach also works well at scale,
-where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when
-using the same amount of compute.*
-
-This model was contributed by [lysandre](https://huggingface.co/lysandre). The original code can be found [here](https://github.com/google-research/electra).
-
-## Usage tips
-
-- ELECTRA is the pretraining approach, therefore there is nearly no changes done to the underlying model: BERT. The
-  only change is the separation of the embedding size and the hidden size: the embedding size is generally smaller,
-  while the hidden size is larger. An additional projection layer (linear) is used to project the embeddings from their
-  embedding size to the hidden size. In the case where the embedding size is the same as the hidden size, no projection
-  layer is used.
-- ELECTRA is a transformer model pretrained with the use of another (small) masked language model. The inputs are corrupted by that language model, which takes an input text that is randomly masked and outputs a text in which ELECTRA has to predict which token is an original and which one has been replaced. Like for GAN training, the small language model is trained for a few steps (but with the original texts as objective, not to fool the ELECTRA model like in a traditional GAN setting) then the ELECTRA model is trained for a few steps.
-- The ELECTRA checkpoints saved using [Google Research's implementation](https://github.com/google-research/electra)
-  contain both the generator and discriminator. The conversion script requires the user to name which model to export
-  into the correct architecture. Once converted to the HuggingFace format, these checkpoints may be loaded into all
-  available ELECTRA models, however. This means that the discriminator may be loaded in the
-  [`ElectraForMaskedLM`] model, and the generator may be loaded in the
-  [`ElectraForPreTraining`] model (the classification head will be randomly initialized as it
-  doesn't exist in the generator).
-
-## Resources
-
-- [Text classification task guide](../tasks/sequence_classification)
-- [Token classification task guide](../tasks/token_classification)
-- [Question answering task guide](../tasks/question_answering)
-- [Causal language modeling task guide](../tasks/language_modeling)
-- [Masked language modeling task guide](../tasks/masked_language_modeling)
-- [Multiple choice task guide](../tasks/multiple_choice)
+# ELECTRA
+
+[ELECTRA](https://huggingface.co/papers/2003.10555) modifies the pretraining objective of traditional masked language models like BERT. Instead of just masking tokens and asking the model to predict them, ELECTRA trains two models, a generator and a discriminator. The generator replaces some tokens with plausible alternatives and the discriminator (the model you'll actually use) learns to detect which tokens are original and which were replaced. This training approach is very efficient and scales to larger models while using considerably less compute.
+
+This approach is super efficient because ELECTRA learns from every single token in the input, not just the masked ones. That's why even the small ELECTRA models can match or outperform much larger models while using way less computing resources.
+
+You can find all the original ELECTRA checkpoints under the [ELECTRA](https://huggingface.co/collections/google/electra-release-64ff6e8b18830fabea30a1ab) release.
+
+> [!TIP]
+> Click on the right sidebar for more examples of how to use ELECTRA for different language tasks like sequence classification, token classification, and question answering.
+
+The example below demonstrates how to classify text with [`Pipeline`] or the [`AutoModel`] class.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+classifier = pipeline(
+    task="text-classification", 
+    model="bhadresh-savani/electra-base-emotion", 
+    torch_dtype=torch.float16, 
+    device=0
+)
+classifier("This restaurant has amazing food!")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "bhadresh-savani/electra-base-emotion",
+)
+model = AutoModelForSequenceClassification.from_pretrained(
+    "bhadresh-savani/electra-base-emotion", 
+    torch_dtype=torch.float16
+)
+inputs = tokenizer("ELECTRA is more efficient than BERT", return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    logits = outputs.logits
+    predicted_class_id = logits.argmax(dim=-1).item()
+    predicted_label = model.config.id2label[predicted_class_id]
+print(f"Predicted label: {predicted_label}")
+```
+
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+echo -e "This restaurant has amazing food." | transformers-cli run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## Notes
+
+- ELECTRA consists of two transformer models, a generator (G) and a discriminator (D). For most downstream tasks, use the discriminator model (as indicated by `*-discriminator` in the name) rather than the generator.
+- ELECTRA comes in three sizes: small (14M parameters), base (110M parameters), and large (335M parameters).
+- ELECTRA can use a smaller embedding size than the hidden size for efficiency. When `embedding_size` is smaller than `hidden_size` in the configuration, a projection layer connects them.
+- When using batched inputs with padding, make sure to use attention masks to prevent the model from attending to padding tokens.
+
+    ```py
+    # Example of properly handling padding with attention masks
+    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"], 
+                    padding=True, 
+                    return_tensors="pt")
+    outputs = model(**inputs)  # automatically uses the attention_mask
+    ```
+    
+- When using the discriminator for a downstream task, you can load it into any of the ELECTRA model classes ([`ElectraForSequenceClassification`], [`ElectraForTokenClassification`], etc.).
 
 ## ElectraConfig
 
diff --git a/docs/source/en/model_doc/emu3.md b/docs/source/en/model_doc/emu3.md
index 619c9a3be51f..4ac7d0b0c4f1 100644
--- a/docs/source/en/model_doc/emu3.md
+++ b/docs/source/en/model_doc/emu3.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Emu3
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Emu3 model was proposed in [Emu3: Next-Token Prediction is All You Need](https://arxiv.org/abs/2409.18869) by Xinlong Wang, Xiaosong Zhang, Zhengxiong Luo, Quan Sun, Yufeng Cui, Jinsheng Wang, Fan Zhang, Yueze Wang, Zhen Li, Qiying Yu, Yingli Zhao, Yulong Ao, Xuebin Min, Tao Li, Boya Wu, Bo Zhao, Bowen Zhang, Liangdong Wang, Guang Liu, Zheqi He, Xi Yang, Jingjing Liu, Yonghua Lin, Tiejun Huang, Zhongyuan Wang.
diff --git a/docs/source/en/model_doc/encodec.md b/docs/source/en/model_doc/encodec.md
index 856f8be2b80a..893954d5cf86 100644
--- a/docs/source/en/model_doc/encodec.md
+++ b/docs/source/en/model_doc/encodec.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # EnCodec
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The EnCodec neural codec model was proposed in [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
diff --git a/docs/source/en/model_doc/encoder-decoder.md b/docs/source/en/model_doc/encoder-decoder.md
index 4bd0e6f188fe..d0a676fb33a6 100644
--- a/docs/source/en/model_doc/encoder-decoder.md
+++ b/docs/source/en/model_doc/encoder-decoder.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # Encoder Decoder Models
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The [`EncoderDecoderModel`] can be used to initialize a sequence-to-sequence model with any
diff --git a/docs/source/en/model_doc/ernie.md b/docs/source/en/model_doc/ernie.md
index a5110b2d7b73..82f2a0d5ba81 100644
--- a/docs/source/en/model_doc/ernie.md
+++ b/docs/source/en/model_doc/ernie.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # ERNIE
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 ERNIE is a series of powerful models proposed by baidu, especially in Chinese tasks,
 including [ERNIE1.0](https://arxiv.org/abs/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428),
diff --git a/docs/source/en/model_doc/ernie_m.md b/docs/source/en/model_doc/ernie_m.md
index 85254693501c..3ce3b40c4463 100644
--- a/docs/source/en/model_doc/ernie_m.md
+++ b/docs/source/en/model_doc/ernie_m.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # ErnieM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/esm.md b/docs/source/en/model_doc/esm.md
index 46bab860ff4d..6061d8eea987 100644
--- a/docs/source/en/model_doc/esm.md
+++ b/docs/source/en/model_doc/esm.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # ESM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 This page provides code and pre-trained weights for Transformer protein language models from Meta AI's Fundamental 
diff --git a/docs/source/en/model_doc/falcon.md b/docs/source/en/model_doc/falcon.md
index 9bf6c32a4ec5..72c7c3274c24 100644
--- a/docs/source/en/model_doc/falcon.md
+++ b/docs/source/en/model_doc/falcon.md
@@ -14,42 +14,113 @@ rendered properly in your Markdown viewer.
 
 -->
 
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
 # Falcon
 
-## Overview
+[Falcon](https://huggingface.co/papers/2311.16867) is a family of large language models, available in 7B, 40B, and 180B parameters, as pretrained and instruction tuned variants. This model focuses on scaling pretraining over three categories, performance, data, and hardware. Falcon uses multigroup attention to significantly reduce inference memory requirements and rotary positional embeddings (RoPE). These models are pretrained on [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb), a high-quality and deduplicated 5T token dataset.
+
+You can find all the original Falcon checkpoints under the [Falcon](https://huggingface.co/collections/tiiuae/falcon-64fb432660017eeec9837b5a) collection.
+
+> [!TIP]
+> Click on the Falcon models in the right sidebar for more examples of how to apply Falcon to different language tasks.
+
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="text-generation", 
+    model="tiiuae/falcon-7b-instruct",
+    torch_dtype=torch.bfloat16,
+    device=0
+)
+pipeline(
+    "Write a short poem about coding",
+    max_length=100,
+    do_sample=True,
+    temperature=0.7
+)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
 
-Falcon is a class of causal decoder-only models built by [TII](https://www.tii.ae/). The largest Falcon checkpoints
-have been trained on >=1T tokens of text, with a particular emphasis on the [RefinedWeb](https://arxiv.org/abs/2306.01116)
-corpus. They are made available under the Apache 2.0 license.
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
+tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
+model = AutoModelForCausalLM.from_pretrained(
+    "tiiuae/falcon-7b-instruct",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa",
+)
 
-Falcon's architecture is modern and optimized for inference, with multi-query attention and support for efficient
-attention variants like `FlashAttention`. Both 'base' models trained only as causal language models as well as
-'instruct' models that have received further fine-tuning are available.
+input_ids = tokenizer("Write a short poem about coding", return_tensors="pt").to("cuda")
 
+output = model.generate(**input_ids)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
 
-Falcon models are (as of 2023) some of the largest and most powerful open-source language models,
-and consistently rank highly in the [OpenLLM leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
+</hfoption>
+<hfoption id="transformers-cli">
 
-## Converting custom checkpoints 
+```bash
+# pip install -U flash-attn --no-build-isolation
+transformers-cli chat --model_name_or_path tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+```
 
-<Tip>
+</hfoption>
+</hfoptions>
 
-Falcon models were initially added to the Hugging Face Hub as custom code checkpoints. However, Falcon is now fully
-supported in the Transformers library. If you fine-tuned a model from a custom code checkpoint, we recommend converting
-your checkpoint to the new in-library format, as this should give significant improvements to stability and
-performance, especially for generation, as well as removing the need to use `trust_remote_code=True`!
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
 
-</Tip>
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
 
-You can convert custom code checkpoints to full Transformers checkpoints using the `convert_custom_code_checkpoint.py` 
-script located in the
-[Falcon model directory](https://github.com/huggingface/transformers/tree/main/src/transformers/models/falcon)
-of the Transformers library. To use this script, simply call it with 
-`python convert_custom_code_checkpoint.py --checkpoint_dir my_model`. This will convert your checkpoint in-place, and
-you can immediately load it from the directory afterwards with e.g. `from_pretrained()`. If your model hasn't been
-uploaded to the Hub, we recommend making a backup before attempting the conversion, just in case!
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+)
+
+tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
+model = AutoModelForCausalLM.from_pretrained(
+    "tiiuae/falcon-7b",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config,
+)
+
+inputs = tokenizer("In quantum physics, entanglement means", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, max_new_tokens=100)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+## Notes
+
+- If you're upgrading from an older custom code checkpoint, remember to convert it to the official Transformers format for better stability and performance using the conversion script located in the [Falcon model directory](https://github.com/huggingface/transformers/tree/main/src/transformers/models/falcon).
+
+   ```bash
+   python convert_custom_code_checkpoint.py --checkpoint_dir my_model
+   ```
 
 ## FalconConfig
 
@@ -79,6 +150,4 @@ uploaded to the Hub, we recommend making a backup before attempting the conversi
 ## FalconForQuestionAnswering
 
 [[autodoc]] FalconForQuestionAnswering
-    - forward
-
-
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/falcon3.md b/docs/source/en/model_doc/falcon3.md
index 813533dd7f4d..276548be77ad 100644
--- a/docs/source/en/model_doc/falcon3.md
+++ b/docs/source/en/model_doc/falcon3.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Falcon3
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 Falcon3 represents a natural evolution from previous releases, emphasizing expanding the models' science, math, and code capabilities. This iteration includes five base models: Falcon3-1B-Base, Falcon3-3B-Base, Falcon3-Mamba-7B-Base, Falcon3-7B-Base, and Falcon3-10B-Base. In developing these models, we incorporated several key innovations aimed at improving the models' performances while reducing training costs:
diff --git a/docs/source/en/model_doc/falcon_mamba.md b/docs/source/en/model_doc/falcon_mamba.md
index cbec6378cc14..fb6debfef921 100644
--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # FalconMamba
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The FalconMamba model was proposed by TII UAE (Technology Innovation Institute) in their release.
diff --git a/docs/source/en/model_doc/fastspeech2_conformer.md b/docs/source/en/model_doc/fastspeech2_conformer.md
index 7d9250273331..aeb055ceae40 100644
--- a/docs/source/en/model_doc/fastspeech2_conformer.md
+++ b/docs/source/en/model_doc/fastspeech2_conformer.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # FastSpeech2Conformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The FastSpeech2Conformer model was proposed with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.
diff --git a/docs/source/en/model_doc/flan-t5.md b/docs/source/en/model_doc/flan-t5.md
index c0fd6b0011cc..0e3b9ba0738f 100644
--- a/docs/source/en/model_doc/flan-t5.md
+++ b/docs/source/en/model_doc/flan-t5.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # FLAN-T5
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf) - it is an enhanced version of T5 that has been finetuned in a mixture of tasks.
diff --git a/docs/source/en/model_doc/flan-ul2.md b/docs/source/en/model_doc/flan-ul2.md
index 5487bb779760..3b946b909b09 100644
--- a/docs/source/en/model_doc/flan-ul2.md
+++ b/docs/source/en/model_doc/flan-ul2.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # FLAN-UL2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 Flan-UL2 is an encoder decoder model based on the T5 architecture. It uses the same configuration as the [UL2](ul2) model released earlier last year. 
diff --git a/docs/source/en/model_doc/flaubert.md b/docs/source/en/model_doc/flaubert.md
index 04bcc2638ac9..59ab44ebff03 100644
--- a/docs/source/en/model_doc/flaubert.md
+++ b/docs/source/en/model_doc/flaubert.md
@@ -17,12 +17,8 @@ rendered properly in your Markdown viewer.
 # FlauBERT
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=flaubert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-flaubert-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/flaubert_small_cased">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/flava.md b/docs/source/en/model_doc/flava.md
index d9f9f1de5146..b32f93fc8bcb 100644
--- a/docs/source/en/model_doc/flava.md
+++ b/docs/source/en/model_doc/flava.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # FLAVA
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The FLAVA model was proposed in [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela and is accepted at CVPR 2022.
diff --git a/docs/source/en/model_doc/fnet.md b/docs/source/en/model_doc/fnet.md
index 1bcae678e632..fcf75e21caed 100644
--- a/docs/source/en/model_doc/fnet.md
+++ b/docs/source/en/model_doc/fnet.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # FNet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The FNet model was proposed in [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by
diff --git a/docs/source/en/model_doc/focalnet.md b/docs/source/en/model_doc/focalnet.md
index c4c97980f069..5312cae4ff67 100644
--- a/docs/source/en/model_doc/focalnet.md
+++ b/docs/source/en/model_doc/focalnet.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # FocalNet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The FocalNet model was proposed in [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
diff --git a/docs/source/en/model_doc/funnel.md b/docs/source/en/model_doc/funnel.md
index d6929691f400..96050a153df2 100644
--- a/docs/source/en/model_doc/funnel.md
+++ b/docs/source/en/model_doc/funnel.md
@@ -17,15 +17,10 @@ rendered properly in your Markdown viewer.
 # Funnel Transformer
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=funnel">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-funnel-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/funnel-transformer-small">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>
 
-
 ## Overview
 
 The Funnel Transformer model was proposed in the paper [Funnel-Transformer: Filtering out Sequential Redundancy for
diff --git a/docs/source/en/model_doc/fuyu.md b/docs/source/en/model_doc/fuyu.md
index bd55737da58f..c0ea89ad19fb 100644
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Fuyu
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Fuyu model was created by [ADEPT](https://www.adept.ai/blog/fuyu-8b), and authored by Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar.
diff --git a/docs/source/en/model_doc/gemma.md b/docs/source/en/model_doc/gemma.md
index abd077af8da1..144bcf33886b 100644
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # Gemma
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Gemma model was proposed in [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by Gemma Team, Google.
diff --git a/docs/source/en/model_doc/gemma2.md b/docs/source/en/model_doc/gemma2.md
index 431c4ecd25f2..9cf8ff7af102 100644
--- a/docs/source/en/model_doc/gemma2.md
+++ b/docs/source/en/model_doc/gemma2.md
@@ -17,6 +17,12 @@ rendered properly in your Markdown viewer.
 
 # Gemma2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Gemma2 model was proposed in [Gemma2: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/google-gemma-2/) by Gemma2 Team, Google.
diff --git a/docs/source/en/model_doc/gemma3.md b/docs/source/en/model_doc/gemma3.md
new file mode 100644
index 000000000000..72c0c5d76af4
--- /dev/null
+++ b/docs/source/en/model_doc/gemma3.md
@@ -0,0 +1,265 @@
+
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# Gemma 3
+
+[Gemma 3](https://goo.gle/Gemma3Report) is a multimodal model with pretrained and instruction-tuned variants, available in 1B, 13B, and 27B parameters. The architecture is mostly the same as the previous Gemma versions. The key differences are alternating 5 local sliding window self-attention layers for every global self-attention layer, support for a longer context length of 128K tokens, and a [SigLip](./siglip) encoder that can "pan & scan" high-resolution images to prevent information from disappearing in high resolution images or images with non-square aspect ratios.
+
+The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning.
+
+You can find all the original Gemma 3 checkpoints under the [Gemma 3](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b) release.
+
+> [!TIP]
+> Click on the Gemma 3 models in the right sidebar for more examples of how to apply Gemma to different vision and language tasks.
+
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="image-text-to-text",
+    model="google/gemma-3-4b-pt",
+    device=0,
+    torch_dtype=torch.bfloat16
+)
+pipeline(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+    text="<start_of_image> What is shown in this image?"
+)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+model = Gemma3ForConditionalGeneration.from_pretrained(
+    "google/gemma-3-4b-it",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+processor = AutoProcessor.from_pretrained(
+    "google/gemma-3-4b-it",
+    padding_side="left"
+)
+
+messages = [
+    {
+        "role": "system",
+        "content": [
+            {"type": "text", "text": "You are a helpful assistant."}
+        ]
+    },
+    {
+        "role": "user", "content": [
+            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+            {"type": "text", "text": "What is shown in this image?"},
+        ]
+    },
+]
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+).to("cuda")
+
+output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static")
+print(processor.decode(output[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model google/gemma-3-1b-pt --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
+
+```py
+# pip install torchao
+import torch
+from transformers import TorchAoConfig, Gemma3ForConditionalGeneration, AutoProcessor
+
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+model = Gemma3ForConditionalGeneration.from_pretrained(
+    "google/gemma-3-27b-it",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+processor = AutoProcessor.from_pretrained(
+    "google/gemma-3-27b-it",
+    padding_side="left"
+)
+
+messages = [
+    {
+        "role": "system",
+        "content": [
+            {"type": "text", "text": "You are a helpful assistant."}
+        ]
+    },
+    {
+        "role": "user", "content": [
+            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+            {"type": "text", "text": "What is shown in this image?"},
+        ]
+    },
+]
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+).to("cuda")
+
+output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static")
+print(processor.decode(output[0], skip_special_tokens=True))
+```
+
+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
+
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
+
+visualizer = AttentionMaskVisualizer("google/gemma-3-4b-it")
+visualizer("<img>What is shown in this image?")
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/gemma-3-attn-mask.png"/>
+</div>
+
+## Notes
+
+- Use [`Gemma3ForConditionalGeneration`] for image-and-text and image-only inputs.
+- Gemma 3 supports multiple input images, but make sure the images are correctly batched before passing them to the processor. Each batch should be a list of one or more images.
+
+    ```py
+    url_cow = "https://media.istockphoto.com/id/1192867753/photo/cow-in-berchida-beach-siniscola.jpg?s=612x612&w=0&k=20&c=v0hjjniwsMNfJSuKWZuIn8pssmD5h5bSN1peBd1CmH4="
+    url_cat = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+
+    messages =[
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "You are a helpful assistant."}
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "url": url_cow},
+                {"type": "image", "url": url_cat},
+                {"type": "text", "text": "Which image is cuter?"},
+            ]
+        },
+    ]
+    ```
+- Text passed to the processor should have a `<start_of_image>` token wherever an image should be inserted.
+- The processor has its own [`~ProcessorMixin.apply_chat_template`] method to convert chat messages to model inputs.
+- By default, images aren't cropped and only the base image is forwarded to the model. In high resolution images or images with non-square aspect ratios, artifacts can result because the vision encoder uses a fixed resolution of 896x896. To prevent these artifacts and improve performance during inference, set `do_pan_and_scan=True` to crop the image into multiple smaller patches and concatenate them with the base image embedding. You can disable pan and scan for faster inference.
+
+    ```diff
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+        add_generation_prompt=True,
+    +   do_pan_and_scan=True,
+        ).to("cuda")
+    ```
+- For Gemma-3 1B checkpoint trained in text-only mode, use [`AutoModelForCausalLM`] instead.
+
+    ```py
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        "google/gemma-3-1b-pt",
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        "google/gemma-3-1b-pt",
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        attn_implementation="sdpa"
+    )
+    input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
+
+    output = model.generate(**input_ids, cache_implementation="static")
+    print(tokenizer.decode(output[0], skip_special_tokens=True))
+    ```
+
+## Gemma3ImageProcessor
+
+[[autodoc]] Gemma3ImageProcessor
+
+## Gemma3ImageProcessorFast
+
+[[autodoc]] Gemma3ImageProcessorFast
+
+## Gemma3Processor
+
+[[autodoc]] Gemma3Processor
+
+## Gemma3TextConfig
+
+[[autodoc]] Gemma3TextConfig
+
+## Gemma3Config
+
+[[autodoc]] Gemma3Config
+
+## Gemma3TextModel
+
+[[autodoc]] Gemma3TextModel
+    - forward
+
+## Gemma3ForCausalLM
+
+[[autodoc]] Gemma3ForCausalLM
+    - forward
+
+## Gemma3ForConditionalGeneration
+
+[[autodoc]] Gemma3ForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/git.md b/docs/source/en/model_doc/git.md
index bffa98b89e3b..825b73c5c59b 100644
--- a/docs/source/en/model_doc/git.md
+++ b/docs/source/en/model_doc/git.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # GIT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The GIT model was proposed in [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by
diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md
index 1268b2e7cf9c..cfcd549d1493 100644
--- a/docs/source/en/model_doc/glm.md
+++ b/docs/source/en/model_doc/glm.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # GLM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The GLM Model was proposed
diff --git a/docs/source/en/model_doc/glpn.md b/docs/source/en/model_doc/glpn.md
index b57d1a7ccdda..95ecc36bf5b7 100644
--- a/docs/source/en/model_doc/glpn.md
+++ b/docs/source/en/model_doc/glpn.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # GLPN
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip>
 
 This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
diff --git a/docs/source/en/model_doc/got_ocr2.md b/docs/source/en/model_doc/got_ocr2.md
index a560f78269cc..bb14cdbcf847 100644
--- a/docs/source/en/model_doc/got_ocr2.md
+++ b/docs/source/en/model_doc/got_ocr2.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # GOT-OCR2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The GOT-OCR2 model was proposed in [General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model](https://arxiv.org/abs/2409.01704) by Haoran Wei, Chenglong Liu, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Liang Zhao, Jianjian Sun, Yuang Peng, Chunrui Han, Xiangyu Zhang.
@@ -44,13 +48,14 @@ The original code can be found [here](https://github.com/Ucas-HaoranWei/GOT-OCR2
 
 ```python
 >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
 >>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)
 
 >>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
->>> inputs = processor(image, return_tensors="pt").to(device)
+>>> inputs = processor(image, return_tensors="pt", device=device).to(device)
 
 >>> generate_ids = model.generate(
 ...     **inputs,
@@ -68,15 +73,16 @@ The original code can be found [here](https://github.com/Ucas-HaoranWei/GOT-OCR2
 
 ```python
 >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
 >>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)
 
 >>> image1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
 >>> image2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
 
->>> inputs = processor([image1, image2], return_tensors="pt").to(device)
+>>> inputs = processor([image1, image2], return_tensors="pt", device=device).to(device)
 
 >>> generate_ids = model.generate(
 ...     **inputs,
@@ -96,13 +102,14 @@ GOT-OCR2 can also generate formatted text, such as markdown or LaTeX. Here is an
 
 ```python
 >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
 >>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)
 
 >>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/latex.png"
->>> inputs = processor(image, return_tensors="pt", format=True).to(device)
+>>> inputs = processor(image, return_tensors="pt", format=True, device=device).to(device)
 
 >>> generate_ids = model.generate(
 ...     **inputs,
@@ -124,14 +131,15 @@ Here is an example of how to process multiple pages at once:
 
 ```python
 >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
 >>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)
 
 >>> image1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/page1.png"
 >>> image2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/page2.png"
->>> inputs = processor([image1, image2], return_tensors="pt", multi_page=True, format=True).to(device)
+>>> inputs = processor([image1, image2], return_tensors="pt", multi_page=True, format=True, device=device).to(device)
 
 >>> generate_ids = model.generate(
 ...     **inputs,
@@ -153,13 +161,14 @@ Here is an example of how to process cropped patches:
 ```python
 >>> import torch
 >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
 >>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", torch_dtype=torch.bfloat16, device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)
 
 >>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/one_column.png"
->>> inputs = processor(image, return_tensors="pt", format=True, crop_to_patches=True, max_patches=3).to(device)
+>>> inputs = processor(image, return_tensors="pt", format=True, crop_to_patches=True, max_patches=3, device=device).to(device)
 
 >>> generate_ids = model.generate(
 ...     **inputs,
@@ -179,13 +188,14 @@ GOT supports interactive OCR, where the user can specify the region to be recogn
 
 ```python
 >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
 >>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)
 
 >>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
->>> inputs = processor(image, return_tensors="pt", color="green").to(device) # or box=[x1, y1, x2, y2] for coordinates (image pixels)
+>>> inputs = processor(image, return_tensors="pt", color="green", device=device).to(device) # or box=[x1, y1, x2, y2] for coordinates (image pixels)
 
 >>> generate_ids = model.generate(
 ...     **inputs,
@@ -206,14 +216,15 @@ Here is an example of how to process sheet music:
 
 ```python
 >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
 >>> import verovio
 
 >>> device = "cuda" if torch.cuda.is_available() else "cpu"
 >>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", use_fast=True)
 
 >>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/sheet_music.png"
->>> inputs = processor(image, return_tensors="pt", format=True).to(device)
+>>> inputs = processor(image, return_tensors="pt", format=True, device=device).to(device)
 
 >>> generate_ids = model.generate(
 ...     **inputs,
@@ -258,6 +269,10 @@ alt="drawing" width="600"/>
 
 [[autodoc]] GotOcr2ImageProcessor
 
+## GotOcr2ImageProcessorFast
+
+[[autodoc]] GotOcr2ImageProcessorFast
+
 ## GotOcr2Processor
 
 [[autodoc]] GotOcr2Processor
diff --git a/docs/source/en/model_doc/gpt-sw3.md b/docs/source/en/model_doc/gpt-sw3.md
index f69bd958e9c5..20daa3537af0 100644
--- a/docs/source/en/model_doc/gpt-sw3.md
+++ b/docs/source/en/model_doc/gpt-sw3.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # GPT-Sw3
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The GPT-Sw3 model was first proposed in
diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md
index 1635a9f50dd0..648fa6cb8d60 100644
--- a/docs/source/en/model_doc/gpt_bigcode.md
+++ b/docs/source/en/model_doc/gpt_bigcode.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # GPTBigCode
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The GPTBigCode model was proposed in [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by BigCode. The listed authors are: Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
diff --git a/docs/source/en/model_doc/gpt_neo.md b/docs/source/en/model_doc/gpt_neo.md
index 3c7858c99820..f90e0d18498f 100644
--- a/docs/source/en/model_doc/gpt_neo.md
+++ b/docs/source/en/model_doc/gpt_neo.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # GPT Neo
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+</div>
+
 ## Overview
 
 The GPTNeo model was released in the [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) repository by Sid
diff --git a/docs/source/en/model_doc/gpt_neox.md b/docs/source/en/model_doc/gpt_neox.md
index 1319f2e93c14..35f12bdb2128 100644
--- a/docs/source/en/model_doc/gpt_neox.md
+++ b/docs/source/en/model_doc/gpt_neox.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # GPT-NeoX
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 We introduce GPT-NeoX-20B, a 20 billion parameter autoregressive language model trained on the Pile, whose weights will
diff --git a/docs/source/en/model_doc/gpt_neox_japanese.md b/docs/source/en/model_doc/gpt_neox_japanese.md
index c69e643cae5b..cedfafa133e4 100644
--- a/docs/source/en/model_doc/gpt_neox_japanese.md
+++ b/docs/source/en/model_doc/gpt_neox_japanese.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # GPT-NeoX-Japanese
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+</div>
+
 ## Overview
 
 We introduce GPT-NeoX-Japanese, which is an autoregressive language model for Japanese, trained on top of [https://github.com/EleutherAI/gpt-neox](https://github.com/EleutherAI/gpt-neox).
diff --git a/docs/source/en/model_doc/gptj.md b/docs/source/en/model_doc/gptj.md
index b515cf36dd40..8e852d931aae 100644
--- a/docs/source/en/model_doc/gptj.md
+++ b/docs/source/en/model_doc/gptj.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # GPT-J
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+</div>
+
 ## Overview
 
 The GPT-J model was released in the [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax) repository by Ben Wang and Aran Komatsuzaki. It is a GPT-2-like
diff --git a/docs/source/en/model_doc/gptsan-japanese.md b/docs/source/en/model_doc/gptsan-japanese.md
index 108e59048d5d..929e7330ceea 100644
--- a/docs/source/en/model_doc/gptsan-japanese.md
+++ b/docs/source/en/model_doc/gptsan-japanese.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # GPTSAN-japanese
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/granite.md b/docs/source/en/model_doc/granite.md
index 42b6da4e7478..0326bc5ad24a 100644
--- a/docs/source/en/model_doc/granite.md
+++ b/docs/source/en/model_doc/granite.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Granite
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Granite model was proposed in [Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler](https://arxiv.org/abs/2408.13359) by Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox and Rameswar Panda.
diff --git a/docs/source/en/model_doc/granitemoe.md b/docs/source/en/model_doc/granitemoe.md
index 176e833c24c6..56ba5d936c9d 100644
--- a/docs/source/en/model_doc/granitemoe.md
+++ b/docs/source/en/model_doc/granitemoe.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # GraniteMoe
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The GraniteMoe model was proposed in [Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler](https://arxiv.org/abs/2408.13359) by Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox and Rameswar Panda.
diff --git a/docs/source/en/model_doc/granitemoeshared.md b/docs/source/en/model_doc/granitemoeshared.md
new file mode 100644
index 000000000000..38eb7daf8c92
--- /dev/null
+++ b/docs/source/en/model_doc/granitemoeshared.md
@@ -0,0 +1,66 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GraniteMoeShared
+
+## Overview
+
+
+The GraniteMoe model was proposed in [Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler](https://arxiv.org/abs/2408.13359) by Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox and Rameswar Panda.
+
+Additionally this class GraniteMoeSharedModel adds shared experts for Moe.
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_path = "ibm-research/moe-7b-1b-active-shared-experts"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+# drop device_map if running on CPU
+model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
+model.eval()
+
+# change input text as desired
+prompt = "Write a code to find the maximum value in a list of numbers."
+
+# tokenize the text
+input_tokens = tokenizer(prompt, return_tensors="pt")
+# generate output tokens
+output = model.generate(**input_tokens, max_new_tokens=100)
+# decode output tokens into text
+output = tokenizer.batch_decode(output)
+# loop over the batch to print, in this example the batch size is 1
+for i in output:
+    print(i)
+```
+
+This HF implementation is contributed by [Mayank Mishra](https://huggingface.co/mayank-mishra), [Shawn Tan](https://huggingface.co/shawntan) and [Sukriti Sharma](https://huggingface.co/SukritiSharma).
+
+
+## GraniteMoeSharedConfig
+
+[[autodoc]] GraniteMoeSharedConfig
+
+## GraniteMoeSharedModel
+
+[[autodoc]] GraniteMoeSharedModel
+    - forward
+
+## GraniteMoeSharedForCausalLM
+
+[[autodoc]] GraniteMoeSharedForCausalLM
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/graphormer.md b/docs/source/en/model_doc/graphormer.md
index d01bf04debf9..0d88134d4b7e 100644
--- a/docs/source/en/model_doc/graphormer.md
+++ b/docs/source/en/model_doc/graphormer.md
@@ -14,6 +14,10 @@ rendered properly in your Markdown viewer.
 
 # Graphormer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index 1b9104eb963e..75f8a2fa32f7 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Grounding DINO
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Grounding DINO model was proposed in [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. Grounding DINO extends a closed-set object detection model with a text encoder, enabling open-set object detection. The model achieves remarkable results, such as 52.5 AP on COCO zero-shot.
diff --git a/docs/source/en/model_doc/groupvit.md b/docs/source/en/model_doc/groupvit.md
index 8728cf0da21b..c77a51d8b1b7 100644
--- a/docs/source/en/model_doc/groupvit.md
+++ b/docs/source/en/model_doc/groupvit.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # GroupViT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The GroupViT model was proposed in [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
diff --git a/docs/source/en/model_doc/helium.md b/docs/source/en/model_doc/helium.md
index b830c0a72be7..a9296eb110d5 100644
--- a/docs/source/en/model_doc/helium.md
+++ b/docs/source/en/model_doc/helium.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # Helium
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
 
 ## Overview
 
diff --git a/docs/source/en/model_doc/herbert.md b/docs/source/en/model_doc/herbert.md
index 0049d6bfcf3a..aa4f535ed274 100644
--- a/docs/source/en/model_doc/herbert.md
+++ b/docs/source/en/model_doc/herbert.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # HerBERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The HerBERT model was proposed in [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, and
diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index c63c892c7c7d..a82eec950a51 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Hiera
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 Hiera was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer
diff --git a/docs/source/en/model_doc/hubert.md b/docs/source/en/model_doc/hubert.md
index 93e40d4f4ee8..67e7d78beb63 100644
--- a/docs/source/en/model_doc/hubert.md
+++ b/docs/source/en/model_doc/hubert.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # Hubert
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan
@@ -64,9 +71,10 @@ pip install -U flash-attn --no-build-isolation
 Below is an expected speedup diagram comparing the pure inference time between the native implementation in transformers of `facebook/hubert-large-ls960-ft`, the flash-attention-2 and the sdpa (scale-dot-product-attention) version. We show the average speedup obtained on the `librispeech_asr` `clean` validation split: 
 
 ```python
->>> from transformers import Wav2Vec2Model
+>>> from transformers import HubertModel
+>>> import torch
 
-model = Wav2Vec2Model.from_pretrained("facebook/hubert-large-ls960-ft", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
+>>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to("cuda")
 ...
 ```
 
diff --git a/docs/source/en/model_doc/ibert.md b/docs/source/en/model_doc/ibert.md
index 9ea623951aec..8c43eeddaf55 100644
--- a/docs/source/en/model_doc/ibert.md
+++ b/docs/source/en/model_doc/ibert.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # I-BERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The I-BERT model was proposed in [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by
diff --git a/docs/source/en/model_doc/idefics.md b/docs/source/en/model_doc/idefics.md
index ab66bd555a71..2b8e471213d7 100644
--- a/docs/source/en/model_doc/idefics.md
+++ b/docs/source/en/model_doc/idefics.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # IDEFICS
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The IDEFICS model was proposed in [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents
diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md
index b9b51082f29e..8de2c92d5609 100644
--- a/docs/source/en/model_doc/idefics2.md
+++ b/docs/source/en/model_doc/idefics2.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Idefics2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Idefics2 model was proposed in [What matters when building vision-language models?](https://arxiv.org/abs/2405.02246) by Léo Tronchon, Hugo Laurencon, Victor Sanh. The accompanying blog post can be found [here](https://huggingface.co/blog/idefics2).
diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md
index cf7c043e9289..deab4423f80c 100644
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Idefics3
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Idefics3 model was proposed in [Building and better understanding vision-language models: insights and future directions](https://huggingface.co/papers/2408.12637) by Hugo Laurençon, Andrés Marafioti, Victor Sanh, and Léo Tronchon.
diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md
index cb2afd25e20b..d35011478182 100644
--- a/docs/source/en/model_doc/ijepa.md
+++ b/docs/source/en/model_doc/ijepa.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # I-JEPA
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/abs/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas.
diff --git a/docs/source/en/model_doc/imagegpt.md b/docs/source/en/model_doc/imagegpt.md
index 53a7ba3b34b7..7fbec62d30bb 100644
--- a/docs/source/en/model_doc/imagegpt.md
+++ b/docs/source/en/model_doc/imagegpt.md
@@ -15,6 +15,10 @@ specific language governing permissions and limitations under the License. -->
 
 # ImageGPT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The ImageGPT model was proposed in [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt) by Mark
diff --git a/docs/source/en/model_doc/informer.md b/docs/source/en/model_doc/informer.md
index f866afbfcb8a..1dfc397db777 100644
--- a/docs/source/en/model_doc/informer.md
+++ b/docs/source/en/model_doc/informer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Informer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Informer model was proposed in [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md
index 904a96bc786f..4f2feb015f1f 100644
--- a/docs/source/en/model_doc/instructblip.md
+++ b/docs/source/en/model_doc/instructblip.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # InstructBLIP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The InstructBLIP model was proposed in [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md
index 8b2207ce1765..c26562a85308 100644
--- a/docs/source/en/model_doc/instructblipvideo.md
+++ b/docs/source/en/model_doc/instructblipvideo.md
@@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License.
 
 # InstructBlipVideo
 
-## Overview
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
 
 ## Overview
 
diff --git a/docs/source/en/model_doc/jamba.md b/docs/source/en/model_doc/jamba.md
index c3f66c1825f3..c8d66b163b5a 100644
--- a/docs/source/en/model_doc/jamba.md
+++ b/docs/source/en/model_doc/jamba.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Jamba
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 Jamba is a state-of-the-art, hybrid SSM-Transformer LLM. It is the first production-scale Mamba implementation, which opens up interesting research and application opportunities. While this initial experimentation shows encouraging gains, we expect these to be further enhanced with future optimizations and explorations.
diff --git a/docs/source/en/model_doc/jetmoe.md b/docs/source/en/model_doc/jetmoe.md
index 87f99c6f9988..aba6577f70cd 100644
--- a/docs/source/en/model_doc/jetmoe.md
+++ b/docs/source/en/model_doc/jetmoe.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # JetMoe
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 **JetMoe-8B** is an 8B Mixture-of-Experts (MoE) language model developed by [Yikang Shen](https://scholar.google.com.hk/citations?user=qff5rRYAAAAJ) and [MyShell](https://myshell.ai/).
diff --git a/docs/source/en/model_doc/jukebox.md b/docs/source/en/model_doc/jukebox.md
index 12f273b71e97..144134d9b070 100644
--- a/docs/source/en/model_doc/jukebox.md
+++ b/docs/source/en/model_doc/jukebox.md
@@ -15,6 +15,10 @@ rendered properly in your Markdown viewer.
 -->
 # Jukebox
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/kosmos-2.md b/docs/source/en/model_doc/kosmos-2.md
index f799751cce84..88a3b6bd99e1 100644
--- a/docs/source/en/model_doc/kosmos-2.md
+++ b/docs/source/en/model_doc/kosmos-2.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # KOSMOS-2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The KOSMOS-2 model was proposed in [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
diff --git a/docs/source/en/model_doc/layoutlm.md b/docs/source/en/model_doc/layoutlm.md
index 34b429fb7376..51cc52b7f452 100644
--- a/docs/source/en/model_doc/layoutlm.md
+++ b/docs/source/en/model_doc/layoutlm.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # LayoutLM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 <a id='Overview'></a>
 
 ## Overview
diff --git a/docs/source/en/model_doc/layoutlmv2.md b/docs/source/en/model_doc/layoutlmv2.md
index 0769322e9ad5..7fc5ae36197b 100644
--- a/docs/source/en/model_doc/layoutlmv2.md
+++ b/docs/source/en/model_doc/layoutlmv2.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # LayoutLMV2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The LayoutLMV2 model was proposed in [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu,
diff --git a/docs/source/en/model_doc/layoutlmv3.md b/docs/source/en/model_doc/layoutlmv3.md
index 87ff32f38356..5ab998dc3cd8 100644
--- a/docs/source/en/model_doc/layoutlmv3.md
+++ b/docs/source/en/model_doc/layoutlmv3.md
@@ -52,7 +52,7 @@ LayoutLMv3 is nearly identical to LayoutLMv2, so we've also included LayoutLMv2
 </Tip>
 
 - Demo notebooks for LayoutLMv3 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/LayoutLMv3).
-- Demo scripts can be found [here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3).
+- Demo scripts can be found [here](https://github.com/huggingface/transformers-research-projects/tree/main/layoutlmv3).
 
 <PipelineTag pipeline="text-classification"/>
 
@@ -61,7 +61,7 @@ LayoutLMv3 is nearly identical to LayoutLMv2, so we've also included LayoutLMv2
 
 <PipelineTag pipeline="token-classification"/>
 
-- [`LayoutLMv3ForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/research_projects/layoutlmv3) and [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv3/Fine_tune_LayoutLMv3_on_FUNSD_(HuggingFace_Trainer).ipynb).
+- [`LayoutLMv3ForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers-research-projects/tree/main/layoutlmv3) and [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv3/Fine_tune_LayoutLMv3_on_FUNSD_(HuggingFace_Trainer).ipynb).
 - A [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Inference_with_LayoutLMv2ForTokenClassification.ipynb) for how to perform inference with [`LayoutLMv2ForTokenClassification`] and a [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/True_inference_with_LayoutLMv2ForTokenClassification_%2B_Gradio_demo.ipynb) for how to perform inference when no labels are available with [`LayoutLMv2ForTokenClassification`].
 - A [notebook](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLMv2/FUNSD/Fine_tuning_LayoutLMv2ForTokenClassification_on_FUNSD_using_HuggingFace_Trainer.ipynb) for how to finetune [`LayoutLMv2ForTokenClassification`] with the 🤗 Trainer.
 - [Token classification task guide](../tasks/token_classification)
diff --git a/docs/source/en/model_doc/layoutxlm.md b/docs/source/en/model_doc/layoutxlm.md
index f6b2cbef9d6f..96e0a4d4bf51 100644
--- a/docs/source/en/model_doc/layoutxlm.md
+++ b/docs/source/en/model_doc/layoutxlm.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # LayoutXLM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 LayoutXLM was proposed in [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha
diff --git a/docs/source/en/model_doc/led.md b/docs/source/en/model_doc/led.md
index 9a39b0b28ede..729d5666d8a3 100644
--- a/docs/source/en/model_doc/led.md
+++ b/docs/source/en/model_doc/led.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # LED
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The LED model was proposed in [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz
diff --git a/docs/source/en/model_doc/levit.md b/docs/source/en/model_doc/levit.md
index 15dc2f4e1373..af42c1533e53 100644
--- a/docs/source/en/model_doc/levit.md
+++ b/docs/source/en/model_doc/levit.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # LeViT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The LeViT model was proposed in [LeViT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. LeViT improves the [Vision Transformer (ViT)](vit) in performance and efficiency by a few architectural differences such as activation maps with decreasing resolutions in Transformers and the introduction of an attention bias to integrate positional information.
diff --git a/docs/source/en/model_doc/lilt.md b/docs/source/en/model_doc/lilt.md
index 2514a6ebd852..2474d854e030 100644
--- a/docs/source/en/model_doc/lilt.md
+++ b/docs/source/en/model_doc/lilt.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # LiLT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The LiLT model was proposed in [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md
index 2f0eb63da00a..7dc06608963a 100644
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@@ -14,71 +14,115 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# LLaMA
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
 
-## Overview
+# Llama
 
-The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters.
+[Llama](https://huggingface.co/papers/2302.13971) is a family of large language models ranging from 7B to 65B parameters. These models are focused on efficient inference (important for serving language models) by training a smaller model on more tokens rather than training a larger model on fewer tokens. The Llama model is based on the GPT architecture, but it uses pre-normalization to improve training stability, replaces ReLU with SwiGLU to improve performance, and replaces absolute positional embeddings with rotary positional embeddings (RoPE) to better handle longer sequence lengths.
 
-The abstract from the paper is the following:
+You can find all the original Llama checkpoints under the [Huggy Llama](https://huggingface.co/huggyllama) organization.
 
-*We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA-65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community. *
+> [!TIP]
+> Click on the Llama models in the right sidebar for more examples of how to apply Llama to different language tasks.
 
-This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`], and from the command line.
 
-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-- Weights for the LLaMA models can be obtained from by filling out [this form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform?usp=send_form)
-- After downloading the weights, they will need to be converted to the Hugging Face Transformers format using the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
+```py
+import torch
+from transformers import pipeline
 
-```bash
-python src/transformers/models/llama/convert_llama_weights_to_hf.py \
-    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+pipeline = pipeline(
+    task="text-generation",
+    model="huggyllama/llama-7b",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create energy through a process known as")
 ```
 
-- After conversion, the model and tokenizer can be loaded via:
-
-```python
-from transformers import LlamaForCausalLM, LlamaTokenizer
-
-tokenizer = LlamaTokenizer.from_pretrained("/output/path")
-model = LlamaForCausalLM.from_pretrained("/output/path")
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "huggyllama/llama-7b",
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "huggyllama/llama-7b",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 
-Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 65B model, it's thus 130GB of RAM needed.
+</hfoption>
+<hfoption id="transformers-cli">
 
-- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
+```bash
+echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model huggyllama/llama-7b --device 0
+```
 
-This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama). The Flax version of the implementation was contributed by [afmck](https://huggingface.co/afmck) with the code in the implementation based on Hugging Face's Flax GPT-Neo.
+</hfoption>
+</hfoptions>
 
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
 
-Based on the original LLaMA model, Meta AI has released some follow-up works:
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
 
-- **Llama2**: Llama2 is an improved version of Llama with some architectural tweaks (Grouped Query Attention), and is pre-trained on 2Trillion tokens. Refer to the documentation of Llama2 which can be found [here](llama2).
+```py
+# pip install torchao
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
 
-## Resources
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+model = AutoModelForCausalLM.from_pretrained(
+    "huggyllama/llama-30b",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
 
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LLaMA. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-30b")
+input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
 
-<PipelineTag pipeline="text-classification"/>
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
 
-- A [notebook](https://colab.research.google.com/github/bigscience-workshop/petals/blob/main/examples/prompt-tuning-sst2.ipynb#scrollTo=f04ba4d2) on how to use prompt tuning to adapt the LLaMA model for text classification task. 🌎
+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
 
-<PipelineTag pipeline="question-answering"/>
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
 
-- [StackLLaMA: A hands-on guide to train LLaMA with RLHF](https://huggingface.co/blog/stackllama#stackllama-a-hands-on-guide-to-train-llama-with-rlhf), a blog post about how to train LLaMA to answer questions on [Stack Exchange](https://stackexchange.com/) with RLHF.
+visualizer = AttentionMaskVisualizer("huggyllama/llama-7b")
+visualizer("Plants create energy through a process known as")
+```
 
-⚗️ Optimization
-- A [notebook](https://colab.research.google.com/drive/1SQUXq1AMZPSLD4mk3A3swUIc6Y2dclme?usp=sharing) on how to fine-tune LLaMA model using xturing library on GPU which has limited memory. 🌎 
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llama-attn-mask.png"/>
+</div>
 
-⚡️ Inference
-- A [notebook](https://colab.research.google.com/github/DominguesM/alpaca-lora-ptbr-7b/blob/main/notebooks/02%20-%20Evaluate.ipynb) on how to run the LLaMA Model using PeftModel from the 🤗 PEFT library. 🌎 
-- A [notebook](https://colab.research.google.com/drive/1l2GiSSPbajVyp2Nk3CFT4t3uH6-5TiBe?usp=sharing) on how to load a PEFT adapter LLaMA model with LangChain. 🌎
+## Notes
 
-🚀 Deploy
-- A [notebook](https://colab.research.google.com/github/lxe/simple-llama-finetuner/blob/master/Simple_LLaMA_FineTuner.ipynb#scrollTo=3PM_DilAZD8T) on how to fine-tune LLaMA model using LoRA method via the 🤗 PEFT library with intuitive UI. 🌎 
-- A [notebook](https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/text-generation-open-llama.ipynb) on how to deploy Open-LLaMA model for text generation on Amazon SageMaker. 🌎 
+- The tokenizer is a byte-pair encoding model based on [SentencePiece](https://github.com/google/sentencepiece). During decoding, if the first token is the start of the word (for example, "Banana"), the tokenizer doesn't prepend the prefix space to the string.
 
 ## LlamaConfig
 
diff --git a/docs/source/en/model_doc/llama2.md b/docs/source/en/model_doc/llama2.md
index b4cd6b9ca110..ec981890b284 100644
--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@@ -14,91 +14,129 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Llama2
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+    </div>
+</div>
 
-## Overview
+# Llama 2
 
-The Llama2 model was proposed in [LLaMA: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. It is a collection of foundation language models ranging from 7B to 70B parameters, with checkpoints finetuned for chat application!
+[Llama 2](https://huggingface.co/papers/2307.09288) is a family of large language models, Llama 2 and Llama 2-Chat, available in 7B, 13B, and 70B parameters. The Llama 2 model mostly keeps the same architecture as [Llama](./llama), but it is pretrained on more tokens, doubles the context length, and uses grouped-query attention (GQA) in the 70B model to improve inference.
 
-The abstract from the paper is the following:
+Llama 2-Chat is trained with supervised fine-tuning (SFT), and reinforcement learning with human feedback (RLHF) - rejection sampling and proximal policy optimization (PPO) - is applied to the fine-tuned model to align the chat model with human preferences.
 
-*In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.*
+You can find all the original Llama 2 checkpoints under the [Llama 2 Family](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b) collection.
 
-Checkout all Llama2 model checkpoints [here](https://huggingface.co/models?search=llama2).
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ) with contributions from [Lysandre Debut](https://huggingface.co/lysandre). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama).
+> [!TIP]
+> Click on the Llama 2 models in the right sidebar for more examples of how to apply Llama to different language tasks.
 
-## Usage tips
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and how to chat with Llama 2-Chat from the command line.
 
-<Tip warning={true}>
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-The `Llama2` models were trained using `bfloat16`, but the original inference uses `float16`. The checkpoints uploaded on the Hub use `torch_dtype = 'float16'`, which will be
-used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`. 
+```py
+import torch
+from transformers import pipeline
 
-The `dtype` of the online weights is mostly irrelevant unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online), then it will be casted to the default `dtype` of `torch` (becomes `torch.float32`), and finally, if there is a `torch_dtype` provided in the config, it will be used. 
-
-Training the model in `float16` is not recommended and is known to produce `nan`; as such, the model should be trained in `bfloat16`.
-
-</Tip>
-
-Tips:
-
-- Weights for the Llama2 models can be obtained by filling out [this form](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
-- The architecture is very similar to the first Llama, with the addition of Grouped Query Attention (GQA) following this [paper](https://arxiv.org/pdf/2305.13245.pdf)
-- Setting `config.pretraining_tp` to a value different than 1 will activate the more accurate but slower computation of the linear layers, which should better match the original logits.
-- The original model uses `pad_id = -1` which means that there is no padding token. We can't have the same logic, make sure to add a padding token using `tokenizer.add_special_tokens({"pad_token":"<pad>"})` and resize the token embedding accordingly. You should also set the `model.config.pad_token_id`. The `embed_tokens` layer of the model is initialized with `self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`, which makes sure that encoding the padding token will output zeros, so passing it when initializing is recommended.
-- After filling out the form and gaining access to the model checkpoints, you should be able to use the already converted checkpoints. Otherwise, if you are converting your own model, feel free to use the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
-
-```bash
-python src/transformers/models/llama/convert_llama_weights_to_hf.py \
-    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+pipeline = pipeline(
+    task="text-generation",
+    model="meta-llama/Llama-2-7b-hf",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create energy through a process known as")
 ```
 
-- After conversion, the model and tokenizer can be loaded via:
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-7b-hf",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
 
-```python
-from transformers import LlamaForCausalLM, LlamaTokenizer
+</hfoption>
+<hfoption id="transformers-cli">
 
-tokenizer = LlamaTokenizer.from_pretrained("/output/path")
-model = LlamaForCausalLM.from_pretrained("/output/path")
+```bash
+transformers-cli chat --model_name_or_path meta-llama/Llama-2-7b-chat-hf --torch_dtype auto --attn_implementation flash_attention_2
 ```
 
-Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed.
+</hfoption>
+</hfoptions>
 
-- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
 
-- When using Flash Attention 2 via `attn_implementation="flash_attention_2"`, don't pass `torch_dtype` to the `from_pretrained` class method and use Automatic Mixed-Precision training. When using `Trainer`, it is simply specifying either `fp16` or `bf16` to `True`. Otherwise, make sure you are using `torch.autocast`. This is required because the Flash Attention only support `fp16` and `bf16` data type.
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
 
+```py
+# pip install torchao
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
 
-## Resources
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-13b-hf",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
 
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LLaMA2. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")
+input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to("cuda")
 
-- [Llama 2 is here - get it on Hugging Face](https://huggingface.co/blog/llama2), a blog post about Llama 2 and how to use it with 🤗 Transformers and 🤗 PEFT.
-- [LLaMA 2 - Every Resource you need](https://www.philschmid.de/llama-2), a compilation of relevant resources to learn about LLaMA 2 and how to get started quickly.
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
 
-<PipelineTag pipeline="text-generation"/>
+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
 
-- A [notebook](https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing) on how to fine-tune Llama 2 in Google Colab using QLoRA and 4-bit precision. 🌎
-- A [notebook](https://colab.research.google.com/drive/134o_cXcMe_lsvl15ZE_4Y75Kstepsntu?usp=sharing) on how to fine-tune the "Llama-v2-7b-guanaco" model with 4-bit QLoRA and generate Q&A datasets from PDFs. 🌎
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
 
-<PipelineTag pipeline="text-classification"/>
+visualizer = AttentionMaskVisualizer("meta-llama/Llama-2-7b-hf")
+visualizer("Plants create energy through a process known as")
+```
 
-- A [notebook](https://colab.research.google.com/drive/1ggaa2oRFphdBmqIjSEbnb_HGkcIRC2ZB?usp=sharing) on how to fine-tune the Llama 2 model with QLoRa, TRL, and Korean text classification dataset. 🌎🇰🇷
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llama-2-attn-mask.png"/>
+</div>
 
-⚗️ Optimization
-- [Fine-tune Llama 2 with DPO](https://huggingface.co/blog/dpo-trl), a guide to using the TRL library's DPO method to fine tune Llama 2 on a specific dataset.
-- [Extended Guide: Instruction-tune Llama 2](https://www.philschmid.de/instruction-tune-llama-2), a guide to training Llama 2 to generate instructions from inputs, transforming the model from instruction-following to instruction-giving.
-- A [notebook](https://colab.research.google.com/drive/1SYpgFpcmtIUzdE7pxqknrM4ArCASfkFQ?usp=sharing) on how to fine-tune the Llama 2 model on a personal computer using QLoRa and TRL. 🌎
+## Notes
 
-⚡️ Inference
-- A [notebook](https://colab.research.google.com/drive/1TC56ArKerXUpbgRy5vM3woRsbTEVNq7h?usp=sharing) on how to quantize the Llama 2 model using GPTQ from the AutoGPTQ library. 🌎
-- A [notebook](https://colab.research.google.com/drive/1X1z9Q6domMKl2CnEM0QGHNwidLfR4dW2?usp=sharing) on how to run the Llama 2 Chat Model with 4-bit quantization on a local computer or Google Colab. 🌎
+- Setting `config.pretraining_tp` to a value besides `1` activates a more accurate but slower computation of the linear layers. This matches the original logits better.
+- The original model uses `pad_id = -1` to indicate a padding token. The Transformers implementation requires adding a padding token and resizing the token embedding accordingly.
 
-🚀 Deploy
-- [Fine-tune LLaMA 2 (7-70B) on Amazon SageMaker](https://www.philschmid.de/sagemaker-llama2-qlora), a complete guide from setup to QLoRA fine-tuning and deployment on Amazon SageMaker.
-- [Deploy Llama 2 7B/13B/70B on Amazon SageMaker](https://www.philschmid.de/sagemaker-llama-llm), a guide on using Hugging Face's LLM DLC container for secure and scalable deployment.
+    ```py
+    tokenizer.add_special_tokens({"pad_token":"<pad>"})
+    # update model config with padding token
+    model.config.pad_token_id
+    ```
+- It is recommended to initialize the `embed_tokens` layer with the following code to ensure encoding the padding token outputs zeros.
 
+    ```py
+    self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)
+    ```
+- The tokenizer is a byte-pair encoding model based on [SentencePiece](https://github.com/google/sentencepiece). During decoding, if the first token is the start of the word (for example, "Banana"), the tokenizer doesn't prepend the prefix space to the string.
+- Don't use the `torch_dtype` parameter in [`~AutoModel.from_pretrained`] if you're using FlashAttention-2 because it only supports fp16 or bf16. You should use [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html), set fp16 or bf16 to `True` if using [`Trainer`], or use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast).
 
 ## LlamaConfig
 
diff --git a/docs/source/en/model_doc/llama3.md b/docs/source/en/model_doc/llama3.md
index 9c77db44fcf3..0bb5e8160c90 100644
--- a/docs/source/en/model_doc/llama3.md
+++ b/docs/source/en/model_doc/llama3.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Llama3
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ```py3
 import transformers
 import torch
diff --git a/docs/source/en/model_doc/llama4.md b/docs/source/en/model_doc/llama4.md
new file mode 100644
index 000000000000..8e2cd3a2786f
--- /dev/null
+++ b/docs/source/en/model_doc/llama4.md
@@ -0,0 +1,442 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Llama4
+
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    </div>
+</div>
+
+Llama 4, developed by Meta, introduces a new auto-regressive Mixture-of-Experts (MoE) architecture.
+This generation includes two models:
+- The highly capable Llama 4 Maverick with 17B active parameters out of ~400B total, with 128 experts.
+- The efficient Llama 4 Scout also  has 17B active parameters out of ~109B total, using just 16 experts.
+-
+Both models leverage early fusion for native multimodality, enabling them to process text and image inputs.
+Maverick and Scout are both trained on up to 40 trillion tokens on data encompassing 200 languages
+(with specific fine-tuning support for 12 languages including Arabic, Spanish, German, and Hindi).
+
+For deployment, Llama 4 Scout is designed for accessibility, fitting on a single server-grade GPU via
+on-the-fly 4-bit or 8-bitint4 quantization, while Maverick is available in BF16 and FP8 formats.
+These models are released under the custom Llama 4 Community License Agreement, available on the model repositories.
+
+You can find all the original Llama checkpoints under the [meta-llama](https://huggingface.co/meta-llama) organization.
+
+> [!TIP]
+> The Llama 4 family of models comes in two flavors: 109B, and 402B parameters. Both of these flavors are extremely
+> large and won't fit on your run-of-the-mill device. See below for some examples to reduce the memory usage of the
+> model.
+>
+> For the download to be faster and more resilient, we recommend installing the `hf_xet` dependency as followed:
+> `pip install transformers[hf_xet]`
+
+The examples below demonstrates how to generate with [`Pipeline`] or the [`AutoModel`]. We additionally add an example
+showcasing how to toggle the right attributes to enable very long-context generations, as some flavors of Llama 4
+have context lengths going up to 10 million tokens.
+
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+from transformers import pipeline
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+messages = [
+    {"role": "user", "content": "what is the recipe of mayonnaise?"},
+]
+
+pipe = pipeline(
+    "text-generation",
+    model=model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16
+)
+
+output = pipe(messages, do_sample=False, max_new_tokens=200)
+print(output[0]["generated_text"][-1]["content"])
+```
+
+</hfoption>
+<hfoption id="AutoModel - Text only">
+
+```py
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "Who are you?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+
+</hfoption>
+<hfoption id="AutoModel - Multimodal">
+
+```py
+from transformers import AutoProcessor, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+
+img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": img_url},
+            {"type": "text", "text": "Describe this image in two sentences."},
+        ]
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=256,
+)
+
+response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
+print(response)
+```
+
+</hfoption>
+<hfoption id="AutoModel - Multimodal with multiple images">
+
+```py
+from transformers import AutoProcessor, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+processor = AutoProcessor.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+
+url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": url1},
+            {"type": "image", "url": url2},
+            {"type": "text", "text": "Can you describe how these two images are similar, and how they differ?"},
+        ]
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device)
+
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=256,
+)
+
+response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
+print(response)
+```
+
+</hfoption>
+<hfoption id="AutoModel - Long context">
+
+Beware: the example below uses both `device_map="auto"` and flex-attention.
+Please use `torchrun` to run this example in tensor-parallel mode.
+
+We will work to enable running with `device_map="auto"` and flex-attention without
+tensor-parallel in the future.
+
+```py
+from transformers import Llama4ForConditionalGeneration, AutoTokenizer
+import torch
+import time
+
+file = "very_long_context_prompt.txt"
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+with open(file, "r") as f:
+    very_long_text = "\n".join(f.readlines())
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    attn_implementation="flex_attention",
+    torch_dtype=torch.bfloat16
+)
+
+messages = [
+    {"role": "user", "content": f"Look at the following texts: [{very_long_text}]\n\n\n\nWhat are the books, and who wrote them? Make me a nice list."},
+]
+input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+
+torch.cuda.synchronize()
+start = time.time()
+out = model.generate(
+    input_ids.to(model.device),
+    prefill_chunk_size=2048*8,
+    max_new_tokens=300,
+    cache_implementation="hybrid",
+)
+print(time.time()-start)
+print(tokenizer.batch_decode(out[:, input_ids.shape[-1]:]))
+print(f"{torch.cuda.max_memory_allocated(model.device) / 1024**3:.2f} GiB")
+```
+
+</hfoption>
+</hfoptions>
+
+## Efficiency; how to get the best out of llama 4
+
+### The Attention methods
+
+Updating the default attention function can significantly improve compute performance as well as memory usage. Refer to the [Attention Interface](../attention_interface) overview for an in-depth explanation of our interface.
+
+As of release, the Llama 4 model supports the following attention methods: `eager`, `flex_attention`, `sdpa`. We recommend using `flex_attention` for best results.
+Switching attention mechanism is done at the model initialization step:
+
+
+<hfoptions id="Attention">
+<hfoption id="Flex Attention">
+
+Setting Flex Attention ensures the best results with the very long context the model can handle.
+
+> [!TIP] Beware: the example below uses both `device_map="auto"` and flex-attention.
+> Please use `torchrun` to run this example in tensor-parallel mode.
+>
+> We will work to enable running with `device_map="auto"` and flex-attention without
+> tensor-parallel in the future.
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    attn_implementation="flex_attention",
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+```
+</hfoption>
+<hfoption id="SDPA">
+The `sdpa` attention method is generally more compute-efficient than the `eager` method.
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    attn_implementation="sdpa",
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+```
+</hfoption>
+<hfoption id="Eager">
+The `eager` attention method is set by default, so no need for anything different when loading the model:
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+```
+</hfoption>
+</hfoptions>
+
+
+### Quantization
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for available quantization backends.
+At time of release, both FBGEMM and LLM-Compressor are supported; more quantization methods will be supported in the days that follow the release.
+
+See below for examples using both:
+
+
+
+Here is an example loading an BF16 model in FP8 using the FBGEMM approach:
+
+<hfoptions id="Quantization">
+<hfoption id="FBGEMM">
+
+```python
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration, FbgemmFp8Config
+import torch
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "Who are you?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    quantization_config=FbgemmFp8Config()
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+
+</hfoption>
+<hfoption id="LLM-Compressor">
+
+To use the LLM-Compressor technique, we recommend leveraging the pre-quantized FP8 checkpoint available with the release:
+
+```python
+from transformers import AutoTokenizer, Llama4ForConditionalGeneration
+import torch
+
+model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [
+    {"role": "user", "content": "Who are you?"},
+]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    tp_plan="auto",
+    torch_dtype=torch.bfloat16,
+)
+
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+</hfoption>
+</hfoptions>
+
+### Offloading
+
+Enabling CPU-offloading means that components of the model might be moved to CPU instead of GPU in case the GPU-memory available isn't sufficient to load the entire model.
+At inference, different components will be loaded/unloaded from/to the GPU on the fly. This ensures that the model can be loaded on smaller machines as long as the CPU-memory is sufficient.
+However, this also slows down inference as it adds communication overhead.
+
+In order to enable CPU-offloading, you simply need to specify the `device_map` to `auto` at model load:
+
+```py
+from transformers import Llama4ForConditionalGeneration
+import torch
+
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+```
+
+## Llama4Config
+
+[[autodoc]] Llama4Config
+
+## Llama4TextConfig
+
+[[autodoc]] Llama4TextConfig
+
+## Llama4VisionConfig
+
+[[autodoc]] Llama4VisionConfig
+
+## Llama4Processor
+
+[[autodoc]] Llama4Processor
+
+## Llama4ImageProcessorFast
+
+[[autodoc]] Llama4ImageProcessorFast
+
+## Llama4ForConditionalGeneration
+
+[[autodoc]] Llama4ForConditionalGeneration
+- forward
+
+## Llama4ForCausalLM
+
+[[autodoc]] Llama4ForCausalLM
+- forward
+
+## Llama4TextModel
+
+[[autodoc]] Llama4TextModel
+- forward
+
+## Llama4ForCausalLM
+
+[[autodoc]] Llama4ForCausalLM
+- forward
+
+## Llama4VisionModel
+
+[[autodoc]] Llama4VisionModel
+- forward
diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md
index d89ec57be1e7..79033ec5a189 100644
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # LLaVa
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. It is an auto-regressive language model, based on the transformer architecture. In other words, it is an multi-modal version of LLMs fine-tuned for chat / instructions.
diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
index e62b9ba68c1e..7d85ab8b6967 100644
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # LLaVA-NeXT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The LLaVA-NeXT model was proposed in [LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/) by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee. LLaVa-NeXT (also called LLaVa-1.6) improves upon [LLaVa](llava) by increasing the input image resolution and training on an improved visual instruction tuning dataset to improve OCR and common sense reasoning.
diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md
index ecd7b83a8b58..e435861cbe26 100644
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # LLaVa-NeXT-Video
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The LLaVa-NeXT-Video model was proposed in [LLaVA-NeXT: A Strong Zero-shot Video Understanding Model
@@ -30,7 +36,7 @@ On January 30, 2024, we released LLaVA-NeXT, an open-source Large Multimodal Mod
 
 **In today’s exploration, we delve into the performance of LLaVA-NeXT within the realm of video understanding tasks. We reveal that LLaVA-NeXT surprisingly has strong performance in understanding video content. The current version of LLaVA-NeXT for videos has several improvements:
 
-- Zero-shot video representation capabilities with AnyRes: The AnyRes technique naturally represents a high-resolution image into multiple images that a pre-trained VIT is able to digest, and forms them into a concantenated sequence. This technique is naturally generalizable to represent videos (consisting of multiple frames), allowing the image-only-trained LLaVA-Next model to perform surprisingly well on video tasks. Notably, this is the first time that LMMs show strong zero-shot modality transfer ability.
+- Zero-shot video representation capabilities with AnyRes: The AnyRes technique naturally represents a high-resolution image into multiple images that a pre-trained VIT is able to digest, and forms them into a concatenated sequence. This technique is naturally generalizable to represent videos (consisting of multiple frames), allowing the image-only-trained LLaVA-Next model to perform surprisingly well on video tasks. Notably, this is the first time that LMMs show strong zero-shot modality transfer ability.
 - Inference with length generalization improves on longer videos. The linear scaling technique enables length generalization, allowing LLaVA-NeXT to effectively handle long-video beyond the limitation of the "max_token_length" of the LLM.
 - Strong video understanding ability. (1) LLaVA-Next-Image, which combines the above two techniques, yields superior zero-shot performance than open-source LMMs tuned on videos. (2) LLaVA-Next-Video, further supervised fine-tuning (SFT) LLaVA-Next-Image on video data, achieves better video understanding capabilities compared to LLaVA-Next-Image. (3) LLaVA-Next-Video-DPO, which aligns the model response with AI feedback using direct preference optimization (DPO), showing significant performance boost.
 - Efficient deployment and inference with SGLang. It allows 5x faster inference on video tasks, allowing more scalable serving such as million-level video re-captioning. See instructions in our repo.**
diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md
index 785e6af74a4d..77fe807d46d0 100644
--- a/docs/source/en/model_doc/llava_onevision.md
+++ b/docs/source/en/model_doc/llava_onevision.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # LLaVA-OneVision
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The LLaVA-OneVision model was proposed in [LLaVA-OneVision: Easy Visual Task Transfer](https://arxiv.org/abs/2408.03326) by <Bo Li, Yuanhan Zhang, Dong Guo, Renrui Zhang, Feng Li, Hao Zhang, Kaichen Zhang, Yanwei Li, Ziwei Liu, Chunyuan Li
diff --git a/docs/source/en/model_doc/longformer.md b/docs/source/en/model_doc/longformer.md
index 20ba7a922515..d173a7eb32ec 100644
--- a/docs/source/en/model_doc/longformer.md
+++ b/docs/source/en/model_doc/longformer.md
@@ -17,12 +17,8 @@ rendered properly in your Markdown viewer.
 # Longformer
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=longformer">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-longformer-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/longformer-base-4096-finetuned-squadv1">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/longt5.md b/docs/source/en/model_doc/longt5.md
index 40faa6d8c237..85a869f3c594 100644
--- a/docs/source/en/model_doc/longt5.md
+++ b/docs/source/en/model_doc/longt5.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # LongT5
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The LongT5 model was proposed in [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
diff --git a/docs/source/en/model_doc/luke.md b/docs/source/en/model_doc/luke.md
index 4e070b1c4bac..be4d5946dfcf 100644
--- a/docs/source/en/model_doc/luke.md
+++ b/docs/source/en/model_doc/luke.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # LUKE
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The LUKE model was proposed in [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda and Yuji Matsumoto.
diff --git a/docs/source/en/model_doc/lxmert.md b/docs/source/en/model_doc/lxmert.md
index 435994196b43..a0f686efc35d 100644
--- a/docs/source/en/model_doc/lxmert.md
+++ b/docs/source/en/model_doc/lxmert.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # LXMERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan & Mohit Bansal. It is a series of bidirectional transformer encoders
diff --git a/docs/source/en/model_doc/m2m_100.md b/docs/source/en/model_doc/m2m_100.md
index d64545fafb06..f4f2955bb046 100644
--- a/docs/source/en/model_doc/m2m_100.md
+++ b/docs/source/en/model_doc/m2m_100.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # M2M100
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The M2M100 model was proposed in [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky,
diff --git a/docs/source/en/model_doc/madlad-400.md b/docs/source/en/model_doc/madlad-400.md
index aeb41938499c..db6abc38eaf1 100644
--- a/docs/source/en/model_doc/madlad-400.md
+++ b/docs/source/en/model_doc/madlad-400.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # MADLAD-400
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 MADLAD-400 models were released in the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](MADLAD-400: A Multilingual And Document-Level Large Audited Dataset). 
diff --git a/docs/source/en/model_doc/mamba.md b/docs/source/en/model_doc/mamba.md
index 317948331eb1..d5c0612b1ebe 100644
--- a/docs/source/en/model_doc/mamba.md
+++ b/docs/source/en/model_doc/mamba.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Mamba
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Mamba model was proposed in [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) by Albert Gu and Tri Dao.
diff --git a/docs/source/en/model_doc/mamba2.md b/docs/source/en/model_doc/mamba2.md
index 5ed27881cf18..8d88d6c02652 100644
--- a/docs/source/en/model_doc/mamba2.md
+++ b/docs/source/en/model_doc/mamba2.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Mamba 2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Mamba2 model was proposed in [Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality](https://arxiv.org/abs/2405.21060) by Tri Dao and Albert Gu. It is a State Space Model similar to Mamba 1, with better performances in a simplified architecture. 
diff --git a/docs/source/en/model_doc/marian.md b/docs/source/en/model_doc/marian.md
index d8ebec8ffb0a..80bb73d26df1 100644
--- a/docs/source/en/model_doc/marian.md
+++ b/docs/source/en/model_doc/marian.md
@@ -17,12 +17,10 @@ rendered properly in your Markdown viewer.
 # MarianMT
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=marian">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-marian-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/opus-mt-zh-en">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/markuplm.md b/docs/source/en/model_doc/markuplm.md
index e52ff3157eac..72948da2c5af 100644
--- a/docs/source/en/model_doc/markuplm.md
+++ b/docs/source/en/model_doc/markuplm.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MarkupLM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MarkupLM model was proposed in [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document
diff --git a/docs/source/en/model_doc/mask2former.md b/docs/source/en/model_doc/mask2former.md
index 4faeed50311f..37a2603c6880 100644
--- a/docs/source/en/model_doc/mask2former.md
+++ b/docs/source/en/model_doc/mask2former.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Mask2Former
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Mask2Former model was proposed in [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. Mask2Former is a unified framework for panoptic, instance and semantic segmentation and features significant performance and efficiency improvements over [MaskFormer](maskformer).
diff --git a/docs/source/en/model_doc/maskformer.md b/docs/source/en/model_doc/maskformer.md
index a0199f380ce6..0adbbf2285f9 100644
--- a/docs/source/en/model_doc/maskformer.md
+++ b/docs/source/en/model_doc/maskformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MaskFormer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip>
 
 This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
diff --git a/docs/source/en/model_doc/matcha.md b/docs/source/en/model_doc/matcha.md
index d26b88b16fae..f3c618953b9b 100644
--- a/docs/source/en/model_doc/matcha.md
+++ b/docs/source/en/model_doc/matcha.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MatCha
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 MatCha has been proposed in the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662), from Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
diff --git a/docs/source/en/model_doc/mbart.md b/docs/source/en/model_doc/mbart.md
index ca529e957e2d..62356ad26402 100644
--- a/docs/source/en/model_doc/mbart.md
+++ b/docs/source/en/model_doc/mbart.md
@@ -17,12 +17,12 @@ rendered properly in your Markdown viewer.
 # MBart and MBart-50
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=mbart">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mbart-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/mbart-large-50-one-to-many-mmt">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
 
 
diff --git a/docs/source/en/model_doc/mctct.md b/docs/source/en/model_doc/mctct.md
index 7cf1a68f12e4..a755f5a027d2 100644
--- a/docs/source/en/model_doc/mctct.md
+++ b/docs/source/en/model_doc/mctct.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # M-CTC-T
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, so we won't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/mega.md b/docs/source/en/model_doc/mega.md
index 5545f5e19c47..4e8ccd4b29f3 100644
--- a/docs/source/en/model_doc/mega.md
+++ b/docs/source/en/model_doc/mega.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MEGA
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/megatron-bert.md b/docs/source/en/model_doc/megatron-bert.md
index 67000c8b843f..b032655f7547 100644
--- a/docs/source/en/model_doc/megatron-bert.md
+++ b/docs/source/en/model_doc/megatron-bert.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MegatronBERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MegatronBERT model was proposed in [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
diff --git a/docs/source/en/model_doc/megatron_gpt2.md b/docs/source/en/model_doc/megatron_gpt2.md
index 284fd372c0e0..7e0ee3cb9e7c 100644
--- a/docs/source/en/model_doc/megatron_gpt2.md
+++ b/docs/source/en/model_doc/megatron_gpt2.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # MegatronGPT2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The MegatronGPT2 model was proposed in [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
diff --git a/docs/source/en/model_doc/mgp-str.md b/docs/source/en/model_doc/mgp-str.md
index d4152e92b2ec..168e5bd1043d 100644
--- a/docs/source/en/model_doc/mgp-str.md
+++ b/docs/source/en/model_doc/mgp-str.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MGP-STR
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MGP-STR model was proposed in [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao. MGP-STR is a conceptually **simple** yet **powerful** vision Scene Text Recognition (STR) model, which is built upon the [Vision Transformer (ViT)](vit). To integrate linguistic knowledge, Multi-Granularity Prediction (MGP) strategy is proposed to inject information from the language modality into the model in an implicit way.
diff --git a/docs/source/en/model_doc/mimi.md b/docs/source/en/model_doc/mimi.md
index ad15a002da91..6e68394fcaea 100644
--- a/docs/source/en/model_doc/mimi.md
+++ b/docs/source/en/model_doc/mimi.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Mimi
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Mimi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour. Mimi is a high-fidelity audio codec model developed by the Kyutai team, that combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps. In other words, it can be used to map audio waveforms into “audio tokens”, known as “codebooks”.
diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md
index cfa2af367813..097d8888f9a5 100644
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # Mistral
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 Mistral was introduced in the [this blogpost](https://mistral.ai/news/announcing-mistral-7b/) by Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
diff --git a/docs/source/en/model_doc/mistral3.md b/docs/source/en/model_doc/mistral3.md
new file mode 100644
index 000000000000..4efdb6415599
--- /dev/null
+++ b/docs/source/en/model_doc/mistral3.md
@@ -0,0 +1,234 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Mistral3
+
+## Overview
+
+Building upon Mistral Small 3 (2501), Mistral Small 3.1 (2503) adds state-of-the-art vision understanding and enhances long context capabilities up to 128k tokens without compromising text performance. With 24 billion parameters, this model achieves top-tier capabilities in both text and vision tasks.
+
+It is ideal for:
+- Fast-response conversational agents.
+- Low-latency function calling.
+- Subject matter experts via fine-tuning.
+- Local inference for hobbyists and organizations handling sensitive data.
+- Programming and math reasoning.
+- Long document understanding.
+- Visual understanding.
+
+This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan).
+
+The original code can be found [here](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/pixtral.py) and [here](https://github.com/mistralai/mistral-common).
+
+## Usage example
+
+### Inference with Pipeline
+
+Here is how you can use the `image-text-to-text` pipeline to perform inference with the `Mistral3` models in just a few lines of code:
+```python
+>>> from transformers import pipeline
+
+>>> messages = [
+...     {
+...         "role": "user",
+...         "content": [
+...             {
+...                 "type": "image",
+...                 "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
+...             },
+...             {"type": "text", "text": "Describe this image."},
+...         ],
+...     },
+... ]
+
+>>> pipe = pipeline("image-text-to-text", model="mistralai/Mistral-Small-3.1-24B-Instruct-2503", torch_dtype=torch.bfloat16)
+>>> outputs = pipe(text=messages, max_new_tokens=50, return_full_text=False)
+>>> outputs[0]["generated_text"]
+'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a'
+```
+### Inference on a single image
+
+This example demonstrates how to perform inference on a single image with the Mistral3 models using chat templates.
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
+
+>>> torch_device = "cuda"
+>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+
+>>> messages = [
+...     {
+...         "role": "user",
+...         "content": [
+...             {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+...             {"type": "text", "text": "Describe this image"},
+...         ],
+...     }
+... ]
+
+>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+
+>>> generate_ids = model.generate(**inputs, max_new_tokens=20)
+>>> decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+
+>>> decoded_output
+"The image depicts two cats lying on a pink blanket. The larger cat, which appears to be an"...
+```
+
+### Text-only generation
+This example shows how to generate text using the Mistral3 model without providing any image input.
+
+
+````python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
+
+>>> torch_device = "cuda"
+>>> model_checkpoint = ".mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+
+>>> SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat."
+>>> user_prompt = "Give me 5 non-formal ways to say 'See you later' in French."
+
+>>> messages = [
+...    {"role": "system", "content": SYSTEM_PROMPT},
+...    {"role": "user", "content": user_prompt},
+... ]
+
+>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+>>> inputs = processor(text=text, return_tensors="pt").to(0, dtype=torch.float16)
+>>> generate_ids = model.generate(**inputs, max_new_tokens=50, do_sample=False)
+>>> decoded_output = processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)[0]
+
+>>> print(decoded_output)
+"1. À plus tard!
+2. Salut, à plus!
+3. À toute!
+4. À la prochaine!
+5. Je me casse, à plus!
+
+```
+ /\_/\
+( o.o )
+ > ^ <
+```"
+````
+
+### Batched image and text inputs
+Mistral3 models also support batched image and text inputs.
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import torch
+
+>>> torch_device = "cuda"
+>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+
+>>> messages = [
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+...                 {"type": "text", "text": "Write a haiku for this image"},
+...             ],
+...         },
+...     ],
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+...                 {"type": "text", "text": "Describe this image"},
+...             ],
+...         },
+...     ],
+... ]
+
+
+>>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+
+>>> output = model.generate(**inputs, max_new_tokens=25)
+
+>>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+>>> decoded_outputs
+["Write a haiku for this imageCalm waters reflect\nWhispers of the forest's breath\nPeace on wooden path"
+, "Describe this imageThe image depicts a vibrant street scene in what appears to be a Chinatown district. The focal point is a traditional Chinese"]
+```
+
+### Batched multi-image input and quantization with BitsAndBytes
+This implementation of the Mistral3 models supports batched text-images inputs with different number of images for each text.
+This example also how to use `BitsAndBytes` to load the model in 4bit quantization.
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
+>>> import torch
+
+>>> torch_device = "cuda"
+>>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+>>> quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+>>> model = AutoModelForImageTextToText.from_pretrained(
+...     model_checkpoint, quantization_config=quantization_config
+... )
+
+>>> messages = [
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+...                 {"type": "text", "text": "Write a haiku for this image"},
+...             ],
+...         },
+...     ],
+...     [
+...         {
+...             "role": "user",
+...             "content": [
+...                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
+...                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
+...                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+...             ],
+...         },
+...     ],
+>>> ]
+
+>>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+
+>>> output = model.generate(**inputs, max_new_tokens=25)
+
+>>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+>>> decoded_outputs
+["Write a haiku for this imageSure, here is a haiku inspired by the image:\n\nCalm lake's wooden path\nSilent forest stands guard\n", "These images depict two different landmarks. Can you identify them? Certainly! The images depict two iconic landmarks:\n\n1. The first image shows the Statue of Liberty in New York City."]
+```
+
+
+## Mistral3Config
+
+[[autodoc]] Mistral3Config
+
+
+## Mistral3ForConditionalGeneration
+
+[[autodoc]] Mistral3ForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md
index b5451702e44a..38c0c98ed0b9 100644
--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Mixtral
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 Mixtral-8x7B was introduced in the [Mixtral of Experts blogpost](https://mistral.ai/news/mixtral-of-experts/) by Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md
index 64da42b38b0f..77f5e211f170 100644
--- a/docs/source/en/model_doc/mllama.md
+++ b/docs/source/en/model_doc/mllama.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Mllama
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Llama 3.2-Vision collection of multimodal large language models (LLMs) is a collection of pretrained and instruction-tuned image reasoning generative models in 11B and 90B sizes (text \+ images in / text out). The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image.
diff --git a/docs/source/en/model_doc/mluke.md b/docs/source/en/model_doc/mluke.md
index 719af76ad446..aae607def6f1 100644
--- a/docs/source/en/model_doc/mluke.md
+++ b/docs/source/en/model_doc/mluke.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # mLUKE
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The mLUKE model was proposed in [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. It's a multilingual extension
diff --git a/docs/source/en/model_doc/mms.md b/docs/source/en/model_doc/mms.md
index 7102b8896647..480d5bc8ddb1 100644
--- a/docs/source/en/model_doc/mms.md
+++ b/docs/source/en/model_doc/mms.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # MMS
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The MMS model was proposed in [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) 
diff --git a/docs/source/en/model_doc/mobilebert.md b/docs/source/en/model_doc/mobilebert.md
index 5c9a230d0d5c..2104d0a4573f 100644
--- a/docs/source/en/model_doc/mobilebert.md
+++ b/docs/source/en/model_doc/mobilebert.md
@@ -14,47 +14,81 @@ rendered properly in your Markdown viewer.
 
 -->
 
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    </div>
+</div>
+
 # MobileBERT
 
-## Overview
+[MobileBERT](https://huggingface.co/papers/2004.02984) is a lightweight and efficient variant of BERT, specifically designed for resource-limited devices such as mobile phones. It retains BERT's architecture but significantly reduces model size and inference latency while maintaining strong performance on NLP tasks. MobileBERT achieves this through a bottleneck structure and carefully balanced self-attention and feedforward networks. The model is trained by knowledge transfer from a large BERT model with an inverted bottleneck structure.
+
+You can find the original MobileBERT checkpoint under the [Google](https://huggingface.co/google/mobilebert-uncased) organization.
+> [!TIP]
+> Click on the MobileBERT models in the right sidebar for more examples of how to apply MobileBERT to different language tasks.
+
+The example below demonstrates how to predict the `[MASK]` token with [`Pipeline`], [`AutoModel`], and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="fill-mask",
+    model="google/mobilebert-uncased",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("The capital of France is [MASK].")
+```
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/mobilebert-uncased",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "google/mobilebert-uncased",
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
+inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt").to("cuda")
 
-The MobileBERT model was proposed in [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny
-Zhou. It's a bidirectional transformer based on the BERT model, which is compressed and accelerated using several
-approaches.
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
 
-The abstract from the paper is the following:
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
 
-*Natural Language Processing (NLP) has recently achieved great success by using huge pre-trained models with hundreds
-of millions of parameters. However, these models suffer from heavy model sizes and high latency such that they cannot
-be deployed to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating
-the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied to
-various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while
-equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks.
-To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated BERT_LARGE
-model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that MobileBERT is
-4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known benchmarks. On the
-natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7 (0.6 lower than BERT_BASE), and 62 ms
-latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a dev F1 score of
-90.0/79.2 (1.5/2.1 higher than BERT_BASE).*
+print(f"The predicted token is: {predicted_token}")
+```
 
-This model was contributed by [vshampor](https://huggingface.co/vshampor). The original code can be found [here](https://github.com/google-research/google-research/tree/master/mobilebert).
+</hfoption>
+<hfoption id="transformers-cli">
 
-## Usage tips
+```bash
+echo -e "The capital of France is [MASK]." | transformers-cli run --task fill-mask --model google/mobilebert-uncased --device 0
+```
 
-- MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
-  than the left.
-- MobileBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
-  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
-  with a causal language modeling (CLM) objective are better in that regard.
+</hfoption>
+</hfoptions>
 
 
-## Resources
+## Notes
 
-- [Text classification task guide](../tasks/sequence_classification)
-- [Token classification task guide](../tasks/token_classification)
-- [Question answering task guide](../tasks/question_answering)
-- [Masked language modeling task guide](../tasks/masked_language_modeling)
-- [Multiple choice task guide](../tasks/multiple_choice)
+- Inputs should be padded on the right because BERT uses absolute position embeddings.
 
 ## MobileBertConfig
 
diff --git a/docs/source/en/model_doc/mobilenet_v1.md b/docs/source/en/model_doc/mobilenet_v1.md
index 9f68035c63c2..7d94777d6f83 100644
--- a/docs/source/en/model_doc/mobilenet_v1.md
+++ b/docs/source/en/model_doc/mobilenet_v1.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MobileNet V1
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MobileNet model was proposed in [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
diff --git a/docs/source/en/model_doc/mobilenet_v2.md b/docs/source/en/model_doc/mobilenet_v2.md
index ff22231ae0c1..b78a8eb72f63 100644
--- a/docs/source/en/model_doc/mobilenet_v2.md
+++ b/docs/source/en/model_doc/mobilenet_v2.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MobileNet V2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MobileNet model was proposed in [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
diff --git a/docs/source/en/model_doc/mobilevit.md b/docs/source/en/model_doc/mobilevit.md
index e724ffa380e2..c9054b59cbc9 100644
--- a/docs/source/en/model_doc/mobilevit.md
+++ b/docs/source/en/model_doc/mobilevit.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # MobileViT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The MobileViT model was proposed in [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. MobileViT introduces a new layer that replaces local processing in convolutions with global processing using transformers.
diff --git a/docs/source/en/model_doc/mobilevitv2.md b/docs/source/en/model_doc/mobilevitv2.md
index c3a650fc7042..b6549666850a 100644
--- a/docs/source/en/model_doc/mobilevitv2.md
+++ b/docs/source/en/model_doc/mobilevitv2.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MobileViTV2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MobileViTV2 model was proposed in [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
diff --git a/docs/source/en/model_doc/modernbert.md b/docs/source/en/model_doc/modernbert.md
index e90f34a903e4..16ada230a2ae 100644
--- a/docs/source/en/model_doc/modernbert.md
+++ b/docs/source/en/model_doc/modernbert.md
@@ -14,55 +14,79 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# ModernBERT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=modernbert">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-modernbert-blueviolet">
-</a>
-<a href="https://arxiv.org/abs/2412.13663">
-<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-2412.13663-green">
-</a>
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+  </div>
 </div>
 
-## Overview
+# ModernBERT
 
-The ModernBERT model was proposed in [Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference](https://arxiv.org/abs/2412.13663) by Benjamin Warner, Antoine Chaffin, Benjamin Clavié, Orion Weller, Oskar Hallström, Said Taghadouini, Alexis Galalgher, Raja Bisas, Faisal Ladhak, Tom Aarsen, Nathan Cooper, Grifin Adams, Jeremy Howard and Iacopo Poli.
+[ModernBERT](https://huggingface.co/papers/2412.13663) is a modernized version of [`BERT`] trained on 2T tokens. It brings many improvements to the original architecture such as rotary positional embeddings to support sequences of up to 8192 tokens, unpadding to avoid wasting compute on padding tokens, GeGLU layers, and alternating attention.
 
-It is a refresh of the traditional encoder architecture, as used in previous models such as [BERT](https://huggingface.co/docs/transformers/en/model_doc/bert) and [RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/roberta). 
+You can find all the original ModernBERT checkpoints under the [ModernBERT](https://huggingface.co/collections/answerdotai/modernbert-67627ad707a4acbf33c41deb) collection.
 
-It builds on BERT and implements many modern architectural improvements which have been developed since its original release, such as:
-- [Rotary Positional Embeddings](https://huggingface.co/blog/designing-positional-encoding) to support sequences of up to 8192 tokens.
-- [Unpadding](https://arxiv.org/abs/2208.08124) to ensure no compute is wasted on padding tokens, speeding up processing time for batches with mixed-length sequences.
-- [GeGLU](https://arxiv.org/abs/2002.05202) Replacing the original MLP layers with GeGLU layers, shown to improve performance.
-- [Alternating Attention](https://arxiv.org/abs/2004.05150v2) where most attention layers employ a sliding window of 128 tokens, with Global Attention only used every 3 layers.
-- [Flash Attention](https://github.com/Dao-AILab/flash-attention) to speed up processing.
-- A model designed following recent [The Case for Co-Designing Model Architectures with Hardware](https://arxiv.org/abs/2401.14489), ensuring maximum efficiency across inference GPUs.
-- Modern training data scales (2 trillion tokens) and mixtures (including code ande math data)
+> [!TIP]
+> Click on the ModernBERT models in the right sidebar for more examples of how to apply ModernBERT to different language tasks.
 
-The abstract from the paper is the following:
+The example below demonstrates how to predict the `[MASK]` token with [`Pipeline`], [`AutoModel`], and from the command line.
 
-*Encoder-only transformer models such as BERT offer a great performance-size tradeoff for retrieval and classification tasks with respect to larger decoder-only models. Despite being the workhorse of numerous production pipelines, there have been limited Pareto improvements to BERT since its release. In this paper, we introduce ModernBERT, bringing modern model optimizations to encoder-only models and representing a major Pareto improvement over older encoders. Trained on 2 trillion tokens with a native 8192 sequence length, ModernBERT models exhibit state-of-the-art results on a large pool of evaluations encompassing diverse classification tasks and both single and multi-vector retrieval on different domains (including code). In addition to strong downstream performance, ModernBERT is also the most speed and memory efficient encoder and is designed for inference on common GPUs.*
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-The original code can be found [here](https://github.com/answerdotai/modernbert).
+```py
+import torch
+from transformers import pipeline
 
-## Resources
+pipeline = pipeline(
+    task="fill-mask",
+    model="answerdotai/ModernBERT-base",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create [MASK] through a process known as photosynthesis.")
+```
 
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ModernBert.
+</hfoption>
+<hfoption id="AutoModel">
 
-<PipelineTag pipeline="text-classification"/>
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
 
-- A notebook on how to [finetune for General Language Understanding Evaluation (GLUE) with Transformers](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/finetune_modernbert_on_glue.ipynb), also available as a Google Colab [notebook](https://colab.research.google.com/github/AnswerDotAI/ModernBERT/blob/main/examples/finetune_modernbert_on_glue.ipynb). 🌎
+tokenizer = AutoTokenizer.from_pretrained(
+    "answerdotai/ModernBERT-base",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "answerdotai/ModernBERT-base",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to("cuda")
 
-<PipelineTag pipeline="sentence-similarity"/>
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
 
-- A script on how to [finetune for text similarity or information retrieval with Sentence Transformers](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/train_st.py). 🌎
-- A script on how to [finetune for information retrieval with PyLate](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/train_pylate.py). 🌎
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
 
-<PipelineTag pipeline="fill-mask"/>
+print(f"The predicted token is: {predicted_token}")
+```
 
-- [Masked language modeling task guide](../tasks/masked_language_modeling)
+</hfoption>
+<hfoption id="transformers-cli">
 
+```bash
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model answerdotai/ModernBERT-base --device 0
+```
+
+</hfoption>
+</hfoptions>
 
 ## ModernBertConfig
 
@@ -91,5 +115,15 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] ModernBertForTokenClassification
     - forward
 
+## ModernBertForQuestionAnswering
+
+[[autodoc]] ModernBertForQuestionAnswering
+    - forward
+
+### Usage tips
+
+The ModernBert model can be fine-tuned using the HuggingFace Transformers library with its [official script](https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_qa.py) for question-answering tasks.
+
+
 </pt>
 </frameworkcontent>
diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
index 571e3febdb4f..2a4599e3d7e0 100644
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Moonshine
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Moonshine model was proposed in [Moonshine: Speech Recognition for Live Transcription and Voice Commands
diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md
index 2e2c5655de45..9302a9461959 100644
--- a/docs/source/en/model_doc/moshi.md
+++ b/docs/source/en/model_doc/moshi.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Moshi
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Moshi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour.
diff --git a/docs/source/en/model_doc/mpnet.md b/docs/source/en/model_doc/mpnet.md
index c571da47b004..cf84e2b41075 100644
--- a/docs/source/en/model_doc/mpnet.md
+++ b/docs/source/en/model_doc/mpnet.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # MPNet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The MPNet model was proposed in [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
diff --git a/docs/source/en/model_doc/mpt.md b/docs/source/en/model_doc/mpt.md
index 113b42573f4d..a4dbc5ea6a8d 100644
--- a/docs/source/en/model_doc/mpt.md
+++ b/docs/source/en/model_doc/mpt.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MPT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MPT model was proposed by the [MosaicML](https://www.mosaicml.com/) team and released with multiple sizes and finetuned variants. The MPT models are a series of open source and commercially usable LLMs pre-trained on 1T tokens. 
diff --git a/docs/source/en/model_doc/mra.md b/docs/source/en/model_doc/mra.md
index cc4c0d9cc9c8..a5490d5d379c 100644
--- a/docs/source/en/model_doc/mra.md
+++ b/docs/source/en/model_doc/mra.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MRA
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MRA model was proposed in [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, and Vikas Singh.
diff --git a/docs/source/en/model_doc/mt5.md b/docs/source/en/model_doc/mt5.md
index 7f053bb724a1..d4af9f538cb3 100644
--- a/docs/source/en/model_doc/mt5.md
+++ b/docs/source/en/model_doc/mt5.md
@@ -17,12 +17,10 @@ rendered properly in your Markdown viewer.
 # mT5
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=mt5">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mt5-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/mt5-small-finetuned-arxiv-cs-finetuned-arxiv-cs-full">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/musicgen.md b/docs/source/en/model_doc/musicgen.md
index 7c105e1f39f7..6d709a963c04 100644
--- a/docs/source/en/model_doc/musicgen.md
+++ b/docs/source/en/model_doc/musicgen.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # MusicGen
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MusicGen model was proposed in the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284)
diff --git a/docs/source/en/model_doc/musicgen_melody.md b/docs/source/en/model_doc/musicgen_melody.md
index 7b67713c42b7..b1f16c4574ef 100644
--- a/docs/source/en/model_doc/musicgen_melody.md
+++ b/docs/source/en/model_doc/musicgen_melody.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # MusicGen Melody
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MusicGen Melody model was proposed in [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
diff --git a/docs/source/en/model_doc/mvp.md b/docs/source/en/model_doc/mvp.md
index 0d98e04cf091..d73297716792 100644
--- a/docs/source/en/model_doc/mvp.md
+++ b/docs/source/en/model_doc/mvp.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # MVP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The MVP model was proposed in [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
diff --git a/docs/source/en/model_doc/nat.md b/docs/source/en/model_doc/nat.md
index 02c2e466cc4a..c7725ed7a563 100644
--- a/docs/source/en/model_doc/nat.md
+++ b/docs/source/en/model_doc/nat.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Neighborhood Attention Transformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/nemotron.md b/docs/source/en/model_doc/nemotron.md
index 1979847c43cf..13b1b9be2fbc 100644
--- a/docs/source/en/model_doc/nemotron.md
+++ b/docs/source/en/model_doc/nemotron.md
@@ -14,7 +14,11 @@ specific language governing permissions and limitations under the License.
 
 # Nemotron
 
-## Nemotron
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
 
 ### License
 
diff --git a/docs/source/en/model_doc/nezha.md b/docs/source/en/model_doc/nezha.md
index 976722592cad..dc815e0ecc48 100644
--- a/docs/source/en/model_doc/nezha.md
+++ b/docs/source/en/model_doc/nezha.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Nezha
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/nllb-moe.md b/docs/source/en/model_doc/nllb-moe.md
index 5c283fb3f0e1..65a4812ed6ab 100644
--- a/docs/source/en/model_doc/nllb-moe.md
+++ b/docs/source/en/model_doc/nllb-moe.md
@@ -16,6 +16,9 @@ rendered properly in your Markdown viewer.
 
 # NLLB-MOE
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
 
 ## Overview
 
diff --git a/docs/source/en/model_doc/nllb.md b/docs/source/en/model_doc/nllb.md
index abdff7445aa3..4ba273777920 100644
--- a/docs/source/en/model_doc/nllb.md
+++ b/docs/source/en/model_doc/nllb.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # NLLB
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Updated tokenizer behavior 
 
 **DISCLAIMER:** The default behaviour for the tokenizer was fixed and thus changed in April 2023.
diff --git a/docs/source/en/model_doc/nougat.md b/docs/source/en/model_doc/nougat.md
index a39e74eb213a..06b12b5ee8e6 100644
--- a/docs/source/en/model_doc/nougat.md
+++ b/docs/source/en/model_doc/nougat.md
@@ -15,6 +15,13 @@ specific language governing permissions and limitations under the License. -->
 
 # Nougat
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The Nougat model was proposed in [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by
diff --git a/docs/source/en/model_doc/nystromformer.md b/docs/source/en/model_doc/nystromformer.md
index 185c4e1f011a..b4c017b35fff 100644
--- a/docs/source/en/model_doc/nystromformer.md
+++ b/docs/source/en/model_doc/nystromformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Nyströmformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Nyströmformer model was proposed in [*Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention*](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn
diff --git a/docs/source/en/model_doc/olmo.md b/docs/source/en/model_doc/olmo.md
index 6db7d8ad5c5e..8d722185c31f 100644
--- a/docs/source/en/model_doc/olmo.md
+++ b/docs/source/en/model_doc/olmo.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # OLMo
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The OLMo model was proposed in [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi.
diff --git a/docs/source/en/model_doc/olmo2.md b/docs/source/en/model_doc/olmo2.md
index 8ca3326660b3..24030b855244 100644
--- a/docs/source/en/model_doc/olmo2.md
+++ b/docs/source/en/model_doc/olmo2.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # OLMo2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The OLMo2 model is the successor of the OLMo model, which was proposed in
diff --git a/docs/source/en/model_doc/olmoe.md b/docs/source/en/model_doc/olmoe.md
index 5ebcf3f943b3..6496e44c1bd5 100644
--- a/docs/source/en/model_doc/olmoe.md
+++ b/docs/source/en/model_doc/olmoe.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # OLMoE
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The OLMoE model was proposed in [OLMoE: Open Mixture-of-Experts Language Models](https://arxiv.org/abs/2409.02060) by Niklas Muennighoff, Luca Soldaini, Dirk Groeneveld, Kyle Lo, Jacob Morrison, Sewon Min, Weijia Shi, Pete Walsh, Oyvind Tafjord, Nathan Lambert, Yuling Gu, Shane Arora, Akshita Bhagia, Dustin Schwenk, David Wadden, Alexander Wettig, Binyuan Hui, Tim Dettmers, Douwe Kiela, Ali Farhadi, Noah A. Smith, Pang Wei Koh, Amanpreet Singh, Hannaneh Hajishirzi.
diff --git a/docs/source/en/model_doc/omdet-turbo.md b/docs/source/en/model_doc/omdet-turbo.md
index 91419919b6e0..d73fef2d8b5d 100644
--- a/docs/source/en/model_doc/omdet-turbo.md
+++ b/docs/source/en/model_doc/omdet-turbo.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # OmDet-Turbo
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The OmDet-Turbo model was proposed in [Real-time Transformer-based Open-Vocabulary Detection with Efficient Fusion Head](https://arxiv.org/abs/2403.06892) by Tiancheng Zhao, Peng Liu, Xuan He, Lu Zhang, Kyusong Lee. OmDet-Turbo incorporates components from RT-DETR and introduces a swift multimodal fusion module to achieve real-time open-vocabulary object detection capabilities while maintaining high accuracy. The base model achieves performance of up to 100.2 FPS and 53.4 AP on COCO zero-shot.
diff --git a/docs/source/en/model_doc/oneformer.md b/docs/source/en/model_doc/oneformer.md
index 0132a600ccc5..f1c1de791238 100644
--- a/docs/source/en/model_doc/oneformer.md
+++ b/docs/source/en/model_doc/oneformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # OneFormer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The OneFormer model was proposed in [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. OneFormer is a universal image segmentation framework that can be trained on a single panoptic dataset to perform semantic, instance, and panoptic segmentation tasks. OneFormer uses a task token to condition the model on the task in focus, making the architecture task-guided for training, and task-dynamic for inference.
diff --git a/docs/source/en/model_doc/open-llama.md b/docs/source/en/model_doc/open-llama.md
index 01170e7e3be6..3b4856cd4fb6 100644
--- a/docs/source/en/model_doc/open-llama.md
+++ b/docs/source/en/model_doc/open-llama.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Open-Llama
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/openai-gpt.md b/docs/source/en/model_doc/openai-gpt.md
index 09277858aa3b..7c3affa4942a 100644
--- a/docs/source/en/model_doc/openai-gpt.md
+++ b/docs/source/en/model_doc/openai-gpt.md
@@ -14,154 +14,123 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# OpenAI GPT
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=openai-gpt">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-openai--gpt-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/openai-gpt">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,...">
+    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+  </div>
 </div>
 
-## Overview
 
-OpenAI GPT model was proposed in [Improving Language Understanding by Generative Pre-Training](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf)
-by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional) transformer
-pre-trained using language modeling on a large corpus with long range dependencies, the Toronto Book Corpus.
 
-The abstract from the paper is the following:
+# GPT
 
-*Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering,
-semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant,
-labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to
-perform adequately. We demonstrate that large gains on these tasks can be realized by generative pretraining of a
-language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In
-contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve
-effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our
-approach on a wide range of benchmarks for natural language understanding. Our general task-agnostic model outperforms
-discriminatively trained models that use architectures specifically crafted for each task, significantly improving upon
-the state of the art in 9 out of the 12 tasks studied.*
+[GPT (Generative Pre-trained Transformer)](https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf) focuses on effectively learning text representations and transferring them to tasks. This model trains the Transformer decoder to predict the next word, and then fine-tuned on labeled data.
 
-[Write With Transformer](https://transformer.huggingface.co/doc/gpt) is a webapp created and hosted by Hugging Face
-showcasing the generative capabilities of several models. GPT is one of them.
+GPT can generate high-quality text, making it well-suited for a variety of natural language understanding tasks such as textual entailment, question answering, semantic similarity, and document classification.
 
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/openai/finetune-transformer-lm).
+You can find all the original GPT checkpoints under the [OpenAI community](https://huggingface.co/openai-community/openai-gpt) organization.
 
-## Usage tips
+> [!TIP]
+> Click on the GPT models in the right sidebar for more examples of how to apply GPT to different language tasks.
 
-- GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
-- GPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
-  observed in the *run_generation.py* example script.
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
 
 
-Note:
 
-If you want to reproduce the original tokenization process of the *OpenAI GPT* paper, you will need to install `ftfy`
-and `SpaCy`:
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-```bash
-pip install spacy ftfy==4.4.3
-python -m spacy download en
+
+```python
+import torch
+from transformers import pipeline
+
+generator = pipeline(task="text-generation", model="openai-community/gpt", torch_dtype=torch.float16, device=0)
+output = generator("The future of AI is", max_length=50, do_sample=True)
+print(output[0]["generated_text"])
 ```
 
-If you don't install `ftfy` and `SpaCy`, the [`OpenAIGPTTokenizer`] will default to tokenize
-using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+</hfoption>
+<hfoption id="AutoModel">
 
-## Resources
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OpenAI GPT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt")
+model = AutoModelForCausalLM.from_pretrained("openai-community/openai-gpt", torch_dtype=torch.float16)
 
-<PipelineTag pipeline="text-classification"/>
+inputs = tokenizer("The future of AI is", return_tensors="pt")
+outputs = model.generate(**inputs, max_length=50)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
 
-- A blog post on [outperforming OpenAI GPT-3 with SetFit for text-classification](https://www.philschmid.de/getting-started-setfit).
-- See also: [Text classification task guide](../tasks/sequence_classification)
+</hfoption>
+<hfoption id="transformers-cli">
 
-<PipelineTag pipeline="text-generation"/>
+```bash
+echo -e "The future of AI is" | transformers-cli run --task text-generation --model openai-community/openai-gpt --device 0
 
-- A blog on how to [Finetune a non-English GPT-2 Model with Hugging Face](https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface).
-- A blog on [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) with GPT-2.
-- A blog on [Training CodeParrot 🦜 from Scratch](https://huggingface.co/blog/codeparrot), a large GPT-2 model.
-- A blog on [Faster Text Generation with TensorFlow and XLA](https://huggingface.co/blog/tf-xla-generate) with GPT-2.
-- A blog on [How to train a Language Model with Megatron-LM](https://huggingface.co/blog/megatron-training) with a GPT-2 model.
-- A notebook on how to [finetune GPT2 to generate lyrics in the style of your favorite artist](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb). 🌎
-- A notebook on how to [finetune GPT2 to generate tweets in the style of your favorite Twitter user](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb). 🌎
-- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
-- [`OpenAIGPTLMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling), [text generation example script](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
-- [`TFOpenAIGPTLMHeadModel`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
-- See also: [Causal language modeling task guide](../tasks/language_modeling)
+```
+</hfoption>
+</hfoptions>
 
-<PipelineTag pipeline="token-classification"/>
+## Notes
 
-- A course material on [Byte-Pair Encoding tokenization](https://huggingface.co/course/en/chapter6/5).
+- Inputs should be padded on the right because GPT uses absolute position embeddings.
 
 ## OpenAIGPTConfig
 
 [[autodoc]] OpenAIGPTConfig
 
-## OpenAIGPTTokenizer
-
-[[autodoc]] OpenAIGPTTokenizer
-    - save_vocabulary
-
-## OpenAIGPTTokenizerFast
-
-[[autodoc]] OpenAIGPTTokenizerFast
-
-## OpenAI specific outputs
-
-[[autodoc]] models.openai.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
-
-[[autodoc]] models.openai.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
-
-<frameworkcontent>
-<pt>
-
 ## OpenAIGPTModel
 
 [[autodoc]] OpenAIGPTModel
-    - forward
+- forward
 
 ## OpenAIGPTLMHeadModel
 
 [[autodoc]] OpenAIGPTLMHeadModel
-    - forward
+- forward
 
 ## OpenAIGPTDoubleHeadsModel
 
 [[autodoc]] OpenAIGPTDoubleHeadsModel
-    - forward
+- forward
 
 ## OpenAIGPTForSequenceClassification
 
 [[autodoc]] OpenAIGPTForSequenceClassification
-    - forward
+- forward
 
-</pt>
-<tf>
+## OpenAIGPTTokenizer
+
+[[autodoc]] OpenAIGPTTokenizer
+
+## OpenAIGPTTokenizerFast
+
+[[autodoc]] OpenAIGPTTokenizerFast
 
 ## TFOpenAIGPTModel
 
 [[autodoc]] TFOpenAIGPTModel
-    - call
+- call
 
 ## TFOpenAIGPTLMHeadModel
 
 [[autodoc]] TFOpenAIGPTLMHeadModel
-    - call
+- call
 
 ## TFOpenAIGPTDoubleHeadsModel
 
 [[autodoc]] TFOpenAIGPTDoubleHeadsModel
-    - call
+- call
 
 ## TFOpenAIGPTForSequenceClassification
 
 [[autodoc]] TFOpenAIGPTForSequenceClassification
-    - call
-
-</tf>
-</frameworkcontent>
+- call
diff --git a/docs/source/en/model_doc/opt.md b/docs/source/en/model_doc/opt.md
index c82064bae894..f6165e495393 100644
--- a/docs/source/en/model_doc/opt.md
+++ b/docs/source/en/model_doc/opt.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # OPT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The OPT model was proposed in [Open Pre-trained Transformer Language Models](https://arxiv.org/pdf/2205.01068) by Meta AI.
diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md
index 696a1b03776a..f01a5c59063b 100644
--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # OWLv2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 OWLv2 was proposed in [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby. OWLv2 scales up [OWL-ViT](owlvit) using self-training, which uses an existing detector to generate pseudo-box annotations on image-text pairs. This results in large gains over the previous state-of-the-art for zero-shot object detection.
diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md
index 519648bbd8dc..5be8ffc8f58c 100644
--- a/docs/source/en/model_doc/owlvit.md
+++ b/docs/source/en/model_doc/owlvit.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # OWL-ViT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The OWL-ViT (short for Vision Transformer for Open-World Localization) was proposed in [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. OWL-ViT is an open-vocabulary object detection network trained on a variety of (image, text) pairs. It can be used to query an image with one or multiple text queries to search for and detect target objects described in text.
diff --git a/docs/source/en/model_doc/paligemma.md b/docs/source/en/model_doc/paligemma.md
index 8b88db39bd71..fa119a5f8362 100644
--- a/docs/source/en/model_doc/paligemma.md
+++ b/docs/source/en/model_doc/paligemma.md
@@ -14,83 +14,157 @@ rendered properly in your Markdown viewer.
 
 -->
 
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
 # PaliGemma
 
-## Overview
+[PaliGemma](https://huggingface.co/papers/2407.07726) is a family of vision-language models (VLMs), combining [SigLIP](./siglip) with the [Gemma](./gemma) 2B model. PaliGemma is available in 3B, 10B, and 28B parameters. The main purpose of PaliGemma is to provide an adaptable base VLM that is easy to transfer to other tasks. The SigLIP vision encoder is a "shape optimized" contrastively pretrained [ViT](./vit) that converts an image into a sequence of tokens and prepended to an optional prompt. The Gemma 2B model is used as the decoder. PaliGemma uses full attention on all image and text tokens to maximize its capacity.
 
-The PaliGemma model was proposed in [PaliGemma – Google's Cutting-Edge Open Vision Language Model](https://huggingface.co/blog/paligemma) by Google. It is a 3B vision-language model composed by a [SigLIP](siglip) vision encoder and a [Gemma](gemma) language decoder linked by a multimodal linear projection. It cuts an image into a fixed number of VIT tokens and prepends it to an optional prompt. One particularity is that the model uses full block attention on all the image tokens plus the input text tokens. It comes in 3 resolutions, 224x224, 448x448 and 896x896 with 3 base models, with 55 fine-tuned versions for different tasks, and 2 mix models.
+[PaliGemma 2](https://huggingface.co/papers/2412.03555) improves on the first model by using Gemma 2 (2B, 9B, and 27B parameter variants) as the decoder. These are available as **pt** or **mix** variants. The **pt** checkpoints are intended for further fine-tuning and the **mix** checkpoints are ready for use out of the box.
 
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/paligemma/paligemma_arch.png"
-alt="drawing" width="600"/>
+You can find all the original PaliGemma checkpoints under the [PaliGemma](https://huggingface.co/collections/google/paligemma-release-6643a9ffbf57de2ae0448dda), [PaliGemma 2](https://huggingface.co/collections/google/paligemma-2-release-67500e1e1dbfdd4dee27ba48), and [PaliGemma 2 Mix](https://huggingface.co/collections/google/paligemma-2-mix-67ac6a251aaf3ee73679dcc4) collections.
 
-<small> PaliGemma architecture. Taken from the <a href="https://huggingface.co/blog/paligemma">blog post.</a> </small>
+> [!TIP]
+> Click on the PaliGemma models in the right sidebar for more examples of how to apply PaliGemma to different vision and language tasks.
 
-This model was contributed by [Molbap](https://huggingface.co/Molbap).
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
 
-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-- PaliGemma is not meant for conversational use, and it works best when fine-tuning to a specific use case. Some downstream tasks on which PaliGemma can be fine-tuned include image captioning, visual question answering (VQA), object detection, referring expression segmentation and document understanding.
-- One can use `PaliGemmaProcessor` to prepare images, text and optional labels for the model. When fine-tuning a PaliGemma model, the `suffix` argument can be passed to the processor which creates the `labels` for the model:
+```py
+import torch
+from transformers import pipeline
 
-```python
-prompt = "What is on the flower?"
-answer = "a bee"
-inputs = processor(images=raw_image, text=prompt, suffix=answer, return_tensors="pt")
+pipeline = pipeline(
+    task="image-text-to-text",
+    model="google/paligemma2-3b-mix-224",
+    device=0,
+    torch_dtype=torch.bfloat16
+)
+pipeline(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+    text="What is in this image?"
+)
 ```
 
-## Usage Example
+</hfoption>
+<hfoption id="AutoModel">
 
-The model can accept a single or multiple images. According to the [paper](https://arxiv.org/abs/2407.07726v1), the checkpoint PaliGemma can transfer to tasks which take multiple images as input. NLVR2 is one such task, which asks one question about two images, and requires looking at both to give the correct answer. Here's an example code for single and multi image inference.
-
-### Single-image Inference
-
-```python
+```py
+import torch
+import requests
+from PIL import Image
 from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
 
-model_id = "google/paligemma-3b-mix-224"
-model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
-processor = AutoProcessor.from_pretrained(model_id)
+model = PaliGemmaForConditionalGeneration.from_pretrained(
+    "google/paligemma2-3b-mix-224",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+processor = AutoProcessor.from_pretrained(
+    "google/paligemma2-3b-mix-224",
+)
 
-prompt = "What is on the flower?"
-image_file = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg?download=true"
-raw_image = Image.open(requests.get(image_file, stream=True).raw)
-inputs = processor(raw_image, prompt, return_tensors="pt")
-output = model.generate(**inputs, max_new_tokens=20)
+prompt = "What is in this image?"
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(image, prompt, return_tensors="pt").to("cuda")
 
-print(processor.decode(output[0], skip_special_tokens=True)[inputs.input_ids.shape[1]: ])
+output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static")
+print(processor.decode(output[0], skip_special_tokens=True))
 ```
 
-### Multi-image Inference
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
 
-```python
-model_id = "google/paligemma-3b-ft-nlvr2-448"  # checkpoint tuned for multiple images
-model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
-processor = PaliGemmaProcessor.from_pretrained(model_id)
+```py
+# pip install torchao
+import torch
+import requests
+from PIL import Image
+from transformers import TorchAoConfig, AutoProcessor, PaliGemmaForConditionalGeneration
 
-prompt = "answer en Which of the two pictures shows a snowman, first or second?"
-stop_sign_image = Image.open(
-    requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+model = PaliGemmaForConditionalGeneration.from_pretrained(
+    "google/paligemma2-28b-mix-224",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
 )
-snow_image = Image.open(
-    requests.get(
-        "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg", stream=True
-    ).raw
+processor = AutoProcessor.from_pretrained(
+    "google/paligemma2-28b-mix-224",
 )
 
-inputs = processor(images=[[snow_image, stop_sign_image]], text=prompt, return_tensors="pt")
-
-output = model.generate(**inputs, max_new_tokens=20)
-print(processor.decode(output[0], skip_special_tokens=True)[inputs.input_ids.shape[1]: ])
+prompt = "What is in this image?"
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(image, prompt, return_tensors="pt").to("cuda")
 
+output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static")
+print(processor.decode(output[0], skip_special_tokens=True))
 ```
 
-## Resources
+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
+
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
 
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with PaliGemma. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+visualizer = AttentionMaskVisualizer("google/paligemma2-3b-mix-224")
+visualizer("<img> What is in this image?")
+```
 
-- A blog post introducing all the features of PaliGemma can be found [here](https://huggingface.co/blog/paligemma).
-- Demo notebooks on how to fine-tune PaliGemma for VQA with the Trainer API along with inference can be found [here](https://github.com/huggingface/notebooks/tree/main/examples/paligemma).
-- Demo notebooks on how to fine-tune PaliGemma on a custom dataset (receipt image -> JSON) along with inference can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/PaliGemma). 🌎
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/paligemma2-attn-mask.png"/>
+</div>
+
+## Notes
+
+- PaliGemma is not a conversational model and works best when fine-tuned for specific downstream tasks such as image captioning, visual question answering (VQA), object detection, and document understanding.
+- [`PaliGemmaProcessor`] can prepare images, text, and optional labels for the model. Pass the `suffix` parameter to the processor to create labels for the model during fine-tuning.
+
+    ```py
+    prompt = "What is in this image?"
+    answer = "a pallas cat"
+    inputs = processor(images=image, text=prompt, suffix=answer, return_tensors="pt")
+    ```
+- PaliGemma can support multiple input images if it is fine-tuned to accept multiple images. For example, the [NLVR2](https://huggingface.co/google/paligemma-3b-ft-nlvr2-448) checkpoint supports multiple images. Pass the images as a list to the processor.
+
+    ```py
+    import torch
+    import requests
+    from PIL import Image
+    from transformers import TorchAoConfig, AutoProcessor, PaliGemmaForConditionalGeneration
+
+    model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-ft-nlvr2-448")
+    processor = AutoProcessor.from_pretrained("google/paligemma-3b-ft-nlvr2-448")
+
+    prompt = "Are these two images the same?"
+    cat_image = Image.open(
+        requests.get("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", stream=True).raw
+    )
+    cow_image = Image.open(
+        requests.get(
+            "https://media.istockphoto.com/id/1192867753/photo/cow-in-berchida-beach-siniscola.jpg?s=612x612&w=0&k=20&c=v0hjjniwsMNfJSuKWZuIn8pssmD5h5bSN1peBd1CmH4=", stream=True
+        ).raw
+    )
+
+    inputs = processor(images=[[cat_image, cow_image]], text=prompt, return_tensors="pt")
+
+    output = model.generate(**inputs, max_new_tokens=20, cache_implementation="static")
+    print(processor.decode(output[0], skip_special_tokens=True))
+    ```
 
 ## PaliGemmaConfig
 
diff --git a/docs/source/en/model_doc/patchtsmixer.md b/docs/source/en/model_doc/patchtsmixer.md
index a67138e533b7..dd678dd40101 100644
--- a/docs/source/en/model_doc/patchtsmixer.md
+++ b/docs/source/en/model_doc/patchtsmixer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # PatchTSMixer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The PatchTSMixer model was proposed in [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong and Jayant Kalagnanam.
diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
index 544e4cb378c6..c55ba3334299 100644
--- a/docs/source/en/model_doc/patchtst.md
+++ b/docs/source/en/model_doc/patchtst.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # PatchTST
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The PatchTST model was proposed in [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong and Jayant Kalagnanam.
diff --git a/docs/source/en/model_doc/pegasus.md b/docs/source/en/model_doc/pegasus.md
index 0622354e62de..bdb61e66d984 100644
--- a/docs/source/en/model_doc/pegasus.md
+++ b/docs/source/en/model_doc/pegasus.md
@@ -17,15 +17,12 @@ rendered properly in your Markdown viewer.
 # Pegasus
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=pegasus">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-pegasus-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/pegasus_paraphrase">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
 </div>
 
-
 ## Overview
 
 The Pegasus model was proposed in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
@@ -99,7 +96,7 @@ All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tun
 
 ## Resources
 
-- [Script](https://github.com/huggingface/transformers/tree/main/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh) to fine-tune pegasus
+- [Script](https://github.com/huggingface/transformers-research-projects/tree/main/seq2seq-distillation/finetune_pegasus_xsum.sh) to fine-tune pegasus
   on the XSUM dataset. Data download instructions at [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md).
 - [Causal language modeling task guide](../tasks/language_modeling)
 - [Translation task guide](../tasks/translation)
diff --git a/docs/source/en/model_doc/pegasus_x.md b/docs/source/en/model_doc/pegasus_x.md
index d64d8ba95416..3f982263cdb1 100644
--- a/docs/source/en/model_doc/pegasus_x.md
+++ b/docs/source/en/model_doc/pegasus_x.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # PEGASUS-X
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The PEGASUS-X model was proposed in [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)  by Jason Phang, Yao Zhao and Peter J. Liu.
diff --git a/docs/source/en/model_doc/perceiver.md b/docs/source/en/model_doc/perceiver.md
index ee678c22f6f8..700f49d42d93 100644
--- a/docs/source/en/model_doc/perceiver.md
+++ b/docs/source/en/model_doc/perceiver.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Perceiver
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Perceiver IO model was proposed in [Perceiver IO: A General Architecture for Structured Inputs &
diff --git a/docs/source/en/model_doc/persimmon.md b/docs/source/en/model_doc/persimmon.md
index 7a105ac5543d..bf721f19a107 100644
--- a/docs/source/en/model_doc/persimmon.md
+++ b/docs/source/en/model_doc/persimmon.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Persimmon
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Persimmon model was created by [ADEPT](https://www.adept.ai/blog/persimmon-8b), and authored by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
diff --git a/docs/source/en/model_doc/phi.md b/docs/source/en/model_doc/phi.md
index ef163213bf14..097d7fdd39ee 100644
--- a/docs/source/en/model_doc/phi.md
+++ b/docs/source/en/model_doc/phi.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Phi
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Phi-1 model was proposed in [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li.
diff --git a/docs/source/en/model_doc/phi3.md b/docs/source/en/model_doc/phi3.md
index fe68a6ae76b2..82973d39c07b 100644
--- a/docs/source/en/model_doc/phi3.md
+++ b/docs/source/en/model_doc/phi3.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Phi-3
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Phi-3 model was proposed in [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Microsoft.
diff --git a/docs/source/en/model_doc/phi4_multimodal.md b/docs/source/en/model_doc/phi4_multimodal.md
new file mode 100644
index 000000000000..3fa2b61cc976
--- /dev/null
+++ b/docs/source/en/model_doc/phi4_multimodal.md
@@ -0,0 +1,156 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Phi4 Multimodal
+
+## Overview
+
+Phi4 Multimodal is a lightweight open multimodal foundation model that leverages the language, vision, and speech research and datasets used for Phi-3.5 and 4.0 models. The model processes text, image, and audio inputs, generating text outputs, and comes with 128K token context length. The model underwent an enhancement process, incorporating both supervised fine-tuning, direct preference optimization and RLHF (Reinforcement Learning from Human Feedback) to support precise instruction adherence and safety measures. The languages that each modal supports are the following:
+
+- Text: Arabic, Chinese, Czech, Danish, Dutch, English, Finnish, French, German, Hebrew, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian
+- Vision: English
+- Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese
+
+This model was contributed by [Cyril Vallez](https://huggingface.co/cyrilvallez). The most recent code can be
+found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py).
+
+
+## Usage tips
+
+`Phi4-multimodal-instruct` can be found on the [Huggingface Hub](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)
+
+In the following, we demonstrate how to use it for inference depending on the input modalities (text, image, audio).
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+
+
+# Define model path
+model_path = "microsoft/Phi-4-multimodal-instruct"
+device = "cuda:0"
+
+# Load model and processor
+processor = AutoProcessor.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device,  torch_dtype=torch.float16)
+
+# Optional: load the adapters (note that without them, the base model will very likely not work well)
+model.load_adapter(model_path, adapter_name="speech", device_map=device, adapter_kwargs={"subfolder": 'speech-lora'})
+model.load_adapter(model_path, adapter_name="vision", device_map=device, adapter_kwargs={"subfolder": 'vision-lora'})
+
+# Part : Image Processing
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+            {"type": "text", "text": "What is shown in this image?"},
+        ],
+    },
+]
+
+model.set_adapter("vision") # if loaded, activate the vision adapter
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(device, torch.float16)
+
+# Generate response
+generate_ids = model.generate(
+    **inputs,
+    max_new_tokens=1000,
+    do_sample=False,
+)
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(
+    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)[0]
+print(f'>>> Response\n{response}')
+
+
+# Part 2: Audio Processing
+model.set_adapter("speech") # if loaded, activate the speech adapter
+audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "audio", "url": audio_url},
+            {"type": "text", "text": "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the origina transcript and the translation."},
+        ],
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    sample_rate=sample_rate,
+).to(device, torch.float16)
+
+generate_ids = model.generate(
+    **inputs,
+    max_new_tokens=1000,
+    do_sample=False,
+)
+generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+response = processor.batch_decode(
+    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)[0]
+print(f'>>> Response\n{response}')
+```
+
+## Phi4MultimodalFeatureExtractor
+
+[[autodoc]] Phi4MultimodalFeatureExtractor
+
+## Phi4MultimodalImageProcessorFast
+
+[[autodoc]] Phi4MultimodalImageProcessorFast
+
+## Phi4MultimodalProcessor
+
+[[autodoc]] Phi4MultimodalProcessor
+
+## Phi4MultimodalAudioConfig
+
+[[autodoc]] Phi4MultimodalAudioConfig
+
+## Phi4MultimodalVisionConfig
+
+[[autodoc]] Phi4MultimodalVisionConfig
+
+## Phi4MultimodalConfig
+
+[[autodoc]] Phi4MultimodalConfig
+
+## Phi4MultimodalAudioModel
+
+[[autodoc]] Phi4MultimodalAudioModel
+
+## Phi4MultimodalVisionModel
+
+[[autodoc]] Phi4MultimodalVisionModel
+
+## Phi4MultimodalModel
+
+[[autodoc]] Phi4MultimodalModel
+    - forward
+
+## Phi4MultimodalForCausalLM
+
+[[autodoc]] Phi4MultimodalForCausalLM
+    - forward
diff --git a/docs/source/en/model_doc/phimoe.md b/docs/source/en/model_doc/phimoe.md
index d9c9ae4a1831..6728248f2e0a 100644
--- a/docs/source/en/model_doc/phimoe.md
+++ b/docs/source/en/model_doc/phimoe.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # PhiMoE
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The PhiMoE model was proposed in [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Microsoft.
diff --git a/docs/source/en/model_doc/phobert.md b/docs/source/en/model_doc/phobert.md
index adf5900ebe2a..c1c4b8742b4d 100644
--- a/docs/source/en/model_doc/phobert.md
+++ b/docs/source/en/model_doc/phobert.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # PhoBERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The PhoBERT model was proposed in [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92.pdf) by Dat Quoc Nguyen, Anh Tuan Nguyen.
diff --git a/docs/source/en/model_doc/pix2struct.md b/docs/source/en/model_doc/pix2struct.md
index 0c9baa18e02f..e912cc96cdcc 100644
--- a/docs/source/en/model_doc/pix2struct.md
+++ b/docs/source/en/model_doc/pix2struct.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Pix2Struct
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Pix2Struct model was proposed in [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index 6e7652bfdea3..f287170a0e0f 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Pixtral
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Pixtral model was released by the Mistral AI team in a [blog post](https://mistral.ai/news/pixtral-12b/). Pixtral is a multimodal version of [Mistral](mistral), incorporating a 400 million parameter vision encoder trained from scratch.
@@ -59,7 +63,7 @@ chat = [
     {
       "role": "user", "content": [
         {"type": "text", "content": "Can this animal"}, 
-        {"type": "image", "ur": "https://picsum.photos/id/237/200/300"}, 
+        {"type": "image", "url": "https://picsum.photos/id/237/200/300"}, 
         {"type": "text", "content": "live here?"}, 
         {"type": "image", "url": "https://picsum.photos/seed/picsum/200/300"}
       ]
diff --git a/docs/source/en/model_doc/plbart.md b/docs/source/en/model_doc/plbart.md
index 61af52e54d0d..bac567615d42 100644
--- a/docs/source/en/model_doc/plbart.md
+++ b/docs/source/en/model_doc/plbart.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # PLBart
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The PLBART model was proposed in [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
diff --git a/docs/source/en/model_doc/poolformer.md b/docs/source/en/model_doc/poolformer.md
index 823c4412485c..bce183706a83 100644
--- a/docs/source/en/model_doc/poolformer.md
+++ b/docs/source/en/model_doc/poolformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # PoolFormer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The PoolFormer model was proposed in [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)  by Sea AI Labs. Instead of designing complicated token mixer to achieve SOTA performance, the target of this work is to demonstrate the competence of transformer models largely stem from the general architecture MetaFormer.
diff --git a/docs/source/en/model_doc/pop2piano.md b/docs/source/en/model_doc/pop2piano.md
index 8e7c1fbd3435..a9554b4924a9 100644
--- a/docs/source/en/model_doc/pop2piano.md
+++ b/docs/source/en/model_doc/pop2piano.md
@@ -13,9 +13,7 @@ specific language governing permissions and limitations under the License.
 # Pop2Piano
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/spaces/sweetcocoa/pop2piano">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/prompt_depth_anything.md b/docs/source/en/model_doc/prompt_depth_anything.md
new file mode 100644
index 000000000000..910298fa8c71
--- /dev/null
+++ b/docs/source/en/model_doc/prompt_depth_anything.md
@@ -0,0 +1,96 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Prompt Depth Anything
+
+## Overview
+
+The Prompt Depth Anything model was introduced in [Prompting Depth Anything for 4K Resolution Accurate Metric Depth Estimation](https://arxiv.org/abs/2412.14015) by Haotong Lin, Sida Peng, Jingxiao Chen, Songyou Peng, Jiaming Sun, Minghuan Liu, Hujun Bao, Jiashi Feng, Xiaowei Zhou, Bingyi Kang. 
+
+
+The abstract from the paper is as follows:
+
+*Prompts play a critical role in unleashing the power of language and vision foundation models for specific tasks. For the first time, we introduce prompting into depth foundation models, creating a new paradigm for metric depth estimation termed Prompt Depth Anything. Specifically, we use a low-cost LiDAR as the prompt to guide the Depth Anything model for accurate metric depth output, achieving up to 4K resolution. Our approach centers on a concise prompt fusion design that integrates the LiDAR at multiple scales within the depth decoder. To address training challenges posed by limited datasets containing both LiDAR depth and precise GT depth, we propose a scalable data pipeline that includes synthetic data LiDAR simulation and real data pseudo GT depth generation. Our approach sets new state-of-the-arts on the ARKitScenes and ScanNet++ datasets and benefits downstream applications, including 3D reconstruction and generalized robotic grasping.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/prompt_depth_anything_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> Prompt Depth Anything overview. Taken from the <a href="https://arxiv.org/pdf/2412.14015">original paper</a>.</small>
+
+## Usage example
+
+The Transformers library allows you to use the model with just a few lines of code:
+
+```python
+>>> import torch
+>>> import requests
+>>> import numpy as np
+
+>>> from PIL import Image
+>>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+
+>>> url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/image.jpg?raw=true"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image_processor = AutoImageProcessor.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
+>>> model = AutoModelForDepthEstimation.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
+
+>>> prompt_depth_url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/arkit_depth.png?raw=true"
+>>> prompt_depth = Image.open(requests.get(prompt_depth_url, stream=True).raw)
+>>> # the prompt depth can be None, and the model will output a monocular relative depth.
+
+>>> # prepare image for the model
+>>> inputs = image_processor(images=image, return_tensors="pt", prompt_depth=prompt_depth)
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> # interpolate to original size
+>>> post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs,
+...     target_sizes=[(image.height, image.width)],
+... )
+
+>>> # visualize the prediction
+>>> predicted_depth = post_processed_output[0]["predicted_depth"]
+>>> depth = predicted_depth * 1000 
+>>> depth = depth.detach().cpu().numpy()
+>>> depth = Image.fromarray(depth.astype("uint16")) # mm
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Prompt Depth Anything.
+
+- [Prompt Depth Anything Demo](https://huggingface.co/spaces/depth-anything/PromptDA)
+- [Prompt Depth Anything Interactive Results](https://promptda.github.io/interactive.html)
+
+If you are interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## PromptDepthAnythingConfig
+
+[[autodoc]] PromptDepthAnythingConfig
+
+## PromptDepthAnythingForDepthEstimation
+
+[[autodoc]] PromptDepthAnythingForDepthEstimation
+    - forward
+
+## PromptDepthAnythingImageProcessor
+
+[[autodoc]] PromptDepthAnythingImageProcessor
+    - preprocess
+    - post_process_depth_estimation
\ No newline at end of file
diff --git a/docs/source/en/model_doc/prophetnet.md b/docs/source/en/model_doc/prophetnet.md
index 764c3acb0674..b768fef72a04 100644
--- a/docs/source/en/model_doc/prophetnet.md
+++ b/docs/source/en/model_doc/prophetnet.md
@@ -17,12 +17,7 @@ rendered properly in your Markdown viewer.
 # ProphetNet
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=prophetnet">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-prophetnet-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/prophetnet-large-uncased">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/pvt.md b/docs/source/en/model_doc/pvt.md
index 3e88a24999f7..d4c80445bf61 100644
--- a/docs/source/en/model_doc/pvt.md
+++ b/docs/source/en/model_doc/pvt.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # Pyramid Vision Transformer (PVT)
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The PVT model was proposed in
diff --git a/docs/source/en/model_doc/pvt_v2.md b/docs/source/en/model_doc/pvt_v2.md
index 4b580491ea1e..deac614d38bf 100644
--- a/docs/source/en/model_doc/pvt_v2.md
+++ b/docs/source/en/model_doc/pvt_v2.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # Pyramid Vision Transformer V2 (PVTv2)
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The PVTv2 model was proposed in
diff --git a/docs/source/en/model_doc/qdqbert.md b/docs/source/en/model_doc/qdqbert.md
index ca718f34af4a..4c1a485b1161 100644
--- a/docs/source/en/model_doc/qdqbert.md
+++ b/docs/source/en/model_doc/qdqbert.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # QDQBERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
@@ -50,7 +54,7 @@ This model was contributed by [shangz](https://huggingface.co/shangz).
 - QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *google-bert/bert-base-uncased*), and
   perform Quantization Aware Training/Post Training Quantization.
 - A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
-  SQUAD task can be found at [transformers/examples/research_projects/quantization-qdqbert/](examples/research_projects/quantization-qdqbert/).
+  SQUAD task can be found at https://github.com/huggingface/transformers-research-projects/tree/main/quantization-qdqbert.
 
 ### Set default quantizers
 
diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md
index 78138413c7fb..1c5e3880938d 100644
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@@ -14,44 +14,136 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Qwen2
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
 
-## Overview
+# Qwen2
 
-Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen2-0.5B, Qwen2-1.5B, Qwen2-7B, Qwen2-57B-A14B, Qwen2-72B, Qwen2-Audio, etc.
+[Qwen2](https://huggingface.co/papers/2407.10671) is a family of large language models (pretrained, instruction-tuned and mixture-of-experts) available in sizes from 0.5B to 72B parameters. The models are built on the Transformer architecture featuring enhancements like group query attention (GQA), rotary positional embeddings (RoPE), a mix of sliding window and full attention, and dual chunk attention with YARN for training stability. Qwen2 models support multiple languages and context lengths up to 131,072 tokens.
 
-### Model Details
+You can find all the official Qwen2 checkpoints under the [Qwen2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f) collection.
 
-Qwen2 is a language model series including decoder language models of different model sizes. For each size, we release the base language model and the aligned chat model. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention, etc. Additionally, we have an improved tokenizer adaptive to multiple natural languages and codes.
+> [!TIP]
+> Click on the Qwen2 models in the right sidebar for more examples of how to apply Qwen2 to different language tasks.
 
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line using the instruction-tuned models.
 
-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-`Qwen2-7B` and `Qwen2-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)
+```python
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+    task="text-generation",
+    model="Qwen/Qwen2-1.5B-Instruct",
+    torch_dtype=torch.bfloat16,
+    device_map=0
+)
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Tell me about the Qwen2 model family."},
+]
+outputs = pipe(messages, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
+print(outputs[0]["generated_text"][-1]['content'])
+```
 
-In the following, we demonstrate how to use `Qwen2-7B-Instruct` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
+</hfoption>
+<hfoption id="AutoModel">
 
 ```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
->>> device = "cuda" # the device to load the model onto
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen2-1.5B-Instruct",
+    torch_dtype=torch.bfloat16, 
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
+
+prompt = "Give me a short introduction to large language models."
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
+
+generated_ids = model.generate(
+    model_inputs.input_ids,
+    cache_implementation="static",
+    max_new_tokens=512,
+    do_sample=True, 
+    temperature=0.7, 
+    top_k=50,        
+    top_p=0.95       
+)
+generated_ids = [
+    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+]
+
+response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(response)
+```
 
->>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct", device_map="auto")
->>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")
+</hfoption>
+<hfoption id="transformers-cli">
 
->>> prompt = "Give me a short introduction to large language model."
+```bash
+# pip install -U flash-attn --no-build-isolation
+transformers-cli chat --model_name_or_path Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+```
 
->>> messages = [{"role": "user", "content": prompt}]
+</hfoption>
+</hfoptions>
 
->>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
 
->>> model_inputs = tokenizer([text], return_tensors="pt").to(device)
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to 4-bits.
 
->>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True)
+```python
+# pip install -U flash-attn --no-build-isolation
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16, 
+    bnb_4bit_quant_type="nf4",             
+    bnb_4bit_use_double_quant=True,       
+)
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B") 
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen2-7B",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config,
+    attn_implementation="flash_attention_2" 
+)
+
+inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda") 
+outputs = model.generate(**inputs, max_new_tokens=100)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
 
->>> generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
 
->>> response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-```
+## Notes
+
+- Ensure your Transformers library version is up-to-date. Qwen2 requires Transformers>=4.37.0 for full support.
 
 ## Qwen2Config
 
diff --git a/docs/source/en/model_doc/qwen2_5_vl.md b/docs/source/en/model_doc/qwen2_5_vl.md
index f08343506b6a..2d38fe82e614 100644
--- a/docs/source/en/model_doc/qwen2_5_vl.md
+++ b/docs/source/en/model_doc/qwen2_5_vl.md
@@ -14,39 +14,74 @@ rendered properly in your Markdown viewer.
 
 -->
 
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">    </div>
+</div>
+
 # Qwen2.5-VL
 
-## Overview
+[Qwen2.5-VL](https://huggingface.co/papers/2502.13923) is a multimodal vision-language model, available in 3B, 7B, and 72B parameters, pretrained on 4.1T tokens. The model introduces window attention in the ViT encoder to accelerate training and inference, dynamic FPS sampling on the spatial and temporal dimensions for better video understanding across different sampling rates, and an upgraded MRoPE (multi-resolutional rotary positional encoding) mechanism to better capture and learn temporal dynamics.
+
 
-The [Qwen2.5-VL](https://qwenlm.github.io/blog/qwen2_5-vl/) model is an update to [Qwen2-VL](https://arxiv.org/abs/2409.12191) from Qwen team, Alibaba Group. 
+You can find all the original Qwen2.5-VL checkpoints under the [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl-6795ffac22b334a837c0f9a5) collection.
 
-The abstract from this update is the following:
+> [!TIP]
+> Click on the Qwen2.5-VL models in the right sidebar for more examples of how to apply Qwen2.5-VL to different vision and language tasks.
 
-*Qwen2.5-VL marks a major step forward from Qwen2-VL, built upon the latest Qwen2.5 LLM. We've accelerated training and testing through the strategic implementation of window attention within the ViT. The ViT architecture itself has been refined with SwiGLU and RMSNorm, aligning it more closely with the LLM's structure. A key innovation is the expansion of native dynamic resolution to encompass the temporal dimension, in addition to spatial aspects. Furthermore, we've upgraded MRoPE, incorporating absolute time alignment on the time axis to allow the model to effectively capture temporal dynamics, regardless of frame rate, leading to superior video understanding.*
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
 
-## Usage example
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-### Single Media inference
+```py
+import torch
+from transformers import pipeline
+pipe = pipeline(
+    task="image-text-to-text",
+    model="Qwen/Qwen2.5-VL-7B-Instruct",
+    device=0,
+    torch_dtype=torch.bfloat16
+)
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+            },
+            { "type": "text", "text": "Describe this image."},
+        ]
+    }
+]
+pipe(text=messages,max_new_tokens=20, return_full_text=False)
 
-The model can accept both images and videos as input. Here's an example code for inference.
+```
+</hfoption>
 
-```python
+<hfoption id="AutoModel">
 
+```py
 import torch
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 
-# Load the model in half-precision on the available device(s)
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", device_map="auto")
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2.5-VL-7B-Instruct",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-
-
-conversation = [
+messages = [
     {
         "role":"user",
         "content":[
             {
                 "type":"image",
-                "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
             },
             {
                 "type":"text",
@@ -54,212 +89,145 @@ conversation = [
             }
         ]
     }
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt"
-).to(model.device)
-
-
-# Inference: Generation of the output
-output_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
-output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-print(output_text)
 
-# Video
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "video", "path": "/path/to/video.mp4"},
-            {"type": "text", "text": "What happened in the video?"},
-        ],
-    }
 ]
 
 inputs = processor.apply_chat_template(
-    conversation,
-    video_fps=1,
+    messages,
     add_generation_prompt=True,
     tokenize=True,
     return_dict=True,
     return_tensors="pt"
-).to(model.device)
-
-# Inference: Generation of the output
-output_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
-output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-print(output_text)
-```
-
-### Batch Mixed Media Inference
-
-The model can batch inputs composed of mixed samples of various types such as images, videos, and text. Here is an example.
-
-```python
-# Conversation for the first image
-conversation1 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image", "path": "/path/to/image1.jpg"},
-            {"type": "text", "text": "Describe this image."}
-        ]
-    }
-]
-
-# Conversation with two images
-conversation2 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image", "path": "/path/to/image2.jpg"},
-            {"type": "image", "path": "/path/to/image3.jpg"},
-            {"type": "text", "text": "What is written in the pictures?"}
-        ]
-    }
-]
-
-# Conversation with pure text
-conversation3 = [
-    {
-        "role": "user",
-        "content": "who are you?"
-    }
-]
-
+).to("cuda")
 
-# Conversation with mixed midia
-conversation4 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image", "path": "/path/to/image3.jpg"},
-            {"type": "image", "path": "/path/to/image4.jpg"},
-            {"type": "video", "path": "/path/to/video.jpg"},
-            {"type": "text", "text": "What are the common elements in these medias?"},
-        ],
-    }
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 ]
-
-conversations = [conversation1, conversation2, conversation3, conversation4]
-# Preparation for batch inference
-ipnuts = processor.apply_chat_template(
-    conversations,
-    video_fps=1,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt"
-).to(model.device)
-
-
-# Batch Inference
-output_ids = model.generate(**inputs, max_new_tokens=128)
-generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
-output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+output_text = processor.batch_decode(
+       generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
 print(output_text)
 ```
+</hfoption>
+</hfoptions>
 
-### Usage Tips
-
-#### Image Resolution trade-off
-
-The model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs.
-
-```python
-min_pixels = 224*224
-max_pixels = 2048*2048
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
-```
-
-In case of limited GPU RAM, one can reduce the resolution as follows:
-
-```python
-min_pixels = 256*28*28
-max_pixels = 1024*28*28 
-processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
-```
-This ensures each image gets encoded using a number between 256-1024 tokens. The 28 comes from the fact that the model uses a patch size of 14 and a temporal patch size of 2 (14 x 2 = 28).
-
-#### Multiple Image Inputs
-
-By default, images and video content are directly included in the conversation. When handling multiple images, it's helpful to add labels to the images and videos for better reference. Users can control this behavior with the following settings:
-
-```python
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"}, 
-            {"type": "text", "text": "Hello, how are you?"}
-        ]
-    },
-    {
-        "role": "assistant",
-        "content": "I'm doing well, thank you for asking. How can I assist you today?"
-    },
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "Can you describe these images and video?"}, 
-            {"type": "image"}, 
-            {"type": "image"}, 
-            {"type": "video"}, 
-            {"type": "text", "text": "These are from my vacation."}
-        ]
-    },
-    {
-        "role": "assistant",
-        "content": "I'd be happy to describe the images and video for you. Could you please provide more context about your vacation?"
-    },
-    {
-        "role": "user",
-        "content": "It was a trip to the mountains. Can you see the details in the images and video?"
-    }
-]
-
-# default:
-prompt_without_id = processor.apply_chat_template(conversation, add_generation_prompt=True)
-# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Hello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing well, thank you for asking. How can I assist you today?<|im_end|>\n<|im_start|>user\nCan you describe these images and video?<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|><|vision_start|><|video_pad|><|vision_end|>These are from my vacation.<|im_end|>\n<|im_start|>assistant\nI'd be happy to describe the images and video for you. Could you please provide more context about your vacation?<|im_end|>\n<|im_start|>user\nIt was a trip to the mountains. Can you see the details in the images and video?<|im_end|>\n<|im_start|>assistant\n'
-
-
-# add ids
-prompt_with_id = processor.apply_chat_template(conversation, add_generation_prompt=True, add_vision_id=True)
-# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nPicture 1: <|vision_start|><|image_pad|><|vision_end|>Hello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing well, thank you for asking. How can I assist you today?<|im_end|>\n<|im_start|>user\nCan you describe these images and video?Picture 2: <|vision_start|><|image_pad|><|vision_end|>Picture 3: <|vision_start|><|image_pad|><|vision_end|>Video 1: <|vision_start|><|video_pad|><|vision_end|>These are from my vacation.<|im_end|>\n<|im_start|>assistant\nI'd be happy to describe the images and video for you. Could you please provide more context about your vacation?<|im_end|>\n<|im_start|>user\nIt was a trip to the mountains. Can you see the details in the images and video?<|im_end|>\n<|im_start|>assistant\n'
-
-```
-
-#### Flash-Attention 2 to speed up generation
-
-First, make sure to install the latest version of Flash Attention 2:
-
-```bash
-pip install -U flash-attn --no-build-isolation
-```
-
-Also, you should have hardware that is compatible with FlashAttention 2. Read more about it in the official documentation of the [flash attention repository](https://github.com/Dao-AILab/flash-attention). FlashAttention-2 can only be used when a model is loaded in `torch.float16` or `torch.bfloat16`.
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
 
-To load and run a model using FlashAttention-2, add `attn_implementation="flash_attention_2"` when loading the model:
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
 
 ```python
-from transformers import Qwen2_5_VLForConditionalGeneration
+import torch
+from transformers import TorchAoConfig, Gemma3ForConditionalGeneration, AutoProcessor
 
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    "Qwen/Qwen2.5-VL-7B-Instruct", 
-    torch_dtype=torch.bfloat16, 
-    attn_implementation="flash_attention_2",
+    "Qwen/Qwen2.5-VL-7B-Instruct",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
 )
-```
-
-
 
+```
+### Notes
+
+- Use Qwen2.5-VL for video inputs by setting `"type": "video"` as shown below.
+    ```python
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video", "path": "/path/to/video.mp4"},
+                {"type": "text", "text": "What happened in the video?"},
+            ],
+        }
+    ]
+    
+    inputs = processor.apply_chat_template(
+        conversation,
+        video_fps=1,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(model.device)
+    
+    # Inference: Generation of the output
+    output_ids = model.generate(**inputs, max_new_tokens=128)
+    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    print(output_text)
+    ```
+- Use Qwen2.5-VL for a mixed batch of inputs (images, videos, text). Add labels when handling multiple images or videos for better reference
+ as show below.
+    ```python
+    import torch
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+    
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2.5-VL-7B-Instruct",
+        torch_dtype=torch.float16,
+        device_map="auto",
+        attn_implementation="sdpa"
+    )
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"}, 
+                {"type": "text", "text": "Hello, how are you?"}
+            ]
+        },
+        {
+            "role": "assistant",
+            "content": "I'm doing well, thank you for asking. How can I assist you today?"
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Can you describe these images and video?"}, 
+                {"type": "image"}, 
+                {"type": "image"}, 
+                {"type": "video"}, 
+                {"type": "text", "text": "These are from my vacation."}
+            ]
+        },
+        {
+            "role": "assistant",
+            "content": "I'd be happy to describe the images and video for you. Could you please provide more context about your vacation?"
+        },
+        {
+            "role": "user",
+            "content": "It was a trip to the mountains. Can you see the details in the images and video?"
+        }
+    ]
+    
+    # default:
+    prompt_without_id = processor.apply_chat_template(conversation, add_generation_prompt=True)
+    # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Hello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing well, thank you for asking. How can I assist you today?<|im_end|>\n<|im_start|>user\nCan you describe these images and video?<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|><|vision_start|><|video_pad|><|vision_end|>These are from my vacation.<|im_end|>\n<|im_start|>assistant\nI'd be happy to describe the images and video for you. Could you please provide more context about your vacation?<|im_end|>\n<|im_start|>user\nIt was a trip to the mountains. Can you see the details in the images and video?<|im_end|>\n<|im_start|>assistant\n'
+    
+    
+    # add ids
+    prompt_with_id = processor.apply_chat_template(conversation, add_generation_prompt=True, add_vision_id=True)
+    # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nPicture 1: <|vision_start|><|image_pad|><|vision_end|>Hello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing well, thank you for asking. How can I assist you today?<|im_end|>\n<|im_start|>user\nCan you describe these images and video?Picture 2: <|vision_start|><|image_pad|><|vision_end|>Picture 3: <|vision_start|><|image_pad|><|vision_end|>Video 1: <|vision_start|><|video_pad|><|vision_end|>These are from my vacation.<|im_end|>\n<|im_start|>assistant\nI'd be happy to describe the images and video for you. Could you please provide more context about your vacation?<|im_end|>\n<|im_start|>user\nIt was a trip to the mountains. Can you see the details in the images and video?<|im_end|>\n<|im_start|>assistant\n'
+    ```
+
+- Use the `min_pixels` and `max_pixels` parameters in [`AutoProcessor`] to set the resolution.
+
+    ```python
+    min_pixels = 224*224
+    max_pixels = 2048*2048
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+    ```
+    
+    Higher resolution can require more compute whereas reducing the resolution can save memory as follows:
+    
+    ```python
+    min_pixels = 256*28*28
+    max_pixels = 1024*28*28 
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+    ```
 ## Qwen2_5_VLConfig
 
 [[autodoc]] Qwen2_5_VLConfig
diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md
index 2ef947ce430d..e53f94b10eb1 100644
--- a/docs/source/en/model_doc/qwen2_audio.md
+++ b/docs/source/en/model_doc/qwen2_audio.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Qwen2Audio
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Qwen2-Audio is the new model series of large audio-language models from the Qwen team. Qwen2-Audio is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. We introduce two distinct audio interaction modes:
@@ -23,7 +29,7 @@ The Qwen2-Audio is the new model series of large audio-language models from the
 * voice chat: users can freely engage in voice interactions with Qwen2-Audio without text input
 * audio analysis: users could provide audio and text instructions for analysis during the interaction
 
-It was proposed in [Qwen2-Audio Technical Report](https://arxiv.org/abs/2407.10759) by Yunfei Chu, Jin Xu, Qian Yang, Haojie Wei, Xipin Wei, Zhifang Guo, Yichong Leng, Yuanjun Lv, Jinzheng He, Junyang Lin, Chang Zhou, Jingren Zhou. 
+It was proposed in [Qwen2-Audio Technical Report](https://arxiv.org/abs/2407.10759) by Yunfei Chu, Jin Xu, Qian Yang, Haojie Wei, Xipin Wei, Zhifang Guo, Yichong Leng, Yuanjun Lv, Jinzheng He, Junyang Lin, Chang Zhou, Jingren Zhou.
 
 The abstract from the paper is the following:
 
@@ -94,7 +100,7 @@ for message in conversation:
         for ele in message["content"]:
             if ele["type"] == "audio":
                 audios.append(librosa.load(
-                    BytesIO(urlopen(ele['audio_url']).read()), 
+                    BytesIO(urlopen(ele['audio_url']).read()),
                     sr=processor.feature_extractor.sampling_rate)[0]
                 )
 
@@ -119,7 +125,7 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
 model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
 
 conversation = [
-    {'role': 'system', 'content': 'You are a helpful assistant.'}, 
+    {'role': 'system', 'content': 'You are a helpful assistant.'},
     {"role": "user", "content": [
         {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
         {"type": "text", "text": "What's that sound?"},
@@ -142,7 +148,7 @@ for message in conversation:
             if ele["type"] == "audio":
                 audios.append(
                     librosa.load(
-                        BytesIO(urlopen(ele['audio_url']).read()), 
+                        BytesIO(urlopen(ele['audio_url']).read()),
                         sr=processor.feature_extractor.sampling_rate)[0]
                 )
 
@@ -197,7 +203,7 @@ for conversation in conversations:
                 if ele["type"] == "audio":
                     audios.append(
                         librosa.load(
-                            BytesIO(urlopen(ele['audio_url']).read()), 
+                            BytesIO(urlopen(ele['audio_url']).read()),
                             sr=processor.feature_extractor.sampling_rate)[0]
                     )
 
@@ -215,7 +221,7 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_
 
 [[autodoc]] Qwen2AudioConfig
 
-## Qwen2AudioConfig
+## Qwen2AudioEncoderConfig
 
 [[autodoc]] Qwen2AudioEncoderConfig
 
@@ -223,6 +229,11 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_
 
 [[autodoc]] Qwen2AudioProcessor
 
+## Qwen2AudioEncoder
+
+[[autodoc]] Qwen2AudioEncoder
+    - forward
+
 ## Qwen2AudioForConditionalGeneration
 
 [[autodoc]] Qwen2AudioForConditionalGeneration
diff --git a/docs/source/en/model_doc/qwen2_moe.md b/docs/source/en/model_doc/qwen2_moe.md
index 3a7391ca194f..eaaa66aedf7a 100644
--- a/docs/source/en/model_doc/qwen2_moe.md
+++ b/docs/source/en/model_doc/qwen2_moe.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Qwen2MoE
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 Qwen2MoE is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen-72B, Qwen-1.8B, Qwen-VL, Qwen-Audio, etc.
diff --git a/docs/source/en/model_doc/qwen2_vl.md b/docs/source/en/model_doc/qwen2_vl.md
index b0275ce94af5..37c7ad31b311 100644
--- a/docs/source/en/model_doc/qwen2_vl.md
+++ b/docs/source/en/model_doc/qwen2_vl.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # Qwen2-VL
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+</div>
+
 ## Overview
 
 The [Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/) model is a major update to [Qwen-VL](https://arxiv.org/pdf/2308.12966) from the Qwen team at Alibaba Research. 
diff --git a/docs/source/en/model_doc/qwen3.md b/docs/source/en/model_doc/qwen3.md
new file mode 100644
index 000000000000..e3f3c2660913
--- /dev/null
+++ b/docs/source/en/model_doc/qwen3.md
@@ -0,0 +1,59 @@
+<!--Copyright 2024 The Qwen Team and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Qwen3
+
+## Overview
+
+To be released with the official model launch.
+
+### Model Details
+
+To be released with the official model launch.
+
+
+## Usage tips
+
+To be released with the official model launch.
+
+## Qwen3Config
+
+[[autodoc]] Qwen3Config
+
+## Qwen3Model
+
+[[autodoc]] Qwen3Model
+    - forward
+
+## Qwen3ForCausalLM
+
+[[autodoc]] Qwen3ForCausalLM
+    - forward
+
+## Qwen3ForSequenceClassification
+
+[[autodoc]] Qwen3ForSequenceClassification
+    - forward
+
+## Qwen3ForTokenClassification
+
+[[autodoc]] Qwen3ForTokenClassification
+    - forward
+
+## Qwen3ForQuestionAnswering
+
+[[autodoc]] Qwen3ForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/qwen3_moe.md b/docs/source/en/model_doc/qwen3_moe.md
new file mode 100644
index 000000000000..1de4af1a5bdf
--- /dev/null
+++ b/docs/source/en/model_doc/qwen3_moe.md
@@ -0,0 +1,58 @@
+<!--Copyright 2024 The Qwen Team and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Qwen3MoE
+
+## Overview
+
+To be released with the official model launch.
+
+### Model Details
+
+To be released with the official model launch.
+
+## Usage tips
+
+To be released with the official model launch.
+
+## Qwen3MoeConfig
+
+[[autodoc]] Qwen3MoeConfig
+
+## Qwen3MoeModel
+
+[[autodoc]] Qwen3MoeModel
+    - forward
+
+## Qwen3MoeForCausalLM
+
+[[autodoc]] Qwen3MoeForCausalLM
+    - forward
+
+## Qwen3MoeForSequenceClassification
+
+[[autodoc]] Qwen3MoeForSequenceClassification
+    - forward
+
+## Qwen3MoeForTokenClassification
+
+[[autodoc]] Qwen3MoeForTokenClassification
+    - forward
+
+## Qwen3MoeForQuestionAnswering
+
+[[autodoc]] Qwen3MoeForQuestionAnswering
+    - forward
diff --git a/docs/source/en/model_doc/rag.md b/docs/source/en/model_doc/rag.md
index 1891efe74263..8b65da43a22e 100644
--- a/docs/source/en/model_doc/rag.md
+++ b/docs/source/en/model_doc/rag.md
@@ -17,9 +17,9 @@ rendered properly in your Markdown viewer.
 # RAG
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=rag">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-rag-blueviolet">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/realm.md b/docs/source/en/model_doc/realm.md
index 558e83c08b06..b5b9102c2c64 100644
--- a/docs/source/en/model_doc/realm.md
+++ b/docs/source/en/model_doc/realm.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # REALM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/recurrent_gemma.md b/docs/source/en/model_doc/recurrent_gemma.md
index ceee799159fc..b543b35a75f0 100644
--- a/docs/source/en/model_doc/recurrent_gemma.md
+++ b/docs/source/en/model_doc/recurrent_gemma.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # RecurrentGemma
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Recurrent Gemma model was proposed in [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams of Google.
diff --git a/docs/source/en/model_doc/reformer.md b/docs/source/en/model_doc/reformer.md
index c78b1bbb8333..7e403599fdb0 100644
--- a/docs/source/en/model_doc/reformer.md
+++ b/docs/source/en/model_doc/reformer.md
@@ -17,12 +17,7 @@ rendered properly in your Markdown viewer.
 # Reformer
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=reformer">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-reformer-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/reformer-crime-and-punishment">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/regnet.md b/docs/source/en/model_doc/regnet.md
index acd833c77c2d..f292fe0df24b 100644
--- a/docs/source/en/model_doc/regnet.md
+++ b/docs/source/en/model_doc/regnet.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # RegNet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The RegNet model was proposed in [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
diff --git a/docs/source/en/model_doc/rembert.md b/docs/source/en/model_doc/rembert.md
index b755d3423060..319e44cf0987 100644
--- a/docs/source/en/model_doc/rembert.md
+++ b/docs/source/en/model_doc/rembert.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # RemBERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The RemBERT model was proposed in [Rethinking Embedding Coupling in Pre-trained Language Models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder.
diff --git a/docs/source/en/model_doc/resnet.md b/docs/source/en/model_doc/resnet.md
index b959266512f5..d7400b46c838 100644
--- a/docs/source/en/model_doc/resnet.md
+++ b/docs/source/en/model_doc/resnet.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # ResNet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The ResNet model was proposed in [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. Our implementation follows the small changes made by [Nvidia](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/resnet_50_v1_5_for_pytorch), we apply the `stride=2` for downsampling in bottleneck's `3x3` conv and not in the first `1x1`. This is generally known as "ResNet v1.5".
diff --git a/docs/source/en/model_doc/retribert.md b/docs/source/en/model_doc/retribert.md
index ab29ac966fe1..795f81caaa72 100644
--- a/docs/source/en/model_doc/retribert.md
+++ b/docs/source/en/model_doc/retribert.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # RetriBERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, so we won't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/roberta-prelayernorm.md b/docs/source/en/model_doc/roberta-prelayernorm.md
index f748e273e8f8..7cef8526c251 100644
--- a/docs/source/en/model_doc/roberta-prelayernorm.md
+++ b/docs/source/en/model_doc/roberta-prelayernorm.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # RoBERTa-PreLayerNorm
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The RoBERTa-PreLayerNorm model was proposed in [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md
index 2a1843d8885a..10a46d6f57eb 100644
--- a/docs/source/en/model_doc/roberta.md
+++ b/docs/source/en/model_doc/roberta.md
@@ -17,17 +17,12 @@ rendered properly in your Markdown viewer.
 # RoBERTa
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=roberta">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-roberta-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/roberta-base">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-<a href="https://huggingface.co/papers/1907.11692">
-<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1907.11692-green">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
-
 ## Overview
 
 The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
diff --git a/docs/source/en/model_doc/roc_bert.md b/docs/source/en/model_doc/roc_bert.md
index 30fadd5c2c10..f3797663ff70 100644
--- a/docs/source/en/model_doc/roc_bert.md
+++ b/docs/source/en/model_doc/roc_bert.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # RoCBert
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The RoCBert model was proposed in [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)  by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
diff --git a/docs/source/en/model_doc/roformer.md b/docs/source/en/model_doc/roformer.md
index 5d8f146c43fd..83d01c2fc91d 100644
--- a/docs/source/en/model_doc/roformer.md
+++ b/docs/source/en/model_doc/roformer.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # RoFormer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The RoFormer model was proposed in [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md
index 6a1545e12329..c80e83e7b883 100644
--- a/docs/source/en/model_doc/rt_detr.md
+++ b/docs/source/en/model_doc/rt_detr.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # RT-DETR
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 
diff --git a/docs/source/en/model_doc/rt_detr_v2.md b/docs/source/en/model_doc/rt_detr_v2.md
index 0c125af3d2a1..e5212d945ce7 100644
--- a/docs/source/en/model_doc/rt_detr_v2.md
+++ b/docs/source/en/model_doc/rt_detr_v2.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # RT-DETRv2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The RT-DETRv2 model was proposed in [RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer](https://arxiv.org/abs/2407.17140) by Wenyu Lv, Yian Zhao, Qinyao Chang, Kui Huang, Guanzhong Wang, Yi Liu.
diff --git a/docs/source/en/model_doc/rwkv.md b/docs/source/en/model_doc/rwkv.md
index 1acb17306021..8b54c25204bb 100644
--- a/docs/source/en/model_doc/rwkv.md
+++ b/docs/source/en/model_doc/rwkv.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # RWKV
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The RWKV model was proposed in [this repo](https://github.com/BlinkDL/RWKV-LM)
diff --git a/docs/source/en/model_doc/sam.md b/docs/source/en/model_doc/sam.md
index f45b08c2c235..58cbfbfb2190 100644
--- a/docs/source/en/model_doc/sam.md
+++ b/docs/source/en/model_doc/sam.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # SAM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 SAM (Segment Anything Model) was proposed in [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
@@ -144,12 +149,24 @@ alt="drawing" width="900"/>
 [[autodoc]] SamImageProcessor
 
 
+## SamVisionModel
+
+[[autodoc]] SamVisionModel
+    - forward
+
+
 ## SamModel
 
 [[autodoc]] SamModel
     - forward
 
 
+## TFSamVisionModel
+
+[[autodoc]] TFSamVisionModel
+    - call
+
+
 ## TFSamModel
 
 [[autodoc]] TFSamModel
diff --git a/docs/source/en/model_doc/seamless_m4t.md b/docs/source/en/model_doc/seamless_m4t.md
index 486e58691f6d..100198e50170 100644
--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # SeamlessM4T
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The SeamlessM4T model was proposed in [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team from Meta AI.
diff --git a/docs/source/en/model_doc/seamless_m4t_v2.md b/docs/source/en/model_doc/seamless_m4t_v2.md
index c6a2ec4b51c2..7b68d08b5f95 100644
--- a/docs/source/en/model_doc/seamless_m4t_v2.md
+++ b/docs/source/en/model_doc/seamless_m4t_v2.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # SeamlessM4T-v2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The SeamlessM4T-v2 model was proposed in [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team from Meta AI.
diff --git a/docs/source/en/model_doc/segformer.md b/docs/source/en/model_doc/segformer.md
index 1dc38ef45b8e..093a141eaf83 100644
--- a/docs/source/en/model_doc/segformer.md
+++ b/docs/source/en/model_doc/segformer.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # SegFormer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The SegFormer model was proposed in [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping
diff --git a/docs/source/en/model_doc/seggpt.md b/docs/source/en/model_doc/seggpt.md
index b53f5d6ca150..1eb82b84774c 100644
--- a/docs/source/en/model_doc/seggpt.md
+++ b/docs/source/en/model_doc/seggpt.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # SegGPT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The SegGPT model was proposed in [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang. SegGPT employs a decoder-only Transformer that can generate a segmentation mask given an input image, a prompt image and its corresponding prompt mask. The model achieves remarkable one-shot results with 56.1 mIoU on COCO-20 and 85.6 mIoU on FSS-1000.
diff --git a/docs/source/en/model_doc/sew-d.md b/docs/source/en/model_doc/sew-d.md
index 013e404bd045..3626d953d97d 100644
--- a/docs/source/en/model_doc/sew-d.md
+++ b/docs/source/en/model_doc/sew-d.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # SEW-D
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 SEW-D (Squeezed and Efficient Wav2Vec with Disentangled attention) was proposed in [Performance-Efficiency Trade-offs
diff --git a/docs/source/en/model_doc/sew.md b/docs/source/en/model_doc/sew.md
index ee8a36a4dcb2..cfc92db0eaa1 100644
--- a/docs/source/en/model_doc/sew.md
+++ b/docs/source/en/model_doc/sew.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # SEW
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 SEW (Squeezed and Efficient Wav2Vec) was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training
diff --git a/docs/source/en/model_doc/shieldgemma2.md b/docs/source/en/model_doc/shieldgemma2.md
new file mode 100644
index 000000000000..ed25f57eb725
--- /dev/null
+++ b/docs/source/en/model_doc/shieldgemma2.md
@@ -0,0 +1,100 @@
+
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ShieldGemma 2
+
+## Overview
+
+The ShieldGemma 2 model was proposed in a [technical report](https://arxiv.org/abs/2504.01081) by Google. ShieldGemma 2, built on [Gemma 3](https://ai.google.dev/gemma/docs/core/model_card_3), is a 4 billion (4B) parameter model that checks the safety of both synthetic and natural images against key categories to help you build robust datasets and models. With this addition to the Gemma family of models, researchers and developers can now easily minimize the risk of harmful content in their models across key areas of harm as defined below:
+
+-   No Sexually Explicit content: The image shall not contain content that depicts explicit or graphic sexual acts (e.g., pornography, erotic nudity, depictions of rape or sexual assault).
+-   No Dangerous Content: The image shall not contain content that facilitates or encourages activities that could cause real-world harm (e.g., building firearms and explosive devices, promotion of terrorism, instructions for suicide).
+-   No Violence/Gore content: The image shall not contain content that depicts shocking, sensational, or gratuitous violence (e.g., excessive blood and gore, gratuitous violence against animals, extreme injury or moment of death).
+
+We recommend using ShieldGemma 2 as an input filter to vision language models, or as an output filter of image generation systems. To train a robust image safety model, we curated training datasets of natural and synthetic images and instruction-tuned Gemma 3 to demonstrate strong performance.
+
+This model was contributed by [Ryan Mullins](https://huggingface.co/RyanMullins).
+
+## Usage Example
+
+- ShieldGemma 2 provides a Processor that accepts a list of `images` and an optional list of `policies` as input, and constructs a batch of prompts as the product of these two lists using the provided chat template.
+- You can extend ShieldGemma's built-in in policies with the `custom_policies` argument to the Processor. Using the same key as one of the built-in policies will overwrite that policy with your custom defintion.
+- ShieldGemma 2 does not support the image cropping capabilities used by Gemma 3.
+
+### Classification against Built-in Policies
+
+```python
+from PIL import Image
+import requests
+from transformers import AutoProcessor, ShieldGemma2ForImageClassification
+
+model_id = "google/shieldgemma-2-4b-it"
+model = ShieldGemma2ForImageClassification.from_pretrained(model_id, device_map="auto")
+processor = AutoProcessor.from_pretrained(model_id)
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(images=[image], return_tensors="pt").to(model.device)
+
+output = model(**inputs)
+print(output.probabilities)
+```
+
+### Classification against Custom Policies
+
+```python
+from PIL import Image
+import requests
+from transformers import AutoProcessor, ShieldGemma2ForImageClassification
+
+model_id = "google/shieldgemma-2-4b-it"
+model = ShieldGemma2ForImageClassification.from_pretrained(model_id, device_map="auto")
+processor = AutoProcessor.from_pretrained(model_id)
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+custom_policies = {
+    "key_a": "descrition_a",
+    "key_b": "descrition_b",
+}
+
+inputs = processor(
+    images=[image],
+    custom_policies=custom_policies,
+    policies=["dangerous", "key_a", "key_b"],
+    return_tensors="pt",
+).to(model.device)
+
+output = model(**inputs)
+print(output.probabilities)
+```
+
+
+## ShieldGemma2Processor
+
+[[autodoc]] ShieldGemma2Processor
+
+## ShieldGemma2Config
+
+[[autodoc]] ShieldGemma2Config
+
+## ShieldGemma2ForImageClassification
+
+[[autodoc]] ShieldGemma2ForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md
index 4beac361de53..478e8a19a8c3 100644
--- a/docs/source/en/model_doc/siglip.md
+++ b/docs/source/en/model_doc/siglip.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # SigLIP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The SigLIP model was proposed in [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer. SigLIP proposes to replace the loss function used in [CLIP](clip) by a simple pairwise sigmoid loss. This results in better performance in terms of zero-shot classification accuracy on ImageNet.
diff --git a/docs/source/en/model_doc/siglip2.md b/docs/source/en/model_doc/siglip2.md
new file mode 100644
index 000000000000..0d49d9382361
--- /dev/null
+++ b/docs/source/en/model_doc/siglip2.md
@@ -0,0 +1,282 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SigLIP2
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+The SigLIP2 model was proposed in [SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features](https://huggingface.co/papers/2502.14786) by Michael Tschannen, Alexey Gritsenko, Xiao Wang, Muhammad Ferjad Naeem, Ibrahim Alabdulmohsin,
+Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, Olivier Hénaff, Jeremiah Harmsen,
+Andreas Steiner and Xiaohua Zhai.
+
+The model comes in two variants
+
+ 1) FixRes - model works with fixed resolution images (backward compatible with SigLIP v1)
+ 2) NaFlex - model works with variable image aspect ratios and resolutions (SigLIP2 in `transformers`)
+
+The abstract from the paper is the following:
+
+*We introduce SigLIP 2, a family of new multilingual vision-language encoders that build on the success
+of the original SigLIP. In this second iteration, we extend the original image-text training objective with
+several prior, independently developed techniques into a unified recipe—this includes decoder-based
+pretraining, self-supervised losses (self-distillation, masked prediction) and online data curation. With
+these changes, SigLIP 2 models outperform their SigLIP counterparts at all model scales in core capabilities, 
+including zero-shot classification (best SigLIP 2 ViT-g/16 achieves 85.0% ImageNet zero-shot
+accuracy), image-text retrieval, and transfer performance when extracting visual representations for
+Vision-Language Models (VLMs). Furthermore, the new training recipe leads to significant improvements 
+on localization and dense prediction tasks. We also train variants which support multiple resolutions 
+and preserve the input’s native aspect ratio. Finally, we train on a more diverse data-mixture that
+includes de-biasing techniques, leading to much better multilingual understanding and improved fair-
+ness. To provide users with the ability to trade-off inference cost with performance, we release model
+checkpoints at four sizes (ViT-B/86M, L/303M, So400m/400M, and g/1B).*
+
+## Usage tips
+
+- Usage of SigLIP2 is similar to [SigLIP](siglip) and [CLIP](clip). The main difference from CLIP is the training loss, which does not require a global view of all the pairwise similarities of images and texts within a batch. One needs to apply the sigmoid activation function to the logits, rather than the softmax.
+- Training is supported but does not use `torch.distributed` utilities which may limit the scalability of batch size. However, DDP and FDSP works on single-node multi-gpu setup.
+- When using the standalone [`GemmaTokenizerFast`] make sure to pass `padding="max_length"` and `max_length=64` as that's how the model was trained.
+- Model was trained with *lowercased* text, make sure you make the same preprocessing for your text labels.
+- To get the same results as the pipeline, a prompt template of "this is a photo of {label}" should be used.
+- The NaFlex variant supports processing images at higher resolutions by adjusting the `max_num_patches` parameter in the `Processor`. The default value is `max_num_patches=256`. Increasing `max_num_patches` to 1024 (4x) will approximately double processed image height and width, while preserving the aspect ratio.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip2_metrics_table.png"
+alt="drawing" width="600"/>
+
+This model was contributed by [qubvel](https://huggingface.co/qubvel-hf).
+The original code can be found [here](https://github.com/google-research/big_vision/tree/main).
+
+## Usage example
+
+There are 2 main ways to use SigLIP2: either using the pipeline API, which abstracts away all the complexity for you, or by using the `Siglip2Model` class yourself.
+
+### FixRes variant
+
+**Pipeline API**
+
+The pipeline allows to use the model in a few lines of code:
+
+```python
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> # load pipe
+>>> image_classifier = pipeline(
+...     task="zero-shot-image-classification",
+...     model="google/siglip2-base-patch16-224",
+... )
+
+>>> # load image
+>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> # inference
+>>> candidate_labels = ["2 cats", "a plane", "a remote"]
+>>> outputs = image_classifier(image, candidate_labels=candidate_labels)
+>>> outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
+>>> print(outputs)
+[{'score': 0.1499, 'label': '2 cats'}, {'score': 0.0008, 'label': 'a remote'}, {'score': 0.0, 'label': 'a plane'}]
+```
+
+**Using the model yourself**
+
+If you want to do the pre- and postprocessing yourself, here's how to do that:
+
+```python
+>>> from PIL import Image
+>>> import requests
+>>> from transformers import AutoProcessor, AutoModel
+>>> import torch
+
+>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
+>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> candidate_labels = ["2 cats", "2 dogs"]
+# follows the pipeline prompt template to get same results
+>>> texts = [f"This is a photo of {label}." for label in candidate_labels]
+
+# IMPORTANT: we pass `padding=max_length` and `max_length=64` since the model was trained with this
+>>> inputs = processor(text=texts, images=image, padding="max_length", max_length=64, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image
+>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
+>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
+15.0% that image 0 is '2 cats'
+```
+
+### NaFlex variant
+
+NaFlex combines ideas from FlexiViT, i.e. supporting multiple, predefined sequence lengths 
+with a single ViT model, and NaViT, namely processing images at their native aspect ratio.
+This enables processing different types of images at appropriate resolution, e.g. using a
+larger resolution to process document images, while at the same time minimizing the impact 
+of aspect ratio distortion on certain inference tasks, e.g. on OCR.
+
+Given a patch size and target sequence length, NaFlex preprocesses the data by first resizing 
+the input image such that the height and width after resizing are multiples of the patch size,
+while 
+    
+    1. keeping the aspect ratio distortion as small as possible
+    2. producing a sequence length of at most the desired target sequence length (`max_num_patches`)
+    
+The resulting distortion in width and height is at most `(patch_size - 1) / width` and
+`(patch_size - 1) / height`, respectively, which tends to be small for common resolutions and aspect ratios. 
+After resizing, the image is split into a sequence of patches, and a mask with padding information is added.
+
+```python
+>>> from PIL import Image
+>>> import requests
+>>> from transformers import AutoProcessor, AutoModel
+>>> import torch
+
+>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-naflex")
+>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-naflex")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> candidate_labels = ["2 cats", "2 dogs"]
+# follows the pipeline prompt template to get same results
+>>> texts = [f"This is a photo of {label}." for label in candidate_labels]
+
+# default value for `max_num_patches` is 256, but you can increase resulted image resolution providing
+# higher values e.g. `max_num_patches=512`
+>>> inputs = processor(text=texts, images=image, max_num_patches=256, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image
+>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
+>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
+21.1% that image 0 is '2 cats'
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SigLIP2.
+
+- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification)
+- Demo notebook for SigLIP2 can be found [here](https://github.com/qubvel/transformers-notebooks/tree/master/notebooks/SigLIP2_inference.ipynb). 🌎
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+
+## Combining SigLIP2 and Flash Attention 2
+
+First, make sure to install the latest version of Flash Attention 2.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import AutoProcessor, AutoModel
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = AutoModel.from_pretrained(
+...     "google/siglip2-so400m-patch14-384",
+...     attn_implementation="flash_attention_2",
+...     torch_dtype=torch.float16,
+...     device_map=device,
+... )
+>>> processor = AutoProcessor.from_pretrained("google/siglip2-so400m-patch14-384")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> candidate_labels = ["2 cats", "2 dogs"]
+# follows the pipeline prompt template to get same results
+>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
+# important: we pass `padding=max_length` since the model was trained with this
+>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
+
+>>> with torch.no_grad():
+...     with torch.autocast(device):
+...         outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image
+>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
+>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
+19.8% that image 0 is '2 cats'
+```
+
+## Siglip2Config
+
+[[autodoc]] Siglip2Config
+
+## Siglip2TextConfig
+
+[[autodoc]] Siglip2TextConfig
+
+## Siglip2VisionConfig
+
+[[autodoc]] Siglip2VisionConfig
+
+## Siglip2ImageProcessor
+
+[[autodoc]] Siglip2ImageProcessor
+    - preprocess
+
+## Siglip2ImageProcessorFast
+
+[[autodoc]] Siglip2ImageProcessorFast
+    - preprocess
+
+## Siglip2Processor
+
+[[autodoc]] Siglip2Processor
+
+## Siglip2Model
+
+[[autodoc]] Siglip2Model
+    - forward
+    - get_text_features
+    - get_image_features
+
+## Siglip2TextModel
+
+[[autodoc]] Siglip2TextModel
+    - forward
+
+## Siglip2VisionModel
+
+[[autodoc]] Siglip2VisionModel
+    - forward
+
+## Siglip2ForImageClassification
+
+[[autodoc]] Siglip2ForImageClassification
+    - forward
diff --git a/docs/source/en/model_doc/smolvlm.md b/docs/source/en/model_doc/smolvlm.md
new file mode 100644
index 000000000000..9512fb6aa295
--- /dev/null
+++ b/docs/source/en/model_doc/smolvlm.md
@@ -0,0 +1,203 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SmolVLM
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+SmolVLM2 is an adaptation of the Idefics3 model with two main differences:
+
+- It uses SmolLM2 for the text model.
+- It supports multi-image and video inputs
+
+## Usage tips
+
+Input images are processed either by upsampling (if resizing is enabled) or at their original resolution. The resizing behavior depends on two parameters: do_resize and size.
+
+Videos should not be upsampled. 
+
+If `do_resize` is set to `True`, the model resizes images so that the longest edge is 4*512 pixels by default.
+The default resizing behavior can be customized by passing a dictionary to the `size` parameter. For example, `{"longest_edge": 4 * 512}` is the default, but you can change it to a different value if needed.
+
+Here’s how to control resizing and set a custom size:
+```python
+image_processor = SmolVLMImageProcessor(do_resize=True, size={"longest_edge": 2 * 512}, max_image_size=512)
+```
+
+Additionally, the `max_image_size` parameter, which controls the size of each square patch the image is decomposed into, is set to 512 by default but can be adjusted as needed. After resizing (if applicable), the image processor decomposes the images into square patches based on the `max_image_size` parameter.
+
+This model was contributed by [orrzohar](https://huggingface.co/orrzohar).
+
+
+
+## Usage example
+
+### Single Media inference
+
+The model can accept both images and videos as input, but you should use only one of the modalities at a time. Here's an example code for that.
+
+```python
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText
+
+processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
+model = AutoModelForImageTextToText.from_pretrained(
+    "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+)
+
+conversation = [
+    {
+        "role": "user",
+        "content":[
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+            {"type": "text", "text": "Describe this image."}
+        ]
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device, dtype=torch.bfloat16)
+
+output_ids = model.generate(**inputs, max_new_tokens=128)
+generated_texts = processor.batch_decode(output_ids, skip_special_tokens=True)
+print(generated_texts)
+
+
+# Video
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "video", "path": "/path/to/video.mp4"},
+            {"type": "text", "text": "Describe this video in detail"}
+        ]
+    },
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device, dtype=torch.bfloat16)
+
+generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=100)
+generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+print(generated_texts[0])
+```
+
+### Batch Mixed Media Inference
+
+The model can batch inputs composed of several images/videos and text. Here is an example.
+
+```python
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText
+
+processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
+model = AutoModelForImageTextToText.from_pretrained(
+    "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+)
+
+# Conversation for the first image
+conversation1 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "path": "/path/to/image.jpg"},
+            {"type": "text", "text": "Describe this image."}
+        ]
+    }
+]
+
+# Conversation with two images
+conversation2 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "path": "/path/to/image.jpg"},
+            {"type": "image", "path": "/path/to/image.jpg"},
+            {"type": "text", "text": "What is written in the pictures?"}
+        ]
+    }
+]
+
+# Conversation with pure text
+conversation3 = [
+    {"role": "user","content": "who are you?"}
+]
+
+
+conversations = [conversation1, conversation2, conversation3]
+inputs = processor.apply_chat_template(
+    conversation,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+).to(model.device, dtype=torch.bfloat16)
+
+generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=100)
+generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+print(generated_texts[0])
+```
+
+## SmolVLMConfig
+
+[[autodoc]] SmolVLMConfig
+
+## SmolVLMVisionConfig
+
+[[autodoc]] SmolVLMVisionConfig
+
+## Idefics3VisionTransformer
+
+[[autodoc]] SmolVLMVisionTransformer
+
+## SmolVLMModel
+
+[[autodoc]] SmolVLMModel
+    - forward
+
+## SmolVLMForConditionalGeneration
+
+[[autodoc]] SmolVLMForConditionalGeneration
+    - forward
+
+
+## SmolVLMImageProcessor
+[[autodoc]] SmolVLMImageProcessor
+    - preprocess
+
+
+## SmolVLMProcessor
+[[autodoc]] SmolVLMProcessor
+    - __call__
diff --git a/docs/source/en/model_doc/speech-encoder-decoder.md b/docs/source/en/model_doc/speech-encoder-decoder.md
index 7e2bcef98abc..8893adfdd4a0 100644
--- a/docs/source/en/model_doc/speech-encoder-decoder.md
+++ b/docs/source/en/model_doc/speech-encoder-decoder.md
@@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.
 
 # Speech Encoder Decoder Models
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-to-text model
 with any pretrained speech autoencoding model as the encoder (*e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert)) and any pretrained autoregressive model as the decoder.
 
diff --git a/docs/source/en/model_doc/speech_to_text.md b/docs/source/en/model_doc/speech_to_text.md
index 23512b323af6..bc65ea79655f 100644
--- a/docs/source/en/model_doc/speech_to_text.md
+++ b/docs/source/en/model_doc/speech_to_text.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # Speech2Text
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The Speech2Text model was proposed in [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a
@@ -69,7 +74,7 @@ be installed as follows: `apt install libsndfile1-dev`
   For multilingual speech translation models, `eos_token_id` is used as the `decoder_start_token_id` and
   the target language id is forced as the first generated token. To force the target language id as the first
   generated token, pass the `forced_bos_token_id` parameter to the `generate()` method. The following
-  example shows how to transate English speech to French text using the *facebook/s2t-medium-mustc-multilingual-st*
+  example shows how to translate English speech to French text using the *facebook/s2t-medium-mustc-multilingual-st*
   checkpoint.
 
 ```python
diff --git a/docs/source/en/model_doc/speecht5.md b/docs/source/en/model_doc/speecht5.md
index 4d5e2098a542..acbadb137f46 100644
--- a/docs/source/en/model_doc/speecht5.md
+++ b/docs/source/en/model_doc/speecht5.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # SpeechT5
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The SpeechT5 model was proposed in [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
diff --git a/docs/source/en/model_doc/splinter.md b/docs/source/en/model_doc/splinter.md
index a46c55966c0e..0d526beff968 100644
--- a/docs/source/en/model_doc/splinter.md
+++ b/docs/source/en/model_doc/splinter.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Splinter
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Splinter model was proposed in [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter
diff --git a/docs/source/en/model_doc/squeezebert.md b/docs/source/en/model_doc/squeezebert.md
index e2bb378fe5bb..56046e22b799 100644
--- a/docs/source/en/model_doc/squeezebert.md
+++ b/docs/source/en/model_doc/squeezebert.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # SqueezeBERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The SqueezeBERT model was proposed in [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, Kurt W. Keutzer. It's a
diff --git a/docs/source/en/model_doc/stablelm.md b/docs/source/en/model_doc/stablelm.md
index 09c0e5855c3a..b996b7fcf9e8 100644
--- a/docs/source/en/model_doc/stablelm.md
+++ b/docs/source/en/model_doc/stablelm.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # StableLM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 `StableLM 3B 4E1T` was proposed in [`StableLM 3B 4E1T`: Technical Report](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Stability AI and is the first model in a series of multi-epoch pre-trained language models.
diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md
index 1d107b385556..c6b146bf30ed 100644
--- a/docs/source/en/model_doc/starcoder2.md
+++ b/docs/source/en/model_doc/starcoder2.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Starcoder2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 StarCoder2 is a family of open LLMs for code and comes in 3 different sizes with 3B, 7B and 15B parameters. The flagship StarCoder2-15B model is trained on over 4 trillion tokens and 600+ programming languages from The Stack v2. All models use Grouped Query Attention, a context window of 16,384 tokens with a sliding window attention of 4,096 tokens, and were trained using the Fill-in-the-Middle objective. The models have been released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas Krauß, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
diff --git a/docs/source/en/model_doc/superglue.md b/docs/source/en/model_doc/superglue.md
index 08a4575dddc2..38ef55ab793f 100644
--- a/docs/source/en/model_doc/superglue.md
+++ b/docs/source/en/model_doc/superglue.md
@@ -15,6 +15,10 @@ rendered properly in your Markdown viewer.
 
 # SuperGlue
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The SuperGlue model was proposed in [SuperGlue: Learning Feature Matching with Graph Neural Networks](https://arxiv.org/abs/1911.11763) by Paul-Edouard Sarlin, Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md
index 59e451adceb8..06ae5cb08127 100644
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@@ -15,6 +15,10 @@ rendered properly in your Markdown viewer.
 
 # SuperPoint
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The SuperPoint model was proposed
diff --git a/docs/source/en/model_doc/swiftformer.md b/docs/source/en/model_doc/swiftformer.md
index 319c79fce4fb..48580a60f580 100644
--- a/docs/source/en/model_doc/swiftformer.md
+++ b/docs/source/en/model_doc/swiftformer.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # SwiftFormer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The SwiftFormer model was proposed in [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
diff --git a/docs/source/en/model_doc/swin.md b/docs/source/en/model_doc/swin.md
index e23c882a3f09..4e2adf5ca820 100644
--- a/docs/source/en/model_doc/swin.md
+++ b/docs/source/en/model_doc/swin.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # Swin Transformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The Swin Transformer was proposed in [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
diff --git a/docs/source/en/model_doc/swin2sr.md b/docs/source/en/model_doc/swin2sr.md
index 18d6635feffc..136f1a1c1e17 100644
--- a/docs/source/en/model_doc/swin2sr.md
+++ b/docs/source/en/model_doc/swin2sr.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Swin2SR
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Swin2SR model was proposed in [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
diff --git a/docs/source/en/model_doc/swinv2.md b/docs/source/en/model_doc/swinv2.md
index 25233dca3395..a709af9712e3 100644
--- a/docs/source/en/model_doc/swinv2.md
+++ b/docs/source/en/model_doc/swinv2.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Swin Transformer V2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Swin Transformer V2 model was proposed in [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
diff --git a/docs/source/en/model_doc/switch_transformers.md b/docs/source/en/model_doc/switch_transformers.md
index ca6748167f5e..433b84dd8622 100644
--- a/docs/source/en/model_doc/switch_transformers.md
+++ b/docs/source/en/model_doc/switch_transformers.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # SwitchTransformers
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md
index 86a645512c6c..e7b28cdf1fba 100644
--- a/docs/source/en/model_doc/t5.md
+++ b/docs/source/en/model_doc/t5.md
@@ -14,354 +14,104 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# T5
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=t5">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-t5-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/t5-base">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-<a href="https://huggingface.co/papers/1910.10683">
-<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-1910.10683-green">
-</a>
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+    </div>
 </div>
 
-## Overview
-
-The T5 model was presented in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) by [Colin Raffel](https://huggingface.co/craffel), Noam Shazeer, [Adam Roberts](https://huggingface.co/adarob), Katherine Lee, Sharan Narang,
-Michael Matena, Yanqi Zhou, Wei Li, [Peter J. Liu](https://huggingface.co/peterjliu).
-
-The abstract from the paper is the following:
-
-*Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream
-task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
-has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
-transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
-text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer
-approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
-with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
-summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
-NLP, we release our dataset, pre-trained models, and code.*
-
-All checkpoints can be found on the [hub](https://huggingface.co/models?search=t5).
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/google-research/text-to-text-transfer-transformer).
-
-## Usage tips
-
-- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which
-each task is converted into a text-to-text format. T5 works well on a variety of tasks out-of-the-box by prepending a
-different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*,
-for summarization: *summarize: ...*.
-- The pretraining includes both supervised and self-supervised training. Supervised training is conducted on downstream tasks provided by the GLUE and SuperGLUE benchmarks (converting them into text-to-text tasks as explained above).
-- Self-supervised training uses corrupted tokens, by randomly removing 15% of the tokens and replacing them with individual sentinel tokens (if several consecutive tokens are marked for removal, the whole group is replaced with a single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder is the original sentence and the target is then the dropped out tokens delimited by their sentinel tokens.
-
-- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
-
-- See the [training](#training), [inference](#inference) and [resources](#resources) sections below for all details regarding usage.
-
-T5 comes in different sizes:
-
-- [google-t5/t5-small](https://huggingface.co/google-t5/t5-small)
-
-- [google-t5/t5-base](https://huggingface.co/google-t5/t5-base)
-
-- [google-t5/t5-large](https://huggingface.co/google-t5/t5-large)
-
-- [google-t5/t5-3b](https://huggingface.co/google-t5/t5-3b)
-
-- [google-t5/t5-11b](https://huggingface.co/google-t5/t5-11b).
-
-Based on the original T5 model, Google has released some follow-up works:
-
-- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without
-  mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found [here](t5v1.1).
-
-- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to
-  the documentation of mT5 which can be found [here](mt5).
-
-- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer
-  to the documentation of byT5 which can be found [here](byt5).
-
-- **UL2**: UL2 is a T5 like model pretrained on various denoising objectives
-
-- **Flan-T5**: Flan is a pretraining methods that is based on prompting. The Flan-T5 are T5 models trained on the Flan collection of 
-    datasets which include: `taskmaster2`, `djaym7/wiki_dialog`, `deepmind/code_contests`, `lambada`, `gsm8k`, `aqua_rat`, `esnli`, `quasc` and `qed`.
-
-- **FLan-UL2** : the UL2 model finetuned using the "Flan" prompt tuning and dataset collection.
-
-- **UMT5**: UmT5 is a multilingual T5 model trained on an improved and refreshed mC4 multilingual corpus,  29 trillion characters across 107 language, using a new sampling method, UniMax. Refer to
- the documentation of mT5 which can be found [here](umt5).
-
-## Training
-
-T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
-forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input
-sequence is fed to the model using `input_ids`. The target sequence is shifted to the right, i.e., prepended by a
-start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target
-sequence is then appended by the EOS token and corresponds to the `labels`. The PAD token is hereby used as the
-start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+# T5
 
-One can use [`T5ForConditionalGeneration`] (or the Tensorflow/Flax variant), which includes the
-language modeling head on top of the decoder.
+[T5](https://huggingface.co/papers/1910.10683) is a encoder-decoder transformer available in a range of sizes from 60M to 11B parameters. It is designed to handle a wide range of NLP tasks by treating them all as text-to-text problems. This eliminates the need for task-specific architectures because T5 converts every NLP task into a text generation task.
 
-- Unsupervised denoising training
+To formulate every task as text generation, each task is prepended with a task-specific prefix (e.g., translate English to German: ..., summarize: ...). This enables T5 to handle tasks like translation, summarization, question answering, and more.
 
-In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
-the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each
-sentinel token represents a unique mask token for this sentence and should start with `<extra_id_0>`,
-`<extra_id_1>`, ... up to `<extra_id_99>`. As a default, 100 sentinel tokens are available in
-[`T5Tokenizer`].
+You can find all official T5 checkpoints under the [T5](https://huggingface.co/collections/google/t5-release-65005e7c520f8d7b4d037918) collection.
 
-For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be
-processed as follows:
+> [!TIP]
+> Click on the T5 models in the right sidebar for more examples of how to apply T5 to different language tasks.
 
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and how to translate with T5 from the command line.
 
->>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
->>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
->>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
+```py
+import torch
+from transformers import pipeline
 
->>> # the forward function automatically creates the correct decoder_input_ids
->>> loss = model(input_ids=input_ids, labels=labels).loss
->>> loss.item()
-3.7837
+pipeline = pipeline(
+    task="text2text-generation",
+    model="google-t5/t5-base",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("translate English to French: The weather is nice today.")
 ```
 
-If you're interested in pre-training T5 on a new corpus, check out the [run_t5_mlm_flax.py](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling) script in the Examples
-directory.
+</hfoption>
+<hfoption id="AutoModel">
 
-- Supervised training
+```py
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 
-In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping.
-Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input
-sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for
-the model as follows:
+tokenizer = AutoTokenizer.from_pretrained(
+    "google-t5/t5-base"
+    )
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google-t5/t5-base",
+    torch_dtype=torch.float16,
+    device_map="auto"
+    )
 
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-
->>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-
->>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
->>> labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
-
->>> # the forward function automatically creates the correct decoder_input_ids
->>> loss = model(input_ids=input_ids, labels=labels).loss
->>> loss.item()
-0.2542
-```
+input_ids = tokenizer("translate English to French: The weather is nice today.", return_tensors="pt").to("cuda")
 
-As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
-`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
-target sequence). The model will automatically create the `decoder_input_ids` based on the `labels`, by
-shifting them one position to the right and prepending the `config.decoder_start_token_id`, which for T5 is
-equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate
-English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used
-during T5's pre-training.
-
-However, the example above only shows a single training example. In practice, one trains deep learning models in
-batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one
-typically defines a `max_source_length` and `max_target_length`, which determine the maximum length of the
-input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on
-the task.
-
-In addition, we must make sure that padding token id's of the `labels` are not taken into account by the loss
-function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the `ignore_index`
-of the `CrossEntropyLoss`. In Flax, one can use the `decoder_attention_mask` to ignore padded tokens from
-the loss (see the [Flax summarization script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization) for details). We also pass
-`attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are
-ignored. The code example below illustrates all of this.
-
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
->>> import torch
-
->>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-
->>> # the following 2 hyperparameters are task-specific
->>> max_source_length = 512
->>> max_target_length = 128
-
->>> # Suppose we have the following 2 training examples:
->>> input_sequence_1 = "Welcome to NYC"
->>> output_sequence_1 = "Bienvenue à NYC"
-
->>> input_sequence_2 = "HuggingFace is a company"
->>> output_sequence_2 = "HuggingFace est une entreprise"
-
->>> # encode the inputs
->>> task_prefix = "translate English to French: "
->>> input_sequences = [input_sequence_1, input_sequence_2]
-
->>> encoding = tokenizer(
-...     [task_prefix + sequence for sequence in input_sequences],
-...     padding="longest",
-...     max_length=max_source_length,
-...     truncation=True,
-...     return_tensors="pt",
-... )
-
->>> input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
-
->>> # encode the targets
->>> target_encoding = tokenizer(
-...     [output_sequence_1, output_sequence_2],
-...     padding="longest",
-...     max_length=max_target_length,
-...     truncation=True,
-...     return_tensors="pt",
-... )
->>> labels = target_encoding.input_ids
-
->>> # replace padding token id's of the labels by -100 so it's ignored by the loss
->>> labels[labels == tokenizer.pad_token_id] = -100
-
->>> # forward pass
->>> loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
->>> loss.item()
-0.188
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 
-Additional training tips:
+</hfoption>
+<hfoption id="transformers-cli">
 
-- T5 models need a slightly higher learning rate than the default one set in the `Trainer` when using the AdamW
-optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question
-answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer.
-
-According to [this forum post](https://discuss.huggingface.co/t/t5-finetuning-tips/684), task prefixes matter when
-(1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's
-pre-training mixture (see Appendix D of the [paper](https://arxiv.org/pdf/1910.10683.pdf) for the task prefixes
-used).
-
-If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of
-*pad_to_multiple_of* to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding
-batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is
-encountered during training thus significantly slowing down the training. only padding up to the longest example in a
-batch) leads to very slow training on TPU.
-
-## Inference
-
-At inference time, it is recommended to use [`~generation.GenerationMixin.generate`]. This
-method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder
-and auto-regressively generates the decoder output. Check out [this blog post](https://huggingface.co/blog/how-to-generate) to know all the details about generating text with Transformers.
-There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encoder-decoder) which explains how
-generation works in general in encoder-decoder models.
-
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-
->>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-
->>> input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
->>> outputs = model.generate(input_ids)
->>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-Das Haus ist wunderbar.
+```bash
+echo -e "translate English to French: The weather is nice today." | transformers-cli run --task text2text-generation --model google-t5/t5-base --device 0
 ```
 
-Note that T5 uses the `pad_token_id` as the `decoder_start_token_id`, so when doing generation without using
-[`~generation.GenerationMixin.generate`], make sure you start it with the `pad_token_id`.
-
-The example above only shows a single example. You can also do batched inference, like so:
+</hfoption>
+</hfoptions>
 
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
 
->>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
 
->>> task_prefix = "translate English to German: "
->>> # use different length sentences to test batching
->>> sentences = ["The house is wonderful.", "I like to work in NYC."]
+```py
+# pip install torchao
+import torch
+from transformers import TorchAoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
 
->>> inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google/t5-v1_1-xl",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
 
->>> output_sequences = model.generate(
-...     input_ids=inputs["input_ids"],
-...     attention_mask=inputs["attention_mask"],
-...     do_sample=False,  # disable sampling to test if batching affects output
-... )
+tokenizer = AutoTokenizer.from_pretrained("google/t5-v1_1-xl")
+input_ids = tokenizer("translate English to French: The weather is nice today.", return_tensors="pt").to("cuda")
 
->>> print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
-['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.']
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 
-Because T5 has been trained with the span-mask denoising objective,
-it can be used to predict the sentinel (masked-out) tokens during inference.
-The predicted tokens will then be placed between the sentinel tokens.
-
-```python
->>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-
->>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
->>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-
->>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
-
->>> sequence_ids = model.generate(input_ids)
->>> sequences = tokenizer.batch_decode(sequence_ids)
->>> sequences
-['<pad> <extra_id_0> park offers <extra_id_1> the <extra_id_2> park.</s>']
-```
-
-## Performance
-
-If you'd like a faster training and inference performance, install [NVIDIA APEX](https://github.com/NVIDIA/apex#quick-start) for NVIDIA GPUs, or [ROCm APEX](https://github.com/ROCmSoftwarePlatform/apex) for AMD GPUs and then the model will automatically use `apex.normalization.FusedRMSNorm` instead of `T5LayerNorm`. The former uses an optimized fused kernel which is several times faster than the latter.
-
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with T5. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-classification"/>
-
-- A notebook for how to [finetune T5 for classification and multiple choice](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb).
-- A notebook for how to [finetune T5 for sentiment span extraction](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb). 🌎
-
-<PipelineTag pipeline="token-classification"/>
-
-- A notebook for how to [finetune T5 for named entity recognition](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing). 🌎
-
-<PipelineTag pipeline="text-generation"/>
-
-- A notebook for [Finetuning CodeT5 for generating docstrings from Ruby code](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tune_CodeT5_for_generating_docstrings_from_Ruby_code.ipynb).
-
-<PipelineTag pipeline="summarization"/>
-
-- A notebook to [Finetune T5-base-dutch to perform Dutch abstractive summarization on a TPU](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/T5/Fine_tuning_Dutch_T5_base_on_CNN_Daily_Mail_for_summarization_(on_TPU_using_HuggingFace_Accelerate).ipynb).
-- A notebook for how to [finetune T5 for summarization in PyTorch and track experiments with WandB](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb#scrollTo=OKRpFvYhBauC). 🌎
-- A blog post on [Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq).
-- [`T5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb).
-- [`TFT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).
-- [`FlaxT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization).
-- [Summarization](https://huggingface.co/course/chapter7/5?fw=pt#summarization) chapter of the 🤗 Hugging Face course.
-- [Summarization task guide](../tasks/summarization)
-
-<PipelineTag pipeline="fill-mask"/>
-
-- [`FlaxT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#t5-like-span-masked-language-modeling) for training T5 with a span-masked language model objective. The script also shows how to train a T5 tokenizer. [`FlaxT5ForConditionalGeneration`] is also supported by this [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
-
-<PipelineTag pipeline="translation"/>
-
-- [`T5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb).
-- [`TFT5ForConditionalGeneration`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
-- [Translation task guide](../tasks/translation)
-
-<PipelineTag pipeline="question-answering"/>
-
-- A notebook on how to [finetune T5 for question answering with TensorFlow 2](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb). 🌎
-- A notebook on how to [finetune T5 for question answering on a TPU](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil).
+## Notes
 
-🚀 **Deploy**
-- A blog post on how to deploy [T5 11B for inference for less than $500](https://www.philschmid.de/deploy-t5-11b).
+- You can pad the encoder inputs on the left or right because T5 uses relative scalar embeddings.
+- T5 models need a slightly higher learning rate than the default used in [`Trainer`]. Typically, values of `1e-4` and `3e-4` work well for most tasks.
 
 ## T5Config
 
diff --git a/docs/source/en/model_doc/t5v1.1.md b/docs/source/en/model_doc/t5v1.1.md
index e18696f629df..5ae908bacdae 100644
--- a/docs/source/en/model_doc/t5v1.1.md
+++ b/docs/source/en/model_doc/t5v1.1.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # T5v1.1
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 T5v1.1 was released in the [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511)
diff --git a/docs/source/en/model_doc/table-transformer.md b/docs/source/en/model_doc/table-transformer.md
index 850e7f50aa61..fea4dabf3f38 100644
--- a/docs/source/en/model_doc/table-transformer.md
+++ b/docs/source/en/model_doc/table-transformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Table Transformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Table Transformer model was proposed in [PubTables-1M: Towards comprehensive table extraction from unstructured documents](https://arxiv.org/abs/2110.00061) by
diff --git a/docs/source/en/model_doc/tapas.md b/docs/source/en/model_doc/tapas.md
index 79bbe3e819cf..21eb697ee34d 100644
--- a/docs/source/en/model_doc/tapas.md
+++ b/docs/source/en/model_doc/tapas.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # TAPAS
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 ## Overview
 
 The TAPAS model was proposed in [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://www.aclweb.org/anthology/2020.acl-main.398)
diff --git a/docs/source/en/model_doc/tapex.md b/docs/source/en/model_doc/tapex.md
index 15ac2463fd85..d46d520c7d18 100644
--- a/docs/source/en/model_doc/tapex.md
+++ b/docs/source/en/model_doc/tapex.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # TAPEX
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
index d6b431e648f2..72f29b4463ed 100644
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # TextNet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text detection tasks. It is the result of neural architecture search (NAS) on backbones with reward function as text detection task (to provide powerful features for text detection).
diff --git a/docs/source/en/model_doc/time_series_transformer.md b/docs/source/en/model_doc/time_series_transformer.md
index c5bfcfc15ea2..a91633b6b029 100644
--- a/docs/source/en/model_doc/time_series_transformer.md
+++ b/docs/source/en/model_doc/time_series_transformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Time Series Transformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting.
diff --git a/docs/source/en/model_doc/timesformer.md b/docs/source/en/model_doc/timesformer.md
index fe75bee5b289..c01f64efa71c 100644
--- a/docs/source/en/model_doc/timesformer.md
+++ b/docs/source/en/model_doc/timesformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # TimeSformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The TimeSformer model was proposed in [TimeSformer: Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Facebook Research.
diff --git a/docs/source/en/model_doc/timm_wrapper.md b/docs/source/en/model_doc/timm_wrapper.md
index 467f2addf963..8095a91054a5 100644
--- a/docs/source/en/model_doc/timm_wrapper.md
+++ b/docs/source/en/model_doc/timm_wrapper.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # TimmWrapper
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 Helper class to enable loading timm models to be used with the transformers library and its autoclasses.
diff --git a/docs/source/en/model_doc/trajectory_transformer.md b/docs/source/en/model_doc/trajectory_transformer.md
index 45616255871a..0c8fc29e01fa 100644
--- a/docs/source/en/model_doc/trajectory_transformer.md
+++ b/docs/source/en/model_doc/trajectory_transformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Trajectory Transformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, so we won't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/transfo-xl.md b/docs/source/en/model_doc/transfo-xl.md
index c80d9352b5ae..4d4f68ab07c9 100644
--- a/docs/source/en/model_doc/transfo-xl.md
+++ b/docs/source/en/model_doc/transfo-xl.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # Transformer XL
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, so we won't accept any new PRs changing its code. This model was deprecated due to security issues linked to `pickle.load`.
diff --git a/docs/source/en/model_doc/trocr.md b/docs/source/en/model_doc/trocr.md
index c471a13bbd23..0d0fb6ca24ab 100644
--- a/docs/source/en/model_doc/trocr.md
+++ b/docs/source/en/model_doc/trocr.md
@@ -15,6 +15,10 @@ specific language governing permissions and limitations under the License. -->
 
 # TrOCR
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The TrOCR model was proposed in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained
diff --git a/docs/source/en/model_doc/tvlt.md b/docs/source/en/model_doc/tvlt.md
index 0a0f50e47315..f1a97dfcd813 100644
--- a/docs/source/en/model_doc/tvlt.md
+++ b/docs/source/en/model_doc/tvlt.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # TVLT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/tvp.md b/docs/source/en/model_doc/tvp.md
index 22b400a06c73..cadb6e71f074 100644
--- a/docs/source/en/model_doc/tvp.md
+++ b/docs/source/en/model_doc/tvp.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # TVP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The text-visual prompting (TVP) framework was proposed in the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
@@ -107,7 +111,7 @@ def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps
     Returns:
         frames (tensor): decoded frames from the video.
     '''
-    assert clip_idx >= -2, "Not a valied clip_idx {}".format(clip_idx)
+    assert clip_idx >= -2, "Not a valid clip_idx {}".format(clip_idx)
     frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps)
     clip_size = sampling_rate * num_frames / target_fps * fps
     index = np.linspace(0, clip_size - 1, num_frames)
diff --git a/docs/source/en/model_doc/udop.md b/docs/source/en/model_doc/udop.md
index 614bd2ff4fd7..b63bc11a53ee 100644
--- a/docs/source/en/model_doc/udop.md
+++ b/docs/source/en/model_doc/udop.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # UDOP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The UDOP model was proposed in [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal.
diff --git a/docs/source/en/model_doc/ul2.md b/docs/source/en/model_doc/ul2.md
index f4d01c40b0c1..18743a28426e 100644
--- a/docs/source/en/model_doc/ul2.md
+++ b/docs/source/en/model_doc/ul2.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # UL2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The T5 model was presented in [Unifying Language Learning Paradigms](https://arxiv.org/pdf/2205.05131v1.pdf) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler.
diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md
index b9f86a0304e8..736574373c50 100644
--- a/docs/source/en/model_doc/umt5.md
+++ b/docs/source/en/model_doc/umt5.md
@@ -17,12 +17,7 @@ rendered properly in your Markdown viewer.
 # UMT5
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=umt5">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-mt5-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/mt5-small-finetuned-arxiv-cs-finetuned-arxiv-cs-full">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/unispeech-sat.md b/docs/source/en/model_doc/unispeech-sat.md
index 3f0bbcc79323..ae4eed71874c 100644
--- a/docs/source/en/model_doc/unispeech-sat.md
+++ b/docs/source/en/model_doc/unispeech-sat.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # UniSpeech-SAT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The UniSpeech-SAT model was proposed in [UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware
diff --git a/docs/source/en/model_doc/unispeech.md b/docs/source/en/model_doc/unispeech.md
index 2b2b13bed52c..43b0c3bb117e 100644
--- a/docs/source/en/model_doc/unispeech.md
+++ b/docs/source/en/model_doc/unispeech.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # UniSpeech
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The UniSpeech model was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael
diff --git a/docs/source/en/model_doc/univnet.md b/docs/source/en/model_doc/univnet.md
index 45bd94732773..367147115278 100644
--- a/docs/source/en/model_doc/univnet.md
+++ b/docs/source/en/model_doc/univnet.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # UnivNet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The UnivNet model was proposed in [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kin, and Juntae Kim.
diff --git a/docs/source/en/model_doc/upernet.md b/docs/source/en/model_doc/upernet.md
index 418c3ef1786b..a2c96582f24a 100644
--- a/docs/source/en/model_doc/upernet.md
+++ b/docs/source/en/model_doc/upernet.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # UPerNet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The UPerNet model was proposed in [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
diff --git a/docs/source/en/model_doc/van.md b/docs/source/en/model_doc/van.md
index 2fb8475ce72f..1df6a4640bbb 100644
--- a/docs/source/en/model_doc/van.md
+++ b/docs/source/en/model_doc/van.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # VAN
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md
index a3ba1258ecfa..f407b4dc5ebd 100644
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # Video-LLaVA
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 Video-LLaVa is an open-source multimodal LLM trained by fine-tuning LlamA/Vicuna on multimodal instruction-following data generated by Llava1.5 and VideChat. It is an auto-regressive language model, based on the transformer architecture. Video-LLaVa unifies visual representations to the language feature space, and enables an LLM to perform visual reasoning capabilities on both images and videos simultaneously.
diff --git a/docs/source/en/model_doc/videomae.md b/docs/source/en/model_doc/videomae.md
index a78561118570..be048d5b73a6 100644
--- a/docs/source/en/model_doc/videomae.md
+++ b/docs/source/en/model_doc/videomae.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # VideoMAE
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The VideoMAE model was proposed in [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
diff --git a/docs/source/en/model_doc/vilt.md b/docs/source/en/model_doc/vilt.md
index 2b0ac022da4b..107271e2c96e 100644
--- a/docs/source/en/model_doc/vilt.md
+++ b/docs/source/en/model_doc/vilt.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # ViLT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The ViLT model was proposed in [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)
diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md
index cb625e371161..9438893dfb15 100644
--- a/docs/source/en/model_doc/vipllava.md
+++ b/docs/source/en/model_doc/vipllava.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # VipLlava
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The VipLlava model was proposed in [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
diff --git a/docs/source/en/model_doc/vision-encoder-decoder.md b/docs/source/en/model_doc/vision-encoder-decoder.md
index 41159b7fc5f9..05340858612f 100644
--- a/docs/source/en/model_doc/vision-encoder-decoder.md
+++ b/docs/source/en/model_doc/vision-encoder-decoder.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # Vision Encoder Decoder Models
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text model with any
diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.md b/docs/source/en/model_doc/vision-text-dual-encoder.md
index 7cb68a261875..b9d6db38d588 100644
--- a/docs/source/en/model_doc/vision-text-dual-encoder.md
+++ b/docs/source/en/model_doc/vision-text-dual-encoder.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # VisionTextDualEncoder
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The [`VisionTextDualEncoderModel`] can be used to initialize a vision-text dual encoder model with
diff --git a/docs/source/en/model_doc/visual_bert.md b/docs/source/en/model_doc/visual_bert.md
index 95e5ae4e84a2..265d482c1902 100644
--- a/docs/source/en/model_doc/visual_bert.md
+++ b/docs/source/en/model_doc/visual_bert.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # VisualBERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The VisualBERT model was proposed in [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
@@ -60,7 +64,7 @@ appropriately for the textual and visual parts.
 The [`BertTokenizer`] is used to encode the text. A custom detector/image processor must be used
 to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
 
-- [VisualBERT VQA demo notebook](https://github.com/huggingface/transformers/tree/main/examples/research_projects/visual_bert) : This notebook
+- [VisualBERT VQA demo notebook](https://github.com/huggingface/transformers-research-projects/tree/main/visual_bert) : This notebook
   contains an example on VisualBERT VQA.
 
 - [Generate Embeddings for VisualBERT (Colab Notebook)](https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing) : This notebook contains
diff --git a/docs/source/en/model_doc/vit.md b/docs/source/en/model_doc/vit.md
index 53a550895ce2..d07006ac1b03 100644
--- a/docs/source/en/model_doc/vit.md
+++ b/docs/source/en/model_doc/vit.md
@@ -14,135 +14,83 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Vision Transformer (ViT)
-
-## Overview
-
-The Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
-at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
-Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
-Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
-very good results compared to familiar convolutional architectures.
-
-The abstract from the paper is the following:
-
-*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
-applications to computer vision remain limited. In vision, attention is either applied in conjunction with
-convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
-structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
-sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
-data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
-Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
-substantially fewer computational resources to train.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> ViT architecture. Taken from the <a href="https://arxiv.org/abs/2010.11929">original paper.</a> </small>
-
-Following the original Vision Transformer, some follow-up works have been made:
-
-- [DeiT](deit) (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers.
-  The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into [`ViTModel`] or
-  [`ViTForImageClassification`]. There are 4 variants available (in 3 different sizes): *facebook/deit-tiny-patch16-224*,
-  *facebook/deit-small-patch16-224*, *facebook/deit-base-patch16-224* and *facebook/deit-base-patch16-384*. Note that one should
-  use [`DeiTImageProcessor`] in order to prepare images for the model.
-
-- [BEiT](beit) (BERT pre-training of Image Transformers) by Microsoft Research. BEiT models outperform supervised pre-trained
-  vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE.
-
-- DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. Vision Transformers trained using
-  the DINO method show very interesting properties not seen with convolutional models. They are capable of segmenting
-  objects, without having ever been trained to do so. DINO checkpoints can be found on the [hub](https://huggingface.co/models?other=dino).
-
-- [MAE](vit_mae) (Masked Autoencoders) by Facebook AI. By pre-training Vision Transformers to reconstruct pixel values for a high portion
-  (75%) of masked patches (using an asymmetric encoder-decoder architecture), the authors show that this simple method outperforms
-  supervised pre-training after fine-tuning.
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
-found [here](https://github.com/google-research/vision_transformer).
-
-Note that we converted the weights from Ross Wightman's [timm library](https://github.com/rwightman/pytorch-image-models),
-who already converted the weights from JAX to PyTorch. Credits go to him!
-
-## Usage tips
-
-- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
-  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
-  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
-  vectors to a standard Transformer encoder.
-- As the Vision Transformer expects each image to be of the same size (resolution), one can use
-  [`ViTImageProcessor`] to resize (or rescale) and normalize images for the model.
-- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
-  each checkpoint. For example, `google/vit-base-patch16-224` refers to a base-sized architecture with patch
-  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the [hub](https://huggingface.co/models?search=vit).
-- The available checkpoints are either (1) pre-trained on [ImageNet-21k](http://www.image-net.org/) (a collection of
-  14 million images and 21k classes) only, or (2) also fine-tuned on [ImageNet](http://www.image-net.org/challenges/LSVRC/2012/) (also referred to as ILSVRC 2012, a collection of 1.3 million
-  images and 1,000 classes).
-- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
-  use a higher resolution than pre-training [(Touvron et al., 2019)](https://arxiv.org/abs/1906.06423), [(Kolesnikov
-  et al., 2020)](https://arxiv.org/abs/1912.11370). In order to fine-tune at higher resolution, the authors perform
-  2D interpolation of the pre-trained position embeddings, according to their location in the original image.
-- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
-  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
-  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
-  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
-
-### Using Scaled Dot Product Attention (SDPA)
-
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
-
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
 
-```
-from transformers import ViTForImageClassification
-model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16)
-...
-```
-
-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
-
-On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-base-patch16-224` model, we saw the following speedups during inference.
+# Vision Transformer (ViT)
 
-|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
-|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
-|            1 |                                         7 |                                         6 |                      1.17 |
-|            2 |                                         8 |                                         6 |                      1.33 |
-|            4 |                                         8 |                                         6 |                      1.33 |
-|            8 |                                         8 |                                         6 |                      1.33 |
+[Vision Transformer (ViT)](https://huggingface.co/papers/2010.11929) is a transformer adapted for computer vision tasks. An image is split into smaller fixed-sized patches which are treated as a sequence of tokens, similar to words for NLP tasks. ViT requires less resources to pretrain compared to convolutional architectures and its performance on large datasets can be transferred to smaller downstream tasks.
 
-## Resources
+You can find all the original ViT checkpoints under the [Google](https://huggingface.co/google?search_models=vit) organization.
 
-Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer).
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+> [!TIP]
+> Click on the ViT models in the right sidebar for more examples of how to apply ViT to different computer vision tasks.
 
-`ViTForImageClassification` is supported by:
-<PipelineTag pipeline="image-classification"/>
+The example below demonstrates how to classify an image with [`Pipeline`] or the [`AutoModel`] class.
 
-- A blog post on how to [Fine-Tune ViT for Image Classification with Hugging Face Transformers](https://huggingface.co/blog/fine-tune-vit)
-- A blog post on [Image Classification with Hugging Face Transformers and `Keras`](https://www.philschmid.de/image-classification-huggingface-transformers-keras)
-- A notebook on [Fine-tuning for Image Classification with Hugging Face Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb)
-- A notebook on how to [Fine-tune the Vision Transformer on CIFAR-10 with the Hugging Face Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb)
-- A notebook on how to [Fine-tune the Vision Transformer on CIFAR-10 with PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb)
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
-⚗️ Optimization
+```py
+import torch
+from transformers import pipeline
 
-- A blog post on how to [Accelerate Vision Transformer (ViT) with Quantization using Optimum](https://www.philschmid.de/optimizing-vision-transformer)
+pipeline = pipeline(
+    task="image-classification",
+    model="google/vit-base-patch16-224",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline(images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg")
+```
 
-⚡️ Inference
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoModelForImageClassification, AutoImageProcessor
+
+image_processor = AutoImageProcessor.from_pretrained(
+    "google/vit-base-patch16-224",
+    use_fast=True,
+)
+model = AutoModelForImageClassification.from_pretrained(
+    "google/vit-base-patch16-224",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = image_processor(image, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+  logits = model(**inputs).logits
+predicted_class_id = logits.argmax(dim=-1).item()
+
+class_labels = model.config.id2label
+predicted_class_label = class_labels[predicted_class_id]
+print(f"The predicted class label is: {predicted_class_label}")
+```
 
-- A notebook on [Quick demo: Vision Transformer (ViT) by Google Brain](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Quick_demo_of_HuggingFace_version_of_Vision_Transformer_inference.ipynb)
+</hfoption>
+</hfoptions>
 
-🚀 Deploy
+## Notes
 
-- A blog post on [Deploying Tensorflow Vision Models in Hugging Face with TF Serving](https://huggingface.co/blog/tf-serving-vision)
-- A blog post on [Deploying Hugging Face ViT on Vertex AI](https://huggingface.co/blog/deploy-vertex-ai)
-- A blog post on [Deploying Hugging Face ViT on Kubernetes with TF Serving](https://huggingface.co/blog/deploy-tfserving-kubernetes)
+- The best results are obtained with supervised pretraining, and during fine-tuning, it may be better to use images with a resolution higher than 224x224.
+- Use [`ViTImageProcessorFast`] to resize (or rescale) and normalize images to the expected size.
+- The patch and image resolution are reflected in the checkpoint name. For example, google/vit-base-patch16-224, is the **base-sized** architecture with a patch resolution of 16x16 and fine-tuning resolution of 224x224.
 
 ## ViTConfig
 
@@ -163,9 +111,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] ViTImageProcessorFast
     - preprocess
 
-<frameworkcontent>
-<pt>
-
 ## ViTModel
 
 [[autodoc]] ViTModel
@@ -181,9 +126,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] ViTForImageClassification
     - forward
 
-</pt>
-<tf>
-
 ## TFViTModel
 
 [[autodoc]] TFViTModel
@@ -194,9 +136,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] TFViTForImageClassification
     - call
 
-</tf>
-<jax>
-
 ## FlaxVitModel
 
 [[autodoc]] FlaxViTModel
@@ -206,6 +145,3 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] FlaxViTForImageClassification
     - __call__
-
-</jax>
-</frameworkcontent>
diff --git a/docs/source/en/model_doc/vit_hybrid.md b/docs/source/en/model_doc/vit_hybrid.md
index 5cde5e529807..a79fadd2550c 100644
--- a/docs/source/en/model_doc/vit_hybrid.md
+++ b/docs/source/en/model_doc/vit_hybrid.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # Hybrid Vision Transformer (ViT Hybrid)
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/vit_mae.md b/docs/source/en/model_doc/vit_mae.md
index 8d0a40c8a3e1..893490cf013b 100644
--- a/docs/source/en/model_doc/vit_mae.md
+++ b/docs/source/en/model_doc/vit_mae.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # ViTMAE
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The ViTMAE model was proposed in [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v2) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li,
diff --git a/docs/source/en/model_doc/vit_msn.md b/docs/source/en/model_doc/vit_msn.md
index e1210ce7f9dd..a3aadef0e9bf 100644
--- a/docs/source/en/model_doc/vit_msn.md
+++ b/docs/source/en/model_doc/vit_msn.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # ViTMSN
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The ViTMSN model was proposed in [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes,
diff --git a/docs/source/en/model_doc/vitdet.md b/docs/source/en/model_doc/vitdet.md
index 81bf787d6cda..d569e71d904e 100644
--- a/docs/source/en/model_doc/vitdet.md
+++ b/docs/source/en/model_doc/vitdet.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # ViTDet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The ViTDet model was proposed in [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
diff --git a/docs/source/en/model_doc/vitmatte.md b/docs/source/en/model_doc/vitmatte.md
index 5a6d501030fc..105d529c2d44 100644
--- a/docs/source/en/model_doc/vitmatte.md
+++ b/docs/source/en/model_doc/vitmatte.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # ViTMatte
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The ViTMatte model was proposed in [Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md
index 4fbead04ea80..02471ad39e22 100644
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # ViTPose
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard, non-hierarchical [Vision Transformer](vit) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark. The model was further improved in [ViTPose++: Vision Transformer for Generic Body Pose Estimation](https://arxiv.org/abs/2212.04246) where the authors employ
diff --git a/docs/source/en/model_doc/vits.md b/docs/source/en/model_doc/vits.md
index 42997cae1e74..225d0f639003 100644
--- a/docs/source/en/model_doc/vits.md
+++ b/docs/source/en/model_doc/vits.md
@@ -12,6 +12,10 @@ specific language governing permissions and limitations under the License.
 
 # VITS
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The VITS model was proposed in [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md
index c3e3df14ab98..a2cba9793e82 100644
--- a/docs/source/en/model_doc/vivit.md
+++ b/docs/source/en/model_doc/vivit.md
@@ -12,6 +12,12 @@ specific language governing permissions and limitations under the License.
 
 # Video Vision Transformer (ViViT)
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Vivit model was proposed in [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
diff --git a/docs/source/en/model_doc/wav2vec2-bert.md b/docs/source/en/model_doc/wav2vec2-bert.md
index 6514133330a9..c2cf46497706 100644
--- a/docs/source/en/model_doc/wav2vec2-bert.md
+++ b/docs/source/en/model_doc/wav2vec2-bert.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Wav2Vec2-BERT
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Wav2Vec2-BERT model was proposed in [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team from Meta AI.
diff --git a/docs/source/en/model_doc/wav2vec2-conformer.md b/docs/source/en/model_doc/wav2vec2-conformer.md
index 0b30cf5fa431..f84e6b371116 100644
--- a/docs/source/en/model_doc/wav2vec2-conformer.md
+++ b/docs/source/en/model_doc/wav2vec2-conformer.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # Wav2Vec2-Conformer
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Wav2Vec2-Conformer was added to an updated version of [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
diff --git a/docs/source/en/model_doc/wav2vec2.md b/docs/source/en/model_doc/wav2vec2.md
index 5ef3fdbb1eaa..0dac6234914b 100644
--- a/docs/source/en/model_doc/wav2vec2.md
+++ b/docs/source/en/model_doc/wav2vec2.md
@@ -16,6 +16,15 @@ rendered properly in your Markdown viewer.
 
 # Wav2Vec2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The Wav2Vec2 model was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
diff --git a/docs/source/en/model_doc/wav2vec2_phoneme.md b/docs/source/en/model_doc/wav2vec2_phoneme.md
index 93e0656f493c..c5c1edd6aced 100644
--- a/docs/source/en/model_doc/wav2vec2_phoneme.md
+++ b/docs/source/en/model_doc/wav2vec2_phoneme.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # Wav2Vec2Phoneme
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The Wav2Vec2Phoneme model was proposed in [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al.,
diff --git a/docs/source/en/model_doc/wavlm.md b/docs/source/en/model_doc/wavlm.md
index a42fbff13958..54947e2f1579 100644
--- a/docs/source/en/model_doc/wavlm.md
+++ b/docs/source/en/model_doc/wavlm.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # WavLM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The WavLM model was proposed in [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen,
diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md
index 58e641a5d0e0..237fb9d379a3 100644
--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@@ -14,143 +14,86 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Whisper
-
-## Overview
 
-The Whisper model was proposed in [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
 
-The abstract from the paper is the following:
+# Whisper
 
-*We study the capabilities of speech processing systems trained simply to predict large amounts of transcripts of audio on the internet. When scaled to 680,000 hours of multilingual and multitask supervision, the resulting models generalize well to standard benchmarks and are often competitive with prior fully supervised results but in a zeroshot transfer setting without the need for any finetuning. When compared to humans, the models approach their accuracy and robustness. We are releasing models and inference code to serve as a foundation for further work on robust speech processing.*
+[Whisper](https://hf.co/papers/2212.04356) is a encoder-decoder (sequence-to-sequence) transformer pretrained on 680,000 hours of labeled audio data. This amount of pretraining data enables zero-shot performance on audio tasks in English and many other languages. The decoder allows Whisper to map the encoders learned speech representations to useful outputs, such as text, without additional fine-tuning. Whisper just works out of the box.
 
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts).
-The original code can be found [here](https://github.com/openai/whisper).
+You can find all the original Whisper checkpoints under the [Whisper](https://huggingface.co/collections/openai/whisper-release-6501bba2cf999715fd953013) collection.
 
-## Quick usage
+> [!TIP]
+> Click on the Whisper models in the right sidebar for more examples of how to apply Whisper to different audio tasks.
 
-You can run Whisper in less than 4 lines of code and transcribe in less than a minute!
+The example below demonstrates how to automatically transcribe speech into text with [`Pipeline`] or the [`AutoModel`] class.
 
-```python
-# pip install transformers torch
+<hfoptions id="usage">
+<hfoption id="Pipeline">
 
+```py
 import torch
 from transformers import pipeline
 
-whisper = pipeline("automatic-speech-recognition", "openai/whisper-large-v3", torch_dtype=torch.float16, device="cuda:0")
-
-transcription = whisper("<audio_file.mp3>")
-
-print(transcription["text"])
-```
-
-Voila! You can swap the model with any [Whisper checkpoints](https://huggingface.co/models?other=whisper&sort=downloads) on the Hugging Face Hub with the same pipeline based on your needs.
-
-Bonus: You can replace `"cuda"` with `"mps"` to make it seamlessly work on Macs.
-
-## Usage tips
-
-- The model usually performs well without requiring any finetuning.
-- The architecture follows a classic encoder-decoder architecture, which means that it relies on the [`~generation.GenerationMixin.generate`] function for inference.
-- One can use [`WhisperProcessor`] to prepare audio for the model, and decode the predicted ID's back into text.
-
-- To convert the model and the processor, we recommend using the following:
-
-```bash
-python src/transformers/models/whisper/convert_openai_to_hf.py --checkpoint_path "" --pytorch_dump_folder_path "Arthur/whisper-3" --convert_preprocessor True
+pipeline = pipeline(
+    task="automatic-speech-recognition",
+    model="openai/whisper-large-v3-turbo",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
 ```
-The script will automatically determine all necessary parameters from the OpenAI checkpoint. A `tiktoken` library needs to be installed
-to perform the conversion of the OpenAI tokenizer to the `tokenizers` version.
-
-## Inference
-
-Here is a step-by-step guide to transcribing an audio sample using a pre-trained Whisper model:
-
-```python
->>> from datasets import load_dataset
->>> from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
->>> # Select an audio file and read it:
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> audio_sample = ds[0]["audio"]
+</hfoption>
+<hfoption id="AutoModel">
 
->>> # Load the Whisper model in Hugging Face format:
->>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
->>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-
->>> # Use the model and processor to transcribe the audio:
->>> input_features = processor(
-...     audio_sample["array"], sampling_rate=audio_sample["sampling_rate"], return_tensors="pt"
-... ).input_features
-
->>> # Generate token ids
->>> predicted_ids = model.generate(input_features)
-
->>> # Decode token ids to text
->>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
-
->>> transcription[0]
-' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
-```
-
-Whisper is compatible with the following optimisations for both short and long-form generation:
-- [PyTorch Scaled Dot Product Attention (SDPA)](../perf_infer_gpu_one#pytorch-scaled-dot-product-attention): flash attention and memory-efficient attention kernels. Enabled by default for `torch>=2.1.1`.
-- [Flash Attention 2](../perf_infer_gpu_one#flashattention-2): improved implementation of flash attention through better parallelism and work partitioning. 
-- [torch.compile](../llm_optims#static-kv-cache-and-torchcompile): JIT-compile the forward pass to dispatch to efficient fused kernels.
-
-As an example, the following codesnippet enables SDPA and `torch.compile` for up to 5x faster inference:
-
-```python
->>> from datasets import load_dataset
->>> from transformers import WhisperProcessor, WhisperForConditionalGeneration
-
->>> # Select an audio file and read it:
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> audio_sample = ds[0]["audio"]
-
->>> # Load the Whisper model with SDPA attention
->>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
->>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", attn_implementation="sdpa")
-
->>> # Enable static cache and compile the forward pass
->>> model.generation_config.cache_implementation = "static"
->>> model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
-
->>> # Use the model and processor to transcribe the audio:
->>> input_features = processor(
-...     audio_sample["array"], sampling_rate=audio_sample["sampling_rate"], return_tensors="pt"
-... ).input_features
-
->>> # Compile the forward pass
->>> for _ in range(2):
->>>     model.generate(input_features)
-
->>> # Generate token ids using compiled graph (fast!)
->>> predicted_ids = model.generate(input_features)
-
->>> # Decode token ids to text
->>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
-
->>> transcription[0]
-' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+```py
+# pip install datasets
+import torch
+from datasets import load_dataset
+from transformers import AutoProcessor, WhisperForConditionalGeneration
+
+processor = AutoProcessor.from_pretrained(
+    "openai/whisper-large-v3-turbo",
+)
+model = WhisperForConditionalGeneration.from_pretrained(
+    "openai/whisper-large-v3-turbo",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+).to("cuda")
+
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+audio_sample = ds[0]["audio"]
+
+input_features = processor(
+    audio_sample["array"],
+    sampling_rate=audio_sample["sampling_rate"],
+    return_tensors="pt"
+).input_features
+input_features = input_features.to("cuda", dtype=torch.float16)
+
+predicted_ids = model.generate(input_features, cache_implementation="static")
+transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+transcription[0]
 ```
 
-For more details on each optimisation, refer to the documentation linked above.
+</hfoption>
+</hfoptions>
 
-## Resources
+## Notes
 
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Whisper. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-- [Fine-tune Whisper](https://huggingface.co/blog/fine-tune-whisper) on your own dataset for better downstream performance.
-- [Distil-Whisper](https://huggingface.co/distil-whisper): Upto 6x faster, 2x smaller distilled Whisper models for English. We release the [model checkpoints](https://huggingface.co/distil-whisper), and [distillation code](https://github.com/huggingface/distil-whisper).
-- A fork with a script to [convert a Whisper model in Hugging Face format to OpenAI format](https://github.com/zuazo-forks/transformers/blob/convert_hf_to_openai/src/transformers/models/whisper/convert_hf_to_openai.py). 🌎
-Usage example:
-```bash
-pip install -U openai-whisper
-python convert_hf_to_openai.py \
-    --checkpoint openai/whisper-tiny \
-    --whisper_dump_path whisper-tiny-openai.pt
-```
+- Whisper relies on [`~GenerationMixin.generate`] for inference.
+- The [`WhisperProcessor`] can be used for preparing audio and decoding predicted ids back into text.
 
 ## WhisperConfig
 
@@ -196,9 +139,6 @@ python convert_hf_to_openai.py \
     - batch_decode
     - decode
 
-<frameworkcontent>
-<pt>
-
 ## WhisperModel
 
 [[autodoc]] WhisperModel
@@ -221,9 +161,6 @@ python convert_hf_to_openai.py \
 [[autodoc]] WhisperForAudioClassification
     - forward
 
-</pt>
-<tf>
-
 ## TFWhisperModel
 
 [[autodoc]] TFWhisperModel
@@ -234,9 +171,6 @@ python convert_hf_to_openai.py \
 [[autodoc]] TFWhisperForConditionalGeneration
     - call
 
-</tf>
-<jax>
-
 ## FlaxWhisperModel
 
 [[autodoc]] FlaxWhisperModel
@@ -251,7 +185,3 @@ python convert_hf_to_openai.py \
 
 [[autodoc]] FlaxWhisperForAudioClassification
     - __call__
-
-</jax>
-</frameworkcontent>
-
diff --git a/docs/source/en/model_doc/xclip.md b/docs/source/en/model_doc/xclip.md
index 8c22747387c0..62f0c3aa2e4e 100644
--- a/docs/source/en/model_doc/xclip.md
+++ b/docs/source/en/model_doc/xclip.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # X-CLIP
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The X-CLIP model was proposed in [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
diff --git a/docs/source/en/model_doc/xglm.md b/docs/source/en/model_doc/xglm.md
index 470e42c747be..4032de2cd784 100644
--- a/docs/source/en/model_doc/xglm.md
+++ b/docs/source/en/model_doc/xglm.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # XGLM
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The XGLM model was proposed in [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668)
diff --git a/docs/source/en/model_doc/xlm-prophetnet.md b/docs/source/en/model_doc/xlm-prophetnet.md
index b350cb554b03..046904d885a4 100644
--- a/docs/source/en/model_doc/xlm-prophetnet.md
+++ b/docs/source/en/model_doc/xlm-prophetnet.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # XLM-ProphetNet
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 <Tip warning={true}>
 
 This model is in maintenance mode only, we don't accept any new PRs changing its code.
diff --git a/docs/source/en/model_doc/xlm-roberta-xl.md b/docs/source/en/model_doc/xlm-roberta-xl.md
index f9cb78c0bf4e..355869ad6e02 100644
--- a/docs/source/en/model_doc/xlm-roberta-xl.md
+++ b/docs/source/en/model_doc/xlm-roberta-xl.md
@@ -16,6 +16,11 @@ rendered properly in your Markdown viewer.
 
 # XLM-RoBERTa-XL
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The XLM-RoBERTa-XL model was proposed in [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. 
diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md
index 414afba11681..2bc890257a69 100644
--- a/docs/source/en/model_doc/xlm-roberta.md
+++ b/docs/source/en/model_doc/xlm-roberta.md
@@ -17,12 +17,11 @@ rendered properly in your Markdown viewer.
 # XLM-RoBERTa
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=xlm-roberta">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm--roberta-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/xlm-roberta-base">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/xlm-v.md b/docs/source/en/model_doc/xlm-v.md
index 049a1f35ad9a..69badfe2e698 100644
--- a/docs/source/en/model_doc/xlm-v.md
+++ b/docs/source/en/model_doc/xlm-v.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # XLM-V
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 XLM-V is multilingual language model with a one million token vocabulary trained on 2.5TB of data from Common Crawl (same as XLM-R).
diff --git a/docs/source/en/model_doc/xlm.md b/docs/source/en/model_doc/xlm.md
index 0ee11c6addc5..61effea7cca7 100644
--- a/docs/source/en/model_doc/xlm.md
+++ b/docs/source/en/model_doc/xlm.md
@@ -17,12 +17,8 @@ rendered properly in your Markdown viewer.
 # XLM
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=xlm">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/xlm-mlm-en-2048">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/xlnet.md b/docs/source/en/model_doc/xlnet.md
index 90b454e8af3c..0b90de75ccff 100644
--- a/docs/source/en/model_doc/xlnet.md
+++ b/docs/source/en/model_doc/xlnet.md
@@ -17,12 +17,8 @@ rendered properly in your Markdown viewer.
 # XLNet
 
 <div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=xlnet">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlnet-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/xlnet-base-cased">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>
 
 ## Overview
diff --git a/docs/source/en/model_doc/xls_r.md b/docs/source/en/model_doc/xls_r.md
index 2226c813e72b..d24d88907ee7 100644
--- a/docs/source/en/model_doc/xls_r.md
+++ b/docs/source/en/model_doc/xls_r.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # XLS-R
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The XLS-R model was proposed in [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman
diff --git a/docs/source/en/model_doc/xlsr_wav2vec2.md b/docs/source/en/model_doc/xlsr_wav2vec2.md
index 6369d068850a..f88b0dc9e14f 100644
--- a/docs/source/en/model_doc/xlsr_wav2vec2.md
+++ b/docs/source/en/model_doc/xlsr_wav2vec2.md
@@ -16,6 +16,13 @@ rendered properly in your Markdown viewer.
 
 # XLSR-Wav2Vec2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+</div>
+
 ## Overview
 
 The XLSR-Wav2Vec2 model was proposed in [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael
diff --git a/docs/source/en/model_doc/xmod.md b/docs/source/en/model_doc/xmod.md
index 47797fa64902..e07601074c2b 100644
--- a/docs/source/en/model_doc/xmod.md
+++ b/docs/source/en/model_doc/xmod.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # X-MOD
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The X-MOD model was proposed in [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, and Mikel Artetxe.
diff --git a/docs/source/en/model_doc/yolos.md b/docs/source/en/model_doc/yolos.md
index ebe249517fdf..2a0f5d23fa47 100644
--- a/docs/source/en/model_doc/yolos.md
+++ b/docs/source/en/model_doc/yolos.md
@@ -16,6 +16,12 @@ rendered properly in your Markdown viewer.
 
 # YOLOS
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The YOLOS model was proposed in [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
diff --git a/docs/source/en/model_doc/yoso.md b/docs/source/en/model_doc/yoso.md
index a3dfa3fed855..c9fbb11b1e49 100644
--- a/docs/source/en/model_doc/yoso.md
+++ b/docs/source/en/model_doc/yoso.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # YOSO
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The YOSO model was proposed in [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714)  
diff --git a/docs/source/en/model_doc/zamba.md b/docs/source/en/model_doc/zamba.md
index 450b68c77d6d..a6a7ee38cf60 100644
--- a/docs/source/en/model_doc/zamba.md
+++ b/docs/source/en/model_doc/zamba.md
@@ -15,6 +15,10 @@ rendered properly in your Markdown viewer.
 -->
 # Zamba
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 Zamba is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights.
 
 This model was contributed by [pglo](https://huggingface.co/pglo).
diff --git a/docs/source/en/model_doc/zamba2.md b/docs/source/en/model_doc/zamba2.md
index c3e67291039c..447fa27b6962 100644
--- a/docs/source/en/model_doc/zamba2.md
+++ b/docs/source/en/model_doc/zamba2.md
@@ -15,6 +15,12 @@ rendered properly in your Markdown viewer.
 -->
 # Zamba2
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 Zamba2 is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights.
 
 This model was contributed by [pglo](https://huggingface.co/pglo).
diff --git a/docs/source/en/model_doc/zoedepth.md b/docs/source/en/model_doc/zoedepth.md
index ecd068511e96..fefadfba6aa4 100644
--- a/docs/source/en/model_doc/zoedepth.md
+++ b/docs/source/en/model_doc/zoedepth.md
@@ -16,6 +16,10 @@ rendered properly in your Markdown viewer.
 
 # ZoeDepth
 
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
 ## Overview
 
 The ZoeDepth model was proposed in [ZoeDepth: Zero-shot Transfer by Combining Relative and Metric Depth](https://arxiv.org/abs/2302.12288) by Shariq Farooq Bhat, Reiner Birkl, Diana Wofk, Peter Wonka, Matthias Müller. ZoeDepth extends the [DPT](dpt) framework for metric (also called absolute) depth estimation. ZoeDepth is pre-trained on 12 datasets using relative depth and fine-tuned on two domains (NYU and KITTI) using metric depth. A lightweight head is used with a novel bin adjustment design called metric bins module for each domain. During inference, each input image is automatically routed to the appropriate head using a latent classifier.
diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md
index 076fc2ccdd57..a6ebdfb39657 100644
--- a/docs/source/en/model_sharing.md
+++ b/docs/source/en/model_sharing.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,219 +14,202 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Share a model
+# Sharing
 
-The last two tutorials showed how you can fine-tune a model with PyTorch, Keras, and 🤗 Accelerate for distributed setups. The next step is to share your model with the community! At Hugging Face, we believe in openly sharing knowledge and resources to democratize artificial intelligence for everyone. We encourage you to consider sharing your model with the community to help others save time and resources.
+The Hugging Face [Hub](https://hf.co/models) is a platform for sharing, discovering, and consuming models of all different types and sizes. We highly recommend sharing your model on the Hub to push open-source machine learning forward for everyone!
 
-In this tutorial, you will learn two methods for sharing a trained or fine-tuned model on the [Model Hub](https://huggingface.co/models):
+This guide will show you how to share a model to the Hub from Transformers.
 
-- Programmatically push your files to the Hub.
-- Drag-and-drop your files to the Hub with the web interface.
+## Set up
 
-<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
-frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
-picture-in-picture" allowfullscreen></iframe>
+To share a model to the Hub, you need a Hugging Face [account](https://hf.co/join). Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) (stored in the [cache](./installation#cache-directory) by default) and login to your account from either the command line or notebook.
 
-<Tip>
+<hfoptions id="share">
+<hfoption id="huggingface-CLI">
 
-To share a model with the community, you need an account on [huggingface.co](https://huggingface.co/join). You can also join an existing organization or create a new one.
-
-</Tip>
-
-## Repository features
-
-Each repository on the Model Hub behaves like a typical GitHub repository. Our repositories offer versioning, commit history, and the ability to visualize differences.
-
-The Model Hub's built-in versioning is based on git and [git-lfs](https://git-lfs.github.com/). In other words, you can treat one model as one repository, enabling greater access control and scalability. Version control allows *revisions*, a method for pinning a specific version of a model with a commit hash, tag or branch.
+```bash
+huggingface-cli login
+```
 
-As a result, you can load a specific model version with the `revision` parameter:
+</hfoption>
+<hfoption id="notebook">
 
 ```py
->>> model = AutoModel.from_pretrained(
-...     "julien-c/EsperBERTo-small", revision="4c77982"  # tag name, or branch name, or commit hash
-... )
-```
+from huggingface_hub import notebook_login
 
-Files are also easily edited in a repository, and you can view the commit history as well as the differences:
+notebook_login()
+```
 
-![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
+</hfoption>
+</hfoptions>
 
-## Setup
+## Repository features
 
-Before sharing a model to the Hub, you will need your Hugging Face credentials. If you have access to a terminal, run the following command in the virtual environment where 🤗 Transformers is installed. This will store your access token in your Hugging Face cache folder (`~/.cache/` by default):
+<Youtube id="XvSGPZFEjDY"/>
 
-```bash
-huggingface-cli login
-```
+Each model repository features versioning, commit history, and diff visualization.
 
-If you are using a notebook like Jupyter or Colaboratory, make sure you have the [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) library installed. This library allows you to programmatically interact with the Hub.
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png"/>
+</div>
 
-```bash
-pip install huggingface_hub
-```
+Versioning is based on [Git](https://git-scm.com/) and [Git Large File Storage (LFS)](https://git-lfs.github.com/), and it enables revisions, a way to specify a model version with a commit hash, tag or branch.
 
-Then use `notebook_login` to sign-in to the Hub, and follow the link [here](https://huggingface.co/settings/token) to generate a token to login with:
+For example, use the `revision` parameter in [`~PreTrainedModel.from_pretrained`] to load a specific model version from a commit hash.
 
 ```py
->>> from huggingface_hub import notebook_login
-
->>> notebook_login()
+model = AutoModel.from_pretrained(
+    "julien-c/EsperBERTo-small", revision="4c77982"
+)
 ```
 
-## Convert a model for all frameworks
+Model repositories also support [gating](https://hf.co/docs/hub/models-gated) to control who can access a model. Gating is common for allowing a select group of users to preview a research model before it's made public.
 
-To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly.
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/gated-model.png"/>
+</div>
 
-Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework.
+A model repository also includes an inference [widget](https://hf.co/docs/hub/models-widgets) for users to directly interact with a model on the Hub.
 
-<frameworkcontent>
-<pt>
-Specify `from_tf=True` to convert a checkpoint from TensorFlow to PyTorch:
+Check out the Hub [Models](https://hf.co/docs/hub/models) documentation to for more information.
 
-```py
->>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
->>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
-```
-</pt>
-<tf>
-Specify `from_pt=True` to convert a checkpoint from PyTorch to TensorFlow:
+## Model framework conversion
 
-```py
->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
-```
+Reach a wider audience by making a model available in PyTorch, TensorFlow, and Flax. While users can still load a model if they're using a different framework, it is slower because Transformers needs to convert the checkpoint on the fly. It is faster to convert the checkpoint first.
 
-Then you can save your new TensorFlow model with its new checkpoint:
+<hfoptions id="convert">
+<hfoption id="PyTorch">
 
-```py
->>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
-```
-</tf>
-<jax>
-If a model is available in Flax, you can also convert a checkpoint from PyTorch to Flax:
+Set `from_tf=True` to convert a checkpoint from TensorFlow to PyTorch and then save it.
 
 ```py
->>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
-...     "path/to/awesome-name-you-picked", from_pt=True
-... )
-```
-</jax>
-</frameworkcontent>
+from transformers import DistilBertForSequenceClassification
 
-## Push a model during training
+pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+pt_model.save_pretrained("path/to/awesome-name-you-picked")
+```
 
-<frameworkcontent>
-<pt>
-<Youtube id="Z1-XMy-GNLQ"/>
+</hfoption>
+<hfoption id="TensorFlow">
 
-Sharing a model to the Hub is as simple as adding an extra parameter or callback. Remember from the [fine-tuning tutorial](training), the [`TrainingArguments`] class is where you specify hyperparameters and additional training options. One of these training options includes the ability to push a model directly to the Hub. Set `push_to_hub=True` in your [`TrainingArguments`]:
+Set `from_pt=True` to convert a checkpoint from PyTorch to TensorFlow and then save it.
 
 ```py
->>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
-```
-
-Pass your training arguments as usual to [`Trainer`]:
+from transformers import TFDistilBertForSequenceClassification
 
-```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
+tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+tf_model.save_pretrained("path/to/awesome-name-you-picked")
 ```
 
-After you fine-tune your model, call [`~transformers.Trainer.push_to_hub`] on [`Trainer`] to push the trained model to the Hub. 🤗 Transformers will even automatically add training hyperparameters, training results and framework versions to your model card!
+</hfoption>
+<hfoption id="Flax">
+
+Set `from_pt=True` to convert a checkpoint from PyTorch to Flax and then save it.
 
 ```py
->>> trainer.push_to_hub()
+from transformers import FlaxDistilBertForSequenceClassification
+flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
+    "path/to/awesome-name-you-picked", from_pt=True
+)
+flax_model.save_pretrained("path/to/awesome-name-you-picked")
 ```
-</pt>
-<tf>
-Share a model to the Hub with [`PushToHubCallback`]. In the [`PushToHubCallback`] function, add:
 
-- An output directory for your model.
-- A tokenizer.
-- The `hub_model_id`, which is your Hub username and model name.
+</hfoption>
+</hfoptions>
 
-```py
->>> from transformers import PushToHubCallback
-
->>> push_to_hub_callback = PushToHubCallback(
-...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
-... )
-```
+## Uploading a model
 
-Add the callback to [`fit`](https://keras.io/api/models/model_training_apis/), and 🤗 Transformers will push the trained model to the Hub:
+There are several ways to upload a model to the Hub depending on your workflow preference. You can push a model with [`Trainer`], a callback for TensorFlow models, call [`~PreTrainedModel.push_to_hub`] directly on a model, or use the Hub web interface.
 
-```py
->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
-```
-</tf>
-</frameworkcontent>
+<Youtube id="Z1-XMy-GNLQ"/>
 
-## Use the `push_to_hub` function
+### Trainer
 
-You can also call `push_to_hub` directly on your model to upload it to the Hub.
+[`Trainer`] can push a model directly to the Hub after training. Set `push_to_hub=True` in [`TrainingArguments`] and pass it to [`Trainer`]. Once training is complete, call [`~transformers.Trainer.push_to_hub`] to upload the model.
 
-Specify your model name in `push_to_hub`:
+[`~transformers.Trainer.push_to_hub`] automatically adds useful information like training hyperparameters and results to the model card.
 
 ```py
->>> pt_model.push_to_hub("my-awesome-model")
+from transformers import TrainingArguments, Trainer
+
+training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=small_train_dataset,
+    eval_dataset=small_eval_dataset,
+    compute_metrics=compute_metrics,
+)
+trainer.push_to_hub()
 ```
 
-This creates a repository under your username with the model name `my-awesome-model`. Users can now load your model with the `from_pretrained` function:
+### PushToHubCallback
+
+For TensorFlow models, add the [`PushToHubCallback`] to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
 
 ```py
->>> from transformers import AutoModel
+from transformers import PushToHubCallback
 
->>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
+push_to_hub_callback = PushToHubCallback(
+    output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
+)
+model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
 ```
 
-If you belong to an organization and want to push your model under the organization name instead, just add it to the `repo_id`:
+### PushToHubMixin
 
-```py
->>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
-```
+The [`~utils.PushToHubMixin`] provides functionality for pushing a model or tokenizer to the Hub.
 
-The `push_to_hub` function can also be used to add other files to a model repository. For example, add a tokenizer to a model repository:
+Call [`~utils.PushToHubMixin.push_to_hub`] directly on a model to upload it to the Hub. It creates a repository under your namespace with the model name specified in [`~utils.PushToHubMixin.push_to_hub`].
 
 ```py
->>> tokenizer.push_to_hub("my-awesome-model")
+model.push_to_hub("my-awesome-model")
 ```
 
-Or perhaps you'd like to add the TensorFlow version of your fine-tuned PyTorch model:
+Other objects like a tokenizer or TensorFlow model are also pushed to the Hub in the same way.
 
 ```py
->>> tf_model.push_to_hub("my-awesome-model")
+tokenizer.push_to_hub("my-awesome-model")
 ```
 
-Now when you navigate to your Hugging Face profile, you should see your newly created model repository. Clicking on the **Files** tab will display all the files you've uploaded to the repository.
+Your Hugging Face profile should now display the newly created model repository. Navigate to the **Files** tab to see all the uploaded files.
+
+Refer to the [Upload files to the Hub](https://hf.co/docs/hub/how-to-upstream) guide for more information about pushing files to the Hub.
 
-For more details on how to create and upload files to a repository, refer to the Hub documentation [here](https://huggingface.co/docs/hub/how-to-upstream).
+### Hub web interface
 
-## Upload with the web interface
+The Hub web interface is a no-code approach for uploading a model.
 
-Users who prefer a no-code approach are able to upload a model through the Hub's web interface. Visit [huggingface.co/new](https://huggingface.co/new) to create a new repository:
+1. Create a new repository by selecting [**New Model**](https://huggingface.co/new).
 
-![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png"/>
+</div>
 
-From here, add some information about your model:
+Add some information about your model:
 
 - Select the **owner** of the repository. This can be yourself or any of the organizations you belong to.
 - Pick a name for your model, which will also be the repository name.
 - Choose whether your model is public or private.
-- Specify the license usage for your model.
+- Set the license usage.
+
+2. Click on **Create model** to create the model repository.
+
+3. Select the **Files** tab and click on the **Add file** button to drag-and-drop a file to your repository. Add a commit message and click on **Commit changes to main** to commit the file.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png"/>
+</div>
 
-Now click on the **Files** tab and click on the **Add file** button to upload a new file to your repository. Then drag-and-drop a file to upload and add a commit message.
+## Model card
 
-![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
+[Model cards](https://hf.co/docs/hub/model-cards#model-cards) inform users about a models performance, limitations, potential biases, and ethical considerations. It is highly recommended to add a model card to your repository!
 
-## Add a model card
+A model card is a `README.md` file in your repository. Add this file by:
 
-To make sure users understand your model's capabilities, limitations, potential biases and ethical considerations, please add a model card to your repository. The model card is defined in the `README.md` file. You can add a model card by:
+- manually creating and uploading a `README.md` file
+- clicking on the **Edit model card** button in the repository
 
-* Manually creating and uploading a `README.md` file.
-* Clicking on the **Edit model card** button in your model repository.
+Take a look at the Llama 3.1 [model card](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) for an example of what to include on a model card.
 
-Take a look at the DistilBert [model card](https://huggingface.co/distilbert/distilbert-base-uncased) for a good example of the type of information a model card should include. For more details about other options you can control in the `README.md` file such as a model's carbon footprint or widget examples, refer to the documentation [here](https://huggingface.co/docs/hub/models-cards).
+Learn more about other model card metadata (carbon emissions, license, link to paper, etc.) available in the [Model Cards](https://hf.co/docs/hub/model-cards#model-cards) guide.
diff --git a/docs/source/en/models.md b/docs/source/en/models.md
new file mode 100644
index 000000000000..cc897dcc958f
--- /dev/null
+++ b/docs/source/en/models.md
@@ -0,0 +1,325 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Loading models
+
+Transformers provides many pretrained models that are ready to use with a single line of code. It requires a model class and the [`~PreTrainedModel.from_pretrained`] method.
+
+Call [`~PreTrainedModel.from_pretrained`] to download and load a models weights and configuration stored on the Hugging Face [Hub](https://hf.co/models).
+
+> [!TIP]
+> The [`~PreTrainedModel.from_pretrained`] method loads weights stored in the [safetensors](https://hf.co/docs/safetensors/index) file format if they're available. Traditionally, PyTorch model weights are serialized with the [pickle](https://docs.python.org/3/library/pickle.html) utility which is known to be unsecure. Safetensor files are more secure and faster to load.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype="auto", device_map="auto")
+```
+
+This guide explains how models are loaded, the different ways you can load a model, how to overcome memory issues for really big models, and how to load custom models.
+
+## Models and configurations
+
+All models have a `configuration.py` file with specific attributes like the number of hidden layers, vocabulary size, activation function, and more. You'll also find a `modeling.py` file that defines the layers and mathematical operations taking place inside each layer. The `modeling.py` file takes the model attributes in `configuration.py` and builds the model accordingly. At this point, you have a model with random weights that needs to be trained to output meaningful results.
+
+<!-- insert diagram of model and configuration -->
+
+> [!TIP]
+> An *architecture* refers to the model's skeleton and a *checkpoint* refers to the model's weights for a given architecture. For example, [BERT](./model_doc/bert) is an architecture while [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) is a checkpoint. You'll see the term *model* used interchangeably with architecture and checkpoint.
+
+There are two general types of models you can load:
+
+1. A barebones model, like [`AutoModel`] or [`LlamaModel`], that outputs hidden states.
+2. A model with a specific *head* attached, like [`AutoModelForCausalLM`] or [`LlamaForCausalLM`], for performing specific tasks.
+
+For each model type, there is a separate class for each machine learning framework (PyTorch, TensorFlow, Flax). Pick the corresponding prefix for the framework you're using.
+
+<hfoptions id="backend">
+<hfoption id="PyTorch">
+
+```py
+from transformers import AutoModelForCausalLM, MistralForCausalLM
+
+# load with AutoClass or model-specific class
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", , torch_dtype="auto", device_map="auto")
+model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", , torch_dtype="auto", device_map="auto")
+```
+
+</hfoption>
+<hfoption id="TensorFlow">
+
+```py
+from transformers import TFAutoModelForCausalLM, TFMistralForCausalLM
+
+# load with AutoClass or model-specific class
+model = TFAutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+model = TFMistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+```
+
+</hfoption>
+<hfoption id="Flax">
+
+```py
+from transformers import FlaxAutoModelForCausalLM, FlaxMistralForCausalLM
+
+# load with AutoClass or model-specific class
+model = FlaxAutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+model = FlaxMistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+```
+
+</hfoption>
+</hfoptions>
+
+## Model classes
+
+To get a pretrained model, you need to load the weights into the model. This is done by calling [`~PreTrainedModel.from_pretrained`] which accepts weights from the Hugging Face Hub or a local directory.
+
+There are two model classes, the [AutoModel](./model_doc/auto) class and a model-specific class.
+
+<hfoptions id="model-classes">
+<hfoption id="AutoModel">
+
+<Youtube id="AhChOFRegn4"/>
+
+The [AutoModel](./model_doc/auto) class is a convenient way to load an architecture without needing to know the exact model class name because there are many models available. It automatically selects the correct model class based on the configuration file. You only need to know the task and checkpoint you want to use.
+
+Easily switch between models or tasks, as long as the architecture is supported for a given task.
+
+For example, the same model can be used for separate tasks.
+
+```py
+from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
+
+# use the same API for 3 different tasks
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
+model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf")
+model = AutoModelForQuestionAnswering.from_pretrained("meta-llama/Llama-2-7b-hf")
+```
+
+In other cases, you may want to quickly try out several different models for a task.
+
+```py
+from transformers import AutoModelForCausalLM
+
+# use the same API to load 3 different models
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-7b")
+```
+
+</hfoption>
+<hfoption id="model-specific class">
+
+The [AutoModel](./model_doc/auto) class builds on top of model-specific classes. All model classes that support a specific task are mapped to their respective `AutoModelFor` task class.
+
+If you already know which model class you want to use, then you could use its model-specific class directly.
+
+```py
+from transformers import LlamaModel, LlamaForCausalLM
+
+model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
+```
+
+</hfoption>
+</hfoptions>
+
+## Large models
+
+Large pretrained models require a lot of memory to load. The loading process involves:
+
+1. creating a model with random weights
+2. loading the pretrained weights
+3. placing the pretrained weights on the model
+
+You need enough memory to hold two copies of the model weights (random and pretrained) which may not be possible depending on your hardware. In distributed training environments, this is even more challenging because each process loads a pretrained model.
+
+Transformers reduces some of these memory-related challenges with fast initialization, sharded checkpoints, Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature, and supporting lower bit data types.
+
+### Fast initialization
+
+A PyTorch model is instantiated with random weights, or "empty" tensors, that take up space in memory without filling it.
+
+Transformers boosts loading speed by skipping random weight initialization with the [_fast_init](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter if the pretrained weights are correctly initialized. This parameter is set to `True` by default.
+
+### Sharded checkpoints
+
+The [`~PreTrainedModel.save_pretrained`] method automatically shards checkpoints larger than 10GB.
+
+Each shard is loaded sequentially after the previous shard is loaded, limiting memory usage to only the model size and the largest shard size.
+
+The `max_shard_size` parameter defaults to 5GB for each shard because it is easier to run on free-tier GPU instances without running out of memory.
+
+For example, create some shards checkpoints for [BioMistral/BioMistral-7B](https://hf.co/BioMistral/BioMistral-7B) in [`~PreTrainedModel.save_pretrained`].
+
+```py
+from transformers import AutoModel
+import tempfile
+import os
+
+model = AutoModel.from_pretrained("biomistral/biomistral-7b")
+with tempfile.TemporaryDirectory() as tmp_dir:
+    model.save_pretrained(tmp_dir, max_shard_size="5GB")
+    print(sorted(os.listdir(tmp_dir)))
+```
+
+Reload the sharded checkpoint with [`~PreTrainedModel.from_pretrained`].
+
+```py
+with tempfile.TemporaryDirectory() as tmp_dir:
+    model.save_pretrained(tmp_dir)
+    new_model = AutoModel.from_pretrained(tmp_dir)
+```
+
+Sharded checkpoints can also be directly loaded with [`~transformers.modeling_utils.load_sharded_checkpoint`].
+
+```py
+from transformers.modeling_utils import load_sharded_checkpoint
+
+with tempfile.TemporaryDirectory() as tmp_dir:
+    model.save_pretrained(tmp_dir, max_shard_size="5GB")
+    load_sharded_checkpoint(model, tmp_dir)
+```
+
+The [`~PreTrainedModel.save_pretrained`] method creates an index file that maps parameter names to the files they're stored in. The index file has two keys, `metadata` and `weight_map`.
+
+```py
+import json
+
+with tempfile.TemporaryDirectory() as tmp_dir:
+    model.save_pretrained(tmp_dir, max_shard_size="5GB")
+    with open(os.path.join(tmp_dir, "model.safetensors.index.json"), "r") as f:
+        index = json.load(f)
+
+print(index.keys())
+```
+
+The `metadata` key provides the total model size.
+
+```py
+index["metadata"]
+{'total_size': 28966928384}
+```
+
+The `weight_map` key maps each parameter to the shard it's stored in.
+
+```py
+index["weight_map"]
+{'lm_head.weight': 'model-00006-of-00006.safetensors',
+ 'model.embed_tokens.weight': 'model-00001-of-00006.safetensors',
+ 'model.layers.0.input_layernorm.weight': 'model-00001-of-00006.safetensors',
+ 'model.layers.0.mlp.down_proj.weight': 'model-00001-of-00006.safetensors',
+ ...
+}
+```
+
+### Big Model Inference
+
+> [!TIP]
+> Make sure you have Accelerate v0.9.0 and PyTorch v1.9.0 or later installed to use this feature!
+
+<Youtube id="MWCSGj9jEAo"/>
+
+[`~PreTrainedModel.from_pretrained`] is supercharged with Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature.
+
+Big Model Inference creates a *model skeleton* on the PyTorch [meta](https://pytorch.org/docs/main/meta.html) device. The meta device doesn't store any real data, only the metadata.
+
+Randomly initialized weights are only created when the pretrained weights are loaded to avoid maintaining two copies of the model in memory at the same time. The maximum memory usage is only the size of the model.
+
+> [!TIP]
+> Learn more about device placement in [Designing a device map](https://hf.co/docs/accelerate/v0.33.0/en/concept_guides/big_model_inference#designing-a-device-map).
+
+Big Model Inference's second feature relates to how weights are loaded and dispatched in the model skeleton. Model weights are dispatched across all available devices, starting with the fastest device (usually the GPU) and then offloading any remaining weights to slower devices (CPU and hard drive).
+
+Both features combined reduces memory usage and loading times for big pretrained models.
+
+Set [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) to `"auto"` to enable Big Model Inference. This also sets the [low_cpu_mem_usage](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3028) parameter to `True`, such that not more than 1x the model size is used in CPU memory.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto")
+```
+
+You can also manually assign layers to a device in `device_map`. It should map all model parameters to a device, but you don't have to detail where all the submodules of a layer go if the entire layer is on the same device.
+
+Access the `hf_device_map` attribute to see how a model is distributed across devices.
+
+```py
+device_map = {"model.layers.1": 0, "model.layers.14": 1, "model.layers.31": "cpu", "lm_head": "disk"}
+model.hf_device_map
+```
+
+### Model data type
+
+PyTorch model weights are initialized in `torch.float32` by default. Loading a model in a different data type, like `torch.float16`, requires additional memory because the model is loaded again in the desired data type.
+
+Explicitly set the [torch_dtype](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype) parameter to directly initialize the model in the desired data type instead of loading the weights twice (`torch.float32` then `torch.float16`). You could also set `torch_dtype="auto"` to automatically load the weights in the data type they are stored in.
+
+<hfoptions id="dtype">
+<hfoption id="specific dtype">
+
+```py
+from transformers import AutoModelForCausalLM
+
+gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16)
+```
+
+</hfoption>
+<hfoption id="auto dtype">
+
+```py
+from transformers import AutoModelForCausalLM
+
+gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype="auto")
+```
+
+</hfoption>
+</hfoptions>
+
+The `torch_dtype` parameter can also be configured in [`AutoConfig`] for models instantiated from scratch.
+
+```py
+import torch
+from transformers import AutoConfig, AutoModel
+
+my_config = AutoConfig.from_pretrained("google/gemma-2b", torch_dtype=torch.float16)
+model = AutoModel.from_config(my_config)
+```
+
+## Custom models
+
+Custom models builds on Transformers' configuration and modeling classes, supports the [AutoClass](#autoclass) API, and are loaded with [`~PreTrainedModel.from_pretrained`]. The difference is that the modeling code is *not* from Transformers.
+
+Take extra precaution when loading a custom model. While the Hub includes [malware scanning](https://hf.co/docs/hub/security-malware#malware-scanning) for every repository, you should still be careful to avoid inadvertently executing malicious code.
+
+Set `trust_remote_code=True` in [`~PreTrainedModel.from_pretrained`] to load a custom model.
+
+```py
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
+```
+
+As an extra layer of security, load a custom model from a specific revision to avoid loading model code that may have changed. The commit hash can be copied from the models [commit history](https://hf.co/sgugger/custom-resnet50d/commits/main).
+
+```py
+commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
+model = AutoModelForImageClassification.from_pretrained(
+    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
+)
+```
+
+Refer to the [Customize models](./custom_models) guide for more information.
diff --git a/docs/source/en/modular_transformers.md b/docs/source/en/modular_transformers.md
index dca1282bcf99..badeab0214ae 100644
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@@ -1,88 +1,55 @@
-# Modular transformers
+# Modular Transformers
 
-`transformers` is an opinionated framework; our philosophy is defined in the following [conceptual guide](./philosophy).
+Modular Transformers lowers the bar for contributing models and significantly reduces the code required to add a model by allowing imports and inheritance.
 
-The core of that philosophy is exemplified by the [single model, single file](https://huggingface.co/blog/transformers-design-philosophy)
-aspect of the library. This component's downside is that it limits the inheritance and importability of components from
-files to others in the toolkit.
+One of Transformers' core design feature is the [single model, single file](https://huggingface.co/blog/transformers-design-philosophy) policy. Model components - such as attention layers - are repeated across many files and any independent implementations tend to diverge as fixes and changes are applied to specific parts of the code.
 
-As a result, model components tend to be repeated across many files. There are as many attention layers defined
-in `transformers` as there are models, and a significant number of those are identical to each other. 
-The unfortunate consequence is that independent implementations tend to diverge as fixes and changes get applied
-to specific parts of the code.
+The [`# Copied from`](./pr_checks#check-copies) statements prevents the code from diverging, and it is enforced by our continuous integration tests and local commands. The downside is that this approach is tedious and adds significantly more lines of code, most of which is boilerplate.
 
-In order to balance this issue, we introduced the concept of "copies" across the library. By adding a comment indicating
-that code is a copy of another, we can enforce through CI and local commands that copies do not diverge. However,
-while the complexity is low, this is often quite tedious to do.
+## Motivation
 
-And, finally, this contributes to adding a significant overhead to contributing models which we would like to remove.
-This approach often requires model contributions to add modeling code (~1k lines), processor (~500 lines), tests, docs,
-etc. Model contribution PRs rarely add less than 3-5k lines of code, with much of this code being boilerplate.
+Modular Transformers addresses these issues by adding a *modular* file to a model folder. The modular file can import code from other models and inherit code from other classes unlike traditional modeling and processing files.
 
-This raises the bar for contributions, and with Modular Transformers, we're aiming to lower the bar to a much more
-acceptable point.
+> [!TIP]
+> Modular Transformers isn't meant to replace the modeling code, and if your model isn't based on an existing model, you'll need to add a `modeling.py` file manually. Likewise, if a configuration, tokenization or processing file can't easily inherit from a similar file, you can add that file directly.
 
-If you plan to add a model to `transformers` make sure you read [How to add a model to 🤗 Transformers?](https://huggingface.co/docs/transformers/add_new_model).
-For any kind of contributions, see [CONTRIBUTING.md](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md).
+A modular file contains model, processor, and configuration class code that would otherwise be in separate files under the single model, single file policy.
 
-## What is it?
+Model users still import and use the single-file interface they've grown familiar with. In doing so, we hope to enable simpler contributions while sticking to our philosophy.
 
-Modular Transformers introduces the concept of a "modular" file to a model folder. This modular file accepts code
-that isn't typically accepted in modeling/processing files, as it allows importing from neighbouring models as well
-as inheritance from classes to others.
+## Create a modeling.py file
 
-This modular file defines models, processors, and the configuration class that would otherwise be defined in their
-respective modules.
+A linter "unravels" the modular file into a `modeling.py` file to preserve the single model, single file directory structure (modeling, processor, etc.). Inheritance is flattened to only a **single** level.
 
-Finally, this feature introduces a new `linter` which will "unravel" the modular file into the "single model, single 
-file" directory structure. These files will get auto-generated every time the script is run; reducing the required
-contributions to the modular file, and therefore only to the changes between the contributed model and others.
-
-Model users will end up importing and using the single-file interface, so no change is expected here. Doing this, we
-hope to combine the best of both worlds: enabling simple contributions while sticking to our philosophy.
-
-This is therefore a replacement for the `# Copied from` markers, and previously contributed models can be expected to
-be moved to the new Modular Transformers format in the coming months.
-
-### Details 
-
-To generate a single file from the modular file, run the following command.
+Run the command below to automatically generate a `modeling.py` file from a modular file.
 
 ```bash
-python utils/modular_model_converter.py --files-to-parse src/transformers/models/<your_model>/modular_<your_model>.py
+python utils/modular_model_converter.py --files_to_parse src/transformers/models/<your_model>/modular_<your_model>.py
 ```
 
-The "linter", which unravels the inheritance and creates all single-files from the modular file, will flatten the 
-inheritance while trying to be invisible to Python users. At this time, the linter flattens a **single** level of
-inheritance.
-
 For example:
-- If a configuration class inherits from another and adds/deletes an argument, the generated file will either directly 
-  reference it (in case of addition) or completely remove it (in case of deletion).
-- If a class inherits from another, for example: `class GemmaModel(LlamaModel):`, dependencies are automatically 
-  inferred. All submodules will be automatically added from the superclass.
-- If you define new functions in the `modular` and use them inside classes, the linter will automatically infer the 
 
-You should be able to write everything (the tokenizer, the image processor, the model, the config) in this `modular` 
-file, and the corresponding files will be created for you. 
+- If a configuration class inherits from another class, but adds and deletes an argument, the generated file directly references it if an argument is added or completely removes it if an argument is deleted.
+- If a class inherits from another, like `GemmaModel(LlamaModel)`, the dependencies are automatically inferred. All submodules are also automatically inferred from the superclass.
+- If a new function is defined in the modular file and used inside classes, the linter automatically infers these as well.
 
-### Enforcement
+You should be able to write everything (tokenizer, image processor, model, config, etc.) in a modular and their corresponding single-files are generated.
 
-Run the command below to ensure the generated content matches `modular_<your_model>.py`
+Run the command below to ensure the generated content matches `modular_<your_model>.py`.
 
 ```bash
 python utils/check_modular_conversion.py --files src/transformers/models/<your_model>/modular_<your_model>.py
 ```
 
-### Examples
+The example below demonstrates how a model can be added with significantly fewer lines of code with Modular Transformers.
 
-Here is a quick example with BERT and RoBERTa. The two models are intimately related: their modeling implementation 
-differs solely by a change in the embedding layer.
+### BERT and RoBERTa
 
-Instead of redefining the model entirely, here is what the `modular_roberta.py` file looks like for the modeling &
-configuration classes (for the sake of the example, the tokenizer is ignored at this time as very different).
+BERT and RoBERTa, two very similar models, differ solely in how the embedding layer is implemented.
 
-```python
+Instead of redefining the model entirely, consider the `modular_roberta.py` file shown below for the modeling and configuration classes (the tokenizer isn't shown in this example).
+
+```py
 from torch import nn
 from ..bert.configuration_bert import BertConfig
 from ..bert.modeling_bert import (
@@ -91,11 +58,11 @@ from ..bert.modeling_bert import (
     BertForMaskedLM
 )
 
-# The RoBERTa config is identical to BERT's config
+# RoBERTa and BERT config is identical
 class RobertaConfig(BertConfig):
   model_type = 'roberta'
 
-# We redefine the embeddings here to highlight the padding ID difference, and we redefine the position embeddings
+# Redefine the embeddings to highlight the padding id difference, and redefine the position embeddings
 class RobertaEmbeddings(BertEmbeddings):
     def __init__(self, config):
         super().__init__(config())
@@ -105,57 +72,53 @@ class RobertaEmbeddings(BertEmbeddings):
             config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
         )
 
-# The RoBERTa model is identical to the BERT model, except for the embedding layer. 
-# We redefine the embeddings above, so here there is no need to do additional work
+# RoBERTa and BERT model is identical except for the embedding layer, which is defined above, so no need for additional changes here
 class RobertaModel(BertModel):
   def __init__(self, config):
     super().__init__(config)
     self.embeddings = RobertaEmbeddings(config)
 
-      
-# The heads now only need to redefine the model inside to the correct `RobertaModel`
+
+# The model heads now only need to redefine the model inside to `RobertaModel`
 class RobertaForMaskedLM(BertForMaskedLM):
   def __init__(self, config):
     super().__init__(config)
     self.model = RobertaModel(config)
 ```
 
-## What it is not
-
-It is not a replacement for the modeling code (yet?), and if your model is not based on anything else that ever existed, then you can add a `modeling` file as usual. Similarly, if you cannot easily inherit your `configuration` (or `tokenization` or `processing`) file from another model's similar file, you can add that filetype directly (even though defining it in the modular file would work, it would clutter it).
-
+If you don't use the defined dependency, you'll receive the following error.
 
-## Real world example breakdown
-
-As explained, modular allows you to use regular Python inheritance from any other model's code in the library, in order to define your own. For this reason, it will work better/be easier if you first browse the library a bit to find models close to yours, in order to inherit from them. For example, are you using a sliding window in the `Attention` class? Then start by checking models that are well known to use it, e.g. `Mistral`, or `Qwen2`! Are you using interleaved `RotaryEmbedding` modules? Check out `Cohere`, `Cohere2` and `Glm` models! Otherwise a very strong starting point is to check out `Llama`. And if you are doing a bit of all of that at once, then you can mix and match!
+```
+ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used when you define `BertModel`, as it is one of it's direct dependencies. Make sure you use it in the `__init__` function.
+```
 
-Here are some common properties that your model might be using, and corresponding modeling files to check as an example:
-- Mixture of expert: `SwitchTransformers` or `Mixtral`
-- Interleaved (and/or partial) rotary embedding: `Glm`, `Phi`
-- State space models: 
-    - Hybrid with attention: `Jamba` , `Bamba`, `Zamba`
-    - Mamba2: `Mamba2` 
-- Recurrent hidden states: `Gemma2`
-- Different sliding window attention/full attention patterns per layer: `Gemma2`, `Cohere2`
-- Clipping of QKV: `Olmo`
-- Normalization of QK: `Olmo2`, `Cohere`
-- Fused QKV (not recommended): `Phi3`
+## Implementing a modular file
 
-At Hugging Face, we feel that learning by example is usually (one of) the best way, so we will now go over a typical modular file, and the different features our linter provides (and its limitations)! 🤗 Let's use a real world example with Olmo2 model, which I feel provides a very good illustration of the modular mechanisms. The original file can be found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modular_olmo2.py). For simplicity, we will go over it class by class, and repeat the modular's definition of ech class. For reference, the modeling and configuration of Olmo (v1) on which we will inherit a lot can be found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo/modeling_olmo.py) and [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo/configuration_olmo.py) respectively. The final modeling of Olmo2 (generated by running our linter on the modular we will describe below) can be found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py)
+The easiest way to start is by browsing Transformers for a model similar to yours in order to inherit from it. Some good starting points are [Mistral](./model_doc/mistral), [Qwen2](./model_doc/qwen2), [Cohere](./model_doc/cohere) and [Cohere](./model_doc/cohere2), and [Llama](./model_doc/llama). Refer to the table below for components your model might be using and where you can inherit from.
 
-Let's break it down!
+| Component | Model |
+|---|---|
+| Mixture of expert | SwitchTransformers or Mixtral |
+| Interleaved (and/or partial) rotary embedding | GLM, Phi |
+| State space models | Jamba, Bamba, Zamba, Mamba2 |
+| Recurrent hidden states | Gemma2 |
+| Sliding window attention/full attention patterns per layer | Gemma2, Cohere2 |
+| QKV clipping | Olmo |
+| QK normalization | Olmo2, Cohere |
+| Fused QKV (not recommended) | Phi3 |
 
+This section will walk you through how to implement [Olmo2](./model_doc/olmo2) from [Olmo](./model_doc/olmo) with modular Transformers (you can refer to the original [modeling.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modular_olmo2.py) file).
 
-### Config class
+### Config
 
-Here is the `Config` definition in modular:
+The modular `Olmo2Config` is shown below.
 
 ```py
 from ..olmo.configuration_olmo import OlmoConfig
 
 class Olmo2Config(OlmoConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Olmo2Model`].
+    This is the configuration class to store the configuration of a [Olmo2Model](/docs/transformers/main/en/model_doc/olmo2#transformers.Olmo2Model).
     """
 
     def __init__(
@@ -207,19 +170,21 @@ class Olmo2Config(OlmoConfig):
         del self.clip_qkv
 ```
 
-Here, we correctly identified that the `Config` in Olmo2 is similar to Olmo's, up to a few details:
-1. The default value of most arguments has changed
-2. we have a new argument, `rms_norm_eps`
-3. the argument `clip_qkv` is not used anymore
+There are three points where the `Olmo2Config` is different from the original `OlmoConfig`.
+
+1. The default value of most arguments have changed.
+2. There is a new argument, `rms_norm_eps`.
+3. The `clip_qkv` argument isn't used anymore.
+
+For the new default values and argument, overwrite the `__init__` function with the new default values and add `rms_norm_eps`. Assign `rms_norm_eps` to `self` in the body of `__init__`. For the `clip_qkv` argument, use `del self.clip_qkv` to remove the assignment of this attribute in the unraveled code (post-linter conversion).
 
-To solve points 1. and 2., simply overwriting the `__init__` function with the new default arguments and adding the new one is enough, as you would expect when you want to overwrite a method in Python! Of course you also need to assign the new attribute `rms_norm_eps` to `self` in the `__init__`'s body.  
-For point 3., we use the special syntax `del self.clip_qkv`, which, has you can expect, removed the assignment of this attribute in the unravelled code (after the conversion with the linter).  
+Notice how the `super().__init__(...)` is used. Typically, it calls the parent `__init__`.
 
-Now, there is a subtility here: as you can see, we used `super().__init__(...)`. Usually, in Python, it is simply used to call the parent's `__init__`. In modular terms, however, it has a _slightly_ different meaning. When we find a call such as `super().my_function(...)` in the modular file, the linter will take the body of the `my_function` function in the parent, and unravel it where the call to `super().my_function(...)` occured. Then, the `del self.clip_qkv` statement will remove the reference to `self.clip_qkv` from the unravelled body. Thus `del self.xxx` can only work in pair with `super().my_function(...)`, and should always be placed after it (but you can add whatever you want _before_ calling `super()`, and it will be placed, as you can expect, before the parent's body).
+But in modular Transformers, if there is a call like `super().my_function(...)`, the linter takes the body of `my_function` in the parent and unravels it where the call to `super().my_function(...)` occurred. The `del self.clip_qkv` statement removes the reference to `self.clip_qkv` in the unraveled body.
 
-### Norm class
+`del self.` and `super().my_function(..)` work together, and it should always be placed after `super().my_function(...)`. You can add whatever you want *before* calling `super()`, and it is placed before the parents body.
 
-Here is the `Norm` class:
+### Norm
 
 ```py
 from ..llama.modeling_llama import LlamaRMSNorm
@@ -228,11 +193,11 @@ class Olmo2RMSNorm(LlamaRMSNorm):
     pass
 ```
 
-What to say here, it is pretty explicit isn't it? We do not modify anything from the `LlamaRMSNorm` definition. Thus the linter will unravel exactly the content of the parent (`LlamaRMSNorm`). Only change will be that every reference to "llama" on the docstrings, type hints, and comments (basically everywhere) will be changed to references to "olmo2" for consistency!
+Nothing needs to be modified in `LlamaRMSNorm`. The linter unravels the exact content of `LlamaRMSNorm` into `Olmo2RMSNorm`. References to Llama in the docstrings, type hints, and comments are also changed to Olmo2.
 
-### Attention class
+### Attention
 
-Here is the `Attention` class:
+The modular `Olmo2Attention` is shown below.
 
 ```py
 from ..llama.modeling_llama import eager_attention_forward
@@ -302,12 +267,15 @@ class Olmo2Attention(OlmoAttention):
         return attn_output, attn_weights
 ```
 
-Now, what's happening here? In the `__init__`, we call `super().__init__(...)`, thus copying the parent's definition, then add 2 new layers of the `Olmo2RMSNorm` we just added previously. Indeed, those were not present in the original `Olmo` (v1) model. So, now, we also have to overwrite the `forward` method to use these 2 new layers right? Indeed, if you check carefully, the definition of `forward` is identical to `Olmo`'s, but we added a pass with the norm layers just before projecting with `q_proj` and `k_proj`. However, to help us, we directly imported the functions `eager_attention_forward` from llama, and `apply_rotary_pos_emb` from olmo. The linter will then automatically add these imported functions in the final `modeling_olmo2.py` file, by copying their definitions from the source (imported) files. And it will even add the `rotate_half` and `repeat_kv` functions (which are used inside `apply_rotary_pos_embed` and `eager_attention_forward` respectively) by figuring out the dependency automatically. Neat, right?  
-Note that we had to redefine this class, because we did not find any model defining the `Attention` layer with the added `RMSNorm` layer anywhere else in the library! Otherwise, we would have simply inherited from this model instead as we did for the `RMSNorm`!
+The `super().__init__(...)` copies the parent definition and adds 2 new layers from `Olmo2RMSNorm`. The forward pass needs to be overwritten to use these 2 new layers. A pass with the norm layers is added before projecting with `q_proj` and `k_proj`. To make it easier, the `eager_attention_forward` function is directly imported from Llama and the `apply_rotary_pos_emb` is imported from Olmo.
 
-### The DecoderLayer class
+The linter automatically adds these imported functions in the final `modeling_olmo2.py` file by copying their definitions from the source files. The `rotate_half` and `repeat_kv` functions are also added because they are used inside `apply_rotary_pos_emb` and `eager_attention_forward`.
 
-Here is the `DecoderLayer` class:
+The `Attention` class had to be redefined because there weren't any existing models with an `Attention` layer that included a `RMSNorm` layer.
+
+### DecoderLayer
+
+The modular `DecoderLayer` is shown below.
 
 ```py
 from ..olmo.modeling_olmo import OlmoDecoderLayer
@@ -365,11 +333,13 @@ class Olmo2DecoderLayer(OlmoDecoderLayer):
         return outputs
 ```
 
-At this point, you should start to pick up what is happening for this class. We switched the type of norm in the `__init__` by overwriting `self.post_attention_layernorm` after the call to `super().__init__(...)`, thus going from a `LayerNorm` in the parent class, to our `RMSNorm` in this class. Then we simply deleted the `self.input_layernorm` attribute, and replaced it by `self.post_feedforward_layernorm`, because the name was not making sense anymore as we apply it after in `Olmo2` instead of before in `Olmo`. For this reason, we also need to overwrite the `forward` method, to reflect the logic change.
+The norm type is switched in `__init__` by overwriting `self.post_attention_layernorm` after the call to `super().__init__(...)`. Delete the `self.input_layernorm` attributed and replace it with `self.post_feedforward_layernorm` because it is applied after in Olmo2. The forward method is overwritten to reflect this change.
 
-Note however that if we had only switched `self.post_attention_layernorm` and `self.input_layernorm` from `LayerNorm`s to `RMSNorm`s (without the name and logic change of `elf.input_layernorm`), we would not have had to redefine the `forward` method!
+If you only switched `self.post_feedforward_layernorm` and `self.input_layernorm` from `LayerNorm` to `RMSNorm` without also changing the name and logic of `self.input_layernorm`, then you wouldn't have to rewrite the forward method.
 
-### The Model class
+### Model
+
+The modular `Olmo2Model` class is shown below.
 
 ```py
 from ..olmo.modeling_olmo import OlmoModel
@@ -385,11 +355,11 @@ class Olmo2Model(OlmoModel):
         )
 ```
 
-Here, this is exactly what I was pointing out before: we simply change the _type_ of the `self.norm` attribute (going from `LayerNorn` in `Olmo` to `RMSNorm` in `Olmo2`). Since this change does not reflect the logic of the `forward` method (the name of the layer and where it is used is identical to the parent's), then we do not even need to overwrite it! It will be unravelled automatically! Note that we redefined `self.layers` for the sake of being explicit, but this is not even strictly required here as the definition is similar to what is found in `Olmo` (v1).
+You only need to change the *type* of the `self.norm` attribute to use `RMSNorm` instead of `LayerNorm`. This change doesn't affect the logic in the forward method (layer name and usage is identical to the parent class), so you don't need to overwrite it. The linter automatically unravels it.
 
-### Finally... The ForCausalLM class
+### Model head
 
-Finally, here is the definition of the `ForCausalLM`:
+The modular causal modeling head is shown below.
 
 ```py
 from ..olmo.modeling_olmo import OlmoForCausalLM
@@ -398,15 +368,15 @@ class Olmo2ForCausalLM(OlmoForCausalLM):
     pass
 ```
 
-As for the `RMSNorm`, it is exactly similar to the parent's in logic, so we do not have anything to do, the linter will all figure it out by itself. Almost disappointing, no?
+The logic is identical to `OlmoForCausalLM` which means you don't need to make any changes here.
 
+### Other classes
 
-<a id="dependencies"></a>
-### But... What about the MLP, RotaryEmbedding and PreTrainedModel classes?
+The [modeling_olmo2.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py) generated by the linter also contains some classes (`Olmo2MLP`, `Olmo2RotaryEmbedding`, `Olmo2PreTrainedModel`) that weren't explicitly defined in `modular_olmo2.py`.
 
-Indeed, if you inspect the file [modeling_olmo2.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py) which is created by running the linter on `modular_olmo2.py`, you will notice that it also creates `Olmo2MLP`, `Olmo2RotaryEmbedding`, and `Olmo2PreTrainedModel` classes, that we did not define explicitly in `modular_olmo2.py`.  
+Classes that are a dependency of an inherited class but aren't explicitly defined are automatically added as a part of dependency tracing. This is similar to how some functions were added to the `Attention` class without directly importing them.
 
-Well, it is one of the main feature of our modular linter. Similarly to how some functions were added automatically with the `Attention` class (without directly importing them), classes that are a dependency of one of the class inherited class and which are not explicitly defined in the modular file, will be added automatically as part of the dependeny tracing. For example, in `OlmoDecoderLayer`, there is an attribute defined as `self.mlp = OlmoMLP(config)`. Because we never explicitly redefined a class named `Olmo2MLP` in `modular_olmo2.py`, the linter automatically created a class `Olmo2MLP`, similar to `OlmoMLP`. This is exactly the same as if we had done:
+For example, `OlmoDecoderLayer` has an attribute defined as `self.mlp = OlmoMLP(config)`. This class was never explicitly redefined in `Olmo2MLP`, so the linter automatically created a `Olmo2MLP` class similar to `OlmoMLP`. It is identical to the code below if it was explicitly written in `modular_olmo2.py`.
 
 ```py
 from ..olmo.modeling_olmo import OlmoMLP
@@ -415,9 +385,9 @@ class Olmo2MLP(OlmoMLP):
     pass
 ```
 
-but we did not even bother, because we _know_ this class is supposed to be exactly similar, and we never needed it anywhere else in the `modular_olmo2.py` file. In contrast, the class `Olmo2RMSNorm` was needed to (re)define the norms both in the `Attention` and `DecoderLayer` classes. The same logic is true for the `Olmo2PreTrainedModel` and `Olmo2RotaryEmbedding` classes.
+However, it was necessary to rewrite `Olmo2RMSNorm` because the layer norm needed to be redefined in the `Attention` and `DecoderLayer` classes. Similarly, this is why you didn't need to create the `Olmo2PreTrainedModel` and `Olmo2RotaryEmbedding` classes.
 
-Note however that if not redefined, classes will be copied from the file in which an inherited module uses them first. So if you wanted e.g. `Olmo2MLP` to inherit from, say, `MistralMLP` instead of `OlmoMLP` (here it was `OlmoMLP` because it was first implicitly used in `Olmo2DecoderLayer`, which inherited from `OlmoDecoderLayer`), you would need to be explicit and do:
+Classes that aren't rewritten are copied from the file where the inherited module first uses them. This means if you wanted `Olmo2MLP` to inherit from `MistralMLP` instead, you would need to be more explicit as shown below.
 
 ```py
 # switch to mistral definition
@@ -427,13 +397,9 @@ class Olmo2MLP(MistralMLP):
     pass
 ```
 
-## Advanced usage
-
-Now that you should have a good grasp of how modular works, let's see some more advanced use cases and features you can use.
+## Removing attributes
 
-### Removing attributes which are not just assignments
-
-As we have seen before, after using `super().__init__()`, we can use `del self.attribute` to remove a specific attribute which was defined in the parent. What if this attribute was used elsewhere though? Meaning it was not just "defined to be stored" as in the config for example. For example, consider the following case:
+You can `del` to remove attributes defined in the parent after using `super().__init__()`. However, this doesn't work if the attribute is also used somewhere else as shown below. It only suppresses the assignment. The `self.attribute = config.attribute` line is removed, but the `if` statement remains and references the attribute.
 
 ```py
 class DummyModel(nn.Module):
@@ -444,11 +410,7 @@ class DummyModel(nn.Module):
     if self.attribute:
       # do more stuff with `self.attribute` here
       ...
-```
-
-Then inheriting from this `DummyModel` and doing
 
-```py
 class MyNewDummyModel(DummyModel):
 
   def __init__(self, config: MyNewDummyConfig):
@@ -456,13 +418,9 @@ class MyNewDummyModel(DummyModel):
     del self.attribute
 ```
 
-is not supported, because it will only suppress the assignment, i.e. the line `self.attribute = config.attribute` will disappear, but the `if` statement will stay and reference the attribute. We tried to make it work by suppressing every mentions of the attribute, however it it not a sound solution in the general case (it can lead to very surprising effects and remove other important parts) and is therefore not possible. 
-
-But what if I still want to inherit from `DummyModel`? How to properly do it? How to use `super().__init__()` without copy/pasting the parent then? This brings us to the next point:
+## Explicit super() calls
 
-### Avoiding super() special meaning
-
-Say you still want to inherit from `DummyModel` (because it is convenient for some other methods) but you do want to remove the `self.attribute`. How to properly override the `__init__` method, while calling `super()` but without unravelling the parent's code? Well, then be explicit about which class `super()`'s you are calling! If we want to call the `nn.Module`'s `super()` for example, we can do the following (unravelled code on the right):
+If you still want to inherit from `DummyModel` but don't want to remove the `self.attribute`, be explicit about which class' `super()` you're calling. The example below shows how to call the `super()` of `nn.Module` (unraveled code shown on the right)
 
 ```py
 class MyNewDummyModel(DummyModel, nn.Module):        |     class MyNewDummyModel(nn.Module):
@@ -473,11 +431,11 @@ class MyNewDummyModel(DummyModel, nn.Module):        |     class MyNewDummyModel
     ...                                              |         ...
 ```
 
-### Deleting unused methods
+## Deleting unused methods
 
-Removing a class method is pretty similar to remove an attribute, you just need to overwrite it with a `raise AttributeError("")` to mimick the behaviour you actually want when you remove a parent function in python. For example, the following will remove the methods in the unravelled code:
+Remove an attribute by overwriting it with a `raise AttributeError("")` statement to mimic the behavior you want when you remove a parent function in Python. The example below removes the methods in the unraveled code.
 
-```python
+```py
 class GemmaTokenizer(LlamaTokenizer):
     ...
 
@@ -488,29 +446,11 @@ class GemmaTokenizer(LlamaTokenizer):
         raise AttributeError("Not needed for Gemma")
 ```
 
-### Define new functions
-
-Of course, if you define a new function in the `modular` file, and use it inside an inherited class, say
+## Defining new functions
 
-```python
-def my_new_function(*args, **kwargs):
-  # Do something here
-  pass
-
-class DummyModel(LlamaModel):
-    def forward(*args, **kwargs):
-      # Call the function
-      example = my_new_function(*args, **kwargs)
-      # continue here
-```
+By default, if you inherit from a class and override a method with one or more decorators in the parent method, the decorators are also added to the unraveled code *only if you don't add any yourself*. Otherwise, the redefined decorator is used.
 
-the `my_new_function` function (and, recursively, any other functions called in its body) will be automatically added to the unravelled code even if it is not present in the parent's file (here Llama).
-
-### Decorators
-
-By default, if you inherit from a class and override a method which has 1 (or more) decorators in the parent's method, the decorators will be added as well in the unravelled code, _but only if you do not add any yourself_. Otherwise, it will of course use whatever decorator your redefined.
-
-That, is, imagine the following parent class
+For example, if you had a parent class shown below and you overwrite it, the parent decorator is kept.
 
 ```py
 class DummyModel(nn.Module):
@@ -521,7 +461,7 @@ class DummyModel(nn.Module):
     # do stuff here
 ```
 
-Then, if you simply override the method it will produce (modular on the left, unravelled code on the right):
+Modular code is shown on the left, and the unraveled code is shown on the right.
 
 ```py
 class NewModel(DummyModel):       |   class NewModel(nn.Module):
@@ -532,7 +472,7 @@ class NewModel(DummyModel):       |   class NewModel(nn.Module):
                                   |       ...
 ```
 
-That is, it keeps the parent's decorators by default. However, if you do:
+But if you add a new decorator, your new decorator is used instead.
 
 ```py
 class NewModel(DummyModel):       |   class NewModel(nn.Module):
@@ -543,11 +483,11 @@ class NewModel(DummyModel):       |   class NewModel(nn.Module):
     ...                           |       ...
 ```
 
-Then it keeps you own new decorator.
+## super_kwargs
 
-### The super_kwargs special case
+In scenarios where a forward method is really long and you want to switch decorators, you don't need to redefine everything and copy/paste the function. You can use `super().forward(...)` to unravel the parent body. When there are a lot of arguments in the function signature, use the special `**super_kwargs` syntax in the overwritten signature.
 
-In the above case about decorators, what if the `forward` method is really long, and I just want to switch the decorators? Do I really have to redefine it all and copy/paste the body just for the decorator? Fortunately, no. If you followed until this point, you now that you can use `super().forward(...)`, and it will unravel the parent's body automatically. But what if there are plenty of arguments in the function's signature, and we are very lazy? For that use-case, we introduced the special syntax `**super_kwargs` in the overriden method signature. It basically mean: "unravel all the parent's signature arguments here". For example, a common signature in the `ForCausalLM` model is the following (copied from llama's modeling):
+This syntax indicates to the linter to unravel all the parent signature arguments here. An example signature in a [`AutoModelForCausalLM`] model is shown below, with lots of arguments.
 
 ```py
 class LlamaForCausalLM(nn.Module):
@@ -574,7 +514,7 @@ class LlamaForCausalLM(nn.Module):
     ...
 ```
 
-As you can see, this is a rather long and complicated signature. But if you do the following (as usual, modular on the left, unravelled code by the linter on the right):
+Instead of rewriting and copying/pasting all of those arguments, use the `super().forward(**super_kwargs)` statement (modular code shown on the left, unraveled code on the right).
 
 ```py
 class NewModelForCausalLM(LlamaForCausalLM):    |    class LlamaForCausalLM(nn.Module):
@@ -600,13 +540,15 @@ class NewModelForCausalLM(LlamaForCausalLM):    |    class LlamaForCausalLM(nn.M
                                                 |       ...
 ```
 
-and the `**super_kwargs` syntax unravelled all the arguments, while the `super().forward()` syntax unravelled the whole body! As you can see, this is  great combo when you just want to switch the decorators, as it is very easy to use, and make it explicit that the only change you want to apply is the decorator.  
+This makes it very easy to switch decorators and makes it explicit that the only change you want to apply is the decorator.
 
-However, we want to make it clear that the `**super_kwargs` syntax is not a replacement to being explicit when you redefine your methods: if you actually overwrite the method (i.e. you do not call `super().method()`), then we want you to explicitly write the signature as you would usually. This is only a short-cut when switching decorators, and a few other niche cases.
+`**super_kwargs` should not be used to avoid being explicit when redefining methods though. If you overwrite a method, you should explicitly write the signature as you normally would. The `**super_kwargs` syntax is a shortcut for switching decorators and a few other niche cases.
 
-### The DOCSTRING variables
+## Docstring variables
 
-Usually, if whatever object is defned both in the modular file and the modeling file from which we inherit, then the definition of the modular takes precedence. However, this is not the case for assignments containing the pattern `DOCSTRING`. Indeed, we usually have variables defined as `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. These are just very big blocks of, well, docstrings... But they are (almost) always exactly the same up to the model name! And modular automatically rewrite the names everywhere! For this reason, assignments containing the pattern will _always_ use the definition found in the source file instead of the modular file. This is extremely handy if we need the variable reference somewhere (e.g. to redefine a decorator) but we do not want to clutter the modular file with 100 lines of docstrings which are always the same. It allows to do the following (taken from [modular_starcoder2.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/starcoder2/modular_starcoder2.py#L146))
+If an object defined in both the modular and modeling file from which it inherits, the modular definition has precedence unless for assignments containing the pattern `DOCSTRING`. These variables are typically used in `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. They are big blocks of docstrings and the linter rewrites the names everywhere. For this reason, assignments containing the `DOCSTRING` variable can use the definition found in the source file without copying the whole docstring, by simply setting the variable to `None` in the modular file.
+
+This is very useful if you need the variable reference somewhere but you don't want to clutter the modular file with docstrings which are always the same. The example code below allows you to automatically use the same docstrings from [Mistral](./model_doc/mistral) in [Starcoder2](./model_doc/starcoder2).
 
 ```py
 STARCODER2_INPUTS_DOCSTRING = None  # will be automatically redefined
@@ -619,15 +561,13 @@ class Starcoder2Model(MistralModel):
         ...
 ```
 
-and here, the linter will correctly take the same definition of the docstring as in `Mistral`, without having to clutter the modular file!
-
-## Limitations
+Setting the variable to anything other than `None` will override the docstring, so that you can customize the docstrings if needed.
 
-Now, let's go over some of the limitations of modular.
+## Special naming
 
-### Special naming (essentially for multimodal models)
+The linter automatically renames everything when inheriting from a class. For consistency, you should always use the same class name prefix when inheriting from different classes from the same file.
 
-Because our linter automatically renames everything when inheriting from a class (defining `class NewModelMLP(LlamaMLP)` will rename every mention of `Llama` to `NewModel`, and recursively for all dependencies grabbed), it has somewhat strict rules when it comes to naming. For consistency reasons, we require that you always use the same class name prefix when inheriting different classes from the same file. For example, doing:
+The example below is not recommended. It breaks standards in the library, `MyModelIncredibleMLP` instead of `LlamaMLP`, and because the linter doesn't know how to rename potential higher-order dependencies (`MyModelIncredible` or just `MyModel`).
 
 ```py
 class MyModelIncredibleMLP(LlamaMLP):
@@ -637,25 +577,21 @@ class MyModelDecoderLayer(LlamaDecoderLayer):
     ...
 ```
 
-is not recommended, first because it breaks standards in the library and we do not like it, and second because the linter will not know how to rename potential high-order dependencies (should we use `MyModelIncredible`, or `MyModel`?).
+However, if there aren't any [implicit dependencies](#other-classes), then you can locally rename a single class. Make sure you still explicitly redefine every other mention of the class with the new name pattern though. For example, all mentions of `LlamaMLP` should be renamed to `MyModelIncredibleMLP` otherwise the linter may add a new and unwanted `MyModelMLP` class.
 
-If there are no dependencies to grab implicitly however (see [this section](#dependencies) to understand implicit dependencies), local renaming (for a single class) will not be an issue and the linter will not complain. But make sure to explicitly redefine every other mentions of the class with the new name pattern! For example in the example above, all mentions of `LlamaMLP` in other modules inherited should be explicitly replaced by mentions to `MyModelIncredibleMLP`, otherwise the linter may add a new and unwanted `MyModelMLP` class!
+The linter raises a warning if an ambiguous case is detected. It explains what is happening and which prefix is used by default for getting the dependencies. These warning and renaming pattern complications usually only come up when defining multimodal models. For example, adding `Text` to class names in a multimodal model to make it clear which modality it refers to.
 
-In any way, if there is an ambiguous case detected, the linter will raise a warning such as
-
-```
+```py
 We detected multiple prefix names when inheriting from transformers.models.llama.modeling_llama: ('Emu3Text', 'Emu3'). We will only use the most used 'Emu3' prefix when grabbing args and dependencies. Make sure to subclass the intermediate classes with the prefix you want (if different from 'Emu3') or use a single prefix in all the modular (best).
 ```
 
-explaining what is happening, and which prefix is used by default for grabbing dependencies. As explained, if you see automatic dependencies appear with a prefix but you want another one, then explicitly rename these classes locally with a simple `pass` class, such as
+If there are automatic dependencies with a prefix, but you want another one, explicitly rename the classes locally with a `pass` class as shown in the following.
 
 ```py
-class Emu3TextMLP(LlamaMLP):                                 
+class Emu3TextMLP(LlamaMLP):
     pass
 ```
 
-Such warnings and renaming patterns complications usually only arise when defining multimodel models, when you want to define e.g. the text part of your model from an existing model, but want to add the part `Text` to the class names to make it clear what they refer to in the multimodal setup.
-
-### Automatic docstrings issue (mostly for Configs)
+## Config docstrings
 
-When inheriting a Config class and adding or deleting some attributes, it may be tempting to only redefine the new attributes in the docstring, and hoping that modular will do the rest. And similarly when deleting an argument, do nothing and hope that modular will remove itself from the docstring. However, due to current limitations of our linter, this is not yet supported. Thus, if you are in this case, you need to directly put the whole docstring (as it should appear in the end, with the correct arguments and default values) directly in the modular file under the class definition.
\ No newline at end of file
+When inheriting a `Config` class or adding and deleting attributes, you may want to only redefine the new attributes in the docstring. However, the linter doesn't support this yet. You need to directly add the while docstring directly in the modular file under the class definition.
diff --git a/docs/source/en/multilingual.md b/docs/source/en/multilingual.md
deleted file mode 100644
index 30a63eea28c8..000000000000
--- a/docs/source/en/multilingual.md
+++ /dev/null
@@ -1,179 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Multilingual models for inference
-
-[[open-in-colab]]
-
-There are several multilingual models in 🤗 Transformers, and their inference usage differs from monolingual models. Not *all* multilingual model usage is different though. Some models, like [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased), can be used just like a monolingual model. This guide will show you how to use multilingual models whose usage differs for inference.
-
-## XLM
-
-XLM has ten different checkpoints, only one of which is monolingual. The nine remaining model checkpoints can be split into two categories: the checkpoints that use language embeddings and those that don't.
-
-### XLM with language embeddings
-
-The following XLM models use language embeddings to specify the language used at inference:
-
-- `FacebookAI/xlm-mlm-ende-1024` (Masked language modeling, English-German)
-- `FacebookAI/xlm-mlm-enfr-1024` (Masked language modeling, English-French)
-- `FacebookAI/xlm-mlm-enro-1024` (Masked language modeling, English-Romanian)
-- `FacebookAI/xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages)
-- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages)
-- `FacebookAI/xlm-clm-enfr-1024` (Causal language modeling, English-French)
-- `FacebookAI/xlm-clm-ende-1024` (Causal language modeling, English-German)
-
-Language embeddings are represented as a tensor of the same shape as the `input_ids` passed to the model. The values in these tensors depend on the language used and are identified by the tokenizer's `lang2id` and `id2lang` attributes.
-
-In this example, load the `FacebookAI/xlm-clm-enfr-1024` checkpoint (Causal language modeling, English-French):
-
-```py
->>> import torch
->>> from transformers import XLMTokenizer, XLMWithLMHeadModel
-
->>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
->>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
-```
-
-The `lang2id` attribute of the tokenizer displays this model's languages and their ids:
-
-```py
->>> print(tokenizer.lang2id)
-{'en': 0, 'fr': 1}
-```
-
-Next, create an example input:
-
-```py
->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
-```
-
-Set the language id as `"en"` and use it to define the language embedding. The language embedding is a tensor filled with `0` since that is the language id for English. This tensor should be the same size as `input_ids`. 
-
-```py
->>> language_id = tokenizer.lang2id["en"]  # 0
->>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
-
->>> # We reshape it to be of size (batch_size, sequence_length)
->>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
-```
-
-Now you can pass the `input_ids` and language embedding to the model:
-
-```py
->>> outputs = model(input_ids, langs=langs)
-```
-
-The [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) script can generate text with language embeddings using the `xlm-clm` checkpoints.
-
-### XLM without language embeddings
-
-The following XLM models do not require language embeddings during inference:
-
-- `FacebookAI/xlm-mlm-17-1280` (Masked language modeling, 17 languages)
-- `FacebookAI/xlm-mlm-100-1280` (Masked language modeling, 100 languages)
-
-These models are used for generic sentence representations, unlike the previous XLM checkpoints.
-
-## BERT
-
-The following BERT models can be used for multilingual tasks:
-
-- `google-bert/bert-base-multilingual-uncased` (Masked language modeling + Next sentence prediction, 102 languages)
-- `google-bert/bert-base-multilingual-cased` (Masked language modeling + Next sentence prediction, 104 languages)
-
-These models do not require language embeddings during inference. They should identify the language from the
-context and infer accordingly.
-
-## XLM-RoBERTa
-
-The following XLM-RoBERTa models can be used for multilingual tasks:
-
-- `FacebookAI/xlm-roberta-base` (Masked language modeling, 100 languages)
-- `FacebookAI/xlm-roberta-large` (Masked language modeling, 100 languages)
-
-XLM-RoBERTa was trained on 2.5TB of newly created and cleaned CommonCrawl data in 100 languages. It provides strong gains over previously released multilingual models like mBERT or XLM on downstream tasks like classification, sequence labeling, and question answering.
-
-## M2M100
-
-The following M2M100 models can be used for multilingual translation:
-
-- `facebook/m2m100_418M` (Translation)
-- `facebook/m2m100_1.2B` (Translation)
-
-In this example, load the `facebook/m2m100_418M` checkpoint to translate from Chinese to English. You can set the source language in the tokenizer:
-
-```py
->>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
-
->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
->>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
-
->>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
->>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-```
-
-Tokenize the text:
-
-```py
->>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
-```
-
-M2M100 forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English:
-
-```py
->>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
-```
-
-## MBart
-
-The following MBart models can be used for multilingual translation:
-
-- `facebook/mbart-large-50-one-to-many-mmt` (One-to-many multilingual machine translation, 50 languages)
-- `facebook/mbart-large-50-many-to-many-mmt` (Many-to-many multilingual machine translation, 50 languages)
-- `facebook/mbart-large-50-many-to-one-mmt` (Many-to-one multilingual machine translation, 50 languages)
-- `facebook/mbart-large-50` (Multilingual translation, 50 languages)
-- `facebook/mbart-large-cc25`
-
-In this example, load the `facebook/mbart-large-50-many-to-many-mmt` checkpoint to translate Finnish to English. You can set the source language in the tokenizer:
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
->>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
-
->>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-```
-
-Tokenize the text:
-
-```py
->>> encoded_en = tokenizer(en_text, return_tensors="pt")
-```
-
-MBart forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English:
-
-```py
->>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
-```
-
-If you are using the `facebook/mbart-large-50-many-to-one-mmt` checkpoint, you don't need to force the target language id as the first generated token otherwise the usage is the same.
diff --git a/docs/source/en/optimizers.md b/docs/source/en/optimizers.md
new file mode 100644
index 000000000000..a02b02c359c9
--- /dev/null
+++ b/docs/source/en/optimizers.md
@@ -0,0 +1,176 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Optimizers
+
+Transformers offers two native optimizers, AdamW and AdaFactor. It also provides integrations for more specialized optimizers. Install the library that offers the optimizer and drop it in the `optim` parameter in [`TrainingArguments`].
+
+This guide will show you how to use these optimizers with [`Trainer`] using [`TrainingArguments`] shown below.
+
+```py
+import torch
+from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, Trainer
+
+args = TrainingArguments(
+    output_dir="./test-optimizer",
+    max_steps=1000,
+    per_device_train_batch_size=4,
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-5,
+    save_strategy="no",
+    run_name="optimizer-name",
+)
+```
+
+## APOLLO
+
+```bash
+pip install apollo-torch
+```
+
+[Approximated Gradient Scaling for Memory Efficient LLM Optimization (APOLLO)](https://github.com/zhuhanqing/APOLLO) is a memory-efficient optimizer that allows full parameter learning for both pretraining and fine-tuning. It maintains AdamW-level performance with SGD-like memory efficiency. For extreme memory efficiency, you can use APOLLO-Mini, a rank 1 variant of APOLLO. APOLLO optimizers support:
+
+* Ultra-low rank efficiency. You can use a much lower rank than [GaLoRE](./trainer#galore), rank 1 is sufficient.
+* Avoid expensive SVD computations. APOLLO leverages random projections to avoid training stalls.
+
+Use the `optim_target_modules` parameter to specify which layers to train.
+
+```diff
+import torch
+from transformers import TrainingArguments
+
+args = TrainingArguments(
+    output_dir="./test-apollo",
+    max_steps=100,
+    per_device_train_batch_size=2,
++   optim="apollo_adamw",
++   optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-5,
+    save_strategy="no",
+    run_name="apollo_adamw",
+)
+```
+
+For additional training options, use `optim_args` to define hyperparameters like `rank`, `scale`, and more. Refer to the table below for a complete list of available hyperparameters.
+
+> [!TIP]
+> The `scale` parameter can be set to `n/r`, where `n` is the original space dimension and `r` is the low-rank space dimension. You could achieve a similar effect by adjusting the learning rate while keeping `scale` at its default value.
+
+| parameter | description | APOLLO | APOLLO-Mini |
+|---|---|---|---|
+| rank | rank of the auxiliary sub-space for gradient scaling | 256 | 1 |
+| scale_type | how scaling factors are applied | `channel` (per-channel scaling) | `tensor` (per-tensor scaling) |
+| scale | adjusts gradient updates to stabilize training | 1.0 | 128 |
+| update_proj_gap | steps before updating projection matrices | 200 | 200 |
+| proj | projection type | `random` | `random` |
+
+The example below enables the APOLLO-Mini optimizer.
+
+```py
+from transformers import TrainingArguments
+
+args = TrainingArguments(
+    output_dir="./test-apollo_mini",
+    max_steps=100,
+    per_device_train_batch_size=2,
+    optim="apollo_adamw",
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
+    optim_args="proj=random,rank=1,scale=128.0,scale_type=tensor,update_proj_gap=200",
+)
+```
+
+## GrokAdamW
+
+```bash
+pip install grokadamw
+```
+
+[GrokAdamW](https://github.com/cognitivecomputations/grokadamw) is an optimizer designed to help models that benefit from *grokking*, a term used to describe delayed generalization because of slow-varying gradients. It is particularly useful for models requiring more advanced optimization techniques to achieve better performance and stability.
+
+```diff
+import torch
+from transformers import TrainingArguments
+
+args = TrainingArguments(
+    output_dir="./test-grokadamw",
+    max_steps=1000,
+    per_device_train_batch_size=4,
++   optim="grokadamw",
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-5,
+    save_strategy="no",
+    run_name="grokadamw",
+)
+```
+
+## LOMO
+
+```bash
+pip install lomo-optim
+```
+
+[Low-Memory Optimization (LOMO)](https://github.com/OpenLMLab/LOMO) is a family of optimizers, [LOMO](https://huggingface.co/papers/2306.09782) and [AdaLomo](https://hf.co/papers/2310.10195), designed for low-memory full-parameter finetuning of LLMs. Both LOMO optimizers fuse the gradient computation and parameter update in one step to reduce memory usage. AdaLomo builds on top of LOMO by incorporating an adaptive learning rate for each parameter like the Adam optimizer.
+
+> [!TIP]
+> It is recommended to use AdaLomo without `grad_norm` for better performance and higher throughput.
+
+```diff
+args = TrainingArguments(
+    output_dir="./test-lomo",
+    max_steps=1000,
+    per_device_train_batch_size=4,
++   optim="adalomo",
+    gradient_checkpointing=True,
+    gradient_checkpointing=True,
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-6,
+    save_strategy="no",
+    run_name="adalomo",
+)
+```
+
+## Schedule Free
+
+```bash
+pip install schedulefree
+```
+
+[Schedule Free optimizer (SFO)](https://hf.co/papers/2405.15682) replaces the base optimizers momentum with a combination of averaging and interpolation. Unlike a traditional scheduler, SFO completely removes the need to anneal the learning rate.
+
+SFO supports the RAdam (`schedule_free_radam`), AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. The RAdam scheduler doesn't require `warmup_steps` or `warmup_ratio`.
+
+By default, it is recommended to set `lr_scheduler_type="constant"`. Other `lr_scheduler_type` values may also work, but combining SFO optimizers with other learning rate schedules could affect SFOs intended behavior and performance.
+
+```diff
+args = TrainingArguments(
+    output_dir="./test-schedulefree",
+    max_steps=1000,
+    per_device_train_batch_size=4,
++   optim="schedule_free_radamw,
++   lr_scheduler_type="constant",
+    gradient_checkpointing=True,
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-6,
+    save_strategy="no",
+    run_name="sfo",
+)
+```
diff --git a/docs/source/en/peft.md b/docs/source/en/peft.md
index e1777114dbcf..4ee0e2681963 100644
--- a/docs/source/en/peft.md
+++ b/docs/source/en/peft.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
@@ -9,243 +9,145 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 -->
 
-# Load adapters with 🤗 PEFT
+# PEFT
 
 [[open-in-colab]]
 
-[Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it. The adapters are trained to learn task-specific information. This approach has been shown to be very memory-efficient with lower compute usage while producing results comparable to a fully fine-tuned model. 
+[PEFT](https://huggingface.co/docs/peft/index), a library of parameter-efficient fine-tuning methods, enables training and storing large models on consumer GPUs. These methods only fine-tune a small number of extra model parameters, also known as adapters, on top of the pretrained model. A significant amount of memory is saved because the GPU doesn't need to store the optimizer states and gradients for the pretrained base model. Adapters are very lightweight, making it convenient to share, store, and load them.
 
-Adapters trained with PEFT are also usually an order of magnitude smaller than the full model, making it convenient to share, store, and load them.
+This guide provides a short introduction to the PEFT library and how to use it for training with Transformers. For more details, refer to the PEFT [documentation](https://huggingface.co/docs/peft/index).
 
-<div class="flex flex-col justify-center">
-  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/PEFT-hub-screenshot.png"/>
-  <figcaption class="text-center">The adapter weights for a OPTForCausalLM model stored on the Hub are only ~6MB compared to the full size of the model weights, which can be ~700MB.</figcaption>
-</div>
+Install PEFT with the command below.
 
-If you're interested in learning more about the 🤗 PEFT library, check out the [documentation](https://huggingface.co/docs/peft/index).
-
-## Setup
-
-Get started by installing 🤗 PEFT:
+<hfoptions id="install">
+<hfoption id="pip">
 
 ```bash
-pip install peft
+pip install -U peft
 ```
 
-If you want to try out the brand new features, you might be interested in installing the library from source:
+</hfoption>
+<hfoption id="source">
 
 ```bash
 pip install git+https://github.com/huggingface/peft.git
 ```
 
-## Supported PEFT models
-
-🤗 Transformers natively supports some PEFT methods, meaning you can load adapter weights stored locally or on the Hub and easily run or train them with a few lines of code. The following methods are supported:
-
-- [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora)
-- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3)
-- [AdaLoRA](https://arxiv.org/abs/2303.10512)
-
-If you want to use other PEFT methods, such as prompt learning or prompt tuning, or learn about the 🤗 PEFT library in general, please refer to the [documentation](https://huggingface.co/docs/peft/index).
+</hfoption>
+</hfoptions>
 
+> [!TIP]
+> PEFT currently supports the LoRA, IA3, and AdaLoRA methods for Transformers. To use another PEFT method, such as prompt learning or prompt tuning, use the PEFT library directly.
 
-## Load a PEFT adapter
-
-To load and use a PEFT adapter model from 🤗 Transformers, make sure the Hub repository or local directory contains an `adapter_config.json` file and the adapter weights, as shown in the example image above. Then you can load the PEFT adapter model using the `AutoModelFor` class. For example, to load a PEFT adapter model for causal language modeling:
-
-1. specify the PEFT model id
-2. pass it to the [`AutoModelForCausalLM`] class
+[Low-Rank Adaptation (LoRA)](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a very common PEFT method that decomposes the weight matrix into two smaller trainable matrices. Start by defining a [LoraConfig](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig) object with the parameters shown below.
 
 ```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-peft_model_id = "ybelkada/opt-350m-lora"
-model = AutoModelForCausalLM.from_pretrained(peft_model_id)
-```
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import AutoModelForCausalLM
 
-<Tip>
-
-You can load a PEFT adapter with either an `AutoModelFor` class or the base model class like `OPTForCausalLM` or `LlamaForCausalLM`.
-
-</Tip>
-
-You can also load a PEFT adapter by calling the `load_adapter` method:
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "facebook/opt-350m"
-peft_model_id = "ybelkada/opt-350m-lora"
-
-model = AutoModelForCausalLM.from_pretrained(model_id)
-model.load_adapter(peft_model_id)
+# create LoRA configuration object
+lora_config = LoraConfig(
+    task_type=TaskType.CAUSAL_LM, # type of task to train on
+    inference_mode=False, # set to False for training
+    r=8, # dimension of the smaller matrices
+    lora_alpha=32, # scaling factor
+    lora_dropout=0.1 # dropout of LoRA layers
+)
 ```
 
-Check out the [API documentation](#transformers.integrations.PeftAdapterMixin) section below for more details.
-
-## Load in 8bit or 4bit
-
-The `bitsandbytes` integration supports 8bit and 4bit precision data types, which are useful for loading large models because it saves memory (see the `bitsandbytes` integration [guide](./quantization#bitsandbytes-integration) to learn more). Add the `load_in_8bit` or `load_in_4bit` parameters to [`~PreTrainedModel.from_pretrained`] and set `device_map="auto"` to effectively distribute the model to your hardware:
+Add [LoraConfig](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig) to the model with [`~integrations.PeftAdapterMixin.add_adapter`]. The model is now ready to be passed to [`Trainer`] for training.
 
 ```py
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-peft_model_id = "ybelkada/opt-350m-lora"
-model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True))
+model.add_adapter(lora_config, adapter_name="lora_1")
+trainer = Trainer(model=model, ...)
+trainer.train()
 ```
 
-## Add a new adapter
+To add an additional trainable adapter on top of a model with an existing adapter attached, specify the modules you want to train in [modules_to_save()](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig.modules_to_save).
 
-You can use [`~peft.PeftModel.add_adapter`] to add a new adapter to a model with an existing adapter as long as the new adapter is the same type as the current one. For example, if you have an existing LoRA adapter attached to a model:
+For example, to train the `lm_head` module on top of a causal language model with a LoRA adapter attached, set `modules_to_save=["lm_head"]`. Add the adapter to the model as shown below, and then pass it to [`Trainer`].
 
 ```py
-from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM
 from peft import LoraConfig
 
-model_id = "facebook/opt-350m"
-model = AutoModelForCausalLM.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b")
 
 lora_config = LoraConfig(
     target_modules=["q_proj", "k_proj"],
-    init_lora_weights=False
+    modules_to_save=["lm_head"],
 )
 
-model.add_adapter(lora_config, adapter_name="adapter_1")
-```
-
-To add a new adapter:
-
-```py
-# attach new adapter with same config
-model.add_adapter(lora_config, adapter_name="adapter_2")
+model.add_adapter(lora_config)
+trainer = Trainer(model=model, ...)
+trainer.train()
 ```
 
-Now you can use [`~peft.PeftModel.set_adapter`] to set which adapter to use:
+Save your adapter with [`~PreTrainedModel.save_pretrained`] to reuse it.
 
-```py
-# use adapter_1
-model.set_adapter("adapter_1")
-output_disabled = model.generate(**inputs)
-print(tokenizer.decode(output_disabled[0], skip_special_tokens=True))
-
-# use adapter_2
-model.set_adapter("adapter_2")
-output_enabled = model.generate(**inputs)
-print(tokenizer.decode(output_enabled[0], skip_special_tokens=True))
-```
+## Load adapter
 
-## Enable and disable adapters
+To load an adapter with Transformers, the Hub repository or local directory must contain an `adapter_config.json` file and the adapter weights. Load the adapter with [`~PreTrainedModel.from_pretrained`] or with [`~integrations.PeftAdapterMixin.load_adapter`].
 
-Once you've added an adapter to a model, you can enable or disable the adapter module. To enable the adapter module:
+<hfoptions id="load">
+<hfoption id="from_pretrained">
 
 ```py
-from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
-from peft import PeftConfig
+from transformers import AutoModelForCausalLM
 
-model_id = "facebook/opt-350m"
-adapter_model_id = "ybelkada/opt-350m-lora"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-text = "Hello"
-inputs = tokenizer(text, return_tensors="pt")
-
-model = AutoModelForCausalLM.from_pretrained(model_id)
-peft_config = PeftConfig.from_pretrained(adapter_model_id)
-
-# to initiate with random weights
-peft_config.init_lora_weights = False
-
-model.add_adapter(peft_config)
-model.enable_adapters()
-output = model.generate(**inputs)
+model = AutoModelForCausalLM.from_pretrained("klcsp/gemma7b-lora-alpaca-11-v1")
 ```
 
-To disable the adapter module:
+</hfoption>
+<hfoption id="load_adapter">
 
 ```py
-model.disable_adapters()
-output = model.generate(**inputs)
-```
-
-## Train a PEFT adapter
-
-PEFT adapters are supported by the [`Trainer`] class so that you can train an adapter for your specific use case. It only requires adding a few more lines of code. For example, to train a LoRA adapter:
+from transformers import AutoModelForCausalLM
 
-<Tip>
+model = AutoModelForCausalLM.from_pretrained("google/gemma-7b")
+model.load_adapter("klcsp/gemma7b-lora-alpaca-11-v1")
+```
 
-If you aren't familiar with fine-tuning a model with [`Trainer`], take a look at the [Fine-tune a pretrained model](training) tutorial.
+</hfoption>
+</hfoptions>
 
-</Tip>
+For very large models, it is helpful to load a quantized version of the model in 8 or 4-bit precision to save memory. Transformers supports quantization with its [bitsandbytes](https://huggingface.co/docs/bitsandbytes/index) integration. Specify in [`BitsAndBytesConfig`] whether you want to load a model in 8 or 4-bit precision.
 
-1. Define your adapter configuration with the task type and hyperparameters (see [`~peft.LoraConfig`] for more details about what the hyperparameters do).
+For multiple devices, add `device_map="auto"` to automatically distribute the model across your hardware.
 
 ```py
-from peft import LoraConfig
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
 
-peft_config = LoraConfig(
-    lora_alpha=16,
-    lora_dropout=0.1,
-    r=64,
-    bias="none",
-    task_type="CAUSAL_LM",
+model = AutoModelForCausalLM.from_pretrained(
+    "klcsp/gemma7b-lora-alpaca-11-v1",
+    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
+    device_map="auto",
 )
 ```
 
-2. Add adapter to the model.
-
-```py
-model.add_adapter(peft_config)
-```
+## Set adapter
 
-3. Now you can pass the model to [`Trainer`]!
+[`~integrations.PeftAdapterMixin.add_adapter`] adds a new adapter to a model. To add a second adapter, the new adapter must be the same type as the first adapter. Use the `adapter_name` parameter to assign a name to the adapter.
 
 ```py
-trainer = Trainer(model=model, ...)
-trainer.train()
+model.add_adapter(lora_config, adapter_name="lora_2")
 ```
 
-To save your trained adapter and load it back:
+Once added, use [`~integrations.PeftAdapterMixin.set_adapter`] to force a model to use the specified adapter and disable the other adapters.
 
 ```py
-model.save_pretrained(save_dir)
-model = AutoModelForCausalLM.from_pretrained(save_dir)
+model.set_adapter("lora_2")
 ```
 
-## Add additional trainable layers to a PEFT adapter
+## Enable and disable adapter
 
-You can also fine-tune additional trainable adapters on top of a model that has adapters attached by passing `modules_to_save` in your PEFT config. For example, if you want to also fine-tune the lm_head on top of a model with a LoRA adapter:
+[`~integrations.PeftAdapterMixin.enable_adapters`] is a broader function that enables *all* adapters attached to a model, and [`~integrations.PeftAdapterMixin.disable_adapters`] disables *all* attached adapters.
 
 ```py
-from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
-from peft import LoraConfig
-
-model_id = "facebook/opt-350m"
-model = AutoModelForCausalLM.from_pretrained(model_id)
-
-lora_config = LoraConfig(
-    target_modules=["q_proj", "k_proj"],
-    modules_to_save=["lm_head"],
-)
+model.add_adapter(lora_1)
+model.add_adapter(lora_2)
+model.enable_adapters()
 
-model.add_adapter(lora_config)
+# disable all adapters
+model.disable_adapters()
 ```
-
-## API docs
-
-[[autodoc]] integrations.PeftAdapterMixin
-    - load_adapter
-    - add_adapter
-    - set_adapter
-    - disable_adapters
-    - enable_adapters
-    - active_adapters
-    - get_adapter_state_dict
-
-
-
-
-<!--
-TODO: (@younesbelkada @stevhliu)
--   Link to PEFT docs for further details
--   Trainer  
--   8-bit / 4-bit examples ?
--->
diff --git a/docs/source/en/perf_hardware.md b/docs/source/en/perf_hardware.md
index 260fe5b71ccb..49ba739be28f 100644
--- a/docs/source/en/perf_hardware.md
+++ b/docs/source/en/perf_hardware.md
@@ -1,155 +1,73 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
 
 -->
 
+# Build your own machine
 
-# Custom hardware for training
+One of the most important consideration when building a machine for deep learning is the GPU choice. GPUs are the standard workhorse for deep learning owing to their tensor cores for performing very efficient matrix multiplication and high memory bandwidth. To train large models, you either need a more powerful GPU, multiple GPUs, or take advantage of techniques that offload some of the load to the CPU or NVMe.
 
-The hardware you use to run model training and inference can have a big effect on performance. For a deep dive into GPUs make sure to check out Tim Dettmer's excellent [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/).
+This guide provides some practical tips for setting up a GPU for deep learning. For a more detailed discussion and comparison of GPUs, take a look at the [Which GPU(s) to Get for Deep Learning](https://timdettmers.com/2023/01/30/which-gpu-for-deep-learning/) blog post.
 
-Let's have a look at some practical advice for GPU setups.
+## Power
 
-## GPU
-When you train bigger models you have essentially three options:
+High-end consumer GPUs may have two or three PCIe 8-pin power sockets, and you should make sure you have the same number of 12V PCIe 8-pin cables connected to each socket. Don't use a *pigtail cable*, a single cable with two splits at one end, to connect two sockets or else you won't get full performance from your GPU.
 
-- bigger GPUs
-- more GPUs
-- more CPU and NVMe (offloaded to by [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support))
+Each PCIe 8-pin power cable should be connected to a 12V rail on the power supply unit (PSU) and can deliver up to 150W. Other GPUs may use a PCIe 12-pin connector which can deliver up to 500-600W. Lower-end GPUs may only use a PCIe 6-pin connector which supplies up to 75W.
 
-Let's start at the case where you have a single GPU.
+It is important the PSU has stable voltage otherwise it may not be able to supply the GPU with enough power to function properly during peak usage.
 
-### Power and Cooling
+## Cooling
 
-If you bought an expensive high end GPU make sure you give it the correct power and sufficient cooling.
+An overheated GPU throttles its performance and can even shutdown if it's too hot to prevent damage. Keeping the GPU temperature low, anywhere between 158 - 167F, is essential for delivering full performance and maintaining its lifespan. Once temperatures reach 183 - 194F, the GPU may begin to throttle performance.
 
-**Power**:
+## Multi-GPU connectivity
 
-Some high end consumer GPU cards have 2 and sometimes 3 PCI-E 8-Pin power sockets. Make sure you have as many independent 12V PCI-E 8-Pin cables plugged into the card as there are sockets. Do not use the 2 splits at one end of the same cable (also known as pigtail cable). That is if you have 2 sockets on the GPU, you want 2 PCI-E 8-Pin cables going from your PSU to the card and not one that has 2 PCI-E 8-Pin connectors at the end! You won't get the full performance out of your card otherwise.
+When your setup uses multiple GPUs, it is important to consider how they're connected. [NVLink](https://www.nvidia.com/en-us/design-visualization/nvlink-bridges/) connections are faster than PCIe bridges, but you should also consider the [parallelism](./perf_train_gpu_many) strategy you're using. For example, in DistributedDataParallel, GPUs communicate less frequently compared to ZeRO-DP. In this case, a slower connection is not as important.
 
-Each PCI-E 8-Pin power cable needs to be plugged into a 12V rail on the PSU side and can supply up to 150W of power.
-
-Some other cards may use a PCI-E 12-Pin connectors, and these can deliver up to 500-600W of power.
-
-Low end cards may use 6-Pin connectors, which supply up to 75W of power.
-
-Additionally you want the high-end PSU that has stable voltage. Some lower quality ones may not give the card the stable voltage it needs to function at its peak.
-
-And of course the PSU needs to have enough unused Watts to power the card.
-
-**Cooling**:
-
-When a GPU gets overheated it will start throttling down and will not deliver full performance and it can even shutdown if it gets too hot.
-
-It's hard to tell the exact best temperature to strive for when a GPU is heavily loaded, but probably anything under +80C is good, but lower is better - perhaps 70-75C is an excellent range to be in. The throttling down is likely to start at around 84-90C. But other than throttling performance a prolonged very high temperature is likely to reduce the lifespan of a GPU.
-
-Next let's have a look at one of the most important aspects when having multiple GPUs: connectivity.
-
-### Multi-GPU Connectivity
-
-If you use multiple GPUs the way cards are inter-connected can have a huge impact on the total training time. If the GPUs are on the same physical node, you can run:
+Run the command below to check how your GPUs are connected.
 
 ```bash
 nvidia-smi topo -m
 ```
 
-and it will tell you how the GPUs are inter-connected. On a machine with dual-GPU and which are connected with NVLink, you will most likely see something like:
+<hfoptions id="nvlink">
+<hfoption id="NVLink">
 
-```
+[NVLink](https://www.nvidia.com/en-us/design-visualization/nvlink-bridges/) is a high-speed communication system designed by NVIDIA for connecting multiple NVIDIA GPUs. Training [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) on a small sample of the [wikitext](https://huggingface.co/datasets/Salesforce/wikitext) dataset is ~23% faster with NVLink.
+
+On a machine with two GPUs connected with NVLink, an example output of `nvidia-smi topo -m` is shown below.
+
+```bash
         GPU0    GPU1    CPU Affinity    NUMA Affinity
 GPU0     X      NV2     0-23            N/A
 GPU1    NV2      X      0-23            N/A
 ```
 
-on a different machine w/o NVLink we may see:
-```
-        GPU0    GPU1    CPU Affinity    NUMA Affinity
-GPU0     X      PHB     0-11            N/A
-GPU1    PHB      X      0-11            N/A
-```
-
-The report includes this legend:
-
-```
-  X    = Self
-  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
-  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
-  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
-  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
-  PIX  = Connection traversing at most a single PCIe bridge
-  NV#  = Connection traversing a bonded set of # NVLinks
-```
+`NV2` indicates `GPU0` and `GPU1` are connected by 2 NVLinks.
 
-So the first report `NV2` tells us the GPUs are interconnected with 2 NVLinks, and the second report `PHB` we have a typical consumer-level PCIe+Bridge setup.
+</hfoption>
+<hfoption id="without NVLink">
 
-Check what type of connectivity you have on your setup. Some of these will make the communication between cards faster (e.g. NVLink), others slower (e.g. PHB).
-
-Depending on the type of scalability solution used, the connectivity speed could have a major or a minor impact. If the GPUs need to sync rarely, as in DDP, the impact of a slower connection will be less significant. If the GPUs need to send messages to each other often, as in ZeRO-DP, then faster connectivity becomes super important to achieve faster training.
-
-#### NVlink
-
-[NVLink](https://en.wikipedia.org/wiki/NVLink) is a wire-based serial multi-lane near-range communications link developed by Nvidia.
-
-Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf):
-
-> Third-Generation NVLink®
-> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links,
-> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four
-> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth
-> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink.
-> (Note that 3-Way and 4-Way SLI configurations are not supported.)
-
-So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture.
-
-Let's compare the execution of an `openai-community/gpt2` language model training over a small sample of wikitext.
-
-The results are:
-
-
-| NVlink | Time |
-| -----  | ---: |
-| Y      | 101s |
-| N      | 131s |
-
-
-You can see that NVLink completes the training ~23% faster. In the second benchmark we use `NCCL_P2P_DISABLE=1` to tell the GPUs not to use NVLink.
-
-Here is the full benchmark code and outputs:
+On a machine with two GPUs connected with a PCIe bridge, an example output of `nvidia-smi topo -m` is shown below.
 
 ```bash
-# DDP w/ NVLink
-
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
-
-{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
-
-# DDP w/o NVLink
-
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \
---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
-
-{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      PHB     0-11            N/A
+GPU1    PHB      X      0-11            N/A
 ```
 
-Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
-Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
+`PHB` indicates `GPU0` and `GPU1` are connected by a PCIe bridge.
+
+</hfoption>
+</hfoptions>
diff --git a/docs/source/en/perf_infer_cpu.md b/docs/source/en/perf_infer_cpu.md
index 7f8b525b3df6..7522a013d0d9 100644
--- a/docs/source/en/perf_infer_cpu.md
+++ b/docs/source/en/perf_infer_cpu.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,44 +13,59 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# CPU inference
+# CPU
 
-With some optimizations, it is possible to efficiently run large model inference on a CPU. One of these optimization techniques involves compiling the PyTorch code into an intermediate format for high-performance environments like C++. The other technique fuses multiple operations into one kernel to reduce the overhead of running each operation separately.
+CPUs are a viable and cost-effective inference option. With a few optimization methods, it is possible to achieve good performance with large models on CPUs. These methods include fusing kernels to reduce overhead and compiling your code to a faster intermediate format that can be deployed in production environments.
 
-You'll learn how to use [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) for faster inference, and how to convert your PyTorch code to [TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html). If you're using an Intel CPU, you can also use [graph optimizations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features.html#graph-optimization) from [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/index.html) to boost inference speed even more. Finally, learn how to use 🤗 Optimum to accelerate inference with ONNX Runtime or OpenVINO (if you're using an Intel CPU).
+This guide will show you a few ways to optimize inference on a CPU.
 
-## BetterTransformer
+## Optimum
 
-BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. The two optimizations in the fastpath execution are:
+[Optimum](https://hf.co/docs/optimum/en/index) is a Hugging Face library focused on optimizing model performance across various hardware. It supports [ONNX Runtime](https://onnxruntime.ai/docs/) (ORT), a model accelerator, for a wide range of hardware and frameworks including CPUs.
 
-1. fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps
-2. skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors
+Optimum provides the [`~optimum.onnxruntime.ORTModel`] class for loading ONNX models. For example, load the [optimum/roberta-base-squad2](https://hf.co/optimum/roberta-base-squad2) checkpoint for question answering inference. This checkpoint contains a [model.onnx](https://hf.co/optimum/roberta-base-squad2/blob/main/model.onnx) file.
 
-BetterTransformer also converts all attention operations to use the more memory-efficient [scaled dot product attention](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention).
+```py
+from transformers import AutoTokenizer, pipeline
+from optimum.onnxruntime import ORTModelForQuestionAnswering
+
+onnx_qa = pipeline("question-answering", model="optimum/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2")
+
+question = "What's my name?"
+context = "My name is Philipp and I live in Nuremberg."
+pred = onnx_qa(question, context)
+```
 
-<Tip>
+> [!TIP]
+> Optimum includes an [Intel](https://hf.co/docs/optimum/intel/index) extension that provides additional optimizations such as quantization, pruning, and knowledge distillation for Intel CPUs. This extension also includes tools to convert models to [OpenVINO](https://hf.co/docs/optimum/intel/inference), a toolkit for optimizing and deploying models, for even faster inference.
 
-BetterTransformer is not supported for all models. Check this [list](https://huggingface.co/docs/optimum/bettertransformer/overview#supported-models) to see if a model supports BetterTransformer.
+### BetterTransformer
 
-</Tip>
+[BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) is a *fastpath* execution of specialized Transformers functions directly on the hardware level such as a CPU. There are two main components of the fastpath execution.
 
-Before you start, make sure you have 🤗 Optimum [installed](https://huggingface.co/docs/optimum/installation).
+- fusing multiple operations into a single kernel for faster and more efficient execution
+- skipping unnecessary computation of padding tokens with nested tensors
 
-Enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] method:
+> [!WARNING]
+> BetterTransformer isn't supported for all models. Check this [list](https://hf.co/docs/optimum/bettertransformer/overview#supported-models) to see whether a model supports BetterTransformer.
+
+BetterTransformer is available through Optimum with [`~PreTrainedModel.to_bettertransformer`].
 
 ```py
 from transformers import AutoModelForCausalLM
 
-model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom")
+model = model.to_bettertransformer()
 ```
 
 ## TorchScript
 
-TorchScript is an intermediate PyTorch model representation that can be run in production environments where performance is important. You can train a model in PyTorch and then export it to TorchScript to free the model from Python performance constraints. PyTorch [traces](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) a model to return a [`ScriptFunction`] that is optimized with just-in-time compilation (JIT). Compared to the default eager mode, JIT mode in PyTorch typically yields better performance for inference using optimization techniques like operator fusion.
+[TorchScript](https://pytorch.org/docs/stable/jit.html) is an intermediate PyTorch model format that can be run in non-Python environments, like C++, where performance is critical. Train a PyTorch model and convert it to a TorchScript function or module with [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html). This function optimizes the model with just-in-time (JIT) compilation, and compared to the default eager mode, JIT-compiled models offer better inference performance.
 
-For a gentle introduction to TorchScript, see the [Introduction to PyTorch TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html) tutorial.
+> [!TIP]
+> Refer to the [Introduction to PyTorch TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html) tutorial for a gentle introduction to TorchScript.
 
-With the [`Trainer`] class, you can enable JIT mode for CPU inference by setting the `--jit_mode_eval` flag:
+On a CPU, enable `torch.jit.trace` with the `--jit_mode_eval` flag in [`Trainer`].
 
 ```bash
 python examples/pytorch/question-answering/run_qa.py \
@@ -64,26 +79,16 @@ python examples/pytorch/question-answering/run_qa.py \
 --jit_mode_eval
 ```
 
-<Tip warning={true}>
-
-For PyTorch >= 1.14.0, JIT-mode could benefit any model for prediction and evaluation since the dict input is supported in `jit.trace`.
-
-For PyTorch < 1.14.0, JIT-mode could benefit a model if its forward parameter order matches the tuple input order in `jit.trace`, such as a question-answering model. If the forward parameter order does not match the tuple input order in `jit.trace`, like a text classification model, `jit.trace` will fail and we are capturing this with the exception here to make it fallback. Logging is used to notify users.
-
-</Tip>
+## IPEX
 
-## IPEX graph optimization
+[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html) (IPEX) offers additional optimizations for PyTorch on Intel CPUs. IPEX further optimizes TorchScript with [graph optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html) which fuses operations like Multi-head attention, Concat Linear, Linear + Add, Linear + Gelu, Add + LayerNorm, and more, into single kernels for faster execution.
 
-Intel® Extension for PyTorch (IPEX) provides further optimizations in JIT mode for Intel CPUs, and we recommend combining it with TorchScript for even faster performance. The IPEX [graph optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html) fuses operations like Multi-head attention, Concat Linear, Linear + Add, Linear + Gelu, Add + LayerNorm, and more.
-
-To take advantage of these graph optimizations, make sure you have IPEX [installed](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html):
+Make sure IPEX is installed, and set the `--use_opex` and `--jit_mode_eval` flags in [`Trainer`] to enable IPEX graph optimization and TorchScript.
 
 ```bash
-pip install intel_extension_for_pytorch
+!pip install intel_extension_for_pytorch
 ```
 
-Set the `--use_ipex` and `--jit_mode_eval` flags in the [`Trainer`] class to enable JIT mode with the graph optimizations:
-
 ```bash
 python examples/pytorch/question-answering/run_qa.py \
 --model_name_or_path csarron/bert-base-uncased-squad-v1 \
@@ -96,31 +101,3 @@ python examples/pytorch/question-answering/run_qa.py \
 --use_ipex \
 --jit_mode_eval
 ```
-
-## 🤗 Optimum
-
-<Tip>
-
-Learn more details about using ORT with 🤗 Optimum in the [Optimum Inference with ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/models) guide. This section only provides a brief and simple example.
-
-</Tip>
-
-ONNX Runtime (ORT) is a model accelerator that runs inference on CPUs by default. ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers, without making too many changes to your code. You only need to replace the 🤗 Transformers `AutoClass` with its equivalent [`~optimum.onnxruntime.ORTModel`] for the task you're solving, and load a checkpoint in the ONNX format.
-
-For example, if you're running inference on a question answering task, load the [optimum/roberta-base-squad2](https://huggingface.co/optimum/roberta-base-squad2) checkpoint which contains a `model.onnx` file:
-
-```py
-from transformers import AutoTokenizer, pipeline
-from optimum.onnxruntime import ORTModelForQuestionAnswering
-
-model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2")
-tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
-
-onnx_qa = pipeline("question-answering", model=model, tokenizer=tokenizer)
-
-question = "What's my name?"
-context = "My name is Philipp and I live in Nuremberg."
-pred = onnx_qa(question, context)
-```
-
-If you have an Intel CPU, take a look at 🤗 [Optimum Intel](https://huggingface.co/docs/optimum/intel/index) which supports a variety of compression techniques (quantization, pruning, knowledge distillation) and tools for converting models to the [OpenVINO](https://huggingface.co/docs/optimum/intel/inference) format for higher performance inference.
diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md
index 7f5d52363e4d..3aa1f09be55b 100644
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@@ -13,65 +13,69 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Multi-GPU inference
+# Distributed GPU inference
 
-Built-in Tensor Parallelism (TP) is now available with certain models using PyTorch. Tensor parallelism shards a model onto multiple GPUs, enabling larger model sizes, and parallelizes computations such as matrix multiplication.
+[Tensor parallelism](./perf_train_gpu_many#tensor-parallelism) shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice.
 
-To enable tensor parallel, pass the argument `tp_plan="auto"` to [`~AutoModelForCausalLM.from_pretrained`]:
+> [!TIP]
+> Expand the list below to see which models support tensor parallelism. Open a GitHub issue or pull request to add support for a model not currently below.
 
-```python
+<details>
+<summary>Supported models</summary>
+
+* [Cohere](./model_doc/cohere) and [Cohere 2](./model_doc/cohere2)
+* [Gemma](./model_doc/gemma) and [Gemma 2](./model_doc/gemma2)
+* [GLM](./model_doc/glm)
+* [Granite](./model_doc/granite)
+* [Llama](./model_doc/llama)
+* [Mistral](./model_doc/mistral)
+* [Mixtral](./model_doc/mixtral)
+* [OLMo](./model_doc/olmo) and [OLMo2](./model_doc/olmo2)
+* [Phi](./model_doc/phi) and [Phi-3](./model_doc/phi3)
+* [Qwen2](./model_doc/qwen2), [Qwen2Moe](./model_doc/qwen2_moe), and [Qwen2-VL](./model_doc/qwen2_5_vl)
+* [Starcoder2](./model_doc/starcoder2)
+
+</details>
+
+Set `tp_plan="auto"` in [`~AutoModel.from_pretrained`] to enable tensor parallelism for inference.
+
+```py
 import os
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-
-# Initialize distributed
-rank = int(os.environ["RANK"])
-device = torch.device(f"cuda:{rank}")
-torch.distributed.init_process_group("nccl", device_id=device)
 
-# Retrieve tensor parallel model
+# enable tensor parallelism
 model = AutoModelForCausalLM.from_pretrained(
-    model_id,
+    "meta-llama/Meta-Llama-3-8B-Instruct",
     tp_plan="auto",
 )
 
-# Prepare input tokens
-tokenizer = AutoTokenizer.from_pretrained(model_id)
+# prepare input tokens
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
 prompt = "Can I help"
-inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
 
-# Distributed run
+# distributed run
 outputs = model(inputs)
 ```
 
-You can use `torchrun` to launch the above script with multiple processes, each mapping to a GPU:
+Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU.
 
-```
+```bash
 torchrun --nproc-per-node 4 demo.py
 ```
 
-PyTorch tensor parallel is currently supported for the following models:
-* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
-* [Gemma](https://huggingface.co/docs/transformers/en/model_doc/gemma), [Gemma2](https://huggingface.co/docs/transformers/en/model_doc/gemma2)
-* [Granite](https://huggingface.co/docs/transformers/en/model_doc/granite)
-* [Mistral](https://huggingface.co/docs/transformers/en/model_doc/mistral)
-* [Qwen2](https://huggingface.co/docs/transformers/en/model_doc/qwen2), [Qwen2MoE](https://huggingface.co/docs/transformers/en/model_doc/qwen2_moe), [Qwen2-VL](https://huggingface.co/docs/transformers/v4.48.0/en/model_doc/qwen2_vl)
-* [Starcoder2](https://huggingface.co/docs/transformers/en/model_doc/starcoder2)
-* [Cohere](https://huggingface.co/docs/transformers/en/model_doc/cohere), [Cohere2](https://huggingface.co/docs/transformers/en/model_doc/cohere2)
-* [GLM](https://huggingface.co/docs/transformers/en/model_doc/glm)
-* [Mixtral](https://huggingface.co/docs/transformers/en/model_doc/mixtral)
-* [OLMo](https://huggingface.co/docs/transformers/en/model_doc/olmo), [OLMo2](https://huggingface.co/docs/transformers/en/model_doc/olmo2)
-* [Phi](https://huggingface.co/docs/transformers/en/model_doc/phi), [Phi-3](https://huggingface.co/docs/transformers/en/model_doc/phi3)
-
-You can request to add tensor parallel support for another model by opening a GitHub Issue or Pull Request.
-
-### Expected speedups
+For CPU, please binding different socket on each rank. For example, if you are using Intel 4th Gen Xeon:
+```bash
+export OMP_NUM_THREADS=56
+numactl -C 0-55 -m 0 torchrun --nnodes=2 --node_rank=0 --master_addr="127.0.0.1" --master_port=29500 --nproc-per-node 1 demo.py & numactl -C 56-111 -m 1 torchrun --nnodes=2 --node_rank=1 --master_addr="127.0.0.1" --master_port=29500 --nproc-per-node 1 demo.py & wait
+```
+The CPU benchmark data will be released soon.
 
-You can benefit from considerable speedups for inference, especially for inputs with large batch size or long sequences.
+You can benefit from considerable speed ups for inference, especially for inputs with large batch size or long sequences.
 
-For a single forward pass on [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) with a sequence length of 512 and various batch sizes, the expected speedup is as follows:
+For a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512 and various batch sizes, you can expect the following speed ups.
 
 <div style="text-align: center">
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct%2C%20seqlen%20%3D%20512%2C%20python%2C%20w_%20compile.png">
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 59b686436018..4bb34acf6c9f 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,545 +13,279 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# GPU inference
+# GPU
 
-GPUs are the standard choice of hardware for machine learning, unlike CPUs, because they are optimized for memory bandwidth and parallelism. To keep up with the larger sizes of modern models or to run these large models on existing and older hardware, there are several optimizations you can use to speed up GPU inference. In this guide, you'll learn how to use FlashAttention-2 (a more memory-efficient attention mechanism), BetterTransformer (a PyTorch native fastpath execution), and bitsandbytes to quantize your model to a lower precision. Finally, learn how to use 🤗 Optimum to accelerate inference with ONNX Runtime on Nvidia and AMD GPUs.
+GPUs are the standard hardware for machine learning because they're optimized for memory bandwidth and parallelism. With the increasing sizes of modern models, it's more important than ever to make sure GPUs are capable of efficiently handling and delivering the best possible performance.
 
-<Tip>
+This guide will demonstrate a few ways to optimize inference on a GPU. The optimization methods shown below can be combined with each other to achieve even better performance, and they also work for distributed GPUs.
 
-The majority of the optimizations described here also apply to multi-GPU setups!
-
-</Tip>
-
-## FlashAttention-2
-
-<Tip>
-
-FlashAttention-2 is experimental and may change considerably in future versions.
-
-</Tip>
+## bitsandbytes
 
-[FlashAttention-2](https://huggingface.co/papers/2205.14135) is a faster and more efficient implementation of the standard attention mechanism that can significantly speedup inference by:
+[bitsandbytes](https://hf.co/docs/bitsandbytes/index) is a quantization library that supports 8-bit and 4-bit quantization. Quantization represents weights in a lower precision compared to the original full precision format. It reduces memory requirements and makes it easier to fit large model into memory.
 
-1. additionally parallelizing the attention computation over sequence length
-2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them
-
-FlashAttention-2 is currently supported for the following architectures:
-* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration)
-* [Bark](https://huggingface.co/docs/transformers/model_doc/bark#transformers.BarkModel)
-* [Bamba](https://huggingface.co/docs/transformers/model_doc/bamba#transformers.BambaModel)
-* [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
-* [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
-* [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
-* [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
-* [Cohere2](https://huggingface.co/docs/transformers/model_doc/cohere2#transformers.Cohere2Model)
-* [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
-* [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
-* [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel)
-* [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
-* [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3)
-* [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
-* [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
-* [GotOcr2](https://huggingface.co/docs/transformers/model_doc/got_ocr2#transformers.GotOcr2ForConditionalGeneration)
-* [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)
-* [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
-* [GPTNeo](https://huggingface.co/docs/transformers/model_doc/gpt_neo#transformers.GPTNeoModel)
-* [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel)
-* [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj#transformers.GPTJModel)
-* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
-* [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel)
-* [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model)
-* [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model)
-* [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
-* [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
-* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
-* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
-* [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
-* [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
-* [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video)
-* [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
-* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel)
-* [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
-* [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
-* [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava)
-* [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)
-* [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
-* [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
-* [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
-* [ModernBert](https://huggingface.co/docs/transformers/model_doc/modernbert#transformers.ModernBert)
-* [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel)
-* [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
-* [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
-* [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
-* [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
-* [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
-* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model)
-* [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
-* [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
-* [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
-* [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
-* [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
-* [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel)
-* [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
-* [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
-* [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model)
-* [Qwen2Audio](https://huggingface.co/docs/transformers/model_doc/qwen2_audio#transformers.Qwen2AudioEncoder)
-* [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel)
-* [Qwen2VL](https://huggingface.co/docs/transformers/model_doc/qwen2_vl#transformers.Qwen2VLModel)
-* [Qwen2.5VL](https://huggingface.co/docs/transformers/model_doc/qwen2_5_vl#transformers.Qwen2_5_VLModel)
-* [RAG](https://huggingface.co/docs/transformers/model_doc/rag#transformers.RagModel)
-* [SpeechEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/speech_encoder_decoder#transformers.SpeechEncoderDecoderModel)
-* [VisionEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/vision_encoder_decoder#transformers.VisionEncoderDecoderModel)
-* [VisionTextDualEncoder](https://huggingface.co/docs/transformers/model_doc/vision_text_dual_encoder#transformers.VisionTextDualEncoderModel)
-* [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
-* [Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model)
-* [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel)
-* [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
-* [Sew](https://huggingface.co/docs/transformers/main/en/model_doc/sew#transformers.SEWModel)
-* [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
-* [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
-* [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)
-* [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel)
-* [Zamba2](https://huggingface.co/docs/transformers/model_doc/zamba2)
-
-You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request.
-
-Before you begin, make sure you have FlashAttention-2 installed.
-
-<hfoptions id="install">
-<hfoption id="NVIDIA">
+Make sure bitsandbytes and Accelerate are installed first.
 
 ```bash
-pip install flash-attn --no-build-isolation
+pip install bitsandbytes accelerate
 ```
 
-We strongly suggest referring to the detailed [installation instructions](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features) to learn more about supported hardware and data types!
+<hfoptions id="bnb">
+<hfoption id="8-bit">
 
-</hfoption>
-<hfoption id="AMD">
+For text generation with 8-bit quantization, you should use [`~GenerationMixin.generate`] instead of the high-level [`Pipeline`] API. The [`Pipeline`] returns slower performance because it isn't optimized for 8-bit models, and some sampling strategies (nucleus sampling) also aren't supported.
 
-FlashAttention-2 is also supported on AMD GPUs and current support is limited to **Instinct MI210**, **Instinct MI250** and **Instinct MI300**. We strongly suggest using this [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) to use FlashAttention-2 on AMD GPUs.
+Set up a [`BitsAndBytesConfig`] and set `load_in_8bit=True` to load a model in 8-bit precision. The [`BitsAndBytesConfig`] is passed to the `quantization_config` parameter in [`~PreTrainedModel.from_pretrained`].
 
-</hfoption>
-</hfoptions>
+Allow Accelerate to automatically distribute the model across your available hardware by setting [device_map="auto"](https://hf.co/docs/accelerate/concept_guides/big_model_inference#designing-a-device-map).
 
-To enable FlashAttention-2, pass the argument `attn_implementation="flash_attention_2"` to [`~AutoModelForCausalLM.from_pretrained`]:
+Place all inputs on the same device as the model.
 
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+```py
+from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
 
-model_id = "tiiuae/falcon-7b"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+tokenizer = AutoTokenizer("meta-llama/Llama-3.1-8B")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config)
 
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16,
-    attn_implementation="flash_attention_2",
-)
+prompt = "Hello, my llama is cute"
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+generated_ids = model.generate(**inputs)
+outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 ```
 
-<Tip>
-
-FlashAttention-2 can only be used when the model's dtype is `fp16` or `bf16`. Make sure to cast your model to the appropriate dtype and load them on a supported device before using FlashAttention-2.
-
-<br>
-
-You can also set `use_flash_attention_2=True` to enable FlashAttention-2 but it is deprecated in favor of `attn_implementation="flash_attention_2"`.
-
-</Tip>
-
-FlashAttention-2 can be combined with other optimization techniques like quantization to further speedup inference. For example, you can combine FlashAttention-2 with 8-bit or 4-bit quantization:
+For distributed setups, use the `max_memory` parameter to create a mapping of the amount of memory to allocate to each GPU. The example below distributes 16GB of memory to the first GPU and 16GB of memory to the second GPU.
 
 ```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
-
-model_id = "tiiuae/falcon-7b"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-# load in 8bit
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    load_in_8bit=True,
-    attn_implementation="flash_attention_2",
-)
-
-# load in 4bit
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    load_in_4bit=True,
-    attn_implementation="flash_attention_2",
+max_memory_mapping = {0: "16GB", 1: "16GB"}
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config, max_memory=max_memory_mapping
 )
 ```
 
-### Expected speedups
+Learn in more detail the concepts underlying 8-bit quantization in the [Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://hf.co/blog/hf-bitsandbytes-integration) blog post.
 
-You can benefit from considerable speedups for inference, especially for inputs with long sequences. However, since FlashAttention-2 does not support computing attention scores with padding tokens, you must manually pad/unpad the attention scores for batched inference when the sequence contains padding tokens. This leads to a significant slowdown for batched generations with padding tokens.
+</hfoption>
+<hfoption id="4-bit">
 
-To overcome this, you should use FlashAttention-2 without padding tokens in the sequence during training (by packing a dataset or [concatenating sequences](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L516) until reaching the maximum sequence length).
+Set up a [`BitsAndBytesConfig`] and set `load_in_4bit=True` to load a model in 4-bit precision. The [`BitsAndBytesConfig`] is passed to the `quantization_config` parameter in [`~PreTrainedModel.from_pretrained`].
 
-For a single forward pass on [tiiuae/falcon-7b](https://hf.co/tiiuae/falcon-7b) with a sequence length of 4096 and various batch sizes without padding tokens, the expected speedup is:
+Allow Accelerate to automatically distribute the model across your available hardware by setting `device_map=“auto”`.
 
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/falcon-7b-inference-large-seqlen.png">
-</div>
+Place all inputs on the same device as the model.
 
-For a single forward pass on [meta-llama/Llama-7b-hf](https://hf.co/meta-llama/Llama-7b-hf) with a sequence length of 4096 and various batch sizes without padding tokens, the expected speedup is:
+```py
+from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
 
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-7b-inference-large-seqlen.png">
-</div>
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+tokenizer = AutoTokenizer("meta-llama/Llama-3.1-8B")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config)
 
-For sequences with padding tokens (generating with padding tokens), you need to unpad/pad the input sequences to correctly compute the attention scores. With a relatively small sequence length, a single forward pass creates overhead leading to a small speedup (in the example below, 30% of the input is filled with padding tokens):
+prompt = "Hello, my llama is cute"
+inputs = tokenizer(prompt, return_tensors="pt").to(model_8bit.device)
+generated_ids = model_8bit.generate(**inputs)
+outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+```
 
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-small-seqlen-padding.png">
-</div>
+For distributed setups, use the `max_memory` parameter to create a mapping of the amount of memory to allocate to each GPU. The example below distributes 16GB of memory to the first GPU and 16GB of memory to the second GPU.
 
-But for larger sequence lengths, you can expect even more speedup benefits:
+```py
+max_memory_mapping = {0: "16GB", 1: "16GB"}
+model_4bit = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config, max_memory=max_memory_mapping
+)
+```
 
-<Tip>
+</hfoption>
+</hfoptions>
 
-FlashAttention is more memory efficient, meaning you can train on much larger sequence lengths without running into out-of-memory issues. You can potentially reduce memory usage up to 20x for larger sequence lengths. Take a look at the [flash-attention](https://github.com/Dao-AILab/flash-attention) repository for more details.
+## Optimum
 
-</Tip>
+[Optimum](https://hf.co/docs/optimum/en/index) is a Hugging Face library focused on optimizing model performance across various hardware. It supports [ONNX Runtime](https://onnxruntime.ai/docs/) (ORT), a model accelerator, for a wide range of hardware and frameworks including NVIDIA GPUs and AMD GPUs that use the [ROCm](https://www.amd.com/en/products/software/rocm.html) stack.
 
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-large-seqlen-padding.png">
-</div>
+ORT uses optimization techniques that fuse common operations into a single node and constant folding to reduce the number of computations. ORT also places the most computationally intensive operations on the GPU and the rest on the CPU to intelligently distribute the workload between the two devices.
 
-## PyTorch scaled dot product attention
-
-PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for `torch>=2.1.1` when an implementation is available. You may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
-
-For now, Transformers supports SDPA inference and training for the following architectures:
-* [Albert](https://huggingface.co/docs/transformers/model_doc/albert#transformers.AlbertModel)
-* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration)
-* [Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer#transformers.ASTModel)
-* [Bamba](https://huggingface.co/docs/transformers/model_doc/bamba#transformers.BambaModel)
-* [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
-* [Beit](https://huggingface.co/docs/transformers/model_doc/beit#transformers.BeitModel)
-* [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)
-* [BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt#transformers.BioGptModel)
-* [CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertModel)
-* [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
-* [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
-* [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
-* [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
-* [Cohere2](https://huggingface.co/docs/transformers/model_doc/cohere2#transformers.Cohere2Model)
-* [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
-* [data2vec_vision](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecVisionModel)
-* [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
-* [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
-* [DepthPro](https://huggingface.co/docs/transformers/model_doc/depth_pro#transformers.DepthProModel)
-* [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel)
-* [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
-* [Dinov2_with_registers](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
-* [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
-* [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)
-* [EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder_decoder#transformers.EncoderDecoderModel)
-* [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3)
-* [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
-* [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
-* [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
-* [GotOcr2](https://huggingface.co/docs/transformers/model_doc/got_ocr2#transformers.GotOcr2ForConditionalGeneration)
-* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
-* [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)
-* [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
-* [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel)
-* [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel)
-* [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
-* [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model)
-* [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model)
-* [I-JEPA](https://huggingface.co/docs/transformers/model_doc/ijepa#transformers.IJepaModel)
-* [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel)
-* [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
-* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
-* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
-* [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
-* [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
-* [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video)
-* [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
-* [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100#transformers.M2M100Model)
-* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel)
-* [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
-* [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
-* [Mllama](https://huggingface.co/docs/transformers/model_doc/mllama#transformers.MllamaForConditionalGeneration)
-* [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
-* [ModernBert](https://huggingface.co/docs/transformers/model_doc/modernbert#transformers.ModernBert)
-* [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel)
-* [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
-* [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
-* [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
-* [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
-* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model)
-* [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
-* [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt)
-* [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
-* [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
-* [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
-* [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel)
-* [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
-* [mBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
-* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel)
-* [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
-* [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
-* [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
-* [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
-* [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model)
-* [Qwen2Audio](https://huggingface.co/docs/transformers/model_doc/qwen2_audio#transformers.Qwen2AudioEncoder)
-* [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel)
-* [Qwen2.5VL](https://huggingface.co/docs/transformers/model_doc/qwen2_5_vl#transformers.Qwen2_5_VLModel)
-* [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel)
-* [Sew](https://huggingface.co/docs/transformers/main/en/model_doc/sew#transformers.SEWModel)
-* [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
-* [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
-* [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
-* [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
-* [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)
-* [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel)
-* [Qwen2VL](https://huggingface.co/docs/transformers/model_doc/qwen2_vl#transformers.Qwen2VLModel)
-* [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
-* [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
-* [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
-* [SpeechEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/speech_encoder_decoder#transformers.SpeechEncoderDecoderModel)
-* [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava)
-* [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
-* [VisionEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/vision_encoder_decoder#transformers.VisionEncoderDecoderModel)
-* [ViT](https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTModel)
-* [ViTHybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid#transformers.ViTHybridModel)
-* [ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae#transformers.ViTMAEModel)
-* [ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn#transformers.ViTMSNModel)
-* [VisionTextDualEncoder](https://huggingface.co/docs/transformers/model_doc/vision_text_dual_encoder#transformers.VisionTextDualEncoderModel)
-* [VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae#transformers.VideoMAEModell)
-* [ViViT](https://huggingface.co/docs/transformers/model_doc/vivit#transformers.VivitModel)
-* [wav2vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model)
-* [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
-* [XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaModel)
-* [XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl#transformers.XLMRobertaXLModel)
-* [YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos#transformers.YolosModel)
-* [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel)
-* [Zamba2](https://huggingface.co/docs/transformers/model_doc/zamba2)
-
-<Tip>
-
-FlashAttention can only be used for models with the `fp16` or `bf16` torch type, so make sure to cast your model to the appropriate type first. The memory-efficient attention backend is able to handle `fp32` models.
-
-</Tip>
-
-<Tip>
-
-SDPA does not support certain sets of attention parameters, such as `head_mask` and `output_attentions=True`.
-In that case, you should see a warning message and we will fall back to the (slower) eager implementation.
-
-</Tip>
-
-By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with [`torch.nn.attention.sdpa_kernel`](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) as a context manager:
-
-```diff
-import torch
-+ from torch.nn.attention import SDPBackend, sdpa_kernel
-from transformers import AutoModelForCausalLM, AutoTokenizer
+Optimum provides the [`~optimum.onnxruntime.ORTModel`] class for loading ONNX models. Set the `provider` parameter according to the table below.
 
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda")
+| provider | hardware |
+|---|---|
+| [CUDAExecutionProvider](https://hf.co/docs/optimum/main/en/onnxruntime/usage_guides/gpu#cudaexecutionprovider) | CUDA-enabled GPUs |
+| [ROCMExecutionProvider](https://hf.co/docs/optimum/onnxruntime/usage_guides/amdgpu) | AMD Instinct, Radeon Pro, Radeon GPUs |
+| [TensorrtExecutionProvider](https://hf.co/docs/optimum/onnxruntime/usage_guides/gpu#tensorrtexecutionprovider) | TensorRT |
 
-input_text = "Hello my dog is cute and"
-inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
+For example, load the [distilbert/distilbert-base-uncased-finetuned-sst-2-english](https://hf.co/optimum/roberta-base-squad2) checkpoint for sequence classification. This checkpoint contains a [model.onnx](https://hf.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english/blob/main/onnx/model.onnx) file. If a checkpoint doesn't have a `model.onnx` file, set `export=True` to convert a checkpoint on the fly to the ONNX format.
 
-+ with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
-    outputs = model.generate(**inputs)
+```py
+from optimum.onnxruntime import ORTModelForSequenceClassification
 
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+ort_model = ORTModelForSequenceClassification.from_pretrained(
+  "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
+  #export=True,
+  provider="CUDAExecutionProvider",
+)
 ```
 
-If you see a bug with the traceback below, try using the nightly version of PyTorch which may have broader coverage for FlashAttention:
+Now you can use the model for inference in a [`Pipeline`].
 
-```bash
-RuntimeError: No available kernel. Aborting execution.
+```py
+from optimum.pipelines import pipeline
+from transformers import AutoTokenizer
 
-# install PyTorch nightly
-pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
+tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
+pipeline = pipeline(task="text-classification", model=ort_model, tokenizer=tokenizer, device="cuda:0")
+result = pipeline("Both the music and visual were astounding, not to mention the actors performance.")
 ```
 
-## BetterTransformer
-
-<Tip warning={true}>
-
-Some BetterTransformer features are being upstreamed to Transformers with default support for native `torch.nn.scaled_dot_product_attention`. BetterTransformer still has a wider coverage than the Transformers SDPA integration, but you can expect more and more architectures to natively support SDPA in Transformers.
-
-</Tip>
-
-<Tip>
+Learn more details about using ORT with Optimum in the [Accelerated inference on NVIDIA GPUs](https://hf.co/docs/optimum/onnxruntime/usage_guides/gpu#accelerated-inference-on-nvidia-gpus) and [Accelerated inference on AMD GPUs](https://hf.co/docs/optimum/onnxruntime/usage_guides/amdgpu#accelerated-inference-on-amd-gpus) guides.
 
-Check out our benchmarks with BetterTransformer and scaled dot product attention in the [Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0](https://pytorch.org/blog/out-of-the-box-acceleration/) and learn more about the fastpath execution in the [BetterTransformer](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) blog post.
+### BetterTransformer
 
-</Tip>
+[BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) is a *fastpath* execution of specialized Transformers functions directly on the hardware level such as a GPU. There are two main components of the fastpath execution.
 
-BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. The two optimizations in the fastpath execution are:
+- fusing multiple operations into a single kernel for faster and more efficient execution
+- skipping unnecessary computation of padding tokens with nested tensors
 
-1. fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps
-2. skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors
+> [!WARNING]
+> Some BetterTransformer features are being upstreamed to Transformers with default support for native [torch.nn.functional.scaled_dot_product_attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA). BetterTransformer has a wider coverage than the Transformers SDPA integration, but you can expect more and more architectures to natively support SDPA in Transformers.
 
-BetterTransformer also converts all attention operations to use the more memory-efficient [scaled dot product attention (SDPA)](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention), and it calls optimized kernels like [FlashAttention](https://huggingface.co/papers/2205.14135) under the hood.
+BetterTransformer is available through Optimum with [`~PreTrainedModel.to_bettertransformer`].
 
-Before you start, make sure you have 🤗 Optimum [installed](https://huggingface.co/docs/optimum/installation).
-
-Then you can enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] method:
+```py
+from transformers import AutoModelForCausalLM
 
-```python
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom")
 model = model.to_bettertransformer()
 ```
 
-You can return the original Transformers model with the [`~PreTrainedModel.reverse_bettertransformer`] method. You should use this before saving your model to use the canonical Transformers modeling:
+Call [`~PreTrainedModel.reverse_bettertransformer`] and save it first to return the model to the original Transformers model.
 
 ```py
 model = model.reverse_bettertransformer()
 model.save_pretrained("saved_model")
 ```
 
-## bitsandbytes
-
-bitsandbytes is a quantization library that includes support for 4-bit and 8-bit quantization. Quantization reduces your model size compared to its native full precision version, making it easier to fit large models onto GPUs with limited memory.
+Refer to the benchmarks in [Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0](https://pytorch.org/blog/out-of-the-box-acceleration/) for BetterTransformer and scaled dot product attention performance. The [BetterTransformer](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) blog post also discusses fastpath execution in greater detail if you're interested in learning more.
 
-Make sure you have bitsandbytes and 🤗 Accelerate installed:
+## Scaled dot product attention (SDPA)
 
-```bash
-# these versions support 8-bit and 4-bit
-pip install bitsandbytes>=0.39.0 accelerate>=0.20.0
+PyTorch's [torch.nn.functional.scaled_dot_product_attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) is a native implementation of the scaled dot product attention mechanism. SDPA is a more efficient and optimized version of the attention mechanism used in transformer models.
 
-# install Transformers
-pip install transformers
-```
+There are three supported implementations available.
 
-### 4-bit
+- [FlashAttention2](https://github.com/Dao-AILab/flash-attention) only supports models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate type first.
+- [xFormers](https://github.com/facebookresearch/xformers) or Memory-Efficient Attention is able to support models with the fp32 torch type.
+- C++ implementation of scaled dot product attention
 
-To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `device_map` parameter is optional, but we recommend setting it to `"auto"` to allow 🤗 Accelerate to automatically and efficiently allocate the model given the available resources in the environment.
+SDPA is used by default for PyTorch v2.1.1. and greater when an implementation is available. You could explicitly enable SDPA by setting `attn_implementation="sdpa"` in [`~PreTrainedModel.from_pretrained`] though. Certain attention parameters, such as `head_mask` and `output_attentions=True`, are unsupported and returns a warning that Transformers will fall back to the (slower) eager implementation.
 
 ```py
 from transformers import AutoModelForCausalLM
 
-model_name = "bigscience/bloom-1b7"
-model_4bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True)
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", attn_implementation="sdpa")
 ```
 
-To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 2GB of memory to the first GPU and 5GB of memory to the second GPU:
+SDPA selects the most performant implementation available, but you can also explicitly select an implementation with [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager. The example below shows how to enable the FlashAttention2 implementation with `enable_flash=True`.
 
 ```py
-max_memory_mapping = {0: "2GB", 1: "5GB"}
-model_name = "bigscience/bloom-3b"
-model_4bit = AutoModelForCausalLM.from_pretrained(
-    model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
-)
-```
+import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-### 8-bit
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto").to("cuda")
 
-<Tip>
+input_text = "Hello, my llama is cute"
+inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
 
-If you're curious and interested in learning more about the concepts underlying 8-bit quantization, read the [Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration) blog post.
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION)::
+    outputs = model.generate(**inputs)
 
-</Tip>
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
 
-To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `device_map` parameter is optional, but we recommend setting it to `"auto"` to allow 🤗 Accelerate to automatically and efficiently allocate the model given the available resources in the environment:
+If you encounter the following `RuntimeError`, try installing the nightly version of PyTorch which has broader coverage for FlashAttention.
 
-```py
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+```bash
+RuntimeError: No available kernel. Aborting execution.
 
-model_name = "bigscience/bloom-1b7"
-model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True))
+pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
 ```
 
-If you're loading a model in 8-bit for text generation, you should use the [`~transformers.GenerationMixin.generate`] method instead of the [`Pipeline`] function which is not optimized for 8-bit models and will be slower. Some sampling strategies, like nucleus sampling, are also not supported by the [`Pipeline`] for 8-bit models. You should also place all inputs on the same device as the model:
+## FlashAttention
 
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+[FlashAttention](https://github.com/Dao-AILab/flash-attention) is also available as a standalone package. It can significantly speed up inference by:
 
-model_name = "bigscience/bloom-1b7"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True))
+1. additionally parallelizing the attention computation over sequence length
+2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them
 
-prompt = "Hello, my llama is cute"
-inputs = tokenizer(prompt, return_tensors="pt").to(model_8bit.device)
-generated_ids = model_8bit.generate(**inputs)
-outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-```
+Install FlashAttention first for the hardware you're using.
 
-To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 2GB of memory to the first GPU and 5GB of memory to the second GPU:
+<hfoptions id="install">
+<hfoption id="NVIDIA">
 
-```py
-max_memory_mapping = {0: "2GB", 1: "5GB"}
-model_name = "bigscience/bloom-3b"
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    model_name, torch_dtype="auto", device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
-)
+```bash
+pip install flash-attn --no-build-isolation
 ```
 
-<Tip>
+</hfoption>
+<hfoption id="AMD">
 
-Feel free to try running a 11 billion parameter [T5 model](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing) or the 3 billion parameter [BLOOM model](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing) for inference on Google Colab's free tier GPUs!
+FlashAttention2 support is currently limited to Instinct MI210, Instinct MI250 and Instinct MI300. We strongly suggest running this [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) for FlashAttention2 on AMD GPUs.
 
-</Tip>
+</hfoption>
+</hfoptions>
 
-## 🤗 Optimum
+Enable FlashAttention2 by setting `attn_implementation="flash_attention_2"` in [`~PreTrainedModel.from_pretrained`]. FlashAttention2 is only supported for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate data type first.
 
-<Tip>
+```py
+from transformers import AutoModelForCausalLM
 
-Learn more details about using ORT with 🤗 Optimum in the [Accelerated inference on NVIDIA GPUs](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#accelerated-inference-on-nvidia-gpus) and [Accelerated inference on AMD GPUs](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/amdgpu#accelerated-inference-on-amd-gpus) guides. This section only provides a brief and simple example.
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2")
+```
 
-</Tip>
+### Benchmarks
 
-ONNX Runtime (ORT) is a model accelerator that supports accelerated inference on Nvidia GPUs, and AMD GPUs that use [ROCm](https://www.amd.com/en/products/software/rocm.html) stack. ORT uses optimization techniques like fusing common operations into a single node and constant folding to reduce the number of computations performed and speedup inference. ORT also places the most computationally intensive operations on the GPU and the rest on the CPU to intelligently distribute the workload between the two devices.
+FlashAttention2 speeds up inference considerably especially for inputs with long sequences. However, since FlashAttention2 doesn't support computing attention scores with padding tokens, you must manually pad and unpad the attention scores for batched inference if a sequence contains padding tokens. The downside is batched generation is slower with padding tokens. 
 
-ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers. You'll need to use an [`~optimum.onnxruntime.ORTModel`] for the task you're solving, and specify the `provider` parameter which can be set to either [`CUDAExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#cudaexecutionprovider), [`ROCMExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/amdgpu) or [`TensorrtExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#tensorrtexecutionprovider). If you want to load a model that was not yet exported to ONNX, you can set `export=True` to convert your model on-the-fly to the ONNX format:
+<hfoptions id="padded">
+<hfoption id="short sequence length">
 
-```py
-from optimum.onnxruntime import ORTModelForSequenceClassification
+With a relatively small sequence length, a single forward pass creates overhead leading to a small speed up. The graph below shows the expected speed up for a single forward pass with [meta-llama/Llama-7b-hf](https://hf.co/meta-llama/Llama-7b-hf) with padding.
 
-ort_model = ORTModelForSequenceClassification.from_pretrained(
-  "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
-  export=True,
-  provider="CUDAExecutionProvider",
-)
-```
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-small-seqlen-padding.png"/>
+</div>
 
-Now you're free to use the model for inference:
+</hfoption>
+<hfoption id="long sequence length">
 
-```py
-from optimum.pipelines import pipeline
-from transformers import AutoTokenizer
+You can train on much longer sequence lengths without running into out-of-memory issues with FlashAttention2, and potentially reduce memory usage up to 20x. The speed up benefits are even better. The graph below shows the expected speed up for a single forward pass with [meta-llama/Llama-7b-hf](https://hf.co/meta-llama/Llama-7b-hf) with padding on a longer sequence length.
 
-tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-large-seqlen-padding.png"/>
+</div>
 
-pipeline = pipeline(task="text-classification", model=ort_model, tokenizer=tokenizer, device="cuda:0")
-result = pipeline("Both the music and visual were astounding, not to mention the actors performance.")
-```
+</hfoption>
+</hfoptions>
 
-## Combine optimizations
+To avoid this slowdown, use FlashAttention2 without padding tokens in the sequence during training. Pack the dataset or concatenate sequences until reaching the maximum sequence length.
 
-It is often possible to combine several of the optimization techniques described above to get the best inference performance possible for your model. For example, you can load a model in 4-bit, and then enable BetterTransformer with FlashAttention:
+<hfoptions id="not-padded">
+<hfoption id="tiiuae/falcon-7b">
 
-```py
-import torch
-from torch.nn.attention import SDPBackend, sdpa_kernel
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+The graph below shows the expected speed up for a single forward pass with [tiiuae/falcon-7b](https://hf.co/tiiuae/falcon-7b) with a sequence length of 4096 and various batch sizes without padding tokens.
 
-# load model in 4-bit
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16
-)
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/falcon-7b-inference-large-seqlen.png"/>
+</div>
 
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", quantization_config=quantization_config)
+</hfoption>
+<hfoption id="meta-llama/Llama-7b-hf">
 
-input_text = "Hello my dog is cute and"
-inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
+The graph below shows the expected speed up for a single forward pass with [meta-llama/Llama-7b-hf](https://hf.co/meta-llama/Llama-7b-hf) with a sequence length of 4096 and various batch sizes without padding tokens.
 
-# enable FlashAttention
-with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
-    outputs = model.generate(**inputs)
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-7b-inference-large-seqlen.png"/>
+</div>
 
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-```
+</hfoption>
+</hfoptions>
diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md
index 2155a403b2b7..e7bef363e039 100644
--- a/docs/source/en/perf_torch_compile.md
+++ b/docs/source/en/perf_torch_compile.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,353 +13,59 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Optimize inference using torch.compile()
+# torch.compile
 
-This guide aims to provide a benchmark on the inference speed-ups introduced with [`torch.compile()`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for [computer vision models in 🤗 Transformers](https://huggingface.co/models?pipeline_tag=image-classification&library=transformers&sort=trending).
+[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) compiles PyTorch code into optimized kernels that significantly speed up inference. This feature relies on [TorchDynamo](https://pytorch.org/docs/stable/torch.compiler_dynamo_overview.html) to compile the code into graphs and [TorchInductor](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747) to further compile the graphs into optimized kernels. It is a powerful optimization tool, and in many cases, only requires adding a single line of code.
 
-## Benefits of torch.compile
-   
-Depending on the model and the GPU, `torch.compile()` yields up to 30% speed-up during inference. To use `torch.compile()`, simply install any version of `torch` above 2.0. 
+Wrap a model with torch.compile to compile and return an optimized model.
 
-Compiling a model takes time, so it's useful if you are compiling the model only once instead of every time you infer.
-To compile any computer vision model of your choice, call `torch.compile()` on the model as shown below:
+```py
+from transformers import AutoModelForCausalLM
 
-```diff
-from transformers import AutoModelForImageClassification
-
-model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to(DEVICE)
-+ model = torch.compile(model)
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+compiled_model = torch.compile(model)
 ```
 
-`compile()` comes with multiple modes for compiling, which essentially differ in compilation time and inference overhead. `max-autotune` takes longer than `reduce-overhead` but results in faster inference. Default mode is fastest for compilation but is not as efficient compared to `reduce-overhead` for inference time. In this guide, we used the default mode. You can learn more about it [here](https://pytorch.org/get-started/pytorch-2.0/#user-experience).
-
-We benchmarked `torch.compile` with different computer vision models, tasks, types of hardware, and batch sizes on `torch` version 2.0.1.
-
-## Benchmarking code 
-
-Below you can find the benchmarking code for each task. We warm up the GPU before inference and take the mean time of 300 inferences, using the same image each time.
-
-### Image Classification with ViT
-
-```python 
-import torch
-from PIL import Image
-import requests
-import numpy as np
-from transformers import AutoImageProcessor, AutoModelForImageClassification
-from accelerate.test_utils.testing import get_backend
-
-device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
-url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-image = Image.open(requests.get(url, stream=True).raw)
+> [!TIP]
+> The initial call to torch.compile is slow because the model needs to be compiled. Subsequent calls to the compiled model are much faster because it doesn't need to compile again.
 
-processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
-model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to(device)
-model = torch.compile(model)
+There are several parameters to customize the compilation process. Two of the more important ones are listed below. For a full list of parameters, refer to the torch.compile [documentation](https://pytorch.org/docs/stable/generated/torch.compile.html).
 
-processed_input = processor(image, return_tensors='pt').to(device)
+## Modes
 
-with torch.no_grad():
-    _ = model(**processed_input)
+The `mode` parameter offers several performance options for compiling. Try different modes to see which one works best for your use case.
 
-```
-
-#### Object Detection with DETR
-
-```python 
-from transformers import AutoImageProcessor, AutoModelForObjectDetection
-from accelerate.test_utils.testing import get_backend
-
-device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
-processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
-model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)
-model = torch.compile(model)
+- `default` is a balanced option between speed and memory.
+- `reduce-overhead` reduces the Python overhead at the expense of a little more memory, but it can be faster.
+- `max-autotune` offers the fastest speed, but compilation takes longer.
 
-texts = ["a photo of a cat", "a photo of a dog"]
-inputs = processor(text=texts, images=image, return_tensors="pt").to(device)
+```py
+from transformers import AutoModelForCausalLM
 
-with torch.no_grad():
-    _ = model(**inputs)
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+compiled_model = torch.compile(model, mode="reduce-overhead")
 ```
 
-#### Image Segmentation with Segformer
+## Fullgraph
 
-```python 
-from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
-from accelerate.test_utils.testing import get_backend
+Fullgraph attempts to compile the entire model into a single graph to maximize performance. torch.compile raises an error if it encounters a graph break, which means it can't compile the model into a single graph.
 
-device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
-processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
-model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(device)
-model = torch.compile(model)
-seg_inputs = processor(images=image, return_tensors="pt").to(device)
+```py
+from transformers import AutoModelForCausalLM
 
-with torch.no_grad():
-    _ = model(**seg_inputs)
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
 ```
 
-Below you can find the list of the models we benchmarked.
-
-**Image Classification** 
-- [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224)
-- [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k)
-- [facebook/convnext-large-224](https://huggingface.co/facebook/convnext-large-224)
-- [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50)
-
-**Image Segmentation** 
-- [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
-- [facebook/mask2former-swin-tiny-coco-panoptic](https://huggingface.co/facebook/mask2former-swin-tiny-coco-panoptic)
-- [facebook/maskformer-swin-base-ade](https://huggingface.co/facebook/maskformer-swin-base-ade)
-- [google/deeplabv3_mobilenet_v2_1.0_513](https://huggingface.co/google/deeplabv3_mobilenet_v2_1.0_513)
-
-**Object Detection** 
-- [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32)
-- [facebook/detr-resnet-101](https://huggingface.co/facebook/detr-resnet-101)
-- [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50)
-
-Below you can find visualization of inference durations with and without `torch.compile()` and percentage improvements for each model in different hardware and batch sizes. 
-
-<div class="flex">
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/a100_batch_comp.png" />
-  </div>
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/v100_batch_comp.png" />
-  </div>
-   <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/t4_batch_comp.png" />
-  </div>
-</div>
-
-<div class="flex">
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/A100_1_duration.png" />
-  </div>
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/A100_1_percentage.png" />
-  </div>
-</div>
-
-
-![Duration Comparison on V100 with Batch Size of 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/v100_1_duration.png)
-
-![Percentage Improvement on T4 with Batch Size of 4](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/T4_4_percentage.png)
-
-Below you can find inference durations in milliseconds for each model with and without `compile()`. Note that OwlViT results in OOM in larger batch sizes.
-
-### A100 (batch size: 1)
-
-| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|
-| Image Classification/ViT | 9.325 | 7.584 | 
-| Image Segmentation/Segformer | 11.759 | 10.500 |
-| Object Detection/OwlViT | 24.978 | 18.420 |
-| Image Classification/BeiT | 11.282 | 8.448 | 
-| Object Detection/DETR | 34.619 | 19.040 |
-| Image Classification/ConvNeXT | 10.410 | 10.208 | 
-| Image Classification/ResNet | 6.531 | 4.124 |
-| Image Segmentation/Mask2former | 60.188 | 49.117 |
-| Image Segmentation/Maskformer | 75.764 | 59.487 | 
-| Image Segmentation/MobileNet | 8.583 | 3.974 |
-| Object Detection/Resnet-101 | 36.276 | 18.197 |
-| Object Detection/Conditional-DETR | 31.219 | 17.993 |
-
-
-### A100 (batch size: 4)
-
-| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|
-| Image Classification/ViT | 14.832 | 14.499 | 
-| Image Segmentation/Segformer | 18.838 | 16.476 |
-| Image Classification/BeiT | 13.205 | 13.048 | 
-| Object Detection/DETR | 48.657 | 32.418|
-| Image Classification/ConvNeXT | 22.940 | 21.631 | 
-| Image Classification/ResNet | 6.657 | 4.268 |
-| Image Segmentation/Mask2former | 74.277 | 61.781 |
-| Image Segmentation/Maskformer | 180.700 | 159.116 | 
-| Image Segmentation/MobileNet | 14.174 | 8.515 |
-| Object Detection/Resnet-101 | 68.101 | 44.998 |
-| Object Detection/Conditional-DETR | 56.470 | 35.552 |
-
-### A100 (batch size: 16)
-
-| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|
-| Image Classification/ViT | 40.944 | 40.010 | 
-| Image Segmentation/Segformer | 37.005 | 31.144 |
-| Image Classification/BeiT | 41.854 | 41.048 | 
-| Object Detection/DETR | 164.382 | 161.902 |
-| Image Classification/ConvNeXT | 82.258 | 75.561 | 
-| Image Classification/ResNet | 7.018 | 5.024 |
-| Image Segmentation/Mask2former | 178.945 | 154.814 |
-| Image Segmentation/Maskformer | 638.570 | 579.826 | 
-| Image Segmentation/MobileNet | 51.693 | 30.310 |
-| Object Detection/Resnet-101 | 232.887 | 155.021 |
-| Object Detection/Conditional-DETR | 180.491 | 124.032 |
-
-### V100 (batch size: 1)
-
-| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|
-| Image Classification/ViT | 10.495 | 6.00 | 
-| Image Segmentation/Segformer | 13.321 | 5.862 | 
-| Object Detection/OwlViT | 25.769 | 22.395 | 
-| Image Classification/BeiT | 11.347 | 7.234 | 
-| Object Detection/DETR | 33.951 | 19.388 |
-| Image Classification/ConvNeXT | 11.623 | 10.412 | 
-| Image Classification/ResNet | 6.484 | 3.820 |
-| Image Segmentation/Mask2former | 64.640 | 49.873 |
-| Image Segmentation/Maskformer | 95.532 | 72.207 | 
-| Image Segmentation/MobileNet | 9.217 | 4.753 |
-| Object Detection/Resnet-101 | 52.818 | 28.367 |
-| Object Detection/Conditional-DETR | 39.512 | 20.816 |
-
-### V100 (batch size: 4)
-
-| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|
-| Image Classification/ViT | 15.181 | 14.501 | 
-| Image Segmentation/Segformer | 16.787 | 16.188 |
-| Image Classification/BeiT | 15.171 | 14.753 | 
-| Object Detection/DETR | 88.529 | 64.195 |
-| Image Classification/ConvNeXT | 29.574 | 27.085 | 
-| Image Classification/ResNet | 6.109 | 4.731 |
-| Image Segmentation/Mask2former | 90.402 | 76.926 |
-| Image Segmentation/Maskformer | 234.261 | 205.456 | 
-| Image Segmentation/MobileNet | 24.623 | 14.816 |
-| Object Detection/Resnet-101 | 134.672 | 101.304 |
-| Object Detection/Conditional-DETR | 97.464 | 69.739 |
-
-### V100 (batch size: 16)
-
-| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|
-| Image Classification/ViT | 52.209 | 51.633 | 
-| Image Segmentation/Segformer | 61.013 | 55.499 |
-| Image Classification/BeiT | 53.938 | 53.581  |
-| Object Detection/DETR | OOM | OOM |
-| Image Classification/ConvNeXT | 109.682 | 100.771 | 
-| Image Classification/ResNet | 14.857 | 12.089 |
-| Image Segmentation/Mask2former | 249.605 | 222.801 |
-| Image Segmentation/Maskformer | 831.142 | 743.645 | 
-| Image Segmentation/MobileNet | 93.129 | 55.365 |
-| Object Detection/Resnet-101 | 482.425 | 361.843 |
-| Object Detection/Conditional-DETR | 344.661 | 255.298 |
-
-### T4 (batch size: 1)
-
-| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|
-| Image Classification/ViT | 16.520 | 15.786 | 
-| Image Segmentation/Segformer | 16.116 | 14.205 |
-| Object Detection/OwlViT | 53.634 | 51.105 |
-| Image Classification/BeiT | 16.464 | 15.710 | 
-| Object Detection/DETR | 73.100 | 53.99 |
-| Image Classification/ConvNeXT | 32.932 | 30.845 | 
-| Image Classification/ResNet | 6.031 | 4.321 |
-| Image Segmentation/Mask2former | 79.192 | 66.815 |
-| Image Segmentation/Maskformer | 200.026 | 188.268 | 
-| Image Segmentation/MobileNet | 18.908 | 11.997 |
-| Object Detection/Resnet-101 | 106.622 | 82.566 |
-| Object Detection/Conditional-DETR | 77.594 | 56.984 |
-
-### T4 (batch size: 4)
-
-| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|
-| Image Classification/ViT | 43.653 | 43.626 | 
-| Image Segmentation/Segformer | 45.327 | 42.445 |
-| Image Classification/BeiT | 52.007 | 51.354 | 
-| Object Detection/DETR | 277.850 | 268.003 |
-| Image Classification/ConvNeXT | 119.259 | 105.580 | 
-| Image Classification/ResNet | 13.039 | 11.388 |
-| Image Segmentation/Mask2former | 201.540 | 184.670 |
-| Image Segmentation/Maskformer | 764.052 | 711.280 | 
-| Image Segmentation/MobileNet | 74.289 | 48.677 |
-| Object Detection/Resnet-101 | 421.859 | 357.614 |
-| Object Detection/Conditional-DETR | 289.002 | 226.945 |
-
-### T4 (batch size: 16)
-
-| **Task/Model** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|
-| Image Classification/ViT | 163.914 | 160.907 | 
-| Image Segmentation/Segformer | 192.412 | 163.620 |
-| Image Classification/BeiT | 188.978 | 187.976 | 
-| Object Detection/DETR | OOM | OOM |
-| Image Classification/ConvNeXT | 422.886 | 388.078 | 
-| Image Classification/ResNet | 44.114 | 37.604 |
-| Image Segmentation/Mask2former | 756.337 | 695.291 |
-| Image Segmentation/Maskformer | 2842.940 | 2656.88 | 
-| Image Segmentation/MobileNet | 299.003 | 201.942 |
-| Object Detection/Resnet-101 |  1619.505 | 1262.758 | 
-| Object Detection/Conditional-DETR | 1137.513 | 897.390|
-
-## PyTorch Nightly
-We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel [here](https://download.pytorch.org/whl/nightly/cu118)) and observed improvement in latency both for uncompiled and compiled models. 
-
-### A100
-
-| **Task/Model** | **Batch Size** | **torch 2.0 - no compile** | **torch 2.0 -<br> compile** |
-|:---:|:---:|:---:|:---:|
-| Image Classification/BeiT | Unbatched | 12.462 | 6.954 | 
-| Image Classification/BeiT | 4 | 14.109 | 12.851 | 
-| Image Classification/BeiT | 16 | 42.179 | 42.147 | 
-| Object Detection/DETR | Unbatched | 30.484 | 15.221 |
-| Object Detection/DETR | 4 | 46.816 | 30.942 |
-| Object Detection/DETR | 16 | 163.749 | 163.706  |
-
-### T4
-
-| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|:---:|
-| Image Classification/BeiT | Unbatched | 14.408 | 14.052 | 
-| Image Classification/BeiT | 4 | 47.381 | 46.604 | 
-| Image Classification/BeiT | 16 | 42.179 | 42.147  | 
-| Object Detection/DETR | Unbatched | 68.382 | 53.481 |
-| Object Detection/DETR | 4 | 269.615 | 204.785 |
-| Object Detection/DETR | 16 | OOM | OOM   |
-
-### V100
-
-| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|:---:|
-| Image Classification/BeiT | Unbatched | 13.477 | 7.926 | 
-| Image Classification/BeiT | 4 | 15.103 | 14.378 | 
-| Image Classification/BeiT | 16 | 52.517 | 51.691  | 
-| Object Detection/DETR | Unbatched | 28.706 | 19.077 |
-| Object Detection/DETR | 4 | 88.402 | 62.949|
-| Object Detection/DETR | 16 | OOM | OOM  |
-
-
-## Reduce Overhead
-We benchmarked `reduce-overhead` compilation mode for A100 and T4 in Nightly.
-
-### A100
-
-| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** |
-|:---:|:---:|:---:|:---:|
-| Image Classification/ConvNeXT | Unbatched | 11.758 | 7.335 | 
-| Image Classification/ConvNeXT | 4 | 23.171 | 21.490 | 
-| Image Classification/ResNet | Unbatched | 7.435 | 3.801 | 
-| Image Classification/ResNet | 4 | 7.261 | 2.187 | 
-| Object Detection/Conditional-DETR | Unbatched | 32.823 | 11.627  | 
-| Object Detection/Conditional-DETR | 4 | 50.622 | 33.831  | 
-| Image Segmentation/MobileNet | Unbatched | 9.869 | 4.244 |
-| Image Segmentation/MobileNet | 4 | 14.385 | 7.946 |
-
-
-### T4
+## Benchmarks
 
-| **Task/Model** | **Batch Size** | **torch 2.0 - <br>no compile** | **torch 2.0 - <br>compile** | 
-|:---:|:---:|:---:|:---:|
-| Image Classification/ConvNeXT | Unbatched | 32.137 | 31.84 | 
-| Image Classification/ConvNeXT | 4 | 120.944 | 110.209 | 
-| Image Classification/ResNet | Unbatched | 9.761 | 7.698 | 
-| Image Classification/ResNet | 4 | 15.215 | 13.871 | 
-| Object Detection/Conditional-DETR | Unbatched | 72.150 | 57.660  | 
-| Object Detection/Conditional-DETR | 4 | 301.494 | 247.543  | 
-| Image Segmentation/MobileNet | Unbatched | 22.266 | 19.339  |
-| Image Segmentation/MobileNet | 4 | 78.311 | 50.983 |
+Refer to the table below for performance benchmarks comparing the mean inference time in milliseconds with torch.compile enabled and disabled across various GPUs and batch sizes on the same image for different vision tasks.
 
+Select **Subset** in the table below to switch between different GPUs, as well as benchmarks on [PyTorch nightly](https://download.pytorch.org/whl/nightly/cu118) 2.1.0dev and torch.compile with `reduce-overhead` mode enabled.
 
+<iframe
+  src="https://huggingface.co/datasets/stevhliu/compile-benchmarks/embed/viewer/t4/train"
+  frameborder="0"
+  width="100%"
+  height="560px"
+></iframe>
diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md
index ab2f735ecbdd..1eab6afbde23 100644
--- a/docs/source/en/perf_train_cpu.md
+++ b/docs/source/en/perf_train_cpu.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,70 +13,63 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Efficient Training on CPU
+# CPU
 
-This guide focuses on training large models efficiently on CPU.
+A modern CPU is capable of efficiently training large models by leveraging the underlying optimizations built into the hardware and training on fp16 or bf16 data types.
 
-## Mixed precision with IPEX
-Mixed precision uses single (fp32) and half-precision (bf16/fp16) data types in a model to accelerate training or inference while still preserving much of the single-precision accuracy. Modern CPUs such as 3rd, 4th, and 5th Gen Intel® Xeon® Scalable processors natively support bf16. 6th Gen Intel® Xeon® Scalable processors natively support bf16 and fp16. You should get more performance out of the box by enabling mixed precision training with bf16 or fp16.
+This guide focuses on how to train large models on an Intel CPU using mixed precision and the [Intel Extension for PyTorch (IPEX)](https://intel.github.io/intel-extension-for-pytorch/index.html) library.
 
-To further maximize training performance, you can use Intel® Extension for PyTorch (IPEX), which is a library built on PyTorch and adds additional CPU instruction level architecture (ISA) level support such as Intel® Advanced Vector Extensions 512 Vector Neural Network Instructions (Intel® AVX512-VNNI), and Intel® Advanced Matrix Extensions (Intel® AMX) for an extra performance boost on Intel CPUs. However, CPUs with only AVX2 (e.g., AMD or older Intel CPUs) are not guaranteed to have better performance under IPEX.
+You can Find your PyTorch version by running the command below.
 
-Auto Mixed Precision (AMP) for CPU backends has been enabled since PyTorch 1.10. AMP support for bf16/fp16 on CPUs and bf16/fp16 operator optimization is also supported in IPEX and partially upstreamed to the main PyTorch branch. You can get better performance and user experience with IPEX AMP.
+```bash
+pip list | grep torch
+```
+
+Install IPEX with the PyTorch version from above.
 
-Check more detailed information for [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html).
+```bash
+pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+```
 
-### IPEX installation:
+> [!TIP]
+> Refer to the IPEX [installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation) guide for more details.
 
-IPEX release is following PyTorch, to install via pip:
+IPEX provides additional performance optimizations for Intel CPUs. These include additional CPU instruction level architecture (ISA) support such as [Intel AVX512-VNNI](https://en.wikichip.org/wiki/x86/avx512_vnni) and [Intel AMX](https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/what-is-intel-amx.html). Both of these features are designed to accelerate matrix multiplication. Older AMD and Intel CPUs with only Intel AVX2, however, aren't guaranteed better performance with IPEX.
 
-| PyTorch Version   | IPEX version   |
-| :---------------: | :----------:   |
-| 2.5.0             |  2.5.0+cpu     |
-| 2.4.0             |  2.4.0+cpu     |
-| 2.3.0             |  2.3.0+cpu     |
-| 2.2.0             |  2.2.0+cpu     |
+IPEX also supports [Auto Mixed Precision (AMP)](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html) training with the fp16 and bf16 data types. Reducing precision speeds up training and reduces memory usage because it requires less computation. The loss in accuracy from using full-precision is minimal. 3rd, 4th, and 5th generation Intel Xeon Scalable processors natively support bf16, and the 6th generation processor also natively supports fp16 in addition to bf16.
+
+AMP is enabled for CPU backends training with PyTorch.
+
+[`Trainer`] supports AMP training with a CPU by adding the `--use_cpu`, `--use_ipex`, and `--bf16` parameters. The example below demonstrates the [run_qa.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) script.
 
-Please run `pip list | grep torch` to get your `pytorch_version`, so you can get the `IPEX version_name`.
 ```bash
-pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+python run_qa.py \
+ --model_name_or_path google-bert/bert-base-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 12 \
+ --learning_rate 3e-5 \
+ --num_train_epochs 2 \
+ --max_seq_length 384 \
+ --doc_stride 128 \
+ --output_dir /tmp/debug_squad/ \
+ --use_ipex \
+ --bf16 \
+ --use_cpu
 ```
-You can check the latest versions in [ipex-whl-stable-cpu](https://developer.intel.com/ipex-whl-stable-cpu) if needed.
-
-Check more approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html).
-
-### Usage in Trainer
-To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` or `fp16`, and `no_cuda` in training command arguments.
-
-Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
-
-- Training with IPEX using BF16 auto mixed precision on CPU:
-<pre> python examples/pytorch/question-answering/run_qa.py \
---model_name_or_path google-bert/bert-base-uncased \
---dataset_name squad \
---do_train \
---do_eval \
---per_device_train_batch_size 12 \
---learning_rate 3e-5 \
---num_train_epochs 2 \
---max_seq_length 384 \
---doc_stride 128 \
---output_dir /tmp/debug_squad/ \
-<b>--use_ipex</b> \
-<b>--bf16</b> \
-<b>--use_cpu</b></pre> 
-
-If you want to enable `use_ipex` and `bf16` in your script, add these parameters to `TrainingArguments` like this:
-```diff
+
+These parameters can also be added to [`TrainingArguments`] as shown below.
+
+```py
 training_args = TrainingArguments(
-    output_dir=args.output_path,
-+   bf16=True,
-+   use_ipex=True,
-+   use_cpu=True,
-    **kwargs
+    output_dir="./outputs",
+    bf16=True,
+    use_ipex=True,
+    use_cpu=True,
 )
 ```
 
-### Practice example
+## Resources
 
-Blog: [Accelerating PyTorch Transformers with Intel Sapphire Rapids](https://huggingface.co/blog/intel-sapphire-rapids)
+Learn more about training on Intel CPUs in the [Accelerating PyTorch Transformers with Intel Sapphire Rapids](https://huggingface.co/blog/intel-sapphire-rapids) blog post.
diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
index d6a029c471de..bd332d15e1ba 100644
--- a/docs/source/en/perf_train_cpu_many.md
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,69 +13,57 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Efficient Training on Multiple CPUs
+# Distributed CPUs
 
-When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling
-distributed CPU training efficiently on [bare metal](#usage-in-trainer) and [Kubernetes](#usage-with-kubernetes).
+CPUs are commonly available and can be a cost-effective training option when GPUs are unavailable. When training large models or if a single CPU is too slow, distributed training with CPUs can help speed up training.
 
-## Intel® oneCCL Bindings for PyTorch
+This guide demonstrates how to perform distributed training with multiple CPUs using a [DistributedDataParallel (DDP)](./perf_train_gpu_many#distributeddataparallel) strategy on bare metal with [`Trainer`] and a Kubernetes cluster. All examples shown in this guide depend on the [Intel oneAPI HPC Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/hpc-toolkit.html).
 
-[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) is a library for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall. For more information on oneCCL, please refer to the [oneCCL documentation](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html) and [oneCCL specification](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html).
+There are two toolkits you'll need from Intel oneAPI.
 
-Module `oneccl_bindings_for_pytorch` (`torch_ccl` before version 1.12)  implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now
+1. [oneCCL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html) includes efficient implementations of collectives commonly used in deep learning such as all-gather, all-reduce, and reduce-scatter. To install from a prebuilt wheel, make sure you always use the latest release. Refer to the table [here](https://github.com/intel/torch-ccl#install-prebuilt-wheel) to check if a version of oneCCL is supported for a Python and PyTorch version.
 
-Check more detailed information for [oneccl_bind_pt](https://github.com/intel/torch-ccl).
-
-### Intel® oneCCL Bindings for PyTorch installation
-
-Wheel files are available for the following Python versions:
-
-| Extension Version | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | Python 3.11 |
-| :---------------: | :--------: | :--------: | :--------: | :---------: | :---------: |
-| 2.5.0             |            | √          | √          | √           | √           |
-| 2.4.0             |            | √          | √          | √           | √           |
-| 2.3.0             |            | √          | √          | √           | √           |
-| 2.2.0             |            | √          | √          | √           | √           |
-
-Please run `pip list | grep torch` to get your `pytorch_version`.
 ```bash
-pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu
+# installs oneCCL for PyTorch 2.4.0
+pip install oneccl_bind_pt==2.4.0 -f https://developer.intel.com/ipex-whl-stable-cpu
 ```
-where `{pytorch_version}` should be your PyTorch version, for instance 2.4.0.
-Check more approaches for [oneccl_bind_pt installation](https://github.com/intel/torch-ccl).
-Versions of oneCCL and PyTorch must match.
-
 
-## Intel® MPI library
-Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. This component is part of the Intel® oneAPI HPC Toolkit.
+> [!TIP]
+> Refer to the oneCCL [installation](https://github.com/intel/torch-ccl#installation) for more details.
 
-oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to source the environment before using it.
+1. [MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) is a message-passing interface for communications between hardware and networks. The oneCCL toolkit is installed along with MPI, but you need to source the environment as shown below before using it.
 
 ```bash
 oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)")
 source $oneccl_bindings_for_pytorch_path/env/setvars.sh
 ```
 
-#### Intel® Extension for PyTorch installation
+Lastly, install the [Intex Extension for PyTorch (IPEX)](https://intel.github.io/intel-extension-for-pytorch/index.html) which enables additional performance optimizations for Intel hardware such as weight sharing and better thread runtime control.
 
-Intel Extension for PyTorch (IPEX) provides performance optimizations for CPU training with both Float32 and BFloat16 (refer to the [single CPU section](./perf_train_cpu) to learn more).
+```bash
+pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+```
 
+> [!TIP]
+> Refer to the IPEX [installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation) for more details.
 
-The following "Usage in Trainer" takes mpirun in Intel® MPI library as an example.
+## Trainer
 
+[`Trainer`] supports distributed training with CPUs with the oneCCL backend. Add the `--ddp_backend ccl` parameter in the command arguments to enable it.
 
-## Usage in Trainer
-To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--ddp_backend ccl`** in the command arguments.
+<hfoptions id="distrib-cpu">
+<hfoption id="single node">
 
-Let's see an example with the [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
+The example below demonstrates the [run_qa.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) script. It enables training with two processes on one Xeon CPU, with one process running per socket.
 
+> [!TIP]
+> Tune the variable `OMP_NUM_THREADS/CCL_WORKER_COUNT` for optimal performance.
 
-The following command enables training with 2 processes on one Xeon node, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.
-```shell script
- export CCL_WORKER_COUNT=1
- export MASTER_ADDR=127.0.0.1
- mpirun -n 2 -genv OMP_NUM_THREADS=23 \
- python3 examples/pytorch/question-answering/run_qa.py \
+```bash
+export CCL_WORKER_COUNT=1
+export MASTER_ADDR=127.0.0.1
+mpirun -n 2 -genv OMP_NUM_THREADS=23 \
+python3 run_qa.py \
  --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
@@ -90,21 +78,31 @@ The following command enables training with 2 processes on one Xeon node, with o
  --ddp_backend ccl \
  --use_ipex
 ```
-The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.
 
-In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument.
-```shell script
- cat hostfile
- xxx.xxx.xxx.xxx #node0 ip
- xxx.xxx.xxx.xxx #node1 ip
+</hfoption>
+<hfoption id="multiple nodes">
+
+Scale the training script to four processes on two Xeon CPUs (`node0` and `node1`) by setting `-n 4` and `ppn 2`. The `ppn` parameter specifies the number of processes per node, with one process running per socket.
+
+Assume `node0` is the main process and create a configuration file containing the IP addresses of each node (for example, hostfile) and pass the configuration file path as an argument.
+
+```bash
+cat hostfile
+xxx.xxx.xxx.xxx #node0 ip
+xxx.xxx.xxx.xxx #node1 ip
 ```
-Now, run the following command in node0 and **4DDP** will be enabled in node0 and node1 with BF16 auto mixed precision:
-```shell script
- export CCL_WORKER_COUNT=1
- export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
- mpirun -f hostfile -n 4 -ppn 2 \
+
+Run the script below on `node0` to enable DDP on `node0` and `node1` and train with bf16 auto mixed precision.
+
+> [!TIP]
+> Tune the variable `OMP_NUM_THREADS/CCL_WORKER_COUNT` for optimal performance.
+
+```bash
+export CCL_WORKER_COUNT=1
+export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
+mpirun -f hostfile -n 4 -ppn 2 \
  -genv OMP_NUM_THREADS=23 \
- python3 examples/pytorch/question-answering/run_qa.py \
+python3 run_qa.py \
  --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
@@ -121,25 +119,20 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an
  --bf16
 ```
 
-## Usage with Kubernetes
+</hfoption>
+</hfoptions>
 
-The same distributed training job from the previous section can be deployed to a Kubernetes cluster using the
-[Kubeflow PyTorchJob training operator](https://www.kubeflow.org/docs/components/training/user-guides/pytorch).
+## Kubernetes
 
-### Setup
+Distributed training with CPUs can also be deployed to a Kubernetes cluster with [PyTorchJob](https://www.kubeflow.org/docs/components/training/user-guides/pytorch/). Before you get started, you should perform the following setup steps.
 
-This example assumes that you have:
-* Access to a Kubernetes cluster with [Kubeflow installed](https://www.kubeflow.org/docs/started/installing-kubeflow)
-* [`kubectl`](https://kubernetes.io/docs/tasks/tools) installed and configured to access the Kubernetes cluster
-* A [Persistent Volume Claim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes) that can be used
-  to store datasets and model files. There are multiple options for setting up the PVC including using an NFS
-  [storage class](https://kubernetes.io/docs/concepts/storage/storage-classes) or a cloud storage bucket.
-* A Docker container that includes your model training script and all the dependencies needed to run the script. For
-  distributed CPU training jobs, this typically includes PyTorch, Transformers, Intel Extension for PyTorch, Intel
-  oneCCL Bindings for PyTorch, and OpenSSH to communicate between the containers.
+1. Ensure you have access to a Kubernetes cluster with [Kubeflow](https://www.kubeflow.org/docs/started/installing-kubeflow/) installed.
+1. Install and configure [kubectl](https://kubernetes.io/docs/tasks/tools) to interact with the cluster.
+1. Set up a [PersistentVolumeClaim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) to store datasets and model files. There are multiple options to choose from, including a [StorageClass](https://kubernetes.io/docs/concepts/storage/storage-classes/) or a cloud storage bucket.
+1. Set up a Docker container for the training script and all required dependencies such as PyTorch, Transformers, IPEX, oneCCL, and OpenSSH to facilitate communicattion between containers.
+
+The example Dockerfile below uses a base image that supports distributed training with CPUs, and extracts Transformers to the `/workspace` directory to include the training scripts in the image. The image needs to be built and copied to the clusters nodes or pushed to a container registry prior to deployment.
 
-The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then
-extracts a Transformers release to the `/workspace` directory, so that the example scripts are included in the image:
 ```dockerfile
 FROM intel/intel-optimized-pytorch:2.4.0-pip-multinode
 
@@ -157,26 +150,17 @@ RUN pip install --no-cache-dir \
     mkdir transformers && \
     curl -sSL --retry 5 https://github.com/huggingface/transformers/archive/refs/tags/v${HF_TRANSFORMERS_VER}.tar.gz | tar -C transformers --strip-components=1 -xzf -
 ```
-The image needs to be built and copied to the cluster's nodes or pushed to a container registry prior to deploying the
-PyTorchJob to the cluster.
-
-### PyTorchJob Specification File
-
-The [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/user-guides/pytorch) is used to run the distributed
-training job on the cluster. The yaml file for the PyTorchJob defines parameters such as:
- * The name of the PyTorchJob
- * The number of replicas (workers)
- * The python script and it's parameters that will be used to run the training job
- * The types of resources (node selector, memory, and CPU) needed for each worker
- * The image/tag for the Docker container to use
- * Environment variables
- * A volume mount for the PVC
-
-The volume mount defines a path where the PVC will be mounted in the container for each worker pod. This location can be
-used for the dataset, checkpoint files, and the saved model after training completes.
-
-The snippet below is an example of a yaml file for a PyTorchJob with 4 workers running the
-[question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering).
+
+### PyTorchJob
+
+[PyTorchJob](https://www.kubeflow.org/docs/components/training/user-guides/pytorch/) is an extension of the Kubernetes API for running PyTorch training jobs on Kubernetes. It includes a yaml file that defines the training jobs parameters such as the name of the PyTorchJob, number of workers, types of resources for each worker, and more.
+
+The volume mount parameter is a path to where the PVC is mounted in the container for each worker pod. The PVC is typically used to hold the dataset, checkpoint files, and the model after it has finished training.
+
+The example yaml file below sets up four workers on the [run_qa.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) script. Adapt the yaml file based on your training script and number of nodes in your cluster.
+
+The CPU resource limits and requests are defined in [CPU units](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu). One CPU unit is equivalent to one physical CPU core or virtual core. The CPU units defined in the yaml file should be less than the amount of available CPU and memory capacity of a single machine in order to leave some resources for kubelet and the system. For a `Guaranteed` [quality of service](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod), set the same CPU and memory amounts for both the resource limits and requests.
+
 ```yaml
 apiVersion: "kubeflow.org/v1"
 kind: PyTorchJob
@@ -255,35 +239,22 @@ spec:
             emptyDir:
               medium: Memory
 ```
-To run this example, update the yaml based on your training script and the nodes in your cluster.
-
-<Tip>
-
-The CPU resource limits/requests in the yaml are defined in 
-[cpu units](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu)
-where 1 CPU unit is equivalent to 1 physical CPU core or 1 virtual core (depending on whether the node is a physical
-host or a VM). The amount of CPU and memory limits/requests defined in the yaml should be less than the amount of
-available CPU/memory capacity on a single machine. It is usually a good idea to not use the entire machine's capacity in
-order to leave some resources for the kubelet and OS. In order to get ["guaranteed"](https://kubernetes.io/docs/concepts/workloads/pods/pod-qos/#guaranteed)
-[quality of service](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod) for the worker pods,
-set the same CPU and memory amounts for both the resource limits and requests.
-
-</Tip>
 
 ### Deploy
 
-After the PyTorchJob spec has been updated with values appropriate for your cluster and training job, it can be deployed
-to the cluster using:
+After you've setup the PyTorchJob yaml file with the appropriate settings for your cluster and training job, deploy it to the cluster with the command below.
+
 ```bash
 export NAMESPACE=<specify your namespace>
 
 kubectl create -f pytorchjob.yaml -n ${NAMESPACE}
 ```
 
-The `kubectl get pods -n ${NAMESPACE}` command can then be used to list the pods in your namespace. You should see
-the worker pods for the PyTorchJob that was just deployed. At first, they will probably have a status of "Pending" as
-the containers get pulled and created, then the status should change to "Running".
-```
+List the pods in the namespace with `kubectl get pods -n ${NAMESPACE}`. At first, the status may be "Pending" but it should change to "Running" once the containers are pulled and created.
+
+```bash
+kubectl get pods -n ${NAMESPACE}
+
 NAME                                                     READY   STATUS                  RESTARTS          AGE
 ...
 transformers-pytorchjob-worker-0                         1/1     Running                 0                 7m37s
@@ -293,16 +264,14 @@ transformers-pytorchjob-worker-3                         1/1     Running
 ...
 ```
 
-The logs for worker can be viewed using `kubectl logs <pod name> -n ${NAMESPACE}`. Add `-f` to stream the logs, for example:
+Inspect the logs for each worker with the following command. Add `-f` to stream the logs.
+
 ```bash
 kubectl logs transformers-pytorchjob-worker-0 -n ${NAMESPACE} -f
 ```
 
-After the training job completes, the trained model can be copied from the PVC or storage location. When you are done
-with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml -n ${NAMESPACE}`.
-
-## Summary
+Once training is complete, the trained model can be copied from the PVC or storage location. Delete the PyTorchJob resource from the cluster with the command below.
 
-This guide covered running distributed PyTorch training jobs using multiple CPUs on bare metal and on a Kubernetes
-cluster. Both cases utilize Intel Extension for PyTorch and Intel oneCCL Bindings for PyTorch for optimal training
-performance, and can be used as a template to run your own workload on multiple nodes.
+```bash
+kubectl delete -f pytorchjob.yaml -n ${NAMESPACE}
+```
diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md
index d60c61020c76..347db1a3f0b8 100644
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,656 +13,110 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Efficient Training on Multiple GPUs
+# Parallelism methods
 
-If training a model on a single GPU is too slow or if the model's weights do not fit in a single GPU's memory, transitioning 
-to a multi-GPU setup may be a viable option. Prior to making this transition, thoroughly explore all the strategies covered 
-in the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one) as they are universally applicable 
-to model training on any number of GPUs. Once you have employed those strategies and found them insufficient for your 
-case on a single GPU, consider moving to multiple GPUs.
+Multi-GPU setups are effective for accelerating training and fitting large models in memory that otherwise wouldn't fit on a single GPU. It relies on parallelizing the workload across GPUs. There are several types of parallelism such as data parallelism, tensor parallelism, pipeline parallelism, and model parallelism. Each type of parallelism splits the workload differently, whether it's the data or the model.
 
-Transitioning from a single GPU to multiple GPUs requires the introduction of some form of parallelism, as the workload 
-must be distributed across the resources. Multiple techniques can be employed to achieve parallelism, such as data 
-parallelism, tensor parallelism, and pipeline parallelism. It's important to note that there isn't a one-size-fits-all 
-solution, and the optimal settings depend on the specific hardware configuration you are using. 
+This guide will discuss the various parallelism methods, combining them, and choosing an appropriate strategy for your setup. For more details about distributed training, refer to the [Accelerate](https://hf.co/docs/accelerate/index) documentation.
 
-This guide offers an in-depth overview of individual types of parallelism, as well as guidance on ways to combine   
-techniques and choosing an appropriate approach. For step-by-step tutorials on distributed training, please refer to
-the [🤗 Accelerate documentation](https://huggingface.co/docs/accelerate/index). 
-
-<Tip>
-
-While the main concepts discussed in this guide are likely applicable across frameworks, here we focus on 
-PyTorch-based implementations.
-
-</Tip>
-
-Before diving deeper into the specifics of each technique, let's go over the rough decision process when training 
-large models on a large infrastructure.
+For a comprehensive guide on scaling large language models, check out the [Ultrascale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook), which provides detailed strategies and best practices for training at scale.
 
 ## Scalability strategy
 
-Begin by estimating how much vRAM is required to train your model. For models hosted on the 🤗 Hub, use our 
-[Model Memory Calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage), which gives you 
-accurate calculations within a few percent margin.  
-
-**Parallelization strategy for a single Node / multi-GPU setup**
-
-When training a model on a single node with multiple GPUs, your choice of parallelization strategy can significantly 
-impact performance. Here's a breakdown of your options:
-
-**Case 1: Your model fits onto a single GPU**
-
-If your model can comfortably fit onto a single GPU, you have two primary options:
-
-1. DDP - Distributed DataParallel
-2. [Zero Redundancy Optimizer (ZeRO)](https://arxiv.org/abs/1910.02054) - depending on the situation and configuration used, this method may or may not be faster, however, it's worth experimenting with it.
-
-**Case 2: Your model doesn't fit onto a single GPU:**
-
-If your model is too large for a single GPU, you have several alternatives to consider:
-
-1. PipelineParallel (PP)
-2. [ZeRO](https://arxiv.org/abs/1910.02054)
-3. [TensorParallel](#tensor-parallelism) (TP)
-
-With very fast inter-node connectivity (e.g., NVLINK or NVSwitch) all three strategies (PP, ZeRO, TP) should result in 
-similar performance. However, without these, PP will be faster than TP or ZeRO. The degree of TP may also 
-make a difference. It's best to experiment with your specific setup to determine the most suitable strategy.
-
-TP is almost always used within a single node. That is TP size <= GPUs per node.
-
-**Case 3: Largest layer of your model does not fit onto a single GPU**
-
-1. If you are not using ZeRO, you have to use TensorParallel (TP), because PipelineParallel (PP) alone won't be sufficient to accommodate the large layer.
-2. If you are using ZeRO, additionally adopt techniques from the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one).
-
-**Parallelization strategy for a multi-Node / multi-GPU setup**
-
-* When you have fast inter-node connectivity (e.g., NVLINK or NVSwitch) consider using one of these options:
-
-    1. ZeRO - as it requires close to no modifications to the model
-    2. A combination of PipelineParallel(PP) with TensorParallel(TP) and DataParallel(DP) - this approach will result in fewer communications, but requires significant changes to the model
-
-* When you have slow inter-node connectivity and still low on GPU memory:
-
-    1. Employ a combination of DataParallel(DP) with PipelineParallel(PP), TensorParallel(TP), and ZeRO.
-
-In the following sections of this guide we dig deeper into how these different parallelism methods work.
-
-## Data Parallelism
-
-Even with only 2 GPUs, you can readily leverage the accelerated training capabilities offered by PyTorch's built-in features, 
-such as `DataParallel` (DP) and `DistributedDataParallel` (DDP). Note that 
-[PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) recommends to prefer 
-`DistributedDataParallel` (DDP) over `DataParallel` (DP) for multi-GPU training as it works for all models.
-Let's take a look at how these two methods work and what makes them different.
-
-### DataParallel vs DistributedDataParallel
-
-To understand the key differences in inter-GPU communication overhead between the two methods, let's review the processes per batch:
-
-[DDP](https://pytorch.org/docs/master/notes/ddp.html):
-
-- At the start time the main process replicates the model once from GPU 0 to the rest of GPUs
-- Then for each batch:
-   1. Each GPU directly consumes its mini-batch of data.
-   2. During `backward`, once the local gradients are ready, they are averaged across all processes.
-
-[DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html):
-
-For each batch:
-   1. GPU 0 reads the batch of data and then sends a mini-batch to each GPU.
-   2. The up-to-date model is replicated from GPU 0 to each GPU. 
-   3. `forward` is executed, and output from each GPU is sent to GPU 0 to compute the loss.
-   4. The loss is distributed from GPU 0 to all GPUs, and `backward` is run. 
-   5. Gradients from each GPU are sent to GPU 0 and averaged. 
-
-Key differences include:
-1. DDP performs only a single communication per batch - sending gradients, while DP performs five different data exchanges per batch.
-DDP copies data using [torch.distributed](https://pytorch.org/docs/master/distributed.html), while DP copies data within 
-the process via Python threads (which introduces limitations associated with GIL). As a result, **`DistributedDataParallel` (DDP) is generally faster than `DataParallel` (DP)** unless you have slow GPU card inter-connectivity.
-2. Under DP, GPU 0 performs significantly more work than other GPUs, resulting in GPU under-utilization. 
-3. DDP supports distributed training across multiple machines, whereas DP does not.
-
-This is not an exhaustive list of differences between DP and DDP, however, other nuances are out of scope of this guide.
-You can get a deeper understanding of these methods by reading this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/).
+Use the [Model Memory Calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) to calculate how much memory a model requires. Then refer to the table below to select a strategy based on your setup.
 
-Let's illustrate the differences between DP and DDP with an experiment. We'll benchmark the differences between DP and 
-DDP with an added context of NVLink presence:  
+| setup | scenario | strategy |
+|---|---|---|
+| single node/multi-GPU | fits on single GPU | DistributedDataParallel or ZeRO |
+|  | doesn't fit on single GPU | PipelineParallel, ZeRO or TensorParallel |
+|  | largest model layer doesn't fit | TensorParallel or ZeRO |
+| multi-node/multi-GPU | fast inter-node connectivity (NVLink or NVSwitch) | ZeRO or 3D parallelism (PipelineParallel, TensorParallel, DataParallel) |
+|  | slow inter-node connectivity | ZeRO or 3D parallelism (PipelineParallel, TensorParallel, DataParallel) |
 
-* Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`).
-* Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`.
+## Data parallelism
 
-To disable the NVLink feature on one of the benchmarks, we use `NCCL_P2P_DISABLE=1`. 
+Data parallelism evenly distributes data across multiple GPUs. Each GPU holds a copy of the model and concurrently processes their portion of the data. At the end, the results from each GPU are synchronized and combined.
 
-Here is the benchmarking code and outputs:
+Data parallelism significantly reduces training time by processing data in parallel, and it is scalable to the number of GPUs available. However, synchronizing results from each GPU can add overhead.
 
-**DP**
+There are two types of data parallelism, DataParallel (DP) and DistributedDataParallel (DDP).
 
-```bash
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
-python examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+### DataParallel
 
-{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
-```
+[DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) supports distributed training on a *single machine* with multiple GPUs.
 
-**DDP w/ NVlink**
+1. The default GPU, `GPU 0`, reads a batch of data and sends a mini batch of it to the other GPUs.
+2. An up-to-date model is replicated from `GPU 0` to the other GPUs.
+3. A `forward` pass is performed on each GPU and their outputs are sent to `GPU 0` to compute the loss.
+4. The loss is distributed from `GPU 0` to the other GPUs for the `backward` pass.
+5. The gradients from each GPU are sent back to `GPU 0` and averaged.
 
-```bash
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
-torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+### DistributedDataParallel
 
-{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
-```
+[DistributedDataParallel](https://pytorch.org/docs/main/notes/ddp.html) supports distributed training across *multiple machines* with multiple GPUs.
 
-**DDP w/o NVlink**
+1. The main process replicates the model from the default GPU, `GPU 0`, to each GPU.
+2. Each GPU directly processes a mini batch of data.
+3. The local gradients are averaged across all GPUs during the `backward` pass.
 
-```bash
-rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
-torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
---model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+DDP is recommended because it reduces communication overhead between GPUs, efficiently utilizes each GPU, and scales to more than one machine.
 
-{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
-```
+### ZeRO data parallelism
 
-Here are the same benchmarking results gathered in a table for convenience:
+[Zero Redundancy Optimizer](https://www.deepspeed.ai/tutorials/zero/) is a more memory efficient type of data parallelism. It significantly improves memory efficiency by partitioning parameters, gradients, and optimizer states across data parallel processes to reduce memory usage. There are three ZeRO stages:
 
-| Type   | NVlink | Time |
-| :----- | -----  | ---: |
-| 2:DP   | Y      | 110s |
-| 2:DDP  | Y      | 101s |
-| 2:DDP  | N      | 131s |
-
-As you can see, in this case DP is ~10% slower than DDP with NVlink, but ~15% faster than DDP without NVlink.
-The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, 
-the more a slow link will impede the overall runtime.
-
-## ZeRO Data Parallelism
-
-ZeRO-powered data parallelism (ZeRO-DP) is illustrated in the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/).
-
-<div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png" alt="DeepSpeed-Image-1"/>
- </div>
-
-While it may appear complex, it is a very similar concept to `DataParallel` (DP). The difference is that instead of 
-replicating the full model parameters, gradients and optimizer states, each GPU stores only a slice of it. Then, at 
-run-time when the full layer parameters are needed just for the given layer, all GPUs synchronize to give each other 
-parts that they miss.
-
-To illustrate this idea, consider a simple model with 3 layers (La, Lb, and Lc), where each layer has 3 parameters. 
-Layer La, for example, has weights a0, a1 and a2:
-
-```
-La | Lb | Lc
----|----|---
-a0 | b0 | c0
-a1 | b1 | c1
-a2 | b2 | c2
-```
-
-If we have 3 GPUs, ZeRO-DP splits the model onto 3 GPUs like so:
-
-```
-GPU0:
-La | Lb | Lc
----|----|---
-a0 | b0 | c0
-
-GPU1:
-La | Lb | Lc
----|----|---
-a1 | b1 | c1
-
-GPU2:
-La | Lb | Lc
----|----|---
-a2 | b2 | c2
-```
-
-In a way, this is the same horizontal slicing as tensor parallelism, as opposed to Vertical 
-slicing, where one puts whole layer-groups on different GPUs. Now let's see how this works: 
-
-Each of these GPUs will get the usual mini-batch as it works in DP:
-
-```
-x0 => GPU0
-x1 => GPU1
-x2 => GPU2
-```
-
-The inputs are passed without modifications as if they would be processed by the original model.
-
-First, the inputs get to the layer `La`. What happens at this point?
-
-On GPU0: the x0 mini-batch requires the a0, a1, a2 parameters to do its forward path through the layer, but the GPU0 has only a0. 
-It will get a1 from GPU1 and a2 from GPU2, bringing all the pieces of the model together.
-
-In parallel, GPU1 gets another mini-batch - x1. GPU1 has the a1 parameter, but needs a0 and a2, so it gets those from GPU0 and GPU2.
-Same happens to GPU2 that gets the mini-batch x2. It gets a0 and a1 from GPU0 and GPU1.
-
-This way each of the 3 GPUs gets the full tensors reconstructed and makes a forward pass with its own mini-batch.
-As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. 
-The reconstruction is done efficiently via a pre-fetch.
-
-Then the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La.
-
-<Tip>
-
-This mechanism is similar to an efficient group backpacking strategy: person A carries the tent, person B carries the stove,
-and person C carries the axe. Each night they all share what they have with others and get from others what they don't have, 
-and in the morning they pack up their allocated type of gear and continue on their way. This is what ZeRO DP/Sharded DDP is.
-Compare this strategy to the simple one where each person has to carry their own tent, stove and axe (similar to 
-DataParallel (DP and DDP) in PyTorch), which would be far more inefficient. 
-
-</Tip>
-
-While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned.
-If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism 
-which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism 
-which is discussed next.
-
-Implementations:
-
-- [DeepSpeed](https://www.deepspeed.ai/tutorials/zero/) ZeRO-DP stages 1+2+3
-- [`Accelerate` integration](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed) 
-- [`transformers` integration](main_classes/trainer#trainer-integrations)
-
-## From Naive Model Parallelism to Pipeline Parallelism
-
-To explain Pipeline parallelism, we'll first look into Naive Model Parallelism (MP), also known as Vertical MP. This approach
-involves distributing groups of model layers across multiple GPUs by assigning specific layers to specific GPUs with `.to()`. 
-As data flows through these layers, it is moved to the same GPU as the layer, while the other layers remain untouched.
-
-We refer to this Model parallelism as "Vertical" because of how models are typically visualized. For example, the 
-following diagram shows an 8-layer model split vertically into two slices, placing layers 0-3 onto 
-GPU0 and 4-7 to GPU1:
-
-```
-================
-| Layer |      |
-|   0   |      |
-|   1   | GPU0 |
-|   2   |      |
-|   3   |      |
-================
-| Layer |      |
-|   4   |      |
-|   5   | GPU1 |
-|   6   |      |
-|   7   |      |
-================
-```
-
-In this example, when data moves from layer 0 to 3, it's no different from regular forward pass. However, passing data 
-from layer 3 to 4 requires moving it from GPU0 to GPU1, introducing a communication overhead. If the participating 
-GPUs are on the same compute node (e.g. same physical machine) this copying is fast, but if the GPUs are distributed 
-across different compute nodes (e.g. multiple machines), the communication overhead could be substantially greater.
-
-Following that, layers 4 to 7 work as they would in the original model. Upon completion of the 7th layer, there is often 
-a need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be 
-computed and the optimizer can do its work.
-
-Naive Model Parallelism comes several shortcomings:
-- **All but one GPU are idle at any given moment**: if 4 GPUs are used, it's nearly identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. 
-- **Overhead in data transfer between devices**:  E.g. 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, but a single 24GB card will complete the training faster, because it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states)
-- **Copying shared embeddings**: Shared embeddings may need to get copied back and forth between GPUs.
-
-Now that you are familiar with how the naive approach to model parallelism works and its shortcomings, let's look at Pipeline Parallelism (PP).
-PP is almost identical to a naive MP, but it solves the GPU idling problem by chunking the incoming batch into micro-batches 
-and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process.
-
-The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) 
-shows the naive MP on the top, and PP on the bottom:
-
-<div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-gpipe-bubble.png" alt="MP vs PP"/>
-</div>
-
-At the bottom of the diagram, you can observe that the Pipeline Parallelism (PP) approach minimizes the number of idle 
-GPU zones, referred to as 'bubbles'. Both parts of the diagram show a parallelism level of degree 4, meaning that 4 GPUs 
-are involved in the pipeline. You can see that there's a forward path of 4 pipe stages (F0, F1, F2 and F3) followed by 
-a backward path in reverse order (B3, B2, B1, and B0).
-
-PP introduces a new hyperparameter to tune - `chunks`, which determines how many data chunks are sent in a sequence 
-through the same pipe stage. For example, in the bottom diagram you can see `chunks=4`. GPU0 performs the same 
-forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do complete their work. 
-Only when the other GPUs begin to complete their work, GPU0 starts to work again doing the backward path for chunks 
-3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
-
-Note that this is the same concept as gradient accumulation steps. PyTorch uses `chunks`, while DeepSpeed refers 
-to the same hyperparameter as gradient accumulation steps.
-
-Because of the chunks, PP introduces the notion of micro-batches (MBS). DP splits the global data batch size into 
-mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of 
-256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each 
-Pipeline stage works with a single micro-batch at a time. To calculate the global batch size of the DP + PP setup, 
-use the formula: `mbs * chunks * dp_degree` (`8 * 32 * 4 = 1024`).
-With `chunks=1` you end up with the naive MP, which is inefficient. With a large `chunks` value you end up with 
-tiny micro-batch sizes which is also inefficient. For this reason, we encourage to experiment with the `chunks` value to 
-find the one that leads to the most efficient GPUs utilization.
-
-You may notice a bubble of "dead" time on the diagram that can't be parallelized because the last `forward` stage 
-has to wait for `backward` to complete the pipeline. The purpose of finding the best value for `chunks` is to enable a high 
-concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble.
-
-Pipeline API solutions have been implemented in:
-- PyTorch
-- DeepSpeed
-- Megatron-LM
-
-These come with some shortcomings:
-- They have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model.
-- Currently the Pipeline API is very restricted. If you had a bunch of Python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693
-- Conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage.
-- They have to arrange each layer so that the output of one layer becomes an input to the other layer.
-
-More recent solutions include:
-- Varuna
-- Sagemaker
-
-We have not experimented with Varuna and SageMaker but their papers report that they have overcome the list of problems 
-mentioned above and that they require smaller changes to the user's model.
-
-Implementations:
-- [PyTorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
-- [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/)
-- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API.
-- [Varuna](https://github.com/microsoft/varuna)
-- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
-- [OSLO](https://github.com/tunib-ai/oslo) - this is implemented based on the Hugging Face Transformers.
-
-🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive MP support. 
-The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This 
-is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that.
-
-DeepSpeed and Megatron-LM integrations are available in [🤗 Accelerate](https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed)
-
-Other approaches:
-
-DeepSpeed, Varuna and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html)
-
-<div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-sagemaker-interleaved-pipeline.png" alt="Interleaved pipeline execution"/>
-</div>
-
-Here the bubble (idle time) is further minimized by prioritizing backward passes. Varuna further attempts to improve the 
-schedule by using simulations to discover the most efficient scheduling.
-
-OSLO has pipeline parallelism implementation based on the Transformers without `nn.Sequential` conversion.
-
-## Tensor Parallelism
-
-In Tensor Parallelism, each GPU processes a slice of a tensor and only aggregates the full tensor for operations requiring it.
-To describe this method, this section of the guide relies on the concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) 
-paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473).
-
-The main building block of any transformer is a fully connected `nn.Linear` followed by a nonlinear activation `GeLU`.
-The dot dot-product part of it, following the Megatron's paper notation, can be written as `Y = GeLU(XA)`, where `X` is 
-an input vector, `Y` is the output vector, and `A` is the weight matrix.
-
-If we look at the computation in matrix form, you can see how the matrix multiplication can be split between multiple GPUs:
+- Stage 1 partitions the optimizer states
+- Stage 2 partitions the optimizer and gradient states
+- Stage 3 partitions the optimizer, gradient, and parameters
 
 <div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_gemm.png" alt="Parallel GEMM"/>
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png"/>
 </div>
 
-If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, 
-then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently:
+## Model parallelism
 
-<div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-independent-gelu.png" alt="Independent GeLU"/>
-</div>
+Model parallelism distributes a model across multiple GPUs. There are several ways to split a model, but the typical method distributes the model layers across GPUs. On the `forward` pass, the first GPU processes a batch of data and passes it to the next group of layers on the next GPU. For the `backward` pass, the data is sent backward from the final layer to the first layer.
 
-Using this principle, we can update a multi-layer perceptron of arbitrary depth, without the need for any synchronization 
-between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors 
-provide a helpful illustration for that:
+Model parallelism is a useful strategy for training models that are too large to fit into the memory of a single GPU. However, GPU utilization is unbalanced because only one GPU is active at a time. Passing results between GPUs also adds communication overhead and it can be a bottleneck.
 
-<div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_shard_processing.png" alt="Parallel shard processing"/>
-</div>
+## Pipeline parallelism
 
-Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having 
-multiple independent heads!
+Pipeline parallelism is conceptually very similar to model parallelism, but it's more efficient because it reduces the amount of idle GPU time. Instead of waiting for each GPU to finish processing a batch of data, pipeline parallelism creates *micro-batches* of data. As soon as one micro-batch is finished, it is passed to the next GPU. This way, each GPU can concurrently process part of the data without waiting for the other GPU to completely finish processing a mini batch of data.
 
-<div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_self_attention.png" alt="Parallel self-attention"/>
-</div>
+Pipeline parallelism shares the same advantages as model parallelism, but it optimizes GPU utilization and reduces idle time. But pipeline parallelism can be more complex because models may need to be rewritten as a sequence of [nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html) modules and it also isn't possible to completely reduce idle time because the last `forward` pass must also wait for the `backward` pass to finish.
 
-Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. 
-Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use
-nodes that have at least 8 GPUs.
+## Tensor parallelism
 
-This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530).
-by [@anton-l](https://github.com/anton-l).
+Tensor parallelism distributes large tensor computations across multiple GPUs. The tensors are sliced horizontally or vertically and each slice is processed by a separate GPU. Each GPU performs its calculations on its tensor slice and the results are synchronized at the end to reconstruct the final result.
 
-Alternative names:
-- DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/training/#model-parallelism)
+Tensor parallelism is effective for training large models that don't fit into the memory of a single GPU. It is also faster and more efficient because each GPU can process its tensor slice in parallel, and it can be combined with other parallelism methods. Like other parallelism methods though, tensor parallelism adds communication overhead between GPUs.
 
-Implementations:
-- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation, as it's very model-specific
-- [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment)
-- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
-- [OSLO](https://github.com/tunib-ai/oslo) has the tensor parallelism implementation based on the Transformers.
+## Hybrid parallelism
 
-SageMaker combines TP with DP for a more efficient processing.
+Parallelism methods can be combined to achieve even greater memory savings and more efficiently train models with billions of parameters.
 
-🤗 Transformers status:
-- core: not yet implemented in the core
-- but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too.
-- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/)
+### Data parallelism and pipeline parallelism
 
-🤗 Accelerate integrates with [TP from Megatron-LM](https://huggingface.co/docs/accelerate/v0.23.0/en/usage_guides/megatron_lm).
+Data and pipeline parallelism distributes the data across GPUs and divides each mini batch of data into micro-batches to achieve pipeline parallelism.
 
-## Data Parallelism + Pipeline Parallelism
+Each data parallel rank treats the process as if there were only one GPU instead of two, but GPUs 0 and 1 can offload micro-batches of data to GPUs 2 and 3 and reduce idle time.
 
-The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates 
-how one can combine DP with PP.
+This approach optimizes parallel data processing by reducing idle GPU utilization.
 
 <div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero-dp-pp.png" alt="DP + PP-2d"/>
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero-dp-pp.png"/>
 </div>
 
-Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 
-and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. 
-And GPU1 does the same by enlisting GPU3 to its aid.
-
-Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs.
+### ZeRO data parallelism, pipeline parallelism, and model parallelism (3D parallelism)
 
-Implementations:
-- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)
-- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
-- [Varuna](https://github.com/microsoft/varuna)
-- [SageMaker](https://arxiv.org/abs/2111.05972)
-- [OSLO](https://github.com/tunib-ai/oslo)
+Data, pipeline and model parallelism combine to form [3D parallelism](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/) to optimize memory and compute efficiency.
 
-🤗 Transformers status: not yet implemented
+Memory effiiciency is achieved by splitting the model across GPUs and also dividing it into stages to create a pipeline. This allows GPUs to work in parallel on micro-batches of data, reducing the memory usage of the model, optimizer, and activations.
 
-## Data Parallelism + Pipeline Parallelism + Tensor Parallelism
+Compute efficiency is enabled by ZeRO data parallelism where each GPU only stores a slice of the model, optimizer, and activations. This allows higher communication bandwidth between data parallel nodes because communication can occur independently or in parallel with the other pipeline stages.
 
-To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. This can be seen in the following diagram.
+This approach is scalable to extremely large models with trillions of parameters.
 
 <div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-deepspeed-3d.png" alt="dp-pp-tp-3d"/>
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-deepspeed-3d.png"/>
 </div>
-
-This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/), which is a good read as well.
-
-Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs.
-
-Implementations:
-- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP.
-- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
-- [Varuna](https://github.com/microsoft/varuna)
-- [SageMaker](https://arxiv.org/abs/2111.05972)
-- [OSLO](https://github.com/tunib-ai/oslo)
-
-🤗 Transformers status: not yet implemented, since we have no PP and TP.
-
-## ZeRO Data Parallelism + Pipeline Parallelism + Tensor Parallelism
-
-One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been 
-discussed in [ZeRO Data Parallelism](#zero-data-parallelism). Normally it's a standalone feature that doesn't require PP or TP. 
-But it can be combined with PP and TP.
-
-When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding).
-
-While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have negative 
-performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate 
-the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, 
-small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with
-minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to impact the performance.
-
-In addition, there are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already 
-reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP.
-
-ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required.
-
-And since we have ZeRO, the other benefit is ZeRO-Offload. Since this is stage 1 optimizer states can be offloaded to CPU.
-
-Implementations:
-- [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) and [Megatron-Deepspeed from BigScience](https://github.com/bigscience-workshop/Megatron-DeepSpeed), which is the fork of the former repo.
-- [OSLO](https://github.com/tunib-ai/oslo)
-
-Important papers:
-
-- [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](
-https://arxiv.org/abs/2201.11990)
-
-🤗 Transformers status: not yet implemented, since we have no PP and TP.
-
-## FlexFlow
-
-[FlexFlow](https://github.com/flexflow/FlexFlow) also solves the parallelization problem in a slightly different approach.
-
-Paper: ["Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken](https://arxiv.org/abs/1807.05358)
-
-It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter.
-
-1. Sample = Data Parallelism (sample-wise parallel)
-2. Operator = Parallelize a single operation into several sub-operations
-3. Attribute = Data Parallelism (length-wise parallel)
-4. Parameter = Model Parallelism (regardless of dimension - horizontal or vertical)
-
-Examples:
-* Sample
-
-Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes 5 x 2 x 512.
-
-* Operator
-
-If we perform layer normalization, we compute std first and mean second, and then we can normalize data. 
-Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2 
-devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time.
-
-* Attribute
-
-We have 10 batches of 512 length. If we parallelize them by attribute dimension into 2 devices, 10 x 512 will be 10 x 2 x 256.
-
-* Parameter
-
-It is similar with tensor model parallelism or naive layer-wise model parallelism.
-
-<div class="flex justify-center">
-     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-flexflow.jpeg" alt="flex-flow-soap"/>
-</div>
-
-The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) 
-fast-intra-connect/slow-inter-connect and it automatically optimizes all these algorithmically deciding which 
-parallelisation to use where.
-
-One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and 
-fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations.
-
-So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best 
-strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan 
-for that. And then you can train. A different setup will have its own custom optimization.
-
-🤗 Transformers status: Transformers models are FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), 
-which is a prerequisite for FlexFlow, however, changes are required on the FlexFlow side to make it work with Transformers models.
-
-## GPU selection
-
-When training on multiple GPUs, you can specify the number of GPUs to use and in what order. This can be useful for instance when you have GPUs with different computing power and want to use the faster GPU first. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) to use only a subset of the available GPUs, and you don't need Accelerate or the [DeepSpeed integration](./main_classes/deepspeed).
-
-### Number of GPUs
-
-For example, if you have 4 GPUs and you only want to use the first 2:
-
-<hfoptions id="select-gpu">
-<hfoption id="torchrun">
-
-Use the `--nproc_per_node` to select how many GPUs to use.
-
-```bash
-torchrun --nproc_per_node=2  trainer-program.py ...
-```
-
-</hfoption>
-<hfoption id="Accelerate">
-
-Use `--num_processes` to select how many GPUs to use.
-
-```bash
-accelerate launch --num_processes 2 trainer-program.py ...
-```
-
-</hfoption>
-<hfoption id="DeepSpeed">
-
-Use `--num_gpus` to select how many GPUs to use.
-
-```bash
-deepspeed --num_gpus 2 trainer-program.py ...
-```
-
-</hfoption>
-</hfoptions>
-
-### Order of GPUs
-
-Now, to select which GPUs to use and their order, you'll use the `CUDA_VISIBLE_DEVICES` environment variable. It is easiest to set the environment variable in a `~/bashrc` or another startup config file. `CUDA_VISIBLE_DEVICES` is used to map which GPUs are used. For example, if you have 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2:
-
-```bash
-CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
-```
-
-Only the 2 physical GPUs (0 and 2) are "visible" to PyTorch and these are mapped to `cuda:0` and `cuda:1` respectively. You can also reverse the order of the GPUs to use 2 first. Now, the mapping is `cuda:1` for GPU 0 and `cuda:0` for GPU 2.
-
-```bash
-CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
-```
-
-You can also set the `CUDA_VISIBLE_DEVICES` environment variable to an empty value to create an environment without GPUs.
-
-```bash
-CUDA_VISIBLE_DEVICES= python trainer-program.py ...
-```
-
-<Tip warning={true}>
-
-As with any environment variable, they can be exported instead of being added to the command line. However, this is not recommended because it can be confusing if you forget how the environment variable was setup and you end up using the wrong GPUs. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
-
-</Tip>
-
-`CUDA_DEVICE_ORDER` is an alternative environment variable you can use to control how the GPUs are ordered. You can either order them by:
-
-1. PCIe bus ID's that matches the order of [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) and [`rocm-smi`](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/.doxygen/docBin/html/index.html) for NVIDIA and AMD GPUs respectively
-
-```bash
-export CUDA_DEVICE_ORDER=PCI_BUS_ID
-```
-
-2. GPU compute ability
-
-```bash
-export CUDA_DEVICE_ORDER=FASTEST_FIRST
-```
-
-The `CUDA_DEVICE_ORDER` is especially useful if your training setup consists of an older and newer GPU, where the older GPU appears first, but you cannot physically swap the cards to make the newer GPU appear first. In this case, set `CUDA_DEVICE_ORDER=FASTEST_FIRST` to always use the newer and faster GPU first (`nvidia-smi` or `rocm-smi` still reports the GPUs in their PCIe order). Or you could also set `export CUDA_VISIBLE_DEVICES=1,0`.
diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md
index 0c21fd92b834..2f6b2d4da9de 100644
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,522 +13,284 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Methods and tools for efficient training on a single GPU
+# GPU
 
-This guide demonstrates practical techniques that you can use to increase the efficiency of your model's training by 
-optimizing memory utilization, speeding up the training, or both. If you'd like to understand how GPU is utilized during 
-training, please refer to the [Model training anatomy](model_memory_anatomy) conceptual guide first. This guide 
-focuses on practical techniques.  
+GPUs are commonly used to train deep learning models due to their high memory bandwidth and parallel processing capabilities. Depending on your GPU and model size, it is possible to even train models with billions of parameters. The key is to find the right balance between GPU memory utilization (data throughput/training time) and training speed.
 
-<Tip>
+This guide will show you the features available in Transformers and PyTorch for efficiently training a model on GPUs. In many cases, you'll want to use a combination of these features to optimize training.
 
-If you have access to a machine with multiple GPUs, these approaches are still valid, plus you can leverage additional methods outlined in the [multi-GPU section](perf_train_gpu_many).
+Refer to the table below to quickly help you identify the features relevant to your training scenario.
 
-</Tip>
+| Feature | Training speed | Memory usage |
+|---|---|---|
+| batch size | yes | yes |
+| gradient accumulation | no | yes |
+| gradient checkpointing | no | yes |
+| mixed precision | yes | depends |
+| optimizers | yes | yes |
+| data preloading | yes | no |
+| torch_empty_cache_steps | no | yes |
+| torch.compile | yes | no |
+| PEFT | no | yes |
 
-When training large models, there are two aspects that should be considered at the same time: 
+## Trainer
 
-* Data throughput/training time
-* Model performance
+[Trainer](./trainer) supports many useful training features that can be configured through [`TrainingArguments`]. This section highlights some of the more important features for optimizing training.
 
-Maximizing the throughput (samples/second) leads to lower training cost. This is generally achieved by utilizing the GPU 
-as much as possible and thus filling GPU memory to its limit. If the desired batch size exceeds the limits of the GPU memory, 
-the memory optimization techniques, such as gradient accumulation, can help.
+### Batch size
 
-However, if the preferred batch size fits into memory, there's no reason to apply memory-optimizing techniques because they can 
-slow down the training. Just because one can use a large batch size, does not necessarily mean they should. As part of 
-hyperparameter tuning, you should determine which batch size yields the best results and then optimize resources accordingly.
+Batch size is one of the most important hyperparameters for efficient GPU training because it affects memory usage and training speed. Larger batch sizes lead to faster training because it takes advantage of a GPUs parallel processing power. It is recommended to use batch sizes that are powers of 2, such as 8, 64, 128, 256, 512, etc. The batch size depends on your GPU and the models data type.
 
-The methods and tools covered in this guide can be classified based on the effect they have on the training process:
-
-| Method/tool                                                                                                                                             | Improves training speed | Optimizes memory utilization |
-|:--------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------|:-----------------------------|
-| [Batch size choice](#batch-size-choice)                                                                                                                 | Yes                     | Yes                          |
-| [Gradient accumulation](#gradient-accumulation)                                                                                                         | No                      | Yes                          |
-| [Gradient checkpointing](#gradient-checkpointing)                                                                                                       | No                      | Yes                          |
-| [Mixed precision training](#mixed-precision-training)                                                                                                   | Yes                     | Maybe*                       |
-| [torch_empty_cache_steps](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.torch_empty_cache_steps) | No                      | Yes                          |
-| [Optimizer choice](#optimizer-choice)                                                                                                                   | Yes                     | Yes                          |
-| [Data preloading](#data-preloading)                                                                                                                     | Yes                     | No                           |
-| [DeepSpeed Zero](#deepspeed-zero)                                                                                                                       | No                      | Yes                          |
-| [torch.compile](#using-torchcompile)                                                                                                                    | Yes                     | No                           |
-| [Parameter-Efficient Fine Tuning (PEFT)](#using--peft)                                                                                                  | No                      | Yes                          |
- 
-<Tip>
-
-*Note: when using mixed precision with a small model and a large batch size, there will be some memory savings but with a 
-large model and a small batch size, the memory use will be larger.
-
-</Tip>
-
-You can combine the above methods to get a cumulative effect. These techniques are available to you whether you are 
-training your model with [`Trainer`] or writing a pure PyTorch loop, in which case you can [configure these optimizations 
-with 🤗 Accelerate](#using--accelerate).
-
-If these methods do not result in sufficient gains, you can explore the following options: 
-* [Look into building your own custom Docker container with efficient software prebuilds](#efficient-software-prebuilds)
-* [Consider a model that uses Mixture of Experts (MoE)](#mixture-of-experts)
-* [Convert your model to BetterTransformer to leverage PyTorch native attention](#using-pytorch-native-attention-and-flash-attention)
-
-Finally, if all of the above is still not enough, even after switching to a server-grade GPU like A100, consider moving 
-to a multi-GPU setup. All these approaches are still valid in a multi-GPU setup, plus you can leverage additional parallelism 
-techniques outlined in the [multi-GPU section](perf_train_gpu_many). 
-
-## Batch size choice
-
-To achieve optimal performance, start by identifying the appropriate batch size. It is recommended to use batch sizes and 
-input/output neuron counts that are of size 2^N. Often it's a multiple of 8, but it can be 
-higher depending on the hardware being used and the model's dtype.
-
-For reference, check out NVIDIA's recommendation for [input/output neuron counts](
-https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and 
-[batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size) for 
-fully connected layers (which are involved in GEMMs (General Matrix Multiplications)).
-
-[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) 
-define the multiplier based on the dtype and the hardware. For instance, for fp16 data type a multiple of 8 is recommended, unless 
-it's an A100 GPU, in which case use multiples of 64.
-
-For parameters that are small, consider also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization). 
-This is where tiling happens and the right multiplier can have a significant speedup.
-
-## Gradient Accumulation
-
-The **gradient accumulation** method aims to calculate gradients in smaller increments instead of computing them for the 
-entire batch at once. This approach involves iteratively calculating gradients in smaller batches by performing forward 
-and backward passes through the model and accumulating the gradients during the process. Once a sufficient number of 
-gradients have been accumulated, the model's optimization step is executed. By employing gradient accumulation, it 
-becomes possible to increase the **effective batch size** beyond the limitations imposed by the GPU's memory capacity. 
-However, it is important to note that the additional forward and backward passes introduced by gradient accumulation can 
-slow down the training process.
-
-You can enable gradient accumulation by adding the `gradient_accumulation_steps` argument to  [`TrainingArguments`]: 
+Configure [`~TrainingArguments.per_device_train_batch_size`] in [`TrainingArguments`].
 
 ```py
-training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)
-```
-
-In the above example, your effective batch size becomes 4. 
-
-Alternatively, use 🤗 Accelerate to gain full control over the training loop. Find the 🤗 Accelerate example 
-[further down in this guide](#using--accelerate).
-
-While it is advised to max out GPU usage as much as possible, a high number of gradient accumulation steps can 
-result in a more pronounced training slowdown. Consider the following example. Let's say, the `per_device_train_batch_size=4` 
-without gradient accumulation hits the GPU's limit. If you would like to train with batches of size 64, do not set the 
-`per_device_train_batch_size` to 1 and `gradient_accumulation_steps` to 64. Instead, keep `per_device_train_batch_size=4` 
-and set `gradient_accumulation_steps=16`. This results in the same effective batch size while making better use of 
-the available GPU resources.
-
-For additional information, please refer to batch size and gradient accumulation benchmarks for [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537)
-and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957).
-
-## Gradient Checkpointing
-
-Some large models may still face memory issues even when the batch size is set to 1 and gradient accumulation is used. 
-This is because there are other components that also require memory storage.
+from transformers import TrainingArguments
 
-Saving all activations from the forward pass in order to compute the gradients during the backward pass can result in 
-significant memory overhead. The alternative approach of discarding the activations and recalculating them when needed 
-during the backward pass, would introduce a considerable computational overhead and slow down the training process.
-
-**Gradient checkpointing** offers a compromise between these two approaches and saves strategically selected activations 
-throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. For 
-an in-depth explanation of gradient checkpointing, refer to [this great article](https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9).
-
-To enable gradient checkpointing in the [`Trainer`], pass the corresponding a flag to [`TrainingArguments`]:
-
-```py
-training_args = TrainingArguments(
-    per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args
+args = TrainingArguments(
+    per_device_train_batch_size=256,
+    per_device_eval_batch_size=256,
 )
 ```
 
-Alternatively, use 🤗 Accelerate - find the 🤗 Accelerate example [further in this guide](#using--accelerate). 
-
-<Tip>
-
-While gradient checkpointing may improve memory efficiency, it slows training by approximately 20%.
+Refer to the NVIDIA [Performance](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) guide to learn more about how input features and output neuron counts and batch size affect performance. These are involved in the General Matrix Multiplications (GEMMs) performed by the GPU. Larger parameters are better for parallelization and efficiency.
 
-</Tip>
+The [Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) section is also useful for selecting a batch size that maximizes the speed of tensor multiplication based on the data type and GPU. For example, multiples of 8 are recommended for fp16, unless it's an A100 GPU, in which case use multiples of 64.
 
-## Mixed precision training
+Finally, consider [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization) for smaller parameters. Tile quantization results when matrix dimensions aren't divisible by a GPUs thread block tile size, causing the GPU to underutilize its resources. Selecting the correct batch size multiplier, such that the matrix is divisible by the tile size, can significantly speed up training.
 
-**Mixed precision training** is a technique that aims to optimize the computational efficiency of training models by 
-utilizing lower-precision numerical formats for certain variables. Traditionally, most models use 32-bit floating point 
-precision (fp32 or float32) to represent and process variables. However, not all variables require this high precision 
-level to achieve accurate results. By reducing the precision of certain variables to lower numerical formats like 16-bit 
-floating point (fp16 or float16), we can speed up the computations. Because in this approach some computations are performed 
-in half-precision, while some are still in full precision, the approach is called mixed precision training.
+### Gradient accumulation
 
-Most commonly mixed precision training is achieved by using fp16 (float16) data types, however, some GPU architectures 
-(such as the Ampere architecture) offer bf16 and tf32 (CUDA internal data type) data types. Check 
-out the [NVIDIA Blog](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) to learn more about 
-the differences between these data types.
+Gradient accumulation overcomes memory constraints - useful for fitting a very large model that otherwise wouldn't fit on a single GPU - by accumulating gradients over multiple mini-batches before updating the parameters. This reduces memory by storing fewer gradients and enables training with a larger *effective batch size* because usually, the parameters are updated from a single batch of data. Training can slow down though due to the additional forward and backward passes introduced by gradient accumulation.
 
-### fp16
-
-The main advantage of mixed precision training comes from saving the activations in half precision (fp16). 
-Although the gradients are also computed in half precision they are converted back to full precision for the optimization 
-step so no memory is saved here. 
-While mixed precision training results in faster computations, it can also lead to more GPU memory being utilized, especially for small batch sizes.
-This is because the model is now present on the GPU in both 16-bit and 32-bit precision (1.5x the original model on the GPU).
-
-To enable mixed precision training, set the `fp16` flag to `True`:
+Configure [`~TrainingArguments.per_device_train_batch_size`] in [`TrainingArguments`] to enable gradient accumulation.
 
 ```py
-training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)
-```
-
-If you prefer to use 🤗 Accelerate, find the 🤗 Accelerate example [further in this guide](#using--accelerate). 
-
-### BF16
-
-If you have access to an Ampere or newer hardware you can use bf16 for mixed precision training and evaluation. While 
-bf16 has a worse precision than fp16, it has a much bigger dynamic range. In fp16 the biggest number you can have 
-is `65504` and any number above that will result in an overflow. A bf16 number can be as large as `3.39e+38` (!) which 
-is about the same as fp32 - because both have 8-bits used for the numerical range.
-
-You can enable BF16 in the 🤗 Trainer with:
+from transformers import TrainingArguments
 
-```python
-training_args = TrainingArguments(bf16=True, **default_args)
+# effective batch size of 64
+args = TrainingArguments(
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=16,
+)
 ```
 
-### TF32
+Try to avoid too many gradient accumulation steps because it can really slow down training. Consider the example below, where the maximum batch size that'll fit on your GPU is 4. You should keep your batch size at 4 to better utilize the GPU.
 
-The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead 
-of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total. It's "magical" in the sense that 
-you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput 
-improvement. All you need to do is to add the following to your code:
+| batch size | gradient accumulation steps | effective batch size |  |
+|---|---|---|---|
+| 1 | 64 | 64 | 👎 |
+| 4 | 16 | 64 | 👍 |
 
-```python
-import torch
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-```
+### Gradient checkpointing
 
-CUDA will automatically switch to using tf32 instead of fp32 where possible, assuming that the used GPU is from the Ampere series.
+Gradient checkpointing reduces memory usage by only storing some of the intermediate activations during the backward pass and recomputing the remaining activations. This avoids storing *all* of the intermediate activations from the forward pass, which can require a lot of memory overhead. However, it comes at the cost of slower training speed (~20%).
 
-According to [NVIDIA research](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/), the 
-majority of machine learning training workloads show the same perplexity and convergence with tf32 training as with fp32. 
-If you're already using fp16 or bf16 mixed precision it may help with the throughput as well.
+Configure [`~TrainingArguments.gradient_checkpointing`] in [`TrainingArguments`] to enable gradient checkpointing.
 
-You can enable this mode in the 🤗 Trainer:
+```py
+from transformers import TrainingArguments
 
-```python
-TrainingArguments(tf32=True, **default_args)
+args = TrainingArguments(
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=16,
+    gradient_checkpointing=True,
+)
 ```
 
-<Tip>
-
-tf32 can't be accessed directly via `tensor.to(dtype=torch.tf32)` because it is an internal CUDA data type. You need `torch>=1.7` to use tf32 data types.
-
-</Tip>
-
-For additional information on tf32 vs other precisions, please refer to the following benchmarks: 
-[RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803) and
-[A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189).
-
-## Flash Attention 2
-
-You can speedup the training throughput by using Flash Attention 2 integration in transformers. Check out the appropriate section in the [single GPU section](./perf_infer_gpu_one#Flash-Attention-2) to learn more about how to load a model with Flash Attention 2 modules. 
-
-## Optimizer choice
+### Mixed precision
 
-The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). Adam achieves 
-good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory 
-footprint of the order of the number of model parameters. To remedy this, you can use an alternative optimizer. 
-For example if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed for NVIDIA GPUs, or [ROCmSoftwarePlatform/apex](https://github.com/ROCmSoftwarePlatform/apex) for AMD GPUs, `adamw_apex_fused` will give you the
-fastest training experience among all supported AdamW optimizers.
+Mixed precision accelerates training speed by performing some calculations in half-precision (fp16) and some in full-precision (fp32). The half-precision calculations boosts training speed because it's not as computationally expensive as performing the calculations in full-precision. Meanwhile, preserving some of the calculations in full-precision maintains accuracy.
 
-[`Trainer`] integrates a variety of optimizers that can be used out of box: `adamw_hf`, `adamw_torch`, `adamw_torch_fused`, 
-`adamw_apex_fused`, `adamw_anyprecision`, `adafactor`, or `adamw_bnb_8bit`. More optimizers can be plugged in via a third-party implementation.
+There are several data types available for mixed precision training.
 
-Let's take a closer look at two alternatives to AdamW optimizer:
-1. `adafactor` which is available in [`Trainer`]
-2. `adamw_bnb_8bit` is also available in Trainer, but a third-party integration is provided below for demonstration.
+<hfoptions id="mixed-precision">
+<hfoption id="fp16">
 
-For comparison, for a 3B-parameter model, like “google-t5/t5-3b”: 
-* A standard AdamW optimizer will need 24GB of GPU memory because it uses 8 bytes for each parameter (8*3 => 24GB)
-* Adafactor optimizer will need more than 12GB. It uses slightly more than 4 bytes for each parameter, so 4*3 and then some extra.
-* 8bit BNB quantized optimizer will use only (2*3) 6GB if all optimizer states are quantized.
+The main advantage of mixed precision training is saving the activations in fp16.
 
-### Adafactor
-
-Adafactor doesn't store rolling averages for each element in weight matrices. Instead, it keeps aggregated information 
-(sums of rolling averages row- and column-wise), significantly reducing its footprint. However, compared to Adam, 
-Adafactor may have slower convergence in certain cases.
-
-You can switch to Adafactor by setting `optim="adafactor"` in [`TrainingArguments`]:
+Configure [`~TrainingArguments.fp16`] in [`TrainingArguments`] to enable mixed precision training with the fp16 data type.
 
 ```py
-training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args)
+from transformers import TrainingArguments
+
+args = TrainingArguments(
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=16,
+    gradient_checkpointing=True,
+    fp16=True.
+)
 ```
 
-Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training) 
-you can notice up to 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of 
-Adafactor can be worse than Adam. 
+fp16 isn't memory-optimized because the gradients that are computed in fp16 are converted back to fp32 during the optimization step. You may end up using more GPU memory, especially for small batch sizes, because there are now two versions (fp16 and fp32) of the model on the GPU.
 
-### 8-bit Adam
+</hfoption>
+<hfoption id="bf16">
 
-Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization 
-means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the 
-idea behind mixed precision training.
+[bf16](https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus) trades off some precision for a much larger dynamic range, which is helpful for avoiding overflow and underflow errors. You can use bf16 without adding any loss scaling methods like you would with fp16. bf16 is supported by NVIDIAs Ampere architecture or newer.
 
-To use `adamw_bnb_8bit`, you simply need to set `optim="adamw_bnb_8bit"` in [`TrainingArguments`]:
+Configure [`~TrainingArguments.fp16`] in [`TrainingArguments`] to enable mixed precision training with the bf16 data type.
 
 ```py
-training_args = TrainingArguments(per_device_train_batch_size=4, optim="adamw_bnb_8bit", **default_args)
+from transformers import TrainingArguments
+
+args = TrainingArguments(
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=16,
+    gradient_checkpointing=True,
+    bf16=True,
+)
 ```
 
-However, we can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated.
+</hfoption>
+<hfoption id="tf32">
 
-First, follow the installation guide in the GitHub [repo](https://github.com/bitsandbytes-foundation/bitsandbytes) to install the `bitsandbytes` library 
-that implements the 8-bit Adam optimizer.
+[tf32](https://blogs.nvidia.com/blog/tensorfloat-32-precision-format/) is a mode on NVIDIA Ampere GPUs that convert the convolution and matrix multiplication inputs to tf32. All other storage and operations are kept in fp32. This allows tf32 to maintain the same range as fp32, the same precision as fp16 and more precision than bf16. Combining tf32 with fp16 or bf16 mixed precision training can improve throughput by 16x.
 
-Next you need to initialize the optimizer. This involves two steps: 
-* First, group the model's parameters into two groups - one where weight decay should be applied, and the other one where it should not. Usually, biases and layer norm parameters are not weight decayed. 
-* Then do some argument housekeeping to use the same parameters as the previously used AdamW optimizer.
+tf32 is enabled by default on NVIDIA Ampere GPUs, but you can also add the code below to your fp32 training or inference code to explicitly enable it.
 
 ```py
-import bitsandbytes as bnb
-from torch import nn
-from transformers.trainer_pt_utils import get_parameter_names
-
-training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
-
-decay_parameters = get_parameter_names(model, [nn.LayerNorm], ["bias", "layernorm", "rmsnorm"])
-optimizer_grouped_parameters = [
-    {
-        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
-        "weight_decay": training_args.weight_decay,
-    },
-    {
-        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
-        "weight_decay": 0.0,
-    },
-]
-
-optimizer_kwargs = {
-    "betas": (training_args.adam_beta1, training_args.adam_beta2),
-    "eps": training_args.adam_epsilon,
-}
-optimizer_kwargs["lr"] = training_args.learning_rate
-adam_bnb_optim = bnb.optim.Adam8bit(
-    optimizer_grouped_parameters,
-    betas=(training_args.adam_beta1, training_args.adam_beta2),
-    eps=training_args.adam_epsilon,
-    lr=training_args.learning_rate,
-)
+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
 ```
 
-Finally, pass the custom optimizer as an argument to the `Trainer`:
+Configure [tf32()](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.tf32) in [`TrainingArguments`] to enable mixed precision training with tf32 mode.
 
 ```py
-trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None))
-```
-
-Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training), 
-you can expect to get about a 3x memory improvement and even slightly higher throughput as using Adafactor. 
-
-### multi_tensor
-
-pytorch-nightly introduced `torch.optim._multi_tensor` which should significantly speed up the optimizers for situations 
-with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner, take a look at this GitHub [issue](https://github.com/huggingface/transformers/issues/9965).
-
-## Data preloading
-
-One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it 
-can handle. By default, everything happens in the main process, and it might not be able to read the data from disk fast 
-enough, and thus create a bottleneck, leading to GPU under-utilization. Configure the following arguments to reduce the bottleneck:
-
-- `DataLoader(pin_memory=True, ...)` - ensures the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory.
-- `DataLoader(num_workers=4, ...)` - spawn several workers to preload data faster. During training, watch the GPU utilization stats; if it's far from 100%, experiment with increasing the number of workers. Of course, the problem could be elsewhere, so many workers won't necessarily lead to better performance.
-
-When using [`Trainer`], the corresponding [`TrainingArguments`] are: `dataloader_pin_memory` (`True` by default), and `dataloader_num_workers` (defaults to `0`).
+from transformers import TrainingArguments
 
-## DeepSpeed ZeRO
+args = TrainingArguments(
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=16,
+    gradient_checkpointing=True,
+    bf16=True.
+    tf32=True,
+)
+```
 
-DeepSpeed is an open-source deep learning optimization library that is integrated with 🤗 Transformers and 🤗 Accelerate.
-It provides a wide range of features and optimizations designed to improve the efficiency and scalability of large-scale 
-deep learning training.
+</hfoption>
+</hfoptions>
 
-If your model fits onto a single GPU and you have enough space to fit a small batch size, you don't need to use DeepSpeed
-as it'll only slow things down. However, if the model doesn't fit onto a single GPU or you can't fit a small batch, you can 
-leverage DeepSpeed ZeRO + CPU Offload, or NVMe Offload for much larger models. In this case, you need to separately
-[install the library](main_classes/deepspeed#installation), then follow one of the guides to create a configuration file 
-and launch DeepSpeed: 
- 
-* For an in-depth guide on DeepSpeed integration with [`Trainer`], review [the corresponding documentation](main_classes/deepspeed), specifically the 
-[section for a single GPU](main_classes/deepspeed#deployment-with-one-gpu). Some adjustments are required to use DeepSpeed in a notebook; please take a look at the [corresponding guide](main_classes/deepspeed#deployment-in-notebooks).
-* If you prefer to use 🤗 Accelerate, refer to [🤗 Accelerate DeepSpeed guide](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed).
+### Optimizers
 
-## Using torch.compile
+Transformers implements the [AdamW (adamw_torch)](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) optimizer from PyTorch by default. But because it stores a weighted average of past gradients, it requires additional memory proportional to the number of model parameters to store the past gradients. This can be an issue when training very large models, and in such cases, you should consider choosing a different optimizer. For example, if you have [Apex](https://nvidia.github.io/apex/index.html) installed on either [NVIDIA](https://github.com/NVIDIA/apex) or [AMD](https://github.com/ROCm/apex), then using the `adamw_apex_fused` optimizer provides the fastest training for all AdamW optimizers.
 
-PyTorch 2.0 introduced a new compile function that doesn't require any modification to existing PyTorch code but can 
-optimize your code by adding a single line of code: `model = torch.compile(model)`.
+Configure [`~TrainingArguments.optim`] in [`TrainingArguments`] to choose an optimizer.
 
-If using [`Trainer`], you only need `to` pass the `torch_compile` option in the [`TrainingArguments`]: 
+```py
+from transformers import TrainingArguments
 
-```python
-training_args = TrainingArguments(torch_compile=True, **default_args)
+args = TrainingArguments(
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=16,
+    gradient_checkpointing=True,
+    bf16=True,
+    optim="adamw_bnb_8bit"
+)
 ```
 
-`torch.compile` uses Python's frame evaluation API to automatically create a graph from existing PyTorch programs. After 
-capturing the graph, different backends can be deployed to lower the graph to an optimized engine. 
-You can find more details and benchmarks in [PyTorch documentation](https://pytorch.org/get-started/pytorch-2.0/).
-
-`torch.compile` has a growing list of backends, which can be found in by calling `torchdynamo.list_backends()`, each of which with its optional dependencies.
+There are many optimizers to choose from (refer to [OptimizerNames](https://github.com/huggingface/transformers/blob/34f4080ff59b1668d919a1ba9f8bc4a3a2a3f478/src/transformers/training_args.py#L145) for a full supported list) depending on your training scenario. For example, Adafactor can significantly reduce memory requirements by storing a weighted average of a row or column instead of each element in the matrix at the cost of slower convergence. Another example is using a [8-bit AdamW optimizer](https://huggingface.co/docs/bitsandbytes) from bitsandbytes to quantize optimizer states. The optimizer state is stored in a lower precision and dequantized before being used in the optimizer step.
 
-Choose which backend to use by specifying it via `torch_compile_backend` in the [`TrainingArguments`].  Some of the most commonly used backends are:
+Refer to the [optimizer](./optimizers) guide for to learn about more specialized optimizers.
 
-**Debugging backends**:
-* `dynamo.optimize("eager")` - Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo issues.
-* `dynamo.optimize("aot_eager")` - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups.
+### Data preloading
 
-**Training & inference backends**:
-* `dynamo.optimize("inductor")` - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels  [Read more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747)
-* `dynamo.optimize("nvfuser")` -  nvFuser with TorchScript. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
-* `dynamo.optimize("aot_nvfuser")` -  nvFuser with AotAutograd. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593)
-* `dynamo.optimize("aot_cudagraphs")` - cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757)
+Data preloading loads and prepares batches of data in advance on the CPU to ensure the GPU is continuously working, reducing GPU idling and increasing utilization. There are two ways to preload data to ensure the GPU is always working.
 
-**Inference-only backend**s:
-* `dynamo.optimize("ofi")` -  Uses TorchScript optimize_for_inference.  [Read more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html)
-* `dynamo.optimize("fx2trt")` -  Uses NVIDIA TensorRT for inference optimizations.  [Read more](https://pytorch.org/TensorRT/tutorials/getting_started_with_fx_path.html)
-* `dynamo.optimize("onnxrt")` -  Uses ONNXRT for inference on CPU/GPU.  [Read more](https://onnxruntime.ai/)
-* `dynamo.optimize("ipex")` -  Uses IPEX for inference on CPU.  [Read more](https://github.com/intel/intel-extension-for-pytorch)
+1. Allocate pinned memory on the CPU to store the data and transfer it directly to the GPU.
+2. Increase the number of CPU threads or workers to preload the data faster.
 
-For an example of using `torch.compile` with 🤗 Transformers, check out this [blog post on fine-tuning a BERT model for Text Classification using the newest PyTorch 2.0 features](https://www.philschmid.de/getting-started-pytorch-2-0-transformers)
+Configure [`~TrainingArguments.dataloader_pin_memory`] and [`~TrainingArguments.dataloader_num_workers`] in [`TrainingArguments`] to allocate pinned memory and increase the number of workers.
 
-## Using 🤗 PEFT
-
-[Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it.
-
-As a result the [memory associated to the optimizer states and gradients](https://huggingface.co/docs/transformers/model_memory_anatomy#anatomy-of-models-memory) are greatly reduced.
-
-For example with a vanilla AdamW, the memory requirement for the optimizer state would be:
-* fp32 copy of parameters: 4 bytes/param
-* Momentum: 4 bytes/param
-* Variance: 4 bytes/param
-
-Suppose a model with 7B parameters and 200 million parameters injected with [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora).
+```py
+from transformers import TrainingArguments
 
-The memory requirement for the optimizer state of the plain model would be 12 * 7 = 84 GB (assuming 7B trainable parameters).
+args = TrainingArguments(
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=16,
+    gradient_checkpointing=True,
+    bf16=True,
+    optim="adamw_bnb_8bit",
+    dataloader_pin_memory=True,
+    dataloader_num_workers=4,
+)
+```
 
-Adding Lora increases slightly the memory associated to the model weights and substantially decreases memory requirement for the optimizer state to 12 * 0.2 = 2.4GB.
+## PyTorch
 
-Read more about PEFT and its detailed usage in [the PEFT documentation](https://huggingface.co/docs/peft/) or [PEFT repository](https://github.com/huggingface/peft).
+PyTorch provides several features for reducing memory requirements and increasing training speed. These features can often be enabled in Transformers by only adding a few lines of code.
 
-## Using 🤗 Accelerate
+### torch.empty_cache_steps
 
-With [🤗 Accelerate](https://huggingface.co/docs/accelerate/index) you can use the above methods while gaining full 
-control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. 
+The [torch.cuda.empty_cache](https://pytorch.org/docs/stable/generated/torch.cuda.empty_cache.html#torch.cuda.empty_cache) function releases unused cached memory, which can help avoid out-of-memory (OOM) errors at the cost of ~10% slower training.
 
-Suppose you have combined the methods in the [`TrainingArguments`] like so:
+Use [torch_empty_cache_steps()](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.torch_empty_cache_steps) in [`TrainingArguments`] to enable it after a certain number of training steps.
 
 ```py
-training_args = TrainingArguments(
-    per_device_train_batch_size=1,
-    gradient_accumulation_steps=4,
+from transformers import TrainingArguments
+
+args = TrainingArguments(
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=16,
     gradient_checkpointing=True,
-    fp16=True,
-    **default_args,
+    bf16=True,
+    optim="adamw_bnb_8bit",
+    dataloader_pin_memory=True,
+    dataloader_num_workers=4,
+    torch_empty_cache_steps=4,
 )
 ```
 
-The full example training loop with 🤗 Accelerate is only a handful of lines of code long:
+### torch.compile
 
-```py
-from accelerate import Accelerator
-from torch.utils.data.dataloader import DataLoader
+[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) compiles PyTorch code into optimized kernels that significantly speed up training. This feature relies on TorchDynamo to capture PyTorch graphs with the Frame Evaluation API. The graph can be further compiled into optimized kernels for different backends.
 
-dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size)
+Configure [`~TrainingArguments.torch_compile`] in [`TrainingArguments`] to enable it, and configure [torch_compile_backend()](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.torch_compile_backend) to select a backend to use.
 
-if training_args.gradient_checkpointing:
-    model.gradient_checkpointing_enable()
-
-accelerator = Accelerator(fp16=training_args.fp16)
-model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)
+```py
+from transformers import TrainingArguments
 
-model.train()
-for step, batch in enumerate(dataloader, start=1):
-    loss = model(**batch).loss
-    loss = loss / training_args.gradient_accumulation_steps
-    accelerator.backward(loss)
-    if step % training_args.gradient_accumulation_steps == 0:
-        optimizer.step()
-        optimizer.zero_grad()
+args = TrainingArguments(
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=16,
+    gradient_checkpointing=True,
+    bf16=True,
+    optim="adamw_bnb_8bit",
+    dataloader_pin_memory=True,
+    dataloader_num_workers=4,
+    torch_empty_cache_steps=4,
+    torch_compile=True,
+    torch_compile_backend="inductor"
+)
 ```
 
-First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). 
-Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. 
-When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) 
-we can specify if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. 
-During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) 
-call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same [8-bit optimizer](#8-bit-adam) from the earlier example.
-
-Finally, we can add the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see
-how gradient accumulation works: we normalize the loss, so we get the average at the end of accumulation and once we have 
-enough steps we run the optimization. 
-
-Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the 
-benefit of more flexibility in the training loop. For a full documentation of all features have a look at the 
-[Accelerate documentation](https://huggingface.co/docs/accelerate/index).
-
-
-## Efficient Software Prebuilds
-
-PyTorch's [pip and conda builds](https://pytorch.org/get-started/locally/#start-locally) come prebuilt with the cuda toolkit 
-which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions.
+Refer to the table below to help you choose the right backend for your training scenario.
 
-At times, additional efforts may be required to pre-build some components. For instance, if you're using libraries like `apex` that 
-don't come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. 
-To address these scenarios PyTorch and NVIDIA released a new version of NGC docker container which already comes with 
-everything prebuilt. You just need to install your programs on it, and it will run out of the box.
+| backend | description | goal |
+|---|---|---|
+| eager | uses PyTorch to run extracted GraphModule | debugging |
+| aot_eager | uses PyTorch eager mode for AOTAutograd's extracted forward and backward graphs | debugging |
+| inductor | uses TorchInductor with AOTAutograd and CUDA Graphs by leveraging Triton kernels | training and inference |
+| nvfuser | uses nvFuser with TorchScript | training and inference |
+| aot_nvfuser | uses nvFuser with AOTAutograd | training and inference |
+| aot_cudagraphs | uses CUDA Graphs with AOTAutograd | training and inference |
+| ofi | uses TorchScripts [optimize_for_inference](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html#torch-jit-optimize-for-inference) | inference |
+| fx2trt | uses [Torch-TensorRT](https://pytorch.org/TensorRT/tutorials/getting_started_with_fx_path.html) | inference |
+| onnxrt | uses [ONNX-RT](https://onnxruntime.ai/) for CPU and GPU inference | inference |
+| ipex | uses [IPEX](https://github.com/intel/intel-extension-for-pytorch) for CPU inference | inference |
 
-This approach is also useful if you want to tweak the pytorch source and/or make a new customized build.
-To find the docker image version you want start [with PyTorch release notes](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/), 
-choose one of the latest monthly releases. Go into the release's notes for the desired release, check that the environment's 
-components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go 
-to the corresponding NGC page. If for some reason you get lost, here is [the index of all PyTorch NGC images](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch).
+### Scaled dot production attention
 
-Next follow the instructions to download and deploy the docker image.
+[torch.nn.functional.scaled_dot_product_attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) is a native PyTorch implementation of the scaled dot product attention mechanism. SDPA is more efficient and optimized than the original attention mechanism in transformer models. It supports three types of scaled dot product attention.
 
-## Mixture of Experts
+- [FlashAttention2](https://github.com/Dao-AILab/flash-attention) is automatically enabled for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate type first.
+- [xFormers](https://github.com/facebookresearch/xformers) or Memory-Efficient Attention supports models with the fp32 torch type.
+- C++ implementation of scaled dot product attention.
 
-Some recent papers reported a 4-5x training speedup and a faster inference by integrating
-Mixture of Experts (MoE) into the Transformer models.
+SDPA is enabled by default for PyTorch 2.1.1+, but it can be explicitly enabled by setting `attn_implementation="sdpa"` in [`~PreTrainedModel.from_pretrained`].
 
-Since it has been discovered that more parameters lead to better performance, this technique allows to increase the 
-number of parameters by an order of magnitude without increasing training costs.
-
-In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function 
-that trains each expert in a balanced way depending on the input token's position in a sequence.
-
-![MoE Transformer 2x block](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf-moe-transformer.png)
-
-(source: [GLAM](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html))
-
-You can find exhaustive details and comparison tables in the papers listed at the end of this section.
-
-The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude 
-larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements.
-
-There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or 
-hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the 
-memory requirements moderately as well.
-
-Most related papers and implementations are built around Tensorflow/TPUs:
-
-- [GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding](https://arxiv.org/abs/2006.16668)
-- [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961)
-- [GLaM: Generalist Language Model (GLaM)](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html)
-
-And for Pytorch DeepSpeed has built one as well: [DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale](https://arxiv.org/abs/2201.05596), [Mixture of Experts](https://www.deepspeed.ai/tutorials/mixture-of-experts/) - blog posts:  [1](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/), [2](https://www.microsoft.com/en-us/research/publication/scalable-and-efficient-moe-training-for-multitask-multilingual-models/) and specific deployment with large transformer-based natural language generation models: [blog post](https://www.deepspeed.ai/2021/12/09/deepspeed-moe-nlg.html), [Megatron-Deepspeed branch](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training).
-
-## Using PyTorch native attention and Flash Attention
-
-PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for `torch>=2.1.1` when an implementation is available. Please refer to [PyTorch scaled dot product attention](https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) for a list of supported models and more details.
+```py
+from transformers import AutoModelForCausalLM
 
-Check out this [blogpost](https://pytorch.org/blog/out-of-the-box-acceleration/) to learn more about acceleration and memory-savings with SDPA.
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", attn_implementation="sdpa")
+```
diff --git a/docs/source/en/perf_train_special.md b/docs/source/en/perf_train_special.md
index d98d3e0e32e5..128f83c23ad7 100644
--- a/docs/source/en/perf_train_special.md
+++ b/docs/source/en/perf_train_special.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,51 +13,19 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# PyTorch training on Apple silicon
+# Apple Silicon
 
-Previously, training models on a Mac was limited to the CPU only. With the release of PyTorch v1.12, you can take advantage of training models with Apple's silicon GPUs for significantly faster performance and training. This is powered in PyTorch by integrating Apple's Metal Performance Shaders (MPS) as a backend. The [MPS backend](https://pytorch.org/docs/stable/notes/mps.html) implements PyTorch operations as custom Metal shaders and places these modules on a `mps` device.
+Apple Silicon (M series) features a unified memory architecture, making it possible to efficiently train large models locally and improves performance by reducing latency associated with data retrieval. You can take advantage of Apple Silicon for training with PyTorch due to its integration with [Metal Performance Shaders (MPS)](https://pytorch.org/docs/stable/notes/mps.html).
 
-<Tip warning={true}>
+The `mps` backend requires macOS 12.3 or later.
 
-Some PyTorch operations are not implemented in MPS yet and will throw an error. To avoid this, you should set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU kernels instead (you'll still see a `UserWarning`).
+> [!WARNING]
+> Some PyTorch operations are not implemented in MPS yet. To avoid an error, set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to fallback on the CPU kernels. Please open an issue in the [PyTorch](https://github.com/pytorch/pytorch/issues) repository if you encounter any other issues.
 
-<br>
+[`TrainingArguments`] and [`Trainer`] detects and sets the backend device to `mps` if an Apple Silicon device is available. No additional changes are required to enable training on your device.
 
-If you run into any other errors, please open an issue in the [PyTorch](https://github.com/pytorch/pytorch/issues) repository because the [`Trainer`] only integrates the MPS backend.
+The `mps` backend doesn't support [distributed training](https://pytorch.org/docs/stable/distributed.html#backends).
 
-</Tip>
+## Resources
 
-With the `mps` device set, you can:
-
-* train larger networks or batch sizes locally
-* reduce data retrieval latency because the GPU's unified memory architecture allows direct access to the full memory store
-* reduce costs because you don't need to train on cloud-based GPUs or add additional local GPUs
-
-Get started by making sure you have PyTorch installed. MPS acceleration is supported on macOS 12.3+.
-
-```bash
-pip install torch torchvision torchaudio
-```
-
-[`TrainingArguments`] uses the `mps` device by default if it's available which means you don't need to explicitly set the device. For example, you can run the [run_glue.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py) script with the MPS backend automatically enabled without making any changes.
-
-```diff
-export TASK_NAME=mrpc
-
-python examples/pytorch/text-classification/run_glue.py \
-  --model_name_or_path google-bert/bert-base-cased \
-  --task_name $TASK_NAME \
-- --use_mps_device \
-  --do_train \
-  --do_eval \
-  --max_seq_length 128 \
-  --per_device_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3 \
-  --output_dir /tmp/$TASK_NAME/ \
-  --overwrite_output_dir
-```
-
-Backends for [distributed setups](https://pytorch.org/docs/stable/distributed.html#backends) like `gloo` and `nccl` are not supported by the `mps` device which means you can only train on a single GPU with the MPS backend.
-
-You can learn more about the MPS backend in the [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) blog post.
+Learn more about the MPS backend in the [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) blog post.
diff --git a/docs/source/en/perf_train_tpu_tf.md b/docs/source/en/perf_train_tpu_tf.md
index 1897c1ad745f..286ff530a817 100644
--- a/docs/source/en/perf_train_tpu_tf.md
+++ b/docs/source/en/perf_train_tpu_tf.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -13,103 +13,322 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Training on TPU with TensorFlow
+# TPU
 
-<Tip>
+TPU (Tensor Processing Unit) is a type of hardware designed to accelerate tensor computations for training and inference. TPUs are generally accessed through Google cloud services, but smaller TPUs are also available for free from [Google Colab](https://colab.research.google.com/notebooks/tpu.ipynb) or [Kaggle](https://www.kaggle.com/docs/tpu).
 
-If you don't need long explanations and just want TPU code samples to get started with, check out [our TPU example notebook!](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)
+This guide focuses on training a Keras model for sequence classification on a TPU from Google Colab. Make sure the TPU runtime is enabled by going to **Runtime > Change runtime type** and selecting a TPU.
 
-</Tip>
+Run the command below to install the latest version of Transformers and [Datasets](https://huggingface.co/docs/datasets).
 
-### What is a TPU?
+```py
+!pip install --U transformers datasets
+```
+
+Create an instance of [tf.distribute.cluster_resolver.TPUClusterResolver](https://www.tensorflow.org/api_docs/python/tf/distribute/cluster_resolver/TPUClusterResolver), and then connect to the remote cluster and initialize the TPUs.
+
+```py
+import tensorflow as tf
+
+resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+tf.config.experimental_connect_to_cluster(resolver)
+tf.tpu.experimental.initialize_tpu_system(resolver)
+```
+
+There are various distribution strategies for running your model on multiple TPUs. The [tpu.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/TPUStrategy) offers synchronized distributed training.
+
+```py
+strategy = tf.distribute.TPUStrategy(resolver)
+```
+
+Load and tokenize a dataset - this example uses [CoLA](https://huggingface.co/datasets/nyu-mll/glue/viewer/cola) from the GLUE benchmark - and pad all samples to the maximum length so it is easier to load as an array and to avoid [XLA compilation issues](#xla).
+
+```py
+from transformers import AutoTokenizer
+from datasets import load_dataset
+import numpy as np
+
+dataset = load_dataset("glue", "cola")["train"]
+tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+
+train_data = tokenizer(
+    dataset["sentence"],
+    padding="max_length",
+    truncation=True,
+    max_length=128,
+    return_tensors="np",
+)
+train_data = dict(train_data)
+train_labels = np.array(dataset["label"])
+```
+
+The model **must** be created inside [Strategy.scope](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy#scope) in order to replicate the model layers on each TPU device.
+
+```py
+from transformers import TFAutoModelForSequenceClassification
+
+with strategy.scope():
+    model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+    model.compile(optimizer="adam")
+```
 
-A TPU is a **Tensor Processing Unit.** They are hardware designed by Google, which are used to greatly speed up the tensor computations within neural networks, much like GPUs. They can be used for both network training and inference. They are generally accessed through Google’s cloud services, but small TPUs can also be accessed directly for free through Google Colab and Kaggle Kernels.
+TPUs only accept [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) inputs unlike the Keras [fit](https://keras.io/api/models/model_training_apis/#fit-method) method which accepts a broader range of inputs.
 
-Because [all TensorFlow models in 🤗 Transformers are Keras models](https://huggingface.co/blog/tensorflow-philosophy), most of the methods in this document are generally applicable to TPU training for any Keras model! However, there are a few points that are specific to the HuggingFace ecosystem (hug-o-system?) of Transformers and Datasets, and we’ll make sure to flag them up when we get to them.
+```py
+BATCH_SIZE = 8 * strategy.num_replicas_in_sync
 
-### What kinds of TPU are available?
+tf_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
+tf_dataset = tf_dataset.shuffle(len(tf_dataset))
+tf_dataset = tf_dataset.batch(BATCH_SIZE, drop_remainder=True)
+```
+
+Finally, call [fit](https://keras.io/api/models/model_training_apis/#fit-method) to start training.
+
+```py
+model.fit(tf_dataset)
+```
 
-New users are often very confused by the range of TPUs, and the different ways to access them. The first key distinction to understand is the difference between **TPU Nodes** and **TPU VMs.**
+## Large datasets
+
+The dataset created above pads every sample to the maximum length and loads the whole dataset into memory. This may not be possible if you're working with larger datasets. When training on large datasets, you may want to create a [tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) or stream the data.
+
+### tf.TFRecord
+
+[tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) is the standard [tf.data](https://www.tensorflow.org/guide/data) format for storing training data. For very large training jobs, it's worth preprocessing your data and storing it in the `tf.TFRecord` format and building a `tf.data` pipeline on top. Refer to the table below to help you decide whether `tf.TFRecord` is helpful for you.
+
+| pros | cons |
+|---|---|
+| works on all TPU instances | costs associated with cloud storage |
+| supports huge datasets and massive throughput | some data types (images) can take a lot of space to store |
+| suitable for training on entire TPU pods |  |
+| preprocessing is done in advance, maximizing training speed |  |
+
+Preprocess and tokenize the dataset before writing it to a `tf.TFRecord` to avoid writing every time the data is loaded.
+
+An exception is made for *train-time augmentations*, because augmentations applied after writing to a `tf.TFRecord` results in the same augmentation for each epoch. Instead, apply augmentations in the `tf.data` pipeline that loads the data.
+
+> [!TIP]
+> In practice, you probably won't be able to load the entire dataset in memory. Load a chunk of the dataset at a time and convert it to `TFRecord`, and repeat until the entire dataset is in the `TFRecord` format. Then you can use a list of all the files to create a `TFRecordDataset`. The example below demonstrates a single file for simplicity.
+
+```py
+tokenized_data = tokenizer(
+    dataset["sentence"],
+    padding="max_length",
+    truncation=True,
+    max_length=128,
+    return_tensors="np",
+)
+labels = dataset["label"]
+
+with tf.io.TFRecordWriter("dataset.tfrecords") as file_writer:
+    for i in range(len(labels)):
+        features = {
+            "input_ids": tf.train.Feature(
+                int64_list=tf.train.Int64List(value=tokenized_data["input_ids"][i])
+            ),
+            "attention_mask": tf.train.Feature(
+                int64_list=tf.train.Int64List(value=tokenized_data["attention_mask"][i])
+            ),
+            "labels": tf.train.Feature(
+                int64_list=tf.train.Int64List(value=[labels[i]])
+            ),
+        }
+        features = tf.train.Features(feature=features)
+        example = tf.train.Example(features=features)
+        record_bytes = example.SerializeToString()
+        file_writer.write(record_bytes)
+```
+
+Build a [TFRecordDataset](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset) using the saved filename to load it.
+
+```py
+def decode_fn(sample):
+    features = {
+        "input_ids": tf.io.FixedLenFeature((128,), dtype=tf.int64),
+        "attention_mask": tf.io.FixedLenFeature((128,), dtype=tf.int64),
+        "labels": tf.io.FixedLenFeature((1,), dtype=tf.int64),
+    }
+    return tf.io.parse_example(sample, features)
+
+# TFRecordDataset can handle gs:// paths
+tf_dataset = tf.data.TFRecordDataset(["gs://matt-tf-tpu-tutorial-datasets/cola/dataset.tfrecords"])
+tf_dataset = tf_dataset.map(decode_fn)
+tf_dataset = tf_dataset.shuffle(len(dataset)).batch(BATCH_SIZE, drop_remainder=True)
+tf_dataset = tf_dataset.apply(
+    tf.data.experimental.assert_cardinality(len(labels) // BATCH_SIZE)
+)
+```
 
-When you use a **TPU Node**, you are effectively indirectly accessing a remote TPU. You will need a separate VM, which will initialize your network and data pipeline and then forward them to the remote node. When you use a TPU on Google Colab, you are accessing it in the **TPU Node** style.
+The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
 
-Using TPU Nodes can have some quite unexpected behaviour for people who aren’t used to them! In particular, because the TPU is located on a physically different system to the machine you’re running your Python code on, your data cannot be local to your machine - any data pipeline that loads from your machine’s internal storage will totally fail! Instead, data must be stored in Google Cloud Storage where your data pipeline can still access it, even when the pipeline is running on the remote TPU node.
+```py
+model.fit(tf_dataset)
+```
 
-<Tip>
+### Stream from raw data
 
-If you can fit all your data in memory as `np.ndarray` or `tf.Tensor`, then you can `fit()` on that data even when using Colab or a TPU Node, without needing to upload it to Google Cloud Storage.
+Data can be stored in its native format and preprocessed in a [tf.data](https://www.tensorflow.org/guide/data) pipeline as the data is loaded. This approach isn't supported for many models with complex tokenization schemes, but some models like BERT are supported because their tokenization can be compiled. Refer to the table below to help you decide whether this approach is helpful for you.
 
-</Tip>
+| pros | cons |
+|---|---|
+| suitable for highly compressed big data in native format (images, audio) | requires writing a full preprocessing pipeline |
+| convenient if raw data is available in a public cloud bucket | complex preprocessing on-the-fly can hurt throughput |
+| works on all TPU instances if data is stored in Google Cloud | must place data in cloud storage if not already there |
+|  | not as suitable for text data because writing a tokenization pipeline is hard (use `TFRecord` for text) |
 
-<Tip>
+The example below demonstrates streaming data for an image model.
 
-**🤗Specific Hugging Face Tip🤗:** The methods `Dataset.to_tf_dataset()` and its higher-level wrapper `model.prepare_tf_dataset()` , which you will see throughout our TF code examples, will both fail on a TPU Node. The reason for this is that even though they create a `tf.data.Dataset` it is not a “pure” `tf.data` pipeline and uses `tf.numpy_function` or `Dataset.from_generator()` to stream data from the underlying HuggingFace `Dataset`. This HuggingFace `Dataset` is backed by data that is on a local disc and which the remote TPU Node will not be able to read.
+Load an image dataset and get a list of the underlying image file paths and labels.
 
-</Tip>
+```py
+from datasets import load_dataset
 
-The second way to access a TPU is via a **TPU VM.** When using a TPU VM, you connect directly to the machine that the TPU is attached to, much like training on a GPU VM. TPU VMs are generally easier to work with, particularly when it comes to your data pipeline. All of the above warnings do not apply to TPU VMs!
+image_dataset = load_dataset("beans", split="train")
+filenames = image_dataset["image_file_path"]
+labels = image_dataset["labels"]
+```
 
-This is an opinionated document, so here’s our opinion: **Avoid using TPU Node if possible.** It is more confusing and more difficult to debug than TPU VMs. It is also likely to be unsupported in future - Google’s latest TPU, TPUv4, can only be accessed as a TPU VM, which suggests that TPU Nodes are increasingly going to become a “legacy” access method. However, we understand that the only free TPU access is on Colab and Kaggle Kernels, which uses TPU Node - so we’ll try to explain how to handle it if you have to! Check the [TPU example notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) for code samples that explain this in more detail.
+Convert the local filenames in the dataset into `gs://` paths in Google Cloud Storage.
 
-### What sizes of TPU are available?
+```py
+# strip everything but the category directory and filenames
+base_filenames = ['/'.join(filename.split('/')[-2:]) for filename in filenames]
+# prepend the Google Cloud base path to everything instead
+gs_paths = ["gs://matt-tf-tpu-tutorial-datasets/beans/"+filename for filename in base_filenames]
 
-A single TPU (a v2-8/v3-8/v4-8) runs 8 replicas. TPUs exist in **pods** that can run hundreds or thousands of replicas simultaneously. When you use more than a single TPU but less than a whole pod (for example, a v3-32), your TPU fleet is referred to as a **pod slice.**
+# create tf_dataset
+tf_dataset = tf.data.Dataset.from_tensor_slices(
+    {"filename": gs_paths, "labels": labels}
+)
+tf_dataset = tf_dataset.shuffle(len(tf_dataset))
+```
 
-When you access a free TPU via Colab, you generally get a single v2-8 TPU.
+Transformers preprocessing classes like [`AutoImageProcessor`] are framework-agnostic and can't be compiled into a pipeline by `tf.data`. To get around this, get the normalization values (`mean` and `std`) from the [`AutoImageProcessor`] and use them in the `tf.data` pipeline.
 
-### I keep hearing about this XLA thing. What’s XLA, and how does it relate to TPUs?
+```py
+from transformers import AutoImageProcessor
 
-XLA is an optimizing compiler, used by both TensorFlow and JAX. In JAX it is the only compiler, whereas in TensorFlow it is optional (but mandatory on TPU!). The easiest way to enable it when training a Keras model is to pass the argument `jit_compile=True` to `model.compile()`. If you don’t get any errors and performance is good, that’s a great sign that you’re ready to move to TPU!
+processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+image_size = (processor.size["height"], processor.size["width"])
+image_mean = processor.image_mean
+image_std = processor.image_std
+```
 
-Debugging on TPU is generally a bit harder than on CPU/GPU, so we recommend getting your code running on CPU/GPU with XLA first before trying it on TPU. You don’t have to train for long, of course - just for a few steps to make sure that your model and data pipeline are working like you expect them to.
+Use these normalization values to create a function to load and preprocess the images.
 
-<Tip>
+```py
+BATCH_SIZE = 8 * strategy.num_replicas_in_sync
 
-XLA compiled code is usually faster - so even if you’re not planning to run on TPU, adding `jit_compile=True` can improve your performance. Be sure to note the caveats below about XLA compatibility, though!
+def decode_fn(sample):
+    image_data = tf.io.read_file(sample["filename"])
+    image = tf.io.decode_jpeg(image_data, channels=3)
+    image = tf.image.resize(image, image_size)
+    array = tf.cast(image, tf.float32)
+    array /= 255.0
+    array = (array - image_mean) / image_std
+    array = tf.transpose(array, perm=[2, 0, 1])
+    return {"pixel_values": array, "labels": sample["labels"]}
 
-</Tip>
+tf_dataset = tf_dataset.map(decode_fn)
+tf_dataset = tf_dataset.batch(BATCH_SIZE, drop_remainder=True)
+print(tf_dataset.element_spec)
+```
 
-<Tip warning={true}>
+The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
 
-**Tip born of painful experience:** Although using `jit_compile=True` is a good way to get a speed boost and test if your CPU/GPU code is XLA-compatible, it can actually cause a lot of problems if you leave it in when actually training on TPU. XLA compilation will happen implicitly on TPU, so remember to remove that line before actually running your code on a TPU!
+```py
+from transformers import TFAutoModelForImageClassification
 
-</Tip>
+with strategy.scope():
+    model = TFAutoModelForImageClassification.from_pretrained(image_model_checkpoint)
+    model.compile(optimizer="adam")
 
-### How do I make my model XLA compatible?
+model.fit(tf_dataset)
+```
 
-In many cases, your code is probably XLA-compatible already! However, there are a few things that work in normal TensorFlow that don’t work in XLA. We’ve distilled them into three core rules below:
+### Stream with prepare_tf_dataset
 
-<Tip>
+[`~TFPreTrainedModel.prepare_tf_dataset`] creates a `tf.data` pipeline that loads samples from [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset). The pipeline uses [tf.numpy_function]() or [`~datasets.Dataset.from_generator`], which can't be compiled by TensorFlow, to access the underlying `tf.data.Dataset`. It also won't work on a Colab TPU or TPU Nodes because the pipeline streams data from a local disk. Refer to the table below to help you decide whether this approach is helpful for you.
 
-**🤗Specific HuggingFace Tip🤗:** We’ve put a lot of effort into rewriting our TensorFlow models and loss functions to be XLA-compatible. Our models and loss functions generally obey rule #1 and #2 by default, so you can skip over them if you’re using `transformers` models. Don’t forget about these rules when writing your own models and loss functions, though!
+| pros | cons |
+|---|---|
+| simple code | only works on TPU VM |
+| same approach on TPU/GPU | data must be available as a Hugging Face Dataset |
+| dataset doesn't have to fit in memory | data must fit on local storage |
+| supports variable padding | data loading may be a bottleneck on a big TPU pod slice |
 
-</Tip>
+[`~TFPreTrainedModel.prepare_tf_dataset`] only works on [TPU VM](#tpu-types). Add the tokenizer output as columns in the dataset since the dataset is stored on disk, which means it can handle data larger than the available memory. Use [`~TFPreTrainedModel.prepare_tf_dataset`] to stream data from the dataset by wrapping it with a `tf.data` pipeline.
 
-#### XLA Rule #1: Your code cannot have “data-dependent conditionals”
+```py
+def tokenize_function(examples):
+    return tokenizer(
+        examples["sentence"], padding="max_length", truncation=True, max_length=128
+    )
+# add the tokenizer output to the dataset as new columns
+dataset = dataset.map(tokenize_function)
 
-What that means is that any `if` statement cannot depend on values inside a `tf.Tensor`. For example, this code block cannot be compiled with XLA!
+# prepare_tf_dataset() chooses columns that match the models input names
+tf_dataset = model.prepare_tf_dataset(
+    dataset, batch_size=BATCH_SIZE, shuffle=True, tokenizer=tokenizer
+)
+```
 
-```python
+The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
+
+```py
+from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+with strategy.scope():
+    model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+    model.compile(optimizer="adam")
+
+model.fit(tf_dataset)
+```
+
+## TPU types
+
+There are two types of TPUs, a TPU Node and a TPU VM.
+
+A TPU Node indirectly accesses a remote TPU. It requires a separate VM to initialize your network and data pipeline, and then forwards it to the remote node. Google Colab TPUs are an example of a TPU Node. You can't use local data because the TPU is remotely located, and data must be stored in Google Cloud Storage where the data pipeline can access it.
+
+TPU VM are connected directly to the machine the TPU is located on, and they are generally easier to work with, especially when it comes to your data pipeline.
+
+> [!TIP]
+> We recommend avoiding TPU Nodes if possible because it is more difficult to debug than TPU VMs. TPU Nodes may also be unsupported in the future and become a legacy access method.
+
+A single TPU (v2-8, v3-8, v4-8) runs 8 replicas. TPUs can exist in **pods** which run hundreds or even thousands of replicas simultaneously. When you only use a portion of a pod, it is referred to as a **pod slice**. On Google Colab, you'll typically get a single v2-8 TPU.
+
+## XLA
+
+[XLA](https://openxla.org/xla) is a linear algebra compiler for high-performance execution and it is used by default to improve performance on TPUs.
+
+Before executing your code on a TPU, it's a good idea to try it first on a CPU or GPU because it is easier to debug. You can train for a few steps to make sure the model and data pipeline work as expected. Set `jit_compile=True` in the [compile](https://keras.io/api/models/model_training_apis/#compile-method) method to enable XLA compilation (but remember to remove this line of code before running on a TPU).
+
+The section below outlines three rules for making your code XLA-compatible. Transformers enforce the first two rules for models and loss functions by default, but don't forget about them if you're writing your own models and loss functions.
+
+### Data dependent conditionals
+
+Any `if` statements cannot depend on values inside a [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor). The code below can't be compiled by XLA.
+
+```py
 if tf.reduce_sum(tensor) > 10:
     tensor = tensor / 2.0
 ```
 
-This might seem very restrictive at first, but most neural net code doesn’t need to do this. You can often get around this restriction by using `tf.cond` (see the documentation [here](https://www.tensorflow.org/api_docs/python/tf/cond)) or by removing the conditional and finding a clever math trick with indicator variables instead, like so:
+To compile with XLA, use [tf.cond](https://www.tensorflow.org/api_docs/python/tf/cond) or remove the conditional and use indicator variables instead as shown below.
 
-```python
+```py
 sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32)
 tensor = tensor / (1.0 + sum_over_10)
 ```
 
-This code has exactly the same effect as the code above, but by avoiding a conditional, we ensure it will compile with XLA without problems!
-
-#### XLA Rule #2: Your code cannot have “data-dependent shapes”
+### Data dependent shapes
 
-What this means is that the shape of all of the `tf.Tensor` objects in your code cannot depend on their values. For example, the function `tf.unique` cannot be compiled with XLA, because it returns a `tensor` containing one instance of each unique value in the input. The shape of this output will obviously be different depending on how repetitive the input `Tensor` was, and so XLA refuses to handle it!
+The shape of a [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor) cannot depend on their values. For example, [tf.unique](https://www.tensorflow.org/api_docs/python/tf/unique) can't be compiled because it returns a tensor containing an instance of each unique value in the input. The shape of this output depends on how repetitive the input [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor) is.
 
-In general, most neural network code obeys rule #2 by default. However, there are a few common cases where it becomes a problem. One very common one is when you use **label masking**, setting your labels to a negative value to indicate that those positions should be ignored when computing the loss. If you look at NumPy or PyTorch loss functions that support label masking, you will often see code like this that uses [boolean indexing](https://numpy.org/doc/stable/user/basics.indexing.html#boolean-array-indexing):
+This is an issue during **label masking**, where labels are set to a negative value to indicate they should be ignored when computing the loss. The code below can't be compiled by XLA because the shape of `masked_outputs` and `masked_labels` depend on how many positions are masked.
 
-```python
+```py
 label_mask = labels >= 0
 masked_outputs = outputs[label_mask]
 masked_labels = labels[label_mask]
@@ -117,46 +336,20 @@ loss = compute_loss(masked_outputs, masked_labels)
 mean_loss = torch.mean(loss)
 ```
 
-This code is totally fine in NumPy or PyTorch, but it breaks in XLA! Why? Because the shape of `masked_outputs` and `masked_labels` depends on how many positions are masked - that makes it a **data-dependent shape.** However, just like for rule #1, we can often rewrite this code to yield exactly the same output without any data-dependent shapes.
+To compile with XLA, avoid the data-dependent shapes by computing the loss for every position and zeroing out the masked positions in both the numerator and denominator when calculating the mean. Convert `tf.bool` to `tf.float32` as an indicator variable to make your code XLA-compatible.
 
-```python
+```py
 label_mask = tf.cast(labels >= 0, tf.float32)
 loss = compute_loss(outputs, labels)
-loss = loss * label_mask  # Set negative label positions to 0
+loss = loss * label_mask
 mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask)
 ```
 
-Here, we avoid data-dependent shapes by computing the loss for every position, but zeroing out the masked positions in both the numerator and denominator when we calculate the mean, which yields exactly the same result as the first block while maintaining XLA compatibility. Note that we use the same trick as in rule #1 - converting a `tf.bool` to `tf.float32` and using it as an indicator variable. This is a really useful trick, so remember it if you need to convert your own code to XLA!
-
-#### XLA Rule #3: XLA will need to recompile your model for every different input shape it sees
-
-This is the big one. What this means is that if your input shapes are very variable, XLA will have to recompile your model over and over, which will create huge performance problems. This commonly arises in NLP models, where input texts have variable lengths after tokenization. In other modalities, static shapes are more common and this rule is much less of a problem.
-
-How can you get around rule #3? The key is **padding** - if you pad all your inputs to the same length, and then use an `attention_mask`, you can get the same results as you’d get from variable shapes, but without any XLA issues. However, excessive padding can cause severe slowdown too - if you pad all your samples to the maximum length in the whole dataset, you might end up with batches consisting endless padding tokens, which will waste a lot of compute and memory!
-
-There isn’t a perfect solution to this problem. However, you can try some tricks. One very useful trick is to **pad batches of samples up to a multiple of a number like 32 or 64 tokens.** This often only increases the number of tokens by a small amount, but it hugely reduces the number of unique input shapes, because every input shape now has to be a multiple of 32 or 64. Fewer unique input shapes means fewer XLA compilations!
-
-<Tip>
-
-**🤗Specific HuggingFace Tip🤗:** Our tokenizers and data collators have methods that can help you here. You can use `padding="max_length"` or `padding="longest"` when calling tokenizers to get them to output padded data. Our tokenizers and data collators also have a `pad_to_multiple_of` argument that you can use to reduce the number of unique input shapes you see!
-
-</Tip>
-
-### How do I actually train my model on TPU?
-
-Once your training is XLA-compatible and (if you’re using TPU Node / Colab) your dataset has been prepared appropriately, running on TPU is surprisingly easy! All you really need to change in your code is to add a few lines to initialize your TPU, and to ensure that your model and dataset are created inside a `TPUStrategy` scope. Take a look at [our TPU example notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) to see this in action!
+### Recompile different input shapes
 
-### Summary
+XLA recompiles your model if input shapes are variable which create huge performance problems. It is especially common in text models because input texts have variable lengths after tokenization.
 
-There was a lot in here, so let’s summarize with a quick checklist you can follow when you want to get your model ready for TPU training:
+> [!WARNING]
+> Execessive padding can also severely slow down training because requires more compute and memory to process.
 
-- Make sure your code follows the three rules of XLA
-- Compile your model with `jit_compile=True` on CPU/GPU and confirm that you can train it with XLA
-- Either load your dataset into memory or use a TPU-compatible dataset loading approach (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb))
-- Migrate your code either to Colab (with accelerator set to “TPU”) or a TPU VM on Google Cloud
-- Add TPU initializer code (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb))
-- Create your `TPUStrategy` and make sure dataset loading and model creation are inside the `strategy.scope()` (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb))
-- Don’t forget to take `jit_compile=True` out again when you move to TPU!
-- 🙏🙏🙏🥺🥺🥺
-- Call `model.fit()`
-- You did it!
\ No newline at end of file
+To avoid different shapes, use padding to pad all your inputs to the same length and use an `attention_mask`. Try padding batches of samples to a multiple of 32 or 64 tokens. Use the parameters `padding="max_length"`, `padding="longest"`, or `pad_to_multiple_of` to help with padding. This often increases the number of tokens by a small amount, but it significantly reduces the number of unique input shapes because every input shape is a multiple of 32 or 64. Fewer unique input shapes requires fewer recompilation.
\ No newline at end of file
diff --git a/docs/source/en/performance.md b/docs/source/en/performance.md
deleted file mode 100644
index b9176be04ec2..000000000000
--- a/docs/source/en/performance.md
+++ /dev/null
@@ -1,73 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Performance and Scalability
-
-Training large transformer models and deploying them to production present various challenges.  
-During training, the model may require more GPU memory than available or exhibit slow training speed. In the deployment 
-phase, the model can struggle to handle the required throughput in a production environment.
-
-This documentation aims to assist you in overcoming these challenges and finding the optimal settings for your use-case. 
-The guides are divided into training and inference sections, as each comes with different challenges and solutions. 
-Within each section you'll find separate guides for different hardware configurations, such as single GPU vs. multi-GPU 
-for training or CPU vs. GPU for inference.
-
-Use this document as your starting point to navigate further to the methods that match your scenario.
-
-## Training
-
-Training large transformer models efficiently requires an accelerator such as a GPU or TPU. The most common case is where 
-you have a single GPU. The methods that you can apply to improve training efficiency on a single GPU extend to other setups 
-such as multiple GPU. However, there are also techniques that are specific to multi-GPU or CPU training. We cover them in 
-separate sections.
-
-* [Methods and tools for efficient training on a single GPU](perf_train_gpu_one): start here to learn common approaches that can help optimize GPU memory utilization, speed up the training, or both. 
-* [Multi-GPU training section](perf_train_gpu_many): explore this section to learn about further optimization methods that apply to a multi-GPU settings, such as data, tensor, and pipeline parallelism.
-* [CPU training section](perf_train_cpu): learn about mixed precision training on CPU.
-* [Efficient Training on Multiple CPUs](perf_train_cpu_many): learn about distributed CPU training.
-* [Training on TPU with TensorFlow](perf_train_tpu_tf): if you are new to TPUs, refer to this section for an opinionated introduction to training on TPUs and using XLA. 
-* [Custom hardware for training](perf_hardware): find tips and tricks when building your own deep learning rig.
-* [Hyperparameter Search using Trainer API](hpo_train)
-
-## Inference
-
-Efficient inference with large models in a production environment can be as challenging as training them. In the following 
-sections we go through the steps to run inference on CPU and single/multi-GPU setups.
-
-* [Inference on a single CPU](perf_infer_cpu)
-* [Inference on a single GPU](perf_infer_gpu_one)
-* [Multi-GPU inference](perf_infer_gpu_multi)
-* [XLA Integration for TensorFlow Models](tf_xla)
-
-
-## Training and inference
-
-Here you'll find techniques, tips and tricks that apply whether you are training a model, or running inference with it.
-
-* [Instantiating a big model](big_models)
-* [Troubleshooting performance issues](debugging)
-
-## Contribute
-
-This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to 
-make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there.
-
-When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the 
-source of that information (unless it comes directly from you).
diff --git a/docs/source/en/pipeline_gradio.md b/docs/source/en/pipeline_gradio.md
new file mode 100644
index 000000000000..0cd65665d33d
--- /dev/null
+++ b/docs/source/en/pipeline_gradio.md
@@ -0,0 +1,52 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Machine learning apps
+
+[Gradio](https://www.gradio.app/), a fast and easy library for building and sharing machine learning apps, is integrated with [`Pipeline`] to quickly create a simple interface for inference.
+
+Before you begin, make sure Gradio is installed.
+
+```py
+!pip install gradio
+```
+
+Create a pipeline for your task, and then pass it to Gradio's [Interface.from_pipeline](https://www.gradio.app/docs/gradio/interface#interface-from_pipeline) function to create the interface. Gradio automatically determines the appropriate input and output components for a [`Pipeline`].
+
+Add [launch](https://www.gradio.app/main/docs/gradio/blocks#blocks-launch) to create a web server and start up the app.
+
+```py
+from transformers import pipeline
+import gradio as gr
+
+pipeline = pipeline("image-classification", model="google/vit-base-patch16-224")
+gr.Interface.from_pipeline(pipeline).launch()
+```
+
+The web app runs on a local server by default. To share the app with other users, set `share=True` in [launch](https://www.gradio.app/main/docs/gradio/blocks#blocks-launch) to generate a temporary public link. For a more permanent solution, host the app on Hugging Face [Spaces](https://hf.co/spaces).
+
+```py
+gr.Interface.from_pipeline(pipeline).launch(share=True)
+```
+
+The Space below is created with the code above and hosted on Spaces.
+
+<iframe
+	src="https://stevhliu-gradio-pipeline-demo.hf.space"
+	frameborder="0"
+	width="850"
+	height="850"
+></iframe>
diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md
index 357bc7f636ec..24fa6275acee 100644
--- a/docs/source/en/pipeline_tutorial.md
+++ b/docs/source/en/pipeline_tutorial.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,333 +14,332 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Pipelines for inference
+# Pipeline
 
-The [`pipeline`] makes it simple to use any model from the [Hub](https://huggingface.co/models) for inference on any language, computer vision, speech, and multimodal tasks. Even if you don't have experience with a specific modality or aren't familiar with the underlying code behind the models, you can still use them for inference with the [`pipeline`]! This tutorial will teach you to:
+The [`Pipeline`] is a simple but powerful inference API that is readily available for a variety of machine learning tasks with any model from the Hugging Face [Hub](https://hf.co/models).
 
-* Use a [`pipeline`] for inference.
-* Use a specific tokenizer or model.
-* Use a [`pipeline`] for audio, vision, and multimodal tasks.
+Tailor the [`Pipeline`] to your task with task specific parameters such as adding timestamps to an automatic speech recognition (ASR) pipeline for transcribing meeting notes. [`Pipeline`] supports GPUs, Apple Silicon, and half-precision weights to accelerate inference and save memory.
 
-<Tip>
+<Youtube id=tiZFewofSLM/>
 
-Take a look at the [`pipeline`] documentation for a complete list of supported tasks and available parameters.
+Transformers has two pipeline classes, a generic [`Pipeline`] and many individual task-specific pipelines like [`TextGenerationPipeline`] or [`VisualQuestionAnsweringPipeline`]. Load these individual pipelines by setting the task identifier in the `task` parameter in [`Pipeline`]. You can find the task identifier for each pipeline in their API documentation.
 
-</Tip>
+Each task is configured to use a default pretrained model and preprocessor, but this can be overridden with the `model` parameter if you want to use a different model.
 
-## Pipeline usage
-
-While each task has an associated [`pipeline`], it is simpler to use the general [`pipeline`] abstraction which contains 
-all the task-specific pipelines. The [`pipeline`] automatically loads a default model and a preprocessing class capable 
-of inference for your task. Let's take the example of using the [`pipeline`] for automatic speech recognition (ASR), or
-speech-to-text.
-
-
-1. Start by creating a [`pipeline`] and specify the inference task:
+For example, to use the [`TextGenerationPipeline`] with [Gemma 2](./model_doc/gemma2), set `task="text-generation"` and `model="google/gemma-2-2b"`.
 
 ```py
->>> from transformers import pipeline
+from transformers import pipeline
 
->>> transcriber = pipeline(task="automatic-speech-recognition")
+pipeline = pipeline(task="text-generation", model="google/gemma-2-2b")
+pipeline("the secret to baking a really good cake is ")
+[{'generated_text': 'the secret to baking a really good cake is 1. the right ingredients 2. the'}]
 ```
 
-2. Pass your input to the [`pipeline`]. In the case of speech recognition, this is an audio input file:
+When you have more than one input, pass them as a list.
 
 ```py
->>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
-{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
-```
+from transformers import pipeline
 
-Not the result you had in mind? Check out some of the [most downloaded automatic speech recognition models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) 
-on the Hub to see if you can get a better transcription.
+pipeline = pipeline(task="text-generation", model="google/gemma-2-2b", device="cuda")
+pipeline(["the secret to baking a really good cake is ", "a baguette is "])
+[[{'generated_text': 'the secret to baking a really good cake is 1. the right ingredients 2. the'}],
+ [{'generated_text': 'a baguette is 100% bread.\n\na baguette is 100%'}]]
+```
 
-Let's try the [Whisper large-v2](https://huggingface.co/openai/whisper-large-v2) model from OpenAI. Whisper was released 
-2 years later than Wav2Vec2, and was trained on close to 10x more data. As such, it beats Wav2Vec2 on most downstream 
-benchmarks. It also has the added benefit of predicting punctuation and casing, neither of which are possible with  
-Wav2Vec2.
+This guide will introduce you to the [`Pipeline`], demonstrate its features, and show how to configure its various parameters.
 
-Let's give it a try here to see how it performs. Set `torch_dtype="auto"` to automatically load the most memory-efficient data type the weights are stored in.
+## Tasks
 
-```py
->>> transcriber = pipeline(model="openai/whisper-large-v2", torch_dtype="auto")
->>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
-{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
-```
+[`Pipeline`] is compatible with many machine learning tasks across different modalities. Pass an appropriate input to the pipeline and it will handle the rest.
 
-Now this result looks more accurate! For a deep-dive comparison on Wav2Vec2 vs Whisper, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/asr_models).
-We really encourage you to check out the Hub for models in different languages, models specialized in your field, and more.
-You can check out and compare model results directly from your browser on the Hub to see if it fits or 
-handles corner cases better than other ones.
-And if you don't find a model for your use case, you can always start [training](training) your own!
+Here are some examples of how to use [`Pipeline`] for different tasks and modalities.
 
-If you have several inputs, you can pass your input as a list:
+<hfoptions id="tasks">
+<hfoption id="summarization">
 
 ```py
-transcriber(
-    [
-        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
-        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
-    ]
-)
-```
-
-Pipelines are great for experimentation as switching from one model to another is trivial; however, there are some ways to optimize them for larger workloads than experimentation. See the following guides that dive into iterating over whole datasets or using pipelines in a webserver:
-of the docs:
-* [Using pipelines on a dataset](#using-pipelines-on-a-dataset)
-* [Using pipelines for a webserver](./pipeline_webserver)
+from transformers import pipeline
 
-## Parameters
+pipeline = pipeline(task="summarization", model="google/pegasus-billsum")
+pipeline("Section was formerly set out as section 44 of this title. As originally enacted, this section contained two further provisions that 'nothing in this act shall be construed as in any wise affecting the grant of lands made to the State of California by virtue of the act entitled 'An act authorizing a grant to the State of California of the Yosemite Valley, and of the land' embracing the Mariposa Big-Tree Grove, approved June thirtieth, eighteen hundred and sixty-four; or as affecting any bona-fide entry of land made within the limits above described under any law of the United States prior to the approval of this act.' The first quoted provision was omitted from the Code because the land, granted to the state of California pursuant to the Act cite, was receded to the United States. Resolution June 11, 1906, No. 27, accepted the recession.")
+[{'summary_text': 'Instructs the Secretary of the Interior to convey to the State of California all right, title, and interest of the United States in and to specified lands which are located within the Yosemite and Mariposa National Forests, California.'}]
+```
 
-[`pipeline`] supports many parameters; some are task specific, and some are general to all pipelines.
-In general, you can specify parameters anywhere you want:
+</hfoption>
+<hfoption id="automatic speech recognition">
 
 ```py
-transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1)
+from transformers import pipeline
 
-out = transcriber(...)  # This will use `my_parameter=1`.
-out = transcriber(..., my_parameter=2)  # This will override and use `my_parameter=2`.
-out = transcriber(...)  # This will go back to using `my_parameter=1`.
+pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3")
+pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
 ```
 
-Let's check out 3 important ones:
-
-### Device
-
-If you use `device=n`, the pipeline automatically puts the model on the specified device.
-This will work regardless of whether you are using PyTorch or Tensorflow.
+</hfoption>
+<hfoption id="image classification">
 
 ```py
-transcriber = pipeline(model="openai/whisper-large-v2", device=0)
+from transformers import pipeline
+
+pipeline = pipeline(task="image-classification", model="google/vit-base-patch16-224")
+pipeline(images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg")
+[{'label': 'lynx, catamount', 'score': 0.43350091576576233},
+ {'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
+  'score': 0.034796204417943954},
+ {'label': 'snow leopard, ounce, Panthera uncia',
+  'score': 0.03240183740854263},
+ {'label': 'Egyptian cat', 'score': 0.02394474856555462},
+ {'label': 'tiger cat', 'score': 0.02288915030658245}]
 ```
 
-If the model is too large for a single GPU and you are using PyTorch, you can set `torch_dtype='float16'` to enable FP16 precision inference. Usually this would not cause significant performance drops but make sure you evaluate it on your models!
+</hfoption>
+<hfoption id="visual question answering">
 
-Alternatively, you can set `device_map="auto"` to automatically 
-determine how to load and store the model weights. Using the `device_map` argument requires the 🤗 [Accelerate](https://huggingface.co/docs/accelerate)
-package:
+```py
+from transformers import pipeline
 
-```bash
-pip install --upgrade accelerate
+pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base")
+pipeline(
+    image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg",
+    question="What is in the image?",
+)
+[{'answer': 'statue of liberty'}]
 ```
 
-The following code automatically loads and stores model weights across devices:
+</hfoption>
+</hfoptions>
 
-```py
-transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")
-```
+## Parameters
 
-Note that if  `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior!
+At a minimum, [`Pipeline`] only requires a task identifier, model, and the appropriate input. But there are many parameters available to configure the pipeline with, from task-specific parameters to optimizing performance.
 
-### Batch size
+This section introduces you to some of the more important parameters.
 
-By default, pipelines will not batch inference for reasons explained in detail [here](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). The reason is that batching is not necessarily faster, and can actually be quite slower in some cases.
+### Device
 
-But if it works in your use case, you can use:
+[`Pipeline`] is compatible with many hardware types, including GPUs, CPUs, Apple Silicon, and more. Configure the hardware type with the `device` parameter. By default, [`Pipeline`] runs on a CPU which is given by `device=-1`.
 
-```py
-transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
-audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
-texts = transcriber(audio_filenames)
-```
+<hfoptions id="device">
+<hfoption id="GPU">
 
-This runs the pipeline on the 4 provided audio files, but it will pass them in batches of 2
-to the model (which is on a GPU, where batching is more likely to help) without requiring any further code from you. 
-The output should always match what you would have received without batching. It is only meant as a way to help you get more speed out of a pipeline.
+To run [`Pipeline`] on a GPU, set `device` to the associated CUDA device id. For example, `device=0` runs on the first GPU.
 
-Pipelines can also alleviate some of the complexities of batching because, for some pipelines, a single item (like a long audio file) needs to be chunked into multiple parts to be processed by a model. The pipeline performs this [*chunk batching*](./main_classes/pipelines#pipeline-chunk-batching) for you.
+```py
+from transformers import pipeline
 
-### Task specific parameters
+pipeline = pipeline(task="text-generation", model="google/gemma-2-2b", device=0)
+pipeline("the secret to baking a really good cake is ")
+```
 
-All tasks provide task specific parameters which allow for additional flexibility and options to help you get your job done.
-For instance, the [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] method has a `return_timestamps` parameter which sounds promising for subtitling videos:
+You could also let [Accelerate](https://hf.co/docs/accelerate/index), a library for distributed training, automatically choose how to load and store the model weights on the appropriate device. This is especially useful if you have multiple devices. Accelerate loads and stores the model weights on the fastest device first, and then moves the weights to other devices (CPU, hard drive) as needed. Set `device_map="auto"` to let Accelerate choose the device.
 
+> [!TIP]
+> Make sure have [Accelerate](https://hf.co/docs/accelerate/basic_tutorials/install) is installed.
+>
+> ```py
+> !pip install -U accelerate
+> ```
 
 ```py
->>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True)
->>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
-{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]}
+from transformers import pipeline
+
+pipeline = pipeline(task="text-generation", model="google/gemma-2-2b", device_map="auto")
+pipeline("the secret to baking a really good cake is ")
 ```
 
-As you can see, the model inferred the text and also outputted **when** the various sentences were pronounced.
+</hfoption>
+<hfoption id="Apple silicon">
 
-There are many parameters available for each task, so check out each task's API reference to see what you can tinker with!
-For instance, the [`~transformers.AutomaticSpeechRecognitionPipeline`] has a `chunk_length_s` parameter which is helpful 
-for working on really long audio files (for example, subtitling entire movies or hour-long videos) that a model typically 
-cannot handle on its own:
+To run [`Pipeline`] on Apple silicon, set `device="mps"`.
 
-```python
->>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30)
->>> transcriber("https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/ted_60.wav")
-{'text': " So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know. You get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option, it was way too big a project. So I planned things out and I decided I kind of had to go something like this. This is how the year would go. So I'd start off light and I'd bump it up"}
+```py
+from transformers import pipeline
+
+pipeline = pipeline(task="text-generation", model="google/gemma-2-2b", device="mps")
+pipeline("the secret to baking a really good cake is ")
 ```
 
-If you can't find a parameter that would really help you out, feel free to [request it](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)!
+</hfoption>
+</hfoptions>
 
+### Batch inference
 
-## Using pipelines on a dataset
+[`Pipeline`] can also process batches of inputs with the `batch_size` parameter. Batch inference may improve speed, especially on a GPU, but it isn't guaranteed. Other variables such as hardware, data, and the model itself can affect whether batch inference improves speed. For this reason, batch inference is disabled by default.
 
-The pipeline can also run inference on a large dataset. The easiest way we recommend doing this is by using an iterator:
+In the example below, when there are 4 inputs and `batch_size` is set to 2, [`Pipeline`] passes a batch of 2 inputs to the model at a time.
 
 ```py
-def data():
-    for i in range(1000):
-        yield f"My example {i}"
-
+from transformers import pipeline
 
-pipe = pipeline(model="openai-community/gpt2", device=0)
-generated_characters = 0
-for out in pipe(data()):
-    generated_characters += len(out[0]["generated_text"])
+pipeline = pipeline(task="text-generation", model="google/gemma-2-2b", device="cuda", batch_size=2)
+pipeline(["the secret to baking a really good cake is", "a baguette is", "paris is the", "hotdogs are"])
+[[{'generated_text': 'the secret to baking a really good cake is to use a good cake mix.\n\ni’'}],
+ [{'generated_text': 'a baguette is'}],
+ [{'generated_text': 'paris is the most beautiful city in the world.\n\ni’ve been to paris 3'}],
+ [{'generated_text': 'hotdogs are a staple of the american diet. they are a great source of protein and can'}]]
 ```
 
-The iterator `data()` yields each result, and the pipeline automatically
-recognizes the input is iterable and will start fetching the data while
-it continues to process it on the GPU (this uses [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) under the hood).
-This is important because you don't have to allocate memory for the whole dataset
-and you can feed the GPU as fast as possible.
-
-Since batching could speed things up, it may be useful to try tuning the `batch_size` parameter here.
-
-The simplest way to iterate over a dataset is to just load one from 🤗 [Datasets](https://github.com/huggingface/datasets/):
+Another good use case for batch inference is for streaming data in [`Pipeline`].
 
 ```py
-# KeyDataset is a util that will just output the item we're interested in.
+from transformers import pipeline
 from transformers.pipelines.pt_utils import KeyDataset
-from datasets import load_dataset
+import datasets
 
-pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
-dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
-
-for out in pipe(KeyDataset(dataset, "audio")):
+# KeyDataset is a utility that returns the item in the dict returned by the dataset
+dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
+pipeline = pipeline(task="text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", device="cuda")
+for out in pipeline(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
     print(out)
 ```
 
+Keep the following general rules of thumb in mind for determining whether batch inference can help improve performance.
 
-## Using pipelines for a webserver
-
-<Tip>
-Creating an inference engine is a complex topic which deserves it's own
-page.
-</Tip>
+1. The only way to know for sure is to measure performance on your model, data, and hardware.
+2. Don't batch inference if you're constrained by latency (a live inference product for example).
+3. Don't batch inference if you're using a CPU.
+4. Don't batch inference if you don't know the `sequence_length` of your data. Measure performance, iteratively add to `sequence_length`, and include out-of-memory (OOM) checks to recover from failures.
+5. Do batch inference if your `sequence_length` is regular, and keep pushing it until you reach an OOM error. The larger the GPU, the more helpful batch inference is.
+6. Do make sure you can handle OOM errors if you decide to do batch inference.
 
-[Link](./pipeline_webserver)
+### Task-specific parameters
 
-## Vision pipeline
+[`Pipeline`] accepts any parameters that are supported by each individual task pipeline. Make sure to check out each individual task pipeline to see what type of parameters are available. If you can't find a parameter that is useful for your use case, please feel free to open a GitHub [issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml) to request it!
 
-Using a [`pipeline`] for vision tasks is practically identical.
+The examples below demonstrate some of the task-specific parameters available.
 
-Specify your task and pass your image to the classifier. The image can be a link, a local path or a base64-encoded image. For example, what species of cat is shown below?
+<hfoptions id="task-specific-parameters">
+<hfoption id="automatic speech recognition">
 
-![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+Pass the `return_timestamps="word"` parameter to [`Pipeline`] to return when each word was spoken.
 
 ```py
->>> from transformers import pipeline
-
->>> vision_classifier = pipeline(model="google/vit-base-patch16-224")
->>> preds = vision_classifier(
-...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-... )
->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
->>> preds
-[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
+from transformers import pipeline
+
+pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3")
+pipeline(audio="https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac", return_timestamp="word")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.',
+ 'chunks': [{'text': ' I', 'timestamp': (0.0, 1.1)},
+  {'text': ' have', 'timestamp': (1.1, 1.44)},
+  {'text': ' a', 'timestamp': (1.44, 1.62)},
+  {'text': ' dream', 'timestamp': (1.62, 1.92)},
+  {'text': ' that', 'timestamp': (1.92, 3.7)},
+  {'text': ' one', 'timestamp': (3.7, 3.88)},
+  {'text': ' day', 'timestamp': (3.88, 4.24)},
+  {'text': ' this', 'timestamp': (4.24, 5.82)},
+  {'text': ' nation', 'timestamp': (5.82, 6.78)},
+  {'text': ' will', 'timestamp': (6.78, 7.36)},
+  {'text': ' rise', 'timestamp': (7.36, 7.88)},
+  {'text': ' up', 'timestamp': (7.88, 8.46)},
+  {'text': ' and', 'timestamp': (8.46, 9.2)},
+  {'text': ' live', 'timestamp': (9.2, 10.34)},
+  {'text': ' out', 'timestamp': (10.34, 10.58)},
+  {'text': ' the', 'timestamp': (10.58, 10.8)},
+  {'text': ' true', 'timestamp': (10.8, 11.04)},
+  {'text': ' meaning', 'timestamp': (11.04, 11.4)},
+  {'text': ' of', 'timestamp': (11.4, 11.64)},
+  {'text': ' its', 'timestamp': (11.64, 11.8)},
+  {'text': ' creed.', 'timestamp': (11.8, 12.3)}]}
 ```
 
-## Text pipeline
+</hfoption>
+<hfoption id="text generation">
+
+Pass `return_full_text=False` to [`Pipeline`] to only return the generated text instead of the full text (prompt and generated text).
 
-Using a [`pipeline`] for NLP tasks is practically identical.
+[`~TextGenerationPipeline.__call__`] also supports additional keyword arguments from the [`~GenerationMixin.generate`] method. To return more than one generated sequence, set `num_return_sequences` to a value greater than 1.
 
 ```py
->>> from transformers import pipeline
-
->>> # This model is a `zero-shot-classification` model.
->>> # It will classify text, except you are free to choose any label you might imagine
->>> classifier = pipeline(model="facebook/bart-large-mnli")
->>> classifier(
-...     "I have a problem with my iphone that needs to be resolved asap!!",
-...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
-... )
-{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+from transformers import pipeline
+
+pipeline = pipeline(task="text-generation", model="openai-community/gpt2")
+pipeline("the secret to baking a good cake is", num_return_sequences=4, return_full_text=False)
+[{'generated_text': ' how easy it is for me to do it with my hands. You must not go nuts, or the cake is going to fall out.'},
+ {'generated_text': ' to prepare the cake before baking. The key is to find the right type of icing to use and that icing makes an amazing frosting cake.\n\nFor a good icing cake, we give you the basics'},
+ {'generated_text': " to remember to soak it in enough water and don't worry about it sticking to the wall. In the meantime, you could remove the top of the cake and let it dry out with a paper towel.\n"},
+ {'generated_text': ' the best time to turn off the oven and let it stand 30 minutes. After 30 minutes, stir and bake a cake in a pan until fully moist.\n\nRemove the cake from the heat for about 12'}]
 ```
 
-## Multimodal pipeline
+</hfoption>
+</hfoptions>
 
-The [`pipeline`] supports more than one modality. For example, a visual question answering (VQA) task combines text and image. Feel free to use any image link you like and a question you want to ask about the image. The image can be a URL or a local path to the image.
+## Chunk batching
 
-For example, if you use this [invoice image](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png):
+There are some instances where you need to process data in chunks.
 
-```py
->>> from transformers import pipeline
-
->>> vqa = pipeline(model="impira/layoutlm-document-qa")
->>> output = vqa(
-...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
-...     question="What is the invoice number?",
-... )
->>> output[0]["score"] = round(output[0]["score"], 3)
->>> output
-[{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
-```
+- for some data types, a single input (for example, a really long audio file) may need to be chunked into multiple parts before it can be processed
+- for some tasks, like zero-shot classification or question answering, a single input may need multiple forward passes which can cause issues with the `batch_size` parameter
 
-<Tip>
+The [ChunkPipeline](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/pipelines/base.py#L1387) class is designed to handle these use cases. Both pipeline classes are used in the same way, but since [ChunkPipeline](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/pipelines/base.py#L1387) can automatically handle batching, you don't need to worry about the number of forward passes your inputs trigger. Instead, you can optimize `batch_size` independently of the inputs.
 
-To run the example above you need to have [`pytesseract`](https://pypi.org/project/pytesseract/) installed in addition to 🤗 Transformers:
+The example below shows how it differs from [`Pipeline`].
 
-```bash
-sudo apt install -y tesseract-ocr
-pip install pytesseract
+```py
+# ChunkPipeline
+all_model_outputs = []
+for preprocessed in pipeline.preprocess(inputs):
+    model_outputs = pipeline.model_forward(preprocessed)
+    all_model_outputs.append(model_outputs)
+outputs =pipeline.postprocess(all_model_outputs)
+
+# Pipeline
+preprocessed = pipeline.preprocess(inputs)
+model_outputs = pipeline.forward(preprocessed)
+outputs = pipeline.postprocess(model_outputs)
 ```
 
-</Tip>
-
-## Using `pipeline` on large models with 🤗 `accelerate`:
-
-You can easily run `pipeline` on large models using 🤗 `accelerate`! First make sure you have installed `accelerate` with `pip install accelerate`. 
+## Large datasets
 
-First load your model using `device_map="auto"`! We will use `facebook/opt-1.3b` for our example.
+For inference with large datasets, you can iterate directly over the dataset itself. This avoids immediately allocating memory for the entire dataset, and you don't need to worry about creating batches yourself. Try [Batch inference](#batch-inference) with the `batch_size` parameter to see if it improves performance.
 
 ```py
-# pip install accelerate
-import torch
+from transformers.pipelines.pt_utils import KeyDataset
 from transformers import pipeline
+from datasets import load_dataset
 
-pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
-output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
+pipeline = pipeline(task="text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", device="cuda")
+for out in pipeline(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
+    print(out)
 ```
 
-You can also pass 8-bit loaded models if you install `bitsandbytes` and add the argument `load_in_8bit=True`
+Other ways to run inference on large datasets with [`Pipeline`] include using an iterator or generator.
 
 ```py
-# pip install accelerate bitsandbytes
-import torch
-from transformers import pipeline
+def data():
+    for i in range(1000):
+        yield f"My example {i}"
 
-pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True})
-output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+pipeline = pipeline(model="openai-community/gpt2", device=0)
+generated_characters = 0
+for out in pipeline(data()):
+    generated_characters += len(out[0]["generated_text"])
 ```
 
-Note that you can replace the checkpoint with any Hugging Face model that supports large model loading, such as BLOOM.
+## Large models
 
-## Creating web demos from pipelines with `gradio`
+[Accelerate](https://hf.co/docs/accelerate/index) enables a couple of optimizations for running large models with [`Pipeline`]. Make sure Accelerate is installed first.
 
-Pipelines are automatically supported in [Gradio](https://github.com/gradio-app/gradio/), a library that makes creating beautiful and user-friendly machine learning apps on the web a breeze. First, make sure you have Gradio installed:
-
-```
-pip install gradio
+```py
+!pip install -U accelerate
 ```
 
-Then, you can create a web demo around an image classification pipeline (or any other pipeline) in a single line of code by calling Gradio's [`Interface.from_pipeline`](https://www.gradio.app/docs/interface#interface-from-pipeline) function to launch the pipeline. This creates an intuitive drag-and-drop interface in your browser:
-
-```py
-from transformers import pipeline
-import gradio as gr
+The `device_map="auto"` setting is useful for automatically distributing the model across the fastest devices (GPUs) first before dispatching to other slower devices if available (CPU, hard drive).
 
-pipe = pipeline("image-classification", model="google/vit-base-patch16-224")
+[`Pipeline`] supports half-precision weights (torch.float16), which can be significantly faster and save memory. Performance loss is negligible for most models, especially for larger ones. If your hardware supports it, you can enable torch.bfloat16 instead for more range.
 
-gr.Interface.from_pipeline(pipe).launch()
-```
+> [!TIP]
+> Inputs are internally converted to torch.float16 and it only works for models with a PyTorch backend.
 
+Lastly, [`Pipeline`] also accepts quantized models to reduce memory usage even further. Make sure you have the [bitsandbytes](https://hf.co/docs/bitsandbytes/installation) library installed first, and then add `load_in_8bit=True` to `model_kwargs` in the pipeline.
 
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/panda-classification.png)
+```py
+import torch
+from transformers import pipeline, BitsAndBytesConfig
 
-By default, the web demo runs on a local server. If you'd like to share it with others, you can generate a temporary public
-link by setting `share=True` in `launch()`. You can also host your demo on [Hugging Face Spaces](https://huggingface.co/spaces) for a permanent link. 
+pipeline = pipeline(model="google/gemma-7b", torch_dtype=torch.bfloat16, device_map="auto", model_kwargs={"quantization_config": BitsAndBytesConfig(load_in_8bit=True)})
+pipeline("the secret to baking a good cake is ")
+[{'generated_text': 'the secret to baking a good cake is 1. the right ingredients 2. the right'}]
+```
diff --git a/docs/source/en/pipeline_webserver.md b/docs/source/en/pipeline_webserver.md
index 17b5fbd958dd..5782cd13f5b3 100644
--- a/docs/source/en/pipeline_webserver.md
+++ b/docs/source/en/pipeline_webserver.md
@@ -1,34 +1,40 @@
-<!--⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
+
 -->
 
-# Using pipelines for a webserver
+# Web server inference
 
-<Tip>
-Creating an inference engine is a complex topic, and the "best" solution 
-will most likely depend on your problem space. Are you on CPU or GPU? Do
-you want the lowest latency, the highest throughput, support for
-many models, or just highly optimize 1 specific model?
-There are many ways to tackle this topic, so what we are going to present is a good default
-to get started which may not necessarily be the most optimal solution for you.
-</Tip>
+A web server is a system that waits for requests and serves them as they come in. This means you can use [`Pipeline`] as an inference engine on a web server, since you can use an iterator (similar to how you would [iterate over a dataset](./pipeline_tutorial#large-datasets)) to handle each incoming request.
 
+Designing a web server with [`Pipeline`] is unique though because they're fundamentally different. Web servers are multiplexed (multithreaded, async, etc.) to handle multiple requests concurrently. [`Pipeline`] and its underlying model on the other hand are not designed for parallelism because they take a lot of memory. It's best to give a [`Pipeline`] all the available resources when they're running or for a compute intensive job.
 
-The key thing to understand is that we can use an iterator, just like you would [on a
-dataset](pipeline_tutorial#using-pipelines-on-a-dataset), since a webserver is basically a system that waits for requests and
-treats them as they come in.
+This guide shows how to work around this difference by using a web server to handle the lighter load of receiving and sending requests, and having a single thread to handle the heavier load of running [`Pipeline`].
 
-Usually webservers are multiplexed (multithreaded, async, etc..) to handle various
-requests concurrently. Pipelines on the other hand (and mostly the underlying models)
-are not really great for parallelism; they take up a lot of RAM, so it's best to give them all the available resources when they are running or it's a compute-intensive job.
+## Create a server
 
-We are going to solve that by having the webserver handle the light load of receiving
-and sending requests, and having a single thread handling the actual work.
-This example is going to use `starlette`. The actual framework is not really
-important, but you might have to tune or change the code if you are using another
-one to achieve the same effect.
+[Starlette](https://www.starlette.io/) is a lightweight framework for building web servers. You can use any other framework you'd like, but you may have to make some changes to the code below.
 
-Create `server.py`:
+Before you begin, make sure Starlette and [uvicorn](http://www.uvicorn.org/) are installed.
+
+```py
+!pip install starlette uvicorn
+```
+
+Now you can create a simple web server in a `server.py` file. The key is to only load the model **once** to prevent unnecessary copies of it from consuming memory.
+
+Create a pipeline to fill in the masked token, `[MASK]`.
 
 ```py
 from starlette.applications import Starlette
@@ -37,7 +43,6 @@ from starlette.routing import Route
 from transformers import pipeline
 import asyncio
 
-
 async def homepage(request):
     payload = await request.body()
     string = payload.decode("utf-8")
@@ -46,22 +51,19 @@ async def homepage(request):
     output = await response_q.get()
     return JSONResponse(output)
 
-
 async def server_loop(q):
-    pipe = pipeline(model="google-bert/bert-base-uncased")
+    pipeline = pipeline(task="fill-mask",model="google-bert/bert-base-uncased")
     while True:
         (string, response_q) = await q.get()
-        out = pipe(string)
+        out = pipeline(string)
         await response_q.put(out)
 
-
 app = Starlette(
     routes=[
         Route("/", homepage, methods=["POST"]),
     ],
 )
 
-
 @app.on_event("startup")
 async def startup_event():
     q = asyncio.Queue()
@@ -69,30 +71,48 @@ async def startup_event():
     asyncio.create_task(server_loop(q))
 ```
 
-Now you can start it with:
+Start the server with the following command.
+
 ```bash
 uvicorn server:app
 ```
 
-And you can query it:
+Query the server with a POST request.
+
 ```bash
-curl -X POST -d "test [MASK]" http://localhost:8000/
-#[{"score":0.7742936015129089,"token":1012,"token_str":".","sequence":"test."},...]
+curl -X POST -d "Paris is the [MASK] of France." http://localhost:8000/
+[{'score': 0.9969332218170166,
+  'token': 3007,
+  'token_str': 'capital',
+  'sequence': 'paris is the capital of france.'},
+ {'score': 0.0005914849461987615,
+  'token': 2540,
+  'token_str': 'heart',
+  'sequence': 'paris is the heart of france.'},
+ {'score': 0.00043787318281829357,
+  'token': 2415,
+  'token_str': 'center',
+  'sequence': 'paris is the center of france.'},
+ {'score': 0.0003378340043127537,
+  'token': 2803,
+  'token_str': 'centre',
+  'sequence': 'paris is the centre of france.'},
+ {'score': 0.00026995912776328623,
+  'token': 2103,
+  'token_str': 'city',
+  'sequence': 'paris is the city of france.'}]
 ```
 
-And there you go, now you have a good idea of how to create a webserver!
+## Queuing requests
 
-What is really important is that we load the model only **once**, so there are no copies
-of the model on the webserver. This way, no unnecessary RAM is being used.
-Then the queuing mechanism allows you to do fancy stuff like maybe accumulating a few
-items before inferring to use dynamic batching:
+The server's queuing mechanism can be used for some interesting applications such as dynamic batching. Dynamic batching accumulates several requests first before processing them with [`Pipeline`].
 
-<Tip warning={true}>
+The example below is written in pseudocode for readability rather than performance, in particular, you'll notice that:
 
-The code sample below is intentionally written like pseudo-code for readability.
-Do not run this without checking if it makes sense for your system resources!
+1. There is no batch size limit.
+2. The timeout is reset on every queue fetch, so you could end up waiting much longer than the `timeout` value before processing a request. This would also delay the first inference request by that amount of time. The web server always waits 1ms even if the queue is empty, which is inefficient, because that time can be used to start inference. It could make sense though if batching is essential to your use case.
 
-</Tip>
+    It would be better to have a single 1ms deadline, instead of resetting it on every fetch.
 
 ```py
 (string, rq) = await q.get()
@@ -100,69 +120,35 @@ strings = []
 queues = []
 while True:
     try:
-        (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001)  # 1ms
+        (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001)
     except asyncio.exceptions.TimeoutError:
         break
     strings.append(string)
     queues.append(rq)
 strings
-outs = pipe(strings, batch_size=len(strings))
+outs = pipeline(strings, batch_size=len(strings))
 for rq, out in zip(queues, outs):
     await rq.put(out)
 ```
 
-Again, the proposed code is optimized for readability, not for being the best code.
-First of all, there's no batch size limit which is usually not a 
-great idea. Next, the timeout is reset on every queue fetch, meaning you could
-wait much more than 1ms before running the inference (delaying the first request 
-by that much). 
-
-It would be better to have a single 1ms deadline.
-
-This will always wait for 1ms even if the queue is empty, which might not be the
-best since you probably want to start doing inference if there's nothing in the queue.
-But maybe it does make sense if batching is really crucial for your use case.
-Again, there's really no one best solution.
-
-
-## Few things you might want to consider
-
-### Error checking
+## Error checking
 
-There's a lot that can go wrong in production: out of memory, out of space,
-loading the model might fail, the query might be wrong, the query might be
-correct but still fail to run because of a model misconfiguration, and so on.
+There are many things that can go wrong in production. You could run out-of-memory, out of space, fail to load a model, have an incorrect model configuration, have an incorrect query, and so much more.
 
-Generally, it's good if the server outputs the errors to the user, so
-adding a lot of `try..except` statements to show those errors is a good
-idea. But keep in mind it may also be a security risk to reveal all those errors depending 
-on your security context.
+Adding `try...except` statements is helpful for returning these errors to the user for debugging. Keep in mind this could be a security risk if you shouldn't be revealing certain information.
 
-### Circuit breaking
+## Circuit breaking
 
-Webservers usually look better when they do circuit breaking. It means they 
-return proper errors when they're overloaded instead of just waiting for the query indefinitely. Return a 503 error instead of waiting for a super long time or a 504 after a long time.
+Try to return a 503 or 504 error when the server is overloaded instead of forcing a user to wait indefinitely.
 
-This is relatively easy to implement in the proposed code since there is a single queue.
-Looking at the queue size is a basic way to start returning errors before your 
-webserver fails under load.
+It is relatively simple to implement these error types since it's only a single queue. Take a look at the queue size to determine when to start returning errors before your server fails under load.
 
-### Blocking the main thread
+## Block the main thread
 
-Currently PyTorch is not async aware, and computation will block the main
-thread while running. That means it would be better if PyTorch was forced to run
-on its own thread/process. This wasn't done here because the code is a lot more
-complex (mostly because threads and async and queues don't play nice together).
-But ultimately it does the same thing.
+PyTorch is not async aware, so computation will block the main thread from running.
 
-This would be important if the inference of single items were long (> 1s) because 
-in this case, it means every query during inference would have to wait for 1s before
-even receiving an error.
+For this reason, it's better to run PyTorch on its own separate thread or process. When inference of a single request is especially long (more than 1s), it's even more important because it means every query during inference must wait 1s before even receiving an error.
 
-### Dynamic batching
+## Dynamic batching
 
-In general, batching is not necessarily an improvement over passing 1 item at 
-a time (see [batching details](./main_classes/pipelines#pipeline-batching) for more information). But it can be very effective
-when used in the correct setting. In the API, there is no dynamic
-batching by default (too much opportunity for a slowdown). But for BLOOM inference -
-which is a very large model - dynamic batching is **essential** to provide a decent experience for everyone.
+Dynamic batching can be very effective when used in the correct setting, but it's not necessary when you're only passing 1 request at a time (see [batch inference](./pipeline_tutorial#batch-inference) for more details).
diff --git a/docs/source/en/preprocessing.md b/docs/source/en/preprocessing.md
deleted file mode 100644
index 1a6f071a3353..000000000000
--- a/docs/source/en/preprocessing.md
+++ /dev/null
@@ -1,534 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Preprocess
-
-[[open-in-colab]]
-
-Before you can train a model on a dataset, it needs to be preprocessed into the expected model input format. Whether your data is text, images, or audio, it needs to be converted and assembled into batches of tensors. 🤗 Transformers provides a set of preprocessing classes to help prepare your data for the model. In this tutorial, you'll learn that for:
-
-* Text, use a [Tokenizer](./main_classes/tokenizer) to convert text into a sequence of tokens, create a numerical representation of the tokens, and assemble them into tensors.
-* Speech and audio, use a [Feature extractor](./main_classes/feature_extractor) to extract sequential features from audio waveforms and convert them into tensors.
-* Image inputs use a [ImageProcessor](./main_classes/image_processor) to convert images into tensors.
-* Multimodal inputs, use a [Processor](./main_classes/processors) to combine a tokenizer and a feature extractor or image processor.
-
-<Tip>
-
-`AutoProcessor` **always** works and automatically chooses the correct class for the model you're using, whether you're using a tokenizer, image processor, feature extractor or processor.
-
-</Tip>
-
-Before you begin, install 🤗 Datasets so you can load some datasets to experiment with:
-
-```bash
-pip install datasets
-```
-
-## Natural Language Processing
-
-<Youtube id="Yffk5aydLzg"/>
-
-The main tool for preprocessing textual data is a [tokenizer](main_classes/tokenizer). A tokenizer splits text into *tokens* according to a set of rules. The tokens are converted into numbers and then tensors, which become the model inputs. Any additional inputs required by the model are added by the tokenizer.
-
-<Tip>
-
-If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referred to as the *vocab*) during pretraining.
-
-</Tip>
-
-Get started by loading a pretrained tokenizer with the [`AutoTokenizer.from_pretrained`] method. This downloads the *vocab* a model was pretrained with:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
-```
-
-Then pass your text to the tokenizer:
-
-```py
->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
->>> print(encoded_input)
-{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102],
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-The tokenizer returns a dictionary with three important items:
-
-* [input_ids](glossary#input-ids) are the indices corresponding to each token in the sentence.
-* [attention_mask](glossary#attention-mask) indicates whether a token should be attended to or not.
-* [token_type_ids](glossary#token-type-ids) identifies which sequence a token belongs to when there is more than one sequence.
-
-Return your input by decoding the `input_ids`:
-
-```py
->>> tokenizer.decode(encoded_input["input_ids"])
-'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
-```
-
-As you can see, the tokenizer added two special tokens - `CLS` and `SEP` (classifier and separator) - to the sentence. Not all models need
-special tokens, but if they do, the tokenizer automatically adds them for you.
-
-If there are several sentences you want to preprocess, pass them as a list to the tokenizer:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_inputs = tokenizer(batch_sentences)
->>> print(encoded_inputs)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102],
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-               [101, 1327, 1164, 5450, 23434, 136, 102]],
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [0, 0, 0, 0, 0, 0, 0]],
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1, 1, 1]]}
-```
-
-### Pad
-
-Sentences aren't always the same length which can be an issue because tensors, the model inputs, need to have a uniform shape. Padding is a strategy for ensuring tensors are rectangular by adding a special *padding token* to shorter sentences.
-
-Set the `padding` parameter to `True` to pad the shorter sequences in the batch to match the longest sequence:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True)
->>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
-```
-
-The first and third sentences are now padded with `0`'s because they are shorter.
-
-### Truncation
-
-On the other end of the spectrum, sometimes a sequence may be too long for a model to handle. In this case, you'll need to truncate the sequence to a shorter length.
-
-Set the `truncation` parameter to `True` to truncate a sequence to the maximum length accepted by the model:
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
->>> print(encoded_input)
-{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
-               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
- 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
-```
-
-<Tip>
-
-Check out the [Padding and truncation](./pad_truncation) concept guide to learn more different padding and truncation arguments.
-
-</Tip>
-
-### Build tensors
-
-Finally, you want the tokenizer to return the actual tensors that get fed to the model.
-
-Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for TensorFlow:
-
-<frameworkcontent>
-<pt>
-
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
->>> print(encoded_input)
-{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
-                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]),
- 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
- 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
-                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
-```
-</pt>
-<tf>
-```py
->>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
-... ]
->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
->>> print(encoded_input)
-{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
-       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
-       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
-      dtype=int32)>,
- 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
- 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
-array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
-       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-Different pipelines support tokenizer arguments in their `__call__()` differently. `text-2-text-generation` pipelines support (i.e. pass on)
-only `truncation`. `text-generation` pipelines support `max_length`, `truncation`, `padding` and `add_special_tokens`. 
-In `fill-mask` pipelines, tokenizer arguments can be passed in the `tokenizer_kwargs` argument (dictionary).
-</Tip>
-
-## Audio
-
-For audio tasks, you'll need a [feature extractor](main_classes/feature_extractor) to prepare your dataset for the model. The feature extractor is designed to extract features from raw audio data, and convert them into tensors.
-
-Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) for more details on how to load a dataset) to see how you can use a feature extractor with audio datasets:
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
-```
-
-Access the first element of the `audio` column to take a look at the input. Calling the `audio` column automatically loads and resamples the audio file:
-
-```py
->>> dataset[0]["audio"]
-{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
-         0.        ,  0.        ], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
- 'sampling_rate': 8000}
-```
-
-This returns three items:
-
-* `array` is the speech signal loaded - and potentially resampled - as a 1D array.
-* `path` points to the location of the audio file.
-* `sampling_rate` refers to how many data points in the speech signal are measured per second.
-
-For this tutorial, you'll use the [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) model. Take a look at the model card, and you'll learn Wav2Vec2 is pretrained on 16kHz sampled speech audio. It is important your audio data's sampling rate matches the sampling rate of the dataset used to pretrain the model. If your data's sampling rate isn't the same, then you need to resample your data.
-
-1. Use 🤗 Datasets' [`~datasets.Dataset.cast_column`] method to upsample the sampling rate to 16kHz:
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
-```
-
-2. Call the `audio` column again to resample the audio file:
-
-```py
->>> dataset[0]["audio"]
-{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
-         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
- 'sampling_rate': 16000}
-```
-
-Next, load a feature extractor to normalize and pad the input. When padding textual data, a `0` is added for shorter sequences. The same idea applies to audio data. The feature extractor adds a `0` - interpreted as silence - to `array`.
-
-Load the feature extractor with [`AutoFeatureExtractor.from_pretrained`]:
-
-```py
->>> from transformers import AutoFeatureExtractor
-
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-```
-
-Pass the audio `array` to the feature extractor. We also recommend adding the `sampling_rate` argument in the feature extractor in order to better debug any silent errors that may occur.
-
-```py
->>> audio_input = [dataset[0]["audio"]["array"]]
->>> feature_extractor(audio_input, sampling_rate=16000)
-{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
-        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
-```
-
-Just like the tokenizer, you can apply padding or truncation to handle variable sequences in a batch. Take a look at the sequence length of these two audio samples:
-
-```py
->>> dataset[0]["audio"]["array"].shape
-(173398,)
-
->>> dataset[1]["audio"]["array"].shape
-(106496,)
-```
-
-Create a function to preprocess the dataset so the audio samples are the same lengths. Specify a maximum sample length, and the feature extractor will either pad or truncate the sequences to match it:
-
-```py
->>> def preprocess_function(examples):
-...     audio_arrays = [x["array"] for x in examples["audio"]]
-...     inputs = feature_extractor(
-...         audio_arrays,
-...         sampling_rate=16000,
-...         padding=True,
-...         max_length=100000,
-...         truncation=True,
-...     )
-...     return inputs
-```
-
-Apply the `preprocess_function` to the first few examples in the dataset:
-
-```py
->>> processed_dataset = preprocess_function(dataset[:5])
-```
-
-The sample lengths are now the same and match the specified maximum length. You can pass your processed dataset to the model now!
-
-```py
->>> processed_dataset["input_values"][0].shape
-(100000,)
-
->>> processed_dataset["input_values"][1].shape
-(100000,)
-```
-
-## Computer vision
-
-For computer vision tasks, you'll need an [image processor](main_classes/image_processor) to prepare your dataset for the model.
-Image preprocessing consists of several steps that convert images into the input expected by the model. These steps
-include but are not limited to resizing, normalizing, color channel correction, and converting images to tensors.
-
-<Tip>
-
-Image preprocessing often follows some form of image augmentation. Both image preprocessing and image augmentation
-transform image data, but they serve different purposes:
-
-* Image augmentation alters images in a way that can help prevent overfitting and increase the robustness of the model. You can get creative in how you augment your data - adjust brightness and colors, crop, rotate, resize, zoom, etc. However, be mindful not to change the meaning of the images with your augmentations.
-* Image preprocessing guarantees that the images match the model’s expected input format. When fine-tuning a computer vision model, images must be preprocessed exactly as when the model was initially trained.
-
-You can use any library you like for image augmentation. For image preprocessing, use the `ImageProcessor` associated with the model.
-
-</Tip>
-
-Load the [food101](https://huggingface.co/datasets/food101) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) for more details on how to load a dataset) to see how you can use an image processor with computer vision datasets:
-
-<Tip>
-
-Use 🤗 Datasets `split` parameter to only load a small sample from the training split since the dataset is quite large!
-
-</Tip>
-
-```py
->>> from datasets import load_dataset
-
->>> dataset = load_dataset("food101", split="train[:100]")
-```
-
-Next, take a look at the image with 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image) feature:
-
-```py
->>> dataset[0]["image"]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png"/>
-</div>
-
-Load the image processor with [`AutoImageProcessor.from_pretrained`]:
-
-```py
->>> from transformers import AutoImageProcessor
-
->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
-```
-
-First, let's add some image augmentation. You can use any library you prefer, but in this tutorial, we'll use torchvision's [`transforms`](https://pytorch.org/vision/stable/transforms.html) module. If you're interested in using another data augmentation library, learn how in the [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) or [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb).
-
-1. Here we use [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) to chain together a couple of
-transforms - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) and [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html).
-Note that for resizing, we can get the image size requirements from the `image_processor`. For some models, an exact height and
-width are expected, for others only the `shortest_edge` is defined.
-
-```py
->>> from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose
-
->>> size = (
-...     image_processor.size["shortest_edge"]
-...     if "shortest_edge" in image_processor.size
-...     else (image_processor.size["height"], image_processor.size["width"])
-... )
-
->>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)])
-```
-
-2. The model accepts [`pixel_values`](model_doc/vision-encoder-decoder#transformers.VisionEncoderDecoderModel.forward.pixel_values)
-as its input. `ImageProcessor` can take care of normalizing the images, and generating appropriate tensors.
-Create a function that combines image augmentation and image preprocessing for a batch of images and generates `pixel_values`:
-
-```py
->>> def transforms(examples):
-...     images = [_transforms(img.convert("RGB")) for img in examples["image"]]
-...     examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"]
-...     return examples
-```
-
-<Tip>
-
-In the example above we set `do_resize=False` because we have already resized the images in the image augmentation transformation,
-and leveraged the `size` attribute from the appropriate `image_processor`. If you do not resize images during image augmentation,
-leave this parameter out. By default, `ImageProcessor` will handle the resizing.
-
-If you wish to normalize images as a part of the augmentation transformation, use the `image_processor.image_mean`,
-and `image_processor.image_std` values.
-</Tip>
-
-3. Then use 🤗 Datasets[`~datasets.Dataset.set_transform`] to apply the transforms on the fly:
-```py
->>> dataset.set_transform(transforms)
-```
-
-4. Now when you access the image, you'll notice the image processor has added `pixel_values`. You can pass your processed dataset to the model now!
-
-```py
->>> dataset[0].keys()
-```
-
-Here is what the image looks like after the transforms are applied. The image has been randomly cropped and it's color properties are different.
-
-```py
->>> import numpy as np
->>> import matplotlib.pyplot as plt
-
->>> img = dataset[0]["pixel_values"]
->>> plt.imshow(img.permute(1, 2, 0))
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png"/>
-</div>
-
-<Tip>
-
-For tasks like object detection, semantic segmentation, instance segmentation, and panoptic segmentation, `ImageProcessor`
-offers post processing methods. These methods convert model's raw outputs into meaningful predictions such as bounding boxes,
-or segmentation maps.
-
-</Tip>
-
-### Pad
-
-In some cases, for instance, when fine-tuning [DETR](./model_doc/detr), the model applies scale augmentation at training
-time. This may cause images to be different sizes in a batch. You can use [`DetrImageProcessor.pad`]
-from [`DetrImageProcessor`] and define a custom `collate_fn` to batch images together.
-
-```py
->>> def collate_fn(batch):
-...     pixel_values = [item["pixel_values"] for item in batch]
-...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
-...     labels = [item["labels"] for item in batch]
-...     batch = {}
-...     batch["pixel_values"] = encoding["pixel_values"]
-...     batch["pixel_mask"] = encoding["pixel_mask"]
-...     batch["labels"] = labels
-...     return batch
-```
-
-## Multimodal
-
-For tasks involving multimodal inputs, you'll need a [processor](main_classes/processors) to prepare your dataset for the model. A processor couples together two processing objects such as tokenizer and feature extractor.
-
-Load the [LJ Speech](https://huggingface.co/datasets/lj_speech) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) for more details on how to load a dataset) to see how you can use a processor for automatic speech recognition (ASR):
-
-```py
->>> from datasets import load_dataset
-
->>> lj_speech = load_dataset("lj_speech", split="train")
-```
-
-For ASR, you're mainly focused on `audio` and `text` so you can remove the other columns:
-
-```py
->>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
-```
-
-Now take a look at the `audio` and `text` columns:
-
-```py
->>> lj_speech[0]["audio"]
-{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
-         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
- 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
- 'sampling_rate': 22050}
-
->>> lj_speech[0]["text"]
-'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
-```
-
-Remember you should always [resample](preprocessing#audio) your audio dataset's sampling rate to match the sampling rate of the dataset used to pretrain a model!
-
-```py
->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
-```
-
-Load a processor with [`AutoProcessor.from_pretrained`]:
-
-```py
->>> from transformers import AutoProcessor
-
->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
-```
-
-1. Create a function to process the audio data contained in `array` to `input_values`, and tokenize `text` to `labels`. These are the inputs to the model:
-
-```py
->>> def prepare_dataset(example):
-...     audio = example["audio"]
-
-...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
-
-...     return example
-```
-
-2. Apply the `prepare_dataset` function to a sample:
-
-```py
->>> prepare_dataset(lj_speech[0])
-```
-
-The processor has now added `input_values` and `labels`, and the sampling rate has also been correctly downsampled to 16kHz. You can pass your processed dataset to the model now!
diff --git a/docs/source/en/processors.md b/docs/source/en/processors.md
new file mode 100644
index 000000000000..7bd6be74f5b1
--- /dev/null
+++ b/docs/source/en/processors.md
@@ -0,0 +1,129 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Processors
+
+Multimodal models require a preprocessor capable of handling inputs that combine more than one modality. Depending on the input modality, a processor needs to convert text into an array of tensors, images into pixel values, and audio into an array with tensors with the correct sampling rate.
+
+For example, [PaliGemma](./model_doc/paligemma) is a vision-language model that uses the [SigLIP](./model_doc/siglip) image processor and the [Llama](./model_doc/llama) tokenizer. A [`ProcessorMixin`] class wraps both of these preprocessor types, providing a single and unified processor class for a multimodal model.
+
+Call [`~ProcessorMixin.from_pretrained`] to load a processor. Pass the input type to the processor to generate the expected model inputs, input ids and pixel values.
+
+```py
+from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+from PIL import Image
+import requests
+
+processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
+
+prompt = "answer en Where is the cat standing?"
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(text=prompt, images=image, return_tensors="pt")
+inputs
+```
+
+This guide describes the processor class and how to preprocess multimodal inputs.
+
+## Processor classes
+
+All processors inherit from the [`ProcessorMixin`] class which provides methods like [`~ProcessorMixin.from_pretrained`], [`~ProcessorMixin.save_pretrained`], and [`~ProcessorMixin.push_to_hub`] for loading, saving, and sharing processors to the Hub.
+
+There are two ways to load a processor, with an [`AutoProcessor`] and with a model-specific processor class.
+
+<hfoptions id="processor-class">
+<hfoption id="AutoProcessor">
+
+The [AutoClass](./model_doc/auto) API provides a simple interface to load processors without directly specifying the specific model class it belongs to.
+
+Use [`~AutoProcessor.from_pretrained`] to load a processor.
+
+```py
+from transformers import AutoProcessor
+
+processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
+```
+
+</hfoption>
+<hfoption id="model-specific processor">
+
+Processors are also associated with a specific pretrained multimodal model class. You can load a processor directly from the model class with [`~ProcessorMixin.from_pretrained`].
+
+```py
+from transformers import WhisperProcessor
+
+processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+```
+
+You could also separately load the two preprocessor types, [`WhisperTokenizerFast`] and [`WhisperFeatureExtractor`].
+
+```py
+from transformers import WhisperTokenizerFast, WhisperFeatureExtractor, WhisperProcessor
+
+tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny")
+feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
+processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+```
+
+</hfoption>
+</hfoptions>
+
+## Preprocess
+
+Processors preprocess multimodal inputs into the expected Transformers format. There are a couple combinations of input modalities that a processor can handle such as text and audio or text and image.
+
+Automatic speech recognition (ASR) tasks require a processor that can handle text and audio inputs. Load a dataset and take a look at the `audio` and `text` columns (you can remove the other columns which aren't needed).
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("lj_speech", split="train")
+dataset = dataset.map(remove_columns=["file", "id", "normalized_text"])
+dataset[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+dataset[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+Remember to resample the sampling rate to match the pretrained models required sampling rate.
+
+```py
+dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+```
+
+Load a processor and pass the audio `array` and `text` columns to it.
+
+```py
+from transformers import AutoProcessor
+
+processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
+
+def prepare_dataset(example):
+    audio = example["audio"]
+    example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+    return example
+```
+
+Apply the `prepare_dataset` function to preprocess the dataset. The processor returns `input_features` for the `audio` column and `labels` for the text column.
+
+```py
+prepare_dataset(dataset[0])
+```
diff --git a/docs/source/en/quantization/aqlm.md b/docs/source/en/quantization/aqlm.md
index 2e00d94cfcff..9c9b6ac0715d 100644
--- a/docs/source/en/quantization/aqlm.md
+++ b/docs/source/en/quantization/aqlm.md
@@ -16,19 +16,17 @@ rendered properly in your Markdown viewer.
 
 # AQLM
 
-> [!TIP]
-> Try AQLM on [Google Colab](https://colab.research.google.com/drive/1-xZmBRXT5Fm3Ghn4Mwa2KRypORXb855X?usp=sharing)!
+Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) quantizes multiple weights together and takes advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes.
 
-Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and takes advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes.
+AQLM also supports fine-tuning with [LoRA](https://huggingface.co/docs/peft/package_reference/lora) with the [PEFT](https://huggingface.co/docs/peft) library, and is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training.
+
+Run the command below to install the AQLM library with kernel support for both GPU and CPU inference and training. AQLM only works with Python 3.10+.
 
-Inference support for AQLM is realised in the `aqlm` library. Make sure to install it to run the models (note aqlm works only with python>=3.10):
 ```bash
 pip install aqlm[gpu,cpu]
 ```
 
-The library provides efficient kernels for both GPU and CPU inference and training.
-
-The instructions on how to quantize models yourself, as well as all the relevant code can be found in the corresponding GitHub [repository](https://github.com/Vahe1994/AQLM). To run AQLM models simply load a model that has been quantized with AQLM:
+Load an AQLM-quantized model with [`~PreTrainedModel.from_pretrained`].
 
 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -38,20 +36,21 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
     torch_dtype="auto", 
     device_map="auto"
 )
-tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf")
 ```
 
-## PEFT
-
-Starting with version `aqlm 1.0.2`, AQLM supports Parameter-Efficient Fine-Tuning in a form of [LoRA](https://huggingface.co/docs/peft/package_reference/lora) integrated into the [PEFT](https://huggingface.co/blog/peft) library.
+## Configurations
 
-## AQLM configurations
+AQLM quantization setups vary mainly in the number of codebooks used, as well as codebook sizes in bits. The most popular setups and supported inference kernels are shown below.
 
-AQLM quantization setups vary mainly on the number of codebooks used as well as codebook sizes in bits. The most popular setups, as well as inference kernels they support are:
- 
 | Kernel | Number of codebooks | Codebook size, bits | Notation | Accuracy | Speedup     | Fast GPU inference | Fast CPU inference |
 |---|---------------------|---------------------|----------|-------------|-------------|--------------------|--------------------|
 | Triton | K                   | N                  | KxN     | -        | Up to ~0.7x | ✅                  | ❌                  |
 | CUDA | 1                   | 16                  | 1x16     | Best        | Up to ~1.3x | ✅                  | ❌                  |
 | CUDA | 2                   | 8                   | 2x8      | OK          | Up to ~3.0x | ✅                  | ❌                  |
 | Numba | K                   | 8                   | Kx8      | Good        | Up to ~4.0x | ❌                  | ✅                  |
+
+## Resources
+
+Run the AQLM demo [notebook](https://colab.research.google.com/drive/1-xZmBRXT5Fm3Ghn4Mwa2KRypORXb855X?usp=sharing) for more examples of how to quantize a model, push a quantized model to the Hub, and more.
+
+For more example demo notebooks, visit the AQLM [repository](https://github.com/Vahe1994/AQLM).
diff --git a/docs/source/en/quantization/awq.md b/docs/source/en/quantization/awq.md
index f581c161392f..aa57da0f811f 100644
--- a/docs/source/en/quantization/awq.md
+++ b/docs/source/en/quantization/awq.md
@@ -16,17 +16,11 @@ rendered properly in your Markdown viewer.
 
 # AWQ
 
-<Tip>
-
-Try AWQ quantization with this [notebook](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY)!
-
-</Tip>
-
-[Activation-aware Weight Quantization (AWQ)](https://hf.co/papers/2306.00978) doesn't quantize all the weights in a model, and instead, it preserves a small percentage of weights that are important for LLM performance. This significantly reduces quantization loss such that you can run models in 4-bit precision without experiencing any performance degradation.
+[Activation-aware Weight Quantization (AWQ)](https://hf.co/papers/2306.00978) preserves a small fraction of the weights that are important for LLM performance to compress a model to 4-bits with minimal performance degradation.
 
 There are several libraries for quantizing models with the AWQ algorithm, such as [llm-awq](https://github.com/mit-han-lab/llm-awq), [autoawq](https://github.com/casper-hansen/AutoAWQ) or [optimum-intel](https://huggingface.co/docs/optimum/main/en/intel/optimization_inc). Transformers supports loading models quantized with the llm-awq and autoawq libraries. This guide will show you how to load models quantized with autoawq, but the process is similar for llm-awq quantized models.
 
-Make sure you have autoawq installed:
+Run the command below to install autoawq
 
 ```bash
 pip install autoawq
@@ -34,7 +28,7 @@ pip install autoawq
 > [!WARNING]
 > AutoAWQ downgrades Transformers to version 4.47.1. If you want to do inference with AutoAWQ, you may need to reinstall your Transformers' version after installing AutoAWQ.
 
-AWQ-quantized models can be identified by checking the `quantization_config` attribute in the model's [config.json](https://huggingface.co/TheBloke/zephyr-7B-alpha-AWQ/blob/main/config.json) file:
+Identify an AWQ-quantized model by checking the `quant_method` key in the models [config.json](https://huggingface.co/TheBloke/zephyr-7B-alpha-AWQ/blob/main/config.json) file.
 
 ```json
 {
@@ -55,63 +49,60 @@ AWQ-quantized models can be identified by checking the `quantization_config` att
 }
 ```
 
-A quantized model is loaded with the [`~PreTrainedModel.from_pretrained`] method. If you loaded your model on the CPU, make sure to move it to a GPU device first. Use the `device_map` parameter to specify where to place the model:
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "TheBloke/zephyr-7B-alpha-AWQ"
-model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
-```
+Load the AWQ-quantized model with [`~PreTrainedModel.from_pretrained`]. This automatically sets the other weights to fp16 by default for performance reasons. Use the `torch_dtype` parameter to load these other weights in a different format.
 
-Loading an AWQ-quantized model automatically sets other weights to fp16 by default for performance reasons. If you want to load these other weights in a different format, use the `torch_dtype` parameter:
+If the model is loaded on the CPU, use the `device_map` parameter to move it to a GPU.
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 
-model_id = "TheBloke/zephyr-7B-alpha-AWQ"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
+model = AutoModelForCausalLM.from_pretrained(
+  "TheBloke/zephyr-7B-alpha-AWQ",
+  torch_dtype=torch.float32,
+  device_map="cuda:0"
+)
 ```
 
-AWQ quantization can also be combined with [FlashAttention-2](../perf_infer_gpu_one#flashattention-2) to further accelerate inference:
+Use `attn_implementation` to enable [FlashAttention2](../perf_infer_gpu_one#flashattention-2) to further accelerate inference.
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", attn_implementation="flash_attention_2", device_map="cuda:0")
+model = AutoModelForCausalLM.from_pretrained(
+  "TheBloke/zephyr-7B-alpha-AWQ",
+  attn_implementation="flash_attention_2",
+  device_map="cuda:0"
+)
 ```
 
 ## Fused modules
 
-Fused modules offers improved accuracy and performance and it is supported out-of-the-box for AWQ modules for [Llama](https://huggingface.co/meta-llama) and [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1) architectures, but you can also fuse AWQ modules for unsupported architectures.
-
-<Tip warning={true}>
+Fused modules offer improved accuracy and performance. They are supported out-of-the-box for AWQ modules for [Llama](https://huggingface.co/meta-llama) and [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1) architectures, but you can also fuse AWQ modules for unsupported architectures.
 
-Fused modules cannot be combined with other optimization techniques such as FlashAttention-2.
-
-</Tip>
+> [!WARNING]
+> Fused modules cannot be combined with other optimization techniques such as FlashAttention2.
 
 <hfoptions id="fuse">
 <hfoption id="supported architectures">
 
-To enable fused modules for supported architectures, create an [`AwqConfig`] and set the parameters `fuse_max_seq_len` and `do_fuse=True`. The `fuse_max_seq_len` parameter is the total sequence length and it should include the context length and the expected generation length. You can set it to a larger value to be safe.
+Create an [`AwqConfig`] and set the parameters `fuse_max_seq_len` and `do_fuse=True` to enable fused modules. The `fuse_max_seq_len` parameter is the total sequence length and it should include the context length and the expected generation length. Set it to a larger value to be safe.
 
-For example, to fuse the AWQ modules of the [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model.
+The example below fuses the AWQ modules of the [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model.
 
 ```python
 import torch
 from transformers import AwqConfig, AutoModelForCausalLM
 
-model_id = "TheBloke/Mistral-7B-OpenOrca-AWQ"
-
 quantization_config = AwqConfig(
     bits=4,
     fuse_max_seq_len=512,
     do_fuse=True,
 )
-
-model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
+model = AutoModelForCausalLM.from_pretrained(
+  "TheBloke/Mistral-7B-OpenOrca-AWQ",
+  quantization_config=quantization_config
+).to(0)
 ```
 
 The [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model was benchmarked with `batch_size=1` with and without fused modules.
@@ -156,14 +147,14 @@ The speed and throughput of fused and unfused modules were also tested with the
 </hfoption>
 <hfoption id="unsupported architectures">
 
-For architectures that don't support fused modules yet, you need to create a custom fusing mapping to define which modules need to be fused with the `modules_to_fuse` parameter. For example, to fuse the AWQ modules of the [TheBloke/Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) model.
+For architectures that don't support fused modules, create an [`AwqConfig`] and define a custom fusing mapping in `modules_to_fuse` to determine which modules need to be fused.
+
+The example below fuses the AWQ modules of the [TheBloke/Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) model.
 
 ```python
 import torch
 from transformers import AwqConfig, AutoModelForCausalLM
 
-model_id = "TheBloke/Yi-34B-AWQ"
-
 quantization_config = AwqConfig(
     bits=4,
     fuse_max_seq_len=512,
@@ -178,35 +169,46 @@ quantization_config = AwqConfig(
     }
 )
 
-model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, trust_remote_code=True).to(0)
+model = AutoModelForCausalLM.from_pretrained(
+  "TheBloke/Yi-34B-AWQ",
+  quantization_config=quantization_config
+).to(0)
 ```
 
-The parameter `modules_to_fuse` should include:
+The parameter `modules_to_fuse` should include the following keys.
 
 - `"attention"`: The names of the attention layers to fuse in the following order: query, key, value and output projection layer. If you don't want to fuse these layers, pass an empty list.
 - `"layernorm"`: The names of all the LayerNorm layers you want to replace with a custom fused LayerNorm. If you don't want to fuse these layers, pass an empty list.
 - `"mlp"`: The names of the MLP layers you want to fuse into a single MLP layer in the order: (gate (dense, layer, post-attention) / up / down layers).
 - `"use_alibi"`: If your model uses ALiBi positional embedding.
 - `"num_attention_heads"`: The number of attention heads.
-- `"num_key_value_heads"`: The number of key value heads that should be used to implement Grouped Query Attention (GQA). If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA), otherwise GQA is used.
+- `"num_key_value_heads"`: The number of key value heads that should be used to implement Grouped Query Attention (GQA).
+
+  | parameter value | attention |
+  |---|---|
+  | `num_key_value_heads=num_attention_heads` | Multi-Head Attention |
+  | `num_key_value_heads=1` | Multi-Query Attention |
+  | `num_key_value_heads=...` | Grouped Query Attention |
+
 - `"hidden_size"`: The dimension of the hidden representations.
 
 </hfoption>
 </hfoptions>
 
+## ExLlamaV2
 
-
-## ExLlama-v2 support
-
-Recent versions of `autoawq` supports ExLlama-v2 kernels for faster prefill and decoding. To get started, first install the latest version of `autoawq` by running:
+[ExLlamaV2](https://github.com/turboderp/exllamav2) kernels support faster prefill and decoding. Run the command below to install the latest version of autoawq with ExLlamaV2 support.
 
 ```bash
 pip install git+https://github.com/casper-hansen/AutoAWQ.git
 ```
 
-Get started by passing an `AwqConfig()` with `version="exllama"`.
+Set `version="exllama"` in [`AwqConfig`] to enable ExLlamaV2 kernels.
 
-```python
+> [!TIP]
+> ExLlamaV2 is supported on AMD GPUs.
+
+```py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig
 
@@ -217,34 +219,18 @@ model = AutoModelForCausalLM.from_pretrained(
     quantization_config=quantization_config,
     device_map="auto",
 )
-
-input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device="cuda")
-output = model(input_ids)
-print(output.logits)
-
-tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-AWQ")
-input_ids = tokenizer.encode("How to make a cake", return_tensors="pt").to(model.device)
-output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=50256)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 
-<Tip warning={true}>
-
-Note this feature is supported on AMD GPUs.
-
-</Tip>
-
+## CPU
 
-## Intel CPU/GPU support
-
-Recent versions of autoawq supports Intel CPU/GPU with IPEX op optimizations. To get started, install the latest version of autoawq.
+[Intel Extension for PyTorch (IPEX)](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/) is designed to enable performance optimizations on Intel hardware. Run the command below to install the latest version of autoawq with IPEX support.
 
 ```bash
 pip install intel-extension-for-pytorch # for IPEX-GPU refer to https://intel.github.io/intel-extension-for-pytorch/xpu/2.5.10+xpu/ 
 pip install git+https://github.com/casper-hansen/AutoAWQ.git
 ```
 
-Get started by passing an `AwqConfig()` with `version="ipex"`.
+Set `version="ipex"` in [`AwqConfig`] to enable ExLlamaV2 kernels.
 
 ```python
 import torch
@@ -258,20 +244,8 @@ model = AutoModelForCausalLM.from_pretrained(
     quantization_config=quantization_config,
     device_map=device,
 )
-
-input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device=device)
-output = model(input_ids)
-print(output.logits)
-
-tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ")
-input_ids = tokenizer.encode("How to make a cake", return_tensors="pt").to(device)
-pad_token_id = tokenizer.eos_token_id
-output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=pad_token_id)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 
-<Tip warning={true}>
-
-This feature is supported on Intel CPUs/GPUs.
+## Resources
 
-</Tip>
+Run the AWQ demo [notebook](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY#scrollTo=Wwsg6nCwoThm) for more examples of how to quantize a model, push a quantized model to the Hub, and more.
diff --git a/docs/source/en/quantization/bitnet.md b/docs/source/en/quantization/bitnet.md
index 6bd65e8b53a4..5f713a20d3fd 100644
--- a/docs/source/en/quantization/bitnet.md
+++ b/docs/source/en/quantization/bitnet.md
@@ -16,60 +16,33 @@ rendered properly in your Markdown viewer.
 
 # BitNet
 
-[BitNet](https://arxiv.org/abs/2402.17764) replaces traditional Linear layers in Multi-Head Attention and Feed-Forward Networks with specialized layers called BitLinear with ternary (or binary in the older version) precision. The BitLinear layers introduced here quantize the weights using ternary precision (with values of -1, 0, and 1) and quantize the activations to 8-bit precision.
-
+[BitNet](https://arxiv.org/abs/2402.17764) replaces traditional linear layers in Multi-Head Attention and feed-forward networks with specialized BitLinear layers. The BitLinear layers quantize the weights using ternary precision (with values of -1, 0, and 1) and quantize the activations to 8-bit precision.
 
 <figure style="text-align: center;">
   <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/1.58llm_extreme_quantization/bitlinear.png" alt="Alt Text" />
-  <figcaption>The architecture of BitNet with BitLinear layers</figcaption>
+  <figcaption>The architecture of BitNet with BitLinear layers.</figcaption>
 </figure>
 
-During training, we start by quantizing the weights into ternary values, using symmetric per tensor quantization. First, we compute the average of the absolute values of the weight matrix and use this as a scale. We then divide the weights by the scale, round the values, constrain them between -1 and 1, and finally rescale them to continue in full precision.
-
-$$
-scale_w = \frac{1}{\frac{1}{nm} \sum_{ij} |W_{ij}|}
-$$
-
-$$
-W_q = \text{clamp}_{[-1,1]}(\text{round}(W*scale))
-$$
-
-$$
-W_{dequantized} = W_q*scale_w
-$$
-
-Activations are then quantized to a specified bit-width (e.g., 8-bit) using [absmax](https://arxiv.org/pdf/2208.07339) quantization (symmetric per channel quantization). This involves scaling the activations into a range [−128,127[. The quantization formula is:
+BitNet models can't be quantized on the fly. They need to be quantized during pretraining or fine-tuning because it is a Quantization-Aware Training (QAT) technique. During training, the weights are quantized to ternary values with symmetric per tensor quantization.
 
-$$
-scale_x = \frac{127}{|X|_{\text{max}, \, \text{dim}=-1}}
-$$
+1. Compute the average of the absolute values of the weight matrix and use as a scale.
+2. Divide the weights by the scale, round the values, constrain them between -1 and 1, and rescale them to continue in full precision.
+3. Activations are quantized to a specified bit-width (8-bit) using [absmax](https://arxiv.org/pdf/2208.07339) quantization (symmetric per channel quantization). This involves scaling the activations into a range of [−128,127].
 
-$$
-X_q = \text{clamp}_{[-128,127]}(\text{round}(X*scale))
-$$
+Refer to this [PR](https://github.com/huggingface/nanotron/pull/180) to pretrain or fine-tune a 1.58-bit model with [Nanotron](https://github.com/huggingface/nanotron). For fine-tuning, convert a model from the Hugging Face to Nanotron format. Find the conversion steps in this [PR](https://github.com/huggingface/nanotron/pull/174).
 
-$$
-X_{dequantized} = X_q * scale_x
-$$
-
-To learn more about how we trained, and fine-tuned bitnet models checkout the blogpost [here](https://huggingface.co/blog/1_58_llm_extreme_quantization)
-
-## Load a BitNet Model from the Hub
-BitNet models can't be quantized on the fly—they need to be pre-trained or fine-tuned with the quantization applied (it's a Quantization aware training technique). Once trained, these models are already quantized and available as packed versions on the hub.
-
-A quantized model can be load : 
+Load a BitNet quantized model with [`~PreTrainedModel.from_pretrained`].
 
 ```py
 from transformers import AutoModelForCausalLM
 path = "/path/to/model"
 model = AutoModelForCausalLM.from_pretrained(path, device_map="auto")
 ```
-## Pre-training / Fine-tuning a BitNet Model
 
-If you're looking to pre-train or fine-tune your own 1.58-bit model using Nanotron, check out this [PR](https://github.com/huggingface/nanotron/pull/180), all you need to get started is there !
+## Kernels
 
-For fine-tuning, you'll need to convert the model from Hugging Face format to Nanotron format (which has some differences). You can find the conversion steps in this [PR](https://github.com/huggingface/nanotron/pull/174).
+`@torch.compile` is used to unpack the weights and perform the forward pass. It’s very straightforward to implement and delivers significant speed improvements. Additional optimized kernels will be integrated in future versions.
 
-## Kernels
+## Resources
 
-In our initial version, we chose to use `@torch.compile` to unpack the weights and perform the forward pass. It’s very straightforward to implement and delivers significant speed improvements. We plan to integrate additional optimized kernels in future versions.
\ No newline at end of file
+Read [Fine-tuning LLMs to 1.58bit: extreme quantization made easy](https://huggingface.co/blog/1_58_llm_extreme_quantization) to learn more about how BitNet models are trained and fine-tuned.
diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md
index 368a649bae3b..e9b581b89f2b 100644
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@@ -16,42 +16,27 @@ rendered properly in your Markdown viewer.
 
 # bitsandbytes
 
-[bitsandbytes](https://github.com/TimDettmers/bitsandbytes) is the easiest option for quantizing a model to 8 and 4-bit. 8-bit quantization multiplies outliers in fp16 with non-outliers in int8, converts the non-outlier values back to fp16, and then adds them together to return the weights in fp16. This reduces the degradative effect outlier values have on a model's performance. 4-bit quantization compresses a model even further, and it is commonly used with [QLoRA](https://hf.co/papers/2305.14314) to finetune quantized LLMs.
+[bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) features the LLM.int8 and QLoRA quantization to enable accessible large language model inference and training.
 
-To use bitsandbytes, make sure you have the following libraries installed:
+[LLM.int8()](https://hf.co/papers/2208.07339) is a quantization method that aims to make large language model inference more accessible without significant degradation. Unlike naive 8-bit quantization, which can result in loss of critical information and accuracy, LLM.int8() dynamically adapts to ensure sensitive components of the computation retain higher precision when needed.
 
-<hfoptions id="bnb">
-<hfoption id="8-bit">
+QLoRA, or 4-bit quantization, compresses a model even further to 4-bits and inserts a small set of trainable low-rank adaptation (LoRA) weights to allowing training. 
 
-```bash
-pip install transformers accelerate bitsandbytes>0.37.0
-```
+> **Note:** For a user-friendly quantization experience, you can use the `bitsandbytes` [community space](https://huggingface.co/spaces/bnb-community/bnb-my-repo).
 
-</hfoption>
-<hfoption id="4-bit">
+
+Run the command below to install bitsandbytes.
 
 ```bash
-pip install bitsandbytes>=0.39.0
-pip install --upgrade accelerate transformers
+pip install --upgrade transformers accelerate bitsandbytes
 ```
 
-</hfoption>
-</hfoptions>
-
-<Tip>
-
-bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
-
-We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
-
-</Tip>
-
-Now you can quantize a model by passing a `BitsAndBytesConfig` to [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it supports loading with Accelerate and contains `torch.nn.Linear` layers.
+Quantize a model by passing a [`BitsAndBytesConfig`] to [`~PreTrainedModel.from_pretrained`]. This works for any model in any modality, as long as it supports [Accelerate](https://huggingface.co/docs/accelerate/index) and contains [torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) layers.
 
 <hfoptions id="bnb">
 <hfoption id="8-bit">
 
-Quantizing a model in 8-bit halves the memory-usage, and for large models, set `device_map="auto"` to efficiently use the GPUs available:
+Quantizing a model in 8-bit halves the memory-usage, and for large models, set `device_map="auto"` to efficiently distribute the weights across all available GPUs.
 
 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
@@ -64,7 +49,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file.
+By default, all other modules such as [torch.nn.LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) are set to the default torch dtype. You can change the data type of these modules with the `torch_dtype` parameter. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file.
 
 ```py
 import torch
@@ -80,7 +65,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained(
 model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
 ```
 
-Once a model is quantized to 8-bit, you can't push the quantized weights to the Hub unless you're using the latest version of Transformers and bitsandbytes. If you have the latest versions, then you can push the 8-bit model to the Hub with the [`~PreTrainedModel.push_to_hub`] method. The quantization config.json file is pushed first, followed by the quantized model weights.
+Once a model is quantized to 8-bit, you can't push the quantized weights to the Hub unless you're using the latest version of Transformers and bitsandbytes. If you have the latest versions, then you can push the 8-bit model to the Hub with [`~PreTrainedModel.push_to_hub`]. The quantization config.json file is pushed first, followed by the quantized model weights.
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
@@ -99,7 +84,7 @@ model.push_to_hub("bloom-560m-8bit")
 </hfoption>
 <hfoption id="4-bit">
 
-Quantizing a model in 4-bit reduces your memory-usage by 4x, and for large models, set `device_map="auto"` to efficiently use the GPUs available:
+Quantizing a model in 4-bit reduces your memory-usage by 4x, and for large models, set `device_map="auto"` to efficiently distribute the weights across all available GPUs.
 
 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
@@ -112,7 +97,7 @@ model_4bit = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file.
+By default, all other modules such as [torch.nn.LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file.
 
 ```py
 import torch
@@ -128,24 +113,21 @@ model_4bit = AutoModelForCausalLM.from_pretrained(
 model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
 ```
 
-If you have `bitsandbytes>=0.41.3`, you can serialize 4-bit models and push them on Hugging Face Hub. Simply call `model.push_to_hub()` after loading it in 4-bit precision. You can also save the serialized 4-bit models locally with `model.save_pretrained()` command.  
+Make sure you have the latest bitsandbytes version so you can serialize 4-bit models and push them to the Hub with [`~PreTrainedModel.push_to_hub`]. Use [`~PreTrainedModel.save_pretrained`] to save the 4-bit model locally.  
 
 </hfoption>
 </hfoptions>
 
-<Tip warning={true}>
+> [!WARNING]
+> 8 and 4-bit training is only supported for training *extra* parameters.
 
-Training with 8-bit and 4-bit weights are only supported for training *extra* parameters.
-
-</Tip>
-
-You can check your memory footprint with the `get_memory_footprint` method:
+Check your memory footprint with `get_memory_footprint`.
 
 ```py
 print(model.get_memory_footprint())
 ```
 
-Quantized models can be loaded from the [`~PreTrainedModel.from_pretrained`] method without needing to specify the `load_in_8bit` or `load_in_4bit` parameters:
+Load quantized models with [`~PreTrainedModel.from_pretrained`] without a `quantization_config`.
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -153,19 +135,13 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
 ```
 
-## 8-bit (LLM.int8() algorithm)
-
-<Tip>
-
-Learn more about the details of 8-bit quantization in this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration)!
+## LLM.int8
 
-</Tip>
-
-This section explores some of the specific features of 8-bit models, such as offloading, outlier thresholds, skipping module conversion, and finetuning.
+This section explores some of the specific features of 8-bit quantization, such as offloading, outlier thresholds, skipping module conversion, and finetuning.
 
 ### Offloading
 
-8-bit models can offload weights between the CPU and GPU to support fitting very large models into memory. The weights dispatched to the CPU are actually stored in **float32**, and aren't converted to 8-bit. For example, to enable offloading for the [bigscience/bloom-1b7](https://huggingface.co/bigscience/bloom-1b7) model, start by creating a [`BitsAndBytesConfig`]:
+8-bit models can offload weights between the CPU and GPU to fit very large models into memory. The weights dispatched to the CPU are stored in **float32** and aren't converted to 8-bit. For example, enable offloading for [bigscience/bloom-1b7](https://huggingface.co/bigscience/bloom-1b7) through [`BitsAndBytesConfig`].
 
 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
@@ -173,7 +149,7 @@ from transformers import AutoModelForCausalLM, BitsAndBytesConfig
 quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
 ```
 
-Design a custom device map to fit everything on your GPU except for the `lm_head`, which you'll dispatch to the CPU:
+Design a custom device map to fit everything on your GPU except for the `lm_head`, which is dispatched to the CPU.
 
 ```py
 device_map = {
@@ -185,7 +161,7 @@ device_map = {
 }
 ```
 
-Now load your model with the custom `device_map` and `quantization_config`:
+Now load your model with the custom `device_map` and `quantization_config`.
 
 ```py
 model_8bit = AutoModelForCausalLM.from_pretrained(
@@ -200,7 +176,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained(
 
 An "outlier" is a hidden state value greater than a certain threshold, and these values are computed in fp16. While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). 8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning).
 
-To find the best threshold for your model, we recommend experimenting with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]:
+To find the best threshold for your model, experiment with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]. For example, setting the threshold to `0.0` significantly speeds up inference at the potential cost of some accuracy loss.
 
 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
@@ -208,7 +184,7 @@ from transformers import AutoModelForCausalLM, BitsAndBytesConfig
 model_id = "bigscience/bloom-1b7"
 
 quantization_config = BitsAndBytesConfig(
-    llm_int8_threshold=10.0,
+    llm_int8_threshold=0.0,
     llm_int8_enable_fp32_cpu_offload=True
 )
 
@@ -222,7 +198,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained(
 
 ### Skip module conversion
 
-For some models, like [Jukebox](model_doc/jukebox), you don't need to quantize every module to 8-bit which can actually cause instability. With Jukebox, there are several `lm_head` modules that should be skipped using the `llm_int8_skip_modules` parameter in [`BitsAndBytesConfig`]:
+For some models, like [Jukebox](model_doc/jukebox), you don't need to quantize every module to 8-bit because it can actually cause instability. With Jukebox, there are several `lm_head` modules that should be skipped using the `llm_int8_skip_modules` parameter in [`BitsAndBytesConfig`].
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
@@ -243,22 +219,15 @@ model_8bit = AutoModelForCausalLM.from_pretrained(
 
 ### Finetuning
 
-With the [PEFT](https://github.com/huggingface/peft) library, you can finetune large models like [flan-t5-large](https://huggingface.co/google/flan-t5-large) and [facebook/opt-6.7b](https://huggingface.co/facebook/opt-6.7b) with 8-bit quantization. You don't need to pass the `device_map` parameter for training because it'll automatically load your model on a GPU. However, you can still customize the device map with the `device_map` parameter if you want to (`device_map="auto"` should only be used for inference).
-
-## 4-bit (QLoRA algorithm)
-
-<Tip>
+The [PEFT](https://github.com/huggingface/peft) library supports fine-tuning large models like [flan-t5-large](https://huggingface.co/google/flan-t5-large) and [facebook/opt-6.7b](https://huggingface.co/facebook/opt-6.7b) with 8-bit quantization. You don't need to pass the `device_map` parameter for training because it automatically loads your model on a GPU. However, you can still customize the device map with the `device_map` parameter (`device_map="auto"` should only be used for inference).
 
-Try 4-bit quantization in this [notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) and learn more about it's details in this [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
-
-</Tip>
-
-This section explores some of the specific features of 4-bit models, such as changing the compute data type, using the Normal Float 4 (NF4) data type, and using nested quantization.
+## QLoRA
 
+This section explores some of the specific features of 4-bit quantization, such as changing the compute data type, the Normal Float 4 (NF4) data type, and nested quantization.
 
 ### Compute data type
 
-To speedup computation, you can change the data type from float32 (the default value) to bf16 using the `bnb_4bit_compute_dtype` parameter in [`BitsAndBytesConfig`]:
+Change the data type from float32 (the default value) to bf16 in [`BitsAndBytesConfig`] to speedup computation.
 
 ```py
 import torch
@@ -269,7 +238,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dty
 
 ### Normal Float 4 (NF4)
 
-NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models. This can be configured with the `bnb_4bit_quant_type` parameter in the [`BitsAndBytesConfig`]:
+NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models.
 
 ```py
 from transformers import BitsAndBytesConfig
@@ -286,7 +255,7 @@ For inference, the `bnb_4bit_quant_type` does not have a huge impact on performa
 
 ### Nested quantization
 
-Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps.
+Nested quantization can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enable gradient accumulation with 4 steps.
 
 ```py
 from transformers import BitsAndBytesConfig
@@ -299,22 +268,19 @@ double_quant_config = BitsAndBytesConfig(
 model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", torch_dtype="auto", quantization_config=double_quant_config)
 ```
 
-## Dequantizing `bitsandbytes` models
+## Dequantizing bitsandbytes models
 
-Once quantized, you can dequantize the model to the original precision but this might result in a small quality loss of the model. Make sure you have enough GPU RAM to fit the dequantized model. 
+Once quantized, you can [`~PreTrainedModel.dequantize`] a model to the original precision but this may result in some quality loss. Make sure you have enough GPU memory to fit the dequantized model.
 
 ```python
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
 
-model_id = "facebook/opt-125m"
-
-model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=BitsAndBytesConfig(load_in_4bit=True))
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m", BitsAndBytesConfig(load_in_4bit=True))
 model.dequantize()
+```
+
+## Resources
 
-text = tokenizer("Hello my name is", return_tensors="pt").to(0)
+Learn more about the details of 8-bit quantization in [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration).
 
-out = model.generate(**text)
-print(tokenizer.decode(out[0]))
-```
\ No newline at end of file
+Try 4-bit quantization in this [notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) and learn more about it's details in [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md
index 177e26144589..a3b01a1b4489 100644
--- a/docs/source/en/quantization/compressed_tensors.md
+++ b/docs/source/en/quantization/compressed_tensors.md
@@ -13,98 +13,61 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
-# Compressed Tensors
 
-The [`compressed-tensors`](https://github.com/neuralmagic/compressed-tensors) library provides a versatile and efficient way to store and manage compressed model checkpoints. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more.
+# compressed-tensors
 
-Some of the supported formats include:
-1. `dense`
-2. `int-quantized` ([sample](https://huggingface.co/nm-testing/tinyllama-w8a8-compressed-hf-quantizer)): INT8 quantized models
-3. `float-quantized` ([sample](https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat)): FP8 quantized models; currently support E4M3
-4. `pack-quantized` ([sample](https://huggingface.co/nm-testing/tinyllama-w4a16-compressed-hf-quantizer)): INT4 or INT8 weight-quantized models, packed into INT32. For INT4, the weights have an INT4 range but are stored as INT8 and then packed into INT32.
+[compressed-tensors](https://github.com/neuralmagic/compressed-tensors) extends [safetensors](https://github.com/huggingface/safetensors) files to compressed tensor data types to provide a unified checkpoint format for storing and loading various quantization and sparsity formats such dense, int-quantized (int8), float-quantized (fp8), and pack-quantized (int4 or int8 weight-quantized packed into int32).
 
-Compressed models can be easily created using [llm-compressor](https://github.com/vllm-project/llm-compressor).
-Alternatively models can be created independently and serialized with a compressed tensors config.
+compressed-tensors supports fine-tuning with [PEFT](https://huggingface.co/docs/peft) and includes the following features as well.
 
-To find existing models on the Hugging Face Model Hub, search for the [`compressed-tensors` tag](https://huggingface.co/models?other=compressed-tensors).
+- fp8, int4, int8 weight and activation precisions.
+- Quantization scales and zero-points strategies for [tensor, channel, group, block, token](https://github.com/neuralmagic/compressed-tensors/blob/83b2e7a969d70606421a76b9a3d112646077c8de/src/compressed_tensors/quantization/quant_args.py#L43-L52).
+- Dynamic per-token activation quantization (or any static strategy).
+- Weight sparsity (unstructured or semi-structured like 2:4) can be composed with quantization for extreme compression.
+- Quantization of arbitrary modules, not just [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) modules.
+- Targeted support for specific modules by name or class.
 
-#### Features:
- - Weight and activation precisions: FP8, INT4, INT8 (for Q/DQ arbitrary precision is allowed for INT)
- - Quantization scales and zero-points strategies: [tensor, channel, group, block, token](https://github.com/neuralmagic/compressed-tensors/blob/83b2e7a969d70606421a76b9a3d112646077c8de/src/compressed_tensors/quantization/quant_args.py#L43-L52)
- - Dynamic per-token activation quantization (or any static strategy)
- - Sparsity in weights (unstructured or semi-structured like 2:4) can be composed with quantization for extreme compression
- - Supports quantization of arbitrary modules, not just Linear modules
- - Targeted support or ignoring of modules by name or class
+Install compressed-tensors from [PyPI](https://pypi.org/project/compressed-tensors) to get the latest stable release (recommended) or install it from source to get the latest features.
 
-## Installation
+<hfoptions id="install">
+<hfoption id="PyPI">
 
-It is recommended to install stable releases of compressed-tensors from [PyPI](https://pypi.org/project/compressed-tensors):
 ```bash
 pip install compressed-tensors
 ```
 
-Developers who want to experiment with the latest features can also install the package from source:
+</hfoption>
+<hfoption id="source code">
+
 ```bash
 git clone https://github.com/neuralmagic/compressed-tensors
 cd compressed-tensors
 pip install -e .
 ```
 
-## Quickstart Model Load
-Quantized models can be easily loaded for inference as shown below. Only models that have already been quantized can be loaded at the moment. To quantize a model into the compressed-tensors format see [llm-compressor](https://github.com/vllm-project/llm-compressor).
+</hfoption>
+</hfoptions>
+
+Search using the compressed-tensors [tag](https://huggingface.co/models?other=compressed-tensors) to find a compatible model on the Hugging Face Hub.
+
+Only models that have already been quantized can be loaded at the moment, and once a model is loaded, it cannot be saved. To quantize a model into the compressed-tensors format, see [llm-compressor](https://github.com/vllm-project/llm-compressor). Alternatively, models can be created independently and serizlied with a compressed-tensors config.
 
 ```python
 from transformers import AutoModelForCausalLM
 
-# Load the model in compressed-tensors format
-ct_model = AutoModelForCausalLM.from_pretrained("nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf")
+ct_model = AutoModelForCausalLM.from_pretrained("nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf", device_map="auto")
 
-# Measure memory usage
+# measure memory usage
 mem_params = sum([param.nelement()*param.element_size() for param in ct_model.parameters()])
 print(f"{mem_params/2**30:.4f} GB")
 # 8.4575 GB
 ```
 
-We can see just above that the compressed-tensors FP8 checkpoint of Llama 3.1 8B is able to be loaded for inference using half of the memory of the unquantized reference checkpoint.
-
-## Sample Use Cases - Load and run an FP8 model
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-prompt = [
-    "Hello, my name is",
-    "The capital of France is",
-    "The future of AI is"
-]
-
-model_name = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"
-
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-inputs = tokenizer(prompt, return_tensors="pt")
-generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
-outputs = tokenizer.batch_decode(generated_ids)
-
-print(outputs)
-
-"""
-['<|begin_of_text|>Hello, my name is [Name]. I am a [Your Profession/Student] and I am here to learn about the [Course/Program] at [University/Institution]. I am excited to be here and I am looking forward to', '<|begin_of_text|>The capital of France is Paris, which is located in the north-central part of the country. Paris is the most populous city in France and is known for its stunning architecture, art museums, fashion, and romantic atmosphere. The city is home to', "<|begin_of_text|>The future of AI is here, and it's already changing the way we live and work. From virtual assistants to self-driving cars, AI is transforming industries and revolutionizing the way we interact with technology. But what does the future of AI hold"]
-"""
-
-```
-
-The above shows a quick example for running generation using a `compressed-tensors`
-model. Currently, once loaded the model cannot be saved.
-
-## Deep dive into a compressed-tensors model checkpoint
-
-In this example we will examine how the compressed-tensors model nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf is defined through its configuration entry and see how this translates to the loaded model representation. 
+## Model checkpoint
 
-First, let us look at the [`quantization_config` of the model](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf/blob/main/config.json). At a glance it looks overwhelming with the number of entries but this is because compressed-tensors is a format that allows for flexible expression both during and after model compression.
+compressed-tensor models are defined through its configuration entry. The following example is taken from the [nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf/blob/main/config.json) `config.json` file.
 
-In practice for checkpoint loading and inference the configuration can be simplified to not include all the default or empty entries, so we will do that here to focus on what compression is actually represented.
+There are a lot of entries to allow for flexible expression both during and after compression, but the entries for loading and inference can be simplified to focus on just a few key entries.
 
 ```yaml
 "quantization_config": {
@@ -130,9 +93,9 @@ In practice for checkpoint loading and inference the configuration can be simpli
 },
 ```
 
-We can see from the above configuration that it is specifying one config group that includes weight and activation quantization to FP8 with a static per-tensor strategy. It is also worth noting that in the `ignore` list there is an entry to skip quantization of the `lm_head` module, so that module should be untouched in the checkpoint.
+The config file specifies the quantization of a config group (`group_0`), which includes weight and activation quantization to fp8 with a static per-tensor strategy. The `lm_head` module is unquantized as shown in the `ignore` key.
 
-To see the result of the configuration in practice, we can simply use the [safetensors viewer](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf?show_file_info=model.safetensors.index.json) on the model card to see the quantized weights, input_scale, and weight_scale for all of the Linear modules in the first model layer (and so on for the rest of the layers).
+For a more detailed look at the model weights, use the [safetensors viewer](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf?show_file_info=model.safetensors.index.json) on the model card to see the quantized weights, input scale, and weight scale for all [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) modules.
 
 | Tensors | Shape |	Precision |
 | ------- | ----- | --------- |
@@ -160,7 +123,7 @@ model.layers.0.self_attn.v_proj.input_scale	| [1] | BF16
 model.layers.0.self_attn.v_proj.weight |	[1 024, 4 096]	| F8_E4M3 
 model.layers.0.self_attn.v_proj.weight_scale |	[1] |	BF16 
 
-When we load the model with the compressed-tensors HFQuantizer integration, we can see that all of the Linear modules that are specified within the quantization configuration have been replaced by `CompressedLinear` modules that manage the compressed weights and forward pass for inference. Note that the `lm_head` mentioned before in the ignore list is still kept as an unquantized Linear module.
+When loading a compressed-tensors model with the [`~quantizers.HFQuantizer`] integration, all the [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) modules specified in the quantization config are replaced by [CompressedLinear](https://github.com/neuralmagic/compressed-tensors/blob/975cb223b19fcac2b98a4271d17668462d4d6e1d/src/compressed_tensors/linear/compressed_linear.py#L30) modules that manage the compressed weights and forward pass for inference. The `lm_head` module is still kept as an unquantized nn.Linear module.
 
 ```python
 from transformers import AutoModelForCausalLM
diff --git a/docs/source/en/quantization/contribute.md b/docs/source/en/quantization/contribute.md
index fb7ef6992223..0d56d7e9d71a 100644
--- a/docs/source/en/quantization/contribute.md
+++ b/docs/source/en/quantization/contribute.md
@@ -14,56 +14,58 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Contribute new quantization method
+# Contribute
 
-Transformers supports and integrates many quantization methods such as QLoRA, GPTQ, LLM.int8, and AWQ. However, there are other quantization approaches that are not yet integrated. To make adding and using these quantization methods with Transformers models easier, you should use the [`HfQuantizer`] class. The [`HfQuantizer`] is designed as an internal helper class for adding a quantization method instead of something you apply to every PyTorch module.
+Transformers supports many quantization methods such as QLoRA, GPTQ, LLM.int8, and AWQ. However, there are still many more quantization approaches that haven't been integrated yet. To make adding and using these quantization methods with Transformers easier, use the [`~quantizers.HfQuantizer`] class.  [`~quantizers.HfQuantizer`] is designed to be an internal helper class for adding a quantization method instead of something applied to every PyTorch module.
 
-This guide will show you how to integrate a new quantization method with the [`HfQuantizer`] class.
+This guide will show you how to integrate a new quantization method with [`~quantizers.HfQuantizer`].
 
 ## Requirements
 
-Before integrating a new quantization method into Transformers, ensure the method you are trying to add meets the following prerequisites. Only quantization methods that can be run with PyTorch modules are currently supported.
+Before integrating a new quantization method into Transformers, ensure the method meets the following requirements. Only quantization methods that can be run with PyTorch modules are supported.
 
-- The quantization method is available through a Python package that is pip-installable by anyone (it is also fine if you can only install the package from source). Ideally, pre-compiled kernels are included in the pip package.
-- The method can run on commonly-used hardware (CPU, GPU, ...).
-- The method is wrapped in a `nn.Module` (e.g., `Linear8bitLt`, `Linear4bit`), and the quantized linear layer should have the following definition:
+- The quantization method is available through a Python package that is pip-installable (it is also fine if you can only install the package from source). Ideally, pre-compiled kernels are included in the pip package.
+- The method can run on commonly-used hardware (CPU, GPU, etc.).
+- The method is wrapped in a [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) ([`~bitsandbytes.nn.Linear8bitLt`], [`~bitsandbytes.nn.Linear4bit`]), and the quantized linear layer should have the following definition.
 
-```py
-class Linear4bit(nn.Module):
-    def __init__(self, ...):
-        ...
-    
-    def forward(self, x):
-        return my_4bit_kernel(x, self.weight, self.bias)
-```
+    ```py
+    class Linear4bit(nn.Module):
+        def __init__(self, ...):
+            ...
+        
+        def forward(self, x):
+            return my_4bit_kernel(x, self.weight, self.bias)
+    ```
 
-This way, Transformers models can be easily quantized by replacing some instances of `nn.Linear` with a target class.
+    This way, Transformers models are easily quantized by replacing instances of [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) with a target class.
 
 - The quantization method should be serializable. You can save the quantized weights locally or push them to the Hub.
-- Make sure the package that contains the quantization kernels/primitive is stable (no frequent breaking changes).
+- Make sure the package containing the quantization kernels/primitive is stable (no frequent breaking changes).
 
-For some quantization methods, they may require "pre-quantizing" the models through data calibration (e.g., AWQ). In this case, we prefer to only support inference in Transformers and let the third-party library maintained by the ML community deal with the model quantization itself.
+Some quantization methods may require "pre-quantizing" the model through data calibration (AWQ). In this case, we prefer to only support inference in Transformers and let the third-party library maintained by the ML community deal handle the model quantization itself.
 
-## Build a new HFQuantizer class
+## Create new HFQuantizer class
 
-1. Create a new quantization config class inside [src/transformers/utils/quantization_config.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/quantization_config.py) and make sure to expose the new quantization config inside Transformers main `init` by adding it to the [`_import_structure`](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py#L1088) object of [src/transformers/__init__.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py).
+1. Create a new quantization config class inside [src/transformers/utils/quantization_config.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/quantization_config.py). Add the new quantization config to the [_import_structure](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py#L1088) inside Transformers' [src/transformers/__init__.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py) file.
 
-2. Create a new file inside [src/transformers/quantizers/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers) named `quantizer_your_method.py`, and make it inherit from [src/transformers/quantizers/base.py::HfQuantizer](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/base.py#L28). Make sure to add the new quantizer and quantization config in the quantization auto-mapping in [src/transformers/quantizers/auto.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/auto.py).
+2. Create a new file inside [src/transformers/quantizers/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers) named `quantizer_your_method.py`, and make it inherit from [`~quantizers.HfQuantizer]. Make sure to add the new quantizer and quantization config in the quantization auto-mapping in [src/transformers/quantizers/auto.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/auto.py).
 
-3. Define the following class attributes/property methods for your quantization method:
+3. Define the following class attributes and property methods for your quantization method.
 
-* `requires_calibration`: Whether the quantization method requires a data calibration process. If set to `True`, you can only support inference (with quantized weights) and not inference and quantization.
-* `required_packages`: A list of strings of the required packages to use the quantized weights. You might need to define some new utility methods such as `is_auto_awq_available` in [transformers/src/utils/import_utils.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/import_utils.py).
-* `requires_parameters_quantization`: Only required if your quantization method requires extra attention to the underlying `nn.Parameter` object. For example, bitsandbytes uses `Params4bit` and `Int8Param`, which requires some extra attention when quantizing the model. Most of the recent quantization method packs int2/int4 weights inside `torch.uint8` weights, so this flag should not be really required (set to `False` by default).
-* `is_serializable`: A property method to determine whether the method is serializable or not.
-* `is_trainable`:  A property method to determine whether you can fine-tune models on top of the quantization method (with or without PEFT approaches).
+    - `requires_calibration`: Whether the quantization method requires a data calibration process. If set to `True`, you can only support inference (with quantized weights) and not inference and quantization.
+    - `required_packages`: A list of strings of the required packages to use the quantized weights. You might need to define some new utility methods such as `is_auto_awq_available` in [transformers/src/utils/import_utils.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/import_utils.py).
+    - `requires_parameters_quantization`: Only required if your quantization method requires extra attention to the underlying [nn.Parameter](https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html) object. For example, bitsandbytes uses [`~bitsandbytes.nn.Params4bit`] and [`~bitsandbytes.nn.Int8Params`], which requires some extra attention when quantizing the model. Most of the recent quantization method packs int2 and int4 weights inside [torch.uint8](https://pytorch.org/docs/stable/tensors.html) weights, so this flag should not be really required (set to `False` by default).
+    - `is_serializable`: A property method to determine whether the method is serializable or not.
+    - `is_trainable`:  A property method to determine whether you can fine-tune models on top of the quantization method (with or without PEFT approaches).
 
-4. Write the `validate_environment` and `update_torch_dtype` methods. These methods are called before creating the quantized model to ensure users use the right configuration. You can have a look at how this is done on other quantizers.
+4. Write the `validate_environment` and `update_torch_dtype` methods. These methods are called before creating the quantized model to ensure users use the right configuration. Refer to other quantizers for an example of it is implemented.
 
-5. Write the `_process_model_before_weight_loading` method. In Transformers, the quantized models are initialized first on the `"meta"` device before loading the weights. This means the `_process_model_before_weight_loading` method takes care of manipulating the model skeleton to replace some modules (e.g., `nn.Linear`) with the target modules (quantization modules). You can define a module replacement logic or any other utility method by creating a new file in [transformers/src/integrations/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/integrations) and exposing the relevant methods in that folder's `__init__.py` file. The best starting point would be to have a look at another quantization methods such as [quantizer_awq.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/quantizer_awq.py).
+5. Write the `_process_model_before_weight_loading` method. In Transformers, the quantized models are initialized first on the `"meta"` device before loading the weights. This means the `_process_model_before_weight_loading` method takes care of manipulating the model skeleton to replace some modules ([nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)) with the target modules (quantization modules).
+
+    You can define module replacement logic or any other utility method by creating a new file in [transformers/src/integrations/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/integrations) and exposing the relevant methods in that folder's `__init__.py` file. The best starting point would be to have a look at another quantization method such as [quantizer_awq.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/quantizer_awq.py).
 
 6. Write the `_process_model_after_weight_loading` method. This method enables implementing additional features that require manipulating the model after loading the weights.
 
-7. Document everything! Make sure your quantization method is documented by adding a new file under `docs/source/en/quantization` and adding a new row in the table in `docs/source/en/quantization/overview.md`.
+7. Document everything! Make sure your quantization method is documented by adding a new file under `docs/source/en/quantization`.
 
-8. Add tests! You should add tests by first adding the package in our nightly Dockerfile inside `docker/transformers-quantization-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out how it is implemented for other quantization methods.
+8. You should add tests by adding the package in our nightly Dockerfile inside `docker/transformers-quantization-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out existing quantization methods to see how it is implemented.
diff --git a/docs/source/en/quantization/eetq.md b/docs/source/en/quantization/eetq.md
index bf2c4e0e6466..07cb25e437f8 100644
--- a/docs/source/en/quantization/eetq.md
+++ b/docs/source/en/quantization/eetq.md
@@ -16,32 +16,50 @@ rendered properly in your Markdown viewer.
 
 # EETQ
 
-The [EETQ](https://github.com/NetEase-FuXi/EETQ) library supports int8 per-channel weight-only quantization for NVIDIA GPUS. The high-performance GEMM and GEMV kernels are from FasterTransformer and TensorRT-LLM. It requires no calibration dataset and does not need to pre-quantize your model. Moreover, the accuracy degradation is negligible owing to the per-channel quantization. 
+The [Easy & Efficient Quantization for Transformers (EETQ)](https://github.com/NetEase-FuXi/EETQ) library supports int8 weight-only per-channel quantization for NVIDIA GPUs. It uses high-performance GEMM and GEMV kernels from [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). The attention layer is optimized with [FlashAttention2](https://github.com/Dao-AILab/flash-attention). No calibration dataset is required, and the model doesn't need to be pre-quantized. Accuracy degradation is negligible owing to the per-channel quantization.
 
-Make sure you have eetq installed from the [release page](https://github.com/NetEase-FuXi/EETQ/releases)
-```
+EETQ further supports fine-tuning with [PEFT](https://huggingface.co/docs/peft).
+
+Install EETQ from the [release page](https://github.com/NetEase-FuXi/EETQ/releases) or [source code](https://github.com/NetEase-FuXi/EETQ). CUDA 11.4+ is required for EETQ.
+
+<hfoptions id="install">
+<hfoption id="release page">
+
+```bash
 pip install --no-cache-dir https://github.com/NetEase-FuXi/EETQ/releases/download/v1.0.0/EETQ-1.0.0+cu121+torch2.1.2-cp310-cp310-linux_x86_64.whl
 ```
-or via the source code https://github.com/NetEase-FuXi/EETQ. EETQ requires CUDA capability <= 8.9 and >= 7.0
-```
+
+</hfoption>
+<hfoption id="source code">
+
+```bash
 git clone https://github.com/NetEase-FuXi/EETQ.git
 cd EETQ/
 git submodule update --init --recursive
 pip install .
 ```
 
-An unquantized model can be quantized via "from_pretrained".
+</hfoption>
+</hfoptions>
+
+Quantize a model on-the-fly by defining the quantization data type in [`EetqConfig`].
+
 ```py
 from transformers import AutoModelForCausalLM, EetqConfig
-path = "/path/to/model"
+
 quantization_config = EetqConfig("int8")
-model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", quantization_config=quantization_config)
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
 ```
 
-A quantized model can be saved via "saved_pretrained" and be reused again via the "from_pretrained".
+Save the quantized model with [`~PreTrainedModel.save_pretrained`] so it can be reused again with [`~PreTrainedModel.from_pretrained`].
 
 ```py
 quant_path = "/path/to/save/quantized/model"
 model.save_pretrained(quant_path)
 model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="auto")
-```
\ No newline at end of file
+```
diff --git a/docs/source/en/quantization/fbgemm_fp8.md b/docs/source/en/quantization/fbgemm_fp8.md
index 61cf8a059bf2..b9382f74952a 100644
--- a/docs/source/en/quantization/fbgemm_fp8.md
+++ b/docs/source/en/quantization/fbgemm_fp8.md
@@ -14,46 +14,43 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# FBGEMM FP8
+# FBGEMM
 
-With FBGEMM FP8 quantization method, you can quantize your model in FP8 (W8A8):
-- the weights will be quantized in 8bit (FP8) per channel
-- the activation will be quantized in 8bit (FP8) per token
-
-It relies on the [FBGEMM](https://github.com/pytorch/FBGEMM) library which provides efficient low-precision general matrix multiplication for small batch sizes and support for accuracy-loss minimizing techniques such as row-wise quantization and outlier-aware quantization. 
+[FBGEMM (Facebook GEneral Matrix Multiplication)](https://github.com/pytorch/FBGEMM) is a low-precision matrix multiplication library for small batch sizes and support for accuracy-loss minimizing techniques such as row-wise quantization and outlier-aware quantization. With FBGEMM, quantize a models weights to 8-bits/channel and the activations to 8-bits/token (also known as fp8 or w8a8).
 
 > [!TIP]
-> You need a GPU with compute capability>=9 (e.g. H100) 
+> You need a GPU with [compute capability 9+](https://developer.nvidia.com/cuda-gpus#collapseOne) like a H100.
 
-Before you begin, make sure the following libraries are installed with their latest version:
+Install the FBGEMM_GPU package with the command below to ensure you have the latest version.
 
 ```bash
 pip install --upgrade accelerate fbgemm-gpu torch
 ```
 
-If you are having issues with fbgemm-gpu and torch library, you might need to install the nightly release. You can follow the instruction [here](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch)
+If you're having installation issues, try installing the [nightly release](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch).
 
-By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
+Create a [`FbgemmFp8Config`] and pass it to [`~PreTrainedModel.from_pretrained`] to quantize a model to fp8.
 
 ```py
-from transformers import FbgemmFp8Config, AutoModelForCausalLM, AutoTokenizer
+from transformers import FbgemmFp8Config, AutoModelForCausalLM
 
-model_name = "meta-llama/Meta-Llama-3-8B"
 quantization_config = FbgemmFp8Config()
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
-
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-output = quantized_model.generate(**input_ids, max_new_tokens=10)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Meta-Llama-3-8B",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
 ```
 
-A quantized model can be saved via "saved_pretrained" and be reused again via the "from_pretrained".
+[`~PreTrainedModel.save_pretrained`] and [`~PreTrainedModel.from_pretrained`] enable saving and loading a quantized model.
 
 ```py
 quant_path = "/path/to/save/quantized/model"
 model.save_pretrained(quant_path)
 model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="auto")
-```
\ No newline at end of file
+```
+
+## Resources
+
+Read the [Open-sourcing FBGEMM for state-of-the-art server-side inference](https://engineering.fb.com/2018/11/07/ml-applications/fbgemm/) blog post for more details on FBGEMM.
diff --git a/docs/source/en/quantization/finegrained_fp8.md b/docs/source/en/quantization/finegrained_fp8.md
index 785e5e88e128..53e2a1cd3b8f 100644
--- a/docs/source/en/quantization/finegrained_fp8.md
+++ b/docs/source/en/quantization/finegrained_fp8.md
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -16,27 +16,27 @@ rendered properly in your Markdown viewer.
 
 # Fine-grained FP8
 
-With FP8 quantization method, you can quantize your model in FP8 (W8A8):
-- the weights will be quantized in 8bit (FP8) per 2D block (e.g. weight_block_size=(128, 128)) which is inspired from the deepseek implementation
-- Activations are quantized to 8 bits (FP8) per group per token, with the group value matching that of the weights in the input channels (128 by default)
+Fine-grained FP8 quantization quantizes the weights and activations to fp8.
 
-It's implemented to add support for DeepSeek-V3 and DeepSeek-R1 models, you can see the paper [here](https://arxiv.org/pdf/2412.19437), and the image below explains the quantization scheme : 
+- The weights are quantized to 8-bits for each 2D block (`weight_block_size=(128, 128)`).
+- The activations are quantized to 8-bits for each group per token. The group value matches the weights in the input channel (128 by default).
 
-![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/b7b3b34bf826a6423ea82ffc57ecac80c46c3c76/transformers/quantization/quantization_deepseek.png)
+FP8 quantization enables support for [DeepSeek-V3](https://hf.co/papers/2412.19437) and DeepSeek-R1.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/b7b3b34bf826a6423ea82ffc57ecac80c46c3c76/transformers/quantization/quantization_deepseek.png">
+</div>
 
 > [!TIP]
-> You need a GPU with compute capability>=9 (e.g. H100) 
+> You need a GPU with Compute Capability>=9 (H100), and install a PyTorch version compatible with the CUDA version of your GPU.
 
-Before you begin, make sure the following libraries are installed with their latest version:
+Install Accelerate and upgrade to the latest version of PyTorch.
 
 ```bash
 pip install --upgrade accelerate torch
 ```
-> [!TIP]
-> You need to install a torch version compatible with the cuda version of your GPU.
-
 
-By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
+Create a [`FineGrainedFP8Config`] class and pass it to [`~PreTrainedModel.from_pretrained`] to quantize it. The weights are loaded in full precision (`torch.float32`) by default regardless of the actual data type the weights are stored in. Set `torch_dtype="auto"` to load the weights in the data type defined in a models `config.json` file to automatically load the most memory-optiomal data type.
 
 ```py
 from transformers import FineGrainedFP8Config, AutoModelForCausalLM, AutoTokenizer
@@ -53,7 +53,7 @@ output = quantized_model.generate(**input_ids, max_new_tokens=10)
 print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 
-A quantized model can be saved via "saved_pretrained" and be reused again via the "from_pretrained".
+Use [`~PreTrainedModel.save_pretrained`] to save the quantized model and reload it with [`~PreTrainedModel.from_pretrained`].
 
 ```py
 quant_path = "/path/to/save/quantized/model"
diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md
index 1534a977f343..a9878bbc362e 100644
--- a/docs/source/en/quantization/gptq.md
+++ b/docs/source/en/quantization/gptq.md
@@ -16,91 +16,80 @@ rendered properly in your Markdown viewer.
 
 # GPTQ
 
-<Tip>
+The [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save memory usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory. Inference is also faster because a lower bitwidth takes less time to communicate.
 
-Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) and learn more about it's details in this [blog post](https://huggingface.co/blog/gptq-integration)!
+> [!WARNING]
+> AutoGPTQ is likely to be deprecated in the future due to lack of continued support for new models and features. See the [GPTQModel](#gptqmodel) section for more details.
 
-</Tip>
-
-Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) libraries implement the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes error. These weights are quantized to int4, stored as int32 (int4 x 8) and dequantized (restored) to fp16 on the fly during inference. This can save memory by almost 4x because the int4 weights are often dequantized in a fused kernel. You can also expect a substantial speedup in inference due to lower bandwidth requirements for lower bitwidth.
-
-[GPTQModel](https://github.com/ModelCloud/GPTQModel) started as a maintained fork of AutoGPTQ but has since differentiated itself with the following major differences.
-
-* Model support: GPTQModel continues to support all of the latest LLM models.
-* Multimodal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. 
-* Platform support: Linux, macOS (Apple Silicon), and Windows 11.
-* Hardware support: NVIDIA CUDA, AMD ROCm, Apple Silicon M1/MPS /CPU, Intel/AMD CPU, and Intel Datacenter Max/Arc GPUs.
-* Asymmetric support: Asymmetric quantization can potentially introduce lower quantization errors compared to symmetric quantization. However, it is not backward compatible with AutoGPTQ, and not all kernels, such as Marlin, support asymmetric quantization.
-* IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max/Arc GPUs) support.
-* Updated Marlin kernel from Neural Magic optimized for A100 (Ampere).
-* Updated kernels with auto-padding for legacy model support and models with non-uniform in/out-features. 
-* Faster quantization, lower memory usage, and more accurate default quantization via GPTQModel quantization APIs.
-* User and developer friendly APIs. 
-
-
-[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) will likely be deprecated in the future due the lack of continued support for new models and features. 
-
-Before you begin, make sure the following libraries are installed and updated to the latest release:
+Install Accelerate, Transformers and Optimum first.
 
 ```bash
 pip install --upgrade accelerate optimum transformers
 ```
 
-Then install either GPTQModel or AutoGPTQ.
+Then run the command below to install a GPTQ library.
+
+<hfoptions id="install">
+<hfoption id="GPTQmodel">
 
 ```bash
 pip install gptqmodel --no-build-isolation
 ```
 
-or
+</hfoption>
+<hfoption id="AutoGPTQ">
 
 ```bash
 pip install auto-gptq --no-build-isolation
 ```
 
-To quantize a model (currently only supported for text models), you need to create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calibrate the weights for quantization, and a tokenizer to prepare the dataset.
+</hfoption>
+</hfoptions>
+
+Create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calbrate the weights for quantization, and a tokenizer to prepare the dataset.
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 
-model_id = "facebook/opt-125m"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
 gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer)
 ```
 
-You could also pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper.
+You can pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper.
 
 ```py
 dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
 gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
 ```
 
-Load a model to quantize and pass the `gptq_config` to the [`~AutoModelForCausalLM.from_pretrained`] method. Set `device_map="auto"` to automatically offload the model to a CPU to help fit the model in memory, and allow the model modules to be moved between the CPU and GPU for quantization.
+Load a model to quantize and pass [`GPTQConfig`] to [`~AutoModelForCausalLM.from_pretrained`]. Set `device_map="auto"` to automatically offload the model to a CPU to help fit the model in memory, and allow the model modules to be moved between the CPU and GPU for quantization.
 
 ```py
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
+quantized_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m", device_map="auto", quantization_config=gptq_config)
 ```
 
-If you're running out of memory because a dataset is too large, disk offloading is not supported. If this is the case, try passing the `max_memory` parameter to allocate the amount of memory to use on your device (GPU and CPU):
+If you're running out of memory because a dataset is too large (disk offloading is not supported), try passing the `max_memory` parameter to allocate the amount of memory to use on your device (GPU and CPU).
 
 ```py
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "30GiB", 1: "46GiB", "cpu": "30GiB"}, quantization_config=gptq_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "facebook/opt-125m",
+    device_map="auto",
+    max_memory={0: "30GiB", 1: "46GiB", "cpu": "30GiB"},
+    quantization_config=gptq_config
+)
 ```
 
-<Tip warning={true}>
-
-Depending on your hardware, it can take some time to quantize a model from scratch. It can take ~5 minutes to quantize the [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) model on a free-tier Google Colab GPU, but it'll take ~4 hours to quantize a 175B parameter model on a NVIDIA A100. Before you quantize a model, it is a good idea to check the Hub if a GPTQ-quantized version of the model already exists.
+> [!WARNING]
+> Depending on your hardware, it can take some time to quantize a model from scratch. It can take ~5 minutes to quantize the [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) model on a free-tier Google Colab GPU, but it'll take ~4 hours to quantize a 175B parameter model on a NVIDIA A100. Before you quantize a model, it is a good idea to check the Hub if a GPTQ-quantized version of the model already exists.
 
-</Tip>
-
-Once your model is quantized, you can push the model and tokenizer to the Hub where it can be easily shared and accessed. Use the [`~PreTrainedModel.push_to_hub`] method to save the [`GPTQConfig`]:
+Once a model is quantized, you can use [`~PreTrainedModel.push_to_hub`] to push the model and tokenizer to the Hub where it can be easily shared and accessed. This saves the [`GPTQConfig`].
 
 ```py
 quantized_model.push_to_hub("opt-125m-gptq")
 tokenizer.push_to_hub("opt-125m-gptq")
 ```
 
-You could also save your quantized model locally with the [`~PreTrainedModel.save_pretrained`] method. If the model was quantized with the `device_map` parameter, make sure to move the entire model to a GPU or CPU before saving it. For example, to save the model on a CPU:
+[`~PreTrainedModel.save_pretrained`] saves a quantized model locally. If the model was quantized with the `device_map` parameter, make sure to move the entire model to a GPU or CPU before saving it. The example below saves the model on a CPU.
 
 ```py
 quantized_model.save_pretrained("opt-125m-gptq")
@@ -111,7 +100,7 @@ quantized_model.to("cpu")
 quantized_model.save_pretrained("opt-125m-gptq")
 ```
 
-Reload a quantized model with the [`~PreTrainedModel.from_pretrained`] method, and set `device_map="auto"` to automatically distribute the model on all available GPUs to load the model faster without using more memory than needed.
+Reload a quantized model with [`~PreTrainedModel.from_pretrained`], and set `device_map="auto"` to automatically distribute the model on all available GPUs to load the model faster without using more memory than needed.
 
 ```py
 from transformers import AutoModelForCausalLM
@@ -134,27 +123,49 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", de
 
 ## ExLlama
 
-[ExLlama](https://github.com/turboderp/exllama) is a CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter:
+> [!WARNING]
+> Only 4-bit models are supported, and we recommend deactivating the ExLlama kernels if you're finetuning a quantized model with PEFT.
+
+[ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object.
+
+To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter in [`GPTQConfig`].
 
 ```py
 import torch
 from transformers import AutoModelForCausalLM, GPTQConfig
 
 gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
-model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config)
+model = AutoModelForCausalLM.from_pretrained(
+    "{your_username}/opt-125m-gptq",
+    device_map="auto",
+    quantization_config=gptq_config
+)
 ```
 
-<Tip warning={true}>
-
-Only 4-bit models are supported, and we recommend deactivating the ExLlama kernels if you're finetuning a quantized model with PEFT.
-
-</Tip>
-
-The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ or GPTQModel, then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
+The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ 0.4.2+, disable the ExLlama kernel in [`GPTQConfig`]. This overwrites the attributes related to the ExLlama kernels in the quantization config of the `config.json` file.
 
 ```py
 import torch
 from transformers import AutoModelForCausalLM, GPTQConfig
+
 gptq_config = GPTQConfig(bits=4, use_exllama=False)
-model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config)
+model = AutoModelForCausalLM.from_pretrained(
+    "{your_username}/opt-125m-gptq",
+    device_map="cpu",
+    quantization_config=gptq_config
+)
 ```
+
+## GPTQModel
+
+It is recommended to use GPTQModel, originally a maintained fork of AutoGPTQ, because it has since diverged from AutoGTPQ with some significant features. GPTQModel has faster quantization, lower memory usage, and more accurate default quantization.
+
+GPTQModel provides asymmetric quantization which can potentially lower quantization errors compared to symmetric quantization. It is not backward compatible with AutoGPTQ, and not all kernels (Marlin) support asymmetric quantization.
+
+GPTQModel also has broader support for the latest LLM models, multimodal models (Qwen2-VL and Ovis1.6-VL), platforms (Linux, macOS, Windows 11), and hardware (AMD ROCm, Apple Silicon, Intel/AMD CPUs, and Intel Datacenter Max/Arc GPUs, etc.).
+
+The Marlin kernels are also updated for A100 GPUs and other kernels are updated to include auto-padding for legacy models and models with non-uniform in/out-features.
+
+## Resources
+
+Run the GPTQ quantization with PEFT [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) for a hands-on experience, and read [Making LLMs lighter with AutoGPTQ and transformers](https://huggingface.co/blog/gptq-integration) to learn more about the AutoGPTQ integration.
diff --git a/docs/source/en/quantization/higgs.md b/docs/source/en/quantization/higgs.md
index d2aa9c9dc497..11c42b208c3b 100644
--- a/docs/source/en/quantization/higgs.md
+++ b/docs/source/en/quantization/higgs.md
@@ -16,11 +16,30 @@ rendered properly in your Markdown viewer.
 
 # HIGGS
 
-HIGGS is a 0-shot quantization algorithm that combines Hadamard preprocessing with MSE-Optimal quantization grids to achieve lower quantization error and SOTA performance. You can find more information in the paper [arxiv.org/abs/2411.17525](https://arxiv.org/abs/2411.17525).
+[HIGGS](https://arxiv.org/abs/2411.17525) is a zero-shot quantization algorithm that combines Hadamard preprocessing with MSE-Optimal quantization grids to achieve lower quantization error and state-of-the-art performance.
 
-Runtime support for HIGGS is implemented through [FLUTE](https://arxiv.org/abs/2407.10960), and its [library](https://github.com/HanGuo97/flute).
+Runtime support for HIGGS is implemented through the [FLUTE](https://github.com/HanGuo97/flute) library. Only the 70B and 405B variants of Llama 3 and Llama 3.0, and the 8B and 27B variants of Gemma 2 are currently supported. HIGGS also doesn't support quantized training and backward passes in general at the moment.
 
-## Quantization Example
+Run the command below to install FLUTE.
+
+<hfoptions id="install">
+<hfoption id="CUDA 12.1">
+
+```bash
+pip install flute-kernel
+```
+
+</hfoption>
+<hfoption id="CUDA 11.8">
+
+```bash
+pip install flute-kernel -i https://flute-ai.github.io/whl/cu12.4
+```
+
+</hfoption>
+</hfoptions>
+
+Create a [`HiggsConfig`] with the number of bits to quantize a model to.
 
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer, HiggsConfig
@@ -30,37 +49,32 @@ model = AutoModelForCausalLM.from_pretrained(
     quantization_config=HiggsConfig(bits=4),
     device_map="auto",
 )
-
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
-
-tokenizer.decode(model.generate(
-    **tokenizer("Hi,", return_tensors="pt").to(model.device),
-    temperature=0.5,
-    top_p=0.80,
-)[0])
 ```
 
-## Pre-quantized models
+> [!TIP]
+> Find models pre-quantized with HIGGS in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/higgs-675308e432fd56b7f6dab94e).
 
-Some pre-quantized models can be found in the [official collection](https://huggingface.co/collections/ISTA-DASLab/higgs-675308e432fd56b7f6dab94e) on Hugging Face Hub.
+## torch.compile
 
-## Current Limitations
+HIGGS is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html).
 
-**Architectures**
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, HiggsConfig
 
-Currently, FLUTE, and HIGGS by extension, **only support Llama 3 and 3.0 of 8B, 70B and 405B parameters, as well as Gemma-2 9B and 27B**. We're working on allowing to run more diverse models as well as allow arbitrary models by modifying the FLUTE compilation procedure.
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-2-9b-it",
+    quantization_config=HiggsConfig(bits=4),
+    device_map="auto",
+)
 
-**torch.compile**
+model = torch.compile(model)
+```
 
-HIGGS is fully compatible with `torch.compile`. Compiling `model.forward`, as described [here](../perf_torch_compile.md), here're the speedups it provides on RTX 4090 for `Llama-3.1-8B-Instruct` (forward passes/sec):
+Refer to the table below for a benchmark of forward passes/sec for Llama-3.1-8B-Instruct on a RTX4090.
 
-| Batch Size | BF16 (With `torch.compile`) | HIGGS 4bit (No `torch.compile`) | HIGGS 4bit (With `torch.compile`) |
+| Batch Size | BF16 (with `torch.compile`) | HIGGS 4bit (without `torch.compile`) | HIGGS 4bit (with `torch.compile`) |
 |------------|-----------------------------|----------------------------------|-----------------------------------|
 | 1          | 59                          | 41                               | 124                               |
 | 4          | 57                          | 42                               | 123                               |
 | 16         | 56                          | 41                               | 120                               |
-
-
-**Quantized training**
-
-Currently, HIGGS doesn't support quantized training (and backward passes in general). We're working on adding support for it.
\ No newline at end of file
diff --git a/docs/source/en/quantization/hqq.md b/docs/source/en/quantization/hqq.md
index 34608cd64fd8..cc7b5f8cd9bc 100755
--- a/docs/source/en/quantization/hqq.md
+++ b/docs/source/en/quantization/hqq.md
@@ -14,27 +14,43 @@ rendered properly in your Markdown viewer.
 
 -->
 
+# HQQ
 
-# HQQ 
+[Half-Quadratic Quantization (HQQ)](https://github.com/mobiusml/hqq/) supports fast on-the-fly quantization for 8, 4, 3, 2, and even 1-bits. It doesn't require calibration data, and it is compatible with any model modality (LLMs, vision, etc.).
 
-Half-Quadratic Quantization (HQQ) implements on-the-fly quantization via fast robust optimization. It doesn't require calibration data and can be used to quantize any model.  
-Please refer to the <a href="https://github.com/mobiusml/hqq/">official package</a> for more details.
+HQQ further supports fine-tuning with [PEFT](https://huggingface.co/docs/peft) and is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training.
 
-For installation, we recommend you use the following approach to get the latest version and build its corresponding CUDA kernels:
-```
+Install HQQ with the following command to get the latest version and to build its corresponding CUDA kernels.
+
+```bash
 pip install hqq
 ```
 
-To quantize a model, you need to create an [`HqqConfig`]. There are two ways of doing it:
-``` Python
+You can choose to either replace all the linear layers in a model with the same quantization config or dedicate a specific quantization config for specific linear layers.
+
+<hfoptions id="hqq">
+<hfoption id="replace all layers">
+
+Quantize a model by creating a [`HqqConfig`] and specifying the `nbits` and `group_size` to replace for all the linear layers ([torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)) of the model.
+
+``` py
 from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
 
-# Method 1: all linear layers will use the same quantization config
-quant_config  = HqqConfig(nbits=8, group_size=64)
+quant_config = HqqConfig(nbits=8, group_size=64)
+model = transformers.AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B", 
+    torch_dtype=torch.float16, 
+    device_map="cuda", 
+    quantization_config=quant_config
+)
 ```
 
-``` Python
-# Method 2: each linear layer with the same tag will use a dedicated quantization config
+</hfoption>
+<hfoption id="specific layers only">
+
+Quantize a model by creating a dictionary specifying the `nbits` and `group_size` for the linear layers to quantize. Pass them to [`HqqConfig`] and set which layers to quantize with the config. This approach is especially useful for quantizing mixture-of-experts (MoEs) because they are less affected ly lower quantization settings.
+
+``` py
 q4_config = {'nbits':4, 'group_size':64}
 q3_config = {'nbits':3, 'group_size':32}
 quant_config  = HqqConfig(dynamic_config={
@@ -47,23 +63,38 @@ quant_config  = HqqConfig(dynamic_config={
   'mlp.up_proj'  :q3_config,
   'mlp.down_proj':q3_config,
 })
-```
 
-The second approach is especially interesting for quantizing Mixture-of-Experts (MoEs) because the experts are less affected by lower quantization settings.
-
-
-Then you simply quantize the model as follows
-``` Python
 model = transformers.AutoModelForCausalLM.from_pretrained(
-    model_id, 
+    "meta-llama/Llama-3.1-8B", 
     torch_dtype=torch.float16, 
     device_map="cuda", 
     quantization_config=quant_config
 )
 ```
 
-## Optimized Runtime
+</hfoption>
+</hfoptions>
+
+## Backends
+
+HQQ supports various backends, including pure PyTorch and custom dequantization CUDA kernels. These backends are suitable for older GPUs and PEFT/QLoRA training.
+
+```py
+from hqq.core.quantize import *
+
+HQQLinear.set_backend(HQQBackend.PYTORCH)
+```
+
+For faster inference, HQQ supports 4-bit fused kernels (torchao and Marlin) after a model is quantized. These can reach up to 200 tokens/sec on a single 4090. The example below demonstrates enabling the torchao_int4 backend.
+
+```py
+from hqq.utils.patching import prepare_for_inference
+
+prepare_for_inference("model", backend="torchao_int4")
+```
+
+Refer to the [Backend](https://github.com/mobiusml/hqq/#backend) guide for more details.
+
+## Resources
 
-HQQ supports various backends, including pure PyTorch and custom dequantization CUDA kernels. These backends are suitable for older gpus and peft/QLoRA training.
-For faster inference, HQQ supports 4-bit fused kernels (TorchAO and Marlin), reaching up to 200 tokens/sec on a single 4090.
-For more details on how to use the backends, please refer to https://github.com/mobiusml/hqq/?tab=readme-ov-file#backend
+Read the [Half-Quadratic Quantization of Large Machine Learning Models](https://mobiusml.github.io/hqq_blog/) blog post for more details about HQQ.
diff --git a/docs/source/en/quantization/optimum.md b/docs/source/en/quantization/optimum.md
index d90b4c818e43..5498e715ee18 100644
--- a/docs/source/en/quantization/optimum.md
+++ b/docs/source/en/quantization/optimum.md
@@ -16,4 +16,4 @@ rendered properly in your Markdown viewer.
 
 # Optimum
 
-The [Optimum](https://huggingface.co/docs/optimum/index) library supports quantization for Intel, Furiosa, ONNX Runtime, GPTQ, and lower-level PyTorch quantization functions. Consider using Optimum for quantization if you're using specific and optimized hardware like Intel CPUs, Furiosa NPUs or a model accelerator like ONNX Runtime.
\ No newline at end of file
+[Optimum](https://huggingface.co/docs/optimum/index) is an optimization library that supports quantization for Intel, Furiousa, ONNX Runtime, GPTQ, and lower-level PyTorch quantization functions. It is designed to enhance performance for specific hardware - Intel CPUs/HPUs, AMD GPUs, Furiousa NPUs, etc. - and model accelerators like ONNX Runtime.
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index 94696e300a57..f91d0b2cec85 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,82 +14,46 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Quantization
+# Overview
 
-Quantization techniques focus on representing data with less information while also trying to not lose too much accuracy. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory-usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits.
+Quantization lowers the memory requirements of loading and using a model by storing the weights in a lower precision while trying to preserve as much accuracy as possible. Weights are typically stored in full-precision (fp32) floating point representations, but half-precision (fp16 or bf16) are increasingly popular data types given the large size of models today. Some quantization methods can reduce the precision even further to integer representations, like int8 or int4.
 
-<Tip>
+Transformers supports many quantization methods, each with their pros and cons, so you can pick the best one for your specific use case. Some methods require calibration for greater accuracy and extreme compression (1-2 bits), while other methods work out of the box with on-the-fly quantization.
 
-Interested in adding a new quantization method to Transformers? Read the [HfQuantizer](./contribute) guide to learn how!
-
-</Tip>
-
-<Tip>
-
-If you are new to the quantization field, we recommend you to check out these beginner-friendly courses about quantization in collaboration with DeepLearning.AI:
-
-* [Quantization Fundamentals with Hugging Face](https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face/)
-* [Quantization in Depth](https://www.deeplearning.ai/short-courses/quantization-in-depth/)
-
-</Tip>
-
-## When to use what?
-
-The community has developed many quantization methods for various use cases. With Transformers, you can run any of these integrated methods depending on your use case because each method has their own pros and cons.
-
-For example, some quantization methods require calibrating the model with a dataset for more accurate and "extreme" compression (up to 1-2 bits quantization), while other methods work out of the box with on-the-fly quantization.
-
-Another parameter to consider is compatibility with your target device. Do you want to quantize on a CPU, GPU, or Apple silicon?
-
-In short, supporting a wide range of quantization methods allows you to pick the best quantization method for your specific use case.
-
-Use the table below to help you decide which quantization method to use.
+Use the Space below to help you pick a quantization method depending on your hardware and number of bits to quantize to.
 
 | Quantization Method                           | On the fly quantization | CPU             | CUDA GPU | ROCm GPU  | Metal (Apple Silicon)              | Intel GPU       | Torch compile() | Bits          | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support  | Link to library                             |
 |-----------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|---------------|------------------|-----------------------------|-------------------------|---------------------------------------------|
-| [AQLM](./aqlm.md)                             | 🔴                   | 🟢              |     🟢     | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/2         | 🟢               | 🟢                          | 🟢                      | https://github.com/Vahe1994/AQLM            |
-| [AWQ](./awq.md)                               | 🔴                   | 🟢              | 🟢        | 🟢        | 🔴                                 | 🟢              | ?               | 4             | 🟢               | 🟢                          | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
-| [bitsandbytes](./bitsandbytes.md)             | 🟢                   | 🟡 <sub>1</sub> |     🟢     | 🟡 <sub>1</sub> | 🔴 <sub>2</sub>                    | 🟡 <sub>1</sub> | 🔴 <sub>1</sub> | 4/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
-| [compressed-tensors](./compressed_tensors.md) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
-| [EETQ](./eetq.md)                             | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8             | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
-| [GGUF / GGML (llama.cpp)](../gguf.md)         | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8         | 🔴               | [See Notes](../gguf.md)     | [See Notes](../gguf.md) | https://github.com/ggerganov/llama.cpp      |
-| [GPTQModel](./gptq.md)                        | 🔴                   | 🟢 <sub>3</sub> | 🟢        | 🟢        | 🟢                                 | 🟢 <sub>4</sub> | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
-| [AutoGPTQ](./gptq.md)                         | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
-| [HIGGS](./higgs.md)                           | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 2/4         | 🔴               | 🟢                          | 🟢                      | https://github.com/HanGuo97/flute           |       
-| [HQQ](./hqq.md)                               | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
-| [optimum-quanto](./quanto.md)                 | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8     | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
-| [FBGEMM_FP8](./fbgemm_fp8.md)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao.md)                       | 🟢                   |                 | 🟢        | 🔴        | 🟡 <sub>5</sub> | 🔴              |                 | 4/8         |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
-| [VPTQ](./vptq.md)                             | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
-| [SpQR](./spqr.md)                          | 🔴                       |  🔴   | 🟢        | 🔴              |    🔴    | 🔴         |         🟢              | 3              |              🔴                     | 🟢           | 🟢                      | https://github.com/Vahe1994/SpQR/       |
-| [FINEGRAINED_FP8](./finegrained_fp8.md)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      |        |
-<Tip>
-  
-**1:** bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
+| [AQLM](./aqlm)                             | 🔴                   | 🟢              |     🟢     | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/2         | 🟢               | 🟢                          | 🟢                      | https://github.com/Vahe1994/AQLM            |
+| [AWQ](./awq)                               | 🔴                   | 🟢              | 🟢        | 🟢        | 🔴                                 | 🟢              | ?               | 4             | 🟢               | 🟢                          | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
+| [bitsandbytes](./bitsandbytes)             | 🟢                   | 🟡 |     🟢     | 🟡 | 🔴                    | 🟡 | 🔴 | 4/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
+| [compressed-tensors](./compressed_tensors) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
+| [EETQ](./eetq)                             | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8             | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| [GGUF / GGML (llama.cpp)](../gguf)         | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8         | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
+| [GPTQModel](./gptq)                        | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
+| [AutoGPTQ](./gptq)                         | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [HIGGS](./higgs)                           | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 2/4         | 🔴               | 🟢                          | 🟢                      | https://github.com/HanGuo97/flute           |       
+| [HQQ](./hqq)                               | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [optimum-quanto](./quanto)                 | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8     | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
+| [FBGEMM_FP8](./fbgemm_fp8)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
+| [torchao](./torchao)                       | 🟢                   | 🟢               | 🟢        | 🔴        | 🟡 | 🔴              |                 | 4/8         |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
+| [VPTQ](./vptq)                             | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
+| [FINEGRAINED_FP8](./finegrained_fp8)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      |        |
+| [SpQR](./spqr)                          | 🔴                       |  🔴   | 🟢        | 🔴              |    🔴    | 🔴         |         🟢              | 3              |              🔴                     | 🟢           | 🟢                      | https://github.com/Vahe1994/SpQR/       |
+| [Quark](./quark)                           | 🔴                       | 🟢 | 🟢      | 🟢      | 🟢                   | 🟢       | ?               | 2/4/6/8/9/16 | 🔴                | 🔴                               | 🟢                       | https://quark.docs.amd.com/latest/                      |
+
+## Resources
+
+If you are new to quantization, we recommend checking out these beginner-friendly quantization courses in collaboration with DeepLearning.AI.
 
-</Tip>
-
-<Tip>
-
-**2:** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships.
-
-</Tip>
-
-<Tip>
-
-**3:** GPTQModel[CPU] supports 4-bit via IPEX on Intel/AMD and full bit range via Torch on Intel/AMD/Apple Silicon.
-
-</Tip>
-
-<Tip>
-
-**4:** GPTQModel[Intel GPU] via IPEX only supports 4-bit for Intel Datacenter Max/Arc GPUs.
-
-</Tip>
-
-<Tip>
+* [Quantization Fundamentals with Hugging Face](https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face/)
+* [Quantization in Depth](https://www.deeplearning.ai/short-courses/quantization-in-depth)
 
-**5:** torchao only supports int4 weight on Metal (Apple Silicon).
+## User-Friendly Quantization Tools
 
-</Tip>
+If you are looking for a user-friendly quantization experience, you can use the following community spaces and notebooks: 
 
+* [Bitsandbytes Space](https://huggingface.co/spaces/bnb-community/bnb-my-repo)
+* [GGUF Space](https://huggingface.co/spaces/ggml-org/gguf-my-repo)
+* [MLX Space](https://huggingface.co/spaces/mlx-community/mlx-my-repo)
+* [AuoQuant Notebook](https://colab.research.google.com/drive/1b6nqC7UZVt8bx4MksX7s656GXPM-eWw4?usp=sharing#scrollTo=ZC9Nsr9u5WhN)
diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md
index 7feadefd83d2..7635b72d9767 100644
--- a/docs/source/en/quantization/quanto.md
+++ b/docs/source/en/quantization/quanto.md
@@ -14,55 +14,56 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Optimum-quanto
+# Optimum Quanto
 
-<Tip>
+[Quanto](https://github.com/huggingface/optimum-quanto) is a PyTorch quantization backend for [Optimum](https://huggingface.co/docs/optimum/index). It features linear quantization for weights (float8, int8, int4, int2) with accuracy very similar to full-precision models. Quanto is compatible with any model modality and device, making it simple to use regardless of hardware.
 
-Try optimum-quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
+Quanto is also compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for faster generation.
 
-</Tip>
-
-
-[🤗 optimum-quanto](https://github.com/huggingface/optimum-quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
-
-- weights quantization (`float8`,`int8`,`int4`,`int2`)
-- activation quantization (`float8`,`int8`)
-- modality agnostic (e.g CV,LLM)
-- device agnostic (e.g CUDA,XPU,MPS,CPU)
-- compatibility with `torch.compile`
-- easy to add custom kernel for specific device
-- supports quantization aware training
-<!-- Add link to the blogpost -->
-
-Before you begin, make sure the following libraries are installed:
+Install Quanto with the following command.
 
 ```bash
 pip install optimum-quanto accelerate transformers
 ```
 
-Now you can quantize a model by passing [`QuantoConfig`] object in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it contains `torch.nn.Linear` layers. 
-
-The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [optimum-quanto](https://github.com/huggingface/optimum-quanto) library instead.
+Quantize a model by creating a [`QuantoConfig`] and specifying the `weights` parameter to quantize to. This works for any model in any modality as long as it contains [torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) layers.
 
-By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
+> [!TIP]
+> The Transformers integration only supports weight quantization. Use the Quanto library directly if you need activation quantization, calibration, or QAT.
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 
-model_id = "facebook/opt-125m"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-quantization_config = QuantoConfig(weights="int8")
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="cuda:0", quantization_config=quantization_config)
+quant_config = QuantoConfig(weights="int8")
+model = transformers.AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B", 
+    torch_dtype="auto", 
+    device_map="auto", 
+    quantization_config=quant_config
+)
 ```
 
-Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead.
+## torch.compile
+
+Wrap a Quanto model with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for faster generation.
+
+```py
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, QuantoConfig
+
+quant_config = QuantoConfig(weights="int8")
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+  "openai/whisper-large-v2",
+  torch_dtype="auto",
+  device_map="auto",
+  quantization_config=quant_config
+)
+
+model = torch.compile(model)
+```
 
-Optimum-quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/optimum-quanto/tree/main/bench/generation)
+## Resources
 
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/NousResearch-Llama-2-7b-hf_Perplexity.png" alt="llama-2-7b-quanto-perplexity" />
-  </div>
-</div>
+Read the [Quanto: a PyTorch quantization backend for Optimum](https://huggingface.co/blog/quanto-introduction) blog post to learn more about the library design and benchmarks.
 
-The library is versatile enough to be compatible with most PTQ optimization algorithms. The plan in the future is to integrate the most popular algorithms in the most seamless possible way (AWQ, Smoothquant).
\ No newline at end of file
+For more hands-on examples, take a look at the Quanto [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing).
\ No newline at end of file
diff --git a/docs/source/en/quantization/quark.md b/docs/source/en/quantization/quark.md
new file mode 100644
index 000000000000..8d60affbc280
--- /dev/null
+++ b/docs/source/en/quantization/quark.md
@@ -0,0 +1,84 @@
+<!--Copyright 2025 Advanced Micro Devices, Inc. and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quark
+
+[Quark](https://quark.docs.amd.com/latest/) is a deep learning quantization toolkit designed to be agnostic to specific data types, algorithms, and hardware. Different pre-processing strategies, algorithms and data-types can be combined in Quark.
+
+The PyTorch support integrated through 🤗 Transformers primarily targets AMD CPUs and GPUs, and is primarily meant to be used for evaluation purposes. For example, it is possible to use [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) with 🤗 Transformers backend and evaluate a wide range of models quantized through Quark seamlessly.
+
+Users interested in Quark can refer to its [documentation](https://quark.docs.amd.com/latest/) to get started quantizing models and using them in supported open-source libraries!
+
+Although Quark has its own checkpoint / [configuration format](https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test/blob/main/config.json#L26), the library also supports producing models with a serialization layout compliant with other quantization/runtime implementations ([AutoAWQ](https://huggingface.co/docs/transformers/quantization/awq), [native fp8 in 🤗 Transformers](https://huggingface.co/docs/transformers/quantization/finegrained_fp8)).
+
+To be able to load Quark quantized models in Transformers, the library first needs to be installed:
+
+```bash
+pip install amd-quark
+```
+
+## Support matrix
+
+Models quantized through Quark support a large range of features, that can be combined together. All quantized models independently of their configuration can seamlessly be reloaded through `PretrainedModel.from_pretrained`.
+
+The table below shows a few features supported by Quark:
+
+| **Feature**                     | **Supported subset in Quark**                                                                             |   |
+|---------------------------------|-----------------------------------------------------------------------------------------------------------|---|
+| Data types                      | int8, int4, int2, bfloat16, float16, fp8_e5m2, fp8_e4m3, fp6_e3m2, fp6_e2m3, fp4, OCP MX, MX6, MX9, bfp16 |   |
+| Pre-quantization transformation | SmoothQuant, QuaRot, SpinQuant, AWQ                                                                       |   |
+| Quantization algorithm          | GPTQ                                                                                                      |   |
+| Supported operators             | ``nn.Linear``, ``nn.Conv2d``, ``nn.ConvTranspose2d``, ``nn.Embedding``, ``nn.EmbeddingBag``               |   |
+| Granularity                     | per-tensor, per-channel, per-block, per-layer, per-layer type                                             |   |
+| KV cache                        | fp8                                                                                                       |   |
+| Activation calibration          | MinMax / Percentile / MSE                                                                                 |   |
+| Quantization strategy           | weight-only, static, dynamic, with or without output quantization                                         |   |
+
+## Models on Hugging Face Hub
+
+Public models using Quark native serialization can be found at https://huggingface.co/models?other=quark.
+
+Although Quark also supports [models using `quant_method="fp8"`](https://huggingface.co/models?other=fp8) and [models using `quant_method="awq"`](https://huggingface.co/models?other=awq), Transformers loads these models rather through [AutoAWQ](https://huggingface.co/docs/transformers/quantization/awq) or uses the [native fp8 support in 🤗 Transformers](https://huggingface.co/docs/transformers/quantization/finegrained_fp8).
+
+## Using Quark models in Transformers
+
+Here is an example of how one can load a Quark model in Transformers:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "EmbeddedLLM/Llama-3.1-8B-Instruct-w_fp8_per_channel_sym"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+model = model.to("cuda")
+
+print(model.model.layers[0].self_attn.q_proj)
+# QParamsLinear(
+#   (weight_quantizer): ScaledRealQuantizer()
+#   (input_quantizer): ScaledRealQuantizer()
+#   (output_quantizer): ScaledRealQuantizer()
+# )
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+inp = tokenizer("Where is a good place to cycle around Tokyo?", return_tensors="pt")
+inp = inp.to("cuda")
+
+res = model.generate(**inp, min_new_tokens=50, max_new_tokens=100)
+
+print(tokenizer.batch_decode(res)[0])
+# <|begin_of_text|>Where is a good place to cycle around Tokyo? There are several places in Tokyo that are suitable for cycling, depending on your skill level and interests. Here are a few suggestions:
+# 1. Yoyogi Park: This park is a popular spot for cycling and has a wide, flat path that's perfect for beginners. You can also visit the Meiji Shrine, a famous Shinto shrine located in the park.
+# 2. Imperial Palace East Garden: This beautiful garden has a large, flat path that's perfect for cycling. You can also visit the
+```
\ No newline at end of file
diff --git a/docs/source/en/quantization/spqr.md b/docs/source/en/quantization/spqr.md
index b9ebb99b69cb..cff6821b0b2e 100644
--- a/docs/source/en/quantization/spqr.md
+++ b/docs/source/en/quantization/spqr.md
@@ -16,11 +16,16 @@ rendered properly in your Markdown viewer.
 
 # SpQR
 
-[SpQR](https://github.com/Vahe1994/SpQR) quantization algorithm involves a 16x16 tiled bi-level group 3-bit quantization structure, with sparse outliers as detailed in [SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression](https://arxiv.org/abs/2306.03078).
+The [SpQR]((https://hf.co/papers/2306.03078)) quantization algorithm involves a 16x16 tiled bi-level group 3-bit quantization structure with sparse outliers.
 
-To SpQR-quantize a model, refer to the [Vahe1994/SpQR](https://github.com/Vahe1994/SpQR) repository.
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/spqr-diagram.png">
+</div>
 
-Load a pre-SpQR-quantized model in [`~PreTrainedModel.from_pretrained`].
+> [!TIP]
+> To quantize a model with SpQR, refer to the [Vahe1994/SpQR](https://github.com/Vahe1994/SpQR) repository.
+
+Load a SpQR-quantized model with [`~PreTrainedModel.from_pretrained`].
 
 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md
index 46fb0f8cbb9a..42231d759c2a 100644
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@@ -9,81 +9,220 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 -->
 
-# TorchAO
+# torchao
 
-[TorchAO](https://github.com/pytorch/ao) is an architecture optimization library for PyTorch, it provides high performance dtypes, optimization techniques and kernels for inference and training, featuring composability with native PyTorch features like `torch.compile`, FSDP etc.. Some benchmark numbers can be found [here](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks).
+[torchao](https://github.com/pytorch/ao) is a PyTorch architecture optimization library with support for custom high performance data types, quantization, and sparsity. It is composable with native PyTorch features such as [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training.
 
-Before you begin, make sure the following libraries are installed with their latest version:
+Install torchao with the following command.
 
 ```bash
 # Updating 🤗 Transformers to the latest version, as the example script below uses the new auto compilation
 pip install --upgrade torch torchao transformers
 ```
 
-By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
+torchao supports many quantization types for different data types (int4, float8, weight only, etc.).
+Starting with version 0.10.0, torchao provides enhanced flexibility through the `AOBaseConfig` API, allowing for more customized quantization configurations.
+And full access to the techniques offered in the torchao library.
+
+You can manually choose the quantization types and settings or automatically select the quantization types.
+
+<hfoptions id="torchao">
+<hfoption id="manual">
+
+
+Create a [`TorchAoConfig`] and specify the quantization type and `group_size` of the weights to quantize. Set the `cache_implementation` to `"static"` to automatically [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) the forward method.
+
+> [!TIP]
+> Run the quantized model on a CPU by changing `device_map` to `"cpu"` and `layout` to `Int4CPULayout()`. This is only available in torchao 0.8.0+.
+
+In torchao 0.10.0+, you can use the more flexible `AOBaseConfig` approach instead of string identifiers:
 
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int4WeightOnlyConfig
 
-model_name = "meta-llama/Meta-Llama-3-8B"
-# We support int4_weight_only, int8_weight_only and int8_dynamic_activation_int8_weight
-# More examples and documentations for arguments can be found in https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques
-quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
+# Using AOBaseConfig instance (torchao >= 0.10.0)
+quant_config = Int4WeightOnlyConfig(group_size=128)
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Meta-Llama-3-8B",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+## Available Quantization Schemes
+
+TorchAO provides a variety of quantization configurations:
+
+- `Int4WeightOnlyConfig`
+- `Int8WeightOnlyConfig`
+- `Int8DynamicActivationInt8WeightConfig`
+- `Float8WeightOnlyConfig`
+
+Each configuration can be further customized with parameters such as `group_size`, `scheme`, and `layout` to optimize for specific hardware and model architectures.
+
+For a complete list of available configurations, see our [quantization API documentation](https://github.com/pytorch/ao/blob/main/torchao/quantization/quant_api.py).
+
+> **⚠️ DEPRECATION WARNING**
+>
+> Starting with version 0.10.0, the string-based API for quantization configuration (e.g., `TorchAoConfig("int4_weight_only", group_size=128)`) is **deprecated** and will be removed in a future release.
+>
+> Please use the new `AOBaseConfig`-based approach instead:
+>
+> ```python
+> # Old way (deprecated)
+> quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+>
+> # New way (recommended)
+> from torchao.quantization import Int4WeightOnlyConfig
+> quant_config = Int4WeightOnlyConfig(group_size=128)
+> quantization_config = TorchAoConfig(quant_type=quant_config)
+> ```
+>
+> The new API offers greater flexibility, better type safety, and access to the full range of features available in torchao.
+>
+> ## Migration Guide
+>
+> Here's how to migrate from common string identifiers to their `AOBaseConfig` equivalents:
+>
+> | Old String API | New `AOBaseConfig` API |
+> |----------------|------------------------|
+> | `"int4_weight_only"` | `Int4WeightOnlyConfig()` |
+> | `"int8_weight_only"` | `Int8WeightOnlyConfig()` |
+> | `"int8_dynamic_activation_int8_weight"` | `Int8DynamicActivationInt8WeightConfig()` |
+>
+> All configuration objects accept parameters for customization (e.g., `group_size`, `scheme`, `layout`).
+
+
+Below is the API for for torchao < `0.9.0`
 
-tokenizer = AutoTokenizer.from_pretrained(model_name)
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Meta-Llama-3-8B",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
 input_text = "What are we having for dinner?"
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 
-# auto-compile the quantized model with `cache_implementation="static"` to get speedup
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
 output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+Run the code below to benchmark the quantized models performance.
+
+```py
+from torch._inductor.utils import do_bench_using_profiling
+from typing import Callable
 
-# benchmark the performance
-import torch.utils.benchmark as benchmark
-
-def benchmark_fn(f, *args, **kwargs):
-    # Manual warmup
-    for _ in range(5):
-        f(*args, **kwargs)
-        
-    t0 = benchmark.Timer(
-        stmt="f(*args, **kwargs)",
-        globals={"args": args, "kwargs": kwargs, "f": f},
-        num_threads=torch.get_num_threads(),
-    )
-    return f"{(t0.blocked_autorange().mean):.3f}"
+def benchmark_fn(func: Callable, *args, **kwargs) -> float:
+    """Thin wrapper around do_bench_using_profiling"""
+    no_args = lambda: func(*args, **kwargs)
+    time = do_bench_using_profiling(no_args)
+    return time * 1e3
 
 MAX_NEW_TOKENS = 1000
 print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
 
-bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16)
+bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
 output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile
 print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
-
 ```
 
-## Serialization and Deserialization
-torchao quantization is implemented with [tensor subclasses](https://pytorch.org/docs/stable/notes/extending.html#subclassing-torch-tensor), it only work with huggingface non-safetensor serialization and deserialization. It relies on `torch.load(..., weights_only=True)` to avoid arbitrary user code execution during load time and use [add_safe_globals](https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals) to allowlist some known user functions.
+> [!TIP]
+> For best performance, you can use recommended settings by calling `torchao.quantization.utils.recommended_inductor_config_setter()`
+
+</hfoption>
+<hfoption id="automatic">
+
+The [autoquant](https://pytorch.org/ao/stable/generated/torchao.quantization.autoquant.html#torchao.quantization.autoquant) API automatically chooses a quantization type for quantizable layers (`nn.Linear`) by micro-benchmarking on input type and shape and compiling a single linear layer.
 
-The reason why it does not support safe tensor serialization is that wrapper tensor subclass allows maximum flexibility so we want to make sure the effort of supporting new format of quantized Tensor is low, while safe tensor optimizes for maximum safety (no user code execution), it also means we have to make sure to manually support new quantization format.
+Create a [`TorchAoConfig`] and set to `"autoquant"`. Set the `cache_implementation` to `"static"` to automatically [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) the forward method. Finally, call `finalize_autoquant` on the quantized model to finalize the quantization and log the input shapes.
+
+> [!TIP]
+> Run the quantized model on a CPU by changing `device_map` to `"cpu"` and `layout` to `Int4CPULayout()`. This is only available in torchao 0.8.0+.
 
 ```py
-# save quantized model locally
-output_dir = "llama3-8b-int4wo-128"
-quantized_model.save_pretrained(output_dir, safe_serialization=False)
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+
+quantization_config = TorchAoConfig("autoquant", min_sqnr=None)
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Meta-Llama-3-8B",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+# explicitly call `finalize_autoquant` (may be refactored and removed in the future)
+quantized_model.finalize_autoquant()
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+Run the code below to benchmark the quantized models performance.
+
+```py
+from torch._inductor.utils import do_bench_using_profiling
+from typing import Callable
+
+def benchmark_fn(func: Callable, *args, **kwargs) -> float:
+    """Thin wrapper around do_bench_using_profiling"""
+    no_args = lambda: func(*args, **kwargs)
+    time = do_bench_using_profiling(no_args)
+    return time * 1e3
+
+MAX_NEW_TOKENS = 1000
+print("autoquantized model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
+
+bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
+output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile
+print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
+```
+
+</hfoption>
+</hfoptions>
 
-# push to huggingface hub
-# save_to = "{user_id}/llama3-8b-int4wo-128"
-# quantized_model.push_to_hub(save_to, safe_serialization=False)
+## Serialization
 
-# load quantized model
-ckpt_id = "llama3-8b-int4wo-128"  # or huggingface hub model id
-loaded_quantized_model = AutoModelForCausalLM.from_pretrained(ckpt_id, device_map="cuda")
+torchao implements [torch.Tensor subclasses](https://pytorch.org/docs/stable/notes/extending.html#subclassing-torch-tensor) for maximum flexibility in supporting new quantized torch.Tensor formats. [Safetensors](https://huggingface.co/docs/safetensors/en/index) serialization and deserialization does not work with torchao.
 
+To avoid arbitrary user code execution, torchao sets `weights_only=True` in [torch.load](https://pytorch.org/docs/stable/generated/torch.load.html) to ensure only tensors are loaded. Any known user functions can be whitelisted with [add_safe_globals](https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals).
 
-# confirm the speedup
-loaded_quantized_model = torch.compile(loaded_quantized_model, mode="max-autotune")
-print("loaded int4wo-128 model:", benchmark_fn(loaded_quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS))
+```py
+# don't serialize model with Safetensors
+output_dir = "llama3-8b-int4wo-128"
+quantized_model.save_pretrained("llama3-8b-int4wo-128", safe_serialization=False)
 ```
+
+## Resources
+
+For a better sense of expected performance, view the [benchmarks](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks) for various models with CUDA and XPU backends.
+
+Refer to [Other Available Quantization Techniques](https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques) for more examples and documentation.
diff --git a/docs/source/en/quantization/vptq.md b/docs/source/en/quantization/vptq.md
index b86e82f0a350..af082c5f2f24 100644
--- a/docs/source/en/quantization/vptq.md
+++ b/docs/source/en/quantization/vptq.md
@@ -14,34 +14,33 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# VPTQ 
+# VPTQ
 
-> [!TIP]
-> Try VPTQ on [Hugging Face](https://huggingface.co/spaces/microsoft/VPTQ)!
-> Try VPTQ on [Google Colab](https://colab.research.google.com/github/microsoft/VPTQ/blob/main/notebooks/vptq_example.ipynb)!
-> Know more about VPTQ on [ArXiv](https://arxiv.org/pdf/2409.17066)!
+[Vector Post-Training Quantization (VPTQ)](https://github.com/microsoft/VPTQ) is a Post-Training Quantization (PTQ) method that leverages vector quantization to quantize LLMs at an extremely low bit-width (<2-bit). VPTQ can compress a 70B, even a 405B model, to 1-2 bits without retraining and still maintain a high-degree of accuracy. It is a lightweight quantization algorithm that takes ~17 hours to quantize a 405B model. VPTQ features agile quantization inference with low decoding overhead and high throughput and Time To First Token (TTFT).
 
-Vector Post-Training Quantization ([VPTQ](https://github.com/microsoft/VPTQ)) is a novel Post-Training Quantization method that leverages Vector Quantization to high accuracy on LLMs at an extremely low bit-width (<2-bit). VPTQ can compress 70B, even the 405B model, to 1-2 bits without retraining and maintain high accuracy.
+Run the command below to install VPTQ which provides efficient kernels for inference on NVIDIA and AMD GPUs.
 
-- Better Accuracy on 1-2 bits, (405B @ <2bit, 70B @ 2bit)
-- Lightweight Quantization Algorithm: only cost ~17 hours to quantize 405B Llama-3.1
-- Agile Quantization Inference: low decode overhead, best throughput, and TTFT
-
-Inference support for VPTQ is released in the `vptq` library. Make sure to install it to run the models:
 ```bash
 pip install vptq
 ```
 
-The library provides efficient kernels for NVIDIA/AMD GPU inference.
+The [VPTQ-community](https://huggingface.co/VPTQ-community) provides a collection of VPTQ-quantized models. The model name contains information about its bitwidth (excluding cookbook, parameter, and padding overhead). Consider the [Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft] model as an example.
+
+- The model name is Meta-Llama-3.1-70B-Instruct.
+- The number of centroids is given by 65536 (2^16).
+- The number of residual centroids is given by 256 (2^8).
 
-To run VPTQ models simply load a model that has been quantized with VPTQ:
+The equivalent bit-width calculation is given by the following.
 
-## Inference example
-**Run Llama 3.1 70b on RTX4090 (24G @ ~2bits) in real time**
-![Llama3 1-70b-prompt](https://github.com/user-attachments/assets/d8729aca-4e1d-4fe1-ac71-c14da4bdd97f)
+- index: log2(65536) = 16 / 8 = 2-bits
+- residual index: log2(256) = 8 / 8 = 1-bit
+- total bit-width: 2 + 1 = 3-bits
 
+From here, estimate the model size by multiplying 70B * 3-bits / 8-bits/byte for a total of 26.25GB.
 
-```python
+Load a VPTQ quantized model with [`~PreTrainedModel.from_pretrained`].
+
+```py
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 quantized_model = AutoModelForCausalLM.from_pretrained(
@@ -49,18 +48,13 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
     torch_dtype="auto", 
     device_map="auto"
 )
-tokenizer = AutoTokenizer.from_pretrained("VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft")
-input_ids = tokenizer("hello, it's me", return_tensors="pt").to("cuda")
-out = model.generate(**input_ids, max_new_tokens=32, do_sample=False)
 ```
 
-## Quantize your own model
-VPTQ algorithm early-released at [VPTQ ](https://github.com/microsoft/VPTQ/tree/algorithm), 
-and checkout the [tutorial](https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md).
+To quantize your own model, refer to the [VPTQ Quantization Algorithm Tutorial](https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md) tutorial.
 
-## Early Results from Tech Report
-VPTQ achieves better accuracy and higher throughput with lower quantization overhead across models of different sizes. The following experimental results are for reference only; VPTQ can achieve better outcomes under reasonable parameters, especially in terms of model accuracy and inference speed.
+## Benchmarks
 
+VPTQ achieves better accuracy and higher throughput with lower quantization overhead across models of different sizes. The following experimental results are for reference only; VPTQ can achieve better outcomes under reasonable parameters, especially in terms of model accuracy and inference speed.
 
 | Model       | bitwidth | W2↓  | C4↓  | AvgQA↑ | tok/s↑ | mem(GB) | cost/h↓ |
 | ----------- | -------- | ---- | ---- | ------ | ------ | ------- | ------- |
@@ -71,41 +65,8 @@ VPTQ achieves better accuracy and higher throughput with lower quantization over
 | LLaMA-2 70B | 2.07     | 3.93 | 5.72 | 68.6   | 9.7    | 19.54   | 19      |
 |             | 2.11     | 3.92 | 5.71 | 68.7   | 9.7    | 20.01   | 19      |
 
+## Resources
 
+See an example demo of VPTQ on the VPTQ Online Demo [Space](https://huggingface.co/spaces/microsoft/VPTQ) or try running the VPTQ inference [notebook](https://colab.research.google.com/github/microsoft/VPTQ/blob/main/notebooks/vptq_example.ipynb).
 
-## More Models in [VPTQ-community](https://huggingface.co/VPTQ-community) 
-
-⚠️ The repository only provides a method of model quantization algorithm. 
-
-⚠️ The open-source community VPTQ-community provides models based on the technical report and quantization algorithm.
-
-
-
-**Quick Estimation of Model Bitwidth (Excluding Codebook Overhead)**:
-
-- **Model Naming Convention**: The model's name includes the **vector length** $v$, **codebook (lookup table) size**, and **residual codebook size**. For example, "Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft" is "Meta-Llama-3.1-70B-Instruct", where:
-  - **Vector Length**: 8
-  - **Number of Centroids**: 65536 (2^16)
-  - **Number of Residual Centroids**: 256 (2^8)
-- **Equivalent Bitwidth Calculation**:
-  - **Index**: log2(65536) = 16 / 8 = 2 bits
-  - **Residual Index**: log2(256) = 8 / 8 = 1 bit
-  - **Total Bitwidth**: 2 + 1 = 3 bits
-- **Model Size Estimation**: 70B * 3 bits / 8 bits per Byte = 26.25 GB
-
-- **Note**: This estimate does not include the size of the codebook (lookup table), other parameter overheads, and the padding overhead for storing indices. For the detailed calculation method, please refer to **Tech Report Appendix C.2**.
-
-
-|            Model Series            |  Collections                              | (Estimated) Bit per weight     |
-| :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-|       Llama 3.1 Nemotron 70B Instruct HF        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-nemotron-70b-instruct-hf-without-finetune-671730b96f16208d0b3fe942)  | [4 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-16384-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-256-woft) |
-|       Llama 3.1 8B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-8b-instruct-without-finetune-66f2b70b1d002ceedef02d2e)  | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-65536-woft) [3.5 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-4096-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-256-woft) [2.3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft)                                                                                                                                                                                                                                                                                                                                                                                                              |
-|       Llama 3.1 70B Instruct       | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-70b-instruct-without-finetune-66f2bf454d3dd78dfee2ff11)  | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft) [2.25 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-4-woft)  [2 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-0-woft) [1.93 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-32768-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k32768-0-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k16384-0-woft) |
-|      Llama 3.1 405B Instruct       | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-405b-instruct-without-finetune-66f4413f9ba55e1a9e52cfb0) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-256-woft) [2 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-65536-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k32768-32768-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-1024-woft) [1.5 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k4096-0-woft) [1.5 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-256-woft) [1.43 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-128-woft) [1.375 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-64-woft) |
-| Mistral Large Instruct 2407 (123B) | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-mistral-large-instruct-2407-without-finetune-6711ebfb7faf85eed9cceb16) | [4 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-16384-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-4096-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-256-woft) |
-|        Qwen 2.5 7B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-7b-instruct-without-finetune-66f3e9866d3167cc05ce954a)   | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-0-woft)  [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v16-k65536-65536-woft)                                                                                                                                                                                                                                                                                                                                              |
-|       Qwen 2.5 14B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-14b-instruct-without-finetune-66f827f83c7ffa7931b8376c)  | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-0-woft)  [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v16-k65536-65536-woft)                                                                                                                                                                                                                                                                                                                                         |
-|       Qwen 2.5 32B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-32b-instruct-without-finetune-66fe77173bf7d64139f0f613)  | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k256-256-woft)                                                                                                                                                                                                                                                                                                                                          |
-|       Qwen 2.5 72B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-72b-instruct-without-finetune-66f3bf1b3757dfa1ecb481c0)  | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-256-woft) [2.38 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k1024-512-woft) [2.25 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k512-512-woft) [2.25 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-4-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-0-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-65536-woft) [1.94 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-32768-woft)                                                  |
-|  Reproduced from the tech report   |     [HF 🤗](https://huggingface.co/collections/VPTQ-community/reproduced-vptq-tech-report-baseline-66fbf1dffe741cc9e93ecf04)     | Results from the open source community for reference only, please use them responsibly.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| Hessian and Inverse Hessian Matrix |      [HF 🤗](https://huggingface.co/collections/VPTQ-community/hessian-and-invhessian-checkpoints-66fd249a104850d17b23fd8b)      | Collected from RedPajama-Data-1T-Sample, following [Quip#](https://github.com/Cornell-RelaxML/quip-sharp/blob/main/quantize_llama/hessian_offline_llama.py)                                                                                                                                                                                                               
\ No newline at end of file
+For more information, read the VPTQ [paper](https://arxiv.org/pdf/2409.17066).
diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md
index bd472aba36ac..4b6b6869bfa4 100755
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,571 +14,325 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Quick tour
+# Quickstart
 
 [[open-in-colab]]
 
-Get up and running with 🤗 Transformers! Whether you're a developer or an everyday user, this quick tour will help you get started and show you how to use the [`pipeline`] for inference, load a pretrained model and preprocessor with an [AutoClass](./model_doc/auto), and quickly train a model with PyTorch or TensorFlow. If you're a beginner, we recommend checking out our tutorials or [course](https://huggingface.co/course/chapter1/1) next for more in-depth explanations of the concepts introduced here.
+Transformers is designed to be fast and easy to use so that everyone can start learning or building with transformer models.
 
-Before you begin, make sure you have all the necessary libraries installed:
+The number of user-facing abstractions is limited to only three classes for instantiating a model, and two APIs for inference or training. This quickstart introduces you to Transformers' key features and shows you how to:
 
-```bash
-!pip install transformers datasets evaluate accelerate
-```
+- load a pretrained model
+- run inference with [`Pipeline`]
+- fine-tune a model with [`Trainer`]
 
-You'll also need to install your preferred machine learning framework:
+## Set up
 
-<frameworkcontent>
-<pt>
-
-```bash
-pip install torch
-```
-</pt>
-<tf>
+To start, we recommend creating a Hugging Face [account](https://hf.co/join). An account lets you host and access version controlled models, datasets, and [Spaces](https://hf.co/spaces) on the Hugging Face [Hub](https://hf.co/docs/hub/index), a collaborative platform for discovery and building.
 
-```bash
-pip install tensorflow
-```
-</tf>
-</frameworkcontent>
+Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) and log in to your account.
 
-## Pipeline
+```py
+from huggingface_hub import notebook_login
 
-<Youtube id="tiZFewofSLM"/>
+notebook_login()
+```
 
-The [`pipeline`] is the easiest and fastest way to use a pretrained model for inference. You can use the [`pipeline`] out-of-the-box for many tasks across different modalities, some of which are shown in the table below:
+Install a machine learning framework.
 
-<Tip>
+<hfoptions id="installation">
+<hfoption id="PyTorch">
 
-For a complete list of available tasks, check out the [pipeline API reference](./main_classes/pipelines).
+```bash
+!pip install torch
+```
 
-</Tip>
+</hfoption>
+<hfoption id="TensorFlow">
 
-| **Task**                     | **Description**                                                                                              | **Modality**    | **Pipeline identifier**                       |
-|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------|
-| Text classification          | assign a label to a given sequence of text                                                                   | NLP             | pipeline(task=“sentiment-analysis”)           |
-| Text generation              | generate text given a prompt                                                                                 | NLP             | pipeline(task=“text-generation”)              |
-| Summarization                | generate a summary of a sequence of text or document                                                         | NLP             | pipeline(task=“summarization”)                |
-| Image classification         | assign a label to an image                                                                                   | Computer vision | pipeline(task=“image-classification”)         |
-| Image segmentation           | assign a label to each individual pixel of an image (supports semantic, panoptic, and instance segmentation) | Computer vision | pipeline(task=“image-segmentation”)           |
-| Object detection             | predict the bounding boxes and classes of objects in an image                                                | Computer vision | pipeline(task=“object-detection”)             |
-| Audio classification         | assign a label to some audio data                                                                            | Audio           | pipeline(task=“audio-classification”)         |
-| Automatic speech recognition | transcribe speech into text                                                                                  | Audio           | pipeline(task=“automatic-speech-recognition”) |
-| Visual question answering    | answer a question about the image, given an image and a question                                             | Multimodal      | pipeline(task=“vqa”)                          |
-| Document question answering  | answer a question about the document, given a document and a question                                        | Multimodal      | pipeline(task="document-question-answering")  |
-| Image captioning             | generate a caption for a given image                                                                         | Multimodal      | pipeline(task="image-to-text")                |
+```bash
+!pip install tensorflow
+```
 
-Start by creating an instance of [`pipeline`] and specifying a task you want to use it for. In this guide, you'll use the [`pipeline`] for sentiment analysis as an example:
+</hfoption>
+</hfoptions>
 
-```py
->>> from transformers import pipeline
+Then install an up-to-date version of Transformers and some additional libraries from the Hugging Face ecosystem for accessing datasets and vision models, evaluating training, and optimizing training for large models.
 
->>> classifier = pipeline("sentiment-analysis")
+```bash
+!pip install -U transformers datasets evaluate accelerate timm
 ```
 
-The [`pipeline`] downloads and caches a default [pretrained model](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) and tokenizer for sentiment analysis. Now you can use the `classifier` on your target text:
+## Pretrained models
 
-```py
->>> classifier("We are very happy to show you the 🤗 Transformers library.")
-[{'label': 'POSITIVE', 'score': 0.9998}]
-```
+Each pretrained model inherits from three base classes.
 
-If you have more than one input, pass your inputs as a list to the [`pipeline`] to return a list of dictionaries:
+| **Class** | **Description** |
+|---|---|
+| [`PretrainedConfig`] | A file that specifies a models attributes such as the number of attention heads or vocabulary size. |
+| [`PreTrainedModel`] | A model (or architecture) defined by the model attributes from the configuration file. A pretrained model only returns the raw hidden states. For a specific task, use the appropriate model head to convert the raw hidden states into a meaningful result (for example, [`LlamaModel`] versus [`LlamaForCausalLM`]). |
+| Preprocessor | A class for converting raw inputs (text, images, audio, multimodal) into numerical inputs to the model. For example, [`PreTrainedTokenizer`] converts text into tensors and [`ImageProcessingMixin`] converts pixels into tensors. |
 
-```py
->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
->>> for result in results:
-...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
-label: POSITIVE, with score: 0.9998
-label: NEGATIVE, with score: 0.5309
-```
+We recommend using the [AutoClass](./model_doc/auto) API to load models and preprocessors because it automatically infers the appropriate architecture for each task and machine learning framework based on the name or path to the pretrained weights and configuration file.
 
-The [`pipeline`] can also iterate over an entire dataset for any task you like. For this example, let's choose automatic speech recognition as our task:
+Use [`~PreTrainedModel.from_pretrained`] to load the weights and configuration file from the Hub into the model and preprocessor class.
 
-```py
->>> import torch
->>> from transformers import pipeline
+<hfoptions id="base-classes">
+<hfoption id="PyTorch">
 
->>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
-```
+When you load a model, configure the following parameters to ensure the model is optimally loaded.
 
-Load an audio dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart#audio) for more details) you'd like to iterate over. For example, load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset:
+- `device_map="auto"` automatically allocates the model weights to your fastest device first, which is typically the GPU.
+- `torch_dtype="auto"` directly initializes the model weights in the data type they're stored in, which can help avoid loading the weights twice (PyTorch loads weights in `torch.float32` by default).
 
 ```py
->>> from datasets import load_dataset, Audio
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype="auto", device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
 ```
 
-You need to make sure the sampling rate of the dataset matches the sampling
-rate [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) was trained on:
+Tokenize the text and return PyTorch tensors with the tokenizer. Move the model to a GPU if it's available to accelerate inference.
 
 ```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+model_inputs = tokenizer(["The secret to baking a good cake is "], return_tensors="pt").to("cuda")
 ```
 
-The audio files are automatically loaded and resampled when calling the `"audio"` column.
-Extract the raw waveform arrays from the first 4 samples and pass it as a list to the pipeline:
+The model is now ready for inference or training.
 
-```py
->>> result = speech_recognizer(dataset[:4]["audio"])
->>> print([d["text"] for d in result])
-['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT']
-```
-
-For larger datasets where the inputs are big (like in speech or vision), you'll want to pass a generator instead of a list to load all the inputs in memory. Take a look at the [pipeline API reference](./main_classes/pipelines) for more information.
-
-### Use another model and tokenizer in the pipeline
-
-The [`pipeline`] can accommodate any model from the [Hub](https://huggingface.co/models), making it easy to adapt the [`pipeline`] for other use-cases. For example, if you'd like a model capable of handling French text, use the tags on the Hub to filter for an appropriate model. The top filtered result returns a multilingual [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) finetuned for sentiment analysis you can use for French text:
+For inference, pass the tokenized inputs to [`~GenerationMixin.generate`] to generate text. Decode the token ids back into text with [`~PreTrainedTokenizerBase.batch_decode`].
 
 ```py
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+generated_ids = model.generate(**model_inputs, max_length=30)
+tokenizer.batch_decode(generated_ids)[0]
+'<s> The secret to baking a good cake is 100% in the preparation. There are so many recipes out there,'
 ```
 
-<frameworkcontent>
-<pt>
-Use [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` in the next section):
+</hfoption>
+<hfoption id="TensorFlow">
 
 ```py
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers import TFAutoModelForCausalLM, AutoTokenizer
 
->>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2-xl")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl")
 ```
-</pt>
-<tf>
-Use [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` in the next section):
 
-```py
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+Tokenize the text and return TensorFlow tensors with the tokenizer.
 
->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```py
+model_inputs = tokenizer(["The secret to baking a good cake is "], return_tensors="tf")
 ```
-</tf>
-</frameworkcontent>
 
-Specify the model and tokenizer in the [`pipeline`], and now you can apply the `classifier` on French text:
+The model is now ready for inference or training.
+
+For inference, pass the tokenized inputs to [`~GenerationMixin.generate`] to generate text. Decode the token ids back into text with [`~PreTrainedTokenizerBase.batch_decode`].
 
 ```py
->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
-[{'label': '5 stars', 'score': 0.7273}]
+generated_ids = model.generate(**model_inputs, max_length=30)
+tokenizer.batch_decode(generated_ids)[0]
+'The secret to baking a good cake is \xa0to use the right ingredients. \xa0The secret to baking a good cake is to use the right'
 ```
 
-If you can't find a model for your use-case, you'll need to finetune a pretrained model on your data. Take a look at our [finetuning tutorial](./training) to learn how. Finally, after you've finetuned your pretrained model, please consider [sharing](./model_sharing) the model with the community on the Hub to democratize machine learning for everyone! 🤗
+</hfoption>
+</hfoptions>
 
-## AutoClass
+> [!TIP]
+> Skip ahead to the [Trainer](#trainer-api) section to learn how to fine-tune a model.
 
-<Youtube id="AhChOFRegn4"/>
+## Pipeline
 
-Under the hood, the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] classes work together to power the [`pipeline`] you used above. An [AutoClass](./model_doc/auto) is a shortcut that automatically retrieves the architecture of a pretrained model from its name or path. You only need to select the appropriate `AutoClass` for your task and it's associated preprocessing class.
+The [`Pipeline`] class is the most convenient way to inference with a pretrained model. It supports many tasks such as text generation, image segmentation, automatic speech recognition, document question answering, and more.
 
-Let's return to the example from the previous section and see how you can use the `AutoClass` to replicate the results of the [`pipeline`].
+> [!TIP]
+> Refer to the [Pipeline](./main_classes/pipelines) API reference for a complete list of available tasks.
 
-### AutoTokenizer
+Create a [`Pipeline`] object and select a task. By default, [`Pipeline`] downloads and caches a default pretrained model for a given task. Pass the model name to the `model` parameter to choose a specific model.
 
-A tokenizer is responsible for preprocessing text into an array of numbers as inputs to a model. There are multiple rules that govern the tokenization process, including how to split a word and at what level words should be split (learn more about tokenization in the [tokenizer summary](./tokenizer_summary)). The most important thing to remember is you need to instantiate a tokenizer with the same model name to ensure you're using the same tokenization rules a model was pretrained with.
+<hfoptions id="pipeline-tasks">
+<hfoption id="text generation">
 
-Load a tokenizer with [`AutoTokenizer`]:
+Set `device="cuda"` to accelerate inference with a GPU.
 
 ```py
->>> from transformers import AutoTokenizer
+from transformers import pipeline
 
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+pipeline = pipeline("text-generation", model="meta-llama/Llama-2-7b-hf", device="cuda")
 ```
 
-Pass your text to the tokenizer:
+Prompt [`Pipeline`] with some initial text to generate more text.
 
 ```py
->>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
->>> print(encoding)
-{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+pipeline("The secret to baking a good cake is ", max_length=50)
+[{'generated_text': 'The secret to baking a good cake is 100% in the batter. The secret to a great cake is the icing.\nThis is why we’ve created the best buttercream frosting reci'}]
 ```
 
-The tokenizer returns a dictionary containing:
+</hfoption>
+<hfoption id="image segmentation">
 
-* [input_ids](./glossary#input-ids): numerical representations of your tokens.
-* [attention_mask](./glossary#attention-mask): indicates which tokens should be attended to.
-
-A tokenizer can also accept a list of inputs, and pad and truncate the text to return a batch with uniform length:
-
-<frameworkcontent>
-<pt>
+Set `device="cuda"` to accelerate inference with a GPU.
 
 ```py
->>> pt_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="pt",
-... )
-```
-</pt>
-<tf>
+from transformers import pipeline
 
-```py
->>> tf_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="tf",
-... )
+pipeline = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic", device="cuda")
 ```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-Check out the [preprocess](./preprocessing) tutorial for more details about tokenization, and how to use an [`AutoImageProcessor`], [`AutoFeatureExtractor`] and [`AutoProcessor`] to preprocess image, audio, and multimodal inputs.
-
-</Tip>
-
-### AutoModel
-
-<frameworkcontent>
-<pt>
-🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`].
-
-By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype="auto")
-```
-
-<Tip>
 
-See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class.
+Pass an image - a URL or local path to the image - to [`Pipeline`].
 
-</Tip>
-
-Now pass your preprocessed batch of inputs directly to the model. You just have to unpack the dictionary by adding `**`:
+<div class="flex justify-center">
+   <img src="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"/>
+</div>
 
 ```py
->>> pt_outputs = pt_model(**pt_batch)
+segments = pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+segments[0]["label"]
+'bird'
+segments[1]["label"]
+'bird'
 ```
 
-The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities:
-
-```py
->>> from torch import nn
+</hfoption>
+<hfoption id="automatic speech recognition">
 
->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
->>> print(pt_predictions)
-tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
-        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
-```
-</pt>
-<tf>
-🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`TFAutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`TFAutoModel`] for the task. For text (or sequence) classification, you should load [`TFAutoModelForSequenceClassification`]:
+Set `device="cuda"` to accelerate inference with a GPU.
 
 ```py
->>> from transformers import TFAutoModelForSequenceClassification
+from transformers import pipeline
 
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device="cuda")
 ```
 
-<Tip>
-
-See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class.
-
-</Tip>
-
-Now pass your preprocessed batch of inputs directly to the model. You can pass the tensors as-is:
+Pass an audio file to [`Pipeline`].
 
 ```py
->>> tf_outputs = tf_model(tf_batch)
+pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+{'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
 ```
 
-The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities:
+</hfoption>
+</hfoptions>
 
-```py
->>> import tensorflow as tf
+## Trainer
 
->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
->>> tf_predictions  # doctest: +IGNORE_RESULT
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-All 🤗 Transformers models (PyTorch or TensorFlow) output the tensors *before* the final activation
-function (like softmax) because the final activation function is often fused with the loss. Model outputs are special dataclasses so their attributes are autocompleted in an IDE. The model outputs behave like a tuple or a dictionary (you can index with an integer, a slice or a string) in which case, attributes that are None are ignored.
-
-</Tip>
+[`Trainer`] is a complete training and evaluation loop for PyTorch models. It abstracts away a lot of the boilerplate usually involved in manually writing a training loop, so you can start training faster and focus on training design choices. You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset.
 
-### Save a model
+Use the [`TrainingArguments`] class to customize the training process. It provides many options for training, evaluation, and more. Experiment with training hyperparameters and features like batch size, learning rate, mixed precision, torch.compile, and more to meet your training needs. You could also use the default training parameters to quickly produce a baseline.
 
-<frameworkcontent>
-<pt>
-Once your model is fine-tuned, you can save it with its tokenizer using [`PreTrainedModel.save_pretrained`]:
+Load a model, tokenizer, and dataset for training.
 
 ```py
->>> pt_save_directory = "./pt_save_pretrained"
->>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
->>> pt_model.save_pretrained(pt_save_directory)
-```
-
-When you are ready to use the model again, reload it with [`PreTrainedModel.from_pretrained`]:
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from datasets import load_dataset
 
-```py
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+dataset = load_dataset("rotten_tomatoes")
 ```
-</pt>
-<tf>
-Once your model is fine-tuned, you can save it with its tokenizer using [`TFPreTrainedModel.save_pretrained`]:
 
-```py
->>> tf_save_directory = "./tf_save_pretrained"
->>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
->>> tf_model.save_pretrained(tf_save_directory)
-```
-
-When you are ready to use the model again, reload it with [`TFPreTrainedModel.from_pretrained`]:
+Create a function to tokenize the text and convert it into PyTorch tensors. Apply this function to the whole dataset with the [`~datasets.Dataset.map`] method.
 
 ```py
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+def tokenize_dataset(dataset):
+    return tokenizer(dataset["text"])
+dataset = dataset.map(tokenize_dataset, batched=True)
 ```
-</tf>
-</frameworkcontent>
-
-One particularly cool 🤗 Transformers feature is the ability to save a model and reload it as either a PyTorch or TensorFlow model. The `from_pt` or `from_tf` parameter can convert the model from one framework to the other:
 
-<frameworkcontent>
-<pt>
-
-```py
->>> from transformers import AutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
-```
-</pt>
-<tf>
+Load a data collator to create batches of data and pass the tokenizer to it.
 
 ```py
->>> from transformers import TFAutoModel
+from transformers import DataCollatorWithPadding
 
->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 ```
-</tf>
-</frameworkcontent>
 
-## Custom model builds
-
-You can modify the model's configuration class to change how a model is built. The configuration specifies a model's attributes, such as the number of hidden layers or attention heads. You start from scratch when you initialize a model from a custom configuration class. The model attributes are randomly initialized, and you'll need to train the model before you can use it to get meaningful results.
-
-Start by importing [`AutoConfig`], and then load the pretrained model you want to modify. Within [`AutoConfig.from_pretrained`], you can specify the attribute you want to change, such as the number of attention heads:
+Next, set up [`TrainingArguments`] with the training features and hyperparameters.
 
 ```py
->>> from transformers import AutoConfig
+from transformers import TrainingArguments
 
->>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
+training_args = TrainingArguments(
+    output_dir="distilbert-rotten-tomatoes",
+    learning_rate=2e-5,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    num_train_epochs=2,
+    push_to_hub=True,
+)
 ```
 
-<frameworkcontent>
-<pt>
-Create a model from your custom configuration with [`AutoModel.from_config`]:
+Finally, pass all these separate components to [`Trainer`] and call [`~Trainer.train`] to start.
 
 ```py
->>> from transformers import AutoModel
+from transformers import Trainer
 
->>> my_model = AutoModel.from_config(my_config)
-```
-</pt>
-<tf>
-Create a model from your custom configuration with [`TFAutoModel.from_config`]:
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+)
 
-```py
->>> from transformers import TFAutoModel
-
->>> my_model = TFAutoModel.from_config(my_config)
+trainer.train()
 ```
-</tf>
-</frameworkcontent>
-
-Take a look at the [Create a custom architecture](./create_a_model) guide for more information about building custom configurations.
-
-## Trainer - a PyTorch optimized training loop
 
-All models are a standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) so you can use them in any typical training loop. While you can write your own training loop, 🤗 Transformers provides a [`Trainer`] class for PyTorch, which contains the basic training loop and adds additional functionality for features like distributed training, mixed precision, and more.
+Share your model and tokenizer to the Hub with [`~Trainer.push_to_hub`].
 
-Depending on your task, you'll typically pass the following parameters to [`Trainer`]:
-
-1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module). Set `torch_dtype="auto"` to automatically load the most memory-efficient data type the weights are stored in.
-
-   ```py
-   >>> from transformers import AutoModelForSequenceClassification
-
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto")
-   ```
-
-2. [`TrainingArguments`] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments:
-
-   ```py
-   >>> from transformers import TrainingArguments
-
-   >>> training_args = TrainingArguments(
-   ...     output_dir="path/to/save/folder/",
-   ...     learning_rate=2e-5,
-   ...     per_device_train_batch_size=8,
-   ...     per_device_eval_batch_size=8,
-   ...     num_train_epochs=2,
-   ... )
-   ```
-
-3. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor:
-
-   ```py
-   >>> from transformers import AutoTokenizer
-
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
-   ```
-
-4. Load a dataset:
-
-   ```py
-   >>> from datasets import load_dataset
-
-   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
-   ```
+```py
+trainer.push_to_hub()
+```
 
-5. Create a function to tokenize the dataset:
+Congratulations, you just trained your first model with Transformers!
 
-   ```py
-   >>> def tokenize_dataset(dataset):
-   ...     return tokenizer(dataset["text"])
-   ```
+### TensorFlow
 
-   Then apply it over the entire dataset with [`~datasets.Dataset.map`]:
+> [!WARNING]
+> Not all pretrained models are available in TensorFlow. Refer to a models API doc to check whether a TensorFlow implementation is supported.
 
-   ```py
-   >>> dataset = dataset.map(tokenize_dataset, batched=True)
-   ```
+[`Trainer`] doesn't work with TensorFlow models, but you can still train a Transformers model implemented in TensorFlow with [Keras](https://keras.io/). Transformers TensorFlow models are a standard [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), which is compatible with Keras' [compile](https://keras.io/api/models/model_training_apis/#compile-method) and [fit](https://keras.io/api/models/model_training_apis/#fit-method) methods.
 
-6. A [`DataCollatorWithPadding`] to create a batch of examples from your dataset:
+Load a model, tokenizer, and dataset for training.
 
-   ```py
-   >>> from transformers import DataCollatorWithPadding
+```py
+from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
 
-   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-   ```
+model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+```
 
-Now gather all these classes in [`Trainer`]:
+Create a function to tokenize the text and convert it into TensorFlow tensors. Apply this function to the whole dataset with the [`~datasets.Dataset.map`] method.
 
 ```py
->>> from transformers import Trainer
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=dataset["train"],
-...     eval_dataset=dataset["test"],
-...     processing_class=tokenizer,
-...     data_collator=data_collator,
-... )  # doctest: +SKIP
+def tokenize_dataset(dataset):
+    return tokenizer(dataset["text"])
+dataset = dataset.map(tokenize_dataset)
 ```
 
-When you're ready, call [`~Trainer.train`] to start training:
+Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to collate and batch a dataset.
 
 ```py
->>> trainer.train()  # doctest: +SKIP
+tf_dataset = model.prepare_tf_dataset(
+    dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
+)
 ```
 
-<Tip>
-
-For tasks - like translation or summarization - that use a sequence-to-sequence model, use the [`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] classes instead.
-
-</Tip>
-
-You can customize the training loop behavior by subclassing the methods inside [`Trainer`]. This allows you to customize features such as the loss function, optimizer, and scheduler. Take a look at the [`Trainer`] reference for which methods can be subclassed.
-
-The other way to customize the training loop is by using [Callbacks](./main_classes/callback). You can use callbacks to integrate with other libraries and inspect the training loop to report on progress or stop the training early. Callbacks do not modify anything in the training loop itself. To customize something like the loss function, you need to subclass the [`Trainer`] instead.
-
-## Train with TensorFlow
-
-All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) so they can be trained in TensorFlow with the [Keras](https://keras.io/) API. 🤗 Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to easily load your dataset as a `tf.data.Dataset` so you can start training right away with Keras' [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) methods.
-
-1. You'll start with a [`TFPreTrainedModel`] or a [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model):
-
-   ```py
-   >>> from transformers import TFAutoModelForSequenceClassification
-
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
-   ```
-
-2. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor:
-
-   ```py
-   >>> from transformers import AutoTokenizer
-
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
-   ```
-
-3. Create a function to tokenize the dataset:
-
-   ```py
-   >>> def tokenize_dataset(dataset):
-   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
-   ```
+Finally, call [compile](https://keras.io/api/models/model_training_apis/#compile-method) to configure the model for training and [fit](https://keras.io/api/models/model_training_apis/#fit-method) to start.
 
-4. Apply the tokenizer over the entire dataset with [`~datasets.Dataset.map`] and then pass the dataset and tokenizer to [`~TFPreTrainedModel.prepare_tf_dataset`]. You can also change the batch size and shuffle the dataset here if you'd like:
-
-   ```py
-   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
-   >>> tf_dataset = model.prepare_tf_dataset(
-   ...     dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
-   ... )  # doctest: +SKIP
-   ```
-
-5. When you're ready, you can call `compile` and `fit` to start training. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
-
-   ```py
-   >>> from tensorflow.keras.optimizers import Adam
-
-   >>> model.compile(optimizer='adam')  # No loss argument!
-   >>> model.fit(tf_dataset)  # doctest: +SKIP
-   ```
-
-
-## Chat with text generation models
-
-If you're working with a model that generates text as an output, you can also engage in a multi-turn conversation with
-it through the `transformers-cli chat` command. This is the fastest way to interact with a model, e.g. for a
-qualitative assessment (aka vibe check).
-
-This CLI is implemented on top of our `AutoClass` abstraction, leveraging our [text generation](llm_tutorial.md) and
-[chat](chat_templating.md) tooling, and thus will be compatible with any 🤗 Transformers model. If you have the library
-[installed](installation.md), you can launch the chat session on your terminal with
-
-```bash
-transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
-```
-
-For a full list of options to launch the chat, type
+```py
+from tensorflow.keras.optimizers import Adam
 
-```bash
-transformers-cli chat -h
+model.compile(optimizer="adam")
+model.fit(tf_dataset)
 ```
 
-After the chat is launched, you will enter an interactive session with the model. There are special commands for this
-session as well, such as `clear` to reset the conversation. Type `help` at any moment to display all special chat
-commands, and `exit` to terminate the session.
-
+## Next steps
 
-## What's next?
+Now that you have a better understanding of Transformers and what it offers, it's time to keep exploring and learning what interests you the most.
 
-Now that you've completed the 🤗 Transformers quick tour, check out our guides and learn how to do more specific things like writing a custom model, fine-tuning a model for a task, and how to train a model with a script. If you're interested in learning more about 🤗 Transformers core concepts, grab a cup of coffee and take a look at our Conceptual Guides!
+- **Base classes**: Learn more about the configuration, model and processor classes. This will help you understand how to create and customize models, preprocess different types of inputs (audio, images, multimodal), and how to share your model.
+- **Inference**: Explore the [`Pipeline`] further, inference and chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware.
+- **Training**: Study the [`Trainer`] in more detail, as well as distributed training and optimizing training on specific hardware.
+- **Quantization**: Reduce memory and storage requirements with quantization and speed up inference by representing weights with fewer bits.
+- **Resources**: Looking for end-to-end recipes for how to train and inference with a model for a specific task? Check out the task recipes!
diff --git a/docs/source/en/run_scripts.md b/docs/source/en/run_scripts.md
index b7a895591970..37e00c9974c3 100644
--- a/docs/source/en/run_scripts.md
+++ b/docs/source/en/run_scripts.md
@@ -14,21 +14,19 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Train with a script
+# Training scripts
 
-Along with the 🤗 Transformers [notebooks](./notebooks), there are also example scripts demonstrating how to train a model for a task with [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+Transformers provides many example training scripts for deep learning frameworks (PyTorch, TensorFlow, Flax) and tasks in [transformers/examples](https://github.com/huggingface/transformers/tree/main/examples). There are additional scripts in [transformers/research projects](https://github.com/huggingface/transformers-research-projects/) and [transformers/legacy](https://github.com/huggingface/transformers/tree/main/examples/legacy), but these aren't actively maintained and requires a specific version of Transformers.
 
-You will also find scripts we've used in our [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects) and [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy) which are mostly community contributed. These scripts are not actively maintained and require a specific version of 🤗 Transformers that will most likely be incompatible with the latest version of the library.
+Example scripts are only examples and you may need to adapt the script to your use-case. To help you with this, most scripts are very transparent in how data is preprocessed, allowing you to edit it as necessary.
 
-The example scripts are not expected to work out-of-the-box on every problem, and you may need to adapt the script to the problem you're trying to solve. To help you with this, most of the scripts fully expose how data is preprocessed, allowing you to edit it as necessary for your use case.
+For any feature you'd like to implement in an example script, please discuss it on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) before submitting a pull request. While we welcome contributions, it is unlikely a pull request that adds more functionality is added at the cost of readability.
 
-For any feature you'd like to implement in an example script, please discuss it on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) before submitting a Pull Request. While we welcome bug fixes, it is unlikely we will merge a Pull Request that adds more functionality at the cost of readability.
-
-This guide will show you how to run an example summarization training script in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). All examples are expected to work with both frameworks unless otherwise specified.
+This guide will show you how to run an example summarization training script in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization).
 
 ## Setup
 
-To successfully run the latest version of the example scripts, you have to **install 🤗 Transformers from source** in a new virtual environment:
+Install Transformers from source in a new virtual environment to run the latest version of the example script.
 
 ```bash
 git clone https://github.com/huggingface/transformers
@@ -36,48 +34,13 @@ cd transformers
 pip install .
 ```
 
-For older versions of the example scripts, click on the toggle below:
-
-<details>
-  <summary>Examples for older versions of 🤗 Transformers</summary>
-	<ul>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
-		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
-	</ul>
-</details>
-
-Then switch your current clone of 🤗 Transformers to a specific version, like v3.5.1 for example:
+Run the command below to checkout a script from a specific or older version of Transformers.
 
 ```bash
 git checkout tags/v3.5.1
 ```
 
-After you've setup the correct library version, navigate to the example folder of your choice and install the example specific requirements:
+After you've setup the correct version, navigate to the example folder of your choice and install the example specific requirements.
 
 ```bash
 pip install -r requirements.txt
@@ -85,13 +48,35 @@ pip install -r requirements.txt
 
 ## Run a script
 
-<frameworkcontent>
-<pt>
-The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset with the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/google-t5/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task.
+Start with a smaller dataset by including the `max_train_samples`, `max_eval_samples`, and `max_predict_samples` parameters to truncate the dataset to a maximum number of samples. This helps ensure training works as expected before committing to the entire dataset which can take hours to complete.
+
+> [!WARNING]
+> Not all example scripts support the `max_predict_samples` parameter. Run the command below to check whether a script supports it or not.
+> ```bash
+> examples/pytorch/summarization/run_summarization.py -h
+> ```
+
+The example below fine-tunes [T5-small](https://huggingface.co/google-t5/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/abisee/cnn_dailymail) dataset. T5 requires an additional `source_prefix` parameter to prompt it to summarize.
+
+<hfoptions id="script">
+<hfoption id="PyTorch">
+
+The example script downloads and preprocesses a dataset, and then fine-tunes it with [`Trainer`] with a supported model architecture. 
+
+Resuming training from a checkpoint is very useful if training is interrupted because you don't have to start over again. There are two ways to resume training from a checkpoint.
+
+* `--output dir previous_output_dir` resumes training from the latest checkpoint stored in `output_dir`. Remove the `--overwrite_output_dir` parameter if you're using this method.
+* `--resume_from_checkpoint path_to_specific_checkpoint` resumes training from a specific checkpoint folder.
+
+Share your model on the [Hub](https://huggingface.co/) with the `--push_to_hub` parameter. It creates a repository and uploads the model to the folder name specified in `--output_dir`. You could also use the `--push_to_hub_model_id` parameter to specify the repository name.
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
     --model_name_or_path google-t5/t5-small \
+    # remove the `max_train_samples`, `max_eval_samples` and `max_predict_samples` if everything works
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
     --do_train \
     --do_eval \
     --dataset_name cnn_dailymail \
@@ -100,83 +85,47 @@ python examples/pytorch/summarization/run_summarization.py \
     --output_dir /tmp/tst-summarization \
     --per_device_train_batch_size=4 \
     --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-</pt>
-<tf>
-The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset using Keras on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/google-t5/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task.
-
-```bash
-python examples/tensorflow/summarization/run_summarization.py  \
-    --model_name_or_path google-t5/t5-small \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --output_dir /tmp/tst-summarization  \
-    --per_device_train_batch_size 8 \
-    --per_device_eval_batch_size 16 \
-    --num_train_epochs 3 \
-    --do_train \
-    --do_eval
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    # remove if using `output_dir previous_output_dir`
+    # --overwrite_output_dir \
+    --output_dir previous_output_dir \
+    # --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate \
 ```
-</tf>
-</frameworkcontent>
-
-## Distributed training and mixed precision
 
-The [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supports distributed training and mixed precision, which means you can also use it in a script. To enable both of these features:
+For mixed precision and distributed training, include the following parameters and launch training with [torchrun](https://pytorch.org/docs/stable/elastic/run.html).
 
-- Add the `fp16` or `bf16` argument to enable mixed precision. XPU devices only supports `bf16` for mixed precision training.
-- Set the number of GPUs to use with the `nproc_per_node` argument.
+* Add the `fp16` or `bf16` parameters to enable mixed precision training. XPU devices only supports `bf16`.
+* Add the `nproc_per_node` parameter to set number of GPUs to train with.
 
 ```bash
 torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
-    --model_name_or_path google-t5/t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
+    ...
+    ...
 ```
 
-TensorFlow scripts utilize a [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) for distributed training, and you don't need to add any additional arguments to the training script. The TensorFlow script will use multiple GPUs by default if they are available.
-
-## Run a script on a TPU
-
-<frameworkcontent>
-<pt>
-Tensor Processing Units (TPUs) are specifically designed to accelerate performance. PyTorch supports TPUs with the [XLA](https://www.tensorflow.org/xla) deep learning compiler (see [here](https://github.com/pytorch/xla/blob/master/README.md) for more details). To use a TPU, launch the `xla_spawn.py` script and use the `num_cores` argument to set the number of TPU cores you want to use.
+PyTorch supports TPUs, hardware designed to accelerate performance, through the [PyTorch/XLA](https://github.com/pytorch/xla/blob/master/README.md) package. Launch the `xla_spawn.py` script and use `num _cores` to set the number of TPU cores to train with.
 
 ```bash
-python xla_spawn.py --num_cores 8 \
-    summarization/run_summarization.py \
+python xla_spawn.py --num_cores 8 pytorch/summarization/run_summarization.py \
     --model_name_or_path google-t5/t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
+    ...
+    ...
 ```
-</pt>
-<tf>
-Tensor Processing Units (TPUs) are specifically designed to accelerate performance. TensorFlow scripts utilize a [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) for training on TPUs. To use a TPU, pass the name of the TPU resource to the `tpu` argument.
+
+</hfoption>
+<hfoption id="TensorFlow">
 
 ```bash
-python run_summarization.py  \
-    --tpu name_of_tpu_resource \
+python examples/tensorflow/summarization/run_summarization.py  \
     --model_name_or_path google-t5/t5-small \
+    # remove the `max_train_samples`, `max_eval_samples` and `max_predict_samples` if everything works
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --output_dir /tmp/tst-summarization  \
@@ -184,33 +133,46 @@ python run_summarization.py  \
     --per_device_eval_batch_size 16 \
     --num_train_epochs 3 \
     --do_train \
-    --do_eval
+    --do_eval \
+```
+
+TensorFlow uses the [MirroredStrategy](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) for distributed training and doesn't require adding any additional parameters. The script uses multiple GPUs by default if they are available.
+
+For TPU training, TensorFlow scripts use the [TPUStrategy](https://www.tensorflow.org/guide/distributed_training#tpustrategy). Pass the TPU resource name to the `--tpu` parameter.
+
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    ...
+    ...
 ```
-</tf>
-</frameworkcontent>
 
-## Run a script with 🤗 Accelerate
+</hfoption>
+</hfoptions>
 
-🤗 [Accelerate](https://huggingface.co/docs/accelerate) is a PyTorch-only library that offers a unified method for training a model on several types of setups (CPU-only, multiple GPUs, TPUs) while maintaining complete visibility into the PyTorch training loop. Make sure you have 🤗 Accelerate installed if you don't already have it:
+## Accelerate
+
+[Accelerate](https://huggingface.co/docs/accelerate) is designed to simplify distributed training while offering complete visibility into the PyTorch training loop. If you're planning on training with a script with Accelerate, use the `_no_trainer.py` version of the script.
+
+Install Accelerate from source to ensure you have the latest version.
 
-> Note: As Accelerate is rapidly developing, the git version of accelerate must be installed to run the scripts
 ```bash
 pip install git+https://github.com/huggingface/accelerate
 ```
 
-Instead of the `run_summarization.py` script, you need to use the `run_summarization_no_trainer.py` script. 🤗 Accelerate supported scripts will have a `task_no_trainer.py` file in the folder. Begin by running the following command to create and save a configuration file:
+Run the [accelerate config](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) command to answer a few questions about your training setup. This creates and saves a config file about your system. 
 
 ```bash
 accelerate config
 ```
 
-Test your setup to make sure it is configured correctly:
+You can use [accelerate test](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-test) to ensure your system is properly configured.
 
 ```bash
 accelerate test
 ```
 
-Now you are ready to launch the training:
+Run [accelerate launch](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) to start training.
 
 ```bash
 accelerate launch run_summarization_no_trainer.py \
@@ -218,18 +180,18 @@ accelerate launch run_summarization_no_trainer.py \
     --dataset_name cnn_dailymail \
     --dataset_config "3.0.0" \
     --source_prefix "summarize: " \
-    --output_dir ~/tmp/tst-summarization
+    --output_dir ~/tmp/tst-summarization \
 ```
 
-## Use a custom dataset
+## Custom dataset
 
-The summarization script supports custom datasets as long as they are a CSV or JSON Line file. When you use your own dataset, you need to specify several additional arguments:
+The summarization scripts supports custom datasets as long as they are a CSV or JSONL file. When using your own dataset, you need to specify the following additional parameters.
 
-- `train_file` and `validation_file` specify the path to your training and validation files.
-- `text_column` is the input text to summarize.
-- `summary_column` is the target text to output.
+* `train_file` and `validation_file` specify the path to your training and validation files.
+* `text_column` is the input text to summarize.
+* `summary_column` is the target text to output.
 
-A summarization script using a custom dataset would look like this:
+An example command for summarizing a custom dataset is shown below.
 
 ```bash
 python examples/pytorch/summarization/run_summarization.py \
@@ -245,107 +207,5 @@ python examples/pytorch/summarization/run_summarization.py \
     --overwrite_output_dir \
     --per_device_train_batch_size=4 \
     --per_device_eval_batch_size=4 \
-    --predict_with_generate
+    --predict_with_generate \
 ```
-
-## Test a script
-
-It is often a good idea to run your script on a smaller number of dataset examples to ensure everything works as expected before committing to an entire dataset which may take hours to complete. Use the following arguments to truncate the dataset to a maximum number of samples:
-
-- `max_train_samples`
-- `max_eval_samples`
-- `max_predict_samples`
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path google-t5/t5-small \
-    --max_train_samples 50 \
-    --max_eval_samples 50 \
-    --max_predict_samples 50 \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
-
-Not all example scripts support the `max_predict_samples` argument. If you aren't sure whether your script supports this argument, add the `-h` argument to check:
-
-```bash
-examples/pytorch/summarization/run_summarization.py -h
-```
-
-## Resume training from checkpoint
-
-Another helpful option to enable is resuming training from a previous checkpoint. This will ensure you can pick up where you left off without starting over if your training gets interrupted. There are two methods to resume training from a checkpoint.
-
-The first method uses the `output_dir previous_output_dir` argument to resume training from the latest checkpoint stored in `output_dir`. In this case, you should remove `overwrite_output_dir`:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path google-t5/t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --output_dir previous_output_dir \
-    --predict_with_generate
-```
-
-The second method uses the `resume_from_checkpoint path_to_specific_checkpoint` argument to resume training from a specific checkpoint folder.
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path google-t5/t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --resume_from_checkpoint path_to_specific_checkpoint \
-    --predict_with_generate
-```
-
-## Share your model
-
-All scripts can upload your final model to the [Model Hub](https://huggingface.co/models). Make sure you are logged into Hugging Face before you begin:
-
-```bash
-huggingface-cli login
-```
-
-Then add the `push_to_hub` argument to the script. This argument will create a repository with your Hugging Face username and the folder name specified in `output_dir`.
-
-To give your repository a specific name, use the `push_to_hub_model_id` argument to add it. The repository will be automatically listed under your namespace.
-
-The following example shows how to upload a model with a specific repository name:
-
-```bash
-python examples/pytorch/summarization/run_summarization.py \
-    --model_name_or_path google-t5/t5-small \
-    --do_train \
-    --do_eval \
-    --dataset_name cnn_dailymail \
-    --dataset_config "3.0.0" \
-    --source_prefix "summarize: " \
-    --push_to_hub \
-    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
-    --output_dir /tmp/tst-summarization \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate
-```
\ No newline at end of file
diff --git a/docs/source/en/sagemaker.md b/docs/source/en/sagemaker.md
deleted file mode 100644
index 41802d9d42b2..000000000000
--- a/docs/source/en/sagemaker.md
+++ /dev/null
@@ -1,28 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Run training on Amazon SageMaker
-
-The documentation has been moved to [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). This page will be removed in `transformers` 5.0. 
-
-### Table of Contents
-
-- [Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
-- [Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
diff --git a/docs/source/en/serialization.md b/docs/source/en/serialization.md
index 158db928812e..83d8832bac12 100644
--- a/docs/source/en/serialization.md
+++ b/docs/source/en/serialization.md
@@ -14,69 +14,41 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Export to ONNX
+# ONNX
 
-Deploying 🤗 Transformers models in production environments often requires, or can benefit from exporting the models into 
-a serialized format that can be loaded and executed on specialized runtimes and hardware.
+[ONNX](http://onnx.ai) is an open standard that defines a common set of operators and a file format to represent deep learning models in different frameworks, including PyTorch and TensorFlow. When a model is exported to ONNX, the operators construct a computational graph (or *intermediate representation*) which represents the flow of data through the model. Standardized operators and data types makes it easy to switch between frameworks.
 
-🤗 Optimum is an extension of Transformers that enables exporting models from PyTorch or TensorFlow to serialized formats 
-such as ONNX and TFLite through its `exporters` module. 🤗 Optimum also provides a set of performance optimization tools to train 
-and run models on targeted hardware with maximum efficiency.
+The [Optimum](https://huggingface.co/docs/optimum/index) library exports a model to ONNX with configuration objects which are supported for [many architectures]((https://huggingface.co/docs/optimum/exporters/onnx/overview)) and can be easily extended. If a model isn't supported, feel free to make a [contribution](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) to Optimum.
 
-This guide demonstrates how you can export 🤗 Transformers models to ONNX with 🤗 Optimum, for the guide on exporting models to TFLite, 
-please refer to the [Export to TFLite page](tflite).
+The benefits of exporting to ONNX include the following.
 
-## Export to ONNX 
+- [Graph optimization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) and [quantization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization) for improving inference.
+- Use the [`~optimum.onnxruntime.ORTModel`] API to run a model with [ONNX Runtime](https://onnxruntime.ai/).
+- Use [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines) for ONNX models.
 
-[ONNX (Open Neural Network eXchange)](http://onnx.ai) is an open standard that defines a common set of operators and a 
-common file format to represent deep learning models in a wide variety of frameworks, including PyTorch and
-TensorFlow. When a model is exported to the ONNX format, these operators are used to
-construct a computational graph (often called an _intermediate representation_) which
-represents the flow of data through the neural network.
+Export a Transformers model to ONNX with the Optimum CLI or the `optimum.onnxruntime` module.
 
-By exposing a graph with standardized operators and data types, ONNX makes it easy to
-switch between frameworks. For example, a model trained in PyTorch can be exported to
-ONNX format and then imported in TensorFlow (and vice versa).
+## Optimum CLI
 
-Once exported to ONNX format, a model can be:
-- optimized for inference via techniques such as [graph optimization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) and [quantization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization). 
-- run with ONNX Runtime via [`ORTModelForXXX` classes](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort),
-which follow the same `AutoModel` API as the one you are used to in 🤗 Transformers.
-- run with [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines),
-which has the same API as the [`pipeline`] function in 🤗 Transformers. 
-
-🤗 Optimum provides support for the ONNX export by leveraging configuration objects. These configuration objects come 
-ready-made for a number of model architectures, and are designed to be easily extendable to other architectures.
-
-For the list of ready-made configurations, please refer to [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/onnx/overview).
-
-There are two ways to export a 🤗 Transformers model to ONNX, here we show both:
-
-- export with 🤗 Optimum via CLI.
-- export with 🤗 Optimum with `optimum.onnxruntime`.
-
-### Exporting a 🤗 Transformers model to ONNX with CLI
-
-To export a 🤗 Transformers model to ONNX, first install an extra dependency:
+Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module.
 
 ```bash
 pip install optimum[exporters]
 ```
 
-To check out all available arguments, refer to the [🤗 Optimum docs](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli), 
-or view help in command line:
+> [!TIP]
+> Refer to the [Export a model to ONNX with optimum.exporters.onnx](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) guide for all available arguments or with the command below.
+> ```bash
+> optimum-cli export onnx --help
+> ```
 
-```bash
-optimum-cli export onnx --help
-```
-
-To export a model's checkpoint from the 🤗 Hub, for example, `distilbert/distilbert-base-uncased-distilled-squad`, run the following command: 
+Set the `--model` argument to export a PyTorch or TensorFlow model from the Hub.
 
 ```bash
 optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
 ```
 
-You should see the logs indicating progress and showing where the resulting `model.onnx` is saved, like this:
+You should see logs indicating the progress and showing where the resulting `model.onnx` is saved.
 
 ```bash
 Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx...
@@ -90,20 +62,13 @@ Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx...
 The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx
 ```
 
-The example above illustrates exporting a checkpoint from 🤗 Hub. When exporting a local model, first make sure that you 
-saved both the model's weights and tokenizer files in the same directory (`local_path`). When using CLI, pass the 
-`local_path` to the `model` argument instead of the checkpoint name on 🤗 Hub and provide the `--task` argument. 
-You can review the list of supported tasks in the [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/task_manager).
-If `task` argument is not provided, it will default to the model architecture without any task specific head.
+For local models, make sure the model weights and tokenizer files are saved in the same directory, for example `local_path`. Pass the directory to the `--model` argument and use `--task` to indicate the [task](https://huggingface.co/docs/optimum/exporters/task_manager) a model can perform. If `--task` isn't provided, the model architecture without a task-specific head is used.
 
 ```bash
 optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/
 ```
 
-The resulting `model.onnx` file can then be run on one of the [many
-accelerators](https://onnx.ai/supported-tools.html#deployModel) that support the ONNX
-standard. For example, we can load and run the model with [ONNX
-Runtime](https://onnxruntime.ai/) as follows:
+The `model.onnx` file can be deployed with any [accelerator](https://onnx.ai/supported-tools.html#deployModel) that supports ONNX. The example below demonstrates loading and running a model with ONNX Runtime.
 
 ```python
 >>> from transformers import AutoTokenizer
@@ -115,16 +80,9 @@ Runtime](https://onnxruntime.ai/) as follows:
 >>> outputs = model(**inputs)
 ```
 
-The process is identical for TensorFlow checkpoints on the Hub. For instance, here's how you would
-export a pure TensorFlow checkpoint from the [Keras organization](https://huggingface.co/keras-io):
-
-```bash
-optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/
-```
+## optimum.onnxruntime
 
-### Exporting a 🤗 Transformers model to ONNX with `optimum.onnxruntime`
-
-Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmatically like so: 
+The `optimum.onnxruntime` module supports programmatically exporting a Transformers model. Instantiate a [`~optimum.onnxruntime.ORTModel`] for a task and set `export=True`. Use [`~OptimizedModel.save_pretrained`] to save the ONNX model.
 
 ```python
 >>> from optimum.onnxruntime import ORTModelForSequenceClassification
@@ -133,78 +91,9 @@ Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmati
 >>> model_checkpoint = "distilbert/distilbert-base-uncased-distilled-squad"
 >>> save_directory = "onnx/"
 
->>> # Load a model from transformers and export it to ONNX
 >>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 
->>> # Save the onnx model and tokenizer
 >>> ort_model.save_pretrained(save_directory)
 >>> tokenizer.save_pretrained(save_directory)
 ```
-
-### Exporting a model for an unsupported architecture
-
-If you wish to contribute by adding support for a model that cannot be currently exported, you should first check if it is
-supported in [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview),
-and if it is not, [contribute to 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)
-directly.
-
-### Exporting a model with `transformers.onnx`
-
-<Tip warning={true}>
-
-`transformers.onnx` is no longer maintained, please export models with 🤗 Optimum as described above. This section will be removed in the future versions.
-
-</Tip>
-
-To export a 🤗 Transformers model to ONNX with `transformers.onnx`, install extra dependencies:
-
-```bash
-pip install transformers[onnx]
-```
-
-Use `transformers.onnx` package as a Python module to export a checkpoint using a ready-made configuration:
-
-```bash
-python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
-```
-
-This exports an ONNX graph of the checkpoint defined by the `--model` argument. Pass any checkpoint on the 🤗 Hub or one that's stored locally.
-The resulting `model.onnx` file can then be run on one of the many accelerators that support the ONNX standard. For example, 
-load and run the model with ONNX Runtime as follows:
-
-```python
->>> from transformers import AutoTokenizer
->>> from onnxruntime import InferenceSession
-
->>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
->>> session = InferenceSession("onnx/model.onnx")
->>> # ONNX Runtime expects NumPy arrays as input
->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
-```
-
-The required output names (like `["last_hidden_state"]`) can be obtained by taking a look at the ONNX configuration of 
-each model. For example, for DistilBERT we have:
-
-```python
->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
-
->>> config = DistilBertConfig()
->>> onnx_config = DistilBertOnnxConfig(config)
->>> print(list(onnx_config.outputs.keys()))
-["last_hidden_state"]
-```
-
-The process is identical for TensorFlow checkpoints on the Hub. For example, export a pure TensorFlow checkpoint like so:
-
-```bash
-python -m transformers.onnx --model=keras-io/transformers-qa onnx/
-```
-
-To export a model that's stored locally, save the model's weights and tokenizer files in the same directory (e.g. `local-pt-checkpoint`), 
-then export it to ONNX by pointing the `--model` argument of the `transformers.onnx` package to the desired directory:
-
-```bash
-python -m transformers.onnx --model=local-pt-checkpoint onnx/
-```
\ No newline at end of file
diff --git a/docs/source/en/serving.md b/docs/source/en/serving.md
new file mode 100644
index 000000000000..1c665e4bc4bc
--- /dev/null
+++ b/docs/source/en/serving.md
@@ -0,0 +1,64 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Serving
+
+Transformer models can be served for inference with specialized libraries such as Text Generation Inference (TGI) and vLLM. These libraries are specifically designed to optimize performance with LLMs and include many unique optimization features that may not be included in Transformers.
+
+## TGI
+
+[TGI](https://huggingface.co/docs/text-generation-inference/index) can serve models that aren't [natively implemented](https://huggingface.co/docs/text-generation-inference/supported_models) by falling back on the Transformers implementation of the model. Some of TGIs high-performance features aren't available in the Transformers implementation, but other features like continuous batching and streaming are still supported.
+
+> [!TIP]
+> Refer to the [Non-core model serving](https://huggingface.co/docs/text-generation-inference/basic_tutorials/non_core_models) guide for more details.
+
+Serve a Transformers implementation the same way you'd serve a TGI model.
+
+```docker
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
+```
+
+Add `--trust-remote_code` to the command to serve a custom Transformers model.
+
+```docker
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
+```
+
+## vLLM
+
+[vLLM](https://docs.vllm.ai/en/latest/index.html) can also serve a Transformers implementation of a model if it isn't [natively implemented](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-text-only-language-models) in vLLM.
+
+Many features like quantization, LoRA adapters, and distributed inference and serving are supported for the Transformers implementation.
+
+> [!TIP]
+> Refer to the [Transformers fallback](https://docs.vllm.ai/en/latest/models/supported_models.html#transformers-fallback) section for more details.
+
+By default, vLLM serves the native implementation and if it doesn't exist, it falls back on the Transformers implementation. But you can also set `--model-impl transformers` to explicitly use the Transformers model implementation.
+
+```shell
+vllm serve Qwen/Qwen2.5-1.5B-Instruct \
+    --task generate \
+    --model-impl transformers \
+```
+
+Add the `trust-remote-code` parameter to enable loading a remote code model.
+
+```shell
+vllm serve Qwen/Qwen2.5-1.5B-Instruct \
+    --task generate \
+    --model-impl transformers \
+    --trust-remote-code \
+```
\ No newline at end of file
diff --git a/docs/source/en/tasks/prompting.md b/docs/source/en/tasks/prompting.md
index 146ec328df0c..e2e688629284 100644
--- a/docs/source/en/tasks/prompting.md
+++ b/docs/source/en/tasks/prompting.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,426 +14,224 @@ rendered properly in your Markdown viewer.
 
 -->
 
-
-# LLM prompting guide
+# Prompt engineering
 
 [[open-in-colab]]
 
-Large Language Models such as Falcon, LLaMA, etc. are pretrained transformer models initially trained to predict the 
-next token given some input text. They typically have billions of parameters and have been trained on trillions of 
-tokens for an extended period of time. As a result, these models become quite powerful and versatile, and you can use 
-them to solve multiple NLP tasks out of the box by instructing the models with natural language prompts.
-
-Designing such prompts to ensure the optimal output is often called "prompt engineering". Prompt engineering is an 
-iterative process that requires a fair amount of experimentation. Natural languages are much more flexible and expressive 
-than programming languages, however, they can also introduce some ambiguity. At the same time, prompts in natural language 
-are quite sensitive to changes. Even minor modifications in prompts can lead to wildly different outputs.
-
-While there is no exact recipe for creating prompts to match all cases, researchers have worked out a number of best 
-practices that help to achieve optimal results more consistently. 
+Prompt engineering or prompting, uses natural language to improve large language model (LLM) performance on a variety of tasks. A prompt can steer the model towards generating a desired output. In many cases, you don't even need a [fine-tuned](#finetuning) model for a task. You just need a good prompt.
 
-This guide covers the prompt engineering best practices to help you craft better LLM prompts and solve various NLP tasks. 
-You'll learn:
+Try prompting a LLM to classify some text. When you create a prompt, it's important to provide very specific instructions about the task and what the result should look like.
 
-- [Basics of prompting](#basics-of-prompting)
-- [Best practices of LLM prompting](#best-practices-of-llm-prompting)
-- [Advanced prompting techniques: few-shot prompting and chain-of-thought](#advanced-prompting-techniques)
-- [When to fine-tune instead of prompting](#prompting-vs-fine-tuning)
+```py
+from transformers import pipeline
+import torch
 
-<Tip>
-
-Prompt engineering is only a part of the LLM output optimization process. Another essential component is choosing the 
-optimal text generation strategy. You can customize how your LLM selects each of the subsequent tokens when generating 
-the text without modifying any of the trainable parameters. By tweaking the text generation parameters, you can reduce 
-repetition in the generated text and make it more coherent and human-sounding. 
-Text generation strategies and parameters are out of scope for this guide, but you can learn more about these topics in 
-the following guides: 
- 
-* [Generation with LLMs](../llm_tutorial)
-* [Text generation strategies](../generation_strategies)
+pipeline = pipeline(task="text-generation", model="mistralai/Mistal-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
+prompt = """Classify the text into neutral, negative or positive.
+Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen.
+Sentiment:
+"""
 
-</Tip>
+outputs = pipeline(prompt, max_new_tokens=10)
+for output in outputs:
+    print(f"Result: {output['generated_text']}")
+Result: Classify the text into neutral, negative or positive. 
+Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen.
+Sentiment:
+Positive
+```
 
-## Basics of prompting
+The challenge lies in designing prompts that produces the results you're expecting because language is so incredibly nuanced and expressive.
 
-### Types of models 
+This guide covers prompt engineering best practices, techniques, and examples for how to solve language and reasoning tasks.
 
-The majority of modern LLMs are decoder-only transformers. Some examples include: [LLaMA](../model_doc/llama), 
-[Llama2](../model_doc/llama2), [Falcon](../model_doc/falcon), [GPT2](../model_doc/gpt2). However, you may encounter
-encoder-decoder transformer LLMs as well, for instance, [Flan-T5](../model_doc/flan-t5) and [BART](../model_doc/bart).
+## Best practices
 
-Encoder-decoder-style models are typically used in generative tasks where the output **heavily** relies on the input, for 
-example, in translation and summarization. The decoder-only models are used for all other types of generative tasks.
+1. Try to pick the latest models for the best performance. Keep in mind that LLMs can come in two variants, [base](https://hf.co/mistralai/Mistral-7B-v0.1) and [instruction-tuned](https://hf.co/mistralai/Mistral-7B-Instruct-v0.1) (or chat).
 
-When using a pipeline to generate text with an LLM, it's important to know what type of LLM you are using, because 
-they use different pipelines. 
+    Base models are excellent at completing text given an initial prompt, but they're not as good at following instructions. Instruction-tuned models are specifically trained versions of the base models on instructional or conversational data. This makes instruction-tuned models a better fit for prompting.
 
-Run inference with decoder-only models with the `text-generation` pipeline:
+    > [!WARNING]
+    > Modern LLMs are typically decoder-only models, but there are some encoder-decoder LLMs like [Flan-T5](../model_doc/flan-t5) or [BART](../model_doc/bart) that may be used for prompting. For encoder-decoder models, make sure you set the pipeline task identifier to `text2text-generation` instead of `text-generation`.
 
-```python
->>> from transformers import pipeline
->>> import torch
+2. Start with a short and simple prompt, and iterate on it to get better results.
 
->>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
+3. Put instructions at the beginning or end of a prompt. For longer prompts, models may apply optimizations to prevent attention from scaling quadratically, which places more emphasis at the beginning and end of a prompt.
 
->>> generator = pipeline('text-generation', model = 'openai-community/gpt2')
->>> prompt = "Hello, I'm a language model"
+4. Clearly separate instructions from the text of interest.
 
->>> generator(prompt, max_length = 30)
-[{'generated_text': "Hello, I'm a language model. Not a programming language at all: it's pretty simple.\n\nWhen I write a function, I mean"}]
-```
+5. Be specific and descriptive about the task and the desired output, including for example, its format, length, style, and language. Avoid ambiguous descriptions and instructions.
 
-To run inference with an encoder-decoder, use the `text2text-generation` pipeline:
+6. Instructions should focus on "what to do" rather than "what not to do".
 
-```python
->>> text2text_generator = pipeline("text2text-generation", model = 'google/flan-t5-base')
->>> prompt = "Translate from English to French: I'm very happy to see you"
+7. Lead the model to generate the correct output by writing the first word or even the first sentence.
 
->>> text2text_generator(prompt)
-[{'generated_text': 'Je suis très heureuse de vous rencontrer.'}]
-```
+8. Try other techniques like [few-shot](#few-shot) and [chain-of-thought](#chain-of-thought) to improve results.
 
-### Base vs instruct/chat models
+9. Test your prompts with different models to assess their robustness.
 
-Most of the recent LLM checkpoints available on 🤗 Hub come in two versions: base and instruct (or chat). For example, 
-[`tiiuae/falcon-7b`](https://huggingface.co/tiiuae/falcon-7b) and [`tiiuae/falcon-7b-instruct`](https://huggingface.co/tiiuae/falcon-7b-instruct).
+10. Version and track your prompt performance.
 
-Base models are excellent at completing the text when given an initial prompt, however, they are not ideal for NLP tasks 
-where they need to follow instructions, or for conversational use. This is where the instruct (chat) versions come in. 
-These checkpoints are the result of further fine-tuning of the pre-trained base versions on instructions and conversational data. 
-This additional fine-tuning makes them a better choice for many NLP tasks.  
+## Techniques
 
-Let's illustrate some simple prompts that you can use with [`tiiuae/falcon-7b-instruct`](https://huggingface.co/tiiuae/falcon-7b-instruct) 
-to solve some common NLP tasks.
+Crafting a good prompt alone, also known as zero-shot prompting, may not be enough to get the results you want. You may need to try a few prompting techniques to get the best performance.
 
-### NLP tasks 
+This section covers a few prompting techniques.
 
-First, let's set up the environment: 
+### Few-shot
 
-```bash
-pip install -q transformers accelerate
-```
+Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you're looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to see how it affects performance.
 
-Next, let's load the model with the appropriate pipeline (`"text-generation"`): 
+The example below provides the model with 1 example (1-shot) of the output format (a date in MM/DD/YYYY format) it should return.
 
-```python
->>> from transformers import pipeline, AutoTokenizer
->>> import torch
+```py
+from transformers import pipeline
+import torch
 
->>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
->>> model = "tiiuae/falcon-7b-instruct"
+pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
+prompt = """Text: The first human went into space and orbited the Earth on April 12, 1961.
+Date: 04/12/1961
+Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. 
+Date:"""
 
->>> tokenizer = AutoTokenizer.from_pretrained(model)
->>> pipe = pipeline(
-...     "text-generation",
-...     model=model,
-...     tokenizer=tokenizer,
-...     torch_dtype=torch.bfloat16,
-...     device_map="auto",
-... )
+outputs = pipeline(prompt, max_new_tokens=12, do_sample=True, top_k=10)
+for output in outputs:
+    print(f"Result: {output['generated_text']}")
+Result: Text: The first human went into space and orbited the Earth on April 12, 1961.
+Date: 04/12/1961
+Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. 
+Date: 09/28/1960
 ```
 
-<Tip>
-
-Note that Falcon models were trained using the `bfloat16` datatype, so we recommend you use the same. This requires a recent 
-version of CUDA and works best on modern cards.
-
-</Tip>
+The downside of few-shot prompting is that you need to create lengthier prompts which increases computation and latency. There is also a limit to prompt lengths. Finally, a model can learn unintended patterns from your examples and it doesn't work well on complex reasoning tasks.
 
-Now that we have the model loaded via the pipeline, let's explore how you can use prompts to solve NLP tasks.
+### Chain-of-thought
 
-#### Text classification
+Chain-of-thought (CoT) is effective at generating more coherent and well-reasoned outputs by providing a series of prompts that help a model "think" more thoroughly about a topic.
 
-One of the most common forms of text classification is sentiment analysis, which assigns a label like "positive", "negative", 
-or "neutral" to a sequence of text. Let's write a prompt that instructs the model to classify a given text (a movie review). 
-We'll start by giving the instruction, and then specifying the text to classify. Note that instead of leaving it at that, we're 
-also adding the beginning of the response - `"Sentiment: "`:
+The example below provides the model with several prompts to work through intermediate reasoning steps.
 
-```python
->>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
->>> prompt = """Classify the text into neutral, negative or positive. 
-... Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen.
-... Sentiment:
-... """
+```py
+from transformers import pipeline
+import torch
 
->>> sequences = pipe(
-...     prompt,
-...     max_new_tokens=10,
-... )
+pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
+prompt = """Let's go through this step-by-step:
+1. You start with 15 muffins.
+2. You eat 2 muffins, leaving you with 13 muffins.
+3. You give 5 muffins to your neighbor, leaving you with 8 muffins.
+4. Your partner buys 6 more muffins, bringing the total number of muffins to 14.
+5. Your partner eats 2 muffins, leaving you with 12 muffins.
+If you eat 6 muffins, how many are left?"""
 
->>> for seq in sequences:
-...     print(f"Result: {seq['generated_text']}")
-Result: Classify the text into neutral, negative or positive. 
-Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen.
-Sentiment:
-Positive
+outputs = pipeline(prompt, max_new_tokens=20, do_sample=True, top_k=10)
+for output in outputs:
+    print(f"Result: {output['generated_text']}")
+Result: Let's go through this step-by-step:
+1. You start with 15 muffins.
+2. You eat 2 muffins, leaving you with 13 muffins.
+3. You give 5 muffins to your neighbor, leaving you with 8 muffins.
+4. Your partner buys 6 more muffins, bringing the total number of muffins to 14.
+5. Your partner eats 2 muffins, leaving you with 12 muffins.
+If you eat 6 muffins, how many are left?
+Answer: 6
 ```
 
-As a result, the output contains a classification label from the list we have provided in the instructions, and it is a correct one!
+Like [few-shot](#few-shot) prompting, the downside of CoT is that it requires more effort to design a series of prompts that help the model reason through a complex task and prompt length increases latency.
 
-<Tip>
+## Fine-tuning
 
-You may notice that in addition to the prompt, we pass a `max_new_tokens` parameter. It controls the number of tokens the 
-model shall generate, and it is one of the many text generation parameters that you can learn about 
-in [Text generation strategies](../generation_strategies) guide.
+While prompting is a powerful way to work with LLMs, there are scenarios where a fine-tuned model or even fine-tuning a model works better.
 
-</Tip>
+Here are some examples scenarios where a fine-tuned model makes sense.
 
-#### Named Entity Recognition
+- Your domain is extremely different from what a LLM was pretrained on, and extensive prompting didn't produce the results you want.
+- Your model needs to work well in a low-resource language.
+- Your model needs to be trained on sensitive data that have strict regulatory requirements.
+- You're using a small model due to cost, privacy, infrastructure, or other constraints.
 
-Named Entity Recognition (NER) is a task of finding named entities in a piece of text, such as a person, location, or organization.
-Let's modify the instructions in the prompt to make the LLM perform this task. Here, let's also set `return_full_text = False` 
-so that output doesn't contain the prompt:
+In all of these scenarios, ensure that you have a large enough domain-specific dataset to train your model with, have enough time and resources, and the cost of fine-tuning is worth it. Otherwise, you may be better off trying to optimize your prompt.
 
-```python
->>> torch.manual_seed(1) # doctest: +IGNORE_RESULT
->>> prompt = """Return a list of named entities in the text.
-... Text: The Golden State Warriors are an American professional basketball team based in San Francisco.
-... Named entities:
-... """
+## Examples
 
->>> sequences = pipe(
-...     prompt,
-...     max_new_tokens=15,
-...     return_full_text = False,    
-... )
-
->>> for seq in sequences:
-...     print(f"{seq['generated_text']}")
-- Golden State Warriors
-- San Francisco
-```
+The examples below demonstrate prompting a LLM for different tasks.
 
-As you can see, the model correctly identified two named entities from the given text.
+<hfoptions id="tasks">
+<hfoption id="named entity recognition">
 
-#### Translation
+```py
+from transformers import pipeline
+import torch
 
-Another task LLMs can perform is translation. You can choose to use encoder-decoder models for this task, however, here,
-for the simplicity of the examples, we'll keep using Falcon-7b-instruct, which does a decent job. Once again, here's how 
-you can write a basic prompt to instruct a model to translate a piece of text from English to Italian: 
+pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
+prompt = """Return a list of named entities in the text.
+Text: The company was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf in New York City, originally as a company that developed a chatbot app targeted at teenagers.
+Named entities:
+"""
 
-```python
->>> torch.manual_seed(2) # doctest: +IGNORE_RESULT
->>> prompt = """Translate the English text to Italian.
-... Text: Sometimes, I've believed as many as six impossible things before breakfast.
-... Translation:
-... """
-
->>> sequences = pipe(
-...     prompt,
-...     max_new_tokens=20,
-...     do_sample=True,
-...     top_k=10,
-...     return_full_text = False,
-... )
-
->>> for seq in sequences:
-...     print(f"{seq['generated_text']}")
-A volte, ho creduto a sei impossibili cose prima di colazione.
+outputs = pipeline(prompt, max_new_tokens=50, return_full_text=False)
+for output in outputs:
+    print(f"Result: {output['generated_text']}")
+Result:  [Clément Delangue, Julien Chaumond, Thomas Wolf, company, New York City, chatbot app, teenagers]
 ```
 
-Here we've added a `do_sample=True` and `top_k=10` to allow the model to be a bit more flexible when generating output.
-
-#### Text summarization
-
-Similar to the translation, text summarization is another generative task where the output **heavily** relies on the input, 
-and encoder-decoder models can be a better choice. However, decoder-style models can be used for this task as well.
-Previously, we have placed the instructions at the very beginning of the prompt. However, the very end of the prompt can 
-also be a suitable location for instructions. Typically, it's better to place the instruction on one of the extreme ends.  
-
-```python
->>> torch.manual_seed(3) # doctest: +IGNORE_RESULT
->>> prompt = """Permaculture is a design process mimicking the diversity, functionality and resilience of natural ecosystems. The principles and practices are drawn from traditional ecological knowledge of indigenous cultures combined with modern scientific understanding and technological innovations. Permaculture design provides a framework helping individuals and communities develop innovative, creative and effective strategies for meeting basic needs while preparing for and mitigating the projected impacts of climate change.
-... Write a summary of the above text.
-... Summary:
-... """
-
->>> sequences = pipe(
-...     prompt,
-...     max_new_tokens=30,
-...     do_sample=True,
-...     top_k=10,
-...     return_full_text = False,
-... )
-
->>> for seq in sequences:
-...     print(f"{seq['generated_text']}")
-"Permaculture is an ecological design method that mimics natural ecosystems' diversity, functionality, and resilience using modern technology and indigenous knowledge. It aims to help"
-```
+</hfoption>
+<hfoption id="translation">
 
-#### Question answering
-
-For question answering task we can structure the prompt into the following logical components: instructions, context, question, and 
-the leading word or phrase (`"Answer:"`) to nudge the model to start generating the answer:
-
-```python
->>> torch.manual_seed(4) # doctest: +IGNORE_RESULT
->>> prompt = """Answer the question using the context below.
-... Context: Gazpacho is a cold soup and drink made of raw, blended vegetables. Most gazpacho includes stale bread, tomato, cucumbers, onion, bell peppers, garlic, olive oil, wine vinegar, water, and salt. Northern recipes often include cumin and/or pimentón (smoked sweet paprika). Traditionally, gazpacho was made by pounding the vegetables in a mortar with a pestle; this more laborious method is still sometimes used as it helps keep the gazpacho cool and avoids the foam and silky consistency of smoothie versions made in blenders or food processors.
-... Question: What modern tool is used to make gazpacho?
-... Answer:
-... """
-
->>> sequences = pipe(
-...     prompt,
-...     max_new_tokens=10,
-...     do_sample=True,
-...     top_k=10,
-...     return_full_text = False,
-... )
-
->>> for seq in sequences:
-...     print(f"Result: {seq['generated_text']}")
-"Result: Modern tools are used, such as immersion blenders"
-```
+```py
+from transformers import pipeline
+import torch
 
-#### Reasoning
+pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
+prompt = """Translate the English text to French.
+Text: Sometimes, I've believed as many as six impossible things before breakfast.
+Translation:
+"""
 
-Reasoning is one of the most difficult tasks for LLMs, and achieving good results often requires applying advanced prompting techniques, like 
-[Chain-of-thought](#chain-of-thought).
-
-Let's try if we can make a model reason about a simple arithmetics task with a basic prompt: 
-
-```python
->>> torch.manual_seed(5) # doctest: +IGNORE_RESULT
->>> prompt = """There are 5 groups of students in the class. Each group has 4 students. How many students are there in the class?"""
-
->>> sequences = pipe(
-...     prompt,
-...     max_new_tokens=30,
-...     do_sample=True,
-...     top_k=10,
-...     return_full_text = False,
-... )
-
->>> for seq in sequences:
-...     print(f"Result: {seq['generated_text']}")
-Result: 
-There are a total of 50 students in the class (5 groups x 4 students per group = 20 groups, and 
+outputs = pipeline(prompt, max_new_tokens=20, do_sample=True, top_k=10, return_full_text=False)
+for output in outputs:
+    print(f"Result: {output['generated_text']}")
+Result: À l'occasion, j'ai croyu plus de six choses impossibles
 ```
 
-Correct! Let's increase the complexity a little and see if we can still get away with a basic prompt:
+</hfoption>
+<hfoption id="summarization">
 
-```python
->>> torch.manual_seed(6) # doctest: +IGNORE_RESULT
->>> prompt = """I baked 15 muffins. I ate 2 muffins and gave 5 muffins to a neighbor. My partner then bought 6 more muffins and ate 2. How many muffins do we now have?"""
+```py
+from transformers import pipeline
+import torch
 
->>> sequences = pipe(
-...     prompt,
-...     max_new_tokens=10,
-...     do_sample=True,
-...     top_k=10,
-...     return_full_text = False,
-... )
+pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
+prompt = """Permaculture is a design process mimicking the diversity, functionality and resilience of natural ecosystems. The principles and practices are drawn from traditional ecological knowledge of indigenous cultures combined with modern scientific understanding and technological innovations. Permaculture design provides a framework helping individuals and communities develop innovative, creative and effective strategies for meeting basic needs while preparing for and mitigating the projected impacts of climate change.
+Write a summary of the above text.
+Summary:
+"""
 
->>> for seq in sequences:
-...     print(f"Result: {seq['generated_text']}")
-Result: 
-The total number of muffins now is 21
-```
-
-This is a wrong answer, it should be 12. In this case, this can be due to the prompt being too basic, or due to the choice 
-of model, after all we've picked the smallest version of Falcon. Reasoning is difficult for models of all sizes, but larger 
-models are likely to perform better. 
-
-## Best practices of LLM prompting
-
-In this section of the guide we have compiled a list of best practices that tend to improve the prompt results:
-
-* When choosing the model to work with, the latest and most capable models are likely to perform better. 
-* Start with a simple and short prompt, and iterate from there.
-* Put the instructions at the beginning of the prompt, or at the very end. When working with large context, models apply various optimizations to prevent Attention complexity from scaling quadratically. This may make a model more attentive to the beginning or end of a prompt than the middle.
-* Clearly separate instructions from the text they apply to - more on this in the next section. 
-* Be specific and descriptive about the task and the desired outcome - its format, length, style, language, etc.
-* Avoid ambiguous descriptions and instructions.
-* Favor instructions that say "what to do" instead of those that say "what not to do".
-* "Lead" the output in the right direction by writing the first word (or even begin the first sentence for the model).
-* Use advanced techniques like [Few-shot prompting](#few-shot-prompting) and [Chain-of-thought](#chain-of-thought)
-* Test your prompts with different models to assess their robustness. 
-* Version and track the performance of your prompts. 
-
-## Advanced prompting techniques
-
-### Few-shot prompting
-
-The basic prompts in the sections above are the examples of "zero-shot" prompts, meaning, the model has been given 
-instructions and context, but no examples with solutions. LLMs that have been fine-tuned on instruction datasets, generally 
-perform well on such "zero-shot" tasks. However, you may find that your task has more complexity or nuance, and, perhaps, 
-you have some requirements for the output that the model doesn't catch on just from the instructions. In this case, you can 
-try the technique called few-shot prompting. 
-
-In few-shot prompting, we provide examples in the prompt giving the model more context to improve the performance. 
-The examples condition the model to generate the output following the patterns in the examples.
-
-Here's an example: 
-
-```python
->>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
->>> prompt = """Text: The first human went into space and orbited the Earth on April 12, 1961.
-... Date: 04/12/1961
-... Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. 
-... Date:"""
-
->>> sequences = pipe(
-...     prompt,
-...     max_new_tokens=8,
-...     do_sample=True,
-...     top_k=10,
-... )
-
->>> for seq in sequences:
-...     print(f"Result: {seq['generated_text']}")
-Result: Text: The first human went into space and orbited the Earth on April 12, 1961.
-Date: 04/12/1961
-Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. 
-Date: 09/28/1960
+outputs = pipeline(prompt, max_new_tokens=30, do_sample=True, top_k=10, return_full_text=False)
+for output in outputs:
+    print(f"Result: {output['generated_text']}")
+Result: Permaculture is the design process that involves mimicking natural ecosystems to provide sustainable solutions to basic needs. It is a holistic approach that comb
 ```
 
-In the above code snippet we used a single example to demonstrate the desired output to the model, so this can be called a 
-"one-shot" prompting. However, depending on the task complexity you may need to use more than one example. 
-
-Limitations of the few-shot prompting technique: 
-- While LLMs can pick up on the patterns in the examples, these technique doesn't work well on complex reasoning tasks
-- Few-shot prompting requires creating lengthy prompts. Prompts with large number of tokens can increase computation and latency. There's also a limit to the length of the prompts.  
-- Sometimes when given a number of examples, models can learn patterns that you didn't intend them to learn, e.g. that the third movie review is always negative.
-
-### Chain-of-thought
+</hfoption>
+<hfoption id="question answering">
 
-Chain-of-thought (CoT) prompting is a technique that nudges a model to produce intermediate reasoning steps thus improving 
-the results on complex reasoning tasks. 
+```py
+from transformers import pipeline
+import torch
 
-There are two ways of steering a model to producing the reasoning steps:
-- few-shot prompting by illustrating examples with detailed answers to questions, showing the model how to work through a problem.
-- by instructing the model to reason by adding phrases like "Let's think step by step" or "Take a deep breath and work through the problem step by step."
+pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
+prompt = """Answer the question using the context below.
+Context: Gazpacho is a cold soup and drink made of raw, blended vegetables. Most gazpacho includes stale bread, tomato, cucumbers, onion, bell peppers, garlic, olive oil, wine vinegar, water, and salt. Northern recipes often include cumin and/or pimentón (smoked sweet paprika). Traditionally, gazpacho was made by pounding the vegetables in a mortar with a pestle; this more laborious method is still sometimes used as it helps keep the gazpacho cool and avoids the foam and silky consistency of smoothie versions made in blenders or food processors.
+Question: What modern tool is used to make gazpacho?
+Answer:
+"""
 
-If we apply the CoT technique to the muffins example from the [reasoning section](#reasoning) and use a larger model, 
-such as (`tiiuae/falcon-180B-chat`) which you can play with in the [HuggingChat](https://huggingface.co/chat/), 
-we'll get a significant improvement on the reasoning result:
-
-```text
-Let's go through this step-by-step:
-1. You start with 15 muffins.
-2. You eat 2 muffins, leaving you with 13 muffins.
-3. You give 5 muffins to your neighbor, leaving you with 8 muffins.
-4. Your partner buys 6 more muffins, bringing the total number of muffins to 14.
-5. Your partner eats 2 muffins, leaving you with 12 muffins.
-Therefore, you now have 12 muffins.
+outputs = pipeline(prompt, max_new_tokens=10, do_sample=True, top_k=10, return_full_text=False)
+for output in outputs:
+    print(f"Result: {output['generated_text']}")
+Result: A blender or food processor is the modern tool
 ```
 
-## Prompting vs fine-tuning
-
-You can achieve great results by optimizing your prompts, however, you may still ponder whether fine-tuning a model 
-would work better for your case. Here are some scenarios when fine-tuning a smaller model may be a preferred option:
-
-- Your domain is wildly different from what LLMs were pre-trained on and extensive prompt optimization did not yield sufficient results. 
-- You need your model to work well in a low-resource language.
-- You need the model to be trained on sensitive data that is under strict regulations. 
-- You have to use a small model due to cost, privacy, infrastructure or other limitations. 
-
-In all of the above examples, you will need to make sure that you either already have or can easily obtain a large enough 
-domain-specific dataset at a reasonable cost to fine-tune a model. You will also need to have enough time and resources 
-to fine-tune a model.
-
-If the above examples are not the case for you, optimizing prompts can prove to be more beneficial.   
-
-
+</hfoption>
+</hfoptions>
diff --git a/docs/source/en/testing.md b/docs/source/en/testing.md
index 9e85f2248e16..dd0b9cbb4268 100644
--- a/docs/source/en/testing.md
+++ b/docs/source/en/testing.md
@@ -220,7 +220,7 @@ Just run the following line to automatically test every docstring example in the
 ```bash
 pytest --doctest-modules <path_to_file_or_dir>
 ```
-If the file has a markdown extention, you should add the `--doctest-glob="*.md"` argument.
+If the file has a markdown extension, you should add the `--doctest-glob="*.md"` argument.
 
 ### Run only modified tests
 
diff --git a/docs/source/en/tf_xla.md b/docs/source/en/tf_xla.md
index a585aec068b1..c8fb13ff6aec 100644
--- a/docs/source/en/tf_xla.md
+++ b/docs/source/en/tf_xla.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,82 +14,71 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# XLA Integration for TensorFlow Models
+# XLA
 
 [[open-in-colab]]
 
-Accelerated Linear Algebra, dubbed XLA, is a compiler for accelerating the runtime of TensorFlow Models. From the [official documentation](https://www.tensorflow.org/xla):
+[Accelerated Linear Algebra (XLA)](https://openxla.org/xla) is a linear algebra compiler that optimizes model runtime across different hardware and frameworks.
 
-XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear algebra that can accelerate TensorFlow models with potentially no source code changes.
+This guide will look specifically at how to accelerate *TensorFlow* models with XLA.
 
-Using XLA in TensorFlow is simple – it comes packaged inside the `tensorflow` library, and it can be triggered with the `jit_compile` argument in any graph-creating function such as [`tf.function`](https://www.tensorflow.org/guide/intro_to_graphs). When using Keras methods like `fit()` and `predict()`, you can enable XLA simply by passing the `jit_compile` argument to `model.compile()`. However, XLA is not limited to these methods - it can also be used to accelerate any arbitrary `tf.function`.
+## TensorFlow
 
-Several TensorFlow methods in 🤗 Transformers have been rewritten to be XLA-compatible, including text generation for models such as [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2), [T5](https://huggingface.co/docs/transformers/model_doc/t5) and [OPT](https://huggingface.co/docs/transformers/model_doc/opt), as well as speech processing for models such as [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
+XLA can potentially accelerate a TensorFlow model without making any source code changes. It is already packaged with the TensorFlow library, and it is triggered with `jit_compile` in any graph creating function such as [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
 
-While the exact amount of speed-up is very much model-dependent, for TensorFlow text generation models inside 🤗 Transformers, we noticed a speed-up of ~100x. This document will explain how you can use XLA for these models to get the maximum amount of performance. We’ll also provide links to additional resources if you’re interested to learn more about the benchmarks and our design philosophy behind the XLA integration.
+If you're using Keras methods like [fit](https://keras.io/api/models/model_training_apis/#fit-method) and [predict](https://keras.io/api/models/model_training_apis/#predict-method), enable XLA by passing `jit_compile=True` to [compile](https://keras.io/api/models/model_training_apis/#compile-method).
 
-## Running TF functions with XLA
+```py
+model.compile(jit_compile=True)
+```
 
-Let us consider the following model in TensorFlow:
+XLA can be used to accelerate any arbitrary [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
 
-```py
+Models with a TensorFlow implementation like [GPT2](./model_doc/gpt2), [T5](./model_doc/t5), [OPT](./model_doc/opt), and [Whisper](./model_doc/whisper) are XLA compatible. The speed up depends on a model, but in general, TensorFlow models in Transformers get a ~100x speed up.
+
+### Functions
+
+A typical forward pass in a TensorFlow model is shown below. To run a forward pass with XLA, wrap the model with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function) and set `jit_compile=True`.
+
+```diff
 import tensorflow as tf
 
 model = tf.keras.Sequential(
     [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
 )
-```
-
-The above model accepts inputs having a dimension of `(10, )`. We can use the model for running a forward pass like so:
-
-```py
 # Generate random inputs for the model.
 batch_size = 16
 input_vector_dim = 10
 random_inputs = tf.random.normal((batch_size, input_vector_dim))
 
 # Run a forward pass.
-_ = model(random_inputs)
-```
-
-In order to run the forward pass with an XLA-compiled function, we’d need to do:
-
-```py
-xla_fn = tf.function(model, jit_compile=True)
-_ = xla_fn(random_inputs)
+- _ = model(random_inputs)
++ xla_fn = tf.function(model, jit_compile=True)
++ _ = xla_fn(random_inputs)
 ```
 
-The default `call()` function of the `model` is used for compiling the XLA graph. But if there’s any other model function you want to compile into XLA that’s also possible with:
+The default `call` function of the model is used to compile the XLA graph. But if there's any other model function you want to compile with XLA, wrap them with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
 
 ```py
 my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
 ```
 
-## Running a TF text generation model with XLA from 🤗 Transformers
+### Text generation
 
-To enable XLA-accelerated generation within 🤗 Transformers, you need to have a recent version of `transformers` installed. You can install it by running:
-
-```bash
-pip install transformers --upgrade
-```
-
-And then you can run the following code:
+You could also compile other model functions with XLA. For example, enable XLA for text generation by wrapping [`~TFGenerationMixin.generate`] with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
 
 ```py
 import tensorflow as tf
 from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
 # Will error if the minimal version of Transformers is not installed.
 from transformers.utils import check_min_version
 
 check_min_version("4.21.0")
 
-
 tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
 model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
 input_string = ["TensorFlow is"]
 
-# One line to create an XLA generation function
 xla_generate = tf.function(model.generate, jit_compile=True)
 
 tokenized_input = tokenizer(input_string, return_tensors="tf")
@@ -97,18 +86,16 @@ generated_tokens = xla_generate(**tokenized_input, num_beams=2)
 
 decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
 print(f"Generated -- {decoded_text}")
-# Generated -- TensorFlow is an open-source, open-source, distributed-source application # framework for the
+"Generated -- TensorFlow is an open-source, open-source, distributed-source application framework for the"
 ```
 
-As you can notice, enabling XLA on `generate()` is just a single line of code. The rest of the code remains unchanged. However, there are a couple of gotchas in the above code snippet that are specific to XLA. You need to be aware of those to realize the speed-ups that XLA can bring in. We discuss these in the following section. 
-
-## Gotchas to be aware of
+## Tracing
 
-When you are executing an XLA-enabled function (like `xla_generate()` above) for the first time, it will internally try to infer the computation graph, which is time-consuming.  This process is known as [“tracing”](https://www.tensorflow.org/guide/intro_to_graphs#when_is_a_function_tracing). 
+When executing an XLA-enabled function for the first time, it tries to infer the computation graph in a process known as *tracing*. This is a time-consuming step, but any consecutive calls to the function will be much faster because it won't have to trace the computation graph again.
 
-You might notice that the generation time is not fast. Successive calls of `xla_generate()` (or any other XLA-enabled function) won’t have to infer the computation graph, given the inputs to the function follow the same shape with which the computation graph was initially built. While this is not a problem for modalities with fixed input shapes (e.g., images), you must pay attention if you are working with variable input shape modalities (e.g., text).
+To ensure a function is only traced once, the inputs must have the same shape as when the graph was built. This usually isn't an issue for fixed input shapes like images, but it can be an issue for inputs with variable shapes like text.
 
-To ensure `xla_generate()` always operates with the same input shapes, you can specify the `padding` arguments when calling the tokenizer. 
+One way to handle this is to pad your text so it always has the same shape. Configure padding options such as [pad_to_multiple_of](https://hf.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.pad.pad_to_multiple_of) in the tokenizer.
 
 ```py
 import tensorflow as tf
@@ -120,7 +107,7 @@ input_string = ["TensorFlow is"]
 
 xla_generate = tf.function(model.generate, jit_compile=True)
 
-# Here, we call the tokenizer with padding options.
+# Call tokenizer with padding options.
 tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
 
 generated_tokens = xla_generate(**tokenized_input, num_beams=2)
@@ -128,47 +115,15 @@ decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
 print(f"Generated -- {decoded_text}")
 ```
 
-This way, you can ensure that the inputs to `xla_generate()` will always receive inputs with the shape it was traced with and thus leading to speed-ups in the generation time. You can verify this with the code below:
-
-```py
-import time
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]:
-    tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
-    start = time.time_ns()
-    generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-    end = time.time_ns()
-    print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")
-```
-
-On a Tesla T4 GPU, you can expect the outputs like so:
-
-```bash
-Execution time -- 30819.6 ms
-
-Execution time -- 79.0 ms
-
-Execution time -- 78.9 ms
-```
-The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point will trigger re-tracing and thus leading to slow-downs in the generation time. 
+In addition to the input shape, any changes to the generation options at any point also triggers tracing.
 
-We didn’t cover all the text generation options 🤗 Transformers provides in this document. We encourage you to read the documentation for advanced use cases.
+## Resources
 
-## Additional Resources
+Learn more about XLA with the following resources.
 
-Here, we leave you with some additional resources if you want to delve deeper into XLA in 🤗 Transformers and in general. 
- 
-* [This Colab Notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) provides an interactive demonstration if you want to fiddle with the XLA-compatible encoder-decoder (like [T5](https://huggingface.co/docs/transformers/model_doc/t5)) and decoder-only (like [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)) text generation models. 
-* [This blog post](https://huggingface.co/blog/tf-xla-generate) provides an overview of the comparison benchmarks for XLA-compatible models along with a friendly introduction to XLA in TensorFlow. 
-* [This blog post](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) discusses our design philosophy behind adding XLA support to the TensorFlow models in 🤗 Transformers. 
-* Recommended posts for learning more about XLA and TensorFlow graphs in general:
-    * [XLA: Optimizing Compiler for Machine Learning](https://www.tensorflow.org/xla)
-    * [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs)
-    * [Better performance with tf.function](https://www.tensorflow.org/guide/function)
+- A [notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) demonstrating XLA-compatible encoder-decoder and decoder-only text generation models.
+- The [Faster Text Generation with TensorFlow and XLA](https://hf.co/blog/tf-xla-generate) blog post compares benchmarks for XLA-compatible models and provides a friendly introduction to XLA in TensorFlow.
+- The [How Hugging Face improved Text Generation performance with XLA](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) blog post discusses the design philosophy behind adding XLA to TensorFlow models in Transformers.
+- The [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs) guide.
+- The [Better performance with tf.function](https://www.tensorflow.org/guide/function) guide.
+- The [XLA](https://openxla.org/xla) documentation.
diff --git a/docs/source/en/tflite.md b/docs/source/en/tflite.md
index 09434a81508d..c1ab9618b436 100644
--- a/docs/source/en/tflite.md
+++ b/docs/source/en/tflite.md
@@ -14,37 +14,39 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Export to TFLite
+# LiteRT
 
-[TensorFlow Lite](https://www.tensorflow.org/lite/guide) is a lightweight framework for deploying machine learning models 
-on resource-constrained devices, such as mobile phones, embedded systems, and Internet of Things (IoT) devices. 
-TFLite is designed to optimize and run models efficiently on these devices with limited computational power, memory, and 
-power consumption.
-A TensorFlow Lite model is represented in a special efficient portable format identified by the `.tflite` file extension. 
+[LiteRT](https://ai.google.dev/edge/litert) (previously known as TensorFlow Lite) is a high-performance runtime designed for on-device machine learning.
 
-🤗 Optimum offers functionality to export 🤗 Transformers models to TFLite through the `exporters.tflite` module. 
-For the list of supported model architectures, please refer to [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/tflite/overview).
+The [Optimum](https://huggingface.co/docs/optimum/index) library exports a model to LiteRT for [many architectures]((https://huggingface.co/docs/optimum/exporters/onnx/overview)).
 
-To export a model to TFLite, install the required dependencies:
- 
-```bash
-pip install optimum[exporters-tf]
-```
+The benefits of exporting to LiteRT include the following.
+
+- Low-latency, privacy-focused, no internet connectivity required, and reduced model size and power consumption for on-device machine learning.
+- Broad platform, model framework, and language support.
+- Hardware acceleration for GPUs and Apple Silicon.
+
+Export a Transformers model to LiteRT with the Optimum CLI.
 
-To check out all available arguments, refer to the [🤗 Optimum docs](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model), 
-or view help in command line:
+Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module for LiteRT.
 
 ```bash
-optimum-cli export tflite --help
+pip install optimum[exporters-tf]
 ```
 
-To export a model's checkpoint from the 🤗 Hub, for example, `google-bert/bert-base-uncased`, run the following command:
+> [!TIP]
+> Refer to the [Export a model to TFLite with optimum.exporters.tflite](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) guide for all available arguments or with the command below.
+> ```bash
+> optimum-cli export tflite --help
+> ```
+
+Set the `--model` argument to export a from the Hub.
 
 ```bash
 optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
 ```
 
-You should see the logs indicating progress and showing where the resulting `model.tflite` is saved, like this:
+You should see logs indicating the progress and showing where the resulting `model.tflite` is saved.
 
 ```bash
 Validating TFLite model...
@@ -57,6 +59,8 @@ The TensorFlow Lite export succeeded with the warning: The maximum absolute diff
  The exported model was saved at: bert_tflite
  ```
 
-The example above illustrates exporting a checkpoint from 🤗 Hub. When exporting a local model, first make sure that you 
-saved both the model's weights and tokenizer files in the same directory (`local_path`). When using CLI, pass the 
-`local_path` to the `model` argument instead of the checkpoint name on 🤗 Hub. 
\ No newline at end of file
+For local models, make sure the model weights and tokenizer files are saved in the same directory, for example `local_path`. Pass the directory to the `--model` argument and use `--task` to indicate the [task](https://huggingface.co/docs/optimum/exporters/task_manager) a model can perform. If `--task` isn't provided, the model architecture without a task-specific head is used.
+
+```bash
+optimum-cli export tflite --model local_path --task question-answering google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
+```
diff --git a/docs/source/en/tiktoken.md b/docs/source/en/tiktoken.md
deleted file mode 100644
index aac81e24fdd7..000000000000
--- a/docs/source/en/tiktoken.md
+++ /dev/null
@@ -1,60 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-``
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Tiktoken and interaction with Transformers
-
-Support for tiktoken model files is seamlessly integrated in 🤗 transformers when loading models 
-`from_pretrained` with a `tokenizer.model` tiktoken file on the Hub, which is automatically converted into our 
-[fast tokenizer](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast). 
-
-### Known models that were released with a `tiktoken.model`:
-	- gpt2
-	- llama3
-
-## Example usage
- 
-In order to load `tiktoken` files in `transformers`, ensure that the `tokenizer.model` file is a tiktoken file and it 
-will automatically be loaded when loading `from_pretrained`. Here is how one would load a tokenizer and a model, which 
- can be loaded from the exact same file:
-
-```py
-from transformers import AutoTokenizer
-
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") 
-```
-## Create tiktoken tokenizer
-
-The `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json`, the appropriate format for [`PreTrainedTokenizerFast`].
-
-Generate the `tokenizer.model` file with [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) and then convert it to `tokenizer.json` with [`convert_tiktoken_to_fast`].
-
-```py
-
-from transformers.integrations.tiktoken import convert_tiktoken_to_fast
-from tiktoken import get_encoding
-
-# You can load your custom encoding or the one provided by OpenAI
-encoding = get_encoding("gpt2")
-convert_tiktoken_to_fast(encoding, "config/save/dir")
-```
-
-The resulting `tokenizer.json` file is saved to the specified directory and can be loaded with [`PreTrainedTokenizerFast`].
-
-```py
-tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
-```
-
diff --git a/docs/source/en/tools.md b/docs/source/en/tools.md
new file mode 100644
index 000000000000..cb29de386a18
--- /dev/null
+++ b/docs/source/en/tools.md
@@ -0,0 +1,252 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+> [!WARNING]
+> Agents and tools are being spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. These docs will be deprecated in the future!
+
+# Tools
+
+A tool is a function an agent can use to complete a task. Depending on your task, a tool can perform a web search, answer questions about a document, transcribe speech to text, and much more.
+
+Transformers provides a default set of tools for agents. These include the tools mentioned above as well as image question answering, text-to-speech, translation, and a Python code interpreter that executes the Python code generated by a LLM in a secure environment.
+
+Set `add_base_tools=True` to enable this default set of tools. The `tools` parameter is for adding additional tools. Leave the list empty if you aren't planning on adding any other tools to the toolbox.
+
+```py
+from transformers import ReactCodeAgent
+
+agent = ReactCodeAgent(tools=[], add_base_tools=True)
+```
+
+You could also manually load a tool with [`load_tool`].
+
+```py
+from transformers import load_tool, ReactCodeAgent
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text-to-speech tool")
+agent = ReactCodeAgent(tools=[audio])
+```
+
+This guide will help you learn how to create your own tools and manage an agents toolbox.
+
+## Create a new tool
+
+You can create any tool you can dream of to empower an agent. The example in this section creates a tool that returns the most downloaded model for a task from the Hub, and the code for it is shown below.
+
+```py
+from huggingface_hub import list_models
+
+task = "text-classification"
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+
+There are two ways you can create a tool, using a decorator or a superclass.
+
+### Tool decorator
+
+A fast and simple way to create a tool is to add the `@tool` decorator.
+
+Convert the code above into a tool by wrapping it in a function and adding the `@tool` decorator. The function needs:
+
+- A clear name that describes what the tool does, `model_download_counter`.
+- Type hints for the input and output (`str`).
+- A description that describes the tool in more detail and its arguments. This description is incorporated in the agents system prompt. It tells the agent *how* to use the tool, so try to make it as clear as possible!
+
+```py
+from transformers import tool
+
+@tool
+def model_download_counter(task: str) -> str:
+    """
+    This is a tool that returns the checkpoint name of the most downloaded model for a task from the Hugging Face Hub.
+
+    Args:
+        task: The task to retrieve the most downloaded model from.
+    """
+    model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+    return model.id
+```
+
+Pass the `model_download_counter` tool to the agents `tools` parameter to use it.
+
+```py
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[model_download_counter], add_base_tools=True)
+agent.run(
+    "Can you give me the name of the model that has the most downloads on the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+
+### Tool superclass
+
+Inheritance allows you to customize the [`Tool`] superclass or build a tool much more flexibly and comprehensively. This example will show you how to build the same `model_download_counter` tool as a [`Tool`] class.
+
+The [`Tool`] class needs:
+
+- A clear name that describes what the tool does, `model_download_counter`.
+- A description that describes the tool in more detail and its arguments. This description is incorporated in the agents system prompt. It tells the agent *how* to use the tool, so try to make it as clear as possible!
+- An `inputs` attribute that describes the input type. This is a dictionary with the keys, `type` and `description`.
+- An `outputs` attribute that describes the output type.
+- A `forward` method containing the code to be executed when the tool is called.
+
+Write the following code below to a file named `model_download.py`.
+
+```py
+from transformers import Tool
+from huggingface_hub import list_models
+
+class HFModelDownloadsTool(Tool):
+    name = "model_download_counter"
+    description = """
+    This is a tool that returns the checkpoint name of the most downloaded model for a task from the Hugging Face Hub."""
+
+    inputs = {
+        "task": {
+            "type": "string",
+            "description": "the task category (such as text-classification, depth-estimation, etc)",
+        }
+    }
+    output_type = "string"
+
+    def forward(self, task: str):
+        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+        return model.id
+```
+
+Import the tool from `model_download.py` and use [`load_tool`] to load it into the agent.
+
+```py
+from model_download import HFModelDownloadsTool
+from transformers import load_tool, CodeAgent
+
+tool = HFModelDownloadsTool()
+model_counter = load_tool(tool)
+agent = CodeAgent(tools=[model_counter], add_base_tools=True)
+```
+
+Also consider sharing your tool to the Hub with [`~Tool.push_to_hub`] so that everyone can use it!
+
+```py
+from model_download import HFModelDownloadsTool
+from transformers import load_tool, CodeAgent
+
+tool = HFModelDownloadsTool()
+tool.push_to_hub("{your_username}/hf-model-downloads")
+model_counter = load_tool("m-ric/hf-model-downloads")
+agent = CodeAgent(tools=[model_counter], add_base_tools=True)
+```
+
+## Add and replace tools
+
+Once an agent is initialized, add or replace its available tools without reinitializing the agent from scratch.
+
+Use [`add_tool`] to add a tool to an existing agent.
+
+```py
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], add_base_tools=True)
+agent.toolbox.add_tool(model_download_counter)
+```
+
+Now you can use the default text-to-speech tool to read aloud the most downloaded model for the text-to-video task.
+
+```py
+agent.run(
+    "Can you read out loud the name of the model that has the most downloads on the 'text-to-video' task on the Hugging Face Hub and return the audio?"
+)
+```
+
+> [!WARNING]
+> When adding tools to an agent that already works well, it can bias the agent towards your tool or a tool other than the one currently defined.
+
+Use [`update_tool`] to replace an agents existing tool. This is useful if the new tool is a one-to-one replacement of the existing tool because the agent already knows how to perform the task. The new tool should follow the same API as the tool it replaced or the system prompt template should be adapted to ensure all examples using the replaced tool are updated.
+
+```py
+agent.toolbox.update_tool(new_model_download_counter)
+```
+
+## ToolCollection
+
+A [`ToolCollection`] is a collection of Hugging Face [Spaces](https://hf.co/spaces) that can be quickly loaded and used by an agent.
+
+> [!TIP]
+> Learn more about creating collections on the Hub.
+
+Create a [`ToolCollection`] object and specify the `collection_slug` of the collection you want to use, and then pass it to the agent. To speed up the starting process, tools are only loaded if they're called by the agent.
+
+The example loads a collection of image generation tools.
+
+```py
+from transformers import ToolCollection, ReactCodeAgent
+
+image_tool_collection = ToolCollection(collection_slug="")
+agent = ReactCodeAgent(tools=[*image_tool_collection], add_base_tools=True)
+agent.run(
+    "Please draw me a picture of rivers and lakes."
+)
+```
+
+## Tool integrations
+
+Transformers supports tools from several other libraries, such as [gradio-tools](https://github.com/freddyaboulton/gradio-tools) and [LangChain](https://python.langchain.com/docs/introduction/).
+
+### gradio-tools
+
+gradio-tools is a library that enables [Gradio](https://www.gradio.app/) apps to be used as tools. With the wide variety of Gradio apps available, you can enhance your agent with a range of tools like generating images and videos or transcribing audio and summarizing it.
+
+Import and instantiate a tool from gradio-tools, for example, the [StableDiffusionPromptGeneratorTool](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py). This tool can help improve prompts to generate better images.
+
+> [!WARNING]
+> gradio-tools require text inputs and outputs even when working with different modalities like images and audio, which are currently incompatible.
+
+Use [`~Tool.from_gradio`] to load the prompt generator tool.
+
+```py
+from gradio_tools import StableDiffusionPromptGeneratorTool
+from transformers import Tool, load_tool, CodeAgent
+
+gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
+prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
+```
+
+Now pass it to the agent along with a text-to-image tool.
+
+```py
+image_generation_tool = load_tool("huggingface-tools/text-to-image")
+agent = CodeAgent(tools=[prompt_generator_tool, image_generation_tool], llm_engine=llm_engine)
+agent.run(
+    "Improve this prompt, then generate an image of it.", prompt="A rabbit wearing a space suit"
+)
+```
+
+### LangChain
+
+LangChain is a library for working with LLMs which includes agents and tools. Use the [`~Tool.from_langchain`] method to load any LangChain tool into an agent.
+
+The example below demonstrates how to use LangChains web search tool.
+
+```py
+from langchain.agents import load_tools
+from transformers import Tool, ReactCodeAgent
+
+search_tool = Tool.from_langchain(load_tools(["serpapi"])[0])
+agent = ReactCodeAgent(tools=[search_tool])
+agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
+```
diff --git a/docs/source/en/torchscript.md b/docs/source/en/torchscript.md
index b62e23468f8f..75d66e454837 100644
--- a/docs/source/en/torchscript.md
+++ b/docs/source/en/torchscript.md
@@ -14,108 +14,56 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Export to TorchScript
+# TorchScript
 
-<Tip>
+[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may the most performant choice.
 
-This is the very beginning of our experiments with TorchScript and we are still
-exploring its capabilities with variable-input-size models. It is a focus of interest to
-us and we will deepen our analysis in upcoming releases, with more code examples, a more
-flexible implementation, and benchmarks comparing Python-based codes with compiled
-TorchScript.
+Transformers can export a model to TorchScript by:
 
-</Tip>
+1. creating dummy inputs to create a *trace* of the model to serialize to TorchScript
+2. enabling the `torchscript` parameter in either [`~PretrainedConfig.torchscript`] for a randomly initialized model or [`~PreTrainedModel.from_pretrained`] for a pretrained model
 
-According to the [TorchScript documentation](https://pytorch.org/docs/stable/jit.html):
+## Dummy inputs
 
-> TorchScript is a way to create serializable and optimizable models from PyTorch code.
+The dummy inputs are used in the forward pass, and as the input values are propagated through each layer, PyTorch tracks the different operations executed on each tensor. The recorded operations are used to create the model trace. Once it is recorded, it is serialized into a TorchScript program.
 
-There are two PyTorch modules, [JIT and
-TRACE](https://pytorch.org/docs/stable/jit.html), that allow developers to export their
-models to be reused in other programs like efficiency-oriented C++ programs.
-
-We provide an interface that allows you to export 🤗 Transformers models to TorchScript
-so they can be reused in a different environment than PyTorch-based Python programs.
-Here, we explain how to export and use our models using TorchScript.
-
-Exporting a model requires two things:
-
-- model instantiation with the `torchscript` flag
-- a forward pass with dummy inputs
-
-These necessities imply several things developers should be careful about as detailed
-below.
-
-## TorchScript flag and tied weights
-
-The `torchscript` flag is necessary because most of the 🤗 Transformers language models
-have tied weights between their `Embedding` layer and their `Decoding` layer.
-TorchScript does not allow you to export models that have tied weights, so it is
-necessary to untie and clone the weights beforehand.
-
-Models instantiated with the `torchscript` flag have their `Embedding` layer and
-`Decoding` layer separated, which means that they should not be trained down the line.
-Training would desynchronize the two layers, leading to unexpected results.
-
-This is not the case for models that do not have a language model head, as those do not
-have tied weights. These models can be safely exported without the `torchscript` flag.
-
-## Dummy inputs and standard lengths
-
-The dummy inputs are used for a models forward pass. While the inputs' values are
-propagated through the layers, PyTorch keeps track of the different operations executed
-on each tensor. These recorded operations are then used to create the *trace* of the
-model.
-
-The trace is created relative to the inputs' dimensions. It is therefore constrained by
-the dimensions of the dummy input, and will not work for any other sequence length or
-batch size. When trying with a different size, the following error is raised:
-
-```
-`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
-```
-
-We recommended you trace the model with a dummy input size at least as large as the
-largest input that will be fed to the model during inference. Padding can help fill the
-missing values. However, since the model is traced with a larger input size, the
-dimensions of the matrix will also be large, resulting in more calculations.
-
-Be careful of the total number of operations done on each input and follow the
-performance closely when exporting varying sequence-length models.
-
-## Using TorchScript in Python
-
-This section demonstrates how to save and load models as well as how to use the trace
-for inference.
-
-### Saving a model
-
-To export a `BertModel` with TorchScript, instantiate `BertModel` from the `BertConfig`
-class and then save it to disk under the filename `traced_bert.pt`:
-
-```python
+```py
 from transformers import BertModel, BertTokenizer, BertConfig
 import torch
 
-enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-
-# Tokenizing input text
+tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
 text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = enc.tokenize(text)
+tokenized_text = tokenizer.tokenize(text)
 
-# Masking one of the input tokens
 masked_index = 8
 tokenized_text[masked_index] = "[MASK]"
-indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
 segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
 
-# Creating a dummy input
+# creating a dummy input
 tokens_tensor = torch.tensor([indexed_tokens])
 segments_tensors = torch.tensor([segments_ids])
 dummy_input = [tokens_tensor, segments_tensors]
+```
+
+The trace is created based on the provided inputs dimensions and it can only handle inputs with the same shape as the provided input during tracing. An input with a different size raises the error message shown below.
+
+```bash
+`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`.
+```
+
+Try to create a trace with a dummy input size at least as large as the largest expected input during inference. Padding can help fill missing values for larger inputs. It may be slower though since a larger input size requires more calculations. Be mindful of the total number of operations performed on each input and track the model performance when exporting models with variable sequence lengths.
+
+## Tied weights
+
+Weights between the `Embedding` and `Decoding` layers are tied in Transformers and TorchScript can't export models with tied weights. Instantiating a model with `torchscript=True`, separates the `Embedding` and `Decoding` layers and they aren't trained any further because it would throw the two layers out of sync which can lead to unexpected results.
 
-# Initializing the model with the torchscript flag
-# Flag set to True even though it is not necessary as this model does not have an LM Head.
+Models *without* a language model head don't have tied weights and can be safely exported without the `torchscript` parameter.
+
+<hfoptions id="torchscript">
+<hfoption id="randomly initialized model">
+
+```py
 config = BertConfig(
     vocab_size_or_config_json_file=32000,
     hidden_size=768,
@@ -125,105 +73,66 @@ config = BertConfig(
     torchscript=True,
 )
 
-# Instantiating the model
 model = BertModel(config)
-
-# The model needs to be in evaluation mode
 model.eval()
+```
+
+</hfoption>
+<hfoption id="pretrained model">
 
-# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
+```py
 model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
+model.eval()
+```
+
+</hfoption>
+</hfoptions>
+
+## Export to TorchScript
 
-# Creating the trace
+Create the Torchscript program with [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html), and save with [torch.jit.save](https://pytorch.org/docs/stable/generated/torch.jit.save.html).
+
+```py
 traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
 torch.jit.save(traced_model, "traced_bert.pt")
 ```
 
-### Loading a model
-
-Now you can load the previously saved `BertModel`, `traced_bert.pt`, from disk and use
-it on the previously initialised `dummy_input`:
+Use [torch.jit.load](https://pytorch.org/docs/stable/generated/torch.jit.load.html) to load the traced model.
 
-```python
+```py
 loaded_model = torch.jit.load("traced_bert.pt")
 loaded_model.eval()
 
 all_encoder_layers, pooled_output = loaded_model(*dummy_input)
 ```
 
-### Using a traced model for inference
-
-Use the traced model for inference by using its `__call__` dunder method:
+To use the traced model for inference, use the `__call__` dunder method.
 
-```python
+```py
 traced_model(tokens_tensor, segments_tensors)
 ```
 
-## Deploy Hugging Face TorchScript models to AWS with the Neuron SDK
-
-AWS introduced the [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/)
-instance family for low cost, high performance machine learning inference in the cloud.
-The Inf1 instances are powered by the AWS Inferentia chip, a custom-built hardware
-accelerator, specializing in deep learning inferencing workloads. [AWS
-Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) is the SDK for
-Inferentia that supports tracing and optimizing transformers models for deployment on
-Inf1. The Neuron SDK provides:
-
-
-1. Easy-to-use API with one line of code change to trace and optimize a TorchScript
-   model for inference in the cloud.
-2. Out of the box performance optimizations for [improved
-   cost-performance](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>).
-3. Support for Hugging Face transformers models built with either
-   [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html)
-   or
-   [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
-
-### Implications
-
-Transformers models based on the [BERT (Bidirectional Encoder Representations from
-Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert)
-architecture, or its variants such as
-[distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) and
-[roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) run best on
-Inf1 for non-generative tasks such as extractive question answering, sequence
-classification, and token classification. However, text generation tasks can still be
-adapted to run on Inf1 according to this [AWS Neuron MarianMT
-tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html).
-More information about models that can be converted out of the box on Inferentia can be
-found in the [Model Architecture
-Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia)
-section of the Neuron documentation.
-
-### Dependencies
-
-Using AWS Neuron to convert models requires a [Neuron SDK
-environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide)
-which comes preconfigured on [AWS Deep Learning
-AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
-
-### Converting a model for AWS Neuron
-
-Convert a model for AWS NEURON using the same code from [Using TorchScript in
-Python](torchscript#using-torchscript-in-python) to trace a `BertModel`. Import the
-`torch.neuron` framework extension to access the components of the Neuron SDK through a
-Python API:
-
-```python
-from transformers import BertModel, BertTokenizer, BertConfig
-import torch
+## Deploy to AWS
+
+TorchScript programs serialized from Transformers can be deployed on [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) instances. The instance is powered by AWS Inferentia chips, a custom hardware accelerator designed for deep learning inference workloads. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) supports tracing Transformers models for deployment on Inf1 instances.
+
+> [!TIP]
+> AWS Neuron requires a [Neuron SDK environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/inference-torch-neuron.html#inference-torch-neuron) which is preconfigured on [AWS DLAMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
+
+Instead of [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html), use [torch.neuron.trace](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/api-compilation-python-api.html) to trace a model and optimize it for Inf1 instances.
+
+```py
 import torch.neuron
+
+torch.neuron.trace(model, [tokens_tensor, segments_tensors])
 ```
 
-You only need to modify the following line:
+Refer to the [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html) documentation for more information.
 
-```diff
-- torch.jit.trace(model, [tokens_tensor, segments_tensors])
-+ torch.neuron.trace(model, [tokens_tensor, segments_tensors])
-```
+### Model architectures
+
+BERT-based models - like [DistilBERT](./model_doc/distilbert) or [RoBERTa](./model_doc/roberta) - run best on Inf1 instances for non-generative tasks such as extractive question answering, and sequence or token classification.
 
-This enables the Neuron SDK to trace the model and optimize it for Inf1 instances.
+Text generation can be adapted to run on an Inf1 instance as shown in the [Transformers MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html) tutorial.
 
-To learn more about AWS Neuron SDK features, tools, example tutorials and latest
-updates, please see the [AWS NeuronSDK
-documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
+Refer to the [Inference Samples/Tutorials (Inf1)](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/models/inference-inf1-samples.html#model-samples-inference-inf1) guide for more information about which models can be converted out of the box to run on Inf1 instances.
diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index 92bb4367139f..94f32ec34fcf 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -16,41 +16,31 @@ rendered properly in your Markdown viewer.
 
 # Trainer
 
-The [`Trainer`] is a complete training and evaluation loop for PyTorch models implemented in the Transformers library. You only need to pass it the necessary pieces for training (model, tokenizer, dataset, evaluation function, training hyperparameters, etc.), and the [`Trainer`] class takes care of the rest. This makes it easier to start training faster without manually writing your own training loop. But at the same time, [`Trainer`] is very customizable and offers a ton of training options so you can tailor it to your exact training needs.
+[`Trainer`] is a complete training and evaluation loop for Transformers' PyTorch models. Plug a model, preprocessor, dataset, and training arguments into [`Trainer`] and let it handle the rest to start training faster.
 
-<Tip>
+[`Trainer`] is also powered by [Accelerate](https://hf.co/docs/accelerate/index), a library for handling large models for distributed training.
 
-In addition to the [`Trainer`] class, Transformers also provides a [`Seq2SeqTrainer`] class for sequence-to-sequence tasks like translation or summarization. There is also the [`~trl.SFTTrainer`] class from the [TRL](https://hf.co/docs/trl) library which wraps the [`Trainer`] class and is optimized for training language models like Llama-2 and Mistral with autoregressive techniques. [`~trl.SFTTrainer`] also supports features like sequence packing, LoRA, quantization, and DeepSpeed for efficiently scaling to any model size.
-
-<br>
-
-Feel free to check out the [API reference](./main_classes/trainer) for these other [`Trainer`]-type classes to learn more about when to use which one. In general, [`Trainer`] is the most versatile option and is appropriate for a broad spectrum of tasks. [`Seq2SeqTrainer`] is designed for sequence-to-sequence tasks and [`~trl.SFTTrainer`] is designed for training language models.
-
-</Tip>
-
-Before you start, make sure [Accelerate](https://hf.co/docs/accelerate) - a library for enabling and running PyTorch training across distributed environments - is installed.
+This guide will show you how [`Trainer`] works and how to customize it for your use case with a callback.
 
 ```bash
-pip install accelerate
-
-# upgrade
-pip install accelerate --upgrade
+!pip install accelerate --upgrade
 ```
 
-This guide provides an overview of the [`Trainer`] class.
-
-## Basic usage
+[`Trainer`] contains all the necessary components of a training loop.
 
-[`Trainer`] includes all the code you'll find in a basic training loop:
-
-1. perform a training step to calculate the loss
+1. calculate the loss from a training step
 2. calculate the gradients with the [`~accelerate.Accelerator.backward`] method
 3. update the weights based on the gradients
-4. repeat this process until you've reached a predetermined number of epochs
+4. repeat until the predetermined number of epochs is reached
+
+Manually coding this training loop everytime can be inconvenient or a barrier if you're just getting started with machine learning. [`Trainer`] abstracts this process, allowing you to focus on the model, dataset, and training design choices.
 
-The [`Trainer`] class abstracts all of this code away so you don't have to worry about manually writing a training loop every time or if you're just getting started with PyTorch and training. You only need to provide the essential components required for training, such as a model and a dataset, and the [`Trainer`] class handles everything else.
+Configure your training with hyperparameters and options from [`TrainingArguments`] which supports many features such as distributed training, torch.compile, mixed precision training, and saving the model to the Hub.
 
-If you want to specify any training options or hyperparameters, you can find them in the [`TrainingArguments`] class. For example, let's define where to save the model in `output_dir` and push the model to the Hub after training with `push_to_hub=True`.
+> [!TIP]
+> The number of available parameters available in [`TrainingArguments`] may be intimidating at first. If there is a specific hyperparameter or feature you want to use, try searching for it directly. Otherwise, feel free to start with the default values and gradually customize them as you become more familiar with the training process.
+
+The example below demonstrates an example of [`TrainingArguments`] that evaluates and saves the model at the end of each epoch. It also loads the best model found during training and pushes it to the Hub.
 
 ```py
 from transformers import TrainingArguments
@@ -69,9 +59,10 @@ training_args = TrainingArguments(
 )
 ```
 
-Pass `training_args` to the [`Trainer`] along with a model, dataset, something to preprocess the dataset with (depending on your data type it could be a tokenizer, feature extractor or image processor), a data collator, and a function to compute the metrics you want to track during training.
+Pass your model, dataset, preprocessor, and [`TrainingArguments`] to [`Trainer`], and call [`~Trainer.train`] to start training.
 
-Finally, call [`~Trainer.train`] to start training!
+> [!TIP]
+> Refer to the [Fine-tuning](./training) guide for a more complete overview of the training process.
 
 ```py
 from transformers import Trainer
@@ -89,112 +80,42 @@ trainer = Trainer(
 trainer.train()
 ```
 
-### Checkpoints
-
-The [`Trainer`] class saves your model checkpoints to the directory specified in the `output_dir` parameter of [`TrainingArguments`]. You'll find the checkpoints saved in a `checkpoint-000` subfolder where the numbers at the end correspond to the training step. Saving checkpoints are useful for resuming training later.
-
-```py
-# resume from latest checkpoint
-trainer.train(resume_from_checkpoint=True)
-
-# resume from specific checkpoint saved in output directory
-trainer.train(resume_from_checkpoint="your-model/checkpoint-1000")
-```
-
-You can save your checkpoints (the optimizer state is not saved by default) to the Hub by setting `push_to_hub=True` in [`TrainingArguments`] to commit and push them. Other options for deciding how your checkpoints are saved are set up in the [`hub_strategy`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.hub_strategy) parameter:
-
-* `hub_strategy="checkpoint"` pushes the latest checkpoint to a subfolder named "last-checkpoint" from which you can resume training
-* `hub_strategy="all_checkpoints"` pushes all checkpoints to the directory defined in `output_dir` (you'll see one checkpoint per folder in your model repository)
-
-When you resume training from a checkpoint, the [`Trainer`] tries to keep the Python, NumPy, and PyTorch RNG states the same as they were when the checkpoint was saved. But because PyTorch has various non-deterministic default settings, the RNG states aren't guaranteed to be the same. If you want to enable full determinism, take a look at the [Controlling sources of randomness](https://pytorch.org/docs/stable/notes/randomness#controlling-sources-of-randomness) guide to learn what you can enable to make your training fully deterministic. Keep in mind though that by making certain settings deterministic, training may be slower.
-
-## Customize the Trainer
+## Checkpoints
 
-While the [`Trainer`] class is designed to be accessible and easy-to-use, it also offers a lot of customizability for more adventurous users. Many of the [`Trainer`]'s method can be subclassed and overridden to support the functionality you want, without having to rewrite the entire training loop from scratch to accommodate it. These methods include:
+[`Trainer`] saves checkpoints (the optimizer state is not saved by default) to the directory in `output_dir` in [`TrainingArguments`] to a subfolder named `checkpoint-000`. The number at the end is the training step at which the checkpoint was saved.
 
-* [`~Trainer.get_train_dataloader`] creates a training DataLoader
-* [`~Trainer.get_eval_dataloader`] creates an evaluation DataLoader
-* [`~Trainer.get_test_dataloader`] creates a test DataLoader
-* [`~Trainer.log`] logs information on the various objects that watch training
-* [`~Trainer.create_optimizer_and_scheduler`] creates an optimizer and learning rate scheduler if they weren't passed in the `__init__`; these can also be separately customized with [`~Trainer.create_optimizer`] and [`~Trainer.create_scheduler`] respectively
-* [`~Trainer.compute_loss`] computes the loss on a batch of training inputs
-* [`~Trainer.training_step`] performs the training step
-* [`~Trainer.prediction_step`] performs the prediction and test step
-* [`~Trainer.evaluate`] evaluates the model and returns the evaluation metrics
-* [`~Trainer.predict`] makes predictions (with metrics if labels are available) on the test set
+Saving checkpoints are useful for resuming training or recovering your training progress if you encounter an error. Set the `resume_from_checkpoint` parameter in [`~Trainer.train`] to resume training from the last checkpoint or a specific checkpoint.
 
-For example, if you want to customize the [`~Trainer.compute_loss`] method to use a weighted loss instead.
+<hfoptions id="ckpt">
+<hfoption id="latest checkpoint">
 
 ```py
-from torch import nn
-from transformers import Trainer
-
-class CustomTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
-        labels = inputs.pop("labels")
-        # forward pass
-        outputs = model(**inputs)
-        logits = outputs.get("logits")
-        # compute custom loss for 3 labels with different weights
-        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
-        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
-        return (loss, outputs) if return_outputs else loss
-```
-
-### Callbacks
-
-Another option for customizing the [`Trainer`] is to use [callbacks](callbacks). Callbacks *don't change* anything in the training loop. They inspect the training loop state and then execute some action (early stopping, logging results, etc.) depending on the state. In other words, a callback can't be used to implement something like a custom loss function and you'll need to subclass and override the [`~Trainer.compute_loss`] method for that.
-
-For example, if you want to add an early stopping callback to the training loop after 10 steps.
-
-```py
-from transformers import TrainerCallback
-
-class EarlyStoppingCallback(TrainerCallback):
-    def __init__(self, num_steps=10):
-        self.num_steps = num_steps
-
-    def on_step_end(self, args, state, control, **kwargs):
-        if state.global_step >= self.num_steps:
-            control.should_training_stop = True
+trainer.train(resume_from_checkpoint=True)
 ```
 
-Then pass it to the [`Trainer`]'s `callback` parameter.
+</hfoption>
+<hfoption id="specific checkpoint">
 
 ```py
-from transformers import Trainer
-
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=dataset["train"],
-    eval_dataset=dataset["test"],
-    processing_class=tokenizer,
-    data_collator=data_collator,
-    compute_metrics=compute_metrics,
-    callbacks=[EarlyStoppingCallback()],
-)
+trainer.train(resume_from_checkpoint="your-model/checkpoint-1000")
 ```
 
-## Logging
-
-<Tip>
-
-Check out the [logging](./main_classes/logging) API reference for more information about the different logging levels.
-
-</Tip>
+</hfoption>
+</hfoptions>
 
-The [`Trainer`] is set to `logging.INFO` by default which reports errors, warnings, and other basic information. A [`Trainer`] replica - in distributed environments - is set to `logging.WARNING` which only reports errors and warnings. You can change the logging level with the [`log_level`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level) and [`log_level_replica`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level_replica) parameters in [`TrainingArguments`].
+Checkpoints can be saved to the Hub by setting `push_to_hub=True` in [`TrainingArguments`]. The default method (`"every_save"`) saves a checkpoint to the Hub every time a model is saved, which is typically the final model at the end of training. Some other options for deciding how to save checkpoints to the Hub include the following.
 
-To configure the log level setting for each node, use the [`log_on_each_node`](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.log_on_each_node) parameter to determine whether to use the log level on each node or only on the main node.
+- `hub_strategy="end"` only pushes a checkpoint when [`~Trainer.save_model`] is called
+- `hub_strategy="checkpoint"` pushes the latest checkpoint to a subfolder named *last-checkpoint* from which training can be resumed
+- `hub_strategy="all_checkpoints"` pushes all checkpoints to the Hub with one checkpoint per subfolder in your model repository
 
-<Tip>
+[`Trainer`] attempts to maintain the same Python, NumPy, and PyTorch RNG states when you resume training from a checkpoint. But PyTorch has various non-deterministic settings which can't guarantee the RNG states are identical. To enable full determinism, refer to the [Controlling sources of randomness](https://pytorch.org/docs/stable/notes/randomness#controlling-sources-of-randomness) guide to learn what settings to adjust to make training fully deterministic (some settings may result in slower training).
 
-[`Trainer`] sets the log level separately for each node in the [`Trainer.__init__`] method, so you may want to consider setting this sooner if you're using other Transformers functionalities before creating the [`Trainer`] object.
+## Logging
 
-</Tip>
+[`Trainer`] is set to `logging.INFO` by default to report errors, warnings, and other basic information. Use [`~TrainingArguments.log_level`] to change the logging level and log verbosity.
 
-For example, to set your main code and modules to use the same log level according to each node:
+The example below sets the main code and modules to use the same log level.
 
 ```py
 logger = logging.getLogger(__name__)
@@ -213,9 +134,11 @@ transformers.utils.logging.set_verbosity(log_level)
 trainer = Trainer(...)
 ```
 
-Use different combinations of `log_level` and `log_level_replica` to configure what gets logged on each of the nodes.
+In a distributed environment, [`Trainer`] replicas are set to `logging.WARNING` to only report errors and warnings. Use [`~TrainingArguments.log_level_replica`] to change the logging level and log verbosity. To configure the log level for each node, use [`~TrainingArguments.log_on_each_node`] to determine whether to use a specific log level on each node or only the main node.
 
-<hfoptions id="logging">
+Use different combinations of `log_level` and `log_level_replica` to configure what gets logged on each node.
+
+<hfoptions id="nodes">
 <hfoption id="single node">
 
 ```bash
@@ -225,7 +148,7 @@ my_app.py ... --log_level warning --log_level_replica error
 </hfoption>
 <hfoption id="multi-node">
 
-Add the `log_on_each_node 0` parameter for multi-node environments.
+Add `log_on_each_node 0` for distributed environments.
 
 ```bash
 my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
@@ -237,470 +160,88 @@ my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
 </hfoption>
 </hfoptions>
 
-## NEFTune
-
-[NEFTune](https://hf.co/papers/2310.05914) is a technique that can improve performance by adding noise to the embedding vectors during training. To enable it in [`Trainer`], set the `neftune_noise_alpha` parameter in [`TrainingArguments`] to control how much noise is added.
-
-```py
-from transformers import TrainingArguments, Trainer
-
-training_args = TrainingArguments(..., neftune_noise_alpha=0.1)
-trainer = Trainer(..., args=training_args)
-```
-
-NEFTune is disabled after training to restore the original embedding layer to avoid any unexpected behavior.
+> [!TIP]
+> The log level is separately set for each node in the [`~Trainer.__init__`] method. Consider setting this sooner if you're using other Transformers functionalities before creating the [`Trainer`] instance.
 
-## Liger Kernel
+## Customize
 
-[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed.
+Tailor [`Trainer`] to your use case by subclassing or overriding its methods to support the functionality you want to add or use, without rewriting the entire training loop from scratch. The table below lists some of the methods that can be customized.
 
-<Tip>
-Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples)
-</Tip>
+| method | description |
+|---|---|
+| [`~Trainer.get_train_dataloader`] | create a training DataLoader |
+| [`~Trainer.get_eval_dataloader`] | create an evaluation DataLoader |
+| [`~Trainer.get_test_dataloader`] | create a test DataLoader |
+| [`~Trainer.log`] | log information about the training process |
+| [`~Trainer.create_optimizer_and_scheduler`] | create an optimizer and learning rate scheduler (can also be separately customized with [`~Trainer.create_optimizer`] and [`~Trainer.create_scheduler`] if they weren't passed in `__init__`) |
+| [`~Trainer.compute_loss`] | compute the loss of a batch of training inputs |
+| [`~Trainer.training_step`] | perform the training step |
+| [`~Trainer.prediction_step`] | perform the prediction and test step |
+| [`~Trainer.evaluate`] | evaluate the model and return the evaluation metric |
+| [`~Trainer.predict`] | make a prediction (with metrics if labels are available) on the test set |
 
-First make sure to install Liger official repository:
-```bash
-pip install liger-kernel
-```
-
-You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example:
+For example, to use weighted loss, rewrite [`~Trainer.compute_loss`] inside [`Trainer`].
 
 ```py
-from transformers import TrainingArguments
-
-training_args = TrainingArguments(
-    output_dir="your-model",
-    learning_rate=2e-5,
-    per_device_train_batch_size=16,
-    per_device_eval_batch_size=16,
-    num_train_epochs=2,
-    weight_decay=0.01,
-    eval_strategy="epoch",
-    save_strategy="epoch",
-    load_best_model_at_end=True,
-    push_to_hub=True,
-    use_liger_kernel=True
-)
-```
-
-The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value.
-
-
-## Optimizers
-
-You can choose a built-in optimizer for training using:
-
-```python
-from transformers import TrainingArguments
-training_args = TrainingArguments(..., optim="adamw_torch")
-```
-
-See [`OptimizerNames`](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py) for a full list of choices. We include advanced examples in the sections below.
-
-You can also use an arbitrary PyTorch optimizer via:
-
-```python
-import torch
-
-optimizer_cls = torch.optim.AdamW
-optimizer_kwargs = {
-    "lr": 4e-3,
-    "betas": (0.9, 0.999),
-    "weight_decay": 0.05,
-}
-
+from torch import nn
 from transformers import Trainer
-trainer = Trainer(..., optimizer_cls_and_kwargs=(optimizer_cls, optimizer_kwargs))
-```
-
-### GaLore
-
-Gradient Low-Rank Projection (GaLore) is a memory-efficient low-rank training strategy that allows full-parameter learning but is more memory-efficient than common low-rank adaptation methods, such as LoRA.
-
-First make sure to install GaLore official repository:
-
-```bash
-pip install galore-torch
-```
-
-Then simply add one of `["galore_adamw", "galore_adafactor", "galore_adamw_8bit"]` in `optim` together with `optim_target_modules`, which can be a list of strings, regex or full path corresponding to the target module names you want to adapt. Below is an end-to-end example script (make sure to `pip install trl datasets`):
-
-```python
-import torch
-import datasets
-import trl
-
-from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
-
-train_dataset = datasets.load_dataset('imdb', split='train')
-
-args = TrainingArguments(
-    output_dir="./test-galore",
-    max_steps=100,
-    per_device_train_batch_size=2,
-    optim="galore_adamw",
-    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
-)
-
-model_id = "google/gemma-2b"
-
-config = AutoConfig.from_pretrained(model_id)
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_config(config).to(0)
-
-trainer = trl.SFTTrainer(
-    model=model,
-    args=args,
-    train_dataset=train_dataset,
-    dataset_text_field='text',
-    max_seq_length=512,
-)
-
-trainer.train()
-```
-
-To pass extra arguments supported by GaLore, you should pass correctly `optim_args`, for example:
-
-```python
-import torch
-import datasets
-import trl
-
-from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
-
-train_dataset = datasets.load_dataset('imdb', split='train')
-
-args = TrainingArguments(
-    output_dir="./test-galore",
-    max_steps=100,
-    per_device_train_batch_size=2,
-    optim="galore_adamw",
-    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
-    optim_args="rank=64, update_proj_gap=100, scale=0.10",
-)
-
-model_id = "google/gemma-2b"
-
-config = AutoConfig.from_pretrained(model_id)
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_config(config).to(0)
-
-trainer = trl.SFTTrainer(
-    model=model,
-    args=args,
-    train_dataset=train_dataset,
-    dataset_text_field='text',
-    max_seq_length=512,
-)
-
-trainer.train()
-```
-
-You can read more about the method in the [original repository](https://github.com/jiaweizzhao/GaLore) or the [paper](https://arxiv.org/abs/2403.03507).
-
-Currently you can only train Linear layers that are considered as GaLore layers and will use low-rank decomposition to be trained while remaining layers will be optimized in the conventional manner.
-
-Note it will take a bit of time before starting the training (~3 minutes for a 2B model on a NVIDIA A100), but training should go smoothly afterwards.
-
-You can also perform layer-wise optimization by post-pending the optimizer name with `layerwise` like below:
-
-```python
-import torch
-import datasets
-import trl
-
-from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
-
-train_dataset = datasets.load_dataset('imdb', split='train')
-
-args = TrainingArguments(
-    output_dir="./test-galore",
-    max_steps=100,
-    per_device_train_batch_size=2,
-    optim="galore_adamw_layerwise",
-    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
-)
-
-model_id = "google/gemma-2b"
-
-config = AutoConfig.from_pretrained(model_id)
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_config(config).to(0)
-
-trainer = trl.SFTTrainer(
-    model=model,
-    args=args,
-    train_dataset=train_dataset,
-    dataset_text_field='text',
-    max_seq_length=512,
-)
-
-trainer.train()
-```
-
-Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see [this appropriate section](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please [raise an issue on GitHub](https://github.com/huggingface/transformers/issues) if you encounter such issue.
-
-### APOLLO
-
-Approximated Gradient Scaling for Memory Efficient LLM Optimization (APOLLO) is a memory-efficient training strategy that allows full-parameter learning for both pre-training and fine-tuning, while maintaining AdamW-level performance with SGD-like memory efficiency.
-
-* **Ultra-low rank efficiency** → Requires much lower rank than GaLore—even rank 1 (APOLLO-Mini) suffices.
-* **No expensive SVD computations** → Unlike GaLore, APOLLO leverages random projection, avoiding training stalls.
-
-You can read more about the method in the [original repository](https://github.com/zhuhanqing/APOLLO) or the [APOLLO: SGD-like Memory, AdamW-level Performance](https://arxiv.org/abs/2412.05270).
-
-First, make sure to install APOLLO from its official repository:
-
-```bash
-pip install apollo-torch
-```
-
-Then, APOLLO optimizers can be used simply by setting `optim="apollo_adamw"` and specifying `optim_target_modules`.
-`optim_target_modules` can be a list of strings, regex or full path corresponding to the target module names you want to adapt. 
-Currently, only Linear layers are considered to use the APOLLO optimizers, i.e., included in `optim_target_modules,` while the remaining models are still using AdamW. 
-
-
-You can also enable layer-wise APOLLO by appending "layerwise" to the optimizer name (optim="apollo_adamw_layerwise"), the same as layer-wise GaLore. This saves additional memory for gradient by performing weight updates layer by layer.
-
-Below is an end-to-end example script (make sure to `pip install trl datasets`):
-
-```python
-import torch
-import datasets
-import trl
-
-from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
-
-train_dataset = datasets.load_dataset('imdb', split='train')
-
-args = TrainingArguments(
-    output_dir="./test-apollo",
-    max_steps=100,
-    per_device_train_batch_size=2,
-    optim="apollo_adamw",
-    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
-)
-
-model_id = "google/gemma-2b"
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
-
-trainer = trl.SFTTrainer(
-    model=model,
-    args=args,
-    train_dataset=train_dataset,
-    dataset_text_field='text',
-    max_seq_length=512,
-)
 
-trainer.train()
-```
-
-
-You can further customize APOLLO’s behavior by passing hyperparameters using `optim_args`.
-
-| Parameter         | Description |
-|------------------|-------------|
-| `rank` | Rank of the auxiliary sub-space used for gradient scaling. <br> **APOLLO (default=256)** → Works well for 1B and 7B models. <br> **APOLLO-Mini (default=1)** |
-| `scale_type` | How scaling factors are applied. <br> **`channel`** → Per-channel scaling (used in APOLLO). <br> **`tensor`** → Per-tensor scaling (used in APOLLO-Mini). |
-| `scale` | Adjusts gradient updates to stabilize training. <br> **APOLLO (default=1.0)** <br> **APOLLO-Mini (default=128)** |
-| `update_proj_gap` | Steps before updating projection matrices. Default: **200**. |
-| `proj` | Type of projection. Default: **`random`**. |
-
-
-<Tip>
-
-The `scale` parameter can be set to `n/r`, where `n` is the original space dimension and `r` is the low-rank space dimension.
-Alternatively, you can achieve a similar effect by adjusting the learning rate, while keeping scale at its default value.
-
-</Tip>
-
-For example, you can enable APOLLO-Mini (rank=1 for extreme memory efficiency) by passing `optim_args`:
-
-```python
-
-args = TrainingArguments(
-    output_dir="./test-galore",
-    max_steps=100,
-    per_device_train_batch_size=2,
-    optim="apollo_adamw",
-    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
-    optim_args="proj=random,rank=1,scale=128.0,scale_type=tensor,update_proj_gap=200",
-
-)
-```
-
-### LOMO optimizer
-
-The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195).
-They both consist of an efficient full-parameter fine-tuning method. These optimizers fuse the gradient computation and the parameter update in one step to reduce memory usage. Supported optimizers for LOMO are `"lomo"` and `"adalomo"`. First either install LOMO from pypi `pip install lomo-optim` or install it from source with `pip install git+https://github.com/OpenLMLab/LOMO.git`.
-
-<Tip>
-
-According to the authors, it is recommended to use `AdaLomo` without `grad_norm` to get better performance and higher throughput.
-
-</Tip>
-
-Below is a simple script to demonstrate how to fine-tune [google/gemma-2b](https://huggingface.co/google/gemma-2b) on IMDB dataset in full precision:
-
-```python
-import torch
-import datasets
-from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
-import trl
-
-train_dataset = datasets.load_dataset('imdb', split='train')
-
-args = TrainingArguments(
-    output_dir="./test-lomo",
-    max_steps=1000,
-    per_device_train_batch_size=4,
-    optim="adalomo",
-    gradient_checkpointing=True,
-    logging_strategy="steps",
-    logging_steps=1,
-    learning_rate=2e-6,
-    save_strategy="no",
-    run_name="lomo-imdb",
-)
-
-model_id = "google/gemma-2b"
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
-
-trainer = trl.SFTTrainer(
-    model=model,
-    args=args,
-    train_dataset=train_dataset,
-    dataset_text_field='text',
-    max_seq_length=1024,
-)
-
-trainer.train()
+class CustomTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        labels = inputs.pop("labels")
+        # forward pass
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+        # compute custom loss for 3 labels with different weights
+        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
+        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
+        return (loss, outputs) if return_outputs else loss
 ```
 
-### GrokAdamW optimizer
-
-The GrokAdamW optimizer is designed to enhance training performance and stability, particularly for models that benefit from grokking signal functions. To use GrokAdamW, first install the optimizer package with `pip install grokadamw`.
-
-<Tip>
-
-GrokAdamW is particularly useful for models that require advanced optimization techniques to achieve better performance and stability.
-
-</Tip>
+### Callbacks
 
-Below is a simple script to demonstrate how to fine-tune [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the IMDB dataset using the GrokAdamW optimizer:
+[Callbacks](./main_classes/callback) are another way to customize [`Trainer`], but they don't change anything *inside the training loop*. Instead, a callback inspects the training loop state and executes some action (early stopping, logging, etc.) depending on the state. For example, you can't implement a custom loss function with a callback because that requires overriding [`~Trainer.compute_loss`].
 
-```python
-import torch
-import datasets
-from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, Trainer
+To use a callback, create a class that inherits from [`TrainerCallback`] and implements the functionality you want. Then pass the callback to the `callback` parameter in [`Trainer`]. The example below implements an early stopping callback that stops training after 10 steps.
 
-# Load the IMDB dataset
-train_dataset = datasets.load_dataset('imdb', split='train')
+```py
+from transformers import TrainerCallback, Trainer
 
-# Define the training arguments
-args = TrainingArguments(
-    output_dir="./test-grokadamw",
-    max_steps=1000,
-    per_device_train_batch_size=4,
-    optim="grokadamw",
-    logging_strategy="steps",
-    logging_steps=1,
-    learning_rate=2e-5,
-    save_strategy="no",
-    run_name="grokadamw-imdb",
-)
+class EarlyStoppingCallback(TrainerCallback):
+    def __init__(self, num_steps=10):
+        self.num_steps = num_steps
 
-# Load the model and tokenizer
-model_id = "google/gemma-2b"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
+    def on_step_end(self, args, state, control, **kwargs):
+        if state.global_step >= self.num_steps:
+            return {"should_training_stop": True}
+        else:
+            return {}
 
-# Initialize the Trainer
 trainer = Trainer(
     model=model,
-    args=args,
-    train_dataset=train_dataset,
-)
-
-# Train the model
-trainer.train()
-```
-
-This script demonstrates how to fine-tune the [google/gemma-2b](https://huggingface.co/google/gemma-2b) model on the IMDB dataset using the GrokAdamW optimizer. The `TrainingArguments` are configured to use GrokAdamW, and the dataset is passed to the `Trainer` for training.
-
-### Schedule-Free Optimizer
-
-The Schedule-Free optimizers have been introduced in [The Road Less Scheduled](https://hf.co/papers/2405.15682).
-Supported optimizers for Schedule-Free are `schedule_free_radam`, `schedule_free_adamw` and `schedule_free_sgd`. First install schedulefree from pypi `pip install schedulefree`.
-
-Schedule-Free learning replaces the momentum of the base optimizer with a combination of averaging and interpolation, to completely remove the need to anneal the learning rate with a traditional schedule.
-Additionally, neither `warmup_steps` nor `warmup_ratio` parameters are required when using `schedule_free_radam`.
-
-By default, we recommend setting `lr_scheduler_type="constant"` in the `TrainingArguments`. Setting other `lr_scheduler_type` would also work, but combining Schedule-Free with other learning rate schedules is not well-studied both in research and in practice, as it may affect the optimizer's intended behavior and performance guarantees.
-
-Below is a simple script to demonstrate how to fine-tune [google/gemma-2b](https://huggingface.co/google/gemma-2b) on IMDB dataset in full precision:
-
-```python
-import torch
-import datasets
-from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
-import trl
-
-train_dataset = datasets.load_dataset('imdb', split='train')
-
-args = TrainingArguments(
-    output_dir="./test-schedulefree",
-    max_steps=1000,
-    per_device_train_batch_size=4,
-    optim="schedule_free_radam",
-    lr_scheduler_type="constant",
-    gradient_checkpointing=True,
-    logging_strategy="steps",
-    logging_steps=1,
-    learning_rate=2e-6,
-    save_strategy="no",
-    run_name="sfo-imdb",
-)
-
-model_id = "google/gemma-2b"
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
-
-trainer = trl.SFTTrainer(
-    model=model, 
-    args=args,
-    train_dataset=train_dataset,
-    dataset_text_field='text',
-    max_seq_length=1024,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    processing_class=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    callbacks=[EarlyStoppingCallback()],
 )
-
-trainer.train()
 ```
 
-## Accelerate and Trainer
+## Accelerate
 
-The [`Trainer`] class is powered by [Accelerate](https://hf.co/docs/accelerate), a library for easily training PyTorch models in distributed environments with support for integrations such as [FullyShardedDataParallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/).
+[Accelerate](https://hf.co/docs/accelerate/index) is a library that simplifies training in distributed environments and across different hardware. Its integration with [`Trainer`] means [`Trainer`] supports distributed training frameworks like [Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/).
 
-<Tip>
+> [!TIP]
+> Learn more about FSDP sharding strategies, CPU offloading, and more with [`Trainer`] in the [Fully Sharded Data Parallel](./fsdp) guide.
 
-Learn more about FSDP sharding strategies, CPU offloading, and more with the [`Trainer`] in the [Fully Sharded Data Parallel](fsdp) guide.
+To use Accelerate with [`Trainer`], run the [accelerate_config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config) command to configure your training environment. This command creates a `config_file.yaml` file that stores the configuration settings of your training environment and it's used whenever you launch your training script. Some example distributed training configurations are shown below.
 
-</Tip>
-
-To use Accelerate with [`Trainer`], run the [`accelerate.config`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) command to set up training for your training environment. This command creates a `config_file.yaml` that'll be used when you launch your training script. For example, some example configurations you can setup are:
-
-<hfoptions id="config">
+<hfoptions id="distributed-training">
 <hfoption id="DistributedDataParallel">
 
-```yml
+```yaml
 compute_environment: LOCAL_MACHINE
 distributed_type: MULTI_GPU
 downcast_bf16: 'no'
@@ -721,9 +262,9 @@ use_cpu: false
 ```
 
 </hfoption>
-<hfoption id="FSDP">
+<hfoption id="FullyShardedDataParallel">
 
-```yml
+```yaml
 compute_environment: LOCAL_MACHINE
 distributed_type: FSDP
 downcast_bf16: 'no'
@@ -753,7 +294,7 @@ use_cpu: false
 </hfoption>
 <hfoption id="DeepSpeed">
 
-```yml
+```yaml
 compute_environment: LOCAL_MACHINE
 deepspeed_config:
   deepspeed_config_file: /home/user/configs/ds_zero3_config.json
@@ -775,7 +316,7 @@ use_cpu: false
 </hfoption>
 <hfoption id="DeepSpeed with Accelerate plugin">
 
-```yml
+```yaml
 compute_environment: LOCAL_MACHINE
 deepspeed_config:
   gradient_accumulation_steps: 1
@@ -800,11 +341,32 @@ use_cpu: false
 ```
 
 </hfoption>
+<hfoption id="Tensor parallelism with PyTorch 2">
+
+```yaml
+compute_environment: LOCAL_MACHINE
+tp_config:
+  tp_size: 4
+distributed_type: TP
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
 </hfoptions>
 
-The [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) command is the recommended way to launch your training script on a distributed system with Accelerate and [`Trainer`] with the parameters specified in `config_file.yaml`. This file is saved to the Accelerate cache folder and automatically loaded when you run `accelerate_launch`.
+Run [accelerate_launch](https://hf.co/docs/accelerate/package_reference/cli#accelerate-launch) to start training with the configurations set in `config_file.yaml`. This file is saved to the Accelerate cache folder and automatically loaded when you run `accelerate_launch`.
 
-For example, to run the [run_glue.py](https://github.com/huggingface/transformers/blob/f4db565b695582891e43a5e042e5d318e28f20b8/examples/pytorch/text-classification/run_glue.py#L4) training script with the FSDP configuration:
+The example below launches the [run_glue.py](../../../examples/pytorch/text-classification/run_glue) script with the FSDP configuration shown earlier. Parameters from the `config_file.yaml` file can also be directly set in the command line.
 
 ```bash
 accelerate launch \
@@ -821,27 +383,156 @@ accelerate launch \
     --overwrite_output_dir
 ```
 
-You could also specify the parameters from the `config_file.yaml` file directly in the command line:
+> [!TIP]
+> Refer to the [Launching your Accelerate scripts](https://hf.co/docs/accelerate/basic_tutorials/launch) tutorial to learn more about `accelerate_launch` and custom configurations.
+
+## Optimizations
+
+[`Trainer`] supports various optimizations to improve *training* performance - reduce memory and increase training speed - and *model* performance.
+
+### torch.compile
+
+[torch.compile](./perf_torch_compile) can significantly speed up training and reduce computational overhead. Configure your torch.compile settings in [`TrainingArguments`]. Set `torch.compile` to `True`, and select a backend and compile mode.
+
+```py
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    torch.compile=True,
+    torch.compile_backend="inductor",
+    torch_compile_mode="default",
+    ...,
+)
+```
+
+### GaLore
+
+[Gradient Low-Rank Projection (GaLore)](https://hf.co/papers/2403.03507) significantly reduces memory usage when training large language models (LLMs). One of GaLores key benefits is *full-parameter* learning, unlike low-rank adaptation methods like [LoRA](https://hf.co/papers/2106.09685), which produces better model performance.
+
+Install the [GaLore](https://github.com/jiaweizzhao/GaLore) library, [TRL](https://hf.co/docs/trl/index), and [Datasets](https://hf.co/docs/datasets/index).
 
 ```bash
-accelerate launch --num_processes=2 \
-    --use_fsdp \
-    --mixed_precision=bf16 \
-    --fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP  \
-    --fsdp_transformer_layer_cls_to_wrap="BertLayer" \
-    --fsdp_sharding_strategy=1 \
-    --fsdp_state_dict_type=FULL_STATE_DICT \
-    ./examples/pytorch/text-classification/run_glue.py \
-    --model_name_or_path google-bert/bert-base-cased \
-    --task_name $TASK_NAME \
-    --do_train \
-    --do_eval \
-    --max_seq_length 128 \
-    --per_device_train_batch_size 16 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3 \
-    --output_dir /tmp/$TASK_NAME/ \
-    --overwrite_output_dir
+pip install galore-torch trl datasets
+```
+
+Pick a GaLore optimizer (`"galore_adamw"`, `"galore_adafactor"`, `"galore_adamw_8bit`") and pass it to the `optim` parameter in [`TrainingArguments`]. Use the `optim_target_modules` parameter to specify which modules to adapt (can be a list of strings, regex, or a full path).
+
+Extra parameters supported by GaLore, `rank`, `update_proj_gap`, and `scale`, should be passed to the `optim_args` parameter in [`TrainingArguments`].
+
+The example below enables GaLore with [`~trl.SFTTrainer`] that targets the `attn` and `mlp` layers with regex.
+
+> [!TIP]
+> It can take some time before training starts (~3 minutes for a 2B model on a NVIDIA A100).
+
+<hfoptions id="galore">
+<hfoption id="GaLore optimizer">
+
+```py
+import torch
+import datasets
+import trl
+from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+args = TrainingArguments(
+    output_dir="./test-galore",
+    max_steps=100,
+    per_device_train_batch_size=2,
+    optim="galore_adamw",
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
+    optim_args="rank=64, update_proj_gap=100, scale=0.10",
+)
+config = AutoConfig.from_pretrained("google/gemma-2b")
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_config("google/gemma-2b").to(0)
+trainer = trl.SFTTrainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=512,
+)
+trainer.train()
+```
+
+</hfoption>
+<hfoption id="GaLore optimizer with layerwise optimization">
+
+Append `layerwise` to the optimizer name to enable layerwise optimization. For example, `"galore_adamw"` becomes `"galore_adamw_layerwise"`. This feature is still experimental and does not support Distributed Data Parallel (DDP). The code below can only be run on a [single GPU](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory). Other features like gradient clipping and DeepSpeed may not be available out of the box. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter any problems!
+
+```py
+import torch
+import datasets
+import trl
+from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+args = TrainingArguments(
+    output_dir="./test-galore",
+    max_steps=100,
+    per_device_train_batch_size=2,
+    optim="galore_adamw_layerwise",
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
+    optim_args="rank=64, update_proj_gap=100, scale=0.10",
+)
+config = AutoConfig.from_pretrained("google/gemma-2b")
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_config("google/gemma-2b").to(0)
+trainer = trl.SFTTrainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=512,
+)
+trainer.train()
+```
+
+</hfoption>
+</hfoptions>
+
+Only linear layers that are considered GaLore layers can be trained with low-rank decomposition. The rest of the model layers are optimized in the usual way.
+
+### Liger
+
+[Liger Kernel](https://github.com/linkedin/Liger-Kernel) is a collection of layers such as RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more that have been fused into a single Triton kernel for training LLMs. These kernels are also compatible with FlashAttention, FSDP, and DeepSpeed. As a result, Liger Kernel can increase multi-GPU training throughput and reduce memory usage. This is useful for multi-head training and supporting larger vocabulary sizes, larger batch sizes, and longer context lengths.
+
+```bash
+pip install liger-kernel
+```
+
+Enable Liger Kernel for training by setting `use_liger_kernel=True` in [`TrainingArguments`]. This patches the corresponding layers in the model with Ligers kernels.
+
+> [!TIP]
+> Liger Kernel supports Llama, Gemma, Mistral, and Mixtral models. Refer to the [patching](https://github.com/linkedin/Liger-Kernel#patching) list for the latest list of supported models.
+
+```py
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+    use_liger_kernel=True
+)
+```
+
+### NEFTune
+
+[NEFTune](https://hf.co/papers/2310.05914) adds noise to the embedding vectors during training to improve model performance. Enable it in [`Trainer`] with the `neftune_noise_alpha` parameter in [`TrainingArguments`] to control how much noise is added.
+
+```py
+from transformers import TrainingArguments, Trainer
+
+training_args = TrainingArguments(..., neftune_noise_alpha=0.1)
+trainer = Trainer(..., args=training_args)
 ```
 
-Check out the [Launching your Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch) tutorial to learn more about `accelerate_launch` and custom configurations.
+The original embedding layer is restored after training to avoid any unexpected behavior.
diff --git a/docs/source/en/training.md b/docs/source/en/training.md
index fa6ef0c0da6e..7f2a622b4840 100644
--- a/docs/source/en/training.md
+++ b/docs/source/en/training.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -14,426 +14,162 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Fine-tune a pretrained model
+# Fine-tuning
 
 [[open-in-colab]]
 
-There are significant benefits to using a pretrained model. It reduces computation costs, your carbon footprint, and allows you to use state-of-the-art models without having to train one from scratch. 🤗 Transformers provides access to thousands of pretrained models for a wide range of tasks. When you use a pretrained model, you train it on a dataset specific to your task. This is known as fine-tuning, an incredibly powerful training technique. In this tutorial, you will fine-tune a pretrained model with a deep learning framework of your choice:
+Fine-tuning adapts a pretrained model to a specific task with a smaller specialized dataset. This approach requires far less data and compute compared to training a model from scratch, which makes it a more accessible option for many users.
 
-* Fine-tune a pretrained model with 🤗 Transformers [`Trainer`].
-* Fine-tune a pretrained model in TensorFlow with Keras.
-* Fine-tune a pretrained model in native PyTorch.
+Transformers provides the [`Trainer`] API, which offers a comprehensive set of training features, for fine-tuning any of the models on the [Hub](https://hf.co/models).
 
-<a id='data-processing'></a>
+> [!TIP]
+> Learn how to fine-tune models for other tasks in our Task Recipes section in Resources!
 
-## Prepare a dataset
+This guide will show you how to fine-tune a model with [`Trainer`] to classify Yelp reviews.
 
-<Youtube id="_BZearw7f0w"/>
-
-Before you can fine-tune a pretrained model, download a dataset and prepare it for training. The previous tutorial showed you how to process data for training, and now you get an opportunity to put those skills to the test!
-
-Begin by loading the [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full) dataset:
+Log in to your Hugging Face account with your user token to ensure you can access gated models and share your models on the Hub.
 
 ```py
->>> from datasets import load_dataset
+from huggingface_hub import login
 
->>> dataset = load_dataset("yelp_review_full")
->>> dataset["train"][100]
-{'label': 0,
- 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+login()
 ```
 
-As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process#map) method to apply a preprocessing function over the entire dataset:
+Start by loading the [Yelp Reviews](https://hf.co/datasets/yelp_review_full) dataset and [preprocess](./fast_tokenizers#preprocess) (tokenize, pad, and truncate) it for training. Use [`~datasets.Dataset.map`] to preprocess the entire dataset in one step.
 
 ```py
->>> from transformers import AutoTokenizer
-
->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
-
-
->>> def tokenize_function(examples):
-...     return tokenizer(examples["text"], padding="max_length", truncation=True)
-
+from datasets import load_dataset
+from transformers import AutoTokenizer
 
->>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
-```
+dataset = load_dataset("yelp_review_full")
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
 
-If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes:
+def tokenize(examples):
+    return tokenizer(examples["text"], padding="max_length", truncation=True)
 
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+dataset = dataset.map(tokenize, batched=True)
 ```
 
-<a id='trainer'></a>
-
-## Train
+> [!TIP]
+> Fine-tune on a smaller subset of the full dataset to reduce the time it takes. The results won't be as good compared to fine-tuning on the full dataset, but it is useful to make sure everything works as expected first before committing to training on the full dataset.
+> ```py
+> small_train = dataset["train"].shuffle(seed=42).select(range(1000))
+> small_eval = dataset["test"].shuffle(seed=42).select(range(1000))
+> ```
 
-At this point, you should follow the section corresponding to the framework you want to use. You can use the links
-in the right sidebar to jump to the one you want - and if you want to hide all of the content for a given framework,
-just use the button at the top-right of that framework's block!
+## Trainer
 
-<frameworkcontent>
-<pt>
 <Youtube id="nvBXf7s7vTI"/>
 
-## Train with PyTorch Trainer
+[Trainer](./trainer) is an optimized training loop for Transformers models, making it easy to start training right away without manually writing your own training code. Pick and choose from a wide range of training features in [`TrainingArguments`] such as gradient accumulation, mixed precision, and options for reporting and logging training metrics.
 
-🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] API supports a wide range of training options and features such as logging, gradient accumulation, and mixed precision.
-
-Start by loading your model and specify the number of expected labels. From the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), you know there are five labels.
-
-By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
+Load a model and provide the number of expected labels (you can find this information on the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields)).
 
 ```py
->>> from transformers import AutoModelForSequenceClassification
+from transformers import AutoModelForSequenceClassification
 
->>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5, torch_dtype="auto")
+model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
+"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']"
+"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."
 ```
 
-<Tip>
-
-You will see a warning about some of the pretrained weights not being used and some weights being randomly
-initialized. Don't worry, this is completely normal! The pretrained head of the BERT model is discarded, and replaced with a randomly initialized classification head. You will fine-tune this new model head on your sequence classification task, transferring the knowledge of the pretrained model to it.
-
-</Tip>
+> [!TIP]
+> The message above is a reminder that the models pretrained head is discarded and replaced with a randomly initialized classification head. The randomly initialized head needs to be fine-tuned on your specific task to output meanginful predictions.
 
-### Training hyperparameters
+With the model loaded, set up your training hyperparameters in [`TrainingArguments`]. Hyperparameters are variables that control the training process - such as the learning rate, batch size, number of epochs - which in turn impacts model performance. Selecting the correct hyperparameters is important and you should experiment with them to find the best configuration for your task.
 
-Next, create a [`TrainingArguments`] class which contains all the hyperparameters you can tune as well as flags for activating different training options. For this tutorial you can start with the default training [hyperparameters](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments), but feel free to experiment with these to find your optimal settings.
+For this guide, you can use the default hyperparameters which provide a good baseline to begin with. The only settings to configure in this guide are where to save the checkpoint, how to evaluate model performance during training, and pushing the model to the Hub.
 
-Specify where to save the checkpoints from your training:
+[`Trainer`] requires a function to compute and report your metric. For a classification task, you'll use [`evaluate.load`] to load the [accuracy](https://hf.co/spaces/evaluate-metric/accuracy) function from the [Evaluate](https://hf.co/docs/evaluate/index) library. Gather the predictions and labels in [`~evaluate.EvaluationModule.compute`] to calculate the accuracy.
 
 ```py
->>> from transformers import TrainingArguments
-
->>> training_args = TrainingArguments(output_dir="test_trainer")
-```
-
-### Evaluate
-
-[`Trainer`] does not automatically evaluate model performance during training. You'll need to pass [`Trainer`] a function to compute and report metrics. The [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) library provides a simple [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) function you can load with the [`evaluate.load`] (see this [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) for more information) function:
-
-```py
->>> import numpy as np
->>> import evaluate
-
->>> metric = evaluate.load("accuracy")
-```
+import numpy as np
+import evaluate
 
-Call [`~evaluate.compute`] on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the logits to predictions (remember all 🤗 Transformers models return logits):
+metric = evaluate.load("accuracy")
 
-```py
->>> def compute_metrics(eval_pred):
-...     logits, labels = eval_pred
-...     predictions = np.argmax(logits, axis=-1)
-...     return metric.compute(predictions=predictions, references=labels)
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    # convert the logits to their predicted class
+    predictions = np.argmax(logits, axis=-1)
+    return metric.compute(predictions=predictions, references=labels)
 ```
 
-If you'd like to monitor your evaluation metrics during fine-tuning, specify the `eval_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch:
+Set up [`TrainingArguments`] with where to save the model and when to compute accuracy during training. The example below sets it to `"epoch"`, which reports the accuracy at the end of each epoch. Add `push_to_hub=True` to upload the model to the Hub after training.
 
 ```py
->>> from transformers import TrainingArguments, Trainer
+from transformers import TrainingArguments
 
->>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
+training_args = TrainingArguments(
+    output_dir="yelp_review_classifier",
+    eval_strategy="epoch",
+    push_to_hub=True,
+)
 ```
 
-### Trainer
-
-Create a [`Trainer`] object with your model, training arguments, training and test datasets, and evaluation function:
+Create a [`Trainer`] instance and pass it the model, training arguments, training and test datasets, and evaluation function. Call [`~Trainer.train`] to start training.
 
 ```py
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=small_train_dataset,
-...     eval_dataset=small_eval_dataset,
-...     compute_metrics=compute_metrics,
-... )
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    compute_metrics=compute_metrics,
+)
+trainer.train()
 ```
 
-Then fine-tune your model by calling [`~transformers.Trainer.train`]:
+Finally, use [`~Trainer.push_to_hub`] to upload your model and tokenizer to the Hub.
 
 ```py
->>> trainer.train()
+trainer.push_to_hub()
 ```
-</pt>
-<tf>
-<a id='keras'></a>
-
-<Youtube id="rnTGBy2ax1c"/>
 
-## Train a TensorFlow model with Keras
+## TensorFlow
 
-You can also train 🤗 Transformers models in TensorFlow with the Keras API!
-
-### Loading data for Keras
-
-When you want to train a 🤗 Transformers model with the Keras API, you need to convert your dataset to a format that
-Keras understands. If your dataset is small, you can just convert the whole thing to NumPy arrays and pass it to Keras.
-Let's try that first before we do anything more complicated.
-
-First, load a dataset. We'll use the CoLA dataset from the [GLUE benchmark](https://huggingface.co/datasets/glue),
-since it's a simple binary text classification task, and just take the training split for now.
+[`Trainer`] is incompatible with Transformers TensorFlow models. Instead, fine-tune these models with [Keras](https://keras.io/) since they're implemented as a standard [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model).
 
 ```py
+from transformers import TFAutoModelForSequenceClassification
 from datasets import load_dataset
-
-dataset = load_dataset("glue", "cola")
-dataset = dataset["train"]  # Just take the training split for now
-```
-
-Next, load a tokenizer and tokenize the data as NumPy arrays. Note that the labels are already a list of 0 and 1s,
-so we can just convert that directly to a NumPy array without tokenization!
-
-```py
 from transformers import AutoTokenizer
-import numpy as np
 
+model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
+dataset = load_dataset("yelp_review_full")
 tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
-tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
-# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
-tokenized_data = dict(tokenized_data)
-
-labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
-```
 
-Finally, load, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method), and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) the model. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to:
+def tokenize(examples):
+    return tokenizer(examples["text"])
 
-```py
-from transformers import TFAutoModelForSequenceClassification
-from tensorflow.keras.optimizers import Adam
-
-# Load and compile our model
-model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased")
-# Lower learning rates are often better for fine-tuning transformers
-model.compile(optimizer=Adam(3e-5))  # No loss argument!
-
-model.fit(tokenized_data, labels)
+dataset = dataset.map(tokenize)
 ```
 
-<Tip>
-
-You don't have to pass a loss argument to your models when you `compile()` them! Hugging Face models automatically
-choose a loss that is appropriate for their task and model architecture if this argument is left blank. You can always
-override this by specifying a loss yourself if you want to!
-
-</Tip>
-
-This approach works great for smaller datasets, but for larger datasets, you might find it starts to become a problem. Why?
-Because the tokenized array and labels would have to be fully loaded into memory, and because NumPy doesn’t handle
-“jagged” arrays, so every tokenized sample would have to be padded to the length of the longest sample in the whole
-dataset. That’s going to make your array even bigger, and all those padding tokens will slow down training too!
+There are two methods to convert a dataset to [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset).
 
-### Loading data as a tf.data.Dataset
+- [`~TFPreTrainedModel.prepare_tf_dataset`] is the recommended way to create a [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) because you can inspect the model to figure out which columns to use as inputs and which columns to discard. This allows you to create a simpler, more performant dataset.
+- [`~datasets.Dataset.to_tf_dataset`] is a more low-level method from the [Datasets](https://hf.co/docs/datasets/index) library that gives you more control over how a dataset is created by specifying the columns and label columns to use.
 
-If you want to avoid slowing down training, you can load your data as a `tf.data.Dataset` instead. Although you can write your own
-`tf.data` pipeline if you want, we have two convenience methods for doing this:
-
-- [`~TFPreTrainedModel.prepare_tf_dataset`]: This is the method we recommend in most cases. Because it is a method
-on your model, it can inspect the model to automatically figure out which columns are usable as model inputs, and
-discard the others to make a simpler, more performant dataset.
-- [`~datasets.Dataset.to_tf_dataset`]: This method is more low-level, and is useful when you want to exactly control how
-your dataset is created, by specifying exactly which `columns` and `label_cols` to include.
-
-Before you can use [`~TFPreTrainedModel.prepare_tf_dataset`], you will need to add the tokenizer outputs to your dataset as columns, as shown in
-the following code sample:
+Add the tokenizer to [`~TFPreTrainedModel.prepare_tf_dataset`] to pad each batch, and you can optionally shuffle the dataset. For more complicated preprocessing, pass the preprocessing function to the `collate_fn` parameter instead.
 
 ```py
-def tokenize_dataset(data):
-    # Keys of the returned dictionary will be added to the dataset as columns
-    return tokenizer(data["text"])
-
-
-dataset = dataset.map(tokenize_dataset)
+tf_dataset = model.prepare_tf_dataset(
+    dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
+)
 ```
 
-Remember that Hugging Face datasets are stored on disk by default, so this will not inflate your memory usage! Once the
-columns have been added, you can stream batches from the dataset and add padding to each batch, which greatly
-reduces the number of padding tokens compared to padding the entire dataset.
+Finally, [compile](https://keras.io/api/models/model_training_apis/#compile-method) and [fit](https://keras.io/api/models/model_training_apis/#fit-method) the model to start training.
 
+> [!TIP]
+> It isn't necessary to pass a loss argument to [compile](https://keras.io/api/models/model_training_apis/#compile-method) because Transformers automatically chooses a loss that is appropriate for the task and architecture. However, you can always specify a loss argument if you want.
 
 ```py
->>> tf_dataset = model.prepare_tf_dataset(dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer)
-```
-
-Note that in the code sample above, you need to pass the tokenizer to `prepare_tf_dataset` so it can correctly pad batches as they're loaded.
-If all the samples in your dataset are the same length and no padding is necessary, you can skip this argument.
-If you need to do something more complex than just padding samples (e.g. corrupting tokens for masked language
-modelling), you can use the `collate_fn` argument instead to pass a function that will be called to transform the
-list of samples into a batch and apply any preprocessing you want. See our
-[examples](https://github.com/huggingface/transformers/tree/main/examples) or
-[notebooks](https://huggingface.co/docs/transformers/notebooks) to see this approach in action.
-
-Once you've created a `tf.data.Dataset`, you can compile and fit the model as before:
-
-```py
-model.compile(optimizer=Adam(3e-5))  # No loss argument!
+from tensorflow.keras.optimizers import Adam
 
+model.compile(optimizer=Adam(3e-5))
 model.fit(tf_dataset)
 ```
 
-</tf>
-</frameworkcontent>
-
-<a id='pytorch_native'></a>
-
-## Train in native PyTorch
-
-<frameworkcontent>
-<pt>
-<Youtube id="Dh9CL8fyG80"/>
-
-[`Trainer`] takes care of the training loop and allows you to fine-tune a model in a single line of code. For users who prefer to write their own training loop, you can also fine-tune a 🤗 Transformers model in native PyTorch.
-
-At this point, you may need to restart your notebook or execute the following code to free some memory:
-
-```py
-from accelerate.utils.memory import clear_device_cache
-del model
-del trainer
-clear_device_cache()
-```
-
-Next, manually postprocess `tokenized_dataset` to prepare it for training.
-
-1. Remove the `text` column because the model does not accept raw text as an input:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
-    ```
-
-2. Rename the `label` column to `labels` because the model expects the argument to be named `labels`:
-
-    ```py
-    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
-    ```
-
-3. Set the format of the dataset to return PyTorch tensors instead of lists:
-
-    ```py
-    >>> tokenized_datasets.set_format("torch")
-    ```
-
-Then create a smaller subset of the dataset as previously shown to speed up the fine-tuning:
-
-```py
->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
-```
-
-### DataLoader
-
-Create a `DataLoader` for your training and test datasets so you can iterate over batches of data:
-
-```py
->>> from torch.utils.data import DataLoader
-
->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
-```
-
-Load your model with the number of expected labels:
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
-```
-
-### Optimizer and learning rate scheduler
-
-Create an optimizer and learning rate scheduler to fine-tune the model. Let's use the [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) optimizer from PyTorch:
-
-```py
->>> from torch.optim import AdamW
-
->>> optimizer = AdamW(model.parameters(), lr=5e-5)
-```
-
-Create the default learning rate scheduler from [`Trainer`]:
-
-```py
->>> from transformers import get_scheduler
-
->>> num_epochs = 3
->>> num_training_steps = num_epochs * len(train_dataloader)
->>> lr_scheduler = get_scheduler(
-...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
-... )
-```
-
-Lastly, specify `device` to use a GPU if you have access to one. Otherwise, training on a CPU may take several hours instead of a couple of minutes.
-
-```py
->>> import torch
->>> from accelerate.test_utils.testing import get_backend
-
->>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
->>> model.to(device)
-```
-
-<Tip>
-
-Get free access to a cloud GPU if you don't have one with a hosted notebook like [Colaboratory](https://colab.research.google.com/) or [SageMaker StudioLab](https://studiolab.sagemaker.aws/).
-
-</Tip>
-
-Great, now you are ready to train! 🥳 
-
-### Training loop
-
-To keep track of your training progress, use the [tqdm](https://tqdm.github.io/) library to add a progress bar over the number of training steps:
-
-```py
->>> from tqdm.auto import tqdm
-
->>> progress_bar = tqdm(range(num_training_steps))
-
->>> model.train()
->>> for epoch in range(num_epochs):
-...     for batch in train_dataloader:
-...         batch = {k: v.to(device) for k, v in batch.items()}
-...         outputs = model(**batch)
-...         loss = outputs.loss
-...         loss.backward()
-
-...         optimizer.step()
-...         lr_scheduler.step()
-...         optimizer.zero_grad()
-...         progress_bar.update(1)
-```
-
-### Evaluate
-
-Just like how you added an evaluation function to [`Trainer`], you need to do the same when you write your own training loop. But instead of calculating and reporting the metric at the end of each epoch, this time you'll accumulate all the batches with [`~evaluate.add_batch`] and calculate the metric at the very end.
-
-```py
->>> import evaluate
-
->>> metric = evaluate.load("accuracy")
->>> model.eval()
->>> for batch in eval_dataloader:
-...     batch = {k: v.to(device) for k, v in batch.items()}
-...     with torch.no_grad():
-...         outputs = model(**batch)
-
-...     logits = outputs.logits
-...     predictions = torch.argmax(logits, dim=-1)
-...     metric.add_batch(predictions=predictions, references=batch["labels"])
-
->>> metric.compute()
-```
-</pt>
-</frameworkcontent>
-
-<a id='additional-resources'></a>
-
-## Additional resources
-
-For more fine-tuning examples, refer to:
-
-- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) includes scripts
-  to train common NLP tasks in PyTorch and TensorFlow.
+## Resources
 
-- [🤗 Transformers Notebooks](notebooks) contains various notebooks on how to fine-tune a model for specific tasks in PyTorch and TensorFlow.
+Refer to the Transformers [examples](https://github.com/huggingface/transformers/tree/main/examples) for more detailed training scripts on various tasks. You can also check out the [notebooks](./notebooks) for interactive examples.
diff --git a/docs/source/es/bertology.md b/docs/source/es/bertology.md
index ed4e12a8d59c..c62e5aaf9732 100644
--- a/docs/source/es/bertology.md
+++ b/docs/source/es/bertology.md
@@ -37,5 +37,5 @@ ayudar a acceder a las representaciones internas, principalmente adaptado de la
 - adquiriendo los valores de salida y gradientes de las heads para poder computar la métrica de importancia de las heads y realizar la poda de heads como se explica
   en https://arxiv.org/abs/1905.10650.
 
-Para ayudarte a entender y usar estas features, hemos añadido un script específico de ejemplo: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) mientras extraes información y cortas un modelo pre-entrenado en
+Para ayudarte a entender y usar estas features, hemos añadido un script específico de ejemplo: [bertology.py](https://github.com/huggingface/transformers-research-projects/tree/main/bertology/run_bertology.py) mientras extraes información y cortas un modelo pre-entrenado en
 GLUE.
diff --git a/docs/source/es/index.md b/docs/source/es/index.md
index fe7d65d94e35..3c10e71ebf91 100644
--- a/docs/source/es/index.md
+++ b/docs/source/es/index.md
@@ -80,7 +80,7 @@ La biblioteca actualmente contiene implementaciones de JAX, PyTorch y TensorFlow
 1. **[DeiT](model_doc/deit)** (de Facebook) publicado con el paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) por Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](model_doc/detr)** (de Facebook) publicado con el paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) por Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (de Microsoft Research) publicado con el paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) por Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (de HuggingFace), publicado junto con el paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) por Victor Sanh, Lysandre Debut y Thomas Wolf. Se ha aplicado el mismo método para comprimir GPT2 en [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa en [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), BERT multilingüe en [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) y una versión alemana de DistilBERT.
+1. **[DistilBERT](model_doc/distilbert)** (de HuggingFace), publicado junto con el paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) por Victor Sanh, Lysandre Debut y Thomas Wolf. Se ha aplicado el mismo método para comprimir GPT2 en [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), RoBERTa en [DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), BERT multilingüe en [DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) y una versión alemana de DistilBERT.
 1. **[DPR](model_doc/dpr)** (de Facebook) publicado con el paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) por Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, y Wen-tau Yih.
 1. **[DPT](master/model_doc/dpt)** (de Intel Labs) publicado con el paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) por René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)  by Mingxing Tan and Quoc V. Le.
diff --git a/docs/source/es/quicktour.md b/docs/source/es/quicktour.md
index c4babab09f02..41b9c5400282 100644
--- a/docs/source/es/quicktour.md
+++ b/docs/source/es/quicktour.md
@@ -220,7 +220,7 @@ Pasa tu texto al tokenizador:
 El tokenizador devolverá un diccionario conteniendo:
 
 * [input_ids](./glossary#input-ids): representaciones numéricas de los tokens.
-* [atttention_mask](.glossary#attention-mask): indica cuáles tokens deben ser atendidos.
+* [attention_mask](.glossary#attention-mask): indica cuáles tokens deben ser atendidos.
 
 Como con el [`pipeline`], el tokenizador aceptará una lista de inputs. Además, el tokenizador también puede rellenar (pad, en inglés) y truncar el texto para devolver un lote (batch, en inglés) de longitud uniforme:
 
diff --git a/docs/source/es/run_scripts.md b/docs/source/es/run_scripts.md
index d9a2b142a8ab..a389b2d2fe41 100644
--- a/docs/source/es/run_scripts.md
+++ b/docs/source/es/run_scripts.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 Junto con los [notebooks](./notebooks) de 🤗 Transformers, también hay scripts con ejemplos que muestran cómo entrenar un modelo para una tarea en [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 
-También encontrarás scripts que hemos usado en nuestros [proyectos de investigación](https://github.com/huggingface/transformers/tree/main/examples/research_projects) y [ejemplos pasados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que en su mayoría son aportados por la comunidad. Estos scripts no se mantienen activamente y requieren una versión específica de 🤗 Transformers que probablemente sea incompatible con la última versión de la biblioteca.
+También encontrarás scripts que hemos usado en nuestros [proyectos de investigación](https://github.com/huggingface/transformers-research-projects/) y [ejemplos pasados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que en su mayoría son aportados por la comunidad. Estos scripts no se mantienen activamente y requieren una versión específica de 🤗 Transformers que probablemente sea incompatible con la última versión de la biblioteca.
 
 No se espera que los scripts de ejemplo funcionen de inmediato en todos los problemas, y es posible que debas adaptar el script al problema que estás tratando de resolver. Para ayudarte con esto, la mayoría de los scripts exponen completamente cómo se preprocesan los datos, lo que te permite editarlos según sea necesario para tu caso de uso.
 
diff --git a/docs/source/es/trainer.md b/docs/source/es/trainer.md
index dab83e9a9d9e..0362fe1d7d20 100644
--- a/docs/source/es/trainer.md
+++ b/docs/source/es/trainer.md
@@ -361,6 +361,30 @@ use_cpu: false
 
 ```
 
+</hfoption>
+
+<hfoption id="Tensor Parallelism with PyTorch 2">
+
+```yml
+compute_environment: LOCAL_MACHINE
+tp_config:
+  tp_size: 4
+distributed_type: TP
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+
+```
+
 </hfoption>
 </hfoptions>
 
diff --git a/docs/source/fr/index.md b/docs/source/fr/index.md
index 51d35b76e877..963afe48ce44 100644
--- a/docs/source/fr/index.md
+++ b/docs/source/fr/index.md
@@ -98,7 +98,7 @@ La documentation est organisée en 5 parties:
 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) and a German version of DistilBERT.
 1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
diff --git a/docs/source/fr/run_scripts_fr.md b/docs/source/fr/run_scripts_fr.md
index 0344ff2cec3d..a68d71035f01 100644
--- a/docs/source/fr/run_scripts_fr.md
+++ b/docs/source/fr/run_scripts_fr.md
@@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
 En plus des [notebooks](./notebooks) de 🤗 Transformers, il existe également des exemples de scripts démontrant comment entraîner un modèle pour une tâche avec [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) ou [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 
 
-Vous trouverez également des scripts que nous avons utilisé dans nos [projets de recherche](https://github.com/huggingface/transformers/tree/main/examples/research_projects) et des [exemples "legacy"](https://github.com/huggingface/transformers/tree/main/examples/legacy) qui sont des contributions de la communauté. Ces scripts ne sont pas activement maintenus et nécessitent une version spécifique de 🤗 Transformers qui sera probablement incompatible avec la dernière version de la librairie.
+Vous trouverez également des scripts que nous avons utilisé dans nos [projets de recherche](https://github.com/huggingface/transformers-research-projects/) et des [exemples "legacy"](https://github.com/huggingface/transformers/tree/main/examples/legacy) qui sont des contributions de la communauté. Ces scripts ne sont pas activement maintenus et nécessitent une version spécifique de 🤗 Transformers qui sera probablement incompatible avec la dernière version de la librairie.
 
 Les exemples de scripts ne sont pas censés fonctionner immédiatement pour chaque problème, et il se peut que vous ayez besoin d'adapter le script au problème que vous essayez de résoudre. Pour vous aider dans cette tâche, la plupart des scripts exposent entièrement la manière dont les données sont prétraitées, vous permettant de les modifier selon vos besoins.
 
diff --git a/docs/source/it/index.md b/docs/source/it/index.md
index 76cdc0ad2461..bbab23eed60f 100644
--- a/docs/source/it/index.md
+++ b/docs/source/it/index.md
@@ -86,7 +86,7 @@ La libreria attualmente contiene implementazioni in JAX, PyTorch e TensorFlow, p
 1. **[DeiT](model_doc/deit)** (da Facebook) rilasciato con il paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) da Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](model_doc/detr)** (da Facebook) rilasciato con il paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) da Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (da Microsoft Research) rilasciato con il paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) da Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (da HuggingFace), rilasciato assieme al paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) da Victor Sanh, Lysandre Debut e Thomas Wolf. La stessa tecnica è stata applicata per comprimere GPT2 in [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa in [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT in [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DistilBERT](model_doc/distilbert)** (da HuggingFace), rilasciato assieme al paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) da Victor Sanh, Lysandre Debut e Thomas Wolf. La stessa tecnica è stata applicata per comprimere GPT2 in [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), RoBERTa in [DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), Multilingual BERT in [DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) and a German version of DistilBERT.
 1. **[DPR](model_doc/dpr)** (da Facebook) rilasciato con il paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) da Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, e Wen-tau Yih.
 1. **[DPT](master/model_doc/dpt)** (da Intel Labs) rilasciato con il paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) da René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)  by Mingxing Tan and Quoc V. Le.
diff --git a/docs/source/it/perf_infer_cpu.md b/docs/source/it/perf_infer_cpu.md
index baae51a5a978..5bf48e4737d9 100644
--- a/docs/source/it/perf_infer_cpu.md
+++ b/docs/source/it/perf_infer_cpu.md
@@ -23,7 +23,7 @@ Abbiamo integrato di recente `BetterTransformer` per fare inferenza più rapidam
 
 ## PyTorch JIT-mode (TorchScript)
 
-TorchScript è un modo di creare modelli serializzabili e ottimizzabili da codice PyTorch. Ogni programmma TorchScript può esere salvato da un processo Python  e caricato in un processo dove non ci sono dipendenze Python.
+TorchScript è un modo di creare modelli serializzabili e ottimizzabili da codice PyTorch. Ogni programma TorchScript può esere salvato da un processo Python  e caricato in un processo dove non ci sono dipendenze Python.
 Comparandolo con l'eager mode di default, jit mode in PyTorch normalmente fornisce prestazioni migliori per l'inferenza del modello da parte di metodologie di ottimizzazione come la operator fusion.
 
 Per una prima introduzione a TorchScript, vedi la Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules).
diff --git a/docs/source/it/run_scripts.md b/docs/source/it/run_scripts.md
index b437efb9fb18..b7d13f7019fb 100644
--- a/docs/source/it/run_scripts.md
+++ b/docs/source/it/run_scripts.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 Insieme ai [notebooks](./notebooks) 🤗 Transformers, ci sono anche esempi di script che dimostrano come addestrare un modello per un task con [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), o [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 
-Troverai anche script che abbiamo usato nei nostri [progetti di ricerca](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [precedenti esempi](https://github.com/huggingface/transformers/tree/main/examples/legacy) a cui contribuisce per lo più la comunità. Questi script non sono attivamente mantenuti e richiedono una specifica versione di 🤗 Transformers che sarà molto probabilmente incompatibile con l'ultima versione della libreria.
+Troverai anche script che abbiamo usato nei nostri [progetti di ricerca](https://github.com/huggingface/transformers-research-projects/) e [precedenti esempi](https://github.com/huggingface/transformers/tree/main/examples/legacy) a cui contribuisce per lo più la comunità. Questi script non sono attivamente mantenuti e richiedono una specifica versione di 🤗 Transformers che sarà molto probabilmente incompatibile con l'ultima versione della libreria.
 
 Non è dato per scontato che gli script di esempio funzionino senza apportare modifiche per ogni problema, bensì potrebbe essere necessario adattare lo script al tuo caso specifico. Per aiutarti in ciò, la maggioranza degli script espone le modalità di pre-processamento dei dati, consentendoti di modificare lo script come preferisci.
 
diff --git a/docs/source/ja/bertology.md b/docs/source/ja/bertology.md
index 167ed007bbe4..2525d5edef4f 100644
--- a/docs/source/ja/bertology.md
+++ b/docs/source/ja/bertology.md
@@ -31,4 +31,4 @@ rendered properly in your Markdown viewer.
 - BERT/GPT/GPT-2の各ヘッドの注意重みにアクセスできます。
 - ヘッドの出力値と勾配を取得し、ヘッドの重要性スコアを計算し、[論文リンク](https://arxiv.org/abs/1905.10650)で説明されているようにヘッドを削減できます。
 
-これらの機能を理解し、使用するのを支援するために、特定のサンプルスクリプト「[bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py)」を追加しました。このスクリプトは、GLUEで事前トレーニングされたモデルから情報を抽出し、ヘッドを削減する役割を果たします。
+これらの機能を理解し、使用するのを支援するために、特定のサンプルスクリプト「[bertology.py](https://github.com/huggingface/transformers-research-projects/tree/main/bertology/run_bertology.py)」を追加しました。このスクリプトは、GLUEで事前トレーニングされたモデルから情報を抽出し、ヘッドを削減する役割を果たします。
diff --git a/docs/source/ja/index.md b/docs/source/ja/index.md
index c3baa0888fc8..d606662ed833 100644
--- a/docs/source/ja/index.md
+++ b/docs/source/ja/index.md
@@ -95,7 +95,7 @@ rendered properly in your Markdown viewer.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (Facebook から) Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko から公開された研究論文: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872)
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (Microsoft Research から) Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan から公開された研究論文: [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536)
 1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (SHI Labs から) Ali Hassani and Humphrey Shi から公開された研究論文: [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001)
-1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace から), Victor Sanh, Lysandre Debut and Thomas Wolf. 同じ手法で GPT2, RoBERTa と Multilingual BERT の圧縮を行いました.圧縮されたモデルはそれぞれ [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation)、[DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) と名付けられました. 公開された研究論文: [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)
+1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (HuggingFace から), Victor Sanh, Lysandre Debut and Thomas Wolf. 同じ手法で GPT2, RoBERTa と Multilingual BERT の圧縮を行いました.圧縮されたモデルはそれぞれ [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation)、[DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation)、[DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) と名付けられました. 公開された研究論文: [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)
 1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (Microsoft Research から) Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei から公開された研究論文: [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378)
 1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (NAVER から), Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park から公開された研究論文: [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664)
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (Facebook から) Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih から公開された研究論文: [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906)
diff --git a/docs/source/ja/main_classes/callback.md b/docs/source/ja/main_classes/callback.md
index a90044b6cd37..f1cb357f7eb8 100644
--- a/docs/source/ja/main_classes/callback.md
+++ b/docs/source/ja/main_classes/callback.md
@@ -46,6 +46,7 @@ rendered properly in your Markdown viewer.
 - [`~integrations.DagsHubCallback`] [dagshub](https://dagshub.com/) がインストールされている場合。
 - [`~integrations.FlyteCallback`] [flyte](https://flyte.org/) がインストールされている場合。
 - [`~integrations.DVCLiveCallback`] [dvclive](https://www.dvc.org/doc/dvclive) がインストールされている場合。
+- [`~integrations.SwanLabCallback`] [swanlab](http://swanlab.cn/) がインストールされている場合。
 
 パッケージがインストールされているが、付随する統合を使用したくない場合は、`TrainingArguments.report_to` を、使用したい統合のみのリストに変更できます (例: `["azure_ml", "wandb"]`) 。
 
@@ -92,6 +93,9 @@ rendered properly in your Markdown viewer.
 [[autodoc]] integrations.DVCLiveCallback
     - setup
 
+[[autodoc]] integrations.SwanLabCallback
+    - setup
+
 ## TrainerCallback
 
 [[autodoc]] TrainerCallback
diff --git a/docs/source/ja/main_classes/optimizer_schedules.md b/docs/source/ja/main_classes/optimizer_schedules.md
index fc7a13b9df53..cd6dada007cb 100644
--- a/docs/source/ja/main_classes/optimizer_schedules.md
+++ b/docs/source/ja/main_classes/optimizer_schedules.md
@@ -22,10 +22,6 @@ rendered properly in your Markdown viewer.
 - `_LRSchedule` から継承するスケジュール オブジェクトの形式のいくつかのスケジュール:
 - 複数のバッチの勾配を累積するための勾配累積クラス
 
-## AdamW (PyTorch)
-
-[[autodoc]] AdamW
-
 ## AdaFactor (PyTorch)
 
 [[autodoc]] Adafactor
diff --git a/docs/source/ja/run_scripts.md b/docs/source/ja/run_scripts.md
index af99d1c6da97..69437819e36b 100644
--- a/docs/source/ja/run_scripts.md
+++ b/docs/source/ja/run_scripts.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 🤗 Transformersの[notebooks](./notebooks/README)と一緒に、[PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch)、[TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow)、または[JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax)を使用してモデルをトレーニングする方法を示すサンプルスクリプトもあります。
 
-また、私たちの[研究プロジェクト](https://github.com/huggingface/transformers/tree/main/examples/research_projects)や[レガシーの例](https://github.com/huggingface/transformers/tree/main/examples/legacy)で使用したスクリプトも見つかります。これらのスクリプトは現在メンテナンスされておらず、おそらく最新バージョンのライブラリと互換性がない特定の🤗 Transformersのバージョンが必要です。
+また、私たちの[研究プロジェクト](https://github.com/huggingface/transformers-research-projects/)や[レガシーの例](https://github.com/huggingface/transformers/tree/main/examples/legacy)で使用したスクリプトも見つかります。これらのスクリプトは現在メンテナンスされておらず、おそらく最新バージョンのライブラリと互換性がない特定の🤗 Transformersのバージョンが必要です。
 
 サンプルスクリプトはすべての問題でそのまま動作することは期待されておらず、解決しようとしている問題にスクリプトを適応させる必要があるかもしれません。この点をサポートするために、ほとんどのスクリプトはデータがどのように前処理されているかを完全に公開し、必要に応じて編集できるようにしています。
 
diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml
index 6257bca6c953..353d616b78b2 100644
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -332,8 +332,8 @@
         title: (번역중) CamemBERT
       - local: in_translation
         title: (번역중) CANINE
-      - local: in_translation
-        title: (번역중) CodeGen
+      - local: model_doc/codegen
+        title: CodeGen
       - local: model_doc/cohere
         title: Cohere
       - local: model_doc/convbert
@@ -720,6 +720,8 @@
         title: (번역중) Perceiver
       - local: in_translation
         title: (번역중) Pix2Struct
+      - local: model_doc/qwen2_vl
+        title: Qwen2VL
       - local: in_translation
         title: (번역중) Segment Anything
       - local: in_translation
diff --git a/docs/source/ko/bertology.md b/docs/source/ko/bertology.md
index 7b4f3dc4c493..1f69a0381707 100644
--- a/docs/source/ko/bertology.md
+++ b/docs/source/ko/bertology.md
@@ -38,4 +38,4 @@ BERT와 같은 대규모 트랜스포머의 내부 동작을 조사하는 연구
 - BERT/GPT/GPT-2의 각 헤드의 모든 어텐션 가중치에 접근하기,
 - 헤드의 출력 값과 그래디언트를 검색하여 헤드 중요도 점수를 계산하고 https://arxiv.org/abs/1905.10650에서 설명된 대로 헤드를 제거하는 기능을 제공합니다.
 
-이러한 기능들을 이해하고 직접 사용해볼 수 있도록 [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) 예제 스크립트를 추가했습니다. 이 예제 스크립트에서는 GLUE에 대해 사전훈련된 모델에서 정보를 추출하고 모델을 가지치기(prune)해봅니다.
+이러한 기능들을 이해하고 직접 사용해볼 수 있도록 [bertology.py](https://github.com/huggingface/transformers-research-projects/tree/main/bertology/run_bertology.py) 예제 스크립트를 추가했습니다. 이 예제 스크립트에서는 GLUE에 대해 사전훈련된 모델에서 정보를 추출하고 모델을 가지치기(prune)해봅니다.
diff --git a/docs/source/ko/contributing.md b/docs/source/ko/contributing.md
index 99f1d2b66647..f1c0a84ef32d 100644
--- a/docs/source/ko/contributing.md
+++ b/docs/source/ko/contributing.md
@@ -85,7 +85,7 @@ python src/transformers/commands/transformers_cli.py env
 3. 해당 기능의 사용법을 보여주는 *코드 스니펫*을 제공해 주세요.
 4. 기능과 관련된 논문이 있는 경우 링크를 포함해 주세요.
 
-이슈가 잘 작성되었다면 이슈가 생성된 순간, 이미 80% 정도의 작업이 완료된 것입니다. 
+이슈가 잘 작성되었다면 이슈가 생성된 순간, 이미 80% 정도의 작업이 완료된 것입니다.
 
 이슈를 제기하는 데 도움이 될 만한 [템플릿](https://github.com/huggingface/transformers/tree/main/templates)도 준비되어 있습니다.
 
@@ -140,7 +140,7 @@ python src/transformers/commands/transformers_cli.py env
    ```
 
    만약 이미 가상 환경에 🤗 Transformers가 설치되어 있다면, `-e` 플래그를 사용하여 설치하기 전에 `pip uninstall transformers`로 제거해주세요.
-   
+
    여러분의 운영체제에 따라서, 그리고 🤗 Transformers의 선택적 의존성의 수가 증가하면서, 이 명령이 실패할 수도 있습니다. 그럴 경우 사용하려는 딥러닝 프레임워크(PyTorch, TensorFlow, 그리고/또는 Flax)를 설치한 후 아래 명령을 실행해주세요:
 
    ```bash
@@ -188,7 +188,7 @@ python src/transformers/commands/transformers_cli.py env
    이러한 검사에 대해 자세히 알아보고 관련 문제를 해결하는 방법은 [Pull Request에 대한 검사](https://huggingface.co/docs/transformers/pr_checks) 가이드를 확인하세요.
 
    만약 `docs/source` 디렉터리 아래의 문서를 수정하는 경우, 문서가 빌드될 수 있는지 확인하세요. 이 검사는 Pull Request를 열 때도 CI에서 실행됩니다. 로컬 검사를 실행하려면 문서 빌더를 설치해야 합니다:
-   
+
    ```bash
    pip install ".[docs]"
    ```
@@ -216,7 +216,7 @@ python src/transformers/commands/transformers_cli.py env
    git fetch upstream
    git rebase upstream/main
    ```
-   
+
    변경 사항을 브랜치에 푸시하세요:
 
    ```bash
@@ -238,7 +238,7 @@ python src/transformers/commands/transformers_cli.py env
 ☐ 새로운 기능을 추가하는 경우, 해당 기능에 대한 테스트도 추가하세요.<br>
    - 새 모델을 추가하는 경우, `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`을 사용하여 일반적인 테스트를 활성화하세요.
    - 새 `@slow` 테스트를 추가하는 경우, 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
-   - 새 토크나이저를 추가하는 경우, 테스트를 작성하고 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py`. 
+   - 새 토크나이저를 추가하는 경우, 테스트를 작성하고 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py`.
    - CircleCI에서는 느린 테스트를 실행하지 않지만, GitHub Actions에서는 매일 밤 실행됩니다!<br>
 
 ☐ 모든 공개 메소드는 유용한 기술문서를 가져야 합니다 (예를 들어 [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py) 참조).<br>
@@ -282,8 +282,6 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
 
 느린 테스트와 마찬가지로, 다음과 같이 테스트 중에 기본적으로 활성화되지 않는 다른 환경 변수도 있습니다:
 - `RUN_CUSTOM_TOKENIZERS`: 사용자 정의 토크나이저 테스트를 활성화합니다.
-- `RUN_PT_FLAX_CROSS_TESTS`: PyTorch + Flax 통합 테스트를 활성화합니다.
-- `RUN_PT_TF_CROSS_TESTS`: TensorFlow + PyTorch 통합 테스트를 활성화합니다.
 
 더 많은 환경 변수와 추가 정보는 [testing_utils.py](src/transformers/testing_utils.py)에서 찾을 수 있습니다.
 
diff --git a/docs/source/ko/index.md b/docs/source/ko/index.md
index 0726085c5b3a..bd95cbc0ab09 100644
--- a/docs/source/ko/index.md
+++ b/docs/source/ko/index.md
@@ -88,7 +88,7 @@ rendered properly in your Markdown viewer.
 1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) and a German version of DistilBERT.
 1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
diff --git a/docs/source/ko/llm_optims.md b/docs/source/ko/llm_optims.md
index 99eabc19ce86..f6eaa58c0004 100644
--- a/docs/source/ko/llm_optims.md
+++ b/docs/source/ko/llm_optims.md
@@ -99,7 +99,7 @@ model.generation_config.max_new_tokens = 16
 
 past_key_values = StaticCache(
     config=model.config,
-    batch_size=1,
+    max_batch_size=1,
     # 캐시를 재사용할 계획이 있는 경우, 모든 경우에 충분한 캐시 길이를 설정해야 합니다
     max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
     device=model.device,
@@ -109,7 +109,7 @@ outputs = model.generate(**input_ids, past_key_values=past_key_values)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2']
 
-# 생성된 텍스트와 동일한 캐시 객체를 전달하여, 중단한 곳에서 생성을 계속합니다. 
+# 생성된 텍스트와 동일한 캐시 객체를 전달하여, 중단한 곳에서 생성을 계속합니다.
 # 다중 턴 대화의 경우, 생성된 텍스트에 새로운 사용자 입력을 추가할 수 있습니다.
 new_input_ids = outputs
 outputs = model.generate(new_input_ids, past_key_values=past_key_values)
diff --git a/docs/source/ko/main_classes/callback.md b/docs/source/ko/main_classes/callback.md
index 1f280a412989..c8d122a8ef92 100644
--- a/docs/source/ko/main_classes/callback.md
+++ b/docs/source/ko/main_classes/callback.md
@@ -45,6 +45,7 @@ rendered properly in your Markdown viewer.
 - [`~integrations.DagsHubCallback`]는 [dagshub](https://dagshub.com/)이 설치되어 있으면 사용됩니다.
 - [`~integrations.FlyteCallback`]는 [flyte](https://flyte.org/)가 설치되어 있으면 사용됩니다.
 - [`~integrations.DVCLiveCallback`]는 [dvclive](https://dvc.org/doc/dvclive)가 설치되어 있으면 사용됩니다.
+- [`~integrations.SwanLabCallback`]는 [swanlab](https://swanlab.cn)가 설치되어 있으면 사용됩니다.
 
 패키지가 설치되어 있지만 해당 통합 기능을 사용하고 싶지 않다면, `TrainingArguments.report_to`를 사용하고자 하는 통합 기능 목록으로 변경할 수 있습니다 (예: `["azure_ml", "wandb"]`).
 
@@ -92,6 +93,9 @@ rendered properly in your Markdown viewer.
 [[autodoc]] integrations.DVCLiveCallback
     - setup
 
+[[autodoc]] integrations.SwanLabCallback
+    - setup
+
 ## TrainerCallback [[trainercallback]]
 
 [[autodoc]] TrainerCallback
diff --git a/docs/source/ko/model_doc/codegen.md b/docs/source/ko/model_doc/codegen.md
new file mode 100644
index 000000000000..264f10e89b06
--- /dev/null
+++ b/docs/source/ko/model_doc/codegen.md
@@ -0,0 +1,94 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CodeGen[[Codegen]]
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## 개요[[Overview]]
+
+CodeGen 모델은 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong이 작성한 논문  [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474)에서 제안되었습니다.
+
+CodeGen 모델은 프로그램 합성(program synthesis)을 위한 자기회귀(autoregressive) 언어 모델로, [The Pile](https://pile.eleuther.ai/), BigQuery, BigPython 데이터로 순차적으로 학습되었습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*프로그램 합성(program synthesis)은 주어진 문제 명세에 대한 해답으로 프로그램을 생성하는 것을 목표로 합니다. 이 논문에서는 대규모 언어 모델(LLM)을 활용한 대화형 프로그램 합성(conversational program synthesis) 접근법을 제안하여, 기존 접근법에서의 방대한 프로그램 탐색 공간과 사용자의 의도를 명세화하는 과정에서의 어려움을 해결합니다. 제안된 방식에서는 프로그램 명세 작성과 실제 프로그램 작성을 사용자와 시스템 간 다회 대화(multi-turn conversation)로 바라봅니다. 즉, 프로그램 합성 과정 명세를 자연어로 표현하고, 기대하는 프로그램 합성을 조건부로 예측하여 생성하는 일종의 순차적 예측 문제(sequence prediction problem)로 접근했습니다. 이를 위해 자연어와 프로그래밍 언어 데이터를 기반으로 CodeGen이라는 대규모 언어 모델 그룹을 학습시켰으며, 데이터로부터 약한 지도(weak supervision)와 데이터 및 모델 규모의 확장만으로도 모델이 자연스럽게 대화 능력을 갖추게 된다는 점을 확인하였습니다. 더해서 모델의 대화형 프로그램 합성 능력을 평가하기 위해 다회 대화 기반 프로그래밍 벤치마크(MTPB)를 개발했습니다. 이 벤치마크는 각 문제를 해결하기 위해 사용자와 모델 간 여러 단계의 대화를 거쳐 프로그램이 점진적으로 합성되는 과정을 요구합니다. 연구 결과, CodeGen 모델은 대화형 능력을 성공적으로 발휘했으며 본 논문에서 제안한 대화형 합성 패러다임의 우수성과 효율성을 입증했습니다. 특히 16B 파라미터 규모로 TPU-v4에서 학습된 CodeGen 모델은 HumanEval 벤치마크에서 OpenAI의 Codex를 뛰어넘는 성능을 기록했습니다. 학습된 사용된 라이브러리인 JaxFormer와 모델 체크포인트는 오픈소스로 공개되었습니다: [이 https URL에서 확인하세요](https://github.com/salesforce/codegen).*
+
+이 모델은[Hiroaki Hayashi](https://huggingface.co/rooa)가 기여했습니다.
+모델의 원본 코드는 [여기](https://github.com/salesforce/codegen)에 있습니다.
+
+## 체크포인트 명명 규칙[[Checkpoint Naming]]
+
+* CodeGen 모델의 [체크포인트](https://huggingface.co/models?other=codegen)는 서로 다른 사전 학습 데이터와 다양한 크기로 제공됩니다.
+* 체크포인트의 형식은 다음과 같습니다: `Salesforce/codegen-{size}-{data}`
+  * `size`: `350M`, `2B`, `6B`, `16B`
+  * `data`: 
+    * `nl`: The Pile 데이터로 사전학습된 모델
+    * `multi`: `nl` 모델에서 시작하여 다양한 프로그래밍 언어를 추가적으로 학습한 모델
+    * `mono`: `multi` 모델에서 시작하여 추가로 Python 데이터에 대해 학습된 모델
+* 예를 들어, `Salesforce/codegen-350M-mono`는 3억 5천만(350M) 개의 파라미터를 모델로, The Pile, 다양한 프로그래밍 언어, Python 데이터의 순서로 단계적으로 학습한 체크포인트를 의미합니다.
+
+## 사용 예시[[Usage example]]
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> checkpoint = "Salesforce/codegen-350M-mono"
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+
+>>> text = "def hello_world():"
+
+>>> completion = model.generate(**tokenizer(text, return_tensors="pt"))
+
+>>> print(tokenizer.decode(completion[0]))
+def hello_world():
+    print("Hello World")
+
+hello_world()
+```
+
+## 자료[[Resources]]
+
+- [Causal language modeling task guide](../tasks/language_modeling)
+
+## CodeGenConfig
+
+[[autodoc]] CodeGenConfig
+    - all
+
+## CodeGenTokenizer
+
+[[autodoc]] CodeGenTokenizer
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CodeGenTokenizerFast
+
+[[autodoc]] CodeGenTokenizerFast
+
+## CodeGenModel
+
+[[autodoc]] CodeGenModel
+    - forward
+
+## CodeGenForCausalLM
+
+[[autodoc]] CodeGenForCausalLM
+    - forward
diff --git a/docs/source/ko/model_doc/qwen2_vl.md b/docs/source/ko/model_doc/qwen2_vl.md
new file mode 100644
index 000000000000..fb4ed27391e0
--- /dev/null
+++ b/docs/source/ko/model_doc/qwen2_vl.md
@@ -0,0 +1,303 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Qwen2-VL[[Qwen2-VL]]
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+</div>
+
+## Overview[[Overview]]
+
+[Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/) 모델은 알리바바 리서치의 Qwen팀에서 개발한 [Qwen-VL](https://arxiv.org/pdf/2308.12966) 모델의 주요 업데이트 버전입니다.
+
+블로그의 요약은 다음과 같습니다:
+
+*이 블로그는 지난 몇 년간 Qwen-VL에서 중대한 개선을 거쳐 발전된 Qwen2-VL 모델을 소개합니다. 중요 개선 사항은 향상된 이미지 이해, 고급 비디오 이해, 통합 시각 에이전트 기능, 확장된 다언어 지원을 포함하고 있습니다.모델 아키텍처는 Naive Dynamic Resolution 지원을 통해 임의의 이미지 해상도를 처리할 수 있도록 최적화되었으며, 멀티모달 회전 위치 임베딩(M-ROPE)을 활용하여 1D 텍스트와 다차원 시각 데이터를 효과적으로 처리합니다. 이 업데이트된 모델은 시각 관련 작업에서 GPT-4o와 Claude 3.5 Sonnet 같은 선도적인 AI 시스템과 경쟁력 있는 성능을 보여주며, 텍스트 능력에서는 오픈소스 모델 중 상위권에 랭크되어 있습니다. 이러한 발전은 Qwen2-VL을 강력한 멀티모달 처리 및 추론 능력이 필요한 다양한 응용 분야에서 활용할 수 있는 다재다능한 도구로 만들어줍니다.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/qwen2_vl_architecture.jpeg"
+alt="drawing" width="600"/>
+
+<small> Qwen2-VL 구조. 출처: <a href="https://qwenlm.github.io/blog/qwen2-vl/">블로그 게시글</a> </small>
+
+이 모델은 [simonJJJ](https://huggingface.co/simonJJJ)에 의해 기여되었습니다.
+
+## 사용 예시[[Usage example]]
+
+### 단일 미디어 추론[[Single Media inference]]
+
+이 모델은 이미지와 비디오를 모두 인풋으로 받을 수 있습니다. 다음은 추론을 위한 예제 코드입니다.
+
+```python
+
+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+
+# 사용 가능한 장치에서 모델을 반 정밀도(half-precision)로 로드
+model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", device_map="auto")
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+
+
+conversation = [
+    {
+        "role":"user",
+        "content":[
+            {
+                "type":"image",
+                "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+            },
+            {
+                "type":"text",
+                "text":"Describe this image."
+            }
+        ]
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt"
+).to(model.device)
+
+# 추론: 아웃풋 생성
+output_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+print(output_text)
+
+
+
+# 비디오
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "video", "path": "/path/to/video.mp4"},
+            {"type": "text", "text": "What happened in the video?"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    video_fps=1,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt"
+).to(model.device)
+
+
+# 추론: 아웃풋 생성
+output_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+print(output_text)
+```
+
+### 배치 혼합 미디어 추론[[Batch Mixed Media Inference]]
+
+이 모델은 이미지, 비디오, 텍스트 등 다양한 유형의 데이터를 혼합하여 배치 입력으로 처리할 수 있습니다. 다음은 예제입니다.
+
+```python
+
+# 첫번째 이미지에 대한 대화
+conversation1 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "path": "/path/to/image1.jpg"},
+            {"type": "text", "text": "Describe this image."}
+        ]
+    }
+]
+
+# 두 개의 이미지에 대한 대화
+conversation2 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "path": "/path/to/image2.jpg"},
+            {"type": "image", "path": "/path/to/image3.jpg"},
+            {"type": "text", "text": "What is written in the pictures?"}
+        ]
+    }
+]
+
+# 순수 텍스트로만 이루어진 대화
+conversation3 = [
+    {
+        "role": "user",
+        "content": "who are you?"
+    }
+]
+
+
+# 혼합된 미디어로 이루어진 대화
+conversation4 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "path": "/path/to/image3.jpg"},
+            {"type": "image", "path": "/path/to/image4.jpg"},
+            {"type": "video", "path": "/path/to/video.jpg"},
+            {"type": "text", "text": "What are the common elements in these medias?"},
+        ],
+    }
+]
+
+conversations = [conversation1, conversation2, conversation3, conversation4]
+# 배치 추론을 위한 준비
+ipnuts = processor.apply_chat_template(
+    conversations,
+    video_fps=1,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt"
+).to(model.device)
+
+
+# 배치 추론
+output_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+print(output_text)
+```
+
+### 사용 팁[[Usage Tips]]
+
+#### 이미지 해상도 트레이드오프[[Image Resolution trade-off]]
+
+이 모델은 다양한 해상도의 입력을 지원합니다. 디폴트로 입력에 대해 네이티브(native) 해상도를 사용하지만, 더 높은 해상도를 적용하면 성능이 향상될 수 있습니다. 다만, 이는 더 많은 연산 비용을 초래합니다. 사용자는 최적의 설정을 위해 최소 및 최대 픽셀 수를 조정할 수 있습니다.
+
+```python
+min_pixels = 224*224
+max_pixels = 2048*2048
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+```
+
+제한된 GPU RAM의 경우, 다음과 같이 해상도를 줄일 수 있습니다:
+
+```python
+min_pixels = 256*28*28
+max_pixels = 1024*28*28 
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+```
+이렇게 하면 각 이미지가 256~1024개의 토큰으로 인코딩됩니다. 여기서 28은 모델이 14 크기의 패치(patch)와 2의 시간 패치(temporal patch size)를 사용하기 때문에 나온 값입니다 (14 × 2 = 28).
+
+
+#### 다중 이미지 인풋[[Multiple Image Inputs]]
+
+기본적으로 이미지와 비디오 콘텐츠는 대화에 직접 포함됩니다. 여러 개의 이미지를 처리할 때는 이미지 및 비디오에 라벨을 추가하면 참조하기가 더 쉬워집니다. 사용자는 다음 설정을 통해 이 동작을 제어할 수 있습니다:
+
+```python
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"}, 
+            {"type": "text", "text": "Hello, how are you?"}
+        ]
+    },
+    {
+        "role": "assistant",
+        "content": "I'm doing well, thank you for asking. How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Can you describe these images and video?"}, 
+            {"type": "image"}, 
+            {"type": "image"}, 
+            {"type": "video"}, 
+            {"type": "text", "text": "These are from my vacation."}
+        ]
+    },
+    {
+        "role": "assistant",
+        "content": "I'd be happy to describe the images and video for you. Could you please provide more context about your vacation?"
+    },
+    {
+        "role": "user",
+        "content": "It was a trip to the mountains. Can you see the details in the images and video?"
+    }
+]
+
+# 디폴트:
+prompt_without_id = processor.apply_chat_template(conversation, add_generation_prompt=True)
+# 예상 아웃풋: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Hello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing well, thank you for asking. How can I assist you today?<|im_end|>\n<|im_start|>user\nCan you describe these images and video?<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|><|vision_start|><|video_pad|><|vision_end|>These are from my vacation.<|im_end|>\n<|im_start|>assistant\nI'd be happy to describe the images and video for you. Could you please provide more context about your vacation?<|im_end|>\n<|im_start|>user\nIt was a trip to the mountains. Can you see the details in the images and video?<|im_end|>\n<|im_start|>assistant\n'
+
+
+# id 추가
+prompt_with_id = processor.apply_chat_template(conversation, add_generation_prompt=True, add_vision_id=True)
+# 예상 아웃풋: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nPicture 1: <|vision_start|><|image_pad|><|vision_end|>Hello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing well, thank you for asking. How can I assist you today?<|im_end|>\n<|im_start|>user\nCan you describe these images and video?Picture 2: <|vision_start|><|image_pad|><|vision_end|>Picture 3: <|vision_start|><|image_pad|><|vision_end|>Video 1: <|vision_start|><|video_pad|><|vision_end|>These are from my vacation.<|im_end|>\n<|im_start|>assistant\nI'd be happy to describe the images and video for you. Could you please provide more context about your vacation?<|im_end|>\n<|im_start|>user\nIt was a trip to the mountains. Can you see the details in the images and video?<|im_end|>\n<|im_start|>assistant\n'
+
+```
+
+#### 빠른 생성을 위한 Flash-Attention 2[[Flash-Attention 2 to speed up generation]]
+
+첫번째로, Flash Attention 2의 최신 버전을 설치합니다:
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+또한, Flash-Attention 2를 지원하는 하드웨어가 필요합니다. 자세한 내용은 공식 문서인 [flash attention repository](https://github.com/Dao-AILab/flash-attention)에서 확인할 수 있습니다. FlashAttention-2는 모델이 `torch.float16` 또는 `torch.bfloat16` 형식으로 로드된 경우에만 사용할 수 있습니다.
+
+Flash Attention-2를 사용하여 모델을 로드하고 실행하려면, 다음과 같이 모델을 로드할 때 `attn_implementation="flash_attention_2"` 옵션을 추가하면 됩니다:
+
+```python
+from transformers import Qwen2VLForConditionalGeneration
+
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2-VL-7B-Instruct", 
+    torch_dtype=torch.bfloat16, 
+    attn_implementation="flash_attention_2",
+)
+```
+
+## Qwen2VLConfig
+
+[[autodoc]] Qwen2VLConfig
+
+## Qwen2VLImageProcessor
+
+[[autodoc]] Qwen2VLImageProcessor
+    - preprocess
+
+## Qwen2VLImageProcessorFast
+
+[[autodoc]] Qwen2VLImageProcessorFast
+    - preprocess
+
+## Qwen2VLProcessor
+
+[[autodoc]] Qwen2VLProcessor
+
+## Qwen2VLModel
+
+[[autodoc]] Qwen2VLModel
+    - forward
+
+## Qwen2VLForConditionalGeneration
+
+[[autodoc]] Qwen2VLForConditionalGeneration
+    - forward
diff --git a/docs/source/ko/run_scripts.md b/docs/source/ko/run_scripts.md
index 715a949dde42..70520f1a97f8 100644
--- a/docs/source/ko/run_scripts.md
+++ b/docs/source/ko/run_scripts.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 🤗 Transformers 노트북과 함께 [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), 또는 [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax)를 사용해 특정 태스크에 대한 모델을 훈련하는 방법을 보여주는 예제 스크립트도 있습니다.
 
-또한 [연구 프로젝트](https://github.com/huggingface/transformers/tree/main/examples/research_projects) 및 [레거시 예제](https://github.com/huggingface/transformers/tree/main/examples/legacy)에서 대부분 커뮤니티에서 제공한 스크립트를 찾을 수 있습니다. 
+또한 [연구 프로젝트](https://github.com/huggingface/transformers-research-projects/) 및 [레거시 예제](https://github.com/huggingface/transformers/tree/main/examples/legacy)에서 대부분 커뮤니티에서 제공한 스크립트를 찾을 수 있습니다. 
 이러한 스크립트는 적극적으로 유지 관리되지 않으며 최신 버전의 라이브러리와 호환되지 않을 가능성이 높은 특정 버전의 🤗 Transformers를 필요로 합니다.
 
 예제 스크립트가 모든 문제에서 바로 작동하는 것은 아니며, 해결하려는 문제에 맞게 스크립트를 변경해야 할 수도 있습니다.
diff --git a/docs/source/ko/trainer.md b/docs/source/ko/trainer.md
index 42789fc0c2f6..976072730c86 100644
--- a/docs/source/ko/trainer.md
+++ b/docs/source/ko/trainer.md
@@ -548,6 +548,29 @@ tpu_use_sudo: false
 use_cpu: false
 ```
 
+</hfoption>
+<hfoption id="Tensor Parallelism with PyTorch 2">
+
+```yml
+compute_environment: LOCAL_MACHINE
+tp_config:
+  tp_size: 4
+distributed_type: TP
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+
+```
+
 </hfoption>
 </hfoptions>
 
diff --git a/docs/source/ms/index.md b/docs/source/ms/index.md
index f51c43c9bd01..e0adb8a8a8e5 100644
--- a/docs/source/ms/index.md
+++ b/docs/source/ms/index.md
@@ -104,7 +104,7 @@ Dokumentasi disusun kepada lima bahagian:
 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DiNAT](model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
-1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) and a German version of DistilBERT.
 1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
 1. **[Donut](model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
diff --git a/docs/source/pt/index.md b/docs/source/pt/index.md
index 18dbcbc06b80..365933bd658d 100644
--- a/docs/source/pt/index.md
+++ b/docs/source/pt/index.md
@@ -93,7 +93,7 @@ Atualmente a biblioteca contém implementações do PyTorch, TensorFlow e JAX, p
 1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) and a German version of DistilBERT.
 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
 1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)  by Mingxing Tan and Quoc V. Le.
diff --git a/docs/source/pt/quicktour.md b/docs/source/pt/quicktour.md
index cc583697b9a6..5ccdd63376e3 100644
--- a/docs/source/pt/quicktour.md
+++ b/docs/source/pt/quicktour.md
@@ -222,7 +222,7 @@ Passe o texto para o tokenizer:
 O tokenizer retornará um dicionário contendo:
 
 * [input_ids](./glossary#input-ids): representações numéricas de seus tokens.
-* [atttention_mask](.glossary#attention-mask): indica quais tokens devem ser atendidos.
+* [attention_mask](.glossary#attention-mask): indica quais tokens devem ser atendidos.
 
 Assim como o [`pipeline`], o tokenizer aceitará uma lista de entradas. Além disso, o tokenizer também pode preencher e truncar o texto para retornar um lote com comprimento uniforme:
 
diff --git a/docs/source/pt/run_scripts.md b/docs/source/pt/run_scripts.md
index d4cc3973608d..ad19a8fdea09 100644
--- a/docs/source/pt/run_scripts.md
+++ b/docs/source/pt/run_scripts.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 Junto com os 🤗 Transformers [notebooks](./notebooks), também há scripts de exemplo demonstrando como treinar um modelo para uma tarefa com [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) ou [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 
-Você também encontrará scripts que usamos em nossos [projetos de pesquisa](https://github.com/huggingface/transformers/tree/main/examples/research_projects) e [exemplos legados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que são principalmente contribuições da comunidade. Esses scripts não são mantidos ativamente e exigem uma versão específica de 🤗 Transformers que provavelmente será incompatível com a versão mais recente da biblioteca.
+Você também encontrará scripts que usamos em nossos [projetos de pesquisa](https://github.com/huggingface/transformers-research-projects/) e [exemplos legados](https://github.com/huggingface/transformers/tree/main/examples/legacy) que são principalmente contribuições da comunidade. Esses scripts não são mantidos ativamente e exigem uma versão específica de 🤗 Transformers que provavelmente será incompatível com a versão mais recente da biblioteca.
 
 Não se espera que os scripts de exemplo funcionem imediatamente em todos os problemas, você pode precisar adaptar o script ao problema que está tentando resolver. Para ajudá-lo com isso, a maioria dos scripts expõe totalmente como os dados são pré-processados, permitindo que você os edite conforme necessário para seu caso de uso.
 
diff --git a/docs/source/zh/agents.md b/docs/source/zh/agents.md
index 00fa74e65450..b10fe4360859 100644
--- a/docs/source/zh/agents.md
+++ b/docs/source/zh/agents.md
@@ -233,7 +233,7 @@ Here are a few examples using notional tools:
 ---
 {examples}
 
-Above example were using notional tools that might not exist for you. You only have acces to those tools:
+Above example were using notional tools that might not exist for you. You only have access to those tools:
 <<tool_names>>
 You also can perform computations in the python code you generate.
 
diff --git a/docs/source/zh/bertology.md b/docs/source/zh/bertology.md
index 9b39f9483394..e7df7593a2bd 100644
--- a/docs/source/zh/bertology.md
+++ b/docs/source/zh/bertology.md
@@ -30,4 +30,4 @@ http://www.apache.org/licenses/LICENSE-2.0
 - 访问BERT/GPT/GPT-2每个注意力头的所有注意力权重，
 - 检索注意力头的输出值和梯度，以便计算头的重要性得分并对头进行剪枝，详情可见论文：https://arxiv.org/abs/1905.10650。
 
-为了帮助您理解和使用这些功能，我们添加了一个具体的示例脚本：[bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py)，该脚本可以对一个在 GLUE 数据集上预训练的模型进行信息提取与剪枝。
\ No newline at end of file
+为了帮助您理解和使用这些功能，我们添加了一个具体的示例脚本：[bertology.py](https://github.com/huggingface/transformers-research-projects/tree/main/bertology/run_bertology.py)，该脚本可以对一个在 GLUE 数据集上预训练的模型进行信息提取与剪枝。
\ No newline at end of file
diff --git a/docs/source/zh/contributing.md b/docs/source/zh/contributing.md
index b525754359ba..045b58af0861 100644
--- a/docs/source/zh/contributing.md
+++ b/docs/source/zh/contributing.md
@@ -33,7 +33,7 @@ limitations under the License.
 * 实现新的模型。
 * 为示例或文档做贡献。
 
-如果你不知道从哪里开始，有一个特别的 [Good First Issue](https://github.com/huggingface/transformers/contribute) 列表。它会列出一些适合初学者的开放的 issues，并帮助你开始为开源项目做贡献。只需要在你想要处理的 issue 下发表评论就行。 
+如果你不知道从哪里开始，有一个特别的 [Good First Issue](https://github.com/huggingface/transformers/contribute) 列表。它会列出一些适合初学者的开放的 issues，并帮助你开始为开源项目做贡献。只需要在你想要处理的 issue 下发表评论就行。
 
 如果想要稍微更有挑战性的内容，你也可以查看 [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) 列表。总的来说，如果你觉得自己知道该怎么做，就去做吧，我们会帮助你达到目标的！🚀
 
@@ -139,7 +139,7 @@ python src/transformers/commands/transformers_cli.py env
    ```
 
    如果在虚拟环境中已经安装了 🤗 Transformers，请先使用 `pip uninstall transformers` 卸载它，然后再用 `-e` 参数以可编辑模式重新安装。
-   
+
    根据你的操作系统，以及 Transformers 的可选依赖项数量的增加，可能会在执行此命令时出现失败。如果出现这种情况，请确保已经安装了你想使用的深度学习框架（PyTorch, TensorFlow 和 Flax），然后执行以下操作：
 
    ```bash
@@ -187,7 +187,7 @@ python src/transformers/commands/transformers_cli.py env
    想要了解有关这些检查及如何解决相关问题的更多信息，请阅读 [检查 Pull Request](https://huggingface.co/docs/transformers/pr_checks) 指南。
 
    如果你修改了 `docs/source` 目录下的文档，请确保文档仍然能够被构建。这个检查也会在你创建 PR 时在 CI 中运行。如果要进行本地检查，请确保安装了文档构建工具：
-   
+
    ```bash
    pip install ".[docs]"
    ```
@@ -281,8 +281,6 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
 
 和时间较长的测试一样，还有其他环境变量在测试过程中，在默认情况下是未启用的：
 - `RUN_CUSTOM_TOKENIZERS`: 启用自定义分词器的测试。
-- `RUN_PT_FLAX_CROSS_TESTS`: 启用 PyTorch + Flax 整合的测试。
-- `RUN_PT_TF_CROSS_TESTS`: 启用 TensorFlow + PyTorch 整合的测试。
 
 更多环境变量和额外信息可以在 [testing_utils.py](src/transformers/testing_utils.py) 中找到。
 
diff --git a/docs/source/zh/main_classes/callback.md b/docs/source/zh/main_classes/callback.md
index 3642207d75b9..36c1898f018b 100644
--- a/docs/source/zh/main_classes/callback.md
+++ b/docs/source/zh/main_classes/callback.md
@@ -37,6 +37,7 @@ Callbacks是“只读”的代码片段，除了它们返回的[TrainerControl]
 - [`~integrations.DagsHubCallback`]，如果安装了[dagshub](https://dagshub.com/)。
 - [`~integrations.FlyteCallback`]，如果安装了[flyte](https://flyte.org/)。
 - [`~integrations.DVCLiveCallback`]，如果安装了[dvclive](https://dvc.org/doc/dvclive)。
+- [`~integrations.SwanLabCallback`]，如果安装了[swanlab](http://swanlab.cn/)。
 
 如果安装了一个软件包，但您不希望使用相关的集成，您可以将 `TrainingArguments.report_to` 更改为仅包含您想要使用的集成的列表（例如 `["azure_ml", "wandb"]`）。
 
@@ -81,6 +82,9 @@ Callbacks是“只读”的代码片段，除了它们返回的[TrainerControl]
 [[autodoc]] integrations.DVCLiveCallback
     - setup
 
+[[autodoc]] integrations.SwanLabCallback
+    - setup
+
 ## TrainerCallback
 
 [[autodoc]] TrainerCallback
diff --git a/docs/source/zh/main_classes/optimizer_schedules.md b/docs/source/zh/main_classes/optimizer_schedules.md
index 63e5438d77b6..4fb45540abb9 100644
--- a/docs/source/zh/main_classes/optimizer_schedules.md
+++ b/docs/source/zh/main_classes/optimizer_schedules.md
@@ -22,10 +22,6 @@ rendered properly in your Markdown viewer.
 - 继承自 `_LRSchedule` 多个调度器：
 - 一个梯度累积类，用于累积多个批次的梯度
 
-## AdamW (PyTorch)
-
-[[autodoc]] AdamW
-
 ## AdaFactor (PyTorch)
 
 [[autodoc]] Adafactor
diff --git a/docs/source/zh/perf_infer_gpu_multi.md b/docs/source/zh/perf_infer_gpu_multi.md
index 35e5bac465a3..91a54e1d3f5f 100644
--- a/docs/source/zh/perf_infer_gpu_multi.md
+++ b/docs/source/zh/perf_infer_gpu_multi.md
@@ -29,6 +29,7 @@ model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 # 初始化分布式环境
 rank = int(os.environ["RANK"])
 device = torch.device(f"cuda:{rank}")
+torch.cuda.set_device(device)
 torch.distributed.init_process_group("nccl", device_id=device)
 
 # 获取支持张量并行的模型
diff --git a/docs/source/zh/run_scripts.md b/docs/source/zh/run_scripts.md
index d058e97d1ad5..8c21266afce0 100644
--- a/docs/source/zh/run_scripts.md
+++ b/docs/source/zh/run_scripts.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 除了 🤗 Transformers [notebooks](./notebooks)，还有示例脚本演示了如何使用[PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch)、[TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow)或[JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax)训练模型以解决特定任务。
 
-您还可以在这些示例中找到我们在[研究项目](https://github.com/huggingface/transformers/tree/main/examples/research_projects)和[遗留示例](https://github.com/huggingface/transformers/tree/main/examples/legacy)中使用过的脚本，这些脚本主要是由社区贡献的。这些脚本已不再被积极维护，需要使用特定版本的🤗 Transformers， 可能与库的最新版本不兼容。
+您还可以在这些示例中找到我们在[研究项目](https://github.com/huggingface/transformers-research-projects/)和[遗留示例](https://github.com/huggingface/transformers/tree/main/examples/legacy)中使用过的脚本，这些脚本主要是由社区贡献的。这些脚本已不再被积极维护，需要使用特定版本的🤗 Transformers， 可能与库的最新版本不兼容。
 
 示例脚本可能无法在初始配置下直接解决每个问题，您可能需要根据要解决的问题调整脚本。为了帮助您，大多数脚本都完全暴露了数据预处理的方式，允许您根据需要对其进行编辑。
 
diff --git a/docs/source/zh/serialization.md b/docs/source/zh/serialization.md
index b9cc74e5849d..6885ae30c470 100644
--- a/docs/source/zh/serialization.md
+++ b/docs/source/zh/serialization.md
@@ -128,11 +128,11 @@ optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_s
 
 <Tip warning={true}>
 
-`tranformers.onnx` 不再进行维护，请如上所述，使用 🤗 Optimum 导出模型。这部分内容将在未来版本中删除。
+`transformers.onnx` 不再进行维护，请如上所述，使用 🤗 Optimum 导出模型。这部分内容将在未来版本中删除。
 
 </Tip>
 
-要使用 `tranformers.onnx` 将 🤗 Transformers 模型导出为 ONNX，请安装额外的依赖项：
+要使用 `transformers.onnx` 将 🤗 Transformers 模型导出为 ONNX，请安装额外的依赖项：
 
 ```bash
 pip install transformers[onnx]
diff --git a/examples/README.md b/examples/README.md
index 20b1d86fcd61..86c1cdbb5037 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -17,7 +17,7 @@ limitations under the License.
 
 We host a wide range of example scripts for multiple learning frameworks. Simply choose your favorite: [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch) or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
 
-We also have some [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects), as well as some [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy). Note that unlike the main examples these are not actively maintained, and may require specific older versions of dependencies in order to run.
+We also have some [research projects](https://github.com/huggingface/transformers-research-projects/), as well as some [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy). Note that unlike the main examples these are not actively maintained, and may require specific older versions of dependencies in order to run.
 
 While we strive to present as many use cases as possible, the example scripts are just that - examples. It is expected that they won't work out-of-the-box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data, allowing you to tweak and edit them as required.
 
diff --git a/examples/flax/language-modeling/run_bert_flax.py b/examples/flax/language-modeling/run_bert_flax.py
index 2e73af4592eb..2faea6b5c56e 100644
--- a/examples/flax/language-modeling/run_bert_flax.py
+++ b/examples/flax/language-modeling/run_bert_flax.py
@@ -53,4 +53,4 @@ def func():
     func()
 end = time.time()
 print(end - start)
-print(f"Throughput: {((nbenchmark * BS)/(end-start)):.3f} examples/sec")
+print(f"Throughput: {((nbenchmark * BS) / (end - start)):.3f} examples/sec")
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 692a43f9d23c..63d9c918cc44 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -61,7 +61,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
index ae106c726431..38258ecec303 100644
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@@ -60,7 +60,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")
 
@@ -265,7 +265,7 @@ class FlaxDataCollatorSpeechSeq2SeqWithPadding:
     Data collator that will dynamically pad the inputs received.
     Args:
         processor ([`Wav2Vec2Processor`])
-            The processor used for proccessing the data.
+            The processor used for processing the data.
         decoder_start_token_id (:obj: `int`)
             The begin-of-sentence of the decoder.
         input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 0c4edda3bd4a..a3d952a1a2bb 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -56,7 +56,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index bc83fb53e212..4b232115c0d4 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -57,7 +57,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py
index 0228a8797b6e..4b50352f580f 100644
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@@ -36,12 +36,12 @@
 # for dataset and preprocessing
 import torch
 import torchvision
-import torchvision.transforms as transforms
 from flax import jax_utils
 from flax.jax_utils import pad_shard_unpad, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
 from huggingface_hub import HfApi
+from torchvision import transforms
 from tqdm import tqdm
 
 import transformers
diff --git a/examples/legacy/multiple_choice/utils_multiple_choice.py b/examples/legacy/multiple_choice/utils_multiple_choice.py
index 6b7559c49e53..2ee901c1bc80 100644
--- a/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/examples/legacy/multiple_choice/utils_multiple_choice.py
@@ -113,7 +113,7 @@ def __init__(
             with FileLock(lock_path):
                 if os.path.exists(cached_features_file) and not overwrite_cache:
                     logger.info(f"Loading features from cached file {cached_features_file}")
-                    self.features = torch.load(cached_features_file)
+                    self.features = torch.load(cached_features_file, weights_only=True)
                 else:
                     logger.info(f"Creating features from dataset file at {data_dir}")
                     label_list = processor.get_labels()
diff --git a/examples/legacy/pytorch-lightning/lightning_base.py b/examples/legacy/pytorch-lightning/lightning_base.py
index 640828bacd34..7eb1e7831f63 100644
--- a/examples/legacy/pytorch-lightning/lightning_base.py
+++ b/examples/legacy/pytorch-lightning/lightning_base.py
@@ -8,7 +8,6 @@
 from pytorch_lightning.utilities import rank_zero_info
 
 from transformers import (
-    AdamW,
     AutoConfig,
     AutoModel,
     AutoModelForPreTraining,
@@ -20,6 +19,7 @@
     AutoTokenizer,
     PretrainedConfig,
     PreTrainedTokenizer,
+    is_torch_available,
 )
 from transformers.optimization import (
     Adafactor,
@@ -31,6 +31,10 @@
 from transformers.utils.versions import require_version
 
 
+if is_torch_available():
+    import torch
+
+
 logger = logging.getLogger(__name__)
 
 require_version("pytorch_lightning>=1.0.4")
@@ -146,7 +150,7 @@ def configure_optimizers(self):
             )
 
         else:
-            optimizer = AdamW(
+            optimizer = torch.optim.AdamW(
                 optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
             )
         self.opt = optimizer
diff --git a/examples/legacy/pytorch-lightning/run_glue.py b/examples/legacy/pytorch-lightning/run_glue.py
index 681f633fcd6d..00302c5061c4 100644
--- a/examples/legacy/pytorch-lightning/run_glue.py
+++ b/examples/legacy/pytorch-lightning/run_glue.py
@@ -81,7 +81,7 @@ def get_dataloader(self, mode: str, batch_size: int, shuffle: bool = False) -> D
 
         cached_features_file = self._feature_file(mode)
         logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
+        features = torch.load(cached_features_file, weights_only=True)
         all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
         all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
         all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
diff --git a/examples/legacy/pytorch-lightning/run_ner.py b/examples/legacy/pytorch-lightning/run_ner.py
index fc6f812275ea..144759d36aac 100644
--- a/examples/legacy/pytorch-lightning/run_ner.py
+++ b/examples/legacy/pytorch-lightning/run_ner.py
@@ -63,7 +63,7 @@ def prepare_data(self):
             cached_features_file = self._feature_file(mode)
             if os.path.exists(cached_features_file) and not args.overwrite_cache:
                 logger.info("Loading features from cached file %s", cached_features_file)
-                features = torch.load(cached_features_file)
+                features = torch.load(cached_features_file, weights_only=True)
             else:
                 logger.info("Creating features from dataset file at %s", args.data_dir)
                 examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode)
@@ -89,7 +89,7 @@ def get_dataloader(self, mode: int, batch_size: int, shuffle: bool = False) -> D
         "Load datasets. Called after prepare data."
         cached_features_file = self._feature_file(mode)
         logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
+        features = torch.load(cached_features_file, weights_only=True)
         all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
         all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
         if features[0].token_type_ids is not None:
diff --git a/examples/legacy/question-answering/run_squad.py b/examples/legacy/question-answering/run_squad.py
index f5a827c15aca..757024a0c389 100644
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@@ -32,7 +32,6 @@
 from transformers import (
     MODEL_FOR_QUESTION_ANSWERING_MAPPING,
     WEIGHTS_NAME,
-    AdamW,
     AutoConfig,
     AutoModelForQuestionAnswering,
     AutoTokenizer,
@@ -69,7 +68,7 @@ def set_seed(args):
 
 
 def to_list(tensor):
-    return tensor.detach().cpu().tolist()
+    return tensor.tolist()
 
 
 def train(args, train_dataset, model, tokenizer):
@@ -96,7 +95,7 @@ def train(args, train_dataset, model, tokenizer):
         },
         {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
     ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(
         optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
     )
@@ -106,8 +105,8 @@ def train(args, train_dataset, model, tokenizer):
         os.path.join(args.model_name_or_path, "scheduler.pt")
     ):
         # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"), weights_only=True))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"), weights_only=True))
 
     if args.fp16:
         try:
@@ -418,7 +417,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     # Init features and dataset from cache if it exists
     if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
-        features_and_dataset = torch.load(cached_features_file)
+        features_and_dataset = torch.load(cached_features_file, weights_only=True)
         features, dataset, examples = (
             features_and_dataset["features"],
             features_and_dataset["dataset"],
diff --git a/examples/legacy/run_openai_gpt.py b/examples/legacy/run_openai_gpt.py
index 3831c1bd4401..4193cd1824d6 100755
--- a/examples/legacy/run_openai_gpt.py
+++ b/examples/legacy/run_openai_gpt.py
@@ -43,7 +43,6 @@
 from transformers import (
     CONFIG_NAME,
     WEIGHTS_NAME,
-    AdamW,
     OpenAIGPTDoubleHeadsModel,
     OpenAIGPTTokenizer,
     get_linear_schedule_with_warmup,
@@ -236,7 +235,7 @@ def tokenize_and_encode(obj):
             },
             {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
         ]
-        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
         scheduler = get_linear_schedule_with_warmup(
             optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
         )
diff --git a/examples/legacy/run_swag.py b/examples/legacy/run_swag.py
index dbf712a71ff2..55fd0aa05205 100755
--- a/examples/legacy/run_swag.py
+++ b/examples/legacy/run_swag.py
@@ -34,7 +34,6 @@
 import transformers
 from transformers import (
     WEIGHTS_NAME,
-    AdamW,
     AutoConfig,
     AutoModelForMultipleChoice,
     AutoTokenizer,
@@ -245,7 +244,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     )
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
+        features = torch.load(cached_features_file, weights_only=True)
     else:
         logger.info("Creating features from dataset file at %s", input_file)
         examples = read_swag_examples(input_file)
@@ -298,7 +297,7 @@ def train(args, train_dataset, model, tokenizer):
         },
         {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
     ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(
         optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
     )
diff --git a/examples/legacy/seq2seq/README.md b/examples/legacy/seq2seq/README.md
index f574ccabda2c..741d8b5dd54b 100644
--- a/examples/legacy/seq2seq/README.md
+++ b/examples/legacy/seq2seq/README.md
@@ -17,7 +17,7 @@ limitations under the License.
 # Sequence-to-Sequence Training and Evaluation
 
 This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks.
-For deprecated `bertabs` instructions, see [`bertabs/README.md`](https://github.com/huggingface/transformers/blob/main/examples/research_projects/bertabs/README.md).
+For deprecated `bertabs` instructions, see https://github.com/huggingface/transformers-research-projects/blob/main/bertabs/README.md.
 
 ### Supported Architectures
 
@@ -209,7 +209,7 @@ th 56 \
 ```
 
 ### Multi-GPU Evaluation
-here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases
+here is a command to run xsum evaluation on 8 GPUs. It is more than linearly faster than run_eval.py in some cases
 because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have
 `{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs.
 
diff --git a/examples/legacy/seq2seq/convert_model_to_fp16.py b/examples/legacy/seq2seq/convert_model_to_fp16.py
index 7fffbde79df7..8d568a7e4af0 100755
--- a/examples/legacy/seq2seq/convert_model_to_fp16.py
+++ b/examples/legacy/seq2seq/convert_model_to_fp16.py
@@ -22,7 +22,7 @@
 
 def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None:
     """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space."""
-    state_dict = torch.load(src_path, map_location=map_location)
+    state_dict = torch.load(src_path, map_location=map_location, weights_only=True)
     for k, v in tqdm(state_dict.items()):
         if not isinstance(v, torch.Tensor):
             raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
diff --git a/examples/legacy/seq2seq/finetune_trainer.py b/examples/legacy/seq2seq/finetune_trainer.py
index 5ede86ee0822..e9daf9fc506d 100755
--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
@@ -231,9 +231,9 @@ def main():
 
     # set decoder_start_token_id for MBart
     if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert (
-            data_args.tgt_lang is not None and data_args.src_lang is not None
-        ), "mBart requires --tgt_lang and --src_lang"
+        assert data_args.tgt_lang is not None and data_args.src_lang is not None, (
+            "mBart requires --tgt_lang and --src_lang"
+        )
         if isinstance(tokenizer, MBartTokenizer):
             model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang]
         else:
diff --git a/examples/legacy/seq2seq/run_eval_search.py b/examples/legacy/seq2seq/run_eval_search.py
index 9b5debfb2795..e6048a4ec440 100755
--- a/examples/legacy/seq2seq/run_eval_search.py
+++ b/examples/legacy/seq2seq/run_eval_search.py
@@ -128,7 +128,7 @@ def run_search():
 
     results_sorted = sorted(results, key=operator.itemgetter(*task_score_names[task]), reverse=True)
     print(" | ".join([f"{col:{col_widths[col]}}" for col in col_names]))
-    print(" | ".join([f"{'-'*col_widths[col]}" for col in col_names]))
+    print(" | ".join([f"{'-' * col_widths[col]}" for col in col_names]))
     for row in results_sorted:
         print(" | ".join([f"{row[col]:{col_widths[col]}}" for col in col_names]))
 
diff --git a/examples/legacy/seq2seq/seq2seq_trainer.py b/examples/legacy/seq2seq/seq2seq_trainer.py
index 0c981a201dd4..8f056ca8e15d 100644
--- a/examples/legacy/seq2seq/seq2seq_trainer.py
+++ b/examples/legacy/seq2seq/seq2seq_trainer.py
@@ -22,7 +22,6 @@
 from transformers.models.fsmt.configuration_fsmt import FSMTConfig
 from transformers.optimization import (
     Adafactor,
-    AdamW,
     get_constant_schedule,
     get_constant_schedule_with_warmup,
     get_cosine_schedule_with_warmup,
@@ -102,12 +101,11 @@ def create_optimizer_and_scheduler(self, num_training_steps: int):
                     "weight_decay": 0.0,
                 },
             ]
-            optimizer_cls = Adafactor if self.args.adafactor else AdamW
             if self.args.adafactor:
                 optimizer_cls = Adafactor
                 optimizer_kwargs = {"scale_parameter": False, "relative_step": False}
             else:
-                optimizer_cls = AdamW
+                optimizer_cls = torch.optim.AdamW
                 optimizer_kwargs = {
                     "betas": (self.args.adam_beta1, self.args.adam_beta2),
                     "eps": self.args.adam_epsilon,
diff --git a/examples/legacy/seq2seq/utils.py b/examples/legacy/seq2seq/utils.py
index d7cd84dedb28..955c9e996105 100644
--- a/examples/legacy/seq2seq/utils.py
+++ b/examples/legacy/seq2seq/utils.py
@@ -282,9 +282,9 @@ def __init__(self, tokenizer, data_args, decoder_start_token_id, tpu_num_cores=N
         self.tokenizer = tokenizer
         self.pad_token_id = tokenizer.pad_token_id
         self.decoder_start_token_id = decoder_start_token_id
-        assert (
-            self.pad_token_id is not None
-        ), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
+        assert self.pad_token_id is not None, (
+            f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
+        )
         self.data_args = data_args
         self.tpu_num_cores = tpu_num_cores
         self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
@@ -593,7 +593,7 @@ def assert_all_frozen(model):
     model_grads: List[bool] = list(grad_status(model))
     n_require_grad = sum(lmap(int, model_grads))
     npars = len(model_grads)
-    assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
+    assert not any(model_grads), f"{n_require_grad / npars:.1%} of {npars} weights require grad"
 
 
 def assert_not_all_frozen(model):
diff --git a/examples/legacy/token-classification/tasks.py b/examples/legacy/token-classification/tasks.py
index d893a2ab0347..43de0a7f04ad 100644
--- a/examples/legacy/token-classification/tasks.py
+++ b/examples/legacy/token-classification/tasks.py
@@ -131,7 +131,7 @@ def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, p
             s_p = preds_list[example_id]
             out = ""
             for token in sentence:
-                out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
+                out += f"{token['form']} ({token['upos']}|{s_p.pop(0)}) "
             out += "\n"
             writer.write(out)
             example_id += 1
diff --git a/examples/legacy/token-classification/utils_ner.py b/examples/legacy/token-classification/utils_ner.py
index da4d8c3b6059..9167ce151618 100644
--- a/examples/legacy/token-classification/utils_ner.py
+++ b/examples/legacy/token-classification/utils_ner.py
@@ -242,7 +242,7 @@ def __init__(
             with FileLock(lock_path):
                 if os.path.exists(cached_features_file) and not overwrite_cache:
                     logger.info(f"Loading features from cached file {cached_features_file}")
-                    self.features = torch.load(cached_features_file)
+                    self.features = torch.load(cached_features_file, weights_only=True)
                 else:
                     logger.info(f"Creating features from dataset file at {data_dir}")
                     examples = token_classification_task.read_examples_from_file(data_dir, mode)
diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py
index 59637e02d3f1..febd1b886752 100644
--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@@ -140,6 +140,11 @@ class MyNewModelConfig(PretrainedConfig):
         "layers.*.mlp.up_proj": "colwise",
         "layers.*.mlp.down_proj": "rowwise",
     }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
 
     def __init__(
         self,
diff --git a/examples/modular-transformers/configuration_my_new_model2.py b/examples/modular-transformers/configuration_my_new_model2.py
index eddd7fe47973..a5364a85d537 100644
--- a/examples/modular-transformers/configuration_my_new_model2.py
+++ b/examples/modular-transformers/configuration_my_new_model2.py
@@ -43,6 +43,11 @@ class MyNewModel2Config(PretrainedConfig):
         "layers.*.mlp.up_proj": "colwise",
         "layers.*.mlp.down_proj": "rowwise",
     }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
 
     def __init__(
         self,
diff --git a/examples/modular-transformers/configuration_new_model.py b/examples/modular-transformers/configuration_new_model.py
index 4d164fe3e75f..ba05b4ea51b2 100644
--- a/examples/modular-transformers/configuration_new_model.py
+++ b/examples/modular-transformers/configuration_new_model.py
@@ -79,6 +79,20 @@ class NewModelConfig(PretrainedConfig):
 
     model_type = "new_model"
     keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
 
     def __init__(
         self,
diff --git a/examples/modular-transformers/image_processing_new_imgproc_model.py b/examples/modular-transformers/image_processing_new_imgproc_model.py
index f3ab1772ec59..bde80dd09641 100644
--- a/examples/modular-transformers/image_processing_new_imgproc_model.py
+++ b/examples/modular-transformers/image_processing_new_imgproc_model.py
@@ -19,7 +19,7 @@
     PILImageResampling,
     infer_channel_dimension_format,
     is_scaled_image,
-    make_list_of_images,
+    make_flat_list_of_images,
     to_numpy_array,
     valid_images,
     validate_preprocess_arguments,
@@ -221,8 +221,7 @@ def preprocess(
 
         size = size if size is not None else self.size
         size = get_size_dict(size, default_to_square=False)
-
-        images = make_list_of_images(images)
+        images = make_flat_list_of_images(images)
 
         if not valid_images(images):
             raise ValueError(
diff --git a/examples/modular-transformers/modeling_dummy.py b/examples/modular-transformers/modeling_dummy.py
index 1b0ad5ad92fe..98a72a3e6597 100644
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@@ -4,6 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_dummy.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from functools import partial
 from typing import Callable, Optional, Tuple, Union
 
 import torch
@@ -356,6 +357,7 @@ class DummyPreTrainedModel(PreTrainedModel):
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
+    _supports_attention_backend = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -543,7 +545,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -698,7 +700,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
diff --git a/examples/modular-transformers/modeling_multimodal1.py b/examples/modular-transformers/modeling_multimodal1.py
index ec54af22186e..91d226d12b88 100644
--- a/examples/modular-transformers/modeling_multimodal1.py
+++ b/examples/modular-transformers/modeling_multimodal1.py
@@ -4,6 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_multimodal1.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from functools import partial
 from typing import Callable, Optional, Tuple, Union
 
 import torch
@@ -356,6 +357,7 @@ class Multimodal1TextPreTrainedModel(PreTrainedModel):
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
+    _supports_attention_backend = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -543,7 +545,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -698,7 +700,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
diff --git a/examples/modular-transformers/modeling_multimodal2.py b/examples/modular-transformers/modeling_multimodal2.py
index b10b11b671af..ba2e9a4d6f27 100644
--- a/examples/modular-transformers/modeling_multimodal2.py
+++ b/examples/modular-transformers/modeling_multimodal2.py
@@ -534,7 +534,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals
         batch_size, _, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
             )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py
index 86669310c4f8..854d280663ff 100644
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@@ -356,6 +356,7 @@ class MyNewModel2PreTrainedModel(PreTrainedModel):
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
+    _supports_attention_backend = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -491,6 +492,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,  # NOOP kwarg for now
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -703,7 +705,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
@@ -787,17 +791,20 @@ def forward(
         if self.config.pad_token_id is None and batch_size != 1:
             raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
         if self.config.pad_token_id is None:
-            sequence_lengths = -1
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
 
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
 
         loss = None
         if labels is not None:
diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py
index 3cea4ef2c455..ea2e1a2b9a10 100644
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@@ -19,6 +19,7 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
+from ...utils.deprecation import deprecate_kwarg
 from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_new_task_model import NewTaskModelConfig
 
@@ -254,8 +255,7 @@ def _update_causal_mask(
         token_type_ids,
         past_key_values,
         cache_position,
-        input_ids=None,
-        inputs_embeds=None,
+        input_tensor,
         is_training: bool = False,
     ):
         if self.config.text_config._attn_implementation == "flash_attention_2":
@@ -265,8 +265,7 @@ def _update_causal_mask(
 
         using_static_cache = isinstance(past_key_values, StaticCache)
         min_dtype = torch.finfo(self.dtype).min
-        inputs_lead_dim = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
-        sequence_length = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        inputs_lead_dim, sequence_length = input_tensor.shape[:2]
         if using_static_cache:
             target_length = past_key_values.get_max_cache_shape()
         elif isinstance(past_key_values, HybridCache):
@@ -297,16 +296,20 @@ def _update_causal_mask(
         if attention_mask is not None:
             causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
             mask_length = attention_mask.shape[-1]
+
+            # First unmask prefix tokens during training
+            if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
+                )
+
+            # Then apply padding mask (will mask pad tokens)
             padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
             padding_mask = padding_mask == 0
             causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                 padding_mask, min_dtype
             )
-            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
-            if is_training:
-                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
-                )
+
         return causal_mask
 
     def get_image_features(self, pixel_values: torch.FloatTensor):
@@ -325,6 +328,7 @@ def get_image_features(self, pixel_values: torch.FloatTensor):
         image_features = image_features / (self.config.text_config.hidden_size**0.5)
         return image_features
 
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(NEW_TASK_MODEL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=NewTaskModelCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -345,16 +349,17 @@ def forward(
         num_logits_to_keep: int = 0,
     ) -> Union[Tuple, NewTaskModelCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
 
-            num_logits_to_keep (`int`, *optional*):
-                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
                 `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                 token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
 
         Returns:
 
@@ -418,7 +423,7 @@ def prepare_inputs_for_generation(
         attention_mask=None,
         token_type_ids=None,
         use_cache=True,
-        num_logits_to_keep=None,
+        logits_to_keep=None,
         labels=None,
         **kwargs,
     ):
@@ -431,7 +436,7 @@ def prepare_inputs_for_generation(
             position_ids=position_ids,
             cache_position=cache_position,
             use_cache=use_cache,
-            num_logits_to_keep=num_logits_to_keep,
+            logits_to_keep=logits_to_keep,
             token_type_ids=token_type_ids,
             **kwargs,
         )
@@ -445,10 +450,12 @@ def prepare_inputs_for_generation(
             model_inputs["pixel_values"] = pixel_values
         is_training = token_type_ids is not None and labels is not None
         if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
+            input_tensor = inputs_embeds if inputs_embeds is not None else input_ids
             causal_mask = self._update_causal_mask(
-                attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training
+                attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training
             )
             model_inputs["attention_mask"] = causal_mask
+
         return model_inputs
 
     def resize_token_embeddings(
diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py
index 454860458636..d618cd54e90a 100644
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@@ -356,6 +356,7 @@ class SuperPreTrainedModel(PreTrainedModel):
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
+    _supports_attention_backend = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -620,7 +621,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     padding_mask, min_dtype
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index d9cb0187e4d8..89915308a1e0 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -45,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index f8170bb416dd..bdd803e646b4 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 32d85b7d98da..c0d2001b9bea 100755
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -57,7 +57,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 0dc3e11f08ab..d837440405ba 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index 6308e250f5c3..208c43c2b4e6 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 3721dc267c22..c0ae1cec1c1d 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
@@ -359,7 +359,7 @@ def main():
         IMAGE_PROCESSOR_TYPES = {
             conf.model_type: image_processor_class for conf, image_processor_class in IMAGE_PROCESSOR_MAPPING.items()
         }
-        image_processor = IMAGE_PROCESSOR_TYPES[model_args.model_type]()
+        image_processor = IMAGE_PROCESSOR_TYPES[model_args.model_type][-1]()
 
     # create model
     if model_args.model_name_or_path:
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index 0d8496bb3fed..c60787764c24 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -53,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/instance-segmentation/README.md b/examples/pytorch/instance-segmentation/README.md
index 339d7591523d..565e2ec71dd9 100644
--- a/examples/pytorch/instance-segmentation/README.md
+++ b/examples/pytorch/instance-segmentation/README.md
@@ -66,7 +66,7 @@ python run_instance_segmentation.py \
     --dataloader_persistent_workers \
     --dataloader_prefetch_factor 4 \
     --do_eval \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --logging_strategy epoch \
     --save_strategy epoch \
     --save_total_limit 2 \
diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
index 306e8085f676..6cb6859fd976 100644
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@@ -46,7 +46,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
index e9cd01610bb7..60c791f4aa9b 100644
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@@ -52,7 +52,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/README.md b/examples/pytorch/language-modeling/README.md
index b13cebde5f57..700d1a2b5613 100644
--- a/examples/pytorch/language-modeling/README.md
+++ b/examples/pytorch/language-modeling/README.md
@@ -177,7 +177,7 @@ sure all your batches have the same length.
 
 ### Whole word masking
 
-This part was moved to `examples/research_projects/mlm_wwm`.
+This part was moved to https://github.com/huggingface/transformers-research-projects/tree/main/mlm_wwm.
 
 ### XLNet and permutation language modeling
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 54098f5a7dd6..5b0a011db6c7 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -438,7 +438,7 @@ def main():
     else:
         model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+        logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index e35e4e7d907e..fa05c57d9c6a 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py
index e94690eaa7fc..3b1e7931f0ef 100644
--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@@ -47,7 +47,7 @@
     Trainer,
     TrainingArguments,
     default_data_collator,
-    is_torch_tpu_available,
+    is_torch_xla_available,
     set_seed,
 )
 from transformers.integrations import is_deepspeed_zero3_enabled
@@ -58,7 +58,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
@@ -265,8 +265,7 @@ class DataTrainingArguments:
         default="<fim_pad>",
         metadata={
             "help": (
-                "Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. "
-                "Defaults to '<fim_pad>'."
+                "Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. Defaults to '<fim_pad>'."
             )
         },
     )
@@ -514,7 +513,7 @@ def main():
             attn_implementation=model_args.attn_implementation,
         )
         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+        logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
 
     # Add the new FIM tokens to the tokenizer and resize model's vocab embeddings
     special_tokens = [data_args.fim_prefix_token, data_args.fim_middle_token, data_args.fim_suffix_token]
@@ -526,7 +525,7 @@ def main():
     if torch.cuda.is_availble():
         pad_factor = 8
 
-    elif is_torch_tpu_available():
+    elif is_torch_xla_available(check_is_tpu=True):
         pad_factor = 128
 
     # Add the new tokens to the tokenizer
@@ -796,9 +795,13 @@ def compute_metrics(eval_preds):
         processing_class=tokenizer,
         # Data collator will default to DataCollatorWithPadding, so we change it.
         data_collator=default_data_collator,
-        compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
+        compute_metrics=compute_metrics
+        if training_args.do_eval and not is_torch_xla_available(check_is_tpu=True)
+        else None,
         preprocess_logits_for_metrics=(
-            preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available() else None
+            preprocess_logits_for_metrics
+            if training_args.do_eval and not is_torch_xla_available(check_is_tpu=True)
+            else None
         ),
     )
 
diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py
index 64e340a62a76..2e1e5dd58e21 100644
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@@ -52,7 +52,7 @@
     SchedulerType,
     default_data_collator,
     get_scheduler,
-    is_torch_tpu_available,
+    is_torch_xla_available,
 )
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.utils import check_min_version, send_example_telemetry
@@ -60,7 +60,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = get_logger(__name__)
 
@@ -234,9 +234,7 @@ def parse_args():
         "--fim_pad_token",
         type=str,
         default="<fim_pad>",
-        help=(
-            "Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True." " Defaults to '<fim_pad>'."
-        ),
+        help=("Fill-in-Middle Pad token. Used only when 'truncate_or_pad' is set to True. Defaults to '<fim_pad>'."),
     )
     parser.add_argument(
         "--preprocessing_num_workers",
@@ -494,7 +492,7 @@ def main():
     if torch.cuda.is_availble():
         pad_factor = 8
 
-    elif is_torch_tpu_available():
+    elif is_torch_xla_available(check_is_tpu=True):
         pad_factor = 128
 
     # Add the new tokens to the tokenizer
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index fde2980d3ab5..4bc2a41cf712 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -54,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 40265efcfdae..2ea58d91cfc3 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 6ef17ebb9b6a..9f54bb6f68d6 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index cc632480fbdd..f8e85d22256a 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 4119342163d6..182c64480c1f 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -54,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
diff --git a/examples/pytorch/object-detection/README.md b/examples/pytorch/object-detection/README.md
index ab474f760753..3c0ce460f0d5 100644
--- a/examples/pytorch/object-detection/README.md
+++ b/examples/pytorch/object-detection/README.md
@@ -56,7 +56,7 @@ python run_object_detection.py \
     --greater_is_better true \
     --load_best_model_at_end true \
     --logging_strategy epoch \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --save_strategy epoch \
     --save_total_limit 2 \
     --push_to_hub true \
diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py
index 095b41a6a491..8f28132043e6 100644
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")
 
diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
index b7ca051949e1..b1b8b504d8bc 100644
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index 81fcc7b8b70b..11e0cd08ca09 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 4b199a9e8990..d7b6d9661949 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 312d8b389dd6..2288939d02a5 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -41,7 +41,6 @@
 
 import transformers
 from transformers import (
-    AdamW,
     DataCollatorWithPadding,
     EvalPrediction,
     SchedulerType,
@@ -56,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
@@ -767,7 +766,7 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
             "weight_decay": 0.0,
         },
     ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index da448c37f2c4..131774497036 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index cd6204c467c2..c67706da187b 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -46,7 +46,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 0551b5f61f10..0ef3eb241735 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 8a0d9de748f9..ef0514733a83 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
index 62b15c0f3138..5c6c3f62c792 100755
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -33,7 +33,6 @@
 
 import transformers
 from transformers import (
-    AdamW,
     SchedulerType,
     Wav2Vec2Config,
     Wav2Vec2FeatureExtractor,
@@ -296,7 +295,7 @@ class DataCollatorForWav2Vec2Pretraining:
             The Wav2Vec2 model used for pretraining. The data collator needs to have access
             to config and ``_get_feat_extract_output_lengths`` function for correct padding.
         feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
-            The processor used for proccessing the data.
+            The processor used for processing the data.
         padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
             among:
@@ -445,7 +444,7 @@ def main():
     accelerator.wait_for_everyone()
 
     # 1. Download and create train, validation dataset
-    # We load all dataset configuration and datset split pairs passed in
+    # We load all dataset configuration and dataset split pairs passed in
     # ``args.dataset_config_names`` and ``args.dataset_split_names``
     datasets_splits = []
     for dataset_config_name, train_split_name in zip(args.dataset_config_names, args.dataset_split_names):
@@ -583,7 +582,7 @@ def prepare_dataset(batch):
     )
 
     # Optimizer
-    optimizer = AdamW(
+    optimizer = torch.optim.AdamW(
         list(model.parameters()),
         lr=args.learning_rate,
         betas=[args.adam_beta1, args.adam_beta2],
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 70fc035fabb8..b64c25446d43 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
@@ -292,7 +292,7 @@ class DataCollatorCTCWithPadding:
     Data collator that will dynamically pad the inputs received.
     Args:
         processor (:class:`~transformers.AutoProcessor`)
-            The processor used for proccessing the data.
+            The processor used for processing the data.
         padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
             among:
@@ -491,7 +491,7 @@ def main():
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
     chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+        f"[{''.join(data_args.chars_to_ignore)}]" if data_args.chars_to_ignore is not None else None
     )
     text_column_name = data_args.text_column_name
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
index a6e3d9f7f33f..6d3bbe3781d3 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
@@ -275,7 +275,7 @@ class DataCollatorCTCWithPadding:
     Data collator that will dynamically pad the inputs received.
     Args:
         processor (:class:`~transformers.AutoProcessor`)
-            The processor used for proccessing the data.
+            The processor used for processing the data.
         padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
             among:
@@ -471,7 +471,7 @@ def main():
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
     # that could be easily picked up by the model
     chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
+        f"[{''.join(data_args.chars_to_ignore)}]" if data_args.chars_to_ignore is not None else None
     )
     text_column_name = data_args.text_column_name
 
@@ -559,7 +559,7 @@ def remove_special_characters(batch):
                 )
 
                 # if we doing adapter language training, save
-                # vocab with adpter language
+                # vocab with adapter language
                 if data_args.target_language is not None:
                     vocab_dict[data_args.target_language] = lang_dict
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 6d3950802c83..60cb0c24d0bb 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/summarization/README.md b/examples/pytorch/summarization/README.md
index 93c0bbccef6c..0d332564de84 100644
--- a/examples/pytorch/summarization/README.md
+++ b/examples/pytorch/summarization/README.md
@@ -18,7 +18,7 @@ limitations under the License.
 
 This directory contains examples for finetuning and evaluating transformers on summarization  tasks.
 Please tag @patil-suraj with any issues/unexpected behaviors, or send a PR!
-For deprecated `bertabs` instructions, see [`bertabs/README.md`](https://github.com/huggingface/transformers/blob/main/examples/research_projects/bertabs/README.md).
+For deprecated `bertabs` instructions, see https://github.com/huggingface/transformers-research-projects/blob/main/bertabs/README.md.
 For the old `finetune_trainer.py` and related utils, see [`examples/legacy/seq2seq`](https://github.com/huggingface/transformers/blob/main/examples/legacy/seq2seq).
 
 ### Supported Architectures
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index 93036a7e03ac..b39ab45e997f 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
@@ -505,9 +505,9 @@ def main():
         return
 
     if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
-        assert (
-            data_args.lang is not None
-        ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
+        assert data_args.lang is not None, (
+            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
+        )
 
         tokenizer.src_lang = data_args.lang
         tokenizer.tgt_lang = data_args.lang
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index c2201840cacc..3d58ffc9ab1b 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py
index c609ee860c72..0df63ee946d9 100644
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@@ -667,7 +667,7 @@ def test_run_instance_segmentation(self):
             --per_device_train_batch_size 2
             --per_device_eval_batch_size 1
             --do_eval
-            --evaluation_strategy epoch
+            --eval_strategy epoch
             --seed 32
         """.split()
 
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index fef77f4108a3..6cc1f0770a89 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
@@ -199,9 +199,9 @@ def __post_init__(self):
             train_extension = self.train_file.split(".")[-1]
             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
             validation_extension = self.validation_file.split(".")[-1]
-            assert (
-                validation_extension == train_extension
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+            assert validation_extension == train_extension, (
+                "`validation_file` should have the same extension (csv or json) as `train_file`."
+            )
 
 
 @dataclass
@@ -357,9 +357,9 @@ def main():
             if data_args.test_file is not None:
                 train_extension = data_args.train_file.split(".")[-1]
                 test_extension = data_args.test_file.split(".")[-1]
-                assert (
-                    test_extension == train_extension
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                assert test_extension == train_extension, (
+                    "`test_file` should have the same extension (csv or json) as `train_file`."
+                )
                 data_files["test"] = data_args.test_file
             else:
                 raise ValueError("Need either a dataset name or a test file for `do_predict`.")
@@ -429,7 +429,7 @@ def main():
     if is_regression:
         label_list = None
         num_labels = 1
-        # regession requires float as label type, let's cast it if needed
+        # regression requires float as label type, let's cast it if needed
         for split in raw_datasets.keys():
             if raw_datasets[split].features["label"].dtype not in ["float32", "float64"]:
                 logger.warning(
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 9f1c93f3df2e..97c7df8f34e7 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
@@ -156,9 +156,9 @@ def __post_init__(self):
             train_extension = self.train_file.split(".")[-1]
             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
             validation_extension = self.validation_file.split(".")[-1]
-            assert (
-                validation_extension == train_extension
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+            assert validation_extension == train_extension, (
+                "`validation_file` should have the same extension (csv or json) as `train_file`."
+            )
 
 
 @dataclass
@@ -313,9 +313,9 @@ def main():
             if data_args.test_file is not None:
                 train_extension = data_args.train_file.split(".")[-1]
                 test_extension = data_args.test_file.split(".")[-1]
-                assert (
-                    test_extension == train_extension
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                assert test_extension == train_extension, (
+                    "`test_file` should have the same extension (csv or json) as `train_file`."
+                )
                 data_files["test"] = data_args.test_file
             else:
                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 61bd746f0782..206e8dd1f56f 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 0c7d2c44b878..e6e801d7fd73 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-generation/README.md b/examples/pytorch/text-generation/README.md
index 72fc25e13c65..b96bcd92241c 100644
--- a/examples/pytorch/text-generation/README.md
+++ b/examples/pytorch/text-generation/README.md
@@ -19,7 +19,7 @@ limitations under the License.
 Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py).
 
 Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, GPT-J, Transformer-XL, XLNet, CTRL, BLOOM, LLAMA, OPT.
-A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
+A similar script is used for our official demo [Write With Transformer](https://transformer.huggingface.co), where you
 can try out the different models available in the library.
 
 Example usage:
diff --git a/examples/pytorch/text-generation/run_generation.py b/examples/pytorch/text-generation/run_generation.py
index 0e21a242684d..570eb92645f3 100755
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@@ -322,7 +322,7 @@ def main():
     parser.add_argument(
         "--use_cpu",
         action="store_true",
-        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+        help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
     )
     parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
     parser.add_argument(
diff --git a/examples/pytorch/text-generation/run_generation_contrastive_search.py b/examples/pytorch/text-generation/run_generation_contrastive_search.py
index ba4c9a77e94b..a36323e4ed7e 100755
--- a/examples/pytorch/text-generation/run_generation_contrastive_search.py
+++ b/examples/pytorch/text-generation/run_generation_contrastive_search.py
@@ -68,7 +68,7 @@ def main():
     parser.add_argument(
         "--use_cpu",
         action="store_true",
-        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+        help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
     )
     parser.add_argument(
         "--fp16",
diff --git a/examples/pytorch/token-classification/README.md b/examples/pytorch/token-classification/README.md
index b880b8203079..734a1a1d1aef 100644
--- a/examples/pytorch/token-classification/README.md
+++ b/examples/pytorch/token-classification/README.md
@@ -19,7 +19,7 @@ limitations under the License.
 ## PyTorch version
 
 Fine-tuning the library models for token classification task such as Named Entity Recognition (NER), Parts-of-speech
-tagging (POS) or phrase extraction (CHUNKS). The main scrip `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
+tagging (POS) or phrase extraction (CHUNKS). The main script `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
 customize it to your needs if you need extra processing on your datasets.
 
 It will either run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own text files for
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index 81340dc2eef8..b9d7008d27ae 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 5b8c4c80ef89..54708fd08a6e 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
diff --git a/examples/pytorch/translation/README.md b/examples/pytorch/translation/README.md
index 74ca16ccb0bf..8285355fb0b5 100644
--- a/examples/pytorch/translation/README.md
+++ b/examples/pytorch/translation/README.md
@@ -18,7 +18,7 @@ limitations under the License.
 
 This directory contains examples for finetuning and evaluating transformers on translation tasks.
 Please tag @patil-suraj with any issues/unexpected behaviors, or send a PR!
-For deprecated `bertabs` instructions, see [`bertabs/README.md`](https://github.com/huggingface/transformers/blob/main/examples/research_projects/bertabs/README.md).
+For deprecated `bertabs` instructions, see https://github.com/huggingface/transformers-research-projects/blob/main/bertabs/README.md.
 For the old `finetune_trainer.py` and related utils, see [`examples/legacy/seq2seq`](https://github.com/huggingface/transformers/blob/main/examples/legacy/seq2seq).
 
 ### Supported Architectures
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 1d8ff8d05e53..6e14b6402cd5 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 502132143046..765e9af924e5 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
@@ -436,9 +436,9 @@ def main():
 
     # Set decoder_start_token_id
     if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert (
-            args.target_lang is not None and args.source_lang is not None
-        ), "mBart requires --target_lang and --source_lang"
+        assert args.target_lang is not None and args.source_lang is not None, (
+            "mBart requires --target_lang and --source_lang"
+        )
         if isinstance(tokenizer, MBartTokenizer):
             model.config.decoder_start_token_id = tokenizer.lang_code_to_id[args.target_lang]
         else:
diff --git a/examples/quantization/custom_quantization.py b/examples/quantization/custom_quantization.py
new file mode 100644
index 000000000000..16b31cd8ebe4
--- /dev/null
+++ b/examples/quantization/custom_quantization.py
@@ -0,0 +1,78 @@
+import json
+from typing import Any, Dict
+
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.quantizers import HfQuantizer, register_quantization_config, register_quantizer
+from transformers.utils.quantization_config import QuantizationConfigMixin
+
+
+@register_quantization_config("custom")
+class CustomConfig(QuantizationConfigMixin):
+    def __init__(self):
+        self.quant_method = "custom"
+        self.bits = 8
+
+    def to_dict(self) -> Dict[str, Any]:
+        output = {
+            "num_bits": self.bits,
+        }
+        return output
+
+    def __repr__(self):
+        config_dict = self.to_dict()
+        return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        config_dict = self.to_dict()
+
+        default_config_dict = CustomConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        for key, value in config_dict.items():
+            if value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict
+
+
+@register_quantizer("custom")
+class CustomQuantizer(HfQuantizer):
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+        self.scale_map = {}
+        self.device = kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu")
+        self.torch_dtype = kwargs.get("torch_dtype", torch.float32)
+
+    def _process_model_before_weight_loading(self, model, **kwargs):
+        return True
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        return True
+
+    def is_serializable(self) -> bool:
+        return True
+
+    def is_trainable(self) -> bool:
+        return False
+
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    "facebook/opt-350m", quantization_config=CustomConfig(), torch_dtype="auto"
+)
+
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+input_text = "once there is"
+inputs = tokenizer(input_text, return_tensors="pt")
+output = model_8bit.generate(
+    **inputs,
+    max_length=100,
+    num_return_sequences=1,
+    no_repeat_ngram_size=2,
+)
+generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+
+print(generated_text)
diff --git a/examples/quantization/custom_quantization_int8_example.py b/examples/quantization/custom_quantization_int8_example.py
new file mode 100644
index 000000000000..e43b2e0fc219
--- /dev/null
+++ b/examples/quantization/custom_quantization_int8_example.py
@@ -0,0 +1,257 @@
+import json
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from accelerate import init_empty_weights
+from huggingface_hub import HfApi
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.quantizers import HfQuantizer, get_module_from_name, register_quantization_config, register_quantizer
+from transformers.utils.quantization_config import QuantizationConfigMixin
+
+
+# Implement INT8 Symmetric Linear layer
+class Int8SymmetricLinear(torch.nn.Module):
+    def __init__(self, in_features, out_features, bias, dtype=torch.float32):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.register_buffer("weight", torch.zeros((out_features, in_features), dtype=torch.int8))
+        self.register_buffer("weight_scale", torch.zeros((out_features, 1), dtype=dtype))
+
+        if bias:
+            self.register_buffer("bias", torch.zeros((self.out_features), dtype=dtype))
+        else:
+            self.bias = None
+
+    def forward(self, x):
+        dequant_weight = self.weight * self.weight_scale
+        output = F.linear(x, dequant_weight)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+# Function to replace standard linear layers with INT8 symmetric quantized layers
+def _replace_with_int8_symmetric_linear(
+    model,
+    modules_to_not_convert=None,
+    current_key_name=None,
+    quantization_config=None,
+    has_been_replaced=False,
+    pre_quantized=False,
+):
+    """
+    Recursively replaces nn.Linear modules with Int8SymmetricLinear modules.
+    """
+    if current_key_name is None:
+        current_key_name = []
+
+    for name, module in model.named_children():
+        current_key_name.append(name)
+
+        if (isinstance(module, nn.Linear)) and name not in modules_to_not_convert:
+            # Check if the current key is not in the `modules_to_not_convert`
+            current_key_name_str = ".".join(current_key_name)
+            if not any(
+                (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert
+            ):
+                with init_empty_weights(include_buffers=True):
+                    in_features = module.in_features
+                    out_features = module.out_features
+                    model._modules[name] = Int8SymmetricLinear(
+                        in_features, out_features, module.bias is not None, dtype=module.weight.dtype
+                    )
+                    has_been_replaced = True
+                    model._modules[name].requires_grad_(False)
+
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _replace_with_int8_symmetric_linear(
+                module,
+                modules_to_not_convert,
+                current_key_name,
+                quantization_config,
+                has_been_replaced=has_been_replaced,
+                pre_quantized=pre_quantized,
+            )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+    return model, has_been_replaced
+
+
+def replace_with_int8_symmetric_linear(
+    model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, pre_quantized=False
+):
+    """
+    Main function to replace model layers with INT8 symmetric quantized versions.
+    """
+    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
+
+    if quantization_config.modules_to_not_convert is not None:
+        modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
+    modules_to_not_convert = list(set(modules_to_not_convert))
+
+    model, has_been_replaced = _replace_with_int8_symmetric_linear(
+        model, modules_to_not_convert, current_key_name, quantization_config, pre_quantized=pre_quantized
+    )
+
+    if not has_been_replaced:
+        raise ValueError(
+            "You are loading your model using INT8 symmetric quantization but no linear modules were found in your model."
+        )
+
+    return model
+
+
+@register_quantization_config("int8_symmetric")
+class Int8SymmetricConfig(QuantizationConfigMixin):
+    """
+    Configuration for INT8 symmetric quantization.
+    """
+
+    def __init__(self, modules_to_not_convert: Optional[List[str]] = None, **kwargs):
+        self.quant_method = "int8_symmetric"
+        self.modules_to_not_convert = modules_to_not_convert
+
+    def __repr__(self):
+        config_dict = self.to_dict()
+        return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        config_dict = self.to_dict()
+        default_config_dict = Int8SymmetricConfig().to_dict()
+
+        serializable_config_dict = {}
+        for key, value in config_dict.items():
+            if value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict
+
+
+@register_quantizer("int8_symmetric")
+class Int8SymmetricQuantizer(HfQuantizer):
+    """
+    Implementation of INT8 symmetric quantization.
+
+    """
+
+    requires_calibration = False
+    requires_parameters_quantization = True
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def _process_model_before_weight_loading(self, model, **kwargs):
+        """
+        Replace model's linear layers with quantized versions before loading weights.
+        """
+        self.modules_to_not_convert = self.quantization_config.modules_to_not_convert
+
+        model = replace_with_int8_symmetric_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+            pre_quantized=self.pre_quantized,
+        )
+
+    def check_quantized_param(
+        self,
+        model,
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ):
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if isinstance(module, Int8SymmetricLinear):
+            if self.pre_quantized or tensor_name == "bias":
+                if tensor_name == "weight" and param_value.dtype != torch.int8:
+                    raise ValueError("Expect quantized weights but got an unquantized weight")
+                return False
+            else:
+                if tensor_name == "weight_scale":
+                    raise ValueError("Expect unquantized weights but got a quantized weight_scale")
+                return True
+        return False
+
+    def create_quantized_param(
+        self,
+        model,
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        """
+        Quantizes weights to INT8 symmetric format.
+        """
+        abs_max_per_row = torch.max(torch.abs(param_value), dim=1, keepdim=True)[0].clamp(min=1e-5)
+
+        weight_scale = abs_max_per_row / 127.0
+
+        weight_quantized = torch.round(param_value / weight_scale).clamp(-128, 127).to(torch.int8)
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        module._buffers[tensor_name] = weight_quantized.to(target_device)
+        module._buffers["weight_scale"] = weight_scale.to(target_device)
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        not_missing_keys = []
+        for name, module in model.named_modules():
+            if isinstance(module, Int8SymmetricLinear):
+                for missing in missing_keys:
+                    if (
+                        (name in missing or name in f"{prefix}.{missing}")
+                        and not missing.endswith(".weight")
+                        and not missing.endswith(".bias")
+                    ):
+                        not_missing_keys.append(missing)
+        return [k for k in missing_keys if k not in not_missing_keys]
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        """
+        Post-processing after weights are loaded.
+        """
+        return True
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return False
+
+
+# Example usage
+if __name__ == "__main__":
+    model_int8 = AutoModelForCausalLM.from_pretrained(
+        "meta-llama/Llama-3.2-1B", quantization_config=Int8SymmetricConfig(), torch_dtype=torch.float, device_map="cpu"
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
+    input_text = "once there is"
+    inputs = tokenizer(input_text, return_tensors="pt").to("cpu")
+    output = model_int8.generate(
+        **inputs,
+        max_length=100,
+        num_return_sequences=1,
+        no_repeat_ngram_size=2,
+    )
+    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+    print(generated_text)
+
+    # Save and upload to HUB
+    output_model_dir = "Llama-3.2-1B-INT8-CUSTOM"
+    model_int8.save_pretrained(output_model_dir)
+    tokenizer.save_pretrained(output_model_dir)
+    api = HfApi()
+    repo_id = "medmekk/Llama-3.2-1B-INT8-CUSTOM"
+    api.create_repo(repo_id, private=False)
+    api.upload_folder(folder_path=output_model_dir, repo_id=repo_id, repo_type="model")
diff --git a/examples/research_projects/README.md b/examples/research_projects/README.md
index b2f5d431f25b..e8a0ecd8c4e9 100644
--- a/examples/research_projects/README.md
+++ b/examples/research_projects/README.md
@@ -1,5 +1,5 @@
 <!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
+Copyright 2025 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,13 +16,5 @@ limitations under the License.
 
 # Research projects
 
-This folder contains various research projects using 🤗 Transformers. They are not maintained and require a specific
-version of 🤗 Transformers that is indicated in the requirements file of each folder. Updating them to the most recent version of the library will require some work.
-
-To use any of them, just run the command
-```bash
-pip install -r requirements.txt
-```
-inside the folder of your choice.
-
-If you need help with any of those, contact the author(s), indicated at the top of the `README` of each folder.
+This directory previously contained various research projects using 🤗 Transformers. They have been moved
+to a separate repo, so you can now find them at https://github.com/huggingface/transformers-research-projects/
diff --git a/examples/research_projects/adversarial/README.md b/examples/research_projects/adversarial/README.md
deleted file mode 100644
index 3e331a05f453..000000000000
--- a/examples/research_projects/adversarial/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-## Adversarial evaluation of model performances
-
-Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
-
-The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
-
-This is an example of using test_hans.py:
-
-```bash
-export HANS_DIR=path-to-hans
-export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
-export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
-
-python run_hans.py \
-        --task_name hans \
-        --model_type $MODEL_TYPE \
-        --do_eval \
-        --data_dir $HANS_DIR \
-        --model_name_or_path $MODEL_PATH \
-        --max_seq_length 128 \
-        --output_dir $MODEL_PATH \
-```
-
-This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
-
-The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
-
-```bash
-Heuristic entailed results:
-lexical_overlap: 0.9702
-subsequence: 0.9942
-constituent: 0.9962
-
-Heuristic non-entailed results:
-lexical_overlap: 0.199
-subsequence: 0.0396
-constituent: 0.118
-```
diff --git a/examples/research_projects/adversarial/requirements.txt b/examples/research_projects/adversarial/requirements.txt
deleted file mode 100644
index 76c6528f6f0a..000000000000
--- a/examples/research_projects/adversarial/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers == 4.48.0
diff --git a/examples/research_projects/adversarial/run_hans.py b/examples/research_projects/adversarial/run_hans.py
deleted file mode 100644
index 23625dfa7ee4..000000000000
--- a/examples/research_projects/adversarial/run_hans.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Finetuning the library models for sequence classification on HANS."""
-
-import logging
-import os
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional
-
-import numpy as np
-import torch
-from utils_hans import HansDataset, InputFeatures, hans_processors, hans_tasks_num_labels
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    default_data_collator,
-    set_seed,
-)
-from transformers.trainer_utils import is_main_process
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    task_name: str = field(
-        metadata={"help": "The name of the task to train selected in the list: " + ", ".join(hans_processors.keys())}
-    )
-    data_dir: str = field(
-        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
-    )
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-
-
-def hans_data_collator(features: List[InputFeatures]) -> Dict[str, torch.Tensor]:
-    """
-    Data collator that removes the "pairID" key if present.
-    """
-    batch = default_data_collator(features)
-    _ = batch.pop("pairID", None)
-    return batch
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    if (
-        os.path.exists(training_args.output_dir)
-        and os.listdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use"
-            " --overwrite_output_dir to overcome."
-        )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        training_args.local_rank,
-        training_args.device,
-        training_args.n_gpu,
-        bool(training_args.local_rank != -1),
-        training_args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed
-    set_seed(training_args.seed)
-
-    try:
-        num_labels = hans_tasks_num_labels[data_args.task_name]
-    except KeyError:
-        raise ValueError("Task not found: %s" % (data_args.task_name))
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=data_args.task_name,
-        cache_dir=model_args.cache_dir,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-    model = AutoModelForSequenceClassification.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-    )
-
-    # Get datasets
-    train_dataset = (
-        HansDataset(
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            task=data_args.task_name,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-        )
-        if training_args.do_train
-        else None
-    )
-    eval_dataset = (
-        HansDataset(
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            task=data_args.task_name,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-            evaluate=True,
-        )
-        if training_args.do_eval
-        else None
-    )
-
-    # Initialize our Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        data_collator=hans_data_collator,
-    )
-
-    # Training
-    if training_args.do_train:
-        trainer.train(
-            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
-        )
-        trainer.save_model()
-        # For convenience, we also re-save the tokenizer to the same directory,
-        # so that you can share your model easily on huggingface.co/models =)
-        if trainer.is_world_master():
-            tokenizer.save_pretrained(training_args.output_dir)
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        output = trainer.predict(eval_dataset)
-        preds = output.predictions
-        preds = np.argmax(preds, axis=1)
-
-        pair_ids = [ex.pairID for ex in eval_dataset]
-        output_eval_file = os.path.join(training_args.output_dir, "hans_predictions.txt")
-        label_list = eval_dataset.get_labels()
-        if trainer.is_world_master():
-            with open(output_eval_file, "w") as writer:
-                writer.write("pairID,gold_label\n")
-                for pid, pred in zip(pair_ids, preds):
-                    writer.write("ex" + str(pid) + "," + label_list[int(pred)] + "\n")
-
-        trainer._log(output.metrics)
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/adversarial/utils_hans.py b/examples/research_projects/adversarial/utils_hans.py
deleted file mode 100644
index f051e60f84fe..000000000000
--- a/examples/research_projects/adversarial/utils_hans.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-import tqdm
-from filelock import FileLock
-
-from transformers import (
-    BartTokenizer,
-    BartTokenizerFast,
-    DataProcessor,
-    PreTrainedTokenizer,
-    RobertaTokenizer,
-    RobertaTokenizerFast,
-    XLMRobertaTokenizer,
-    is_tf_available,
-    is_torch_available,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass(frozen=True)
-class InputExample:
-    """
-    A single training/test example for simple sequence classification.
-
-    Args:
-        guid: Unique id for the example.
-        text_a: string. The untokenized text of the first sequence. For single
-            sequence tasks, only this sequence must be specified.
-        text_b: (Optional) string. The untokenized text of the second sequence.
-            Only must be specified for sequence pair tasks.
-        label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        pairID: (Optional) string. Unique identifier for the pair of sentences.
-    """
-
-    guid: str
-    text_a: str
-    text_b: Optional[str] = None
-    label: Optional[str] = None
-    pairID: Optional[str] = None
-
-
-@dataclass(frozen=True)
-class InputFeatures:
-    """
-    A single set of features of data.
-    Property names are the same names as the corresponding inputs to a model.
-
-    Args:
-        input_ids: Indices of input sequence tokens in the vocabulary.
-        attention_mask: Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
-        token_type_ids: (Optional) Segment token indices to indicate first and second
-            portions of the inputs. Only some models use them.
-        label: (Optional) Label corresponding to the input. Int for classification problems,
-            float for regression problems.
-        pairID: (Optional) Unique identifier for the pair of sentences.
-    """
-
-    input_ids: List[int]
-    attention_mask: Optional[List[int]] = None
-    token_type_ids: Optional[List[int]] = None
-    label: Optional[Union[int, float]] = None
-    pairID: Optional[int] = None
-
-
-if is_torch_available():
-    import torch
-    from torch.utils.data import Dataset
-
-    class HansDataset(Dataset):
-        """
-        This will be superseded by a framework-agnostic approach
-        soon.
-        """
-
-        features: List[InputFeatures]
-
-        def __init__(
-            self,
-            data_dir: str,
-            tokenizer: PreTrainedTokenizer,
-            task: str,
-            max_seq_length: Optional[int] = None,
-            overwrite_cache=False,
-            evaluate: bool = False,
-        ):
-            processor = hans_processors[task]()
-
-            cached_features_file = os.path.join(
-                data_dir,
-                "cached_{}_{}_{}_{}".format(
-                    "dev" if evaluate else "train",
-                    tokenizer.__class__.__name__,
-                    str(max_seq_length),
-                    task,
-                ),
-            )
-            label_list = processor.get_labels()
-            if tokenizer.__class__ in (
-                RobertaTokenizer,
-                RobertaTokenizerFast,
-                XLMRobertaTokenizer,
-                BartTokenizer,
-                BartTokenizerFast,
-            ):
-                # HACK(label indices are swapped in RoBERTa pretrained model)
-                label_list[1], label_list[2] = label_list[2], label_list[1]
-            self.label_list = label_list
-
-            # Make sure only the first process in distributed training processes the dataset,
-            # and the others will use the cache.
-            lock_path = cached_features_file + ".lock"
-            with FileLock(lock_path):
-                if os.path.exists(cached_features_file) and not overwrite_cache:
-                    logger.info(f"Loading features from cached file {cached_features_file}")
-                    self.features = torch.load(cached_features_file)
-                else:
-                    logger.info(f"Creating features from dataset file at {data_dir}")
-
-                    examples = (
-                        processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
-                    )
-
-                    logger.info("Training examples: %s", len(examples))
-                    self.features = hans_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)
-                    logger.info("Saving features into cached file %s", cached_features_file)
-                    torch.save(self.features, cached_features_file)
-
-        def __len__(self):
-            return len(self.features)
-
-        def __getitem__(self, i) -> InputFeatures:
-            return self.features[i]
-
-        def get_labels(self):
-            return self.label_list
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    class TFHansDataset:
-        """
-        This will be superseded by a framework-agnostic approach
-        soon.
-        """
-
-        features: List[InputFeatures]
-
-        def __init__(
-            self,
-            data_dir: str,
-            tokenizer: PreTrainedTokenizer,
-            task: str,
-            max_seq_length: Optional[int] = 128,
-            overwrite_cache=False,
-            evaluate: bool = False,
-        ):
-            processor = hans_processors[task]()
-            label_list = processor.get_labels()
-            if tokenizer.__class__ in (
-                RobertaTokenizer,
-                RobertaTokenizerFast,
-                XLMRobertaTokenizer,
-                BartTokenizer,
-                BartTokenizerFast,
-            ):
-                # HACK(label indices are swapped in RoBERTa pretrained model)
-                label_list[1], label_list[2] = label_list[2], label_list[1]
-            self.label_list = label_list
-
-            examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
-            self.features = hans_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)
-
-            def gen():
-                for ex_index, ex in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
-                    if ex_index % 10000 == 0:
-                        logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-                    yield (
-                        {
-                            "example_id": 0,
-                            "input_ids": ex.input_ids,
-                            "attention_mask": ex.attention_mask,
-                            "token_type_ids": ex.token_type_ids,
-                        },
-                        ex.label,
-                    )
-
-            self.dataset = tf.data.Dataset.from_generator(
-                gen,
-                (
-                    {
-                        "example_id": tf.int32,
-                        "input_ids": tf.int32,
-                        "attention_mask": tf.int32,
-                        "token_type_ids": tf.int32,
-                    },
-                    tf.int64,
-                ),
-                (
-                    {
-                        "example_id": tf.TensorShape([]),
-                        "input_ids": tf.TensorShape([None, None]),
-                        "attention_mask": tf.TensorShape([None, None]),
-                        "token_type_ids": tf.TensorShape([None, None]),
-                    },
-                    tf.TensorShape([]),
-                ),
-            )
-
-        def get_dataset(self):
-            return self.dataset
-
-        def __len__(self):
-            return len(self.features)
-
-        def __getitem__(self, i) -> InputFeatures:
-            return self.features[i]
-
-        def get_labels(self):
-            return self.label_list
-
-
-class HansProcessor(DataProcessor):
-    """Processor for the HANS data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
-
-    def get_labels(self):
-        """See base class.
-        Note that we follow the standard three labels for MNLI
-        (see :class:`~transformers.data.processors.utils.MnliProcessor`)
-        but the HANS evaluation groups `contradiction` and `neutral` into `non-entailment` (label 0) while
-        `entailment` is label 1."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for i, line in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[5]
-            text_b = line[6]
-            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
-            label = line[0]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
-        return examples
-
-
-def hans_convert_examples_to_features(
-    examples: List[InputExample],
-    label_list: List[str],
-    max_length: int,
-    tokenizer: PreTrainedTokenizer,
-):
-    """
-    Loads a data file into a list of ``InputFeatures``
-
-    Args:
-        examples: List of ``InputExamples`` containing the examples.
-        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
-        max_length: Maximum example length.
-        tokenizer: Instance of a tokenizer that will tokenize the examples.
-
-    Returns:
-        A list of task-specific ``InputFeatures`` which can be fed to the model.
-
-    """
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for ex_index, example in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d" % (ex_index))
-
-        inputs = tokenizer(
-            example.text_a,
-            example.text_b,
-            add_special_tokens=True,
-            max_length=max_length,
-            padding="max_length",
-            truncation=True,
-            return_overflowing_tokens=True,
-        )
-
-        label = label_map[example.label] if example.label in label_map else 0
-
-        pairID = int(example.pairID)
-
-        features.append(InputFeatures(**inputs, label=label, pairID=pairID))
-
-    for i, example in enumerate(examples[:5]):
-        logger.info("*** Example ***")
-        logger.info(f"guid: {example}")
-        logger.info(f"features: {features[i]}")
-
-    return features
-
-
-hans_tasks_num_labels = {
-    "hans": 3,
-}
-
-hans_processors = {
-    "hans": HansProcessor,
-}
diff --git a/examples/research_projects/bert-loses-patience/README.md b/examples/research_projects/bert-loses-patience/README.md
deleted file mode 100755
index b405e8a94887..000000000000
--- a/examples/research_projects/bert-loses-patience/README.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Patience-based Early Exit
-
-Patience-based Early Exit (PABEE) is a plug-and-play inference method for pretrained language models.
-We have already implemented it on BERT and ALBERT. Basically, you can make your LM faster and more robust with PABEE. It can even improve the performance of ALBERT on GLUE. The only sacrifice is that the batch size can only be 1.
-Learn more in the paper ["BERT Loses Patience: Fast and Robust Inference with Early Exit"](https://arxiv.org/abs/2006.04152) and the official [GitHub repo](https://github.com/JetRunner/PABEE).
-
-![PABEE](https://github.com/JetRunner/PABEE/raw/master/bert-loses-patience.png)
-
-## Training
-
-You can fine-tune a pretrained language model (you can choose from BERT and ALBERT) and train the internal classifiers by:
-```bash
-export GLUE_DIR=/path/to/glue_data
-export TASK_NAME=MRPC
-
-python ./run_glue_with_pabee.py \
-  --model_type albert \
-  --model_name_or_path google-bert/bert-base-uncased/albert/albert-base-v2 \
-  --task_name $TASK_NAME \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir "$GLUE_DIR/$TASK_NAME" \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --per_gpu_eval_batch_size 32 \
-  --learning_rate 2e-5 \
-  --save_steps 50 \
-  --logging_steps 50 \
-  --num_train_epochs 5 \
-  --output_dir /path/to/save/ \
-  --evaluate_during_training
-```
-
-## Inference
-
-You can inference with different patience settings by:
-```bash
-export GLUE_DIR=/path/to/glue_data
-export TASK_NAME=MRPC
-
-python ./run_glue_with_pabee.py \
-  --model_type albert \
-  --model_name_or_path /path/to/save/ \
-  --task_name $TASK_NAME \
-  --do_eval \
-  --do_lower_case \
-  --data_dir "$GLUE_DIR/$TASK_NAME" \
-  --max_seq_length 128 \
-  --per_gpu_eval_batch_size 1 \
-  --learning_rate 2e-5 \
-  --logging_steps 50 \
-  --num_train_epochs 15 \
-  --output_dir /path/to/save/ \
-  --eval_all_checkpoints \
-  --patience 3,4,5,6,7,8
-```
-where `patience` can be a list of patience settings, separated by a comma. It will help determine which patience works best.
-
-When evaluating on a regression task (STS-B), you may add `--regression_threshold 0.1` to define the regression threshold.
-
-## Results
-On the GLUE dev set:
-
-| Model        | \#Param | Speed  | CoLA  | MNLI  | MRPC  | QNLI  | QQP   | RTE   | SST\-2 | STS\-B |
-|--------------|---------|--------|-------|-------|-------|-------|-------|-------|--------|--------|
-| ALBERT\-base | 12M     |        | 58\.9 | 84\.6 | 89\.5 | 91\.7 | 89\.6 | 78\.6 | 92\.8  | 89\.5  |
-| \+PABEE      | 12M     | 1\.57x | 61\.2 | 85\.1 | 90\.0 | 91\.8 | 89\.6 | 80\.1 | 93\.0  | 90\.1  |
-
-| Model         | \#Param | Speed\-up | MNLI  | SST\-2 | STS\-B |
-|---------------|---------|-----------|-------|--------|--------|
-| BERT\-base    | 108M    |           | 84\.5 | 92\.1  | 88\.9  |
-| \+PABEE       | 108M    | 1\.62x    | 83\.6 | 92\.0  | 88\.7  |
-| ALBERT\-large | 18M     |           | 86\.4 | 94\.9  | 90\.4  |
-| \+PABEE       | 18M     | 2\.42x    | 86\.8 | 95\.2  | 90\.6  |
-
-
-## Citation
-If you find this resource useful, please consider citing the following paper:
-```bibtex
-@misc{zhou2020bert,
-    title={BERT Loses Patience: Fast and Robust Inference with Early Exit},
-    author={Wangchunshu Zhou and Canwen Xu and Tao Ge and Julian McAuley and Ke Xu and Furu Wei},
-    year={2020},
-    eprint={2006.04152},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-```
diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
deleted file mode 100644
index 5b30155a736a..000000000000
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Google AI, Google Brain, the HuggingFace Inc. team and Microsoft Corporation.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch ALBERT model with Patience-based Early Exit."""
-
-import logging
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
-from transformers.models.albert.modeling_albert import (
-    ALBERT_INPUTS_DOCSTRING,
-    ALBERT_START_DOCSTRING,
-    AlbertModel,
-    AlbertPreTrainedModel,
-    AlbertTransformer,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-class AlbertTransformerWithPabee(AlbertTransformer):
-    def adaptive_forward(self, hidden_states, current_layer, attention_mask=None, head_mask=None):
-        if current_layer == 0:
-            hidden_states = self.embedding_hidden_mapping_in(hidden_states)
-        else:
-            hidden_states = hidden_states[0]
-
-        layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
-
-        # Index of the hidden group
-        group_idx = int(current_layer / (self.config.num_hidden_layers / self.config.num_hidden_groups))
-
-        layer_group_output = self.albert_layer_groups[group_idx](
-            hidden_states,
-            attention_mask,
-            head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
-        )
-        hidden_states = layer_group_output[0]
-
-        return (hidden_states,)
-
-
-@add_start_docstrings(
-    "The bare ALBERT Model transformer with PABEE outputting raw hidden-states without any specific head on top.",
-    ALBERT_START_DOCSTRING,
-)
-class AlbertModelWithPabee(AlbertModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.encoder = AlbertTransformerWithPabee(config)
-
-        self.init_weights()
-        self.patience = 0
-        self.inference_instances_num = 0
-        self.inference_layers_num = 0
-
-        self.regression_threshold = 0
-
-    def set_regression_threshold(self, threshold):
-        self.regression_threshold = threshold
-
-    def set_patience(self, patience):
-        self.patience = patience
-
-    def reset_stats(self):
-        self.inference_instances_num = 0
-        self.inference_layers_num = 0
-
-    def log_stats(self):
-        avg_inf_layers = self.inference_layers_num / self.inference_instances_num
-        message = (
-            f"*** Patience = {self.patience} Avg. Inference Layers = {avg_inf_layers:.2f} Speed Up ="
-            f" {1 - avg_inf_layers / self.config.num_hidden_layers:.2f} ***"
-        )
-        print(message)
-
-    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_dropout=None,
-        output_layers=None,
-        regression=False,
-    ):
-        r"""
-        Return:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-                Sequence of hidden-states at the output of the last layer of the model.
-            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-                Last layer hidden-state of the first token of the sequence (classification token)
-                further processed by a Linear layer and a Tanh activation function. The Linear
-                layer weights are trained from the next sentence prediction (classification)
-                objective during pre-training.
-
-                This output is usually *not* a good summary
-                of the semantic content of the input, you're often better with averaging or pooling
-                the sequence of hidden-states for the whole input sequence.
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-        """
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        embedding_output = self.embeddings(
-            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-        )
-        encoder_outputs = embedding_output
-
-        if self.training:
-            res = []
-            for i in range(self.config.num_hidden_layers):
-                encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs,
-                    current_layer=i,
-                    attention_mask=extended_attention_mask,
-                    head_mask=head_mask,
-                )
-
-                pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
-                logits = output_layers[i](output_dropout(pooled_output))
-                res.append(logits)
-        elif self.patience == 0:  # Use all layers for inference
-            encoder_outputs = self.encoder(encoder_outputs, extended_attention_mask, head_mask=head_mask)
-            pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
-            res = [output_layers[self.config.num_hidden_layers - 1](pooled_output)]
-        else:
-            patient_counter = 0
-            patient_result = None
-            calculated_layer_num = 0
-            for i in range(self.config.num_hidden_layers):
-                calculated_layer_num += 1
-                encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs,
-                    current_layer=i,
-                    attention_mask=extended_attention_mask,
-                    head_mask=head_mask,
-                )
-
-                pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
-                logits = output_layers[i](pooled_output)
-                if regression:
-                    labels = logits.detach()
-                    if patient_result is not None:
-                        patient_labels = patient_result.detach()
-                    if (patient_result is not None) and torch.abs(patient_result - labels) < self.regression_threshold:
-                        patient_counter += 1
-                    else:
-                        patient_counter = 0
-                else:
-                    labels = logits.detach().argmax(dim=1)
-                    if patient_result is not None:
-                        patient_labels = patient_result.detach().argmax(dim=1)
-                    if (patient_result is not None) and torch.all(labels.eq(patient_labels)):
-                        patient_counter += 1
-                    else:
-                        patient_counter = 0
-
-                patient_result = logits
-                if patient_counter == self.patience:
-                    break
-            res = [patient_result]
-            self.inference_layers_num += calculated_layer_num
-            self.inference_instances_num += 1
-
-        return res
-
-
-@add_start_docstrings(
-    """Albert Model transformer with PABEE and a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    ALBERT_START_DOCSTRING,
-)
-class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.albert = AlbertModelWithPabee(config)
-        self.dropout = nn.Dropout(config.classifier_dropout_prob)
-        self.classifiers = nn.ModuleList(
-            [nn.Linear(config.hidden_size, self.config.num_labels) for _ in range(config.num_hidden_layers)]
-        )
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Labels for computing the sequence classification/regression loss.
-                Indices should be in ``[0, ..., config.num_labels - 1]``.
-                If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-                If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
-
-        Returns:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
-            loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-                Classification (or regression if config.num_labels==1) loss.
-            logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-                Classification (or regression if config.num_labels==1) scores (before SoftMax).
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-
-            Examples::
-
-                from transformers import AlbertTokenizer
-                from pabee import AlbertForSequenceClassificationWithPabee
-                from torch import nn
-                import torch
-
-                tokenizer = AlbertTokenizer.from_pretrained('albert/albert-base-v2')
-                model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert/albert-base-v2')
-                input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-                labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-                outputs = model(input_ids, labels=labels)
-                loss, logits = outputs[:2]
-
-        """
-
-        logits = self.albert(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_dropout=self.dropout,
-            output_layers=self.classifiers,
-            regression=self.num_labels == 1,
-        )
-
-        outputs = (logits[-1],)
-
-        if labels is not None:
-            total_loss = None
-            total_weights = 0
-            for ix, logits_item in enumerate(logits):
-                if self.num_labels == 1:
-                    #  We are doing regression
-                    loss_fct = MSELoss()
-                    loss = loss_fct(logits_item.view(-1), labels.view(-1))
-                else:
-                    loss_fct = CrossEntropyLoss()
-                    loss = loss_fct(logits_item.view(-1, self.num_labels), labels.view(-1))
-                if total_loss is None:
-                    total_loss = loss
-                else:
-                    total_loss += loss * (ix + 1)
-                total_weights += ix + 1
-            outputs = (total_loss / total_weights,) + outputs
-
-        return outputs
diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
deleted file mode 100644
index c1ce924a57a2..000000000000
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google AI Language Team Authors, The HuggingFace Inc. team and Microsoft Corporation.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BERT model with Patience-based Early Exit."""
-
-import logging
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
-from transformers.models.bert.modeling_bert import (
-    BERT_INPUTS_DOCSTRING,
-    BERT_START_DOCSTRING,
-    BertEncoder,
-    BertModel,
-    BertPreTrainedModel,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-class BertEncoderWithPabee(BertEncoder):
-    def adaptive_forward(self, hidden_states, current_layer, attention_mask=None, head_mask=None):
-        layer_outputs = self.layer[current_layer](hidden_states, attention_mask, head_mask[current_layer])
-
-        hidden_states = layer_outputs[0]
-
-        return hidden_states
-
-
-@add_start_docstrings(
-    "The bare Bert Model transformer with PABEE outputting raw hidden-states without any specific head on top.",
-    BERT_START_DOCSTRING,
-)
-class BertModelWithPabee(BertModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well
-    as a decoder, in which case a layer of cross-attention is added between
-    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as a decoder the model needs to be initialized with the
-    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
-    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
-
-    .. _`Attention is all you need`:
-        https://arxiv.org/abs/1706.03762
-
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.encoder = BertEncoderWithPabee(config)
-
-        self.init_weights()
-        self.patience = 0
-        self.inference_instances_num = 0
-        self.inference_layers_num = 0
-
-        self.regression_threshold = 0
-
-    def set_regression_threshold(self, threshold):
-        self.regression_threshold = threshold
-
-    def set_patience(self, patience):
-        self.patience = patience
-
-    def reset_stats(self):
-        self.inference_instances_num = 0
-        self.inference_layers_num = 0
-
-    def log_stats(self):
-        avg_inf_layers = self.inference_layers_num / self.inference_instances_num
-        message = (
-            f"*** Patience = {self.patience} Avg. Inference Layers = {avg_inf_layers:.2f} Speed Up ="
-            f" {1 - avg_inf_layers / self.config.num_hidden_layers:.2f} ***"
-        )
-        print(message)
-
-    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        output_dropout=None,
-        output_layers=None,
-        regression=False,
-    ):
-        r"""
-        Return:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-                Sequence of hidden-states at the output of the last layer of the model.
-            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-                Last layer hidden-state of the first token of the sequence (classification token)
-                further processed by a Linear layer and a Tanh activation function. The Linear
-                layer weights are trained from the next sentence prediction (classification)
-                objective during pre-training.
-
-                This output is usually *not* a good summary
-                of the semantic content of the input, you're often better with averaging or pooling
-                the sequence of hidden-states for the whole input sequence.
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-        """
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
-
-        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-        )
-        encoder_outputs = embedding_output
-
-        if self.training:
-            res = []
-            for i in range(self.config.num_hidden_layers):
-                encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask
-                )
-
-                pooled_output = self.pooler(encoder_outputs)
-                logits = output_layers[i](output_dropout(pooled_output))
-                res.append(logits)
-        elif self.patience == 0:  # Use all layers for inference
-            encoder_outputs = self.encoder(
-                embedding_output,
-                attention_mask=extended_attention_mask,
-                head_mask=head_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_extended_attention_mask,
-            )
-            pooled_output = self.pooler(encoder_outputs[0])
-            res = [output_layers[self.config.num_hidden_layers - 1](pooled_output)]
-        else:
-            patient_counter = 0
-            patient_result = None
-            calculated_layer_num = 0
-            for i in range(self.config.num_hidden_layers):
-                calculated_layer_num += 1
-                encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask
-                )
-
-                pooled_output = self.pooler(encoder_outputs)
-                logits = output_layers[i](pooled_output)
-                if regression:
-                    labels = logits.detach()
-                    if patient_result is not None:
-                        patient_labels = patient_result.detach()
-                    if (patient_result is not None) and torch.abs(patient_result - labels) < self.regression_threshold:
-                        patient_counter += 1
-                    else:
-                        patient_counter = 0
-                else:
-                    labels = logits.detach().argmax(dim=1)
-                    if patient_result is not None:
-                        patient_labels = patient_result.detach().argmax(dim=1)
-                    if (patient_result is not None) and torch.all(labels.eq(patient_labels)):
-                        patient_counter += 1
-                    else:
-                        patient_counter = 0
-
-                patient_result = logits
-                if patient_counter == self.patience:
-                    break
-            res = [patient_result]
-            self.inference_layers_num += calculated_layer_num
-            self.inference_instances_num += 1
-
-        return res
-
-
-@add_start_docstrings(
-    """Bert Model transformer with PABEE and a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    BERT_START_DOCSTRING,
-)
-class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModelWithPabee(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifiers = nn.ModuleList(
-            [nn.Linear(config.hidden_size, self.config.num_labels) for _ in range(config.num_hidden_layers)]
-        )
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Labels for computing the sequence classification/regression loss.
-                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-        Returns:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-                Classification (or regression if config.num_labels==1) loss.
-            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-                Classification (or regression if config.num_labels==1) scores (before SoftMax).
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-
-        Examples::
-
-            from transformers import BertTokenizer, BertForSequenceClassification
-            from pabee import BertForSequenceClassificationWithPabee
-            from torch import nn
-            import torch
-
-            tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
-            model = BertForSequenceClassificationWithPabee.from_pretrained('google-bert/bert-base-uncased')
-
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=labels)
-
-            loss, logits = outputs[:2]
-
-        """
-
-        logits = self.bert(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_dropout=self.dropout,
-            output_layers=self.classifiers,
-            regression=self.num_labels == 1,
-        )
-
-        outputs = (logits[-1],)
-
-        if labels is not None:
-            total_loss = None
-            total_weights = 0
-            for ix, logits_item in enumerate(logits):
-                if self.num_labels == 1:
-                    #  We are doing regression
-                    loss_fct = MSELoss()
-                    loss = loss_fct(logits_item.view(-1), labels.view(-1))
-                else:
-                    loss_fct = CrossEntropyLoss()
-                    loss = loss_fct(logits_item.view(-1, self.num_labels), labels.view(-1))
-                if total_loss is None:
-                    total_loss = loss
-                else:
-                    total_loss += loss * (ix + 1)
-                total_weights += ix + 1
-            outputs = (total_loss / total_weights,) + outputs
-
-        return outputs
diff --git a/examples/research_projects/bert-loses-patience/requirements.txt b/examples/research_projects/bert-loses-patience/requirements.txt
deleted file mode 100644
index af3b01e0645d..000000000000
--- a/examples/research_projects/bert-loses-patience/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers == 4.38.0
\ No newline at end of file
diff --git a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
deleted file mode 100755
index d1ee5ddde3c6..000000000000
--- a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
+++ /dev/null
@@ -1,751 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google AI Language Team Authors, The HuggingFace Inc. team and Microsoft Corporation.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Training and inference using the library models for sequence classification on GLUE (Bert, Albert) with PABEE."""
-
-import argparse
-import glob
-import json
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from pabee.modeling_pabee_albert import AlbertForSequenceClassificationWithPabee
-from pabee.modeling_pabee_bert import BertForSequenceClassificationWithPabee
-from torch import nn
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-import transformers
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    AlbertConfig,
-    AlbertTokenizer,
-    BertConfig,
-    BertTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
-from transformers import glue_output_modes as output_modes
-from transformers import glue_processors as processors
-from transformers.trainer_utils import is_main_process
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForSequenceClassificationWithPabee, BertTokenizer),
-    "albert": (AlbertConfig, AlbertForSequenceClassificationWithPabee, AlbertTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer):
-    """Train the model"""
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = nn.parallel.DistributedDataParallel(
-            model,
-            device_ids=[args.local_rank],
-            output_device=args.local_rank,
-            find_unused_parameters=True,
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        # set global_step to global_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
-        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-        logger.info("  Continuing training from epoch %d", epochs_trained)
-        logger.info("  Continuing training from global step %d", global_step)
-        logger.info(
-            "  Will skip the first %d steps in the first epoch",
-            steps_trained_in_current_epoch,
-        )
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained,
-        int(args.num_train_epochs),
-        desc="Epoch",
-        disable=args.local_rank not in [-1, 0],
-    )
-    set_seed(args)  # Added here for reproducibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "labels": batch[3],
-            }
-            inputs["token_type_ids"] = batch[2]
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    logs = {}
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            eval_key = "eval_{}".format(key)
-                            logs[eval_key] = value
-
-                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
-                    learning_rate_scalar = scheduler.get_lr()[0]
-                    logs["learning_rate"] = learning_rate_scalar
-                    logs["loss"] = loss_scalar
-                    logging_loss = tr_loss
-
-                    for key, value in logs.items():
-                        tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{"step": global_step}}))
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix="", patience=0):
-    if args.model_type == "albert":
-        model.albert.set_regression_threshold(args.regression_threshold)
-        model.albert.set_patience(patience)
-        model.albert.reset_stats()
-    elif args.model_type == "bert":
-        model.bert.set_regression_threshold(args.regression_threshold)
-        model.bert.set_patience(patience)
-        model.bert.reset_stats()
-    else:
-        raise NotImplementedError()
-
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
-
-    results = {}
-    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
-
-        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(eval_output_dir)
-
-        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset)
-        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        # multi-gpu eval
-        if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
-            model = nn.DataParallel(model)
-
-        # Eval!
-        logger.info("***** Running evaluation {} *****".format(prefix))
-        logger.info("  Num examples = %d", len(eval_dataset))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        eval_loss = 0.0
-        nb_eval_steps = 0
-        preds = None
-        out_label_ids = None
-        for batch in tqdm(eval_dataloader, desc="Evaluating"):
-            model.eval()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            with torch.no_grad():
-                inputs = {
-                    "input_ids": batch[0],
-                    "attention_mask": batch[1],
-                    "labels": batch[3],
-                }
-                inputs["token_type_ids"] = batch[2]
-                outputs = model(**inputs)
-                tmp_eval_loss, logits = outputs[:2]
-
-                eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if preds is None:
-                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs["labels"].detach().cpu().numpy()
-            else:
-                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
-
-        eval_loss = eval_loss / nb_eval_steps
-        if args.output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        elif args.output_mode == "regression":
-            preds = np.squeeze(preds)
-        result = compute_metrics(eval_task, preds, out_label_ids)
-        results.update(result)
-
-        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results {} *****".format(prefix))
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                print("  %s = %s" % (key, str(result[key])))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-    if args.eval_all_checkpoints and patience != 0:
-        if args.model_type == "albert":
-            model.albert.log_stats()
-        elif args.model_type == "bert":
-            model.bert.log_stats()
-        else:
-            raise NotImplementedError()
-
-    return results
-
-
-def load_and_cache_examples(args, task, tokenizer, evaluate=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    processor = processors[task]()
-    output_mode = output_modes[task]
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args.data_dir,
-        "cached_{}_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-            str(task),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        label_list = processor.get_labels()
-        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
-            # HACK(label indices are swapped in RoBERTa pretrained model)
-            label_list[1], label_list[2] = label_list[2], label_list[1]
-        examples = (
-            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        )
-        features = convert_examples_to_features(
-            examples,
-            tokenizer,
-            label_list=label_list,
-            max_length=args.max_seq_length,
-            output_mode=output_mode,
-        )
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-    if output_mode == "classification":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
-    elif output_mode == "regression":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-
-    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name.",
-    )
-    parser.add_argument(
-        "--task_name",
-        default=None,
-        type=str,
-        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--patience",
-        default="0",
-        type=str,
-        required=False,
-    )
-    parser.add_argument(
-        "--regression_threshold",
-        default=0,
-        type=float,
-        required=False,
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name",
-        default="",
-        type=str,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help=(
-            "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        ),
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training",
-        action="store_true",
-        help="Run evaluation during training at each logging step.",
-    )
-    parser.add_argument(
-        "--do_lower_case",
-        action="store_true",
-        help="Set this flag if you are using an uncased model.",
-    )
-
-    parser.add_argument(
-        "--per_gpu_train_batch_size",
-        default=8,
-        type=int,
-        help="Batch size per GPU/CPU for training.",
-    )
-    parser.add_argument(
-        "--per_gpu_eval_batch_size",
-        default=1,
-        type=int,
-        help="Batch size per GPU/CPU for evaluation.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        default=5e-5,
-        type=float,
-        help="The initial learning rate for Adam.",
-    )
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs",
-        default=3.0,
-        type=float,
-        help="Total number of training epochs to perform.",
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument(
-        "--save_steps",
-        type=int,
-        default=500,
-        help="Save checkpoint every X updates steps.",
-    )
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir",
-        action="store_true",
-        help="Overwrite the content of the output directory",
-    )
-    parser.add_argument(
-        "--overwrite_cache",
-        action="store_true",
-        help="Overwrite the cached training and evaluation sets",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument(
-        "--local_rank",
-        type=int,
-        default=-1,
-        help="For distributed training: local_rank",
-    )
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    # Set seed
-    set_seed(args)
-
-    # Prepare GLUE task
-    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
-        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
-    args.output_mode = output_modes[args.task_name]
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    if args.patience != "0" and args.per_gpu_eval_batch_size != 1:
-        raise ValueError("The eval batch size must be 1 with PABEE inference on.")
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=args.task_name,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    print("Total Model Parameters:", sum(param.numel() for param in model.parameters()))
-    output_layers_param_num = sum(param.numel() for param in model.classifiers.parameters())
-    print("Output Layers Parameters:", output_layers_param_num)
-    single_output_layer_param_num = sum(param.numel() for param in model.classifiers[0].parameters())
-    print(
-        "Added Output Layers Parameters:",
-        output_layers_param_num - single_output_layer_param_num,
-    )
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        patience_list = [int(x) for x in args.patience.split(",")]
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = [
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            ]
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-
-            print(f"Evaluation for checkpoint {prefix}")
-            for patience in patience_list:
-                result = evaluate(args, model, tokenizer, prefix=prefix, patience=patience)
-                result = {k + "_{}".format(global_step): v for k, v in result.items()}
-                results.update(result)
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
deleted file mode 100644
index 5516924f0f2f..000000000000
--- a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import argparse
-import logging
-import sys
-from unittest.mock import patch
-
-import run_glue_with_pabee
-
-from transformers.testing_utils import TestCasePlus
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-
-
-def get_setup_file():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-f")
-    args = parser.parse_args()
-    return args.f
-
-
-class PabeeTests(TestCasePlus):
-    def test_run_glue(self):
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        testargs = f"""
-            run_glue_with_pabee.py
-            --model_type albert
-            --model_name_or_path albert/albert-base-v2
-            --data_dir ./tests/fixtures/tests_samples/MRPC/
-            --output_dir {tmp_dir}
-            --overwrite_output_dir
-            --task_name mrpc
-            --do_train
-            --do_eval
-            --per_gpu_train_batch_size=2
-            --per_gpu_eval_batch_size=1
-            --learning_rate=2e-5
-            --max_steps=50
-            --warmup_steps=2
-            --seed=42
-            --max_seq_length=128
-            """.split()
-
-        with patch.object(sys, "argv", testargs):
-            result = run_glue_with_pabee.main()
-            for value in result.values():
-                self.assertGreaterEqual(value, 0.75)
diff --git a/examples/research_projects/bertabs/README.md b/examples/research_projects/bertabs/README.md
deleted file mode 100644
index 7109c0fb72be..000000000000
--- a/examples/research_projects/bertabs/README.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# Text Summarization with Pretrained Encoders
-
-This folder contains part of the code necessary to reproduce the results on abstractive summarization from the article [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) by [Yang Liu](https://nlp-yang.github.io/) and [Mirella Lapata](https://homepages.inf.ed.ac.uk/mlap/). It can also be used to summarize any document.
-
-The original code can be found on the Yang Liu's [github repository](https://github.com/nlpyang/PreSumm).
-
-The model is loaded with the pre-trained weights for the abstractive summarization model trained on the CNN/Daily Mail dataset with an extractive and then abstractive tasks.
-
-## Setup
-
-```bash
-git clone https://github.com/huggingface/transformers && cd transformers
-pip install .
-pip install nltk py-rouge
-cd examples/seq2seq/bertabs
-```
-
-## Reproduce the authors'  ROUGE score
-
-To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
-
-```bash
-tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
-```
-
-And move all the stories to the same folder. We will refer as `$DATA_PATH` the path to where you uncompressed both archive. Then run the following in the same folder as `run_summarization.py`:
-
-```bash
-python run_summarization.py \
-    --documents_dir $DATA_PATH \
-    --summaries_output_dir $SUMMARIES_PATH \ # optional
-    --no_cuda false \
-    --batch_size 4 \
-    --min_length 50 \
-    --max_length 200 \
-    --beam_size 5 \
-    --alpha 0.95 \
-    --block_trigram true \
-    --compute_rouge true
-```
-
-The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not supported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
-
-## Summarize any text
-
-Put the documents that you would like to summarize in a folder (the path to which is referred to as `$DATA_PATH` below) and run the following in the same folder as `run_summarization.py`:
-
-```bash
-python run_summarization.py \
-    --documents_dir $DATA_PATH \
-    --summaries_output_dir $SUMMARIES_PATH \ # optional
-    --no_cuda false \
-    --batch_size 4 \
-    --min_length 50 \
-    --max_length 200 \
-    --beam_size 5 \
-    --alpha 0.95 \
-    --block_trigram true \
-```
-
-You may want to play around with `min_length`, `max_length` and `alpha` to suit your use case. If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py` and tell it where to fetch the reference summaries.
diff --git a/examples/research_projects/bertabs/configuration_bertabs.py b/examples/research_projects/bertabs/configuration_bertabs.py
deleted file mode 100644
index 4c65cd3395c2..000000000000
--- a/examples/research_projects/bertabs/configuration_bertabs.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# coding=utf-8
-# Copyright 2019 The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BertAbs configuration"""
-
-import logging
-
-from transformers import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-
-BERTABS_FINETUNED_CONFIG_MAP = {
-    "bertabs-finetuned-cnndm": "https://huggingface.co/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization/resolve/main/config.json",
-}
-
-
-class BertAbsConfig(PretrainedConfig):
-    r"""Class to store the configuration of the BertAbs model.
-
-    Arguments:
-        vocab_size: int
-            Number of tokens in the vocabulary.
-        max_pos: int
-            The maximum sequence length that this model will be used with.
-        enc_layer: int
-            The numner of hidden layers in the Transformer encoder.
-        enc_hidden_size: int
-            The size of the encoder's layers.
-        enc_heads: int
-            The number of attention heads for each attention layer in the encoder.
-        enc_ff_size: int
-            The size of the encoder's feed-forward layers.
-        enc_dropout: int
-            The dropout probability for all fully connected layers in the
-            embeddings, layers, pooler and also the attention probabilities in
-            the encoder.
-        dec_layer: int
-            The numner of hidden layers in the decoder.
-        dec_hidden_size: int
-            The size of the decoder's layers.
-        dec_heads: int
-            The number of attention heads for each attention layer in the decoder.
-        dec_ff_size: int
-            The size of the decoder's feed-forward layers.
-        dec_dropout: int
-            The dropout probability for all fully connected layers in the
-            embeddings, layers, pooler and also the attention probabilities in
-            the decoder.
-    """
-
-    model_type = "bertabs"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        max_pos=512,
-        enc_layers=6,
-        enc_hidden_size=512,
-        enc_heads=8,
-        enc_ff_size=512,
-        enc_dropout=0.2,
-        dec_layers=6,
-        dec_hidden_size=768,
-        dec_heads=8,
-        dec_ff_size=2048,
-        dec_dropout=0.2,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.vocab_size = vocab_size
-        self.max_pos = max_pos
-
-        self.enc_layers = enc_layers
-        self.enc_hidden_size = enc_hidden_size
-        self.enc_heads = enc_heads
-        self.enc_ff_size = enc_ff_size
-        self.enc_dropout = enc_dropout
-
-        self.dec_layers = dec_layers
-        self.dec_hidden_size = dec_hidden_size
-        self.dec_heads = dec_heads
-        self.dec_ff_size = dec_ff_size
-        self.dec_dropout = dec_dropout
diff --git a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
deleted file mode 100644
index 338ffd21c993..000000000000
--- a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BertExtAbs's checkpoints.
-
-The script looks like it is doing something trivial but it is not. The "weights"
-proposed by the authors are actually the entire model pickled. We need to load
-the model within the original codebase to be able to only save its `state_dict`.
-"""
-
-import argparse
-import logging
-from collections import namedtuple
-
-import torch
-from model_bertabs import BertAbsSummarizer
-from models.model_builder import AbsSummarizer  # The authors' implementation
-
-from transformers import BertTokenizer
-
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-SAMPLE_TEXT = "Hello world! cécé herlolip"
-
-
-BertAbsConfig = namedtuple(
-    "BertAbsConfig",
-    [
-        "temp_dir",
-        "large",
-        "use_bert_emb",
-        "finetune_bert",
-        "encoder",
-        "share_emb",
-        "max_pos",
-        "enc_layers",
-        "enc_hidden_size",
-        "enc_heads",
-        "enc_ff_size",
-        "enc_dropout",
-        "dec_layers",
-        "dec_hidden_size",
-        "dec_heads",
-        "dec_ff_size",
-        "dec_dropout",
-    ],
-)
-
-
-def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
-    """Copy/paste and tweak the pre-trained weights provided by the creators
-    of BertAbs for the internal architecture.
-    """
-
-    # Instantiate the authors' model with the pre-trained weights
-    config = BertAbsConfig(
-        temp_dir=".",
-        finetune_bert=False,
-        large=False,
-        share_emb=True,
-        use_bert_emb=False,
-        encoder="bert",
-        max_pos=512,
-        enc_layers=6,
-        enc_hidden_size=512,
-        enc_heads=8,
-        enc_ff_size=512,
-        enc_dropout=0.2,
-        dec_layers=6,
-        dec_hidden_size=768,
-        dec_heads=8,
-        dec_ff_size=2048,
-        dec_dropout=0.2,
-    )
-    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
-    original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
-    original.eval()
-
-    new_model = BertAbsSummarizer(config, torch.device("cpu"))
-    new_model.eval()
-
-    # -------------------
-    # Convert the weights
-    # -------------------
-
-    logging.info("convert the model")
-    new_model.bert.load_state_dict(original.bert.state_dict())
-    new_model.decoder.load_state_dict(original.decoder.state_dict())
-    new_model.generator.load_state_dict(original.generator.state_dict())
-
-    # ----------------------------------
-    # Make sure the outpus are identical
-    # ----------------------------------
-
-    logging.info("Make sure that the models' outputs are identical")
-    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-
-    # prepare the model inputs
-    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
-    encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
-    encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
-    decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
-    decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
-    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
-
-    # failsafe to make sure the weights reset does not affect the
-    # loaded weights.
-    assert torch.max(torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0
-
-    # forward pass
-    src = encoder_input_ids
-    tgt = decoder_input_ids
-    segs = token_type_ids = None
-    clss = None
-    mask_src = encoder_attention_mask = None
-    mask_tgt = decoder_attention_mask = None
-    mask_cls = None
-
-    # The original model does not apply the geneator layer immediatly but rather in
-    # the beam search (where it combines softmax + linear layer). Since we already
-    # apply the softmax in our generation process we only apply the linear layer here.
-    # We make sure that the outputs of the full stack are identical
-    output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
-    output_original_generator = original.generator(output_original_model)
-
-    output_converted_model = new_model(
-        encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask
-    )[0]
-    output_converted_generator = new_model.generator(output_converted_model)
-
-    maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
-    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
-    maximum_absolute_difference = torch.max(torch.abs(output_converted_generator - output_original_generator)).item()
-    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
-
-    are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3)
-    if are_identical:
-        logging.info("all weights are equal up to 1e-3")
-    else:
-        raise ValueError("the weights are different. The new model is likely different from the original one.")
-
-    # The model has been saved with torch.save(model) and this is bound to the exact
-    # directory structure. We save the state_dict instead.
-    logging.info("saving the model's state dictionary")
-    torch.save(
-        new_model.state_dict(), "./bertabs-finetuned-cnndm-extractive-abstractive-summarization/pytorch_model.bin"
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--bertabs_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path the official PyTorch dump.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model.",
-    )
-    args = parser.parse_args()
-
-    convert_bertabs_checkpoints(
-        args.bertabs_checkpoint_path,
-        args.pytorch_dump_folder_path,
-    )
diff --git a/examples/research_projects/bertabs/modeling_bertabs.py b/examples/research_projects/bertabs/modeling_bertabs.py
deleted file mode 100644
index c2c6a54be75f..000000000000
--- a/examples/research_projects/bertabs/modeling_bertabs.py
+++ /dev/null
@@ -1,1054 +0,0 @@
-# MIT License
-
-# Copyright (c) 2019 Yang Liu and the HuggingFace team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import copy
-import math
-
-import numpy as np
-import torch
-from configuration_bertabs import BertAbsConfig
-from torch import nn
-from torch.nn.init import xavier_uniform_
-
-from transformers import BertConfig, BertModel, PreTrainedModel
-
-
-MAX_SIZE = 5000
-
-
-class BertAbsPreTrainedModel(PreTrainedModel):
-    config_class = BertAbsConfig
-    load_tf_weights = False
-    base_model_prefix = "bert"
-
-
-class BertAbs(BertAbsPreTrainedModel):
-    def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
-        super().__init__(args)
-        self.args = args
-        self.bert = Bert()
-
-        # If pre-trained weights are passed for Bert, load these.
-        load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False
-        if load_bert_pretrained_extractive:
-            self.bert.model.load_state_dict(
-                {n[11:]: p for n, p in bert_extractive_checkpoint.items() if n.startswith("bert.model")},
-                strict=True,
-            )
-
-        self.vocab_size = self.bert.model.config.vocab_size
-
-        if args.max_pos > 512:
-            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
-            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
-            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][
-                None, :
-            ].repeat(args.max_pos - 512, 1)
-            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
-        tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0)
-
-        tgt_embeddings.weight = copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight)
-
-        self.decoder = TransformerDecoder(
-            self.args.dec_layers,
-            self.args.dec_hidden_size,
-            heads=self.args.dec_heads,
-            d_ff=self.args.dec_ff_size,
-            dropout=self.args.dec_dropout,
-            embeddings=tgt_embeddings,
-            vocab_size=self.vocab_size,
-        )
-
-        gen_func = nn.LogSoftmax(dim=-1)
-        self.generator = nn.Sequential(nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func)
-        self.generator[0].weight = self.decoder.embeddings.weight
-
-        load_from_checkpoints = False if checkpoint is None else True
-        if load_from_checkpoints:
-            self.load_state_dict(checkpoint)
-
-    def init_weights(self):
-        for module in self.decoder.modules():
-            if isinstance(module, (nn.Linear, nn.Embedding)):
-                module.weight.data.normal_(mean=0.0, std=0.02)
-            elif isinstance(module, nn.LayerNorm):
-                module.bias.data.zero_()
-                module.weight.data.fill_(1.0)
-            if isinstance(module, nn.Linear) and module.bias is not None:
-                module.bias.data.zero_()
-        for p in self.generator.parameters():
-            if p.dim() > 1:
-                xavier_uniform_(p)
-            else:
-                p.data.zero_()
-
-    def forward(
-        self,
-        encoder_input_ids,
-        decoder_input_ids,
-        token_type_ids,
-        encoder_attention_mask,
-        decoder_attention_mask,
-    ):
-        encoder_output = self.bert(
-            input_ids=encoder_input_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=encoder_attention_mask,
-        )
-        encoder_hidden_states = encoder_output[0]
-        dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states)
-        decoder_outputs, _ = self.decoder(decoder_input_ids[:, :-1], encoder_hidden_states, dec_state)
-        return decoder_outputs
-
-
-class Bert(nn.Module):
-    """This class is not really necessary and should probably disappear."""
-
-    def __init__(self):
-        super().__init__()
-        config = BertConfig.from_pretrained("google-bert/bert-base-uncased")
-        self.model = BertModel(config)
-
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, **kwargs):
-        self.eval()
-        with torch.no_grad():
-            encoder_outputs, _ = self.model(
-                input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, **kwargs
-            )
-        return encoder_outputs
-
-
-class TransformerDecoder(nn.Module):
-    """
-    The Transformer decoder from "Attention is All You Need".
-
-    Args:
-       num_layers (int): number of encoder layers.
-       d_model (int): size of the model
-       heads (int): number of heads
-       d_ff (int): size of the inner FF layer
-       dropout (float): dropout parameters
-       embeddings (:obj:`onmt.modules.Embeddings`):
-          embeddings to use, should have positional encodings
-       attn_type (str): if using a separate copy attention
-    """
-
-    def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
-        super().__init__()
-
-        # Basic attributes.
-        self.decoder_type = "transformer"
-        self.num_layers = num_layers
-        self.embeddings = embeddings
-        self.pos_emb = PositionalEncoding(dropout, self.embeddings.embedding_dim)
-
-        # Build TransformerDecoder.
-        self.transformer_layers = nn.ModuleList(
-            [TransformerDecoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers)]
-        )
-
-        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
-
-    # forward(input_ids, attention_mask, encoder_hidden_states, encoder_attention_mask)
-    # def forward(self, input_ids, state, attention_mask=None, memory_lengths=None,
-    # step=None, cache=None, encoder_attention_mask=None, encoder_hidden_states=None, memory_masks=None):
-    def forward(
-        self,
-        input_ids,
-        encoder_hidden_states=None,
-        state=None,
-        attention_mask=None,
-        memory_lengths=None,
-        step=None,
-        cache=None,
-        encoder_attention_mask=None,
-    ):
-        """
-        See :obj:`onmt.modules.RNNDecoderBase.forward()`
-        memory_bank = encoder_hidden_states
-        """
-        # Name conversion
-        tgt = input_ids
-        memory_bank = encoder_hidden_states
-        memory_mask = encoder_attention_mask
-
-        # src_words = state.src
-        src_words = state.src
-        src_batch, src_len = src_words.size()
-
-        padding_idx = self.embeddings.padding_idx
-
-        # Decoder padding mask
-        tgt_words = tgt
-        tgt_batch, tgt_len = tgt_words.size()
-        tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len)
-
-        # Encoder padding mask
-        if memory_mask is not None:
-            src_len = memory_mask.size(-1)
-            src_pad_mask = memory_mask.expand(src_batch, tgt_len, src_len)
-        else:
-            src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1).expand(src_batch, tgt_len, src_len)
-
-        # Pass through the embeddings
-        emb = self.embeddings(input_ids)
-        output = self.pos_emb(emb, step)
-        assert emb.dim() == 3  # len x batch x embedding_dim
-
-        if state.cache is None:
-            saved_inputs = []
-
-        for i in range(self.num_layers):
-            prev_layer_input = None
-            if state.cache is None:
-                if state.previous_input is not None:
-                    prev_layer_input = state.previous_layer_inputs[i]
-
-            output, all_input = self.transformer_layers[i](
-                output,
-                memory_bank,
-                src_pad_mask,
-                tgt_pad_mask,
-                previous_input=prev_layer_input,
-                layer_cache=state.cache["layer_{}".format(i)] if state.cache is not None else None,
-                step=step,
-            )
-            if state.cache is None:
-                saved_inputs.append(all_input)
-
-        if state.cache is None:
-            saved_inputs = torch.stack(saved_inputs)
-
-        output = self.layer_norm(output)
-
-        if state.cache is None:
-            state = state.update_state(tgt, saved_inputs)
-
-        # Decoders in transformers return a tuple. Beam search will fail
-        # if we don't follow this convention.
-        return output, state  # , state
-
-    def init_decoder_state(self, src, memory_bank, with_cache=False):
-        """Init decoder state"""
-        state = TransformerDecoderState(src)
-        if with_cache:
-            state._init_cache(memory_bank, self.num_layers)
-        return state
-
-
-class PositionalEncoding(nn.Module):
-    def __init__(self, dropout, dim, max_len=5000):
-        pe = torch.zeros(max_len, dim)
-        position = torch.arange(0, max_len).unsqueeze(1)
-        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim)))
-        pe[:, 0::2] = torch.sin(position.float() * div_term)
-        pe[:, 1::2] = torch.cos(position.float() * div_term)
-        pe = pe.unsqueeze(0)
-        super().__init__()
-        self.register_buffer("pe", pe)
-        self.dropout = nn.Dropout(p=dropout)
-        self.dim = dim
-
-    def forward(self, emb, step=None):
-        emb = emb * math.sqrt(self.dim)
-        if step:
-            emb = emb + self.pe[:, step][:, None, :]
-
-        else:
-            emb = emb + self.pe[:, : emb.size(1)]
-        emb = self.dropout(emb)
-        return emb
-
-    def get_emb(self, emb):
-        return self.pe[:, : emb.size(1)]
-
-
-class TransformerDecoderLayer(nn.Module):
-    """
-    Args:
-      d_model (int): the dimension of keys/values/queries in
-                       MultiHeadedAttention, also the input size of
-                       the first-layer of the PositionwiseFeedForward.
-      heads (int): the number of heads for MultiHeadedAttention.
-      d_ff (int): the second-layer of the PositionwiseFeedForward.
-      dropout (float): dropout probability(0-1.0).
-      self_attn_type (string): type of self-attention scaled-dot, average
-    """
-
-    def __init__(self, d_model, heads, d_ff, dropout):
-        super().__init__()
-
-        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
-
-        self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
-        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
-        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
-        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
-        self.drop = nn.Dropout(dropout)
-        mask = self._get_attn_subsequent_mask(MAX_SIZE)
-        # Register self.mask as a saved_state in TransformerDecoderLayer, so
-        # it gets TransformerDecoderLayer's cuda behavior automatically.
-        self.register_buffer("mask", mask)
-
-    def forward(
-        self,
-        inputs,
-        memory_bank,
-        src_pad_mask,
-        tgt_pad_mask,
-        previous_input=None,
-        layer_cache=None,
-        step=None,
-    ):
-        """
-        Args:
-            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
-            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
-            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
-            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`
-
-        Returns:
-            (`FloatTensor`, `FloatTensor`, `FloatTensor`):
-
-            * output `[batch_size x 1 x model_dim]`
-            * attn `[batch_size x 1 x src_len]`
-            * all_input `[batch_size x current_step x model_dim]`
-
-        """
-        dec_mask = torch.gt(tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0)
-        input_norm = self.layer_norm_1(inputs)
-        all_input = input_norm
-        if previous_input is not None:
-            all_input = torch.cat((previous_input, input_norm), dim=1)
-            dec_mask = None
-
-        query = self.self_attn(
-            all_input,
-            all_input,
-            input_norm,
-            mask=dec_mask,
-            layer_cache=layer_cache,
-            type="self",
-        )
-
-        query = self.drop(query) + inputs
-
-        query_norm = self.layer_norm_2(query)
-        mid = self.context_attn(
-            memory_bank,
-            memory_bank,
-            query_norm,
-            mask=src_pad_mask,
-            layer_cache=layer_cache,
-            type="context",
-        )
-        output = self.feed_forward(self.drop(mid) + query)
-
-        return output, all_input
-        # return output
-
-    def _get_attn_subsequent_mask(self, size):
-        """
-        Get an attention mask to avoid using the subsequent info.
-
-        Args:
-            size: int
-
-        Returns:
-            (`LongTensor`):
-
-            * subsequent_mask `[1 x size x size]`
-        """
-        attn_shape = (1, size, size)
-        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype("uint8")
-        subsequent_mask = torch.from_numpy(subsequent_mask)
-        return subsequent_mask
-
-
-class MultiHeadedAttention(nn.Module):
-    """
-    Multi-Head Attention module from
-    "Attention is All You Need"
-    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
-
-    Similar to standard `dot` attention but uses
-    multiple attention distributions simulataneously
-    to select relevant items.
-
-    .. mermaid::
-
-       graph BT
-          A[key]
-          B[value]
-          C[query]
-          O[output]
-          subgraph Attn
-            D[Attn 1]
-            E[Attn 2]
-            F[Attn N]
-          end
-          A --> D
-          C --> D
-          A --> E
-          C --> E
-          A --> F
-          C --> F
-          D --> O
-          E --> O
-          F --> O
-          B --> O
-
-    Also includes several additional tricks.
-
-    Args:
-       head_count (int): number of parallel heads
-       model_dim (int): the dimension of keys/values/queries,
-           must be divisible by head_count
-       dropout (float): dropout parameter
-    """
-
-    def __init__(self, head_count, model_dim, dropout=0.1, use_final_linear=True):
-        assert model_dim % head_count == 0
-        self.dim_per_head = model_dim // head_count
-        self.model_dim = model_dim
-
-        super().__init__()
-        self.head_count = head_count
-
-        self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
-        self.linear_values = nn.Linear(model_dim, head_count * self.dim_per_head)
-        self.linear_query = nn.Linear(model_dim, head_count * self.dim_per_head)
-        self.softmax = nn.Softmax(dim=-1)
-        self.dropout = nn.Dropout(dropout)
-        self.use_final_linear = use_final_linear
-        if self.use_final_linear:
-            self.final_linear = nn.Linear(model_dim, model_dim)
-
-    def forward(
-        self,
-        key,
-        value,
-        query,
-        mask=None,
-        layer_cache=None,
-        type=None,
-        predefined_graph_1=None,
-    ):
-        """
-        Compute the context vector and the attention vectors.
-
-        Args:
-           key (`FloatTensor`): set of `key_len`
-                key vectors `[batch, key_len, dim]`
-           value (`FloatTensor`): set of `key_len`
-                value vectors `[batch, key_len, dim]`
-           query (`FloatTensor`): set of `query_len`
-                 query vectors  `[batch, query_len, dim]`
-           mask: binary mask indicating which keys have
-                 non-zero attention `[batch, query_len, key_len]`
-        Returns:
-           (`FloatTensor`, `FloatTensor`) :
-
-           * output context vectors `[batch, query_len, dim]`
-           * one of the attention vectors `[batch, query_len, key_len]`
-        """
-        batch_size = key.size(0)
-        dim_per_head = self.dim_per_head
-        head_count = self.head_count
-
-        def shape(x):
-            """projection"""
-            return x.view(batch_size, -1, head_count, dim_per_head).transpose(1, 2)
-
-        def unshape(x):
-            """compute context"""
-            return x.transpose(1, 2).contiguous().view(batch_size, -1, head_count * dim_per_head)
-
-        # 1) Project key, value, and query.
-        if layer_cache is not None:
-            if type == "self":
-                query, key, value = (
-                    self.linear_query(query),
-                    self.linear_keys(query),
-                    self.linear_values(query),
-                )
-
-                key = shape(key)
-                value = shape(value)
-
-                if layer_cache is not None:
-                    device = key.device
-                    if layer_cache["self_keys"] is not None:
-                        key = torch.cat((layer_cache["self_keys"].to(device), key), dim=2)
-                    if layer_cache["self_values"] is not None:
-                        value = torch.cat((layer_cache["self_values"].to(device), value), dim=2)
-                    layer_cache["self_keys"] = key
-                    layer_cache["self_values"] = value
-            elif type == "context":
-                query = self.linear_query(query)
-                if layer_cache is not None:
-                    if layer_cache["memory_keys"] is None:
-                        key, value = self.linear_keys(key), self.linear_values(value)
-                        key = shape(key)
-                        value = shape(value)
-                    else:
-                        key, value = (
-                            layer_cache["memory_keys"],
-                            layer_cache["memory_values"],
-                        )
-                    layer_cache["memory_keys"] = key
-                    layer_cache["memory_values"] = value
-                else:
-                    key, value = self.linear_keys(key), self.linear_values(value)
-                    key = shape(key)
-                    value = shape(value)
-        else:
-            key = self.linear_keys(key)
-            value = self.linear_values(value)
-            query = self.linear_query(query)
-            key = shape(key)
-            value = shape(value)
-
-        query = shape(query)
-
-        # 2) Calculate and scale scores.
-        query = query / math.sqrt(dim_per_head)
-        scores = torch.matmul(query, key.transpose(2, 3))
-
-        if mask is not None:
-            mask = mask.unsqueeze(1).expand_as(scores)
-            scores = scores.masked_fill(mask, -1e18)
-
-        # 3) Apply attention dropout and compute context vectors.
-
-        attn = self.softmax(scores)
-
-        if predefined_graph_1 is not None:
-            attn_masked = attn[:, -1] * predefined_graph_1
-            attn_masked = attn_masked / (torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)
-
-            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)
-
-        drop_attn = self.dropout(attn)
-        if self.use_final_linear:
-            context = unshape(torch.matmul(drop_attn, value))
-            output = self.final_linear(context)
-            return output
-        else:
-            context = torch.matmul(drop_attn, value)
-            return context
-
-
-class DecoderState:
-    """Interface for grouping together the current state of a recurrent
-    decoder. In the simplest case just represents the hidden state of
-    the model.  But can also be used for implementing various forms of
-    input_feeding and non-recurrent models.
-
-    Modules need to implement this to utilize beam search decoding.
-    """
-
-    def detach(self):
-        """Need to document this"""
-        self.hidden = tuple([_.detach() for _ in self.hidden])
-        self.input_feed = self.input_feed.detach()
-
-    def beam_update(self, idx, positions, beam_size):
-        """Need to document this"""
-        for e in self._all:
-            sizes = e.size()
-            br = sizes[1]
-            if len(sizes) == 3:
-                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[:, :, idx]
-            else:
-                sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2], sizes[3])[:, :, idx]
-
-            sent_states.data.copy_(sent_states.data.index_select(1, positions))
-
-    def map_batch_fn(self, fn):
-        raise NotImplementedError()
-
-
-class TransformerDecoderState(DecoderState):
-    """Transformer Decoder state base class"""
-
-    def __init__(self, src):
-        """
-        Args:
-            src (FloatTensor): a sequence of source words tensors
-                    with optional feature tensors, of size (len x batch).
-        """
-        self.src = src
-        self.previous_input = None
-        self.previous_layer_inputs = None
-        self.cache = None
-
-    @property
-    def _all(self):
-        """
-        Contains attributes that need to be updated in self.beam_update().
-        """
-        if self.previous_input is not None and self.previous_layer_inputs is not None:
-            return (self.previous_input, self.previous_layer_inputs, self.src)
-        else:
-            return (self.src,)
-
-    def detach(self):
-        if self.previous_input is not None:
-            self.previous_input = self.previous_input.detach()
-        if self.previous_layer_inputs is not None:
-            self.previous_layer_inputs = self.previous_layer_inputs.detach()
-        self.src = self.src.detach()
-
-    def update_state(self, new_input, previous_layer_inputs):
-        state = TransformerDecoderState(self.src)
-        state.previous_input = new_input
-        state.previous_layer_inputs = previous_layer_inputs
-        return state
-
-    def _init_cache(self, memory_bank, num_layers):
-        self.cache = {}
-
-        for l in range(num_layers):
-            layer_cache = {"memory_keys": None, "memory_values": None}
-            layer_cache["self_keys"] = None
-            layer_cache["self_values"] = None
-            self.cache["layer_{}".format(l)] = layer_cache
-
-    def repeat_beam_size_times(self, beam_size):
-        """Repeat beam_size times along batch dimension."""
-        self.src = self.src.data.repeat(1, beam_size, 1)
-
-    def map_batch_fn(self, fn):
-        def _recursive_map(struct, batch_dim=0):
-            for k, v in struct.items():
-                if v is not None:
-                    if isinstance(v, dict):
-                        _recursive_map(v)
-                    else:
-                        struct[k] = fn(v, batch_dim)
-
-        self.src = fn(self.src, 0)
-        if self.cache is not None:
-            _recursive_map(self.cache)
-
-
-def gelu(x):
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-class PositionwiseFeedForward(nn.Module):
-    """A two-layer Feed-Forward-Network with residual layer norm.
-
-    Args:
-        d_model (int): the size of input for the first-layer of the FFN.
-        d_ff (int): the hidden layer size of the second-layer
-            of the FNN.
-        dropout (float): dropout probability in :math:`[0, 1)`.
-    """
-
-    def __init__(self, d_model, d_ff, dropout=0.1):
-        super().__init__()
-        self.w_1 = nn.Linear(d_model, d_ff)
-        self.w_2 = nn.Linear(d_ff, d_model)
-        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
-        self.actv = gelu
-        self.dropout_1 = nn.Dropout(dropout)
-        self.dropout_2 = nn.Dropout(dropout)
-
-    def forward(self, x):
-        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
-        output = self.dropout_2(self.w_2(inter))
-        return output + x
-
-
-#
-# TRANSLATOR
-# The following code is used to generate summaries using the
-# pre-trained weights and beam search.
-#
-
-
-def build_predictor(args, tokenizer, symbols, model, logger=None):
-    # we should be able to refactor the global scorer a lot
-    scorer = GNMTGlobalScorer(args.alpha, length_penalty="wu")
-    translator = Translator(args, model, tokenizer, symbols, global_scorer=scorer, logger=logger)
-    return translator
-
-
-class GNMTGlobalScorer:
-    """
-    NMT re-ranking score from
-    "Google's Neural Machine Translation System" :cite:`wu2016google`
-
-    Args:
-       alpha (float): length parameter
-       beta (float):  coverage parameter
-    """
-
-    def __init__(self, alpha, length_penalty):
-        self.alpha = alpha
-        penalty_builder = PenaltyBuilder(length_penalty)
-        self.length_penalty = penalty_builder.length_penalty()
-
-    def score(self, beam, logprobs):
-        """
-        Rescores a prediction based on penalty functions
-        """
-        normalized_probs = self.length_penalty(beam, logprobs, self.alpha)
-        return normalized_probs
-
-
-class PenaltyBuilder:
-    """
-    Returns the Length and Coverage Penalty function for Beam Search.
-
-    Args:
-        length_pen (str): option name of length pen
-        cov_pen (str): option name of cov pen
-    """
-
-    def __init__(self, length_pen):
-        self.length_pen = length_pen
-
-    def length_penalty(self):
-        if self.length_pen == "wu":
-            return self.length_wu
-        elif self.length_pen == "avg":
-            return self.length_average
-        else:
-            return self.length_none
-
-    """
-    Below are all the different penalty terms implemented so far
-    """
-
-    def length_wu(self, beam, logprobs, alpha=0.0):
-        """
-        NMT length re-ranking score from
-        "Google's Neural Machine Translation System" :cite:`wu2016google`.
-        """
-
-        modifier = ((5 + len(beam.next_ys)) ** alpha) / ((5 + 1) ** alpha)
-        return logprobs / modifier
-
-    def length_average(self, beam, logprobs, alpha=0.0):
-        """
-        Returns the average probability of tokens in a sequence.
-        """
-        return logprobs / len(beam.next_ys)
-
-    def length_none(self, beam, logprobs, alpha=0.0, beta=0.0):
-        """
-        Returns unmodified scores.
-        """
-        return logprobs
-
-
-class Translator:
-    """
-    Uses a model to translate a batch of sentences.
-
-    Args:
-       model (:obj:`onmt.modules.NMTModel`):
-          NMT model to use for translation
-       fields (dict of Fields): data fields
-       beam_size (int): size of beam to use
-       n_best (int): number of translations produced
-       max_length (int): maximum length output to produce
-       global_scores (:obj:`GlobalScorer`):
-         object to rescore final translations
-       copy_attn (bool): use copy attention during translation
-       beam_trace (bool): trace beam search for debugging
-       logger(logging.Logger): logger.
-    """
-
-    def __init__(self, args, model, vocab, symbols, global_scorer=None, logger=None):
-        self.logger = logger
-
-        self.args = args
-        self.model = model
-        self.generator = self.model.generator
-        self.vocab = vocab
-        self.symbols = symbols
-        self.start_token = symbols["BOS"]
-        self.end_token = symbols["EOS"]
-
-        self.global_scorer = global_scorer
-        self.beam_size = args.beam_size
-        self.min_length = args.min_length
-        self.max_length = args.max_length
-
-    def translate(self, batch, step, attn_debug=False):
-        """Generates summaries from one batch of data."""
-        self.model.eval()
-        with torch.no_grad():
-            batch_data = self.translate_batch(batch)
-            translations = self.from_batch(batch_data)
-        return translations
-
-    def translate_batch(self, batch, fast=False):
-        """
-        Translate a batch of sentences.
-
-        Mostly a wrapper around :obj:`Beam`.
-
-        Args:
-           batch (:obj:`Batch`): a batch from a dataset object
-           fast (bool): enables fast beam search (may not support all features)
-        """
-        with torch.no_grad():
-            return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
-
-    # Where the beam search lives
-    # I have no idea why it is being called from the method above
-    def _fast_translate_batch(self, batch, max_length, min_length=0):
-        """Beam Search using the encoder inputs contained in `batch`."""
-
-        # The batch object is funny
-        # Instead of just looking at the size of the arguments we encapsulate
-        # a size argument.
-        # Where is it defined?
-        beam_size = self.beam_size
-        batch_size = batch.batch_size
-        src = batch.src
-        segs = batch.segs
-        mask_src = batch.mask_src
-
-        src_features = self.model.bert(src, segs, mask_src)
-        dec_states = self.model.decoder.init_decoder_state(src, src_features, with_cache=True)
-        device = src_features.device
-
-        # Tile states and memory beam_size times.
-        dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim))
-        src_features = tile(src_features, beam_size, dim=0)
-        batch_offset = torch.arange(batch_size, dtype=torch.long, device=device)
-        beam_offset = torch.arange(0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device)
-        alive_seq = torch.full([batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device)
-
-        # Give full probability to the first beam on the first step.
-        topk_log_probs = torch.tensor([0.0] + [float("-inf")] * (beam_size - 1), device=device).repeat(batch_size)
-
-        # Structure that holds finished hypotheses.
-        hypotheses = [[] for _ in range(batch_size)]  # noqa: F812
-
-        results = {}
-        results["predictions"] = [[] for _ in range(batch_size)]  # noqa: F812
-        results["scores"] = [[] for _ in range(batch_size)]  # noqa: F812
-        results["gold_score"] = [0] * batch_size
-        results["batch"] = batch
-
-        for step in range(max_length):
-            decoder_input = alive_seq[:, -1].view(1, -1)
-
-            # Decoder forward.
-            decoder_input = decoder_input.transpose(0, 1)
-
-            dec_out, dec_states = self.model.decoder(decoder_input, src_features, dec_states, step=step)
-
-            # Generator forward.
-            log_probs = self.generator(dec_out.transpose(0, 1).squeeze(0))
-            vocab_size = log_probs.size(-1)
-
-            if step < min_length:
-                log_probs[:, self.end_token] = -1e20
-
-            # Multiply probs by the beam probability.
-            log_probs += topk_log_probs.view(-1).unsqueeze(1)
-
-            alpha = self.global_scorer.alpha
-            length_penalty = ((5.0 + (step + 1)) / 6.0) ** alpha
-
-            # Flatten probs into a list of possibilities.
-            curr_scores = log_probs / length_penalty
-
-            if self.args.block_trigram:
-                cur_len = alive_seq.size(1)
-                if cur_len > 3:
-                    for i in range(alive_seq.size(0)):
-                        fail = False
-                        words = [int(w) for w in alive_seq[i]]
-                        words = [self.vocab.ids_to_tokens[w] for w in words]
-                        words = " ".join(words).replace(" ##", "").split()
-                        if len(words) <= 3:
-                            continue
-                        trigrams = [(words[i - 1], words[i], words[i + 1]) for i in range(1, len(words) - 1)]
-                        trigram = tuple(trigrams[-1])
-                        if trigram in trigrams[:-1]:
-                            fail = True
-                        if fail:
-                            curr_scores[i] = -10e20
-
-            curr_scores = curr_scores.reshape(-1, beam_size * vocab_size)
-            topk_scores, topk_ids = curr_scores.topk(beam_size, dim=-1)
-
-            # Recover log probs.
-            topk_log_probs = topk_scores * length_penalty
-
-            # Resolve beam origin and true word ids.
-            topk_beam_index = topk_ids.div(vocab_size)
-            topk_ids = topk_ids.fmod(vocab_size)
-
-            # Map beam_index to batch_index in the flat representation.
-            batch_index = topk_beam_index + beam_offset[: topk_beam_index.size(0)].unsqueeze(1)
-            select_indices = batch_index.view(-1)
-
-            # Append last prediction.
-            alive_seq = torch.cat([alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1)
-
-            is_finished = topk_ids.eq(self.end_token)
-            if step + 1 == max_length:
-                is_finished.fill_(1)
-            # End condition is top beam is finished.
-            end_condition = is_finished[:, 0].eq(1)
-            # Save finished hypotheses.
-            if is_finished.any():
-                predictions = alive_seq.view(-1, beam_size, alive_seq.size(-1))
-                for i in range(is_finished.size(0)):
-                    b = batch_offset[i]
-                    if end_condition[i]:
-                        is_finished[i].fill_(1)
-                    finished_hyp = is_finished[i].nonzero().view(-1)
-                    # Store finished hypotheses for this batch.
-                    for j in finished_hyp:
-                        hypotheses[b].append((topk_scores[i, j], predictions[i, j, 1:]))
-                    # If the batch reached the end, save the n_best hypotheses.
-                    if end_condition[i]:
-                        best_hyp = sorted(hypotheses[b], key=lambda x: x[0], reverse=True)
-                        score, pred = best_hyp[0]
-
-                        results["scores"][b].append(score)
-                        results["predictions"][b].append(pred)
-                non_finished = end_condition.eq(0).nonzero().view(-1)
-                # If all sentences are translated, no need to go further.
-                if len(non_finished) == 0:
-                    break
-                # Remove finished batches for the next step.
-                topk_log_probs = topk_log_probs.index_select(0, non_finished)
-                batch_index = batch_index.index_select(0, non_finished)
-                batch_offset = batch_offset.index_select(0, non_finished)
-                alive_seq = predictions.index_select(0, non_finished).view(-1, alive_seq.size(-1))
-            # Reorder states.
-            select_indices = batch_index.view(-1)
-            src_features = src_features.index_select(0, select_indices)
-            dec_states.map_batch_fn(lambda state, dim: state.index_select(dim, select_indices))
-
-        return results
-
-    def from_batch(self, translation_batch):
-        batch = translation_batch["batch"]
-        assert len(translation_batch["gold_score"]) == len(translation_batch["predictions"])
-        batch_size = batch.batch_size
-
-        preds, _, _, tgt_str, src = (
-            translation_batch["predictions"],
-            translation_batch["scores"],
-            translation_batch["gold_score"],
-            batch.tgt_str,
-            batch.src,
-        )
-
-        translations = []
-        for b in range(batch_size):
-            pred_sents = self.vocab.convert_ids_to_tokens([int(n) for n in preds[b][0]])
-            pred_sents = " ".join(pred_sents).replace(" ##", "")
-            gold_sent = " ".join(tgt_str[b].split())
-            raw_src = [self.vocab.ids_to_tokens[int(t)] for t in src[b]][:500]
-            raw_src = " ".join(raw_src)
-            translation = (pred_sents, gold_sent, raw_src)
-            translations.append(translation)
-
-        return translations
-
-
-def tile(x, count, dim=0):
-    """
-    Tiles x on dimension dim count times.
-    """
-    perm = list(range(len(x.size())))
-    if dim != 0:
-        perm[0], perm[dim] = perm[dim], perm[0]
-        x = x.permute(perm).contiguous()
-    out_size = list(x.size())
-    out_size[0] *= count
-    batch = x.size(0)
-    x = x.view(batch, -1).transpose(0, 1).repeat(count, 1).transpose(0, 1).contiguous().view(*out_size)
-    if dim != 0:
-        x = x.permute(perm).contiguous()
-    return x
-
-
-#
-# Optimizer for training. We keep this here in case we want to add
-# a finetuning script.
-#
-
-
-class BertSumOptimizer:
-    """Specific optimizer for BertSum.
-
-    As described in [1], the authors fine-tune BertSum for abstractive
-    summarization using two Adam Optimizers with different warm-up steps and
-    learning rate. They also use a custom learning rate scheduler.
-
-    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
-        arXiv preprint arXiv:1908.08345 (2019).
-    """
-
-    def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
-        self.encoder = model.encoder
-        self.decoder = model.decoder
-        self.lr = lr
-        self.warmup_steps = warmup_steps
-
-        self.optimizers = {
-            "encoder": torch.optim.Adam(
-                model.encoder.parameters(),
-                lr=lr["encoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
-            ),
-            "decoder": torch.optim.Adam(
-                model.decoder.parameters(),
-                lr=lr["decoder"],
-                betas=(beta_1, beta_2),
-                eps=eps,
-            ),
-        }
-
-        self._step = 0
-        self.current_learning_rates = {}
-
-    def _update_rate(self, stack):
-        return self.lr[stack] * min(self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5))
-
-    def zero_grad(self):
-        self.optimizer_decoder.zero_grad()
-        self.optimizer_encoder.zero_grad()
-
-    def step(self):
-        self._step += 1
-        for stack, optimizer in self.optimizers.items():
-            new_rate = self._update_rate(stack)
-            for param_group in optimizer.param_groups:
-                param_group["lr"] = new_rate
-            optimizer.step()
-            self.current_learning_rates[stack] = new_rate
diff --git a/examples/research_projects/bertabs/requirements.txt b/examples/research_projects/bertabs/requirements.txt
deleted file mode 100644
index bc2a3d6a1630..000000000000
--- a/examples/research_projects/bertabs/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-transformers == 4.38.0
-
-# For ROUGE
-nltk
-py-rouge
diff --git a/examples/research_projects/bertabs/run_summarization.py b/examples/research_projects/bertabs/run_summarization.py
deleted file mode 100644
index 1f969f117baa..000000000000
--- a/examples/research_projects/bertabs/run_summarization.py
+++ /dev/null
@@ -1,347 +0,0 @@
-#! /usr/bin/python3
-import argparse
-import logging
-import os
-import sys
-from collections import namedtuple
-
-import torch
-from modeling_bertabs import BertAbs, build_predictor
-from torch.utils.data import DataLoader, SequentialSampler
-from tqdm import tqdm
-
-from transformers import BertTokenizer
-
-from .utils_summarization import (
-    CNNDMDataset,
-    build_mask,
-    compute_token_type_ids,
-    encode_for_summarization,
-    truncate_or_pad,
-)
-
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-
-
-Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"])
-
-
-def evaluate(args):
-    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", do_lower_case=True)
-    model = BertAbs.from_pretrained("remi/bertabs-finetuned-extractive-abstractive-summarization")
-    model.to(args.device)
-    model.eval()
-
-    symbols = {
-        "BOS": tokenizer.vocab["[unused0]"],
-        "EOS": tokenizer.vocab["[unused1]"],
-        "PAD": tokenizer.vocab["[PAD]"],
-    }
-
-    if args.compute_rouge:
-        reference_summaries = []
-        generated_summaries = []
-
-        import nltk
-        import rouge
-
-        nltk.download("punkt")
-        rouge_evaluator = rouge.Rouge(
-            metrics=["rouge-n", "rouge-l"],
-            max_n=2,
-            limit_length=True,
-            length_limit=args.beam_size,
-            length_limit_type="words",
-            apply_avg=True,
-            apply_best=False,
-            alpha=0.5,  # Default F1_score
-            weight_factor=1.2,
-            stemming=True,
-        )
-
-    # these (unused) arguments are defined to keep the compatibility
-    # with the legacy code and will be deleted in a next iteration.
-    args.result_path = ""
-    args.temp_dir = ""
-
-    data_iterator = build_data_iterator(args, tokenizer)
-    predictor = build_predictor(args, tokenizer, symbols, model)
-
-    logger.info("***** Running evaluation *****")
-    logger.info("  Number examples = %d", len(data_iterator.dataset))
-    logger.info("  Batch size = %d", args.batch_size)
-    logger.info("")
-    logger.info("***** Beam Search parameters *****")
-    logger.info("  Beam size = %d", args.beam_size)
-    logger.info("  Minimum length = %d", args.min_length)
-    logger.info("  Maximum length = %d", args.max_length)
-    logger.info("  Alpha (length penalty) = %.2f", args.alpha)
-    logger.info("  Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT"))
-
-    for batch in tqdm(data_iterator):
-        batch_data = predictor.translate_batch(batch)
-        translations = predictor.from_batch(batch_data)
-        summaries = [format_summary(t) for t in translations]
-        save_summaries(summaries, args.summaries_output_dir, batch.document_names)
-
-        if args.compute_rouge:
-            reference_summaries += batch.tgt_str
-            generated_summaries += summaries
-
-    if args.compute_rouge:
-        scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries)
-        str_scores = format_rouge_scores(scores)
-        save_rouge_scores(str_scores)
-        print(str_scores)
-
-
-def save_summaries(summaries, path, original_document_name):
-    """Write the summaries in fies that are prefixed by the original
-    files' name with the `_summary` appended.
-
-    Attributes:
-        original_document_names: List[string]
-            Name of the document that was summarized.
-        path: string
-            Path were the summaries will be written
-        summaries: List[string]
-            The summaries that we produced.
-    """
-    for summary, document_name in zip(summaries, original_document_name):
-        # Prepare the summary file's name
-        if "." in document_name:
-            bare_document_name = ".".join(document_name.split(".")[:-1])
-            extension = document_name.split(".")[-1]
-            name = bare_document_name + "_summary." + extension
-        else:
-            name = document_name + "_summary"
-
-        file_path = os.path.join(path, name)
-        with open(file_path, "w") as output:
-            output.write(summary)
-
-
-def format_summary(translation):
-    """Transforms the output of the `from_batch` function
-    into nicely formatted summaries.
-    """
-    raw_summary, _, _ = translation
-    summary = (
-        raw_summary.replace("[unused0]", "")
-        .replace("[unused3]", "")
-        .replace("[PAD]", "")
-        .replace("[unused1]", "")
-        .replace(r" +", " ")
-        .replace(" [unused2] ", ". ")
-        .replace("[unused2]", "")
-        .strip()
-    )
-
-    return summary
-
-
-def format_rouge_scores(scores):
-    return """\n
-****** ROUGE SCORES ******
-
-** ROUGE 1
-F1        >> {:.3f}
-Precision >> {:.3f}
-Recall    >> {:.3f}
-
-** ROUGE 2
-F1        >> {:.3f}
-Precision >> {:.3f}
-Recall    >> {:.3f}
-
-** ROUGE L
-F1        >> {:.3f}
-Precision >> {:.3f}
-Recall    >> {:.3f}""".format(
-        scores["rouge-1"]["f"],
-        scores["rouge-1"]["p"],
-        scores["rouge-1"]["r"],
-        scores["rouge-2"]["f"],
-        scores["rouge-2"]["p"],
-        scores["rouge-2"]["r"],
-        scores["rouge-l"]["f"],
-        scores["rouge-l"]["p"],
-        scores["rouge-l"]["r"],
-    )
-
-
-def save_rouge_scores(str_scores):
-    with open("rouge_scores.txt", "w") as output:
-        output.write(str_scores)
-
-
-#
-# LOAD the dataset
-#
-
-
-def build_data_iterator(args, tokenizer):
-    dataset = load_and_cache_examples(args, tokenizer)
-    sampler = SequentialSampler(dataset)
-
-    def collate_fn(data):
-        return collate(data, tokenizer, block_size=512, device=args.device)
-
-    iterator = DataLoader(
-        dataset,
-        sampler=sampler,
-        batch_size=args.batch_size,
-        collate_fn=collate_fn,
-    )
-
-    return iterator
-
-
-def load_and_cache_examples(args, tokenizer):
-    dataset = CNNDMDataset(args.documents_dir)
-    return dataset
-
-
-def collate(data, tokenizer, block_size, device):
-    """Collate formats the data passed to the data loader.
-
-    In particular we tokenize the data batch after batch to avoid keeping them
-    all in memory. We output the data as a namedtuple to fit the original BertAbs's
-    API.
-    """
-    data = [x for x in data if not len(x[1]) == 0]  # remove empty_files
-    names = [name for name, _, _ in data]
-    summaries = [" ".join(summary_list) for _, _, summary_list in data]
-
-    encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data]
-    encoded_stories = torch.tensor(
-        [truncate_or_pad(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
-    )
-    encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
-    encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
-
-    batch = Batch(
-        document_names=names,
-        batch_size=len(encoded_stories),
-        src=encoded_stories.to(device),
-        segs=encoder_token_type_ids.to(device),
-        mask_src=encoder_mask.to(device),
-        tgt_str=summaries,
-    )
-
-    return batch
-
-
-def decode_summary(summary_tokens, tokenizer):
-    """Decode the summary and return it in a format
-    suitable for evaluation.
-    """
-    summary_tokens = summary_tokens.to("cpu").numpy()
-    summary = tokenizer.decode(summary_tokens)
-    sentences = summary.split(".")
-    sentences = [s + "." for s in sentences]
-    return sentences
-
-
-def main():
-    """The main function defines the interface with the users."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--documents_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The folder where the documents to summarize are located.",
-    )
-    parser.add_argument(
-        "--summaries_output_dir",
-        default=None,
-        type=str,
-        required=False,
-        help="The folder in wich the summaries should be written. Defaults to the folder where the documents are",
-    )
-    parser.add_argument(
-        "--compute_rouge",
-        default=False,
-        type=bool,
-        required=False,
-        help="Compute the ROUGE metrics during evaluation. Only available for the CNN/DailyMail dataset.",
-    )
-    # EVALUATION options
-    parser.add_argument(
-        "--no_cuda",
-        default=False,
-        type=bool,
-        help="Whether to force the execution on CPU.",
-    )
-    parser.add_argument(
-        "--batch_size",
-        default=4,
-        type=int,
-        help="Batch size per GPU/CPU for training.",
-    )
-    # BEAM SEARCH arguments
-    parser.add_argument(
-        "--min_length",
-        default=50,
-        type=int,
-        help="Minimum number of tokens for the summaries.",
-    )
-    parser.add_argument(
-        "--max_length",
-        default=200,
-        type=int,
-        help="Maixmum number of tokens for the summaries.",
-    )
-    parser.add_argument(
-        "--beam_size",
-        default=5,
-        type=int,
-        help="The number of beams to start with for each example.",
-    )
-    parser.add_argument(
-        "--alpha",
-        default=0.95,
-        type=float,
-        help="The value of alpha for the length penalty in the beam search.",
-    )
-    parser.add_argument(
-        "--block_trigram",
-        default=True,
-        type=bool,
-        help="Whether to block the existence of repeating trigrams in the text generated by beam search.",
-    )
-    args = parser.parse_args()
-
-    # Select device (distibuted not available)
-    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-
-    # Check the existence of directories
-    if not args.summaries_output_dir:
-        args.summaries_output_dir = args.documents_dir
-
-    if not documents_dir_is_valid(args.documents_dir):
-        raise FileNotFoundError(
-            "We could not find the directory you specified for the documents to summarize, or it was empty. Please"
-            " specify a valid path."
-        )
-    os.makedirs(args.summaries_output_dir, exist_ok=True)
-
-    evaluate(args)
-
-
-def documents_dir_is_valid(path):
-    if not os.path.exists(path):
-        return False
-
-    file_list = os.listdir(path)
-    if len(file_list) == 0:
-        return False
-
-    return True
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/bertabs/test_utils_summarization.py b/examples/research_projects/bertabs/test_utils_summarization.py
deleted file mode 100644
index 18120c9063ed..000000000000
--- a/examples/research_projects/bertabs/test_utils_summarization.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-import torch
-
-from .utils_summarization import build_mask, compute_token_type_ids, process_story, truncate_or_pad
-
-
-class SummarizationDataProcessingTest(unittest.TestCase):
-    def setUp(self):
-        self.block_size = 10
-
-    def test_fit_to_block_sequence_too_small(self):
-        """Pad the sequence with 0 if the sequence is smaller than the block size."""
-        sequence = [1, 2, 3, 4]
-        expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
-        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
-
-    def test_fit_to_block_sequence_fit_exactly(self):
-        """Do nothing if the sequence is the right size."""
-        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
-
-    def test_fit_to_block_sequence_too_big(self):
-        """Truncate the sequence if it is too long."""
-        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
-        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(truncate_or_pad(sequence, self.block_size, 0), expected_output)
-
-    def test_process_story_no_highlights(self):
-        """Processing a story with no highlights returns an empty list for the summary."""
-        raw_story = """It was the year of Our Lord one thousand seven hundred and
-        seventy-five.\n\nSpiritual revelations were conceded to England at that
-        favoured period, as at this."""
-        _, summary_lines = process_story(raw_story)
-        self.assertEqual(summary_lines, [])
-
-    def test_process_empty_story(self):
-        """An empty story returns an empty collection of lines."""
-        raw_story = ""
-        story_lines, summary_lines = process_story(raw_story)
-        self.assertEqual(story_lines, [])
-        self.assertEqual(summary_lines, [])
-
-    def test_process_story_with_missing_period(self):
-        raw_story = (
-            "It was the year of Our Lord one thousand seven hundred and "
-            "seventy-five\n\nSpiritual revelations were conceded to England "
-            "at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
-        )
-        story_lines, summary_lines = process_story(raw_story)
-
-        expected_story_lines = [
-            "It was the year of Our Lord one thousand seven hundred and seventy-five.",
-            "Spiritual revelations were conceded to England at that favoured period, as at this.",
-        ]
-        self.assertEqual(expected_story_lines, story_lines)
-
-        expected_summary_lines = ["It was the best of times."]
-        self.assertEqual(expected_summary_lines, summary_lines)
-
-    def test_build_mask_no_padding(self):
-        sequence = torch.tensor([1, 2, 3, 4])
-        expected = torch.tensor([1, 1, 1, 1])
-        np.testing.assert_array_equal(build_mask(sequence, 0).numpy(), expected.numpy())
-
-    def test_build_mask(self):
-        sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
-        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
-        np.testing.assert_array_equal(build_mask(sequence, 23).numpy(), expected.numpy())
-
-    def test_build_mask_with_padding_equal_to_one(self):
-        sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
-        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
-        np.testing.assert_array_equal(build_mask(sequence, 1).numpy(), expected.numpy())
-
-    def test_compute_token_type_ids(self):
-        separator = 101
-        batch = torch.tensor([[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]])
-        expected = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]])
-
-        result = compute_token_type_ids(batch, separator)
-        np.testing.assert_array_equal(result, expected)
diff --git a/examples/research_projects/bertabs/utils_summarization.py b/examples/research_projects/bertabs/utils_summarization.py
deleted file mode 100644
index 716365336bb5..000000000000
--- a/examples/research_projects/bertabs/utils_summarization.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import os
-from collections import deque
-
-import torch
-from torch.utils.data import Dataset
-
-
-# ------------
-# Data loading
-# ------------
-
-
-class CNNDMDataset(Dataset):
-    """Abstracts the dataset used to train seq2seq models.
-
-    The class will process the documents that are located in the specified
-    folder. The preprocessing will work on any document that is reasonably
-    formatted. On the CNN/DailyMail dataset it will extract both the story
-    and the summary.
-
-    CNN/Daily News:
-
-    The CNN/Daily News raw datasets are downloaded from [1]. The stories are
-    stored in different files; the summary appears at the end of the story as
-    sentences that are prefixed by the special `@highlight` line. To process
-    the data, untar both datasets in the same folder, and pass the path to this
-    folder as the "data_dir argument. The formatting code was inspired by [2].
-
-    [1] https://cs.nyu.edu/~kcho/
-    [2] https://github.com/abisee/cnn-dailymail/
-    """
-
-    def __init__(self, path="", prefix="train"):
-        """We initialize the class by listing all the documents to summarize.
-        Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
-        """
-        assert os.path.isdir(path)
-
-        self.documents = []
-        story_filenames_list = os.listdir(path)
-        for story_filename in story_filenames_list:
-            if "summary" in story_filename:
-                continue
-            path_to_story = os.path.join(path, story_filename)
-            if not os.path.isfile(path_to_story):
-                continue
-            self.documents.append(path_to_story)
-
-    def __len__(self):
-        """Returns the number of documents."""
-        return len(self.documents)
-
-    def __getitem__(self, idx):
-        document_path = self.documents[idx]
-        document_name = document_path.split("/")[-1]
-        with open(document_path, encoding="utf-8") as source:
-            raw_story = source.read()
-            story_lines, summary_lines = process_story(raw_story)
-        return document_name, story_lines, summary_lines
-
-
-def process_story(raw_story):
-    """Extract the story and summary from a story file.
-
-    Arguments:
-        raw_story (str): content of the story file as an utf-8 encoded string.
-
-    Raises:
-        IndexError: If the story is empty or contains no highlights.
-    """
-    nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
-
-    # for some unknown reason some lines miss a period, add it
-    nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
-
-    # gather article lines
-    story_lines = []
-    lines = deque(nonempty_lines)
-    while True:
-        try:
-            element = lines.popleft()
-            if element.startswith("@highlight"):
-                break
-            story_lines.append(element)
-        except IndexError:
-            # if "@highlight" is absent from the file we pop
-            # all elements until there is None, raising an exception.
-            return story_lines, []
-
-    # gather summary lines
-    summary_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
-
-    return story_lines, summary_lines
-
-
-def _add_missing_period(line):
-    END_TOKENS = [".", "!", "?", "...", "'", "`", '"', "\u2019", "\u2019", ")"]
-    if line.startswith("@highlight"):
-        return line
-    if line[-1] in END_TOKENS:
-        return line
-    return line + "."
-
-
-# --------------------------
-# Encoding and preprocessing
-# --------------------------
-
-
-def truncate_or_pad(sequence, block_size, pad_token_id):
-    """Adapt the source and target sequences' lengths to the block size.
-    If the sequence is shorter we append padding token to the right of the sequence.
-    """
-    if len(sequence) > block_size:
-        return sequence[:block_size]
-    else:
-        sequence.extend([pad_token_id] * (block_size - len(sequence)))
-        return sequence
-
-
-def build_mask(sequence, pad_token_id):
-    """Builds the mask. The attention mechanism will only attend to positions
-    with value 1."""
-    mask = torch.ones_like(sequence)
-    idx_pad_tokens = sequence == pad_token_id
-    mask[idx_pad_tokens] = 0
-    return mask
-
-
-def encode_for_summarization(story_lines, summary_lines, tokenizer):
-    """Encode the story and summary lines, and join them
-    as specified in [1] by using `[SEP] [CLS]` tokens to separate
-    sentences.
-    """
-    story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
-    story_token_ids = [token for sentence in story_lines_token_ids for token in sentence]
-    summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
-    summary_token_ids = [token for sentence in summary_lines_token_ids for token in sentence]
-
-    return story_token_ids, summary_token_ids
-
-
-def compute_token_type_ids(batch, separator_token_id):
-    """Segment embeddings as described in [1]
-
-    The values {0,1} were found in the repository [2].
-
-    Attributes:
-        batch: torch.Tensor, size [batch_size, block_size]
-            Batch of input.
-        separator_token_id: int
-            The value of the token that separates the segments.
-
-    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
-        arXiv preprint arXiv:1908.08345 (2019).
-    [2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)
-    """
-    batch_embeddings = []
-    for sequence in batch:
-        sentence_num = -1
-        embeddings = []
-        for s in sequence:
-            if s == separator_token_id:
-                sentence_num += 1
-            embeddings.append(sentence_num % 2)
-        batch_embeddings.append(embeddings)
-    return torch.tensor(batch_embeddings)
diff --git a/examples/research_projects/bertology/requirements.txt b/examples/research_projects/bertology/requirements.txt
deleted file mode 100644
index 99636a7fce1b..000000000000
--- a/examples/research_projects/bertology/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers == 4.38.0
diff --git a/examples/research_projects/bertology/run_bertology.py b/examples/research_projects/bertology/run_bertology.py
deleted file mode 100644
index 35d096f16499..000000000000
--- a/examples/research_projects/bertology/run_bertology.py
+++ /dev/null
@@ -1,453 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2018 CMU and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Bertology: this script shows how you can explore the internals of the models in the library to:
-- compute the entropy of the head attentions
-- compute the importance of each head
-- prune (remove) the low importance head.
-Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650)
-which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1
-"""
-
-import argparse
-import logging
-import os
-from datetime import datetime
-
-import numpy as np
-import torch
-from torch import nn
-from torch.utils.data import DataLoader, SequentialSampler, Subset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    GlueDataset,
-    default_data_collator,
-    glue_compute_metrics,
-    glue_output_modes,
-    glue_processors,
-    set_seed,
-)
-from transformers.trainer_utils import is_main_process
-
-
-logger = logging.getLogger(__name__)
-
-
-def entropy(p):
-    """Compute the entropy of a probability distribution"""
-    plogp = p * torch.log(p)
-    plogp[p == 0] = 0
-    return -plogp.sum(dim=-1)
-
-
-def print_2d_tensor(tensor):
-    """Print a 2D tensor"""
-    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
-    for row in range(len(tensor)):
-        if tensor.dtype != torch.long:
-            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data))
-        else:
-            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
-
-
-def compute_heads_importance(
-    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
-):
-    """This method shows how to compute:
-    - head attention entropy
-    - head importance scores according to http://arxiv.org/abs/1905.10650
-    """
-    # Prepare our tensors
-    n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
-    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
-    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
-
-    if head_mask is None:
-        head_mask = torch.ones(n_layers, n_heads).to(args.device)
-
-    head_mask.requires_grad_(requires_grad=True)
-    # If actually pruned attention multi-head, set head mask to None to avoid shape mismatch
-    if actually_pruned:
-        head_mask = None
-
-    preds = None
-    labels = None
-    tot_tokens = 0.0
-
-    for step, inputs in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-        for k, v in inputs.items():
-            inputs[k] = v.to(args.device)
-
-        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(**inputs, head_mask=head_mask)
-        loss, logits, all_attentions = (
-            outputs[0],
-            outputs[1],
-            outputs[-1],
-        )  # Loss and logits are the first, attention the last
-        loss.backward()  # Backpropagate to populate the gradients in the head mask
-
-        if compute_entropy:
-            for layer, attn in enumerate(all_attentions):
-                masked_entropy = entropy(attn.detach()) * inputs["attention_mask"].float().unsqueeze(1)
-                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
-
-        if compute_importance:
-            head_importance += head_mask.grad.abs().detach()
-
-        # Also store our logits/labels if we want to compute metrics afterwards
-        if preds is None:
-            preds = logits.detach().cpu().numpy()
-            labels = inputs["labels"].detach().cpu().numpy()
-        else:
-            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            labels = np.append(labels, inputs["labels"].detach().cpu().numpy(), axis=0)
-
-        tot_tokens += inputs["attention_mask"].float().detach().sum().data
-
-    # Normalize
-    attn_entropy /= tot_tokens
-    head_importance /= tot_tokens
-    # Layerwise importance normalization
-    if not args.dont_normalize_importance_by_layer:
-        exponent = 2
-        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
-        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
-
-    if not args.dont_normalize_global_importance:
-        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
-
-    # Print/save matrices
-    np.save(os.path.join(args.output_dir, "attn_entropy.npy"), attn_entropy.detach().cpu().numpy())
-    np.save(os.path.join(args.output_dir, "head_importance.npy"), head_importance.detach().cpu().numpy())
-
-    logger.info("Attention entropies")
-    print_2d_tensor(attn_entropy)
-    logger.info("Head importance scores")
-    print_2d_tensor(head_importance)
-    logger.info("Head ranked by importance scores")
-    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
-        head_importance.numel(), device=args.device
-    )
-    head_ranks = head_ranks.view_as(head_importance)
-    print_2d_tensor(head_ranks)
-
-    return attn_entropy, head_importance, preds, labels
-
-
-def mask_heads(args, model, eval_dataloader):
-    """This method shows how to mask head (set some heads to zero), to test the effect on the network,
-    based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
-    """
-    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
-    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    original_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
-    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
-
-    new_head_mask = torch.ones_like(head_importance)
-    num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
-
-    current_score = original_score
-    while current_score >= original_score * args.masking_threshold:
-        head_mask = new_head_mask.clone()  # save current head mask
-        # heads from least important to most - keep only not-masked heads
-        head_importance[head_mask == 0.0] = float("Inf")
-        current_heads_to_mask = head_importance.view(-1).sort()[1]
-
-        if len(current_heads_to_mask) <= num_to_mask:
-            break
-
-        # mask heads
-        current_heads_to_mask = current_heads_to_mask[:num_to_mask]
-        logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
-        new_head_mask = new_head_mask.view(-1)
-        new_head_mask[current_heads_to_mask] = 0.0
-        new_head_mask = new_head_mask.view_as(head_mask)
-        new_head_mask = new_head_mask.clone().detach()
-        print_2d_tensor(new_head_mask)
-
-        # Compute metric and head importance again
-        _, head_importance, preds, labels = compute_heads_importance(
-            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
-        )
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        current_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
-        logger.info(
-            "Masking: current score: %f, remaining heads %d (%.1f percents)",
-            current_score,
-            new_head_mask.sum(),
-            new_head_mask.sum() / new_head_mask.numel() * 100,
-        )
-
-    logger.info("Final head mask")
-    print_2d_tensor(head_mask)
-    np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())
-
-    return head_mask
-
-
-def prune_heads(args, model, eval_dataloader, head_mask):
-    """This method shows how to prune head (remove heads weights) based on
-    the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
-    """
-    # Try pruning and test time speedup
-    # Pruning is like masking but we actually remove the masked weights
-    before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(
-        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
-    )
-    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    score_masking = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
-    original_time = datetime.now() - before_time
-
-    original_num_params = sum(p.numel() for p in model.parameters())
-    heads_to_prune = {
-        layer: (1 - head_mask[layer].long()).nonzero().squeeze().tolist() for layer in range(len(head_mask))
-    }
-
-    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
-    model.prune_heads(heads_to_prune)
-    pruned_num_params = sum(p.numel() for p in model.parameters())
-
-    before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(
-        args,
-        model,
-        eval_dataloader,
-        compute_entropy=False,
-        compute_importance=False,
-        head_mask=None,
-        actually_pruned=True,
-    )
-    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    score_pruning = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
-    new_time = datetime.now() - before_time
-
-    logger.info(
-        "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
-        original_num_params,
-        pruned_num_params,
-        pruned_num_params / original_num_params * 100,
-    )
-    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time / new_time * 100)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--task_name",
-        default=None,
-        type=str,
-        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(glue_processors.keys()),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name",
-        default="",
-        type=str,
-        help="Pretrained config name or path if not the same as model_name_or_path",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name_or_path",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default=None,
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-    )
-    parser.add_argument(
-        "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
-    )
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-
-    parser.add_argument(
-        "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers"
-    )
-    parser.add_argument(
-        "--dont_normalize_global_importance",
-        action="store_true",
-        help="Don't normalize all importance scores between 0 and 1",
-    )
-
-    parser.add_argument(
-        "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy."
-    )
-    parser.add_argument(
-        "--masking_threshold",
-        default=0.9,
-        type=float,
-        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
-    )
-    parser.add_argument(
-        "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step."
-    )
-    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help=(
-            "The maximum total input sequence length after WordPiece tokenization. \n"
-            "Sequences longer than this will be truncated, sequences shorter padded."
-        ),
-    )
-    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
-
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup devices and distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        args.device = torch.device("cuda", args.local_rank)
-        args.n_gpu = 1
-        torch.distributed.init_process_group(backend="nccl")  # Initializes the distributed backend
-
-    # Setup logging
-    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-
-    # Set seeds
-    set_seed(args.seed)
-
-    # Prepare GLUE task
-    args.task_name = args.task_name.lower()
-    if args.task_name not in glue_processors:
-        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = glue_processors[args.task_name]()
-    args.output_mode = glue_output_modes[args.task_name]
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=args.task_name,
-        output_attentions=True,
-        cache_dir=args.cache_dir,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        cache_dir=args.cache_dir,
-    )
-    model = AutoModelForSequenceClassification.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir,
-    )
-
-    # Distributed and parallel training
-    model.to(args.device)
-    if args.local_rank != -1:
-        model = nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-    elif args.n_gpu > 1:
-        model = nn.DataParallel(model)
-
-    # Print/save training arguments
-    os.makedirs(args.output_dir, exist_ok=True)
-    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Prepare dataset for the GLUE task
-    eval_dataset = GlueDataset(args, tokenizer=tokenizer, mode="dev")
-    if args.data_subset > 0:
-        eval_dataset = Subset(eval_dataset, list(range(min(args.data_subset, len(eval_dataset)))))
-    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
-    eval_dataloader = DataLoader(
-        eval_dataset, sampler=eval_sampler, batch_size=args.batch_size, collate_fn=default_data_collator
-    )
-
-    # Compute head entropy and importance score
-    compute_heads_importance(args, model, eval_dataloader)
-
-    # Try head masking (set heads to zero until the score goes under a threshole)
-    # and head pruning (remove masked heads and see the effect on the network)
-    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
-        head_mask = mask_heads(args, model, eval_dataloader)
-        prune_heads(args, model, eval_dataloader, head_mask)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/bertology/run_prune_gpt.py b/examples/research_projects/bertology/run_prune_gpt.py
deleted file mode 100644
index d227634c2bf7..000000000000
--- a/examples/research_projects/bertology/run_prune_gpt.py
+++ /dev/null
@@ -1,391 +0,0 @@
-#!/usr/bin/env python3
-"""This script is adapted from the Bertology pruning code (https://github.com/huggingface/transformers/blob/783d7d2629e97c5f0c5f9ef01b8c66410275c204/examples/research_projects/bertology/run_bertology.py)
-to prune GPT-like models. The author is @altsoph.
-"""
-
-import argparse
-import logging
-import os
-from datetime import datetime
-
-import numpy as np
-import torch
-from torch import nn
-from torch.utils.data import DataLoader, RandomSampler, TensorDataset
-from tqdm import tqdm
-
-from transformers import GPT2LMHeadModel
-
-
-logger = logging.getLogger(__name__)
-
-
-def save_model(model, dirpath):
-    # save results
-    if os.path.exists(dirpath):
-        if os.path.exists(os.path.join(dirpath, "config.json")) and os.path.isfile(
-            os.path.join(dirpath, "config.json")
-        ):
-            os.remove(os.path.join(dirpath, "config.json"))
-        if os.path.exists(os.path.join(dirpath, "pytorch_model.bin")) and os.path.isfile(
-            os.path.join(dirpath, "pytorch_model.bin")
-        ):
-            os.remove(os.path.join(dirpath, "pytorch_model.bin"))
-    else:
-        os.makedirs(dirpath)
-    model.save_pretrained(dirpath)
-
-
-def entropy(p, unlogit=False):
-    """Compute the entropy of a probability distribution"""
-    exponent = 2
-    if unlogit:
-        p = torch.pow(p, exponent)
-    plogp = p * torch.log(p)
-    plogp[p == 0] = 0
-    return -plogp.sum(dim=-1)
-
-
-def print_2d_tensor(tensor):
-    """Print a 2D tensor"""
-    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
-    for row in range(len(tensor)):
-        if tensor.dtype != torch.long:
-            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data))
-        else:
-            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
-
-
-def compute_heads_importance(
-    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
-):
-    """This method shows how to compute:
-    - head attention entropy
-    - head importance scores according to http://arxiv.org/abs/1905.10650
-    """
-    # Prepare our tensors
-    n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
-    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
-    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
-
-    if head_mask is None:
-        head_mask = torch.ones(n_layers, n_heads).to(args.device)
-
-    head_mask.requires_grad_(requires_grad=True)
-    # If actually pruned attention multi-head, set head mask to None to avoid shape mismatch
-    if actually_pruned:
-        head_mask = None
-
-    tot_tokens = 0.0
-    total_loss = 0.0
-    for step, inputs in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-        inputs = tuple(t.to(args.device) for t in inputs)
-        (input_ids,) = inputs
-
-        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(input_ids, labels=input_ids, head_mask=head_mask)
-        #  (loss), lm_logits, presents, (all hidden_states), (attentions)
-        loss, _, all_attentions = (
-            outputs[0],
-            outputs[1],
-            outputs[-1],
-        )  # Loss and logits are the first, attention the last
-        loss.backward()  # Backpropagate to populate the gradients in the head mask
-        total_loss += loss.detach().cpu().numpy()
-        if compute_entropy:
-            for layer, attn in enumerate(all_attentions):
-                masked_entropy = entropy(attn.detach(), True)
-                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).sum(0).detach()
-
-        if compute_importance:
-            head_importance += head_mask.grad.abs().detach()
-        tot_tokens += torch.ones_like(input_ids).float().detach().sum().data
-
-    # Normalize
-    attn_entropy /= tot_tokens
-    head_importance /= tot_tokens
-    # Layerwise importance normalization
-    if not args.dont_normalize_importance_by_layer:
-        exponent = 2
-        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
-        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
-
-    if not args.dont_normalize_global_importance:
-        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
-
-    # Print matrices
-    if compute_entropy:
-        logger.info("Attention entropies")
-        print_2d_tensor(attn_entropy)
-    if compute_importance:
-        logger.info("Head importance scores")
-        print_2d_tensor(head_importance)
-    logger.info("Head ranked by importance scores")
-    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
-        head_importance.numel(), device=args.device
-    )
-    head_ranks = head_ranks.view_as(head_importance)
-    print_2d_tensor(head_ranks)
-    return attn_entropy, head_importance, total_loss
-
-
-def mask_heads(args, model, eval_dataloader):
-    """This method shows how to mask head (set some heads to zero), to test the effect on the network,
-    based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
-    """
-    _, head_importance, loss = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
-    original_score = 1 / loss  # instead of downsteam score use the LM loss
-    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
-
-    new_head_mask = torch.ones_like(head_importance)
-    num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
-
-    current_score = original_score
-    while current_score >= original_score * args.masking_threshold:
-        head_mask = new_head_mask.clone().detach()  # save current head mask
-        # heads from least important to most - keep only not-masked heads
-        head_importance[head_mask == 0.0] = float("Inf")
-        current_heads_to_mask = head_importance.view(-1).sort()[1]
-
-        if len(current_heads_to_mask) <= num_to_mask:
-            print("BREAK BY num_to_mask")
-            break
-
-        # mask heads
-        current_heads_to_mask = current_heads_to_mask[:num_to_mask]
-        logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
-        new_head_mask = new_head_mask.view(-1)
-        new_head_mask[current_heads_to_mask] = 0.0
-        new_head_mask = new_head_mask.view_as(head_mask)
-        new_head_mask = new_head_mask.clone().detach()
-        print_2d_tensor(new_head_mask)
-
-        # Compute metric and head importance again
-        _, head_importance, loss = compute_heads_importance(
-            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
-        )
-        current_score = 1 / loss
-        logger.info(
-            "Masking: current score: %f, remaining heads %d (%.1f percents)",
-            current_score,
-            new_head_mask.sum(),
-            new_head_mask.sum() / new_head_mask.numel() * 100,
-        )
-
-    logger.info("Final head mask")
-    print_2d_tensor(head_mask)
-    np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())
-
-    return head_mask
-
-
-def prune_heads(args, model, eval_dataloader, head_mask):
-    """This method shows how to prune head (remove heads weights) based on
-    the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
-    """
-    # Try pruning and test time speedup
-    # Pruning is like masking but we actually remove the masked weights
-    before_time = datetime.now()
-    _, _, loss = compute_heads_importance(
-        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
-    )
-    score_masking = 1 / loss
-    original_time = datetime.now() - before_time
-
-    original_num_params = sum(p.numel() for p in model.parameters())
-    heads_to_prune = {
-        layer: (1 - head_mask[layer].long()).nonzero().squeeze().tolist() for layer in range(len(head_mask))
-    }
-
-    for k, v in heads_to_prune.items():
-        if isinstance(v, int):
-            heads_to_prune[k] = [
-                v,
-            ]
-
-    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
-    model.prune_heads(heads_to_prune)
-    pruned_num_params = sum(p.numel() for p in model.parameters())
-
-    before_time = datetime.now()
-    _, _, loss = compute_heads_importance(
-        args,
-        model,
-        eval_dataloader,
-        compute_entropy=False,
-        compute_importance=False,
-        head_mask=None,
-        actually_pruned=True,
-    )
-
-    score_pruning = 1 / loss
-    new_time = datetime.now() - before_time
-
-    logger.info(
-        "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
-        original_num_params,
-        pruned_num_params,
-        pruned_num_params / original_num_params * 100,
-    )
-    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-    logger.info("Pruning: speed ratio (original timing / new timing): %f percents", original_time / new_time * 100)
-    save_model(model, args.output_dir)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name",
-        default="",
-        type=str,
-        help="Pretrained config name or path if not the same as model_name_or_path",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name_or_path",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default=None,
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
-    )
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-
-    parser.add_argument(
-        "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers"
-    )
-    parser.add_argument(
-        "--dont_normalize_global_importance",
-        action="store_true",
-        help="Don't normalize all importance scores between 0 and 1",
-    )
-
-    parser.add_argument(
-        "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy."
-    )
-    parser.add_argument(
-        "--masking_threshold",
-        default=0.9,
-        type=float,
-        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
-    )
-    parser.add_argument(
-        "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step."
-    )
-    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help=(
-            "The maximum total input sequence length after WordPiece tokenization. \n"
-            "Sequences longer than this will be truncated, sequences shorter padded."
-        ),
-    )
-    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
-
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup devices and distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        args.device = torch.device("cuda", args.local_rank)
-        args.n_gpu = 1
-        torch.distributed.init_process_group(backend="nccl")  # Initializes the distributed backend
-
-    # Setup logging
-    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
-
-    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
-
-    # Distributed and parallel training
-    model.to(args.device)
-    if args.local_rank != -1:
-        model = nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-    elif args.n_gpu > 1:
-        model = nn.DataParallel(model)
-
-    # Print/save training arguments
-    os.makedirs(args.output_dir, exist_ok=True)
-    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Prepare dataset
-    numpy_data = np.concatenate(
-        [
-            np.loadtxt(args.data_dir, dtype=np.int64),
-        ]
-    )
-    train_tensor_dataset = (torch.from_numpy(numpy_data),)
-    train_data = TensorDataset(*train_tensor_dataset)
-    train_sampler = RandomSampler(train_data)
-    eval_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size)
-
-    # Compute head entropy and importance score
-    compute_heads_importance(args, model, eval_dataloader)
-
-    # Try head masking (set heads to zero until the score goes under a threshole)
-    # and head pruning (remove masked heads and see the effect on the network)
-    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
-        head_mask = mask_heads(args, model, eval_dataloader)
-        prune_heads(args, model, eval_dataloader, head_mask)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
deleted file mode 100644
index f0af3d144f78..000000000000
--- a/examples/research_projects/codeparrot/README.md
+++ /dev/null
@@ -1,316 +0,0 @@
-# CodeParrot 🦜
-<p align="center">
-    <img src="https://huggingface.co/datasets/lvwerra/repo-images/raw/main/code-highlighting-streamlit.png" alt="drawing" width="350"/>
-</p>
-
-## What is this about?
-This is an open-source effort to train and evaluate code generation models. CodeParrot 🦜 is a GPT-2 model trained from scratch on Python code. The highlights of this project are:
-- initialize and train a GPT-2 language model from scratch for code generation
-- train a custom tokenizer adapted for Python code
-- clean and deduplicate a large (>100GB) dataset with `datasets`
-- train with `accelerate` on multiple GPUs using data parallelism and mixed precision
-- continuously push checkpoints to the hub with `huggingface_hub`
-- stream the dataset with `datasets` during training to avoid disk bottlenecks
-- apply the `code_eval` metric in `datasets` to evaluate on [OpenAI's _HumanEval_ benchmark](https://huggingface.co/datasets/openai_humaneval)
-- showcase examples for downstream tasks with code models in [examples](https://github.com/huggingface/transformers/tree/main/examples/research_projects/codeparrot/examples) folder:
-    - Algorithmic complexity prediction
-    - Code generation from english text
-    - Code explanation
-    
-## Installation
-To install the dependencies simply run the following command:
-```bash
-pip install -r requirements.txt
-```
-
-To reproduce the results you can follow the scripts in the following sections. Note that we don't always show all possible arguments to the scripts. To get the full list of arguments with descriptions you can run the following command on any script:
-
-```bash
-python scripts/some_script.py --help
-```
-
-Before you run any of the scripts make sure you are logged in and can push to the hub:
-
-```bash
-huggingface-cli login
-```
-
-Additionally, sure you have git-lfs installed. You can find instructions for how to install it [here](https://git-lfs.github.com/).
-
-## Dataset
-The source of the dataset is the GitHub dump available on Google's [BigQuery](https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code). The database was queried for all Python files with less than 1MB in size resulting in a 180GB dataset with over 20M files. The dataset is available on the Hugging Face Hub [here](https://huggingface.co/datasets/transformersbook/codeparrot).
-
-### Preprocessing
-The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374) and some new ones:
-
-- exact deduplication using each file's hash after having removed whistespaces.
-- near deduplication using MinHash and Jaccard similarity. MinHash with a Jaccard threshold (default=0.85) is first used to create duplicate clusters. Then these clusters are then reduced to unique files based on the exact Jaccard similarity. See `deduplicate_dataset` in `minhash_deduplication.py` for a detailed description.
-- filtering files with max line length > 1000
-- filtering files with mean line length > 100
-- fraction of alphanumeric characters < 0.25
-- containing the word "auto-generated" or similar in the first 5 lines
-- filtering with a probability of 0.7 of files with a mention of "test file" or "configuration file" or similar in the first 5 lines
-- filtering with a probability of 0.7 of files with high occurrence of the keywords "test " or "config" 
-- filtering with a probability of 0.7  of files without a mention of the keywords `def` , `for`, `while`  and `class`
-- filtering files that use the assignment operator `=` less than 5 times 
-- filtering files with ratio between number of characters and number of tokens after tokenization < 1.5 (the average ratio is 3.6)
-
-The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/codeparrot/codeparrot-clean-train-v2) and [validation](https://huggingface.co/datasets/codeparrot/codeparrot-clean-valid-v2) splits are also available on the Hub if you want to skip this step or use the data for another project.
-
-To execute the preprocessing run the following command:
-```bash
-python scripts/preprocessing.py \
---dataset_name transformersbook/codeparrot \
---output_dir codeparrot-clean
-```
-During preprocessing the dataset is downloaded and stored locally as well as caches of the computations. Make sure you have more than 500GB free disk space to execute it.
-
-### Pretokenization
-The tokenization of the data might be slow during the training especially for small models. We provide code to pretokenize the data beforehand in `scripts/pretokenizing.py`, but this step is optional. The dataset is downloaded and stored locally and the tokenized data is pushed to the hub. The tokenized clean [train](https://huggingface.co/datasets/codeparrot/tokenized-codeparrot-train) and [validation](https://huggingface.co/datasets/codeparrot/tokenized-codeparrot-valid) datasets are available if you want to use them directly.
-
-To execute the pretokenization, for the clean train data for instance, run the following command:
-```bash
-python scripts/pretokenizing.py \
---dataset_name codeparrot/codeparrot-clean-train \
---tokenized_data_repo tokenized-codeparrot-train
-```
-
-## Tokenizer
-Before training a new model for code we create a new tokenizer that is efficient at code tokenization. To train the tokenizer you can run the following command: 
-```bash
-python scripts/bpe_training.py \
-    --base_tokenizer openai-community/gpt2 \
-    --dataset_name codeparrot/codeparrot-clean-train
-```
-
-_Note:_ We originally trained the tokenizer on the unprocessed train split of the dataset `transformersbook/codeparrot-train`.
-
-## Training
-The models are randomly initialized and trained from scratch. To initialize a new model you can run:
-
-```bash
-python scripts/initialize_model.py \
---config_name openai-community/gpt2-large \
---tokenizer_name codeparrot/codeparrot \
---model_name codeparrot \
---push_to_hub True
-```
-This will initialize a new model with the architecture and configuration of `openai-community/gpt2-large` and use the tokenizer to appropriately size the input embeddings. Finally, the initilaized model is pushed the hub.
-
-We can either pass the name of a text dataset or a pretokenized dataset which speeds up training a bit.
-Now that the tokenizer and model are also ready we can start training the model. The main training script is built with `accelerate` to scale across a wide range of platforms and infrastructure scales. We train two models with [110M](https://huggingface.co/codeparrot/codeparrot-small/) and [1.5B](https://huggingface.co/codeparrot/codeparrot/) parameters for 25-30B tokens on a 16xA100 (40GB) machine which takes 1 day and 1 week, respectively.
-
-First you need to configure `accelerate` and login to Weights & Biases:
-
-```bash
-accelerate config
-wandb login
-```
-
-Note that during the `accelerate` configuration we enabled FP16. Then to train the large model you can run
-
-```bash
-accelerate launch scripts/codeparrot_training.py
-```
-
-If you want to train the small model you need to make some modifications:
-
-```bash
-accelerate launch scripts/codeparrot_training.py \
---model_ckpt codeparrot/codeparrot-small \
---train_batch_size 12 \
---valid_batch_size 12 \
---learning_rate 5e-4 \
---num_warmup_steps 2000 \
---gradient_accumulation 1 \
---gradient_checkpointing False \
---max_train_steps 150000 \
---save_checkpoint_steps 15000
-```
-
-Recall that you can see the full set of possible options with descriptions (for all scripts) by running:
-
-```bash
-python scripts/codeparrot_training.py --help
-```
-
-Instead of streaming the dataset from the hub you can also stream it from disk. This can be helpful for long training runs where the connection can be interrupted sometimes. To stream locally you simply need to clone the datasets and replace the dataset name with their path. In this example we store the data in a folder called `data`: 
-
-```bash
-git lfs install
-mkdir data
-git -C "./data" clone https://huggingface.co/datasets/codeparrot/codeparrot-clean-train
-git -C "./data" clone https://huggingface.co/datasets/codeparrot/codeparrot-clean-valid
-```
-
-And then pass the paths to the datasets when we run the training script:
-
-```bash
-accelerate launch scripts/codeparrot_training.py \
---model_ckpt codeparrot/codeparrot-small \
---dataset_name_train ./data/codeparrot-clean-train \
---dataset_name_valid ./data/codeparrot-clean-valid \
---train_batch_size 12 \
---valid_batch_size 12 \
---learning_rate 5e-4 \
---num_warmup_steps 2000 \
---gradient_accumulation 1 \
---gradient_checkpointing False \
---max_train_steps 150000 \
---save_checkpoint_steps 15000
-```
-
-## Evaluation
-For evaluating the language modeling loss on the validation set or any other dataset you can use the following command:
-```bash
-python scripts/validation_loss.py \
---model_ckpt codeparrot/codeparrot \
---dataset_name codeparrot/codeparrot-clean-valid
-```
-In addition we evaluate the model on OpenAI's _HumanEval_ benchmark. You can run the evaluation with the following command:
-
-```bash
-accelerate launch  scripts/human_eval.py --model_ckpt codeparrot/codeparrot \
---do_sample True \
---temperature 0.2 \
---top_p 0.95 \
---n_samples=200 \
---HF_ALLOW_CODE_EVAL="0"
-```
-
-The results as well as reference values are shown in the following table:
-
-| Model | pass@1 | pass@10 | pass@100|
-|-------|--------|---------|---------|
-|CodeParrot 🦜 (110M) | 3.80% | 6.57% | 12.78% |
-|CodeParrot 🦜 (1.5B) | 3.99% | 8.69% | 17.88% |
-|||||
-|Codex (25M)| 3.21% | 7.1% |	12.89%|
-|Codex (85M)| 8.22%	| 12.81% | 22.40% |
-|Codex (300M)| 13.17%| 20.37% | 36.27% |
-|Codex (12B)| 28.81%| 46.81% | 72.31% |
-|||||
-|GPT-neo (125M)| 0.75% | 1.88% | 2.97% |
-|GPT-neo (1.5B)| 4.79% | 7.47% | 16.30% |
-|GPT-neo (2.7B)| 6.41% | 11.27% | 21.37% |
-|GPT-J (6B)| 11.62% | 15.74% | 27.74% |
-
-The numbers were obtained by sampling with `T = [0.2, 0.6, 0.8]` and picking the best value for each metric. Both CodeParrot 🦜 models are still underfitted and longer training would likely improve the performance.
-
-## Demo
-Give the model a shot yourself! There are three demos to interact with CodeParrot 🦜:
-- [Code generation](https://huggingface.co/spaces/codeparrot/codeparrot-generation)
-- [Code highlighting](https://huggingface.co/spaces/codeparrot/codeparrot-highlighting)
-- [Comparison to other code models](https://huggingface.co/spaces/codeparrot/loubnabnl/code-generation-models)
-
-## Training with Megatron
-[Megatron](https://github.com/NVIDIA/Megatron-LM) is a framework developed by NVIDIA for training large transformer models. While the CodeParrot code is easy to follow and modify to your needs the Megatron framework lets you train models faster. Below we explain how to use it.
-
-### Setup
-You can pull an NVIDIA PyTorch Container that comes with all the required installations from [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch). See [documentation](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html) for more details:
-
-With the following Docker command you can run the container (`xx.xx` denotes your Docker version), and clone [Megatron repository](https://github.com/NVIDIA/Megatron-LM) into it:
-```bash
-docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:xx.xx-py3
-git clone https://github.com/NVIDIA/Megatron-LM
-```
-
-You also need to add the vocabulary file and merges table of the tokenizer that you trained on code into the container. You can also find these files in [vocab.json](https://huggingface.co/codeparrot/codeparrot/raw/main/vocab.json) and [merges.txt](https://huggingface.co/codeparrot/codeparrot/raw/main/merges.txt).
-```bash
-sudo docker cp vocab.json CONTAINER_ID:/workspace/Megatron-LM
-sudo docker cp merges.txt CONTAINER_ID:/workspace/Megatron-LM
-```
-
-### Data preprocessing
-The training data requires preprocessing. First, you need to convert it into a loose json format, with one json containing a text sample per line. In python this can be done this way:
-```python
-from datasets import load_dataset
-
-train_data = load_dataset('codeparrot/codeparrot-clean-train', split='train')
-train_data.to_json("codeparrot_data.json", lines=True)  
-```
-
-The data is then tokenized, shuffled and processed into a binary format for training using the following command:
-```bash
-pip install nltk
-cd Megatron-LM
-python tools/preprocess_data.py \
-       --input codeparrot_data.json \
-       --output-prefix codeparrot \
-       --vocab vocab.json \
-       --dataset-impl mmap \
-       --tokenizer-type GPT2BPETokenizer \
-       --merge-file merges.txt \
-       --json-keys content \
-       --workers 32 \
-       --chunk-size 25 \
-       --append-eod
-```
-This outputs two files `codeparrot_content_document.idx` and `codeparrot_content_document.bin` which are used in the training.
-
-### Training
-You can configure the model architecture and training parameters as shown below, or put it in a bash script that you will run. This runs on 8 GPUs the 110M parameter CodeParrot pretraining, with the same settings as before. Note that the data is partitioned by default into a 969:30:1 ratio for training/validation/test sets.
-```bash
-GPUS_PER_NODE=8
-MASTER_ADDR=localhost
-MASTER_PORT=6001
-NNODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-CHECKPOINT_PATH=/workspace/Megatron-LM/experiments/codeparrot-small
-VOCAB_FILE=vocab.json
-MERGE_FILE=merges.txt
-DATA_PATH=codeparrot_content_document
-GPT_ARGS="--num-layers 12
---hidden-size 768
---num-attention-heads 12
---seq-length 1024
---max-position-embeddings 1024
---micro-batch-size 12
---global-batch-size 192
---lr 0.0005
---train-iters 150000
---lr-decay-iters 150000
---lr-decay-style cosine
---lr-warmup-iters 2000
---weight-decay .1
---adam-beta2 .999
---fp16
---log-interval 10
---save-interval 2000
---eval-interval 200
---eval-iters 10
-"
-TENSORBOARD_ARGS="--tensorboard-dir experiments/tensorboard"
-python3 -m torch.distributed.launch $DISTRIBUTED_ARGS \
-        pretrain_gpt.py \
-        --tensor-model-parallel-size 1 \
-        --pipeline-model-parallel-size 1 \
-        $GPT_ARGS \
-        --vocab-file $VOCAB_FILE \
-        --merge-file $MERGE_FILE \
-        --save $CHECKPOINT_PATH \
-        --load $CHECKPOINT_PATH \
-        --data-path $DATA_PATH \
-        $TENSORBOARD_ARGS
-```
-The training takes almost 12 hours in this setting.
-
-### Convert model to `transformers`
-After training we want to use the model in `transformers` e.g. to evaluate it on HumanEval. You can convert it to `transformers` following [this](https://huggingface.co/nvidia/megatron-gpt2-345m) tutorial. For instance, after the training is finished you can copy the weights of the last iteration 150k and convert the `model_optim_rng.pt` file to a `pytorch_model.bin` file that is supported by `transformers`.
-
-```bash
-mkdir -p nvidia/megatron-codeparrot-small
-sudo docker cp CONTAINER_ID:/workspace/Megatron-LM/experiments/codeparrot-small/iter_0150000/mp_rank_00/model_optim_rng.pt nvidia/megatron-codeparrot-small
-git clone https://github.com/huggingface/transformers.git
-git clone https://github.com/NVIDIA/Megatron-LM.git
-export PYTHONPATH=Megatron-LM
-python transformers/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py nvidia/megatron-codeparrot-small/model_optim_rng.pt
-```
-Be careful, you will need to replace the generated vocabulary file and merges table after the conversion, with the original ones if you plan to load the tokenizer from there.
-
-## Further Resources
-A detailed description of the project can be found in the chapter "Training Transformers from Scratch" in the upcoming O'Reilly book [Natural Language Processing with Transformers](https://learning.oreilly.com/library/view/natural-language-processing/9781098103231/).
-
-This example was provided by [Leandro von Werra](www.github.com/lvwerra).
diff --git a/examples/research_projects/codeparrot/examples/README.md b/examples/research_projects/codeparrot/examples/README.md
deleted file mode 100644
index c1980262d827..000000000000
--- a/examples/research_projects/codeparrot/examples/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# Examples
-In this folder we showcase some examples to use code models for downstream tasks.
-
-## Complexity prediction
-In this task we want to predict the complexity of Java programs in [CodeComplex](https://huggingface.co/datasets/codeparrot/codecomplex) dataset. Using Hugging Face `trainer`, we finetuned [multilingual CodeParrot](https://huggingface.co/codeparrot/codeparrot-small-multi) and [UniXcoder](https://huggingface.co/microsoft/unixcoder-base-nine) on it, and we used the latter to build this Java complexity prediction [space](https://huggingface.co/spaces/codeparrot/code-complexity-predictor) on Hugging Face hub.
-
-To fine-tune a model on this dataset you can use the following commands:
-
-```python
-python train_complexity_predictor.py \
-    --model_ckpt microsoft/unixcoder-base-nine \
-    --num_epochs 60 \
-    --num_warmup_steps 10 \
-    --batch_size 8 \
-    --learning_rate 5e-4 
-```
-
-## Code generation: text to python
-In this task we want to train a model to generate code from english text. We finetuned Codeparrot-small on [github-jupyter-text-to-code](https://huggingface.co/datasets/codeparrot/github-jupyter-text-to-code), a dataset where the samples are a succession of docstrings and their Python code, originally extracted from Jupyter notebooks parsed in this [dataset](https://huggingface.co/datasets/codeparrot/github-jupyter-parsed).
-
-To fine-tune a model on this dataset we use the same [script](https://github.com/huggingface/transformers/blob/main/examples/research_projects/codeparrot/scripts/codeparrot_training.py) as the pretraining of codeparrot:
-
-```python
-accelerate launch scripts/codeparrot_training.py \
-    --model_ckpt codeparrot/codeparrot-small \
-    --dataset_name_train codeparrot/github-jupyter-text-to-code \
-    --dataset_name_valid codeparrot/github-jupyter-text-to-code \
-    --train_batch_size 12 \
-    --valid_batch_size 12 \
-    --learning_rate 5e-4 \
-    --num_warmup_steps 100 \
-    --gradient_accumulation 1 \
-    --gradient_checkpointing False \
-    --max_train_steps 3000 \
-    --save_checkpoint_steps 200 \
-    --save_dir jupyter-text-to-python
-```
-
-## Code explanation: python to text
-In this task we want to train a model to explain python code. We finetuned Codeparrot-small on [github-jupyter-code-to-text](https://huggingface.co/datasets/codeparrot/github-jupyter-code-to-text), a dataset where the samples are a succession of Python code and its explanation as a docstring, we just inverted the order of text and code pairs in github-jupyter-code-to-text dataset and added the delimiters "Explanation:" and "End of explanation" inside the doctrings.
-
-To fine-tune a model on this dataset we use the same [script](https://github.com/huggingface/transformers/blob/main/examples/research_projects/codeparrot/scripts/codeparrot_training.py) as the pretraining of codeparrot:
-
-```python
-accelerate launch scripts/codeparrot_training.py \
-    --model_ckpt codeparrot/codeparrot-small \
-    --dataset_name_train codeparrot/github-jupyter-code-to-text \
-    --dataset_name_valid codeparrot/github-jupyter-code-to-text \
-    --train_batch_size 12 \
-    --valid_batch_size 12 \
-    --learning_rate 5e-4 \
-    --num_warmup_steps 100 \
-    --gradient_accumulation 1 \
-    --gradient_checkpointing False \
-    --max_train_steps 3000 \
-    --save_checkpoint_steps 200 \
-    --save_dir jupyter-python-to-text
-```
\ No newline at end of file
diff --git a/examples/research_projects/codeparrot/examples/requirements.txt b/examples/research_projects/codeparrot/examples/requirements.txt
deleted file mode 100644
index 64ee5b508f77..000000000000
--- a/examples/research_projects/codeparrot/examples/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-datasets==2.3.2
-transformers==4.38.0
-wandb==0.13.1
-evaluate==0.2.2
-scikit-learn==1.5.0
\ No newline at end of file
diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
deleted file mode 100644
index de06b988db63..000000000000
--- a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import numpy as np
-from datasets import ClassLabel, DatasetDict, load_dataset
-from evaluate import load
-
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    Trainer,
-    TrainerCallback,
-    TrainingArguments,
-    set_seed,
-)
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_ckpt", type=str, default="microsoft/unixcoder-base-nine")
-    parser.add_argument("--num_epochs", type=int, default=5)
-    parser.add_argument("--batch_size", type=int, default=6)
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
-    parser.add_argument("--freeze", type=bool, default=True)
-    parser.add_argument("--learning_rate", type=float, default=5e-4)
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
-    parser.add_argument("--num_warmup_steps", type=int, default=10)
-    parser.add_argument("--weight_decay", type=float, default=0.01)
-    parser.add_argument("--output_dir", type=str, default="./results")
-    return parser.parse_args()
-
-
-metric = load("accuracy")
-
-
-def compute_metrics(eval_pred):
-    predictions, labels = eval_pred
-    predictions = np.argmax(predictions, axis=1)
-    return metric.compute(predictions=predictions, references=labels)
-
-
-class CustomCallback(TrainerCallback):
-    def __init__(self, trainer) -> None:
-        super().__init__()
-        self._trainer = trainer
-
-    def on_epoch_end(self, args, state, control, **kwargs):
-        if control.should_evaluate:
-            control_copy = deepcopy(control)
-            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
-            return control_copy
-
-
-def main():
-    args = get_args()
-    set_seed(args.seed)
-
-    dataset = load_dataset("codeparrot/codecomplex", split="train")
-    train_test = dataset.train_test_split(test_size=0.2)
-    test_validation = train_test["test"].train_test_split(test_size=0.5)
-    train_test_validation = DatasetDict(
-        {
-            "train": train_test["train"],
-            "test": test_validation["train"],
-            "valid": test_validation["test"],
-        }
-    )
-
-    print("Loading tokenizer and model")
-    tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
-    tokenizer.pad_token = tokenizer.eos_token
-    model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=7)
-    model.config.pad_token_id = model.config.eos_token_id
-
-    if args.freeze:
-        for param in model.roberta.parameters():
-            param.requires_grad = False
-
-    labels = ClassLabel(num_classes=7, names=list(set(train_test_validation["train"]["complexity"])))
-
-    def tokenize(example):
-        inputs = tokenizer(example["src"], truncation=True, max_length=1024)
-        label = labels.str2int(example["complexity"])
-        return {
-            "input_ids": inputs["input_ids"],
-            "attention_mask": inputs["attention_mask"],
-            "label": label,
-        }
-
-    tokenized_datasets = train_test_validation.map(
-        tokenize,
-        batched=True,
-        remove_columns=train_test_validation["train"].column_names,
-    )
-    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-
-    training_args = TrainingArguments(
-        output_dir=args.output_dir,
-        learning_rate=args.learning_rate,
-        lr_scheduler_type=args.lr_scheduler_type,
-        eval_strategy="epoch",
-        save_strategy="epoch",
-        logging_strategy="epoch",
-        per_device_train_batch_size=args.batch_size,
-        per_device_eval_batch_size=args.batch_size,
-        num_train_epochs=args.num_epochs,
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        weight_decay=0.01,
-        metric_for_best_model="accuracy",
-        run_name="complexity-java",
-        report_to="wandb",
-    )
-
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_datasets["train"],
-        eval_dataset=tokenized_datasets["valid"],
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics,
-    )
-
-    print("Training...")
-    trainer.add_callback(CustomCallback(trainer))
-    trainer.train()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/codeparrot/requirements.txt b/examples/research_projects/codeparrot/requirements.txt
deleted file mode 100644
index ee4fc0691b06..000000000000
--- a/examples/research_projects/codeparrot/requirements.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-transformers==4.38.0
-datasets==1.16.0
-wandb==0.12.0
-tensorboard==2.6.0
-torch==2.2.0
-huggingface-hub==0.1.0
-git+https://github.com/huggingface/accelerate.git@3c45b6f760ad8745be9ebc9bbb26f5b04dea4abe
-datasketch==1.5.7
-dpu_utils
\ No newline at end of file
diff --git a/examples/research_projects/codeparrot/scripts/arguments.py b/examples/research_projects/codeparrot/scripts/arguments.py
deleted file mode 100644
index 1540319b3daf..000000000000
--- a/examples/research_projects/codeparrot/scripts/arguments.py
+++ /dev/null
@@ -1,220 +0,0 @@
-from dataclasses import dataclass, field
-from typing import Optional
-
-
-@dataclass
-class TrainingArguments:
-    """
-    Configuration for training model.
-    """
-
-    model_ckpt: Optional[str] = field(
-        default="codeparrot/codeparrot", metadata={"help": "Model name or path of model to be trained."}
-    )
-    save_dir: Optional[str] = field(
-        default="./", metadata={"help": "Save dir where model repo is cloned and models updates are saved to."}
-    )
-    dataset_name_train: Optional[str] = field(
-        default="codeparrot/codeparrot-clean-train", metadata={"help": "Name or path of training dataset."}
-    )
-    dataset_name_valid: Optional[str] = field(
-        default="codeparrot/codeparrot-clean-valid", metadata={"help": "Name or path of validation dataset."}
-    )
-    train_batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size for training."})
-    valid_batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size for evaluation."})
-    weight_decay: Optional[float] = field(default=0.1, metadata={"help": "Value of weight decay."})
-    shuffle_buffer: Optional[int] = field(
-        default=10000, metadata={"help": "Size of buffer used to shuffle streaming dataset."}
-    )
-    learning_rate: Optional[float] = field(default=2e-4, metadata={"help": "Learning rate fo training."})
-    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "Learning rate."})
-    num_warmup_steps: Optional[int] = field(
-        default=750, metadata={"help": "Number of warmup steps in the learning rate schedule."}
-    )
-    gradient_accumulation_steps: Optional[int] = field(
-        default=16, metadata={"help": "Number of gradient accumulation steps."}
-    )
-    gradient_checkpointing: Optional[bool] = field(
-        default=True, metadata={"help": "Use gradient checkpointing to reduce memory footprint."}
-    )
-    max_train_steps: Optional[int] = field(default=50000, metadata={"help": "Maximum number of training steps."})
-    max_eval_steps: Optional[int] = field(
-        default=-1, metadata={"help": "Maximum number of evaluation steps. If -1 the full dataset is evaluated."}
-    )
-    seq_length: Optional[int] = field(default=1024, metadata={"help": "Sequence lengths used for training."})
-    seed: Optional[int] = field(default=1, metadata={"help": "Training seed."})
-    save_checkpoint_steps: Optional[int] = field(
-        default=1024,
-        metadata={"help": "Interval to save checkpoints. Measured as number of forward passes not training steps."},
-    )
-    resume_from_checkpoint: Optional[str] = field(
-        default=None, metadata={"help": "States path if the training should continue from a checkpoint folder."}
-    )
-    tokenized: Optional[bool] = field(default=False, metadata={"help": "If True the data is pretokenized."})
-
-
-@dataclass
-class EvaluationArguments:
-    """
-    Configuration for evaluating model.
-    """
-
-    model_ckpt: Optional[str] = field(
-        default="codeparrot/codeparrot", metadata={"help": "Model name or path of model to be evaluated."}
-    )
-    dataset_name: Optional[str] = field(
-        default="codeparrot/codeparrot-clean-valid", metadata={"help": "Name or path of validation dataset."}
-    )
-    batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size used for evaluation."})
-    max_eval_steps: Optional[int] = field(
-        default=-1, metadata={"help": "Maximum number of evaluation steps. If -1 the full dataset is evaluated."}
-    )
-    seq_length: Optional[int] = field(default=1024, metadata={"help": "Length of sequences to be evaluated."})
-    seed: Optional[int] = field(default=1, metadata={"help": "Random seed used for evaluation."})
-
-
-@dataclass
-class HumanEvalArguments:
-    """
-    Configuration for running evaluation on HumanEval dataset.
-    """
-
-    model_ckpt: Optional[str] = field(
-        default="codeparrot/codeparrot", metadata={"help": "Model name or path of model to be evaluated."}
-    )
-    num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."})
-    num_tasks: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of human-eval tasks to run. If not included all tasks are evaluated."},
-    )
-    do_sample: Optional[bool] = field(
-        default=True, metadata={"help": "Sample from the language model's output distribution."}
-    )
-    temperature: Optional[float] = field(default=0.2, metadata={"help": "Sampling temperature used for generation."})
-    max_new_tokens: Optional[int] = field(default=256, metadata={"help": "Maximum number of newly generated tokens."})
-    top_k: Optional[int] = field(default=0, metadata={"help": "Top-k parameter used for generation."})
-    top_p: Optional[float] = field(default=0.95, metadata={"help": "Top-p parameter used for nucleus sampling."})
-    batch_size: Optional[int] = field(default=10, metadata={"help": "Number of generations to run in parallel."})
-    n_samples: Optional[int] = field(
-        default=200, metadata={"help": "Number of completions to generate for each sample."}
-    )
-    seed: Optional[int] = field(default=1, metadata={"help": "Random seed used for evaluation."})
-    output_file: Optional[str] = field(
-        default="eval_results.json", metadata={"help": "Random seed used for evaluation."}
-    )
-    HF_ALLOW_CODE_EVAL: Optional[str] = field(
-        default="0", metadata={"help": "Allow `code_eval` to execute Python code on machine"}
-    )
-    device_int: Optional[int] = field(
-        default=-1,
-        metadata={
-            "help": (
-                "Determine which device to run the `text-generation` Pipeline on. -1 is CPU and any zero or positive"
-                " number corresponds to which GPU device id to run on."
-            )
-        },
-    )
-
-
-@dataclass
-class PreprocessingArguments:
-    """
-    Configuration for preprocessing data.
-    """
-
-    num_workers: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "The number of CPU cores to use for parallel preprocessing. Default uses the maximum available."
-        },
-    )
-    dataset_name: Optional[str] = field(
-        default="transformersbook/codeparrot", metadata={"help": "Folder or name of dataset to process."}
-    )
-    output_dir: Optional[str] = field(
-        default="codeparrot-clean", metadata={"help": "Folder to save processed dataset."}
-    )
-    samples_per_file: Optional[int] = field(
-        default=100_000, metadata={"help": "Number of files to save per JSON output file."}
-    )
-    text_column: Optional[str] = field(default="content", metadata={"help": "Column containing text data to process."})
-    line_max: Optional[float] = field(
-        default=1000, metadata={"help": "Maximum line length in file, otherwise file is filtered."}
-    )
-    line_mean: Optional[float] = field(
-        default=100, metadata={"help": "Maximum mean line length in file, otherwise file is filtered."}
-    )
-    alpha_frac: Optional[float] = field(
-        default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."}
-    )
-    min_token_ratio: Optional[float] = field(
-        default=1.5, metadata={"help": "Minimum character token ratio for the file, otherwise file is filtered."}
-    )
-    filter_proba: Optional[float] = field(
-        default=0.7, metadata={"help": "Probability for filtering config, test and uncommon files."}
-    )
-    tokenizer: Optional[str] = field(
-        default="codeparrot/codeparrot",
-        metadata={"help": "Name or path to the tokenizer."},
-    )
-    near_deduplication: Optional[bool] = field(
-        default=False, metadata={"help": "If True, near-duplicate samples are removed."}
-    )
-    jaccard_threshold: Optional[float] = field(
-        default=0.85, metadata={"help": "Jaccard threshold for near-duplicate samples."}
-    )
-
-
-@dataclass
-class TokenizerTrainingArguments:
-    """
-    Configuration for tokenizer training.
-    """
-
-    base_tokenizer: Optional[str] = field(
-        default="openai-community/gpt2", metadata={"help": "Base tokenizer to build new tokenizer from."}
-    )
-    dataset_name: Optional[str] = field(
-        default="transformersbook/codeparrot-train", metadata={"help": "Dataset to train tokenizer on."}
-    )
-    text_column: Optional[str] = field(default="content", metadata={"help": "Column containing text data to process."})
-    vocab_size: Optional[int] = field(default=200_000, metadata={"help": "Number of examples to train tokenizer on."})
-    n_examples: Optional[int] = field(
-        default=32768, metadata={"help": "Number of examples to train the tokenizer on."}
-    )
-    tokenizer_name: Optional[str] = field(default="codeparrot", metadata={"help": "Name of new tokenizer."})
-    push_to_hub: Optional[bool] = field(default=True, metadata={"help": "Push saved tokenizer to the hub."})
-
-
-@dataclass
-class PretokenizationArguments:
-    """
-    Configuration for data pretokenization.
-    """
-
-    tokenizer_dir: Optional[str] = field(
-        default="codeparrot/codeparrot", metadata={"help": "Name or path to the tokenizer."}
-    )
-    dataset_name: Optional[str] = field(
-        default="codeparrot/codeparrot-clean-train", metadata={"help": "Name or path to the dataset to pretokenize."}
-    )
-    tokenized_data_repo: Optional[str] = field(
-        default="tokenized-codeparrot-train", metadata={"help": "Repo name of the pretokenized data."}
-    )
-    num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."})
-
-
-@dataclass
-class InitializationArguments:
-    """
-    Configuration for initializing new model.
-    """
-
-    config_name: Optional[str] = field(
-        default="openai-community/gpt2-large", metadata={"help": "Configuration to use for model initialization."}
-    )
-    tokenizer_name: Optional[str] = field(
-        default="codeparrot/codeparrot", metadata={"help": "Tokenizer attached to model."}
-    )
-    model_name: Optional[str] = field(default="codeparrot", metadata={"help": "Name of the created model."})
-    push_to_hub: Optional[bool] = field(default=True, metadata={"help": "Push saved tokenizer to the hub."})
diff --git a/examples/research_projects/codeparrot/scripts/bpe_training.py b/examples/research_projects/codeparrot/scripts/bpe_training.py
deleted file mode 100644
index 1cbeb4b4ee32..000000000000
--- a/examples/research_projects/codeparrot/scripts/bpe_training.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from arguments import TokenizerTrainingArguments
-from datasets import load_dataset
-from tqdm import tqdm
-
-from transformers import AutoTokenizer, HfArgumentParser
-from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
-
-
-# Iterator for Training
-def batch_iterator(batch_size=10):
-    for _ in tqdm(range(0, args.n_examples, batch_size)):
-        yield [next(iter_dataset)[args.text_column] for _ in range(batch_size)]
-
-
-# Configuration
-parser = HfArgumentParser(TokenizerTrainingArguments)
-args = parser.parse_args()
-
-# Base tokenizer
-tokenizer = AutoTokenizer.from_pretrained(args.base_tokenizer)
-base_vocab = list(bytes_to_unicode().values())
-
-# Load dataset
-dataset = load_dataset(args.dataset_name, split="train", streaming=True)
-iter_dataset = iter(dataset)
-
-
-# Training and saving
-new_tokenizer = tokenizer.train_new_from_iterator(
-    batch_iterator(), vocab_size=args.vocab_size, initial_alphabet=base_vocab
-)
-new_tokenizer.save_pretrained(args.tokenizer_name, push_to_hub=args.push_to_hub)
diff --git a/examples/research_projects/codeparrot/scripts/codeparrot_training.py b/examples/research_projects/codeparrot/scripts/codeparrot_training.py
deleted file mode 100644
index 16f6077f2415..000000000000
--- a/examples/research_projects/codeparrot/scripts/codeparrot_training.py
+++ /dev/null
@@ -1,328 +0,0 @@
-import logging
-import os
-import time
-from argparse import Namespace
-from pathlib import Path
-
-import datasets
-import torch
-from accelerate import Accelerator, DistributedType
-from accelerate.utils import ProjectConfiguration
-from arguments import TrainingArguments
-from datasets import load_dataset
-from huggingface_hub import Repository
-from torch.optim import AdamW
-from torch.utils.data import IterableDataset
-from torch.utils.data.dataloader import DataLoader
-from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe
-
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, get_scheduler, set_seed
-
-
-class ConstantLengthDataset(IterableDataset):
-    """
-    Iterable dataset that returns constant length chunks of tokens from stream of text files.
-        Args:
-            tokenizer (Tokenizer): The processor used for proccessing the data.
-            dataset (dataset.Dataset): Dataset with text files.
-            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
-            seq_length (int): Length of token sequences to return.
-            num_of_sequences (int): Number of token sequences to keep in buffer.
-            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
-            tokenized (bool): If true we use a pretokenized dataset.
-    """
-
-    def __init__(
-        self,
-        tokenizer,
-        dataset,
-        infinite=False,
-        seq_length=1024,
-        num_of_sequences=1024,
-        chars_per_token=3.6,
-        tokenized=False,
-    ):
-        self.tokenizer = tokenizer
-        self.concat_token_id = tokenizer.bos_token_id
-        self.dataset = dataset
-        self.seq_length = seq_length
-        self.epoch = 0
-        self.infinite = infinite
-        self.current_size = 0
-        self.tokenized = tokenized
-
-        if self.tokenized:
-            self.max_buffer_size = seq_length * num_of_sequences
-            self.content_field = "input_ids"
-        else:
-            self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
-            self.content_field = "content"
-
-    def __iter__(self):
-        iterator = iter(self.dataset)
-        more_examples = True
-        while more_examples:
-            buffer, buffer_len = [], 0
-            while True:
-                if buffer_len >= self.max_buffer_size:
-                    break
-                try:
-                    buffer.append(next(iterator)[self.content_field])
-                    buffer_len += len(buffer[-1])
-                except StopIteration:
-                    if self.infinite:
-                        iterator = iter(self.dataset)
-                        self.epoch += 1
-                        logger.info(f"Dataset epoch: {self.epoch}")
-                    else:
-                        more_examples = False
-                        break
-            if self.tokenized:
-                tokenized_inputs = buffer
-            else:
-                tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
-            all_token_ids = []
-            for tokenized_input in tokenized_inputs:
-                all_token_ids.extend(tokenized_input + [self.concat_token_id])
-            for i in range(0, len(all_token_ids), self.seq_length):
-                input_ids = all_token_ids[i : i + self.seq_length]
-                if len(input_ids) == self.seq_length:
-                    self.current_size += 1
-                    yield torch.tensor(input_ids)
-
-    def shuffle(self, buffer_size=1000):
-        return ShufflerIterDataPipe(self, buffer_size=buffer_size)
-
-
-def setup_logging(args):
-    project_name = args.model_ckpt.split("/")[-1]
-    logger = logging.getLogger(__name__)
-    log_dir = Path(args.save_dir) / "log/"
-    log_dir.mkdir(exist_ok=True)
-    filename = f"debug_{accelerator.process_index}.log"
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-        handlers=[logging.FileHandler(log_dir / filename), logging.StreamHandler()],
-    )
-    if accelerator.is_main_process:  # we only want to setup logging once
-        accelerator.init_trackers(project_name, vars(args))
-        run_name = accelerator.trackers[0].run.name
-        logger.setLevel(logging.INFO)
-        datasets.utils.logging.set_verbosity_info()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        run_name = ""
-        logger.setLevel(logging.ERROR)
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-    return logger, run_name
-
-
-def create_dataloaders(args):
-    ds_kwargs = {"streaming": True}
-    train_data = load_dataset(args.dataset_name_train, split="train", **ds_kwargs)
-    train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed)
-    valid_data = load_dataset(args.dataset_name_valid, split="train", **ds_kwargs)
-    train_dataset = ConstantLengthDataset(
-        tokenizer, train_data, infinite=True, seq_length=args.seq_length, tokenized=args.tokenized
-    )
-    valid_dataset = ConstantLengthDataset(
-        tokenizer, valid_data, infinite=False, seq_length=args.seq_length, tokenized=args.tokenized
-    )
-    train_dataset = train_dataset.shuffle(buffer_size=args.shuffle_buffer)
-    train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
-    eval_dataloader = DataLoader(valid_dataset, batch_size=args.valid_batch_size)
-    return train_dataloader, eval_dataloader
-
-
-def get_grouped_params(model, args, no_decay=["bias", "ln_1.weight", "ln_2.weight", "ln_f.weight"]):
-    params_with_wd, params_without_wd = [], []
-    for n, p in model.named_parameters():
-        if any(nd in n for nd in no_decay):
-            params_without_wd.append(p)
-        else:
-            params_with_wd.append(p)
-    return [
-        {"params": params_with_wd, "weight_decay": args.weight_decay},
-        {"params": params_without_wd, "weight_decay": 0.0},
-    ]
-
-
-def log_metrics(step, metrics):
-    logger.info(f"Step {step}: {metrics}")
-    if accelerator.is_main_process:
-        accelerator.log(metrics, step)
-
-
-def compute_tflops(elapsed_time, accelerator, args):
-    # TFLOPs formula (from Equation 3 in Section 5.1 of https://arxiv.org/pdf/2104.04473.pdf).
-    config_model = accelerator.unwrap_model(model).config
-    checkpoint_factor = 4 if args.gradient_checkpointing else 3
-    batch_size = args.train_batch_size * accelerator.state.num_processes * args.gradient_accumulation_steps
-    factor = 24 * checkpoint_factor * batch_size * args.seq_length * config_model.n_layer * (config_model.n_embd**2)
-    flops_per_iteration = factor * (
-        1.0
-        + (args.seq_length / (6.0 * config_model.n_embd))
-        + (tokenizer.vocab_size / (16.0 * config_model.n_layer * config_model.n_embd))
-    )
-    tflops = flops_per_iteration / (elapsed_time * accelerator.state.num_processes * (10**12))
-    return tflops
-
-
-def evaluate(args):
-    model.eval()
-    losses = []
-    for step, batch in enumerate(eval_dataloader):
-        with torch.no_grad():
-            outputs = model(batch, labels=batch)
-        loss = outputs.loss.repeat(args.valid_batch_size)
-        losses.append(accelerator.gather(loss))
-        if args.max_eval_steps > 0 and step >= args.max_eval_steps:
-            break
-    losses = torch.cat(losses)
-    loss = losses[: eval_dataloader.dataset.current_size].mean()
-    try:
-        perplexity = torch.exp(loss)
-    except OverflowError:
-        perplexity = float("inf")
-    return loss.item(), perplexity.item()
-
-
-# Settings
-parser = HfArgumentParser(TrainingArguments)
-args = parser.parse_args()
-
-# Accelerator
-config = ProjectConfiguration(project_dir=args.save_dir, logging_dir="log")
-accelerator = Accelerator(log_with=["wandb", "tensorboard"], project_config=config)
-acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
-
-args = Namespace(**vars(args), **acc_state)
-samples_per_step = accelerator.state.num_processes * args.train_batch_size
-set_seed(args.seed)
-
-# Clone model repository
-if accelerator.is_main_process:
-    hf_repo = Repository(args.save_dir, clone_from=args.model_ckpt)
-
-# Logging
-logger, run_name = setup_logging(args)
-logger.info(accelerator.state)
-
-# Checkout new branch on repo
-if accelerator.is_main_process:
-    hf_repo.git_checkout(run_name, create_branch_ok=True)
-
-# Load model and tokenizer
-model = AutoModelForCausalLM.from_pretrained(args.save_dir)
-if args.gradient_checkpointing:
-    model.gradient_checkpointing_enable()
-tokenizer = AutoTokenizer.from_pretrained(args.save_dir)
-
-# Load dataset and dataloader
-train_dataloader, eval_dataloader = create_dataloaders(args)
-
-# Prepare the optimizer and learning rate scheduler
-optimizer = AdamW(get_grouped_params(model, args), lr=args.learning_rate)
-lr_scheduler = get_scheduler(
-    name=args.lr_scheduler_type,
-    optimizer=optimizer,
-    num_warmup_steps=args.num_warmup_steps,
-    num_training_steps=args.max_train_steps,
-)
-accelerator.register_for_checkpointing(lr_scheduler)
-
-
-def get_lr():
-    return optimizer.param_groups[0]["lr"]
-
-
-# Prepare everything with our `accelerator`.
-model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-    model, optimizer, train_dataloader, eval_dataloader
-)
-
-# load in the weights and states from a previous save
-if args.resume_from_checkpoint:
-    if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
-        accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
-        accelerator.load_state(args.resume_from_checkpoint)
-        path = os.path.basename(args.resume_from_checkpoint)
-    else:
-        # Get the most recent checkpoint
-        dirs = [f.name for f in os.scandir(args.save_dir) if f.is_dir() and "step" in str(f)]
-        dirs.sort(key=os.path.getctime)
-        path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
-    # Extract the step of the checkpoint to continue from there
-    training_difference = os.path.splitext(path)[0]
-    resume_step = int(training_difference.replace("step_", ""))
-
-# Train model
-model.train()
-completed_steps = 0
-t_start = time.time()
-loss_tracking = 0
-for step, batch in enumerate(train_dataloader, start=1):
-    if args.resume_from_checkpoint and step < resume_step:
-        continue  # we need to skip steps until we reach the resumed step
-    loss = model(batch, labels=batch, use_cache=False).loss
-    avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
-    loss_tracking += avg_loss.item() / args.gradient_accumulation_steps
-    log_metrics(step, {"samples": step * samples_per_step, "loss_per_step/train": loss.item()})
-    loss = loss / args.gradient_accumulation_steps
-    if step % args.gradient_accumulation_steps != 0:
-        # Prevent backward from doing gradient all_reduce in every step
-        if accelerator.distributed_type == DistributedType.MULTI_GPU:
-            with model.no_sync():
-                accelerator.backward(loss)
-        else:
-            accelerator.backward(loss)
-    else:
-        lr = get_lr()
-        accelerator.backward(loss)
-        accelerator.clip_grad_norm_(model.parameters(), 1.0)
-        optimizer.step()
-        lr_scheduler.step()
-        optimizer.zero_grad()
-        elapsed_time = time.time() - t_start
-        tflops = compute_tflops(elapsed_time, accelerator, args)
-        log_metrics(
-            step,
-            {
-                "steps": completed_steps,
-                "loss/train": loss_tracking,
-                "lr": lr,
-                "tflops": tflops,
-                "time_per_iteration": elapsed_time,
-            },
-        )
-        t_start = time.time()
-        loss_tracking = 0
-        completed_steps += 1
-    if step % args.save_checkpoint_steps == 0:
-        logger.info("Evaluating and saving model checkpoint")
-        eval_loss, perplexity = evaluate(args)
-        log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
-        accelerator.wait_for_everyone()
-        save_dir = os.path.join(args.save_dir, f"step_{step}")
-        accelerator.save_state(save_dir)
-        if accelerator.is_main_process:
-            hf_repo.push_to_hub(commit_message=f"step {step}")
-        model.train()
-    if completed_steps >= args.max_train_steps:
-        break
-
-# Evaluate and save the last checkpoint
-logger.info("Evaluating and saving model after training")
-eval_loss, perplexity = evaluate(args)
-log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
-accelerator.wait_for_everyone()
-unwrapped_model = accelerator.unwrap_model(model)
-unwrapped_model.save_pretrained(args.save_dir, save_function=accelerator.save)
-save_dir = os.path.join(args.save_dir, f"step_{step}")
-accelerator.save_state(save_dir)
-if accelerator.is_main_process:
-    hf_repo.push_to_hub(commit_message="final model")
diff --git a/examples/research_projects/codeparrot/scripts/human_eval.py b/examples/research_projects/codeparrot/scripts/human_eval.py
deleted file mode 100644
index ef217a597e33..000000000000
--- a/examples/research_projects/codeparrot/scripts/human_eval.py
+++ /dev/null
@@ -1,228 +0,0 @@
-import json
-import multiprocessing
-import os
-import re
-from collections import defaultdict
-
-import torch
-from accelerate import Accelerator
-from accelerate.utils import set_seed
-from arguments import HumanEvalArguments
-from datasets import load_dataset, load_metric
-from torch.utils.data import IterableDataset
-from torch.utils.data.dataloader import DataLoader
-from tqdm import tqdm
-
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, StoppingCriteria, StoppingCriteriaList
-
-
-EOF_STRINGS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"]
-
-
-class TokenizedDataset(IterableDataset):
-    """Tokenize and preprocess the dataset
-    Multiple copies of the same prompt are sent sequentially.
-    See compute_code for more details.
-    """
-
-    def __init__(self, tokenizer, dataset, n_tasks=None, n_copies=1):
-        self.tokenizer = tokenizer
-        self.dataset = dataset
-        self.n_tasks = len(dataset) if n_tasks is None else n_tasks
-        self.n_copies = n_copies
-
-    def __iter__(self):
-        prompts = []
-        for task in range(self.n_tasks):
-            # without strip, the model generate commented codes ...
-            prompts.append(self.tokenizer.eos_token + self.dataset[task]["prompt"].strip())
-        outputs = self.tokenizer(prompts, padding=True, return_tensors="pt")
-        for task in range(self.n_tasks):
-            for _ in range(self.n_copies):
-                yield {
-                    "ids": outputs.input_ids[task],
-                    "task_id": task,
-                    "input_len": outputs.attention_mask[task].sum(),
-                }
-
-
-class EndOfFunctionCriteria(StoppingCriteria):
-    """Custom `StoppingCriteria` which checks if all generated functions in the batch are completed."""
-
-    def __init__(self, start_length, eof_strings, tokenizer):
-        self.start_length = start_length
-        self.eof_strings = eof_strings
-        self.tokenizer = tokenizer
-
-    def __call__(self, input_ids, scores, **kwargs):
-        """Returns true if all generated sequences contain any of the end-of-function strings."""
-        decoded_generations = self.tokenizer.batch_decode(input_ids[:, self.start_length :])
-        done = []
-        for decoded_generation in decoded_generations:
-            done.append(any(stop_string in decoded_generation for stop_string in self.eof_strings))
-        return all(done)
-
-
-def remove_last_block(string):
-    """Remove the last block of the code containing EOF_STRINGS"""
-    string_list = re.split("(%s)" % "|".join(EOF_STRINGS), string)
-    # last string should be ""
-    return "".join(string_list[:-2])
-
-
-def complete_code(accelerator, model, tokenizer, dataloader, n_tasks, batch_size=20, **gen_kwargs):
-    """Generate multiple codes for each task in the dataset. This function leverage accelerator to distribute
-    the processing to multiple GPUs.
-    dataloader, a wrapper around a TokenizeDataset objectm is supposed to send all the prompts from
-    the evalution dataset to the modelm as the following:
-    [p_0_0, p_0_1, ..., p_0_nc-1, p_1_0, ..., p_nt-1_nc-1]
-    where nc is the number of copies of the prompt, and nt is the number of tasks.
-    nc is such that num_sample = nc * batch_size
-
-    Parameters
-    ----------
-    accelerator: Accelerator
-
-    model: transformers.PreTrainedModel
-        Code generation model. AutoTokenizer.from_pretrained(model_ckpt), ex model_ckpt = "lvwerra/codeparrot"
-
-    tokenizer: transformers.AutoTokenizer
-        The tokenizer used to train model
-
-    dataloader: DataLoader
-        The dataloader is a wrapper around a TokenizeDataset object. It is designed to be used with multiple GPUs.
-
-    n_tasks: int
-        The number of tasks in the dataset. It is used to determine the length of the output.
-        Should be aligned with the number of tasks in the TokenizeDataset.
-
-    batch_size: int
-        num_return_sequences per copy of the prompt such that num_sample = batch_size * n_copies
-
-    gen_kwargs: dict
-        Keyword arguments for the generation function of the model.
-
-    Returns
-    -------
-    code_gens: list of list of str, of length n_tasks
-        List of generated codes for each task.
-        Each element is a list of generated codes for each task, with length num_samples
-    """
-    gen_token_dict = defaultdict(list)  # dict of list of generated tokens
-    for step, batch in tqdm(enumerate(dataloader)):
-        with torch.no_grad():
-            gen_kwargs["stopping_criteria"][0].start_length = batch["ids"].shape[-1]
-            generated_tokens = accelerator.unwrap_model(model).generate(
-                input_ids=batch["ids"][:, : batch["input_len"]], num_return_sequences=batch_size, **gen_kwargs
-            )
-            # each task is generated batch_size times
-            generated_tasks = batch["task_id"].repeat(batch_size)
-            generated_tokens = accelerator.pad_across_processes(
-                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
-            )
-
-            generated_tokens, generated_tasks = accelerator.gather((generated_tokens, generated_tasks))
-            generated_tokens = generated_tokens.cpu().numpy()
-            generated_tasks = generated_tasks.cpu().numpy()
-
-            for task, generated_tokens in zip(generated_tasks, generated_tokens):
-                gen_token_dict[task].append(generated_tokens)
-
-    code_gens = [[] for _ in range(n_tasks)]
-    for task, generated_tokens in gen_token_dict.items():
-        for s in generated_tokens:
-            gen_code = tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-            code_gens[task].append(remove_last_block(gen_code))
-    return code_gens
-
-
-def main():
-    # Setup configuration
-    parser = HfArgumentParser(HumanEvalArguments)
-    args = parser.parse_args()
-
-    transformers.logging.set_verbosity_error()
-    # enables code execution in code_eval metric
-    os.environ["HF_ALLOW_CODE_EVAL"] = args.HF_ALLOW_CODE_EVAL
-    # make sure tokenizer plays nice with multiprocessing
-    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-    if args.num_workers is None:
-        args.num_workers = multiprocessing.cpu_count()
-
-    # Use dataset load to feed to accelerate
-    accelerator = Accelerator()
-    set_seed(args.seed, device_specific=True)
-
-    # Load model and tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
-    tokenizer.pad_token = tokenizer.eos_token
-    model = AutoModelForCausalLM.from_pretrained(args.model_ckpt)
-
-    # Generation settings
-    gen_kwargs = {
-        "do_sample": args.do_sample,
-        "temperature": args.temperature,
-        "max_new_tokens": args.max_new_tokens,
-        "top_p": args.top_p,
-        "top_k": args.top_k,
-        "stopping_criteria": StoppingCriteriaList([EndOfFunctionCriteria(0, EOF_STRINGS, tokenizer)]),
-    }
-
-    # Load evaluation dataset and metric
-    human_eval = load_dataset("openai_humaneval")
-    code_eval_metric = load_metric("code_eval")
-
-    n_tasks = args.num_tasks if args.num_tasks is not None else len(human_eval["test"])
-    n_copies = args.n_samples // args.batch_size
-
-    human_eval_tokenized = TokenizedDataset(tokenizer, human_eval["test"], n_copies=n_copies, n_tasks=n_tasks)
-    # do not confuse args.batch_size, which is actually the num_return_sequences
-    human_eval_loader = DataLoader(human_eval_tokenized, batch_size=1)
-
-    # Run a quick test to see if code evaluation is enabled
-    try:
-        _ = code_eval_metric.compute(references=[""], predictions=[[""]])
-    except ValueError as exception:
-        print(
-            'Code evaluation not enabled. Read the warning below carefully and then use `--HF_ALLOW_CODE_EVAL="1"`'
-            " flag to enable code evaluation."
-        )
-        raise exception
-
-    model, human_eval_loader = accelerator.prepare(model, human_eval_loader)
-
-    generations = complete_code(
-        accelerator,
-        model,
-        tokenizer,
-        human_eval_loader,
-        n_tasks=n_tasks,
-        batch_size=args.batch_size,
-        **gen_kwargs,
-    )
-
-    if accelerator.is_main_process:
-        references = []
-
-        for task in tqdm(range(n_tasks)):
-            test_func = human_eval["test"][task]["test"]
-            entry_point = f"check({human_eval['test'][task]['entry_point']})"
-            references.append("\n" + test_func + "\n" + entry_point)
-
-        # Evaluate completions with "code_eval" metric
-        pass_at_k, _ = code_eval_metric.compute(
-            references=references, predictions=generations, num_workers=args.num_workers
-        )
-        print(f"Results: {pass_at_k}")
-
-        # Save results to json file
-        with open(args.output_file, "w") as fp:
-            json.dump(pass_at_k, fp)
-
-
-# For some reason the folliwng seems to be necessary sometimes for code_eval to work nice with multiprocessing
-# https://stackoverflow.com/questions/60804599/python-multiprocessing-keeps-spawning-the-whole-script
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/codeparrot/scripts/initialize_model.py b/examples/research_projects/codeparrot/scripts/initialize_model.py
deleted file mode 100644
index 6bf028688f12..000000000000
--- a/examples/research_projects/codeparrot/scripts/initialize_model.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from arguments import InitializationArguments
-
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
-
-
-# Configuration
-parser = HfArgumentParser(InitializationArguments)
-args = parser.parse_args()
-
-# Load codeparrot tokenizer trained for Python code tokenization
-tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
-
-# Config: "scale_attn_by_layer_idx" and "reorder_and_upcast_attn" are Mistral stability tweaks
-config_kwargs = {
-    "vocab_size": len(tokenizer),
-    "scale_attn_by_inverse_layer_idx": True,
-    "reorder_and_upcast_attn": True,
-}
-
-# Load model config (GPT-2 large in this case)
-config = AutoConfig.from_pretrained(args.config_name, **config_kwargs)
-
-# Initialize new model with config
-model = AutoModelForCausalLM.from_config(config)
-
-# Save model to the hub
-model.save_pretrained(args.model_name, push_to_hub=args.push_to_hub)
diff --git a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
deleted file mode 100644
index f1984711278a..000000000000
--- a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
+++ /dev/null
@@ -1,268 +0,0 @@
-import json
-import multiprocessing as mp
-import re
-from collections import defaultdict
-from functools import partial
-from typing import Dict, List, Optional, Set, Tuple, Type
-
-from datasets import Dataset
-from datasketch import MinHash, MinHashLSH
-from dpu_utils.utils.iterators import ThreadedIterator
-from tqdm import tqdm
-
-
-NON_ALPHA = re.compile("[^A-Za-z_0-9]")
-# parameters used in DuplicationIndex
-MIN_NUM_TOKENS = 10
-NUM_PERM = 256
-
-
-def get_min_hash(tokens: List[str]) -> Optional[MinHash]:
-    """Compute the MinHash of a code snippet."""
-    if len(tokens) < MIN_NUM_TOKENS:
-        return None
-    min_hash = MinHash(num_perm=NUM_PERM)
-    for token in set(tokens):
-        min_hash.update(token.encode())
-    return min_hash
-
-
-def get_tokens(code: str) -> Set[str]:
-    """Tokenize a code snippet."""
-    return {t for t in NON_ALPHA.split(code) if len(t.strip()) > 0}
-
-
-class DuplicationIndex:
-    def __init__(
-        self,
-        *,
-        duplication_jaccard_threshold: float = 0.85,
-    ):
-        self._duplication_jaccard_threshold = duplication_jaccard_threshold
-        self._num_perm = NUM_PERM
-        self._index = MinHashLSH(threshold=self._duplication_jaccard_threshold, num_perm=self._num_perm)
-
-        self._duplicate_clusters = defaultdict(set)
-
-    def add(self, code_key: Tuple, min_hash: MinHash) -> None:
-        """Add a key to _index (MinHashLSH)
-        the min_hash is used to query closest matches based on the jaccard_threshold.
-        The new key is either added to a existing cluster of one close match,
-        or a new cluster is created. The clusters created in this way, depend on the order of add.
-
-        Args:
-            code_key (Tuple of (index, repo_name, path)):
-                Theoritically any hasbale key. Here we use a tuple to retrieve the information later.
-            min_hash: MinHash of the code_key.
-        """
-        close_duplicates = self._index.query(min_hash)
-        if code_key in self._index.keys:
-            print(f"Duplicate key {code_key}")
-            return
-
-        self._index.insert(code_key, min_hash)
-        if len(close_duplicates) > 0:
-            for base_duplicate in close_duplicates:
-                if base_duplicate in self._duplicate_clusters:
-                    self._duplicate_clusters[base_duplicate].add(code_key)
-                    break
-            else:
-                self._duplicate_clusters[close_duplicates[0]].add(code_key)
-
-    def get_duplicate_clusters(self) -> List[List[Dict]]:
-        """Export the duplicate clusters.
-        For each cluster, the first element is the base element of the cluster.
-        The base element has an estimation jaccard similarity higher than the threshold with all the other elements.
-
-        Returns:
-            duplicate_clusters (List[List[Dict]]):
-                List of duplicate clusters.
-        """
-        duplicate_clusters = []
-        for base, duplicates in self._duplicate_clusters.items():
-            cluster = [base] + list(duplicates)
-            # reformat the cluster to be a list of dict
-            cluster = [{"base_index": el[0], "repo_name": el[1], "path": el[2]} for el in cluster]
-            duplicate_clusters.append(cluster)
-        return duplicate_clusters
-
-    def save(self, filepath) -> None:
-        duplicate_clusters = self.get_duplicate_clusters()
-        with open(filepath, "w") as f:
-            json.dump(duplicate_clusters, f)
-
-
-def _compute_min_hash(element):
-    index, data = element
-    min_hash = get_min_hash([t for t in NON_ALPHA.split(data["content"]) if len(t.strip()) > 0])
-    if min_hash is not None:
-        return (index, data["repo_name"], data["path"]), min_hash
-
-
-def minhash_iter(dataset_iterator: Type[Dataset]):
-    with mp.Pool() as pool:
-        for data in pool.imap_unordered(
-            _compute_min_hash,
-            ThreadedIterator(dataset_iterator, max_queue_size=10000),
-            chunksize=100,
-        ):
-            if data is not None:
-                yield data
-
-
-def make_duplicate_clusters(dataset_iterator: Type[Dataset], jaccard_threshold: float):
-    """Find duplicate clusters in the dataset in two steps:
-    1. Compute MinHash for each code snippet. MinHash is a tool for fast jaccard similarity estimation.
-    This step is computed using an asynchronous multiprocessing pool, minhash_iter
-    2. Find duplicate clusters. The computed MinHash is added sequentially to the DuplicationIndex.
-    This step cannot be parallelized. So using asynchronous thread in the previous step helps to speed up the process.
-    """
-    di = DuplicationIndex(duplication_jaccard_threshold=jaccard_threshold)
-
-    for filename, min_hash in tqdm(ThreadedIterator(minhash_iter(enumerate(dataset_iterator)), max_queue_size=100)):
-        di.add(filename, min_hash)
-
-    # Returns a List[Cluster] where Cluster is List[str] with the filenames.
-    return di.get_duplicate_clusters()
-
-
-def jaccard_similarity(code1: str, code2: str) -> float:
-    """Compute the Jaccard similarity of two code snippets."""
-    tokens1 = get_tokens(code1)
-    tokens2 = get_tokens(code2)
-    return len(tokens1 & tokens2) / len(tokens1 | tokens2)
-
-
-_shared_dataset = None
-
-
-def _find_cluster_extremes_shared(cluster, jaccard_threshold):
-    """Find a reduced cluster such that each code in the origin cluster is similar to at least one code in the reduced cluster.
-    Two codes are similar if their Jaccard similarity is above the threshold.
-
-    Args:
-        cluster (List[dict]):
-           cluster is a list of dict, each dict contains the following keys:
-                - base_index
-                - repo_name
-                - path
-            This is a typical output of DuplicationIndex.get_duplicate_clusters()
-        jaccard_threshold (float):
-            threshold for Jaccard similarity.
-            Two codes are similar if their Jaccard similarity is above the threshold.
-
-    Returns:
-        extremes (List[dict]):
-            A reduced representation of the cluster. The field copies is added to each dict.
-            The copies field indicates the number of similar codes in the cluster for a extreme.
-    """
-    extremes = []
-    for element1 in cluster:
-        code1 = _shared_dataset[element1["base_index"]]["content"]
-        for element2 in extremes:
-            code2 = _shared_dataset[element2["base_index"]]["content"]
-            if jaccard_similarity(code1, code2) >= jaccard_threshold:
-                element2["copies"] += 1
-                break
-        else:
-            element1["copies"] = 1
-            extremes.append(element1)
-    return extremes
-
-
-def find_extremes(cluster_list, dataset, jaccard_threshold):
-    """Call the _find_cluster_extremes_shared function in a parallel fashion.
-
-    Args:
-        cluster_list (List[List[Dict]]):
-            each cluster is a list of dicts with the key base_index,
-            referring to the index of the base code in the dataset.
-        dataset (Type[Dataset]):
-            dataset is used to access the content of the code snippets,
-            using the base_index from the cluster_list.
-            dataset is shared between all the processes using a glabal variable (any other way to share the dataset?),
-            otherwise the multi processing is not speeded up.
-        jaccard_threshold (float):
-            the threshold for the jaccard similarity. The default value is 0.85
-
-    Returns:
-        extremes_list (List[Dict]):
-            Each cluster is reduced to extremes.
-            See _find_cluster_extremes_shared for the definition of extremes.
-    """
-    global _shared_dataset
-    _shared_dataset = dataset
-    extremes_list = []
-    f = partial(_find_cluster_extremes_shared, jaccard_threshold=jaccard_threshold)
-    with mp.Pool() as pool:
-        for extremes in tqdm(
-            pool.imap_unordered(
-                f,
-                cluster_list,
-            ),
-            total=len(cluster_list),
-        ):
-            extremes_list.append(extremes)
-    return extremes_list
-
-
-def deduplicate_dataset(
-    dataset: Type[Dataset], jaccard_threshold: float = 0.85
-) -> Tuple[Type[Dataset], List[List[Dict]]]:
-    """Deduplicate the dataset using minhash and jaccard similarity.
-    This function first generate duplicate clusters, then each cluster
-    is reduced to the extremes that are similar to the other elements in the cluster.
-    Codes are called similar if their Jaccard similarity is greater than jaccard_threshold (0.85 default).
-
-    Args:
-        dataset (Type[Dataset]):
-            The dataset to deduplicate.
-        jaccard_threshold (float, default=0.85):
-            jaccard threshold to determine if two codes are similar
-
-    Returns:
-        ds_dedup (Type[Dataset]):
-            The deduplicated dataset.
-        duplicate_clusters (List[List[Dict]]):
-            The list of duplicate clusters.
-            Each cluster is a list of dicts with the following keys:
-            - base_index : int
-                The index of the code in the original dataset.
-            - repo_name : str
-            - path : str
-            - copies : int
-                The number of copies of the code in the cluster. (find_cluster_extremes)
-            - is_extreme : bool
-                Whether the code is an extreme in the cluster.
-            All the codes in the cluster are removed from the dataset except the extremes.
-
-    Example:
-        >>> from datasets import load_dataset
-        >>> from minhash_deduplication import deduplicate_dataset
-        >>> ds = load_dataset("lvwerra/codeparrot-clean", split="train")
-        >>> ds_dedup, duplicate_clusters = deduplicate_dataset(ds, jaccard_threshold=0.85)
-    """
-    duplicate_clusters = make_duplicate_clusters(dataset, jaccard_threshold)
-    duplicate_indices = {x["base_index"] for cluster in duplicate_clusters for x in cluster}
-    extreme_dict = {}
-    extremes_clusters = find_extremes(duplicate_clusters, dataset, jaccard_threshold)
-    for extremes in extremes_clusters:
-        for element in extremes:
-            extreme_dict[element["base_index"]] = element
-    remove_indices = duplicate_indices - set(extreme_dict.keys())
-    ds_filter = dataset.filter(lambda x, idx: idx not in remove_indices, with_indices=True)
-
-    # update duplicate_clusters
-    for cluster in duplicate_clusters:
-        for element in cluster:
-            element["is_extreme"] = element["base_index"] in extreme_dict
-            if element["is_extreme"]:
-                element["copies"] = extreme_dict[element["base_index"]]["copies"]
-
-    print(f"Original dataset size: {len(dataset)}")
-    print(f"Number of duplicate clusters: {len(duplicate_clusters)}")
-    print(f"Files in duplicate cluster: {len(duplicate_indices)}")
-    print(f"Unique files in duplicate cluster: {len(extreme_dict)}")
-    print(f"Filtered dataset size: {len(ds_filter)}")
-
-    return ds_filter, duplicate_clusters
diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
deleted file mode 100644
index d9cac5abfd8e..000000000000
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ /dev/null
@@ -1,215 +0,0 @@
-import gzip
-import json
-import multiprocessing
-import os
-import re
-import shutil
-import time
-from pathlib import Path
-
-import numpy as np
-from arguments import PreprocessingArguments
-from datasets import load_dataset
-from huggingface_hub.utils import insecure_hashlib
-from minhash_deduplication import deduplicate_dataset
-
-from transformers import AutoTokenizer, HfArgumentParser
-
-
-PATTERN = re.compile(r"\s+")
-
-
-def get_hash(example):
-    """Get hash of content field."""
-    return {"hash": insecure_hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()}
-
-
-def line_stats(example):
-    """Calculates mean and max line length of file."""
-    line_lengths = [len(line) for line in example["content"].splitlines()]
-    return {"line_mean": np.mean(line_lengths), "line_max": max(line_lengths)}
-
-
-def alpha_stats(example):
-    """Calculates mean and max line length of file."""
-    alpha_frac = np.mean([c.isalnum() for c in example["content"]])
-    return {"alpha_frac": alpha_frac}
-
-
-def check_uniques(example, uniques):
-    """Check if current hash is still in set of unique hashes and remove if true."""
-    if example["hash"] in uniques:
-        uniques.remove(example["hash"])
-        return True
-    else:
-        return False
-
-
-def is_autogenerated(example, scan_width=5):
-    """Check if file is autogenerated by looking for keywords in the first few lines of the file."""
-    keywords = ["auto-generated", "autogenerated", "automatically generated"]
-    lines = example["content"].splitlines()
-    for _, line in zip(range(scan_width), lines):
-        for keyword in keywords:
-            if keyword in line.lower():
-                return {"autogenerated": True}
-    else:
-        return {"autogenerated": False}
-
-
-def is_config_or_test(example, scan_width=5, coeff=0.05):
-    """Check if file is a configuration file or a unit test by :
-    1- looking for keywords in the first few lines of the file.
-    2- counting number of occurrence of the words 'config' and 'test' with respect to number of lines.
-    """
-
-    keywords = ["unit tests", "test file", "configuration file"]
-    lines = example["content"].splitlines()
-    count_config = 0
-    count_test = 0
-    # first test
-    for _, line in zip(range(scan_width), lines):
-        for keyword in keywords:
-            if keyword in line.lower():
-                return {"config_or_test": True}
-    # second test
-    nlines = example["content"].count("\n")
-    threshold = int(coeff * nlines)
-    for line in lines:
-        count_config += line.lower().count("config")
-        count_test += line.lower().count("test")
-        if count_config > threshold or count_test > threshold:
-            return {"config_or_test": True}
-    return {"config_or_test": False}
-
-
-def has_no_keywords(example):
-    """Check if a python file has none of the keywords for: funcion, class, for loop, while loop."""
-    keywords = ["def ", "class ", "for ", "while "]
-    lines = example["content"].splitlines()
-    for line in lines:
-        for keyword in keywords:
-            if keyword in line.lower():
-                return {"has_no_keywords": False}
-    return {"has_no_keywords": True}
-
-
-def has_few_assignments(example, minimum=4):
-    """Check if file uses symbol '=' less than `minimum` times."""
-    lines = example["content"].splitlines()
-    counter = 0
-    for line in lines:
-        counter += line.lower().count("=")
-        if counter > minimum:
-            return {"has_few_assignments": False}
-    return {"has_few_assignments": True}
-
-
-def char_token_ratio(example):
-    """Compute character/token ratio of the file with tokenizer."""
-    input_ids = tokenizer(example["content"], truncation=False)["input_ids"]
-    ratio = len(example["content"]) / len(input_ids)
-    return {"ratio": ratio}
-
-
-def preprocess(example):
-    """Chain all preprocessing steps into one function to not fill cache."""
-    results = {}
-    results.update(get_hash(example))
-    results.update(line_stats(example))
-    results.update(alpha_stats(example))
-    results.update(char_token_ratio(example))
-    results.update(is_autogenerated(example))
-    results.update(is_config_or_test(example))
-    results.update(has_no_keywords(example))
-    results.update(has_few_assignments(example))
-    return results
-
-
-def filter(example, uniques, args):
-    """Filter dataset with heuristics. Config, test and has_no_keywords files are removed with a given probability."""
-    if not check_uniques(example, uniques):
-        return False
-    elif example["autogenerated"]:
-        return False
-    elif example["line_max"] > args.line_max:
-        return False
-    elif example["line_mean"] > args.line_mean:
-        return False
-    elif example["alpha_frac"] < args.alpha_frac:
-        return False
-    elif example["ratio"] < args.min_token_ratio:
-        return False
-    elif example["config_or_test"] and np.random.rand() <= args.filter_proba:
-        return False
-    elif example["has_no_keywords"] and np.random.rand() <= args.filter_proba:
-        return False
-    elif example["has_few_assignments"]:
-        return False
-    else:
-        return True
-
-
-def compress_file(file_path):
-    """Compress a file with g-zip."""
-    with open(file_path, "rb") as f_in:
-        with gzip.open(str(file_path) + ".gz", "wb", compresslevel=6) as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    os.unlink(file_path)
-
-
-# Settings
-parser = HfArgumentParser(PreprocessingArguments)
-args = parser.parse_args()
-if args.num_workers is None:
-    args.num_workers = multiprocessing.cpu_count()
-tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
-
-# Load dataset
-t_start = time.time()
-ds = load_dataset(args.dataset_name, split="train")
-print(f"Time to load dataset: {time.time()-t_start:.2f}")
-
-# Run preprocessing
-t_start = time.time()
-ds = ds.map(preprocess, num_proc=args.num_workers)
-print(f"Time to preprocess dataset: {time.time()-t_start:.2f}")
-
-# Deduplicate hashes
-uniques = set(ds.unique("hash"))
-frac = len(uniques) / len(ds)
-print(f"Fraction of duplicates: {1-frac:.2%}")
-
-# Deduplicate data and apply heuristics
-t_start = time.time()
-ds_filter = ds.filter(filter, fn_kwargs={"uniques": uniques, "args": args})
-print(f"Time to filter dataset: {time.time()-t_start:.2f}")
-print(f"Size of filtered dataset: {len(ds_filter)}")
-
-# Deduplicate with minhash and jaccard similarity
-if args.near_deduplication:
-    t_start = time.time()
-    ds_filter, duplicate_clusters = deduplicate_dataset(ds_filter, args.jaccard_threshold)
-    print(f"Time to deduplicate dataset: {time.time()-t_start:.2f}")
-    print(f"Size of deduplicate dataset: {len(ds_filter)}")
-
-# Save data in batches of samples_per_file
-output_dir = Path(args.output_dir)
-output_dir.mkdir(exist_ok=True)
-
-# save duplicate_clusters in the output_dir as artifacts
-# not sure it is the right place the save it
-if args.near_deduplication:
-    with open(output_dir / "duplicate_clusters.json", "w") as f:
-        json.dump(duplicate_clusters, f)
-
-data_dir = output_dir / "data"
-data_dir.mkdir(exist_ok=True)
-
-t_start = time.time()
-for file_number, index in enumerate(range(0, len(ds_filter), args.samples_per_file)):
-    file_path = str(data_dir / f"file-{file_number+1:012}.json")
-    end_index = min(len(ds_filter), index + args.samples_per_file)
-    ds_filter.select(list(range(index, end_index))).to_json(file_path)
-    compress_file(file_path)
-print(f"Time to save dataset: {time.time()-t_start:.2f}")
diff --git a/examples/research_projects/codeparrot/scripts/pretokenizing.py b/examples/research_projects/codeparrot/scripts/pretokenizing.py
deleted file mode 100644
index 7cac8f511918..000000000000
--- a/examples/research_projects/codeparrot/scripts/pretokenizing.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import multiprocessing
-import time
-
-from arguments import PretokenizationArguments
-from datasets import load_dataset
-
-from transformers import AutoTokenizer, HfArgumentParser
-
-
-def tokenize(example):
-    output = {}
-    output["input_ids"] = tokenizer(example["content"], truncation=False)["input_ids"]
-    output["ratio_char_token"] = len(example["content"]) / len(output["input_ids"])
-    return output
-
-
-parser = HfArgumentParser(PretokenizationArguments)
-args = parser.parse_args()
-if args.num_workers is None:
-    args.num_workers = multiprocessing.cpu_count()
-tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
-
-t_start = time.time()
-ds = load_dataset(args.dataset_name, split="train")
-print(f"Dataset loaded in {time.time()-t_start:.2f}s")
-
-t_start = time.time()
-ds = ds.map(
-    tokenize,
-    num_proc=args.num_workers,
-    remove_columns=[
-        "repo_name",
-        "path",
-        "copies",
-        "size",
-        "content",
-        "license",
-        "hash",
-        "line_mean",
-        "line_max",
-        "alpha_frac",
-        "autogenerated",
-    ],
-)
-print(f"Dataset tokenized in {time.time()-t_start:.2f}s")
-
-t_start = time.time()
-ds.push_to_hub(args.tokenized_data_repo)
-print(f"Data pushed to the hub in {time.time()-t_start:.2f}s")
diff --git a/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py b/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py
deleted file mode 100644
index aaf53de137f4..000000000000
--- a/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from unittest import TestCase
-
-from datasets import Dataset
-from minhash_deduplication import deduplicate_dataset, make_duplicate_clusters
-
-
-def get_dataset():
-    data_dict = {
-        "repo_name": ["test_repo1", "test_repo2", "test_repo3"],
-        "path": ["test_1.py", "test_2.py", "unit_test.py"],
-        "content": ["a " * 20, "a " * 30, "b " * 7],
-    }
-    dataset = Dataset.from_dict(data_dict)
-    return dataset
-
-
-class MakeDuplicateClustersTest(TestCase):
-    def test_make_duplicate_clusters(self):
-        ds = get_dataset()
-        duplicate_clusters = make_duplicate_clusters(ds, 0.85)
-        self.assertEqual(len(duplicate_clusters[0]), 2)
-
-    def test_deduplicate_dataset(self):
-        ds = get_dataset()
-        ds_filter, duplicate_clusters = deduplicate_dataset(ds)
-        self.assertEqual(len(ds_filter), 2)
-        print(duplicate_clusters)
-        self.assertEqual(duplicate_clusters[0][0]["copies"], 2)
-        self.assertEqual(duplicate_clusters[0][0]["is_extreme"], True)
diff --git a/examples/research_projects/codeparrot/scripts/validation_loss.py b/examples/research_projects/codeparrot/scripts/validation_loss.py
deleted file mode 100644
index 929c2df427e2..000000000000
--- a/examples/research_projects/codeparrot/scripts/validation_loss.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import logging
-
-import torch
-from accelerate import Accelerator
-from arguments import EvaluationArguments
-from datasets import load_dataset
-from torch.utils.data import IterableDataset
-from torch.utils.data.dataloader import DataLoader
-
-from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, set_seed
-
-
-class ConstantLengthDataset(IterableDataset):
-    def __init__(self, tokenizer, dataset, seq_length=1024, num_of_sequences=1024, chars_per_token=3.6):
-        self.tokenizer = tokenizer
-        self.concat_token_id = tokenizer.bos_token_id
-        self.dataset = dataset
-        self.seq_length = seq_length
-        self.input_characters = seq_length * chars_per_token * num_of_sequences
-
-    def __iter__(self):
-        iterator = iter(self.dataset)
-        more_examples = True
-        while more_examples:
-            buffer, buffer_len = [], 0
-            while True:
-                if buffer_len >= self.input_characters:
-                    break
-                try:
-                    buffer.append(next(iterator)["content"])
-                    buffer_len += len(buffer[-1])
-                except StopIteration:
-                    more_examples = False
-                    break
-            tokenized_inputs = tokenizer(buffer, truncation=False)["input_ids"]
-            all_token_ids = []
-            for tokenized_input in tokenized_inputs:
-                all_token_ids.extend(tokenized_input + [self.concat_token_id])
-            for i in range(0, len(all_token_ids), self.seq_length):
-                input_ids = all_token_ids[i : i + self.seq_length]
-                if len(input_ids) == self.seq_length:
-                    yield torch.tensor(input_ids)
-
-
-def create_dataloader(args):
-    ds_kwargs = {"streaming": True}
-    valid_data = load_dataset(args.dataset_name, split="train", **ds_kwargs)
-    valid_dataset = ConstantLengthDataset(tokenizer, valid_data, seq_length=args.seq_length)
-    eval_dataloader = DataLoader(valid_dataset, batch_size=args.batch_size)
-    return eval_dataloader
-
-
-def evaluate(args):
-    model.eval()
-    losses = []
-    for step, batch in enumerate(eval_dataloader):
-        with torch.no_grad():
-            outputs = model(batch, labels=batch)
-        loss = outputs.loss.repeat(args.batch_size)
-        losses.append(accelerator.gather(loss))
-
-        if args.max_eval_steps > 0 and step >= args.max_eval_steps:
-            break
-    loss = torch.mean(torch.cat(losses))
-    try:
-        perplexity = torch.exp(loss)
-    except OverflowError:
-        perplexity = float("inf")
-    return loss.item(), perplexity.item()
-
-
-# Setup Accelerator
-accelerator = Accelerator()
-
-# Parse configuration
-parser = HfArgumentParser(EvaluationArguments)
-args = parser.parse_args()
-set_seed(args.seed)
-
-# Logging
-logger = logging.getLogger(__name__)
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
-)
-
-# Load model and tokenizer
-model = AutoModelForCausalLM.from_pretrained(args.model_ckpt)
-tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
-
-# Load dataset and dataloader
-eval_dataloader = create_dataloader(args)
-
-# Prepare everything with our `accelerator`.
-model, eval_dataloader = accelerator.prepare(model, eval_dataloader)
-
-# Evaluate and save the last checkpoint
-logger.info("Evaluating and saving model after training")
-eval_loss, perplexity = evaluate(args)
-logger.info(f"loss/eval: {eval_loss}, perplexity: {perplexity}")
diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
deleted file mode 100644
index 2d0615efe04c..000000000000
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ /dev/null
@@ -1,240 +0,0 @@
-absl-py==1.0.0
-aiohttp==3.10.11
-aiosignal==1.2.0
-alembic==1.7.7
-appdirs==1.4.4
-APScheduler==3.9.1
-arrow==1.2.2
-asttokens==2.0.5
-astunparse==1.6.3
-async-timeout==4.0.2
-attrs==21.4.0
-audioread==2.1.9
-autopage==0.5.0
-backcall==0.2.0
-backoff==1.11.1
-backports.zoneinfo==0.2.1
-binaryornot==0.4.4
-black==24.3.0
-boto3==1.16.34
-botocore==1.19.63
-Brotli==1.0.9
-cachetools==5.0.0
-certifi==2024.7.4
-cffi==1.15.0
-chardet==4.0.0
-charset-normalizer==2.0.12
-chex==0.1.1
-click==8.0.4
-cliff==3.10.1
-clldutils==3.11.1
-cloudpickle==2.0.0
-cmaes==0.8.2
-cmd2==2.4.0
-codecarbon==1.2.0
-colorlog==6.6.0
-cookiecutter==2.1.1
-cryptography==44.0.1
-csvw==2.0.0
-cycler==0.11.0
-Cython==0.29.28
-dash==2.15.0
-dash-bootstrap-components==1.0.3
-dash-core-components==2.0.0
-dash-html-components==2.0.0
-dash-table==5.0.0
-datasets==2.0.0
-decorator==5.1.1
-Deprecated==1.2.13
-dill==0.3.4
-dlinfo==1.2.1
-dm-tree==0.1.6
-docker==4.4.4
-execnet==1.9.0
-executing==0.8.3
-faiss-cpu==1.7.2
-fasteners==0.17.3
-filelock==3.6.0
-fire==0.4.0
-flake8==4.0.1
-Flask==2.3.2
-Flask-Compress==1.11
-flatbuffers==2.0
-flax==0.4.0
-fonttools==4.43.0
-frozenlist==1.3.0
-fsspec==2022.2.0
-fugashi==1.1.2
-gast==0.5.3
-gitdb==4.0.9
-GitPython==3.1.41
-glfw==2.5.1
-google-auth==2.6.2
-google-auth-oauthlib==0.4.6
-google-pasta==0.2.0
-greenlet==1.1.2
-grpcio==1.53.2
-gym==0.23.1
-gym-notices==0.0.6
-h5py==3.6.0
-huggingface-hub==0.4.0
-hypothesis==6.39.4
-idna==3.7
-imageio==2.16.1
-importlib-metadata==4.11.3
-importlib-resources==5.4.0
-iniconfig==1.1.1
-ipadic==1.0.0
-ipython==8.10.0
-isodate==0.6.1
-isort==5.10.1
-itsdangerous==2.1.1
-jax==0.3.4
-jaxlib==0.3.2
-jedi==0.18.1
-Jinja2==3.1.5
-jinja2-time==0.2.0
-jmespath==0.10.0
-joblib==1.2.0
-jsonschema==4.4.0
-keras==2.13.1
-Keras-Preprocessing==1.1.2
-kiwisolver==1.4.0
-kubernetes==12.0.1
-libclang==13.0.0
-librosa==0.9.1
-llvmlite==0.38.0
-Mako==1.2.2
-Markdown==3.3.6
-MarkupSafe==1.1.1
-matplotlib==3.5.1
-matplotlib-inline==0.1.3
-mccabe==0.6.1
-msgpack==1.0.3
-mujoco-py==2.1.2.14
-multidict==6.0.2
-multiprocess==0.70.12.2
-mypy-extensions==0.4.3
-nltk==3.9
-numba==0.55.1
-numpy==1.22.3
-oauthlib==3.2.2
-onnx>=1.15.0
-onnxconverter-common==1.9.0
-opt-einsum==3.3.0
-optax==0.1.1
-optuna==2.10.0
-packaging==21.3
-pandas==1.4.1
-parameterized==0.8.1
-parso==0.8.3
-pathspec==0.9.0
-pbr==5.8.1
-pexpect==4.8.0
-phonemizer==3.0.1
-pickleshare==0.7.5
-Pillow==10.3.0
-Pint==0.16.1
-plac==1.3.4
-platformdirs==2.5.1
-plotly==5.6.0
-pluggy==1.0.0
-pooch==1.6.0
-portalocker==2.0.0
-poyo==0.5.0
-prettytable==3.2.0
-prompt-toolkit==3.0.28
-protobuf==3.19.5
-psutil==5.9.0
-ptyprocess==0.7.0
-pure-eval==0.2.2
-py==1.11.0
-py-cpuinfo==8.0.0
-pyarrow==15.0.0
-pyasn1==0.4.8
-pyasn1-modules==0.2.8
-pycodestyle==2.8.0
-pycparser==2.21
-pyctcdecode==0.3.0
-pyflakes==2.4.0
-Pygments==2.15.0
-pygtrie==2.4.2
-pynvml==11.4.1
-pyOpenSSL==22.0.0
-pyparsing==3.0.7
-pyperclip==1.8.2
-pypng==0.0.21
-pyrsistent==0.18.1
-pytest==7.1.1
-pytest-forked==1.4.0
-pytest-timeout==2.1.0
-pytest-xdist==2.5.0
-python-dateutil==2.8.2
-python-slugify==6.1.1
-pytz==2022.1
-pytz-deprecation-shim==0.1.0.post0
-PyYAML==6.0
-ray>2.6.3
-redis==4.5.4
-regex==2022.3.15
-requests==2.32.0
-requests-oauthlib==1.3.1
-resampy==0.2.2
-responses==0.18.0
-rfc3986==1.5.0
-rouge-score==0.0.4
-rsa==4.8
-s3transfer==0.3.7
-sacrebleu==1.5.1
-sacremoses==0.0.49
-scikit-learn==1.5.0
-scipy==1.8.0
-segments==2.2.0
-sentencepiece==0.1.96
-sigopt==8.2.0
-six==1.16.0
-smmap==5.0.0
-sortedcontainers==2.4.0
-SoundFile==0.10.3.post1
-SQLAlchemy==1.4.32
-stack-data==0.2.0
-stevedore==3.5.0
-tabulate==0.8.9
-tenacity==8.0.1
-tensorboard==2.8.0
-tensorboard-data-server==0.6.1
-tensorboard-plugin-wit==1.8.1
-tensorboardX==2.5
-tensorflow==2.12.1
-tensorflow-io-gcs-filesystem==0.24.0
-termcolor==1.1.0
-text-unidecode==1.3
-tf-estimator-nightly==2.8.0.dev2021122109
-tf2onnx==1.9.3
-threadpoolctl==3.1.0
-timeout-decorator==0.5.0
-timm==0.5.4
-tokenizers==0.11.6
-tomli==2.0.1
-toolz==0.11.2
-torch==2.2.0
-torchaudio==0.11.0
-torchvision==0.12.0
-tqdm==4.66.3
-traitlets==5.1.1
--e git+git@github.com:edbeeching/transformers.git@77b90113ca0a0e4058b046796c874bdc98f1da61#egg=transformers
-typing-extensions==4.1.1
-tzdata==2022.1
-tzlocal==4.1
-unidic==1.1.0
-unidic-lite==1.0.8
-uritemplate==4.1.1
-urllib3==1.26.19
-wasabi==0.9.0
-wcwidth==0.2.5
-websocket-client==1.3.1
-Werkzeug==3.0.6
-wrapt==1.14.0
-xxhash==3.0.0
-yarl==1.7.2
-zipp==3.19.1
\ No newline at end of file
diff --git a/examples/research_projects/decision_transformer/run_decision_transformer.py b/examples/research_projects/decision_transformer/run_decision_transformer.py
deleted file mode 100644
index d6c3e2833125..000000000000
--- a/examples/research_projects/decision_transformer/run_decision_transformer.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import gym
-import numpy as np
-import torch
-from mujoco_py import GlfwContext
-
-from transformers import DecisionTransformerModel
-
-
-GlfwContext(offscreen=True)  # Create a window to init GLFW.
-
-
-def get_action(model, states, actions, rewards, returns_to_go, timesteps):
-    # we don't care about the past rewards in this model
-
-    states = states.reshape(1, -1, model.config.state_dim)
-    actions = actions.reshape(1, -1, model.config.act_dim)
-    returns_to_go = returns_to_go.reshape(1, -1, 1)
-    timesteps = timesteps.reshape(1, -1)
-
-    if model.config.max_length is not None:
-        states = states[:, -model.config.max_length :]
-        actions = actions[:, -model.config.max_length :]
-        returns_to_go = returns_to_go[:, -model.config.max_length :]
-        timesteps = timesteps[:, -model.config.max_length :]
-
-        # pad all tokens to sequence length
-        attention_mask = torch.cat(
-            [torch.zeros(model.config.max_length - states.shape[1]), torch.ones(states.shape[1])]
-        )
-        attention_mask = attention_mask.to(dtype=torch.long, device=states.device).reshape(1, -1)
-        states = torch.cat(
-            [
-                torch.zeros(
-                    (states.shape[0], model.config.max_length - states.shape[1], model.config.state_dim),
-                    device=states.device,
-                ),
-                states,
-            ],
-            dim=1,
-        ).to(dtype=torch.float32)
-        actions = torch.cat(
-            [
-                torch.zeros(
-                    (actions.shape[0], model.config.max_length - actions.shape[1], model.config.act_dim),
-                    device=actions.device,
-                ),
-                actions,
-            ],
-            dim=1,
-        ).to(dtype=torch.float32)
-        returns_to_go = torch.cat(
-            [
-                torch.zeros(
-                    (returns_to_go.shape[0], model.config.max_length - returns_to_go.shape[1], 1),
-                    device=returns_to_go.device,
-                ),
-                returns_to_go,
-            ],
-            dim=1,
-        ).to(dtype=torch.float32)
-        timesteps = torch.cat(
-            [
-                torch.zeros(
-                    (timesteps.shape[0], model.config.max_length - timesteps.shape[1]), device=timesteps.device
-                ),
-                timesteps,
-            ],
-            dim=1,
-        ).to(dtype=torch.long)
-    else:
-        attention_mask = None
-
-    _, action_preds, _ = model(
-        states=states,
-        actions=actions,
-        rewards=rewards,
-        returns_to_go=returns_to_go,
-        timesteps=timesteps,
-        attention_mask=attention_mask,
-        return_dict=False,
-    )
-
-    return action_preds[0, -1]
-
-
-# build the environment
-
-env = gym.make("Hopper-v3")
-state_dim = env.observation_space.shape[0]
-act_dim = env.action_space.shape[0]
-max_ep_len = 1000
-device = "cuda"
-scale = 1000.0  # normalization for rewards/returns
-TARGET_RETURN = 3600 / scale  # evaluation conditioning targets, 3600 is reasonable from the paper LINK
-state_mean = np.array(
-    [
-        1.311279,
-        -0.08469521,
-        -0.5382719,
-        -0.07201576,
-        0.04932366,
-        2.1066856,
-        -0.15017354,
-        0.00878345,
-        -0.2848186,
-        -0.18540096,
-        -0.28461286,
-    ]
-)
-state_std = np.array(
-    [
-        0.17790751,
-        0.05444621,
-        0.21297139,
-        0.14530419,
-        0.6124444,
-        0.85174465,
-        1.4515252,
-        0.6751696,
-        1.536239,
-        1.6160746,
-        5.6072536,
-    ]
-)
-state_mean = torch.from_numpy(state_mean).to(device=device)
-state_std = torch.from_numpy(state_std).to(device=device)
-
-# Create the decision transformer model
-model = DecisionTransformerModel.from_pretrained("edbeeching/decision-transformer-gym-hopper-medium")
-model = model.to(device)
-model.eval()
-
-for ep in range(10):
-    episode_return, episode_length = 0, 0
-    state = env.reset()
-    target_return = torch.tensor(TARGET_RETURN, device=device, dtype=torch.float32).reshape(1, 1)
-    states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
-    actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
-    rewards = torch.zeros(0, device=device, dtype=torch.float32)
-
-    timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)
-    for t in range(max_ep_len):
-        env.render()
-        # add padding
-        actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
-        rewards = torch.cat([rewards, torch.zeros(1, device=device)])
-
-        action = get_action(
-            model,
-            (states.to(dtype=torch.float32) - state_mean) / state_std,
-            actions.to(dtype=torch.float32),
-            rewards.to(dtype=torch.float32),
-            target_return.to(dtype=torch.float32),
-            timesteps.to(dtype=torch.long),
-        )
-        actions[-1] = action
-        action = action.detach().cpu().numpy()
-
-        state, reward, done, _ = env.step(action)
-
-        cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
-        states = torch.cat([states, cur_state], dim=0)
-        rewards[-1] = reward
-
-        pred_return = target_return[0, -1] - (reward / scale)
-        target_return = torch.cat([target_return, pred_return.reshape(1, 1)], dim=1)
-        timesteps = torch.cat([timesteps, torch.ones((1, 1), device=device, dtype=torch.long) * (t + 1)], dim=1)
-
-        episode_return += reward
-        episode_length += 1
-
-        if done:
-            break
diff --git a/examples/research_projects/deebert/README.md b/examples/research_projects/deebert/README.md
deleted file mode 100644
index 08a087dc03eb..000000000000
--- a/examples/research_projects/deebert/README.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# DeeBERT: Early Exiting for *BERT
-
-This is the code base for the paper [DeeBERT: Dynamic Early Exiting for Accelerating BERT Inference](https://www.aclweb.org/anthology/2020.acl-main.204/), modified from its [original code base](https://github.com/castorini/deebert).
-
-The original code base also has information for downloading sample models that we have trained in advance.
-
-## Usage
-
-There are three scripts in the folder which can be run directly.
-
-In each script, there are several things to modify before running:
-
-* `PATH_TO_DATA`: path to the GLUE dataset.
-* `--output_dir`: path for saving fine-tuned models. Default: `./saved_models`.
-* `--plot_data_dir`: path for saving evaluation results. Default: `./results`. Results are printed to stdout and also saved to `npy` files in this directory to facilitate plotting figures and further analyses.
-* `MODEL_TYPE`: bert or roberta
-* `MODEL_SIZE`: base or large
-* `DATASET`: SST-2, MRPC, RTE, QNLI, QQP, or MNLI
-
-#### train_deebert.sh
-
-This is for fine-tuning DeeBERT models.
-
-#### eval_deebert.sh
-
-This is for evaluating each exit layer for fine-tuned DeeBERT models.
-
-#### entropy_eval.sh
-
-This is for evaluating fine-tuned DeeBERT models, given a number of different early exit entropy thresholds.
-
-
-
-## Citation
-
-Please cite our paper if you find the resource useful:
-```bibtex
-@inproceedings{xin-etal-2020-deebert,
-    title = "{D}ee{BERT}: Dynamic Early Exiting for Accelerating {BERT} Inference",
-    author = "Xin, Ji  and
-      Tang, Raphael  and
-      Lee, Jaejun  and
-      Yu, Yaoliang  and
-      Lin, Jimmy",
-    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
-    month = jul,
-    year = "2020",
-    address = "Online",
-    publisher = "Association for Computational Linguistics",
-    url = "https://www.aclweb.org/anthology/2020.acl-main.204",
-    pages = "2246--2251",
-}
-```
-
diff --git a/examples/research_projects/deebert/entropy_eval.sh b/examples/research_projects/deebert/entropy_eval.sh
deleted file mode 100755
index 884c286a56a5..000000000000
--- a/examples/research_projects/deebert/entropy_eval.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-export CUDA_VISIBLE_DEVICES=0
-
-PATH_TO_DATA=/h/xinji/projects/GLUE
-
-MODEL_TYPE=bert  # bert or roberta
-MODEL_SIZE=base  # base or large
-DATASET=MRPC  # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
-
-MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
-if [ $MODEL_TYPE = 'bert' ]
-then
-  MODEL_NAME=${MODEL_NAME}-uncased
-fi
-
-ENTROPIES="0 0.1 0.2 0.3 0.4 0.5 0.6 0.7"
-
-for ENTROPY in $ENTROPIES; do
-  python -u run_glue_deebert.py \
-    --model_type $MODEL_TYPE \
-    --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
-    --task_name $DATASET \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $PATH_TO_DATA/$DATASET \
-    --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
-    --plot_data_dir ./results/ \
-    --max_seq_length 128 \
-    --early_exit_entropy $ENTROPY \
-    --eval_highway \
-    --overwrite_cache \
-    --per_gpu_eval_batch_size=1
-done
diff --git a/examples/research_projects/deebert/eval_deebert.sh b/examples/research_projects/deebert/eval_deebert.sh
deleted file mode 100755
index adf4f652a9f7..000000000000
--- a/examples/research_projects/deebert/eval_deebert.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-export CUDA_VISIBLE_DEVICES=0
-
-PATH_TO_DATA=/h/xinji/projects/GLUE
-
-MODEL_TYPE=bert  # bert or roberta
-MODEL_SIZE=base  # base or large
-DATASET=MRPC  # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
-
-MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
-if [ $MODEL_TYPE = 'bert' ]
-then
-  MODEL_NAME=${MODEL_NAME}-uncased
-fi
-
-
-python -u run_glue_deebert.py  \
-  --model_type $MODEL_TYPE \
-  --model_name_or_path ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
-  --task_name $DATASET \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $PATH_TO_DATA/$DATASET \
-  --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
-  --plot_data_dir ./results/ \
-  --max_seq_length 128 \
-  --eval_each_highway \
-  --eval_highway \
-  --overwrite_cache \
-  --per_gpu_eval_batch_size=1
diff --git a/examples/research_projects/deebert/requirements.txt b/examples/research_projects/deebert/requirements.txt
deleted file mode 100644
index 99636a7fce1b..000000000000
--- a/examples/research_projects/deebert/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers == 4.38.0
diff --git a/examples/research_projects/deebert/run_glue_deebert.py b/examples/research_projects/deebert/run_glue_deebert.py
deleted file mode 100644
index 6ca28ab5bc07..000000000000
--- a/examples/research_projects/deebert/run_glue_deebert.py
+++ /dev/null
@@ -1,735 +0,0 @@
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import glob
-import logging
-import os
-import random
-import time
-
-import numpy as np
-import torch
-from torch import nn
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-import transformers
-from src.modeling_highway_bert import DeeBertForSequenceClassification
-from src.modeling_highway_roberta import DeeRobertaForSequenceClassification
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertTokenizer,
-    RobertaConfig,
-    RobertaTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
-from transformers import glue_output_modes as output_modes
-from transformers import glue_processors as processors
-from transformers.trainer_utils import is_main_process
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, DeeBertForSequenceClassification, BertTokenizer),
-    "roberta": (RobertaConfig, DeeRobertaForSequenceClassification, RobertaTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def get_wanted_result(result):
-    if "spearmanr" in result:
-        print_result = result["spearmanr"]
-    elif "f1" in result:
-        print_result = result["f1"]
-    elif "mcc" in result:
-        print_result = result["mcc"]
-    elif "acc" in result:
-        print_result = result["acc"]
-    else:
-        raise ValueError("Primary metric unclear in the results")
-    return print_result
-
-
-def train(args, train_dataset, model, tokenizer, train_highway=False):
-    """Train the model"""
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    if train_highway:
-        optimizer_grouped_parameters = [
-            {
-                "params": [
-                    p
-                    for n, p in model.named_parameters()
-                    if ("highway" in n) and (not any(nd in n for nd in no_decay))
-                ],
-                "weight_decay": args.weight_decay,
-            },
-            {
-                "params": [
-                    p for n, p in model.named_parameters() if ("highway" in n) and (any(nd in n for nd in no_decay))
-                ],
-                "weight_decay": 0.0,
-            },
-        ]
-    else:
-        optimizer_grouped_parameters = [
-            {
-                "params": [
-                    p
-                    for n, p in model.named_parameters()
-                    if ("highway" not in n) and (not any(nd in n for nd in no_decay))
-                ],
-                "weight_decay": args.weight_decay,
-            },
-            {
-                "params": [
-                    p
-                    for n, p in model.named_parameters()
-                    if ("highway" not in n) and (any(nd in n for nd in no_decay))
-                ],
-                "weight_decay": 0.0,
-            },
-        ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert", "xlnet"] else None
-                )  # XLM, DistilBERT and RoBERTa don't use segment_ids
-            inputs["train_highway"] = train_highway
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix="", output_layer=-1, eval_highway=False):
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
-
-    results = {}
-    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
-
-        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(eval_output_dir)
-
-        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
-        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        # multi-gpu eval
-        if args.n_gpu > 1:
-            model = nn.DataParallel(model)
-
-        # Eval!
-        logger.info("***** Running evaluation {} *****".format(prefix))
-        logger.info("  Num examples = %d", len(eval_dataset))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        eval_loss = 0.0
-        nb_eval_steps = 0
-        preds = None
-        out_label_ids = None
-        exit_layer_counter = {(i + 1): 0 for i in range(model.num_layers)}
-        st = time.time()
-        for batch in tqdm(eval_dataloader, desc="Evaluating"):
-            model.eval()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            with torch.no_grad():
-                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-                if args.model_type != "distilbert":
-                    inputs["token_type_ids"] = (
-                        batch[2] if args.model_type in ["bert", "xlnet"] else None
-                    )  # XLM, DistilBERT and RoBERTa don't use segment_ids
-                if output_layer >= 0:
-                    inputs["output_layer"] = output_layer
-                outputs = model(**inputs)
-                if eval_highway:
-                    exit_layer_counter[outputs[-1]] += 1
-                tmp_eval_loss, logits = outputs[:2]
-
-                eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if preds is None:
-                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs["labels"].detach().cpu().numpy()
-            else:
-                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
-        eval_time = time.time() - st
-        logger.info("Eval time: {}".format(eval_time))
-
-        eval_loss = eval_loss / nb_eval_steps
-        if args.output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        elif args.output_mode == "regression":
-            preds = np.squeeze(preds)
-        result = compute_metrics(eval_task, preds, out_label_ids)
-        results.update(result)
-
-        if eval_highway:
-            logger.info("Exit layer counter: {}".format(exit_layer_counter))
-            actual_cost = sum([l * c for l, c in exit_layer_counter.items()])
-            full_cost = len(eval_dataloader) * model.num_layers
-            logger.info("Expected saving: {}".format(actual_cost / full_cost))
-            if args.early_exit_entropy >= 0:
-                save_fname = (
-                    args.plot_data_dir
-                    + "/"
-                    + args.model_name_or_path[2:]
-                    + "/entropy_{}.npy".format(args.early_exit_entropy)
-                )
-                if not os.path.exists(os.path.dirname(save_fname)):
-                    os.makedirs(os.path.dirname(save_fname))
-                print_result = get_wanted_result(result)
-                np.save(save_fname, np.array([exit_layer_counter, eval_time, actual_cost / full_cost, print_result]))
-                logger.info("Entropy={}\tResult={:.2f}".format(args.early_exit_entropy, 100 * print_result))
-
-        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results {} *****".format(prefix))
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return results
-
-
-def load_and_cache_examples(args, task, tokenizer, evaluate=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    processor = processors[task]()
-    output_mode = output_modes[task]
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args.data_dir,
-        "cached_{}_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-            str(task),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        label_list = processor.get_labels()
-        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]:
-            # HACK(label indices are swapped in RoBERTa pretrained model)
-            label_list[1], label_list[2] = label_list[2], label_list[1]
-        examples = (
-            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        )
-        features = convert_examples_to_features(
-            examples,
-            tokenizer,
-            label_list=label_list,
-            max_length=args.max_seq_length,
-            output_mode=output_mode,
-        )
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-
-    if features[0].token_type_ids is None:
-        # For RoBERTa (a potential bug!)
-        all_token_type_ids = torch.tensor([[0] * args.max_seq_length for f in features], dtype=torch.long)
-    else:
-        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-    if output_mode == "classification":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
-    elif output_mode == "regression":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-
-    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name.",
-    )
-    parser.add_argument(
-        "--task_name",
-        default=None,
-        type=str,
-        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--plot_data_dir",
-        default="./plotting/",
-        type=str,
-        required=False,
-        help="The directory to store data for plotting figures.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help=(
-            "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        ),
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-    parser.add_argument("--eval_each_highway", action="store_true", help="Set this flag to evaluate each highway.")
-    parser.add_argument(
-        "--eval_after_first_stage",
-        action="store_true",
-        help="Set this flag to evaluate after training only bert (not highway).",
-    )
-    parser.add_argument("--eval_highway", action="store_true", help="Set this flag if it's evaluating highway models")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-    parser.add_argument("--early_exit_entropy", default=-1, type=float, help="Entropy threshold for early exit.")
-
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    # Set seed
-    set_seed(args)
-
-    # Prepare GLUE task
-    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
-        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
-    args.output_mode = output_modes[args.task_name]
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=args.task_name,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.model_type == "bert":
-        model.bert.encoder.set_early_exit_entropy(args.early_exit_entropy)
-        model.bert.init_highway_pooler()
-    elif args.model_type == "roberta":
-        model.roberta.encoder.set_early_exit_entropy(args.early_exit_entropy)
-        model.roberta.init_highway_pooler()
-    else:
-        raise NotImplementedError()
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-        if args.eval_after_first_stage:
-            result = evaluate(args, model, tokenizer, prefix="")
-            print_result = get_wanted_result(result)
-
-        train(args, train_dataset, model, tokenizer, train_highway=True)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = [
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            ]
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-
-            model = model_class.from_pretrained(checkpoint)
-            if args.model_type == "bert":
-                model.bert.encoder.set_early_exit_entropy(args.early_exit_entropy)
-            elif args.model_type == "roberta":
-                model.roberta.encoder.set_early_exit_entropy(args.early_exit_entropy)
-            else:
-                raise NotImplementedError()
-
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix, eval_highway=args.eval_highway)
-            print_result = get_wanted_result(result)
-            logger.info("Result: {}".format(print_result))
-            if args.eval_each_highway:
-                last_layer_results = print_result
-                each_layer_results = []
-                for i in range(model.num_layers):
-                    logger.info("\n")
-                    _result = evaluate(
-                        args, model, tokenizer, prefix=prefix, output_layer=i, eval_highway=args.eval_highway
-                    )
-                    if i + 1 < model.num_layers:
-                        each_layer_results.append(get_wanted_result(_result))
-                each_layer_results.append(last_layer_results)
-                save_fname = args.plot_data_dir + "/" + args.model_name_or_path[2:] + "/each_layer.npy"
-                if not os.path.exists(os.path.dirname(save_fname)):
-                    os.makedirs(os.path.dirname(save_fname))
-                np.save(save_fname, np.array(each_layer_results))
-                info_str = "Score of each layer:"
-                for i in range(model.num_layers):
-                    info_str += " {:.2f}".format(100 * each_layer_results[i])
-                logger.info(info_str)
-            result = {k + "_{}".format(global_step): v for k, v in result.items()}
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/deebert/src/modeling_highway_bert.py b/examples/research_projects/deebert/src/modeling_highway_bert.py
deleted file mode 100644
index b866ef0869c7..000000000000
--- a/examples/research_projects/deebert/src/modeling_highway_bert.py
+++ /dev/null
@@ -1,397 +0,0 @@
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
-from transformers.models.bert.modeling_bert import (
-    BERT_INPUTS_DOCSTRING,
-    BERT_START_DOCSTRING,
-    BertEmbeddings,
-    BertLayer,
-    BertPooler,
-    BertPreTrainedModel,
-)
-
-
-def entropy(x):
-    """Calculate entropy of a pre-softmax logit Tensor"""
-    exp_x = torch.exp(x)
-    A = torch.sum(exp_x, dim=1)  # sum of exp(x_i)
-    B = torch.sum(x * exp_x, dim=1)  # sum of x_i * exp(x_i)
-    return torch.log(A) - B / A
-
-
-class DeeBertEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
-        self.highway = nn.ModuleList([BertHighway(config) for _ in range(config.num_hidden_layers)])
-
-        self.early_exit_entropy = [-1 for _ in range(config.num_hidden_layers)]
-
-    def set_early_exit_entropy(self, x):
-        if isinstance(x, (float, int)):
-            for i in range(len(self.early_exit_entropy)):
-                self.early_exit_entropy[i] = x
-        else:
-            self.early_exit_entropy = x
-
-    def init_highway_pooler(self, pooler):
-        loaded_model = pooler.state_dict()
-        for highway in self.highway:
-            for name, param in highway.pooler.state_dict().items():
-                param.copy_(loaded_model[name])
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        all_hidden_states = ()
-        all_attentions = ()
-        all_highway_exits = ()
-        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(
-                hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
-            )
-            hidden_states = layer_outputs[0]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-            current_outputs = (hidden_states,)
-            if self.output_hidden_states:
-                current_outputs = current_outputs + (all_hidden_states,)
-            if self.output_attentions:
-                current_outputs = current_outputs + (all_attentions,)
-
-            highway_exit = self.highway[i](current_outputs)
-            # logits, pooled_output
-
-            if not self.training:
-                highway_logits = highway_exit[0]
-                highway_entropy = entropy(highway_logits)
-                highway_exit = highway_exit + (highway_entropy,)  # logits, hidden_states(?), entropy
-                all_highway_exits = all_highway_exits + (highway_exit,)
-
-                if highway_entropy < self.early_exit_entropy[i]:
-                    new_output = (highway_logits,) + current_outputs[1:] + (all_highway_exits,)
-                    raise HighwayException(new_output, i + 1)
-            else:
-                all_highway_exits = all_highway_exits + (highway_exit,)
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-
-        outputs = outputs + (all_highway_exits,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions), all highway exits
-
-
-@add_start_docstrings(
-    "The Bert Model transformer with early exiting (DeeBERT). ",
-    BERT_START_DOCSTRING,
-)
-class DeeBertModel(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = DeeBertEncoder(config)
-        self.pooler = BertPooler(config)
-
-        self.init_weights()
-
-    def init_highway_pooler(self):
-        self.encoder.init_highway_pooler(self.pooler)
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        r"""
-        Return:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-                Sequence of hidden-states at the output of the last layer of the model.
-            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-                Last layer hidden-state of the first token of the sequence (classification token)
-                further processed by a Linear layer and a Tanh activation function. The Linear
-                layer weights are trained from the next sentence prediction (classification)
-                objective during pre-training.
-
-                This output is usually *not* a good summary
-                of the semantic content of the input, you're often better with averaging or pooling
-                the sequence of hidden-states for the whole input sequence.
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-                Tuple of each early exit's results (total length: number of layers)
-                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
-        """
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)
-        if encoder_attention_mask is None:
-            encoder_attention_mask = torch.ones(input_shape, device=device)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
-
-        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if encoder_attention_mask.dim() == 3:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-        if encoder_attention_mask.dim() == 2:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-
-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(
-            dtype=next(self.parameters()).dtype
-        )  # fp16 compatibility
-        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
-
-        outputs = (
-            sequence_output,
-            pooled_output,
-        ) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions), highway exits
-
-
-class HighwayException(Exception):
-    def __init__(self, message, exit_layer):
-        self.message = message
-        self.exit_layer = exit_layer  # start from 1!
-
-
-class BertHighway(nn.Module):
-    """A module to provide a shortcut
-    from (the output of one non-final BertLayer in BertEncoder) to (cross-entropy computation in BertForSequenceClassification)
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.pooler = BertPooler(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-    def forward(self, encoder_outputs):
-        # Pooler
-        pooler_input = encoder_outputs[0]
-        pooler_output = self.pooler(pooler_input)
-        # "return" pooler_output
-
-        # BertModel
-        bmodel_output = (pooler_input, pooler_output) + encoder_outputs[1:]
-        # "return" bmodel_output
-
-        # Dropout and classification
-        pooled_output = bmodel_output[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        return logits, pooled_output
-
-
-@add_start_docstrings(
-    """Bert Model (with early exiting - DeeBERT) with a classifier on top,
-    also takes care of multi-layer training. """,
-    BERT_START_DOCSTRING,
-)
-class DeeBertForSequenceClassification(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.num_layers = config.num_hidden_layers
-
-        self.bert = DeeBertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_layer=-1,
-        train_highway=False,
-    ):
-        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Labels for computing the sequence classification/regression loss.
-                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-        Returns:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-                Classification (or regression if config.num_labels==1) loss.
-            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-                Classification (or regression if config.num_labels==1) scores (before SoftMax).
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-                Tuple of each early exit's results (total length: number of layers)
-                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
-        """
-
-        exit_layer = self.num_layers
-        try:
-            outputs = self.bert(
-                input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                position_ids=position_ids,
-                head_mask=head_mask,
-                inputs_embeds=inputs_embeds,
-            )
-            # sequence_output, pooled_output, (hidden_states), (attentions), highway exits
-
-            pooled_output = outputs[1]
-
-            pooled_output = self.dropout(pooled_output)
-            logits = self.classifier(pooled_output)
-            outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-        except HighwayException as e:
-            outputs = e.message
-            exit_layer = e.exit_layer
-            logits = outputs[0]
-
-        if not self.training:
-            original_entropy = entropy(logits)
-            highway_entropy = []
-            highway_logits_all = []
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-            # work with highway exits
-            highway_losses = []
-            for highway_exit in outputs[-1]:
-                highway_logits = highway_exit[0]
-                if not self.training:
-                    highway_logits_all.append(highway_logits)
-                    highway_entropy.append(highway_exit[2])
-                if self.num_labels == 1:
-                    #  We are doing regression
-                    loss_fct = MSELoss()
-                    highway_loss = loss_fct(highway_logits.view(-1), labels.view(-1))
-                else:
-                    loss_fct = CrossEntropyLoss()
-                    highway_loss = loss_fct(highway_logits.view(-1, self.num_labels), labels.view(-1))
-                highway_losses.append(highway_loss)
-
-            if train_highway:
-                outputs = (sum(highway_losses[:-1]),) + outputs
-                # exclude the final highway, of course
-            else:
-                outputs = (loss,) + outputs
-        if not self.training:
-            outputs = outputs + ((original_entropy, highway_entropy), exit_layer)
-            if output_layer >= 0:
-                outputs = (
-                    (outputs[0],) + (highway_logits_all[output_layer],) + outputs[2:]
-                )  # use the highway of the last layer
-
-        return outputs  # (loss), logits, (hidden_states), (attentions), (highway_exits)
diff --git a/examples/research_projects/deebert/src/modeling_highway_roberta.py b/examples/research_projects/deebert/src/modeling_highway_roberta.py
deleted file mode 100644
index c21fb32fde76..000000000000
--- a/examples/research_projects/deebert/src/modeling_highway_roberta.py
+++ /dev/null
@@ -1,154 +0,0 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from transformers import RobertaConfig
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
-from transformers.models.roberta.modeling_roberta import (
-    ROBERTA_INPUTS_DOCSTRING,
-    ROBERTA_START_DOCSTRING,
-    RobertaEmbeddings,
-)
-
-from .modeling_highway_bert import BertPreTrainedModel, DeeBertModel, HighwayException, entropy
-
-
-@add_start_docstrings(
-    "The RoBERTa Model transformer with early exiting (DeeRoBERTa). ",
-    ROBERTA_START_DOCSTRING,
-)
-class DeeRobertaModel(DeeBertModel):
-    config_class = RobertaConfig
-    base_model_prefix = "roberta"
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.embeddings = RobertaEmbeddings(config)
-        self.init_weights()
-
-
-@add_start_docstrings(
-    """RoBERTa Model (with early exiting - DeeRoBERTa) with a classifier on top,
-    also takes care of multi-layer training. """,
-    ROBERTA_START_DOCSTRING,
-)
-class DeeRobertaForSequenceClassification(BertPreTrainedModel):
-    config_class = RobertaConfig
-    base_model_prefix = "roberta"
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.num_layers = config.num_hidden_layers
-
-        self.roberta = DeeRobertaModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-
-    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_layer=-1,
-        train_highway=False,
-    ):
-        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Labels for computing the sequence classification/regression loss.
-                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
-        Returns:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
-            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-                Classification (or regression if config.num_labels==1) loss.
-            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-                Classification (or regression if config.num_labels==1) scores (before SoftMax).
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-            highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
-                Tuple of each early exit's results (total length: number of layers)
-                Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
-        """
-
-        exit_layer = self.num_layers
-        try:
-            outputs = self.roberta(
-                input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                position_ids=position_ids,
-                head_mask=head_mask,
-                inputs_embeds=inputs_embeds,
-            )
-
-            pooled_output = outputs[1]
-
-            pooled_output = self.dropout(pooled_output)
-            logits = self.classifier(pooled_output)
-            outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-        except HighwayException as e:
-            outputs = e.message
-            exit_layer = e.exit_layer
-            logits = outputs[0]
-
-        if not self.training:
-            original_entropy = entropy(logits)
-            highway_entropy = []
-            highway_logits_all = []
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-            # work with highway exits
-            highway_losses = []
-            for highway_exit in outputs[-1]:
-                highway_logits = highway_exit[0]
-                if not self.training:
-                    highway_logits_all.append(highway_logits)
-                    highway_entropy.append(highway_exit[2])
-                if self.num_labels == 1:
-                    #  We are doing regression
-                    loss_fct = MSELoss()
-                    highway_loss = loss_fct(highway_logits.view(-1), labels.view(-1))
-                else:
-                    loss_fct = CrossEntropyLoss()
-                    highway_loss = loss_fct(highway_logits.view(-1, self.num_labels), labels.view(-1))
-                highway_losses.append(highway_loss)
-
-            if train_highway:
-                outputs = (sum(highway_losses[:-1]),) + outputs
-                # exclude the final highway, of course
-            else:
-                outputs = (loss,) + outputs
-        if not self.training:
-            outputs = outputs + ((original_entropy, highway_entropy), exit_layer)
-            if output_layer >= 0:
-                outputs = (
-                    (outputs[0],) + (highway_logits_all[output_layer],) + outputs[2:]
-                )  # use the highway of the last layer
-
-        return outputs  # (loss), logits, (hidden_states), (attentions), entropy
diff --git a/examples/research_projects/deebert/test_glue_deebert.py b/examples/research_projects/deebert/test_glue_deebert.py
deleted file mode 100644
index 7a5f059c8ced..000000000000
--- a/examples/research_projects/deebert/test_glue_deebert.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import argparse
-import logging
-import sys
-from unittest.mock import patch
-
-import run_glue_deebert
-
-from transformers.testing_utils import TestCasePlus, get_gpu_count, require_torch_non_multi_gpu, slow
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-
-
-def get_setup_file():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-f")
-    args = parser.parse_args()
-    return args.f
-
-
-class DeeBertTests(TestCasePlus):
-    def setup(self) -> None:
-        stream_handler = logging.StreamHandler(sys.stdout)
-        logger.addHandler(stream_handler)
-
-    def run_and_check(self, args):
-        n_gpu = get_gpu_count()
-
-        if n_gpu > 1:
-            pass
-            # XXX: doesn't quite work with n_gpu > 1 https://github.com/huggingface/transformers/issues/10560
-            # script = f"{self.examples_dir_str}/research_projects/deebert/run_glue_deebert.py"
-            # distributed_args = f"-m torch.distributed.launch --nproc_per_node={n_gpu} {script}".split()
-            # cmd = [sys.executable] + distributed_args + args
-            # execute_subprocess_async(cmd, env=self.get_env())
-            # XXX: test the results - need to save them first into .json file
-        else:
-            args.insert(0, "run_glue_deebert.py")
-            with patch.object(sys, "argv", args):
-                result = run_glue_deebert.main()
-                for value in result.values():
-                    self.assertGreaterEqual(value, 0.666)
-
-    @slow
-    @require_torch_non_multi_gpu
-    def test_glue_deebert_train(self):
-        train_args = """
-            --model_type roberta
-            --model_name_or_path FacebookAI/roberta-base
-            --task_name MRPC
-            --do_train
-            --do_eval
-            --do_lower_case
-            --data_dir ./tests/fixtures/tests_samples/MRPC/
-            --max_seq_length 128
-            --per_gpu_eval_batch_size=1
-            --per_gpu_train_batch_size=8
-            --learning_rate 2e-4
-            --num_train_epochs 3
-            --overwrite_output_dir
-            --seed 42
-            --output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
-            --plot_data_dir ./examples/deebert/results/
-            --save_steps 0
-            --overwrite_cache
-            --eval_after_first_stage
-            """.split()
-        self.run_and_check(train_args)
-
-        eval_args = """
-            --model_type roberta
-            --model_name_or_path ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
-            --task_name MRPC
-            --do_eval
-            --do_lower_case
-            --data_dir ./tests/fixtures/tests_samples/MRPC/
-            --output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
-            --plot_data_dir ./examples/deebert/results/
-            --max_seq_length 128
-            --eval_each_highway
-            --eval_highway
-            --overwrite_cache
-            --per_gpu_eval_batch_size=1
-            """.split()
-        self.run_and_check(eval_args)
-
-        entropy_eval_args = """
-            --model_type roberta
-            --model_name_or_path ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
-            --task_name MRPC
-            --do_eval
-            --do_lower_case
-            --data_dir ./tests/fixtures/tests_samples/MRPC/
-            --output_dir ./examples/deebert/saved_models/FacebookAI/roberta-base/MRPC/two_stage
-            --plot_data_dir ./examples/deebert/results/
-            --max_seq_length 128
-            --early_exit_entropy 0.1
-            --eval_highway
-            --overwrite_cache
-            --per_gpu_eval_batch_size=1
-            """.split()
-        self.run_and_check(entropy_eval_args)
diff --git a/examples/research_projects/deebert/train_deebert.sh b/examples/research_projects/deebert/train_deebert.sh
deleted file mode 100755
index 32cdf5730f20..000000000000
--- a/examples/research_projects/deebert/train_deebert.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-export CUDA_VISIBLE_DEVICES=0
-
-PATH_TO_DATA=/h/xinji/projects/GLUE
-
-MODEL_TYPE=bert  # bert or roberta
-MODEL_SIZE=base  # base or large
-DATASET=MRPC  # SST-2, MRPC, RTE, QNLI, QQP, or MNLI
-
-MODEL_NAME=${MODEL_TYPE}-${MODEL_SIZE}
-EPOCHS=10
-if [ $MODEL_TYPE = 'bert' ]
-then
-  EPOCHS=3
-  MODEL_NAME=${MODEL_NAME}-uncased
-fi
-
-
-python -u run_glue_deebert.py \
-  --model_type $MODEL_TYPE \
-  --model_name_or_path $MODEL_NAME \
-  --task_name $DATASET \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $PATH_TO_DATA/$DATASET \
-  --max_seq_length 128 \
-  --per_gpu_eval_batch_size=1 \
-  --per_gpu_train_batch_size=8 \
-  --learning_rate 2e-5 \
-  --num_train_epochs $EPOCHS \
-  --overwrite_output_dir \
-  --seed 42 \
-  --output_dir ./saved_models/${MODEL_TYPE}-${MODEL_SIZE}/$DATASET/two_stage \
-  --plot_data_dir ./results/ \
-  --save_steps 0 \
-  --overwrite_cache \
-  --eval_after_first_stage
diff --git a/examples/research_projects/distillation/README.md b/examples/research_projects/distillation/README.md
deleted file mode 100644
index 594e953f99d7..000000000000
--- a/examples/research_projects/distillation/README.md
+++ /dev/null
@@ -1,193 +0,0 @@
-# Distil*
-
-Author: @VictorSanh
-
-This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
-
-**January 20, 2020 - Bug fixing** We have recently discovered and fixed [a bug](https://github.com/huggingface/transformers/commit/48cbf267c988b56c71a2380f748a3e6092ccaed3) in the evaluation of our `run_*.py` scripts that caused the reported metrics to be over-estimated on average. We have updated all the metrics with the latest runs.
-
-**December 6, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
-
-**November 19, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
-
-**October 23, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
-
-**October 3, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper supersedes our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
-
-**September 19, 2019 - Update:** We fixed bugs in the code and released an updated version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 99% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
-
-
-## What is Distil*
-
-Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distilled-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
-
-We have applied the same method to other Transformer architectures and released the weights:
-- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 16.3 compared to 21.1 for **DistilGPT2** (after fine-tuning on the train set).
-- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base`'s performance on GLUE while being twice faster and 35% smaller.
-- German BERT: **German DistilBERT** reaches 99% of `bert-base-german-dbmdz-cased`'s performance on German NER (CoNLL-2003).
-- Multilingual BERT: **DistilmBERT** reaches 92% of Multilingual BERT's performance on XNLI while being twice faster and 25% smaller. The model supports 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
-
-For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).
-
-Here are the results on the dev sets of GLUE:
-
-| Model                     | Macro-score                    | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI              |
-| :---:                     |    :---:                       | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:             |
-| BERT-base-uncased         |  **79.5**                      | 56.3 | 84.7 | 88.6 | 91.8 | 89.6 | 69.3 | 92.7 | 89.0 | 53.5              |
-| DistilBERT-base-uncased   |  **77.0**                      | 51.3 | 82.1 | 87.5 | 89.2 | 88.5 | 59.9 | 91.3 | 86.9 | 56.3              |
-| BERT-base-cased           |  **78.2**                      | 58.2 | 83.9 | 87.8 | 91.0 | 89.2 | 66.1 | 91.7 | 89.2 | 46.5              |
-| DistilBERT-base-cased     |  **75.9**                      | 47.2 | 81.5 | 85.6 | 88.2 | 87.8 | 60.6 | 90.4 | 85.5 | 56.3              |
-| ---                       |    ---                         |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  ---              |
-| RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup>  |
-| DistilRoBERTa<sup>1</sup> |  **79.0**/**82.3**<sup>2</sup> | 59.3 | 84.0 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1              |
-
-<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directly perform transfer learning on the pre-trained DistilRoBERTa.
-
-<sup>2</sup> Macro-score computed without WNLI.
-
-<sup>3</sup> We compute this score ourselves for completeness.
-
-Here are the results on the *test* sets for 6 of the languages available in XNLI. The results are computed in the zero shot setting (trained on the English portion and evaluated on the target language portion):
-
-| Model                        | English | Spanish | Chinese | German | Arabic  | Urdu |
-| :---:                        | :---:   | :---:   | :---:   | :---:  | :---:   | :---:|
-| mBERT base cased (computed)  | 82.1    | 74.6    | 69.1    | 72.3   | 66.4    | 58.5 |
-| mBERT base uncased (reported)| 81.4    | 74.3    | 63.8    | 70.5   | 62.1    | 58.3 |
-| DistilmBERT                  | 78.2    | 69.1    | 64.0    | 66.3   | 59.1    | 54.7 |
-
-## Setup
-
-This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`.
-
-**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breaking changes compared to v1.1.0).
-
-
-## How to use DistilBERT
-
-Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):
-
-- `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
-- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knowledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
-- `distilbert-base-cased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-cased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 65M parameters.
-- `distilbert-base-cased-distilled-squad`: A finetuned version of `distilbert-base-cased` finetuned using (a second step of) knowledge distillation on SQuAD 1.0. This model reaches a F1 score of 87.1 on the dev set (for comparison, Bert `bert-base-cased` version reaches a 88.7 F1 score).
-- `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
-- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
-- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
-- `distilbert-base-multilingual-cased`: DistilmBERT multilingual model pretrained with the supervision of `bert-base-multilingual-cased` on the concatenation of Wikipedia in 104 different languages. The model supports the 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base.
-
-Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
-
-```python
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-model = DistilBertModel.from_pretrained('distilbert-base-cased')
-
-input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)
-outputs = model(input_ids)
-last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-```
-
-Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
-- DistilBERT uncased: `model = DistilBertModel.from_pretrained('distilbert-base-uncased')`
-- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
-- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
-- DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`
-
-
-## How to train Distil*
-
-In the following, we will explain how you can train DistilBERT.
-
-### A. Preparing the data
-
-The weights we release are trained using a concatenation of Toronto Book Corpus and English Wikipedia (same training data as the English version of BERT).
-
-To avoid processing the data several time, we do it once and for all before the training. From now on, will suppose that you have a text file `dump.txt` which contains one sequence per line (a sequence being composed of one of several coherent sentences).
-
-First, we will binarize the data, i.e. tokenize the data and convert each token in an index in our model's vocabulary.
-
-```bash
-python scripts/binarized_data.py \
-    --file_path data/dump.txt \
-    --tokenizer_type bert \
-    --tokenizer_name bert-base-uncased \
-    --dump_file data/binarized_text
-```
-
-Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smooths the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurrences of each tokens in the data:
-
-```bash
-python scripts/token_counts.py \
-    --data_file data/binarized_text.bert-base-uncased.pickle \
-    --token_counts_dump data/token_counts.bert-base-uncased.pickle \
-    --vocab_size 30522
-```
-
-### B. Training
-
-Training with distillation is really simple once you have pre-processed the data:
-
-```bash
-python train.py \
-    --student_type distilbert \
-    --student_config training_configs/distilbert-base-uncased.json \
-    --teacher_type bert \
-    --teacher_name bert-base-uncased \
-    --alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --alpha_clm 0.0 --mlm \
-    --freeze_pos_embs \
-    --dump_path serialization_dir/my_first_training \
-    --data_file data/binarized_text.bert-base-uncased.pickle \
-    --token_counts data/token_counts.bert-base-uncased.pickle \
-    --force # overwrites the `dump_path` if it already exists.
-```
-
-By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
-
-We highly encourage you to use distributed training for training DistilBERT as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
-
-```bash
-export NODE_RANK=0
-export N_NODES=1
-
-export N_GPU_NODE=4
-export WORLD_SIZE=4
-export MASTER_PORT=<AN_OPEN_PORT>
-export MASTER_ADDR=<I.P.>
-
-pkill -f 'python -u train.py'
-
-python -m torch.distributed.launch \
-    --nproc_per_node=$N_GPU_NODE \
-    --nnodes=$N_NODES \
-    --node_rank $NODE_RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT \
-    train.py \
-        --force \
-        --n_gpu $WORLD_SIZE \
-        --student_type distilbert \
-        --student_config training_configs/distilbert-base-uncased.json \
-        --teacher_type bert \
-        --teacher_name bert-base-uncased \
-        --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --alpha_clm 0.0 --mlm \
-        --freeze_pos_embs \
-        --dump_path serialization_dir/my_first_training \
-        --data_file data/binarized_text.bert-base-uncased.pickle \
-        --token_counts data/token_counts.bert-base-uncased.pickle
-```
-
-**Tips:** Starting distilled training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!
-
-Happy distillation!
-
-## Citation
-
-If you find the resource useful, you should cite the following paper:
-
-```bibtex
-@inproceedings{sanh2019distilbert,
-  title={DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter},
-  author={Sanh, Victor and Debut, Lysandre and Chaumond, Julien and Wolf, Thomas},
-  booktitle={NeurIPS EMC^2 Workshop},
-  year={2019}
-}
-```
diff --git a/examples/research_projects/distillation/distiller.py b/examples/research_projects/distillation/distiller.py
deleted file mode 100644
index 963af976f5a4..000000000000
--- a/examples/research_projects/distillation/distiller.py
+++ /dev/null
@@ -1,601 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""The distiller to distil the student.
-Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
-"""
-
-import math
-import os
-import time
-
-import psutil
-import torch
-from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
-from lm_seqs_dataset import LmSeqsDataset
-from torch import nn
-from torch.optim import AdamW
-from torch.utils.data import BatchSampler, DataLoader, RandomSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm
-
-from transformers import get_linear_schedule_with_warmup
-from utils import logger
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-class Distiller:
-    def __init__(
-        self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
-    ):
-        logger.info("Initializing Distiller")
-        self.params = params
-        self.dump_path = params.dump_path
-        self.multi_gpu = params.multi_gpu
-        self.fp16 = params.fp16
-
-        self.student = student
-        self.teacher = teacher
-
-        self.student_config = student.config
-        self.vocab_size = student.config.vocab_size
-
-        if params.n_gpu <= 1:
-            sampler = RandomSampler(dataset)
-        else:
-            sampler = DistributedSampler(dataset)
-
-        if params.group_by_size:
-            groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size)
-            sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size)
-        else:
-            sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
-
-        self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)
-
-        self.temperature = params.temperature
-        assert self.temperature > 0.0
-
-        self.alpha_ce = params.alpha_ce
-        self.alpha_mlm = params.alpha_mlm
-        self.alpha_clm = params.alpha_clm
-        self.alpha_mse = params.alpha_mse
-        self.alpha_cos = params.alpha_cos
-
-        self.mlm = params.mlm
-        if self.mlm:
-            logger.info("Using MLM loss for LM step.")
-            self.mlm_mask_prop = params.mlm_mask_prop
-            assert 0.0 <= self.mlm_mask_prop <= 1.0
-            assert params.word_mask + params.word_keep + params.word_rand == 1.0
-            self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
-            self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
-            self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
-            if self.fp16:
-                self.pred_probs = self.pred_probs.half()
-                self.token_probs = self.token_probs.half()
-        else:
-            logger.info("Using CLM loss for LM step.")
-
-        self.epoch = 0
-        self.n_iter = 0
-        self.n_total_iter = 0
-        self.n_sequences_epoch = 0
-        self.total_loss_epoch = 0
-        self.last_loss = 0
-        self.last_loss_ce = 0
-        self.last_loss_mlm = 0
-        self.last_loss_clm = 0
-        if self.alpha_mse > 0.0:
-            self.last_loss_mse = 0
-        if self.alpha_cos > 0.0:
-            self.last_loss_cos = 0
-        self.last_log = 0
-
-        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
-        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
-        if self.alpha_mse > 0.0:
-            self.mse_loss_fct = nn.MSELoss(reduction="sum")
-        if self.alpha_cos > 0.0:
-            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")
-
-        logger.info("--- Initializing model optimizer")
-        assert params.gradient_accumulation_steps >= 1
-        self.num_steps_epoch = len(self.dataloader)
-        num_train_optimization_steps = (
-            int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
-        )
-
-        no_decay = ["bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [
-                    p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
-                ],
-                "weight_decay": params.weight_decay,
-            },
-            {
-                "params": [
-                    p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
-                ],
-                "weight_decay": 0.0,
-            },
-        ]
-        logger.info(
-            "------ Number of trainable parameters (student): %i"
-            % sum([p.numel() for p in self.student.parameters() if p.requires_grad])
-        )
-        logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
-        self.optimizer = AdamW(
-            optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
-        )
-
-        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
-        self.scheduler = get_linear_schedule_with_warmup(
-            self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
-        )
-
-        if self.fp16:
-            try:
-                from apex import amp
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-            logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
-            self.student, self.optimizer = amp.initialize(
-                self.student, self.optimizer, opt_level=self.params.fp16_opt_level
-            )
-            self.teacher = self.teacher.half()
-
-        if self.multi_gpu:
-            if self.fp16:
-                from apex.parallel import DistributedDataParallel
-
-                logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
-                self.student = DistributedDataParallel(self.student)
-            else:
-                from torch.nn.parallel import DistributedDataParallel
-
-                logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
-                self.student = DistributedDataParallel(
-                    self.student,
-                    device_ids=[params.local_rank],
-                    output_device=params.local_rank,
-                    find_unused_parameters=True,
-                )
-
-        self.is_master = params.is_master
-        if self.is_master:
-            logger.info("--- Initializing Tensorboard")
-            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
-            self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
-            self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
-
-    def prepare_batch_mlm(self, batch):
-        """
-        Prepare the batch: from the token_ids and the lengths, compute the attention mask and the masked label for MLM.
-
-        Input:
-        ------
-            batch: `Tuple`
-                token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
-                lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
-
-        Output:
-        -------
-            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
-            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
-            mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels. There is a -100 where there is nothing to predict.
-        """
-        token_ids, lengths = batch
-        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
-        assert token_ids.size(0) == lengths.size(0)
-
-        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
-
-        bs, max_seq_len = token_ids.size()
-        mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
-
-        x_prob = self.token_probs[token_ids.flatten()]
-        n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
-        tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
-        pred_mask = torch.zeros(
-            bs * max_seq_len, dtype=torch.bool, device=token_ids.device
-        )  # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
-        pred_mask[tgt_ids] = 1
-        pred_mask = pred_mask.view(bs, max_seq_len)
-
-        pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0
-
-        # mask a number of words == 0 [8] (faster with fp16)
-        if self.fp16:
-            n1 = pred_mask.sum().item()
-            if n1 > 8:
-                pred_mask = pred_mask.view(-1)
-                n2 = max(n1 % 8, 8 * (n1 // 8))
-                if n2 != n1:
-                    pred_mask[torch.nonzero(pred_mask).view(-1)[: n1 - n2]] = 0
-                pred_mask = pred_mask.view(bs, max_seq_len)
-                assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
-
-        _token_ids_real = token_ids[pred_mask]
-        _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
-        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids["mask_token"])
-        probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
-        _token_ids = (
-            _token_ids_mask * (probs == 0).long()
-            + _token_ids_real * (probs == 1).long()
-            + _token_ids_rand * (probs == 2).long()
-        )
-        token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
-
-        mlm_labels[~pred_mask] = -100  # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
-
-        # sanity checks
-        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
-
-        return token_ids, attn_mask, mlm_labels
-
-    def prepare_batch_clm(self, batch):
-        """
-        Prepare the batch: from the token_ids and the lengths, compute the attention mask and the labels for CLM.
-
-        Input:
-        ------
-            batch: `Tuple`
-                token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
-                lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
-
-        Output:
-        -------
-            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
-            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
-            clm_labels: `torch.tensor(bs, seq_length)` - The causal language modeling labels. There is a -100 where there is nothing to predict.
-        """
-        token_ids, lengths = batch
-        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
-        assert token_ids.size(0) == lengths.size(0)
-
-        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
-        clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
-        clm_labels[~attn_mask] = -100  # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
-
-        # sanity checks
-        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
-
-        return token_ids, attn_mask, clm_labels
-
-    def round_batch(self, x: torch.tensor, lengths: torch.tensor):
-        """
-        For float16 only.
-        Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8.
-
-        Input:
-        ------
-            x: `torch.tensor(bs, seq_length)` - The token ids.
-            lengths: `torch.tensor(bs, seq_length)` - The lengths of each of the sequence in the batch.
-
-        Output:
-        -------
-            x:  `torch.tensor(new_bs, new_seq_length)` - The updated token ids.
-            lengths: `torch.tensor(new_bs, new_seq_length)` - The updated lengths.
-        """
-        if not self.fp16 or len(lengths) < 8:
-            return x, lengths
-
-        # number of sentences == 0 [8]
-        bs1 = len(lengths)
-        bs2 = 8 * (bs1 // 8)
-        assert bs2 > 0 and bs2 % 8 == 0
-        if bs1 != bs2:
-            idx = torch.randperm(bs1)[:bs2]
-            lengths = lengths[idx]
-            slen = lengths.max().item()
-            x = x[idx, :slen]
-        else:
-            idx = None
-
-        # sequence length == 0 [8]
-        ml1 = x.size(1)
-        if ml1 % 8 != 0:
-            pad = 8 - (ml1 % 8)
-            ml2 = ml1 + pad
-            if self.mlm:
-                pad_id = self.params.special_tok_ids["pad_token"]
-            else:
-                pad_id = self.params.special_tok_ids["unk_token"]
-            padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
-            x = torch.cat([x, padding_tensor], 1)
-            assert x.size() == (bs2, ml2)
-
-        assert x.size(0) % 8 == 0
-        assert x.size(1) % 8 == 0
-        return x, lengths
-
-    def train(self):
-        """
-        The real training loop.
-        """
-        if self.is_master:
-            logger.info("Starting training")
-        self.last_log = time.time()
-        self.student.train()
-        self.teacher.eval()
-
-        for _ in range(self.params.n_epoch):
-            if self.is_master:
-                logger.info(f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}")
-            if self.multi_gpu:
-                torch.distributed.barrier()
-
-            iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
-            for batch in iter_bar:
-                if self.params.n_gpu > 0:
-                    batch = tuple(t.to(f"cuda:{self.params.local_rank}") for t in batch)
-
-                if self.mlm:
-                    token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
-                else:
-                    token_ids, attn_mask, lm_labels = self.prepare_batch_clm(batch=batch)
-                self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)
-
-                iter_bar.update()
-                iter_bar.set_postfix(
-                    {"Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}"}
-                )
-            iter_bar.close()
-
-            if self.is_master:
-                logger.info(f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}")
-            self.end_epoch()
-
-        if self.is_master:
-            logger.info("Save very last checkpoint as `pytorch_model.bin`.")
-            self.save_checkpoint(checkpoint_name="pytorch_model.bin")
-            logger.info("Training is finished")
-
-    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
-        """
-        One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
-        and possibly a parameter update (depending on the gradient accumulation).
-
-        Input:
-        ------
-        input_ids: `torch.tensor(bs, seq_length)` - The token ids.
-        attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
-        lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
-        """
-        if self.mlm:
-            student_outputs = self.student(
-                input_ids=input_ids, attention_mask=attention_mask
-            )  # (bs, seq_length, voc_size)
-            with torch.no_grad():
-                teacher_outputs = self.teacher(
-                    input_ids=input_ids, attention_mask=attention_mask
-                )  # (bs, seq_length, voc_size)
-        else:
-            student_outputs = self.student(input_ids=input_ids, attention_mask=None)  # (bs, seq_length, voc_size)
-            with torch.no_grad():
-                teacher_outputs = self.teacher(input_ids=input_ids, attention_mask=None)  # (bs, seq_length, voc_size)
-        s_logits, s_hidden_states = student_outputs["logits"], student_outputs["hidden_states"]
-        t_logits, t_hidden_states = teacher_outputs["logits"], teacher_outputs["hidden_states"]
-        assert s_logits.size() == t_logits.size()
-
-        # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
-        # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
-        if self.params.restrict_ce_to_mask:
-            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_length, voc_size)
-        else:
-            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_length, voc_size)
-        s_logits_slct = torch.masked_select(s_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
-        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
-        t_logits_slct = torch.masked_select(t_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
-        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
-        assert t_logits_slct.size() == s_logits_slct.size()
-
-        loss_ce = (
-            self.ce_loss_fct(
-                nn.functional.log_softmax(s_logits_slct / self.temperature, dim=-1),
-                nn.functional.softmax(t_logits_slct / self.temperature, dim=-1),
-            )
-            * (self.temperature) ** 2
-        )
-        loss = self.alpha_ce * loss_ce
-
-        if self.alpha_mlm > 0.0:
-            loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
-            loss += self.alpha_mlm * loss_mlm
-        if self.alpha_clm > 0.0:
-            shift_logits = s_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
-            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-            loss += self.alpha_clm * loss_clm
-
-        if self.alpha_mse > 0.0:
-            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size(
-                0
-            )  # Reproducing batchmean reduction
-            loss += self.alpha_mse * loss_mse
-        if self.alpha_cos > 0.0:
-            s_hidden_states = s_hidden_states[-1]  # (bs, seq_length, dim)
-            t_hidden_states = t_hidden_states[-1]  # (bs, seq_length, dim)
-            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)  # (bs, seq_length, dim)
-            assert s_hidden_states.size() == t_hidden_states.size()
-            dim = s_hidden_states.size(-1)
-
-            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)  # (bs * seq_length * dim)
-            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
-            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)  # (bs * seq_length * dim)
-            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
-
-            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1)  # (bs * seq_length,)
-            loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
-            loss += self.alpha_cos * loss_cos
-
-        self.total_loss_epoch += loss.item()
-        self.last_loss = loss.item()
-        self.last_loss_ce = loss_ce.item()
-        if self.alpha_mlm > 0.0:
-            self.last_loss_mlm = loss_mlm.item()
-        if self.alpha_clm > 0.0:
-            self.last_loss_clm = loss_clm.item()
-        if self.alpha_mse > 0.0:
-            self.last_loss_mse = loss_mse.item()
-        if self.alpha_cos > 0.0:
-            self.last_loss_cos = loss_cos.item()
-
-        self.optimize(loss)
-
-        self.n_sequences_epoch += input_ids.size(0)
-
-    def optimize(self, loss):
-        """
-        Normalization on the loss (gradient accumulation or distributed training), followed by
-        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
-        Also update the metrics for tensorboard.
-        """
-        # Check for NaN
-        if (loss != loss).data.any():
-            logger.error("NaN detected")
-            exit()
-
-        if self.multi_gpu:
-            loss = loss.mean()
-        if self.params.gradient_accumulation_steps > 1:
-            loss = loss / self.params.gradient_accumulation_steps
-
-        if self.fp16:
-            from apex import amp
-
-            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
-                scaled_loss.backward()
-        else:
-            loss.backward()
-
-        self.iter()
-        if self.n_iter % self.params.gradient_accumulation_steps == 0:
-            if self.fp16:
-                nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
-            else:
-                nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
-            self.optimizer.step()
-            self.optimizer.zero_grad()
-            self.scheduler.step()
-
-    def iter(self):
-        """
-        Update global counts, write to tensorboard and save checkpoint.
-        """
-        self.n_iter += 1
-        self.n_total_iter += 1
-
-        if self.n_total_iter % self.params.log_interval == 0:
-            self.log_tensorboard()
-            self.last_log = time.time()
-        if self.n_total_iter % self.params.checkpoint_interval == 0:
-            self.save_checkpoint()
-
-    def log_tensorboard(self):
-        """
-        Log into tensorboard. Only by the master process.
-        """
-        if not self.is_master:
-            return
-
-        for param_name, param in self.student.named_parameters():
-            self.tensorboard.add_scalar(
-                tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter
-            )
-            self.tensorboard.add_scalar(
-                tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter
-            )
-            if param.grad is None:
-                continue
-            self.tensorboard.add_scalar(
-                tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter
-            )
-            self.tensorboard.add_scalar(
-                tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter
-            )
-
-        self.tensorboard.add_scalar(
-            tag="losses/cum_avg_loss_epoch",
-            scalar_value=self.total_loss_epoch / self.n_iter,
-            global_step=self.n_total_iter,
-        )
-        self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(
-            tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter
-        )
-        if self.alpha_mlm > 0.0:
-            self.tensorboard.add_scalar(
-                tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter
-            )
-        if self.alpha_clm > 0.0:
-            self.tensorboard.add_scalar(
-                tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter
-            )
-        if self.alpha_mse > 0.0:
-            self.tensorboard.add_scalar(
-                tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter
-            )
-        if self.alpha_cos > 0.0:
-            self.tensorboard.add_scalar(
-                tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter
-            )
-        self.tensorboard.add_scalar(
-            tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter
-        )
-
-        self.tensorboard.add_scalar(
-            tag="global/memory_usage",
-            scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000,
-            global_step=self.n_total_iter,
-        )
-        self.tensorboard.add_scalar(
-            tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter
-        )
-
-    def end_epoch(self):
-        """
-        Finally arrived at the end of epoch (full pass on dataset).
-        Do some tensorboard logging and checkpoint saving.
-        """
-        logger.info(f"{self.n_sequences_epoch} sequences have been trained during this epoch.")
-
-        if self.is_master:
-            self.save_checkpoint(checkpoint_name=f"model_epoch_{self.epoch}.pth")
-            self.tensorboard.add_scalar(
-                tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch
-            )
-
-        self.epoch += 1
-        self.n_sequences_epoch = 0
-        self.n_iter = 0
-        self.total_loss_epoch = 0
-
-    def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"):
-        """
-        Save the current state. Only by the master process.
-        """
-        if not self.is_master:
-            return
-        mdl_to_save = self.student.module if hasattr(self.student, "module") else self.student
-        mdl_to_save.config.save_pretrained(self.dump_path)
-        state_dict = mdl_to_save.state_dict()
-        torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
diff --git a/examples/research_projects/distillation/grouped_batch_sampler.py b/examples/research_projects/distillation/grouped_batch_sampler.py
deleted file mode 100644
index e25def738a84..000000000000
--- a/examples/research_projects/distillation/grouped_batch_sampler.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py)"""
-
-import bisect
-import copy
-from collections import defaultdict
-
-import numpy as np
-from torch.utils.data import BatchSampler, Sampler
-
-from utils import logger
-
-
-def _quantize(x, bins):
-    bins = copy.deepcopy(bins)
-    bins = sorted(bins)
-    quantized = [bisect.bisect_right(bins, y) for y in x]
-    return quantized
-
-
-def create_lengths_groups(lengths, k=0):
-    bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
-    groups = _quantize(lengths, bins)
-    # count number of elements per group
-    counts = np.unique(groups, return_counts=True)[1]
-    fbins = [0] + bins + [np.inf]
-    logger.info("Using {} as bins for aspect lengths quantization".format(fbins))
-    logger.info("Count of instances per bin: {}".format(counts))
-    return groups
-
-
-class GroupedBatchSampler(BatchSampler):
-    """
-    Wraps another sampler to yield a mini-batch of indices.
-    It enforces that the batch only contain elements from the same group.
-    It also tries to provide mini-batches which follows an ordering which is
-    as close as possible to the ordering from the original sampler.
-    Arguments:
-        sampler (Sampler): Base sampler.
-        group_ids (list[int]): If the sampler produces indices in range [0, N),
-            `group_ids` must be a list of `N` ints which contains the group id of each sample.
-            The group ids must be a continuous set of integers starting from
-            0, i.e. they must be in the range [0, num_groups).
-        batch_size (int): Size of mini-batch.
-    """
-
-    def __init__(self, sampler, group_ids, batch_size):
-        if not isinstance(sampler, Sampler):
-            raise TypeError(
-                "sampler should be an instance of torch.utils.data.Sampler, but got sampler={}".format(sampler)
-            )
-        self.sampler = sampler
-        self.group_ids = group_ids
-        self.batch_size = batch_size
-
-    def __iter__(self):
-        buffer_per_group = defaultdict(list)
-        samples_per_group = defaultdict(list)
-
-        num_batches = 0
-        for idx in self.sampler:
-            group_id = self.group_ids[idx]
-            buffer_per_group[group_id].append(idx)
-            samples_per_group[group_id].append(idx)
-            if len(buffer_per_group[group_id]) == self.batch_size:
-                yield buffer_per_group[group_id]  # TODO
-                num_batches += 1
-                del buffer_per_group[group_id]
-            assert len(buffer_per_group[group_id]) < self.batch_size
-
-        # now we have run out of elements that satisfy
-        # the group criteria, let's return the remaining
-        # elements so that the size of the sampler is
-        # deterministic
-        expected_num_batches = len(self)
-        num_remaining = expected_num_batches - num_batches
-        if num_remaining > 0:
-            # for the remaining batches, group the batches by similar lengths
-            batch_idx = []
-            for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
-                batch_idx.extend(idxs)
-                if len(batch_idx) >= self.batch_size:
-                    yield batch_idx[: self.batch_size]
-                    batch_idx = batch_idx[self.batch_size :]
-                    num_remaining -= 1
-            if len(batch_idx) > 0:
-                yield batch_idx
-                num_remaining -= 1
-        assert num_remaining == 0
-
-    def __len__(self):
-        """
-        Return the number of mini-batches rather than the number of samples.
-        """
-        return (len(self.sampler) + self.batch_size - 1) // self.batch_size
diff --git a/examples/research_projects/distillation/lm_seqs_dataset.py b/examples/research_projects/distillation/lm_seqs_dataset.py
deleted file mode 100644
index 647c8f464f7e..000000000000
--- a/examples/research_projects/distillation/lm_seqs_dataset.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Dataset to distilled models
-adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
-"""
-
-import numpy as np
-import torch
-from torch.utils.data import Dataset
-
-from utils import logger
-
-
-class LmSeqsDataset(Dataset):
-    """Custom Dataset wrapping language modeling sequences.
-
-    Each sample will be retrieved by indexing the list of token_ids and their corresponding lengths.
-
-    Input:
-    ------
-        params: `NameSpace` parameters
-        data: `List[np.array[int]]
-    """
-
-    def __init__(self, params, data):
-        self.params = params
-
-        self.token_ids = np.array(data)
-        self.lengths = np.array([len(t) for t in data])
-
-        self.check()
-        self.remove_long_sequences()
-        self.remove_empty_sequences()
-        self.remove_unknown_sequences()
-        self.check()
-        self.print_statistics()
-
-    def __getitem__(self, index):
-        return (self.token_ids[index], self.lengths[index])
-
-    def __len__(self):
-        return len(self.lengths)
-
-    def check(self):
-        """
-        Some sanity checks
-        """
-        assert len(self.token_ids) == len(self.lengths)
-        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))
-
-    def remove_long_sequences(self):
-        """
-        Sequences that are too long are split by chunk of max_model_input_size.
-        """
-        max_len = self.params.max_model_input_size
-        indices = self.lengths > max_len
-        logger.info(f"Splitting {sum(indices)} too long sequences.")
-
-        def divide_chunks(l, n):
-            return [l[i : i + n] for i in range(0, len(l), n)]
-
-        new_tok_ids = []
-        new_lengths = []
-        if self.params.mlm:
-            cls_id, sep_id = self.params.special_tok_ids["cls_token"], self.params.special_tok_ids["sep_token"]
-        else:
-            cls_id, sep_id = self.params.special_tok_ids["bos_token"], self.params.special_tok_ids["eos_token"]
-
-        for seq_, len_ in zip(self.token_ids, self.lengths):
-            assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
-            if len_ <= max_len:
-                new_tok_ids.append(seq_)
-                new_lengths.append(len_)
-            else:
-                sub_seqs = []
-                for sub_s in divide_chunks(seq_, max_len - 2):
-                    if sub_s[0] != cls_id:
-                        sub_s = np.insert(sub_s, 0, cls_id)
-                    if sub_s[-1] != sep_id:
-                        sub_s = np.insert(sub_s, len(sub_s), sep_id)
-                    assert len(sub_s) <= max_len
-                    assert (sub_s[0] == cls_id) and (sub_s[-1] == sep_id), sub_s
-                    sub_seqs.append(sub_s)
-
-                new_tok_ids.extend(sub_seqs)
-                new_lengths.extend([len(l) for l in sub_seqs])
-
-        self.token_ids = np.array(new_tok_ids)
-        self.lengths = np.array(new_lengths)
-
-    def remove_empty_sequences(self):
-        """
-        Too short sequences are simply removed. This could be tuned.
-        """
-        init_size = len(self)
-        indices = self.lengths > 11
-        self.token_ids = self.token_ids[indices]
-        self.lengths = self.lengths[indices]
-        new_size = len(self)
-        logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
-
-    def remove_unknown_sequences(self):
-        """
-        Remove sequences with a (too) high level of unknown tokens.
-        """
-        if "unk_token" not in self.params.special_tok_ids:
-            return
-        else:
-            unk_token_id = self.params.special_tok_ids["unk_token"]
-        init_size = len(self)
-        unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
-        indices = (unk_occs / self.lengths) < 0.5
-        self.token_ids = self.token_ids[indices]
-        self.lengths = self.lengths[indices]
-        new_size = len(self)
-        logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).")
-
-    def print_statistics(self):
-        """
-        Print some statistics on the corpus. Only the master process.
-        """
-        if not self.params.is_master:
-            return
-        logger.info(f"{len(self)} sequences")
-        # data_len = sum(self.lengths)
-        # nb_unique_tokens = len(Counter(list(chain(*self.token_ids))))
-        # logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
-
-        # unk_idx = self.params.special_tok_ids['unk_token']
-        # nb_unknown = sum([(t==unk_idx).sum() for t in self.token_ids])
-        # logger.info(f'{nb_unknown} unknown tokens (covering {100*nb_unknown/data_len:.2f}% of the data)')
-
-    def batch_sequences(self, batch):
-        """
-        Do the padding and transform into torch.tensor.
-        """
-        token_ids = [t[0] for t in batch]
-        lengths = [t[1] for t in batch]
-        assert len(token_ids) == len(lengths)
-
-        # Max for paddings
-        max_seq_len_ = max(lengths)
-
-        # Pad token ids
-        if self.params.mlm:
-            pad_idx = self.params.special_tok_ids["pad_token"]
-        else:
-            pad_idx = self.params.special_tok_ids["unk_token"]
-        tk_ = [list(t.astype(int)) + [pad_idx] * (max_seq_len_ - len(t)) for t in token_ids]
-        assert len(tk_) == len(token_ids)
-        assert all(len(t) == max_seq_len_ for t in tk_)
-
-        tk_t = torch.tensor(tk_)  # (bs, max_seq_len_)
-        lg_t = torch.tensor(lengths)  # (bs)
-        return tk_t, lg_t
diff --git a/examples/research_projects/distillation/requirements.txt b/examples/research_projects/distillation/requirements.txt
deleted file mode 100644
index 4a2ed783a7c9..000000000000
--- a/examples/research_projects/distillation/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-transformers
-
-gitpython==3.1.41
-tensorboard>=1.14.0
-tensorboardX==1.8
-psutil==5.6.6
-scipy>=1.4.1
diff --git a/examples/research_projects/distillation/run_squad_w_distillation.py b/examples/research_projects/distillation/run_squad_w_distillation.py
deleted file mode 100644
index a1150f6b437e..000000000000
--- a/examples/research_projects/distillation/run_squad_w_distillation.py
+++ /dev/null
@@ -1,877 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This is the exact same script as `examples/question-answering/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""
-
-import argparse
-import glob
-import logging
-import os
-import random
-import timeit
-
-import numpy as np
-import torch
-from torch import nn
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-import transformers
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForQuestionAnswering,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertForQuestionAnswering,
-    DistilBertTokenizer,
-    RobertaConfig,
-    RobertaForQuestionAnswering,
-    RobertaTokenizer,
-    XLMConfig,
-    XLMForQuestionAnswering,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetForQuestionAnswering,
-    XLNetTokenizer,
-    get_linear_schedule_with_warmup,
-    squad_convert_examples_to_features,
-)
-from transformers.data.metrics.squad_metrics import (
-    compute_predictions_log_probs,
-    compute_predictions_logits,
-    squad_evaluate,
-)
-from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
-from transformers.trainer_utils import is_main_process
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
-    "roberta": (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def to_list(tensor):
-    return tensor.detach().cpu().tolist()
-
-
-def train(args, train_dataset, model, tokenizer, teacher=None):
-    """Train the model"""
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 1
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        try:
-            # set global_step to global_step of last saved checkpoint from model path
-            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
-            global_step = int(checkpoint_suffix)
-            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-            logger.info("  Continuing training from epoch %d", epochs_trained)
-            logger.info("  Continuing training from global step %d", global_step)
-            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-        except ValueError:
-            logger.info("  Starting fine-tuning.")
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
-    )
-    # Added here for reproducibility
-    set_seed(args)
-
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            if teacher is not None:
-                teacher.eval()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "start_positions": batch[3],
-                "end_positions": batch[4],
-            }
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
-                if args.version_2_with_negative:
-                    inputs.update({"is_impossible": batch[7]})
-            outputs = model(**inputs)
-            loss, start_logits_stu, end_logits_stu = outputs
-
-            # Distillation loss
-            if teacher is not None:
-                if "token_type_ids" not in inputs:
-                    inputs["token_type_ids"] = None if args.teacher_type == "xlm" else batch[2]
-                with torch.no_grad():
-                    start_logits_tea, end_logits_tea = teacher(
-                        input_ids=inputs["input_ids"],
-                        token_type_ids=inputs["token_type_ids"],
-                        attention_mask=inputs["attention_mask"],
-                    )
-                assert start_logits_tea.size() == start_logits_stu.size()
-                assert end_logits_tea.size() == end_logits_stu.size()
-
-                loss_fct = nn.KLDivLoss(reduction="batchmean")
-                loss_start = loss_fct(
-                    nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                    nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
-                ) * (args.temperature**2)
-                loss_end = loss_fct(
-                    nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                    nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
-                ) * (args.temperature**2)
-                loss_ce = (loss_start + loss_end) / 2.0
-
-                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                # Log metrics
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Only evaluate when single GPU otherwise metrics may not average well
-                    if args.local_rank == -1 and args.evaluate_during_training:
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
-
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset)
-    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # multi-gpu evaluate
-    if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
-        model = nn.DataParallel(model)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-
-    all_results = []
-    start_time = timeit.default_timer()
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
-        batch = tuple(t.to(args.device) for t in batch)
-
-        with torch.no_grad():
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2]  # XLM don't use segment_ids
-            example_indices = batch[3]
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
-
-            outputs = model(**inputs)
-
-        for i, example_index in enumerate(example_indices):
-            eval_feature = features[example_index.item()]
-            unique_id = int(eval_feature.unique_id)
-
-            output = [to_list(output[i]) for output in outputs]
-
-            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
-            # models only use two.
-            if len(output) >= 5:
-                start_logits = output[0]
-                start_top_index = output[1]
-                end_logits = output[2]
-                end_top_index = output[3]
-                cls_logits = output[4]
-
-                result = SquadResult(
-                    unique_id,
-                    start_logits,
-                    end_logits,
-                    start_top_index=start_top_index,
-                    end_top_index=end_top_index,
-                    cls_logits=cls_logits,
-                )
-
-            else:
-                start_logits, end_logits = output
-                result = SquadResult(unique_id, start_logits, end_logits)
-
-            all_results.append(result)
-
-    evalTime = timeit.default_timer() - start_time
-    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
-
-    # Compute predictions
-    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
-    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
-
-    if args.version_2_with_negative:
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
-    else:
-        output_null_log_odds_file = None
-
-    if args.model_type in ["xlnet", "xlm"]:
-        # XLNet uses a more complex post-processing procedure
-        predictions = compute_predictions_log_probs(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            model.config.start_n_top,
-            model.config.end_n_top,
-            args.version_2_with_negative,
-            tokenizer,
-            args.verbose_logging,
-        )
-    else:
-        predictions = compute_predictions_logits(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            args.do_lower_case,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            args.verbose_logging,
-            args.version_2_with_negative,
-            args.null_score_diff_threshold,
-            tokenizer,
-        )
-
-    # Compute the F1 and exact scores.
-    results = squad_evaluate(examples, predictions)
-    return results
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        torch.distributed.barrier()
-
-    # Load data features from cache or dataset file
-    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(
-        os.path.dirname(input_file),
-        "cached_distillation_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features_and_dataset = torch.load(cached_features_file)
-
-        try:
-            features, dataset, examples = (
-                features_and_dataset["features"],
-                features_and_dataset["dataset"],
-                features_and_dataset["examples"],
-            )
-        except KeyError:
-            raise DeprecationWarning(
-                "You seem to be loading features from an older version of this script please delete the "
-                "file %s in order for it to be created again" % cached_features_file
-            )
-    else:
-        logger.info("Creating features from dataset file at %s", input_file)
-        processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
-        if evaluate:
-            examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
-        else:
-            examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
-
-        features, dataset = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=not evaluate,
-            return_dataset="pt",
-            threads=args.threads,
-        )
-
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        torch.distributed.barrier()
-
-    if output_examples:
-        return dataset, examples, features
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model checkpoints and predictions will be written.",
-    )
-
-    # Distillation parameters (optional)
-    parser.add_argument(
-        "--teacher_type",
-        default=None,
-        type=str,
-        help=(
-            "Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for"
-            " distillation."
-        ),
-    )
-    parser.add_argument(
-        "--teacher_name_or_path",
-        default=None,
-        type=str,
-        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.",
-    )
-    parser.add_argument(
-        "--alpha_ce", default=0.5, type=float, help="Distillation loss linear weight. Only for distillation."
-    )
-    parser.add_argument(
-        "--alpha_squad", default=0.5, type=float, help="True SQuAD loss linear weight. Only for distillation."
-    )
-    parser.add_argument(
-        "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation."
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        help="The input data dir. Should contain the .json files for the task."
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--train_file",
-        default=None,
-        type=str,
-        help="The input training file. If a data dir is specified, will look for the file there"
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--predict_file",
-        default=None,
-        type=str,
-        help="The input evaluation file. If a data dir is specified, will look for the file there"
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-    )
-
-    parser.add_argument(
-        "--version_2_with_negative",
-        action="store_true",
-        help="If true, the SQuAD examples contain some that do not have an answer.",
-    )
-    parser.add_argument(
-        "--null_score_diff_threshold",
-        type=float,
-        default=0.0,
-        help="If null_score - best_non_null is greater than the threshold predict null.",
-    )
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=384,
-        type=int,
-        help=(
-            "The maximum total input sequence length after WordPiece tokenization. Sequences "
-            "longer than this will be truncated, and sequences shorter than this will be padded."
-        ),
-    )
-    parser.add_argument(
-        "--doc_stride",
-        default=128,
-        type=int,
-        help="When splitting up a long document into chunks, how much stride to take between chunks.",
-    )
-    parser.add_argument(
-        "--max_query_length",
-        default=64,
-        type=int,
-        help=(
-            "The maximum number of tokens for the question. Questions longer than this will "
-            "be truncated to this length."
-        ),
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-    parser.add_argument(
-        "--n_best_size",
-        default=20,
-        type=int,
-        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
-    )
-    parser.add_argument(
-        "--max_answer_length",
-        default=30,
-        type=int,
-        help=(
-            "The maximum length of an answer that can be generated. This is needed because the start "
-            "and end predictions are not conditioned on one another."
-        ),
-    )
-    parser.add_argument(
-        "--verbose_logging",
-        action="store_true",
-        help=(
-            "If true, all of the warnings related to data processing will be printed. "
-            "A number of warnings are expected for a normal SQuAD evaluation."
-        ),
-    )
-
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-
-    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.teacher_type is not None:
-        assert args.teacher_name_or_path is not None
-        assert args.alpha_ce > 0.0
-        assert args.alpha_ce + args.alpha_squad > 0.0
-        assert args.teacher_type != "distilbert", "We constraint teachers not to be of type DistilBERT."
-        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
-        teacher_config = teacher_config_class.from_pretrained(
-            args.teacher_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None
-        )
-        teacher = teacher_model_class.from_pretrained(
-            args.teacher_name_or_path, config=teacher_config, cache_dir=args.cache_dir if args.cache_dir else None
-        )
-        teacher.to(args.device)
-    else:
-        teacher = None
-
-    if args.local_rank == 0:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
-    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
-    # remove the need for this code, but it is still valid.
-    if args.fp16:
-        try:
-            import apex
-
-            apex.amp.register_half_function(torch, "einsum")
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Save the trained model and the tokenizer
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        model.to(args.device)
-
-    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        if args.do_train:
-            logger.info("Loading checkpoints saved during training for evaluation")
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = [
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            ]
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        for checkpoint in checkpoints:
-            # Reload the model
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-
-            # Evaluate
-            result = evaluate(args, model, tokenizer, prefix=global_step)
-
-            result = {k + ("_{}".format(global_step) if global_step else ""): v for k, v in result.items()}
-            results.update(result)
-
-    logger.info("Results: {}".format(results))
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/distillation/scripts/binarized_data.py b/examples/research_projects/distillation/scripts/binarized_data.py
deleted file mode 100644
index 3fc3214acf7f..000000000000
--- a/examples/research_projects/distillation/scripts/binarized_data.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocessing script before distillation.
-"""
-
-import argparse
-import logging
-import pickle
-import random
-import time
-
-import numpy as np
-
-from transformers import BertTokenizer, GPT2Tokenizer, RobertaTokenizer
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
-)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
-    )
-    parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
-    parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
-    parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
-    parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
-    args = parser.parse_args()
-
-    logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
-    if args.tokenizer_type == "bert":
-        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map["cls_token"]  # `[CLS]`
-        sep = tokenizer.special_tokens_map["sep_token"]  # `[SEP]`
-    elif args.tokenizer_type == "roberta":
-        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map["cls_token"]  # `<s>`
-        sep = tokenizer.special_tokens_map["sep_token"]  # `</s>`
-    elif args.tokenizer_type == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map["bos_token"]  # `<|endoftext|>`
-        sep = tokenizer.special_tokens_map["eos_token"]  # `<|endoftext|>`
-
-    logger.info(f"Loading text from {args.file_path}")
-    with open(args.file_path, "r", encoding="utf8") as fp:
-        data = fp.readlines()
-
-    logger.info("Start encoding")
-    logger.info(f"{len(data)} examples to process.")
-
-    rslt = []
-    iter = 0
-    interval = 10000
-    start = time.time()
-    for text in data:
-        text = f"{bos} {text.strip()} {sep}"
-        token_ids = tokenizer.encode(text, add_special_tokens=False)
-        rslt.append(token_ids)
-
-        iter += 1
-        if iter % interval == 0:
-            end = time.time()
-            logger.info(f"{iter} examples processed. - {(end-start):.2f}s/{interval}expl")
-            start = time.time()
-    logger.info("Finished binarization")
-    logger.info(f"{len(data)} examples processed.")
-
-    dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
-    vocab_size = tokenizer.vocab_size
-    if vocab_size < (1 << 16):
-        rslt_ = [np.uint16(d) for d in rslt]
-    else:
-        rslt_ = [np.int32(d) for d in rslt]
-    random.shuffle(rslt_)
-    logger.info(f"Dump to {dp_file}")
-    with open(dp_file, "wb") as handle:
-        pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/distillation/scripts/extract.py b/examples/research_projects/distillation/scripts/extract.py
deleted file mode 100644
index c45821d18731..000000000000
--- a/examples/research_projects/distillation/scripts/extract.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocessing script before training the distilled model.
-Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
-"""
-
-import argparse
-
-import torch
-
-from transformers import GPT2LMHeadModel, RobertaForMaskedLM
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description=(
-            "Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned"
-            " Distillation"
-        )
-    )
-    parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
-    parser.add_argument("--model_name", default="roberta-large", type=str)
-    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_roberta_048131723.pth", type=str)
-    parser.add_argument("--vocab_transform", action="store_true")
-    args = parser.parse_args()
-
-    if args.model_type == "roberta":
-        model = RobertaForMaskedLM.from_pretrained(args.model_name)
-        prefix = "roberta"
-    elif args.model_type == "gpt2":
-        model = GPT2LMHeadModel.from_pretrained(args.model_name)
-        prefix = "transformer"
-
-    state_dict = model.state_dict()
-    compressed_sd = {}
-
-    # Embeddings #
-    if args.model_type == "gpt2":
-        for param_name in ["wte.weight", "wpe.weight"]:
-            compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"]
-    else:
-        for w in ["word_embeddings", "position_embeddings", "token_type_embeddings"]:
-            param_name = f"{prefix}.embeddings.{w}.weight"
-            compressed_sd[param_name] = state_dict[param_name]
-        for w in ["weight", "bias"]:
-            param_name = f"{prefix}.embeddings.LayerNorm.{w}"
-            compressed_sd[param_name] = state_dict[param_name]
-
-    # Transformer Blocks #
-    std_idx = 0
-    for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        if args.model_type == "gpt2":
-            for layer in ["ln_1", "attn.c_attn", "attn.c_proj", "ln_2", "mlp.c_fc", "mlp.c_proj"]:
-                for w in ["weight", "bias"]:
-                    compressed_sd[f"{prefix}.h.{std_idx}.{layer}.{w}"] = state_dict[
-                        f"{prefix}.h.{teacher_idx}.{layer}.{w}"
-                    ]
-            compressed_sd[f"{prefix}.h.{std_idx}.attn.bias"] = state_dict[f"{prefix}.h.{teacher_idx}.attn.bias"]
-        else:
-            for layer in [
-                "attention.self.query",
-                "attention.self.key",
-                "attention.self.value",
-                "attention.output.dense",
-                "attention.output.LayerNorm",
-                "intermediate.dense",
-                "output.dense",
-                "output.LayerNorm",
-            ]:
-                for w in ["weight", "bias"]:
-                    compressed_sd[f"{prefix}.encoder.layer.{std_idx}.{layer}.{w}"] = state_dict[
-                        f"{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}"
-                    ]
-        std_idx += 1
-
-    # Language Modeling Head ###s
-    if args.model_type == "roberta":
-        for layer in ["lm_head.decoder.weight", "lm_head.bias"]:
-            compressed_sd[f"{layer}"] = state_dict[f"{layer}"]
-        if args.vocab_transform:
-            for w in ["weight", "bias"]:
-                compressed_sd[f"lm_head.dense.{w}"] = state_dict[f"lm_head.dense.{w}"]
-                compressed_sd[f"lm_head.layer_norm.{w}"] = state_dict[f"lm_head.layer_norm.{w}"]
-    elif args.model_type == "gpt2":
-        for w in ["weight", "bias"]:
-            compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
-        compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]
-
-    print(f"N layers selected for distillation: {std_idx}")
-    print(f"Number of params transferred for distillation: {len(compressed_sd.keys())}")
-
-    print(f"Save transferred checkpoint to {args.dump_checkpoint}.")
-    torch.save(compressed_sd, args.dump_checkpoint)
diff --git a/examples/research_projects/distillation/scripts/extract_distilbert.py b/examples/research_projects/distillation/scripts/extract_distilbert.py
deleted file mode 100644
index 8637970c5117..000000000000
--- a/examples/research_projects/distillation/scripts/extract_distilbert.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocessing script before training DistilBERT.
-Specific to BERT -> DistilBERT.
-"""
-
-import argparse
-
-import torch
-
-from transformers import BertForMaskedLM
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description=(
-            "Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned"
-            " Distillation"
-        )
-    )
-    parser.add_argument("--model_type", default="bert", choices=["bert"])
-    parser.add_argument("--model_name", default="bert-base-uncased", type=str)
-    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_bert-base-uncased_0247911.pth", type=str)
-    parser.add_argument("--vocab_transform", action="store_true")
-    args = parser.parse_args()
-
-    if args.model_type == "bert":
-        model = BertForMaskedLM.from_pretrained(args.model_name)
-        prefix = "bert"
-    else:
-        raise ValueError('args.model_type should be "bert".')
-
-    state_dict = model.state_dict()
-    compressed_sd = {}
-
-    for w in ["word_embeddings", "position_embeddings"]:
-        compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[f"{prefix}.embeddings.{w}.weight"]
-    for w in ["weight", "bias"]:
-        compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[f"{prefix}.embeddings.LayerNorm.{w}"]
-
-    std_idx = 0
-    for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        for w in ["weight", "bias"]:
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}"
-            ]
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}"
-            ]
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}"
-            ]
-
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}"
-            ]
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}"
-            ]
-
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}"
-            ]
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}"
-            ]
-            compressed_sd[f"distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}"] = state_dict[
-                f"{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}"
-            ]
-        std_idx += 1
-
-    compressed_sd["vocab_projector.weight"] = state_dict["cls.predictions.decoder.weight"]
-    compressed_sd["vocab_projector.bias"] = state_dict["cls.predictions.bias"]
-    if args.vocab_transform:
-        for w in ["weight", "bias"]:
-            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
-            compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]
-
-    print(f"N layers selected for distillation: {std_idx}")
-    print(f"Number of params transferred for distillation: {len(compressed_sd.keys())}")
-
-    print(f"Save transferred checkpoint to {args.dump_checkpoint}.")
-    torch.save(compressed_sd, args.dump_checkpoint)
diff --git a/examples/research_projects/distillation/scripts/token_counts.py b/examples/research_projects/distillation/scripts/token_counts.py
deleted file mode 100644
index 2f80bf31f477..000000000000
--- a/examples/research_projects/distillation/scripts/token_counts.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocessing script before training the distilled model.
-"""
-
-import argparse
-import logging
-import pickle
-from collections import Counter
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
-)
-logger = logging.getLogger(__name__)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)"
-    )
-    parser.add_argument(
-        "--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset."
-    )
-    parser.add_argument(
-        "--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file."
-    )
-    parser.add_argument("--vocab_size", default=30522, type=int)
-    args = parser.parse_args()
-
-    logger.info(f"Loading data from {args.data_file}")
-    with open(args.data_file, "rb") as fp:
-        data = pickle.load(fp)
-
-    logger.info("Counting occurrences for MLM.")
-    counter = Counter()
-    for tk_ids in data:
-        counter.update(tk_ids)
-    counts = [0] * args.vocab_size
-    for k, v in counter.items():
-        counts[k] = v
-
-    logger.info(f"Dump to {args.token_counts_dump}")
-    with open(args.token_counts_dump, "wb") as handle:
-        pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/examples/research_projects/distillation/train.py b/examples/research_projects/distillation/train.py
deleted file mode 100644
index 15d98ace09b5..000000000000
--- a/examples/research_projects/distillation/train.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Training the distilled model.
-Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
-"""
-
-import argparse
-import json
-import os
-import pickle
-import shutil
-
-import numpy as np
-import torch
-from distiller import Distiller
-from lm_seqs_dataset import LmSeqsDataset
-
-from transformers import (
-    BertConfig,
-    BertForMaskedLM,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertForMaskedLM,
-    DistilBertTokenizer,
-    GPT2Config,
-    GPT2LMHeadModel,
-    GPT2Tokenizer,
-    RobertaConfig,
-    RobertaForMaskedLM,
-    RobertaTokenizer,
-)
-from utils import git_log, init_gpu_params, logger, set_seed
-
-
-MODEL_CLASSES = {
-    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
-    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
-    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
-}
-
-
-def sanity_checks(args):
-    """
-    A bunch of args sanity checks to perform even starting...
-    """
-    assert (args.mlm and args.alpha_mlm > 0.0) or (not args.mlm and args.alpha_mlm == 0.0)
-    assert (args.alpha_mlm > 0.0 and args.alpha_clm == 0.0) or (args.alpha_mlm == 0.0 and args.alpha_clm > 0.0)
-    if args.mlm:
-        assert os.path.isfile(args.token_counts)
-        assert (args.student_type in ["roberta", "distilbert"]) and (args.teacher_type in ["roberta", "bert"])
-    else:
-        assert (args.student_type in ["gpt2"]) and (args.teacher_type in ["gpt2"])
-
-    assert args.teacher_type == args.student_type or (
-        args.student_type == "distilbert" and args.teacher_type == "bert"
-    )
-    assert os.path.isfile(args.student_config)
-    if args.student_pretrained_weights is not None:
-        assert os.path.isfile(args.student_pretrained_weights)
-
-    if args.freeze_token_type_embds:
-        assert args.student_type in ["roberta"]
-
-    assert args.alpha_ce >= 0.0
-    assert args.alpha_mlm >= 0.0
-    assert args.alpha_clm >= 0.0
-    assert args.alpha_mse >= 0.0
-    assert args.alpha_cos >= 0.0
-    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.0
-
-
-def freeze_pos_embeddings(student, args):
-    if args.student_type == "roberta":
-        student.roberta.embeddings.position_embeddings.weight.requires_grad = False
-    elif args.student_type == "gpt2":
-        student.transformer.wpe.weight.requires_grad = False
-
-
-def freeze_token_type_embeddings(student, args):
-    if args.student_type == "roberta":
-        student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Training")
-    parser.add_argument("--force", action="store_true", help="Overwrite dump_path if it already exists.")
-
-    parser.add_argument(
-        "--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)"
-    )
-    parser.add_argument(
-        "--data_file",
-        type=str,
-        required=True,
-        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.",
-    )
-
-    parser.add_argument(
-        "--student_type",
-        type=str,
-        choices=["distilbert", "roberta", "gpt2"],
-        required=True,
-        help="The student type (DistilBERT, RoBERTa).",
-    )
-    parser.add_argument("--student_config", type=str, required=True, help="Path to the student configuration.")
-    parser.add_argument(
-        "--student_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint."
-    )
-
-    parser.add_argument(
-        "--teacher_type", choices=["bert", "roberta", "gpt2"], required=True, help="Teacher type (BERT, RoBERTa)."
-    )
-    parser.add_argument("--teacher_name", type=str, required=True, help="The teacher model.")
-
-    parser.add_argument("--temperature", default=2.0, type=float, help="Temperature for the softmax temperature.")
-    parser.add_argument(
-        "--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0."
-    )
-    parser.add_argument(
-        "--alpha_mlm",
-        default=0.0,
-        type=float,
-        help="Linear weight for the MLM loss. Must be >=0. Should be used in conjunction with `mlm` flag.",
-    )
-    parser.add_argument("--alpha_clm", default=0.5, type=float, help="Linear weight for the CLM loss. Must be >=0.")
-    parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.")
-    parser.add_argument(
-        "--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0."
-    )
-
-    parser.add_argument(
-        "--mlm", action="store_true", help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM."
-    )
-    parser.add_argument(
-        "--mlm_mask_prop",
-        default=0.15,
-        type=float,
-        help="Proportion of tokens for which we need to make a prediction.",
-    )
-    parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.")
-    parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.")
-    parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.")
-    parser.add_argument(
-        "--mlm_smoothing",
-        default=0.7,
-        type=float,
-        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).",
-    )
-    parser.add_argument("--token_counts", type=str, help="The token counts in the data_file for MLM.")
-
-    parser.add_argument(
-        "--restrict_ce_to_mask",
-        action="store_true",
-        help="If true, compute the distillation loss only the [MLM] prediction distribution.",
-    )
-    parser.add_argument(
-        "--freeze_pos_embs",
-        action="store_true",
-        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.",
-    )
-    parser.add_argument(
-        "--freeze_token_type_embds",
-        action="store_true",
-        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.",
-    )
-
-    parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.")
-    parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).")
-    parser.add_argument(
-        "--group_by_size",
-        action="store_false",
-        help="If true, group sequences that have similar length into the same batch. Default is true.",
-    )
-
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=50,
-        help="Gradient accumulation for larger training batches.",
-    )
-    parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.")
-    parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank")
-    parser.add_argument("--seed", type=int, default=56, help="Random seed")
-
-    parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.")
-    parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.")
-    args = parser.parse_args()
-    sanity_checks(args)
-
-    # ARGS #
-    init_gpu_params(args)
-    set_seed(args)
-    if args.is_master:
-        if os.path.exists(args.dump_path):
-            if not args.force:
-                raise ValueError(
-                    f"Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite"
-                    " itUse `--force` if you want to overwrite it"
-                )
-            else:
-                shutil.rmtree(args.dump_path)
-
-        if not os.path.exists(args.dump_path):
-            os.makedirs(args.dump_path)
-        logger.info(f"Experiment will be dumped and logged in {args.dump_path}")
-
-        # SAVE PARAMS #
-        logger.info(f"Param: {args}")
-        with open(os.path.join(args.dump_path, "parameters.json"), "w") as f:
-            json.dump(vars(args), f, indent=4)
-        git_log(args.dump_path)
-
-    student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type]
-    teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type]
-
-    # TOKENIZER #
-    tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name)
-    special_tok_ids = {}
-    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
-        idx = tokenizer.all_special_tokens.index(tok_symbol)
-        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
-    logger.info(f"Special tokens {special_tok_ids}")
-    args.special_tok_ids = special_tok_ids
-    args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]
-
-    # DATA LOADER #
-    logger.info(f"Loading data from {args.data_file}")
-    with open(args.data_file, "rb") as fp:
-        data = pickle.load(fp)
-
-    if args.mlm:
-        logger.info(f"Loading token counts from {args.token_counts} (already pre-computed)")
-        with open(args.token_counts, "rb") as fp:
-            counts = pickle.load(fp)
-
-        token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
-        for idx in special_tok_ids.values():
-            token_probs[idx] = 0.0  # do not predict special tokens
-        token_probs = torch.from_numpy(token_probs)
-    else:
-        token_probs = None
-
-    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
-    logger.info("Data loader created.")
-
-    # STUDENT #
-    logger.info(f"Loading student config from {args.student_config}")
-    stu_architecture_config = student_config_class.from_pretrained(args.student_config)
-    stu_architecture_config.output_hidden_states = True
-
-    if args.student_pretrained_weights is not None:
-        logger.info(f"Loading pretrained weights from {args.student_pretrained_weights}")
-        student = student_model_class.from_pretrained(args.student_pretrained_weights, config=stu_architecture_config)
-    else:
-        student = student_model_class(stu_architecture_config)
-
-    if args.n_gpu > 0:
-        student.to(f"cuda:{args.local_rank}")
-    logger.info("Student loaded.")
-
-    # TEACHER #
-    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
-    if args.n_gpu > 0:
-        teacher.to(f"cuda:{args.local_rank}")
-    logger.info(f"Teacher loaded from {args.teacher_name}.")
-
-    # FREEZING #
-    if args.freeze_pos_embs:
-        freeze_pos_embeddings(student, args)
-    if args.freeze_token_type_embds:
-        freeze_token_type_embeddings(student, args)
-
-    # SANITY CHECKS #
-    assert student.config.vocab_size == teacher.config.vocab_size
-    assert student.config.hidden_size == teacher.config.hidden_size
-    assert student.config.max_position_embeddings == teacher.config.max_position_embeddings
-    if args.mlm:
-        assert token_probs.size(0) == stu_architecture_config.vocab_size
-
-    # DISTILLER #
-    torch.cuda.empty_cache()
-    distiller = Distiller(
-        params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher
-    )
-    distiller.train()
-    logger.info("Let's go get some drinks.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/distillation/training_configs/distilbert-base-cased.json b/examples/research_projects/distillation/training_configs/distilbert-base-cased.json
deleted file mode 100644
index d4f524d704c3..000000000000
--- a/examples/research_projects/distillation/training_configs/distilbert-base-cased.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-	"activation": "gelu",
-	"attention_dropout": 0.1,
-	"dim": 768,
-	"dropout": 0.1,
-	"hidden_dim": 3072,
-	"initializer_range": 0.02,
-	"max_position_embeddings": 512,
-	"n_heads": 12,
-	"n_layers": 6,
-	"sinusoidal_pos_embds": true,
-	"tie_weights_": true,
-	"vocab_size": 28996
-  }
-  
\ No newline at end of file
diff --git a/examples/research_projects/distillation/training_configs/distilbert-base-multilingual-cased.json b/examples/research_projects/distillation/training_configs/distilbert-base-multilingual-cased.json
deleted file mode 100644
index f76e7febcba5..000000000000
--- a/examples/research_projects/distillation/training_configs/distilbert-base-multilingual-cased.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-	"activation": "gelu",
-	"attention_dropout": 0.1,
-	"dim": 768,
-	"dropout": 0.1,
-	"hidden_dim": 3072,
-	"initializer_range": 0.02,
-	"max_position_embeddings": 512,
-	"n_heads": 12,
-	"n_layers": 6,
-	"sinusoidal_pos_embds": true,
-	"tie_weights_": true,
-	"vocab_size": 119547
-  }
-  
\ No newline at end of file
diff --git a/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json b/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json
deleted file mode 100644
index 15d1e7fe00e6..000000000000
--- a/examples/research_projects/distillation/training_configs/distilbert-base-uncased.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-	"activation": "gelu",
-	"attention_dropout": 0.1,
-	"dim": 768,
-	"dropout": 0.1,
-	"hidden_dim": 3072,
-	"initializer_range": 0.02,
-	"max_position_embeddings": 512,
-	"n_heads": 12,
-	"n_layers": 6,
-	"sinusoidal_pos_embds": true,
-	"tie_weights_": true,
-	"vocab_size": 30522
-  }
-  
\ No newline at end of file
diff --git a/examples/research_projects/distillation/training_configs/distilgpt2.json b/examples/research_projects/distillation/training_configs/distilgpt2.json
deleted file mode 100644
index 9820ac93b8c7..000000000000
--- a/examples/research_projects/distillation/training_configs/distilgpt2.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-	"initializer_range": 0.02,
-	"layer_norm_epsilon": 0.00001,
-	"n_embd": 768,
-	"n_head": 12,
-	"n_layer": 6,
-	"n_positions": 1024,
-	"vocab_size": 50257
-}
\ No newline at end of file
diff --git a/examples/research_projects/distillation/training_configs/distilroberta-base.json b/examples/research_projects/distillation/training_configs/distilroberta-base.json
deleted file mode 100644
index 2d90ef6380a0..000000000000
--- a/examples/research_projects/distillation/training_configs/distilroberta-base.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-    "vocab_size": 50265,
-    "hidden_size": 768,
-    "num_hidden_layers": 6,
-    "num_attention_heads": 12,
-    "intermediate_size": 3072,
-    "hidden_act": "gelu",
-    "hidden_dropout_prob": 0.1,
-    "attention_probs_dropout_prob": 0.1,
-    "max_position_embeddings": 514,
-    "type_vocab_size": 1,
-    "initializer_range": 0.02,
-    "layer_norm_eps": 0.00001
-}
\ No newline at end of file
diff --git a/examples/research_projects/distillation/utils.py b/examples/research_projects/distillation/utils.py
deleted file mode 100644
index e86d2593bbd9..000000000000
--- a/examples/research_projects/distillation/utils.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# coding=utf-8
-# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utils to train DistilBERT
-adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
-"""
-
-import json
-import logging
-import os
-import socket
-
-import git
-import numpy as np
-import torch
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-
-def git_log(folder_path: str):
-    """
-    Log commit info.
-    """
-    repo = git.Repo(search_parent_directories=True)
-    repo_infos = {
-        "repo_id": str(repo),
-        "repo_sha": str(repo.head.object.hexsha),
-        "repo_branch": str(repo.active_branch),
-    }
-
-    with open(os.path.join(folder_path, "git_log.json"), "w") as f:
-        json.dump(repo_infos, f, indent=4)
-
-
-def init_gpu_params(params):
-    """
-    Handle single and multi-GPU / multi-node.
-    """
-    if params.n_gpu <= 0:
-        params.local_rank = 0
-        params.master_port = -1
-        params.is_master = True
-        params.multi_gpu = False
-        return
-
-    assert torch.cuda.is_available()
-
-    logger.info("Initializing GPUs")
-    if params.n_gpu > 1:
-        assert params.local_rank != -1
-
-        params.world_size = int(os.environ["WORLD_SIZE"])
-        params.n_gpu_per_node = int(os.environ["N_GPU_NODE"])
-        params.global_rank = int(os.environ["RANK"])
-
-        # number of nodes / node ID
-        params.n_nodes = params.world_size // params.n_gpu_per_node
-        params.node_id = params.global_rank // params.n_gpu_per_node
-        params.multi_gpu = True
-
-        assert params.n_nodes == int(os.environ["N_NODES"])
-        assert params.node_id == int(os.environ["NODE_RANK"])
-
-    # local job (single GPU)
-    else:
-        assert params.local_rank == -1
-
-        params.n_nodes = 1
-        params.node_id = 0
-        params.local_rank = 0
-        params.global_rank = 0
-        params.world_size = 1
-        params.n_gpu_per_node = 1
-        params.multi_gpu = False
-
-    # sanity checks
-    assert params.n_nodes >= 1
-    assert 0 <= params.node_id < params.n_nodes
-    assert 0 <= params.local_rank <= params.global_rank < params.world_size
-    assert params.world_size == params.n_nodes * params.n_gpu_per_node
-
-    # define whether this is the master process / if we are in multi-node distributed mode
-    params.is_master = params.node_id == 0 and params.local_rank == 0
-    params.multi_node = params.n_nodes > 1
-
-    # summary
-    PREFIX = f"--- Global rank: {params.global_rank} - "
-    logger.info(PREFIX + "Number of nodes: %i" % params.n_nodes)
-    logger.info(PREFIX + "Node ID        : %i" % params.node_id)
-    logger.info(PREFIX + "Local rank     : %i" % params.local_rank)
-    logger.info(PREFIX + "World size     : %i" % params.world_size)
-    logger.info(PREFIX + "GPUs per node  : %i" % params.n_gpu_per_node)
-    logger.info(PREFIX + "Master         : %s" % str(params.is_master))
-    logger.info(PREFIX + "Multi-node     : %s" % str(params.multi_node))
-    logger.info(PREFIX + "Multi-GPU      : %s" % str(params.multi_gpu))
-    logger.info(PREFIX + "Hostname       : %s" % socket.gethostname())
-
-    # set GPU device
-    torch.cuda.set_device(params.local_rank)
-
-    # initialize multi-GPU
-    if params.multi_gpu:
-        logger.info("Initializing PyTorch distributed")
-        torch.distributed.init_process_group(
-            init_method="env://",
-            backend="nccl",
-        )
-
-
-def set_seed(args):
-    """
-    Set the random seed.
-    """
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
diff --git a/examples/research_projects/fsner/README.md b/examples/research_projects/fsner/README.md
deleted file mode 100644
index 5ebcee07fcb6..000000000000
--- a/examples/research_projects/fsner/README.md
+++ /dev/null
@@ -1,88 +0,0 @@
-<p align="center"> <img src="http://sayef.tech:8082/uploads/FSNER-LOGO-2.png" alt="FSNER LOGO"> </p>
-
-<p align="center">
-  Implemented by <a href="https://huggingface.co/sayef"> sayef </a>. 
-</p>
-
-## Overview
-
-The FSNER model was proposed in [Example-Based Named Entity Recognition](https://arxiv.org/abs/2008.10570) by Morteza Ziyadi, Yuting Sun, Abhishek Goswami, Jade Huang, Weizhu Chen. To identify entity spans in a new domain, it uses a train-free few-shot learning approach inspired by question-answering.
-
-
-
-## Abstract
-----
-> We present a novel approach to named entity recognition (NER) in the presence of scarce data that we call example-based NER. Our train-free few-shot learning approach takes inspiration from question-answering to identify entity spans in a new and unseen domain. In comparison with the current state-of-the-art, the proposed method performs significantly better, especially when using a low number of support examples.
-
-
-
-## Model Training Details
------
-
-| identifier        | epochs           | datasets  |
-| ---------- |:----------:| :-----:|
-| [sayef/fsner-bert-base-uncased](https://huggingface.co/sayef/fsner-bert-base-uncased)      | 10 | ontonotes5, conll2003, wnut2017, and fin (Alvarado et al.). |
-
-
-## Installation and Example Usage
-------
-
-You can use the FSNER model in 3 ways:
-
-1. Install directly from PyPI: `pip install fsner` and import the model as shown in the code example below
-
-    or
-
-2. Install from source: `python setup.py install` and import the model as shown in the code example below
-
-    or
-
-3. Clone repo and change directory to `src` and import the model as shown in the code example below
-
-
-
-```python
-from fsner import FSNERModel, FSNERTokenizerUtils
-
-model = FSNERModel("sayef/fsner-bert-base-uncased")
-
-tokenizer = FSNERTokenizerUtils("sayef/fsner-bert-base-uncased")
-
-# size of query and supports must be the same. If you want to find all the entitites in one particular query, just repeat the same query n times where n is equal to the number of supports (or entities).
-
-
-query = [
-    'KWE 4000 can reach with a maximum speed from up to 450 P/min an accuracy from 50 mg',
-    'I would like to order a computer from eBay.',
-]
-
-# each list in supports are the examples of one entity type
-# wrap entities around with [E] and [/E] in the examples
-
-supports = [
-        [
-           'Horizontal flow wrapper [E] Pack 403 [/E] features the new retrofit-kit „paper-ON-form“',
-           '[E] Paloma Pick-and-Place-Roboter [/E] arranges the bakery products for the downstream tray-forming equipment',
-           'Finally, the new [E] Kliklok ACE [/E] carton former forms cartons and trays without the use of glue',
-           'We set up our pilot plant with the right [E] FibreForm® [/E] configuration to make prototypes for your marketing tests and package validation',
-           'The [E] CAR-T5 [/E] is a reliable, purely mechanically driven cartoning machine for versatile application fields'
-        ],
-        [
-            "[E] Walmart [/E] is a leading e-commerce company",
-            "I recently ordered a book from [E] Amazon [/E]",
-            "I ordered this from [E] ShopClues [/E]",
-            "[E] Flipkart [/E] started it's journey from zero"
-        ]
-   ]
-
-device = 'cpu'
-
-W_query = tokenizer.tokenize(query).to(device)
-W_supports = tokenizer.tokenize(supports).to(device)
-
-start_prob, end_prob = model(W_query, W_supports)
-
-output = tokenizer.extract_entity_from_scores(query, W_query, start_prob, end_prob, thresh=0.50)
-
-print(output)
-```
diff --git a/examples/research_projects/fsner/pyproject.toml b/examples/research_projects/fsner/pyproject.toml
deleted file mode 100644
index f00ba2f7a92b..000000000000
--- a/examples/research_projects/fsner/pyproject.toml
+++ /dev/null
@@ -1,7 +0,0 @@
-[build-system]
-requires = [
-    "setuptools>=57.4.0",
-    "wheel>=0.37.0",
-    "transformers>=4.9.2"
-]
-build-backend = "setuptools.build_meta"
\ No newline at end of file
diff --git a/examples/research_projects/fsner/requirements.txt b/examples/research_projects/fsner/requirements.txt
deleted file mode 100644
index f77cb020b2c1..000000000000
--- a/examples/research_projects/fsner/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers>=4.9.2
\ No newline at end of file
diff --git a/examples/research_projects/fsner/setup.py b/examples/research_projects/fsner/setup.py
deleted file mode 100644
index 8ce34d0f7d90..000000000000
--- a/examples/research_projects/fsner/setup.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import setuptools
-
-
-with open("README.md", "r", encoding="utf-8") as fh:
-    long_description = fh.read()
-
-setuptools.setup(
-    name="fsner",
-    version="0.0.1",
-    author="msi sayef",
-    author_email="msi.sayef@gmail.com",
-    description="Few-shot Named Entity Recognition",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/huggingface/transformers/tree/main/examples/research_projects/fsner",
-    project_urls={
-        "Bug Tracker": "https://github.com/huggingface/transformers/issues",
-    },
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "Operating System :: OS Independent",
-    ],
-    package_dir={"": "src"},
-    packages=setuptools.find_packages(where="src"),
-    python_requires=">=3.6",
-    install_requires=["torch>=1.9.0", "transformers>=4.9.2"],
-)
diff --git a/examples/research_projects/fsner/src/fsner/__init__.py b/examples/research_projects/fsner/src/fsner/__init__.py
deleted file mode 100644
index 130813cc119c..000000000000
--- a/examples/research_projects/fsner/src/fsner/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .model import FSNERModel
-from .tokenizer_utils import FSNERTokenizerUtils
-
-
-__all__ = ["FSNERModel", "FSNERTokenizerUtils"]
diff --git a/examples/research_projects/fsner/src/fsner/model.py b/examples/research_projects/fsner/src/fsner/model.py
deleted file mode 100644
index 0410340c4a94..000000000000
--- a/examples/research_projects/fsner/src/fsner/model.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import torch
-
-from transformers import AutoModel
-
-
-class FSNERModel(torch.nn.Module):
-    """
-    The FSNER model implements a few-shot named entity recognition method from the paper `Example-Based Named Entity Recognition <https://arxiv.org/abs/2008.10570>`__ by
-    Morteza Ziyadi, Yuting Sun, Abhishek Goswami, Jade Huang, Weizhu Chen. To identify entity spans in a new domain, it
-    uses a train-free few-shot learning approach inspired by question-answering.
-    """
-
-    def __init__(self, pretrained_model_name_or_path="sayef/fsner-bert-base-uncased"):
-        super(FSNERModel, self).__init__()
-
-        self.bert = AutoModel.from_pretrained(pretrained_model_name_or_path, return_dict=True)
-        self.cos = torch.nn.CosineSimilarity(3, 1e-08)
-        self.softmax = torch.nn.Softmax(dim=1)
-
-    def BERT(self, **inputs):
-        return self.bert(**inputs).last_hidden_state
-
-    def VectorSum(self, token_embeddings):
-        return token_embeddings.sum(2, keepdim=True)
-
-    def Atten(self, q_rep, S_rep, T=1):
-        return self.softmax(T * self.cos(q_rep, S_rep))
-
-    def forward(self, W_query, W_supports):
-        """
-        Find scores of each token being start and end token for an entity.
-        Args:
-            W_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of query sequence tokens in the vocabulary.
-            W_supports (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of support sequence tokens in the vocabulary.
-        Returns:
-            p_start (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Scores of each token as
-            being start token of an entity
-            p_end (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Scores of each token as
-            being end token of an entity
-        """
-
-        support_sizes = W_supports["sizes"].tolist()
-        start_token_id = W_supports["start_token_id"].item()
-        end_token_id = W_supports["end_token_id"].item()
-
-        del W_supports["sizes"]
-        del W_supports["start_token_id"]
-        del W_supports["end_token_id"]
-
-        q = self.BERT(**W_query)
-        S = self.BERT(**W_supports)
-
-        p_starts = None
-        p_ends = None
-
-        start_token_masks = W_supports["input_ids"] == start_token_id
-        end_token_masks = W_supports["input_ids"] == end_token_id
-
-        for i, size in enumerate(support_sizes):
-            if i == 0:
-                s = 0
-            else:
-                s = support_sizes[i - 1]
-
-            s_start = S[s : s + size][start_token_masks[s : s + size]]
-            s_end = S[s : s + size][end_token_masks[s : s + size]]
-
-            p_start = torch.matmul(q[i], s_start.T).sum(1).softmax(0)
-            p_end = torch.matmul(q[i], s_end.T).sum(1).softmax(0)
-
-            if p_starts is not None:
-                p_starts = torch.vstack((p_starts, p_start))
-                p_ends = torch.vstack((p_ends, p_end))
-            else:
-                p_starts = p_start
-                p_ends = p_end
-
-        return p_starts, p_ends
diff --git a/examples/research_projects/fsner/src/fsner/tokenizer_utils.py b/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
deleted file mode 100644
index 7169e23dbe49..000000000000
--- a/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import torch
-
-from transformers import AutoTokenizer
-
-
-class FSNERTokenizerUtils:
-    def __init__(self, pretrained_model_name_or_path):
-        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
-
-    def tokenize(self, x):
-        """
-        Wrapper function for tokenizing query and supports
-        Args:
-            x (`List[str] or List[List[str]]`):
-                List of strings for query or list of lists of strings for supports.
-        Returns:
-            `transformers.tokenization_utils_base.BatchEncoding` dict with additional keys and values for start_token_id, end_token_id and sizes of example lists for each entity type
-        """
-
-        if isinstance(x, list) and all(isinstance(_x, list) for _x in x):
-            d = None
-            for l in x:
-                t = self.tokenizer(
-                    l,
-                    padding="max_length",
-                    max_length=384,
-                    truncation=True,
-                    return_tensors="pt",
-                )
-                t["sizes"] = torch.tensor([len(l)])
-                if d is not None:
-                    for k in d.keys():
-                        d[k] = torch.cat((d[k], t[k]), 0)
-                else:
-                    d = t
-
-            d["start_token_id"] = torch.tensor(self.tokenizer.convert_tokens_to_ids("[E]"))
-            d["end_token_id"] = torch.tensor(self.tokenizer.convert_tokens_to_ids("[/E]"))
-
-        elif isinstance(x, list) and all(isinstance(_x, str) for _x in x):
-            d = self.tokenizer(
-                x,
-                padding="max_length",
-                max_length=384,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-        else:
-            raise Exception(
-                "Type of parameter x was not recognized! Only `list of strings` for query or `list of lists of"
-                " strings` for supports are supported."
-            )
-
-        return d
-
-    def extract_entity_from_scores(self, query, W_query, p_start, p_end, thresh=0.70):
-        """
-        Extracts entities from query and scores given a threshold.
-        Args:
-            query (`List[str]`):
-                List of query strings.
-            W_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of query sequence tokens in the vocabulary.
-            p_start (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-                Scores of each token as being start token of an entity
-            p_end (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
-                Scores of each token as being end token of an entity
-            thresh (`float`):
-                Score threshold value
-        Returns:
-            A list of lists of tuples(decoded entity, score)
-        """
-
-        final_outputs = []
-        for idx in range(len(W_query["input_ids"])):
-            start_indexes = end_indexes = range(p_start.shape[1])
-
-            output = []
-            for start_id in start_indexes:
-                for end_id in end_indexes:
-                    if start_id < end_id:
-                        output.append(
-                            (
-                                start_id,
-                                end_id,
-                                p_start[idx][start_id].item(),
-                                p_end[idx][end_id].item(),
-                            )
-                        )
-
-            output.sort(key=lambda tup: (tup[2] * tup[3]), reverse=True)
-            temp = []
-            for k in range(len(output)):
-                if output[k][2] * output[k][3] >= thresh:
-                    c_start_pos, c_end_pos = output[k][0], output[k][1]
-                    decoded = self.tokenizer.decode(W_query["input_ids"][idx][c_start_pos:c_end_pos])
-                    temp.append((decoded, output[k][2] * output[k][3]))
-
-            final_outputs.append(temp)
-
-        return final_outputs
diff --git a/examples/research_projects/information-gain-filtration/README.md b/examples/research_projects/information-gain-filtration/README.md
deleted file mode 100644
index f685a512509f..000000000000
--- a/examples/research_projects/information-gain-filtration/README.md
+++ /dev/null
@@ -1,100 +0,0 @@
-
-# Information Gain Filtration(IGF)
-
-Authors @Tuko @mraunak
-
-This folder contains the code how to implement IGF for finetuning on GPT-2.
-
-## What is IGF?
-
-Here we present a general fine-tuning method that we call information gain filtration for improving the overall training efficiency and final
-performance of language model fine-tuning(see paper below). The method is an alternative fine-tuning method that trains
-a secondary model (e.g., a simple convolutional network) to predict the amount of information
-gained over a given pre-trained model. The secondary model is lightweight and trained to
-predict the Information Gain measure. Information Gain is defined as the change in a loss
-function for a model before and after an SGD update with a sample (Equation X in the paper).
-A small subset of the training set named the “objective” set, is used to measure information
-gain on the pre-trained model, and consequently to train the secondary model. After 
-training, the model is used for filtering samples for the fine-tuning process. Therefore, 
-a high information gain value would suggest a sample is informative, whereas a low value
-would suggest a non-informative sample that should be filtered out. Thus, a thresholding
-strategy is defined to select informative samples. With such a strategy, samples are filtered
-and once enough samples are selected to form a mini-batch and a usual fine-tuning/optimization
-step is applied. The filtration process is repeated until the fine-tuning process is over. 
-
-Paper [Selecting Informative Contexts Improves Language Model Finetuning](https://arxiv.org/abs/2005.00175)
-
-# Results
-
-Several experiments were conducted to show the robustness of the IGF method versus the
-standard fine-tuning process. For example, we achieve a median perplexity of 54.0 on the 
-Books dataset compared to 57.3 for standard fine-tuning on GPT-2 Small. The code was
-implemented using the Transformers library and Pytorch. While the method may seem more
-expensive, we saw enough evidence that it may lead to a performance benefit in the final models.   
-
-![IGF performance](result_igf.png)
-
-Figure 1: Comparing IGF to Standard Fine-tuning:
-IGF with constant (p < 10−3 , t-test) and shifting(p < 10−6 , t-test) thresholding significantly outperform standard fine-tuning. The left-hand figure shows
-test-set perplexity after each fine-tuning batch, averaged over 50 runs (error bars denote ± one standard error). The right-hand figure shows the perplexity of each
-method after 60 batches. IGF with shifting thresholding (red) clearly improves over standard batched fine-tuning with Adam
-
-## How to use this project?
-
-To fine-tune a transformer model with IGF on a language modeling task, use the following script:
-
-- `model_name_or_path`: Path to pretrained model or model identifier from huggingface.co/models
-- `data_file`: A jbl file containing tokenized data which can be split as objective dataset,
-    train_dataset and test_dataset
-- `igf_data_file`: A jbl file containing the context and information gain pairs to train secondary learner.  
-- `context_len`: The maximum total input sequence length after tokenization. Sequences longer 
-    than this will be truncated, sequences shorter will be padded.
-- `size_objective_set`: Number of articles that are long enough to be used as our objective set"
-- `min_len`: The minimum length of the article to be used as objective set
-- `trim`: Truncate the example if it exceeds context length
-- `eval_freq`: Secondary model evaluation can be triggered at eval_freq
-- `max_steps`: To calculate training epochs
-- `number`: The number of examples split to be used as objective_set/test_data
-- `secondary_learner_batch_size`: The batch size of training data for secondary learner
-- `secondary_learner_max_epochs`: The number of epochs to train secondary learner
-- `recopy_model`: Reset the model to the original pretrained GPT-2 weights after each iteration
-- `eval_interval`: Decay the selectivity of our secondary learner filter from"
-    1 standard deviation above average to 1 below average after eval_interval(10) batches"
-
-  
-```python
-python run_clm_igf.py\
---model_name_or_path "openai-community/gpt2" \
---data_file="data/tokenized_stories_train_wikitext103" \
---igf_data_file="data/IGF_values" \
---context_len 32 \
---size_objective_set 100 \
---min_len 1026 \
---trim True \
---eval_freq 100 \
---max_steps 1000 \
---secondary_learner_batch_size 128 \
---secondary_learner_max_epochs 15 \
---number 100 \
---recopy_model \
---eval_interval 10 \
-```
-
-## Citation
-
-If you find the resource useful, please cite the following paper
-
-```bibtex
-@inproceedings{antonello-etal-2021-selecting,
-    title = "Selecting Informative Contexts Improves Language Model Fine-tuning",
-    author = "Antonello, Richard and Beckage, Nicole and Turek, Javier and Huth, Alexander",
-    booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
-    month = aug,
-    year = "2021",
-    address = "Online",
-    publisher = "Association for Computational Linguistics",
-    url = "https://aclanthology.org/2021.acl-long.87",
-    doi = "10.18653/v1/2021.acl-long.87",
-    pages = "1072--1085",
-}
-```
diff --git a/examples/research_projects/information-gain-filtration/igf/igf.py b/examples/research_projects/information-gain-filtration/igf/igf.py
deleted file mode 100644
index 4c5aefd9584e..000000000000
--- a/examples/research_projects/information-gain-filtration/igf/igf.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# Copyright 2022 - Intel Corp. All rights reserved.
-# Authors: Mayank Kumar Raunak, Javier Turek, Nicole Backage
-
-import copy
-import logging
-import random
-
-import joblib
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-
-from transformers import AdamW, GPT2LMHeadModel, get_linear_schedule_with_warmup
-
-
-logger = logging.getLogger(__name__)
-
-
-def set_seed(seed):
-    """
-    For reproducible training
-
-    Args:
-        seed: A seed for reproducible training
-
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-
-
-def compute_perplexity(model, test_data, context_len):
-    """
-    Computes perplexity of the transformer model on data in test_data
-
-    Args:
-        model: Pre-trained GPT2 model
-        test_data: Data on which perplexity calculation is required
-        context_len: The maximum total input sequence length after tokenization. Sequences longer
-                     than this will be truncated, sequences shorter will be padded
-
-    Returns:
-        Perplexity on input test data
-
-    """
-
-    model.eval()
-    device = next(model.parameters()).device
-    eval_batch_size = 1
-    context = torch.zeros((eval_batch_size, context_len), dtype=torch.long, device=device)
-    eval_dataloader = DataLoader(test_data, shuffle=False, batch_size=eval_batch_size)
-    eval_loss = torch.zeros(1, device=device)
-    nb_eval_examples = 0
-    for batch in eval_dataloader:
-        batch.to(device)
-        # pad
-        context.zero_()
-        for i in range(eval_batch_size):
-            context[i, :] = batch[i]
-        outputs = model(context, labels=context)
-        eval_loss += outputs[0].sum().item()
-        nb_eval_examples += batch.size(0)
-    eval_loss = eval_loss / nb_eval_examples
-    perplexity = torch.exp(eval_loss)
-    model.train()
-    return perplexity
-
-
-def load_gpt2(model_name="openai-community/gpt2"):
-    """
-    load original openai-community/gpt2 and save off for quicker loading
-
-    Args:
-        model_name: GPT-2
-
-    Returns:
-        GPT-2 model
-
-    """
-
-    model = GPT2LMHeadModel.from_pretrained(model_name, output_hidden_states=True)
-    torch.save(model.state_dict(), model_name + "local.pt")
-    return model
-
-
-def recopy_gpt2(orig_model, device, max_steps):
-    """
-    Reset the model to the original pretrained GPT-2 weights after each iteration
-
-    Args:
-        orig_model: Original pretrained GPT-2 model imported from Transformers library
-        device: CPU/GPU
-        max_steps: number of training steps
-
-    Returns:
-        Original PreTrained GPT-2 model,
-        lm_optimizer: Adam optimizer with Decoupled weight decay
-        lm_scheduler: linear scheduler with the appropriate schedule
-
-    """
-    model = copy.deepcopy(orig_model)
-    model.to(device)
-
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": 0.0,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    lm_optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
-    lm_scheduler = get_linear_schedule_with_warmup(lm_optimizer, 0, max_steps)
-    torch.cuda.empty_cache()
-    return model, lm_optimizer, lm_scheduler
-
-
-def intermittent_save(contexts, real_perps, past_perps, filename):
-    """
-    save the perplexity differences to filename
-
-    Args:
-        contexts: Example on which the perplexity is calculated
-        real_perps: Perplexity after back-propagating on the selected context
-        past_perps: Perplexity of model before training on the context
-        filename: File to store perplexity differences
-
-    Returns:
-        file with perplexity differences
-
-    """
-    # save the perplexity differences to filename
-    avg = np.array(real_perps).mean()
-    std = np.array(real_perps).std()
-    perp_diff = (real_perps - avg) / std
-    data_final = list(zip(contexts, perp_diff, past_perps))
-    joblib.dump(data_final, filename)
-
-
-def collect_objective_set(
-    model,
-    orig_perp,
-    context_len,
-    train_data,
-    objective_set,
-    max_steps,
-    device,
-    filename="dev.jbl",
-    recopy_model=recopy_gpt2,
-):
-    """
-    Collect individual IGF values from pre-trained transformer model
-    max_steps samples of training data to train secondary model
-
-    Args:
-        model: Pre-trained GPT2 model
-        orig_perp: Perplexity of original pretrained GPT-2 model
-        context_len: The maximum total input sequence length after tokenization. Sequences longer
-                    than this will be truncated, sequences shorter will be padded
-        train_data: Data to train model
-        objective_set: Contexts used to create (X,IG(X)) pairs which is the training data for secondary learner
-        max_steps: To calculate training epochs of model
-        device: GPU/CPU
-        filename: To store intermediate perplexity differences
-        recopy_model: Reset the model to the original pretrained GPT-2 weights after each iteration
-
-    Returns:
-        file stored intermediate perplexity differences in intermediate stages
-
-    """
-
-    # initialize variables to record relevant information
-    contexts = []
-    real_perps = []
-    past_perps = []
-
-    # Initialize the transformer model
-    orig_model = copy.deepcopy(model)
-    orig_model.to(device="cpu")
-    torch.cuda.empty_cache()
-
-    # Compute perplexity of initial transformer model for comparison
-    model.train()
-    model, lm_optimizer, lm_scheduler = recopy_model(orig_model, device, max_steps)
-
-    for step in tqdm(range(max_steps)):
-        context = torch.zeros((1, context_len), dtype=torch.long, device=device)
-        story = random.choice(train_data)
-        start = random.randint(0, len(story[0]) - context_len - 1)
-        context[0, :] = story[0][start : start + context_len]
-        lm_optimizer.zero_grad()
-        outputs = model(context, labels=context)
-        lm_loss = outputs[0]
-        past_perp = compute_perplexity(model, context, context_len)
-        model.train()
-        lm_loss.backward()
-        # Do LM backprop
-        torch.nn.utils.clip_grad_norm_(model.parameters(), 3.0)
-        lm_optimizer.step()
-        lm_scheduler.step()  # Update learning rate schedule
-
-        # Compute perplexity after back-propagating on the selected context
-        real_perp = compute_perplexity(model, objective_set, context_len)
-
-        # Periodically save the stored (X, IG(X)) pairs
-        if step % 1000 == 0 and step > 1:
-            intermittent_save(contexts, real_perps, past_perps, filename)
-
-        # Reset the pretrained model to the original pretrained GPT-2 weights after each iteration
-        model, lm_optimizer, lm_scheduler = recopy_model(orig_model, device, max_steps)
-
-        past_perps.append(past_perp.item())
-        real_perps.append(orig_perp - real_perp.item())
-        contexts.append(np.array(context.cpu()))
-
-    intermittent_save(contexts, real_perps, past_perps, filename)
-
-
-def generate_datasets(
-    context_len, file="data/tokenized_stories_train_wikitext103.jbl", number=100, min_len=1026, trim=True
-):
-    """
-    Generate objective set and training set
-
-    Args:
-        context_len: The maximum total input sequence length after tokenization. Sequences longer
-                than this will be truncated, sequences shorter will be padded
-        file: Tokenized data split into training set and objective set
-        number: size of objective dataset
-        min_len: minimum length of a context in objective set
-        trim: If True truncate the context if it exceeds context length
-
-    Returns:
-        Generated objective set and training data
-
-
-    """
-    # Generate objective set and training set
-    # Designate the first number (100) articles that are long enough to be used
-    # as our objective set, rest (that are long enough) are training data for
-    # secondary learner
-
-    data = joblib.load(file)
-    print("data loaded")
-    objective_set = []
-    if trim:
-        for i, example in enumerate(data):
-            if len(example[0]) > min_len:
-                start = random.randint(0, len(example[0]) - context_len - 1)
-                objective_set.append(example[0, start : start + context_len])
-            if len(objective_set) >= number:
-                break
-        train_data = []
-        for j in range(i + 1, len(data)):
-            if len(data[j][0]) > min_len:
-                train_data.append(data[j])
-    else:
-        objective_set = data[0:number]
-        train_data = data[number:]
-
-    joblib.dump(objective_set, "objective_set.jbl")
-    print("objective set saved")
-    return train_data, objective_set
-
-
-def train_secondary_learner(
-    secondary_learner, train_dataset, max_epochs, batch_size, eval_freq=50, igf_model_path="secondary_learner.pt"
-):
-    """
-    Train the secondary learner (igf_model)
-
-    Args:
-        secondary_learner: secondary learner
-        train_dataset: data to train secondary learner
-        max_epochs: number of epochs to train secondary learner
-        batch_size: batch size of training data of secondary learner
-        eval_freq: secondary model evaluation can be triggered at eval_freq
-        igf_model_path: path to store trained secondary learner
-
-    Returns:
-        Trained secondary learner
-
-    """
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    # We will use the first 512 pairs from our dataset as a test set for
-    # our secondary learner and the rest to train
-    test_dataset = train_dataset[:512]
-    train_dataset = train_dataset[512:]
-    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
-    test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)
-
-    # secondary learner model set up
-    loss = nn.MSELoss()
-    test_loss = nn.MSELoss(reduction="sum")
-    secondary_learner.to(device)
-    q_optimizer = torch.optim.Adam(secondary_learner.parameters(), lr=0.00001)
-    secondary_learner.train()
-
-    # TODO in original code this is written as number of actual batches seen
-    # not number of items seen but other places it is number of items instead.
-    # improve consistency! changed this to epochs for clarity
-    best_test_loss = float("inf")
-    # Iterate through batches until we've used max_steps batches
-    for epoch in range(int(max_epochs)):
-        tr_q_loss = 0.0
-        secondary_learner.train()
-        for step, batch in enumerate(train_dataloader):
-            context = batch[0].to(device)
-            real_q = batch[1].to(device)
-            predicted_q = secondary_learner(context)
-            q_optimizer.zero_grad()
-            q_loss = loss(predicted_q, real_q.float())
-            q_loss.backward()
-            q_optimizer.step()
-            tr_q_loss += q_loss.item()
-
-            # model trains fairly quickly so we won't wait for a full epoch
-            # eval is triggered at eval_freq and end of epochs
-            if (step % eval_freq == 0 and step > 0) or ((step + 1) == len(train_dataloader)):
-                tr_loss = tr_q_loss / (step + 1)
-
-                secondary_learner.eval()
-                q_loss2 = 0.0
-                sum_q2 = 0.0
-                predicted = []
-                actual = []
-                # Compute performance of the secondary learner after this batch
-                for step2, batch2 in enumerate(test_dataloader):
-                    features2 = batch2[0].to(device)
-                    real_q2 = batch2[1].to(device)
-                    predicted_q2 = secondary_learner(features2)
-                    q_loss2 += test_loss(predicted_q2, real_q2).item()
-                    sum_q2 += torch.sum(predicted_q2).item()
-                    for ei, i in enumerate(predicted_q2.cpu().detach().numpy()):
-                        predicted.append(i.item())
-                    for ei, i in enumerate(real_q2.cpu().detach().numpy()):
-                        actual.append(i.item())
-
-                q_loss2 /= len(test_dataset)
-                print(
-                    "Epoch: ",
-                    epoch,
-                    "step: ",
-                    step,
-                    "Avg. q:",
-                    sum_q2 / len(test_dataset),
-                    "Train Loss: ",
-                    tr_loss,
-                    "Test Loss: ",
-                    q_loss2,
-                )
-                if q_loss2 < best_test_loss:
-                    joblib.dump((predicted, actual), "pred_vs_actual.jbl")
-                    torch.save(secondary_learner.state_dict(), igf_model_path)
-                    best_test_loss = q_loss2
-
-            secondary_learner.train()
-    return secondary_learner
-
-
-class SecondaryLearner(nn.Module):
-    """
-    Our secondary learner
-    """
-
-    def __init__(self, model):
-        """
-        We use a simple convolutional network as our secondary learner
-
-        Args:
-            model: Pre-trained GPT2 model
-        """
-        # embeddings are from the pretrained model
-        super(SecondaryLearner, self).__init__()
-        self.embeddings = model.transformer.wte
-        self.embeddings.weight = copy.deepcopy(model.transformer.wte.weight)
-        self.conv = nn.Conv1d(self.embeddings.weight.size(1), 256, 3, padding=1)
-        self.fc = nn.Sequential(nn.Linear(256, 32), nn.Dropout(p=0.1), nn.Linear(32, 32), nn.Linear(32, 1))
-
-    def forward(self, context):
-        """
-        Forward pass through the secondary learner
-
-        Args:
-            context: Context input to the secondary learner
-
-        Returns:
-            tensor after squeeze operation
-
-        """
-        pooled = torch.max(self.conv(self.embeddings(context).squeeze(1).transpose(1, 2)), 2)[0]
-        qs = self.fc(pooled)
-        return qs.squeeze(1)
-
-    @classmethod
-    def from_pretrained(cls, state_path, model):
-        """
-        Load the secondary learner
-
-        Args:
-            state_path: Path to save secondary learner
-            model: Pretrained GPT-2
-
-        Returns:
-            secondary learner
-        """
-
-        secondary_learner = cls(model)  # this calls __init__
-        state_dict = torch.load(state_path)
-        secondary_learner.load_state_dict(state_dict)
-        secondary_learner.embeddings = model.transformer.wte
-        secondary_learner.embeddings.weight = copy.deepcopy(model.transformer.wte.weight)
-        return secondary_learner
diff --git a/examples/research_projects/information-gain-filtration/requirements.txt b/examples/research_projects/information-gain-filtration/requirements.txt
deleted file mode 100644
index 2aa3227637c8..000000000000
--- a/examples/research_projects/information-gain-filtration/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-matplotlib
-numpy>=1.17.2
-joblib>=0.13.2
-scipy
-torch>=1.10.1
-transformers>=3.5
\ No newline at end of file
diff --git a/examples/research_projects/information-gain-filtration/result_igf.png b/examples/research_projects/information-gain-filtration/result_igf.png
deleted file mode 100644
index 10bb0b7d6816..000000000000
Binary files a/examples/research_projects/information-gain-filtration/result_igf.png and /dev/null differ
diff --git a/examples/research_projects/information-gain-filtration/run_clm_igf.py b/examples/research_projects/information-gain-filtration/run_clm_igf.py
deleted file mode 100644
index 74973309c4e1..000000000000
--- a/examples/research_projects/information-gain-filtration/run_clm_igf.py
+++ /dev/null
@@ -1,450 +0,0 @@
-# Copyright 2022 - Intel Corp. All rights reserved.
-# Authors: Mayank Kumar Raunak, Javier Turek, Nicole Beckage
-
-"""
-Implementation of a new method for fine-tuning transformer models that we call
-Information Gain Filtration 'IGF' on WikiText data set and compared the results
-with the standard fine-tuning method
-
-Steps followed in the code:
-
-1) Generate a objective dataset of pairs (X, IG(X)). IG(X)--Informativeness of context 'X'.
-Our IG (information gain) model is learning to predict the ‘informativeness’ of a particular
-context. Informativeness is the change in metric between the model’s accuracy on an
-objective set before and after seeing that context. For casual language modeling, the
-metric is perplexity.
-
-2) A secondary learner is trained to infer a function approximation for IG using the dataset
-created in (1).
-
-3) The learner created in (2) is used to inform the fine-tuning process and filter out low informative samples.
-
-Last, a plot is generated to compare the performance of IGF to standard fine-tuning without any filtering
-
-"""
-
-# Prerequisite libraries:
-
-import argparse
-import random
-
-import joblib
-import numpy as np
-import torch
-from igf.igf import (
-    SecondaryLearner,
-    collect_objective_set,
-    compute_perplexity,
-    generate_datasets,
-    load_gpt2,
-    recopy_gpt2,
-    set_seed,
-    train_secondary_learner,
-)
-from torch.utils.data import DataLoader, RandomSampler
-
-from transformers import GPT2LMHeadModel
-
-
-def generate_n_pairs(
-    context_len=32,
-    max_steps=10,
-    size_objective_set=100,
-    min_len=1026,
-    trim=True,
-    data_file="data/tokenized_stories_train_wikitext103.jbl",
-    igf_data_file="igf_context_pairs.jbl",
-):
-    """
-    Collecting *n* pairs for training the secondary learner
-    Args:
-        context_len: The maximum total input sequence length after tokenization. Sequences longer
-                    than this will be truncated, sequences shorter will be padded
-        max_steps: To calculate training epochs of secondary learner
-        size_objective_set: size of objective data set used to create (X,IG(X)) pairs which is the training data for secondary learner
-        min_len: The minimum length of the article to be used as objective set
-        trim: If True truncate the context if it exceeds context length
-        data_file: Tokenized data set split for training and evaluation of model
-        igf_data_file: file to store (I,IG(X)) paired data set to train secondary learner
-
-    Returns:
-        Data stored in igf_data_file
-
-    """
-    # generates same data everytime
-    set_seed(3)
-    # generate train_data and objective_set
-    train_data, objective_set = generate_datasets(
-        context_len, data_file, number=size_objective_set, min_len=1026, trim=True
-    )
-    # keeps model same across runs
-    set_seed(4)
-    # model, lm_optimizer, lm_scheduler = recopy_gpt2(model, device, max_steps) # store original model weights
-    # can we train on GPU?
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-
-    # load pretrained model
-    model = load_gpt2("openai-community/gpt2").to(device)
-    print("computing perplexity on objective set")
-    orig_perp = compute_perplexity(model, objective_set, context_len).item()
-    print("perplexity on objective set:", orig_perp)
-
-    # collect igf pairs and save to file demo.jbl
-    collect_objective_set(model, orig_perp, context_len, train_data, objective_set, max_steps, device, igf_data_file)
-
-    # clean up, delete model and data we don't need anymore
-    del model, train_data, objective_set
-    torch.cuda.empty_cache()
-
-
-def training_secondary_learner(
-    secondary_learner_train_data,
-    secondary_learner_max_epochs=15,
-    secondary_learner_batch_size=128,
-    eval_freq=100,
-    igf_model_path="igf_model.pt",
-):
-    """
-    Train the secondary learner
-
-    Args:
-        secondary_learner_train_data: Data set with (X,IG(X)) pairs to train secondary learner where IG(X) - measure of informativeness and X- context
-        secondary_learner_max_epochs: Number of epochs to train secondary learner
-        secondary_learner_batch_size: Batch size to train secondary learner
-        eval_freq (object): secondary model evaluation can be triggered at eval_freq
-        igf_model_path: path to store trained secondary learner
-
-    Returns:
-        Trained secondary learner
-    """
-
-    set_seed(42)
-
-    # Load pre-trained model
-    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-
-    # Initialize secondary learner to use embedding weights of model
-    secondary_learner = SecondaryLearner(model)
-
-    # Train secondary learner
-    secondary_learner = train_secondary_learner(
-        secondary_learner,
-        secondary_learner_train_data,
-        max_epochs=secondary_learner_max_epochs,
-        batch_size=secondary_learner_batch_size,
-        eval_freq=100,
-        igf_model_path=igf_model_path,
-    )
-
-    del model, secondary_learner_train_data
-    torch.cuda.empty_cache()
-
-    return secondary_learner
-
-
-def finetune(
-    model,
-    train_dataset,
-    test_dataset,
-    context_len=32,
-    max_steps=1000,
-    batch_size=16,
-    threshold=1.0,
-    recopy_model=recopy_gpt2,
-    secondary_learner=None,
-    eval_interval=10,
-    finetuned_model_name="openai-community/gpt2_finetuned.pt",
-):
-    """
-    fine-tune with IGF if secondary_learner is not None, else standard fine-tuning
-
-    Args:
-        model: pre-trained GPT-2 model
-        train_dataset: Data set to train GPT-2 model
-        test_dataset: Evaluate GPT-2 model
-        context_len: The maximum total input sequence length after tokenization. Sequences longer
-                    than this will be truncated, sequences shorter will be padded
-        max_steps: To calculate training epochs
-        batch_size: Batch size to train GPT-2 model
-        threshold: The threshold value used by secondary learner to filter the train_data and allow only"
-                    informative data as input to the model
-        recopy_model: Reset the model to the original pretrained GPT-2 weights after each iteration
-        secondary_learner: Selection of IGF as fine-tuning method if not None
-        eval_interval: number of batches after which decay the selectivity of our secondary learner filter from
-                        1 standard deviation above average to 1 below average
-        fine-tuned_model_name: name of the final final-tuned GPT-2 model
-
-    Returns:
-        Fine-tuned GPT-2 model
-
-    """
-
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    train_sampler = RandomSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler)
-
-    num_train_epochs = max_steps // (len(train_dataset)) + 1
-    global_step = 0
-    context = torch.zeros((1, context_len), dtype=torch.long, device=device)
-    model, lm_optimizer, lm_scheduler = recopy_model(model, device, max_steps)
-
-    model.train()
-    if secondary_learner is not None:
-        secondary_learner.to(device)
-        secondary_learner.eval()
-    contexts = []
-    examples = 0
-
-    observed_qs = []
-    test_perps = []
-
-    # Compute the performance of the transformer model at the beginning
-    real_perp = compute_perplexity(model, test_dataset, context_len)
-    test_perps.append(real_perp)
-    print("Test perplexity, step", global_step, ":", real_perp)
-    for epoch in range(int(num_train_epochs)):
-        for step, example in enumerate(train_dataloader):
-            torch.cuda.empty_cache()
-            start = random.randint(0, example.size(2) - context_len - 1)
-            context[0, :] = example[0, 0, start : start + context_len]
-            lm_optimizer.zero_grad()
-            outputs = model(context, labels=context)
-            do_backprop = True
-
-            if secondary_learner is not None:
-                predicted_q = secondary_learner.forward(
-                    torch.tensor(context, dtype=torch.long, device=device).unsqueeze(0)
-                )[0].item()
-                observed_qs.append(float(predicted_q))
-
-                # Here we implement the simple non-constant threshold for the predicted IG(X) value
-                # We will decay the selectivity of our secondary learner filter from
-                # 1 standard deviation above average to 1 below average after 10 batches.
-
-                if global_step == 10:
-                    threshold = -1
-                if predicted_q < threshold:
-                    do_backprop = False
-
-            # If we passed the filter, add the context to the batch!
-            if do_backprop:
-                contexts.append(np.array(context.cpu()))
-                lm_loss = outputs[0]
-                lm_loss.backward()
-                examples += 1
-
-            del outputs
-
-            # Once the batch is filled with enough contexts, backprop on the batch.
-            if examples == batch_size:
-                torch.cuda.empty_cache()
-                examples = 0
-                # Do LM backprop
-                torch.nn.utils.clip_grad_norm_(model.parameters(), 3.0)
-                lm_optimizer.step()
-                lm_scheduler.step()  # Update learning rate schedule
-                global_step += 1
-                # Compute the performance of the transformer model at this batch
-                if global_step % eval_interval == 0:
-                    real_perp = compute_perplexity(model, test_dataset, context_len)
-                    test_perps.append(real_perp)
-
-                    print("Test perplexity, step", global_step, ":", real_perp)
-            # Break out of the loop after 60 batches
-            if max_steps > 0 and global_step > 60:
-                break
-        if max_steps > 0 and global_step > 60:
-            break
-
-    # save finetuned transformer model
-    torch.save(model.state_dict(), finetuned_model_name)
-    torch.cuda.empty_cache()
-    # Do some cleaning up so we can reinitialize for the next run of this function
-    del lm_optimizer
-    del lm_scheduler
-    return model
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Fine-tune a transformer model with IGF on a language modeling task")
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain data files for WikiText.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--data_file",
-        type=str,
-        default=None,
-        help=(
-            "A jbl file containing tokenized data which can be split as objective dataset, "
-            "train_dataset and test_dataset."
-        ),
-    )
-
-    parser.add_argument(
-        "--igf_data_file",
-        type=str,
-        default=None,
-        help="A jbl file containing the context and information gain pairs to train secondary learner.",
-    )
-
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the final fine-tuned model is stored.",
-    )
-
-    parser.add_argument(
-        "--tokenizer_name",
-        default=None,
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-
-    parser.add_argument(
-        "--context_len",
-        default=32,
-        type=int,
-        help=(
-            "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        ),
-    )
-
-    parser.add_argument(
-        "--size_objective_set",
-        default=100,
-        type=int,
-        help="number of articles that are long enough to be used as our objective set",
-    )
-    parser.add_argument(
-        "--eval_freq", default=100, type=int, help="secondary model evaluation is triggered at eval_freq"
-    )
-
-    parser.add_argument("--max_steps", default=1000, type=int, help="To calculate training epochs")
-
-    parser.add_argument(
-        "--secondary_learner_batch_size",
-        default=128,
-        type=int,
-        help="batch size of training data for secondary learner",
-    )
-
-    parser.add_argument(
-        "--batch_size",
-        default=16,
-        type=int,
-        help="batch size of training data of language model(openai-community/gpt2) ",
-    )
-
-    parser.add_argument(
-        "--eval_interval",
-        default=10,
-        type=int,
-        help=(
-            "decay the selectivity of our secondary learner filter from "
-            "1 standard deviation above average to 1 below average after 10 batches"
-        ),
-    )
-
-    parser.add_argument(
-        "--number", default=100, type=int, help="The number of examples split to be used as objective_set/test_data"
-    )
-
-    parser.add_argument(
-        "--min_len", default=1026, type=int, help="The minimum length of the article to be used as objective set"
-    )
-
-    parser.add_argument(
-        "--secondary_learner_max_epochs", default=15, type=int, help="number of epochs to train secondary learner"
-    )
-
-    parser.add_argument("--trim", default=True, type=bool, help="truncate the example if it exceeds context length")
-
-    parser.add_argument(
-        "--threshold",
-        default=1.0,
-        type=float,
-        help=(
-            "The threshold value used by secondary learner to filter the train_data and allow only"
-            " informative data as input to the model"
-        ),
-    )
-
-    parser.add_argument(
-        "--finetuned_model_name", default="openai-community/gpt2_finetuned.pt", type=str, help="finetuned_model_name"
-    )
-
-    parser.add_argument(
-        "--recopy_model",
-        default=recopy_gpt2,
-        type=str,
-        help="Reset the model to the original pretrained GPT-2 weights after each iteration",
-    )
-
-    # function calls
-    # Collecting *n* pairs of context and information gain(X, IG(X)) for training the secondary learner
-    generate_n_pairs(
-        context_len=32,
-        max_steps=10,
-        size_objective_set=100,
-        min_len=1026,
-        trim=True,
-        data_file="data/tokenized_stories_train_wikitext103.jbl",
-        igf_data_file="igf_context_pairs.jbl",
-    )
-
-    # Load train data for secondary learner
-    secondary_learner_train_data = joblib.load("data/IGF_values.jbl")
-
-    # Train secondary learner
-    secondary_learner = training_secondary_learner(
-        secondary_learner_train_data,
-        secondary_learner_max_epochs=15,
-        secondary_learner_batch_size=128,
-        eval_freq=100,
-        igf_model_path="igf_model.pt",
-    )
-
-    # load pretrained openai-community/gpt2 model
-    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-    set_seed(42)
-
-    # Generate train and test data to train and evaluate openai-community/gpt2 model
-    train_dataset, test_dataset = generate_datasets(
-        context_len=32, file="data/tokenized_stories_train_wikitext103.jbl", number=100, min_len=1026, trim=True
-    )
-
-    # fine-tuning of the openai-community/gpt2 model using igf (Information Gain Filtration)
-    finetune(
-        model,
-        train_dataset,
-        test_dataset,
-        context_len=32,
-        max_steps=1000,
-        batch_size=16,
-        threshold=1.0,
-        recopy_model=recopy_gpt2,
-        secondary_learner=secondary_learner,
-        eval_interval=10,
-        finetuned_model_name="openai-community/gpt2_finetuned.pt",
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/jax-projects/HOW_TO_PROPOSE_PROJECT.md b/examples/research_projects/jax-projects/HOW_TO_PROPOSE_PROJECT.md
deleted file mode 100644
index 08e05f389319..000000000000
--- a/examples/research_projects/jax-projects/HOW_TO_PROPOSE_PROJECT.md
+++ /dev/null
@@ -1,109 +0,0 @@
-# How to propose a Flax/JAX + Transformers project 
-
-Great that you've opened this document! 
-While we at 🤗 are proposing a couple of projects, we strongly 
-believe that the community can come up with much more **creative**, **fun**, and 
-**impactful** projects on their own. This being said, we are really looking forward
-to seeing your project proposal! 
-
-## What a project should be about
-
-The proposed project should fall into the machine learning fields of **Natural Language Processing (NLP)** and/or **Computer Vision (CV)** (possibly also **Speech Recognition (ASR)** depending on whether Speech Recognition models are available in Flax in due time) and aim at solving a specific task. 
-Possible tasks can belong to: 
-
- * text classification
- * text generation
- * image recognition
- * image processing
- * image captioning
- * audio classification
- * and other tasks you can think of!
-
-The clearer a task is defined, the better your project proposal is.
-*E.g.* "Using a T5 model to learn grammar correction in French" or "Adapting a pre-trained CLIP model for zero-shot image classification in Spanish" are **well-defined and clear** project proposals, while something like "Train a language model" or "Image classification" are **too vague**.
-
-There is no limit to your creativity as long as the project is feasible and ethical.
-The more creative & specific your project proposal, the more interesting it will be, 
-and the more likely will you find motivated team members to work on your project!
-To get an idea of how to formulate your project proposals, you can browse through 
-existing project proposals on the [forum](https://discuss.huggingface.co/c/flax-jax-projects/22).
-
-## How to submit a project proposal
-
-First, you should make sure that you are [logged in](https://huggingface.co/login?sso=bm9uY2U9OTRlNjZjZmZhYjMwMmJmMWMyYjc5MmFiMTMyMzY5ODYmcmV0dXJuX3Nzb191cmw9aHR0cHMlM0ElMkYlMkZkaXNjdXNzLmh1Z2dpbmdmYWNlLmNvJTJGc2Vzc2lvbiUyRnNzb19sb2dpbg%3D%3D&sig=429ad8924bcb33c40f9823027ea749abb55d393f4f58924f36a2dba3ab0a48da) with your Hugging Face account on the forum. 
-
-Second, make sure that your project idea doesn't already exist by checking [existing projects](https://discuss.huggingface.co/c/flax-jax-projects/22). 
-If your project already exists - great! This means that you can comment and improve
-the existing idea and join the project to form a team! If your project idea already 
-exists for a different language, feel free to submit the same project idea, just in 
-a different language.
-
-Third, having ensured that your project doesn't exist, click on the *"New Topic"*
-button on the [Flax/JAX Projects Forum category](https://discuss.huggingface.co/c/flax-jax-projects/22) to create a new project proposal.
-
-Fourth, make sure that your project proposal includes the following information:
-
-1. *A clear description of the project*
-2. *In which language should the project be conducted?* English, German, Chinese, ...? It can also be a multi-lingual project
-3. *Which model should be used?* If you want to adapt an existing model, you can add the link to one of the 4000 available checkpoints in JAX [here](https://huggingface.co/models?filter=jax) If you want to train a model from scratch, you can simply state the model architecture to be used, *e.g.* BERT, CLIP, etc. You can also base your project on a model that is not part of transformers. For an overview of libraries based on JAX, you can take a look at [awesome-jax](https://github.com/n2cholas/awesome-jax#awesome-jax-). **Note** that for a project that is not based on Transformers it will be more difficult for the 🤗 team to help you. Also have a look at the section [Quickstart Flax & Jax in Transformers](https://github.com/huggingface/transformers/tree/main/examples/research_projects/jax-projects#quickstart-flax-and-jax-in-transformers) to see what model architectures are currently supported in 🤗 Transformers.
-4. *What data should be used?* It is important to state at least what kind of data you would like to use. Ideally, you can already point to publicly available data or a dataset in the 🤗 Datasets library.
-5. *Are similar training scripts available in Flax/JAX?* It would be important to find similar training scripts that already exist in Flax/JAX. *E.g.* if you are working on a Seq-to-Seq task, you can make use of the [`run_summarization_flax.py`](https://github.com/huggingface/transformers/blob/main/examples/flax/summarization/run_summarization_flax.py) script which is very similar to any seq2seq training. Also have a look at the section [Quickstart Flax & Jax in Transformers](https://github.com/huggingface/transformers/tree/main/examples/research_projects/jax-projects#quickstart-flax-and-jax-in-transformers) to see what training scripts are currently supported in 🤗 Transformers.
-6. *(Optionally) What are possible challenges?* List possible difficulties with your project. *E.g.* If you know that training convergence usually takes a lot of time, it is worth stating this here!
-7. *(Optionally) What is the desired project outcome?* - How would you like to demo your project? One could *e.g.* create a Streamlit application.
-8. *(Optionally) Links to read upon* - Can you provide any links that would help the reader to better understand your project idea?
-
-Feel free to copy-paste the following format for your project proposal and fill out the respective sections: 
-
-```
-# <FILL ME: Name of project>
-
-<FILL ME: A clear description of the project>
-
-## 2. Language
-
-The model will be trained in <FILL ME: which language?>.
-
-## 3. Model
-
-<FILL ME: 3. Which model should be used?>
-
-## 4. Datasets
-
-<FILL ME: 4. Which data should be used?>
-
-Possible links to publicly available datasets include:
-- <FILL ME: Link 1 to dataset> 
-- <FILL ME: Link 2 to dataset> 
-- <FILL ME: Link 3 to dataset> 
-
-## 5. Training scripts
-
-<FILL ME: 5. Are there publicly available training scripts that can be used/tweaked for the project?>
-
-We can make use of <FILL ME: link to training script> to train the model.>
-
-## 6. (Optional) Challenges
-
-<(Optionally) FILL ME: 6. What are possible challenges?>
-
-## 7. (Optional) Desired project outcome
-
-<(Optionally) FILL ME: 7. What is the desired project outcome? A demo?>
-
-## 8. (Optional) Reads
-
-The following links can be useful to better understand the project and 
-what has previously been done.
-
-- <FILL ME: Link 1 to read> 
-- <FILL ME: Link 2 to read> 
-- <FILL ME: Link 3 to read> 
-```
-
-To see how a proposed project looks like, please have a look at submitted project 
-proposals [here](https://discuss.huggingface.co/c/flax-jax-projects/22).
-
-## Will my project proposal be selected?
-
-Having submitted a project proposal, you can now promote your idea in the Slack channel `#flax-jax-community-week` to try to convince other participants to join your project! 
-Once other people have joined your project, one of the organizers (`@Suzana, @valhalla, @osanseviero, @patrickvonplaten`) will officially create a team for your project and add your project to [this google sheet](https://docs.google.com/spreadsheets/d/1GpHebL7qrwJOc9olTpIPgjf8vOS0jNb6zR_B8x_Jtik/edit?usp=sharing).
diff --git a/examples/research_projects/jax-projects/README.md b/examples/research_projects/jax-projects/README.md
deleted file mode 100644
index 88d8d7f9eba9..000000000000
--- a/examples/research_projects/jax-projects/README.md
+++ /dev/null
@@ -1,1295 +0,0 @@
-# Flax/JAX community week 🤗
-
-Welcome to the Flax/JAX community week! The goal of this week is to make compute-intensive NLP and CV projects (like pre-training BERT, GPT2, CLIP, ViT) 
-practicable for a wider audience of engineers and researchers. 
-To do so, we will try to teach **you** how to effectively use JAX/Flax on TPU and help you to complete a fun NLP and/or CV project in JAX/Flax during the community week. 
-
-Free access to a TPUv3-8 will kindly be provided by the Google Cloud team!
-
-In this document, we list all the important information that you will need during the Flax/JAX community week.
-
-Don't forget to sign up [here](https://forms.gle/tVGPhjKXyEsSgUcs8)! 
-
-## Table of Contents
-
-- [Organization](#organization)
-- [Important dates](#important-dates)
-- [Communication](#communication)
-- [Projects](#projects)
-	- [How to propose](#how-to-propose-a-project)
-	- [How to form a team](#how-to-form-a-team-around-a-project)
-- [Tips & Tricks for project](#tips-on-how-to-organize-the-project)
-- [How to install flax, jax, optax, transformers, datasets](#how-to-install-relevant-libraries)
-- [Quickstart Flax/JAX](#quickstart-flax-and-jax)
-- [Quickstart Flax/JAX in 🤗 Transformers](#quickstart-flax-and-jax-in-transformers)
-    - [Flax design philosophy in 🤗 Transformers](#flax-design-philosophy-in-transformers)
-    - [How to use flax models & scripts](#how-to-use-flax-models-and-example-scripts)
-- [Talks](#talks)
-- [How to use the 🤗 Hub for training](#how-to-use-the-hub-for-collaboration)
-- [How to setup TPU VM](#how-to-setup-tpu-vm)
-- [How to build a demo](#how-to-build-a-demo)
-    - [Using the Hugging Face Widgets](#using-the-hugging-face-widgets)
-    - [Using a Streamlit demo](#using-a-streamlit-demo)
-    - [Using a Gradio demo](#using-a-gradio-demo)
-- [Project evaluation](#project-evaluation)
-- [General Tips & Tricks](#general-tips-and-tricks)
-- [FAQ](#faq)
-
-## Organization
-
-Participants can propose ideas for an interesting NLP and/or CV project. Teams of 3 to 5 will then be formed around the most promising and interesting projects. Make sure to read through the [Projects](#projects) section on how to propose projects, comment on other participants' project ideas, and create a team.
-
-To help each team successfully finish their project, we have organized talks by leading scientists and engineers from Google, Hugging Face, and the open-source NLP & CV community. The talks will take place before the community week from June 30th to July 2nd. Make sure to attend the talks to get the most out of your participation! Check out the [Talks](#talks) section to get an overview of the talks, including the speaker and the time of the talk.
-
-Each team is then given **free access to a TPUv3-8 VM** from July 7th to July 14th. In addition, we will provide training examples in JAX/Flax for a variety of NLP and Vision models to kick-start your project. During the week, we'll make sure to answer any questions you might have about JAX/Flax and Transformers and help each team as much as possible to complete their project!
-
-At the end of the community week, each team should submit a demo of their project. All demonstrations will be evaluated by a jury and the top-3 demos will be awarded a prize. Check out the [How to submit a demo](#how-to-submit-a-demo) section for more information and suggestions on how to submit your project.
-
-## Important dates
-
-- **23.06.** Official announcement of the community week. Make sure to sign-up in [this google form](https://forms.gle/tVGPhjKXyEsSgUcs8).
-- **23.06. - 30.06.** Participants will be added to an internal Slack channel. Project ideas can be proposed here and groups of 3-5 are formed. Read this document for more information. 
-- **30.06.** Release of all relevant training scripts in JAX/Flax as well as other documents on how to set up a TPU, how to use the training scripts, how to submit a demo, tips & tricks for JAX/Flax, tips & tricks for efficient use of the hub.
-- **30.06. - 2.07.** Talks about JAX/Flax, TPU, Transformers, Computer Vision & NLP will be held. 
-- **7.07.** Start of the community week! Access to TPUv3-8 will be given to each team.
-- **7.07. - 14.07.** The Hugging Face & JAX/Flax & Cloud team will be available for any questions, problems the teams might run into.
-- **15.07.** Access to TPU is deactivated and community week officially ends.
-- **16.07.** Deadline for each team to submit a demo. 
-
-## Communication
-
-All important communication will take place in an internal Slack channel, called `#flax-jax-community-week`. 
-Important announcements of the Hugging Face, Flax/JAX, and Google Cloud team will be posted there. 
-Such announcements include general information about the community week (Dates, Rules, ...), release of relevant training scripts (Flax/JAX example scripts for NLP and Vision), release of other important documents (How to access the TPU), etc. 
-The Slack channel will also be the central place for participants to post about their results, share their learning experiences, ask questions, etc.
-
-For issues with Flax/JAX, Transformers, Datasets or for questions that are specific to your project we would be **very happy** if you could use the following public repositories and forums:
-
-- Flax: [Issues](https://github.com/google/flax/issues), [Questions](https://github.com/google/flax/discussions)
-- JAX: [Issues](https://github.com/google/jax/issues), [Questions](https://github.com/google/jax/discussions)
-- 🤗 Transformers: [Issues](https://github.com/huggingface/transformers/issues), [Questions](https://discuss.huggingface.co/c/transformers/9)
-- 🤗 Datasets: [Issues](https://github.com/huggingface/datasets/issues), [Questions](https://discuss.huggingface.co/c/datasets/10)
-- Project specific questions: [Forum](https://discuss.huggingface.co/c/flax-jax-projects/22)
-- TPU related questions: [TODO]()
-
-Please do **not** post the complete issue/project-specific question in the Slack channel, but instead a link to your issue/question that we will try to answer as soon as possible. 
-This way, we make sure that the everybody in the community can benefit from your questions - even after the community week - and that the same question is not answered twice.
-
-To be invited to the Slack channel, please make sure you have signed up [on the Google form](https://forms.gle/tVGPhjKXyEsSgUcs8). 
-
-**Note**: If you have signed up on the google form, but you are not in the Slack channel, please leave a message on [(TODO) the official forum announcement]( ) and ping `@Suzana` and `@patrickvonplaten`.
-
-## Projects
-
-During the first week after the community week announcement, **23.06. - 30.06.**, teams will be formed around the most promising and interesting project ideas. Each team can consist of 2 to 10 participants. Projects can be accessed [here](https://discuss.huggingface.co/c/flax-jax-projects/22).
-
-All officially defined projects can be seen [here](https://docs.google.com/spreadsheets/d/1GpHebL7qrwJOc9olTpIPgjf8vOS0jNb6zR_B8x_Jtik/edit?usp=sharing).
-
-### How to propose a project
-
-Some default project ideas are given by the organizers. **However, we strongly encourage participants to submit their own project ideas!**
-Check out the [HOW_TO_PROPOSE_PROJECT.md](https://github.com/huggingface/transformers/tree/main/examples/research_projects/jax-projects/HOW_TO_PROPOSE_PROJECT.md) for more information on how to propose a new project.
-
-### How to form a team around a project
-
-You can check out all existing projects ideas on the forum under [Flax/JAX projects category](https://discuss.huggingface.co/c/flax-jax-projects/22).
-Make sure to quickly check out each project idea and leave a ❤️  if you like an idea. 
-Feel free to leave comments, suggestions for improvement, or questions about more details directly on the discussion thread. 
-If you have found the project that you ❤️  the most, leave a message "I would like to join this project" on the discussion thread. 
-We strongly advise you to also shortly state who you are, which time zone you are in and why you would like to work on this project, how you can contribute to the project and what your vision is for the project.
-For projects that see a lot of interest and for which enough participants have expressed interest in joining, an official team will be created by the organizers. 
-One of the organizers (`@Suzana`, `@valhalla`, `@osanseviero`, `@patrickvonplaten`) will leave a message "For this project the team: `<team_name>`, `<team_members>` , is officially created" on the thread and note down the teams on [this google sheet](https://docs.google.com/spreadsheets/d/1GpHebL7qrwJOc9olTpIPgjf8vOS0jNb6zR_B8x_Jtik/edit?usp=sharing).
-
-Once created, the team can start refining their project:
-
-- What is the goal of the project? *E.g.*, Present a language model that writes poetry in Russian.
-- What model will we use? *E.g.*, FlaxGPT2
-- What data will we use? *E.g.* Russian dataset of OSCAR & publicly available book on poetry
-- Should we use a pre-trained model or train a model from scratch? E.g. Train a model from scratch
-- What training scripts do we need? *E.g.* `transformers/examples/flax/run_clm_flax.py` can be used
-- What kind of demo would we like to present? E.g. Text-generation API of the 🤗 Hub in combination with a Streamlit demo that lets the user generate a poem of a given length
-- How will the work be divided? *E.g.* Team member 1 works on data preprocessing, Team member 2 works on adapting the Flax script, ...
-
-We highly recommend that each team discusses all relevant ideas for their project directly on the forum thread. 
-This way valuable learning experiences are shared and accessible by the whole community in the future. 
-Additionally, the organizers, other participants, or anybody in the community really can read through your discussions and leave comments/tips for improvement. Obviously, you can also create private chats, ... to discuss more sensitive topics, etc.
-
-**Important**:
-
-- For project ideas that see a lot of interest, we are more than happy to create more than one team.
-- Participants are welcome to join multiple teams, even though we encourage them to only work on a single project.
-- Under special circumstances, participants can change/create new teams. Please note that we would like to keep this the exception. If however, you would like to change/leave existing teams, please leave a post on the project's thread where you ping the corresponding organizer that created the group.
- - It is often easy to propose/join a project that is done in your native language. Feel free to reach out to existing [language-specific groups](https://discuss.huggingface.co/c/languages-at-hugging-face/15) to look for community members that might be interested in joining your project.
-
-## Tips on how to organize the project
-
-This section gives you some tips on how to most efficiently & effectively 
-work as a team to achieve your goal. It is by no means a strict recipe to follow, 
-but rather a collection of tips from the 🤗 team.
-
-Once your team is defined, you can start working on the project as soon as possible. 
-
-
-### Communication
-
-At first, it is always useful to get to know each other and to set up a means of communication.
-While we recommend that all technical aspects of work can be discussed directly on the [forum](https://discuss.huggingface.co/c/flax-jax-projects/22) under your project thread, 
-it can be very helpful to have a more direct way of communicating, *e.g.* in a channel. 
-For this we have created a discord that you can access [here](https://discord.com/channels/858019234139602994/858019234139602997). 
-This discord will not be managed by anybody and is just there so that you can communicate more effectively with your team members. 
-Feel free to create a new channel for you and your team where you can discuss everything. If you and your team have already set up other ways of communicating, it is absolutely not required to make use of the discord. However, we do recommend each team to set up some kind 
-of channel or group for quick discussions.
-
-### Project definition
-
-In the very beginning, you should make sure your project is well-defined and that 
-everybody in the team understands the goal of the project and the work that needs to be 
-done in order to achieve the goal. A well-defined project:
-
-- has defined the task on which the model will be trained
-- has defined the model that will be trained
-- has defined the datasets that will be used for training
-- has defined the type of training scripts that need to be written
-- has defined the desired outcome of the project
-- has defined the workflows
-
-By "has defined" we don't meant that the corresponding code already has to be written and ready 
-to be used, but that everybody in team is on the same page on what type of model, data and training script should be used.
-
-To give an example, a well-defined project would be the following:
-
-- task: summarization
-- model: [google-t5/t5-small](https://huggingface.co/google-t5/t5-small)
-- dataset: [CNN/Daily mail](https://huggingface.co/datasets/cnn_dailymail)
-- training script: [run_summarization_flax.py](https://github.com/huggingface/transformers/blob/main/examples/flax/summarization/run_summarization_flax.py)
-- outcome: t5 model that can summarize news
-- work flow: adapt `run_summarization_flax.py` to work with `google-t5/t5-small`.
-
-This example is a very easy and not the most interesting project since a `google-t5/t5-small`
-summarization model exists already for CNN/Daily mail and pretty much no code has to be 
-written. 
-A well-defined project does not need to have the dataset be part of 
-the `datasets` library and the training script already be pre-written, however it should 
-be clear how the desired dataset can be accessed and how the training script can be 
-written. 
-
-It is also important to have a clear plan regarding the workflow. Usually, the 
-data processing is done in a first step. Once the data is in a format that the model can 
-work with, the training script can be written, etc. These steps should be more detailed 
-once the team has a clearly defined project. It can be helpful to set deadlines for each step.
-
-### Workload division
-
-To effectively work as a team, it is crucial to divide the workload among everybody.
-Some team members will be more motivated and experienced than others and 
-some team members simply want to participate to learn more and cannot contribute that 
-much to the team. This is totally fine! One cannot expect everybody in the team to have the same level of experience and time/motivation during the community week.
-
-As a conclusion, being honest about one's expected involvement is crucial so that 
-the workload can be divided accordingly. If someone doesn't think her/his tasks are feasible - let 
-the team know early on so that someone else can take care of it!
-
-It is recommended that the motivated and experienced team members take the lead in dividing the work and are ready to take over the tasks of another team member if necessary. 
-
-The workload can often be divided according to:
-
-- data preprocessing (load the data and preprocess data in the correct format)
-- data tokenization / data collator (process data samples into tokens or images)
-- model configuration (writing the code that defines the model)
-- model forward pass (make sure input / output work correctly)
-- loss function (define the loss function)
-- putting the pieces together in a training script
-
-Many of the steps above require other steps to be finished, so it often makes sense 
-to use dummy data in the expected format to start, *e.g.*, with the model forward pass 
-before the data preprocessing is done.
-
-### Expectations
-
-It is also very important to stay realistic with the scope of your project. Each team 
-has access to a TPUv3-8 for only *ca.* 10 days, so it's important to keep the scope of 
-the project reasonable. While we do want each team to work on interesting projects, each 
-team should make sure that the project goals can be achieved within the provided compute 
-time on TPU. For instance, pretraining a 11 billion parameters T5 model is not really a realistic 
-task with just 10 days of TPUv3-8 compute. 
-Also, it might be difficult to finish a project where the whole modeling, dataset and training code has to be written from scratch.
-
-Having defined your project, feel free to reach out on Slack or the forum for feedback from the organizers. We can surely give you our opinion on whether the project is feasible and what can be done to improve it.
-the project is feasible.
-
-### Other tips
-
-Here is a collection of some more tips:
-
-- We strongly recommend to work as publicly and collaboratively as possible during the week so that other teams 
-and the organizers can best help you. This includes publishing important discussions on 
-the forum and making use of the [🤗 hub](http://huggingface.co/) to have a version 
-control for your models and training logs.
-- When debugging, it is important that the debugging cycle is kept as short as possible to 
-be able to effectively debug. *E.g.* if there is a problem with your training script, 
-you should run it with just a couple of hundreds of examples and not the whole dataset script. This can be done by either making use of [datasets streaming](https://huggingface.co/docs/datasets/master/dataset_streaming?highlight=streaming) or by selecting just the first 
-X number of data samples after loading:
-
-```python
-datasets["train"] = datasets["train"].select(range(1000))
-```
-- Ask for help. If you are stuck, use the public Slack channel or the [forum](https://discuss.huggingface.co/c/flax-jax-projects/22) to ask for help.
-
-## How to install relevant libraries
-
-In the following we will explain how to install all relevant libraries on your local computer and on TPU VM.
-
-It is recommended to install all relevant libraries both on your local machine 
-and on the TPU virtual machine. This way, quick prototyping and testing can be done on
-your local machine and the actual training can be done on the TPU VM.
-
-### Local computer
-
-The following libraries are required to train a JAX/Flax model with 🤗 Transformers and 🤗 Datasets:
-
-- [JAX](https://github.com/google/jax/)
-- [Flax](https://github.com/google/flax)
-- [Optax](https://github.com/deepmind/optax)
-- [Transformers](https://github.com/huggingface/transformers)
-- [Datasets](https://github.com/huggingface/datasets)
-
-You should install the above libraries in a [virtual environment](https://docs.python.org/3/library/venv.html). 
-If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going
-to use and activate it.
-
-You should be able to run the command:
-
-```bash
-python3 -m venv <your-venv-name>
-```
-
-You can activate your venv by running
-
-```bash
-source ~/<your-venv-name>/bin/activate
-```
-
-We strongly recommend to make use of the provided JAX/Flax examples scripts in [transformers/examples/flax](https://github.com/huggingface/transformers/tree/main/examples/flax) even if you want to train a JAX/Flax model of another github repository that is not integrated into 🤗 Transformers.
-In all likelihood, you will need to adapt one of the example scripts, so we recommend forking and cloning the 🤗 Transformers repository as follows. 
-Doing so will allow you to share your fork of the Transformers library with your team members so that the team effectively works on the same code base. It will also automatically install the newest versions of `flax`, `jax` and `optax`.
-
-1. Fork the [repository](https://github.com/huggingface/transformers) by
-   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
-   under your GitHub user account.
-
-2. Clone your fork to your local disk, and add the base repository as a remote:
-
-   ```bash
-   $ git clone https://github.com/<your Github handle>/transformers.git
-   $ cd transformers
-   $ git remote add upstream https://github.com/huggingface/transformers.git
-   ```
-
-3. Create a new branch to hold your development changes. This is especially useful to share code changes with your team:
-
-   ```bash
-   $ git checkout -b a-descriptive-name-for-my-project
-   ```
-
-4. Set up a flax environment by running the following command in a virtual environment:
-
-   ```bash
-   $ pip install -e ".[flax]"
-   ```
-
-   (If transformers was already installed in the virtual environment, remove
-   it with `pip uninstall transformers` before reinstalling it in editable
-   mode with the `-e` flag.)
-
-   If you have already cloned that repo, you might need to `git pull` to get the most recent changes in the `datasets`
-   library.
-
-   Running this command will automatically install `flax`, `jax` and `optax`.
-
-Next, you should also install the 🤗 Datasets library. We strongly recommend installing the 
-library from source to profit from the most current additions during the community week.
-
-Simply run the following steps:
-
-```bash
-$ cd ~/
-$ git clone https://github.com/huggingface/datasets.git
-$ cd datasets
-$ pip install -e ".[streaming]"
-```
-
-If you plan on contributing a specific dataset during 
-the community week, please fork the datasets repository and follow the instructions 
-[here](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-create-a-pull-request).
-
-To verify that all libraries are correctly installed, you can run the following command.
-It assumes that both `transformers` and `datasets` were installed from main - otherwise
-datasets streaming will not work correctly.
-
-```python
-from transformers import FlaxRobertaModel, RobertaTokenizerFast
-from datasets import load_dataset
-import jax
-
-dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', streaming=True)
-
-dummy_input = next(iter(dataset))["text"]
-
-tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
-input_ids = tokenizer(dummy_input, return_tensors="np").input_ids[:, :10]
-
-model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
-
-# run a forward pass, should return an object `FlaxBaseModelOutputWithPooling`
-model(input_ids)
-```
-
-### TPU VM
-
-**VERY IMPORTANT** - Only one process can access the TPU cores at a time. This means that if multiple team members 
-are trying to connect to the TPU cores errors, such as:
-
-```
-libtpu.so already in used by another process. Not attempting to load libtpu.so in this process.
-```
-
-are thrown. As a conclusion, we recommend every team member to create her/his own virtual environment, but only one 
-person should run the heavy training processes. Also, please take turns when setting up the TPUv3-8 so that everybody 
-can verify that JAX is correctly installed.
-
-The following libraries are required to train a JAX/Flax model with 🤗 Transformers and 🤗 Datasets on TPU VM:
-
-- [JAX](https://github.com/google/jax/)
-- [Flax](https://github.com/google/flax)
-- [Optax](https://github.com/deepmind/optax)
-- [Transformers](https://github.com/huggingface/transformers)
-- [Datasets](https://github.com/huggingface/datasets)
-
-You should install the above libraries in a [virtual environment](https://docs.python.org/3/library/venv.html). 
-If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going
-to use and activate it.
-
-You should be able to run the command:
-
-```bash
-python3 -m venv <your-venv-name>
-```
-
-If this doesn't work, you first might to have install `python3-venv`. You can do this as follows:
-
-```bash
-sudo apt-get install python3-venv
-```
-
-You can activate your venv by running
-
-```bash
-source ~/<your-venv-name>/bin/activate
-```
-
-Next you should install JAX's TPU version on TPU by running the following command: 
-
-```bash
-$ pip install requests
-```
-
-and then:
-
-```bash
-$ pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
-```
-
-**Note**: Running this command might actually throw an error, such as:
-```
- Building wheel for jax (setup.py) ... error
-  ERROR: Command errored out with exit status 1:
-   command: /home/patrick/patrick/bin/python3 -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-lwseckn1/jax/setup.py'"'"'; __file__='"'"'/tmp/pip-install-lwseckn1/jax/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d /tmp/pip-wheel-pydotzlo
-       cwd: /tmp/pip-install-lwseckn1/jax/
-  Complete output (6 lines):
-  usage: setup.py [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...]
-     or: setup.py --help [cmd1 cmd2 ...]
-     or: setup.py --help-commands
-     or: setup.py cmd --help
-  
-  error: invalid command 'bdist_wheel'
-  ----------------------------------------
-  ERROR: Failed building wheel for jax
-```
-Jax should have been installed correctly nevertheless.
-
-To verify that JAX was correctly installed, you can run the following command:
-
-```python
-import jax
-jax.device_count()
-```
-
-This should display the number of TPU cores, which should be 8 on a TPUv3-8 VM.
-
-We strongly recommend to make use of the provided JAX/Flax examples scripts in [transformers/examples/flax](https://github.com/huggingface/transformers/tree/main/examples/flax) even if you want to train a JAX/Flax model of another github repository that is not integrated into 🤗 Transformers.
-In all likelihood, you will need to adapt one of the example scripts, so we recommend forking and cloning the 🤗 Transformers repository as follows. 
-Doing so will allow you to share your fork of the Transformers library with your team members so that the team effectively works on the same code base. It will also automatically install the newest versions of `flax`, `jax` and `optax`.
-
-1. Fork the [repository](https://github.com/huggingface/transformers) by
-   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
-   under your GitHub user account.
-
-2. Clone your fork to your local disk, and add the base repository as a remote:
-
-   ```bash
-   $ git clone https://github.com/<your Github handle>/transformers.git
-   $ cd transformers
-   $ git remote add upstream https://github.com/huggingface/transformers.git
-   ```
-
-3. Create a new branch to hold your development changes. This is especially useful to share code changes with your team:
-
-   ```bash
-   $ git checkout -b a-descriptive-name-for-my-project
-   ```
-
-4. Set up a flax environment by running the following command in a virtual environment:
-
-   ```bash
-   $ pip install -e ".[flax]"
-   ```
-
-   (If transformers was already installed in the virtual environment, remove
-   it with `pip uninstall transformers` before reinstalling it in editable
-   mode with the `-e` flag.)
-
-   If you have already cloned that repo, you might need to `git pull` to get the most recent changes in the `datasets`
-   library.
-
-   Running this command will automatically install `flax`, `jax` and `optax`.
-
-Next, you should also install the 🤗 Datasets library. We strongly recommend installing the 
-library from source to profit from the most current additions during the community week.
-
-Simply run the following steps:
-
-```bash
-$ cd ~/
-$ git clone https://github.com/huggingface/datasets.git
-$ cd datasets
-$ pip install -e ".[streaming]"
-```
-
-If you plan on contributing a specific dataset during 
-the community week, please fork the datasets repository and follow the instructions 
-[here](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-create-a-pull-request).
-
-To verify that all libraries are correctly installed, you can run the following command.
-It assumes that both `transformers` and `datasets` were installed from main - otherwise
-datasets streaming will not work correctly.
-
-```python
-from transformers import FlaxRobertaModel, RobertaTokenizerFast
-from datasets import load_dataset
-import jax
-
-dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', streaming=True)
-
-dummy_input = next(iter(dataset))["text"]
-
-tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
-input_ids = tokenizer(dummy_input, return_tensors="np").input_ids[:, :10]
-
-model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
-
-# run a forward pass, should return an object `FlaxBaseModelOutputWithPooling`
-model(input_ids)
-```
-
-## Quickstart flax and jax
-
-[JAX](https://jax.readthedocs.io/en/latest/index.html) is Autograd and XLA, brought together for high-performance numerical computing and machine learning research. It provides composable transformations of Python+NumPy programs: differentiate, vectorize, parallelize, Just-In-Time compile to GPU/TPU, and more. A great place for getting started with JAX is the [JAX 101 Tutorial](https://jax.readthedocs.io/en/latest/jax-101/index.html).
-
-[Flax](https://flax.readthedocs.io/en/latest/index.html) is a high-performance neural network library designed for flexibility built on top of JAX. It aims to provide users with full control of their training code and is carefully designed to work well with JAX transformations such as `grad` and `pmap` (see the [Flax philosophy](https://flax.readthedocs.io/en/latest/philosophy.html)). For an introduction to Flax see the [Flax Basics Colab](https://flax.readthedocs.io/en/latest/notebooks/flax_basics.html) or the list of curated [Flax examples](https://flax.readthedocs.io/en/latest/examples.html).
-
-## Quickstart flax and jax in transformers
-
-Currently, we support the following models in Flax. 
-Note that some models are about to be merged to `main` and will 
-be available in a couple of days.
-
-- [BART](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/modeling_flax_bart.py)
-- [BERT](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_flax_bert.py)
-- [BigBird](https://github.com/huggingface/transformers/blob/main/src/transformers/models/big_bird/modeling_flax_big_bird.py)
-- [CLIP](https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_flax_clip.py)
-- [ELECTRA](https://github.com/huggingface/transformers/blob/main/src/transformers/models/electra/modeling_flax_electra.py)
-- [GPT2](https://github.com/huggingface/transformers/blob/main/src/transformers/models/openai-community/gpt2/modeling_flax_gpt2.py)
-- [(TODO) MBART](https://github.com/huggingface/transformers/blob/main/src/transformers/models/mbart/modeling_flax_mbart.py)
-- [RoBERTa](https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/modeling_flax_roberta.py)
-- [T5](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_flax_t5.py)
-- [ViT](https://github.com/huggingface/transformers/blob/main/src/transformers/models/vit/modeling_flax_vit.py)
-- [Wav2Vec2](https://github.com/huggingface/transformers/blob/main/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py)
-
-You can find all available training scripts for JAX/Flax under the 
-official [flax example folder](https://github.com/huggingface/transformers/tree/main/examples/flax). Note that a couple of training scripts will be released in the following week.
-
-- [Causal language modeling (GPT2)](https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_clm_flax.py)
-- [Masked language modeling (BERT, RoBERTa, ELECTRA, BigBird)](https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_mlm_flax.py)
-- [Text classification (BERT, RoBERTa, ELECTRA, BigBird)](https://github.com/huggingface/transformers/blob/main/examples/flax/text-classification/run_flax_glue.py)
-- [Summarization / Seq2Seq (BART, MBART, T5)](https://github.com/huggingface/transformers/blob/main/examples/flax/summarization/run_summarization_flax.py)
-- [Masked Seq2Seq pret-training (T5)](https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_t5_mlm_flax.py)
-- [Contrastive Loss pretraining for Wav2Vec2](https://github.com/huggingface/transformers/blob/main/examples/research_projects/jax-projects/wav2vec2)
-- [Fine-tuning long-range QA for BigBird](https://github.com/huggingface/transformers/blob/main/examples/research_projects/jax-projects/big_bird)
-- [(TODO) Image classification (ViT)]( )
-- [(TODO) CLIP pretraining, fine-tuning (CLIP)]( )
-
-
-### **Flax design philosophy in Transformers**
-
-This section will explain how Flax models are implemented in Transformers and how the design differs from PyTorch.
-
-Let's first go over the difference between Flax and PyTorch.
-
-In JAX, most transformations (notably `jax.jit`) require functions that are transformed to be stateless so that they have no side effects. This is because any such side-effects will only be executed once when the transformed function is run during compilation and all subsequent calls of the compiled function would re-use the same side-effects of the compiled run instead of the "actual" side-effects (see [Stateful Computations in JAX](https://jax.readthedocs.io/en/latest/jax-101/07-state.html)). As a consequence, Flax models, which are designed to work well with JAX transformations, are stateless. This means that when running a model in inference, both the inputs and the model weights are passed to the forward pass. In contrast, PyTorch model are very much stateful with the weights being stored within the model instance and the user just passing the inputs to the forward pass.
-
-Let's illustrate the difference between stateful models in PyTorch and stateless models in Flax.
-
-For simplicity, let's assume the language model consists simply of a single attention layer [`key_proj`, `value_proj`, `query_proj`] and a linear layer `logits_proj` to project the transformed word embeddings to the output logit vectors.
-
-#### **Stateful models in PyTorch**
-
-In PyTorch, the weights matrices would be stored as `torch.nn.Linear` objects alongside the model's config inside the model class `ModelPyTorch`:
-
-```python
-class ModelPyTorch:
- 
-  def __init__(self, config):
-    self.config = config
-    self.key_proj = torch.nn.Linear(config)
-    self.value_proj = torch.nn.Linear(config)
-    self.query_proj = torch.nn.Linear(config)
-    self.logits_proj = torch.nn.Linear(config)
-```
-
-Instantiating an object `model_pytorch` of the class `ModelPyTorch` would actually allocate memory for the model weights and attach them to the attributes `self.key_proj`, `self.value_proj`, `self.query_proj`, and `self.logits.proj`. We could access the weights via:
-
-```python
-key_projection_matrix = model_pytorch.key_proj.weight.data
-```
-
-Visually, we would represent an object of `model_pytorch` therefore as follows:
-
-![alt text](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/lm_pytorch_def.png)
-
-Executing a forward pass then simply corresponds to passing the `input_ids` to the object `model_pytorch`:
-
-```python
-sequences = model_pytorch(input_ids)
-```
-
-In a more abstract way, this can be represented as passing the word embeddings to the model function to get the output logits:
-
-![alt text](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/lm_pt_inference.png)
-
-This design is called **stateful** because the output logits, the `sequences`, can change even if the word embeddings, the `input_ids`, stay the same. Hence, the function's output does not only depend on its inputs, but also on its **state**, `[self.key_proj, self.value_proj, self.query_proj, self.logits_proj]`, which makes `model_pytorch` stateful.
-
-#### **Stateless models in Flax/JAX**
-
-Now, let's see how the mathematically equivalent model would be written in JAX/Flax. The model class `ModelFlax` would define the self-attention and logits projection weights as [**`flax.linen.Dense`**](https://flax.readthedocs.io/en/latest/_autosummary/flax.linen.Dense.html#flax.linen.Dense) objects:
-
-```python
-class ModelFlax:
-
-  def __init__(self, config):
-    self.config = config
-    self.key_proj = flax.linen.Dense(config)
-    self.value_proj = flax.linen.Dense(config)
-    self.query_proj = flax.linen.Dense(config)
-    self.logits_proj = flax.linen.Dense(config)
-```
-
-At first glance the linear layer class `flax.linen.Dense` looks very similar to PyTorch's `torch.nn.Linear` class. However, instantiating an object `model_flax` only defines the linear transformation functions and does **not** allocate memory to store the linear transformation weights. In a way, the attribute `self.key_proj` tell the instantiated object `model_flax` to perform a linear transformation on some input and force it to expect a weight, called `key_proj`, as an input.
-
-This time we would illustrate the object `model_flax` without the weight matrices:
-
-![alt text](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/lm_flax_def.png)
-
-
-Accordingly, the forward pass requires both `input_ids` as well as a dictionary consisting of the model's weights (called `state` here) to compute the `sequences`:
-
-To get the initial `state` we need to explicitly do a forward pass by passing a dummy input:
-
-```python
-state = model_flax.init(rng, dummy_input_ids)
-```
-
-and then we can do the forward pass.
-
-```python
-sequences = model_flax.apply(state, input_ids)
-```
-
-Visually, the forward pass would now be represented as passing all tensors required for the computation to the model's object:
-
-![alt text](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/lm_flax_inference.png)
-
-This design is called **stateless** because the output logits, the `sequences`, **cannot** change if the word embeddings, the `input_ids`, stay the same. Hence, the function's output only depends on its inputs, being the `input_ids` and the `state` dictionary consisting of the weights **state**, `[key_proj, value_proj, query_proj, logits_proj]`. 
-
-Another term which is often used to describe the design difference between Flax/JAX and PyTorch is **immutable** vs **mutable**. A instantiated Flax model, `model_flax`, is **immutable** as a logical consequence of `model_flax`'s output being fully defined by its input: If calling `model_flax` could mutate `model_flax`, then calling `model_flax` twice with the same inputs could lead to different results which would violate the "*statelessness*" of Flax models.
-
-#### **Flax models in Transformers**
-
-Now let us see how this is handled in `Transformers.` If you have used a Flax model in Transformers already, you might wonder how come you don't always have to pass the parameters to the function of the forward pass. This is because the `FlaxPreTrainedModel` class abstracts it away. 
-It is designed this way so that the Flax models in Transformers will have a similar API to PyTorch and Tensorflow models.
-
-The `FlaxPreTrainedModel` is an abstract class that holds a Flax module, handles weights initialization, and provides a simple interface for downloading and loading pre-trained weights i.e. the `save_pretrained` and `from_pretrained` methods. Each Flax model then defines its own subclass of `FlaxPreTrainedModel`; *e.g.* the BERT model has `FlaxBertPreTrainedModel`. Each such class provides two important methods, `init_weights` and `__call__`. Let's see what each of those methods do:
-
-- The `init_weights` method takes the expected input shape and a [`PRNGKey`](https://jax.readthedocs.io/en/latest/_autosummary/jax.random.PRNGKey.html) (and any other arguments that are required to get initial weights) and calls `module.init` by passing it a random example to get the initial weights with the given `dtype` (for ex. `fp32` or `bf16` etc). This method is called when we create an instance of the model class, so the weights are already initialized when you create a model i.e., when you do 
-
-      model = FlaxBertModel(config)
-
-- The `__call__` method defines forward pass. It takes all necessary model inputs and parameters (and any other arguments required for the forward pass). The parameters are optional; when no parameters are passed, it uses the previously initialized or loaded parameters which can be accessed using `model.params`. It then calls the `module.apply` method, passing it the parameters and inputs to do the actual forward pass. So we can do a forward pass using
-
-      output = model(inputs, params=params)
-
-
-Let's look at an example to see how this works. We will write a simple two-layer MLP model.
-
-First, write a Flax module that will declare the layers and computation.
-
-```python
-import flax.linen as nn
-import jax.numpy as jnp
-
-class MLPModule(nn.Module):
-   config: MLPConfig
-   dtype: jnp.dtype = jnp.float32
-
-   def setup(self):
-      self.dense1 = nn.Dense(self.config.hidden_dim, dtype=self.dtype)
-      self.dense2 = nn.Desne(self.config.hidden_dim, dtype=self.dtype)
-   
-   def __call__(self, inputs):
-      hidden_states = self.dense1(inputs)
-      hidden_states = nn.relu(hidden_states)
-      hidden_states = self.dense2(hidden_states)
-      return hidden_states
-```
-
-Now let's define the `FlaxPreTrainedModel` model class.
-
-```python
-from transformers.modeling_flax_utils import FlaxPreTrainedModel
-
-class FlaxMLPPreTrainedModel(FlaxPreTrainedModel):
-   config_class = MLPConfig
-   base_model_prefix = "model"
-   module_class: nn.Module = None
-
-   def __init__(self, config: BertConfig, input_shape: Tuple = (1, 8), seed: int = 0, dtype: jnp.dtype = jnp.float32, **kwargs):
-      # initialize the flax module
-      module = self.module_class(config=config, dtype=dtype, **kwargs)
-      super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype)
-   
-   def init_weights(self, rng, input_shape):
-      # init input tensors
-      inputs = jnp.zeros(input_shape, dtype="i4")
-      
-      params_rng, dropout_rng = jax.random.split(rng)
-      rngs = {"params": params_rng, "dropout": dropout_rng}
-      
-      params = self.module.init(rngs, inputs)["params"]
-      return params
-   
-   def __call__(self, inputs, params: dict = None):
-      params = {"params": params or self.params}
-      outputs = self.module.apply(params, jnp.array(inputs))
-      return outputs
-```
-
-
-Now we can define our model class as follows.
-
-```python
-class FlaxMLPModel(FlaxMLPPreTrainedModel):
-   module_class = FlaxMLPModule
-```
-
-Now the `FlaxMLPModel` will have a similar interface as PyTorch or Tensorflow models and allows us to attach loaded or randomly initialized weights to the model instance.
-
-So the important point to remember is that the `model` is not an instance of `nn.Module`; it's an abstract class, like a container that holds a Flax module, its parameters and provides convenient methods for initialization and forward pass. The key take-away here is that an instance of `FlaxMLPModel` is very much stateful now since it holds all the model parameters, whereas the underlying Flax module `FlaxMLPModule` is still stateless. Now to make `FlaxMLPModel` fully compliant with JAX transformations, it is always possible to pass the parameters to `FlaxMLPModel` as well to make it stateless and easier to work with during training. Feel free to take a look at the code to see how exactly this is implemented for ex. [`modeling_flax_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_flax_bert.py#L536)
-
-Another significant difference between Flax and PyTorch models is that, we can pass the `labels` directly to PyTorch's forward pass to compute the loss, whereas Flax models never accept `labels` as an input argument. In PyTorch, gradient backpropagation is performed by simply calling `.backward()` on the computed loss which makes it very handy for the user to be able to pass the `labels`. In Flax however, gradient backpropagation cannot be done by simply calling `.backward()` on the loss output, but the loss function itself has to be transformed by `jax.grad` or `jax.value_and_grad` to return the gradients of all parameters. This transformation cannot happen under-the-hood when one passes the `labels` to Flax's forward function, so that in Flax, we simply don't allow `labels` to be passed by design and force the user to implement the loss function oneself. As a conclusion, you will see that all training-related code is decoupled from the modeling code and always defined in the training scripts themselves.
-
-### **How to use flax models and example scripts**
-
-
-#### **How to do a forward pass**
-
-Let's first see how to load, save and do inference with Flax models. As explained in the above section, all Flax models in Transformers have similar API to PyTorch models, so we can use the familiar `from_pretrained` and `save_pretrained` methods to load and save Flax models.
-
-Let's use the base `FlaxRobertaModel` without any heads as an example.
-
-```python
-from transformers import FlaxRobertaModel, RobertaTokenizerFast
-import jax
-
-tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
-inputs = tokenizer("JAX/Flax is amazing ", padding="max_length", max_length=128, return_tensors="np")
-
-model = FlaxRobertaModel.from_pretrained("julien-c/dummy-unknown")
-
-@jax.jit
-def run_model(input_ids, attention_mask):
-   # run a forward pass, should return an object `FlaxBaseModelOutputWithPooling`
-   return model(input_ids, attention_mask)
-
-outputs = run_model(**inputs)
-```
-
-We use `jax.jit` to compile the function to get maximum performance. Note that in the above example, we set `padding=max_length` to pad all examples to the same length. We do this because JAX's compiler has to recompile a function everytime its input shape changes - in a sense a compiled function is not only defined by its code but also by its input and output shape. It is usually much more effective to pad the input to be of a fixed static shape than having to recompile every the function multiple times.
-
-
-#### **How to write a training loop**
-
-Now let's see how we can write a simple training loop to train Flax models, we will use `FlaxGPT2ForCausalLM` as an example. 
-
-A training loop for Flax models typically consists of
-- A loss function that takes the parameters and inputs, runs the forward pass and returns the loss. 
-- We then transform the loss function using `jax.grad` or `jax.value_and_grad`  so that we get the gradients of all parameters.
-- An optimizer to update the paramteres using the gradients returned by the transformed loss function.
-- A train step function which combines the loss function and optimizer update, does the forward and backward pass and returns the updated parameters.
-
-Lets see how that looks like in code:
-
-First initialize our model
-
-```python
-import jax
-import jax.numpy as jnp
-
-from transformers import FlaxGPT2ForCausalLM
-
-model = FlaxGPT2ForCausalLM(config) 
-```
-
-As explained above we don't compute the loss inside the model, but rather in the task-specific training script.
-For demonstration purposes, we write a pseudo training script for causal language modeling in the following.
-
-```python
-from flax.training.common_utils import onehot
-
-def cross_entropy(logits, labels):
-   return -jnp.sum(labels * jax.nn.log_softmax(logits, axis=-1), axis=-1)
-
-# define a function which will run the forward pass return loss
-def compute_loss(params, input_ids, labels):
-   logits = model(input_ids, params=params, train=True)
-   num_classes = logits.shape[-1]
-   loss = cross_entropy(logits, onehot(labels, num_classes)).mean()
-   return loss
-```
-
-Now we transform the loss function with `jax.value_and_grad`.
-
-```python
-# transform the loss function to get the gradients
-grad_fn = jax.value_and_grad(compute_loss)
-```
-
-We use the [optax](https://github.com/deepmind/optax) library to Initialize the optimizer. 
-
-```python
-import optax
-
-params = model.params
-tx = optax.sgd(learning_rate=3e-3)
-opt_state = tx.init(params)
-```
-
-Now we define a single training step which will do a forward and a backward pass.
-
-```python
-def _train_step(params, opt_state, input_ids, labels)
-   # do the forward pass and get the loss and gradients
-   loss, grads = grad_fn(params, input_ids, labels)
-
-   # use the gradients to update parameters
-   updates, opt_state = tx.update(grads, opt_state)
-   updated_params = optax.apply_updates(params, updates)
-
-   return updates_params, opt_state, loss
-
-train_step = jax.jit(_train_step)
-```
-
-Finally, let's run our training loop.
-
-```python
-# train loop
-for i in range(10):
-   params, opt_state, loss = train_step(params, opt_state, input_ids, labels)
-```
-
-Note how we always pass the `params` and `opt_state` to the `train_step` which then returns the updated `params` and `opt_state`. This is because of the staless nature of JAX/Flax models, all the state
-like parameters, optimizer state is kept external.
-
-We can now save the model with the trained parameters using
-
-```python
-model.save_pretrained("awesome-flax-model", params=params)
-```
-
-Note that, as JAX is backed by the [XLA](https://www.tensorflow.org/xla) compiler any JAX/Flax code can run on all `XLA` compliant device without code change!
-That menas you could use the same training script on CPUs, GPUs, TPUs.
-
-To know more about how to train the Flax models on different devices (GPU, multi-GPUs, TPUs) and use the example scripts, please look at the [examples README](https://github.com/huggingface/transformers/tree/main/examples/flax).
-
-## Talks
-
-3 days of talks around JAX / Flax, Transformers, large-scale language modeling and other great topics during our community event!
-
-### Wednesday, June 30th
-- [Watch the talks on YouTube](https://www.youtube.com/watch?v=fuAyUQcVzTY)
-- [Chat history](https://docs.google.com/spreadsheets/d/1PZ5xYV2hVwlAVQSqDag65ympv5YNCSDmXyG-eWTaZ_o/edit?usp=sharing)
-
- Speaker        | Topic                           | Time                  |  Video |
-|-------------|---------------------------------|------------------------|------------------------|
-| Skye Wanderman-Milne, Google Brain | Intro to JAX on Cloud TPUs      | 6.00pm-6.45pm CEST / 9.00am-9.45am PST      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://www.youtube.com/watch?v=fuAyUQcVzTY) |
-| Marc van Zee, Google Brain | Introduction to Flax      | 6.45pm-7.30pm CEST / 9.45am-10.30am PST      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://youtu.be/fuAyUQcVzTY?t=2569) |
-| Pablo Castro, Google Brain | Using Jax & Flax for RL with the Dopamine library      | 7.30pm-8.00pm CEST / 10.30am-11.00am PST      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://youtu.be/fuAyUQcVzTY?t=5306) |
-
-### Thursday, July 1st
-- [Watch the talks on YouTube](https://www.youtube.com/watch?v=__eG63ZP_5g)
-- [Chat history](https://docs.google.com/spreadsheets/d/1PZ5xYV2hVwlAVQSqDag65ympv5YNCSDmXyG-eWTaZ_o/edit#gid=1515796400)
-
- Speaker        | Topic                           | Time                  | Video |
-|-------------|---------------------------------|------------------------|------------------------|
-| Suraj Patil & Patrick von Platen, Hugging Face | How to use JAX/Flax with Transformers      | 5.30pm-6.00pm CEST / 8.30am-9.00am PST      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://www.youtube.com/watch?v=__eG63ZP_5g) |
-| Sabrina J. Mielke, Johns Hopkins University & HuggingFace | From stateful code to purified JAX: how to build your neural net framework | 6.00pm-6.30pm CEST / 9.00am-9.30am PST      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://youtu.be/__eG63ZP_5g?t=1576) |
-| Mostafa Dehghani, Google Brain | Long Range Arena: Benchmarking Efficient Transformers      | 6.30pm-7.00pm CEST / 9.30am-10.00am PST      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://youtu.be/__eG63ZP_5g?t=3695) |
-| Rohan Anil, Google Brain | Scalable Second Order Optimization for Deep Learning      | 7.00pm-7.30pm CEST / 10.00am-10.30am PST      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://youtu.be/__eG63ZP_5g?t=5285) |
-
-
-### Friday, July 2nd
-- [Watch the talks on YouTube](https://www.youtube.com/watch?v=ZCMOPkcTu3s)
-- [Chat history](https://docs.google.com/spreadsheets/d/1PZ5xYV2hVwlAVQSqDag65ympv5YNCSDmXyG-eWTaZ_o/edit#gid=1166061401)
-
- Speaker        | Topic                           | Time                  |  Video |
-|-------------|---------------------------------|------------------------|------------------------|
-| Lucas Beyer, Google Brain | Vision Transformer      | 5.00pm-5.30 CEST / 8.00am-8.30 PST      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://www.youtube.com/watch?v=ZCMOPkcTu3s) |
-| Ben Wang, EleutherAI | Multihost Training in Mesh Transformer JAX      | 5.30pm-6.00 CEST / 8.30am-9.00 PST       | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://youtu.be/ZCMOPkcTu3s?t=1803) |
-| Iurii Kemaev, Soňa Mokrá, Junhyuk Oh, DeepMind | DeepMind JAX Ecosystem      |    6.00pm-6.30 CEST / 9.00am-9.30am PST   | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://youtu.be/ZCMOPkcTu3s?t=3388) |
-| Siddhartha Kamalakara, Joanna Yoo & João G M Araújo, Cohere | Training large scale language models      | 6:30pm-7.00pm CEST / 9:30am-10.00am PST      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://youtu.be/ZCMOPkcTu3s?t=5095) |
-
-### Talks & Speakers
-
-#### Skye Wanderman-Milne, JAX developer, Google Brain
-- Talk: Intro to JAX on Cloud TPUs
-- Abstract: JAX is a system for high-performance machine-learning research that combines the familiarity of Python + NumPy together with the power of hardware acceleration on CPUs, GPUs, and TPUs. It offers composable function transformations for automatic differentiation, automatic batching, end-to-end compilation, and both data and model parallelism. This talk will show you how to get up and running with JAX on a Cloud TPU VM. 
-- Speaker info: Skye Wanderman-Milne is a software engineer working on JAX. She has previously worked on TensorFlow and Apache Impala, a high-performance distributed database.
-
-#### Marc van Zee, Research SWE, Google Brain (Flax team)
-- Talk: Introduction to Flax
-- Abstract: In this talk I will provide a high-level introduction to the neural network library Flax. I will discuss the Flax philosophy, talk about the ecosystem around Flax and provide a high-level introduction to the code. I explain the Module abstraction and how to use it to train your models.
-- Speaker info: Marc is at Google Research for over 4 years. First he worked on conceptual AI, developing a next generation language understanding and reasoning prototype and he authored the CFQ dataset for compositional generalization. Currently, Marc works as a research software engineer in the Flax team.
-
-#### Pablo Castro, Staff Research Software Developer; Google Research, Brain Team
-- Talk: Using Jax & Flax for RL with the Dopamine library
-- Abstract: The Dopamine library was launched with TensorFlow in 2018 and we added a Jax/Flax variant of it last year. Internally, Jax's flexibility has facilitated our RL research tremendously, and we are excited to demonstrate its potential.
-- Speaker info: Pablo Samuel has been at Google for over 9 years, and is currently a researcher with the Brain team, focusing on fundamental reinforcement learning, as well as machine learning and creativity. Aside from his research, Pablo Samuel is an active musician (with a channel exploring the intersection of music and computer science), and is helping increase the representation of the LatinX community in the research world.
-- Dopamine repo: https://github.com/google/dopamine 
-- Homepage: https://psc-g.github.io/
-- Twitter: https://twitter.com/pcastr
-
-#### Suraj Patil & Patrick von Platen, Machine Learning Engineers at Hugging Face
-- Talk: How to use JAX/Flax with Transformers
-- Abstract: Transformers is one of the most popular open-source ML libraries and supports PyTorch, Tensorflow, and JAX/Flax. In this talk, we will explain how JAX/Flax models should be used in Transformers and compare their design in Transformers with the design of PyTorch models in Transformers. In the second part, we will give you a hands-on presentation of how a model can be trained end-to-end with the official JAX/Flax example scripts using Transformers & Datasets. Along the way, we want to give you some tips and tricks on how to best realize your project.
-- Speaker info: Suraj and Patrick are part of Hugging Face’s open source team and lead the integration of JAX/Flax into Transformers.
-- GitHub: https://github.com/patil-suraj & https://github.com/patrickvonplaten
-
-#### Sabrina J. Mielke, PhD student at The Johns Hopkins University & Part-time research intern at HuggingFace
-- Talk: From stateful code to purified JAX: how to build your neural net framework
-- Abstract: Moving from object-oriented (and stateful) PyTorch- or TF2-code with tape-based backprop to JAX isn't easy---and while running grad() on numpy-oneliners is cool and all, you do wonder... how do I build actual big neural nets? Libraries like flax, trax, or haiku make it easy---but how could you build machinery like that yourself?
-- Speaker info: Sabrina is a PhD student at the Johns Hopkins University and a part-time research intern at HuggingFace, researching open-vocabulary language models for segmentation and tokenization. She has published and co-organized workshops and shared tasks on these topics as well as on morphology and typological analysis in ACL, NAACL, EMNLP, LREC, and AAAI. You can find her reminisce for a time when formal language theory played a bigger role in NLP on Twitter at @sjmielke.
-- Links: The 2020 blogpost this talk will be based on: https://sjmielke.com/jax-purify.htm, leading to our experiment Parallax and eventually Haiku
-
-#### Mostafa Dehghani, Research Scientist, Google Brain
-- Talk: Long Range Arena: Benchmarking Efficient Transformers
-- Abstract: Transformers do not scale very well to long sequence lengths largely because of quadratic self-attention complexity. In the recent months, a wide spectrum of efficient, fast Transformers have been proposed to tackle this problem, more often than not claiming superior or comparable model quality to vanilla Transformer models. So, we now need a well-established consensus on how to evaluate this class of models. Moreover, inconsistent benchmarking on a wide spectrum of tasks and datasets makes it difficult to assess relative model quality amongst many models. I'll talk about a systematic and unified benchmark, LRA, specifically focused on evaluating model quality under long-context scenarios. LRA is a suite of tasks consisting of sequences ranging from 1K to 16K tokens, encompassing a wide range of data types and modalities such as text, natural, synthetic images, and mathematical expressions requiring similarity, structural, and visual-spatial reasoning. We systematically evaluate ten well-established long-range Transformer models (Reformers, Linformers, Linear Transformers, Sinkhorn Transformers, Performers, Synthesizers, Sparse Transformers, and Longformers) on LRA. LRA paves the way towards better understanding this class of efficient Transformer models, facilitates more research in this direction, and presents new challenging tasks to tackle. 
-- Speaker info: https://mostafadehghani.com/
-
-#### Rohan Anil, Senior Staff Software Engineer, Google Research, Brain Team
-- Talk: Scalable Second Order Optimization for Deep Learning
-- Abstract: Optimization in machine learning, both theoretical and applied, is presently dominated by first-order gradient methods such as stochastic gradient descent. Second-order optimization methods, that involve second derivatives and/or second order statistics of the data, are far less prevalent despite strong theoretical properties, due to their prohibitive computation, memory and communication costs. In an attempt to bridge this gap between theoretical and practical optimization, we present a scalable implementation of a second-order preconditioned method (concretely, a variant of full-matrix Adagrad), that along with several critical algorithmic and numerical improvements, provides significant convergence and wall-clock time improvements compared to conventional first-order methods on state-of-the-art deep models. Our novel design effectively utilizes the prevalent heterogeneous hardware architecture for training deep models, consisting of a multicore CPU coupled with multiple accelerator units. We demonstrate superior performance compared to state-of-the-art on very large learning tasks such as machine translation with Transformers, language modeling with BERT, click-through rate prediction on Criteo, and image classification on ImageNet with ResNet-50.
-- Speaker info: Rohan Anil is a software engineer at Google Research, Mountain View. Lately, he has been working on scalable and practical optimization techniques for efficient training of neural networks in various regimes.
-- Resources:
-  - https://arxiv.org/abs/2002.09018
-  - https://arxiv.org/abs/1901.11150
-  - https://arxiv.org/abs/2106.06199
-
-
-#### Lucas Beyer, Senior Research Engineer, Google Brain
-- Talk: Vision Transformer
-- Abstract: This talk will discuss the learning of general visual representations via large-scale pre-training and few-shot transfer, with a special focus on the Vision Transformer (ViT) architecture, which popularized transformers for the visual domain.
-- Speaker info: Lucas Beyer is a self-taught hacker and studied engineer. He went on to do his PhD in robotic perception at RWTH Aachen and is currently on a quest to find the ultimate visual representation at Google Brain in Zürich
-
-#### Ben Wang, Independent AI Researcher, EleutherAI
-- Talk: Multihost Training in Mesh Transformer JAX
-- Abstract: As models become larger, training must be scaled across multiple nodes. This talk discusses some design decisions and tradeoffs made for scaling to multiple nodes in Mesh Transformer JAX, a library for running model parallel transformers on TPU pods.
-- Speaker info: Ben is an independent AI researcher who contributes to EleutherAI, an open source research collective centered around democratizing access to powerful AI models. Recently he has released GPT-J-6B, a 6 billion parameter transformer which is the most powerful autoregressive language model in terms of zero-shot performance with public weights.
-- Website: https://www.eleuther.ai/
-
-#### Iurii Kemaev, Research Engineer, Soňa Mokrá, Research Engineer, and Junhyuk Oh, Research Scientist, DeepMind
-- Talk: DeepMind JAX Ecosystem
-- Abstract: The DeepMind JAX Ecosystem is an effort to build a shared substrate of components to enable all aspects of AGI Research. In this talk, our researchers and engineers will give a high-level overview of our Ecosystem goals and design philosophies, using our Haiku (neural network), Optax (optimization) and RLax (reinforcement learning) libraries as examples. We will then deep dive on two examples of recent DeepMind research that have been enabled by JAX and these libraries: generative models and meta-gradient reinforcement learning.
-- Speaker info:
-  - Iurii Kemaev is a Research Engineer at DeepMind. He has been using JAX for 2 years advancing RL research. Iurii is one of the DM JAX ecosystem leads.
-  - Soňa Mokrá is a Research Engineer at DeepMind. She has a background in machine translation and has been using JAX as the main ML framework for the past 6 months.
-  - Junhyuk Oh is a Research Scientist at DeepMind, working on reinforcement learning and meta-learning. More information is available at https://junhyuk.com/
-
-#### Siddhartha Kamalakara, Joanna Yoo, João G M Araújo, MLE at Cohere
-- Talk: Training large scale language models
-- Abstract: A journey through Cohere’s experiences with training large scale language models. Join us in our exploration of pipeline and model parallelism as strategies for efficient training of large language models. We will present and motivate our recent transition to JAX+Flax as our choice of internal tech stack.
-- Speaker info: 
-   - João G M Araújo is a Brazilian college student with a passion for mathematics and a fascination for Deep Learning. João conducted research on representation learning and spent 3 months in Japan working on NeuroEvolution. João likes reading fantasy books and spending quality time with family and friends, and also runs a YouTube series on theoretical understanding of Deep Learning where researchers talk about their findings
-   - Joanna Yoo is one of the founding engineers at Cohere, working on scaling language models for the last year and half. Joanna loves live concerts and rock climbing!
-   - Siddhartha Rao Kamalakara is an MLE at Cohere and a researcher at FOR.ai with research interests at the intersection of efficient training and empirical understanding of DL.
-- Website: https://cohere.ai/
-
-
-## How to use the hub for collaboration
-
-In this section, we will explain how a team can use the 🤗 hub to collaborate on a project.
-The 🤗 hub allows each team to create a repository with integrated git version control that 
-should be used for their project.
-The advantages of using a repository on the 🤗 hub are:
-
-- easy collaboration - each team member has write access to the model repository
-- integrated git version control - code scripts as well as large model files are tracked using git version control
-- easy sharing - the hub allows each team to easily share their work during and after the event
-- integrated tensorboard functionality - uploaded tensorboard traces are automatically displayed on an integrated tensorboard tab
-
-We highly recommend each team to make use of the 🤗 hub during the event.
-To better understand how the repository and the hub in general functions, please take a look at the documentation and the videos [here](https://huggingface.co/docs/hub).
-
-Now let's explain in more detail how a project can be created on the hub. Having an officially defined project on [this](https://docs.google.com/spreadsheets/d/1GpHebL7qrwJOc9olTpIPgjf8vOS0jNb6zR_B8x_Jtik/edit?usp=sharing) Google Sheet you should be part of [the Flax Community organization on the hub](https://huggingface.co/flax-community). All repositories should be created under this organization so that write access can be shared and everybody can easily access other participants'
-work 🤗. Note that we are giving each team member access to all repositories created under [flax-community](https://huggingface.co/flax-community), but we encourage participants to only clone and edit repositories corresponding to one's teams. If you want to help other teams, please ask them before changing files in their repository! The integrated git version control keeps track of 
-all changes, so in case a file was deleted by mistake, it is trivial to re-create it.
-
-Awesome! Now, let's first go over a simple example where most of the required we'll pre-train a RoBERTa model on a low-resource language. To begin with, we create a repository 
-under [the Flax Community organization on the hub](https://huggingface.co/flax-community) by logging in to the hub and going to [*"Add model"*](https://huggingface.co/new). By default 
-the username should be displayed under "*Owner*", which we want to change to *flax-community*. Next, we give our repository a fitting name for the project - here we'll just call it 
-*roberta-base-als* because we'll be pretraining a RoBERTa model on the super low-resource language *Alemannic* (`als`). We make sure that the model is a public repository and create it!
-It should then be displayed on [the Flax Community organization on the hub](https://huggingface.co/flax-community).
-
-Great, now we have a project directory with integrated git version control and a public model page, which we can access under [flax-community/roberta-base-als](https://huggingface.co/flax-community/roberta-base-als). Let's create a short README so that other participants know what this model is about. You can create the README.md directly on the model page as a markdown file.
-Let's now make use of the repository for training.
-
-We assume that the 🤗 Transformers library and [git-lfs](https://git-lfs.github.com/) are correctly installed on our machine or the TPU attributed to us. 
-If this is not the case, please refer to the [Installation guide](#how-to-install-relevant-libraries) and the official [git-lfs](https://git-lfs.github.com/) website.
-
-At first we should log in:
-
-```bash
-$ huggingface-cli login
-```
-
-Next we can clone the repo:
-
-```bash
-$ git clone https://huggingface.co/flax-community/roberta-base-als
-```
-
-We have now cloned the model's repository and it should be under `roberta-base-als`. As you can see,
-we have all the usual git functionalities in this repo - when adding a file, we can do `git add .`, `git commit -m "add file"` and `git push` 
-as usual. Let's try it out by adding the model's config.
-
-We go into the folder:
-
-```bash
-$ cd ./roberta-base-als
-```
-
-and run the following commands in a Python shell to save a config.
-
-```python
-from transformers import RobertaConfig
-
-config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")
-config.save_pretrained("./")
-```
-
-Now we've added a `config.json` file and can upload it by running 
-
-```bash
-$ git add . && git commit -m "add config" && git push
-```
-
-Cool! The file is now displayed on the model page under the [files tab](https://huggingface.co/flax-community/roberta-base-als/tree/main).
-We encourage you to upload all files except maybe the actual data files to the repository. This includes training scripts, model weights,
-model configurations, training logs, etc...
-
-Next, let's create a tokenizer and save it to the model dir by following the instructions of the [official Flax MLM README](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#train-tokenizer). We can again use a simple Python shell.
-
-```python
-from datasets import load_dataset
-from tokenizers import ByteLevelBPETokenizer
-
-# load dataset
-dataset = load_dataset("oscar", "unshuffled_deduplicated_als", split="train")
-
-# Instantiate tokenizer
-tokenizer = ByteLevelBPETokenizer()
-
-def batch_iterator(batch_size=1000):
-    for i in range(0, len(dataset), batch_size):
-        yield dataset[i: i + batch_size]["text"]
-
-# Customized training
-tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
-    "<s>",
-    "<pad>",
-    "</s>",
-    "<unk>",
-    "<mask>",
-])
-
-# Save files to disk
-tokenizer.save("./tokenizer.json")
-```
-
-This creates and saves our tokenizer directly in the cloned repository.
-Finally, we can start training. For now, we'll simply use the official [`run_mlm_flax`](https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_mlm_flax.py)
-script, but we might make some changes later. So let's copy the script into our model repository.
-
-```bash
-$ cp ~/transformers/examples/flax/language-modeling/run_mlm_flax.py ./
-```
-
-This way we are certain to have all the code used to train the model tracked in our repository.
-Let's start training by running:
-
-```bash
-./run_mlm_flax.py \
-    --output_dir="./" \
-    --model_type="roberta" \
-    --config_name="./" \
-    --tokenizer_name="./" \
-    --dataset_name="oscar" \
-    --dataset_config_name="unshuffled_deduplicated_als" \
-    --max_seq_length="128" \
-    --per_device_train_batch_size="4" \
-    --per_device_eval_batch_size="4" \
-    --learning_rate="3e-4" \
-    --warmup_steps="1000" \
-    --overwrite_output_dir \
-    --num_train_epochs="8" \
-    --push_to_hub
-```
-
-Since the dataset is tiny this command should actually run in less than 5 minutes. Note that we attach 
-the flag ``--push_to_hub`` so that both model weights and tensorboard traces are automatically uploaded to the hub.
-You can see the tensorboard directly on the model page, under the [Training metrics tab](https://huggingface.co/flax-community/roberta-base-als/tensorboard).
-
-As you can see, it is pretty simple to upload model weights and training logs to the model hub. Since the repository 
-has git version control, you & your team probably already have the necessary skills to collaborate. Thanks 
-to `git-lfs` being integrated into the hub, model weights and other larger file can just as easily be uploaded 
-and changed. Finally, at Hugging Face, we believe that the model hub is a great platform to share your project 
-while you are still working on it:
-
-- Bugs in training scripts can be found and corrected by anybody participating in the event
-- Loss curves can be analyzed directly on the model page
-- Model weights can be accessed and analyzed by everybody from the model repository
-
-If you are not using a transformers model, don't worry - you should still be able to make use of the hub's functionalities!
-The [huggingface_hub](https://github.com/huggingface/huggingface_hub) allows you to upload essentially any JAX/Flax model to the hub with 
-just a couple of lines of code. *E.g.* assuming you want to call your model simply `flax-model-dummy`, you can upload it to the hub with 
-just three lines of code:
-
-
-```python
-from flax import serialization
-from jax import random
-from flax import linen as nn
-from huggingface_hub import Repository
-
-model = nn.Dense(features=5)
-
-key1, key2 = random.split(random.PRNGKey(0))
-x = random.normal(key1, (10,))
-params = model.init(key2, x)
-
-bytes_output = serialization.to_bytes(params)
-
-repo = Repository("flax-model", clone_from="flax-community/flax-model-dummy", token=True)
-with repo.commit("My cool Flax model :)"):
-    with open("flax_model.msgpack", "wb") as f:
-        f.write(bytes_output)
-
-# Repo is created and available here: https://huggingface.co/flax-community/flax-model-dummy
-```
-
-**Note**: Make sure to have `huggingface_hub >= 0.0.13` to make this command work.
-
-For more information, check out [this PR](https://github.com/huggingface/huggingface_hub/pull/143) on how to upload any framework to the hub.
-
-## How to setup TPU VM
-
-In this section we will explain how you can ssh into a TPU VM that has been given to your team.
-If your username is in one of the officially defined projects [here](https://docs.google.com/spreadsheets/d/1GpHebL7qrwJOc9olTpIPgjf8vOS0jNb6zR_B8x_Jtik/edit?usp=sharing), you should have received two emails: 
-
-- one that states that you have been granted the role "Community Week Participants" for the project hf-flax, and
-- one (or more if you are in multiple projects) that gives you the TPU name and the TPU zone for the TPU of your team
-
-You should click on "Open Cloud Console" on the first mail and agree to the pop up windows that follows. It will allow you to use a TPU VM. Don't worry if you cannot access the actual project `hf-flax` visually on the google cloud console and receive an error:
-
-```
-You don't have sufficient permission to view this page
-```
-- this is expected! 
-
-Great, now you and your team can access your TPU VM!
-
-In the following, we will describe how to do so using a standard console, but you should also be able to connect to the TPU VM via IDEs, like Visual Studio Code, etc.
-
-1. You need to install the Google Cloud SDK. Please follow the instructions on [cloud.google.com/sdk](https://cloud.google.com/sdk/docs/install#linux).
-
-2. Once you've installed the google cloud sdk, you should set your account by running the following command. Make sure that `<your-email-address>` corresponds to the gmail address you used to sign up for this event.
-
-```bash
-$ gcloud config set account <your-email-address>
-```
-
-3. Let's also make sure the correct project is set in case your email is used for multiple gcloud projects:
-
-```bash
-$ gcloud config set project hf-flax
-```
-
-4. Next, you will need to authenticate yourself. You can do so by running: 
-
-```bash
-$ gcloud auth login
-```
-
-This should give you a link to a website, where you can authenticate your gmail account.
-
-5. Finally, you can ssh into the TPU VM! Please run the following command by setting <zone> to either `europe-west4-a` or `us-central1-a` (depending on what is stated in the second email you received) and <tpu-name> to the TPU name also sent to you in the second email.
-	
-```bash
-$ gcloud alpha compute tpus tpu-vm ssh <tpu-name> --zone <zone> --project hf-flax
-```
-	
-This should ssh you into the TPU VM!
-Now you can follow the steps of the section [How to install relevant libraries](#how-to-install-relevant-libraries) to install all necessary 
-libraries. Make sure to carefully follow the explanations of the "**IMPORTANT**" statement to correctly install JAX on TPU.
-Also feel free to install other `python` or `apt` packages on your machine if it helps you to work more efficiently!
-
-
-## How to build a demo
- 
-### Using the Hugging Face Widgets
-
-Hugging Face has over [15 widgets](https://huggingface-widgets.netlify.app/) for different use cases using 🤗 Transformers library. Some of them also support [3rd party libraries](https://huggingface.co/docs/hub/libraries) such as [Sentence Similarity](https://huggingface.co/sentence-transformers/paraphrase-xlm-r-multilingual-v1) with Sentence Transformers and [Text to Speech](https://huggingface.co/julien-c/ljspeech_tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space_train) with [ESPnet](https://github.com/espnet/espnet).
-
-All the widgets are open sourced in the `huggingface_hub` [repo](https://github.com/huggingface/huggingface_hub/tree/main/widgets). Here is a summary of existing widgets:
-
-**NLP**
-* **Conversational:** To have the best conversations!. [Example](https://huggingface.co/microsoft/DialoGPT-large?).
-* **Feature Extraction:** Retrieve the input embeddings. [Example](https://huggingface.co/sentence-transformers/distilbert-base-nli-mean-tokens?text=test).
-* **Fill Mask:** Predict potential words for a mask token. [Example](https://huggingface.co/google-bert/bert-base-uncased?).
-* **Question Answering:** Given a context and a question, predict the answer. [Example](https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad).
-* **Sentence Simmilarity:** Predict how similar a set of sentences are. Useful for Sentence Transformers.
-* **Summarization:** Given a text, output a summary of it. [Example](https://huggingface.co/sshleifer/distilbart-cnn-12-6).
-* **Table Question Answering:** Given a table and a question, predict the answer. [Example](https://huggingface.co/google/tapas-base-finetuned-wtq).
-* **Text Generation:** Generate text based on a prompt. [Example](https://huggingface.co/openai-community/gpt2)
-* **Token Classification:** Useful for tasks such as Named Entity Recognition and Part of Speech. [Example](https://huggingface.co/dslim/bert-base-NER).
-* **Zero-Shot Classification:** Too cool to explain with words. Here is an [example](https://huggingface.co/typeform/distilbert-base-uncased-mnli)
-* ([WIP](https://github.com/huggingface/huggingface_hub/issues/99)) **Table to Text Generation**.
-
-**Speech**
-* **Audio to Audio:** For tasks such as audio source separation or speech enhancement. 
-* **Automatic Speech Recognition:** Convert audio to text. [Example](https://huggingface.co/facebook/wav2vec2-base-960h)
-* **Text to Speech**: Convert text to audio.
-
-**Image**
-* **Image Classification:** Given an image, predict its class. [Example](https://huggingface.co/osanseviero/llamastic).
-* ([WIP](https://github.com/huggingface/huggingface_hub/issues/100)) **Zero Shot Image Classification**
-* ([WIP](https://github.com/huggingface/huggingface_hub/issues/112)) **Image Captioning**
-* ([WIP](https://github.com/huggingface/huggingface_hub/issues/113)) **Text to Image Generation**
-* ([Proposed](https://github.com/huggingface/huggingface_hub/issues/127)) **Visual Question Answering**
-
-You can propose and implement new widgets by [opening an issue](https://github.com/huggingface/huggingface_hub/issues). Contributions are welcomed!
-
-
-### Using a Streamlit demo
-
-Sometimes you might be using different libraries or a very specific application that is not well supported by the current widgets. In this case, [Streamlit](https://streamlit.io/) can be an excellent option to build a cool visual demo. Setting up a Streamlit application is straightforward and in Python!
-
-A common use case is how to load files you have in your model repository in the Hub from the Streamlit demo. The `huggingface_hub` library is here to help you!
-
-```bash
-pip install huggingface_hub
-```
-
-Here is an example downloading (and caching!) a specific file directly from the Hub
-```python
-from huggingface_hub import hf_hub_download
-filepath = hf_hub_download("flax-community/roberta-base-als", "flax_model.msgpack");
-```
-
-In many cases you will want to download the full repository. Here is an example downloading all the files from a repo. You can even specify specific revisions!
-
-```python
-from huggingface_hub import snapshot_download
-local_path = snapshot_download("flax-community/roberta-base-als");
-```
-
-Note that if you're using 🤗 Transformers library, you can quickly load the model and tokenizer as follows
-```python
-from transformers import AutoTokenizer, AutoModelForMaskedLM
-  
-tokenizer = AutoTokenizer.from_pretrained("REPO_ID")
-model = AutoModelForMaskedLM.from_pretrained("REPO_ID")
-```
-
-
-We'll provide more examples on Streamlit demos next week. Stay tuned!
-
-### Using a Gradio demo
-
-You can also use [Gradio](https://gradio.app/) to share your demos! [Here](https://huggingface.co/blog/gradio) is an example using the Gradio library to create a GUI for a Hugging Face model.
-
-More to come!
-
-## Project evaluation
-
-For your project to be evaluated, please fill out [this google form](https://forms.gle/jQaMkj3JJdD4Xcwn9).
-Please make sure that your submitted project includes a demo as well as information about the model, data, training methods, etc.
-
-### Criteria
-
-* **Demo.** All projects are required to have a demo. It’s open ended, but we provide some ideas on how to build demos in the [How to build a demo](#how-to-build-a-demo) section.
-* **Technical difficulty.** Difficulty has different aspects, such as working with complex architectures, obtaining better evaluation metrics than existing models, or implementing models for low-resource languages. 
-* **Social impact.** The project is expected to have a positive social impact, e.g. by tackling under-explored area of practical interest for minorities or under-represented group (low-ressources languages, specific focus on bias, fairness or ethical issues in ML) or by tackling general societal challenges, e.g. health or climate related challenges.
-* **Innovativeness.** Projects that propose novel applications or bring new ideas will be rewarded more.
-
-### Jury
-
-* [Niki Parmar](https://research.google/people/NikiParmar/): Staff Research Scientist at Google.
-* [Ross Wightman](https://www.linkedin.com/in/wightmanr/): Angel Investor.
-* [Thomas Wolf](https://www.linkedin.com/in/thomas-wolf-a056857/): Co-founder and CSO at Hugging Face.
-* [Ashish Vaswani](https://research.google/people/AshishVaswani/): Staff Research Scientist at Google Brain.
-
-### Process
-
-* **July 17, 12h00 CEST**: TPU VM access closes.
-* **July 19, 12h00 CEST**: Project completition ends (including demo).
-* **July 19-21** A group of event organizers (Suraj, Patrick, Suzana, and Omar) will do an initial filter to find the top 15 projects.
-* **July 22-26** The jury will go over the 15 projects and pick the top three projects out of them.
-* **July 27.** Winner projects are announced
-
-
-## General tips and tricks
-
-TODO (will be filled continuously)...
-
-## FAQ
-
-TODO (will be filled continuously)...
diff --git a/examples/research_projects/jax-projects/big_bird/README.md b/examples/research_projects/jax-projects/big_bird/README.md
deleted file mode 100644
index 42586e49580e..000000000000
--- a/examples/research_projects/jax-projects/big_bird/README.md
+++ /dev/null
@@ -1,60 +0,0 @@
-
-Author: [@vasudevgupta7](https://github.com/thevasudevgupta/)
-
-## Intro
-
-In this project, we fine-tuned [**BigBird**](https://arxiv.org/abs/2007.14062) on [**natural-questions**](https://huggingface.co/datasets/natural_questions) dataset for **question-answering** task on long documents. **BigBird**, is a **sparse-attention based transformer** which extends Transformer based models, such as BERT to much **longer sequences**.
-
-Read more about BigBird at https://huggingface.co/blog/big-bird
-
-## Fine-tuning
-
-**Setup**
-
-You need to install jax yourself by following the official docs ([refer this](https://github.com/google/jax#installation)). Other requirements for this project can be installed by running following command:
-
-```shell
-pip3 install -qr requirements.txt
-```
-
-**Download & prepare dataset**
-
-The Natural Questions corpus contains questions from real users, and it requires QA systems to read and comprehend an entire Wikipedia article that may or may not contain the answer to the question. This corpus takes ~100 GB on disk. We have used HuggingFace datasets to download & process the dataset.
-
-```shell
-# just run following CMD
-python3 prepare_natural_questions.py
-
-# this will download the whole dataset from HuggingFace Hub & will make it ready for training
-# this script takes ~3 hours to process the dataset
-```
-
-**Launch Training**
-
-We have trained on Cloud's TPU v3-8. Each epoch took around 4.5 hours and the model got converged in just 2 epochs. You can see complete training args in [this script](bigbird_flax.py).
-
-```shell
-# just run following CMD
-python3 train.py
-
-# In case, you want to try hparams tuning, you can run wandb sweep
-wandb sweep --project=bigbird sweep_flax.yaml
-wandb agent <agent-id-obtained-by-above-CMD>
-```
-
-## Evaluation
-
-Our evaluation script is different from the original script and we are evaluating sequences with length up to 4096 for simplicity. We managed to get the **EM score of ~55.2** using our evaluation script.
-
-```shell
-# download validation-dataset first
-mkdir natural-questions-validation
-wget https://huggingface.co/datasets/vasudevgupta/natural-questions-validation/resolve/main/natural_questions-validation.arrow -P natural-questions-validation
-wget https://huggingface.co/datasets/vasudevgupta/natural-questions-validation/resolve/main/dataset_info.json -P natural-questions-validation
-wget https://huggingface.co/datasets/vasudevgupta/natural-questions-validation/resolve/main/state.json -P natural-questions-validation
-
-# simply run following command
-python3 evaluate.py
-```
-
-You can find our checkpoint on HuggingFace Hub ([see this](https://huggingface.co/vasudevgupta/flax-bigbird-natural-questions)). In case you are interested in PyTorch BigBird fine-tuning, you can refer to [this repository](https://github.com/thevasudevgupta/bigbird).
diff --git a/examples/research_projects/jax-projects/big_bird/bigbird_flax.py b/examples/research_projects/jax-projects/big_bird/bigbird_flax.py
deleted file mode 100644
index af5e11c83a6a..000000000000
--- a/examples/research_projects/jax-projects/big_bird/bigbird_flax.py
+++ /dev/null
@@ -1,323 +0,0 @@
-import json
-import os
-from dataclasses import dataclass
-from functools import partial
-from typing import Callable
-
-import flax.linen as nn
-import jax
-import jax.numpy as jnp
-import joblib
-import optax
-import wandb
-from flax import jax_utils, struct, traverse_util
-from flax.serialization import from_bytes, to_bytes
-from flax.training import train_state
-from flax.training.common_utils import shard
-from tqdm.auto import tqdm
-
-from transformers import BigBirdConfig, FlaxBigBirdForQuestionAnswering
-from transformers.models.big_bird.modeling_flax_big_bird import FlaxBigBirdForQuestionAnsweringModule
-
-
-class FlaxBigBirdForNaturalQuestionsModule(FlaxBigBirdForQuestionAnsweringModule):
-    """
-    BigBirdForQuestionAnswering with CLS Head over the top for predicting category
-
-    This way we can load its weights with FlaxBigBirdForQuestionAnswering
-    """
-
-    config: BigBirdConfig
-    dtype: jnp.dtype = jnp.float32
-    add_pooling_layer: bool = True
-
-    def setup(self):
-        super().setup()
-        self.cls = nn.Dense(5, dtype=self.dtype)
-
-    def __call__(self, *args, **kwargs):
-        outputs = super().__call__(*args, **kwargs)
-        cls_out = self.cls(outputs[2])
-        return outputs[:2] + (cls_out,)
-
-
-class FlaxBigBirdForNaturalQuestions(FlaxBigBirdForQuestionAnswering):
-    module_class = FlaxBigBirdForNaturalQuestionsModule
-
-
-def calculate_loss_for_nq(start_logits, start_labels, end_logits, end_labels, pooled_logits, pooler_labels):
-    def cross_entropy(logits, labels, reduction=None):
-        """
-        Args:
-            logits: bsz, seqlen, vocab_size
-            labels: bsz, seqlen
-        """
-        vocab_size = logits.shape[-1]
-        labels = (labels[..., None] == jnp.arange(vocab_size)[None]).astype("f4")
-        logits = jax.nn.log_softmax(logits, axis=-1)
-        loss = -jnp.sum(labels * logits, axis=-1)
-        if reduction is not None:
-            loss = reduction(loss)
-        return loss
-
-    cross_entropy = partial(cross_entropy, reduction=jnp.mean)
-    start_loss = cross_entropy(start_logits, start_labels)
-    end_loss = cross_entropy(end_logits, end_labels)
-    pooled_loss = cross_entropy(pooled_logits, pooler_labels)
-    return (start_loss + end_loss + pooled_loss) / 3
-
-
-@dataclass
-class Args:
-    model_id: str = "google/bigbird-roberta-base"
-    logging_steps: int = 3000
-    save_steps: int = 10500
-
-    block_size: int = 128
-    num_random_blocks: int = 3
-
-    batch_size_per_device: int = 1
-    max_epochs: int = 5
-
-    # tx_args
-    lr: float = 3e-5
-    init_lr: float = 0.0
-    warmup_steps: int = 20000
-    weight_decay: float = 0.0095
-
-    save_dir: str = "bigbird-roberta-natural-questions"
-    base_dir: str = "training-expt"
-    tr_data_path: str = "data/nq-training.jsonl"
-    val_data_path: str = "data/nq-validation.jsonl"
-
-    def __post_init__(self):
-        os.makedirs(self.base_dir, exist_ok=True)
-        self.save_dir = os.path.join(self.base_dir, self.save_dir)
-        self.batch_size = self.batch_size_per_device * jax.device_count()
-
-
-@dataclass
-class DataCollator:
-    pad_id: int
-    max_length: int = 4096  # no dynamic padding on TPUs
-
-    def __call__(self, batch):
-        batch = self.collate_fn(batch)
-        batch = jax.tree_util.tree_map(shard, batch)
-        return batch
-
-    def collate_fn(self, features):
-        input_ids, attention_mask = self.fetch_inputs(features["input_ids"])
-        batch = {
-            "input_ids": jnp.array(input_ids, dtype=jnp.int32),
-            "attention_mask": jnp.array(attention_mask, dtype=jnp.int32),
-            "start_labels": jnp.array(features["start_token"], dtype=jnp.int32),
-            "end_labels": jnp.array(features["end_token"], dtype=jnp.int32),
-            "pooled_labels": jnp.array(features["category"], dtype=jnp.int32),
-        }
-        return batch
-
-    def fetch_inputs(self, input_ids: list):
-        inputs = [self._fetch_inputs(ids) for ids in input_ids]
-        return zip(*inputs)
-
-    def _fetch_inputs(self, input_ids: list):
-        attention_mask = [1 for _ in range(len(input_ids))]
-        while len(input_ids) < self.max_length:
-            input_ids.append(self.pad_id)
-            attention_mask.append(0)
-        return input_ids, attention_mask
-
-
-def get_batched_dataset(dataset, batch_size, seed=None):
-    if seed is not None:
-        dataset = dataset.shuffle(seed=seed)
-    for i in range(len(dataset) // batch_size):
-        batch = dataset[i * batch_size : (i + 1) * batch_size]
-        yield dict(batch)
-
-
-@partial(jax.pmap, axis_name="batch")
-def train_step(state, drp_rng, **model_inputs):
-    def loss_fn(params):
-        start_labels = model_inputs.pop("start_labels")
-        end_labels = model_inputs.pop("end_labels")
-        pooled_labels = model_inputs.pop("pooled_labels")
-
-        outputs = state.apply_fn(**model_inputs, params=params, dropout_rng=drp_rng, train=True)
-        start_logits, end_logits, pooled_logits = outputs
-
-        return state.loss_fn(
-            start_logits,
-            start_labels,
-            end_logits,
-            end_labels,
-            pooled_logits,
-            pooled_labels,
-        )
-
-    drp_rng, new_drp_rng = jax.random.split(drp_rng)
-    grad_fn = jax.value_and_grad(loss_fn)
-    loss, grads = grad_fn(state.params)
-    metrics = jax.lax.pmean({"loss": loss}, axis_name="batch")
-    grads = jax.lax.pmean(grads, "batch")
-
-    state = state.apply_gradients(grads=grads)
-    return state, metrics, new_drp_rng
-
-
-@partial(jax.pmap, axis_name="batch")
-def val_step(state, **model_inputs):
-    start_labels = model_inputs.pop("start_labels")
-    end_labels = model_inputs.pop("end_labels")
-    pooled_labels = model_inputs.pop("pooled_labels")
-
-    outputs = state.apply_fn(**model_inputs, params=state.params, train=False)
-    start_logits, end_logits, pooled_logits = outputs
-
-    loss = state.loss_fn(start_logits, start_labels, end_logits, end_labels, pooled_logits, pooled_labels)
-    metrics = jax.lax.pmean({"loss": loss}, axis_name="batch")
-    return metrics
-
-
-class TrainState(train_state.TrainState):
-    loss_fn: Callable = struct.field(pytree_node=False)
-
-
-@dataclass
-class Trainer:
-    args: Args
-    data_collator: Callable
-    train_step_fn: Callable
-    val_step_fn: Callable
-    model_save_fn: Callable
-    logger: wandb
-    scheduler_fn: Callable = None
-
-    def create_state(self, model, tx, num_train_steps, ckpt_dir=None):
-        params = model.params
-        state = TrainState.create(
-            apply_fn=model.__call__,
-            params=params,
-            tx=tx,
-            loss_fn=calculate_loss_for_nq,
-        )
-        if ckpt_dir is not None:
-            params, opt_state, step, args, data_collator = restore_checkpoint(ckpt_dir, state)
-            tx_args = {
-                "lr": args.lr,
-                "init_lr": args.init_lr,
-                "warmup_steps": args.warmup_steps,
-                "num_train_steps": num_train_steps,
-                "weight_decay": args.weight_decay,
-            }
-            tx, lr = build_tx(**tx_args)
-            state = train_state.TrainState(
-                step=step,
-                apply_fn=model.__call__,
-                params=params,
-                tx=tx,
-                opt_state=opt_state,
-            )
-            self.args = args
-            self.data_collator = data_collator
-            self.scheduler_fn = lr
-            model.params = params
-        state = jax_utils.replicate(state)
-        return state
-
-    def train(self, state, tr_dataset, val_dataset):
-        args = self.args
-        total = len(tr_dataset) // args.batch_size
-
-        rng = jax.random.PRNGKey(0)
-        drp_rng = jax.random.split(rng, jax.device_count())
-        for epoch in range(args.max_epochs):
-            running_loss = jnp.array(0, dtype=jnp.float32)
-            tr_dataloader = get_batched_dataset(tr_dataset, args.batch_size, seed=epoch)
-            i = 0
-            for batch in tqdm(tr_dataloader, total=total, desc=f"Running EPOCH-{epoch}"):
-                batch = self.data_collator(batch)
-                state, metrics, drp_rng = self.train_step_fn(state, drp_rng, **batch)
-                running_loss += jax_utils.unreplicate(metrics["loss"])
-                i += 1
-                if i % args.logging_steps == 0:
-                    state_step = jax_utils.unreplicate(state.step)
-                    tr_loss = running_loss.item() / i
-                    lr = self.scheduler_fn(state_step - 1)
-
-                    eval_loss = self.evaluate(state, val_dataset)
-                    logging_dict = {
-                        "step": state_step.item(),
-                        "eval_loss": eval_loss.item(),
-                        "tr_loss": tr_loss,
-                        "lr": lr.item(),
-                    }
-                    tqdm.write(str(logging_dict))
-                    self.logger.log(logging_dict, commit=True)
-
-                if i % args.save_steps == 0:
-                    self.save_checkpoint(args.save_dir + f"-e{epoch}-s{i}", state=state)
-
-    def evaluate(self, state, dataset):
-        dataloader = get_batched_dataset(dataset, self.args.batch_size)
-        total = len(dataset) // self.args.batch_size
-        running_loss = jnp.array(0, dtype=jnp.float32)
-        i = 0
-        for batch in tqdm(dataloader, total=total, desc="Evaluating ... "):
-            batch = self.data_collator(batch)
-            metrics = self.val_step_fn(state, **batch)
-            running_loss += jax_utils.unreplicate(metrics["loss"])
-            i += 1
-        return running_loss / i
-
-    def save_checkpoint(self, save_dir, state):
-        state = jax_utils.unreplicate(state)
-        print(f"SAVING CHECKPOINT IN {save_dir}", end=" ... ")
-        self.model_save_fn(save_dir, params=state.params)
-        with open(os.path.join(save_dir, "opt_state.msgpack"), "wb") as f:
-            f.write(to_bytes(state.opt_state))
-        joblib.dump(self.args, os.path.join(save_dir, "args.joblib"))
-        joblib.dump(self.data_collator, os.path.join(save_dir, "data_collator.joblib"))
-        with open(os.path.join(save_dir, "training_state.json"), "w") as f:
-            json.dump({"step": state.step.item()}, f)
-        print("DONE")
-
-
-def restore_checkpoint(save_dir, state):
-    print(f"RESTORING CHECKPOINT FROM {save_dir}", end=" ... ")
-    with open(os.path.join(save_dir, "flax_model.msgpack"), "rb") as f:
-        params = from_bytes(state.params, f.read())
-
-    with open(os.path.join(save_dir, "opt_state.msgpack"), "rb") as f:
-        opt_state = from_bytes(state.opt_state, f.read())
-
-    args = joblib.load(os.path.join(save_dir, "args.joblib"))
-    data_collator = joblib.load(os.path.join(save_dir, "data_collator.joblib"))
-
-    with open(os.path.join(save_dir, "training_state.json"), "r") as f:
-        training_state = json.load(f)
-    step = training_state["step"]
-
-    print("DONE")
-    return params, opt_state, step, args, data_collator
-
-
-def scheduler_fn(lr, init_lr, warmup_steps, num_train_steps):
-    decay_steps = num_train_steps - warmup_steps
-    warmup_fn = optax.linear_schedule(init_value=init_lr, end_value=lr, transition_steps=warmup_steps)
-    decay_fn = optax.linear_schedule(init_value=lr, end_value=1e-7, transition_steps=decay_steps)
-    lr = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[warmup_steps])
-    return lr
-
-
-def build_tx(lr, init_lr, warmup_steps, num_train_steps, weight_decay):
-    def weight_decay_mask(params):
-        params = traverse_util.flatten_dict(params)
-        mask = {k: (v[-1] != "bias" and v[-2:] != ("LayerNorm", "scale")) for k, v in params.items()}
-        return traverse_util.unflatten_dict(mask)
-
-    lr = scheduler_fn(lr, init_lr, warmup_steps, num_train_steps)
-
-    tx = optax.adamw(learning_rate=lr, weight_decay=weight_decay, mask=weight_decay_mask)
-    return tx, lr
diff --git a/examples/research_projects/jax-projects/big_bird/evaluate.py b/examples/research_projects/jax-projects/big_bird/evaluate.py
deleted file mode 100644
index 3c5123efeba5..000000000000
--- a/examples/research_projects/jax-projects/big_bird/evaluate.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import jax
-import jax.numpy as jnp
-from bigbird_flax import FlaxBigBirdForNaturalQuestions
-from datasets import load_from_disk
-
-from transformers import BigBirdTokenizerFast
-
-
-CATEGORY_MAPPING = {0: "null", 1: "short", 2: "long", 3: "yes", 4: "no"}
-PUNCTUATION_SET_TO_EXCLUDE = set("".join(["‘", "’", "´", "`", ".", ",", "-", '"']))
-
-
-def get_sub_answers(answers, begin=0, end=None):
-    return [" ".join(x.split(" ")[begin:end]) for x in answers if len(x.split(" ")) > 1]
-
-
-def expand_to_aliases(given_answers, make_sub_answers=False):
-    if make_sub_answers:
-        # if answers are longer than one word, make sure a predictions is correct if it coresponds to the complete 1: or :-1 sub word
-        # *e.g.* if the correct answer contains a prefix such as "the", or "a"
-        given_answers = (
-            given_answers + get_sub_answers(given_answers, begin=1) + get_sub_answers(given_answers, end=-1)
-        )
-    answers = []
-    for answer in given_answers:
-        alias = answer.replace("_", " ").lower()
-        alias = "".join(c if c not in PUNCTUATION_SET_TO_EXCLUDE else " " for c in alias)
-        answers.append(" ".join(alias.split()).strip())
-    return set(answers)
-
-
-def get_best_valid_start_end_idx(start_scores, end_scores, top_k=1, max_size=100):
-    best_start_scores, best_start_idx = jax.lax.top_k(start_scores, top_k)
-    best_end_scores, best_end_idx = jax.lax.top_k(end_scores, top_k)
-
-    widths = best_end_idx[:, None] - best_start_idx[None, :]
-    mask = jnp.logical_or(widths < 0, widths > max_size)
-    scores = (best_end_scores[:, None] + best_start_scores[None, :]) - (1e8 * mask)
-    best_score = jnp.argmax(scores).item()
-
-    return best_start_idx[best_score % top_k], best_end_idx[best_score // top_k]
-
-
-def format_dataset(sample):
-    question = sample["question"]["text"]
-    context = sample["document"]["tokens"]["token"]
-    is_html = sample["document"]["tokens"]["is_html"]
-    long_answers = sample["annotations"]["long_answer"]
-    short_answers = sample["annotations"]["short_answers"]
-
-    context_string = " ".join([context[i] for i in range(len(context)) if not is_html[i]])
-
-    # 0 - No ; 1 - Yes
-    for answer in sample["annotations"]["yes_no_answer"]:
-        if answer == 0 or answer == 1:
-            return {
-                "question": question,
-                "context": context_string,
-                "short": [],
-                "long": [],
-                "category": "no" if answer == 0 else "yes",
-            }
-
-    short_targets = []
-    for s in short_answers:
-        short_targets.extend(s["text"])
-    short_targets = list(set(short_targets))
-
-    long_targets = []
-    for s in long_answers:
-        if s["start_token"] == -1:
-            continue
-        answer = context[s["start_token"] : s["end_token"]]
-        html = is_html[s["start_token"] : s["end_token"]]
-        new_answer = " ".join([answer[i] for i in range(len(answer)) if not html[i]])
-        if new_answer not in long_targets:
-            long_targets.append(new_answer)
-
-    category = "long_short" if len(short_targets + long_targets) > 0 else "null"
-
-    return {
-        "question": question,
-        "context": context_string,
-        "short": short_targets,
-        "long": long_targets,
-        "category": category,
-    }
-
-
-def main():
-    dataset = load_from_disk("natural-questions-validation")
-    dataset = dataset.map(format_dataset).remove_columns(["annotations", "document", "id"])
-    print(dataset)
-
-    short_validation_dataset = dataset.filter(lambda x: (len(x["question"]) + len(x["context"])) < 4 * 4096)
-    short_validation_dataset = short_validation_dataset.filter(lambda x: x["category"] != "null")
-
-    model_id = "vasudevgupta/flax-bigbird-natural-questions"
-    model = FlaxBigBirdForNaturalQuestions.from_pretrained(model_id)
-    tokenizer = BigBirdTokenizerFast.from_pretrained(model_id)
-
-    @jax.jit
-    def forward(*args, **kwargs):
-        start_logits, end_logits, pooled_logits = model(*args, **kwargs)
-        return start_logits, end_logits, jnp.argmax(pooled_logits, axis=-1)
-
-    def evaluate(example):
-        # encode question and context so that they are separated by a tokenizer.sep_token and cut at max_length
-        inputs = tokenizer(
-            example["question"],
-            example["context"],
-            return_tensors="np",
-            max_length=4096,
-            padding="max_length",
-            truncation=True,
-        )
-
-        start_scores, end_scores, category = forward(**inputs)
-
-        predicted_category = CATEGORY_MAPPING[category.item()]
-
-        example["targets"] = example["long"] + example["short"]
-        if example["category"] in ["yes", "no", "null"]:
-            example["targets"] = [example["category"]]
-        example["has_tgt"] = example["category"] != "null"
-        # Now target can be: "yes", "no", "null", "list of long & short answers"
-
-        if predicted_category in ["yes", "no", "null"]:
-            example["output"] = [predicted_category]
-            example["match"] = example["output"] == example["targets"]
-            example["has_pred"] = predicted_category != "null"
-            return example
-
-        max_size = 38 if predicted_category == "short" else 1024
-        start_score, end_score = get_best_valid_start_end_idx(
-            start_scores[0], end_scores[0], top_k=8, max_size=max_size
-        )
-
-        input_ids = inputs["input_ids"][0].tolist()
-        example["output"] = [tokenizer.decode(input_ids[start_score : end_score + 1])]
-
-        answers = expand_to_aliases(example["targets"], make_sub_answers=True)
-        predictions = expand_to_aliases(example["output"])
-
-        # some preprocessing to both prediction and answer
-        answers = {"".join(a.split()) for a in answers}
-        predictions = {"".join(p.split()) for p in predictions}
-        predictions = {s for s in predictions if s not in ["``", "''", "`", "'"]}
-
-        # if there is a common element, it's a exact match
-        example["match"] = len(list(answers & predictions)) > 0
-        example["has_pred"] = predicted_category != "null" and len(predictions) > 0
-
-        return example
-
-    short_validation_dataset = short_validation_dataset.map(evaluate)
-
-    total = len(short_validation_dataset)
-    matched = len(short_validation_dataset.filter(lambda x: x["match"] == 1))
-    print("EM score:", (matched / total) * 100, "%")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py b/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
deleted file mode 100644
index ebbb184ccb6b..000000000000
--- a/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
+++ /dev/null
@@ -1,329 +0,0 @@
-import os
-
-import jsonlines
-import numpy as np
-from tqdm import tqdm
-
-
-DOC_STRIDE = 2048
-MAX_LENGTH = 4096
-SEED = 42
-PROCESS_TRAIN = os.environ.pop("PROCESS_TRAIN", "false")
-CATEGORY_MAPPING = {"null": 0, "short": 1, "long": 2, "yes": 3, "no": 4}
-
-
-def _get_single_answer(example):
-    def choose_first(answer, is_long_answer=False):
-        assert isinstance(answer, list)
-        if len(answer) == 1:
-            answer = answer[0]
-            return {k: [answer[k]] for k in answer} if is_long_answer else answer
-        for a in answer:
-            if is_long_answer:
-                a = {k: [a[k]] for k in a}
-            if len(a["start_token"]) > 0:
-                break
-        return a
-
-    answer = {"id": example["id"]}
-    annotation = example["annotations"]
-    yes_no_answer = annotation["yes_no_answer"]
-    if 0 in yes_no_answer or 1 in yes_no_answer:
-        answer["category"] = ["yes"] if 1 in yes_no_answer else ["no"]
-        answer["start_token"] = answer["end_token"] = []
-        answer["start_byte"] = answer["end_byte"] = []
-        answer["text"] = ["<cls>"]
-    else:
-        answer["category"] = ["short"]
-        out = choose_first(annotation["short_answers"])
-        if len(out["start_token"]) == 0:
-            # answer will be long if short is not available
-            answer["category"] = ["long"]
-            out = choose_first(annotation["long_answer"], is_long_answer=True)
-            out["text"] = []
-        answer.update(out)
-
-    # disregard some samples
-    if len(answer["start_token"]) > 1 or answer["start_token"] == answer["end_token"]:
-        answer["remove_it"] = True
-    else:
-        answer["remove_it"] = False
-
-    cols = ["start_token", "end_token", "start_byte", "end_byte", "text"]
-    if not all(isinstance(answer[k], list) for k in cols):
-        raise ValueError("Issue in ID", example["id"])
-
-    return answer
-
-
-def get_context_and_ans(example, assertion=False):
-    """Gives new context after removing <html> & new answer tokens as per new context"""
-    answer = _get_single_answer(example)
-    # bytes are of no use
-    del answer["start_byte"]
-    del answer["end_byte"]
-
-    # handle yes_no answers explicitly
-    if answer["category"][0] in ["yes", "no"]:  # category is list with one element
-        doc = example["document"]["tokens"]
-        context = []
-        for i in range(len(doc["token"])):
-            if not doc["is_html"][i]:
-                context.append(doc["token"][i])
-        return {
-            "context": " ".join(context),
-            "answer": {
-                "start_token": -100,  # ignore index in cross-entropy
-                "end_token": -100,  # ignore index in cross-entropy
-                "category": answer["category"],
-                "span": answer["category"],  # extra
-            },
-        }
-
-    # later, help in removing all no answers
-    if answer["start_token"] == [-1]:
-        return {
-            "context": "None",
-            "answer": {
-                "start_token": -1,
-                "end_token": -1,
-                "category": "null",
-                "span": "None",  # extra
-            },
-        }
-
-    # handling normal samples
-
-    cols = ["start_token", "end_token"]
-    answer.update({k: answer[k][0] if len(answer[k]) > 0 else answer[k] for k in cols})  # e.g. [10] == 10
-
-    doc = example["document"]["tokens"]
-    start_token = answer["start_token"]
-    end_token = answer["end_token"]
-
-    context = []
-    for i in range(len(doc["token"])):
-        if not doc["is_html"][i]:
-            context.append(doc["token"][i])
-        else:
-            if answer["start_token"] > i:
-                start_token -= 1
-            if answer["end_token"] > i:
-                end_token -= 1
-    new = " ".join(context[start_token:end_token])
-
-    # checking above code
-    if assertion:
-        """checking if above code is working as expected for all the samples"""
-        is_html = doc["is_html"][answer["start_token"] : answer["end_token"]]
-        old = doc["token"][answer["start_token"] : answer["end_token"]]
-        old = " ".join([old[i] for i in range(len(old)) if not is_html[i]])
-        if new != old:
-            print("ID:", example["id"])
-            print("New:", new, end="\n")
-            print("Old:", old, end="\n\n")
-
-    return {
-        "context": " ".join(context),
-        "answer": {
-            "start_token": start_token,
-            "end_token": end_token - 1,  # this makes it inclusive
-            "category": answer["category"],  # either long or short
-            "span": new,  # extra
-        },
-    }
-
-
-def get_strided_contexts_and_ans(example, tokenizer, doc_stride=2048, max_length=4096, assertion=True):
-    # overlap will be of doc_stride - q_len
-
-    out = get_context_and_ans(example, assertion=assertion)
-    answer = out["answer"]
-
-    # later, removing these samples
-    if answer["start_token"] == -1:
-        return {
-            "example_id": example["id"],
-            "input_ids": [[-1]],
-            "labels": {
-                "start_token": [-1],
-                "end_token": [-1],
-                "category": ["null"],
-            },
-        }
-
-    input_ids = tokenizer(example["question"]["text"], out["context"]).input_ids
-    q_len = input_ids.index(tokenizer.sep_token_id) + 1
-
-    # return yes/no
-    if answer["category"][0] in ["yes", "no"]:  # category is list with one element
-        inputs = []
-        category = []
-        q_indices = input_ids[:q_len]
-        doc_start_indices = range(q_len, len(input_ids), max_length - doc_stride)
-        for i in doc_start_indices:
-            end_index = i + max_length - q_len
-            slice = input_ids[i:end_index]
-            inputs.append(q_indices + slice)
-            category.append(answer["category"][0])
-            if slice[-1] == tokenizer.sep_token_id:
-                break
-
-        return {
-            "example_id": example["id"],
-            "input_ids": inputs,
-            "labels": {
-                "start_token": [-100] * len(category),
-                "end_token": [-100] * len(category),
-                "category": category,
-            },
-        }
-
-    splitted_context = out["context"].split()
-    complete_end_token = splitted_context[answer["end_token"]]
-    answer["start_token"] = len(
-        tokenizer(
-            " ".join(splitted_context[: answer["start_token"]]),
-            add_special_tokens=False,
-        ).input_ids
-    )
-    answer["end_token"] = len(
-        tokenizer(" ".join(splitted_context[: answer["end_token"]]), add_special_tokens=False).input_ids
-    )
-
-    answer["start_token"] += q_len
-    answer["end_token"] += q_len
-
-    # fixing end token
-    num_sub_tokens = len(tokenizer(complete_end_token, add_special_tokens=False).input_ids)
-    if num_sub_tokens > 1:
-        answer["end_token"] += num_sub_tokens - 1
-
-    old = input_ids[answer["start_token"] : answer["end_token"] + 1]  # right & left are inclusive
-    start_token = answer["start_token"]
-    end_token = answer["end_token"]
-
-    if assertion:
-        """This won't match exactly because of extra gaps => visaully inspect everything"""
-        new = tokenizer.decode(old)
-        if answer["span"] != new:
-            print("ISSUE IN TOKENIZATION")
-            print("OLD:", answer["span"])
-            print("NEW:", new, end="\n\n")
-
-    if len(input_ids) <= max_length:
-        return {
-            "example_id": example["id"],
-            "input_ids": [input_ids],
-            "labels": {
-                "start_token": [answer["start_token"]],
-                "end_token": [answer["end_token"]],
-                "category": answer["category"],
-            },
-        }
-
-    q_indices = input_ids[:q_len]
-    doc_start_indices = range(q_len, len(input_ids), max_length - doc_stride)
-
-    inputs = []
-    answers_start_token = []
-    answers_end_token = []
-    answers_category = []  # null, yes, no, long, short
-    for i in doc_start_indices:
-        end_index = i + max_length - q_len
-        slice = input_ids[i:end_index]
-        inputs.append(q_indices + slice)
-        assert len(inputs[-1]) <= max_length, "Issue in truncating length"
-
-        if start_token >= i and end_token <= end_index - 1:
-            start_token = start_token - i + q_len
-            end_token = end_token - i + q_len
-            answers_category.append(answer["category"][0])  # ["short"] -> "short"
-        else:
-            start_token = -100
-            end_token = -100
-            answers_category.append("null")
-        new = inputs[-1][start_token : end_token + 1]
-
-        answers_start_token.append(start_token)
-        answers_end_token.append(end_token)
-        if assertion:
-            """checking if above code is working as expected for all the samples"""
-            if new != old and new != [tokenizer.cls_token_id]:
-                print("ISSUE in strided for ID:", example["id"])
-                print("New:", tokenizer.decode(new))
-                print("Old:", tokenizer.decode(old), end="\n\n")
-        if slice[-1] == tokenizer.sep_token_id:
-            break
-
-    return {
-        "example_id": example["id"],
-        "input_ids": inputs,
-        "labels": {
-            "start_token": answers_start_token,
-            "end_token": answers_end_token,
-            "category": answers_category,
-        },
-    }
-
-
-def prepare_inputs(example, tokenizer, doc_stride=2048, max_length=4096, assertion=False):
-    example = get_strided_contexts_and_ans(
-        example,
-        tokenizer,
-        doc_stride=doc_stride,
-        max_length=max_length,
-        assertion=assertion,
-    )
-
-    return example
-
-
-def save_to_disk(hf_data, file_name):
-    with jsonlines.open(file_name, "a") as writer:
-        for example in tqdm(hf_data, total=len(hf_data), desc="Saving samples ... "):
-            labels = example["labels"]
-            for ids, start, end, cat in zip(
-                example["input_ids"],
-                labels["start_token"],
-                labels["end_token"],
-                labels["category"],
-            ):
-                if start == -1 and end == -1:
-                    continue  # leave waste samples with no answer
-                if cat == "null" and np.random.rand() < 0.6:
-                    continue  # removing 50 % samples
-                writer.write(
-                    {
-                        "input_ids": ids,
-                        "start_token": start,
-                        "end_token": end,
-                        "category": CATEGORY_MAPPING[cat],
-                    }
-                )
-
-
-if __name__ == "__main__":
-    """Running area"""
-    from datasets import load_dataset
-
-    from transformers import BigBirdTokenizer
-
-    data = load_dataset("natural_questions")
-    tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
-
-    data = data["train" if PROCESS_TRAIN == "true" else "validation"]
-
-    fn_kwargs = {
-        "tokenizer": tokenizer,
-        "doc_stride": DOC_STRIDE,
-        "max_length": MAX_LENGTH,
-        "assertion": False,
-    }
-    data = data.map(prepare_inputs, fn_kwargs=fn_kwargs)
-    data = data.remove_columns(["annotations", "document", "id", "question"])
-    print(data)
-
-    np.random.seed(SEED)
-    cache_file_name = "nq-training.jsonl" if PROCESS_TRAIN == "true" else "nq-validation.jsonl"
-    save_to_disk(data, file_name=cache_file_name)
diff --git a/examples/research_projects/jax-projects/big_bird/requirements.txt b/examples/research_projects/jax-projects/big_bird/requirements.txt
deleted file mode 100644
index b1bc8a7ace24..000000000000
--- a/examples/research_projects/jax-projects/big_bird/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-git+https://github.com/huggingface/transformers@main
-datasets
-sentencepiece
-wandb
-flax
-jsonlines
diff --git a/examples/research_projects/jax-projects/big_bird/sweep_flax.yaml b/examples/research_projects/jax-projects/big_bird/sweep_flax.yaml
deleted file mode 100644
index d804f61b3e16..000000000000
--- a/examples/research_projects/jax-projects/big_bird/sweep_flax.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-command: 
-        - python3
-        - train.py
-method: random
-parameters:
-        lr:
-                values: [4e-5, 3e-5]
-        warmup_steps:
-                values: [20000, 15000, 10000, 5000]
-        weight_decay:
-                distribution: normal
-                mu: 1e-2
-                sigma: 2e-3
-metric:
-        name: eval_loss
-        goal: minimize
diff --git a/examples/research_projects/jax-projects/big_bird/train.py b/examples/research_projects/jax-projects/big_bird/train.py
deleted file mode 100644
index ce37b7f975bb..000000000000
--- a/examples/research_projects/jax-projects/big_bird/train.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import os
-from dataclasses import replace
-
-import jax
-import wandb
-from bigbird_flax import Args, DataCollator, FlaxBigBirdForNaturalQuestions, Trainer, build_tx, train_step, val_step
-from datasets import load_dataset
-from flax import jax_utils
-
-from transformers import BigBirdTokenizerFast
-
-
-if __name__ == "__main__":
-    print("#################### AVAILABLE DEVICES ####################")
-    print(jax.devices())
-    print("###########################################################")
-
-    # setup for wandb sweep
-    args = Args()
-    logger = wandb.init(project="bigbird-natural-questions", config=args.__dict__)
-    wandb_args = dict(logger.config)
-    del wandb_args["batch_size"]
-    args = replace(args, **wandb_args)
-    base_dir = args.base_dir + "-" + wandb.run.id
-    args = replace(args, base_dir=base_dir)
-    print(args)
-
-    tr_dataset = load_dataset("json", data_files=args.tr_data_path)["train"]
-    val_dataset = load_dataset("json", data_files=args.val_data_path)["train"]
-
-    # drop extra batch for now
-    indices = range(len(tr_dataset) - len(tr_dataset) % args.batch_size)
-    tr_dataset = tr_dataset.shuffle().select(indices)
-    indices = range(len(val_dataset) - len(val_dataset) % args.batch_size)
-    val_dataset = val_dataset.shuffle().select(indices)
-
-    if os.environ.get("TRAIN_ON_SMALL", "false") == "true":
-        tr_dataset = tr_dataset.shuffle().select(range(80000))
-        val_dataset = val_dataset.shuffle().select(range(8000))
-
-    print(tr_dataset)
-    print(val_dataset)
-
-    model = FlaxBigBirdForNaturalQuestions.from_pretrained(
-        args.model_id, block_size=args.block_size, num_random_blocks=args.num_random_blocks
-    )
-    tokenizer = BigBirdTokenizerFast.from_pretrained(args.model_id)
-    data_collator = DataCollator(pad_id=tokenizer.pad_token_id, max_length=4096)
-
-    tx_args = {
-        "lr": args.lr,
-        "init_lr": args.init_lr,
-        "warmup_steps": args.warmup_steps,
-        "num_train_steps": args.max_epochs * (len(tr_dataset) // args.batch_size),
-        "weight_decay": args.weight_decay,
-    }
-    tx, lr = build_tx(**tx_args)
-
-    trainer = Trainer(
-        args=args,
-        data_collator=data_collator,
-        model_save_fn=model.save_pretrained,
-        train_step_fn=train_step,
-        val_step_fn=val_step,
-        logger=logger,
-        scheduler_fn=lr,
-    )
-
-    ckpt_dir = None
-    state = trainer.create_state(model, tx, num_train_steps=tx_args["num_train_steps"], ckpt_dir=ckpt_dir)
-    try:
-        trainer.train(state, tr_dataset, val_dataset)
-    except KeyboardInterrupt:
-        print("Oooops; TRAINING STOPPED UNFORTUNATELY")
-
-    print("SAVING WEIGHTS IN `final-weights`")
-    params = jax_utils.unreplicate(state.params)
-    model.save_pretrained(os.path.join(args.base_dir, "final-weights"), params=params)
diff --git a/examples/research_projects/jax-projects/dataset-streaming/README.md b/examples/research_projects/jax-projects/dataset-streaming/README.md
deleted file mode 100644
index bdb6629e509c..000000000000
--- a/examples/research_projects/jax-projects/dataset-streaming/README.md
+++ /dev/null
@@ -1,121 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Language model training examples in streaming mode
-
-The following examples showcase how to train a language model from scratch 
-using the JAX/Flax backend.
-
-JAX/Flax allows you to trace pure functions and compile them into efficient, fused accelerator code on both GPU and TPU.
-Models written in JAX/Flax are **immutable** and updated in a purely functional
-way which enables simple and efficient model parallelism.
-
-All of the following examples make use of [dataset streaming](https://huggingface.co/docs/datasets/master/dataset_streaming), therefore allowing to train models on massive datasets\
-without ever having to download the full dataset.
-
-## Masked language modeling
-
-In the following, we demonstrate how to train a bi-directional transformer model 
-using masked language modeling objective as introduced in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805).
-More specifically, we demonstrate how JAX/Flax and dataset streaming can be leveraged 
-to pre-train [**`FacebookAI/roberta-base`**](https://huggingface.co/FacebookAI/roberta-base)
-in English on a single TPUv3-8 pod for 10000 update steps.
-
-The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
-
-Let's start by creating a model repository to save the trained model and logs.
-Here we call the model `"english-roberta-base-dummy"`, but you can change the model name as you like.
-
-You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
-you are logged in) or via the command line:
-
-```bash
-huggingface-cli repo create english-roberta-base-dummy
-```
-
-Next we clone the model repository to add the tokenizer and model files.
-
-```bash
-git clone https://huggingface.co/<your-username>/english-roberta-base-dummy
-```
-
-To ensure that all tensorboard traces will be uploaded correctly, we need to 
-track them. You can run the following command inside your model repo to do so.
-
-```bash
-cd english-roberta-base-dummy
-git lfs track "*tfevents*"
-```
-
-Great, we have set up our model repository. During training, we will automatically
-push the training logs and model weights to the repo.
-
-Next, let's add a symbolic link to the `run_mlm_flax.py`.
-
-```bash
-export MODEL_DIR="./english-roberta-base-dummy"
-ln -s ~/transformers/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py ./
-```
-
-### Copy config and tokenizer of existing model
-
-In this example, we will simply copy an existing config and tokenizer in English.
-You can run the following code in a Python shell to do so.
-
-```python
-from transformers import RobertaTokenizerFast, RobertaConfig
-
-model_dir = "./english-roberta-base-dummy"
-
-tokenizer = RobertaTokenizerFast.from_pretrained("FacebookAI/roberta-base")
-config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")
-
-tokenizer.save_pretrained(model_dir)
-config.save_pretrained(model_dir)
-```
-
-### Train model
-
-Next we can run the example script to pretrain the model.
-Compared to the default [`run_mlm_flax`](https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/run_mlm_flax.py), we introduced 4 new training settings:
-- `num_train_steps` - how many update steps should be run.
-- `num_eval_samples` - how many training samples should be taken for evaluation.
-- `logging_steps` - at what rate should the training loss be logged.
-- `eval_steps` - at what rate should evaluation be run.
-10K update steps 
-
-```bash
-./run_mlm_flax_stream.py \
-    --output_dir="${MODEL_DIR}" \
-    --model_type="roberta" \
-    --config_name="${MODEL_DIR}" \
-    --tokenizer_name="${MODEL_DIR}" \
-    --dataset_name="oscar" \
-    --dataset_config_name="unshuffled_deduplicated_en" \
-    --max_seq_length="128" \
-    --per_device_train_batch_size="128" \
-    --per_device_eval_batch_size="128" \
-    --learning_rate="3e-4" \
-    --warmup_steps="1000" \
-    --overwrite_output_dir \
-    --adam_beta1="0.9" \
-    --adam_beta2="0.98" \
-    --num_train_steps="10000" \
-    --num_eval_samples="5000" \
-    --logging_steps="250" \
-    --eval_steps="1000" \
-    --push_to_hub
-```
diff --git a/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py b/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
deleted file mode 100755
index 8940fab5bda3..000000000000
--- a/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
+++ /dev/null
@@ -1,637 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
-text file or a dataset.
-
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=fill-mask
-"""
-
-import logging
-import os
-import sys
-import time
-from collections import defaultdict
-from dataclasses import dataclass, field
-
-# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import datasets
-import flax
-import jax
-import jax.numpy as jnp
-import numpy as np
-import optax
-from datasets import load_dataset
-from flax import jax_utils, traverse_util
-from flax.training import train_state
-from flax.training.common_utils import get_metrics, onehot, shard
-from tqdm import tqdm
-
-from transformers import (
-    CONFIG_MAPPING,
-    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
-    AutoConfig,
-    AutoTokenizer,
-    FlaxAutoModelForMaskedLM,
-    HfArgumentParser,
-    PreTrainedTokenizerBase,
-    TensorType,
-    TrainingArguments,
-    is_tensorboard_available,
-    set_seed,
-)
-
-
-if datasets.__version__ <= "1.8.0":
-    raise ValueError("Make sure to upgrade `datasets` to a version >= 1.9.0 to use dataset streaming")
-
-
-MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
-            )
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    dtype: Optional[str] = field(
-        default="float32",
-        metadata={
-            "help": (
-                "Floating-point format in which the model weights should be initialized and trained. Choose one of"
-                " `[float32, float16, bfloat16]`."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    train_ref_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
-    )
-    validation_ref_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    max_seq_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated. Default to the max input length of the model."
-            )
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    mlm_probability: float = field(
-        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-            )
-        },
-    )
-    line_by_line: bool = field(
-        default=False,
-        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
-    )
-    text_column_name: str = field(
-        default="text", metadata={"help": "The name of the column to retrieve the training text."}
-    )
-    shuffle_buffer_size: int = field(
-        default=10000, metadata={"help": "The number of examples to pre-load for shuffling."}
-    )
-    num_train_steps: int = field(default=50000, metadata={"help": "The number of training steps."})
-    num_eval_samples: int = field(default=50000, metadata={"help": "The number of samples to be used for evaluation"})
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
-
-
-@flax.struct.dataclass
-class FlaxDataCollatorForLanguageModeling:
-    """
-    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
-    are not all of the same length.
-
-    Args:
-        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
-            The tokenizer used for encoding the data.
-        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
-            The probability with which to (randomly) mask tokens in the input.
-
-    .. note::
-
-        For best performance, this data collator should be used with a dataset having items that are dictionaries or
-        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
-        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
-        argument :obj:`return_special_tokens_mask=True`.
-    """
-
-    tokenizer: PreTrainedTokenizerBase
-    mlm_probability: float = 0.15
-
-    def __post_init__(self):
-        if self.tokenizer.mask_token is None:
-            raise ValueError(
-                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
-                "You should pass `mlm=False` to train on causal language modeling instead."
-            )
-
-    def __call__(self, examples: List[Dict[str, np.ndarray]]) -> Dict[str, np.ndarray]:
-        # Handle dict or lists with proper padding and conversion to tensor.
-        batch = self.tokenizer.pad(examples, return_tensors=TensorType.NUMPY)
-
-        # If special token mask has been preprocessed, pop it from the dict.
-        special_tokens_mask = batch.pop("special_tokens_mask", None)
-
-        batch["input_ids"], batch["labels"] = self.mask_tokens(
-            batch["input_ids"], special_tokens_mask=special_tokens_mask
-        )
-        return batch
-
-    def mask_tokens(
-        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
-    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
-        """
-        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
-        """
-        labels = inputs.copy()
-        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
-        probability_matrix = np.full(labels.shape, self.mlm_probability)
-        special_tokens_mask = special_tokens_mask.astype("bool")
-
-        probability_matrix[special_tokens_mask] = 0.0
-        masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
-        labels[~masked_indices] = -100  # We only compute loss on masked tokens
-
-        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
-        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
-
-        # 10% of the time, we replace masked input tokens with random word
-        indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
-        indices_random &= masked_indices & ~indices_replaced
-
-        random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
-        inputs[indices_random] = random_words[indices_random]
-
-        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
-        return inputs, labels
-
-
-def generate_batch_splits(samples_idx: np.ndarray, batch_size: int) -> np.ndarray:
-    num_samples = len(samples_idx)
-    samples_to_remove = num_samples % batch_size
-
-    if samples_to_remove != 0:
-        samples_idx = samples_idx[:-samples_to_remove]
-    sections_split = num_samples // batch_size
-    batch_idx = np.split(samples_idx, sections_split)
-    return batch_idx
-
-
-def advance_iter_and_group_samples(train_iterator, num_samples, max_seq_length):
-    """
-    The training iterator is advanced so that after groupifying the samples,
-    `num_samples` of length `max_seq_length` are returned.
-    """
-    num_total_tokens = max_seq_length * num_samples
-    samples = defaultdict(list)
-
-    i = 0
-    while i < num_total_tokens:
-        tokenized_samples = next(train_iterator)
-        i += len(tokenized_samples["input_ids"])
-
-        # concatenate tokenized samples to list (excluding "id" and "text")
-        samples = {
-            k: samples[k] + tokenized_samples[k] for k in ["input_ids", "attention_mask", "special_tokens_mask"]
-        }
-
-    # Concatenated tokens are split to lists of length `max_seq_length`.
-    # Note that remainedr of % max_seq_length are thrown away.
-    def group_texts(examples):
-        result = {
-            k: [t[i : i + max_seq_length] for i in range(0, num_total_tokens, max_seq_length)]
-            for k, t in examples.items()
-        }
-        return result
-
-    grouped_samples = group_texts(samples)
-    return grouped_samples
-
-
-def write_train_metric(summary_writer, train_metrics, train_time, step):
-    summary_writer.scalar("train_time", train_time, step)
-
-    train_metrics = get_metrics(train_metrics)
-    for key, vals in train_metrics.items():
-        tag = f"train_{key}"
-        for i, val in enumerate(vals):
-            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
-
-
-def write_eval_metric(summary_writer, eval_metrics, step):
-    for metric_name, value in eval_metrics.items():
-        summary_writer.scalar(f"eval_{metric_name}", value, step)
-
-
-if __name__ == "__main__":
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    if (
-        os.path.exists(training_args.output_dir)
-        and os.listdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-            "Use --overwrite_output_dir to overcome."
-        )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        level="INFO",
-        datefmt="[%X]",
-    )
-
-    # Log on each process the small summary:
-    logger = logging.getLogger(__name__)
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        dataset = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            streaming=True,
-            split="train",
-        )
-
-    if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
-    elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
-    else:
-        config = CONFIG_MAPPING[model_args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-        )
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-        )
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
-    # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
-    # efficient when it receives the `special_tokens_mask`.
-    def tokenize_function(examples):
-        return tokenizer(examples[data_args.text_column_name], return_special_tokens_mask=True)
-
-    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=list(dataset.features.keys()))
-
-    shuffle_seed = training_args.seed
-    tokenized_datasets = tokenized_datasets.shuffle(buffer_size=data_args.shuffle_buffer_size, seed=shuffle_seed)
-
-    has_tensorboard = is_tensorboard_available()
-    if has_tensorboard and jax.process_index() == 0:
-        try:
-            from flax.metrics.tensorboard import SummaryWriter
-        except ImportError as ie:
-            has_tensorboard = False
-            logger.warning(
-                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
-            )
-
-        summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
-
-    # Data collator
-    # This one will take care of randomly masking the tokens.
-    data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
-
-    # Initialize our training
-    rng = jax.random.PRNGKey(training_args.seed)
-    dropout_rngs = jax.random.split(rng, jax.local_device_count())
-
-    if model_args.model_name_or_path:
-        model = FlaxAutoModelForMaskedLM.from_pretrained(
-            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
-        )
-    else:
-        model = FlaxAutoModelForMaskedLM.from_config(
-            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
-        )
-
-    # Store some constant
-    num_epochs = int(training_args.num_train_epochs)
-    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
-    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
-
-    # define number steps per stream epoch
-    num_train_steps = data_args.num_train_steps
-
-    # Create learning rate schedule
-    warmup_fn = optax.linear_schedule(
-        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
-    )
-    decay_fn = optax.linear_schedule(
-        init_value=training_args.learning_rate,
-        end_value=0,
-        transition_steps=num_train_steps - training_args.warmup_steps,
-    )
-    linear_decay_lr_schedule_fn = optax.join_schedules(
-        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
-    )
-
-    # We use Optax's "masking" functionality to not apply weight decay
-    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
-    # mask boolean with the same structure as the parameters.
-    # The mask is True for parameters that should be decayed.
-    # Note that this mask is specifically adapted for FlaxBERT-like models.
-    # For other models, one should correct the layer norm parameter naming
-    # accordingly.
-    def decay_mask_fn(params):
-        flat_params = traverse_util.flatten_dict(params)
-        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
-        return traverse_util.unflatten_dict(flat_mask)
-
-    # create adam optimizer
-    adamw = optax.adamw(
-        learning_rate=linear_decay_lr_schedule_fn,
-        b1=training_args.adam_beta1,
-        b2=training_args.adam_beta2,
-        eps=training_args.adam_epsilon,
-        weight_decay=training_args.weight_decay,
-        mask=decay_mask_fn,
-    )
-
-    # Setup train state
-    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)
-
-    # Define gradient update step fn
-    def train_step(state, batch, dropout_rng):
-        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
-
-        def loss_fn(params):
-            labels = batch.pop("labels")
-
-            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
-
-            # compute loss, ignore padded input tokens
-            label_mask = jnp.where(labels > 0, 1.0, 0.0)
-            loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
-
-            # take average
-            loss = loss.sum() / label_mask.sum()
-
-            return loss
-
-        grad_fn = jax.value_and_grad(loss_fn)
-        loss, grad = grad_fn(state.params)
-        grad = jax.lax.pmean(grad, "batch")
-        new_state = state.apply_gradients(grads=grad)
-
-        metrics = jax.lax.pmean(
-            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
-        )
-
-        return new_state, metrics, new_dropout_rng
-
-    # Create parallel version of the train step
-    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
-
-    # Define eval fn
-    def eval_step(params, batch):
-        labels = batch.pop("labels")
-
-        logits = model(**batch, params=params, train=False)[0]
-
-        # compute loss, ignore padded input tokens
-        label_mask = jnp.where(labels > 0, 1.0, 0.0)
-        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
-
-        # compute accuracy
-        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
-
-        # summarize metrics
-        metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
-        metrics = jax.lax.psum(metrics, axis_name="batch")
-
-        return metrics
-
-    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
-
-    # Replicate the train state on each device
-    state = jax_utils.replicate(state)
-
-    train_time = 0
-    train_start = time.time()
-    train_metrics = []
-    eval_metrics = []
-
-    training_iter = iter(tokenized_datasets)
-
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-    eval_samples = advance_iter_and_group_samples(training_iter, data_args.num_eval_samples, max_seq_length)
-
-    steps = tqdm(range(num_train_steps), desc="Training...", position=0)
-    for step in range(num_train_steps):
-        # ======================== Training ================================
-        try:
-            samples = advance_iter_and_group_samples(training_iter, train_batch_size, max_seq_length)
-        except StopIteration:
-            # Once the end of the dataset stream is reached, the training iterator
-            # is reinitialized and reshuffled and a new eval dataset is randomly chosen.
-            shuffle_seed += 1
-            tokenized_datasets.set_epoch(shuffle_seed)
-
-            training_iter = iter(tokenized_datasets)
-
-            eval_dataset = advance_iter_and_group_samples(training_iter, data_args.num_eval_samples, max_seq_length)
-            samples = advance_iter_and_group_samples(training_iter, train_batch_size, max_seq_length)
-
-        # process input samples
-        model_inputs = data_collator(samples)
-
-        # Model forward
-        model_inputs = shard(model_inputs.data)
-        state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
-
-        train_metrics.append(train_metric)
-
-        if step % training_args.logging_steps == 0 and step > 0:
-            steps.write(
-                f"Step... ({step} | Loss: {train_metric['loss'].mean()}, Learning Rate:"
-                f" {train_metric['learning_rate'].mean()})"
-            )
-            train_time += time.time() - train_start
-            if has_tensorboard and jax.process_index() == 0:
-                write_train_metric(summary_writer, train_metrics, train_time, step)
-            train_metrics = []
-
-        # ======================== Evaluating ==============================
-        if step % training_args.eval_steps == 0 and step > 0:
-            # Avoid using jax.numpy here in case of TPU training
-            eval_samples_idx = np.arange(data_args.num_eval_samples)
-            eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
-
-            for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=1)):
-                # process input samples
-                batch_eval_samples = {k: [v[idx] for idx in batch_idx] for k, v in eval_samples.items()}
-                model_inputs = data_collator(batch_eval_samples)
-
-                # Model forward
-                model_inputs = shard(model_inputs.data)
-                metrics = p_eval_step(state.params, model_inputs)
-                eval_metrics.append(metrics)
-
-            # normalize eval metrics
-            eval_metrics = get_metrics(eval_metrics)
-            eval_metrics = jax.tree_util.tree_map(jnp.sum, eval_metrics)
-            eval_normalizer = eval_metrics.pop("normalizer")
-            eval_metrics = jax.tree_util.tree_map(lambda x: x / eval_normalizer, eval_metrics)
-
-            # Update progress bar
-            steps.desc = (
-                f"Step... ({step + 1}/{num_train_steps} | Loss: {eval_metrics['loss']}, Acc:"
-                f" {eval_metrics['accuracy']})"
-            )
-
-            if has_tensorboard and jax.process_index() == 0:
-                write_eval_metric(summary_writer, eval_metrics, step)
-            eval_metrics = []
-
-            # save checkpoint after each epoch and push checkpoint to the hub
-            if jax.process_index() == 0:
-                params = jax.device_get(jax.tree_util.tree_map(lambda x: x[0], state.params))
-                model.save_pretrained(
-                    training_args.output_dir,
-                    params=params,
-                    push_to_hub=training_args.push_to_hub,
-                    commit_message=f"Saving weights and logs of step {step+1}",
-                )
-
-        # update tqdm bar
-        steps.update(1)
diff --git a/examples/research_projects/jax-projects/hybrid_clip/README.md b/examples/research_projects/jax-projects/hybrid_clip/README.md
deleted file mode 100644
index 72d3db193589..000000000000
--- a/examples/research_projects/jax-projects/hybrid_clip/README.md
+++ /dev/null
@@ -1,172 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Vision-Text dual encoder model training examples
-
-> Note: This example is experimental and might not give the best possible results
-
-The following example showcases how to train a CLIP like vision-text dual encoder model
-using a pre-trained vision and text encoder using the JAX/Flax backend.
-
-Such a model can be used for natural language image search and potentially zero-shot image classification.
-The model is inspired by the [CLIP](https://openai.com/blog/clip/) approach, introduced by Alec Radford et al.
-The idea is to train a vision encoder and a text encoder jointly to project the representation of images and their
-captions into the same embedding space, such that the caption embeddings are located near the embeddings
-of the images they describe.
-
-JAX/Flax allows you to trace pure functions and compile them into efficient, fused accelerator code on both GPU and TPU.
-Models written in JAX/Flax are **immutable** and updated in a purely functional
-way which enables simple and efficient model parallelism.
-
-In this example we will use the vision model from [CLIP](https://huggingface.co/models?filter=clip)
-as the image encoder and [`FacebookAI/roberta-base`](https://huggingface.co/FacebookAI/roberta-base) as the text encoder.
-Note that one can also use the [ViT](https://huggingface.co/models?filter=vit) model as image encoder and any other BERT or ROBERTa model as text encoder.
-To train the model on languages other than English one should choose a text encoder trained on the desired
-language and a image-text dataset in that language. One such dataset is [WIT](https://github.com/google-research-datasets/wit).	
-
-Let's start by creating a model repository to save the trained model and logs.
-Here we call the model `"clip-roberta-base"`, but you can change the model name as you like.
-
-You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
-you are logged in) or via the command line:
-
-```bash
-huggingface-cli repo create clip-roberta-base
-```
-Next we clone the model repository to add the tokenizer and model files.
-```bash
-git clone https://huggingface.co/<your-username>/clip-roberta-base
-```
-To ensure that all tensorboard traces will be uploaded correctly, we need to 
-track them. You can run the following command inside your model repo to do so.
-
-```bash
-cd clip-roberta-base
-git lfs track "*tfevents*"
-```
-
-Great, we have set up our model repository. During training, we will automatically
-push the training logs and model weights to the repo.
-
-Next, let's add a symbolic link to the `run_hybrid_clip.py`.
-
-```bash
-export MODEL_DIR="./clip-roberta-base
-ln -s ~/transformers/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py run_hybrid_clip.py
-```
-
-## How to use the `FlaxHybridCLIP` model:
-
-The `FlaxHybridCLIP` class let's you load any text and vision encoder model to create a dual encoder. 
-Here is an example of how to load the model using pre-trained text and vision models.
-
-```python
-from modeling_hybrid_clip import FlaxHybridCLIP
-
-model = FlaxHybridCLIP.from_text_vision_pretrained("google-bert/bert-base-uncased", "openai/clip-vit-base-patch32")
-
-# save the model
-model.save_pretrained("bert-clip")
-
-# load the saved model
-model = FlaxHybridCLIP.from_pretrained("bert-clip")
-```
-
-If the checkpoints are in PyTorch then one could pass `text_from_pt=True` and `vision_from_pt=True`. This will load the model
-PyTorch checkpoints convert them to flax and load the model.
-
-```python
-model = FlaxHybridCLIP.from_text_vision_pretrained("google-bert/bert-base-uncased", "openai/clip-vit-base-patch32", text_from_pt=True, vision_from_pt=True)
-```
-
-This loads both the text and vision encoders using pre-trained weights, the projection layers are randomly
-initialized except for CLIP's vision model. If you use CLIP to initialize the vision model then the vision projection weights are also
-loaded using the pre-trained weights.
-
-## Prepare the dataset
-
-We will use the MS-COCO dataset to train our dual encoder model. MS-COCO contains over 82,000 images, each of which has at least 5 different caption annotations. The dataset is usually used for image captioning tasks, but we can repurpose the image-caption pairs to train our dual encoder model for image search.
-
-### Download and extract the data.
-
-It consists of two compressed folders: one with images, and the other—with associated image captions. Note that the compressed images folder is 13GB in size.
-
-```bash
-wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
-wget http://images.cocodataset.org/zips/train2014.zip
-
-unzip annotations_trainval2014.zip
-unzip train2014.zip
-
-mkdir coco_dataset
-mv train2014 coco_dataset/
-mv annotations coco_dataset/
-```
-
-### Prepare dataset files and split the dataset.
-
-```python
-import json
-import collections
-
-images_dir = "coco_dataset/train2014"
-annotation_file = "coco_dataset/annotations/captions_train2014.json"
-with open(annotation_file, "r") as f:
-    annotations = json.load(f)["annotations"]
-
-image_path_to_caption = collections.defaultdict(list)
-for element in annotations:
-    caption = f"{element['caption'].lower().rstrip('.')}"
-    image_path = images_dir + "/COCO_train2014_" + "%012d.jpg" % (element["image_id"])
-    image_path_to_caption[image_path].append(caption)
-
-lines = []
-for image_path, captions in image_path_to_caption.items():
-    lines.append(json.dumps({"image_path": image_path, "captions": captions}))
-
-train_lines = lines[:-8000]
-valid_line = lines[-8000:]
-with open("coco_dataset/train_dataset.json", "w") as f:
-    f.write("\n".join(train_lines))
-
-with open("coco_dataset/valid_dataset.json", "w") as f:
-    f.write("\n".join(valid_line))
-```
-
-> Note: The data loading and processing part of this script can still be improved for maximum performance. In particular one should decode the images beforehand and use those instead decoding them each time. If the dataset is small or if you have huge disk space the you could also pre-process all the dataset beforehand and then use it.
-
-## Train the model
-Next we can run the example script to train the model:
-
-```bash
-python run_hybrid_clip.py \
-    --output_dir ${MODEL_DIR} \
-    --text_model_name_or_path="FacebookAI/roberta-base" \
-    --vision_model_name_or_path="openai/clip-vit-base-patch32" \
-    --tokenizer_name="FacebookAI/roberta-base" \
-    --train_file="coco_dataset/train_dataset.json" \
-    --validation_file="coco_dataset/validation_dataset.json" \
-    --do_train --do_eval \
-    --num_train_epochs="40" --max_seq_length 96 \
-    --per_device_train_batch_size="64" \
-    --per_device_eval_batch_size="64" \
-    --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \
-    --overwrite_output_dir \
-    --preprocessing_num_workers 32 \
-    --push_to_hub
-```
-
-This should finish in ~1h50 mins with min validation loss 2.43. Training statistics can be accessed on [tfhub.de](https://tensorboard.dev/experiment/RUNPYd1yRgSD5kZSb9hDig/#scalars)
diff --git a/examples/research_projects/jax-projects/hybrid_clip/configuration_hybrid_clip.py b/examples/research_projects/jax-projects/hybrid_clip/configuration_hybrid_clip.py
deleted file mode 100644
index 5272ac44a1a8..000000000000
--- a/examples/research_projects/jax-projects/hybrid_clip/configuration_hybrid_clip.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import copy
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class HybridCLIPConfig(PretrainedConfig):
-    r"""
-    :class:`HybridCLIPConfig` is the configuration class to store the configuration of a
-    :class:`~HybridCLIPModel`. It is used to instantiate HybridCLIPModel model according to the specified arguments,
-    defining the text model and vision model configs.
-
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-
-    Args:
-        text_config_dict (:obj:`dict`):
-            Dictionary of configuration options that defines text model config.
-        vision_config_dict (:obj:`dict`):
-            Dictionary of configuration options that defines vison model config.
-        projection_dim (:obj:`int`, `optional`, defaults to 512):
-            Dimentionality of text and vision projection layers.
-        kwargs (`optional`):
-            Dictionary of keyword arguments.
-
-    Examples::
-
-        >>> from transformers import BertConfig, CLIPConfig, HybridCLIPConfig, FlaxHybridCLIP
-
-        >>> # Initializing a BERT and CLIP configuration
-        >>> config_text = BertConfig()
-        >>> config_vision = CLIPConfig()
-
-        >>> config = HybridCLIPConfig.from_text_vision_configs(config_text, config_vision, projection_dim=512)
-
-        >>> # Initializing a BERT and CLIPVision model
-        >>> model = EncoderDecoderModel(config=config)
-
-        >>> # Accessing the model configuration
-        >>> config_text = model.config.text_config
-        >>> config_vision  = model.config.vision_config
-
-        >>> # Saving the model, including its configuration
-        >>> model.save_pretrained('my-model')
-
-        >>> # loading model and config from pretrained folder
-        >>> encoder_decoder_config = HybridCLIPConfig.from_pretrained('my-model')
-        >>> model = FlaxHybridCLIP.from_pretrained('my-model', config=encoder_decoder_config)
-    """
-
-    model_type = "hybrid-clip"
-    is_composition = True
-
-    def __init__(self, projection_dim=512, **kwargs):
-        super().__init__(**kwargs)
-
-        if "text_config" not in kwargs:
-            raise ValueError("`text_config` can not be `None`.")
-
-        if "vision_config" not in kwargs:
-            raise ValueError("`vision_config` can not be `None`.")
-
-        text_config = kwargs.pop("text_config")
-        vision_config = kwargs.pop("vision_config")
-
-        text_model_type = text_config.pop("model_type")
-        vision_model_type = vision_config.pop("model_type")
-
-        from transformers import AutoConfig
-
-        self.text_config = AutoConfig.for_model(text_model_type, **text_config)
-
-        if vision_model_type == "clip":
-            self.vision_config = AutoConfig.for_model(vision_model_type, **vision_config).vision_config
-        elif vision_model_type == "clip_vision_model":
-            from transformers import CLIPVisionConfig
-
-            self.vision_config = CLIPVisionConfig(**vision_config)
-        else:
-            self.vision_config = AutoConfig.for_model(vision_model_type, **vision_config)
-
-        self.projection_dim = projection_dim
-        self.initializer_factor = 1.0
-
-    @classmethod
-    def from_text_vision_configs(cls, text_config: PretrainedConfig, vision_config: PretrainedConfig, **kwargs):
-        r"""
-        Instantiate a :class:`HybridCLIPConfig` (or a derived class) from text model configuration and
-        vision model configuration.
-
-        Returns:
-            :class:`HybridCLIPConfig`: An instance of a configuration object
-        """
-
-        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default
-        :meth:`~transformers.PretrainedConfig.to_dict`.
-
-        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output
diff --git a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
deleted file mode 100644
index 08cb3bd0b341..000000000000
--- a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple
-
-import flax.linen as nn
-import jax
-import jax.numpy as jnp
-from configuration_hybrid_clip import HybridCLIPConfig
-from flax.core.frozen_dict import FrozenDict
-
-from transformers import FLAX_MODEL_MAPPING, FlaxCLIPVisionModel
-from transformers.modeling_flax_utils import FlaxPreTrainedModel
-from transformers.models.clip.modeling_flax_clip import FlaxCLIPOutput
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class FlaxHybridCLIPModule(nn.Module):
-    config: HybridCLIPConfig
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self):
-        text_config = self.config.text_config
-        vision_config = self.config.vision_config
-
-        self.projection_dim = self.config.projection_dim
-        self.text_embed_dim = text_config.hidden_size
-        self.vision_embed_dim = vision_config.hidden_size
-
-        text_module = FLAX_MODEL_MAPPING[self.config.text_config.__class__].module_class
-        vision_module = FLAX_MODEL_MAPPING.get(self.config.vision_config.__class__, FlaxCLIPVisionModel).module_class
-
-        self.text_model = text_module(text_config, dtype=self.dtype)
-        self.vision_model = vision_module(vision_config, dtype=self.dtype)
-
-        self.visual_projection = nn.Dense(
-            self.projection_dim,
-            dtype=self.dtype,
-            kernel_init=jax.nn.initializers.normal(0.02),
-            use_bias=False,
-        )
-        self.text_projection = nn.Dense(
-            self.projection_dim,
-            dtype=self.dtype,
-            kernel_init=jax.nn.initializers.normal(0.02),
-            use_bias=False,
-        )
-        self.logit_scale = self.param("logit_scale", jax.nn.initializers.ones, [])
-
-    def __call__(
-        self,
-        input_ids=None,
-        pixel_values=None,
-        attention_mask=None,
-        position_ids=None,
-        token_type_ids=None,
-        deterministic: bool = True,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            deterministic=deterministic,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            deterministic=deterministic,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        image_embeds = vision_outputs[1]
-        image_embeds = self.visual_projection(image_embeds)
-
-        text_embeds = text_outputs[1]
-        text_embeds = self.text_projection(text_embeds)
-
-        # normalized features
-        image_embeds = image_embeds / jnp.linalg.norm(image_embeds, axis=-1, keepdims=True)
-        text_embeds = text_embeds / jnp.linalg.norm(text_embeds, axis=-1, keepdims=True)
-
-        # cosine similarity as logits
-        logit_scale = jnp.exp(self.logit_scale)
-        logits_per_text = jnp.matmul(text_embeds, image_embeds.T) * logit_scale
-        logits_per_image = logits_per_text.T
-
-        if not return_dict:
-            return (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
-
-        return FlaxCLIPOutput(
-            logits_per_image=logits_per_image,
-            logits_per_text=logits_per_text,
-            text_embeds=text_embeds,
-            image_embeds=image_embeds,
-            text_model_output=text_outputs,
-            vision_model_output=vision_outputs,
-        )
-
-
-class FlaxHybridCLIP(FlaxPreTrainedModel):
-    config_class = HybridCLIPConfig
-    module_class = FlaxHybridCLIPModule
-
-    def __init__(
-        self,
-        config: HybridCLIPConfig,
-        input_shape: Optional[Tuple] = None,
-        seed: int = 0,
-        dtype: jnp.dtype = jnp.float32,
-        **kwargs,
-    ):
-        if input_shape is None:
-            input_shape = ((1, 1), (1, config.vision_config.image_size, config.vision_config.image_size, 3))
-
-        module = self.module_class(config=config, dtype=dtype, **kwargs)
-        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype)
-
-    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
-        # init input tensor
-        input_ids = jnp.zeros(input_shape[0], dtype="i4")
-        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape[0])
-        token_type_ids = jnp.ones_like(input_ids)
-        attention_mask = jnp.ones_like(input_ids)
-
-        pixel_values = jax.random.normal(rng, input_shape[1])
-
-        params_rng, dropout_rng = jax.random.split(rng)
-        rngs = {"params": params_rng, "dropout": dropout_rng}
-
-        return self.module.init(rngs, input_ids, pixel_values, attention_mask, position_ids, token_type_ids)["params"]
-
-    def __call__(
-        self,
-        input_ids,
-        pixel_values,
-        attention_mask=None,
-        position_ids=None,
-        token_type_ids=None,
-        params: dict = None,
-        dropout_rng: jax.random.PRNGKey = None,
-        train: bool = False,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-
-        if position_ids is None:
-            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
-
-        if token_type_ids is None:
-            token_type_ids = jnp.zeros_like(input_ids)
-
-        if attention_mask is None:
-            attention_mask = jnp.ones_like(input_ids)
-
-        # Handle any PRNG if needed
-        rngs = {}
-        if dropout_rng is not None:
-            rngs["dropout"] = dropout_rng
-
-        return self.module.apply(
-            {"params": params or self.params},
-            jnp.array(input_ids, dtype="i4"),
-            jnp.array(pixel_values, dtype=jnp.float32),
-            jnp.array(attention_mask, dtype="i4"),
-            jnp.array(position_ids, dtype="i4"),
-            jnp.array(token_type_ids, dtype="i4"),
-            not train,
-            output_attentions,
-            output_hidden_states,
-            return_dict,
-            rngs=rngs,
-        )
-
-    def get_text_features(
-        self,
-        input_ids,
-        attention_mask=None,
-        position_ids=None,
-        token_type_ids=None,
-        params: dict = None,
-        dropout_rng: jax.random.PRNGKey = None,
-        train=False,
-    ):
-        r"""
-        Args:
-            input_ids (:obj:`numpy.ndarray` of shape :obj:`(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
-                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
-                for details.
-
-                `What are input IDs? <../glossary.html#input-ids>`__
-
-        Returns:
-            text_features (:obj:`jnp.ndarray` of shape :obj:`(batch_size, output_dim`): The text embeddings
-            obtained by applying the projection layer to the pooled output of text model.
-        """
-        if position_ids is None:
-            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
-
-        if token_type_ids is None:
-            token_type_ids = jnp.zeros_like(input_ids)
-
-        if attention_mask is None:
-            attention_mask = jnp.ones_like(input_ids)
-
-        # Handle any PRNG if needed
-        rngs = {}
-        if dropout_rng is not None:
-            rngs["dropout"] = dropout_rng
-
-        def _get_features(module, input_ids, attention_mask, position_ids, token_type_ids, deterministic):
-            text_outputs = module.text_model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                token_type_ids=token_type_ids,
-                deterministic=deterministic,
-            )
-            pooled_output = text_outputs[1]
-            text_features = module.text_projection(pooled_output)
-            return text_features
-
-        return self.module.apply(
-            {"params": params or self.params},
-            jnp.array(input_ids, dtype="i4"),
-            jnp.array(attention_mask, dtype="i4"),
-            jnp.array(position_ids, dtype="i4"),
-            jnp.array(token_type_ids, dtype="i4"),
-            not train,
-            method=_get_features,
-            rngs=rngs,
-        )
-
-    def get_image_features(
-        self, pixel_values, params: dict = None, dropout_rng: jax.random.PRNGKey = None, train=False
-    ):
-        r"""
-        Args:
-            pixel_values (:obj:`numpy.ndarray` of shape :obj:`(batch_size, num_channels, height, width)`):
-                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained
-                using :class:`~transformers.ImageFeatureExtractionMixin`. See
-                :meth:`transformers.ImageFeatureExtractionMixin.__call__` for details.
-
-        Returns:
-            image_features (:obj:`jnp.ndarray` of shape :obj:`(batch_size, output_dim`): The image embeddings
-            obtained by applying the projection layer to the pooled output of vision model.
-        """
-
-        # Handle any PRNG if needed
-        rngs = {}
-        if dropout_rng is not None:
-            rngs["dropout"] = dropout_rng
-
-        def _get_features(module, pixel_values, deterministic):
-            vision_outputs = module.vision_model(pixel_values=pixel_values, deterministic=deterministic)
-            pooled_output = vision_outputs[1]  # pooled_output
-            image_features = module.visual_projection(pooled_output)
-            return image_features
-
-        return self.module.apply(
-            {"params": params or self.params},
-            jnp.array(pixel_values, dtype=jnp.float32),
-            not train,
-            method=_get_features,
-            rngs=rngs,
-        )
-
-    @classmethod
-    def from_text_vision_pretrained(
-        cls,
-        text_model_name_or_path: str = None,
-        vision_model_name_or_path: str = None,
-        *model_args,
-        **kwargs,
-    ) -> FlaxPreTrainedModel:
-        """
-        Params:
-            text_model_name_or_path (:obj: `str`, `optional`):
-                Information necessary to initiate the text model. Can be either:
-
-                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                    - A path to a `directory` containing model weights saved using
-                      :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
-                    - A path or url to a `PyTorch checkpoint folder` (e.g, ``./pt_model``). In
-                      this case, ``from_pt`` should be set to :obj:`True` and a configuration object should be provided
-                      as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in
-                      a Flax model using the provided conversion scripts and loading the Flax model afterwards.
-
-            vision_model_name_or_path (:obj: `str`, `optional`, defaults to `None`):
-                Information necessary to initiate the vision model. Can be either:
-
-                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                    - A path to a `directory` containing model weights saved using
-                      :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
-                    - A path or url to a `PyTorch checkpoint folder` (e.g, ``./pt_model``). In
-                      this case, ``from_pt`` should be set to :obj:`True` and a configuration object should be provided
-                      as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in
-                      a Flax model using the provided conversion scripts and loading the Flax model afterwards.
-
-            model_args (remaining positional arguments, `optional`):
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method.
-
-            kwargs (remaining dictionary of keyword arguments, `optional`):
-                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                :obj:`output_attentions=True`).
-
-                - To update the text configuration, use the prefix `text_` for each configuration parameter.
-                - To update the vision configuration, use the prefix `vision_` for each configuration parameter.
-                - To update the parent model configuration, do not use a prefix for each configuration parameter.
-
-                Behaves differently depending on whether a :obj:`config` is provided or automatically loaded.
-
-        Example::
-
-            >>> from transformers import FlaxHybridCLIP
-            >>> # initialize a model from pretrained BERT and CLIP models. Note that the projection layers will be randomly initialized.
-            >>> # If using CLIP's vision model the vision projection layer will be initialized using pre-trained weights
-            >>> model = FlaxHybridCLIP.from_text_vision_pretrained('google-bert/bert-base-uncased', 'openai/clip-vit-base-patch32')
-            >>> # saving model after fine-tuning
-            >>> model.save_pretrained("./bert-clip")
-            >>> # load fine-tuned model
-            >>> model = FlaxHybridCLIP.from_pretrained("./bert-clip")
-        """
-
-        kwargs_text = {
-            argument[len("text_") :]: value for argument, value in kwargs.items() if argument.startswith("text_")
-        }
-
-        kwargs_vision = {
-            argument[len("vision_") :]: value for argument, value in kwargs.items() if argument.startswith("vision_")
-        }
-
-        # remove text, vision kwargs from kwargs
-        for key in kwargs_text.keys():
-            del kwargs["text_" + key]
-        for key in kwargs_vision.keys():
-            del kwargs["vision_" + key]
-
-        # Load and initialize the text and vision model
-        text_model = kwargs_text.pop("model", None)
-        if text_model is None:
-            assert (
-                text_model_name_or_path is not None
-            ), "If `model` is not defined as an argument, a `text_model_name_or_path` has to be defined"
-            from transformers import FlaxAutoModel
-
-            if "config" not in kwargs_text:
-                from transformers import AutoConfig
-
-                text_config = AutoConfig.from_pretrained(text_model_name_or_path)
-                kwargs_text["config"] = text_config
-
-            text_model = FlaxAutoModel.from_pretrained(text_model_name_or_path, *model_args, **kwargs_text)
-
-        vision_model = kwargs_vision.pop("model", None)
-        if vision_model is None:
-            assert (
-                vision_model_name_or_path is not None
-            ), "If `model` is not defined as an argument, a `vision_model_name_or_path` has to be defined"
-            from transformers import FlaxAutoModel
-
-            if "config" not in kwargs_vision:
-                from transformers import AutoConfig
-
-                vision_config = AutoConfig.from_pretrained(vision_model_name_or_path)
-                kwargs_vision["config"] = vision_config
-
-            vision_model = FlaxAutoModel.from_pretrained(vision_model_name_or_path, *model_args, **kwargs_vision)
-
-        # instantiate config with corresponding kwargs
-        dtype = kwargs.pop("dtype", jnp.float32)
-        config = HybridCLIPConfig.from_text_vision_configs(text_model.config, vision_model.config, **kwargs)
-
-        # init model
-        model = cls(config, *model_args, dtype=dtype, **kwargs)
-
-        if vision_config.model_type == "clip":
-            model.params["vision_model"]["vision_model"] = vision_model.params["vision_model"]
-            model.params["visual_projection"]["kernel"] = vision_model.params["visual_projection"]["kernel"]
-        else:
-            model.params["vision_model"] = vision_model.params
-
-        model.params["text_model"] = text_model.params
-
-        return model
diff --git a/examples/research_projects/jax-projects/hybrid_clip/requirements.txt b/examples/research_projects/jax-projects/hybrid_clip/requirements.txt
deleted file mode 100644
index 7b465dde645e..000000000000
--- a/examples/research_projects/jax-projects/hybrid_clip/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-jax>=0.2.8
-jaxlib>=0.1.59
-flax>=0.3.5
-optax>=0.0.8
--f https://download.pytorch.org/whl/torch_stable.html
-torch==2.2.0 
--f https://download.pytorch.org/whl/torch_stable.html
-torchvision==0.10.0+cpu
\ No newline at end of file
diff --git a/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py b/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py
deleted file mode 100644
index 2020f0a35c40..000000000000
--- a/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py
+++ /dev/null
@@ -1,576 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Training a CLIP like dual encoder models using text and vision encoders in the library.
-
-The script can be used to train CLIP like models for languages other than english by using
-a text encoder pre-trained in the desired language. Currently this script support the following vision
-and text models:
-Vision models: ViT(https://huggingface.co/models?filter=vit), CLIP (https://huggingface.co/models?filter=clip)
-Text models: BERT, ROBERTa (https://huggingface.co/models?filter=fill-mask)
-"""
-
-import json
-import logging
-import os
-import sys
-import time
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Callable, Optional
-
-import jax
-import jax.numpy as jnp
-import optax
-import torch
-from flax import jax_utils
-from flax.jax_utils import unreplicate
-from flax.training import train_state
-from flax.training.common_utils import get_metrics, shard, shard_prng_key
-from modeling_hybrid_clip import FlaxHybridCLIP
-from torchvision.datasets import VisionDataset
-from torchvision.io import ImageReadMode, read_image
-from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
-from torchvision.transforms.functional import InterpolationMode
-from tqdm import tqdm
-
-import transformers
-from transformers import AutoTokenizer, HfArgumentParser, TrainingArguments, is_tensorboard_available, set_seed
-
-
-logger = logging.getLogger(__name__)
-
-# Cache the result
-has_tensorboard = is_tensorboard_available()
-if has_tensorboard:
-    try:
-        from flax.metrics.tensorboard import SummaryWriter
-    except ImportError as ie:
-        has_tensorboard = False
-        print(f"Unable to display metrics through TensorBoard because some package are not installed: {ie}")
-
-else:
-    print(
-        "Unable to display metrics through TensorBoard because the package is not installed: "
-        "Please run pip install tensorboard to enable."
-    )
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    text_model_name_or_path: str = field(
-        metadata={
-            "help": (
-                "The text model checkpoint for weights initialization. "
-                "Don't set if you want to train a model from scratch."
-            )
-        },
-    )
-    vision_model_name_or_path: str = field(
-        metadata={
-            "help": (
-                "The vision model checkpoint for weights initialization. "
-                "Don't set if you want to train a model from scratch."
-            )
-        },
-    )
-    from_pt: bool = field(
-        default=True,
-        metadata={"help": "whether to load the text and vision model using PyTorch checkpoints."},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    dtype: Optional[str] = field(
-        default="float32",
-        metadata={
-            "help": (
-                "Floating-point format in which the model weights should be initialized and trained. Choose one of"
-                " `[float32, float16, bfloat16]`."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    data_dir: Optional[str] = field(default=None, metadata={"help": "The data directory containing input files."})
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a jsonlines file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file (a jsonlines file)."},
-    )
-    max_seq_length: Optional[int] = field(
-        default=72,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-
-    def __post_init__(self):
-        if self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension == "json", "`train_file` should be a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension == "json", "`validation_file` should be a json file."
-
-
-# We use torchvision for faster image pre-processing.
-# We need to ensure faster processing speed as it can become a bottleneck on TPU
-class Transform(torch.nn.Module):
-    def __init__(self, image_size):
-        super().__init__()
-        self.transforms = torch.nn.Sequential(
-            Resize([image_size], interpolation=InterpolationMode.BICUBIC),
-            CenterCrop(image_size),
-            ConvertImageDtype(torch.float),
-            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        with torch.no_grad():
-            x = self.transforms(x)
-        return x
-
-
-class ImageTextDataset(VisionDataset):
-    """
-    Dtaset for loading image-text data for tasks like CLIP training, Image Captioning.
-
-    Args:
-        root: (string): The root path where the dataset is stored
-        file_path: (string): Path to the file containing the image_paths and associated captions.
-            The expected format is jsonlines where each line is a json object containing to keys.
-            `image_path`: The path to the image.
-            `captions`: An `array` of captions.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.ToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-    """
-
-    def __init__(
-        self,
-        root: str,
-        file_path: str,
-        captions_per_image=2,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-        transforms: Optional[Callable] = None,
-    ):
-        super().__init__(root, transforms, transform, target_transform)
-
-        with open(file_path, "r") as f:
-            examples = [json.loads(line) for line in f.readlines()]
-
-        self.captions = []
-        self.image_paths = []
-
-        for example in examples:
-            captions_subset = example["captions"][:captions_per_image]
-            self.captions.extend(captions_subset)
-            self.image_paths.extend([example["image_path"]] * len(captions_subset))
-
-    def _load_image(self, idx: int):
-        path = self.image_paths[idx]
-        return read_image(path, mode=ImageReadMode.RGB)
-
-    def _load_target(self, idx):
-        return self.captions[idx]
-
-    def __getitem__(self, index: int):
-        image = self._load_image(index)
-        target = self._load_target(index)
-
-        if self.transforms is not None:
-            image, target = self.transforms(image, target)
-
-        return image, target
-
-    def __len__(self) -> int:
-        return len(self.captions)
-
-
-class TrainState(train_state.TrainState):
-    dropout_rng: jnp.ndarray
-
-    def replicate(self):
-        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
-
-
-def write_metric(summary_writer, train_metrics, eval_metrics, train_time, step):
-    summary_writer.scalar("train_time", train_time, step)
-
-    train_metrics = get_metrics(train_metrics)
-    for key, vals in train_metrics.items():
-        tag = f"train_{key}"
-        for i, val in enumerate(vals):
-            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
-
-    for metric_name, value in eval_metrics.items():
-        summary_writer.scalar(f"eval_{metric_name}", value, step)
-
-
-def create_learning_rate_fn(
-    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
-) -> Callable[[int], jnp.ndarray]:
-    """Returns a linear warmup, linear_decay learning rate function."""
-    steps_per_epoch = train_ds_size // train_batch_size
-    num_train_steps = steps_per_epoch * num_train_epochs
-    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
-    decay_fn = optax.linear_schedule(
-        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
-    )
-    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
-    return schedule_fn
-
-
-def main():
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    if (
-        os.path.exists(training_args.output_dir)
-        and os.listdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-            "Use --overwrite_output_dir to overcome."
-        )
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    # Setup logging, we only want one process per machine to log things on the screen.
-    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
-    if jax.process_index() == 0:
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        transformers.utils.logging.set_verbosity_error()
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-        )
-    elif model_args.text_model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.text_model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-        )
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    model = FlaxHybridCLIP.from_text_vision_pretrained(
-        model_args.text_model_name_or_path,
-        model_args.vision_model_name_or_path,
-        seed=training_args.seed,
-        dtype=getattr(jnp, model_args.dtype),
-        text_from_pt=model_args.from_pt,
-        vision_from_pt=model_args.from_pt,
-    )
-    config = model.config
-    # set seed for torch dataloaders
-    set_seed(training_args.seed)
-
-    # Initialize torchvision transforms and jit them for faster processing
-    preprocess = Transform(config.vision_config.image_size)
-    preprocess = torch.jit.script(preprocess)
-
-    # Initialize the image-text dataset
-    train_dataset = ImageTextDataset(
-        data_args.data_dir,
-        data_args.train_file,
-        captions_per_image=2,
-        transform=preprocess,
-    )
-
-    eval_dataset = ImageTextDataset(
-        data_args.data_dir,
-        data_args.validation_file,
-        captions_per_image=1,
-        transform=preprocess,
-    )
-
-    # Store some constant
-    num_epochs = int(training_args.num_train_epochs)
-    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
-    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
-    steps_per_epoch = len(train_dataset) // train_batch_size
-    total_train_steps = steps_per_epoch * num_epochs
-
-    # Use collate function to tokenizer the text and convert the processed images to numpy
-    def collate_fn(examples):
-        pixel_values = torch.stack([example[0] for example in examples]).permute(0, 2, 3, 1).numpy()
-        captions = [example[1] for example in examples]
-        inputs = tokenizer(
-            captions, max_length=data_args.max_seq_length, padding="max_length", truncation=True, return_tensors="np"
-        )
-
-        batch = {
-            "pixel_values": pixel_values,
-            "input_ids": inputs["input_ids"],
-            "attention_mask": inputs["attention_mask"],
-        }
-
-        return batch
-
-    # Create data loaders
-    train_loader = torch.utils.data.DataLoader(
-        train_dataset,
-        batch_size=train_batch_size,
-        shuffle=True,
-        num_workers=data_args.preprocessing_num_workers,
-        persistent_workers=True,
-        drop_last=True,
-        collate_fn=collate_fn,
-    )
-
-    eval_loader = torch.utils.data.DataLoader(
-        eval_dataset,
-        batch_size=eval_batch_size,
-        shuffle=False,
-        num_workers=data_args.preprocessing_num_workers,
-        persistent_workers=True,
-        drop_last=True,
-        collate_fn=collate_fn,
-    )
-
-    # Enable tensorboard only on the master node
-    if has_tensorboard and jax.process_index() == 0:
-        summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix())
-
-    # Initialize our training
-    rng = jax.random.PRNGKey(training_args.seed)
-    rng, dropout_rng = jax.random.split(rng)
-
-    # Create learning rate schedule
-    linear_decay_lr_schedule_fn = create_learning_rate_fn(
-        len(train_dataset),
-        train_batch_size,
-        training_args.num_train_epochs,
-        training_args.warmup_steps,
-        training_args.learning_rate,
-    )
-
-    # create adam optimizer
-    adamw = optax.adamw(
-        learning_rate=linear_decay_lr_schedule_fn,
-        b1=training_args.adam_beta1,
-        b2=training_args.adam_beta2,
-        eps=training_args.adam_epsilon,
-        weight_decay=training_args.weight_decay,
-    )
-
-    # Setup train state
-    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)
-
-    def cross_entropy(logits, axis):
-        logprobs = jax.nn.log_softmax(logits, axis=axis)
-        nll = jnp.diag(logprobs)
-        ce = -jnp.mean(nll)
-        return ce
-
-    def clip_loss(similarity):
-        loss = (cross_entropy(similarity, axis=0) + cross_entropy(similarity, axis=1)) / 2
-        return loss
-
-    # Define gradient update step fn
-    def train_step(state, batch):
-        dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
-
-        def compute_loss(params):
-            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
-            loss = clip_loss(logits)
-            return loss
-
-        grad_fn = jax.value_and_grad(compute_loss)
-        loss, grad = grad_fn(state.params)
-        grad = jax.lax.pmean(grad, "batch")
-
-        new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
-
-        metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
-        metrics = jax.lax.pmean(metrics, axis_name="batch")
-
-        return new_state, metrics
-
-    # Define eval fn
-    def eval_step(params, batch):
-        logits = model(**batch, params=params, train=False)[0]
-        loss = clip_loss(logits)
-
-        # summarize metrics
-        metrics = {"loss": loss}
-        metrics = jax.lax.pmean(metrics, axis_name="batch")
-        return metrics
-
-    # Create parallel version of the train and eval step
-    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
-    p_eval_step = jax.pmap(eval_step, "batch")
-
-    # Replicate the train state on each device
-    state = state.replicate()
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {num_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel & distributed) = {train_batch_size}")
-    logger.info(f"  Total optimization steps = {total_train_steps}")
-
-    train_time = 0
-    # Create sampling rng
-    rng, input_rng = jax.random.split(rng)
-
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
-    for epoch in epochs:
-        # ======================== Training ================================
-        train_start = time.time()
-
-        # Create sampling rng
-        rng, input_rng = jax.random.split(rng)
-        train_metrics = []
-
-        steps_per_epoch = len(train_dataset) // train_batch_size
-        train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
-        # train
-        for batch in train_loader:
-            batch = shard(batch)
-            state, train_metric = p_train_step(state, batch)
-            train_metrics.append(train_metric)
-
-            train_step_progress_bar.update(1)
-
-        train_time += time.time() - train_start
-
-        train_metric = unreplicate(train_metric)
-
-        train_step_progress_bar.close()
-        epochs.write(
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate:"
-            f" {train_metric['learning_rate']})"
-        )
-
-        # ======================== Evaluating ==============================
-        eval_metrics = []
-        eval_steps = len(eval_dataset) // eval_batch_size
-        eval_step_progress_bar = tqdm(total=eval_steps, desc="Evaluating...", position=2, leave=False)
-        for batch in eval_loader:
-            # Model forward
-            batch = shard(batch)
-            metrics = p_eval_step(state.params, batch)
-            eval_metrics.append(metrics)
-
-            eval_step_progress_bar.update(1)
-
-        # normalize eval metrics
-        eval_metrics = get_metrics(eval_metrics)
-
-        eval_metrics = jax.tree_util.tree_map(jnp.mean, eval_metrics)
-
-        # Print metrics and update progress bar
-        eval_step_progress_bar.close()
-        desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']})"
-        epochs.write(desc)
-        epochs.desc = desc
-
-        # Save metrics
-        if has_tensorboard and jax.process_index() == 0:
-            cur_step = epoch * (len(train_dataset) // train_batch_size)
-            write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step)
-
-        # save checkpoint after each epoch and push checkpoint to the hub
-        if jax.process_index() == 0:
-            params = jax.device_get(unreplicate(state.params))
-            model.save_pretrained(
-                training_args.output_dir,
-                params=params,
-                push_to_hub=training_args.push_to_hub,
-                commit_message=f"Saving weights and logs of epoch {epoch+1}",
-            )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/jax-projects/model_parallel/README.md b/examples/research_projects/jax-projects/model_parallel/README.md
deleted file mode 100644
index 393c9e893750..000000000000
--- a/examples/research_projects/jax-projects/model_parallel/README.md
+++ /dev/null
@@ -1,67 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Model parallel language model training example
-
-The following example showcases how to train/fine-tune GPTNeo model with model parallelism using
-the JAX/Flax backend and the [`pjit`](https://jax.readthedocs.io/en/latest/jax.experimental.pjit.html) transformation.
-
-> Note: The example is experimental and might have bugs. Also currently it only supports single V3-8.
-
-The `partition.py` file defines the `PyTree` of `ParitionSpec` for the GPTNeo model which describes how the model will be sharded.
-The actual sharding is auto-matically handled by `pjit`. The weights are sharded across all local devices.
-To adapt the script for other models, we need to also change the `ParitionSpec` accordingly.
-
-TODO: Add more explantion.
-
-Before training, let's prepare our model first. To be able to shard the model, the sharded dimension needs to be a multiple of devices it'll be sharded on. But GPTNeo's vocab size is 50257, so we need to resize the embeddings accordingly. 
-
-```python
-from transformers import FlaxGPTNeoForCausalLM, GPTNeoConfig 
-model = FlaxGPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
-
-emb = jnp.zeros((50264, model.config.hidden_size))
-# update the first 50257 weights using pre-trained weights
-emb = emb.at[:50257, :].set(model.params["transformer"]["wte"]["embedding"])
-params = model.params
-params["transformer"]["wte"]["embedding"] = emb
-
-# initialize a random model with the right vocab_size
-config = GPTNeoConfig.from_pretrained("EleutherAI/gpt-neo-1.3B", vocab_size=50264)
-model = FlaxGPTNeoForCausalLM(config)
-
-# assign the pre-trained weights and save the model.
-model.params = params
-model.save_pretrained("gpt-neo-1.3B")
-```
-
-
-### Train Model
-
-```bash
-python run_clm_mp.py \
-    --model_name_or_path gpt-neo-1.3B  \
-    --tokenizer_name openai-community/gpt2 \
-    --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
-    --do_train  --do_eval \
-    --block_size 1024 \
-    --num_train_epochs 5 \
-    --learning_rate 4e-6 \
-    --per_device_train_batch_size 3 --per_device_eval_batch_size 3 \
-    --overwrite_output_dir --output_dir ~/tmp/flax-clm \
-    --cache_dir ~/datasets_cache/wikitext --dtype bfloat16 \
-    --logging_steps 96 --eval_steps 96
-```
\ No newline at end of file
diff --git a/examples/research_projects/jax-projects/model_parallel/partitions.py b/examples/research_projects/jax-projects/model_parallel/partitions.py
deleted file mode 100644
index 86e54ad67027..000000000000
--- a/examples/research_projects/jax-projects/model_parallel/partitions.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The Google Research Authors and The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities for constructing PyTrees of PartitionSpecs."""
-
-# utils adapted from https://github.com/google-research/google-research/blob/master/flax_models/t5x/partitions.py
-
-import re
-
-from flax.core.frozen_dict import freeze
-from flax.traverse_util import flatten_dict, unflatten_dict
-from jax.experimental import PartitionSpec as P
-
-
-# Sentinels
-_unmatched = object()
-
-# For specifying empty leaf dict `{}`
-empty_dict = object()
-
-
-def _match(qs, ks):
-    """Return True if regexes in qs match any window of strings in tuple ks."""
-    # compile regexes and force complete match
-    qts = tuple((re.compile(x + "$") for x in qs))
-    for i in range(len(ks) - len(qs) + 1):
-        matches = [x.match(y) for x, y in zip(qts, ks[i:])]
-        if matches and all(matches):
-            return True
-    return False
-
-
-def _replacement_rules(rules):
-    def replace(key, val):
-        for rule, replacement in rules:
-            if _match(rule, key):
-                return replacement
-        return val
-
-    return replace
-
-
-# PartitionSpec for GPTNeo
-# replicate the hidden dim and shard feed-forward and head dim
-def _get_partition_rules():
-    return [
-        # embeddings
-        (("transformer", "wpe", "embedding"), P("mp", None)),
-        (("transformer", "wte", "embedding"), P("mp", None)),
-        # atention
-        (("attention", "(q_proj|k_proj|v_proj)", "kernel"), P(None, "mp")),
-        (("attention", "out_proj", "kernel"), P("mp", None)),
-        (("attention", "out_proj", "bias"), None),
-        # mlp
-        (("mlp", "c_fc", "kernel"), P(None, "mp")),
-        (("mlp", "c_fc", "bias"), P("mp")),
-        (("mlp", "c_proj", "kernel"), P("mp", None)),
-        (("mlp", "c_proj", "bias"), None),
-        # layer norms
-        ((r"ln_\d+", "bias"), None),
-        ((r"\d+", r"ln_\d+", "scale"), None),
-        (("ln_f", "bias"), None),
-        (("ln_f", "scale"), None),
-    ]
-
-
-def set_partitions(in_dict):
-    rules = _get_partition_rules()
-    replace = _replacement_rules(rules)
-    initd = {k: _unmatched for k in flatten_dict(in_dict)}
-    result = {k: replace(k, v) for k, v in initd.items()}
-    assert _unmatched not in result.values(), "Incomplete partition spec."
-    return freeze(unflatten_dict(result))
diff --git a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
deleted file mode 100644
index 067f7cb2b185..000000000000
--- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
+++ /dev/null
@@ -1,662 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Pre-training/Fine-tuning the GPTNeo model for causal language modeling on a text file or a dataset using model parallelism.
-"""
-
-import logging
-import math
-import os
-import sys
-import time
-from dataclasses import dataclass, field
-from itertools import chain
-from pathlib import Path
-from typing import Callable, Optional
-
-import datasets
-import jax
-import jax.numpy as jnp
-import numpy as np
-import optax
-from datasets import Dataset, load_dataset
-from flax.core.frozen_dict import freeze, unfreeze
-from flax.training.common_utils import onehot, stack_forest
-from jax.experimental.maps import mesh
-from jax.experimental.pjit import pjit
-from partitions import set_partitions
-from tqdm import tqdm
-
-import transformers
-from transformers import (
-    CONFIG_MAPPING,
-    FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
-    AutoConfig,
-    AutoTokenizer,
-    FlaxAutoModelForCausalLM,
-    HfArgumentParser,
-    TrainingArguments,
-    is_tensorboard_available,
-)
-from transformers.testing_utils import CaptureLogger
-
-
-logger = logging.getLogger(__name__)
-
-MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
-            )
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    dtype: Optional[str] = field(
-        default="float32",
-        metadata={
-            "help": (
-                "Floating-point format in which the model weights should be initialized and trained. Choose one of"
-                " `[float32, float16, bfloat16]`."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    block_size: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Optional input sequence length after tokenization. "
-                "The training dataset will be truncated in block of this size for training. "
-                "Default to the model max input length for single sentence inputs (take into account special tokens)."
-            )
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
-
-
-def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuffle: bool = False):
-    """
-    Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.
-    Shuffle batches if `shuffle` is `True`.
-    """
-    steps_per_epoch = len(dataset) // batch_size
-
-    if shuffle:
-        batch_idx = jax.random.permutation(rng, len(dataset))
-    else:
-        batch_idx = jnp.arange(len(dataset))
-
-    batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
-    batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))
-
-    for idx in batch_idx:
-        batch = dataset[idx]
-        batch = {k: jnp.array(v) for k, v in batch.items()}
-        yield batch
-
-
-def write_train_metric(summary_writer, train_metrics, train_time, step):
-    summary_writer.scalar("train_time", train_time, step)
-
-    train_metrics = stack_forest(train_metrics)
-    for key, vals in train_metrics.items():
-        tag = f"train_{key}"
-        for i, val in enumerate(vals):
-            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
-
-
-def write_eval_metric(summary_writer, eval_metrics, step):
-    for metric_name, value in eval_metrics.items():
-        summary_writer.scalar(f"eval_{metric_name}", value, step)
-
-
-def create_learning_rate_fn(
-    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
-) -> Callable[[int], jnp.ndarray]:
-    """Returns a linear warmup, linear_decay learning rate function."""
-    steps_per_epoch = train_ds_size // train_batch_size
-    num_train_steps = steps_per_epoch * num_train_epochs
-    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
-    decay_fn = optax.linear_schedule(
-        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
-    )
-    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
-    return schedule_fn
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    if (
-        os.path.exists(training_args.output_dir)
-        and os.listdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-            "Use --overwrite_output_dir to overcome."
-        )
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    # Setup logging, we only want one process per machine to log things on the screen.
-    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
-    if jax.process_index() == 0:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        dataset = load_dataset(
-            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
-        )
-
-        if "validation" not in dataset.keys():
-            dataset["validation"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
-            )
-            dataset["train"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
-            )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if extension == "txt":
-            extension = "text"
-        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Load pretrained config and tokenizer
-    if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
-    elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
-    else:
-        config = CONFIG_MAPPING[model_args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-        )
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-        )
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    if training_args.do_train:
-        column_names = dataset["train"].column_names
-    else:
-        column_names = dataset["validation"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
-
-    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
-    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
-
-    def tokenize_function(examples):
-        with CaptureLogger(tok_logger) as cl:
-            output = tokenizer(examples[text_column_name])
-        # clm input could be much much longer than block_size
-        if "Token indices sequence length is longer than the" in cl.out:
-            tok_logger.warning(
-                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
-                " before being passed to the model."
-            )
-        return output
-
-    tokenized_datasets = dataset.map(
-        tokenize_function,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        remove_columns=column_names,
-        load_from_cache_file=not data_args.overwrite_cache,
-    )
-
-    if data_args.block_size is None:
-        block_size = tokenizer.model_max_length
-        if block_size > config.max_position_embeddings:
-            logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
-            )
-            block_size = min(1024, config.max_position_embeddings)
-    else:
-        if data_args.block_size > tokenizer.model_max_length:
-            logger.warning(
-                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model "
-                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
-            )
-        block_size = min(data_args.block_size, tokenizer.model_max_length)
-
-    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
-    def group_texts(examples):
-        # Concatenate all texts.
-        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-        total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-        # customize this part to your needs.
-        if total_length >= block_size:
-            total_length = (total_length // block_size) * block_size
-        # Split by chunks of max_len.
-        result = {
-            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-            for k, t in concatenated_examples.items()
-        }
-        result["labels"] = result["input_ids"].copy()
-        return result
-
-    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
-    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
-    # to preprocess.
-    #
-    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/process#map
-
-    lm_datasets = tokenized_datasets.map(
-        group_texts,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        load_from_cache_file=not data_args.overwrite_cache,
-    )
-
-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = lm_datasets["train"]
-        if data_args.max_train_samples is not None:
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = lm_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-    # Enable tensorboard only on the master node
-    has_tensorboard = is_tensorboard_available()
-    if has_tensorboard and jax.process_index() == 0:
-        try:
-            from flax.metrics.tensorboard import SummaryWriter
-
-            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
-        except ImportError as ie:
-            has_tensorboard = False
-            logger.warning(
-                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
-            )
-    else:
-        logger.warning(
-            "Unable to display metrics through TensorBoard because the package is not installed: "
-            "Please run pip install tensorboard to enable."
-        )
-
-    # Initialize our training
-    rng = jax.random.PRNGKey(training_args.seed)
-    rng, dropout_rng = jax.random.split(rng)
-
-    # Store some constant
-    num_epochs = int(training_args.num_train_epochs)
-    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
-    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
-    steps_per_epoch = len(train_dataset) // train_batch_size
-    total_train_steps = steps_per_epoch * num_epochs
-
-    # TODO: weights should be initialized in pjitted fun, this won't work for REALLY large models
-    # TODO: when loading from pre-trained model we need to make sure the vocab is divisible by num_partitions
-    # GPT2's vocab is odd, we need to resize it for fine-tuning
-    model = FlaxAutoModelForCausalLM.from_pretrained(
-        model_args.model_name_or_path, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
-    )
-
-    # Create learning rate schedule
-    linear_decay_lr_schedule_fn = create_learning_rate_fn(
-        len(train_dataset),
-        train_batch_size,
-        training_args.num_train_epochs,
-        training_args.warmup_steps,
-        training_args.learning_rate,
-    )
-
-    optimizer = optax.adamw(
-        learning_rate=linear_decay_lr_schedule_fn,
-        b1=training_args.adam_beta1,
-        b2=training_args.adam_beta2,
-        eps=training_args.adam_epsilon,
-        weight_decay=training_args.weight_decay,
-    )
-
-    def get_initial_state(params):
-        state = optimizer.init(params)
-        return tuple(state), params
-
-    # Get PartitionSpec for model params
-    param_spec = set_partitions(unfreeze(model.params))
-
-    # Get the PyTree for opt_state, we don't actually initialize the opt_state yet.
-    params_shapes = jax.tree_util.tree_map(lambda x: x.shape, model.params)
-    state_shapes = jax.eval_shape(get_initial_state, params_shapes)
-
-    # get PartitionSpec for opt_state, this is very specific to adamw
-    # TODO: optax returns different state for different optimizers, how can we handle this generically ?
-    # or maybe we don't since in our examples we just use adamw or adafactor
-    def get_opt_spec(x):
-        if isinstance(x, dict):
-            return param_spec
-        return None
-
-    opt_state_spec, param_spec = jax.tree_util.tree_map(
-        get_opt_spec, state_shapes, is_leaf=lambda x: isinstance(x, (dict, optax.EmptyState))
-    )
-
-    # pjit the get_initial_state function to shard params and init
-    # optimizer state in sharded way
-    p_get_initial_state = pjit(
-        get_initial_state,
-        in_axis_resources=None,
-        out_axis_resources=(opt_state_spec, param_spec),
-    )
-
-    # hack: move the inital params to CPU to free up device memory
-    # TODO: allow loading weights on CPU in pre-trained model
-    model.params = jax.tree_util.tree_map(lambda x: np.asarray(x), model.params)
-
-    # mesh defination
-    mesh_devices = np.array(jax.devices()).reshape(1, jax.local_device_count())
-
-    # actually initialize the opt_state
-    with mesh(mesh_devices, ("dp", "mp")):
-        opt_state, params = p_get_initial_state(freeze(model.params))
-
-    # cross-entropy with z loss
-    def loss_fn(logits, labels, z_loss=0):
-        shift_logits = logits[..., :-1, :]
-        shift_labels = labels[..., 1:]
-
-        shift_labels = onehot(shift_labels, shift_logits.shape[-1])
-
-        shift_logits = shift_logits - jax.lax.stop_gradient(shift_logits.max(axis=-1, keepdims=True))
-        log_z = jnp.log(jnp.sum(jnp.exp(shift_logits), axis=-1, keepdims=True))
-        log_softmax = shift_logits - log_z
-        loss = -jnp.sum(shift_labels * log_softmax, axis=-1)
-
-        loss += (1e-4 * jnp.square(log_z.squeeze(-1))) * z_loss
-
-        return loss.mean()
-
-    # Define gradient update step fn
-    # TODO: try to use TrainState instead of passing params and opt_state individually
-    def train_step(params, opt_state, dropout_rng, batch, step):
-        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
-
-        def compute_loss(params):
-            labels = batch.pop("labels")
-            logits = model(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
-            loss = loss_fn(logits, labels, z_loss=1.0)
-            return loss
-
-        grad_fn = jax.value_and_grad(compute_loss)
-        loss, grads = grad_fn(params)
-
-        updates, new_opt_state = optimizer.update(grads, opt_state, params)
-        new_params = optax.apply_updates(params, updates)
-
-        metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(step)}
-        return new_params, tuple(new_opt_state), new_dropout_rng, metrics, step + 1
-
-    # Define eval fn
-    def eval_step(input_ids, labels, params):
-        logits = model(input_ids=input_ids, params=params, train=False)[0]
-        loss = loss_fn(logits, labels)
-        # metrics
-        return {"loss": loss}
-
-    p_train_step = pjit(
-        train_step,
-        in_axis_resources=(param_spec, opt_state_spec, None, None, None),
-        out_axis_resources=(param_spec, opt_state_spec, None, None, None),
-        donate_argnums=(0, 1),
-    )
-
-    p_eval_step = pjit(
-        eval_step,
-        in_axis_resources=(None, None, param_spec),
-        out_axis_resources=None,
-    )
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {num_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel & distributed) = {train_batch_size}")
-    logger.info(f"  Total optimization steps = {total_train_steps}")
-
-    train_time = 0
-    train_metrics = []
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
-    global_step = 0
-    # we are not doing 2D parallelism (yet!), this just does model parallelism
-    with mesh(mesh_devices, ("dp", "mp")):
-        for _ in epochs:
-            # ======================== Training ================================
-            train_start = time.time()
-
-            # Create sampling rng
-            rng, input_rng = jax.random.split(rng)
-
-            # Generate an epoch by shuffling sampling indices from the train dataset
-            train_metrics = []
-            train_loader = data_loader(input_rng, train_dataset, train_batch_size, shuffle=True)
-            steps_per_epoch = len(train_dataset) // train_batch_size
-
-            # train
-            for _ in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
-                batch = next(train_loader)
-                params, opt_state, dropout_rng, train_metric, global_step = p_train_step(
-                    params,
-                    opt_state,
-                    dropout_rng,
-                    batch,
-                    global_step,
-                )
-                train_metrics.append(train_metric)
-
-                cur_step = global_step
-
-                if cur_step % training_args.logging_steps == 0 and cur_step > 0:
-                    # Save metrics
-                    train_time += time.time() - train_start
-                    if has_tensorboard and jax.process_index() == 0:
-                        write_train_metric(summary_writer, train_metrics, train_time, cur_step)
-
-                    epochs.write(
-                        f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate:"
-                        f" {train_metric['learning_rate']})"
-                    )
-
-                    train_metrics = []
-
-                if cur_step % training_args.eval_steps == 0 and cur_step > 0:
-                    # ======================== Evaluating ==============================
-                    eval_metrics = []
-                    eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
-                    eval_steps = len(eval_dataset) // eval_batch_size
-
-                    for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
-                        batch = next(eval_loader)
-                        metrics = p_eval_step(batch["input_ids"], batch["labels"], params)
-                        eval_metrics.append(metrics)
-
-                    # normalize eval metrics
-                    eval_metrics = stack_forest(eval_metrics)
-                    eval_metrics = jax.tree_util.tree_map(jnp.mean, eval_metrics)
-
-                    try:
-                        eval_metrics["perplexity"] = math.exp(eval_metrics["loss"])
-                    except OverflowError:
-                        eval_metrics["perplexity"] = float("inf")
-
-                    logger.info(
-                        f"Step... ({cur_step} | Eval loss: {eval_metrics['loss']} | Eval Perplexity:"
-                        f" {eval_metrics['perplexity']}"
-                    )
-
-                if cur_step % training_args.save_steps == 0 and cur_step > 0:
-                    # save checkpoint after each epoch and push checkpoint to the hub
-                    if jax.process_index() == 0:
-                        params = jax.device_get(params)
-                        model.save_pretrained(
-                            training_args.output_dir,
-                            params=params,
-                            push_to_hub=training_args.push_to_hub,
-                            commit_message=f"Saving weights and logs of step {cur_step}",
-                        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/jax-projects/wav2vec2/README.md b/examples/research_projects/jax-projects/wav2vec2/README.md
deleted file mode 100644
index 5f8e14f47c59..000000000000
--- a/examples/research_projects/jax-projects/wav2vec2/README.md
+++ /dev/null
@@ -1,120 +0,0 @@
-# Wav2Vec2 Contrastive Loss PreTraining examples
-
-The following example showcases how to pretrain a wav2vec2 model using the JAX/Flax backend.
-Pretraining Wav2Vec2 is rather complex, so it is highly recommended to read the 
-[official paper](https://arxiv.org/abs/2006.11477).
-
-JAX/Flax allows you to trace pure functions and compile them into efficient, fused accelerator code on both GPU and TPU.
-Models written in JAX/Flax are **immutable** and updated in a purely functional
-way which enables simple and efficient model parallelism.
-
-`run_wav2vec2_pretrain_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets library or use your own files (jsonlines or csv), then pretrain the wav2vec2 architectures above on it.
-
-For custom datasets in `jsonlines` format please see: [the Datasets documentation](https://huggingface.co/docs/datasets/loading_datasets#json-files) and you also will find examples of these below.
-
-Let's start by creating a model repository to save the trained model and logs.
-Here we call the model `"wav2vec2-base-robust"`, but you can change the model name as you like.
-
-You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
-you are logged in) or via the command line:
-
-```bash
-huggingface-cli repo create wav2vec2-base-robust
-```
-
-Next we clone the model repository to add the tokenizer and model files.
-
-```bash
-git clone https://huggingface.co/<your-username>/wav2vec2-base-robust
-```
-
-To ensure that all tensorboard traces will be uploaded correctly, we need to 
-track them. You can run the following command inside your model repo to do so.
-
-```bash
-cd wav2vec2-base-robust
-git lfs track "*tfevents*"
-```
-
-Great, we have set up our model repository. During training, we will automatically
-push the training logs and model weights to the repo.
-
-Next, let's add a symbolic link to the `run_wav2vec2_pretrain_flax`.
-
-```bash
-export MODEL_DIR="./wav2vec2-base-robust"
-ln -s ~/transformers/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py ./
-```
-
-### Create the model configuration
-
-Let's first create the model configuration and store it in the model repository. 
-Note that many training parameters can be set in the model configuration including
-the configuration about the masking distribution (`mask_time_length`, `mask_time_prob`), 
-dropout (`attention_dropout`, ...), the trade-off between the contrastive loss and 
-the diversity loss, etc...
-Mostly likely you will need to change these parameters depending on your use case.
-Again, we highly recommend to read the [official paper](https://arxiv.org/abs/2006.11477) 
-to better understand which parameters can be set for pretraining.
-
-For this example, we will be using a `"base"`-sized model of Wav2Vec2 with robust 
-layer norm and keep most of the default settings.
-
-```python
-model_dir="./wav2vec2-base-robust"
-
-from transformers import Wav2Vec2Config
-config = Wav2Vec2Config.from_pretrained(
-    "facebook/wav2vec2-base", 
-    mask_time_length=10,
-    mask_time_prob=0.05,
-    diversity_loss_weight=0.1,
-    num_negatives=100,
-    do_stable_layer_norm=True,
-    feat_extract_norm="layer",
-)
-config.save_pretrained(model_dir)
-```
-
-### Create a feature extractor configuration
-
-Before we can start the training, we need to define 
-a feature extractor that takes care of normalization, etc...
-
-Here we can also re-use the feature extractor of [wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base) while making sure that padding is allowed.
-
-
-```python
-model_dir="./wav2vec2-base-robust"
-
-from transformers import Wav2Vec2FeatureExtractor
-config = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base", return_attention_mask=True)
-config.save_pretrained(model_dir)
-```
-
-### Train the model
-Finally, we can run the example script to train the model:
-
-```bash
-./run_wav2vec2_pretrain_flax.py \
-    --output_dir=${MODEL_DIR} \
-    --num_train_epochs="5" \
-    --per_device_train_batch_size="32" \
-    --per_device_eval_batch_size="32" \
-    --learning_rate="5e-4" \
-    --weight_decay="0.01" \
-    --warmup_steps="2000" \
-    --model_name_or_path=${MODEL_DIR} \
-    --dataset_name="librispeech_asr" \
-    --dataset_config_name="clean" \
-    --train_split_name="train.100" \
-    --preprocessing_num_workers="4" \
-    --max_duration_in_seconds="10.0" \
-    --adam_beta1="0.9" \
-    --adam_beta2="0.98" \
-    --pad_to_multiple_of="16384" \
-    --push_to_hub
-```
-
-Note that this script is not fully tested yet, so we cannot ensure that 
-the above script leads to satisfying results.
diff --git a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
deleted file mode 100755
index 017e910db0a3..000000000000
--- a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
+++ /dev/null
@@ -1,614 +0,0 @@
-#!/usr/bin/env python3
-import logging
-import sys
-import time
-from dataclasses import field
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-import flax
-import jax
-import jax.numpy as jnp
-import librosa
-import numpy as np
-import optax
-from datasets import DatasetDict, load_dataset
-from flax import jax_utils, traverse_util
-from flax.training import train_state
-from flax.training.common_utils import get_metrics, onehot, shard
-from tqdm import tqdm
-
-from transformers import (
-    FlaxWav2Vec2ForPreTraining,
-    HfArgumentParser,
-    TrainingArguments,
-    Wav2Vec2Config,
-    Wav2Vec2FeatureExtractor,
-    is_tensorboard_available,
-)
-from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices, _sample_negative_indices
-
-
-logger = logging.getLogger(__name__)
-
-
-@flax.struct.dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    freeze_feature_extractor: Optional[bool] = field(
-        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
-    )
-    verbose_logging: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Whether to log verbose messages or not."},
-    )
-    max_gumbel_temperature: Optional[float] = field(
-        default=2.0, metadata={"help": "Maximum temperature for gumbel softmax."}
-    )
-    min_gumbel_temperature: Optional[float] = field(
-        default=0.1, metadata={"help": "Minimum temperature for gumbel softmax."}
-    )
-    gumbel_temperature_decay: Optional[float] = field(
-        default=0.999995, metadata={"help": "Decay of gumbel temperature during training."}
-    )
-    dtype: Optional[str] = field(
-        default="float32",
-        metadata={
-            "help": (
-                "Floating-point format in which the model weights should be initialized and trained. Choose one of"
-                " `[float32, float16, bfloat16]`."
-            )
-        },
-    )
-
-
-@flax.struct.dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_name: str = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_split_name: Optional[str] = field(
-        default="train",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    validation_split_name: Optional[str] = field(
-        default="validation",
-        metadata={
-            "help": (
-                "The name of the validation data set split to use (via the datasets library). Defaults to 'validation'"
-            )
-        },
-    )
-    speech_file_column: Optional[str] = field(
-        default="file",
-        metadata={"help": "Column in the dataset that contains speech file path. Defaults to 'file'"},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_duration_in_seconds: Optional[float] = field(
-        default=20.0, metadata={"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds"}
-    )
-    pad_to_multiple_of: Optional[int] = field(
-        default=1024,
-        metadata={
-            "help": (
-                "If set will pad the sequence to a multiple of the provided value. This is important to avoid"
-                " triggering recompilations on TPU"
-            )
-        },
-    )
-
-
-@flax.struct.dataclass
-class FlaxDataCollatorForWav2Vec2Pretraining:
-    """
-    Data collator that will dynamically pad the inputs received and prepare masked indices
-    for self-supervised pretraining.
-
-    Args:
-        model (:class:`~transformers.FlaxWav2Vec2ForPreTraining`):
-            The Wav2Vec2 model used for pretraining. The data collator needs to have access
-            to config and ``_get_feat_extract_output_lengths`` function for correct padding.
-        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
-            The processor used for processing the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
-              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-              different lengths).
-        max_length (:obj:`int`, `optional`):
-            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
-            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-    """
-
-    model: FlaxWav2Vec2ForPreTraining
-    feature_extractor: Wav2Vec2FeatureExtractor
-    padding: Union[bool, str] = "longest"
-    pad_to_multiple_of: Optional[int] = None
-    max_length: Optional[int] = None
-
-    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
-        # reformat list to dict and set to pytorch format
-        batch = self.feature_extractor.pad(
-            features,
-            max_length=self.max_length,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="np",
-        )
-        mask_indices_seq_length = self.model._get_feat_extract_output_lengths(batch["input_values"].shape[-1])
-
-        batch_size = batch["input_values"].shape[0]
-
-        attention_mask = None
-        if batch["attention_mask"] is not None:
-            output_lengths = self.model._get_feat_extract_output_lengths(batch["attention_mask"].sum(-1))
-            attention_mask = np.zeros((batch_size, mask_indices_seq_length), dtype=np.int8)
-
-            # these two operations makes sure that all values
-            # before the output lengths indices are attended to
-            attention_mask[(np.arange(attention_mask.shape[0]), output_lengths - 1)] = 1
-            attention_mask = jnp.flip(jnp.flip(attention_mask, -1).cumsum(-1), -1).astype("bool")
-
-        # sample randomly masked indices
-        batch["mask_time_indices"] = _compute_mask_indices(
-            (batch_size, mask_indices_seq_length),
-            self.model.config.mask_time_prob,
-            self.model.config.mask_time_length,
-            attention_mask=attention_mask,
-            min_masks=2,
-        )
-
-        # sample indices to take for negative vectors
-        batch["sampled_negative_indices"] = _sample_negative_indices(
-            (batch["mask_time_indices"].shape + (self.model.config.proj_codevector_dim,)),
-            self.model.config.num_negatives,
-            attention_mask=attention_mask,
-        )
-
-        return batch
-
-
-def configure_logger(model_args: ModelArguments, training_args: TrainingArguments):
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logging_level = logging.WARNING
-    if model_args.verbose_logging:
-        logging_level = logging.DEBUG
-    logger.setLevel(logging_level)
-
-
-def write_train_metric(summary_writer, train_metrics, train_time, step):
-    summary_writer.scalar("train_time", train_time, step)
-
-    train_metrics = get_metrics(train_metrics)
-    for key, vals in train_metrics.items():
-        tag = f"train_{key}"
-        for i, val in enumerate(vals):
-            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
-
-
-def write_eval_metric(summary_writer, eval_metrics, step):
-    for metric_name, value in eval_metrics.items():
-        summary_writer.scalar(f"eval_{metric_name}", value, step)
-
-
-def generate_batch_splits(samples_idx: np.ndarray, batch_size: int) -> np.ndarray:
-    num_samples = len(samples_idx)
-    samples_to_remove = num_samples % batch_size
-
-    if samples_to_remove != 0:
-        samples_idx = samples_idx[:-samples_to_remove]
-    sections_split = num_samples // batch_size
-    batch_idx = np.split(samples_idx, sections_split)
-    return batch_idx
-
-
-def compute_contrastive_loss(
-    quantized_features, transformer_features, negative_indices, mask_time_indices, logits_temp, num_negatives
-):
-    batch_size, sequence_length, hidden_size = quantized_features.shape
-
-    # take negative vectors from sampled indices
-    quantized_negatives = quantized_features.reshape(-1, hidden_size)[negative_indices.reshape(-1)]
-    quantized_negatives = quantized_negatives.reshape(
-        batch_size, sequence_length, num_negatives, hidden_size
-    ).transpose(2, 0, 1, 3)
-
-    target_features = jnp.concatenate([quantized_features[None, :], quantized_negatives], axis=0)
-    loss_logits = optax.cosine_similarity(transformer_features, target_features)
-    loss_logits = loss_logits / logits_temp
-
-    neg_is_pos = (quantized_features == quantized_negatives).all(-1)
-    neg_is_pos = jnp.concatenate([jnp.full((1,) + loss_logits.shape[1:], False), neg_is_pos], axis=0)
-
-    # make sure incorrectly sampled vectors don't contribute to loss
-    loss_logits = jnp.where(neg_is_pos, -1e9, loss_logits)
-
-    predictions = loss_logits.transpose(2, 1, 0).reshape(-1, loss_logits.shape[0])
-    targets = ((1 - mask_time_indices) * -100).transpose(1, 0).flatten()
-
-    target_mask = jnp.where(targets >= 0, 1.0, 0.0)
-    contrastive_loss = optax.softmax_cross_entropy(predictions, onehot(targets, predictions.shape[-1])) * target_mask
-
-    contrastive_loss = contrastive_loss.sum()
-
-    return contrastive_loss
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    configure_logger(model_args, training_args)
-
-    # Downloading and loading a dataset from the hub.
-    datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
-
-    if "validation" not in datasets.keys():
-        # make sure only "validation" and "train" keys remain"
-        datasets = DatasetDict()
-        datasets["validation"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=f"{data_args.train_split_name}[:{data_args.validation_split_percentage}%]",
-            cache_dir=model_args.cache_dir,
-        )
-        datasets["train"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=f"{data_args.train_split_name}[{data_args.validation_split_percentage}%:]",
-            cache_dir=model_args.cache_dir,
-        )
-    else:
-        # make sure only "validation" and "train" keys remain"
-        datasets = DatasetDict()
-        datasets["validation"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split="validation",
-            cache_dir=model_args.cache_dir,
-        )
-        datasets["train"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=f"{data_args.train_split_name}",
-            cache_dir=model_args.cache_dir,
-        )
-
-    # only normalized-inputs-training is supported
-    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, do_normalize=True
-    )
-
-    def prepare_dataset(batch):
-        # check that all files have the correct sampling rate
-        batch["speech"], _ = librosa.load(batch[data_args.speech_file_column], sr=feature_extractor.sampling_rate)
-        return batch
-
-    # load audio files into numpy arrays
-    vectorized_datasets = datasets.map(
-        prepare_dataset, num_proc=data_args.preprocessing_num_workers, remove_columns=datasets["train"].column_names
-    )
-
-    # filter audio files that are too long
-    vectorized_datasets = vectorized_datasets.filter(
-        lambda data: len(data["speech"]) < int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate)
-    )
-
-    def normalize(batch):
-        return feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate)
-
-    # normalize and transform to `BatchFeatures`
-    vectorized_datasets = vectorized_datasets.map(
-        normalize,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        load_from_cache_file=not data_args.overwrite_cache,
-        remove_columns=vectorized_datasets["train"].column_names,
-    )
-
-    # pretraining is only supported for "newer" stable layer norm architecture
-    # apply_spec_augment has to be True, mask_feature_prob has to be 0.0
-    config = Wav2Vec2Config.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-
-    if not config.do_stable_layer_norm or config.feat_extract_norm != "layer":
-        raise ValueError(
-            "PreTraining is only supported for ``config.do_stable_layer_norm=True`` and"
-            " ``config.feat_extract_norm='layer'"
-        )
-
-    model = FlaxWav2Vec2ForPreTraining(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
-
-    # Activate gradient checkpointing if needed
-    if training_args.gradient_checkpointing:
-        model.gradient_checkpointing_enable()
-
-    data_collator = FlaxDataCollatorForWav2Vec2Pretraining(
-        model=model, feature_extractor=feature_extractor, pad_to_multiple_of=data_args.pad_to_multiple_of
-    )
-
-    # Enable tensorboard only on the master node
-    has_tensorboard = is_tensorboard_available()
-    if has_tensorboard and jax.process_index() == 0:
-        try:
-            from flax.metrics.tensorboard import SummaryWriter
-
-            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
-        except ImportError as ie:
-            has_tensorboard = False
-            logger.warning(
-                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
-            )
-    else:
-        logger.warning(
-            "Unable to display metrics through TensorBoard because the package is not installed: "
-            "Please run pip install tensorboard to enable."
-        )
-
-    # Initialize our training
-    rng = jax.random.PRNGKey(training_args.seed)
-    dropout_rngs = jax.random.split(rng, jax.local_device_count())
-    gumbel_rngs = jax.random.split(rng, jax.local_device_count())
-
-    num_epochs = int(training_args.num_train_epochs)
-    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
-    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
-
-    num_train_steps = len(vectorized_datasets["train"]) // train_batch_size * num_epochs
-
-    # Create learning rate schedule
-    warmup_fn = optax.linear_schedule(
-        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
-    )
-    decay_fn = optax.linear_schedule(
-        init_value=training_args.learning_rate,
-        end_value=0,
-        transition_steps=num_train_steps - training_args.warmup_steps,
-    )
-    linear_decay_lr_schedule_fn = optax.join_schedules(
-        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
-    )
-
-    # We use Optax's "masking" functionality to not apply weight decay
-    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
-    # mask boolean with the same structure as the parameters.
-    # The mask is True for parameters that should be decayed.
-    def decay_mask_fn(params):
-        flat_params = traverse_util.flatten_dict(params)
-        flat_mask = {
-            path: (path[-1] != "bias" and path[-2:] not in [("layer_norm", "scale"), ("final_layer_norm", "scale")])
-            for path in flat_params
-        }
-        return traverse_util.unflatten_dict(flat_mask)
-
-    # create adam optimizer
-    adamw = optax.adamw(
-        learning_rate=linear_decay_lr_schedule_fn,
-        b1=training_args.adam_beta1,
-        b2=training_args.adam_beta2,
-        eps=training_args.adam_epsilon,
-        weight_decay=training_args.weight_decay,
-        mask=decay_mask_fn,
-    )
-
-    # Setup train state and define training hyper-parameters
-    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)
-    num_negatives = model.config.num_negatives
-    contrastive_logits_temperature = model.config.contrastive_logits_temperature
-    num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups
-    diversity_loss_weight = model.config.diversity_loss_weight
-
-    # Define gradient update step fn
-    def train_step(state, batch, dropout_rng, gumbel_rng):
-        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
-        gumbel_rng, new_gumbel_rng = jax.random.split(gumbel_rng)
-
-        def loss_fn(params):
-            negative_indices = batch.pop("sampled_negative_indices")
-
-            gumbel_temperature = jnp.clip(
-                model_args.max_gumbel_temperature * model_args.gumbel_temperature_decay**state.step,
-                a_min=model_args.min_gumbel_temperature,
-            )
-
-            outputs = state.apply_fn(
-                **batch,
-                gumbel_temperature=gumbel_temperature,
-                params=params,
-                dropout_rng=dropout_rng,
-                gumbel_rng=gumbel_rng,
-                train=True,
-            )
-
-            contrastive_loss = compute_contrastive_loss(
-                outputs.projected_quantized_states,
-                outputs.projected_states,
-                negative_indices,
-                batch["mask_time_indices"],
-                contrastive_logits_temperature,
-                num_negatives,
-            )
-
-            diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors
-            loss = contrastive_loss + diversity_loss_weight * diversity_loss
-
-            return loss
-
-        grad_fn = jax.value_and_grad(loss_fn)
-        loss, grad = grad_fn(state.params)
-        grad = jax.lax.pmean(grad, "batch")
-        new_state = state.apply_gradients(grads=grad)
-
-        metrics = jax.lax.pmean(
-            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
-        )
-
-        return new_state, metrics, new_dropout_rng, new_gumbel_rng
-
-    # Create parallel version of the train step
-    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
-
-    # Define eval fn
-    def eval_step(params, batch):
-        negative_indices = batch.pop("sampled_negative_indices")
-
-        outputs = model(**batch, params=params, train=False)
-
-        contrastive_loss = compute_contrastive_loss(
-            outputs.projected_quantized_states,
-            outputs.projected_states,
-            negative_indices,
-            batch["mask_time_indices"],
-            contrastive_logits_temperature,
-            num_negatives,
-        )
-
-        diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors
-        loss = contrastive_loss + diversity_loss_weight * diversity_loss
-
-        # summarize metrics
-        metrics = {"loss": loss.mean(), "codevector_perplexity": outputs.codevector_perplexity}
-        metrics = jax.lax.pmean(metrics, axis_name="batch")
-
-        return metrics
-
-    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
-
-    # Replicate the train state on each device
-    state = jax_utils.replicate(state)
-
-    train_time = 0
-    train_metrics = []
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
-    for epoch in epochs:
-        # ======================== Training ================================
-        train_start = time.time()
-
-        # Create sampling rng
-        rng, input_rng = jax.random.split(rng)
-
-        # Generate an epoch by shuffling sampling indices from the train dataset
-        num_train_samples = len(vectorized_datasets["train"])
-        # Avoid using jax.numpy here in case of TPU training
-        train_samples_idx = np.random.permutation(np.arange(num_train_samples))
-        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
-
-        # Gather the indexes for creating the batch and do a training step
-        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
-            samples = [vectorized_datasets["train"][int(idx)] for idx in batch_idx]
-            model_inputs = data_collator(samples)
-            model_inputs = shard(model_inputs.data)
-
-            # Model forward
-            state, train_metric, dropout_rngs, gumbel_rngs = p_train_step(
-                state, model_inputs, dropout_rngs, gumbel_rngs
-            )
-            train_metrics.append(train_metric)
-
-            cur_step = epoch * (num_train_samples // train_batch_size) + step
-
-            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
-                # Save metrics
-                train_metric = jax_utils.unreplicate(train_metric)
-                train_time += time.time() - train_start
-                if has_tensorboard and jax.process_index() == 0:
-                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
-
-                epochs.write(
-                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate:"
-                    f" {train_metric['learning_rate'].mean()})"
-                )
-
-                train_metrics = []
-
-        # ======================== Evaluating ==============================
-        num_eval_samples = len(vectorized_datasets["validation"])
-        # Avoid using jax.numpy here in case of TPU training
-        eval_samples_idx = np.arange(num_eval_samples)
-        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
-
-        eval_metrics = []
-        for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
-            samples = [vectorized_datasets["validation"][int(idx)] for idx in batch_idx]
-            model_inputs = data_collator(samples)
-
-            # Model forward
-            model_inputs = shard(model_inputs.data)
-            metrics = p_eval_step(state.params, model_inputs)
-            eval_metrics.append(metrics)
-
-        # get eval metrics
-        eval_metrics = get_metrics(eval_metrics)
-        eval_metrics = jax.tree_util.tree_map(jnp.mean, eval_metrics)
-
-        # Update progress bar
-        epochs.write(
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {eval_metrics['loss']}, Perplexity:"
-            f" {eval_metrics['codevector_perplexity']})"
-        )
-
-        # Save metrics
-        if has_tensorboard and jax.process_index() == 0:
-            cur_step = epoch * (len(vectorized_datasets["train"]) // train_batch_size)
-            write_eval_metric(summary_writer, eval_metrics, cur_step)
-
-        # save checkpoint after each epoch and push checkpoint to the hub
-        if jax.process_index() == 0:
-            params = jax.device_get(jax.tree_util.tree_map(lambda x: x[0], state.params))
-            model.save_pretrained(training_args.output_dir, params=params, push_to_hub=training_args.push_to_hub)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/layoutlmv3/README.md b/examples/research_projects/layoutlmv3/README.md
deleted file mode 100644
index 2cc0fb75bd2c..000000000000
--- a/examples/research_projects/layoutlmv3/README.md
+++ /dev/null
@@ -1,69 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Token classification with LayoutLMv3 (PyTorch version)
-
-This directory contains a script, `run_funsd_cord.py`, that can be used to fine-tune (or evaluate) LayoutLMv3 on form understanding datasets, such as [FUNSD](https://guillaumejaume.github.io/FUNSD/) and [CORD](https://github.com/clovaai/cord).
-
-The script `run_funsd_cord.py` leverages the 🤗 Datasets library and the Trainer API. You can easily customize it to your needs.
-
-## Fine-tuning on FUNSD
-
-Fine-tuning LayoutLMv3 for token classification on [FUNSD](https://guillaumejaume.github.io/FUNSD/) can be done as follows:
-
-```bash
-python run_funsd_cord.py \
-  --model_name_or_path microsoft/layoutlmv3-base \
-  --dataset_name funsd \
-  --output_dir layoutlmv3-test \
-  --do_train \
-  --do_eval \
-  --max_steps 1000 \
-  --eval_strategy steps \
-  --eval_steps 100 \
-  --learning_rate 1e-5 \
-  --load_best_model_at_end \
-  --metric_for_best_model "eval_f1" \
-  --push_to_hub \
-  --push_to_hub°model_id layoutlmv3-finetuned-funsd
-```
-
-👀 The resulting model can be found here: https://huggingface.co/nielsr/layoutlmv3-finetuned-funsd. By specifying the `push_to_hub` flag, the model gets uploaded automatically to the hub (regularly), together with a model card, which includes metrics such as precision, recall and F1. Note that you can easily update the model card, as it's just a README file of the respective repo on the hub.
-
-There's also the "Training metrics" [tab](https://huggingface.co/nielsr/layoutlmv3-finetuned-funsd/tensorboard), which shows Tensorboard logs over the course of training. Pretty neat, huh?
-
-## Fine-tuning on CORD
-
-Fine-tuning LayoutLMv3 for token classification on [CORD](https://github.com/clovaai/cord) can be done as follows:
-
-```bash
-python run_funsd_cord.py \
-  --model_name_or_path microsoft/layoutlmv3-base \
-  --dataset_name cord \
-  --output_dir layoutlmv3-test \
-  --do_train \
-  --do_eval \
-  --max_steps 1000 \
-  --eval_strategy steps \
-  --eval_steps 100 \
-  --learning_rate 5e-5 \
-  --load_best_model_at_end \
-  --metric_for_best_model "eval_f1" \
-  --push_to_hub \
-  --push_to_hub°model_id layoutlmv3-finetuned-cord
-```
-
-👀 The resulting model can be found here: https://huggingface.co/nielsr/layoutlmv3-finetuned-cord. Note that a model card gets generated automatically in case you specify the `push_to_hub` flag.
\ No newline at end of file
diff --git a/examples/research_projects/layoutlmv3/requirements.txt b/examples/research_projects/layoutlmv3/requirements.txt
deleted file mode 100644
index c4fa0075733b..000000000000
--- a/examples/research_projects/layoutlmv3/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-datasets
-seqeval
-pillow
diff --git a/examples/research_projects/layoutlmv3/run_funsd_cord.py b/examples/research_projects/layoutlmv3/run_funsd_cord.py
deleted file mode 100644
index ad83fbdef9de..000000000000
--- a/examples/research_projects/layoutlmv3/run_funsd_cord.py
+++ /dev/null
@@ -1,533 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning LayoutLMv3 for token classification on FUNSD or CORD.
-"""
-# You can also adapt this script on your own token classification task and datasets. Pointers for this are left as
-# comments.
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import datasets
-import numpy as np
-from datasets import ClassLabel, load_dataset, load_metric
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoModelForTokenClassification,
-    AutoProcessor,
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    set_seed,
-)
-from transformers.data.data_collator import default_data_collator
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version
-from transformers.utils.versions import require_version
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.19.0.dev0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        default="microsoft/layoutlmv3-base",
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    processor_name: Optional[str] = field(
-        default=None, metadata={"help": "Name or path to the processor files if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."})
-    dataset_name: Optional[str] = field(
-        default="nielsr/funsd-layoutlmv3",
-        metadata={"help": "The name of the dataset to use (via the datasets library)."},
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
-    )
-    text_column_name: Optional[str] = field(
-        default=None, metadata={"help": "The column name of text to input in the file (a csv or JSON file)."}
-    )
-    label_column_name: Optional[str] = field(
-        default=None, metadata={"help": "The column name of label to input in the file (a csv or JSON file)."}
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_seq_length: int = field(
-        default=512,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. If set, sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    label_all_tokens: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to put the label for one word on all tokens of generated by that word or just on the "
-                "one (in which case the other tokens will have a padding index)."
-            )
-        },
-    )
-    return_entity_level_metrics: bool = field(
-        default=False,
-        metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-        self.task_name = self.task_name.lower()
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name == "funsd":
-        # Downloading and loading a dataset from the hub.
-        dataset = load_dataset(
-            "nielsr/funsd-layoutlmv3",
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
-        )
-    elif data_args.dataset_name == "cord":
-        # Downloading and loading a dataset from the hub.
-        dataset = load_dataset(
-            "nielsr/cord-layoutlmv3",
-            data_args.dataset_config_name,
-            cache_dir=model_args.cache_dir,
-            token=True if model_args.use_auth_token else None,
-        )
-    else:
-        raise ValueError("This script only supports either FUNSD or CORD out-of-the-box.")
-
-    if training_args.do_train:
-        column_names = dataset["train"].column_names
-        features = dataset["train"].features
-    else:
-        column_names = dataset["test"].column_names
-        features = dataset["test"].features
-
-    image_column_name = "image"
-    text_column_name = "words" if "words" in column_names else "tokens"
-    boxes_column_name = "bboxes"
-    label_column_name = (
-        f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1]
-    )
-
-    remove_columns = column_names
-
-    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
-    # unique labels.
-    def get_label_list(labels):
-        unique_labels = set()
-        for label in labels:
-            unique_labels = unique_labels | set(label)
-        label_list = list(unique_labels)
-        label_list.sort()
-        return label_list
-
-    # If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.
-    # Otherwise, we have to get the list of labels manually.
-    if isinstance(features[label_column_name].feature, ClassLabel):
-        label_list = features[label_column_name].feature.names
-        # No need to convert the labels since they are already ints.
-        id2label = dict(enumerate(label_list))
-        label2id = {v: k for k, v in enumerate(label_list)}
-    else:
-        label_list = get_label_list(datasets["train"][label_column_name])
-        id2label = dict(enumerate(label_list))
-        label2id = {v: k for k, v in enumerate(label_list)}
-    num_labels = len(label_list)
-
-    # Load pretrained model and processor
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=data_args.task_name,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-
-    processor = AutoProcessor.from_pretrained(
-        model_args.processor_name if model_args.processor_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=True,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-        add_prefix_space=True,
-        apply_ocr=False,
-    )
-
-    model = AutoModelForTokenClassification.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-
-    # Set the correspondences label/ID inside the model config
-    model.config.label2id = label2id
-    model.config.id2label = id2label
-
-    # Preprocessing the dataset
-    # The processor does everything for us (prepare the image using LayoutLMv3ImageProcessor
-    # and prepare the words, boxes and word-level labels using LayoutLMv3TokenizerFast)
-    def prepare_examples(examples):
-        images = examples[image_column_name]
-        words = examples[text_column_name]
-        boxes = examples[boxes_column_name]
-        word_labels = examples[label_column_name]
-
-        encoding = processor(
-            images,
-            words,
-            boxes=boxes,
-            word_labels=word_labels,
-            truncation=True,
-            padding="max_length",
-            max_length=data_args.max_seq_length,
-        )
-
-        return encoding
-
-    if training_args.do_train:
-        if "train" not in dataset:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = dataset["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        with training_args.main_process_first(desc="train dataset map pre-processing"):
-            train_dataset = train_dataset.map(
-                prepare_examples,
-                batched=True,
-                remove_columns=remove_columns,
-                num_proc=data_args.preprocessing_num_workers,
-                load_from_cache_file=not data_args.overwrite_cache,
-            )
-
-    if training_args.do_eval:
-        validation_name = "test"
-        if validation_name not in dataset:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = dataset[validation_name]
-        if data_args.max_eval_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-        with training_args.main_process_first(desc="validation dataset map pre-processing"):
-            eval_dataset = eval_dataset.map(
-                prepare_examples,
-                batched=True,
-                remove_columns=remove_columns,
-                num_proc=data_args.preprocessing_num_workers,
-                load_from_cache_file=not data_args.overwrite_cache,
-            )
-
-    if training_args.do_predict:
-        if "test" not in datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_dataset = datasets["test"]
-        if data_args.max_predict_samples is not None:
-            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
-            predict_dataset = predict_dataset.select(range(max_predict_samples))
-        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
-            predict_dataset = predict_dataset.map(
-                prepare_examples,
-                batched=True,
-                remove_columns=remove_columns,
-                num_proc=data_args.preprocessing_num_workers,
-                load_from_cache_file=not data_args.overwrite_cache,
-            )
-
-    # Metrics
-    metric = load_metric("seqeval")
-
-    def compute_metrics(p):
-        predictions, labels = p
-        predictions = np.argmax(predictions, axis=2)
-
-        # Remove ignored index (special tokens)
-        true_predictions = [
-            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
-            for prediction, label in zip(predictions, labels)
-        ]
-        true_labels = [
-            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
-            for prediction, label in zip(predictions, labels)
-        ]
-
-        results = metric.compute(predictions=true_predictions, references=true_labels)
-        if data_args.return_entity_level_metrics:
-            # Unpack nested dictionaries
-            final_results = {}
-            for key, value in results.items():
-                if isinstance(value, dict):
-                    for n, v in value.items():
-                        final_results[f"{key}_{n}"] = v
-                else:
-                    final_results[key] = value
-            return final_results
-        else:
-            return {
-                "precision": results["overall_precision"],
-                "recall": results["overall_recall"],
-                "f1": results["overall_f1"],
-                "accuracy": results["overall_accuracy"],
-            }
-
-    # Initialize our Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=processor,
-        data_collator=default_data_collator,
-        compute_metrics=compute_metrics,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        metrics = train_result.metrics
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        metrics = trainer.evaluate()
-
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Predict
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-
-        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
-        predictions = np.argmax(predictions, axis=2)
-
-        # Remove ignored index (special tokens)
-        true_predictions = [
-            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
-            for prediction, label in zip(predictions, labels)
-        ]
-
-        trainer.log_metrics("predict", metrics)
-        trainer.save_metrics("predict", metrics)
-
-        # Save predictions
-        output_predictions_file = os.path.join(training_args.output_dir, "predictions.txt")
-        if trainer.is_world_process_zero():
-            with open(output_predictions_file, "w") as writer:
-                for prediction in true_predictions:
-                    writer.write(" ".join(prediction) + "\n")
-
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
-    if data_args.dataset_name is not None:
-        kwargs["dataset_tags"] = data_args.dataset_name
-        if data_args.dataset_config_name is not None:
-            kwargs["dataset_args"] = data_args.dataset_config_name
-            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-        else:
-            kwargs["dataset"] = data_args.dataset_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/longform-qa/README.md b/examples/research_projects/longform-qa/README.md
deleted file mode 100644
index eaa29d454226..000000000000
--- a/examples/research_projects/longform-qa/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Long Form Question Answering
-
-Author: @yjernite
-
-This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗datasets](https://github.com/huggingface/datasets) libraries.
-
-You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html).
diff --git a/examples/research_projects/longform-qa/eli5_app.py b/examples/research_projects/longform-qa/eli5_app.py
deleted file mode 100644
index 6b1b15cc9cbb..000000000000
--- a/examples/research_projects/longform-qa/eli5_app.py
+++ /dev/null
@@ -1,349 +0,0 @@
-import datasets
-import faiss
-import numpy as np
-import streamlit as st
-import torch
-from elasticsearch import Elasticsearch
-from eli5_utils import (
-    embed_questions_for_retrieval,
-    make_qa_s2s_model,
-    qa_s2s_generate,
-    query_es_index,
-    query_qa_dense_index,
-)
-
-import transformers
-from transformers import AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer
-
-
-MODEL_TYPE = "bart"
-LOAD_DENSE_INDEX = True
-
-
-@st.cache(allow_output_mutation=True)
-def load_models():
-    if LOAD_DENSE_INDEX:
-        qar_tokenizer = AutoTokenizer.from_pretrained("yjernite/retribert-base-uncased")
-        qar_model = AutoModel.from_pretrained("yjernite/retribert-base-uncased").to("cuda:0")
-        _ = qar_model.eval()
-    else:
-        qar_tokenizer, qar_model = (None, None)
-    if MODEL_TYPE == "bart":
-        s2s_tokenizer = AutoTokenizer.from_pretrained("yjernite/bart_eli5")
-        s2s_model = AutoModelForSeq2SeqLM.from_pretrained("yjernite/bart_eli5").to("cuda:0")
-        save_dict = torch.load("seq2seq_models/eli5_bart_model_blm_2.pth")
-        s2s_model.load_state_dict(save_dict["model"])
-        _ = s2s_model.eval()
-    else:
-        s2s_tokenizer, s2s_model = make_qa_s2s_model(
-            model_name="google-t5/t5-small", from_file="seq2seq_models/eli5_t5_model_1024_4.pth", device="cuda:0"
-        )
-    return (qar_tokenizer, qar_model, s2s_tokenizer, s2s_model)
-
-
-@st.cache(allow_output_mutation=True)
-def load_indexes():
-    if LOAD_DENSE_INDEX:
-        faiss_res = faiss.StandardGpuResources()
-        wiki40b_passages = datasets.load_dataset(path="wiki_snippets", name="wiki40b_en_100_0")["train"]
-        wiki40b_passage_reps = np.memmap(
-            "wiki40b_passages_reps_32_l-8_h-768_b-512-512.dat",
-            dtype="float32",
-            mode="r",
-            shape=(wiki40b_passages.num_rows, 128),
-        )
-        wiki40b_index_flat = faiss.IndexFlatIP(128)
-        wiki40b_gpu_index_flat = faiss.index_cpu_to_gpu(faiss_res, 1, wiki40b_index_flat)
-        wiki40b_gpu_index_flat.add(wiki40b_passage_reps)  # TODO fix for larger GPU
-    else:
-        wiki40b_passages, wiki40b_gpu_index_flat = (None, None)
-    es_client = Elasticsearch([{"host": "localhost", "port": "9200"}])
-    return (wiki40b_passages, wiki40b_gpu_index_flat, es_client)
-
-
-@st.cache(allow_output_mutation=True)
-def load_train_data():
-    eli5 = datasets.load_dataset("eli5", name="LFQA_reddit")
-    eli5_train = eli5["train_eli5"]
-    eli5_train_q_reps = np.memmap(
-        "eli5_questions_reps.dat", dtype="float32", mode="r", shape=(eli5_train.num_rows, 128)
-    )
-    eli5_train_q_index = faiss.IndexFlatIP(128)
-    eli5_train_q_index.add(eli5_train_q_reps)
-    return (eli5_train, eli5_train_q_index)
-
-
-passages, gpu_dense_index, es_client = load_indexes()
-qar_tokenizer, qar_model, s2s_tokenizer, s2s_model = load_models()
-eli5_train, eli5_train_q_index = load_train_data()
-
-
-def find_nearest_training(question, n_results=10):
-    q_rep = embed_questions_for_retrieval([question], qar_tokenizer, qar_model)
-    D, I = eli5_train_q_index.search(q_rep, n_results)
-    nn_examples = [eli5_train[int(i)] for i in I[0]]
-    return nn_examples
-
-
-def make_support(question, source="wiki40b", method="dense", n_results=10):
-    if source == "none":
-        support_doc, hit_lst = (" <P> ".join(["" for _ in range(11)]).strip(), [])
-    else:
-        if method == "dense":
-            support_doc, hit_lst = query_qa_dense_index(
-                question, qar_model, qar_tokenizer, passages, gpu_dense_index, n_results
-            )
-        else:
-            support_doc, hit_lst = query_es_index(
-                question,
-                es_client,
-                index_name="english_wiki40b_snippets_100w",
-                n_results=n_results,
-            )
-    support_list = [
-        (res["article_title"], res["section_title"].strip(), res["score"], res["passage_text"]) for res in hit_lst
-    ]
-    question_doc = "question: {} context: {}".format(question, support_doc)
-    return question_doc, support_list
-
-
-@st.cache(
-    hash_funcs={
-        torch.Tensor: (lambda _: None),
-        transformers.models.bart.tokenization_bart.BartTokenizer: (lambda _: None),
-    }
-)
-def answer_question(
-    question_doc, s2s_model, s2s_tokenizer, min_len=64, max_len=256, sampling=False, n_beams=2, top_p=0.95, temp=0.8
-):
-    with torch.no_grad():
-        answer = qa_s2s_generate(
-            question_doc,
-            s2s_model,
-            s2s_tokenizer,
-            num_answers=1,
-            num_beams=n_beams,
-            min_len=min_len,
-            max_len=max_len,
-            do_sample=sampling,
-            temp=temp,
-            top_p=top_p,
-            top_k=None,
-            max_input_length=1024,
-            device="cuda:0",
-        )[0]
-    return (answer, support_list)
-
-
-st.title("Long Form Question Answering with ELI5")
-
-# Start sidebar
-header_html = "<img src='https://huggingface.co/front/assets/huggingface_logo.svg'>"
-header_full = """
-<html>
-  <head>
-    <style>
-      .img-container {
-        padding-left: 90px;
-        padding-right: 90px;
-        padding-top: 50px;
-        padding-bottom: 50px;
-        background-color: #f0f3f9;
-      }
-    </style>
-  </head>
-  <body>
-    <span class="img-container"> <!-- Inline parent element -->
-      %s
-    </span>
-  </body>
-</html>
-""" % (header_html,)
-st.sidebar.markdown(
-    header_full,
-    unsafe_allow_html=True,
-)
-
-# Long Form QA with ELI5 and Wikipedia
-description = """
-This demo presents a model trained to [provide long-form answers to open-domain questions](https://yjernite.github.io/lfqa.html).
-First, a document retriever fetches a set of relevant Wikipedia passages given the question from the [Wiki40b](https://research.google/pubs/pub49029/) dataset,
-a pre-processed fixed snapshot of Wikipedia.
-"""
-st.sidebar.markdown(description, unsafe_allow_html=True)
-
-action_list = [
-    "Answer the question",
-    "View the retrieved document only",
-    "View the most similar ELI5 question and answer",
-    "Show me everything, please!",
-]
-demo_options = st.sidebar.checkbox("Demo options")
-if demo_options:
-    action_st = st.sidebar.selectbox(
-        "",
-        action_list,
-        index=3,
-    )
-    action = action_list.index(action_st)
-    show_type = st.sidebar.selectbox(
-        "",
-        ["Show full text of passages", "Show passage section titles"],
-        index=0,
-    )
-    show_passages = show_type == "Show full text of passages"
-else:
-    action = 3
-    show_passages = True
-
-retrieval_options = st.sidebar.checkbox("Retrieval options")
-if retrieval_options:
-    retriever_info = """
-    ### Information retriever options
-
-    The **sparse** retriever uses ElasticSearch, while the **dense** retriever uses max-inner-product search between a question and passage embedding
-    trained using the [ELI5](https://arxiv.org/abs/1907.09190) questions-answer pairs.
-    The answer is then generated by sequence to sequence model which takes the question and retrieved document as input.
-    """
-    st.sidebar.markdown(retriever_info)
-    wiki_source = st.sidebar.selectbox("Which Wikipedia format should the model use?", ["wiki40b", "none"])
-    index_type = st.sidebar.selectbox("Which Wikipedia indexer should the model use?", ["dense", "sparse", "mixed"])
-else:
-    wiki_source = "wiki40b"
-    index_type = "dense"
-
-sampled = "beam"
-n_beams = 2
-min_len = 64
-max_len = 256
-top_p = None
-temp = None
-generate_options = st.sidebar.checkbox("Generation options")
-if generate_options:
-    generate_info = """
-    ### Answer generation options
-
-    The sequence-to-sequence model was initialized with [BART](https://huggingface.co/facebook/bart-large)
-    weights and fine-tuned on the ELI5 QA pairs and retrieved documents. You can use the model for greedy decoding with
-    **beam** search, or **sample** from the decoder's output probabilities.
-    """
-    st.sidebar.markdown(generate_info)
-    sampled = st.sidebar.selectbox("Would you like to use beam search or sample an answer?", ["beam", "sampled"])
-    min_len = st.sidebar.slider(
-        "Minimum generation length", min_value=8, max_value=256, value=64, step=8, format=None, key=None
-    )
-    max_len = st.sidebar.slider(
-        "Maximum generation length", min_value=64, max_value=512, value=256, step=16, format=None, key=None
-    )
-    if sampled == "beam":
-        n_beams = st.sidebar.slider("Beam size", min_value=1, max_value=8, value=2, step=None, format=None, key=None)
-    else:
-        top_p = st.sidebar.slider(
-            "Nucleus sampling p", min_value=0.1, max_value=1.0, value=0.95, step=0.01, format=None, key=None
-        )
-        temp = st.sidebar.slider(
-            "Temperature", min_value=0.1, max_value=1.0, value=0.7, step=0.01, format=None, key=None
-        )
-        n_beams = None
-
-# start main text
-questions_list = [
-    "<MY QUESTION>",
-    "How do people make chocolate?",
-    "Why do we get a fever when we are sick?",
-    "How can different animals perceive different colors?",
-    "What is natural language processing?",
-    "What's the best way to treat a sunburn?",
-    "What exactly are vitamins ?",
-    "How does nuclear energy provide electricity?",
-    "What's the difference between viruses and bacteria?",
-    "Why are flutes classified as woodwinds when most of them are made out of metal ?",
-    "Why do people like drinking coffee even though it tastes so bad?",
-    "What happens when wine ages? How does it make the wine taste better?",
-    "If an animal is an herbivore, where does it get the protein that it needs to survive if it only eats grass?",
-    "How can we set a date to the beginning or end of an artistic period? Doesn't the change happen gradually?",
-    "How does New Zealand have so many large bird predators?",
-]
-question_s = st.selectbox(
-    "What would you like to ask? ---- select <MY QUESTION> to enter a new query",
-    questions_list,
-    index=1,
-)
-if question_s == "<MY QUESTION>":
-    question = st.text_input("Enter your question here:", "")
-else:
-    question = question_s
-
-if st.button("Show me!"):
-    if action in [0, 1, 3]:
-        if index_type == "mixed":
-            _, support_list_dense = make_support(question, source=wiki_source, method="dense", n_results=10)
-            _, support_list_sparse = make_support(question, source=wiki_source, method="sparse", n_results=10)
-            support_list = []
-            for res_d, res_s in zip(support_list_dense, support_list_sparse):
-                if tuple(res_d) not in support_list:
-                    support_list += [tuple(res_d)]
-                if tuple(res_s) not in support_list:
-                    support_list += [tuple(res_s)]
-            support_list = support_list[:10]
-            question_doc = "<P> " + " <P> ".join([res[-1] for res in support_list])
-        else:
-            question_doc, support_list = make_support(question, source=wiki_source, method=index_type, n_results=10)
-    if action in [0, 3]:
-        answer, support_list = answer_question(
-            question_doc,
-            s2s_model,
-            s2s_tokenizer,
-            min_len=min_len,
-            max_len=int(max_len),
-            sampling=(sampled == "sampled"),
-            n_beams=n_beams,
-            top_p=top_p,
-            temp=temp,
-        )
-        st.markdown("### The model generated answer is:")
-        st.write(answer)
-    if action in [0, 1, 3] and wiki_source != "none":
-        st.markdown("--- \n ### The model is drawing information from the following Wikipedia passages:")
-        for i, res in enumerate(support_list):
-            wiki_url = "https://en.wikipedia.org/wiki/{}".format(res[0].replace(" ", "_"))
-            sec_titles = res[1].strip()
-            if sec_titles == "":
-                sections = "[{}]({})".format(res[0], wiki_url)
-            else:
-                sec_list = sec_titles.split(" & ")
-                sections = " & ".join(
-                    ["[{}]({}#{})".format(sec.strip(), wiki_url, sec.strip().replace(" ", "_")) for sec in sec_list]
-                )
-            st.markdown(
-                "{0:02d} - **Article**: {1:<18} <br>  _Section_: {2}".format(i + 1, res[0], sections),
-                unsafe_allow_html=True,
-            )
-            if show_passages:
-                st.write(
-                    '> <span style="font-family:arial; font-size:10pt;">' + res[-1] + "</span>", unsafe_allow_html=True
-                )
-    if action in [2, 3]:
-        nn_train_list = find_nearest_training(question)
-        train_exple = nn_train_list[0]
-        st.markdown(
-            "--- \n ### The most similar question in the ELI5 training set was: \n\n {}".format(train_exple["title"])
-        )
-        answers_st = [
-            "{}. {}".format(i + 1, "  \n".join([line.strip() for line in ans.split("\n") if line.strip() != ""]))
-            for i, (ans, sc) in enumerate(zip(train_exple["answers"]["text"], train_exple["answers"]["score"]))
-            if i == 0 or sc > 2
-        ]
-        st.markdown("##### Its answers were: \n\n {}".format("\n".join(answers_st)))
-
-
-disclaimer = """
----
-
-**Disclaimer**
-
-*The intent of this app is to provide some (hopefully entertaining) insights into the behavior of a current LFQA system.
-Evaluating biases of such a model and ensuring factual generations are still very much open research problems.
-Therefore, until some significant progress is achieved, we caution against using the generated answers for practical purposes.*
-"""
-st.sidebar.markdown(disclaimer, unsafe_allow_html=True)
diff --git a/examples/research_projects/longform-qa/eli5_utils.py b/examples/research_projects/longform-qa/eli5_utils.py
deleted file mode 100644
index d4b235fdbaab..000000000000
--- a/examples/research_projects/longform-qa/eli5_utils.py
+++ /dev/null
@@ -1,688 +0,0 @@
-import functools
-import math
-import os  # noqa: F401
-from random import choice, randint
-from time import time
-
-import datasets  # noqa: F401
-import faiss  # noqa: F401
-import numpy as np
-import pandas as pd
-import torch
-import torch.utils.checkpoint as checkpoint
-from elasticsearch import Elasticsearch  # noqa: F401
-from elasticsearch.helpers import bulk, streaming_bulk  # noqa: F401
-from torch import nn
-from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
-from tqdm import tqdm
-
-from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup
-
-
-pd.set_option("display.max_colwidth", None)
-
-
-###############
-# Sparse index
-###############
-def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_kilt_snippets_100w"):
-    index_config = {
-        "settings": {
-            "number_of_shards": 1,
-            "analysis": {"analyzer": {"stop_standard": {"type": "standard", " stopwords": "_english_"}}},
-        },
-        "mappings": {
-            "properties": {
-                "article_title": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
-                "section_title": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
-                "passage_text": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
-            }
-        },
-    }
-    es_client.indices.create(index=index_name, body=index_config)
-    number_of_docs = passages_dset.num_rows
-    progress = tqdm(unit="docs", total=number_of_docs)
-    successes = 0
-
-    def passage_generator():
-        for passage in passages_dset:
-            yield passage
-
-    # create the ES index
-    for ok, action in streaming_bulk(
-        client=es_client,
-        index=index_name,
-        actions=passage_generator(),
-    ):
-        progress.update(1)
-        successes += ok
-    print("Indexed %d documents" % (successes,))
-
-
-def query_es_index(question, es_client, index_name="english_wiki_kilt_snippets_100w", n_results=10, min_length=20):
-    q = question.lower()
-    banned = ["how", "why", "what", "where", "which", "do", "does", "is", "?", "eli5", "eli5:"]
-    q = " ".join([w for w in q.split() if w not in banned])
-    response = es_client.search(
-        index=index_name,
-        body={
-            "query": {
-                "multi_match": {
-                    "query": q,
-                    "fields": ["article_title", "section_title", "passage_text^2"],
-                    "type": "cross_fields",
-                }
-            },
-            "size": 2 * n_results,
-        },
-    )
-    hits = response["hits"]["hits"]
-    support_doc = "<P> " + " <P> ".join([hit["_source"]["passage_text"] for hit in hits])
-    res_list = [{k: hit["_source"][k] for k in hit["_source"] if k != "passage_text"} for hit in hits]
-    for r, hit in zip(res_list, hits):
-        r["passage_id"] = hit["_id"]
-        r["score"] = hit["_score"]
-        r["passage_text"] = hit["_source"]["passage_text"]
-    res_list = [res for res in res_list if len(res["passage_text"].split()) > min_length][:n_results]
-    return support_doc, res_list
-
-
-###############
-# ELI5 retriever training
-###############
-class ELI5DatasetQARetriver(Dataset):
-    def __init__(self, examples_array, extra_answer_threshold=3, min_answer_length=64, training=True, n_samples=None):
-        self.data = examples_array
-        self.answer_thres = extra_answer_threshold
-        self.min_length = min_answer_length
-        self.training = training
-        self.n_samples = self.data.num_rows if n_samples is None else n_samples
-
-    def __len__(self):
-        return self.n_samples
-
-    def make_example(self, idx):
-        example = self.data[idx]
-        question = example["title"]
-        if self.training:
-            answers = [a for i, (a, sc) in enumerate(zip(example["answers"]["text"], example["answers"]["score"]))]
-            answer_tab = choice(answers).split(" ")
-            start_idx = randint(0, max(0, len(answer_tab) - self.min_length))
-            answer_span = " ".join(answer_tab[start_idx:])
-        else:
-            answer_span = example["answers"]["text"][0]
-        return (question, answer_span)
-
-    def __getitem__(self, idx):
-        return self.make_example(idx % self.data.num_rows)
-
-
-class RetrievalQAEmbedder(nn.Module):
-    def __init__(self, sent_encoder, dim):
-        super(RetrievalQAEmbedder, self).__init__()
-        self.sent_encoder = sent_encoder
-        self.output_dim = 128
-        self.project_q = nn.Linear(dim, self.output_dim, bias=False)
-        self.project_a = nn.Linear(dim, self.output_dim, bias=False)
-        self.ce_loss = nn.CrossEntropyLoss(reduction="mean")
-
-    def embed_sentences_checkpointed(self, input_ids, attention_mask, checkpoint_batch_size=-1):
-        # reproduces BERT forward pass with checkpointing
-        if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size:
-            return self.sent_encoder(input_ids, attention_mask=attention_mask)[1]
-        else:
-            # prepare implicit variables
-            device = input_ids.device
-            input_shape = input_ids.size()
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-            head_mask = [None] * self.sent_encoder.config.num_hidden_layers
-            extended_attention_mask: torch.Tensor = self.sent_encoder.get_extended_attention_mask(
-                attention_mask, input_shape
-            )
-
-            # define function for checkpointing
-            def partial_encode(*inputs):
-                encoder_outputs = self.sent_encoder.encoder(
-                    inputs[0],
-                    attention_mask=inputs[1],
-                    head_mask=head_mask,
-                )
-                sequence_output = encoder_outputs[0]
-                pooled_output = self.sent_encoder.pooler(sequence_output)
-                return pooled_output
-
-            # run embedding layer on everything at once
-            embedding_output = self.sent_encoder.embeddings(
-                input_ids=input_ids, position_ids=None, token_type_ids=token_type_ids, inputs_embeds=None
-            )
-            # run encoding and pooling on one mini-batch at a time
-            pooled_output_list = []
-            for b in range(math.ceil(input_ids.shape[0] / checkpoint_batch_size)):
-                b_embedding_output = embedding_output[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
-                b_attention_mask = extended_attention_mask[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
-                pooled_output = checkpoint.checkpoint(partial_encode, b_embedding_output, b_attention_mask)
-                pooled_output_list.append(pooled_output)
-            return torch.cat(pooled_output_list, dim=0)
-
-    def embed_questions(self, q_ids, q_mask, checkpoint_batch_size=-1):
-        q_reps = self.embed_sentences_checkpointed(q_ids, q_mask, checkpoint_batch_size)
-        return self.project_q(q_reps)
-
-    def embed_answers(self, a_ids, a_mask, checkpoint_batch_size=-1):
-        a_reps = self.embed_sentences_checkpointed(a_ids, a_mask, checkpoint_batch_size)
-        return self.project_a(a_reps)
-
-    def forward(self, q_ids, q_mask, a_ids, a_mask, checkpoint_batch_size=-1):
-        device = q_ids.device
-        q_reps = self.embed_questions(q_ids, q_mask, checkpoint_batch_size)
-        a_reps = self.embed_answers(a_ids, a_mask, checkpoint_batch_size)
-        compare_scores = torch.mm(q_reps, a_reps.t())
-        loss_qa = self.ce_loss(compare_scores, torch.arange(compare_scores.shape[1]).to(device))
-        loss_aq = self.ce_loss(compare_scores.t(), torch.arange(compare_scores.shape[0]).to(device))
-        loss = (loss_qa + loss_aq) / 2
-        return loss
-
-
-def make_qa_retriever_model(model_name="google/bert_uncased_L-8_H-512_A-8", from_file=None, device="cuda:0"):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    bert_model = AutoModel.from_pretrained(model_name).to(device)
-    # run bert_model on a dummy batch to get output dimension
-    d_ids = torch.LongTensor(
-        [[bert_model.config.bos_token_id if bert_model.config.bos_token_id is not None else 1]]
-    ).to(device)
-    d_mask = torch.LongTensor([[1]]).to(device)
-    sent_dim = bert_model(d_ids, attention_mask=d_mask)[1].shape[-1]
-    qa_embedder = RetrievalQAEmbedder(bert_model, sent_dim).to(device)
-    if from_file is not None:
-        param_dict = torch.load(from_file)  # has model weights, optimizer, and scheduler states
-        qa_embedder.load_state_dict(param_dict["model"])
-    return tokenizer, qa_embedder
-
-
-def make_qa_retriever_batch(qa_list, tokenizer, max_len=64, device="cuda:0"):
-    q_ls = [q for q, a in qa_list]
-    a_ls = [a for q, a in qa_list]
-    q_toks = tokenizer(q_ls, max_length=max_len, padding="max_length", truncation=True)
-    q_ids, q_mask = (
-        torch.LongTensor(q_toks["input_ids"]).to(device),
-        torch.LongTensor(q_toks["attention_mask"]).to(device),
-    )
-    a_toks = tokenizer(a_ls, max_length=max_len, padding="max_length", truncation=True)
-    a_ids, a_mask = (
-        torch.LongTensor(a_toks["input_ids"]).to(device),
-        torch.LongTensor(a_toks["attention_mask"]).to(device),
-    )
-    return (q_ids, q_mask, a_ids, a_mask)
-
-
-def train_qa_retriever_epoch(model, dataset, tokenizer, optimizer, scheduler, args, e=0):
-    model.train()
-    # make iterator
-    train_sampler = RandomSampler(dataset)
-    model_collate_fn = functools.partial(
-        make_qa_retriever_batch, tokenizer=tokenizer, max_len=args.max_length, device="cuda:0"
-    )
-    data_loader = DataLoader(dataset, batch_size=args.batch_size, sampler=train_sampler, collate_fn=model_collate_fn)
-    epoch_iterator = tqdm(data_loader, desc="Iteration", disable=True)
-    # accumulate loss since last print
-    loc_steps = 0
-    loc_loss = 0.0
-    st_time = time()
-    for step, batch in enumerate(epoch_iterator):
-        q_ids, q_mask, a_ids, a_mask = batch
-        pre_loss = model(q_ids, q_mask, a_ids, a_mask, checkpoint_batch_size=args.checkpoint_batch_size)
-        loss = pre_loss.sum()
-        # optimizer
-        loss.backward()
-        optimizer.step()
-        scheduler.step()
-        model.zero_grad()
-        # some printing within the epoch
-        loc_loss += loss.item()
-        loc_steps += 1
-        if step % args.print_freq == 0 or step == 1:
-            print(
-                "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e,
-                    step,
-                    len(dataset) // args.batch_size,
-                    loc_loss / loc_steps,
-                    time() - st_time,
-                )
-            )
-            loc_loss = 0
-            loc_steps = 0
-
-
-def train_qa_retriever_joint_epoch(model, dataset_list, tokenizer, optimizer, scheduler, args, e=0):
-    model.train()
-    model_collate_fn = functools.partial(
-        make_qa_retriever_batch, tokenizer=tokenizer, max_len=args.max_length, device="cuda:0"
-    )
-    # make iterator
-    train_samplers = [RandomSampler(dataset) for dataset in dataset_list]
-    data_loaders = [
-        DataLoader(dataset, batch_size=args.batch_size, sampler=train_sampler, collate_fn=model_collate_fn)
-        for dataset, train_sampler in zip(dataset_list, train_samplers)
-    ]
-    iterators = [iter(dloader) for dloader in data_loaders]
-    joint_iter = zip(*iterators)
-    # accumulate loss since last print
-    loc_steps = 0
-    loc_loss = 0.0
-    st_time = time()
-    for step, (batches,) in enumerate(zip(joint_iter)):
-        for batch in batches:
-            q_ids, q_mask, a_ids, a_mask = batch
-            loss = model(q_ids, q_mask, a_ids, a_mask, checkpoint_batch_size=args.checkpoint_batch_size)
-            # optimizer
-            loss.backward()
-            optimizer.step()
-            scheduler.step()
-            model.zero_grad()
-            # some printing within the epoch
-            loc_loss += loss.item()
-            loc_steps += 1
-        if step % args.print_freq == 0:
-            print(
-                "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e,
-                    step,
-                    len(dataset_list[0]) // args.batch_size,
-                    loc_loss / loc_steps,
-                    time() - st_time,
-                )
-            )
-            loc_loss = 0
-            loc_steps = 0
-
-
-def evaluate_qa_retriever(model, dataset, tokenizer, args):
-    model.eval()
-    # make iterator
-    eval_sampler = SequentialSampler(dataset)
-    model_collate_fn = functools.partial(
-        make_qa_retriever_batch, tokenizer=tokenizer, max_len=args.max_length, device="cuda:0"
-    )
-    data_loader = DataLoader(dataset, batch_size=args.batch_size, sampler=eval_sampler, collate_fn=model_collate_fn)
-    epoch_iterator = tqdm(data_loader, desc="Iteration", disable=True)
-    tot_loss = 0.0
-    with torch.no_grad():
-        for step, batch in enumerate(epoch_iterator):
-            q_ids, q_mask, a_ids, a_mask = batch
-            loss = model(q_ids, q_mask, a_ids, a_mask)
-            tot_loss += loss.item()
-        return tot_loss / (step + 1)
-
-
-def train_qa_retriever(qar_model, qar_tokenizer, qar_train_dset, qar_valid_dset, qar_args):
-    qar_optimizer = AdamW(qar_model.parameters(), lr=qar_args.learning_rate, eps=1e-8)
-    qar_scheduler = get_linear_schedule_with_warmup(
-        qar_optimizer,
-        num_warmup_steps=100,
-        num_training_steps=(qar_args.num_epochs + 1) * math.ceil(len(qar_train_dset) / qar_args.batch_size),
-    )
-    for e in range(qar_args.num_epochs):
-        train_qa_retriever_epoch(qar_model, qar_train_dset, qar_tokenizer, qar_optimizer, qar_scheduler, qar_args, e)
-        m_save_dict = {
-            "model": qar_model.state_dict(),
-            "optimizer": qar_optimizer.state_dict(),
-            "scheduler": qar_scheduler.state_dict(),
-        }
-        print("Saving model {}".format(qar_args.model_save_name))
-        torch.save(m_save_dict, "{}_{}.pth".format(qar_args.model_save_name, e))
-        eval_loss = evaluate_qa_retriever(qar_model, qar_valid_dset, qar_tokenizer, qar_args)
-        print("Evaluation loss epoch {:4d}: {:.3f}".format(e, eval_loss))
-
-
-###############
-# ELI5 seq2seq model training
-###############
-class ELI5DatasetS2S(Dataset):
-    def __init__(
-        self, examples_array, make_doc_fun=None, extra_answer_threshold=3, document_cache=None, training=True
-    ):
-        self.training = training
-        self.data = examples_array
-        self.make_doc_function = make_doc_fun
-        self.document_cache = {} if document_cache is None else document_cache
-        assert not (make_doc_fun is None and document_cache is None)
-        # make index of specific question-answer pairs from multi-answers
-        if self.training:
-            self.qa_id_list = [
-                (i, j)
-                for i, qa in enumerate(self.data)
-                for j, (a, sc) in enumerate(zip(qa["answers"]["text"], qa["answers"]["score"]))
-                if j == 0 or sc >= extra_answer_threshold
-            ]
-        else:
-            self.qa_id_list = [(i, 0) for i in range(self.data.num_rows)]
-
-    def __len__(self):
-        return len(self.qa_id_list)
-
-    def make_example(self, idx):
-        i, j = self.qa_id_list[idx]
-        example = self.data[i]
-        question = example["title"] + " " + example["selftext"]
-        answer = example["answers"]["text"][j]
-        q_id = example["q_id"]
-        if self.make_doc_function is not None:
-            self.document_cache[q_id] = self.document_cache.get(q_id, self.make_doc_function(example["title"]))
-        document = self.document_cache[q_id]
-        in_st = "question: {} context: {}".format(
-            question.lower().replace(" --t--", "").strip(),
-            document.lower().strip(),
-        )
-        out_st = answer
-        return (in_st, out_st)
-
-    def __getitem__(self, idx):
-        return self.make_example(idx)
-
-
-def make_qa_s2s_model(model_name="facebook/bart-large", from_file=None, device="cuda:0"):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
-    if from_file is not None:
-        param_dict = torch.load(from_file)  # has model weights, optimizer, and scheduler states
-        model.load_state_dict(param_dict["model"])
-    return tokenizer, model
-
-
-def make_qa_s2s_batch(qa_list, tokenizer, max_len=64, max_a_len=360, device="cuda:0"):
-    q_ls = [q for q, a in qa_list]
-    a_ls = [a for q, a in qa_list]
-    q_toks = tokenizer(q_ls, max_length=max_len, padding="max_length", truncation=True)
-    q_ids, q_mask = (
-        torch.LongTensor(q_toks["input_ids"]).to(device),
-        torch.LongTensor(q_toks["attention_mask"]).to(device),
-    )
-    a_toks = tokenizer(a_ls, max_length=min(max_len, max_a_len), padding="max_length", truncation=True)
-    a_ids, a_mask = (
-        torch.LongTensor(a_toks["input_ids"]).to(device),
-        torch.LongTensor(a_toks["attention_mask"]).to(device),
-    )
-    lm_labels = a_ids[:, 1:].contiguous().clone()
-    lm_labels[a_mask[:, 1:].contiguous() == 0] = -100
-    model_inputs = {
-        "input_ids": q_ids,
-        "attention_mask": q_mask,
-        "decoder_input_ids": a_ids[:, :-1].contiguous(),
-        "lm_labels": lm_labels,
-    }
-    return model_inputs
-
-
-def train_qa_s2s_epoch(model, dataset, tokenizer, optimizer, scheduler, args, e=0, curriculum=False):
-    model.train()
-    # make iterator
-    if curriculum:
-        train_sampler = SequentialSampler(dataset)
-    else:
-        train_sampler = RandomSampler(dataset)
-    model_collate_fn = functools.partial(
-        make_qa_s2s_batch, tokenizer=tokenizer, max_len=args.max_length, device="cuda:0"
-    )
-    data_loader = DataLoader(dataset, batch_size=args.batch_size, sampler=train_sampler, collate_fn=model_collate_fn)
-    epoch_iterator = tqdm(data_loader, desc="Iteration", disable=True)
-    # accumulate loss since last print
-    loc_steps = 0
-    loc_loss = 0.0
-    st_time = time()
-    for step, batch_inputs in enumerate(epoch_iterator):
-        pre_loss = model(**batch_inputs)[0]
-        loss = pre_loss.sum() / pre_loss.shape[0]
-        loss.backward()
-        # optimizer
-        if step % args.backward_freq == 0:
-            optimizer.step()
-            scheduler.step()
-            model.zero_grad()
-        # some printing within the epoch
-        loc_loss += loss.item()
-        loc_steps += 1
-        if step % args.print_freq == 0 or step == 1:
-            print(
-                "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                    e,
-                    step,
-                    len(dataset) // args.batch_size,
-                    loc_loss / loc_steps,
-                    time() - st_time,
-                )
-            )
-            loc_loss = 0
-            loc_steps = 0
-
-
-def eval_qa_s2s_epoch(model, dataset, tokenizer, args):
-    model.eval()
-    # make iterator
-    train_sampler = SequentialSampler(dataset)
-    model_collate_fn = functools.partial(
-        make_qa_s2s_batch, tokenizer=tokenizer, max_len=args.max_length, device="cuda:0"
-    )
-    data_loader = DataLoader(dataset, batch_size=args.batch_size, sampler=train_sampler, collate_fn=model_collate_fn)
-    epoch_iterator = tqdm(data_loader, desc="Iteration", disable=True)
-    # accumulate loss since last print
-    loc_steps = 0
-    loc_loss = 0.0
-    st_time = time()
-    with torch.no_grad():
-        for step, batch_inputs in enumerate(epoch_iterator):
-            pre_loss = model(**batch_inputs)[0]
-            loss = pre_loss.sum() / pre_loss.shape[0]
-            loc_loss += loss.item()
-            loc_steps += 1
-            if step % args.print_freq == 0:
-                print(
-                    "{:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
-                        step,
-                        len(dataset) // args.batch_size,
-                        loc_loss / loc_steps,
-                        time() - st_time,
-                    )
-                )
-    print(
-        "Total \t L: {:.3f} \t -- {:.3f}".format(
-            loc_loss / loc_steps,
-            time() - st_time,
-        )
-    )
-
-
-def train_qa_s2s(qa_s2s_model, qa_s2s_tokenizer, s2s_train_dset, s2s_valid_dset, s2s_args):
-    s2s_optimizer = AdamW(qa_s2s_model.parameters(), lr=s2s_args.learning_rate, eps=1e-8)
-    s2s_scheduler = get_linear_schedule_with_warmup(
-        s2s_optimizer,
-        num_warmup_steps=400,
-        num_training_steps=(s2s_args.num_epochs + 1) * math.ceil(len(s2s_train_dset) / s2s_args.batch_size),
-    )
-    for e in range(s2s_args.num_epochs):
-        train_qa_s2s_epoch(
-            qa_s2s_model,
-            s2s_train_dset,
-            qa_s2s_tokenizer,
-            s2s_optimizer,
-            s2s_scheduler,
-            s2s_args,
-            e,
-            curriculum=(e == 0),
-        )
-        m_save_dict = {
-            "model": qa_s2s_model.state_dict(),
-            "optimizer": s2s_optimizer.state_dict(),
-            "scheduler": s2s_scheduler.state_dict(),
-        }
-        print("Saving model {}".format(s2s_args.model_save_name))
-        eval_qa_s2s_epoch(qa_s2s_model, s2s_valid_dset, qa_s2s_tokenizer, s2s_args)
-        torch.save(m_save_dict, "{}_{}.pth".format(s2s_args.model_save_name, e))
-
-
-# generate answer from input "question: ... context: <p> ..."
-def qa_s2s_generate(
-    question_doc,
-    qa_s2s_model,
-    qa_s2s_tokenizer,
-    num_answers=1,
-    num_beams=None,
-    min_len=64,
-    max_len=256,
-    do_sample=False,
-    temp=1.0,
-    top_p=None,
-    top_k=None,
-    max_input_length=512,
-    device="cuda:0",
-):
-    model_inputs = make_qa_s2s_batch(
-        [(question_doc, "A")],
-        qa_s2s_tokenizer,
-        max_input_length,
-        device=device,
-    )
-    n_beams = num_answers if num_beams is None else max(num_beams, num_answers)
-    generated_ids = qa_s2s_model.generate(
-        input_ids=model_inputs["input_ids"],
-        attention_mask=model_inputs["attention_mask"],
-        min_length=min_len,
-        max_length=max_len,
-        do_sample=do_sample,
-        early_stopping=True,
-        num_beams=1 if do_sample else n_beams,
-        temperature=temp,
-        top_k=top_k,
-        top_p=top_p,
-        eos_token_id=qa_s2s_tokenizer.eos_token_id,
-        no_repeat_ngram_size=3,
-        num_return_sequences=num_answers,
-        decoder_start_token_id=qa_s2s_tokenizer.bos_token_id,
-    )
-    return [qa_s2s_tokenizer.decode(ans_ids, skip_special_tokens=True).strip() for ans_ids in generated_ids]
-
-
-###############
-# ELI5-trained retrieval model usage
-###############
-def embed_passages_for_retrieval(passages, tokenizer, qa_embedder, max_length=128, device="cuda:0"):
-    a_toks = tokenizer(passages, max_length=max_length, padding="max_length", truncation=True)
-    a_ids, a_mask = (
-        torch.LongTensor(a_toks["input_ids"]).to(device),
-        torch.LongTensor(a_toks["attention_mask"]).to(device),
-    )
-    with torch.no_grad():
-        a_reps = qa_embedder.embed_answers(a_ids, a_mask).cpu().type(torch.float)
-    return a_reps.numpy()
-
-
-def embed_questions_for_retrieval(q_ls, tokenizer, qa_embedder, device="cuda:0"):
-    q_toks = tokenizer(q_ls, max_length=128, padding="max_length", truncation=True)
-    q_ids, q_mask = (
-        torch.LongTensor(q_toks["input_ids"]).to(device),
-        torch.LongTensor(q_toks["attention_mask"]).to(device),
-    )
-    with torch.no_grad():
-        q_reps = qa_embedder.embed_questions(q_ids, q_mask).cpu().type(torch.float)
-    return q_reps.numpy()
-
-
-def make_qa_dense_index(
-    qa_embedder,
-    tokenizer,
-    passages_dset,
-    batch_size=512,
-    max_length=128,
-    index_name="kilt_passages_reps.dat",
-    dtype="float32",
-    device="cuda:0",
-):
-    st_time = time()
-    fp = np.memmap(index_name, dtype=dtype, mode="w+", shape=(passages_dset.num_rows, 128))
-    n_batches = math.ceil(passages_dset.num_rows / batch_size)
-    for i in range(n_batches):
-        passages = list(passages_dset[i * batch_size : (i + 1) * batch_size]["passage_text"])
-        reps = embed_passages_for_retrieval(passages, tokenizer, qa_embedder, max_length, device)
-        fp[i * batch_size : (i + 1) * batch_size] = reps
-        if i % 50 == 0:
-            print(i, time() - st_time)
-
-
-def evaluate_retriever(qa_list, retriever_func, scoring_func, n_ret=10, verbose=False):
-    total_retriever_time = 0.0
-    total_retriever_score = 0.0
-    st_time = time()
-    for i, (question, answer) in enumerate(qa_list):
-        r_time = time()
-        retrieved_passages = retriever_func(question, n_ret)
-        total_retriever_time += time() - r_time
-        total_retriever_score += scoring_func(retrieved_passages, answer)
-        if verbose and ((i + 1) % 500 == 0 or i <= 1):
-            print(
-                "{:03d}: S-{:.4f} T-{:.4f} | {:.2f}".format(
-                    i + 1, total_retriever_score / (i + 1), total_retriever_time / (i + 1), time() - st_time
-                )
-            )
-    return {"idf_recall": total_retriever_score / (i + 1), "retrieval_time": total_retriever_time / (i + 1)}
-
-
-# build a support document for the question out of Wikipedia snippets
-def query_qa_dense_index(
-    question, qa_embedder, tokenizer, wiki_passages, wiki_index, n_results=10, min_length=20, device="cuda:0"
-):
-    q_rep = embed_questions_for_retrieval([question], tokenizer, qa_embedder, device=device)
-    D, I = wiki_index.search(q_rep, 2 * n_results)
-    res_passages = [wiki_passages[int(i)] for i in I[0]]
-    support_doc = "<P> " + " <P> ".join([p["passage_text"] for p in res_passages])
-    res_list = [{k: p[k] for k in wiki_passages.column_names} for p in res_passages]
-    res_list = [res for res in res_list if len(res["passage_text"].split()) > min_length][:n_results]
-    for r, sc in zip(res_list, D[0]):
-        r["score"] = float(sc)
-    return support_doc, res_list
-
-
-def batch_query_qa_dense_index(questions, qa_embedder, tokenizer, wiki_passages, wiki_index, n_results=10):
-    q_rep = embed_questions_for_retrieval(questions, tokenizer, qa_embedder)
-    D, I = wiki_index.search(q_rep, n_results)
-    res_passages_lst = [[wiki_passages[int(i)] for i in i_lst] for i_lst in I]
-    support_doc_lst = [
-        "<P> " + " <P> ".join([p["passage_text"] for p in res_passages]) for res_passages in res_passages_lst
-    ]
-    all_res_lists = []
-    for res_passages, dl in zip(res_passages_lst, D):
-        res_list = [{k: p[k] for k in wiki_passages.column_names} for p in res_passages]
-        for r, sc in zip(res_list, dl):
-            r["score"] = float(sc)
-        all_res_lists += [res_list[:]]
-    return support_doc_lst, all_res_lists
-
-
-# find nearest neighbors of an answer or declarative text in Wikipedia snippets
-def query_qa_dense_index_nn(passage, qa_embedder, tokenizer, wiki_passages, wiki_index, n_results=10, min_length=20):
-    a_rep = embed_passages_for_retrieval([passage], tokenizer, qa_embedder)
-    D, I = wiki_index.search(a_rep, 2 * n_results)
-    res_passages = [wiki_passages[int(i)] for i in I[0]]
-    support_doc = "<P> " + " <P> ".join([p["passage_text"] for p in res_passages])
-    res_list = [{k: p[k] for k in wiki_passages.column_names} for p in res_passages]
-    res_list = [res for res in res_list if len(res["passage_text"].split()) > min_length][:n_results]
-    for r, sc, i in zip(res_list, D[0], I[0]):
-        r["passage_id"] = int(i)
-        r["score"] = float(sc)
-    return support_doc, res_list
-
-
-def batch_query_qa_dense_index_nn(passages, qa_embedder, tokenizer, wiki_passages, wiki_index, n_results=10):
-    a_reps = embed_passages_for_retrieval(passages, tokenizer, qa_embedder)
-    D, I = wiki_index.search(a_reps, n_results)
-    res_passages_lst = [[wiki_passages[int(i)] for i in i_lst] for i_lst in I]
-    support_doc_lst = [
-        "<P> " + " <P> ".join([p["passage_text"] for p in res_passages]) for res_passages in res_passages_lst
-    ]
-    all_res_lists = []
-    for res_passages, dl, il in zip(res_passages_lst, D, I):
-        res_list = [{k: p[k] for k in wiki_passages.column_names} for p in res_passages]
-        for r, sc, i in zip(res_list, dl, il):
-            r["passage_id"] = int(i)
-            r["score"] = float(sc)
-        all_res_lists += [res_list[:]]
-    return support_doc_lst, all_res_lists
diff --git a/examples/research_projects/longform-qa/requirements.txt b/examples/research_projects/longform-qa/requirements.txt
deleted file mode 100644
index a21b64d33df8..000000000000
--- a/examples/research_projects/longform-qa/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-datasets >= 1.1.3
-faiss-cpu
-streamlit
-elasticsearch
diff --git a/examples/research_projects/luke/README.md b/examples/research_projects/luke/README.md
deleted file mode 100644
index 703eb0b4e423..000000000000
--- a/examples/research_projects/luke/README.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# Token classification
-
-## PyTorch version, no Trainer
-
-Fine-tuning (m)LUKE for token classification task such as Named Entity Recognition (NER), Parts-of-speech
-tagging (POS) or phrase extraction (CHUNKS). You can easily
-customize it to your needs if you need extra processing on your datasets.
-
-It will either run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own text files for
-training and validation, you might just need to add some tweaks in the data preprocessing.
-
-The script can be  run in a distributed setup, on TPU and supports mixed precision by
-the mean of the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. You can use the script normally
-after installing it:
-
-```bash
-pip install git+https://github.com/huggingface/accelerate
-```
-
-then to train English LUKE on CoNLL2003:
-
-```bash
-export TASK_NAME=ner
-
-python run_luke_ner_no_trainer.py \
-  --model_name_or_path studio-ousia/luke-base \
-  --dataset_name conll2003 \
-  --task_name $TASK_NAME \
-  --max_length 128 \
-  --per_device_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3 \
-  --output_dir /tmp/$TASK_NAME/
-```
-
-You can then use your usual launchers to run in it in a distributed environment, but the easiest way is to run
-
-```bash
-accelerate config
-```
-
-and reply to the questions asked. Then
-
-```bash
-accelerate test
-```
-
-that will check everything is ready for training. Finally, you can launch training with
-
-```bash
-export TASK_NAME=ner
-
-accelerate launch run_ner_no_trainer.py \
-  --model_name_or_path studio-ousia/luke-base \
-  --dataset_name conll2003 \
-  --task_name $TASK_NAME \
-  --max_length 128 \
-  --per_device_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3 \
-  --output_dir /tmp/$TASK_NAME/
-```
-
-This command is the same and will work for:
-
-- a CPU-only setup
-- a setup with one GPU
-- a distributed training with several GPUs (single or multi node)
-- a training on TPUs
-
-Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.
diff --git a/examples/research_projects/luke/luke_utils.py b/examples/research_projects/luke/luke_utils.py
deleted file mode 100644
index aec4133f21b3..000000000000
--- a/examples/research_projects/luke/luke_utils.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import unicodedata
-from dataclasses import dataclass
-from typing import Optional, Union
-
-import numpy as np
-
-from transformers.data.data_collator import DataCollatorMixin
-from transformers.file_utils import PaddingStrategy
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-
-
-def padding_tensor(sequences, padding_value, padding_side, sequence_length):
-    if isinstance(padding_value, tuple):
-        out_tensor = np.full((len(sequences), sequence_length, 2), padding_value)
-    else:
-        out_tensor = np.full((len(sequences), sequence_length), padding_value)
-
-    for i, tensor in enumerate(sequences):
-        if padding_side == "right":
-            if isinstance(padding_value, tuple):
-                out_tensor[i, : len(tensor[:sequence_length]), :2] = tensor[:sequence_length]
-            else:
-                out_tensor[i, : len(tensor[:sequence_length])] = tensor[:sequence_length]
-        else:
-            if isinstance(padding_value, tuple):
-                out_tensor[i, len(tensor[:sequence_length]) - 1 :, :2] = tensor[:sequence_length]
-            else:
-                out_tensor[i, len(tensor[:sequence_length]) - 1 :] = tensor[:sequence_length]
-
-    return out_tensor.tolist()
-
-
-def is_punctuation(char):
-    cp = ord(char)
-    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
-
-
-@dataclass
-class DataCollatorForLukeTokenClassification(DataCollatorMixin):
-    """
-    Data collator that will dynamically pad the inputs received, as well as the labels.
-
-    Args:
-        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
-            The tokenizer used for encoding the data.
-        padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-
-            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              sequence if provided).
-            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
-              maximum acceptable input length for the model if that argument is not provided.
-            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-              different lengths).
-        max_length (`int`, *optional*):
-            Maximum length of the returned list and optionally padding length (see above).
-        pad_to_multiple_of (`int`, *optional*):
-            If set will pad the sequence to a multiple of the provided value.
-
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-        label_pad_token_id (`int`, *optional*, defaults to -100):
-            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
-        return_tensors (`str`):
-            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
-    """
-
-    tokenizer: PreTrainedTokenizerBase
-    padding: Union[bool, str, PaddingStrategy] = True
-    max_length: Optional[int] = None
-    pad_to_multiple_of: Optional[int] = None
-    label_pad_token_id: int = -100
-    return_tensors: str = "pt"
-
-    def torch_call(self, features):
-        import torch
-
-        label_name = "label" if "label" in features[0].keys() else "labels"
-        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
-        batch = self.tokenizer.pad(
-            features,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
-            return_tensors="pt" if labels is None else None,
-        )
-
-        if labels is None:
-            return batch
-
-        sequence_length = torch.tensor(batch["entity_ids"]).shape[1]
-        padding_side = self.tokenizer.padding_side
-        if padding_side == "right":
-            batch[label_name] = [
-                list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
-            ]
-        else:
-            batch[label_name] = [
-                [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
-            ]
-
-        ner_tags = [feature["ner_tags"] for feature in features]
-        batch["ner_tags"] = padding_tensor(ner_tags, -1, padding_side, sequence_length)
-        original_entity_spans = [feature["original_entity_spans"] for feature in features]
-        batch["original_entity_spans"] = padding_tensor(original_entity_spans, (-1, -1), padding_side, sequence_length)
-        batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
-
-        return batch
diff --git a/examples/research_projects/luke/run_luke_ner_no_trainer.py b/examples/research_projects/luke/run_luke_ner_no_trainer.py
deleted file mode 100644
index 1552acbd42c2..000000000000
--- a/examples/research_projects/luke/run_luke_ner_no_trainer.py
+++ /dev/null
@@ -1,720 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning (m)LUKE model on token classification tasks (NER, POS, CHUNKS) relying on the accelerate library 🤗
-without using a Trainer.
-"""
-
-import argparse
-import logging
-import math
-import os
-import random
-from pathlib import Path
-
-import datasets
-import torch
-from accelerate import Accelerator, DistributedDataParallelKwargs
-from datasets import ClassLabel, load_dataset, load_metric
-from huggingface_hub import Repository, create_repo
-from luke_utils import DataCollatorForLukeTokenClassification, is_punctuation, padding_tensor
-from torch.utils.data import DataLoader
-from tqdm.auto import tqdm
-
-import transformers
-from transformers import (
-    AdamW,
-    LukeConfig,
-    LukeForEntitySpanClassification,
-    LukeTokenizer,
-    SchedulerType,
-    default_data_collator,
-    get_scheduler,
-    set_seed,
-)
-from transformers.utils.versions import require_version
-
-
-logger = logging.getLogger(__name__)
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Finetune (m)LUKE on a token classification task (such as NER) with the accelerate library"
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help="The name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The configuration name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
-    )
-    parser.add_argument(
-        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
-    )
-    parser.add_argument(
-        "--text_column_name",
-        type=str,
-        default=None,
-        help="The column name of text to input in the file (a csv or JSON file).",
-    )
-    parser.add_argument(
-        "--label_column_name",
-        type=str,
-        default=None,
-        help="The column name of label to input in the file (a csv or JSON file).",
-    )
-    parser.add_argument(
-        "--max_length",
-        type=int,
-        default=128,
-        help=(
-            "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
-            " sequences shorter will be padded if `--pad_to_max_length` is passed."
-        ),
-    )
-    parser.add_argument(
-        "--max_entity_length",
-        type=int,
-        default=32,
-        help=(
-            "The maximum total input entity length after tokenization (Used only for (M)Luke models). Sequences longer"
-            " than this will be truncated, sequences shorter will be padded if `--pad_to_max_length` is passed."
-        ),
-    )
-    parser.add_argument(
-        "--max_mention_length",
-        type=int,
-        default=30,
-        help=(
-            "The maximum total input mention length after tokenization (Used only for (M)Luke models). Sequences"
-            " longer than this will be truncated, sequences shorter will be padded if `--pad_to_max_length` is passed."
-        ),
-    )
-    parser.add_argument(
-        "--pad_to_max_length",
-        action="store_true",
-        help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=True,
-    )
-    parser.add_argument(
-        "--config_name",
-        type=str,
-        default=None,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--per_device_train_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the training dataloader.",
-    )
-    parser.add_argument(
-        "--per_device_eval_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the evaluation dataloader.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-5,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
-    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--lr_scheduler_type",
-        type=SchedulerType,
-        default="linear",
-        help="The scheduler type to use.",
-        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
-    )
-    parser.add_argument(
-        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--label_all_tokens",
-        action="store_true",
-        help="Setting labels of all special tokens to -100 and thus PyTorch will ignore them.",
-    )
-    parser.add_argument(
-        "--return_entity_level_metrics",
-        action="store_true",
-        help="Indication whether entity level metrics are to be returner.",
-    )
-    parser.add_argument(
-        "--task_name",
-        type=str,
-        default="ner",
-        choices=["ner", "pos", "chunk"],
-        help="The name of the task.",
-    )
-    parser.add_argument(
-        "--debug",
-        action="store_true",
-        help="Activate debug mode and run training only with a subset of data.",
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument(
-        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
-    )
-    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
-    args = parser.parse_args()
-
-    # Sanity checks
-    if args.task_name is None and args.train_file is None and args.validation_file is None:
-        raise ValueError("Need either a task name or a training/validation file.")
-    else:
-        if args.train_file is not None:
-            extension = args.train_file.split(".")[-1]
-            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-        if args.validation_file is not None:
-            extension = args.validation_file.split(".")[-1]
-            assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-
-    if args.push_to_hub:
-        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
-
-    return args
-
-
-def main():
-    args = parse_args()
-
-    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    handler = DistributedDataParallelKwargs(find_unused_parameters=True)
-    accelerator = Accelerator(kwargs_handlers=[handler])
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state)
-
-    # Setup logging, we only want one process per machine to log things on the screen.
-    # accelerator.is_local_main_process is only True for one process per machine.
-    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Handle the repository creation
-    if accelerator.is_main_process:
-        if args.push_to_hub:
-            # Retrieve of infer repo_name
-            repo_name = args.hub_model_id
-            if repo_name is None:
-                repo_name = Path(args.output_dir).absolute().name
-            # Create repo and retrieve repo_id
-            repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id
-            # Clone repo locally
-            repo = Repository(args.output_dir, clone_from=repo_id, token=args.hub_token)
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-    accelerator.wait_for_everyone()
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
-    # 'tokens' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
-    else:
-        data_files = {}
-        if args.train_file is not None:
-            data_files["train"] = args.train_file
-            extension = args.train_file.split(".")[-1]
-        if args.validation_file is not None:
-            data_files["validation"] = args.validation_file
-            extension = args.validation_file.split(".")[-1]
-        raw_datasets = load_dataset(extension, data_files=data_files)
-    # Trim a number of training examples
-    if args.debug:
-        for split in raw_datasets.keys():
-            raw_datasets[split] = raw_datasets[split].select(range(100))
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    if raw_datasets["train"] is not None:
-        column_names = raw_datasets["train"].column_names
-        features = raw_datasets["train"].features
-    else:
-        column_names = raw_datasets["validation"].column_names
-        features = raw_datasets["validation"].features
-
-    if args.text_column_name is not None:
-        text_column_name = args.text_column_name
-    elif "tokens" in column_names:
-        text_column_name = "tokens"
-    else:
-        text_column_name = column_names[0]
-
-    if args.label_column_name is not None:
-        label_column_name = args.label_column_name
-    elif f"{args.task_name}_tags" in column_names:
-        label_column_name = f"{args.task_name}_tags"
-    else:
-        label_column_name = column_names[1]
-
-    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
-    # unique labels.
-    def get_label_list(labels):
-        unique_labels = set()
-        for label in labels:
-            unique_labels = unique_labels | set(label)
-        label_list = list(unique_labels)
-        label_list.sort()
-        return label_list
-
-    if isinstance(features[label_column_name].feature, ClassLabel):
-        label_list = features[label_column_name].feature.names
-        # No need to convert the labels since they are already ints.
-    else:
-        label_list = get_label_list(raw_datasets["train"][label_column_name])
-    num_labels = len(label_list)
-
-    # Map that sends B-Xxx label to its I-Xxx counterpart
-    b_to_i_label = []
-
-    for idx, label in enumerate(label_list):
-        if label.startswith("B-") and label.replace("B-", "I-") in label_list:
-            b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
-        else:
-            b_to_i_label.append(idx)
-
-    # Load pretrained model and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    if args.config_name:
-        config = LukeConfig.from_pretrained(args.config_name, num_labels=num_labels)
-    elif args.model_name_or_path:
-        config = LukeConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels)
-    else:
-        logger.warning("You are instantiating a new config instance from scratch.")
-
-    tokenizer_name_or_path = args.tokenizer_name if args.tokenizer_name else args.model_name_or_path
-    if not tokenizer_name_or_path:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    tokenizer = LukeTokenizer.from_pretrained(
-        tokenizer_name_or_path,
-        use_fast=False,
-        task="entity_span_classification",
-        max_entity_length=args.max_entity_length,
-        max_mention_length=args.max_mention_length,
-    )
-
-    if args.model_name_or_path:
-        model = LukeForEntitySpanClassification.from_pretrained(
-            args.model_name_or_path,
-            from_tf=bool(".ckpt" in args.model_name_or_path),
-            config=config,
-        )
-    else:
-        logger.info("Training new model from scratch")
-        model = LukeForEntitySpanClassification.from_config(config)
-
-    model.resize_token_embeddings(len(tokenizer))
-
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    padding = "max_length" if args.pad_to_max_length else False
-
-    def compute_sentence_boundaries_for_luke(examples):
-        sentence_boundaries = []
-
-        for tokens in examples[text_column_name]:
-            sentence_boundaries.append([0, len(tokens)])
-
-        examples["sentence_boundaries"] = sentence_boundaries
-
-        return examples
-
-    def compute_entity_spans_for_luke(examples):
-        all_entity_spans = []
-        texts = []
-        all_labels_entity_spans = []
-        all_original_entity_spans = []
-
-        for labels, tokens, sentence_boundaries in zip(
-            examples[label_column_name], examples[text_column_name], examples["sentence_boundaries"]
-        ):
-            subword_lengths = [len(tokenizer.tokenize(token)) for token in tokens]
-            total_subword_length = sum(subword_lengths)
-            _, context_end = sentence_boundaries
-
-            if total_subword_length > args.max_length - 2:
-                cur_length = sum(subword_lengths[:context_end])
-                idx = context_end - 1
-
-                while cur_length > args.max_length - 2:
-                    cur_length -= subword_lengths[idx]
-                    context_end -= 1
-                    idx -= 1
-
-            text = ""
-            sentence_words = tokens[:context_end]
-            sentence_subword_lengths = subword_lengths[:context_end]
-            word_start_char_positions = []
-            word_end_char_positions = []
-            labels_positions = {}
-
-            for word, label in zip(sentence_words, labels):
-                if word[0] == "'" or (len(word) == 1 and is_punctuation(word)):
-                    text = text.rstrip()
-
-                word_start_char_positions.append(len(text))
-                text += word
-                word_end_char_positions.append(len(text))
-                text += " "
-                labels_positions[(word_start_char_positions[-1], word_end_char_positions[-1])] = label
-
-            text = text.rstrip()
-            texts.append(text)
-            entity_spans = []
-            labels_entity_spans = []
-            original_entity_spans = []
-
-            for word_start in range(len(sentence_words)):
-                for word_end in range(word_start, len(sentence_words)):
-                    if (
-                        sum(sentence_subword_lengths[word_start:word_end]) <= tokenizer.max_mention_length
-                        and len(entity_spans) < tokenizer.max_entity_length
-                    ):
-                        entity_spans.append((word_start_char_positions[word_start], word_end_char_positions[word_end]))
-                        original_entity_spans.append((word_start, word_end + 1))
-                        if (
-                            word_start_char_positions[word_start],
-                            word_end_char_positions[word_end],
-                        ) in labels_positions:
-                            labels_entity_spans.append(
-                                labels_positions[
-                                    (word_start_char_positions[word_start], word_end_char_positions[word_end])
-                                ]
-                            )
-                        else:
-                            labels_entity_spans.append(0)
-
-            all_entity_spans.append(entity_spans)
-            all_labels_entity_spans.append(labels_entity_spans)
-            all_original_entity_spans.append(original_entity_spans)
-
-        examples["entity_spans"] = all_entity_spans
-        examples["text"] = texts
-        examples["labels_entity_spans"] = all_labels_entity_spans
-        examples["original_entity_spans"] = all_original_entity_spans
-
-        return examples
-
-    def tokenize_and_align_labels(examples):
-        entity_spans = []
-
-        for v in examples["entity_spans"]:
-            entity_spans.append(list(map(tuple, v)))
-
-        tokenized_inputs = tokenizer(
-            examples["text"],
-            entity_spans=entity_spans,
-            max_length=args.max_length,
-            padding=padding,
-            truncation=True,
-        )
-
-        if padding == "max_length":
-            tokenized_inputs["labels"] = padding_tensor(
-                examples["labels_entity_spans"], -100, tokenizer.padding_side, tokenizer.max_entity_length
-            )
-            tokenized_inputs["original_entity_spans"] = padding_tensor(
-                examples["original_entity_spans"], (-1, -1), tokenizer.padding_side, tokenizer.max_entity_length
-            )
-            tokenized_inputs[label_column_name] = padding_tensor(
-                examples[label_column_name], -1, tokenizer.padding_side, tokenizer.max_entity_length
-            )
-        else:
-            tokenized_inputs["labels"] = [ex[: tokenizer.max_entity_length] for ex in examples["labels_entity_spans"]]
-            tokenized_inputs["original_entity_spans"] = [
-                ex[: tokenizer.max_entity_length] for ex in examples["original_entity_spans"]
-            ]
-            tokenized_inputs[label_column_name] = [
-                ex[: tokenizer.max_entity_length] for ex in examples[label_column_name]
-            ]
-
-        return tokenized_inputs
-
-    with accelerator.main_process_first():
-        raw_datasets = raw_datasets.map(
-            compute_sentence_boundaries_for_luke,
-            batched=True,
-            desc="Adding sentence boundaries",
-        )
-        raw_datasets = raw_datasets.map(
-            compute_entity_spans_for_luke,
-            batched=True,
-            desc="Adding sentence spans",
-        )
-
-        processed_raw_datasets = raw_datasets.map(
-            tokenize_and_align_labels,
-            batched=True,
-            remove_columns=raw_datasets["train"].column_names,
-            desc="Running tokenizer on dataset",
-        )
-
-    train_dataset = processed_raw_datasets["train"]
-    eval_dataset = processed_raw_datasets["validation"]
-
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
-
-    # DataLoaders creation:
-    if args.pad_to_max_length:
-        # If padding was already done ot max length, we use the default data collator that will just convert everything
-        # to tensors.
-        data_collator = default_data_collator
-    else:
-        # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
-        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
-        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        # For fp8, we pad to multiple of 16.
-        if accelerator.mixed_precision == "fp8":
-            pad_to_multiple_of = 16
-        elif accelerator.mixed_precision != "no":
-            pad_to_multiple_of = 8
-        else:
-            pad_to_multiple_of = None
-        data_collator = DataCollatorForLukeTokenClassification(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
-
-    train_dataloader = DataLoader(
-        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
-    )
-    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
-
-    # Optimizer
-    # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
-
-    # Use the device given by the `accelerator` object.
-    device = accelerator.device
-    model.to(device)
-
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    else:
-        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    lr_scheduler = get_scheduler(
-        name=args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
-    )
-
-    # Metrics
-    metric = load_metric("seqeval")
-
-    def get_luke_labels(outputs, ner_tags, original_entity_spans):
-        true_predictions = []
-        true_labels = []
-
-        for output, original_spans, tags in zip(outputs.logits, original_entity_spans, ner_tags):
-            true_tags = [val for val in tags if val != -1]
-            true_original_spans = [val for val in original_spans if val != (-1, -1)]
-            max_indices = torch.argmax(output, axis=1)
-            max_logits = torch.max(output, axis=1).values
-            predictions = []
-
-            for logit, index, span in zip(max_logits, max_indices, true_original_spans):
-                if index != 0:
-                    predictions.append((logit, span, label_list[index]))
-
-            predicted_sequence = [label_list[0]] * len(true_tags)
-
-            for _, span, label in sorted(predictions, key=lambda o: o[0], reverse=True):
-                if all(o == label_list[0] for o in predicted_sequence[span[0] : span[1]]):
-                    predicted_sequence[span[0]] = label
-                    if span[1] - span[0] > 1:
-                        predicted_sequence[span[0] + 1 : span[1]] = [label] * (span[1] - span[0] - 1)
-
-            true_predictions.append(predicted_sequence)
-            true_labels.append([label_list[tag_id] for tag_id in true_tags])
-
-        return true_predictions, true_labels
-
-    def compute_metrics():
-        results = metric.compute()
-        if args.return_entity_level_metrics:
-            # Unpack nested dictionaries
-            final_results = {}
-            for key, value in results.items():
-                if isinstance(value, dict):
-                    for n, v in value.items():
-                        final_results[f"{key}_{n}"] = v
-                else:
-                    final_results[key] = value
-            return final_results
-        else:
-            return {
-                "precision": results["overall_precision"],
-                "recall": results["overall_recall"],
-                "f1": results["overall_f1"],
-                "accuracy": results["overall_accuracy"],
-            }
-
-    # Train!
-    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
-    completed_steps = 0
-
-    for epoch in range(args.num_train_epochs):
-        model.train()
-        for step, batch in enumerate(train_dataloader):
-            _ = batch.pop("original_entity_spans")
-            outputs = model(**batch)
-            loss = outputs.loss
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-                progress_bar.update(1)
-                completed_steps += 1
-
-            if completed_steps >= args.max_train_steps:
-                break
-
-        model.eval()
-        for step, batch in enumerate(eval_dataloader):
-            original_entity_spans = batch.pop("original_entity_spans")
-            with torch.no_grad():
-                outputs = model(**batch)
-
-            preds, refs = get_luke_labels(outputs, batch[label_column_name], original_entity_spans)
-
-            metric.add_batch(
-                predictions=preds,
-                references=refs,
-            )  # predictions and preferences are expected to be a nested list of labels, not label_ids
-
-        eval_metric = compute_metrics()
-        accelerator.print(f"epoch {epoch}:", eval_metric)
-
-        if args.push_to_hub and epoch < args.num_train_epochs - 1:
-            accelerator.wait_for_everyone()
-            unwrapped_model = accelerator.unwrap_model(model)
-            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
-            if accelerator.is_main_process:
-                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(
-                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
-                )
-
-    if args.output_dir is not None:
-        accelerator.wait_for_everyone()
-        unwrapped_model = accelerator.unwrap_model(model)
-        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
-        if accelerator.is_main_process:
-            tokenizer.save_pretrained(args.output_dir)
-            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/lxmert/README.md b/examples/research_projects/lxmert/README.md
deleted file mode 100644
index 2ec1aaebbb04..000000000000
--- a/examples/research_projects/lxmert/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# LXMERT DEMO
-
-1. make a virtualenv: ``virtualenv venv`` and activate ``source venv/bin/activate``
-2. install reqs: ``pip install -r ./requirements.txt``
-3. usage is as shown in demo.ipynb
diff --git a/examples/research_projects/lxmert/demo.ipynb b/examples/research_projects/lxmert/demo.ipynb
deleted file mode 100644
index 576a4b7631cb..000000000000
--- a/examples/research_projects/lxmert/demo.ipynb
+++ /dev/null
@@ -1,264 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# %pip install-r requirements.txt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "PyTorch version 1.6.0 available.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import io\n",
-    "\n",
-    "import numpy as np\n",
-    "import PIL.Image\n",
-    "from IPython.display import Image, display\n",
-    "from modeling_frcnn import GeneralizedRCNN\n",
-    "from processing_image import Preprocess\n",
-    "from visualizing_image import SingleImageViz\n",
-    "\n",
-    "import utils\n",
-    "from transformers import LxmertForQuestionAnswering, LxmertTokenizer\n",
-    "from utils import Config\n",
-    "\n",
-    "\n",
-    "# URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/images/input.jpg\",\n",
-    "URL = \"https://vqa.cloudcv.org/media/test2014/COCO_test2014_000000262567.jpg\"\n",
-    "OBJ_URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/objects_vocab.txt\"\n",
-    "ATTR_URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/attributes_vocab.txt\"\n",
-    "GQA_URL = \"https://raw.githubusercontent.com/airsplay/lxmert/master/data/gqa/trainval_label2ans.json\"\n",
-    "VQA_URL = \"https://raw.githubusercontent.com/airsplay/lxmert/master/data/vqa/trainval_label2ans.json\"\n",
-    "\n",
-    "\n",
-    "# for visualizing output\n",
-    "def showarray(a, fmt=\"jpeg\"):\n",
-    "    a = np.uint8(np.clip(a, 0, 255))\n",
-    "    f = io.BytesIO()\n",
-    "    PIL.Image.fromarray(a).save(f, fmt)\n",
-    "    display(Image(data=f.getvalue()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load object, attribute, and answer labels\n",
-    "\n",
-    "objids = utils.get_data(OBJ_URL)\n",
-    "attrids = utils.get_data(ATTR_URL)\n",
-    "gqa_answers = utils.get_data(GQA_URL)\n",
-    "vqa_answers = utils.get_data(VQA_URL)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "loading configuration file cache\n",
-      "loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /home/eltoto/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0\n",
-      "All model checkpoint weights were used when initializing GeneralizedRCNN.\n",
-      "\n",
-      "All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.\n",
-      "If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# load models and model components\n",
-    "frcnn_cfg = Config.from_pretrained(\"unc-nlp/frcnn-vg-finetuned\")\n",
-    "\n",
-    "frcnn = GeneralizedRCNN.from_pretrained(\"unc-nlp/frcnn-vg-finetuned\", config=frcnn_cfg)\n",
-    "\n",
-    "image_preprocess = Preprocess(frcnn_cfg)\n",
-    "\n",
-    "lxmert_tokenizer = LxmertTokenizer.from_pretrained(\"unc-nlp/lxmert-base-uncased\")\n",
-    "lxmert_gqa = LxmertForQuestionAnswering.from_pretrained(\"unc-nlp/lxmert-gqa-uncased\")\n",
-    "lxmert_vqa = LxmertForQuestionAnswering.from_pretrained(\"unc-nlp/lxmert-vqa-uncased\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAGPAlgDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDA1q3ik8VajNKu9V8pQvHUoDn9KbHZWxCgwpl84+UcVpz6Ne3/AIvvjbywqrxoxEhPZVHTBrTi8HaoRgXFp/303/xNdrnCPKpLov636r7iDn1srXA/cJnbn7op4srXk+RHjGcbR6/SumTwPqpx/pFn0x99un/fNWI/Auq4P+kWfTA+dv8A4miNam3Zr+vvCx55qOmW0944WJQ4ij2YAAGWbP6CmTaZZxwtttFO+ZfLyQMDZnk4zjOfyrtrr4da1Lq0Zi1CziZ4tpGGYEcnutOPwr19txbWLNt3qrHB9RxweTyKzVak3Ll31X9a+noZxfM3Z7M4w6RaQy4URqxRkYIwIPBBwDyP1rF162gJ8qNcDbGm44z2H4cV6efhVr7bd2sWZK9G2tn8TjJrG8R/CnWbXRrm7a/tZ2Tb8q7gT8wHGRinKUJSSpx3f9ItK2rZxV9Z211HeWwREFrMFQiILsX5sjI5bgZ59Kj0SCGOZEEgNvJliDApLEYBUknK9uR612a/Dnxnf21tOYrXBAkBDoN+R1YZ54P61Inwy8ax7vKgs4wc4Csnyk9SCTkH8at1YKrzdvLz/pDtocbZWkUcUiuIzAFZ5N0I3PnPBbqGyDwPSs+30W1lklhDF5hB5qKFwM4BxnPpn/PFehR/DHxtHbrbiK0MSqVCsY269TknOaU/CvxfBOsltDarIqIolEik8KOOTjqPSo56b5ey3/ry6BY4+LQbSy1OCaLcVS5gWMk9Tvwx/MfrTU0WwuLwTWv2iMLcPHJj72euQR0Fdmfhl43aKOMRWo8tw6sJFzuBBzyfUUifC7xnG+5be0ALmQr5i4Lnq33s5/Stfb0dktN/61FZnHS6HYywafAyGKTY2WBHzAFyeuME46k8cCqF5pun2tutwkUchZthi88OF685XFdrefDnxRp1nF9qn0+zgSX928txGgDcnaGZvqcfWqLeENSlGJtV0CRePlN7AoyO/wArConUhKOi1/4C/rzuO2pjixt/tX9lJCgtmt9+4qN24jOc9fbHSo9KsrVXlmWK1jVcIJTlwrZHBDZ5PqB61vHwrrBi8v8AtzRfvbt32+Dd1zjO7pnnFOXwrqaODHqnh9F43Ri9g2t06gv7VXtYcydvw/rYLGNNaJb37SRW0EYZsyFkBCqAMtznaDntz0ra8N+HbC8068uDEHHnFo9wOSCAcde2G/KsKe3137ZcQxXdgV8xhhWikVyuckE5zj2NaWhXmvabBFBA1lLtle4CqyHzA3BBAP3eD0x1NKVWN9G1v/Wn9XGovqdUfDOkCBYHgiVhctGHCZJOF6nPTNNt/DWlN5az2se3E3CpyCq565BP/wBb3rGtL7Wp0hBv7EML5V+aWLJZ/X5hwNvt160y31nW5r6KGO601mV5Dh54wrBh8wY7uAAD0IqfbO+7/EfKbUPhHT50V47QkOSIyIyRx/eOeP1qC10DSmaR5LJGWNC+3JGeg9fes6bWNUtkXfJpE0UoLwsJ1IQZwQPnB69mz+tQpf6vp/lzvqemyeZHu8hpozvU8YOMY/MGl7V3Vm/xDlNe60DSlMTpZIqyRh9uScHJHr7VNceH9HaDdb2NsVULuKs+4HHOcnHX0rFm1HVriKe5W90tkRFXy0nQeSCRjGTz6dW6n601ta1i4hWK3/s+PewUtDIpMhHblj+QxR7WWvvfmHKdTZ+HNFl1qxENhbNbm8jQlGfOCw4OT9eleof8IP4a/wCgTB+bf414lZaxrM9zE8Eun2YgdZ/3Tod7KwH8THOM9B+VdMPHmugzJJr0KyoBhBDEeSQPm446+h5wO9N4iqn7s2vmw5E+qPR/+EI8Nf8AQIg/Nv8AGj/hCPDX/QIg/Nv8a89uvGPiW1vmtG1y2eQEgeXHGQ4Hdfl5FVf+Fha75vl/8JBaB87dpWHOfT7tH1mv/wA/H97H7Nd0emf8IP4a/wCgRB+bf41h+HPDOjyatrkTWS7IrgKih2G0Zb39q5q98Z67ZR7/APhKtLmAfy38kxHY/ocqPfkZHB5rN0DxHrTTXt1H4l0yB7u4ZY0laPdMy5Jx8pA+91OAa1hiq3JNOo+nV9xOmr7o9Z/4RPQ/+fEf9/H/AMaX/hE9D/58R/38f/GvNoPG2tXFs0kfizTDKqM5gIQPtXJJzs29BnGa7bSvEcV5p9mz+ItKe4khRnXz4927aCeB+NZqtXf/AC8f3spUovqv6+Rp/wDCJ6H/AM+A/wC/j/41Q1zwtosegai62QDLaykHzH4O0+9XP7S/6jOn/wDf1ar31yL2wubQ61YDz4mjyJFJ+YEdPxqufEfzv73/AJBKlFJvmX9fI4a08GaJd28Oy02verG9ufNb5Quzzu/TLMef7tRad4W0a9JDWFvHFctKbcmWYyhVzjbjK4H+11rTj8KyRCIL4vtovKDKilgCgOcgfNxnJ/Onw+GZreEww+NLaOIncUR8Ln1wGrJqr3f4k04xlFO6M3+z7X+y9vlcf2Js+8en2nOPzrMsLZdN8Pareaarw3qtEoliYh1QhycHqAWCA/l3rov+ETO3b/wl9nt2eXjcMbc52/e6Z5x61DL4Yk0+0ubi08X2sUqwuQYWCseOmQ1EYVW0r/n/AJF8i7ox7+MQ6tKsCBHEmSkYxtkPLAAdMNnitG/vZbnR7O7X7R5sNwV864m8xy2AflOB8ox05wTUeheGGl0aCT/hK7WLe5kKFhkMGOGPzdeTz71qTeGp55o5pvGtvJLGco7yZZT7EtxRUoTjNp/1+AKKt8SMXxBI82oxSysXke0t2ZmPJJiTJq/ZaPaXWmRziEmS4hMMQDH/AI+BvOevOQijHT56t3Xhqe+2fa/GlvcbM7fOk37c9cZbjoKZH4VaJY1j8YWiCN/MQKwG1uPmHzcHgc+1R7GX9X/yHyL+ZGdql/Y6RCLT+z1u4W1MQAGVlAG0KzDHOTjI7c9Kh0+8gsNRtgtgJXN3qNqWeRvmWKJSvAx13Efj9Kp6j4YM/iSLS/8AhI4ZRJALhHTkK4Ylm+912q3PWqz6DPNNpc0HiBUkvbhriEnqjYUMy/N95mBHHUqKwd1JoOUu6XeaXd6daXt3BaW6Xc0iSIXuGaFVxny9iMC2DnDnnI+tZWg6441C72Wf+kRWUs9vh8lmC5BAx125I7jFaU+jalZztMviS8inura4kuPMUwu5RSQXUOefc1zd9ozaffWssOspFKLeGVXT5GUlAcgg+/WhJvZisaPiDX5xaaPd3Fk5ubq0MkrM5y37x1Uk45JUKfpiuhvdVeXS7yxijlkuYLK1d7Zvlt4QTH88b92O4ZyB95uTisLVdCvozql5Prrsl46pBM5P+lITkYO75htAz17CnaTpNzcLBpVz4q8tlnMZsZzIfLC9lXoDnIxxjHvRZ2vcfKb91fT6fpVlLb2lvc2tjqiBTBeRuZBhcv8AKSeT26gYz0rU/wCFlN/0BW/8CP8A7CuT0LwvdQnTpI9QaeGC9MrNGh8mErjDuCehAzzjj16U7RbaW7vzZuY2NxE8cZYdHxlSOOOQB+NS03e0hqKOth+Ik1xPHBFoZaSRgij7TjJJwOq0i/EWV5vK/sYBs4+a6CgfiVxVMw263thd2qQqtxfQxRhVHyrG5DH6keWT9TVW0T7dPaSzQWwIvniISIAMm0EA8c4Pc889am0v5h8qNVviNKiozaG6q4ypM5AYZIyPk55BH4Uz/hZLf9AY/wDgR/8AYVm26S3NvoS3SxGyWNkkkEC/6wPJtUtgdfl4yM5z3zWfr0DRpagRSxT/AD72ltFtw4424VSRxzzxnimou9uYOVeR0/8Awn919m+0/wDCPTeRnHm+adufTOzFFv49vLxmW18OzzsoyRFKWI/JKxhHc+ULrd/xL/7LMW7+HzNhG36+Zzj8axdO064n1O3guSY4mxJISuCIgNxbp/dBNCjKz94OVeR2h8bag872y+GrkzoMtEHbco9xsyK09F1yTWbOSc2LQFJTGU37ugB9B61x1nJNqa6pNJDNc+dPG32W1ba4GWwc4PyqOMY7jpiuo0JbtzqjJcRMDfyHKjg8L9f5mhQlJ257DUY36fibJmf/AJ4tTTK//PFqQx3v/PZPy/8ArUwx3n/PVPy/+tVfV5/8/fwX+RfJDuvvf+Q4yt/zyamGVv8AnkaQx3n/AD1T8v8A61MKXf8Az1T8v/rUvq8/+fv4L/IOSHdfe/8AIguXYzQZQ/e/wqYu39w1WuVuPOhzIpO7jj6VKUuf+eifl/8AWrGnQnzz/edu3b0OajCHtamq3XV9vQUu39w0wsf7ppClz/z0X8v/AK1MK3H/AD0X8q2+rz/5+/l/kdPJDuvvf+QpY/3aYTntSFZ/761GRMP4l/Kj6vP/AJ+/l/kHJDuvvf8AkK1RsPelKy/31/Ko2En94Uvq8/8An7+X+Q+SHdfe/wDIfEv75eaglQeY/wAw6mpIdwuUDEHrTJR+8b6mnOjUjFLn/BHLJKFfZP3V37shMa/3xRQV+YfWiuDEVKtJpKX4I6acYzWxl6af+Kru/wDriP8A2WuvgPSuOsDjxXd/9cR/7LXXWx6V7WI+KP8AhX5HIjTi6VbjqnFV2PoKwAhb/kLwf7h/rWkBWc//ACF4P9w/1rSA4rGjvP1/RHNQ+Kf+L9ELWR4pH/FNXn0X/wBCFbNZHir/AJFq8+i/+hiu3Dfxoeq/M3exd0kf8Sex/wCveP8A9BFXRVPSf+QPZf8AXvH/AOgirtRV+OXq/wA2NBS0UuKzGGKMUUtAHnvxgGfCNoMdb9P/AECSvNG061VV3Wah2bCgSt+p/CvU/ivaT3nha1jt03uL1GIyBxscd/rXF6v4K12DRbuSa1crHEzbmmjyuB7GuuE6cKSc+77/AKee/lsLW5z32KzJVFsxvLFSDIcAgZ60R2Vk5QmzCq+QD5hzkdf5Gku/BNnYyXobVZpFsbgQzlbUZYtnBQb+funOSMds1D/whXkTyx3l28SrdtawvHFvDsMEseRtXDL6nnpXP/aWGlHRP/yb8Pw+RXIzLtZreB5ikZDpMXjHULuIyM/QUjw2EsrjHyJGscWSwHH075zWnaeASyQx3M7R3s8ssKxLCGUOhx8zbhgE9CAT7VY07wfBJZxyajFLtbTZJo/Jt03RuJmXnDLvOBkEnoQOwqpZhhI1OZRvZ+euvTv+o3CWxkGS3a4MoTINzHN8uT90e/1qpbxW0F0ZHGFcOhIzwGBUn9a1k8BySWy7ZGF3JC88MLQrtKLnAZt3ysQpIABHTJqHU/BclhDbiItcXEsMUzx+UqpGHQMRuLZJBOOmCOc9qn65h5LljH3vn9/p2/UXKyhLb2jQwW4ZHRNx3fMBk9h39OtSKIRZvDJceahTasJ3HYc9eeBjnpW8ng6I20dgdLT7Q+nvdG8835llCs4QLnbtwoXpnJzmsrSPDdwl8BdafYNG4I3XbuI075PlHd7d+tSsZCpFe78P3tf8Fpvv+Actioy26WDQNMzxnBjg3NhTnOeeB36etLaNa2scy42OuHjxk/Pgr+H3s/hW+3hOCDxJdW4srBrFtpilvpJQqggE7fLO7BzwWHTGar2/hAw+OIYUs1fT11FVCz7CWi8wcMO/FKpj4zjLljo/e6LS3+fS4KBnyPbM8nl5QNCyADPVm3H+eKrt9neB04JeBIxnP8JBx+ldNYeBo4tdtLiMx3dqLsxTxSQqqg4JGBk7lODjODx0FZ1p8Pbq70qW5VZ1mjjeTabceVheSPMDcHAz93HbNaf2jhV9nTS979X+H/DBySM92tJLm5Y+WUuDuYtuAxnocc9SOnpUEqxNfrdbFYptYFA20hQPU57d63LfwMtpfWCzuZbkywvLB5KmMKxBKli3JweRtx71Brfgeazu5MIvnPK7fZ0VQIkz8uTnqR2A4GOe1OOOw82ocvR2311/qz2/AOV7mZLHZeRLHG6us0yuSQw24zjP/fR6UWElnaqYt6IEkLOGQsxDAfcOPlOOO1dHa+C7U21pYy6aGnu7WSZrvzSGicb9ihQdpX5RnIJ+Y4IxXO6Nocn2+4WXSI75lPlrFI5A3Z4PysCehGAe9ZxxSqUXGMdV6d3rfz212t5hy2Yy1NvDbyqZWWJ1IaFWbDkjj29Oa7TwBpdhda/YieHzIihG0sRz5ZJ6fh+VUZvBtnZz6jdDTEuEhWBUtHmPlrK65cblYMQpDAc+nJ79J4K8EPD4wmeGGVNPjWOTPmKTGJImYL6nBOPwprMee6px1aXZaqzfnfVfeDhbc9G/4RrQf+fFf+/j/wCNIfDuhINy2QDDkHzH6/nUl5olvaRLKkkpO8Dk9jUdof3Df7xrKOLr+0UJ6X877EziuRtAvh/RJhvlslZz1PmN/jU0fhXQX6WK/wDfx/8AGnxn5BV61NdDrVLv3n95nRS9nH0RS/4RHQv+fAf9/H/xqrqnhTRE0m8dbEBlgcg+Y/XafeukHSqmrf8AIGvv+veT/wBBNVTrVOePvPddfNGlkc14Z8LaLP4dtJJLIM7BsnzG/vH3rW/4RHQv+fAf9/H/AMaXwn/yLFl9G/8AQzWzWmJrVPbT957vr5gkrGJ/wiOhf8+A/wC/j/40f8IjoX/PgP8Av4/+NbdFYe2qfzP7x2Rw2q/Drw/qWrws8U8X7rbtil46k55z61EfhL4a9b3/AL+j/wCJrsZP+QnF/uH+tWzWFOrNud29/wDI0klZehwR+E3hv1vP+/o/+JpD8JvDfre/9/R/8TXdmm1pzy7kWOF/4VN4b9bz/v6P/iaxPE/w50TStNintXuw7TKh3SA8EH29q9UrmvHI/wCJJD/18r/Jq2oScqsU2J7FY/D7Sf8An4vf++0/+Jo/4V9pP/Pxe/8Afaf/ABNdbiiuco5L/hX2k/8APxe/99p/8TSf8K+0n/n4vf8AvtP/AImutpKAOS/4V9pP/Pxe/wDfaf8AxNJ/wr/Sv+fi9/77X/4mutpDQByX/Cv9K/5+L3/vtf8A4mk/4V/pX/Pxe/8Afa//ABNdZRQBx8ngHSgf+Pi8/wC+1/8Aia1tJ0i30W0e2tnkdGcyEyEE5IA7Aelakn3qiNADDTTTjTDQAw0w080w0gKd1/r7f/e/wqY1Dd/663/3v8KmNYU/4k/Vfkc9H+LU9V+RGajNSNUbVsdIwjFMIp5phFAEZqNgKlNRsKQEcfN2n0NMlH7xvqafH/x+J9D/AFpJRl2+proqfBEVT+Mv8K/NlfHzCin4+YUV42O+KJ10NmYFtNHF4ruvMkRP3I+8cf3a6i2vrXvcw/8AfwV5X4ltluPGcxc4jS2Qt/IVDFp9qw3EYUttU88/rXvVYczi/Jfkec6lrr/M9thv7Pj/AEuD/v4Kux6hZf8AP5b/APf0f414emmW3AZMMSRjJ7fjUiaZaHGE4Oe57fjUexf9WJ9sv6ue0NqFn/a0Dfa4MBDz5g9/etIajY/8/tt/39X/ABrwQ6Zam4UiP5dpPU/41ONNs8cxdvU/41nSoWcvN36GNOXLzPu/8j3b+0bH/n9tv+/q/wCNZPii/sm8OXird27EhcASg/xD3ryD+zLTH+p/U/41U1TT7VNOmKxYIx3PqK6KUHTqRk11RsqnNome76VqNiukWQN5bgiBMgyr/dHvV3+0rD/n9tv+/q/414Fb6bafZogYQTsXJ3H0+tSjTbP/AJ4j/vo/41NSC9o7vdv8wjNuN0j3n+0rD/n9tv8Av6v+NH9pWH/P7bf9/V/xrwcabZ5/1I/76P8AjR/Ztn/zxH/fR/xqHTX42Gqj7dLnvP8AaVh/z+23/f1f8aP7SsP+f22/7+r/AI14N/Zlp/zxH/fR/wAaU6daFi3kjOfU4p+xdrk+21t/mep+N76zk0WFY7uBz9pU4WQHs1aHiK9tZ/D2oxRXMMkj27hUSQEscdAB1rw3V9OtvsybI9sjSgDGeSe3XirU+nWqRMyxYI9zTxFN+wt2uOFW8kjT1XWJreK4ku7VlN9MJX2Qnhhk4GT0+Y1VPjdjNJI+nNLuk85VeE4R8YyPm9hwcjiszUNLtJbOIeT8zsq7txypPGevrXIw+Ur5aESeikkDP4Vz/VKPKrr+rL9LFKrNt2O5j8X3VvNBMbGSRopGlQshySxyd3/1qSPxpdRCFRpRkSOFodr7gGVmLHOOep7Y6CubawtvMZmREEUId0Z22qxIGCRz3+varOl28cc9yGjQAhGUIxK4IPIzzWtTB0lq0N1pSlZG1/wm18sOxNIjDqjRpKS25EbOVHbueTk89apXXinUbqRXbT1UrGkYxu6KoUdvQVL5UP8Ac/U0GKL+7+pqI0KMXdLX+vML1H2E/wCEz1gWvkjT4N4jMQmKMXCHqvXHcjpnHGarxeKdRick6TaSLtC7XSXGR34YHJ+uPapzFF/d/WozHH/d/WkqFFXst/67jvU8hD4y1d7iWa4060nL7QqvDIBGFGAF2sOMYHOelVD4r1/+1F1BgrSrMJtphO0kHOMA9KsMkY/h/WomVP7tJUKK2XS3y+8L1PIkfxxr3nwSxW1vF5UplZI4X2yORjLZY/kMD2qOLxrrsNksH2eF5FgktxO0T7wjhgf4tuRuJBxnpnI4qFgoPSomx6VP1bD7W/r7x81TyJG8X60yW5e2ja5gKYuSjh2CYwGAbaeABnGcd6qX/iXXNQhVJy+9HZllVSHCnnZnuoPTOSPXFOYioy1UqNFO6W39dxXqeRJF4s1uHTha+WHlSN4orpkbzY0bO5Rzt7tyQSMnBFVtH8R3+k30lzDYQyu0JhbzPN6k8vlXBD44yCB6DvSsxqtGx8yXnvWsMNRlTqadr/f6i5ql+hbi8R30FxcPHp0X2W4RVks284xcHIIJfeDn/a7kdOKj/tvV7i9urya5nieZgxCEooA6AD0A4FIjZBJNMmOYXPtWcKNNPmjv/XmJymrXSPqSKQz+FLKUnJMUZJ/Cqdqf3Df7xp+jv5vgPT39bdD+tR2v+pb/AHjWE/48Pn+hpL4Jf13LafdFXrQ9qz0PFXrM10S+JkUf4UfRGiOlVNW/5A19/wBe8n/oJq2OlVNW/wCQNff9e8n/AKCaqn8cfVfmiyj4T/5Fiy+jf+hmtqsbwn/yLFl9G/8AQzWzWmJ/jT9X+YlsJRS0VgMpSj/iZxf7h/rVo1Vl/wCQnF/uH+tWjWFLefr+iNJ7L0GmmmnEU2tiBK5nxz/yBIf+vlf5NXT1zHjn/kCQ/wDXyv8AJq3w38aPqJ7HS0UtFYFDaQ9KdTSKAENNp1JQA00lONYHiy+v7DTYX05mWd5wnyoHJG1jjBB9KTdlcDWbqajNeb/8JN4kZUcTSlXVnVvs64ZVzuI+XkDBye2Kc2v+Jo1heeSeKKYgJI9qoVs+hK81PM+wHoZphrhdR1nWrCKV/wC0/M2Xktrj7Ogzs2/N077ulJb6p4hubRJ11KFXlV3hhZF3yqudxX5cdj1Izg4zRzPsB3BphrgLLxBrl/M6LqCRrGhkkkkjUKijqThSfToD1qz/AGhrwnkV9Vt0gSJZjcsg8so3CkYTdyTjGM9fSjmfYZ1N1/roP97/AAqZq8+v9c1y31JLOScvOHAQRxo27OCpXA5yCCPrSjX/ABAwQhpyJGZUIt1+Yr1A+XkjvWMLqcnbe35GNONqk33a/I701Gxrjr/U9fsoY5xNPJbNDFIZxbKEUuobbnGMjOKUXfiOazuru3N08NsyK4a0Af5lLZwARgAcnPRlPeteZ9jY6w0w1w8uua9FbR3Mv2hLeT7krW6hW+hxg1YsNS1jUITL/aMFvH5giRp1ADueijCn8zgDjJo5n2A60mmk1xUeua3LqAsFlb7SZPK2GNBhs4OTjilutbv4Z44odUhvHc4H2eLOD6fMgz+GRS5n2GdhHj7WmPQ0kgPmN9TWbpMt5b+JzY6pMk5FqJR5WMAnGOQBngkelb7yWW5swydfX/69a1aj5Irlf9fMmetVf4V+bM0jkUVoK1i8iqIXyxA6/wD16K8fHVPeV00dVF2TPLtYtLy58cSraxCUNbqrKWAB79yParMOgawDgWKYzkDzF4P/AH1WhF/yPr/9cf8A2UV1kZwwr6HEScHCz+yjgjGMm7rqchF4a1xtp+xA4Of9an/xVWofCmudPsIwAf8Alqnp/vV3Ns3StOE8VjGrJO9xypxatY8xPhTXft0afYRkofl81Pf/AGquDwdr5/5h4/7/ACf/ABVegk/8TaD/AHD/AFrWWs6Vabctev6IwpRi3O62f+R4BFqDzuI4rC6kfBO1ApOAMno3oKo6nqQfT5R9lnGccnZ6j/aru/Ddrb20dhstDLJcWk87XG5sqQJFwBnGBtAORnJqrrOn6XD4ckWWWESvZLOrgTGQyEBguAuzbn5fbrntXSpLnS8zp5YrZHL2N+8sMEUVhdSSFAAECknj03U/+1R/z53H/jn/AMVXb6VaWYutOnsIoktVcRFyZBKpMZIEgbjPB+7xxWPq9tFZ3KW8MZ2KgInJz54P8Y7AegH481Mql5Xf9fgJQglaxgf2sP8Anzuf/HP/AIqj+1R/z53P/jn/AMVXcTtIYbm0Of7Pj02KWNf4Q5VDuH+0XJBPuRVPw3b5u/tavAZopEWJJJUQ5J5bDEZwAencipc0/wDhylGK2Ryn9rD/AJ87j/xz/wCKo/tYf8+dz/45/wDFV2UNzNp3iW5RvtTRyXJBS1nChzuyAcAhhg/d96oxW0TeJTaXKRiNrloWCEhVJJUEewPP4VTqt73JVOC2Rxmr6rm0TFpOGEgILbeDz0w1W5tTDxEC0uBn/c/+KrqvEei2Nv4fknki2yW6JDJlz/x8Eocnnsrvx0+Srtxo9jcXM9l5BtUgvobbz95JkVn2knJxnA3DGBilVqc1K3r/AFsNQgndI4K61Jm02REtZxIE+Vjs4Pr96uXt3urMQXX2A7SGVJHB2uR1IOeoyOnTivYrfTLLUIYy+ntbASyxG3V2zKFjLBeSTuyApx/eHArlk02x1PT9EE9nHYwJHqE4gLSmOQpt6EbpNvGTjJ4bGOxGpZLy/rsDhDXTc4lryRSGW0UI6bXjzlWHB/vZ6gd6sWOpGOSV54mXcFVVTGABkY610U1n4fW2vb6K3t7xbfTFn8mF7hIUmNykfyl9rlSrcgnuQD0I57X7S0tNZgFvH5FtcW1vceWGLeX5kaswBPJAJOM84qpVL6MuUYqV7F3+2IP7sv5D/Gk/tiD+5J+Q/wAa2fE8txNbeJ7e7z9ksNQjj05SPlhXc4VY/RTGM4HXANZfhK+v7FWu5L2W30O2lEl1Gp+W5bj9zt6OWAxg5AGSajmj2DQhOrwf3JPyH+NXbcG6t1mTAVs4DdeuK1/C0oXTdJtxLLCb+4uTHaQx5guhgKEuGzwAR6NgHOF61yunP/xL4h9f5mmpQ6r8Rq3Y1GtZD/En51G1pL/eT86rM+aiZqvmp/y/j/wB+72LTWUp/iT86iawm/vJ+dVmaoi1Lmp/y/j/AMAV49i02nzH+KP86jOmzf3o/wA6qk0w0c1P+X8f+AK8exaOmT/3o/zP+FVYNPmeaYBo/lbB5+tRmoI/vyfWt6Uqfs6nu9F18/QTcbrQ0DpU/Z4/++j/AIU06bNtMZePJHrVPGeKe67I2ArKMqf8v4/8Aio42WnVH0R4e1WGPwJYWzrIZEgCkqMjIP1p1vqESLsKvkn0p/gV/N+GNgfSFh+tTWn+pP1/pXLOVP28Pd6Pr6eRcnHklp/Wo06nCjFSkhI9v/r1btdZtl6pL/3yP8aF6VfsetdDlTv8P4/8Aii4+yjp0QDXrXH+rm/75H+NVdU1y1bSL1RHNkwOPuj+6fet4dKq6sP+JNff9e8n/oJq6cqfPH3eq6+a8i2422Of8L63bR+HLRGSYkBuij+8fetj+3rX/nnP/wB8j/GovCf/ACLFl9G/9DNbVXiZU/bT93q+vn6Ci422Mn+3rX/nnP8A98j/ABo/t61/55z/APfI/wAa1aSsOal/L+P/AACrx7GE+sW5vo5Qku0Lj7oz396n/t21/wCec3/fI/xq1L/yE4v9w/1qyawoyp3n7vXv5LyLm42WnQyjrlr/AM85v++R/jSf25a/885v++R/jWoaK35qX8v4/wDAIvHsZf8Ablr/AM85v++R/jXN+NNXt5tGhVUlBFwp5Ueje9dvXMeOf+QJD/18r/Jq2w8qftY+7+P/AABScbbGn/blt/zzm/75H+NH9uWv/POb/vkf41p0lYc1L+X8f+AO8exmf23bf885v++R/jSHW7b+5N/3yP8AGtQ0ho5qX8v4/wDAHePYy/7btv8AnnN/3yP8aT+2rb/nnN/3yP8AGtM0lPmpfy/j/wAALx7GZ/bVt/cm/wC+R/jXP+K9cijs7GeJJN8F7HLyBzgMfWuzxUM9pb3cfl3NvFMgO4LKgYA+uDWVadNQbUfx/wCAF49jy7UPEWkCwvra0ldhAohsf3ZG5H2+Yeen3Dwcffpk+saHDpc8NtMhMjQOgCzGQ7T82/d8uRk/dH416C2i6V/0DLP/AL8L/hUZ0bS/+gbZ/wDfhf8ACnePYWh5lrWtWN1bzrBIzltTuJwNhGY2C7W59cHjrVuw1GxEOmXklxsmsIXj8goxMpLOykEDGMvg5I6d67XUdH0xbGUjTrQHjpAvqPanQaPphtoidOtPuD/lgvp9Kcork5l3G1pc890ae2srl5JLiFXltmWN3jZ0icnGHXHzDaD0DD5h7ir13qNhfNd2rXkaedBCDc+UwiMkfYKBlVweML1HQCu1Oj6Z/wBA2z/78L/hTDpGmf8AQOtP+/C/4VmSeZa1fJPrMU9ozbIVijjcjBby0VQ2Pfbmulvdf0pkvVt5DhIme0Gwj95LvEg6cYEvU/8APMe1btzpOmiaDGn2nLf88V9vapTpGm/9A60/78r/AIVlCV5yXp+RlTlec12t+RxU2r20jzKbgmM6RHaqCGx5gVMr0/vA89KL6/sb6DVIVu0jMrWssbOj4fy4mVl4UkHLcZwOOtdkdJ03/oHWn/flf8KYdJ03tp9r/wB+V/wrS5scnrOs2d3Z3j2v2JDdrGGj2z+cu0ggckxjGMZHbsM1n6fNYz6VHZXd2LUwXf2gMUZt6lQGA2g/MNoxnA56iu5Ok6cOmn2n/flf8KadK04f8w+1/wC/K/4UXCxxdrq8EXir+25CoSW7ldodpZkV884xtP3jxnt9Ksy63awSWT3Nw+q3UHnE3UbNGyhgoQBnXJK4YjI4LDHSuoOlad/z4Wv/AH5X/Cm/2Vp3/Pha/wDflf8ACi4WMjSrzT7zxPbNYLMFj01I38x92CoUY+6Onc9633++31NQ2tlaW9+rQW0MTFSMpGFOPwqeT75+tbVF7kSZq1Zf4V+bEh4uIv8AfH86KWH/AI+Yv98fzorwcz+OPp+p1UtmcdF/yPj/APXH/wBlFdWtcpD/AMj4/wD1x/8AZRXVrX0WL3h/hRx0936mhav0rVhPSsSBsNWtbtwK5kWybP8AxNof9w/1rXQ1jA/8TWH/AHD/AFrYjPFZUd5+v6I5aHxT/wAX6I4TRPBd5/Zzxxa/PFF5hzGkZCk4HON9R694Mu7LwzdomvT/AGcAZhEZCnLDtvxXZ6D/AMeT/wDXU/yFR+K/+RZvPov/AKGK9JRX1lR6XX6HTU0bOdtPBl/eafYzzeI7l2EKshdGYplR0JenN8O5HjSNtaYomdimAkLnrgb+K67Sf+QNY/8AXvH/AOgirtc9RWm15v8ANiucQfAFy1sLU69KbdTkRGE7QfXG/FNb4dyvKJW1t2kGMOYCSMdOd/bAruaWoA4uLwNfQeZ5PiK4j805k2RsNx98PzVcfDYhgw1cgg5yLfp/4/Xe0HPagDyrxf4LuLfSA8mtSzCa6VnVoz8zYb5j83J68+9a+o+A7l7ALLr00scQGxHiJC89hv4rX8d/8gOD/r5T/wBBat7UP+PKT8P5iniNMNzLf3gW5xq+Bbu6EEs3iCeSRFGxnjLFfoS/FeQ+LrDVLDxbcwz61dzS28u+GV3bcpIHK5bg8Dp6CvpK34giP+yP5V4n8XLT7P4tjnAwLiEHPqRx/SohrFBfU4vU01G5jjlutYurh7iIJK0rsxdQQwUktyMgHB7jNZ0tjLcMrTXjyMqKilwThQMADJ6AAACte/b/AESz/wCuf9BVEPxW1RJSsvL8jeskp2Xl+SEmXULmG3in1a6litv9QjuzCL/dBPy/hVi2vdcshKLXxDqMAlkMsgindd7nqxw3JOBz1qHfS76gzGwtqVtbzW8GsXcUE5JmjSRlWQnruAbB/Glt4/IgWLdu255xjvRu4ppagZKWpjNUZamlqBDmNRk0E0wmkICaYaU0lAhpqvH/AKyT61YNQR/fk+tdFL+FU9F+Ynuh4OCDTj86N25pv0pwOIz9ayhuyKmy9UfQvw0fzPhnbD+75i/yq7af6n8azPhQ/mfDlR/dlkH6CtO0/wBRj3/pXLP+PD5/oXL4Jf13LI/Sr9j96s9TV6xPz10S+JkUf4UfRGuvSqurf8ga+/695P8A0E1aXpVXVv8AkDX3/XvJ/wCgmqp/HH1X5oroUvCf/IsWX0b/ANDNbVYvhP8A5Fiy+jf+hmtqtMT/ABp+r/MFsJQaKSsBlOT/AJCcX+4f61aqrL/yE4v9w/1qzWFHefr+iNJ7L0ENJS0lbECVzPjn/kCQf9fK/wAmrp65jxz/AMgSH/r5X+TVvhv40fUT2OlopaSsBiGkzSmkNACUlKaSgBKDR3pD978Kyr/w5DW5UaozUjVG1aAUdS/48Jfw/mKdB/x7Rf7g/lTdS/48Jfw/mKWD/j2i/wBwfyrV/wAFev6F/ZHGmGnmmViSVLr/AF0H+9/hUpqG6/10H+9/hUxrCn/En6r8jno/xanqvyGHvUbU8jmmGtjpGn60w08mmGgBhxTTinHGaacUANT/AI/E+lK/32+tJH/x+J9D/Wlfh2+tdFT4Iiqfxl/hX5sIf+PiL/eH86KIOLiP/fH86K8DM/jj6fqdFLZnhdje+IJZbjU5bp0kW18xGEi7jyAMrnIBBOOmanOueMERWN1J8xAwHBYE9MgHIz71Auq2T288v2s5lsliEJZdqsNoOOc/w+g/Grt3r1pMu+O6A8yRGKgRALhgeo+bt3r15SqN6r+vvPKjKrfb+v66j7fXPFG+dZ9SYGOB5F8uVW5XscE/lULeLfGNvII31CVXwDt3DIz688H2qX+3rCC7ScTrJIsUgLMUGScYGFOOx96yL29tWvvOhuFKthgGcfKfTr0+tEHK/vL+vvLpuo5e8tDo7zxB4stQ0ya3JJLAwjlXGNpOeh3HIzkcgVLpnirxjfkqNZulfOBtQMoHqx3ggfgaxrvUtOK3TQ3ILXkiswLLhBkk459T3xUdjd6bbzGaS6JaJsqqlfnHbndx+tTHmUHpr/X9ehEVP2b7+nkv1/A29J8SeMJreZYdYuUZGbO1AYwQO7buOnoaoXXjPxne2U8UmpSyRBdzqW7Aj3qtp2oaavl3ElwqNG7MYgV2nJyBnOcduhqKyv7KK6HnzR+RIGjkAcfdYY/rW3PJTckv6+80fPeT3NiPxb48tokhGoTKI1RQoYcAj5R1qRvGHj9JEQ6nKWckLtkDDI6jIPaqr+IbN1t5fOj8wTh5PnHKgkr/AOhGmpq+n25jjF2JFLuWZnXIDKV4wT6//WFZ89TrFX/rz/4chSrW1j/X3/8ADlz/AITHx95oj/tSQll3AiQFceu7djH41Pc+LfHEbwJFq0xZoBJIWkG1TkjrnGOn51mLq1girb/awV8p0Mpdd2SQR3x29akGtWCShBdJt8hU3koxyGJ6E471LlUvpH+vv/ATlVvov6/roWl8X/EBi4GpyjyyFYtIoAJ6clu+Kjj8beO5Ltbb+1pllL7MM3Q5xzzVC61i2eK4CXSbmkjw25ASFVh/Ccdx0qOTVbRdbN2k8RQTB8FxyM5qlKdndL+vmXF1GnddPxLviDxR4qexiL61Lcwebj5htIcD03Hsf/rCr0HjHxddW53a9MZdrN5TKCvAJwTuz0HpXO6ldae1nFaQ3QKNceYzFkJAxgfxY/Mj8KtW99p0ensv2wRyuGEjDYxI7KPmyPyond0+Vrv/AEw9/l63v/VzaXxX4ya1Drr0om8nzhEAMBeuM7s5xz0x71L4zs/EU+gaLq+qTPKs8KFXL5+8ob+8cda5+31m2t9NK/a1eVozEFOwbQevzZyRz045r1LxXLY3vwd0ry7u3aaC0t2CLKpP+rXtmrw8+W/NFP1v/mhwjNt8x5Fdx3q29sXkbaU+X5u3HvVXZdf3z+f/ANetXUpF+x2HzDBj9fYVnCZe7L+dddStBS+BdO/b1OutStPr0/JEe26/vn8//r0uy6/vn8//AK9S+an98fnR5yd2X86j28P+fcfx/wAzL2b8yHbdf3z+f/16Cl1/fP5//XqbzU/vr+dJ5y/3h+dHtof8+4/j/mP2b8yHZc/3z+f/ANemE3COmSTk9zVrzE/vL+dQzOhki+YYz61Mq0LaQX4/5kuDXcdvn/uL+dJum/uL+dPLqP41/Ok8xP7y/nWn1mP/AD7j+P8AmVyPzGbpv7i0hab+4Kk8xP7y/nSb0/vD86PrMf8An3H8f8w5H5keZf7oqKMyb5MKOvNWN6f3l/OoI3UPJyOvrXRSxEfZz/dx2Xfv6kuDuhxeRBkqMVKeIz9aimdTGPmHX1p5dfKY7h19azqOEqUZqKTfNt5W8yJp7eaPe/g62/wFMv8AduHH/jorYtP9R+Ncv8G9StIPCl7DPdwRkXBIDyBTyvvXQW19ZqmxrqAMW4BkGTXlzf7+Hz/Q1knyS0/rU0B7VdsM+YayDqNkjYa8t1I7GVQf51bsdU08SHN9bD6zL/jW8n7zIoxfso6dEdKvSqurf8ga+/695P8A0E01NX03H/IRtP8Av8v+NVtV1bTm0e9Av7Uk28gAEy8/Kferpv34+q/NFcrsN8J/8ixZfRv/AENq2q5rwrqmnp4as1e/tVYBsgzKD94+9bH9r6Z/0EbT/v8AL/jV4l/vp+r/ADBRdti5SGqn9r6Z/wBBG0/7/L/jSf2vpv8A0EbT/v8AL/jWNx8r7BJ/yFIv9w/1q1WXJqdgdQjkF7bbAuC3mrgdfepzq+m/9BC0/wC/y/41hR3n6/ojSadlp0LZpKqHVtN/6CFp/wB/l/xpP7X03/oIWn/f5f8AGtiOV9i3XM+Of+QJD/18r/Jq2v7X03/oIWn/AH+X/GsLxpNHPoFvJFIkiNcrhkOQeG71vhv40fUmSaR1NNpeKaTWIC5ptBNQT3EdvGXc4ApATGkNVIL+KdcowNTeaP7woGS5ppP7z8KaJFPQg03fmY/Ssq/8NjW5AxqM04mmE1oBT1L/AI8Jfw/mKWD/AI9Yv9wfypupf8eMv4fzFLB/x7Rf7g/lWr/hL1/Qv7Ip60005qjJrEkq3P8AroP97/CpjUNz/roP97/CpSawp/xJ/L8jno/xanqvyGseKYaeaYeK2OkYaaTz0p5NRmgBD9KaRSkU2gBqf8fkefT/ABok++31oT/j7j+h/rRJ/rGHvW9T4Iiqfxl/hX5sWD/j4j/3x/OilhwLiP8A3h/OivBzP44+n6nRS2Z8+6bpNpvCzDfJ5O5l5AGSMc5681q/2FYDrb/XLN/jTNCs9QvZYY0tkZ3hI37gCQDj1x2rox4f1Y/8uYz3PmL/AI17VeM4yV+yKjPDuNkvz/q5hLoen5I+z547s3+NKmiabuwbcH/gbf410I8Pav2tAP8Atov+NOXw5q5b/j0/8iJ/jWcb31FVlScbQ3MNNC00yqv2bg8n52/xqZvD+lg/8e2OM8O3+Nb8fhnWTcIos+SP+eqe/vWing7XX6WR6YOJo+f1qI3aaXcUK1F1XLovL+umn9XOOtPD2mOFLW2csRje3p9adqPh3S47GRo7TDAdfMbj9a6zTvB2uzwl0szgOeksfXHuak1fwnrtrpU872ICptyTKh/iH+1XWoS9rZLW/wDkTOULxu9Ounn+OhyMGgaT9nTfZ/MEyx8xuwHvUp8PaSuQ1jg8YAlc5zx6111l4J12WyhlFgSJIV/5bR8jH+9U/wDwguv4ObFiTxkzx8f+PVjKM02mac+HvotPn/X+fQ4r/hHtJHymxG/OMea+P500aBpIB3Wag7sDMrAD8c13H/CC6/j/AI8WznO7z48/zoHgTXwOLFs5znz485/OptManQSs/wBf6t+JxI8P6Q23ZYgkg/8ALZu3404+HtI8gyCz/hyAZH/xrth4H8QBgxsCSARzNH/8VSjwNr4j8v8As/K4xzNH/wDFU0pdTOU6N1ZdV93X9DzfVtA063t12W/JYc729/ertx4e0pB+7tcbTyfMb/Gui8ReD9bstPjlmsjgzKuWmQ9jxwa07vwProgaRtNAIwc+bH/8VTqpqjf1HGpS9peWunbr3scUPD2lM0X+icHg/vG5OM+tes3Pg7SJ/h6BFZhZf7PRlbex52D3rnU8Ca60cb/2aOgbPmx88f71eo2Fo8ehWlnOu2RbVInXOcEKARRTTtqZ1qkWo+z001Pl7UYV+xWCleRHj9BWb5Mf939a6PxJbfZrpbfGPLZ1/IisPYc1vVS5/u/JFVpy59+i/JEHlR/3f1p3lR/3f1qXZS+WaysjLnl3IPJj/u/rSiGL0x+NTbDUdw3kQNJtzt7dO9FkCnJ9Q8mLHC/rUEsUYePC9T61B/aR/wCeB/76/wDrUn2uSaQbLfOwFyN3Ydal2sNqo9F+Zd8mM/w/rSmCPqF/WqsV9JLIEjtiWPQbv/rU6W+kgfbJbYJGQRICCPUEcGneOxXLVtzdCbyY/wC7+tVry1MwhhhTMkkgVRnuc1N505haZYIyiruOJ1JA+nXvUE11c24hulgQiKRX4kDfgQORmtKMoe0jfuOVOty63FbR4zZw+TJFLK87L5iOdoAUE5zjGOTnFV10dmfKzwtDsL+cC20AHB4xnPI7d6sx6qbe1imtLNokinOd0pLEsuDyAMcDrTH1J28q8cXzIGaNd14TIrcEkHbwOfSu9VqXK9e39bGPs59vMamlLLYsYijus+0zbiECbcknPQfhmoLKzjuJ2jdz5SKzuy9Sqgk4z64qd9fkYSr9jHlTSBpELcOoXGDx14zn15qva3D24kvEg3Qq3lNGzclXVhjOPQHn6Up16doWemtyeSb2Op8P6Fp97ZNOPPCN0VnBKnJBGQBnpnp3q6uiWUv70iUMmOd3HX6VleH9cmihkhtNKmkhTAADkkck5Y7ff26VoSaxeRKI/wCxpyRzlJNwJ/Ba53VXtb8y2f36baadynTm4N20JpNBsJt0rCYMSB9/j+VEPhqxc8+b1x9+oW1bUPs6v/Y8oUdvM5/LGafHrV/Egb+xpmD9AHyR9QBkU/rEXNJvS3Tvv/wAjRqKF7dC6PC2nY/5bf8AfdMn8MactvIwMoIQnJfgcfStoWviA/8AMs6n/wCA7/8AxNR3NtrotZjJ4a1MJsbcTA4wMc87a1VWlzaSf4/5bfiHJMxbLwvYm0jMok385w/HWrB8L6aSSBKM9g/Sr1hFrs1jHJH4c1F0OdrJA5GMn/Zqz9l8Qf8AQs6n/wCA7/8AxNXKrS5mm3v5/wCX9dRckzI/4RbTv+m3/fdJ/wAItp3/AE2/77rY+y+IP+hZ1P8A8B3/APiaT7L4g/6FnU//AAHf/wCJrD20eZWf9W9O4+SRjHw9YqwhHm7G5Pzc0v8Awi+nf9Nv++//AK1abWuvfaVz4b1PdjgeQ/8A8TUhtfEH/Qtan/34f/4mohWXNN82l/8AL+vuKlCWhU0rwlpk+s2MLibbJcRqcP2LAelepf8ACsPDvpd/9/v/AK1cFpaa7b6vZTN4a1ILHOjnMLgYDA9dtemf8JJqX/QvXf5n/wCJq62KcZ/upWX9eQRpTa/4P/BMfUPhtoFvp11NH9q3xwu65l7gEjtXP3nhixTwTp9yJbje8+CN4x1f29q63UvEOoPpd2p0C7UGBwWJPHynn7tcpeaxeN4KsITpE4RZ8iTJweX46UUMZV9rG8nb+vIt0JOD/wA1/mdifBGmY/195/38H+FNPgrTB/y3vP8Av4P8KU+JNRH/ADL93+Z/+JqJ/EuoDOdBuh7kn/4ms/rtb+d/18iPq8u34r/MyvEml6H4Z0aXUruS/dFYIqowJZjnA+7x0rw+/wDEd/PftcxyMsecLAWyNvoff3rZ8deNbzxTqTQqzQ6dA2I4A2dzDqzeprjWI7mh42t/O/6+Q/q8u34r/M34fGEbJmG0mVx1zMMZ/Klj8XT+cDPbs0XcJLhv5VzQAV+CMGn4B70vrtb+d/18g+ry7fiv8z13S206/so7y2ublo3H3S4yp7g8V2nh/RrCT/SI5bgsyYILg46e1eEeG9WOm3jQyORbzcH/AGT2Ney+E78rdFM8eWT+orKviq7pyTkzNRSZqDwfpxH+uu/+/g/woPg7Tv8Antd/99j/AArahl3oGHcZqbPArf63X/mYuVHK33hGwSzkYTXWRjq49fpTovCOntbxkzXWSgP3x6fSt3Uf+PGX8P5ilg/49ov9wfyrR4qv7JPme/6F8q5djBPhDT/+e11/32P8KYfCVh/z2uv++x/hXRNUbVj9br/zsXKjnJPDVlCyIstwRIcHLj/ClPhOw/563P8A32P8K2Ln/XQcfxf4VMawpYquqk3zPdfkc9GK9rU9V+Rz58J2A/5bXP8A32P8KafClh/z2uf++x/hW+3SmE5rf63X/nZ08qOa0m1Sx8TXdtEzsiQDBc5PO010JPHWsW2/5HC+/wCuA/8AZa2feni25TTe9l+QR2EzTTnFKcfjSE1ylDE/4/E+n+NLIMO31pE/4/E+n+NK5+dvrW9T4Ik1P4y/wr82EGftEf8AvD+dFLD/AMfEX+8P50V4OZfHH0/U6KWzPL/BX/H/AGn/AFwf/wBCNd70bFcH4J/5CFp/1wf/ANCNd64wc19Jj/4kfRHHR6+pKpqZTVdDxUymuG6WrNi7bP8A6ZF/u/410MDdK5i3fF5GSeAK6C3mQYy6/nWVCcbz1W/6I5KKfNP/ABfoibQD/ob/APXQ/wAhR4p/5Fm8+i/+hiotCljS0cM6j94ep9hTvE8sb+GrxUkVmIXAByfvCvUjUh9bWq+JdV5eZ0VFqzT0j/kDWP8A17x/+girtZ+kzxLo9kGlQEW8YILDj5RV37RD/wA9o/8AvoVzVJw55ard9V3ZKRJS1F9oh/57R/8AfQo+0Qf89o/++hUc8e6+9DsSilqL7RD/AM9o/wDvoUfaIP8AntH/AN9Clzx7r70FjnfHf/ICg/6+k/k1b+of8eMv4fzFc744kSXRYVjdXP2lThTns1b1/cQtZSASxnp0YeoqsTOH1Xdfa6oIr3izB/x6xf7g/lStxg+lQwXEItYgZo/uD+IelOa4hI/10f8A30KinOPKtVsuq7A0eAfEG08nxRcoBhfMcj865PyOelegfEtAPEEcwIKybxkeuRXGhQDXRVacrry/I1rfH935IqC3p32aryqvWpViBHAqDEzTbmqWpQhLCQsOAVz+YrofI46VU1C2V7KRXXKnGR+IpPYqGs0c/JukvEJulFvvJi2uPlHbA/h7CnvMEuISk+2V4ZELGYE57ZYYFaaaRaeSh8nkqP4j/jVO7022SeACLgt/ePtWTp6HW8Y4t6de7/r07GTbHbdSCVxudHTeWyMkEZz9e9Jcp+6ghDozRIxYhwQMknAPf8PWtSW0skbb5ZLeikk1A1pD2tW/Fj/jV8utzBVvccLf1e5Q3LFpxVWBeZ/mAPIVen5k/pSy7otMljklhwdpQRlSzHPfHOMZ6+1WjawjrbsPxNUroWqYXy2z16n/ABq6UVzq7tqONV7JdLf1+ZNZySLYSJBcJFKZlPzOFJXBzyacZLeSUCOZYk+1yMCpAIGFwfYEjrVUvaf88n/P/wCvUINtub923Xjn/wCvUxhFxk7/ANXNlWqKKjy7f18jTmmi8yJhMouTC6h3mVyrZ4yw4zjOD9OaZbzqkFwLuZHmMqbXLhwDtbDHHUDj/PFZrNb9o2/P/wCvT99t5RHltnPXP/16fs4+zXvdxyxNXm5uT+np9/mdP4LLtJqSu298qSQ27Jye/et6PgN9TWb8NtU8Pabe351ixubiN0XyxCeQQTnPzCurGt+DkZt2lX/3iRg9v+/lYSt7SPzMrzVNx5d7/wBfgjHJqSBv3lara34NzzpGoA+h/wD3lTQ674Kil/eaPqPHUf5kq3a+5UHP2aTg9rff/Wx7dGflFV9W/wCQNff9e8n/AKCa4lPi5oCgf6Hqf/fpP/i6h1H4saFLpt1CLPUw8kLKN0SAcqcfx1tCUVNa9V+aOf2U7bHX+E/+RYsvo3/oZrZry3QPinodlodtbyWuol0DZKxpjlif7/vWl/wt7QP+fPU/+/Sf/F1WInF1pNPq/wAwVKdtjv8ANITXn5+Lugf8+ep/9+k/+LqxYfE7R9RnaGG2v1ZV3fPGuMZA7MfWseZDVGbdkjq5WH9pxf7h/rVgvXIy+MLA3qP5NzgL/cHv71HN4pt3kMga7SM9AFH8s1z05pOXr+iNp4erp7vQ68v700ye9cd/wk9r/wA9b3/vgf8AxVV5fFtmhw0l8PrH/wDZVr7VE/Vav8p1mrS/8Se+/wCveT/0E1xd+/8AxQGmj/p4/q9LeeKbV7CcB7pgYm4KjB4+tYWoa5D/AMItZw7ZRmVXAwMD73v71th6sVWi2N4eqoW5ep6jLNgE1geJdUGn+HtRus4Mdu+D7kYH6ms+bxhZkHEVx/3yP8a4T4ieJlvNA+x2/mp50g3luMqOcdfUCs+ePcX1at/KzzEEk7ic7uT9TUM4weh/CpYlYoMkU7y8rhiPejnj3D6tW/lZmyTtkALgZqa3dyfnx7U+UxohRVHmZ64BGKas8hbb5cQB9EGab2Ippqok+4NMoYgmvU/hpqr3d1tkJJSMruP8XIryRgTIR0yfpXpvgJ/J1VYgQQsB5HTqtZVv4bE/iZ7FZSZiUe1Xwaw9OkBhj57CthGz1rdmaItR/wCPGT8P5inQf8e0X+4P5UzUD/oMv4fzFPg/49Yv9wfyrR/wl6/oafZBqYae1MNYCKlz/roP97/Cpj0qK5/10H+9/hUp61hT/iT+X5GFH+LU9V+Qw0z6U9v0pnetjoMG2/5HC9x/zwX/ANlraPSsW248YX3/AFwX/wBlra78HiujE/FH/CvyFESmmlJzSdq5yhif8fkf0/xpX++31NIn/H4n0/xpz/fbnvW9T4Ik1P4y/wAK/NhDjz4/94fzooh/4+I/94fzorwcz+OPp+p0UtmeYeCf+Qhaf9cH/wDQjXfydDXn3go/6faf9cH/APQjXY6xqP8AZmnPdeX5u0gbd23OTjrX0uOi5VYpdl+pyUE5Npdy4j1OrAjrXF/8JZcC2F1/ZZEJcxhvP/iABIxjPQihfHBXJbTyFAyf33/2NediKM3RkvLyOn2cl/SO5jdRcJk8Y5ratvs7EA4J+tebweO0+zx3h0cG3LeUGN0PmYjP92pT42laVfs+jS/MQFUTEnPt8tRSwaTleC38vI56NKpzTv37+S8z0vRYIZbVi65O89z6CneI7aKHw7dyRptcBcHJ/vCuA0/4h3elhrW40GZZQ27a8pRhx6FKl134jXVz4cui3h+aO3O0NOZiVX5h1OzFelHC0vrKfIrXXReRtUpVLu35npWl2Vu+k2btHlmgQk5PXaKu/YLX/nl+pryyD4rz6bpNmZ/DtzHB5SKk0jsqPhR0JSp7f4wNdLut9I8wf7Nzn/2SsKmEhzv3Fu+3cn2VT+memfYLX/nl+ppf7Ptf+eQ/M150PitdDr4fc/8Abwf/AIinf8LYn7+HJP8AwJP/AMRUfVIfyr8A9nP+mv8AM9C+wWv/ADy/8eNch8RryfQfD9vc6Y/k3El2sRO0PlSrHGGz3ArFuPjG8EkcZ8NXDvIGIWO4ycKMk/c6Ac/hXK+KPi9baqtlFNos0Atr5JXbzg5AUMCAMDnnPXtThQoqaUor8BWknq/xGa7qniyPS4Xnv8O9wEEX2ZAemQfu+9W5NX8Y7jC95l8Z8sWy7iPpsrmpPH+mpFZq0U0xjuJHfg4UMhUEdCTk57dODSzePE8qJEsI0jkjdYyRKY3GRkZI3Hkduh+taVKdBw5eWP3IE9ToLvxH4nsLFZZdT/fGUQpbLaqXYkZAA25zVWDVvG+qyIsupLaLnPlmNAwA5O4heOlY1n420awNwq212zTTA+a3LRDYASuegByMdcHrmorTx7bwXf8ApFhMsLq6GRWz1UjIBA9aXs6KsuWP3Id2S+MLvXGiglur6K4TeSjxxgA5yD/CD1B61i3cOoWlnFPJqdr5kkayrbhG37W6c7Nv/j1WdX8TWt9ZQWKQSDGXMpBA3FicKMZPGOuOc0HUDDp1xZX91eTK9uqw2zxH903ysGG4/LxnkdQfQ1cJRiuWy/AHqyvfLfaaqiTVbV5yqMYI0bcoZQwySgXoR0NNtrvWp/L8lnk8xtibIQdzeg45PI4qaXUopNJntbu+vLwtsFuHTPk7TyQSeOOMDjn2qXRdZs9NsrhW+1ecp8y2YIPlfayHv6MD9VFX7SN+lvkK3oRRXGuzNGkRkdpM7AsIJbHXHHNRSNr1w8lqkc0sy/eiSDLDHqAM1sXev6XNLcxQC6igltjFG3lj5C0vmMDz05K/TFV7/WtOutOntFkuo3ZLdfPZPv8Alggg45x8wx/ujpRKcbNXX4DirNMoWR1u7YRReawTAkKwbvLHTLccfjTLux1ma/nijMslvaSsklyIPlUA4ycA46ZrdGt6dqF1CsP9orL9tWdfLjBeY7VUcDPzZUnH+0eafJ4m0+6vrVrexZ5oLuSaMSLJnBIOVCMATgHIYdPxpOcLWbX4EySOdFveRWzTxwXktuF3GZYcJjJGc46ZBH4UlksmoSyKG8mOJDJLNI3yooIGTgE9SBwDya2Itf0hXsVaO98uCzmgZRjAL+Zxj/gQ5+npVOwvtDhgvLeS3u1S5iCFiQcEOrDp2+Wn7SN1t+A7ehRvLc2d39nlvShKq6SbdyOrDIIPXBB9KgvNMMIluXurbUEjhVgIC+AS2Pm+VTwMnj861tQ1DQLl4gLa5ZYIEiUscFsf/XJ69qqale6eyRLb314vlbVhUFiI1y2TkkY69B71EpqWisaUpKEuZ7fIo2Vmkrq7i0a3lmEQwZcqcZwvGc98nIqG7aOLTIYfKjJEsqiTJzwR74/SugtLrTpZ4o5r+9m3MdwjUgY2n1cc5xxW1oOhafqAvgfN8gzcIynk9ckbvpWaVrttdDo9vT9nyRWrur+76/p/kcTYtJHBYrB0muSkwHRx8vyt7YJ/WqVpcR2V6Zo1Z0QtjY2DjBGQe1eoan4WtbSwzbboQ7gPhCoYYPB55pg0XTV1WBYbhFRFPCrjBOeg3fSqbjy3bjrcidde4k7OLXVfhr835/ect4eWaW+eaKeUtPbBh9obey4YgDPccH866BYbtlDRzIAPv5HU/l6YraTStNilZ1uQsrcMfKGT/wCPUkWmxQlo2upDvbIPlev/AAKuZVIxqxu47PsXXrqVKUFbXXdf5vV/lpczGju2UiOSNcgbTjnPfPFIEvVckSRFCc9OQPyrcXTYAxVbl2I/6Y//AGVWIdDE+dsz8f8ATH/69XCLl7sZRubQxFOpJPlvfXeP9fqc4EvwmBNEW9WHT0xxUV2t79jmCzJ/qznI9ucceua6w+HW/wCez/8Afn/69VL7QCljcEztxGx/1XsfeumGHrc6em/deXmDT5JRUVqu8dLX89fn8uhy2nJenTYds0Y5J5HbP0qxImolm23EYXPAI7flWrpOjbtKgb7QRkH/AJZ+596mk0jH/L0f+/f/ANerqUKsastFu+3+ZHLOdKEbbW6x/DXru/MwpPtny7JUA/izyT/47W94SbU/7YkNrcxRH7Mcl+c/Mv8As1Uk0ls8XRz/ANc//r1Z0mN9KvHuDIZd0ZTG3HUg56n0rOFGpBp2jp6f5mtRVJxkrb26x6dd/wCu7Oxd9a/ivYDL2baMAf8AfNQh9a3krew+b/E23gj8qzhq7G3eUoeDj71WNP1J7iVfLgaRj1VTzgfhWdOVXml7sfi/u9l5nPPD1NNF96/zIdd8Sav4aht/Mu0nurptsNvDGC7Y6nkduPzrF1fxpqi6jaafcOZIroDE/lAKrEcr06jvW/qkEk+qR6pJpEk8tpCRboSQQ5Iyc49BXPeIZdT8U2sdvb6VJp7W7B4WdNzbz948Ada7Oat/LH7o/wCZH1ar2X3r/MnuLnV4bO4iN3EU8tsjA6Y+lZupXF7/AMI1YhbhAhZeMd8H2qXXZWs7YiRTGrRlAX4ycVi6neovh/T13rzg4z7UUZVfbwTjH7l/mE6FRQ1S+9f5nUT3uqlTuvYsfQf4V5/46uJ5ILb7Y/mgyYG3jBwfpXRS6rCf+W8f/fQrmPEv2e/hgBu41KSZzuB7Uuat/LH7o/5h9Wq9l96/zObSSFFA2sKHnhEirtbJIzSyWcABxqEQz9P8ab9ig84N9viJGOOP8acpVbP3Y/dH/McMPV5lovvX+Y2WWESldjZJA601XhLgBHz9Klexhkm/4/YwxIwvfP50ySAx3LlpC3l4HHGSRmspTqRpXcY29EVUpVIVuZpW5vIY5gDEMjZ69a7X4fz+drEgXOVgYc/Va4V3BkYkZOa7j4UKJPFDoeAbdjj15WsK2Ik6TXKtuyOSUveZ6XpiazJaxOl3AAVGMr/9atiK31/tfW//AHz/APY1HpsYigVOy1swngYrpeJl/LH/AMBRmpGRe22viyctfW5XjIC+/wDu06G31/7PHi+t8bRgbfb/AHa1705sJfoP5inw/wDHvEP9gfyqniZeyT5Y79l2Neb3TAvf7dsrWS4kvICqYyFQZ5OP7tbFrI0tlBI5yzxqzH3Iqtr3/IFufov/AKEKnsf+Qdbf9ck/kKzqy56Ck0k7taK3RCewy5/10H+9/hUx6YqG5H76D/e/wqY15tP+JP1X5HLR/i1PVfkRtzTT7089eaYeK2OkwLYf8Vhff9cF/wDZa2j04rFtv+Rwvv8ArgP/AGWto10Yn4o/4V+QoiU0+1O6dqb+tc5QxP8Aj8j+n+NK/Ejc96RP+PxPof60r/fPrmt6nwRJqfxl/hX5sWH/AI+I/wDfH86KIc/aI8/3x/OivBzL44+n6nRS2Z5X4NOL61/64P8A+hGut1QW80cMV2R9naeISbjgY3jqewrjvCLbby2/64v/AOhGuh1hEu7NrdywVyM7TzxzX1WIV8RBeS/JmOBV6qS7/oRrEWt7CPV7SK13Xk37tYljDYjGzKggctgdRkd+9Qtbq13Gn9lXbXBgkDE6bGh6rtcQ5Ktjkds5HpWHPpVmjECWb/vof4VZC+H7VvsUE9zc3QjLvtZQo/HHvXm1ayjBtrY05/I1YLewuNOubKYW0k0F0ohQxLHE0vlhgGUcAjONvTcMHis/RtRJ1WS1t4XlulinjD7NiLLsYKueADuwOOhrE8PRWmrW01xqFldXCpIT5Nu6qAvHQEc11l+3htPCNxqGktqDXceI47SVQrBj0428j3FKOJpyk1fVM5o14uT6WZi6vZX/APZ2nW888kNzBDI80EXEiR7iRnuABk/Q1p31rpFx4f8AtRtR5UWnxiO9di37wAZQEnGS2cjrkk1xsvh+SS+EkguGygycd8n2q5LoEUdk7kTgjHX6/SuiNWP1jkT3aXQ6JOKu7r70drBdfYfDLC6g+yqEjxI6sDMc9FycHrnKjtVfTG0K5jn1BoITNYMJ2fy8FwQQAf7w3bOD61iWnh63a3iJNxygPUen0q4vhy1x964/Mf4VlKvFTeu1/wBSVKCad196Na4uW0u6ms7eNNRngtPNhTYDJKzSgjA6NiJgQOehqaPVLaV7hLW1EmpRpC0tosYdo9wJfCc8g7c/3cnpWKfDNo3G64OeOSP8KytU8AQRzY066lSQDLqwyN3oCBR9Yg/6Q5VILqvvR1Fy1lN4hgjtzEu37Yk7feES/Z+vHQbtwz7VlHS9KLXzXNtGqWzqu24SSUShif3h8vJAIHBGB8w5PfNt/DM7SQJqEt3psgDK06rlZFYYbbjB5GQQeuat3fhpbDypbHXdT3xfu4zGmzYnJIBDZHPP41zcrrVmoSSvZb/16HPKUZTbjJfeMGk6LqEot7WGMwRkTTS+WQfIIfcw3AH5GUDOATuFLpVtY6np+lQTaWuL150Rtx/cKZRwvuoOTnOQtZ7WYij1WeJ76Rrn/RRcztgupILnb3yVxknpnjvV2HwzfR2zQLq95DbsCpgjJ2kHqCM4P5VlOlNxTc9Omvlq9H3ehNub7S+/y9SI6dZeXBavpiln017p7wZ+R1VjjH3cZUKcjOT17VLJo0MXh26luLKzt7m3SCUlPMZ/nZV+fcNnR8/Ke2Klj8IzizNoNYvFtmOTCM7CfUrnFSSeFLqS1Fs+t3rW4XYImJKBcg4xnGMgHHsKVp3uprf+uv8AwCuXW/Mvv/4JHqMOk6dd6rJHoK7LG8VFR2Y+YC5G/r0GMDHHzDOal1rVbGDxOLS40+NAPIV96KSoKJwT7A/pUEfhXUYrq9vdL1G7n1WOaLzAoPmNG6lmcnOSMgAn35rUufCUc2p6mz3d3e3cV6YpEt7AXTeUFXacNICFPIyM42jkU6VKV1Lmvpbr2X6pv5j9naWr6f5FfVtcVrLUDeWSZtL1beFZIh8vD7lX2AVf09al0PWbZpbawKxWjtPi4g8jInQ7e65GAM53EADmqzaFfXukzTi81CKKyEr2891bbo9qMcKJd5Mb8ABR1IHNN8RaReWFncJYarPNbuyf2goBUo7KNuV3fdIx83c8HHApPDydLkuv6Vv+D6/eQ6a5eXQdYa1KNNup/s5OlJvRIVgBEzEHt0wMgknp9SKydOni1yWfTY7WH7RNCxgxCo+dfm/UKw/GtDQdM1m60nTo9N1a7WGO9dLuONmCQRnYQ7/NgIfn64GQfWuctdC1e+8QQw6H9qV7ieRLOdFaMMF+8Qw6YU5OOgq1SnaVmk3/AF/w/ctK17WuztUSy0/WdKlt7MJHdX8MVsrRgMqqSshz1znaa5GO20/WJ7C7bTvswlubiCSFZGwwSJXVuvB+bnGAcDgU7U7HVNJ1C203brEcNvGVt5ntpIpeXBaWNCQRyOOhwBnBp99omo2+u2GnQzyW1oJH+zz+XhXOGHy8/MSir35yAacMJVtfnu9uvn/wPmZ+zfcei2erzeHbGayhjU6a0zTK0m5ghmOw4LcMy84XOTx2FYWtDTAbRtOEEksgZZY7VZjGGB42+aA3IPI56e9dgmkTwPp9jFe3UYntprlLaS22FJE7CIMQGILdOePesCwt9R1vUYv7QuLl5XhnR5ZlLNBKNwEfJypPGBxy3StYYapTmtdNdNfN/r+BUabT3Ll48ogudMlidNMj0WOdY2XCrIUQ78dm80lc9eorM8MwzWOkapfMbuxiCwqLu2hzMNzE/LyvykDk7h/D16VPqularbaJbabJNeNbRWTXM0JVtsMgJIBGeufyBzila21DTtFu9VTUriF5XjhgmiYiR4F2DduByR8wGOcbSOKlYaSi4adH93fzfX9RKFk0atwE0iTUr5YL2Ce41JIQ2nkRvsZMr8xXuScqAMkY4xUWn67caBHrdpdWzX0C3zQrexyhHSUbuR1yCATgjFTPpl1Y3d7dJql5FFcvbx29ysexbje68Blcl/l3Zz0wR3rP0MXUvg2d7HTku7gaoq7DbrIdnlnnaeD7nqM9e9a4bDzoykr3uvyt/lpqOnDleo661/W7nShdbbxrQSlfMEnAYAHnHThhzVSfXZdYuQLixlvXx0PzMf610y2cMn9mwaUyNYw6jeJISvmKBjhf9rIwB65FU59IUXlpcxWHm3rWE0iWs9osHmSK+BuhU4ztJ4HXaOOa7XzuCjpZX79dzZ2fQ5SXTby4MtxZWd1GkUixyJ99lZgSOM5HCn24rWj8T3h1iHy2cBSim3kbliD0Hoe1b9jYTanFfpqVskMwmtH+ypH5aI4jlAR1GMDkHHHUA4GSOf0ee8fx20t1araSl2XZIu0xvsYJngAfNt9K53Tjzxk1/TB2tqa+oeILsaq0c9hcW9w4BELuQxGOwIz2qSbVL+C1W4n026S3HBlywUn0ztxWVrsGpwwaXCY5jfRSTyeUEJdYsIRkdQMrIfoc1oae+pnTLuaeyEEUtq7LeMr+gIQc7SSRjpkZ9qJxpuorx79+wQkkopLb/ImTxJE1s0w0W4lhRgrSPdPtBPQHaAAa0Lq4tHtZv9BljJQ8pcMe3vmqUzRDwtdxQ3tvPDF5J2bZAzOd24nK4yTwOeiilvrqM20wjSUAoSAyHPToauM6UJxvFO9uvn6myqU0nzP8SO0lgS0iUzFDz/rGx39aluImhK+azx7xlSTgN9PWudkluWtIxHFIwGeiH1qFbnUmtjbmGYx5yAUPyn2qsTWo+2neC3fV9/USxNNJLT7zoWhB581vruqJrUHnzjz7/wD16x431IKCqS49ChqVJb8cmGTHptNc/tqP8i/8Cf8AmaLFUvL71/mbiQAaTNFklWcEv2HStPQtTh8N/Zb+Rg8Y3xlS23O4N3+tc/Ff3S+HrpjbyFlmGF2HJGVp1hZ2+r6Hq11K8NlLBdWqLNd7wEV0l3DCgk5Kp2OMdhmsaVWk5StBaPu+3qOpXpSSSa+86a9+JdxcSCKyNlECcAud5/nVefxprGn3piuZrHIAJjMW04I+v41yEeiXTaxPpUt3YW19FP8AZ1imL/vXzgBWVSoBPdiBz1o1OwFr4WstVN1CL6aWaOS0dJC/yMq4XCbQRkk5bpjHORXR7aN/gVvV/wCZi6lNdPxOh8Ra/ZeItKaNvKjn3CTcsm4DAx07Vg6vbW50nTc3EYITAzj0HvRotha3Flpr6mLpZdVvGtLc2+AsAXYPMcEEsN0g+UFeFPPSsjStOXUfE8Om6wTDboZVkZJVh+ZUYgb3BVcsoGSO9a0qtNVYy5Fp5sTrR5bJfiXTptow4vIvyH+NUrnSLUjnUIUx3wP8alutKtLDxDbW95pepW9jKgYKl7FO0mSQGSVYwhXPoD0NZ2v6X/Z3iDU9PgWZobW7lhjZxliquVBJAxnAqfa0l9hfexuvFqzS+8l/sGybG/WreM56MB/8VR/YVgr7hrtqxByFAHP/AI9VuX4f607MYzbybb2KyXazfvGkAKuvy/c+dMnr868VPN4J+0aNo8tlPZpeS21zI8bPJuumimlBKfKQPkQY3bc9snNZ1ornfK7ISnBSWn4lCLRbJrqOU6zbq+4HyyBn6feq3d6Xaf6Q/wDa0G7cp8vjPTHrUmneDor2C4uL66tdP8rSo76B185g5NwseZMI56EghcclD03VTl8J332aS4Fxaeabf7WLPL+c0IH3x8u3GAWwWDY5xihP9y436/odDrw10+1fcYNEsHAY65bKTyRgcf8Aj1dr8PdKsrHVpblNXt3KQMOMDuPeuD0XR7Wax1LVdTivHtbMxIIbZhG8ryE4+ZlYAAKxPB6Ad81dbRxo3im4tEMstt5KSxOy4YxyKki57Z2sM+9YVItwa5vyOV1IX2/E9l8LarDqUEiJfQXLxnkxkdPzrp45kVwu4bj2zXjGh+GLIy6Rc2ry2a6hci3MMl9DdsVIUh/3arsPONjDOTVuO4tETS9ag03VIbV9Qa2dJgJJFKbG3DAGQQxGMcFSMmtNf5vyJ54dl956/eOTYyfKe386njLfZojsP3B/KuUsvFmn6gr2cdwXZvuFo2XP5gV1ttNGbSMeYmQg6sPSrcl7Fe91fbsP21Ll6ff/AMEzNeY/2Lc/Kei/+hCp7An+zrX5f+WSfyFQa86NpFwqupJ28A5/iFT2MiDT7YF1BES8E+wqpSX1Ze99p9uyH7albp9//BEuSfOg4/i/wqZs+lQzsrTQbWBw3Y/SpzXDSTc52fb8jChOLqVGl1XXyIzk0w09hTDW9n3Ormj/AC/mYNsP+Kxvuf8Algv/ALLW0QO5rEtv+Rwvv+uA/wDZa2j+tdGJT5o6/Zj+RMZR7BgetJtHqKbkGkJxzXNyvuVzR/lBAPtic9v8aV1Bc/NzmmJ/x+Rn2/xpW/1p/wB6uiqn7OOvQmbj7dafZX5sdF8tzGDz8w/nRSx/8fSf74orwsxd5QfkbwSTaR5B4WcLc25P/PFv/QjW1f3QBPzcDk9q5PTNRi06JLiV9qrEw+uWPAqGO/udUvWeYlINp2RZ/U19ZW/3qn6L9THLleuvX9CK91Wa+do7U7YujS+v0qz4etES+kwOsRyT1PIqjGoUAAAAdAKswXFxbzRi1DtNKwiVUXczE9AB9a8LEJzpuKNYwSVx+j3Fxpco8iTCHOVwOcitCG6n+YGTOT12j/CqZ03UFvFtBFG0xVnPl3MLKgX7xdgxCY77iKmj03VnuLiIW+026JJK7zRLGqN91t5baVPqDiudujzc7S19CHSoyd3BX9ESw67dXt4sMSKrFCfmfjgEn+H0FVbjxLNJaOjRZBx/EPX6Vo6XZ6rDpNxcTzGC0g0551jWWLJZmAVnTk4YNwxA7YNUNPmju9Bupry/kt9LgjjhdY0WQmV8kYjO3n5WbdnOB36V3Qnh1U9pFbNbLr0sYOjT/lX3I0NM8Q3Ny0FrHAoYqAC0mBwP932p6+Kpym42uF/vF+P5Vn315d6dr8dpe6iB5EwUFWDRqhXh1XHygqQQPeo9WWCW1sdRTVZ5tMaVrfH2RUeEqATiPfhuGHO4E98cVE/YN83Le/l31F7Gn/KvuR0jeJbmCwFzHpAYbQxkkuAxAJwG2AAgZ7nIqmPF2tX42QCCEpzuMkcZP4uOfwrO1horGxtNRtdUmmF/alBFNapE6opCKcBm4JRvT7tH9jalDYabqWniOXzrNriUTNC3KySBtiNywCopOAcZ7VLeGsny6PTbsH1el/KvuReTU9a1D7RFLFI8kZ2N5kqrtY54GQBnjoPSsOTVtTgfbJPM0IODEwUEH0ztrT1C2u7maGCzvEZp4o76R7maKJdzjBIZtoxnGB15qmLbW5ddu5WiERguCLh53jhRS2Rjc5C5IzgDr2qqLw0J86S/DzWwlQp3+Bfcia/8UefYxxJZ7FVwRiT2PtWg3i2Qn/jzYe3m/wD1qq6lbXP9hWUttIZW+wyCYkxqqRrMykhuAAdnckktgdQKjsbqTUUu7nUNRCabZtHKNkKylQW2qqpxjOeRkDgnnFOfsJUkraJ/r+ti4U6cZXUV9y/yL48WS/8APm//AH9/+tU8niS4jtYZzaHbKWCgTHI24zn5fesjUrrULLV4baa/S7l3LLbTbFOUkQMhxjgHcpx25rZ1OK90gWa3t9cPcK7KZZbIBM9ykhyXweM4HtWap4XRcu+39XLuv5V/4DH/ACMrUdQhvHFzJa3AdwN22YY4AH932qibi2/597n/AL/D/wCIrpzNLaa/AiX7XUdzaLJ5jwqjNySOmeQV657Cr11DbSS7zFG7siF/lBJOBnJ9a1pUaTgnFaDlNX2X3L/I5K4tmtYVlms7hUJA4uVJUkZAYBcqcdjiktoftYPkW0pOcBWu0VmPoAVyfwrtrlICtw/7phNIGGMZ6k5P/wBemWsKxXImH2dQp2ljsyPp/wDWrX6vTvaxHP5L7l/kcULZ5LQ3K27rECwzJeRoSR1wCAT1HSqgtVmsRceXPsPuCOuOu2vRINoDBzF5BZjhtu7nv656V2tmqf8ACqREFGQDkYGOZjSdGmlexVOSc1ot+yPCJtIL2o3w3IRwMcYz39Kbf+GPK/s/ybe9YXEeVBxndnnHy+4r6P1MrJ4Gibk7IIjyeQflHFUdTIFz4TkPTMYyDz/BXQ6VNYdSS1v+iJqSsnZLfsjwaLwbc3lncMllqL3Fu4VgFzwex+XrUMHgq/uIlmSwvzC3/LXZ8oHrnbX1em2OZ/70pJyp64GOfypm1DA1uAMgDI/h/KsuSHYnm8l9yPl6b4f6pJO5t9I1Vo/4SY85GOv3azofDJMpWS3uywJAQEAkjPH3fUGvrctsjwM4A9TXlPimytbfXILiKILJLyzZPP36unThKpGNt2jWk0+a6Wz6Hl6fDvXriaRF0fUWeMgONn3c9P4ap2vgfVLsXMkOm3pjtz+9YD7vXr8vsa+rI3HmTPtA+brzziuT8PMv2jxESoGZuQfcvV06VN05trZL8zLn1Wi+5HgMPhCW7tTcQQXLwq4QuGGAxGcfd9K0I/AV61wUezvhIAx4IyNuc8bfY17NrljZaf4YgS0tIoA90rNsQAE7TVi6m2a/ayYUE20hzjrkPVexp+xjK2/N+FiZ1Gnol06I8hHw8D2BmNlqbcgCeN12gnsV2fTvTP8AhXmpaTPG80dx5LSqrPj3+lerxata2ul7JriKJjNnaxA7CuL8U+LlvtQg02GQPGLhTvTofmHSuOnyuKujXERipSSS6/kcXq+mQ2uqyxG4kUDGN2PQe1RTWltFDE3nv8y5+8v+FN8VvdS6vMgLOqkY3HOPlFVLyOU2tn93cI+/0FRVjFVFp1f5GFNRcIadP0GyRqchLpse5FSyXt3IjA6gvIPG0c1mtFL/ABsMe1SoQsbbSucelbU4U3ON4rdfmhyo03vFfcWIrq8jiUC+UL/d2ilN9d5yNRA9ii1V83bECwBHsKgnAddyNwe1XiqdL28/cW7/ADJWHpNX5V9yNaK/us836t/wFatRXs7Hi8Rh3G0VzkD7Wwe/pVohoysqHaR1I/rXP7Kl/IhrD0v5V9yN1ri6OjXKx3eWMow+0Hb04rHl1O+XRtR05kaR7m5gm87pt8pZFxjHOfM9eNvfNX7R1bRblgcDzh/7LWHNduZZN2OGIBxUUqdP3mopa/oi3Rpxs1Fbdjo7b4mT2Wp3d3/Z91G818LtRbXph3AADy5CEy6cZwNvU1g6j4q/tHSmtZrOZLhLqae3mSYBUEjKWV1KHdjbwQV69DWNO3zZzyarElia25V2FyR7HT6L4y/s+0tYbzT2vJLG5a6s3E3lhHO3IcbTvTKKcAqevPNZkeqwPcRyahb3dzlna4CXCxmQn7pU+WdpBOTndn2rKVvmA96lVd9xtzWtCEZVYxsN048l7HQS+K4Li/00Pp9yul2ERjjt0ulErZZn3NKYyM7m7IBgY461Dr+vrrGu3WoWdrcWi3MjTPFLMsxDsxJwQi8c8DBPuasaXFZQ/LLEr7ThiRyKxrtBFqcqIMKGOMCseWPYHTjbY7WH4h3totnt03d9n042py5G+X5Ak33eCvlQ8d9nXnjAsvGVxaT6ITal/wCzYJoCPMx5vmNIc/d+XHmYxznHvURG4BSe1Zs6RpOoHXcK3xUFCrKKWxoqcXI3rbxkVlitbmwlktm01dNlSKcI7YmEodWKMFOQowQeM+vFm88e30ul/wBlSPqyPHbfZVWHUTHAyYwN8Oz5jtODhgDjkdc8kONUj/66L/Spr1c6vIc9x/6DUqEfYOduv6G/sI6v+9b8y7pHiP7BBe2N9ayXdldBN8cUoidWQ5VlYqwHVhyp4Y/WtXT9Yn1/xFd3UtlKXcJ5UUL/ACRwoAuwjaSflCANkYweDnjlBsEjZPeuq8Bf8hqcD/n2b+a1z1ElBuxyunFS2NS3uo7qwhsdO0q9i08XS3MzvP5ssrKCAqsI1CgBm7E5I54rX8Q6zf3OlHyYNQhS0DSrLdStPJvwOS5UDACjAxgc+tYWhaibe2CSSrHFGOpOAKh17xQl9YS2Vpu2ycPM3AI7gCteWO1iFCPVFTwtq97L4ksElnLI0uCNo54PtX0baW8BhjJTOVHc+lfNvhK2H9u2s7Z2KxKn1ODX0XY3Ae1iIPO0fyq3RpeyXurft5E/V6XL8K+4Zr9tFDpFw8aYIC4Of9oVJZW0L2FuxTJMSknJ9BS66wfQLn1AX/0IVLYD/iXWv/XJP5CqlQpfV0+VfE+nkh/V6PL8K+5ALaFSGCYI6cmnmpGqJuKwjGMfhVi4U4w0irEZPFRtUjd6iJFM0MG2/wCRwvf+uA/9lraNYtt/yOF7n/ngv/stbRPSujE/FH/CvyJiN/Ck47ilNI3Nc5Q1P+PyP6f405v9afrTI8/a48+n+NOb/Wn/AHq3rfw4+n+Ypfx4/wCFf+lMdH/x9p/vCiiP/j7QY/jFFfP5hvD0OmO8vU8Mn0dWtNF+zndPeWskz+fPHHGm2V04ZyoAwo6nrUlnoesfbbuBLTY9rGjzPJPEqIj42tvLbSpyMEHHI5qa213R4xpUd9aSSfZLCWDe1uk4jkaZnVxG7BXADYw2OT7Cto6zpOu6frcrJc29oljZ27PFbxq4ZZPvCNWCY/2QRj8K+mlO9RO5y4e6acbmInhy9GlapeTtFbzafPFC8E08SFt6O2QWcZ4QYAzu3fLnBqJItT8P3Om6zPY7o4bmJzH5ybxuBKhlBLJuGcFgKnvPEOlajaanZSQ3UMEgsxauqLIx+zRPEokG4Y3BskgnHoanl13SdQvbieK0me+1WaATxzxr5UGHVmKNuJbJXA4XAJHNcsowlFxbWvkV73n95m79O0OW7jJ1B7K9ga1ndo4vMgbcrrhVkIJyg6lcjPSrT6vY3uktYQvcRpcww2Nm0iDc5jfeWkwflBZ8ADdj8M0/xRNpMV3ren6ZaO1xcak0kpmhVVgVGcbUIYlgS3UheABg9ar6Rq2iWltp66havPd6dctPGttArR3AO0iORiQQAy9QG4YjHesvq1Jvmcrv+vL5DvIeutaRcm+leLUlv77To7F40iVo0ZfLG4HcC2fLHGBjPeodPs7aK0vNEvVvRBcrHeo8UAMyPHvUKYyw5Ku3f0NVrLRU1G5jeaW7SKQuZ2ht0Ijb+HaC43e4+XHvW5qPh7R5Le1je6uoLa0tvLa4S2Rpp3MhbLLvAAAbaBuJwBz2rSOEpp8qlu18rbWJbZgaxcafqOvT3cxureN5418oxLvWELtOfm++AAAOh5OR0rQ8RQ2180EelG8+y28v2WG1kt0jWInn7wkbexPJYgZ+mAH66ulG9thBDLsjtIo1d413yALjc2D1P49uTXRnxRodtKI5NIlDpYeWw+zxn/TAABJ1+78o/M0vYQi01Lbp/SCzOY1+0/tO6gubKN0s47VY4klZQypHlegJ5IXccZ5Y81tWUtglpplxtuFvdLsGiWNmQRSl3kIO4sCAPMyeOenHUwQa5pEc2ms9pOy29hNbyjyE+eRvN2t97nHmJyefl9hUz6tou2S0ubW6hD2ENuZI7WN2SRWViwG8A5AI6g80pYanKKhfRf8ADBaRAl3ZAJPcW7DZp8MEcr28VwY2U/M3lO2Cp6bj09Kj1fV9I1qPUTcvdWtm1xBLG8MKO4cRbCpQMq4O0kEEYx05wLt1rtlf6OYYLWaOVbRbd1i0i3YHChd5m++uQMn3zg1y+m3Glm0u9I1CO7P2iWKWOW1t1klR03DAUsMhg5zz1APOKmGEpe05r2YK5f1DW7BdAt/DV4lz9mtlY+dEo3CXzJHRgNw3qVcAg4xnI961g+l2el3MF1cXX2XUcJvS3XfG0bBg+3fhhyRjI6+2DX19oNS1O5lgs7qCeSVRDZi2HEQUgZIOd2Av8POScjvNrd5ptzeWdslvPaW1tbxwtm3USZ6yOV3DJLFiMnpgZqnhqSp2T3d/ne9xpPmL0rWl/wCLrZ4XmEcUNsLVGjHKRxKAXO7g/KDwDkk9K7TU20u8hvfkuWa9uVuZhKeIyN3C/Nz9488cDFc5DdaBLrenz2H9oELbpBIJLKNM7Itu4YkbJYgk9Me9b32vTnJDJeYB4Atk/wDiqqODoOzc7W9f8u+oWm9vzRf0fTtB1LxTpcMaXEohiNuyzJ5YwFcg5Vyepr0E+ENAEoX7APmBJzPJ+nzV5XaX1pb3ck0L3scynKmOBQR+IathNcu54ZZ0vtXIgALtgfKCcdPM55I6V0Rw9GmuVVPwf+RVSnLm/wCCjvF8HeHyzIbAADHBnkz/ADpB4R0Bo2Y6eNyk4HnPkf8Aj1cB/wAJJMH3DUNYyep29f8Ax+kHiOUZUX+sBD229f8Ax+q9lS/5+L7mR7Kf9NHoLeEfD4jDrYjPGT50nH/j1Gu2ttpnhWe0s0EUAxhdxbq4PU8964RNYv5LV51n117dD80iwkop9zvwKp3uq6hqdjLbW02t3U5AIjSIueozwGP8qTpUrfxF9zKhTmpJ2/FHot+wHgsRkgL9lhIX8Vqlfvvk8LgkErIuB6cR1w8t9qc+nCwj/t2S5WNVa3EBLDGONu7PH0pqS3+oXulQxXWpxyQyKrLKhUqxwOBuPp7VtONL6uo+0W/Z9kKdOTXz7o9mMpa1LhxvAILkcj1pZJgFSRWABYbiBya4ldH11iwGo6js6Yw2P50v9ka43y/2hqWQfRv8ay9nS/5+L7mP2FTt+K/zOzupTHbSN7V514qAN/YktjIHH/fVXLvStcEW19T1Ebj33c/rXJeKtN1S2urVZru7ZioKlwc/xe9XSp01Whad9V0ZrSo1FzNro+q7ep7Lsiit5Faf5m3cY9a5TRjFHL4h/e9J1xkdeXrI1HT9atbf97ql/GGOMvu/xrlLgzwi4WPUZGMpBdicZPPvz1qb06dOaUrt26Pv5mSw1Rvb8V/mdn411uwt9Dgi+0JvWVW25x2avPtd8ePeTIbNSixx+UTnk9c4/OsDWI22ZkvhId38XJFUnto2b/j9i9d6gcfrTb/2eH/b36EVKE0/muq/zLWqSvJMHLt90E7jiqdtOralaAsc+cnT6ip9RtYZLlWk1FEAQfKce/vT9KsIJdQt2ivbdwJV7ZPUf7VedR+GJ1YqjPmn8+q7eoa5Iia5cZOMlev+6Kde2yS21q6vyE45znpV3W9KjfWp3+1IpOMqw/2R702exi8m3Buo48JgHsenvSrfxY+r/I5aVGfJT9O67eph+WFbBYg+pHFQypBySoDY4I5FbyWMTDi/ib2wP8aqNoMCiV2ul6Egdv51vS+OPqvzRvKhO3T71/mZqkCFcyDGOBiqshBzhh9RXSW3hiC8s0k+27c54A6c1Fc+E44eftqge4/+vWmKf7+fq/zFGhUcVt96/wAzll+/1GK04ZBsxlT6CrI0O2U83sePoP8AGnx6Tbwtn7fDn3H/ANesB/V6n9Nf5lmzjQaROpO3dMPp/DXL3wCSuNwxuPT612UdjE+g3BW8jYCUcgfT3rnn060JZjqcGQT97HBz9aijtL/F+iHOjNtLy7r/ADOfkACjBJz6jFQZ5NbsunWkgwdWg+px/jUI0a0xk6tAB9B/jWlyPq9Ty+9f5mOv3xircZ8uQnHPvWnDoVuVZ01GJyq7sAD/ABqFrCAIH/tCIsTgpxkfrW2Gf7+JUqE1Tf8Amv8AMbNfMWDxbUkHX3qOWZboq5QrIBhsdD71eTSbMj/kKQH8B/jUq6XaqMHUoPwx/jWIvq9T+mv8yJBmYcHpTLu0kaRXONuR0ra07SoJ7hSb5BngLgc/rVvU9Ot7M7Hv03HkIQB/Wt8d/Hnbua06EnNN/mv8zlEgUX0ZxzvX+lNv4caq7e4/lWzFZ2pnRzfRBtw+U4z/ADov7GA3Ekn22Pdx8uB6fWsY831Z+q/JnfKlo/8AEu3n5mI9jKtuLhkj8thuG6RQxGcZC5yRnvitWw0/UdPvZDhrd1UxuYpQWBz0IU5GccZ64qNHtJrD7NLcb3KbUDxL+6Oc5D53Y68dOa6JIYbbXdQlikdpZZt7K4G1drZ455yfpitpxhytt/19xlKlTs3e23b/AC19Djri2vYTGksY+dti/vVIDehIOFPscVcstGllkcXi7IxC0ibJFcPgeoyMZrUEEMs8YL+fCJfMMItI05wQMlfvYz3rViWaJoXjictFG6gmBUBJ6fKOKtcl9xRpUb3ctPkYkFnfpdKqO8ZjUHPmhdgPQdeD7da3be51qOIsNUvI0QlT+/IwR261EkRh8791IiSMJCWiWTa3II+bqOevWorm6k8nDo+4ys+5lABBAA6fSh8qhuRKnTULp3evb/I02vNSmsJDLrt8IiQpHmM3OfTdVlZ9ahRY/wC274bQFwJnwMenNc8mow/ZHinLBdwYGPBJ9RgkVuW+ofaEW4SIsWOcdQKG17JepDScI6q/Xb5/pYstLrIZV/t+/wCQSf3z8Y6/xe1Ng1LVIy5OrXsqHgFpmBBBII60hu/uf6OwABDYz3z/AI1RS5BkkjjjcqjHJYYJJJzUy5HF2/rb/gmlWFNwly26fp/wb+ZryahqaqGW/uyNoY/v2/xpr3+pGQqmoXXQHm4I6j61U+1RnHyuH8vbgjjp1p3nR7mYqckLg7QegwetJqmTKnQva9r26laC71M6xczC7uQQoUuZiCTxx1rQfUNSChlv7vGwMSZm4/Wsz7TFNqU6MHXDM6kAHIYLwfyFWXmWSFYyCNo4I9auqod+iM/Z0EpJO/b1u/LT9Swmo6gys7aldqoIGRKxOT+PtTJNS1KORkOoXXBx/rm/xqvG6eW0cm4AkHKjPI//AF0jSgzGXHIbIUjjFY2jyohxpumtdf8Ah7/pYttfahFLCW1K6y2c4lb5f1rS0S8vG8SW0Ml7cSxtkkPISDlCemaw3mSRok2bSCSSMnr9TWtoLxv4ntGjLHgg7hjohHr7VpWUeTQqrGlzXhbaNu/W+/8AVz0BP+PqP/fFFEf/AB9p/viivncw3h6CjvL1PD7+2gEseIYx+7H8I9TTbS3gMh/cxn5f7orr9N8InWbqz866EUUsZ+6MtwW/wrq5PBOi6RapJHE80pcKXlbPHPavp5Qvi16/5meVTjzRT7/oeT2mmSX8nl2diZ39Iot38q6rTvhhq14ym6toLKI9TIBux9K9igghtoxHBEkSDoqLgVKK4VBCdZ9EcNb/AAs0RVRLovLGvPloAgY/7RHJ+mcV0dl4V8P2EYS30WwUDu0Cs35kZrXpaoycmzlvCuk6dJpcpewtWPnsMtCp7D2qz4j0jTE8P3bJp1orALgiFQfvD2pPCkqppM2eT57cD6CrXiSVW8PXQ5yQv/oQrrh/vS9V+gp9Tl7rSbXaHsdJs7i9Fra5ja2R8RlX3MFI9QoLdvUZq1Jo+lf2jqLRaZFNMt6VeKHT47nEWBjgkbQTu+Yc8Dkd7S+G7PWrm2muZZ1ZbKJR5bADGAe4PrVkeAdK/wCfi9/77X/4muep8cvV/mzSorNei/I5v+x7KXSLn7NpUdtDH5zedPZRusgDHA83qj4woA6n60eItK05IpHsrC1YZT7U5hXfG20bQBj5VPqOpyD2FdMPAGlf8/F7/wB9r/8AE07/AIV/pP8Az8Xv/fa//E1BmcXY6RHd22jtZ2ULeRqDPdkRr8iHy9rP/s4D8njr61yviLw79r1CO50vTDKJZ3WKKOIgSqMtgYxnA7DnmvUL/wACaXHd2Sie8w0mDl19R/s1U8UeBdLt9LRlnuyWmCkM69MH/ZqsMuetyre/6BH4jzrX9JuZ5NFhg8PQLdJAWfSYYpNwUSMfnG4yfNnpnIHTioPGFrHHqFmGsrW0k+xx77SJMfZyCw2tkkk4Abk5+YA9Km8XeEYtBvXti8xTeDGxI+ZTn2/Ck1DQLdEuWiluN8Vr5qgFTubzY0x09HP6U5xfsVLzNINc9vJndWCaATaF7XT4ZLm3XVfliVSsMYQSKPQZ8/j/AGBUGlahY3emW12sFo2mSQTSXt0Ih+5lDPgFv4CAE2rxnPfNeXHwxqE1zKghBdGCNvkjX5iOFyerf7I59qjg0C7aW3jELBrjPlj5BnBIOfTBB649ax9oV7Jdz1Yy2cenTXX2e3GkGwSWK9Ea5achcjf1Lbyy7M8AdO9aN3c21vYaqz28UGlAwCC5SIDzIjIvzBh9/jBJ5weOOleZXfh24h063uLeSRoWt0mcFo8jPUhcZ2j1x+NVJtGv7e0F05dYsKxwULKG+6So5APYkc01VT6FyprqeuNLpw1nT4HsJvKl1KKKCRrFI4XQk/KHDHzQeDnnp15rmr3XrG48P2t+Y4o2N1LBmKILlQsbAHHXG48nn1ri20TUle3QMXa4lEMflyxMN56KSOFPPQ4p8Wh6m8/llzhWVWxNFwTn5f8Af4Py9fan7RdvxEqUT0jT9UiltNMvot502CznW6kA+SNsyblf0ZgVwO+RiuXF1PeL5VtpV1eu0YmW38l/3se4DICkMRnuD2rnZNPkfxHLpVvdygLcvCrybeFDEZOB6DNVtVtUgsIru2v5rm2lZowXiVGDrtJBHPZlI570e0XYapxuen3N4t3FqFo9lcTzGwtlfTLE4lT7vygkMfkwM5DHnnpkQS+I7ex8YLvWVzFNASF5K4UfKTzlh0PuDXn8ej+dYRL/AGhL9smtWuY4PKBUooYkFuzYQkDGOnNLpy2406e7h1K6SW2iWR1ktE2btwUKG355J/u9AfSm6i5Nuoeyj1Pdf+Fi2SE4s71wfwqP/hYlsrFhYXhz2LV4fBqus3as8EjOBIkZwEzufO0Y98Gr1iNZuNTe1ujImPtEfy7MmWKJn29DnkLz0wetHtor7P4/8A2VPDvo/vZ63P8AEC2mIJ0y7OOg8zpXH+LvFo1G8tXFlLGI1Aw0hJP3v8a4ea41mKR0mTfi3adWiliK7RxuDAEMAQcgHPFN1e11a3v4bVJba6d4UlBWWH5AY1clsE7VG77zYBxkVdOvFVIvl2a6lxhQSlZPZnqWr+Km1KEL9mMRHQtISa5WQSysSBEc+prhL+/1KymMV2FD4DcbGDA8ggjII9xWtcpcWMF0Ir8Pc2JRbuHyFUIW4+Vud2G4PA56ZqHVi94fj/wDFKh0X5k2taZM0avJ5f3gMA/Wql3oF7M/kK0MeRlcE4qSwV9RtBLc3jRK1wlvGsdqsrF2BwSMjA9+T7Vbso0vdSaxuNSmiuIzIGZLNJI1VASW3FwcYB7Vu8RD2UY8j05uvexEoUpP5oim8Ganqcysk1tsC4wWwc1qaX8NL23uIZ3mhDRyK2B3wc9ayLJrmeJpmZmcOVBAA4wP8au2vie+0+6gtWnnYSSqpDNkcmsaThGKTjr6ixE6UuaSj36+Ru3/AIdkGpzPcFGV8cBvYVWm8MXd6mLfywsfBDDrWJqniCWPxBcBn6Feozj5RWy3jB9LsFdbna0oB29c/wCc1NZw9pHTq+vkc1KVPkh7vTv5ehWt9CkjcxlIi44IOaffaTILJyEVMISQD14rS0TxFBrYkY2yROozvJ5Jqa8gN/ZS+XvQBGLMe/Hat6bp88fd6rr5o6Oenb4fx/4Bzul2l0ttE8RTByCMnnmr11a3UseAIs/7X/6qsaZoKy6fCwkZWYHPPXk1ox6escDC4CnH3cCrxMqftp+71fXzCEqdl7v4/wDAOJudCudwfdFnOTzgfypn9g3Eh3kxn0AY/wCFb1/pcRUlVwc8VlSQrbw+WnGDg/Wsean/AC/j/wAAfNS/l/H/AIBag0ySPQriJvKGZB0J9q5Q+HXeWVXliRixKtuOME9xiujZlj0GZu5lH9K8/kuvJvZX2BmDtgntzWdGUPf0+138kE5U9Pd6dzWPg3UZJ1SNoGLHC4Y8/pXWa38L103wnazmZv7VDHzs5MTAngD0IFcfpfi270/UoLgEhYzyFPOK63VfiJf69G1na6hJ5BHzQz8b/bNa81P+X8SVKl/L+P8AwDjz4c1H5T58AXgH5z+XSo5tAu4PneSEgnHDH/CvRrDWr+98MvpV7otlPbbSSXADZ/vggckVVtPC+k2enxX7L5jzSBdrDITr/gK1w8qfto2X4jlKn7N+7+P/AADjoPCepzNiERyH0Usf6VbbwhqtuwFwkaZ7MWH9K+h7GKztF8m3hijC8jYoGRWH400VNaslKKDNGcof6VlzU/5fxFzUv5fx/wCAeceF9G3T+ZMsbxZ2kZzg1d8XeGZrmMTW8w2oN2G4IqXw/bvaRPuQoS/ysfuk+lJrXiJpIJbWeHZKMq6np7EVtjZU1Wndde5rBwc0uX8f+AcNHoV8LiKQvBguCAXOev0o1DSL1buSQvDsGP4jnoPapfLilmjYRgHcOfxqG/tF+0yOOvH8qxVWH1Z6dV18j0PZU7PT7Xf1LVho1xcXZZni8pk2nk5/lXpnhKzM2qEMwBMZWQBiehHIry+Fdm4/3QX4613/AMP5Vtr37ajbmEZEi+2RzUYmUOSWn4+hw4n2arS93r3PQ7T7HYQLDawGNAP4VGT9T3qU3kfo35U+0vIL62W4t5A8bDqD09jS3EQngeIkgOpUkdqrmp/y/j/wDFSpfy/j/wAAy7zVLN4XjEo3nHH41Pb3sL2yAHeAoBAwRXF6h4c1Czl3KDJCDneh/mKTSbPVYrtGhilDbsliMDHvWrdP2S069/Irmp8vw/j/AMA2PEel6Zc6bNMLbyphjDxjb3HUDg03wybjQDGkp8yxnUEMOCpI9K2dbJ/sWfPXC5/76FWbNUm0u3jcAq0Kgj8BWjlD6stPtPr5IV6bXw/j/wAAuSXsR5Cvg+1YOmXKC/1M4bmX092q7pOqI88+lySAzW5wpPcVX0s/8TDVP+u39WrFShZ+7+JpTlT5J+726+foPluFN4jYOAv+NPNwno35Usv/AB/R/wC7/jTyeaz5qf8AL+P/AACqsqdo+707+vkc/bTL/wAJdenB/wBQP/Za2TOvoaybY/8AFX3v/XAf+y1tZ9a6MTKnzR937K6+RzxlT/l/H/gEJnXrg0hmX0NTZwODTDjmufmp/wAv4/8AAK5qX8v4/wDAGwsHu0POKe3+tP8AvUsR/eimsf3p/wB6nVkpQVlsc7mpYjRWsl+bHp/x9x4/viikT/j7j/3xRXg5hvD0OuO8vUxfC3+t0r/rlJ/N66fW/wDjyT/roP5GuZ8Lf6zSv+uUn83rptb/AOPJP+ug/ka+rf8AvS9f8zkyv+JH1NQU6kFVbjVdOs5fKur+1gkxnZLMqnH0JrgAuClxWd/wkGi/9Bew/wDAlP8AGl/4SDRv+gvYf+BKf40AZfhaEnTJSvP78jH4Cr3iSNU8O3ZA5wv/AKEKyvC2t6TDpkiy6nZIxnY4adAeg96vavqek6hpU9rFrOmq8gGC1ymOCD6+1dKko4lSe11+gS6lnResP/XpH/6Ctbgrzy2v7uC8McfibR1VIwqkzJjAAAHStBdT1AnjxVon/f6P/Cqlh4yk2qkd33/yNK0rtadF+R2opwrjlvtQPXxdoQ/7eI/8Kf8A2ndR8/8ACXaEx9poz/7LS+qr/n5H8f8AIx5vI6W8sHvmiKSNG0ZJBAz6f4Vh+JtLuYNMieW8kcGZRtbPoeetVxrupfw+K9CUe80f+FY3iTU7+fT4/N8UaROPOB2xzJxweeBV4fAUnXjKU1v3a/RCVnLYk+J2kpZ6JDeSXH2qRJgu1x0BBPvXFX+uDRrszJHulktjHGMZGfMjbnuBhTyOc4+ta/j3xEraKLSTxFpl8xkB8q0KuRweSQtczfXBa4Ux6ha42D+Me9KWCpRoJKa37v8AyFTTVRO3chttb0+C3e1iEsUImM8byWkNwykqAV+f02jDAjPcVLDqDroOoXtxG5kkmdbSZgF3NKCJen+yvbgE+9QefL/0EbT/AL6X/Cgzy/8AQRtP++h/hXJ9Th/z8j97/wAjtXp/X3kz6xYrYwXCC5NylkbQRlV2ElWUtnOeAx4x1x0qC41uxkS6nSOdrm8ijiljYARoFKEkNnJzsGBgYz3q3Jcz/wBmxA6pZkBz8u9cjr7VAv2yRAyXUDKehXBB/SksJD/n5H73/kU1/X9MnbxRpUMlt5EEoii1KC72rbRR7I03ZTKnLnkYLHn2741lrFiLQQXy3AEV19pjMKhi+QAVOSMdBzzjnirskV6RzPF+X/1qqPFd/wDPRPy/+tR9Uh/z8j97/wAgtfp/X3kH9vRR+KJdUSBmhe5eXYxAYoxOR3wcGmX2paa1lb6fb/afssckkzyyRLvLsFAAUNjA2DnPcn2qQxXefvr+X/1qimiuRGxZhj6UpYWCV+eP3v8AyLjTu7Wev9dy1YeJo7DSNiyzyXAhkhSIwIFQPkcSZ345ztwBmsx9WhTQ47CKJvNluPNnY4AIUYQD1xucn6irPl3HlDJG3A7VBNDP5kOSOTxxWrwkFQU+db932XkTUhZdf6+ZPoOvwaTcXLzwO6vD+7CY4lVg0bc9gy8+xNXZPFlo89i4t5gIrKaObgZaeSExlhz0OEyevXiqJgucdR+X/wBaoZUuEGeCf92s1hIN/wASP3v/ACBrlX9f5liLxHaQ6dDbtBIXSyntzgDG53LDv055qRPFWmx3n2sRSlriyjtLlXto5BHsWMBlDEh8mMHBC9evesOWOdyco5/4CaYYpBCR5bZJ6bTW0MDDniudatdX/kYwm5N+Sf4E+ta3Ff3kZjzNFHGI0L28cGACTgImQBknvWne+JdPu47+W2gnF3qTI1yJFUJHhtzbSDlssAeQMD1rCFrMWDeW/wCRpqxzK5/dyDPX5TR9Sh/z8j97/wAjHnfY63S/EFnpls8KXF9bgXIkEtvGu6dMcI43DA79WHJ4NQxatbxnU2jgaKa7GyIKBtjjLbmH14VenQmqUP2m4tkClioOMlcY/SupgtdRS8izdW4YqeWUHHX2rb+zqfs1L2i1v1fS3kTKo9rGZpd40Fi7CFmxIfmH0FINRgnv7dfsg3mVRuIHqPatHUY9TRSWubeQHqFUZ/lXPw293HfwOj4bzVPsOa5/7Ppt39ovvl/kKaXK9C3rfkrq1xuto3bK8nqflHtXOapO0wjAQIsWRgHI7f4Vr6yLo6pceY6s2RllXr8o9qxZhM67SjnPbbT+oU1K6qR++X+RFJJQjp0Lfh7WBaXwEuRGcDg16tFqMI06YKdwaJxuXt8pxXiDwyAnEEvH+ya6TwlNqFxeNZwTlcg7o5T1HetY4eMZJupHT1/yNebyPWdCjE2m2sgB+6wP/fRq1dW+6LyySN2RmrehWRttIihcgumc46ckmn3cO6VQOxzWGIkpVZSWzbKjsctEfPkuLaTHmwnp7CsDW7cwXa4+7KuR9RWne3CW3jSRgcCUcj8Kj8RoZRGVx+7YHP1rAowbn/kA3HtKD/6DXnVxue6kABJLngD3r1YWYuNIu0xzvz/Krvwx8JaDqGq3TXyGa8iYkxzcKOeoHeoo7S/xfoh1Onocd4c+F2v67MjNam3txgs8hxkHpiu/h/Z+TyFabW2SUnO2ODt9c9a9mstOtLGFY7ePao9WJ/nV0dK0bIPIp/hh/YmkzvYapcsiQsWW5w2cA9MYxXGahKbTR7WKSZMs+dufdq9z8YRXD+FdU+ycT/ZpNuBk/dNfJVx9vmvfs8iytL6EEmtcLrWiW/4T9T2221lzcAo/8OPrW3DdXToWLhl6gEVwWmeH9c0fTopdUiESkgYJ+YfWt2616HTLAu7huOOeTXO20wSTRS/tO3g0iZJCNxl2kfUcH8xXI62/m3aOWVn8oCTBzk//AKsVmT3UtzI3zHBOetSBTsOTniujHS/fzXmdVOnaaYkKneh/2h/OnXw+eT8P6U6AfNGff+tF/wANJx6VjH/dn6r8md72f+L/ADKxYp5jcfcGK6TwROz390kWdqQlwPbI4/KuUkkBjki/i+8PpW74EnaDWblwM/6MxP8A30uaWJ+GR5uJX76Xqa+j6vf6d5c1lcr0/eRNyCOx9x1H4V29h45s5VCX8T20ndl+ZD/hXkcLXNvfRLbAmFF3pnoUOMg/Q8j8a0ZNUcOwGCAcAgda0aORM9im1KzvLFzb3UUmcfdb3qzbE+SmWGNo714paamr3aKVwTnkDHanv4gdJXQT3C4YjiQitGv3S9f0Lv7p63rzqNFuMso4Hf8A2hWbN4psNM0yBEk8+4ES4jTscdzXmFxrJniZGlmfPZpCajXVI0UKBjArRr/Zl/if5IL+6bQ1O6j1QagJGEu/c20dR6V3vh2+jv3vrmM8SOGI9OteUHVC27GcAZJ56V1fgKaf+1JjGSLdo/nz/Ee1Yr4WXT+Cfy/M72U/6dH/ALp/rUhNV3ObxP8Ad/xqb61kXV2h6fqzEtv+Rvvf+uA/9lraP61iWx/4q69/64j/ANlrZOetdOJ+KP8AhX5GEQPrSHNHNNOfU5rnGSRf65aacmY/WlhP75e1MbiY88bv61T+Axh/vHyX5kkf/H2n++KKRP8Aj8jP+2KK8TMN4eh3R3l6mP4W/wBbpX/XKT+b10+t/wDHkn/XQfyNcz4W/wBbpX/XKT+b102t/wDHkn/XQfyNfVv/AHpev+ZyZX/Ej6moK878XQ2z6/eySjdIsESqDGGAzu9T7de1eiDnpXDeINF1bVPFFzHY2gljaCMsS6qQVz6kf3q4o9QOXuNJszcyu4ESGTYoVc8/TIwKjXQ7fescmFldyiKMkEg45OeOa6oeEvEe52l02Ngzb8GVMA+o+amr4c12M/8AHgskgYsrGVCVJ64+aq0EcVY6Rbpp0UrEvJMC4XbgKMkYJz7VqQaNZFIUkgBaYE7tzfLyQMc47VZ0jw/r95patHYKUhJjUiRMnHPduvNXV8MeLRBmSwgt0XO2SWZNy564+YD881UtJO4Pc5qG0sY79hJGhQL/ABs2B09Dmr8tjZJcqsVrHIHUYyz4JPpyDWdeC30i4LXkaXJA2YjkD7269m69uPSqU/iTWJplksdMji2DCmZs49MDI+vWs+dLdm0qc3ay6I2bvT7MXEmyFVjU4+8ccdTyaZfx6NHfXKJIrSJMEeKNT8m5to6HHU47VxtxaazeHN3LI+f4Q4A/Q1oz3eozXDzixtYpZZlmlaPjzCDkA5bp9MZ70vaoXsX1LV1PZn7QYSYYIZfI85oi5Z+ei7vu4GcnB56VQ/4R+7u7oQ3OoAYmkichflRlxt79GLAUkLaghmD2UE0c0nmmOQ/KG5wRhge57/Wmy3GsLb3aMik3UqyO5I3Bgd3HPHOPyFXRkpVEkX7NrZEMmiwR2bzT3TQrHFHK4WHc3znAUDI5xg9utPurDy9WSz8wNuKBXxjIbBBx9CKNWm1O4W7kmt4l+1sgcKRhdvQLzx0qe/s9SlvVmeAJIEQDa442gAd/ak2vZXW1yeV89mLfW1m1rdPb2/km1uFizvJ8xTu5OT1+Xtgc9Kj0qK1uJPIntFaMAtNcF2BjT1GDjj3ByeKnuTqN0NrWMCq0nmyhDjzW9W+b3PTHU0sP2yGze1OlW0kbybzudgT6AlXGQO2ay51e9y+R22GWMFrPpzCW1ULHG7Pc72BU87eM45OBjHNXdKVf7MhyPX/0I0z7PfpoEMD6ZbvFvYh2kIJY55OHAJHbIqSwjkt7GOOQbXGcjOe5rKU9NGawhZ6omkCgcDNVZMjvVh2461UlJyeay5pdzoUY9iJmbsar3Dt5Lc1Ix55zVeckwtzScpdzWEY8y0H72MA54wKhndvMt+eh/wAKX/liOewqCckPD7Gulyl9VWv2n+SM60Y2en9XLUlwyZO7isue/mL4D8fQVJdy7Iz3NUY03Zd84qabe7ZyYlq9kh32q5PPmYH0FONzNtLF8YPBwKjHUscYHSnrH+5y/O411UW3Vj11RlSaTd3a6Y5Ly6m+VG4XvgU5jeDlpAR7AVoRRL5fljaOD2pohY+YRjgDPvWHtJeZXsY/zr8f8h8OpoLZULN5gYfeGK6H+0YWuI2JBXb0FcpPFsRX9aUNJu2vJ5fON1dbqP2ENH9r9DKdFX+Nbrv/AJHTXV/CynY2PrWZDJLLc22dwUzKM46/NWVM6W4LRz+bnqKZZar5d3AA5RPNU5b7q89a5FOV+ppOiuV++vx/yNPxFbX8WqTshUxAqAARn7orAknvgcFmBB4+UV2ssEeoyvc/2vZHJ+7uHpii38GXF/eLAt9bvIeRk8gVblO+39fec9P2agk5rbz/APkTiY7i6adA7kgsB0HrViwupNO8SLdRkgowP14GRXXap4Mm0va13d2yYPyk8ZNZEmlWbT731WzSQdi4zQ3Jx1X9feX+7351+P8A8ieyaVqcdzZRXEePLdcn2NS3dwoaNwQVbqa810meSyVooNcsxE4wRvBGfWrU91eqiIuu2g2nI5FQ2+35f5lqMH9tfdL/AORKGp6jDcaqLgHMnmAfQZrSub2OdGGcnIrCubCG4uBM+q2KuG3EqwGTTfsfBA1q0H/AhU3fb8v8zTlp/wA6+6X/AMidTYyQjSrt2OP3g5P4VEuoJZ6mmr6U8SXcCASx5z5i9Dmsqz0xxpcwXWLVozJyd3fiqcGkRpcM66vaBsn+Os6bklLTr5dl5lSjSdvf6dpf/ImjrnxN8RLfi7sLp44nUDyuqqe9el/Dn4jp4nVNMuIpft0Ue55T0f1NeP3GhxucHVLTbnOA3erWhLJ4d1D7ZYaxZxykYbkcj8a05n2/L/MydOHSa+6X/wAifSWrYOiX/wD17yf+gmvPrzTbNPBOn3i28YuJJwGk28kAv3qtZ+Lb3VNNuLc6/p+8wsGUlQcEc/w0tzaay/hGyQ6ham0E2UYAYJy3fH171th7qrF2/r7yZOmoNOa/8m/+RPQryzjnjIdFZAOhFfPfxI0qbTvEjny1S3k5jCHj8uxr2trbxSet7b/98f8A2NcR420e5v7Vv7T1GzWVTlCcA/yFYuMu39feOnOkn8a/H/5E8mgXJq0R8h+lW7XSY5EJOo2qYOMM1WTo6EYOq2f/AH1WuMUnXm7dfL/M7PaUoy1mvx/+RMuD70f1H86ZqJw0n4VuQ6NaLs3arb7gegwf61X1XSrRUlcapAzDHyjGe3vSUGsO15r8mWsVSaaTfxLo/PyOTumKTK46gfnXQ+DFD6zckdBaOw/Nar/2PZzqHk1aCI4+6wH+NdB4N0ezgv7pk1e3lPkFcKBwMj3rPERfLL+uxx4utD20l59n/kctJct5cYC4BjFVTubqK0xotkljCp1+2+ZmbPHPT/aqP+xrL/oYLf8AT/4qt+RnB9Yp+f3P/Ih04H+0IvlPft7GorgH7VNwfvnt71qadpFmt/ERr0DHnjjng/7VEukWhuZf+J/bj5zxxxz/AL1auL9kl5/oX9Yp8vX7n/kZA3bhwfyppZsnr19K2F0e03D/AIn8B9uP/iqa2j2m4/8AFQW/X/Z/+Kq3F/V0v7z/ACQvrFPl6/c/8jJaV1t5sbjlQP1r1HwMnl2uSMHavb61xlh4etLq4ii/tuCQvIoCgDnHPrXpWiW0UV3eosqgK4UfrWKi+VmtKvDkn6Lo+/obDH/S0/3f8amz71H5SC4Q+cvA6VN5aD/lqtZckiquJp2jvt2fn5GDbf8AI23v/XEf+y1tE8nrWbb20Q8S3cguULGIAp3H3fetbyl/56rXRiItyj/hX5GCxFPz+5/5EXJ780hNS+Wn/PVaTyk/57LXPyMf1in5/c/8hsP+tA701v8AXH/e/rUqIqOG81TjtUJIMpI/vU5K0LMVKanXuuy/MfHn7ZH/AL4ooT/j9j/31orw8w3h6HoR3l6md4RgeaTR9veGT+b11uu2Qi0+Nnf/AJagfoa43wLeNPNYQRtiSBHBwOcHcf611/iGNl09JZ2wPNHzSNgdD619a1/tS9Tjyx2qxXmbYltYuIl3kf3Rn9axoLiVvFN4UATMI9/7tZeq/ETwvpBMf21r6cf8srRS/wD490/WuHvPiPqs19NeaRZx2RlXZmfDso45x0zxXDzRVxqnOR7E0bFDJM52DktI2FH9K5nVPiF4X0gtE2oC7nX/AJY2nznPocdK8c1K/wBT1l92r6pcXX+wznYPw6VXjjhhXEaKBWTq9jaOHX2mdenj7WNLs2stJtbeMO5k8+YbmXOBgDp29K5vUtR1TWHL6vq11c5/5Z7yif8AfK4FJNLslH+7VORgW5NGIk/aSN4QitbFz9zBpkSxRqAHOAB9ahExPanyN/xLIv8AfP8AWqgb3rnudVXdeiLJfPWgNzUAfJqRWpGRMG9qq3l5An7tnwwIyMH0qcHnrQtzFa6PrcwFytwTDGJIZxGQGB4+6TjI5GeRxxW1CbhVjKO9wKWoX9vLbqqPk7wcbTWreatZNOCs2RtH8B/wqtr1nY/2leXd+bpklv8A7Oi27BSuFBLHIOfvDA4zzyKkbQLSO/ttPuZZ2ubq4kt4pImASMq+wFgQS2W7AjA9arnfsOXzMH/FTGf2paf89f8Ax0/4Uv8Aaln/AM9f/HT/AIVl39rZWem2LL9oe7uYPOZi42J87LjGMnhfXj37aumQweXo1k1tC8eopIZ5HjBcHeyDa3VdoUNxjrzmsLOxqp6l2XWrBtJhjE/zhySNje/tVA6paH/lr/46f8KybS4W2WCVioXdhmaBZsDnna3BP1rRvp7W11mC4hRYrWe2RjI1lFJu4wXERO1cspGM8c4qVFobqJitqVr/AM9f/HT/AIVXe+tj0k/8dP8AhUOupbx643lxPHaOI5FVQFLIyKdwHIXOc47Zx2rVl8KW0IbzLic+VNK8m1hzbqJNrDjqfKb/AL6Xiiw1MyjeQdpD+RqCe5iaMgOc/SrTaPaC3NuJLn7cLEXvmbh5WCofZtxn7p6568Yp1x4ftXku7C2muFvLN4klkkYGOTe6odoABXDMO5yM9KXKXGq0yj9piEPL9h2NQT3ETPDh+/oa6GLTLK90qXTLGS5j36xb27yXDBs/JMNwwBjv8vOMDnmqn/CPaZLfWS/anjjfz/MijvYbiQBIi6uCnABIxg+nXnjfm/cqHm3+CIqTcl/XcxZTE7ctkD261BKynCpnHrVnWLW3gtLG7sjcLFdIx8uZw7KysVPzADIPB6VqSxQrC+l/Z4RENHF4J/LHmeb5Yk3b/vYydmM49s1MboxqWluc7y0qjGEWrJK/ZwM/Nmq+hX4tb5RPKEiYYd2tI7kj6LIQPxzmuolvNN07UNWtmRbNp7iFrV5LGO6CRlWJyrn5QdyHjJGMY4rooznComjDlg0Z1nFJPC8kakgPjOfxq9DaTq5LRnaRg8iqsVtc2U19ZzSYnguXR/LOF3A4OMdqvNIy5RnbBxg56Vj7/Ror911T/ApXNlcGPaIzw3qKbe2NwYGxDuYdORUl5JIGx5jjJGPmNE0suwjzHyPeuxqr7CGq+138jKbo32e67GYNHvZEDGILnryK2/Dng+5vJlWZAsO7Dlhkgeoqbwva3et6gljDMGmByQ7HpX0XoXh2003T44XgiZwOSVB5+priSqd0bN0V0f4Hnlp8PvD9pdW1wtwDtX94u04LevSte80+wsrv7fa3C5GFKhCMjv2r0FrGzxxawf8AfsVVutPgkhcJawZI/uCn7/kTel2f4Hn/AIsFlqPhqfy9jyou+MFD1rw26064kunbyB25BFe1+J9TTS9Km0+SzVZGUgSbQDzXjsrSm5f97J/30ambqKO6OijGk3s/wEt9NmUcx4/EVYaynP8AB+op0RlA++/5mpAZD/y0b8zXN7/kdsVT6JlU2E//ADz/AFFINOn5/d/qKtEyY5d/++jQN+Pvvj6mj3/Id4dmXLO1kTRZ0K4Yyg4z9KzPsU6ysQnf1FbVtu/sS4O458wYJPPash3k3sd79fU1NPn5Zev6Im9Pm2exMLaU9U/UU9bFjyY/5VWEkm7iRv8AvqniaVeTIf8Avqq98b9n2ZYOmjy3YxgttOOla8evavbeGbbTUbMUcu9VODjr/jWE93IInG9h8p71Vku5P7PjHmPnd/e+tdGF9p7aNmtzOqqbpu6Z6ZqPxB1u7t1iUpHxhigwTXB6jPqF9MTK8kme7Nk1DNdyY4dufeqwlkY58x/++jWLlUfUIwpR2T/AlgtpWThe/qKcRtJUjpTrBn81QWOCTxn2pZR++f6murFyf1qafcUox5VJDF5kX6iq+oj5pPwqymN6/UVBqAG6T8Kpf7u/VfkzWn/D/wC3l+RkXQ+cfStjwYWXWLjHQ2zZH4rWbOivg7wOK1PCo8jVJWUeaTAw2r25HNZ4iEmpNfmjOvRlKcmvzRzFx8kdvGeNsQ4+pNQZHrV6SFXYEzLwAOKj+zx/89hW9n/Vji+qVPL71/mLpJH9pw8+v8jUc7hb6fPTzG/nVzToUS/iYSBiCeB9DUF1BGbuYmZQS549Oa1al7Jev6D+qz5bfqv8wT/WCmsrM7ALnntSxKRKo+1Agdqs2wKTZ8/POcA1q4v6ul/ef5Ih4edrfqv8zc8FWpk8RWquP9UrSsD+AFelaX/x+Xx7eZ/U1yugSeTILhbQvIV27wOSPyrqNJEnnXMkkTR+YwIDD61z2ai7lRpuEJc3l1Xc18/6Sn0qxmqoP+kJ9KnrEVXaHp+rMe2P/FWXn/XEf+y1tZrEtuPFt5z/AMsR/Ja2s4rpxPxR/wAK/IwiO64pp60hz2pDn0rnGGcNg0L99frSbgRSqRvAI5zSew47olT/AI/Y/wDfWikQ/wCmxgj+Nf50V4+Ybw9DeO79TzOwuNTsb2G80y7FvIE2bs/XPH41JcQ6vqN00upapJdEjgPJwPwArjLJiLVOeh/rWvazFm5P8NfUKpGWIjdfj6meXU5qpFRa37eRoR6S0IxGIR+P/wBantp8+0fPH+f/ANasQy+9PaT/AEdTnvXEpU9fd/H/AIA3Gpp734f8E1P7NnJ+/H/30f8ACkOm3H/PSL8//rVjeYfWjec8moc6f8v4/wDANFGp/N+H/BN6606dpRh4/u+v/wBaq50u4P8AHH+Z/wAKp6g2J15/gH8zVXfx96tK8qftZXj+P/ACMalvi/D/AIJ0D6bOdOiTfHkMT976+1VhpU/9+P8AM/4VWkb/AIlEBz/Gf61UDe9YuVP+X8f+AbVY1br3ui6f8E1l0ucfxx/99f8A1qkGmTf3o/8Avr/61ZKvj+Knh896Oan/AC/j/wAAy5an834f8E1hpsw/jj/76P8AhVK90XUXhnjingWCdkLqTySoOO3uaiDcdakuT/oMf+//AI1th3TdeEeXd9/XyKjTqST97p2/4I+S28QwvLOt7aGSaQSOXRWG8dGAKYVh6jBpHXWbLMMd5FhsvuIDMpbhirFcqT3IIqlqDf6OvP8AGP61eDAdDmio4exjyrdv8DKFOXPeUr2M+bT764SFZZ4mEMflx9sLknHT1JqxAmr2tmbWG6hER3YyoLLuGG2sVyuR1wRmrO/1o3gmuW7NuVCR2Go2mnW9xG9ljmMB4lcEZJ5BUg/U5PSoc6sbprh57WR2QJiSFHQKOgClSoA9hW1Mw/sG25/5aH/2as3eM9eKlNspwRl3djf3lw9xczxySv1Yk9uB26YHSrEs2tP5u++QiW3W1fgcxLjC9Pbr1/M1YZ896hLj1qtRcqKzPq32D7H9qi8nZ5f3Rv2Zzt37d23PbOKivJ9XnsvIlu4yi7clVCs23hdzBctjtknFW2bjrUE7fuWyaltmkIRckJc3+t3Vuqy3kYAlW4zGioTIAcOSqglvmPPU9+gqvNe6o1zDN5tskih13R28aZ3rtbO1RkkE8nmrSO2xeR0HaoJ2Jmh6fe/wrenWpqHLOF/nb9CKkUr/ANdSnPBdzW0EDyoYoNwjXH3cnJ7c806SbVX0/wDs83KfZ9uz7i79md2zfjdtzztzj2rSDc9KQqTnir9vQ/59v/wL/gD9lFmXaSahaXLPF9i+ZVVg9rG6/KMA4KEZ9+p7mpv7Q1aCeWfz7eSeWQSNJNAkjBuxUsp2n6Yq0I8HIWqV4P3e4Kc7h/KurC1MNOrGLpvX+9/wDGrS5VdFi2kkRJHncvK0m9mzkknHJPrU88+8BhmqCFwrqQ2HHHHQ9qky/wBnCFW4as/bYf8Akf8A4F/wDL2T7DribzYlH8QbIp80u9SV71SlRwnKkEHvVqOPzIypV1BI7V2OtQ9hD3H9r7Xp5GU6avquqGafeX+k6xFf6fMYpVwCR3HcV774f+KFnfQJDcWF5HOqgHADBjjrnIry3RfDtlPIrvJqLueoghA/Ug1674V0/T7UCO03CUff3zF3/H0rjjVoP7D/APAv+AbTjGK2NYeNNNwN0N0D6bB/jTT4y0/nEV3/AN+x/jW5KluTGJHUvn5cnkmkZ1UtGCCw64PShzoL7D/8C/4BkuXseSfEXUbTVoIpIIpgydS6gZH515iQvmlueTXsPxM1MC1isoJVeXdmQJztHoa8v8pyf9Wx98VE62H5fgf/AIF/wDroQfYpq6Ds30qQSp/darflP/zzP5UeTLwfKbH0rH2uF/kf/gX/AADq5ZFQyp2B/Km+auMYNXDFL/zzP5UnkTHpE35Ue2wv8j/8C/4AOMh8N1GukzxlWyZARx9Ky3dST1roYYpRolwChz5g7fSslrafJPlHFTTrYXll7j3/AJvJeRLi+b5FEuvvTC/XrV421wf+WRqFrWfvGar2uF/kf/gX/AHyyKsj5RwM8g1WfP2VF96vy204hc7ONpqnJbS/YY328bvX61vhauG9tG0Hv/N/wCakX7N6E+4e5pysB61KLWfOCo/Opfsky9uvvWHtsL/I/wDwL/gD5JdhbAfvVz6n+VLL/rHx2Y0+2ikW4XIH5+1RzRymV8AdT3rCtWVWvKptcpwl7PbqIg/eL65FQ3/WT8P6VKiSCRen3h3qG/Vt0h4xxW6kvq79V+TNKcX7Pb7S/Izpl+UfStjwYp/tqb/r3b+a1Q+zGSMfMBxWz4QtCmsSnfn/AEdv5rXJVkuRnPOL1OPZc1ERV42f+2fyqNrUd2P5VspIzcWGlj/iZw/U/wAjVa8X/TJ/+ujfzrR023C6jEdx7/yNQXVuv2uYknl2/nXQ5fuV6v8AIlxdihCP3y1oWkJaYY9ahSBRIOTW3pFsG5zk5rdO+GX+J/kjnqxsrnY6G22NR0wK6uBvl965TTVKBa6W3bt1rBmKLuf9IT6VOD6VWzmdT7VYB5qToq7Q9P1ZkW3/ACNV5/1xH/stbWaw7bjxXef9cR/7LW1k4rpxPxR/wr8jCI7JI+lNPFHQZFJnrXMMOlC/eX60n86FPI+ooew1uiwhzeR9OHWimJ/x/wAZ/wBtf6UV4+Ybw9DeO8vU8Q06ENYxHA5z1+prRggZX+XaOPSqWmg/2fEcev8AM1pQM28g56V7dOpL60o+f+Z24PDwcoPVXts/Ir/Zs9An5U8wERgYXj2pUZuwNSEkr3rljVlZ6LbsRKhHTV/eQC3OeQn5UvksOML9cVOMntSrnJGKj20uy+4v6vDu/vGzROZBuKk49KYIDn+D8qvMIX5dyD04FAS3/wCejf5/CuuupSqNpx/AmnQjyrV/eRPBL9jj5Tbu4GPr7VEbdh/zz/KtNhbiyj/eNt3cH86gxbZ/1rflWXLN9Y/gbVKEbrV7LqUfs7Kf4Pypwhk7BPyq4fs2MeYaX/Rto/emjln3j96I9hDu/vKqxyg/wflS3SOLNMlfvdh9atZtRj94aivDD9lTDnG//GtcNCf1indx36NdmXGjFKWr27leexknQISgGc5xSDTrkjH2pvzNXw9vj/WmnLJAD/rG/KoU68VyqUbfIj6rTbu2/vZltYXAP/Hy3606PT7g/wDL0361ouYOpkb8qYrwZyJW/Kn7Wv8AzR/8l/yD6pS8/vYsmlXo06JjfZjLnC5PB5qqNMuMf8fWPzrceWP+yYCzHZvOD+dVhJBtOGNR7XEfzQ/8l/yL+qUu8vvZlnTbgHBuj+tL/Z1w3/L3/OtAyQ45kP5U4NCB/rT+VL22I/mh/wCS/wCQfU6Xn97M3+zLj/n6/nUV1p06WzsbkkDHHPrWsJIc/wCtb8qhvjELOTEjE8fzpOrXtrKP/kv+RcMJSUk7v72VINMu3gjK3JAKjjn0qvd6ddJc2oNxks+Ac9Olblq0Qtov3hzsHf2qrqDRfa7Ihv8Alpz+Ype1r94/+S/5GdbC01Fu7+99yMaVeY/4/P1NNbTLsHBvf1Na3mQ5yHpxaI85P5UvbYj+aH/kv+Rr9Upd5fezJ/sy725F6xHtmqeoafPHbqz3LMC4GOfeukV4wOCaz9adTZpj/noP5GujCVa7rwTcd/7vn5GdbC0lTbvL72QDSrsj/j9P5mm/2Rdk/wDH2fzNbW5dvX9KQSoBgs35Vz+2xHeH/kv+Rp9Tpd5feznNQ02eOAF7kuCwGOfera6Nc7wzXhB9ec1Y1dk+yJhmJ8wdvY1faRMcsfyrqqVMQsNTalHVy/l8vIzjhaPtHe+lurIY9KmCfvdVnC/3FB5/WrcBvYIfKh1m6hj7JFkAf+PUxXRh94n8KTcg53NXIquJW0o/+S/5G8sPSlun+JPFJf283nQ6tdedjHmsx3fnu4FV5JdVMrMNcustyTuPP60eYn9400SRg/eP5U3Wxb+3H/yX/ISw1BdPzKEtndMzM2ozNnk5zz+tMWwnZci/lXPYZ4/WtGSSIqcE9KZG8YiByeBRzYhw+KN7/wB3/IOSmppeXn/mUTp06nnUZfyP+NPbT7kJn+0psfj/AI1aaSI8lzQZoQuPM/OoviP5o/8Akv8Akaezp+f4/wCZRGnTkH/iYzfr/jSDT5zx/aEw/A/41bM0J/5a/lTPPhGf31UniP5o/wDkv+RPLT8/vf8AmNOnXI0uZv7RmIDD5ecHp71QNnPt5vpf1/xrZE0LaNORLkbx/Sso3VkBg3QB+tOnKu4tc0d/7vZeRjy0+d+nd/5kH2Gb/n9l/X/GoJLSbP8Ax+Sf5/GrpvdPUHN4v51Cb3TM5N4Pz/8ArVoniP5o/wDkv+Q3Gn/Tf+ZSltZRC+buQ/Kf89abbWLS2ke64YjOdpGe/wBanuL3SzE4F6CdpwPfH0plnqOlx2qLJeBWGcj8fpVqWIW0o/8Akv8AkTy0uv5/8EvYGelB4GMVWOraOD/x/D8j/hTH1nRv+f8AP/fJ/wAK5vqlTuvvLdWHcuQn9+tQSsfNfnuait9V0h7lFjvGZznA2n0+lRy6xoyyuGu3DBiCAh6/lSWFnz2utu4OrDk36kwPzr9RUF+RmT8P6UxdW0Z5UC3UhJYADYf8KjvtQ0vzJI/Pk83jjacfyrsWGmsO9VuuvkyoVYez36r8hUcCNfXFbfhM51eX/r3b+YrnRqmjKgVp5gwGDhD1/Kt3whqGlS6xMIppi32duqnplfauarhZqm3dfejkqVYaq5zzNxUDuBTmv9HP/LxP/wB8n/CmG80c/wDLef8A75P+FbrCz7r70ZurHuWNObOoxcev8jVe6b/Spv8Afb+dPg1HSreZZUmlJXplT/hWXPqSyXUrBDsZyQc9s1pOny0lG636PyM3Uj3LiNlxW7ohJIyOM8VzlvIZHOAAB711OixgFc+lbJWwy/xP8kc9WaklY6yzHT3rct+grHswNorYh+6PcVzsyRbB/eqfarIORVRT++X6VYB680joq7R9P1Zl2x/4qq87/uR/7LWxnjNYtv8A8jVeEf8APEf+y1sg/nXTifij/hX5HPEcTz703PAGMUGgmucYnSlQ/OPrTc+1Kv31+opPYcdydP8Aj+j/AN9f6UU2P/j/AI/99f6UV4+Ybw9DeO79T54ju5o4wizMFHQA1JDO11d28NxMzRtKoIz15ro7CHTksdEhRVkkuobmaZZbOM7iqSAfvCSwwVGABg9eDxXPNpTW8Vu6XTvqPkC9FusGVWMAuCXz12jdjbjHevZcuZt2NIRjSmtW7FR4YPJmnF1KI0kEagx8kkE/3unHrThp80sJaKSbIjVwGhI3EsF455HPX2qS6s52jurZI9ksDJLcRLE2I8kLwST3cA5A5PFSXM6W+oahazyCOUxmGSTyWXdIrjIIyTjAOen09aoRvNKW2v5Dc6T/AK9fP0KVvbXRuJIQsxYIcq0XI9wOfzpPKuixEW+QKu44TkD39OlaEktuXniZRm1jWPdIJNp5w2cc+gFOS5i1G8McJxi4EqtsfLDGT07jnr610xw8O/5EScWrJlCOOVrhbcS/viwUoVAwatT28sAj/wBJjZHzhgo7deoqNLpF1y5naHIjkdi6gk7c4z6dDTRJE/kWkLq+0yyM6o20ZX3Gf4ahwjyy7q//AAPv1FzQ11JEikmj3x3cTYUuUK4OB1PTH61OInEOfPg3iMSFeNwX16YxVe1G2yObpzbNFJmFUcFjjv24JB61OGieJZVz509uIUjMR3EBcHnpjCk1ahTcVff1/H/gfgCmt79Bsbs3KywOCQvQdT0HA61JDE8kjqXjGN2AyDOVGSP5VBpkHkCYvGfmVZIAEb5pB939SatNKjXUXllmkmilx+7bmUrg4/SlCnSaUpP5BCSsrsi8uUSvG8iq6jOwxrkfUY460lzY3kUojY5ZhkAR9fpxz1FQJcIyjeXeZbJt42nrvLgfkRViW4i8x3ZVKXlsD86vhCAuc4wccds1SpU9Ne3YIyjZ3ZFc7g75mjVlOGTao2/hio7y1khjkf7QjyCXY4jA+QnoCOAPwplzN54luPLj2KEQSIrhWIxx83NXLwRquobAu551dg6uPLJOTuP1I6VmqcLSXb+vnroP2id9Sl9kkFuZnuliQPs/eIwycZ4wDxT7FGnjRRPH5khOEOd2B6YGPzNOsWnj1NbWSdIT9p2yW6pIfN5xjGMHPTnFP0rKMHhuyLZnYPEEcE4BOBjg8dz+VONKD5br11/Hf+uzBThffQs2U0LWcAuJEIy+QxPXDYzjnrikeLy5JbnCMiQ+YsaOdj/MFzyc9T+lR6fpxmt9PwrM97K0NqrwMRO2cckNx8xwMZ9xU0Wn6pNAb1Yj5As2ZYvIbY2C5aPOc8eVI2c5+U1xuLvoEK1JRSk/w/P0NvwrHa3fivR7eVcQXk0CvHvIJVnUFc5z6+9aUls1q0cranZPavM0DzJOxWGQclWyoPTuMg4OCawtJjm0XxNaatfNIBp2oxRGAQFRvjKsUyT8uCCD1PfBqfTfFEFvrtrZ2uk+RBBeS3Nyj3HmF5NjL8h2jaF5K5DEE5JOKpQXU5qsoym2m7fd+p1kGnh3Ux6paPbNbtci6WR/L2K21jjG7hsDG3v6Uj2cFxYyuNb08Q+YIUmedtkjnB2jgnoR1AAyM4rFvPGttqL6XqMlvqyraieBHTVGNxuyjBvNKk9GIxjH0q9b+OY54NQCre2iyP8AaFj0+9kgmZ1RVJZgpVshQWJAOQTxmnyR7GaSvu/vf+Zf0y3hm1OPTbm+SGWLcJ0EgLR7FJcYz1AU/lVXX1s44tKv7GaQ29zI6ASyhyjJtz8wABGGU9BXL2+sC0u49bS33z/aX83zpWczK6ncGYnkkMQT15q/dahbahHpVpbWcsFhbK5SJpw0hdwPmL7MHkJxtHC46nNJxXYrlg95P7/+CdPaRWd1o11cG4eKS3jLGQ30ZDNkYURY34OfvZIqLVoIrLSY7myF1cDyYXluE1CN1jZwCQ0SruUZOASayo7mHTrOX7PY/wCnS2727TPc7owGG1mCbAc4J6sQM9KhS+hi0u5tbKzaK4uolhnlmufMG0MrHYoQbclR1LUcnkXyU/5n/XzLmmQ6hqiK8E6qpuI7dt8rDYXDEMePu4Vsn26VoxaLcSx3cOoSopSK52b5mAjeLALnHbJPrnB46Z5vSby80W31OOPy5vttq0Kbnx5LngSDjkhSwx/te1P1TxldyxSyT2axuNPNiQsuQXblpenUszNj3xmqhFqSaWpE4wW0n9//AATqPJuYhc5vIZYltFulnEr7TGZFTcvAOcnBDD14zirOo2P2fVLm3t9ZiNvbqHllkkceUOAN3yjJJIwFBrztPF0z6c1p5ABOmDT95k7i4E2/G3224/HPatSLxvLHqNxe28V3CbyFY7sW995b7l24aJgmU+70O8cmi3katR/mZc8QJercQ2H2kySSSRmNkkJVg4yrD2IIq3rUEMVrfSWF5fF9PultpjNKCJd24b1AA2jKHg56jmuI1nXru71U3qXV1uUrsN1P58gwOMuVGenoKt6t4ujv4p4rewa2+2XS3V8VuA3mMN3yx5T5F+djg7jkjnjFXLmdNJrTUEoX+JnReHGtNRvFtL86n/HLJPBehFjiVSzHaY2yQAe4zwKs6VpA1KPTYftGptdaoZRA8c37uEqSAHGMt0ycFcAg81wdp4gmsbPVYYIn33sYhSVnBMMe8Mw4Xknaozxxnjni1ovjSbQrJ0hN6bskspF5ttw2PlZognzMvUfNjIGQayt5DfL0kzqtLMkmmRO5Z2OcknJPzGp2V/7h/OuBtNdvIrVI0ucAZwNgPf6VIfEF+G5uCo7koo/pWbUux1KGGa1m/wADtHV+fkaoSj4+5J+Rrkk1++d1xdDlgPur/hT7jWtSSVgJmb6Rr/hT9/l2D2eF/wCfj/A0tQjuzbSrDHPu3gjapzWM1hq0pH7i+Y9vlar+l6tqVxcrGbjBJ/iRf8K3o7rVIpAUu0EnZto/wrmqVeSXvI78NhKVWm/ZzelzO0LTtQW0kWSyuQQ/8cTZ/WtNtOvP+fOX/v0f8KuWmrakrOLm9JzypjRP8KsHW7kH/j6nP/AY60VWm1ucksLWT0K8Wn3Q0adTay7vMHHln29q4C80DU2vp9mnXTDzDyIm9a9KOr37afLJHdSBFYA7lTOePasaTV9Y8xmW8UA+qLn+VTGpCMXbv+iEsLUlK0mtjhz4d1X/AKBl1/35b/Ck/wCEc1X/AKBlz/35b/Cu1bWNaA5vl/74X/Cmf2zrWM/bVx/uL/hT9ui/qb7o44eHNUHP9mXPH/TE/wCFKPDmqvyNNuSD/wBMj/hXXPrWtCJm+2r0P8C/4U2DW9ZeBW+2rj3Rf8KPbIPqjta6/r5HKf8ACNarn/kHXH/fs/4Uv/CN6r/0D5/+/Z/wrrW1nWgMm+H/AHwv+FRHXNZ6/bR/37X/AApqqiXhGuq/r5GFp3h7U476J2sZwATzsPofaobjw7qbXUpFjNguSPkPr9K6e11zWGuU3XmRz/yzX0+lRza5rImfF5xuP/LNfX6VKqLnKeGfJa63/roc5b+HdTW4jY2cvDg/cPr9KnvNA1E30kgtJscf8s29K2E17WTIv+mHlh/Av+FLPr2srMym8/8AHF/wrp9p+5a8/wBGXCg4091uvy9DnW8Nai3zfZpeeceU1b/gzQr631mZpLaVQbdhkxsO60v/AAkWqAYNyeP9hf8ACtnwrrmoT6rKklyWUQMcbFHce1c1Sa5Hc5J0t2efnw7qPe0n/wC/LUn/AAj2o97O4H/bFv8ACt7/AISTVf8An8P/AHwv+FNPiXVf+fw/98L/AIVtzEOmYf8Awj2of8+lz/35b/Cj+wL8f8ulz/35b/Cto+JdW/5/D/3wv+FN/wCEm1bH/H4f++F/wp3J5DPttIv1uEH2K5wO/kt/hXYaTp94qgG0mHuYzWDZeJNXlnJN4cD/AKZr/hXXabrGotGpa4J4/ur/AIV26/Vl6v8AJHPI1reCSJR5iMhPTcMVqQgcZNJdM0lvZO5yxTJ+uBQg6VzMETqf3qmrCn0qsp/eD6VYU8UjertH0/VmXbH/AIqm7/64j/2WtjisW2P/ABVN5/1xH/stbNdOJ+KP+GP5HPEcOCOaT2ozjGaQ56VzjDJxSo3zrn1FJn1oTh1+tJ7DW5Mn/H/H/vr/ADopEP8AxMI/99f5iivHzDeHobx3l6nz42sX1lNYhYYc2cMkce4E5Em7OeevzHH4VJa63evbfZhbWxmFsbYXWD5vk/3PvbenGcZxxnFUtRBNwP8AdFJYfJOxP9w/zFe0mubU6XSaxfL0uT3HiS9nhaP7NarPN5az3CKfMnCEFQ3zY6qpOAMkDOauT21zf3sl5NBGJbl3mfGMZZixxk9MmsWzh8y9j3cqDk4rrpbiBjCUVwqrgZHPetKKi7+jOFxm9yC8lurq2MJ02zjZiplljQB5dowM84HvtAyeTUVubq2+2fZ7OGJbpdhVTnyxuB+XLE9sZOTgn1qybuPnh/yqNrtB0DflUqSWzJ5al72H2BfTo5kbRbG681dpactkLxwNrjHTr196r2MdxYX63UdjbuyhgElUMnzAjpnnrVm5vFRwMN09KZHeIHDEN+VErJsFGppoX5I7trdTHpNlDEYmhCIowN3Vslid3uSaaYdZ0+xtU/sawfAYwTyKpcK3UcNg9T1GRngirMmqwmwjAWTO70+tX9av1GnaUcN80Pp7LU6GtWE01bsjCtLzWLRLQDRtNka0EnltIgJO8knd82Gx2z0wKjs11izisSukWLm0cukkqqWYEgkN82COMdM4zzVs6xawgYV2b3AqvLr+4khG/Gn7ply1DHb+0LPUY5/sVruC7GidVKOu3aQRnuP8Rit42Orahp6zLoFgkRi8iMxtjYvXjL5znucn1rEW93zPPKpMh6Hrge1dXpOqm38NoZQ/MhKDH1rWiozrQi+rt+ZpSpTaaa6EWo2OppElhL4Y0wMsaplZDnAOc8S4ye5xzTLWLVry+eOTw9pjYeOS4OMeYFIAz8+MeoXGe+aunxAr3r3DiQs+c5UH+tPtddW3vp5mjYmSPGNv09/apiou/kSo1He6OX1SW60vxCNVks7VrmC7Fx5ciKY2YNuwVUj5fYY4rPm8STrcWxh0nTIFi80+VFCdsryJsZmyxOcdACFU9AOaueLrlLvWWljLqjxq6owxz0NYwUK24f6wj7x/h+lZOSuONKXI3bsb1hrOo6J4bsGWxtJntrl5LOeYEtayHOWTDAE5UH5gQCMgZqPSPEmtafp2n28MFqbawvTeK06n52Ixsbn5kwX4GPvtzzSyXcEXhm0hWPzJFlJy4+Ufe96xJ5ZLg7mYuVIPHQf4URkKdKStZHRabrN7d3c9olsk8n2t9Rklc4JlOM9+n60ukarrN/4lfV44Ihtme6faoCqeScZPI5xg9RxTPCkfmaxPI4wGiY7R9RWlocsUdtPEPNLTyRxgKowF3Bj+gpxa5mc8YTdSSt2/U7nQ9A1u9sYWh0WzS0Uu0UEEm2PL9W+aQsT079gOgxW9D4e8Q29u0cOi26StGYjOJV3lSMHq+3kEjOM1raV4s0uyso4Vt7vCrjiMf41of8Jzpn/PC8/79j/4qtLxL9nPscu+ga8NK+wHQrbaG3+b5/z7sYz/AKzHT2xXVW6yW2j2kEo2yR26Iy5zghQCKrT+OdNxxBd/9+x/jWPd+NLBs4huv++B/jRzIPZT7CarLnPNc0bho5jtNLqHia0kJxHP+Kj/ABrCk1u3L52S/wDfI/xqHJFKlPsdVFPuGc151rN2b6x1ubOR9sCL7Bdo/pXQjxDbxwudk3Cn+Een1rjoJQ/hS/kIOWudx/Eqa1oSXtYeqMMRTkoq66r8yhHCpTJz+FTxRBTzuU9uaSK6hOAUcAdxU4u4QclH/EVldHUqcuxHdPKIlBldgD0ODVe53t95nbJqW5uUZFwrDnJyOtNlmQnofyrqqtfV6frL9CVTk5PQo+WVHAwfrSB5s/eJ/CrBlRVwoP4imtMAMBSBj0rkuinTl2K7PNgkuRjnoKYskwGfNP4jIqUyBlwQenpSAqFy4OBQ2hKnPsOChDbTLGoaR8EgcDB9KgvizXcnJI47+1WLaZTOA6ny3YDaP4fQim3uEvJRjnj+VO65Rezle1ix4eONSjDZxk117FfPXGelcVpUwhvUY569q6H7cpG/5sDivNxavNNHv5VeMJJ9n+RqsyK5/nULOgJJ61nfbl7lvypjXsfq/wCQrFQZs5HRRODoc+Cf9YP6VlNMoP6GprS5R9CuDlv9aO30rGku4gzD5uvpVU4+4/8AE/yRztvnfoX3mXHFRGcdc9aom7h9G/KkF1B3D/kK0UQbZclnXyX4HIPeo7eUfZkGMf8A66rvcxFWAD5IxT4HAt045/8Ar0coXdiy8w9/yphmx0/lTScjoM+tRsTnpimkiG2WbaYm7jGPX+VV7iVvOk/3jT7T/j7j/H+VV7g/v5P94/zoS9/5BJvk+YRSN5yem4fzp94x+0vz6fyqKL/XR4/vD+dOvP8Aj6f8P5V2L/d36r8mH/Ll+q/IryOw71u+DWJ1mYk/8u7fzWufkJArd8F5/tib/r3b+a1y1P4bOSZzZJppPag59abzW5mwPSmOcKTSkGmMCSq56mqSM5uyNHTI8IWx1rtNNX92n0FctZR7Y8D0rrtNH7pCPQV3P/dl/if5I5vsnWzjNnY/9c/6Ckj5wD0zT7kYsrL/AK5/0FRJ93JrkY0TrzIB7VYBFVlOZBUwoN620fT9WZlsf+Kou/8AriP/AGWtkfWsS2/5Gi7/AOuI/wDZa2c85rpxPxR/wr8jmiOznvRnIpvSjJzXOULuyPelU/vFx6imbsGnLjzF9M0nsC3RMh/4mMY/6aL/AEopE/5CMf8A10X+lFePmG8PQ6I7y9T531Bf34/3RTdO/wCPlv8AcP8AMVNfr+9B/wBkVDp//Hy3+4f5ivcj8fz/AMzp/wCY/wCf6Gn4J09r7XVwdqpjLeldprlmsOuJGgx8mTz7Gs74SWf2nWJQegYV6g+lWlx8RIYJFXy/s+SCM/wmumlH3fkzy0/efoeXSW0mSAjH6DNVpLabH+pk/wC+TXpGsaeuk6lLBJbP5LHMUoXhh6exriNX1bUba4zb22FBI2bCeK5ybmNdo6yAhGPHpUCSuTgDmr11q8+BGbORs85VTgn8qbDDf3XI0y5wehCU57saLjMw0iHc2PnP9am8R3D/ANlaUsIyTAe+M8LxSS6Drl7pkMFtZMkgfJErBcDnmti/8GancabpkUlxBE0UWH5Lc4Xpx7VKN6+69F+RwUMs6hnuxGgx8qKQT+NVJ/Mlb5JHwei16PZfDnzMbpJ5h3baEUfrmup0rwZpukkSCESz+rchatJs5m0cBoHge5uFS61SRoIcBhEPvMPf0rZ1Ro9kCIu2AOAi+wFdbqSSSKsCkh5HEage/X9M1zniG08y9MUSnbGyoMewrWgrYin/AIv0Zvh3dy9P8gltla8uLm4QLCn7z6+grHhuHutXmduPkyB7cVd1O5UP9gW4V0RvmfeDuNYV7qcWkvczrh5SgSJV5y3HP0ohCXvaPbs/8iIJ2ZmeNJIZr2JYzmW2gIbHbJ4H865d7meDGyQKpHT1/CpT5s9vLNLuaWWbLE98D/69QSblJIjLHoBtzWDhPmWj+5/5GkU/Zy+X6m1cyb/CFiZfmJuG69Or1mRTmVtsjjCnCoeFH4VqTRbvBlmjAeb5zHHpy1RaeIZrY+dYwtIhH3sgsKUKc7PR/c/8hTTuvRHT+FI1FyzKVZfKI+XqORXb/Dm2gm1KeLA3Q7ZkHUdNv/s1cXpNnpaXpeyE0NwYiHgD715I5B4xWt4Ia70Lw3repG5T7WGijCmQZ2EjIHr71Uac+d6Pp0ZzL+LLTov1Pf4NvlAqQR6g0skm0Vg+Hb+0t/D9tHJeQAgEjdKM4PPPPqTV2bUrExFvttuT2Hmr/jWvJPs/uf8AkVZjbu4yDXP30/B5o1HW7WIfLPDJ/uyCsS41S2kGRcRc9t4qHCfZ/c/8i1Fle9ferY6j+VYkgJbIq9Lewb8iaPj/AGhWRrt8tnYStbujOeAQ2doI61Hs59n9z/yKsxupT+TpF22eRGa4+0B/4Q+7zn/XL1/4DWldXpuPCkzs+ZGUKQepOfSs61D/APCH3YP/AD3Xgf8AAa1oQkqsLp7rozDEp8q9UNhMeA289fTrT12s5BY896rxgbQcgY7mpo5VRic7m9xwKjkn/K/uf+R0KI28ZDFwMndk7u1TuFfndj29Kgu2R41bCg7ux61KXQA/Lye4NdNSnP6vT0e8uj8vIFpJlKRirHnC471GWcjJPB6CrU21QWGG9P8A69QCR9xAAY471y+zn2f3P/IdiNxxubJX+FQeWP8AhULAuA8hA9FHYVbmjErGWM5LYymeU9h6iqjqRng8eoo9nPs/uf8AkJpjY8eagTIUsOtTXSLPK6ceag+T/aGORTI0JlQkEfMOlOuSyXTFQcjnOO+Kfs58uz+5/wCQrO5DpzgX0Z6DdXQeYNpPTHvWHjbqEbKMBiGPHQ4rVyDG33etefiqU+ZaP7n/AJHt5U7RkvJ/kSGYHim+aM9P1qLeuMcUxpEHSs1Rn/K/uf8AkaORuWkg/sO4x2lHf6VjSSDzD06mrFlqklupt0RCjtuORz0/+tUjeIrpGKCKEgcfcP8AjQqNaMXaF7vz7LyMXJc/yM/zfm60hce9Xx4ku8/6mH/vg/40v/CR3f8Azyg/75P+NLkxH/Pv8/8AIfMu5nq496UMM96v/wDCR3n/ADxg/wC+T/jTh4ivP+eUH/fJ/wAaOTEf8+/z/wAguu5nEgnHNNLAds1p/wDCQ3f/ADzg/wC+T/jSHxFef88oP++T/jT5MR/z7/P/ACBtFKzYfak49f5VXnJNxJx/Ef51sW2vXktyiNFAFOein/Gmya/epK6iO3wGIHyn/GpUa/N8H5/5DdnBepkx5E0eR/EKW9OLqTj0/lWpH4gvTKmUgxuGflP+NOuvEF2twwWOAj/dPp9a6lHEewa9n1X5PyHp7J+v6HOsd3AyK6DwZu/tiYf9O7fzWoT4hvAP9VD/AN8H/Gtvwlrt3Pq0qNFEAIGOQp9R71zVI1+R3h/X3HLJK25whBB6U09O1bp8R3v/ADxg/wC+D/jTT4kvv+eEH/fB/wAa25cR/wA+/wCvuIaj3ME9aWFd9yo9K2j4kv8A/nhB/wB+z/jVqHXb0SHMUGMD+A/41pGOI/59/wBfcY1VG1r/AIDbZMQk11WnL+6j9NorNh1u4MR+WDPptP8AjXRWOoTPChKx5IHQV0zddYdJwt7z6+S8jG0bb/gbd1xaWQ9Y/wCgqBT3q5d3Ti2szheU9PYVAt0+BwvPPSuVyq/y/j/wBpR7gpw4qwCM5qNbhyw4X8qmE7Y6ClzVf5fx/wCAbVVG0den+Zj2x/4qe7/65D/2WtkH8qzbe8kbxDcwkJtWMEHHP8NannN6CunETrXjeC2XXy9DBKHf8BtJnjg08SsSelHmtnoK5+ar/L+P/AHaHf8AAYeQaWP76/UU7zj6ChJm8xQQME0nOrb4fx/4AJQutfwJU/5CMf8A10X+lFKspW/jHGN6/wBKK87FwqT5bq2nc1vGMmfPl+P3g/3ag0//AI+X/wBw/wAxVq+XLD/dqrYD/SmH+wf5ivbj8fz/AMzp/wCY/wCf6HpnwPhD395IeisP5Cuztr5H+KwkdtqyRMqn+78pxVP4W+GH0nw685BN5dAuyjsMcD8qy9QdoPGynBVkT8Rwa7Kbs7PseTHWT9Gex+UjxhJgrg+oBBrOutC0u4z59oRno68VQ0TxDBcxGG5lVSB/EeG/+vXRROpAaNjtI71nKCZlscZoXhe0ntnlEpBEhUAgEdBXQRaOkOFfy2x6Ej+tR+HCy6dJt4/fHoPYVrbiDzQ4q4Ns56XTLOXUpkdXChQcBiPSrnk28SIscS4AwM81DMxfV5yR/COB+FSykhEI9KhHRX3XovyGkb2CHgHpiq8sDR++KeZcjBFXIv8ASYAWUh14Pv71RznMyFRr1uSPlijeQ/XBrKvVRftElwm55BuAzjFdNcaaV1Pz2X92Ewff2rltVk8/VJdxG0cfpWU5OLi1vf8AzOihtL0/VHKzWdvHIXkQKmC7HJ6VwOuXYuLx5IgUjP3F9BXVeKdSDym0iPAA8zH6CuI1JS+zGM5710Rr1bP3nt3HCTsyuLi4a3OGwA/JxUgaZnG1yB9BUEcnlo8bx/Kw4x2PrV6CASSqvmPjvsXP86xderzL3n95pFv2cvl+ptvAn/CKWk0jsGMrZb2+alsNNuL1EffHbwN/y2bnj0A7mtV9PM3hezjWFUjablpnGQMn3/zmrkls1uiLsVY1XqMAf/qqqdetZ3m/vZM5O69EW9HsbSC5b7ODxEVMjnLPyOT2H0GK7P4faTotz4fvptQjR0+0bT5rlRgZ9DXIaNxcNsZWUxn8OlZeSq7XZnI6jOQDihYiqpv3n06mELurL0X6npWva/4T0VNsdgbhwPlVJGx/OuD1Lx6zEfZdGtYEPA3O7Ef+PViS3LEup+UcZzzWRcvu3fMeu4n2qnia38z+9m1mupoXHizUWc4W3Xnsh/xqkfE+o558k/8AAP8A69ZrEM2M8fyqMEDOMmp+s1v5n97Dmfc24vFsytiaxt3H1Yf1rUg8UaTcoI7uxeEnurFhXHFc549qUIxc/LzS+sVv5397GpM7HWHsZNEkNiyM2QAFPI/CqdsJF8IXZIOfPXr/AMBrDhQ55OK3wp/4RS8wODMpGf8AgNaUa9V1YJye66nPiW+Veq/MonZgDJz2x0oCAOPmBU9fam42Kd5G30FRRsNrDOAOc1H1it/O/vZ0czHzOnl7Qo+961YjKOGJXoOxqlKq7Bh+M5PHNSq0YLFQ3THWumpiKv1eHvPeXX0EpPmZLL5ZQ4BVuvB61TMhAwDk9/8ACn5YnndjPBIqKQYbjv0Fc31it/O/vY3J9x25mHA5Hemlp0P+sGPRgDTHc8DawA9RimlGO3jb9eKX1it/O/vYczZYjmJljHlpncASCR/WlupWF0wVUA9eSelQwgCZfnBG4cCluwpumDdM5z+FP6xVtfmf3sLsgR3F0qly2WrUJ/cN9ax12m7TbnG4VqhsW7/WuLEYitde+/vZ62WvSV+z/IjZsDmmGTA60jfWonPWksTX/nf3smTsT28xN0gz6/yqGa4IncZ/iNJan/S0/H+VVpz/AKRJ/vH+dbKvW5b87+9nO5e8Ti4bHX9KeJ84yapZIpQ1L29b+d/ex87Lwm96eJuKzxIacJDS9vW/nf3saqF8S8etIZRVQS0eZS9vX/nf3srnNOxkzexj6/yqOeQ/aJcf3j/OotOkzfx+vP8AI1HcP/pMv++f51n7etz353t3Zpzfu16k8L7pkB/vD+dOvH2XTgdOP5VWgb9/HjP3h/On37f6ZJ+H8q61XrewfvvddX2ZV17FvzX5DDK3XPFdD4LcnWZuf+Xdv5rXMFq6PwUf+JzN/wBezfzWuWtXrOm05v72cspHOGR/WmmVv71ITTCa2WJrfzv72S2P818jLY59K10XIU+3NYafNMg98mugtFBXn1rWOIrW+N/ezmqSfMa1jbxsnK559a6S1ARAFGABwKw7BOnYZreg47dsVM6s56SbZF2zbvCfsdljr5f9BUSHPX0qS7x9lsv+ufX8BUMbc5zWbGiZT8wqdTxVdT834VMvOMUjattD0/Vmbb/8jPd4/wCeQ/8AZa2c+lYluf8Aiprv/rkP/Za2Sea6cT8Uf8K/I54iggUtNzR6D8q5yhQ2RjvSp/rV+opuaEP7xP8AeFD2BbosD/kIR/8AXRf6UU0EnUU/66L/AEorgxH2fQuXxM8JvCMj/dqPR4ll1IknhV6evIp96MsP92maYy2955hOOMHP1FenH49Tuv8A7f8AP9D6Q8KXaIYkBwykfKeKx9Xsra/+I/lTjCmHkr1HBrQ0W8jvrCGeCxaYFQUkQ8fmBWBqltq9x4nku7SN45VjAw4JPT6e9ejGjJO0tNO6/wAzz44aonrbXzX+Zoar4bl06bdbzebH1AIw1dBoOpH7Ekdz1XgOOR+PpXISXHiadVt5ZAJFHy5TBI/75qpDceIbS4LGVUJ+8DHwfw20lh5rS6+9B9Tqd196/wAz0Tw2QdLk5yPObn8BWo5GwmvMtFv9eitW8idQpc8Bfp/s1sLfeKJRhZEPt5Y/+IoeHk9mvvF9TqPW6+9f5m4P+QrN/uD+lSXjiKJSRntiuT3+J11CQ5G/aMjy/p/s0+8PibZFukRs84EfT/x2o+rS7r7zathZ3W2y6rt6nVWcHn4bBwa1wkNqmWwTjpXCQXXiqCEIpUD/AK5f/YUNdeKmOXYH/tn/APY1X1WXdfejH6nU7r71/mdRfzNJGxGF4OK8r1m7eGSYoMyscKPT3ra1DUfEdtEWmlQA8DKdf/Ha466GpTMzyTx5Y8/L/wDWrGrhZXjqt+/qb0sLUipbbd15eZy2oRFGJY5YnLH1NZFzEJUAZc4NdHfWcrH95PGKy5bJCpBuoxjHSr+ryinqtu6COFqJPb71/mY6QxqfugGui0awWRt7sAOwPeobPR1mk3/aFdV6gD/69bunwqtsVIBG4gcdelYewmpL/Nf5lxwtTkktOnVf5mvIj/2JaxsCuZCDx/vVVijZGwAdnQq/9KvyPt0aCM7iokOHJ+vFZu4gfNJhskkZqoYepZ7bvqv8xVMLVuttl1X+ZJHdSWc4eONMjgbu/wDnFFx4iuc7VSEdz8p/xqnJtwAJlBBLHJqpIY9rZnQHdnPtSlhZPVpfev8AMxeXuTvKK+9f5libxDeAjEUH4of8arP4ivVXmKDr/cPT86qPBHJkm6U57/5NQiBM4+1KW9ABUfU32X3r/MX9nL+Vfev8y23iS7VR+6t8kZ4Q/wCNIviS9JOYrfj/AGD/AI1UNkvQXUQOOhH/ANenC2hUY+1Rn8sfzo+pvsvvX+Yv7N/ur71/mWT4nve8Nv8A98H/ABqRfEl22cRW/v8AIf8AGqAtoFGftERye/8A+ulNvDwPtUY78Y/xpfU32X3r/Mf9mr+Vfev8zSXxJeY/1VuSOg2H/Gqmpa7d3tk1vIkQRiM7VIPHPrUJt4sAC6T1/wA809II8ZW4QnHHtWtHDypzU+Vaea/zD+zu0V96/wAyi06v/C349KaJBj8egq79mU5xdIB3/wA5pRaI3/Lyp+gFbeyh/J/5MjVYSr5fev8AMz5pUKAbSDmhZ0DDcGI9AKt3VsgUOZ064AP/AOun/ZVz/wAfcY9en+NdFSnD2EPc6y+0vIlYWrzNafev8ykLobictkn0oNxG3BViR3Aq75EK8tcREjjp/wDXoMMZU4uohkdgP8a5/Zw/k/8AJkP6pW8vvX+ZneeoPcgdjUJk3kkkmtD7HCAcXcfT8v1pq2cPH+lxnHbjn9afsofyf+TIX1St5fev8yrGwM0WARhhT72QfaXH0/pVlLRPPVjdoeRwf/1064s43uWb7Sg9sf8A16Xs4fyf+TIPq1W9tPvX+ZlKds+4duauidvschx/F/hTktIUuA32uPOOh/8A11c8qIwMBLH169v51y4ilC69zqvtI9LAYeslLVbPqu3qZolBQHBzio3kJ7V01lp0UllE3nRtgYJzjmntY2i/euIR9XFbKhS/59/+To43RxPdfev8zlbd3+2RcDGT/Kqk7yfaZen3z/OuqubazBjMd3AXDcBWBPSsSWzhM8hN7ECWPGOnP1pulC1uT/yZErD127N/iv8AMzd8vtSbpfUVofYoP+f6H8v/AK9H2GH/AJ/ovyH+NL2UP+ff/k6K+q1u6+9f5mful9RRul/vCtD7DD/z/RfkP8aPsMH/AD/RfkP8aPZQ/wCff/k6D6rW7r71/mZ+6X+8KTdL/frR+wwf8/0X5D/Gj7DB/wA/0X5D/Gj2UP8An3/5Og+q1u6+9f5kWltL/aMOX9f5Goroy/a5sOf9Y3861NPsolv42F7Gx54AHofeorixhNzKft8Yy54445+tZKlD2r9zp/Mu5o8NW9mlfr3Xb1M6B5RcREucBx/On37u97IyswBxj8hVyOzgSVH+3RnawOOP8arX7K97IysCpxyOe1by9jClaUOv83kTOlUhRak+vl29SniTu5rp/Aob+3JssT/ozfzWudx710vgYAa3N/17N/Na4a86Hs5Wg/v/AOActpdzldjHuaNh9alwKQgAGteeh/I//Av+AJqXcktfkkye1btrcooBIbGfSsS0B3iuksgdgGa056H8j/8AAv8AgGOvc0rTUYUXlZPwA/xrUj1m3A+7N/3yP8ajsuIwe5rTjbA4pc9D+R/+Bf8AAKSfcu3+rwR2OnsUlw0WRhR6D3qkmvWo/wCWc3/fI/xravGxaWP/AFy/oKrRt8ppKdDrB/f/AMAEn3Ka6/a/885+n90f41KviG0A/wBXP/3yP8avrwalXoaOeh/I/wDwL/gG1VStHXp/mZGm3KXWvXE6BgjRDG4c/wAIre6GmE5INLmorVFUldK2iX3GSVh+aQnNJ70p+lZABPFKh/eoO2RTc/LihD+9X6ik9hx3ROMf2lH6+Yv9KKZnGpxkf89F/pRXDiPs+hpL4meFXmPNH+7UC4zT7xv3w/3RUSNzXqxxdf2KjzO1jevFfXX6mhYave6Y2badlXumflP4V3GheK9DlkWTXBdWyyDb5kR3KG9+OnFebFqnl/5BkX++f612RzDEfale39djz3CLPoe18LeHtctFuNO1CS5TqGimVsfkMiq114OtrY/vGuX994B/lzXz3baje6dJ5lndSwP6xsRW5bfE7xZZDb/arzr/AHZxuFP6/Ue02iPZHrGk6Bp11A/mSTq4cjAYDjj2q8vhjT1b93Nc59Aw/wAK5Hwx8R9PjuFsNaUW8jHdHcqPlyeMH06V6ctyHt1nhMcsRGVkjIINOWKxEdVN2/ryCy6mPZ+EobjUnRvtIXaCSZAOOPatDUvBtrPHDH9onAjG0EOM/wAvakgvLh9QkfzTGpUZJ5OOKtXmpNGsSpkK4OXbrUrF13Z87NK8UmrdkZT+FNKto8yXVyAByzSDn9Kxrq10iPKwTXLn+8ZBj+VaFxI8jlncufU1kzWqysxThs/hV/XMR/O/6+Rz2Rm3GkQTuzNPM4/hG4cfpWdPpEPl+XukwDnr/wDWrWmhkhIBOM9CKEzMSZV5H61jWxdduPvvf/M3opWn6fqjjNS0yKJCwL59zXOPBvGE3ZZsD+tdrrhEpZV4UcVR0myiUmWQDAUhRnn603jMRf43/XyJjFDbLSLVbQMJmyRz6VHaWUUiBQ75y2R2AAFbMqKH3P8ALGi/jVXTQBbuSj8khcDk+tRLF1+ZPnfX+tjphFezl8v1LEmkW40mCUvIBvJO4jGOfb6Vj3FrBvIDSEeu4Vupq81vAsMaoF67XGTj86qy+ILlWb5YCOg+Q8/rUrG4tX95/f8A8AJRpu3+RgyWsW7hmZR1yRxVKRE3EKzfietbz+Jrv5lCW599hH9ahPia8H/LK3I9Np5P50njsX/M/v8A+AL2dLv+Bguo4wTSeX3y1bv/AAlF1n/UW5PfCn/Gr1hrOo3M8eyzjkiJ+fYhzj160fXsX/M/v/4A/Z0u/wCBy0dpLKsjqjBIxlmPatLTdCiv7a4uXufKghAG5v4mPQCuyutUfTftqXCxeQ3EY2/MMHqeaqQ69PdQNJa26Jawgnlc59+tL6/iv5n9/wDwBclLv+Byd3o5S8jtrUPNJ5YaTbztPXn04qi9usLbXV9w/vcV3lhf6xcwbzBarG5O3k8+xPTNWY5JZhiSGKNz0dV35P5in9fxX8z+/wD4AuSn3/A858uPb3B7CkMYjxhiSfTtXXaje6rYTgNajyyTtbyic/kazj4hvs/6u2+uw/rzR9exf8z+/wD4A+Sl3/Aw2AwcMc9xSccfM2K3P+Eou+AsNuwHqh/xpqeKLwsf3NsG/wBw/wCNH17F/wAz+/8A4AclPv8AgYrRlhkBjn14pwUB8Ek+yjAFa6eKb85zFb/Taf8AGmnxTej/AJZ23/fB/wAayqV69W3tHe3n/wAAaVNbP8DFlYbxz+NKCgQl8sSfujpW0fEmo4B8m1G48Daf8aQ+KLtflMVqzdyEPH61jeXYdod/wMR5Fb/lmAPbtTMKOcnHpW8fE90oyYrcnPGFP+NR/wDCUXZ6Jbg/7h/xp3l2BqHf8DJhY+bGScfOOv1pt85F5IN3cd/atdPE98ZQrxWxDEDhD3/GnT+J72CRo1it9q/3kPP607y5dibQvv8Agc/GwF2prQL/AOiOff1+lXI/FV606gw24B9UP+NWW1+6MZm8uAMvAG04/nXJWburo9TActpWfR/kc25Vs/KD9arttz91fyrpW8UXoGfKtv8Avg/41A3iq+HSG2/74P8AjVJvsc8uXuY1jj7bH8oHXoPao5yPtEv++f510Fv4mvZ51jaK3APXCH0+tMk8U3qSsgitsKSBlD/jWt5cuxlaN9zn8ik3Vv8A/CWX3/PK2/74P+NH/CV33/PG1/74P+NTeXYdo9/wMAmjNb3/AAll9/zytv8Avg/40Dxbfn/lja/98H/Gi8uwWj3/AAMHNFb/APwll/8A88bb/vg/40f8JZfZ/wBTa/8AfB/xpXl2C0e5maYf+JjD+P8AI1DdH/S5v+ujfzrobLxPez3kcbRW4DZyQh9PrUU/iq9S4kQRW2FYjlD6/WoTlz7dDVqPs1r1/Q53mit//hLL/wD542v/AHwf8aT/AISy/wD+eNr/AN8H/GtLy7GVo9zCwa6bwMP+J3P/ANezfzWq3/CWX/8Azxtv++D/AI10Xg7xLeXOryo8VuALdj8qH1X3rOs5ezegrR7nAc01umK3/wDhLb7/AJ423/fB/wAaT/hLb4nHk23/AHwf8a1TlfYmShbf8DMtF+b3rpbFehqO28TXrEfurb/vg/41u2mu3bAZih/BT/jV3l2MuWHf8CS14A9KvxtkUQ61O3VIv++T/jVtNWnxnbF+R/xpXl2HaHf8DQvT/oVj/wBc/wCgqBCM49av3moSraWJ2p80eTx7CoI9RlOPlT8qLy7AlDv+ABqlU0LeSMwcqmfpUy3smOiflReXY3qqFo69O3mxg6/zp+eKeLyT0X8qd9rfphfyovLsY2h3/Aj7UZ6VKt254wufpQbt/RfyovLsFod/wISaEP71P94VKbx/RfyphvZB2T8qG5dgSgtb/gPyP7UQf9NF/pRUEMhkv4nbGTIvT6iiuPEqzivIG7ts8KvT+/X/AHRUUbc/hU13DJJKGRcjaO4qOO3lU5KfqK6oyXs7XO6tRqvFuSi7X7DM1bkP/Esi/wB8/wBarfZ5f7n6irqRo1mkUpZSCTxWynHXU4vq1b+R/czMbpVaQVstZ25H+sk/z+FRHT7Y/wDLST9P8KV49194/q1b+R/cypqqlrxAOpQfzNerfB/xMIzN4dvX/dyfvLZmPRu6/jxXByadaTzCUzuCF24xVuws7exuUuYbqQSxncrY6VvCcVJ3as/Mh4Ws18D+5nvUsezUZQvZRx6dKjv1JhhZey/4VxNv46dwDJ5bS7QCSGrSuvGKNBb+UI3bZ84KsMHinzRXVFVcJXk01F7LozTf96mU69xVac+RCQgy546ViN4lYsGVIwc9s1G/iBnDfLGpPcA8fSn7SHdfeZfUcR/Ky7BM5nLzEFcHqOgqrN5m9wjlgf4u2KpnVIQwaRRIQMAHOPyqOfVUuBjcIweyA1jVqR9136nRRwVf3ly7r/IhuIVlO0N0OP8AePpTTbq1yxRcBVJ9qGnhYAeawxjpmgzxH/l4cc9h/wDWpfWKfcpZdiF9kkmYOY432qFG5t3f0rOtC6wM6yA5c9TV83UO52D8vjccHmqlnp32qzYNu27z0IB7VDrQutTeOBrqDXL2KkpLO2Byo556VF/Z11cYWKPAxkM3ANbkOlwwsWEG85z8zVNNBPNwWKp/dXFDrw7mTwGJ/lMFfDUpG6eYKBzlanj0jTkljDSb3Y4Ck9TWiNOG3afMb6vSrYxxtGyQImw5BAGan20O4v7PxH8oWnhqCzuPOeNxIjY2uOMfT0pLW7t7e+aPTl+yurZzyVcnqKtXj3d2CGuJRkAAgjIxTLW3aBQqgkg5Z+NzH3PWn7WHcP7PxP8AKYOstcXmqXXlqGMJ6N90e59q6VfDs1r4etrV5DIb2Tc0qDG0kdB7cVWS0jj2I9ukqq2/a/IZvVh3P1rbtdb1CGF0jCYMu8f7IxjA9qPa0+4v7PxP8pR1SCHS9HTS7dibyOECGIf8tWYfe/OuX03Vf7LnNpfbnnXPmOHG1PpxzW7qNrLfapHflnikjxsCNwPpWLdeEIJZzMZJxk5I3g0OrT7h/Z+J/lLn9swai4gMs5izzJvALe3Sqd/4ajG6S3eRUb5iXbdj+VaNhpdpZ9LKOU9mkJJX6c1bMDbCgUkN1DHP86XtYdx/UMT/ACnFvo7ouY3EmDyAefwqCW3niTmEqnXOOa7eezjnQK1tErgcOgANVf7KdcYkfA9SDR7aHcP7PxP8pwxDo3KsPrTCwD8fdHt1rvG0pZBiRQynsQKrHw1Z7TtjZW7MG5FP20O4/qGI/lOLdnZtzAqO1IM54XLV2H/CLxBtwlmzjHJB/pTH8LxsuDLKBnPBXml7WHcf1DEfynJFQW5c59hxTShz1Bx+Ga6v/hEoP+es35r/AIUHwlCcDzpvzWn7aHcX9n4j+U5WIHz48g/eHI6dakvSBcucZ6dfpXTJ4ThR1YTTfKc4yMUs/hSKeRnaWUE+hFP29O24LLsS38Jx6yYnRm4HrV4zKbOQg8A/4VunwfDkfvpeP92pE8JBx9nR5CH5zlf89q5qs4Sasd2Ewlakpcyto/yOQeUdjUDOCa7c+AG/vzf99JTf+FfN/wA9Jv8AvpKtSRyOjN9V95yNi4+2R9e/8qjncfaJf98/zrtofAbwyrIHlJHYstRSeBS8jsXmBJJ+8tU6kVGxKw829196OK3e1G6uy/4QP/ppN/30tH/CBD/npN/30tT7SJX1afdfejjM0ZA4rs/+ED/6aT/99LR/wgQ/vzf99LR7SIvq0+6+9HGbs0m73Fdp/wAIF/00m/76Wj/hAh/z0m/NKPaRD6tPuvvRy+lv/wATGH6n+RqC7b/S5+n+sb+ddpbeCTbXCShpiV7Fl9KpT+Ela4kY+fksT99fWlFqU212KlSkqaTa37o5ItSbzXVf8Ignrcf99rR/wiCes/8A32ta2MfZPuvvOVDmuo8CsTrc+f8An2b+a07/AIRBP+m//fa10HhDwyltq0r/AL7mBl5ZfUVnWX7ti9k11X3nmu406Plq7f8A4Vzef88J/wDv7H/jUkfw7vAf9RP/AN/Y/wDGtFKPcmVGT6r70c5ZIMrXR2gxHmtG28CXiEfuJf8Av6n+Nasfg++VMeQ//fxP8afPHuT7CXdfejJhP/16tRsBx+NaieFb4f8ALB/+/if41Kvhi+DZ+zt/38T/ABo5o9x+wfdfeiS+b/QtOz/zy/oKgjPpWrfaRcG1s4/LO5EwRuXjgVXj0m6AH7o/99L/AI0nUguoKg31X3jEOVqRTxUy6XeAYEP/AI8P8aeumXgP+p/8eH+NL2kO46ytyrsv8yIHpT+pqYabef8APH/x4f40/wDs67z/AKn/AMeH+NHtIdzArZwM+9KT1HrVj+zrvH+q/wDHh/jSHTbz/nl/48P8aPaQ7oLFbOCKjZqt/wBm3n/PH/x4f40w6bebv9T/AOPD/Gj2kO4EFq3+mQD/AKaL/OirEGnXaXUTNDhVcEncOmfrRXHiZJtWZSP/2Q==\n",
-      "text/plain": [
-       "<IPython.core.display.Image object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# image viz\n",
-    "frcnn_visualizer = SingleImageViz(URL, id2obj=objids, id2attr=attrids)\n",
-    "# run frcnn\n",
-    "images, sizes, scales_yx = image_preprocess(URL)\n",
-    "output_dict = frcnn(\n",
-    "    images,\n",
-    "    sizes,\n",
-    "    scales_yx=scales_yx,\n",
-    "    padding=\"max_detections\",\n",
-    "    max_detections=frcnn_cfg.max_detections,\n",
-    "    return_tensors=\"pt\",\n",
-    ")\n",
-    "# add boxes and labels to the image\n",
-    "\n",
-    "frcnn_visualizer.draw_boxes(\n",
-    "    output_dict.get(\"boxes\"),\n",
-    "    output_dict.pop(\"obj_ids\"),\n",
-    "    output_dict.pop(\"obj_probs\"),\n",
-    "    output_dict.pop(\"attr_ids\"),\n",
-    "    output_dict.pop(\"attr_probs\"),\n",
-    ")\n",
-    "showarray(frcnn_visualizer._get_buffer())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Question: ['Where is the cat?']\n",
-      "prediction from LXMERT GQA: desk\n",
-      "prediction from LXMERT VQA: desk\n",
-      "Question: ['What is near the disk?']\n",
-      "prediction from LXMERT GQA: can\n",
-      "prediction from LXMERT VQA: cat\n",
-      "Question: ['What is the color of the table?']\n",
-      "prediction from LXMERT GQA: brown\n",
-      "prediction from LXMERT VQA: brown\n",
-      "Question: ['What is the color of the cat?']\n",
-      "prediction from LXMERT GQA: black\n",
-      "prediction from LXMERT VQA: black and white\n",
-      "Question: ['What is the shape of the monitor?']\n",
-      "prediction from LXMERT GQA: square\n",
-      "prediction from LXMERT VQA: rectangle\n"
-     ]
-    }
-   ],
-   "source": [
-    "test_questions_for_url1 = [\n",
-    "    \"Where is this scene?\",\n",
-    "    \"what is the man riding?\",\n",
-    "    \"What is the man wearing?\",\n",
-    "    \"What is the color of the horse?\",\n",
-    "]\n",
-    "test_questions_for_url2 = [\n",
-    "    \"Where is the cat?\",\n",
-    "    \"What is near the disk?\",\n",
-    "    \"What is the color of the table?\",\n",
-    "    \"What is the color of the cat?\",\n",
-    "    \"What is the shape of the monitor?\",\n",
-    "]\n",
-    "\n",
-    "# Very important that the boxes are normalized\n",
-    "normalized_boxes = output_dict.get(\"normalized_boxes\")\n",
-    "features = output_dict.get(\"roi_features\")\n",
-    "\n",
-    "for test_question in test_questions_for_url2:\n",
-    "    # run lxmert\n",
-    "    test_question = [test_question]\n",
-    "\n",
-    "    inputs = lxmert_tokenizer(\n",
-    "        test_question,\n",
-    "        padding=\"max_length\",\n",
-    "        max_length=20,\n",
-    "        truncation=True,\n",
-    "        return_token_type_ids=True,\n",
-    "        return_attention_mask=True,\n",
-    "        add_special_tokens=True,\n",
-    "        return_tensors=\"pt\",\n",
-    "    )\n",
-    "\n",
-    "    # run lxmert(s)\n",
-    "    output_gqa = lxmert_gqa(\n",
-    "        input_ids=inputs.input_ids,\n",
-    "        attention_mask=inputs.attention_mask,\n",
-    "        visual_feats=features,\n",
-    "        visual_pos=normalized_boxes,\n",
-    "        token_type_ids=inputs.token_type_ids,\n",
-    "        output_attentions=False,\n",
-    "    )\n",
-    "    output_vqa = lxmert_vqa(\n",
-    "        input_ids=inputs.input_ids,\n",
-    "        attention_mask=inputs.attention_mask,\n",
-    "        visual_feats=features,\n",
-    "        visual_pos=normalized_boxes,\n",
-    "        token_type_ids=inputs.token_type_ids,\n",
-    "        output_attentions=False,\n",
-    "    )\n",
-    "    # get prediction\n",
-    "    pred_vqa = output_vqa[\"question_answering_score\"].argmax(-1)\n",
-    "    pred_gqa = output_gqa[\"question_answering_score\"].argmax(-1)\n",
-    "    print(\"Question:\", test_question)\n",
-    "    print(\"prediction from LXMERT GQA:\", gqa_answers[pred_gqa])\n",
-    "    print(\"prediction from LXMERT VQA:\", vqa_answers[pred_vqa])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
diff --git a/examples/research_projects/lxmert/extracting_data.py b/examples/research_projects/lxmert/extracting_data.py
deleted file mode 100644
index 6b1342c9b11f..000000000000
--- a/examples/research_projects/lxmert/extracting_data.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import getopt
-import json
-import os
-
-# import numpy as np
-import sys
-from collections import OrderedDict
-
-import datasets
-import numpy as np
-import torch
-from modeling_frcnn import GeneralizedRCNN
-from processing_image import Preprocess
-
-from utils import Config
-
-
-"""
-USAGE:
-``python extracting_data.py -i <img_dir> -o <dataset_file>.datasets <batch_size>``
-"""
-
-
-TEST = False
-CONFIG = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
-DEFAULT_SCHEMA = datasets.Features(
-    OrderedDict(
-        {
-            "attr_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
-            "attr_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
-            "boxes": datasets.Array2D((CONFIG.MAX_DETECTIONS, 4), dtype="float32"),
-            "img_id": datasets.Value("int32"),
-            "obj_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
-            "obj_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
-            "roi_features": datasets.Array2D((CONFIG.MAX_DETECTIONS, 2048), dtype="float32"),
-            "sizes": datasets.Sequence(length=2, feature=datasets.Value("float32")),
-            "preds_per_image": datasets.Value(dtype="int32"),
-        }
-    )
-)
-
-
-class Extract:
-    def __init__(self, argv=sys.argv[1:]):
-        inputdir = None
-        outputfile = None
-        subset_list = None
-        batch_size = 1
-        opts, args = getopt.getopt(argv, "i:o:b:s", ["inputdir=", "outfile=", "batch_size=", "subset_list="])
-        for opt, arg in opts:
-            if opt in ("-i", "--inputdir"):
-                inputdir = arg
-            elif opt in ("-o", "--outfile"):
-                outputfile = arg
-            elif opt in ("-b", "--batch_size"):
-                batch_size = int(arg)
-            elif opt in ("-s", "--subset_list"):
-                subset_list = arg
-
-        assert inputdir is not None  # and os.path.isdir(inputdir), f"{inputdir}"
-        assert outputfile is not None and not os.path.isfile(outputfile), f"{outputfile}"
-        if subset_list is not None:
-            with open(os.path.realpath(subset_list)) as f:
-                self.subset_list = {self._vqa_file_split()[0] for x in tryload(f)}
-        else:
-            self.subset_list = None
-
-        self.config = CONFIG
-        if torch.cuda.is_available():
-            self.config.model.device = "cuda"
-        self.inputdir = os.path.realpath(inputdir)
-        self.outputfile = os.path.realpath(outputfile)
-        self.preprocess = Preprocess(self.config)
-        self.model = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=self.config)
-        self.batch = batch_size if batch_size != 0 else 1
-        self.schema = DEFAULT_SCHEMA
-
-    def _vqa_file_split(self, file):
-        img_id = int(file.split(".")[0].split("_")[-1])
-        filepath = os.path.join(self.inputdir, file)
-        return (img_id, filepath)
-
-    @property
-    def file_generator(self):
-        batch = []
-        for i, file in enumerate(os.listdir(self.inputdir)):
-            if self.subset_list is not None and i not in self.subset_list:
-                continue
-            batch.append(self._vqa_file_split(file))
-            if len(batch) == self.batch:
-                temp = batch
-                batch = []
-                yield list(map(list, zip(*temp)))
-
-        for i in range(1):
-            yield list(map(list, zip(*batch)))
-
-    def __call__(self):
-        # make writer
-        if not TEST:
-            writer = datasets.ArrowWriter(features=self.schema, path=self.outputfile)
-        # do file generator
-        for i, (img_ids, filepaths) in enumerate(self.file_generator):
-            images, sizes, scales_yx = self.preprocess(filepaths)
-            output_dict = self.model(
-                images,
-                sizes,
-                scales_yx=scales_yx,
-                padding="max_detections",
-                max_detections=self.config.MAX_DETECTIONS,
-                pad_value=0,
-                return_tensors="np",
-                location="cpu",
-            )
-            output_dict["boxes"] = output_dict.pop("normalized_boxes")
-            if not TEST:
-                output_dict["img_id"] = np.array(img_ids)
-                batch = self.schema.encode_batch(output_dict)
-                writer.write_batch(batch)
-            if TEST:
-                break
-            # finalizer the writer
-        if not TEST:
-            num_examples, num_bytes = writer.finalize()
-            print(f"Success! You wrote {num_examples} entry(s) and {num_bytes >> 20} mb")
-
-
-def tryload(stream):
-    try:
-        data = json.load(stream)
-        try:
-            data = list(data.keys())
-        except Exception:
-            data = [d["img_id"] for d in data]
-    except Exception:
-        try:
-            data = eval(stream.read())
-        except Exception:
-            data = stream.read().split("\n")
-    return data
-
-
-if __name__ == "__main__":
-    extract = Extract(sys.argv[1:])
-    extract()
-    if not TEST:
-        dataset = datasets.Dataset.from_file(extract.outputfile)
-        # wala!
-        # print(np.array(dataset[0:2]["roi_features"]).shape)
diff --git a/examples/research_projects/lxmert/modeling_frcnn.py b/examples/research_projects/lxmert/modeling_frcnn.py
deleted file mode 100644
index c7c3bf376ce3..000000000000
--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ /dev/null
@@ -1,1920 +0,0 @@
-"""
-coding=utf-8
-Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
-Adapted From Facebook Inc, Detectron2 && Huggingface Co.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.import copy
-"""
-
-import itertools
-import math
-import os
-from abc import ABCMeta, abstractmethod
-from collections import OrderedDict, namedtuple
-from typing import Dict, List, Tuple
-
-import numpy as np
-import torch
-from torch import nn
-from torch.nn.modules.batchnorm import BatchNorm2d
-from torchvision.ops import RoIPool
-from torchvision.ops.boxes import batched_nms, nms
-
-from utils import WEIGHTS_NAME, Config, cached_path, hf_bucket_url, is_remote_url, load_checkpoint
-
-
-# other:
-def norm_box(boxes, raw_sizes):
-    if not isinstance(boxes, torch.Tensor):
-        normalized_boxes = boxes.copy()
-    else:
-        normalized_boxes = boxes.clone()
-    normalized_boxes[:, :, (0, 2)] /= raw_sizes[:, 1]
-    normalized_boxes[:, :, (1, 3)] /= raw_sizes[:, 0]
-    return normalized_boxes
-
-
-def pad_list_tensors(
-    list_tensors,
-    preds_per_image,
-    max_detections=None,
-    return_tensors=None,
-    padding=None,
-    pad_value=0,
-    location=None,
-):
-    """
-    location will always be cpu for np tensors
-    """
-    if location is None:
-        location = "cpu"
-    assert return_tensors in {"pt", "np", None}
-    assert padding in {"max_detections", "max_batch", None}
-    new = []
-    if padding is None:
-        if return_tensors is None:
-            return list_tensors
-        elif return_tensors == "pt":
-            if not isinstance(list_tensors, torch.Tensor):
-                return torch.stack(list_tensors).to(location)
-            else:
-                return list_tensors.to(location)
-        else:
-            if not isinstance(list_tensors, list):
-                return np.array(list_tensors.to(location))
-            else:
-                return list_tensors.to(location)
-    if padding == "max_detections":
-        assert max_detections is not None, "specify max number of detections per batch"
-    elif padding == "max_batch":
-        max_detections = max(preds_per_image)
-    for i in range(len(list_tensors)):
-        too_small = False
-        tensor_i = list_tensors.pop(0)
-        if tensor_i.ndim < 2:
-            too_small = True
-            tensor_i = tensor_i.unsqueeze(-1)
-        assert isinstance(tensor_i, torch.Tensor)
-        tensor_i = nn.functional.pad(
-            input=tensor_i,
-            pad=(0, 0, 0, max_detections - preds_per_image[i]),
-            mode="constant",
-            value=pad_value,
-        )
-        if too_small:
-            tensor_i = tensor_i.squeeze(-1)
-        if return_tensors is None:
-            if location == "cpu":
-                tensor_i = tensor_i.cpu()
-            tensor_i = tensor_i.tolist()
-        if return_tensors == "np":
-            if location == "cpu":
-                tensor_i = tensor_i.cpu()
-            tensor_i = tensor_i.numpy()
-        else:
-            if location == "cpu":
-                tensor_i = tensor_i.cpu()
-        new.append(tensor_i)
-    if return_tensors == "np":
-        return np.stack(new, axis=0)
-    elif return_tensors == "pt" and not isinstance(new, torch.Tensor):
-        return torch.stack(new, dim=0)
-    else:
-        return list_tensors
-
-
-def do_nms(boxes, scores, image_shape, score_thresh, nms_thresh, mind, maxd):
-    scores = scores[:, :-1]
-    num_bbox_reg_classes = boxes.shape[1] // 4
-    # Convert to Boxes to use the `clip` function ...
-    boxes = boxes.reshape(-1, 4)
-    _clip_box(boxes, image_shape)
-    boxes = boxes.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
-
-    # Select max scores
-    max_scores, max_classes = scores.max(1)  # R x C --> R
-    num_objs = boxes.size(0)
-    boxes = boxes.view(-1, 4)
-    idxs = torch.arange(num_objs).to(boxes.device) * num_bbox_reg_classes + max_classes
-    max_boxes = boxes[idxs]  # Select max boxes according to the max scores.
-
-    # Apply NMS
-    keep = nms(max_boxes, max_scores, nms_thresh)
-    keep = keep[:maxd]
-    if keep.shape[-1] >= mind and keep.shape[-1] <= maxd:
-        max_boxes, max_scores = max_boxes[keep], max_scores[keep]
-        classes = max_classes[keep]
-        return max_boxes, max_scores, classes, keep
-    else:
-        return None
-
-
-# Helper Functions
-def _clip_box(tensor, box_size: Tuple[int, int]):
-    assert torch.isfinite(tensor).all(), "Box tensor contains infinite or NaN!"
-    h, w = box_size
-    tensor[:, 0].clamp_(min=0, max=w)
-    tensor[:, 1].clamp_(min=0, max=h)
-    tensor[:, 2].clamp_(min=0, max=w)
-    tensor[:, 3].clamp_(min=0, max=h)
-
-
-def _nonempty_boxes(box, threshold: float = 0.0) -> torch.Tensor:
-    widths = box[:, 2] - box[:, 0]
-    heights = box[:, 3] - box[:, 1]
-    keep = (widths > threshold) & (heights > threshold)
-    return keep
-
-
-def get_norm(norm, out_channels):
-    if isinstance(norm, str):
-        if len(norm) == 0:
-            return None
-        norm = {
-            "BN": BatchNorm2d,
-            "GN": lambda channels: nn.GroupNorm(32, channels),
-            "nnSyncBN": nn.SyncBatchNorm,  # keep for debugging
-            "": lambda x: x,
-        }[norm]
-    return norm(out_channels)
-
-
-def _create_grid_offsets(size: List[int], stride: int, offset: float, device):
-    grid_height, grid_width = size
-    shifts_x = torch.arange(
-        offset * stride,
-        grid_width * stride,
-        step=stride,
-        dtype=torch.float32,
-        device=device,
-    )
-    shifts_y = torch.arange(
-        offset * stride,
-        grid_height * stride,
-        step=stride,
-        dtype=torch.float32,
-        device=device,
-    )
-
-    shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
-    shift_x = shift_x.reshape(-1)
-    shift_y = shift_y.reshape(-1)
-    return shift_x, shift_y
-
-
-def build_backbone(cfg):
-    input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
-    norm = cfg.RESNETS.NORM
-    stem = BasicStem(
-        in_channels=input_shape.channels,
-        out_channels=cfg.RESNETS.STEM_OUT_CHANNELS,
-        norm=norm,
-        caffe_maxpool=cfg.MODEL.MAX_POOL,
-    )
-    freeze_at = cfg.BACKBONE.FREEZE_AT
-
-    if freeze_at >= 1:
-        for p in stem.parameters():
-            p.requires_grad = False
-
-    out_features = cfg.RESNETS.OUT_FEATURES
-    depth = cfg.RESNETS.DEPTH
-    num_groups = cfg.RESNETS.NUM_GROUPS
-    width_per_group = cfg.RESNETS.WIDTH_PER_GROUP
-    bottleneck_channels = num_groups * width_per_group
-    in_channels = cfg.RESNETS.STEM_OUT_CHANNELS
-    out_channels = cfg.RESNETS.RES2_OUT_CHANNELS
-    stride_in_1x1 = cfg.RESNETS.STRIDE_IN_1X1
-    res5_dilation = cfg.RESNETS.RES5_DILATION
-    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
-
-    num_blocks_per_stage = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth]
-
-    stages = []
-    out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features]
-    max_stage_idx = max(out_stage_idx)
-    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
-        dilation = res5_dilation if stage_idx == 5 else 1
-        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
-        stage_kargs = {
-            "num_blocks": num_blocks_per_stage[idx],
-            "first_stride": first_stride,
-            "in_channels": in_channels,
-            "bottleneck_channels": bottleneck_channels,
-            "out_channels": out_channels,
-            "num_groups": num_groups,
-            "norm": norm,
-            "stride_in_1x1": stride_in_1x1,
-            "dilation": dilation,
-        }
-
-        stage_kargs["block_class"] = BottleneckBlock
-        blocks = ResNet.make_stage(**stage_kargs)
-        in_channels = out_channels
-        out_channels *= 2
-        bottleneck_channels *= 2
-
-        if freeze_at >= stage_idx:
-            for block in blocks:
-                block.freeze()
-        stages.append(blocks)
-
-    return ResNet(stem, stages, out_features=out_features)
-
-
-def find_top_rpn_proposals(
-    proposals,
-    pred_objectness_logits,
-    images,
-    image_sizes,
-    nms_thresh,
-    pre_nms_topk,
-    post_nms_topk,
-    min_box_side_len,
-    training,
-):
-    """Args:
-        proposals (list[Tensor]): (L, N, Hi*Wi*A, 4).
-        pred_objectness_logits: tensors of length L.
-        nms_thresh (float): IoU threshold to use for NMS
-        pre_nms_topk (int): before nms
-        post_nms_topk (int): after nms
-        min_box_side_len (float): minimum proposal box side
-        training (bool): True if proposals are to be used in training,
-    Returns:
-        results (List[Dict]): stores post_nms_topk object proposals for image i.
-    """
-    num_images = len(images)
-    device = proposals[0].device
-
-    # 1. Select top-k anchor for every level and every image
-    topk_scores = []  # #lvl Tensor, each of shape N x topk
-    topk_proposals = []
-    level_ids = []  # #lvl Tensor, each of shape (topk,)
-    batch_idx = torch.arange(num_images, device=device)
-    for level_id, proposals_i, logits_i in zip(itertools.count(), proposals, pred_objectness_logits):
-        Hi_Wi_A = logits_i.shape[1]
-        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)
-
-        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
-        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
-        logits_i, idx = logits_i.sort(descending=True, dim=1)
-        topk_scores_i = logits_i[batch_idx, :num_proposals_i]
-        topk_idx = idx[batch_idx, :num_proposals_i]
-
-        # each is N x topk
-        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 4
-
-        topk_proposals.append(topk_proposals_i)
-        topk_scores.append(topk_scores_i)
-        level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))
-
-    # 2. Concat all levels together
-    topk_scores = torch.cat(topk_scores, dim=1)
-    topk_proposals = torch.cat(topk_proposals, dim=1)
-    level_ids = torch.cat(level_ids, dim=0)
-
-    # if I change to batched_nms, I wonder if this will make a difference
-    # 3. For each image, run a per-level NMS, and choose topk results.
-    results = []
-    for n, image_size in enumerate(image_sizes):
-        boxes = topk_proposals[n]
-        scores_per_img = topk_scores[n]
-        # I will have to take a look at the boxes clip method
-        _clip_box(boxes, image_size)
-        # filter empty boxes
-        keep = _nonempty_boxes(boxes, threshold=min_box_side_len)
-        lvl = level_ids
-        if keep.sum().item() != len(boxes):
-            boxes, scores_per_img, lvl = (
-                boxes[keep],
-                scores_per_img[keep],
-                level_ids[keep],
-            )
-
-        keep = batched_nms(boxes, scores_per_img, lvl, nms_thresh)
-        keep = keep[:post_nms_topk]
-
-        res = (boxes[keep], scores_per_img[keep])
-        results.append(res)
-
-    # I wonder if it would be possible for me to pad all these things.
-    return results
-
-
-def subsample_labels(labels, num_samples, positive_fraction, bg_label):
-    """
-    Returns:
-        pos_idx, neg_idx (Tensor):
-            1D vector of indices. The total length of both is `num_samples` or fewer.
-    """
-    positive = torch.nonzero((labels != -1) & (labels != bg_label)).squeeze(1)
-    negative = torch.nonzero(labels == bg_label).squeeze(1)
-
-    num_pos = int(num_samples * positive_fraction)
-    # protect against not enough positive examples
-    num_pos = min(positive.numel(), num_pos)
-    num_neg = num_samples - num_pos
-    # protect against not enough negative examples
-    num_neg = min(negative.numel(), num_neg)
-
-    # randomly select positive and negative examples
-    perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
-    perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
-
-    pos_idx = positive[perm1]
-    neg_idx = negative[perm2]
-    return pos_idx, neg_idx
-
-
-def add_ground_truth_to_proposals(gt_boxes, proposals):
-    raise NotImplementedError()
-
-
-def add_ground_truth_to_proposals_single_image(gt_boxes, proposals):
-    raise NotImplementedError()
-
-
-def _fmt_box_list(box_tensor, batch_index: int):
-    repeated_index = torch.full(
-        (len(box_tensor), 1),
-        batch_index,
-        dtype=box_tensor.dtype,
-        device=box_tensor.device,
-    )
-    return torch.cat((repeated_index, box_tensor), dim=1)
-
-
-def convert_boxes_to_pooler_format(box_lists: List[torch.Tensor]):
-    pooler_fmt_boxes = torch.cat(
-        [_fmt_box_list(box_list, i) for i, box_list in enumerate(box_lists)],
-        dim=0,
-    )
-    return pooler_fmt_boxes
-
-
-def assign_boxes_to_levels(
-    box_lists: List[torch.Tensor],
-    min_level: int,
-    max_level: int,
-    canonical_box_size: int,
-    canonical_level: int,
-):
-    box_sizes = torch.sqrt(torch.cat([boxes.area() for boxes in box_lists]))
-    # Eqn.(1) in FPN paper
-    level_assignments = torch.floor(canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8))
-    # clamp level to (min, max), in case the box size is too large or too small
-    # for the available feature maps
-    level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
-    return level_assignments.to(torch.int64) - min_level
-
-
-# Helper Classes
-class _NewEmptyTensorOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, new_shape):
-        ctx.shape = x.shape
-        return x.new_empty(new_shape)
-
-    @staticmethod
-    def backward(ctx, grad):
-        shape = ctx.shape
-        return _NewEmptyTensorOp.apply(grad, shape), None
-
-
-class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
-    def __new__(cls, *, channels=None, height=None, width=None, stride=None):
-        return super().__new__(cls, channels, height, width, stride)
-
-
-class Box2BoxTransform:
-    """
-    This R-CNN transformation scales the box's width and height
-    by exp(dw), exp(dh) and shifts a box's center by the offset
-    (dx * width, dy * height).
-    """
-
-    def __init__(self, weights: Tuple[float, float, float, float], scale_clamp: float = None):
-        """
-        Args:
-            weights (4-element tuple): Scaling factors that are applied to the
-                (dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
-                such that the deltas have unit variance; now they are treated as
-                hyperparameters of the system.
-            scale_clamp (float): When predicting deltas, the predicted box scaling
-                factors (dw and dh) are clamped such that they are <= scale_clamp.
-        """
-        self.weights = weights
-        if scale_clamp is not None:
-            self.scale_clamp = scale_clamp
-        else:
-            """
-            Value for clamping large dw and dh predictions.
-            The heuristic is that we clamp such that dw and dh are no larger
-            than what would transform a 16px box into a 1000px box
-            (based on a small anchor, 16px, and a typical image size, 1000px).
-            """
-            self.scale_clamp = math.log(1000.0 / 16)
-
-    def get_deltas(self, src_boxes, target_boxes):
-        """
-        Get box regression transformation deltas (dx, dy, dw, dh) that can be used
-        to transform the `src_boxes` into the `target_boxes`. That is, the relation
-        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
-        any delta is too large and is clamped).
-        Args:
-            src_boxes (Tensor): source boxes, e.g., object proposals
-            target_boxes (Tensor): target of the transformation, e.g., ground-truth
-                boxes.
-        """
-        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
-        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
-
-        src_widths = src_boxes[:, 2] - src_boxes[:, 0]
-        src_heights = src_boxes[:, 3] - src_boxes[:, 1]
-        src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
-        src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
-
-        target_widths = target_boxes[:, 2] - target_boxes[:, 0]
-        target_heights = target_boxes[:, 3] - target_boxes[:, 1]
-        target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths
-        target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights
-
-        wx, wy, ww, wh = self.weights
-        dx = wx * (target_ctr_x - src_ctr_x) / src_widths
-        dy = wy * (target_ctr_y - src_ctr_y) / src_heights
-        dw = ww * torch.log(target_widths / src_widths)
-        dh = wh * torch.log(target_heights / src_heights)
-
-        deltas = torch.stack((dx, dy, dw, dh), dim=1)
-        assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!"
-        return deltas
-
-    def apply_deltas(self, deltas, boxes):
-        """
-        Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
-        Args:
-            deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
-                deltas[i] represents k potentially different class-specific
-                box transformations for the single box boxes[i].
-            boxes (Tensor): boxes to transform, of shape (N, 4)
-        """
-        boxes = boxes.to(deltas.dtype)
-
-        widths = boxes[:, 2] - boxes[:, 0]
-        heights = boxes[:, 3] - boxes[:, 1]
-        ctr_x = boxes[:, 0] + 0.5 * widths
-        ctr_y = boxes[:, 1] + 0.5 * heights
-
-        wx, wy, ww, wh = self.weights
-        dx = deltas[:, 0::4] / wx
-        dy = deltas[:, 1::4] / wy
-        dw = deltas[:, 2::4] / ww
-        dh = deltas[:, 3::4] / wh
-
-        # Prevent sending too large values into torch.exp()
-        dw = torch.clamp(dw, max=self.scale_clamp)
-        dh = torch.clamp(dh, max=self.scale_clamp)
-
-        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
-        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
-        pred_w = torch.exp(dw) * widths[:, None]
-        pred_h = torch.exp(dh) * heights[:, None]
-
-        pred_boxes = torch.zeros_like(deltas)
-        pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w  # x1
-        pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h  # y1
-        pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w  # x2
-        pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h  # y2
-        return pred_boxes
-
-
-class Matcher:
-    """
-    This class assigns to each predicted "element" (e.g., a box) a ground-truth
-    element. Each predicted element will have exactly zero or one matches; each
-    ground-truth element may be matched to zero or more predicted elements.
-    The matching is determined by the MxN match_quality_matrix, that characterizes
-    how well each (ground-truth, prediction)-pair match each other. For example,
-    if the elements are boxes, this matrix may contain box intersection-over-union
-    overlap values.
-    The matcher returns (a) a vector of length N containing the index of the
-    ground-truth element m in [0, M) that matches to prediction n in [0, N).
-    (b) a vector of length N containing the labels for each prediction.
-    """
-
-    def __init__(
-        self,
-        thresholds: List[float],
-        labels: List[int],
-        allow_low_quality_matches: bool = False,
-    ):
-        """
-        Args:
-            thresholds (list): a list of thresholds used to stratify predictions
-                into levels.
-            labels (list): a list of values to label predictions belonging at
-                each level. A label can be one of {-1, 0, 1} signifying
-                {ignore, negative class, positive class}, respectively.
-            allow_low_quality_matches (bool): if True, produce additional matches or predictions with maximum match quality lower than high_threshold.
-                For example, thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and
-                thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
-                thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and thus will be considered as true positives.
-        """
-        thresholds = thresholds[:]
-        assert thresholds[0] > 0
-        thresholds.insert(0, -float("inf"))
-        thresholds.append(float("inf"))
-        assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:]))
-        assert all(label_i in [-1, 0, 1] for label_i in labels)
-        assert len(labels) == len(thresholds) - 1
-        self.thresholds = thresholds
-        self.labels = labels
-        self.allow_low_quality_matches = allow_low_quality_matches
-
-    def __call__(self, match_quality_matrix):
-        """
-        Args:
-            match_quality_matrix (Tensor[float]): an MxN tensor, containing the pairwise quality between M ground-truth elements and N predicted
-                elements. All elements must be >= 0 (due to the us of `torch.nonzero` for selecting indices in :meth:`set_low_quality_matches_`).
-        Returns:
-            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched ground-truth index in [0, M)
-            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates true or false positive or ignored
-        """
-        assert match_quality_matrix.dim() == 2
-        if match_quality_matrix.numel() == 0:
-            default_matches = match_quality_matrix.new_full((match_quality_matrix.size(1),), 0, dtype=torch.int64)
-            # When no gt boxes exist, we define IOU = 0 and therefore set labels
-            # to `self.labels[0]`, which usually defaults to background class 0
-            # To choose to ignore instead,
-            # can make labels=[-1,0,-1,1] + set appropriate thresholds
-            default_match_labels = match_quality_matrix.new_full(
-                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
-            )
-            return default_matches, default_match_labels
-
-        assert torch.all(match_quality_matrix >= 0)
-
-        # match_quality_matrix is M (gt) x N (predicted)
-        # Max over gt elements (dim 0) to find best gt candidate for each prediction
-        matched_vals, matches = match_quality_matrix.max(dim=0)
-
-        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
-
-        for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
-            low_high = (matched_vals >= low) & (matched_vals < high)
-            match_labels[low_high] = l
-
-        if self.allow_low_quality_matches:
-            self.set_low_quality_matches_(match_labels, match_quality_matrix)
-
-        return matches, match_labels
-
-    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
-        """
-        Produce additional matches for predictions that have only low-quality matches.
-        Specifically, for each ground-truth G find the set of predictions that have
-        maximum overlap with it (including ties); for each prediction in that set, if
-        it is unmatched, then match it to the ground-truth G.
-        This function implements the RPN assignment case (i)
-        in Sec. 3.1.2 of Faster R-CNN.
-        """
-        # For each gt, find the prediction with which it has highest quality
-        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
-        # Find the highest quality match available, even if it is low, including ties.
-        # Note that the matches qualities must be positive due to the use of
-        # `torch.nonzero`.
-        of_quality_inds = match_quality_matrix == highest_quality_foreach_gt[:, None]
-        if of_quality_inds.dim() == 0:
-            (_, pred_inds_with_highest_quality) = of_quality_inds.unsqueeze(0).nonzero().unbind(1)
-        else:
-            (_, pred_inds_with_highest_quality) = of_quality_inds.nonzero().unbind(1)
-        match_labels[pred_inds_with_highest_quality] = 1
-
-
-class RPNOutputs:
-    def __init__(
-        self,
-        box2box_transform,
-        anchor_matcher,
-        batch_size_per_image,
-        positive_fraction,
-        images,
-        pred_objectness_logits,
-        pred_anchor_deltas,
-        anchors,
-        boundary_threshold=0,
-        gt_boxes=None,
-        smooth_l1_beta=0.0,
-    ):
-        """
-        Args:
-            box2box_transform (Box2BoxTransform): :class:`Box2BoxTransform` instance for anchor-proposal transformations.
-            anchor_matcher (Matcher): :class:`Matcher` instance for matching anchors to ground-truth boxes; used to determine training labels.
-            batch_size_per_image (int): number of proposals to sample when training
-            positive_fraction (float): target fraction of sampled proposals that should be positive
-            images (ImageList): :class:`ImageList` instance representing N input images
-            pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A, Hi, W)
-            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A*4, Hi, Wi)
-            anchors (list[torch.Tensor]): nested list of boxes. anchors[i][j] at (n, l) stores anchor array for feature map l
-            boundary_threshold (int): if >= 0, then anchors that extend beyond the image boundary by more than boundary_thresh are not used in training.
-            gt_boxes (list[Boxes], optional): A list of N elements.
-            smooth_l1_beta (float): The transition point between L1 and L2 lossn. When set to 0, the loss becomes L1. When +inf, it is ignored
-        """
-        self.box2box_transform = box2box_transform
-        self.anchor_matcher = anchor_matcher
-        self.batch_size_per_image = batch_size_per_image
-        self.positive_fraction = positive_fraction
-        self.pred_objectness_logits = pred_objectness_logits
-        self.pred_anchor_deltas = pred_anchor_deltas
-
-        self.anchors = anchors
-        self.gt_boxes = gt_boxes
-        self.num_feature_maps = len(pred_objectness_logits)
-        self.num_images = len(images)
-        self.boundary_threshold = boundary_threshold
-        self.smooth_l1_beta = smooth_l1_beta
-
-    def _get_ground_truth(self):
-        raise NotImplementedError()
-
-    def predict_proposals(self):
-        # pred_anchor_deltas: (L, N, ? Hi, Wi)
-        # anchors:(N, L, -1, B)
-        # here we loop over specific feature map, NOT images
-        proposals = []
-        anchors = self.anchors.transpose(0, 1)
-        for anchors_i, pred_anchor_deltas_i in zip(anchors, self.pred_anchor_deltas):
-            B = anchors_i.size(-1)
-            N, _, Hi, Wi = pred_anchor_deltas_i.shape
-            anchors_i = anchors_i.flatten(start_dim=0, end_dim=1)
-            pred_anchor_deltas_i = pred_anchor_deltas_i.view(N, -1, B, Hi, Wi).permute(0, 3, 4, 1, 2).reshape(-1, B)
-            proposals_i = self.box2box_transform.apply_deltas(pred_anchor_deltas_i, anchors_i)
-            # Append feature map proposals with shape (N, Hi*Wi*A, B)
-            proposals.append(proposals_i.view(N, -1, B))
-        proposals = torch.stack(proposals)
-        return proposals
-
-    def predict_objectness_logits(self):
-        """
-        Returns:
-            pred_objectness_logits (list[Tensor]) -> (N, Hi*Wi*A).
-        """
-        pred_objectness_logits = [
-            # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N, Hi*Wi*A)
-            score.permute(0, 2, 3, 1).reshape(self.num_images, -1)
-            for score in self.pred_objectness_logits
-        ]
-        return pred_objectness_logits
-
-
-# Main Classes
-class Conv2d(nn.Conv2d):
-    def __init__(self, *args, **kwargs):
-        norm = kwargs.pop("norm", None)
-        activation = kwargs.pop("activation", None)
-        super().__init__(*args, **kwargs)
-
-        self.norm = norm
-        self.activation = activation
-
-    def forward(self, x):
-        if x.numel() == 0 and self.training:
-            assert not isinstance(self.norm, nn.SyncBatchNorm)
-        if x.numel() == 0:
-            assert not isinstance(self.norm, nn.GroupNorm)
-            output_shape = [
-                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
-                for i, p, di, k, s in zip(
-                    x.shape[-2:],
-                    self.padding,
-                    self.dilation,
-                    self.kernel_size,
-                    self.stride,
-                )
-            ]
-            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
-            empty = _NewEmptyTensorOp.apply(x, output_shape)
-            if self.training:
-                _dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
-                return empty + _dummy
-            else:
-                return empty
-
-        x = super().forward(x)
-        if self.norm is not None:
-            x = self.norm(x)
-        if self.activation is not None:
-            x = self.activation(x)
-        return x
-
-
-class LastLevelMaxPool(nn.Module):
-    """
-    This module is used in the original FPN to generate a downsampled P6 feature from P5.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.num_levels = 1
-        self.in_feature = "p5"
-
-    def forward(self, x):
-        return [nn.functional.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
-
-
-class LastLevelP6P7(nn.Module):
-    """
-    This module is used in RetinaNet to generate extra layers, P6 and P7 from C5 feature.
-    """
-
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-        self.num_levels = 2
-        self.in_feature = "res5"
-        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
-        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
-
-    def forward(self, c5):
-        p6 = self.p6(c5)
-        p7 = self.p7(nn.functional.relu(p6))
-        return [p6, p7]
-
-
-class BasicStem(nn.Module):
-    def __init__(self, in_channels=3, out_channels=64, norm="BN", caffe_maxpool=False):
-        super().__init__()
-        self.conv1 = Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size=7,
-            stride=2,
-            padding=3,
-            bias=False,
-            norm=get_norm(norm, out_channels),
-        )
-        self.caffe_maxpool = caffe_maxpool
-        # use pad 1 instead of pad zero
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = nn.functional.relu_(x)
-        if self.caffe_maxpool:
-            x = nn.functional.max_pool2d(x, kernel_size=3, stride=2, padding=0, ceil_mode=True)
-        else:
-            x = nn.functional.max_pool2d(x, kernel_size=3, stride=2, padding=1)
-        return x
-
-    @property
-    def out_channels(self):
-        return self.conv1.out_channels
-
-    @property
-    def stride(self):
-        return 4  # = stride 2 conv -> stride 2 max pool
-
-
-class ResNetBlockBase(nn.Module):
-    def __init__(self, in_channels, out_channels, stride):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.stride = stride
-
-    def freeze(self):
-        for p in self.parameters():
-            p.requires_grad = False
-        return self
-
-
-class BottleneckBlock(ResNetBlockBase):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        bottleneck_channels,
-        stride=1,
-        num_groups=1,
-        norm="BN",
-        stride_in_1x1=False,
-        dilation=1,
-    ):
-        super().__init__(in_channels, out_channels, stride)
-
-        if in_channels != out_channels:
-            self.shortcut = Conv2d(
-                in_channels,
-                out_channels,
-                kernel_size=1,
-                stride=stride,
-                bias=False,
-                norm=get_norm(norm, out_channels),
-            )
-        else:
-            self.shortcut = None
-
-        # The original MSRA ResNet models have stride in the first 1x1 conv
-        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
-        # stride in the 3x3 conv
-        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
-
-        self.conv1 = Conv2d(
-            in_channels,
-            bottleneck_channels,
-            kernel_size=1,
-            stride=stride_1x1,
-            bias=False,
-            norm=get_norm(norm, bottleneck_channels),
-        )
-
-        self.conv2 = Conv2d(
-            bottleneck_channels,
-            bottleneck_channels,
-            kernel_size=3,
-            stride=stride_3x3,
-            padding=1 * dilation,
-            bias=False,
-            groups=num_groups,
-            dilation=dilation,
-            norm=get_norm(norm, bottleneck_channels),
-        )
-
-        self.conv3 = Conv2d(
-            bottleneck_channels,
-            out_channels,
-            kernel_size=1,
-            bias=False,
-            norm=get_norm(norm, out_channels),
-        )
-
-    def forward(self, x):
-        out = self.conv1(x)
-        out = nn.functional.relu_(out)
-
-        out = self.conv2(out)
-        out = nn.functional.relu_(out)
-
-        out = self.conv3(out)
-
-        if self.shortcut is not None:
-            shortcut = self.shortcut(x)
-        else:
-            shortcut = x
-
-        out += shortcut
-        out = nn.functional.relu_(out)
-        return out
-
-
-class Backbone(nn.Module, metaclass=ABCMeta):
-    def __init__(self):
-        super().__init__()
-
-    @abstractmethod
-    def forward(self):
-        pass
-
-    @property
-    def size_divisibility(self):
-        """
-        Some backbones require the input height and width to be divisible by a specific integer. This is
-        typically true for encoder / decoder type networks with lateral connection (e.g., FPN) for which feature maps need to match
-        dimension in the "bottom up" and "top down" paths. Set to 0 if no specific input size divisibility is required.
-        """
-        return 0
-
-    def output_shape(self):
-        return {
-            name: ShapeSpec(
-                channels=self._out_feature_channels[name],
-                stride=self._out_feature_strides[name],
-            )
-            for name in self._out_features
-        }
-
-    @property
-    def out_features(self):
-        """deprecated"""
-        return self._out_features
-
-    @property
-    def out_feature_strides(self):
-        """deprecated"""
-        return {f: self._out_feature_strides[f] for f in self._out_features}
-
-    @property
-    def out_feature_channels(self):
-        """deprecated"""
-        return {f: self._out_feature_channels[f] for f in self._out_features}
-
-
-class ResNet(Backbone):
-    def __init__(self, stem, stages, num_classes=None, out_features=None):
-        """
-        Args:
-            stem (nn.Module): a stem module
-            stages (list[list[ResNetBlock]]): several (typically 4) stages, each contains multiple :class:`ResNetBlockBase`.
-            num_classes (None or int): if None, will not perform classification.
-            out_features (list[str]): name of the layers whose outputs should be returned in forward. Can be anything in:
-            "stem", "linear", or "res2" ... If None, will return the output of the last layer.
-        """
-        super(ResNet, self).__init__()
-        self.stem = stem
-        self.num_classes = num_classes
-
-        current_stride = self.stem.stride
-        self._out_feature_strides = {"stem": current_stride}
-        self._out_feature_channels = {"stem": self.stem.out_channels}
-
-        self.stages_and_names = []
-        for i, blocks in enumerate(stages):
-            for block in blocks:
-                assert isinstance(block, ResNetBlockBase), block
-                curr_channels = block.out_channels
-            stage = nn.Sequential(*blocks)
-            name = "res" + str(i + 2)
-            self.add_module(name, stage)
-            self.stages_and_names.append((stage, name))
-            self._out_feature_strides[name] = current_stride = int(
-                current_stride * np.prod([k.stride for k in blocks])
-            )
-            self._out_feature_channels[name] = blocks[-1].out_channels
-
-        if num_classes is not None:
-            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-            self.linear = nn.Linear(curr_channels, num_classes)
-
-            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
-            # "The 1000-way fully-connected layer is initialized by
-            # drawing weights from a zero-mean Gaussian with std of 0.01."
-            nn.init.normal_(self.linear.weight, stddev=0.01)
-            name = "linear"
-
-        if out_features is None:
-            out_features = [name]
-        self._out_features = out_features
-        assert len(self._out_features)
-        children = [x[0] for x in self.named_children()]
-        for out_feature in self._out_features:
-            assert out_feature in children, "Available children: {}".format(", ".join(children))
-
-    def forward(self, x):
-        outputs = {}
-        x = self.stem(x)
-        if "stem" in self._out_features:
-            outputs["stem"] = x
-        for stage, name in self.stages_and_names:
-            x = stage(x)
-            if name in self._out_features:
-                outputs[name] = x
-        if self.num_classes is not None:
-            x = self.avgpool(x)
-            x = self.linear(x)
-            if "linear" in self._out_features:
-                outputs["linear"] = x
-        return outputs
-
-    def output_shape(self):
-        return {
-            name: ShapeSpec(
-                channels=self._out_feature_channels[name],
-                stride=self._out_feature_strides[name],
-            )
-            for name in self._out_features
-        }
-
-    @staticmethod
-    def make_stage(
-        block_class,
-        num_blocks,
-        first_stride=None,
-        *,
-        in_channels,
-        out_channels,
-        **kwargs,
-    ):
-        """
-        Usually, layers that produce the same feature map spatial size
-        are defined as one "stage".
-        Under such definition, stride_per_block[1:] should all be 1.
-        """
-        if first_stride is not None:
-            assert "stride" not in kwargs and "stride_per_block" not in kwargs
-            kwargs["stride_per_block"] = [first_stride] + [1] * (num_blocks - 1)
-        blocks = []
-        for i in range(num_blocks):
-            curr_kwargs = {}
-            for k, v in kwargs.items():
-                if k.endswith("_per_block"):
-                    assert (
-                        len(v) == num_blocks
-                    ), f"Argument '{k}' of make_stage should have the same length as num_blocks={num_blocks}."
-                    newk = k[: -len("_per_block")]
-                    assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
-                    curr_kwargs[newk] = v[i]
-                else:
-                    curr_kwargs[k] = v
-
-            blocks.append(block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs))
-            in_channels = out_channels
-
-        return blocks
-
-
-class ROIPooler(nn.Module):
-    """
-    Region of interest feature map pooler that supports pooling from one or more
-    feature maps.
-    """
-
-    def __init__(
-        self,
-        output_size,
-        scales,
-        sampling_ratio,
-        canonical_box_size=224,
-        canonical_level=4,
-    ):
-        super().__init__()
-        # assumption that stride is a power of 2.
-        min_level = -math.log2(scales[0])
-        max_level = -math.log2(scales[-1])
-
-        # a bunch of testing
-        assert math.isclose(min_level, int(min_level)) and math.isclose(max_level, int(max_level))
-        assert len(scales) == max_level - min_level + 1, "not pyramid"
-        assert 0 < min_level and min_level <= max_level
-        if isinstance(output_size, int):
-            output_size = (output_size, output_size)
-        assert len(output_size) == 2 and isinstance(output_size[0], int) and isinstance(output_size[1], int)
-        if len(scales) > 1:
-            assert min_level <= canonical_level and canonical_level <= max_level
-        assert canonical_box_size > 0
-
-        self.output_size = output_size
-        self.min_level = int(min_level)
-        self.max_level = int(max_level)
-        self.level_poolers = nn.ModuleList(RoIPool(output_size, spatial_scale=scale) for scale in scales)
-        self.canonical_level = canonical_level
-        self.canonical_box_size = canonical_box_size
-
-    def forward(self, feature_maps, boxes):
-        """
-        Args:
-            feature_maps: List[torch.Tensor(N,C,W,H)]
-            box_lists: list[torch.Tensor])
-        Returns:
-            A tensor of shape(N*B, Channels, output_size, output_size)
-        """
-        x = list(feature_maps.values())
-        num_level_assignments = len(self.level_poolers)
-        assert len(x) == num_level_assignments and len(boxes) == x[0].size(0)
-
-        pooler_fmt_boxes = convert_boxes_to_pooler_format(boxes)
-
-        if num_level_assignments == 1:
-            return self.level_poolers[0](x[0], pooler_fmt_boxes)
-
-        level_assignments = assign_boxes_to_levels(
-            boxes,
-            self.min_level,
-            self.max_level,
-            self.canonical_box_size,
-            self.canonical_level,
-        )
-
-        num_boxes = len(pooler_fmt_boxes)
-        num_channels = x[0].shape[1]
-        output_size = self.output_size[0]
-
-        dtype, device = x[0].dtype, x[0].device
-        output = torch.zeros(
-            (num_boxes, num_channels, output_size, output_size),
-            dtype=dtype,
-            device=device,
-        )
-
-        for level, (x_level, pooler) in enumerate(zip(x, self.level_poolers)):
-            inds = torch.nonzero(level_assignments == level).squeeze(1)
-            pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
-            output[inds] = pooler(x_level, pooler_fmt_boxes_level)
-
-        return output
-
-
-class ROIOutputs:
-    def __init__(self, cfg, training=False):
-        self.smooth_l1_beta = cfg.ROI_BOX_HEAD.SMOOTH_L1_BETA
-        self.box2box_transform = Box2BoxTransform(weights=cfg.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
-        self.training = training
-        self.score_thresh = cfg.ROI_HEADS.SCORE_THRESH_TEST
-        self.min_detections = cfg.MIN_DETECTIONS
-        self.max_detections = cfg.MAX_DETECTIONS
-
-        nms_thresh = cfg.ROI_HEADS.NMS_THRESH_TEST
-        if not isinstance(nms_thresh, list):
-            nms_thresh = [nms_thresh]
-        self.nms_thresh = nms_thresh
-
-    def _predict_boxes(self, proposals, box_deltas, preds_per_image):
-        num_pred = box_deltas.size(0)
-        B = proposals[0].size(-1)
-        K = box_deltas.size(-1) // B
-        box_deltas = box_deltas.view(num_pred * K, B)
-        proposals = torch.cat(proposals, dim=0).unsqueeze(-2).expand(num_pred, K, B)
-        proposals = proposals.reshape(-1, B)
-        boxes = self.box2box_transform.apply_deltas(box_deltas, proposals)
-        return boxes.view(num_pred, K * B).split(preds_per_image, dim=0)
-
-    def _predict_objs(self, obj_logits, preds_per_image):
-        probs = nn.functional.softmax(obj_logits, dim=-1)
-        probs = probs.split(preds_per_image, dim=0)
-        return probs
-
-    def _predict_attrs(self, attr_logits, preds_per_image):
-        attr_logits = attr_logits[..., :-1].softmax(-1)
-        attr_probs, attrs = attr_logits.max(-1)
-        return attr_probs.split(preds_per_image, dim=0), attrs.split(preds_per_image, dim=0)
-
-    @torch.no_grad()
-    def inference(
-        self,
-        obj_logits,
-        attr_logits,
-        box_deltas,
-        pred_boxes,
-        features,
-        sizes,
-        scales=None,
-    ):
-        # only the pred boxes is the
-        preds_per_image = [p.size(0) for p in pred_boxes]
-        boxes_all = self._predict_boxes(pred_boxes, box_deltas, preds_per_image)
-        obj_scores_all = self._predict_objs(obj_logits, preds_per_image)  # list of length N
-        attr_probs_all, attrs_all = self._predict_attrs(attr_logits, preds_per_image)
-        features = features.split(preds_per_image, dim=0)
-
-        # fun for each image too, also I can experiment and do multiple images
-        final_results = []
-        zipped = zip(boxes_all, obj_scores_all, attr_probs_all, attrs_all, sizes)
-        for i, (boxes, obj_scores, attr_probs, attrs, size) in enumerate(zipped):
-            for nms_t in self.nms_thresh:
-                outputs = do_nms(
-                    boxes,
-                    obj_scores,
-                    size,
-                    self.score_thresh,
-                    nms_t,
-                    self.min_detections,
-                    self.max_detections,
-                )
-                if outputs is not None:
-                    max_boxes, max_scores, classes, ids = outputs
-                    break
-
-            if scales is not None:
-                scale_yx = scales[i]
-                max_boxes[:, 0::2] *= scale_yx[1]
-                max_boxes[:, 1::2] *= scale_yx[0]
-
-            final_results.append(
-                (
-                    max_boxes,
-                    classes,
-                    max_scores,
-                    attrs[ids],
-                    attr_probs[ids],
-                    features[i][ids],
-                )
-            )
-        boxes, classes, class_probs, attrs, attr_probs, roi_features = map(list, zip(*final_results))
-        return boxes, classes, class_probs, attrs, attr_probs, roi_features
-
-    def training(self, obj_logits, attr_logits, box_deltas, pred_boxes, features, sizes):
-        pass
-
-    def __call__(
-        self,
-        obj_logits,
-        attr_logits,
-        box_deltas,
-        pred_boxes,
-        features,
-        sizes,
-        scales=None,
-    ):
-        if self.training:
-            raise NotImplementedError()
-        return self.inference(
-            obj_logits,
-            attr_logits,
-            box_deltas,
-            pred_boxes,
-            features,
-            sizes,
-            scales=scales,
-        )
-
-
-class Res5ROIHeads(nn.Module):
-    """
-    ROIHeads perform all per-region computation in an R-CNN.
-    It contains logic of cropping the regions, extract per-region features
-    (by the res-5 block in this case), and make per-region predictions.
-    """
-
-    def __init__(self, cfg, input_shape):
-        super().__init__()
-        self.batch_size_per_image = cfg.RPN.BATCH_SIZE_PER_IMAGE
-        self.positive_sample_fraction = cfg.ROI_HEADS.POSITIVE_FRACTION
-        self.in_features = cfg.ROI_HEADS.IN_FEATURES
-        self.num_classes = cfg.ROI_HEADS.NUM_CLASSES
-        self.proposal_append_gt = cfg.ROI_HEADS.PROPOSAL_APPEND_GT
-        self.feature_strides = {k: v.stride for k, v in input_shape.items()}
-        self.feature_channels = {k: v.channels for k, v in input_shape.items()}
-        self.cls_agnostic_bbox_reg = cfg.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
-        self.stage_channel_factor = 2**3  # res5 is 8x res2
-        self.out_channels = cfg.RESNETS.RES2_OUT_CHANNELS * self.stage_channel_factor
-
-        # self.proposal_matcher = Matcher(
-        #     cfg.ROI_HEADS.IOU_THRESHOLDS,
-        #     cfg.ROI_HEADS.IOU_LABELS,
-        #     allow_low_quality_matches=False,
-        # )
-
-        pooler_resolution = cfg.ROI_BOX_HEAD.POOLER_RESOLUTION
-        pooler_scales = (1.0 / self.feature_strides[self.in_features[0]],)
-        sampling_ratio = cfg.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
-        res5_halve = cfg.ROI_BOX_HEAD.RES5HALVE
-        use_attr = cfg.ROI_BOX_HEAD.ATTR
-        num_attrs = cfg.ROI_BOX_HEAD.NUM_ATTRS
-
-        self.pooler = ROIPooler(
-            output_size=pooler_resolution,
-            scales=pooler_scales,
-            sampling_ratio=sampling_ratio,
-        )
-
-        self.res5 = self._build_res5_block(cfg)
-        if not res5_halve:
-            """
-            Modifications for VG in RoI heads:
-            1. Change the stride of conv1 and shortcut in Res5.Block1 from 2 to 1
-            2. Modifying all conv2 with (padding: 1 --> 2) and (dilation: 1 --> 2)
-            """
-            self.res5[0].conv1.stride = (1, 1)
-            self.res5[0].shortcut.stride = (1, 1)
-            for i in range(3):
-                self.res5[i].conv2.padding = (2, 2)
-                self.res5[i].conv2.dilation = (2, 2)
-
-        self.box_predictor = FastRCNNOutputLayers(
-            self.out_channels,
-            self.num_classes,
-            self.cls_agnostic_bbox_reg,
-            use_attr=use_attr,
-            num_attrs=num_attrs,
-        )
-
-    def _build_res5_block(self, cfg):
-        stage_channel_factor = self.stage_channel_factor  # res5 is 8x res2
-        num_groups = cfg.RESNETS.NUM_GROUPS
-        width_per_group = cfg.RESNETS.WIDTH_PER_GROUP
-        bottleneck_channels = num_groups * width_per_group * stage_channel_factor
-        out_channels = self.out_channels
-        stride_in_1x1 = cfg.RESNETS.STRIDE_IN_1X1
-        norm = cfg.RESNETS.NORM
-
-        blocks = ResNet.make_stage(
-            BottleneckBlock,
-            3,
-            first_stride=2,
-            in_channels=out_channels // 2,
-            bottleneck_channels=bottleneck_channels,
-            out_channels=out_channels,
-            num_groups=num_groups,
-            norm=norm,
-            stride_in_1x1=stride_in_1x1,
-        )
-        return nn.Sequential(*blocks)
-
-    def _shared_roi_transform(self, features, boxes):
-        x = self.pooler(features, boxes)
-        return self.res5(x)
-
-    def forward(self, features, proposal_boxes, gt_boxes=None):
-        if self.training:
-            """
-            see https://github.com/airsplay/py-bottom-up-attention/\
-                    blob/master/detectron2/modeling/roi_heads/roi_heads.py
-            """
-            raise NotImplementedError()
-
-        assert not proposal_boxes[0].requires_grad
-        box_features = self._shared_roi_transform(features, proposal_boxes)
-        feature_pooled = box_features.mean(dim=[2, 3])  # pooled to 1x1
-        obj_logits, attr_logits, pred_proposal_deltas = self.box_predictor(feature_pooled)
-        return obj_logits, attr_logits, pred_proposal_deltas, feature_pooled
-
-
-class AnchorGenerator(nn.Module):
-    """
-    For a set of image sizes and feature maps, computes a set of anchors.
-    """
-
-    def __init__(self, cfg, input_shape: List[ShapeSpec]):
-        super().__init__()
-        sizes = cfg.ANCHOR_GENERATOR.SIZES
-        aspect_ratios = cfg.ANCHOR_GENERATOR.ASPECT_RATIOS
-        self.strides = [x.stride for x in input_shape]
-        self.offset = cfg.ANCHOR_GENERATOR.OFFSET
-        assert 0.0 <= self.offset < 1.0, self.offset
-
-        """
-        sizes (list[list[int]]): sizes[i] is the list of anchor sizes for feat map i
-            1. given in absolute lengths in units of the input image;
-            2. they do not dynamically scale if the input image size changes.
-        aspect_ratios (list[list[float]])
-        strides (list[int]): stride of each input feature.
-        """
-
-        self.num_features = len(self.strides)
-        self.cell_anchors = nn.ParameterList(self._calculate_anchors(sizes, aspect_ratios))
-        self._spacial_feat_dim = 4
-
-    def _calculate_anchors(self, sizes, aspect_ratios):
-        # If one size (or aspect ratio) is specified and there are multiple feature
-        # maps, then we "broadcast" anchors of that single size (or aspect ratio)
-        if len(sizes) == 1:
-            sizes *= self.num_features
-        if len(aspect_ratios) == 1:
-            aspect_ratios *= self.num_features
-        assert self.num_features == len(sizes)
-        assert self.num_features == len(aspect_ratios)
-
-        cell_anchors = [self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios)]
-
-        return cell_anchors
-
-    @property
-    def box_dim(self):
-        return self._spacial_feat_dim
-
-    @property
-    def num_cell_anchors(self):
-        """
-        Returns:
-            list[int]: Each int is the number of anchors at every pixel location, on that feature map.
-        """
-        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
-
-    def grid_anchors(self, grid_sizes):
-        anchors = []
-        for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
-            shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors.device)
-            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
-
-            anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
-
-        return anchors
-
-    def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
-        """
-        anchors are continuous geometric rectangles
-        centered on one feature map point sample.
-        We can later build the set of anchors
-        for the entire feature map by tiling these tensors
-        """
-
-        anchors = []
-        for size in sizes:
-            area = size**2.0
-            for aspect_ratio in aspect_ratios:
-                w = math.sqrt(area / aspect_ratio)
-                h = aspect_ratio * w
-                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
-                anchors.append([x0, y0, x1, y1])
-        return nn.Parameter(torch.tensor(anchors))
-
-    def forward(self, features):
-        """
-        Args:
-            features List[torch.Tensor]: list of feature maps on which to generate anchors.
-        Returns:
-            torch.Tensor: a list of #image elements.
-        """
-        num_images = features[0].size(0)
-        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
-        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes)
-        anchors_over_all_feature_maps = torch.stack(anchors_over_all_feature_maps)
-        return anchors_over_all_feature_maps.unsqueeze(0).repeat_interleave(num_images, dim=0)
-
-
-class RPNHead(nn.Module):
-    """
-    RPN classification and regression heads. Uses a 3x3 conv to produce a shared
-    hidden state from which one 1x1 conv predicts objectness logits for each anchor
-    and a second 1x1 conv predicts bounding-box deltas specifying how to deform
-    each anchor into an object proposal.
-    """
-
-    def __init__(self, cfg, input_shape: List[ShapeSpec]):
-        super().__init__()
-
-        # Standard RPN is shared across levels:
-        in_channels = [s.channels for s in input_shape]
-        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
-        in_channels = in_channels[0]
-
-        anchor_generator = AnchorGenerator(cfg, input_shape)
-        num_cell_anchors = anchor_generator.num_cell_anchors
-        box_dim = anchor_generator.box_dim
-        assert len(set(num_cell_anchors)) == 1, "Each level must have the same number of cell anchors"
-        num_cell_anchors = num_cell_anchors[0]
-
-        if cfg.PROPOSAL_GENERATOR.HIDDEN_CHANNELS == -1:
-            hid_channels = in_channels
-        else:
-            hid_channels = cfg.PROPOSAL_GENERATOR.HIDDEN_CHANNELS
-            # Modifications for VG in RPN (modeling/proposal_generator/rpn.py)
-            # Use hidden dim  instead fo the same dim as Res4 (in_channels)
-
-        # 3x3 conv for the hidden representation
-        self.conv = nn.Conv2d(in_channels, hid_channels, kernel_size=3, stride=1, padding=1)
-        # 1x1 conv for predicting objectness logits
-        self.objectness_logits = nn.Conv2d(hid_channels, num_cell_anchors, kernel_size=1, stride=1)
-        # 1x1 conv for predicting box2box transform deltas
-        self.anchor_deltas = nn.Conv2d(hid_channels, num_cell_anchors * box_dim, kernel_size=1, stride=1)
-
-        for layer in [self.conv, self.objectness_logits, self.anchor_deltas]:
-            nn.init.normal_(layer.weight, std=0.01)
-            nn.init.constant_(layer.bias, 0)
-
-    def forward(self, features):
-        """
-        Args:
-            features (list[Tensor]): list of feature maps
-        """
-        pred_objectness_logits = []
-        pred_anchor_deltas = []
-        for x in features:
-            t = nn.functional.relu(self.conv(x))
-            pred_objectness_logits.append(self.objectness_logits(t))
-            pred_anchor_deltas.append(self.anchor_deltas(t))
-        return pred_objectness_logits, pred_anchor_deltas
-
-
-class RPN(nn.Module):
-    """
-    Region Proposal Network, introduced by the Faster R-CNN paper.
-    """
-
-    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
-        super().__init__()
-
-        self.min_box_side_len = cfg.PROPOSAL_GENERATOR.MIN_SIZE
-        self.in_features = cfg.RPN.IN_FEATURES
-        self.nms_thresh = cfg.RPN.NMS_THRESH
-        self.batch_size_per_image = cfg.RPN.BATCH_SIZE_PER_IMAGE
-        self.positive_fraction = cfg.RPN.POSITIVE_FRACTION
-        self.smooth_l1_beta = cfg.RPN.SMOOTH_L1_BETA
-        self.loss_weight = cfg.RPN.LOSS_WEIGHT
-
-        self.pre_nms_topk = {
-            True: cfg.RPN.PRE_NMS_TOPK_TRAIN,
-            False: cfg.RPN.PRE_NMS_TOPK_TEST,
-        }
-        self.post_nms_topk = {
-            True: cfg.RPN.POST_NMS_TOPK_TRAIN,
-            False: cfg.RPN.POST_NMS_TOPK_TEST,
-        }
-        self.boundary_threshold = cfg.RPN.BOUNDARY_THRESH
-
-        self.anchor_generator = AnchorGenerator(cfg, [input_shape[f] for f in self.in_features])
-        self.box2box_transform = Box2BoxTransform(weights=cfg.RPN.BBOX_REG_WEIGHTS)
-        self.anchor_matcher = Matcher(
-            cfg.RPN.IOU_THRESHOLDS,
-            cfg.RPN.IOU_LABELS,
-            allow_low_quality_matches=True,
-        )
-        self.rpn_head = RPNHead(cfg, [input_shape[f] for f in self.in_features])
-
-    def training(self, images, image_shapes, features, gt_boxes):
-        pass
-
-    def inference(self, outputs, images, image_shapes, features, gt_boxes=None):
-        outputs = find_top_rpn_proposals(
-            outputs.predict_proposals(),
-            outputs.predict_objectness_logits(),
-            images,
-            image_shapes,
-            self.nms_thresh,
-            self.pre_nms_topk[self.training],
-            self.post_nms_topk[self.training],
-            self.min_box_side_len,
-            self.training,
-        )
-
-        results = []
-        for img in outputs:
-            im_boxes, img_box_logits = img
-            img_box_logits, inds = img_box_logits.sort(descending=True)
-            im_boxes = im_boxes[inds]
-            results.append((im_boxes, img_box_logits))
-
-        (proposal_boxes, logits) = tuple(map(list, zip(*results)))
-        return proposal_boxes, logits
-
-    def forward(self, images, image_shapes, features, gt_boxes=None):
-        """
-        Args:
-            images (torch.Tensor): input images of length `N`
-            features (dict[str: Tensor])
-            gt_instances
-        """
-        # features is dict, key = block level, v = feature_map
-        features = [features[f] for f in self.in_features]
-        pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features)
-        anchors = self.anchor_generator(features)
-        outputs = RPNOutputs(
-            self.box2box_transform,
-            self.anchor_matcher,
-            self.batch_size_per_image,
-            self.positive_fraction,
-            images,
-            pred_objectness_logits,
-            pred_anchor_deltas,
-            anchors,
-            self.boundary_threshold,
-            gt_boxes,
-            self.smooth_l1_beta,
-        )
-        # For RPN-only models, the proposals are the final output
-
-        if self.training:
-            raise NotImplementedError()
-            return self.training(outputs, images, image_shapes, features, gt_boxes)
-        else:
-            return self.inference(outputs, images, image_shapes, features, gt_boxes)
-
-
-class FastRCNNOutputLayers(nn.Module):
-    """
-    Two linear layers for predicting Fast R-CNN outputs:
-      (1) proposal-to-detection box regression deltas
-      (2) classification scores
-    """
-
-    def __init__(
-        self,
-        input_size,
-        num_classes,
-        cls_agnostic_bbox_reg,
-        box_dim=4,
-        use_attr=False,
-        num_attrs=-1,
-    ):
-        """
-        Args:
-            input_size (int): channels, or (channels, height, width)
-            num_classes (int)
-            cls_agnostic_bbox_reg (bool)
-            box_dim (int)
-        """
-        super().__init__()
-
-        if not isinstance(input_size, int):
-            input_size = np.prod(input_size)
-
-        # (do + 1 for background class)
-        self.cls_score = nn.Linear(input_size, num_classes + 1)
-        num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
-        self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
-
-        self.use_attr = use_attr
-        if use_attr:
-            """
-            Modifications for VG in RoI heads
-            Embedding: {num_classes + 1} --> {input_size // 8}
-            Linear: {input_size + input_size // 8} --> {input_size // 4}
-            Linear: {input_size // 4} --> {num_attrs + 1}
-            """
-            self.cls_embedding = nn.Embedding(num_classes + 1, input_size // 8)
-            self.fc_attr = nn.Linear(input_size + input_size // 8, input_size // 4)
-            self.attr_score = nn.Linear(input_size // 4, num_attrs + 1)
-
-        nn.init.normal_(self.cls_score.weight, std=0.01)
-        nn.init.normal_(self.bbox_pred.weight, std=0.001)
-        for item in [self.cls_score, self.bbox_pred]:
-            nn.init.constant_(item.bias, 0)
-
-    def forward(self, roi_features):
-        if roi_features.dim() > 2:
-            roi_features = torch.flatten(roi_features, start_dim=1)
-        scores = self.cls_score(roi_features)
-        proposal_deltas = self.bbox_pred(roi_features)
-        if self.use_attr:
-            _, max_class = scores.max(-1)  # [b, c] --> [b]
-            cls_emb = self.cls_embedding(max_class)  # [b] --> [b, 256]
-            roi_features = torch.cat([roi_features, cls_emb], -1)  # [b, 2048] + [b, 256] --> [b, 2304]
-            roi_features = self.fc_attr(roi_features)
-            roi_features = nn.functional.relu(roi_features)
-            attr_scores = self.attr_score(roi_features)
-            return scores, attr_scores, proposal_deltas
-        else:
-            return scores, proposal_deltas
-
-
-class GeneralizedRCNN(nn.Module):
-    def __init__(self, cfg):
-        super().__init__()
-
-        self.device = torch.device(cfg.MODEL.DEVICE)
-        self.backbone = build_backbone(cfg)
-        self.proposal_generator = RPN(cfg, self.backbone.output_shape())
-        self.roi_heads = Res5ROIHeads(cfg, self.backbone.output_shape())
-        self.roi_outputs = ROIOutputs(cfg)
-        self.to(self.device)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        config = kwargs.pop("config", None)
-        state_dict = kwargs.pop("state_dict", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        from_tf = kwargs.pop("from_tf", False)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        use_cdn = kwargs.pop("use_cdn", True)
-
-        # Load config if we don't provide a configuration
-        if not isinstance(config, Config):
-            config_path = config if config is not None else pretrained_model_name_or_path
-            # try:
-            config = Config.from_pretrained(
-                config_path,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-            )
-
-        # Load model
-        if pretrained_model_name_or_path is not None:
-            if os.path.isdir(pretrained_model_name_or_path):
-                if os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
-                    # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                else:
-                    raise EnvironmentError(
-                        "Error no file named {} found in directory {} ".format(
-                            WEIGHTS_NAME,
-                            pretrained_model_name_or_path,
-                        )
-                    )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-                archive_file = pretrained_model_name_or_path
-            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
-                    pretrained_model_name_or_path + ".index"
-                )
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = hf_bucket_url(
-                    pretrained_model_name_or_path,
-                    filename=WEIGHTS_NAME,
-                    use_cdn=use_cdn,
-                )
-
-            try:
-                # Load from URL or cache if already cached
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                )
-                if resolved_archive_file is None:
-                    raise EnvironmentError
-            except EnvironmentError:
-                msg = f"Can't load weights for '{pretrained_model_name_or_path}'."
-                raise EnvironmentError(msg)
-
-            if resolved_archive_file == archive_file:
-                print("loading weights file {}".format(archive_file))
-            else:
-                print("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
-        else:
-            resolved_archive_file = None
-
-        # Instantiate model.
-        model = cls(config)
-
-        if state_dict is None:
-            try:
-                try:
-                    state_dict = torch.load(resolved_archive_file, map_location="cpu")
-                except Exception:
-                    state_dict = load_checkpoint(resolved_archive_file)
-
-            except Exception:
-                raise OSError(
-                    "Unable to load weights from pytorch checkpoint file. "
-                    "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
-                )
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-
-        # Convert old format to new format if needed from a PyTorch state_dict
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if "gamma" in key:
-                new_key = key.replace("gamma", "weight")
-            if "beta" in key:
-                new_key = key.replace("beta", "bias")
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, "_metadata", None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        model_to_load = model
-        model_to_load.load_state_dict(state_dict)
-
-        if model.__class__.__name__ != model_to_load.__class__.__name__:
-            base_model_state_dict = model_to_load.state_dict().keys()
-            head_model_state_dict_without_base_prefix = [
-                key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
-            ]
-            missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
-
-        if len(unexpected_keys) > 0:
-            print(
-                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
-                " with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical"
-                " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
-            )
-        else:
-            print(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-        if len(missing_keys) > 0:
-            print(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-            )
-        else:
-            print(
-                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
-                f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
-                " training."
-            )
-        if len(error_msgs) > 0:
-            raise RuntimeError(
-                "Error(s) in loading state_dict for {}:\n\t{}".format(
-                    model.__class__.__name__, "\n\t".join(error_msgs)
-                )
-            )
-        # Set model in evaluation mode to deactivate DropOut modules by default
-        model.eval()
-
-        return model
-
-    def forward(
-        self,
-        images,
-        image_shapes,
-        gt_boxes=None,
-        proposals=None,
-        scales_yx=None,
-        **kwargs,
-    ):
-        """
-        kwargs:
-            max_detections (int), return_tensors {"np", "pt", None}, padding {None,
-            "max_detections"}, pad_value (int), location = {"cuda", "cpu"}
-        """
-        if self.training:
-            raise NotImplementedError()
-        return self.inference(
-            images=images,
-            image_shapes=image_shapes,
-            gt_boxes=gt_boxes,
-            proposals=proposals,
-            scales_yx=scales_yx,
-            **kwargs,
-        )
-
-    @torch.no_grad()
-    def inference(
-        self,
-        images,
-        image_shapes,
-        gt_boxes=None,
-        proposals=None,
-        scales_yx=None,
-        **kwargs,
-    ):
-        # run images through backbone
-        original_sizes = image_shapes * scales_yx
-        features = self.backbone(images)
-
-        # generate proposals if none are available
-        if proposals is None:
-            proposal_boxes, _ = self.proposal_generator(images, image_shapes, features, gt_boxes)
-        else:
-            assert proposals is not None
-
-        # pool object features from either gt_boxes, or from proposals
-        obj_logits, attr_logits, box_deltas, feature_pooled = self.roi_heads(features, proposal_boxes, gt_boxes)
-
-        # prepare FRCNN Outputs and select top proposals
-        boxes, classes, class_probs, attrs, attr_probs, roi_features = self.roi_outputs(
-            obj_logits=obj_logits,
-            attr_logits=attr_logits,
-            box_deltas=box_deltas,
-            pred_boxes=proposal_boxes,
-            features=feature_pooled,
-            sizes=image_shapes,
-            scales=scales_yx,
-        )
-
-        # will we pad???
-        subset_kwargs = {
-            "max_detections": kwargs.get("max_detections", None),
-            "return_tensors": kwargs.get("return_tensors", None),
-            "pad_value": kwargs.get("pad_value", 0),
-            "padding": kwargs.get("padding", None),
-        }
-        preds_per_image = torch.tensor([p.size(0) for p in boxes])
-        boxes = pad_list_tensors(boxes, preds_per_image, **subset_kwargs)
-        classes = pad_list_tensors(classes, preds_per_image, **subset_kwargs)
-        class_probs = pad_list_tensors(class_probs, preds_per_image, **subset_kwargs)
-        attrs = pad_list_tensors(attrs, preds_per_image, **subset_kwargs)
-        attr_probs = pad_list_tensors(attr_probs, preds_per_image, **subset_kwargs)
-        roi_features = pad_list_tensors(roi_features, preds_per_image, **subset_kwargs)
-        subset_kwargs["padding"] = None
-        preds_per_image = pad_list_tensors(preds_per_image, None, **subset_kwargs)
-        sizes = pad_list_tensors(image_shapes, None, **subset_kwargs)
-        normalized_boxes = norm_box(boxes, original_sizes)
-        return OrderedDict(
-            {
-                "obj_ids": classes,
-                "obj_probs": class_probs,
-                "attr_ids": attrs,
-                "attr_probs": attr_probs,
-                "boxes": boxes,
-                "sizes": sizes,
-                "preds_per_image": preds_per_image,
-                "roi_features": roi_features,
-                "normalized_boxes": normalized_boxes,
-            }
-        )
diff --git a/examples/research_projects/lxmert/processing_image.py b/examples/research_projects/lxmert/processing_image.py
deleted file mode 100644
index 65f8f6cd377c..000000000000
--- a/examples/research_projects/lxmert/processing_image.py
+++ /dev/null
@@ -1,151 +0,0 @@
-"""
-coding=utf-8
-Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
-Adapted From Facebook Inc, Detectron2
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.import copy
-"""
-
-import sys
-from typing import Tuple
-
-import numpy as np
-import torch
-from PIL import Image
-from torch import nn
-
-from transformers.image_utils import PILImageResampling
-from utils import img_tensorize
-
-
-class ResizeShortestEdge:
-    def __init__(self, short_edge_length, max_size=sys.maxsize):
-        """
-        Args:
-            short_edge_length (list[min, max])
-            max_size (int): maximum allowed longest edge length.
-        """
-        self.interp_method = "bilinear"
-        self.max_size = max_size
-        self.short_edge_length = short_edge_length
-
-    def __call__(self, imgs):
-        img_augs = []
-        for img in imgs:
-            h, w = img.shape[:2]
-            # later: provide list and randomly choose index for resize
-            size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
-            if size == 0:
-                return img
-            scale = size * 1.0 / min(h, w)
-            if h < w:
-                newh, neww = size, scale * w
-            else:
-                newh, neww = scale * h, size
-            if max(newh, neww) > self.max_size:
-                scale = self.max_size * 1.0 / max(newh, neww)
-                newh = newh * scale
-                neww = neww * scale
-            neww = int(neww + 0.5)
-            newh = int(newh + 0.5)
-
-            if img.dtype == np.uint8:
-                pil_image = Image.fromarray(img)
-                pil_image = pil_image.resize((neww, newh), PILImageResampling.BILINEAR)
-                img = np.asarray(pil_image)
-            else:
-                img = img.permute(2, 0, 1).unsqueeze(0)  # 3, 0, 1)  # hw(c) -> nchw
-                img = nn.functional.interpolate(
-                    img, (newh, neww), mode=self.interp_method, align_corners=False
-                ).squeeze(0)
-            img_augs.append(img)
-
-        return img_augs
-
-
-class Preprocess:
-    def __init__(self, cfg):
-        self.aug = ResizeShortestEdge([cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST)
-        self.input_format = cfg.INPUT.FORMAT
-        self.size_divisibility = cfg.SIZE_DIVISIBILITY
-        self.pad_value = cfg.PAD_VALUE
-        self.max_image_size = cfg.INPUT.MAX_SIZE_TEST
-        self.device = cfg.MODEL.DEVICE
-        self.pixel_std = torch.tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(len(cfg.MODEL.PIXEL_STD), 1, 1)
-        self.pixel_mean = torch.tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(len(cfg.MODEL.PIXEL_STD), 1, 1)
-        self.normalizer = lambda x: (x - self.pixel_mean) / self.pixel_std
-
-    def pad(self, images):
-        max_size = tuple(max(s) for s in zip(*[img.shape for img in images]))
-        image_sizes = [im.shape[-2:] for im in images]
-        images = [
-            nn.functional.pad(
-                im,
-                [0, max_size[-1] - size[1], 0, max_size[-2] - size[0]],
-                value=self.pad_value,
-            )
-            for size, im in zip(image_sizes, images)
-        ]
-
-        return torch.stack(images), torch.tensor(image_sizes)
-
-    def __call__(self, images, single_image=False):
-        with torch.no_grad():
-            if not isinstance(images, list):
-                images = [images]
-            if single_image:
-                assert len(images) == 1
-            for i in range(len(images)):
-                if isinstance(images[i], torch.Tensor):
-                    images.insert(i, images.pop(i).to(self.device).float())
-                elif not isinstance(images[i], torch.Tensor):
-                    images.insert(
-                        i,
-                        torch.as_tensor(img_tensorize(images.pop(i), input_format=self.input_format))
-                        .to(self.device)
-                        .float(),
-                    )
-            # resize smallest edge
-            raw_sizes = torch.tensor([im.shape[:2] for im in images])
-            images = self.aug(images)
-            # transpose images and convert to torch tensors
-            # images = [torch.as_tensor(i.astype("float32")).permute(2, 0, 1).to(self.device) for i in images]
-            # now normalize before pad to avoid useless arithmetic
-            images = [self.normalizer(x) for x in images]
-            # now pad them to do the following operations
-            images, sizes = self.pad(images)
-            # Normalize
-
-            if self.size_divisibility > 0:
-                raise NotImplementedError()
-            # pad
-            scales_yx = torch.true_divide(raw_sizes, sizes)
-            if single_image:
-                return images[0], sizes[0], scales_yx[0]
-            else:
-                return images, sizes, scales_yx
-
-
-def _scale_box(boxes, scale_yx):
-    boxes[:, 0::2] *= scale_yx[:, 1]
-    boxes[:, 1::2] *= scale_yx[:, 0]
-    return boxes
-
-
-def _clip_box(tensor, box_size: Tuple[int, int]):
-    assert torch.isfinite(tensor).all(), "Box tensor contains infinite or NaN!"
-    h, w = box_size
-    tensor[:, 0].clamp_(min=0, max=w)
-    tensor[:, 1].clamp_(min=0, max=h)
-    tensor[:, 2].clamp_(min=0, max=w)
-    tensor[:, 3].clamp_(min=0, max=h)
diff --git a/examples/research_projects/lxmert/requirements.txt b/examples/research_projects/lxmert/requirements.txt
deleted file mode 100644
index e2778663a53c..000000000000
--- a/examples/research_projects/lxmert/requirements.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-appdirs==1.4.3
-argon2-cffi==20.1.0
-async-generator==1.10
-attrs==20.2.0
-backcall==0.2.0
-CacheControl==0.12.6
-certifi==2024.7.4
-cffi==1.14.2
-chardet==3.0.4
-click==7.1.2
-colorama==0.4.3
-contextlib2==0.6.0
-cycler==0.10.0
-datasets==1.0.0
-decorator==4.4.2
-defusedxml==0.6.0
-dill==0.3.2
-distlib==0.3.0
-distro==1.4.0
-entrypoints==0.3
-filelock==3.0.12
-future==0.18.3
-html5lib==1.0.1
-idna==3.7
-ipaddr==2.2.0
-ipykernel==5.3.4
-ipython
-ipython-genutils==0.2.0
-ipywidgets==7.5.1
-jedi==0.17.2
-Jinja2>=2.11.3
-joblib==1.2.0
-jsonschema==3.2.0
-jupyter==1.0.0
-jupyter-client==6.1.7
-jupyter-console==6.2.0
-jupyter-core==4.11.2
-jupyterlab-pygments==0.1.1
-kiwisolver==1.2.0
-lockfile==0.12.2
-MarkupSafe==1.1.1
-matplotlib==3.3.1
-mistune==2.0.3
-msgpack==0.6.2
-nbclient==0.5.0
-nbconvert==6.5.1
-nbformat==5.0.7
-nest-asyncio==1.4.0
-notebook==6.4.12
-numpy==1.22.0
-opencv-python==4.8.1.78
-packaging==20.3
-pandas==1.1.2
-pandocfilters==1.4.2
-parso==0.7.1
-pep517==0.8.2
-pexpect==4.8.0
-pickleshare==0.7.5
-Pillow>=8.1.1
-progress==1.5
-prometheus-client==0.8.0
-prompt-toolkit==3.0.7
-ptyprocess==0.6.0
-pyaml==20.4.0
-pyarrow==15.0.0
-pycparser==2.20
-Pygments>=2.7.4
-pyparsing==2.4.6
-pyrsistent==0.16.0
-python-dateutil==2.8.1
-pytoml==0.1.21
-pytz==2020.1
-PyYAML>=5.4
-pyzmq==19.0.2
-qtconsole==4.7.7
-QtPy==1.9.0
-regex==2020.7.14
-requests==2.32.2
-retrying==1.3.3
-sacremoses==0.0.43
-Send2Trash==1.5.0
-sentencepiece==0.1.91
-six==1.14.0
-terminado==0.8.3
-testpath==0.4.4
-tokenizers==0.8.1rc2
-torch==2.2.0
-torchvision==0.7.0
-tornado==6.4.2
-tqdm==4.66.3
-traitlets
-git+https://github.com/huggingface/transformers.git
-urllib3==1.26.19
-wcwidth==0.2.5
-webencodings==0.5.1
-wget==3.2
-widgetsnbextension==3.5.1
-xxhash==2.0.0
diff --git a/examples/research_projects/lxmert/utils.py b/examples/research_projects/lxmert/utils.py
deleted file mode 100644
index 995fbd2c19ae..000000000000
--- a/examples/research_projects/lxmert/utils.py
+++ /dev/null
@@ -1,554 +0,0 @@
-"""
-coding=utf-8
-Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal, Huggingface team :)
-Adapted From Facebook Inc, Detectron2
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.import copy
-"""
-
-import copy
-import fnmatch
-import json
-import os
-import pickle as pkl
-import shutil
-import sys
-import tarfile
-import tempfile
-from collections import OrderedDict
-from contextlib import contextmanager
-from functools import partial
-from io import BytesIO
-from pathlib import Path
-from urllib.parse import urlparse
-from zipfile import ZipFile, is_zipfile
-
-import cv2
-import numpy as np
-import requests
-import wget
-from filelock import FileLock
-from huggingface_hub.utils import insecure_hashlib
-from PIL import Image
-from tqdm.auto import tqdm
-from yaml import Loader, dump, load
-
-
-try:
-    import torch
-
-    _torch_available = True
-except ImportError:
-    _torch_available = False
-
-
-try:
-    from torch.hub import _get_torch_home
-
-    torch_cache_home = _get_torch_home()
-except ImportError:
-    torch_cache_home = os.path.expanduser(
-        os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
-    )
-
-default_cache_path = os.path.join(torch_cache_home, "transformers")
-
-CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
-S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
-PATH = "/".join(str(Path(__file__).resolve()).split("/")[:-1])
-CONFIG = os.path.join(PATH, "config.yaml")
-ATTRIBUTES = os.path.join(PATH, "attributes.txt")
-OBJECTS = os.path.join(PATH, "objects.txt")
-PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
-PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
-TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
-WEIGHTS_NAME = "pytorch_model.bin"
-CONFIG_NAME = "config.yaml"
-
-
-def load_labels(objs=OBJECTS, attrs=ATTRIBUTES):
-    vg_classes = []
-    with open(objs) as f:
-        for object in f.readlines():
-            vg_classes.append(object.split(",")[0].lower().strip())
-
-    vg_attrs = []
-    with open(attrs) as f:
-        for object in f.readlines():
-            vg_attrs.append(object.split(",")[0].lower().strip())
-    return vg_classes, vg_attrs
-
-
-def load_checkpoint(ckp):
-    r = OrderedDict()
-    with open(ckp, "rb") as f:
-        ckp = pkl.load(f)["model"]
-    for k in copy.deepcopy(list(ckp.keys())):
-        v = ckp.pop(k)
-        if isinstance(v, np.ndarray):
-            v = torch.tensor(v)
-        else:
-            assert isinstance(v, torch.tensor), type(v)
-        r[k] = v
-    return r
-
-
-class Config:
-    _pointer = {}
-
-    def __init__(self, dictionary: dict, name: str = "root", level=0):
-        self._name = name
-        self._level = level
-        d = {}
-        for k, v in dictionary.items():
-            if v is None:
-                raise ValueError()
-            k = copy.deepcopy(k)
-            v = copy.deepcopy(v)
-            if isinstance(v, dict):
-                v = Config(v, name=k, level=level + 1)
-            d[k] = v
-            setattr(self, k, v)
-
-        self._pointer = d
-
-    def __repr__(self):
-        return str(list((self._pointer.keys())))
-
-    def __setattr__(self, key, val):
-        self.__dict__[key] = val
-        self.__dict__[key.upper()] = val
-        levels = key.split(".")
-        last_level = len(levels) - 1
-        pointer = self._pointer
-        if len(levels) > 1:
-            for i, l in enumerate(levels):
-                if hasattr(self, l) and isinstance(getattr(self, l), Config):
-                    setattr(getattr(self, l), ".".join(levels[i:]), val)
-                if l == last_level:
-                    pointer[l] = val
-                else:
-                    pointer = pointer[l]
-
-    def to_dict(self):
-        return self._pointer
-
-    def dump_yaml(self, data, file_name):
-        with open(f"{file_name}", "w") as stream:
-            dump(data, stream)
-
-    def dump_json(self, data, file_name):
-        with open(f"{file_name}", "w") as stream:
-            json.dump(data, stream)
-
-    @staticmethod
-    def load_yaml(config):
-        with open(config) as stream:
-            data = load(stream, Loader=Loader)
-        return data
-
-    def __str__(self):
-        t = "    "
-        if self._name != "root":
-            r = f"{t * (self._level-1)}{self._name}:\n"
-        else:
-            r = ""
-        level = self._level
-        for i, (k, v) in enumerate(self._pointer.items()):
-            if isinstance(v, Config):
-                r += f"{t * (self._level)}{v}\n"
-                self._level += 1
-            else:
-                r += f"{t * (self._level)}{k}: {v} ({type(v).__name__})\n"
-            self._level = level
-        return r[:-1]
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        return cls(config_dict)
-
-    @classmethod
-    def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs):
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-
-        if os.path.isdir(pretrained_model_name_or_path):
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            config_file = pretrained_model_name_or_path
-        else:
-            config_file = hf_bucket_url(pretrained_model_name_or_path, filename=CONFIG_NAME, use_cdn=False)
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_config_file = cached_path(
-                config_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-            )
-            # Load config dict
-            if resolved_config_file is None:
-                raise EnvironmentError
-
-            config_file = Config.load_yaml(resolved_config_file)
-
-        except EnvironmentError:
-            msg = "Can't load config for"
-            raise EnvironmentError(msg)
-
-        if resolved_config_file == config_file:
-            print("loading configuration file from path")
-        else:
-            print("loading configuration file cache")
-
-        return Config.load_yaml(resolved_config_file), kwargs
-
-
-# quick compare tensors
-def compare(in_tensor):
-    out_tensor = torch.load("dump.pt", map_location=in_tensor.device)
-    n1 = in_tensor.numpy()
-    n2 = out_tensor.numpy()[0]
-    print(n1.shape, n1[0, 0, :5])
-    print(n2.shape, n2[0, 0, :5])
-    assert np.allclose(n1, n2, rtol=0.01, atol=0.1), (
-        f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x is False])/len(n1.flatten())*100:.4f} %"
-        " element-wise mismatch"
-    )
-    raise Exception("tensors are all good")
-
-    # Hugging face functions below
-
-
-def is_remote_url(url_or_filename):
-    parsed = urlparse(url_or_filename)
-    return parsed.scheme in ("http", "https")
-
-
-def hf_bucket_url(model_id: str, filename: str, use_cdn=True) -> str:
-    endpoint = CLOUDFRONT_DISTRIB_PREFIX if use_cdn else S3_BUCKET_PREFIX
-    legacy_format = "/" not in model_id
-    if legacy_format:
-        return f"{endpoint}/{model_id}-{filename}"
-    else:
-        return f"{endpoint}/{model_id}/{filename}"
-
-
-def http_get(
-    url,
-    temp_file,
-    proxies=None,
-    resume_size=0,
-    user_agent=None,
-):
-    ua = "python/{}".format(sys.version.split()[0])
-    if _torch_available:
-        ua += "; torch/{}".format(torch.__version__)
-    if isinstance(user_agent, dict):
-        ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
-    elif isinstance(user_agent, str):
-        ua += "; " + user_agent
-    headers = {"user-agent": ua}
-    if resume_size > 0:
-        headers["Range"] = "bytes=%d-" % (resume_size,)
-    response = requests.get(url, stream=True, proxies=proxies, headers=headers)
-    if response.status_code == 416:  # Range not satisfiable
-        return
-    content_length = response.headers.get("Content-Length")
-    total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(
-        unit="B",
-        unit_scale=True,
-        total=total,
-        initial=resume_size,
-        desc="Downloading",
-    )
-    for chunk in response.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-
-
-def get_from_cache(
-    url,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    etag_timeout=10,
-    resume_download=False,
-    user_agent=None,
-    local_files_only=False,
-):
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    os.makedirs(cache_dir, exist_ok=True)
-
-    etag = None
-    if not local_files_only:
-        try:
-            response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
-            if response.status_code == 200:
-                etag = response.headers.get("ETag")
-        except (EnvironmentError, requests.exceptions.Timeout):
-            # etag is already None
-            pass
-
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
-    # try to get the last downloaded one
-    if etag is None:
-        if os.path.exists(cache_path):
-            return cache_path
-        else:
-            matching_files = [
-                file
-                for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
-                if not file.endswith(".json") and not file.endswith(".lock")
-            ]
-            if len(matching_files) > 0:
-                return os.path.join(cache_dir, matching_files[-1])
-            else:
-                # If files cannot be found and local_files_only=True,
-                # the models might've been found if local_files_only=False
-                # Notify the user about that
-                if local_files_only:
-                    raise ValueError(
-                        "Cannot find the requested files in the cached path and outgoing traffic has been"
-                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
-                        " to False."
-                    )
-                return None
-
-    # From now on, etag is not None.
-    if os.path.exists(cache_path) and not force_download:
-        return cache_path
-
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + ".lock"
-    with FileLock(lock_path):
-        # If the download just completed while the lock was activated.
-        if os.path.exists(cache_path) and not force_download:
-            # Even if returning early like here, the lock will be released.
-            return cache_path
-
-        if resume_download:
-            incomplete_path = cache_path + ".incomplete"
-
-            @contextmanager
-            def _resumable_file_manager():
-                with open(incomplete_path, "a+b") as f:
-                    yield f
-
-            temp_file_manager = _resumable_file_manager
-            if os.path.exists(incomplete_path):
-                resume_size = os.stat(incomplete_path).st_size
-            else:
-                resume_size = 0
-        else:
-            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
-            resume_size = 0
-
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            print(
-                "%s not found in cache or force_download set to True, downloading to %s",
-                url,
-                temp_file.name,
-            )
-
-            http_get(
-                url,
-                temp_file,
-                proxies=proxies,
-                resume_size=resume_size,
-                user_agent=user_agent,
-            )
-
-        os.replace(temp_file.name, cache_path)
-
-        meta = {"url": url, "etag": etag}
-        meta_path = cache_path + ".json"
-        with open(meta_path, "w") as meta_file:
-            json.dump(meta, meta_file)
-
-    return cache_path
-
-
-def url_to_filename(url, etag=None):
-    url_bytes = url.encode("utf-8")
-    url_hash = insecure_hashlib.sha256(url_bytes)
-    filename = url_hash.hexdigest()
-
-    if etag:
-        etag_bytes = etag.encode("utf-8")
-        etag_hash = insecure_hashlib.sha256(etag_bytes)
-        filename += "." + etag_hash.hexdigest()
-
-    if url.endswith(".h5"):
-        filename += ".h5"
-
-    return filename
-
-
-def cached_path(
-    url_or_filename,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    resume_download=False,
-    user_agent=None,
-    extract_compressed_file=False,
-    force_extract=False,
-    local_files_only=False,
-):
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if is_remote_url(url_or_filename):
-        # URL, so get it from the cache (downloading if necessary)
-        output_path = get_from_cache(
-            url_or_filename,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            user_agent=user_agent,
-            local_files_only=local_files_only,
-        )
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        output_path = url_or_filename
-    elif urlparse(url_or_filename).scheme == "":
-        # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
-
-    if extract_compressed_file:
-        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
-            return output_path
-
-        # Path where we extract compressed archives
-        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
-        output_dir, output_file = os.path.split(output_path)
-        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
-        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
-
-        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
-            return output_path_extracted
-
-        # Prevent parallel extractions
-        lock_path = output_path + ".lock"
-        with FileLock(lock_path):
-            shutil.rmtree(output_path_extracted, ignore_errors=True)
-            os.makedirs(output_path_extracted)
-            if is_zipfile(output_path):
-                with ZipFile(output_path, "r") as zip_file:
-                    zip_file.extractall(output_path_extracted)
-                    zip_file.close()
-            elif tarfile.is_tarfile(output_path):
-                tar_file = tarfile.open(output_path)
-                tar_file.extractall(output_path_extracted)
-                tar_file.close()
-            else:
-                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
-
-        return output_path_extracted
-
-    return output_path
-
-
-def get_data(query, delim=","):
-    assert isinstance(query, str)
-    if os.path.isfile(query):
-        with open(query) as f:
-            data = eval(f.read())
-    else:
-        req = requests.get(query)
-        try:
-            data = requests.json()
-        except Exception:
-            data = req.content.decode()
-            assert data is not None, "could not connect"
-            try:
-                data = eval(data)
-            except Exception:
-                data = data.split("\n")
-        req.close()
-    return data
-
-
-def get_image_from_url(url):
-    response = requests.get(url)
-    img = np.array(Image.open(BytesIO(response.content)))
-    return img
-
-
-# to load legacy frcnn checkpoint from detectron
-def load_frcnn_pkl_from_url(url):
-    fn = url.split("/")[-1]
-    if fn not in os.listdir(os.getcwd()):
-        wget.download(url)
-    with open(fn, "rb") as stream:
-        weights = pkl.load(stream)
-    model = weights.pop("model")
-    new = {}
-    for k, v in model.items():
-        new[k] = torch.from_numpy(v)
-        if "running_var" in k:
-            zero = torch.tensor([0])
-            k2 = k.replace("running_var", "num_batches_tracked")
-            new[k2] = zero
-    return new
-
-
-def get_demo_path():
-    print(f"{os.path.abspath(os.path.join(PATH, os.pardir))}/demo.ipynb")
-
-
-def img_tensorize(im, input_format="RGB"):
-    assert isinstance(im, str)
-    if os.path.isfile(im):
-        img = cv2.imread(im)
-    else:
-        img = get_image_from_url(im)
-        assert img is not None, f"could not connect to: {im}"
-    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    if input_format == "RGB":
-        img = img[:, :, ::-1]
-    return img
-
-
-def chunk(images, batch=1):
-    return (images[i : i + batch] for i in range(0, len(images), batch))
diff --git a/examples/research_projects/lxmert/visualizing_image.py b/examples/research_projects/lxmert/visualizing_image.py
deleted file mode 100644
index dcfd8426ff4f..000000000000
--- a/examples/research_projects/lxmert/visualizing_image.py
+++ /dev/null
@@ -1,500 +0,0 @@
-"""
-coding=utf-8
-Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
-Adapted From Facebook Inc, Detectron2
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.import copy
-"""
-
-import colorsys
-import io
-
-import cv2
-import matplotlib as mpl
-import matplotlib.colors as mplc
-import matplotlib.figure as mplfigure
-import numpy as np
-import torch
-from matplotlib.backends.backend_agg import FigureCanvasAgg
-
-from utils import img_tensorize
-
-
-_SMALL_OBJ = 1000
-
-
-class SingleImageViz:
-    def __init__(
-        self,
-        img,
-        scale=1.2,
-        edgecolor="g",
-        alpha=0.5,
-        linestyle="-",
-        saveas="test_out.jpg",
-        rgb=True,
-        pynb=False,
-        id2obj=None,
-        id2attr=None,
-        pad=0.7,
-    ):
-        """
-        img: an RGB image of shape (H, W, 3).
-        """
-        if isinstance(img, torch.Tensor):
-            img = img.numpy().astype("np.uint8")
-        if isinstance(img, str):
-            img = img_tensorize(img)
-        assert isinstance(img, np.ndarray)
-
-        width, height = img.shape[1], img.shape[0]
-        fig = mplfigure.Figure(frameon=False)
-        dpi = fig.get_dpi()
-        width_in = (width * scale + 1e-2) / dpi
-        height_in = (height * scale + 1e-2) / dpi
-        fig.set_size_inches(width_in, height_in)
-        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
-        ax.axis("off")
-        ax.set_xlim(0.0, width)
-        ax.set_ylim(height)
-
-        self.saveas = saveas
-        self.rgb = rgb
-        self.pynb = pynb
-        self.img = img
-        self.edgecolor = edgecolor
-        self.alpha = 0.5
-        self.linestyle = linestyle
-        self.font_size = int(np.sqrt(min(height, width)) * scale // 3)
-        self.width = width
-        self.height = height
-        self.scale = scale
-        self.fig = fig
-        self.ax = ax
-        self.pad = pad
-        self.id2obj = id2obj
-        self.id2attr = id2attr
-        self.canvas = FigureCanvasAgg(fig)
-
-    def add_box(self, box, color=None):
-        if color is None:
-            color = self.edgecolor
-        (x0, y0, x1, y1) = box
-        width = x1 - x0
-        height = y1 - y0
-        self.ax.add_patch(
-            mpl.patches.Rectangle(
-                (x0, y0),
-                width,
-                height,
-                fill=False,
-                edgecolor=color,
-                linewidth=self.font_size // 3,
-                alpha=self.alpha,
-                linestyle=self.linestyle,
-            )
-        )
-
-    def draw_boxes(self, boxes, obj_ids=None, obj_scores=None, attr_ids=None, attr_scores=None):
-        if len(boxes.shape) > 2:
-            boxes = boxes[0]
-        if len(obj_ids.shape) > 1:
-            obj_ids = obj_ids[0]
-        if len(obj_scores.shape) > 1:
-            obj_scores = obj_scores[0]
-        if len(attr_ids.shape) > 1:
-            attr_ids = attr_ids[0]
-        if len(attr_scores.shape) > 1:
-            attr_scores = attr_scores[0]
-        if isinstance(boxes, torch.Tensor):
-            boxes = boxes.numpy()
-        if isinstance(boxes, list):
-            boxes = np.array(boxes)
-        assert isinstance(boxes, np.ndarray)
-        areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
-        sorted_idxs = np.argsort(-areas).tolist()
-        boxes = boxes[sorted_idxs] if boxes is not None else None
-        obj_ids = obj_ids[sorted_idxs] if obj_ids is not None else None
-        obj_scores = obj_scores[sorted_idxs] if obj_scores is not None else None
-        attr_ids = attr_ids[sorted_idxs] if attr_ids is not None else None
-        attr_scores = attr_scores[sorted_idxs] if attr_scores is not None else None
-
-        assigned_colors = [self._random_color(maximum=1) for _ in range(len(boxes))]
-        assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
-        if obj_ids is not None:
-            labels = self._create_text_labels_attr(obj_ids, obj_scores, attr_ids, attr_scores)
-            for i in range(len(boxes)):
-                color = assigned_colors[i]
-                self.add_box(boxes[i], color)
-                self.draw_labels(labels[i], boxes[i], color)
-
-    def draw_labels(self, label, box, color):
-        x0, y0, x1, y1 = box
-        text_pos = (x0, y0)
-        instance_area = (y1 - y0) * (x1 - x0)
-        small = _SMALL_OBJ * self.scale
-        if instance_area < small or y1 - y0 < 40 * self.scale:
-            if y1 >= self.height - 5:
-                text_pos = (x1, y0)
-            else:
-                text_pos = (x0, y1)
-
-        height_ratio = (y1 - y0) / np.sqrt(self.height * self.width)
-        lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-        font_size = np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
-        font_size *= 0.75 * self.font_size
-
-        self.draw_text(
-            text=label,
-            position=text_pos,
-            color=lighter_color,
-        )
-
-    def draw_text(
-        self,
-        text,
-        position,
-        color="g",
-        ha="left",
-    ):
-        rotation = 0
-        font_size = self.font_size
-        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
-        color[np.argmax(color)] = max(0.8, np.max(color))
-        bbox = {
-            "facecolor": "black",
-            "alpha": self.alpha,
-            "pad": self.pad,
-            "edgecolor": "none",
-        }
-        x, y = position
-        self.ax.text(
-            x,
-            y,
-            text,
-            size=font_size * self.scale,
-            family="sans-serif",
-            bbox=bbox,
-            verticalalignment="top",
-            horizontalalignment=ha,
-            color=color,
-            zorder=10,
-            rotation=rotation,
-        )
-
-    def save(self, saveas=None):
-        if saveas is None:
-            saveas = self.saveas
-        if saveas.lower().endswith(".jpg") or saveas.lower().endswith(".png"):
-            cv2.imwrite(
-                saveas,
-                self._get_buffer()[:, :, ::-1],
-            )
-        else:
-            self.fig.savefig(saveas)
-
-    def _create_text_labels_attr(self, classes, scores, attr_classes, attr_scores):
-        labels = [self.id2obj[i] for i in classes]
-        attr_labels = [self.id2attr[i] for i in attr_classes]
-        labels = [
-            f"{label} {score:.2f} {attr} {attr_score:.2f}"
-            for label, score, attr, attr_score in zip(labels, scores, attr_labels, attr_scores)
-        ]
-        return labels
-
-    def _create_text_labels(self, classes, scores):
-        labels = [self.id2obj[i] for i in classes]
-        if scores is not None:
-            if labels is None:
-                labels = ["{:.0f}%".format(s * 100) for s in scores]
-            else:
-                labels = ["{} {:.0f}%".format(li, s * 100) for li, s in zip(labels, scores)]
-        return labels
-
-    def _random_color(self, maximum=255):
-        idx = np.random.randint(0, len(_COLORS))
-        ret = _COLORS[idx] * maximum
-        if not self.rgb:
-            ret = ret[::-1]
-        return ret
-
-    def _get_buffer(self):
-        if not self.pynb:
-            s, (width, height) = self.canvas.print_to_buffer()
-            if (width, height) != (self.width, self.height):
-                img = cv2.resize(self.img, (width, height))
-            else:
-                img = self.img
-        else:
-            buf = io.BytesIO()  # works for cairo backend
-            self.canvas.print_rgba(buf)
-            width, height = self.width, self.height
-            s = buf.getvalue()
-            img = self.img
-
-        buffer = np.frombuffer(s, dtype="uint8")
-        img_rgba = buffer.reshape(height, width, 4)
-        rgb, alpha = np.split(img_rgba, [3], axis=2)
-
-        try:
-            import numexpr as ne  # fuse them with numexpr
-
-            visualized_image = ne.evaluate("img * (1 - alpha / 255.0) + rgb * (alpha / 255.0)")
-        except ImportError:
-            alpha = alpha.astype("float32") / 255.0
-            visualized_image = img * (1 - alpha) + rgb * alpha
-
-        return visualized_image.astype("uint8")
-
-    def _change_color_brightness(self, color, brightness_factor):
-        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
-        color = mplc.to_rgb(color)
-        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
-        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
-        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
-        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
-        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
-        return modified_color
-
-
-# Color map
-_COLORS = (
-    np.array(
-        [
-            0.000,
-            0.447,
-            0.741,
-            0.850,
-            0.325,
-            0.098,
-            0.929,
-            0.694,
-            0.125,
-            0.494,
-            0.184,
-            0.556,
-            0.466,
-            0.674,
-            0.188,
-            0.301,
-            0.745,
-            0.933,
-            0.635,
-            0.078,
-            0.184,
-            0.300,
-            0.300,
-            0.300,
-            0.600,
-            0.600,
-            0.600,
-            1.000,
-            0.000,
-            0.000,
-            1.000,
-            0.500,
-            0.000,
-            0.749,
-            0.749,
-            0.000,
-            0.000,
-            1.000,
-            0.000,
-            0.000,
-            0.000,
-            1.000,
-            0.667,
-            0.000,
-            1.000,
-            0.333,
-            0.333,
-            0.000,
-            0.333,
-            0.667,
-            0.000,
-            0.333,
-            1.000,
-            0.000,
-            0.667,
-            0.333,
-            0.000,
-            0.667,
-            0.667,
-            0.000,
-            0.667,
-            1.000,
-            0.000,
-            1.000,
-            0.333,
-            0.000,
-            1.000,
-            0.667,
-            0.000,
-            1.000,
-            1.000,
-            0.000,
-            0.000,
-            0.333,
-            0.500,
-            0.000,
-            0.667,
-            0.500,
-            0.000,
-            1.000,
-            0.500,
-            0.333,
-            0.000,
-            0.500,
-            0.333,
-            0.333,
-            0.500,
-            0.333,
-            0.667,
-            0.500,
-            0.333,
-            1.000,
-            0.500,
-            0.667,
-            0.000,
-            0.500,
-            0.667,
-            0.333,
-            0.500,
-            0.667,
-            0.667,
-            0.500,
-            0.667,
-            1.000,
-            0.500,
-            1.000,
-            0.000,
-            0.500,
-            1.000,
-            0.333,
-            0.500,
-            1.000,
-            0.667,
-            0.500,
-            1.000,
-            1.000,
-            0.500,
-            0.000,
-            0.333,
-            1.000,
-            0.000,
-            0.667,
-            1.000,
-            0.000,
-            1.000,
-            1.000,
-            0.333,
-            0.000,
-            1.000,
-            0.333,
-            0.333,
-            1.000,
-            0.333,
-            0.667,
-            1.000,
-            0.333,
-            1.000,
-            1.000,
-            0.667,
-            0.000,
-            1.000,
-            0.667,
-            0.333,
-            1.000,
-            0.667,
-            0.667,
-            1.000,
-            0.667,
-            1.000,
-            1.000,
-            1.000,
-            0.000,
-            1.000,
-            1.000,
-            0.333,
-            1.000,
-            1.000,
-            0.667,
-            1.000,
-            0.333,
-            0.000,
-            0.000,
-            0.500,
-            0.000,
-            0.000,
-            0.667,
-            0.000,
-            0.000,
-            0.833,
-            0.000,
-            0.000,
-            1.000,
-            0.000,
-            0.000,
-            0.000,
-            0.167,
-            0.000,
-            0.000,
-            0.333,
-            0.000,
-            0.000,
-            0.500,
-            0.000,
-            0.000,
-            0.667,
-            0.000,
-            0.000,
-            0.833,
-            0.000,
-            0.000,
-            1.000,
-            0.000,
-            0.000,
-            0.000,
-            0.167,
-            0.000,
-            0.000,
-            0.333,
-            0.000,
-            0.000,
-            0.500,
-            0.000,
-            0.000,
-            0.667,
-            0.000,
-            0.000,
-            0.833,
-            0.000,
-            0.000,
-            1.000,
-            0.000,
-            0.000,
-            0.000,
-            0.143,
-            0.143,
-            0.143,
-            0.857,
-            0.857,
-            0.857,
-            1.000,
-            1.000,
-            1.000,
-        ]
-    )
-    .astype(np.float32)
-    .reshape(-1, 3)
-)
diff --git a/examples/research_projects/mlm_wwm/README.md b/examples/research_projects/mlm_wwm/README.md
deleted file mode 100644
index bf5aa9410826..000000000000
--- a/examples/research_projects/mlm_wwm/README.md
+++ /dev/null
@@ -1,98 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-## Whole Word Mask Language Model
-
-
-These scripts leverage the 🤗 Datasets library and the Trainer API. You can easily customize them to your needs if you
-need extra processing on your datasets.
-
-The following examples, will run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own
-text files for training and validation. We give examples of both below.
-
-
-
-The BERT authors released a new version of BERT using Whole Word Masking in May 2019. Instead of masking randomly
-selected tokens (which may be part of words), they mask randomly selected words (masking all the tokens corresponding
-to that word). This technique has been refined for Chinese in [this paper](https://arxiv.org/abs/1906.08101).
-
-To fine-tune a model using whole word masking, use the following script:
-```bash
-python run_mlm_wwm.py \
-    --model_name_or_path FacebookAI/roberta-base \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --do_train \
-    --do_eval \
-    --output_dir /tmp/test-mlm-wwm
-```
-
-For Chinese models, we need to generate a reference files (which requires the ltp library), because it's tokenized at
-the character level.
-
-**Q :** Why a reference file?
-
-**A :** Suppose we have a Chinese sentence like: `我喜欢你` The original Chinese-BERT will tokenize it as
-`['我','喜','欢','你']` (character level). But `喜欢` is a whole word. For whole word masking proxy, we need a result
-like `['我','喜','##欢','你']`, so we need a reference file to tell the model which position of the BERT original token
-should be added `##`.
-
-**Q :** Why LTP ?
-
-**A :** Cause the best known Chinese WWM BERT is [Chinese-BERT-wwm](https://github.com/ymcui/Chinese-BERT-wwm) by HIT.
-It works well on so many Chines Task like CLUE (Chinese GLUE). They use LTP, so if we want to fine-tune their model,
-we need LTP.
-
-You could run the following:
-
-
-```bash
-export TRAIN_FILE=/path/to/train/file
-export LTP_RESOURCE=/path/to/ltp/tokenizer
-export BERT_RESOURCE=/path/to/bert/tokenizer
-export SAVE_PATH=/path/to/data/ref.txt
-
-python run_chinese_ref.py \
-    --file_name=$TRAIN_FILE \
-    --ltp=$LTP_RESOURCE \
-    --bert=$BERT_RESOURCE \
-    --save_path=$SAVE_PATH
-```
-
-Then you can run the script like this: 
-
-
-```bash
-export TRAIN_FILE=/path/to/train/file
-export VALIDATION_FILE=/path/to/validation/file
-export TRAIN_REF_FILE=/path/to/train/chinese_ref/file
-export VALIDATION_REF_FILE=/path/to/validation/chinese_ref/file
-export OUTPUT_DIR=/tmp/test-mlm-wwm
-
-python run_mlm_wwm.py \
-    --model_name_or_path FacebookAI/roberta-base \
-    --train_file $TRAIN_FILE \
-    --validation_file $VALIDATION_FILE \
-    --train_ref_file $TRAIN_REF_FILE \
-    --validation_ref_file $VALIDATION_REF_FILE \
-    --do_train \
-    --do_eval \
-    --output_dir $OUTPUT_DIR
-```
-
-**Note1:** On TPU, you should the flag `--pad_to_max_length` to make sure all your batches have the same length.
-
-**Note2:** And if you have any questions or something goes wrong when running this code, don't hesitate to pin @wlhgtc.
diff --git a/examples/research_projects/mlm_wwm/requirements.txt b/examples/research_projects/mlm_wwm/requirements.txt
deleted file mode 100644
index 2d0f26bd4dc3..000000000000
--- a/examples/research_projects/mlm_wwm/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-datasets >= 1.1.3
-sentencepiece != 0.1.92
-protobuf
-ltp
diff --git a/examples/research_projects/mlm_wwm/run_chinese_ref.py b/examples/research_projects/mlm_wwm/run_chinese_ref.py
deleted file mode 100644
index eca89df97982..000000000000
--- a/examples/research_projects/mlm_wwm/run_chinese_ref.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import argparse
-import json
-from typing import List
-
-from ltp import LTP
-
-from transformers.models.bert.tokenization_bert import BertTokenizer
-
-
-def _is_chinese_char(cp):
-    """Checks whether CP is the codepoint of a CJK character."""
-    # This defines a "chinese character" as anything in the CJK Unicode block:
-    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-    #
-    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-    # despite its name. The modern Korean Hangul alphabet is a different block,
-    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-    # space-separated words, so they are not treated specially and handled
-    # like the all of the other languages.
-    if (
-        (cp >= 0x4E00 and cp <= 0x9FFF)
-        or (cp >= 0x3400 and cp <= 0x4DBF)  #
-        or (cp >= 0x20000 and cp <= 0x2A6DF)  #
-        or (cp >= 0x2A700 and cp <= 0x2B73F)  #
-        or (cp >= 0x2B740 and cp <= 0x2B81F)  #
-        or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
-        or (cp >= 0xF900 and cp <= 0xFAFF)
-        or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
-    ):  #
-        return True
-
-    return False
-
-
-def is_chinese(word: str):
-    # word like '180' or '身高' or '神'
-    for char in word:
-        char = ord(char)
-        if not _is_chinese_char(char):
-            return 0
-    return 1
-
-
-def get_chinese_word(tokens: List[str]):
-    word_set = set()
-
-    for token in tokens:
-        chinese_word = len(token) > 1 and is_chinese(token)
-        if chinese_word:
-            word_set.add(token)
-    word_list = list(word_set)
-    return word_list
-
-
-def add_sub_symbol(bert_tokens: List[str], chinese_word_set: set()):
-    if not chinese_word_set:
-        return bert_tokens
-    max_word_len = max([len(w) for w in chinese_word_set])
-
-    bert_word = bert_tokens
-    start, end = 0, len(bert_word)
-    while start < end:
-        single_word = True
-        if is_chinese(bert_word[start]):
-            l = min(end - start, max_word_len)
-            for i in range(l, 1, -1):
-                whole_word = "".join(bert_word[start : start + i])
-                if whole_word in chinese_word_set:
-                    for j in range(start + 1, start + i):
-                        bert_word[j] = "##" + bert_word[j]
-                    start = start + i
-                    single_word = False
-                    break
-        if single_word:
-            start += 1
-    return bert_word
-
-
-def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer):
-    ltp_res = []
-
-    for i in range(0, len(lines), 100):
-        res = ltp_tokenizer.pipeline(lines[i : i + 100], tasks=["cws"]).cws
-        res = [get_chinese_word(r) for r in res]
-        ltp_res.extend(res)
-    assert len(ltp_res) == len(lines)
-
-    bert_res = []
-    for i in range(0, len(lines), 100):
-        res = bert_tokenizer(lines[i : i + 100], add_special_tokens=True, truncation=True, max_length=512)
-        bert_res.extend(res["input_ids"])
-    assert len(bert_res) == len(lines)
-
-    ref_ids = []
-    for input_ids, chinese_word in zip(bert_res, ltp_res):
-        input_tokens = []
-        for id in input_ids:
-            token = bert_tokenizer._convert_id_to_token(id)
-            input_tokens.append(token)
-        input_tokens = add_sub_symbol(input_tokens, chinese_word)
-        ref_id = []
-        # We only save pos of chinese subwords start with ##, which mean is part of a whole word.
-        for i, token in enumerate(input_tokens):
-            if token[:2] == "##":
-                clean_token = token[2:]
-                # save chinese tokens' pos
-                if len(clean_token) == 1 and _is_chinese_char(ord(clean_token)):
-                    ref_id.append(i)
-        ref_ids.append(ref_id)
-
-    assert len(ref_ids) == len(bert_res)
-
-    return ref_ids
-
-
-def main(args):
-    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
-    # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
-    with open(args.file_name, "r", encoding="utf-8") as f:
-        data = f.readlines()
-    data = [line.strip() for line in data if len(line) > 0 and not line.isspace()]  # avoid delimiter like '\u2029'
-    ltp_tokenizer = LTP(args.ltp)  # faster in GPU device
-    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)
-
-    ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)
-
-    with open(args.save_path, "w", encoding="utf-8") as f:
-        data = [json.dumps(ref) + "\n" for ref in ref_ids]
-        f.writelines(data)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="prepare_chinese_ref")
-    parser.add_argument(
-        "--file_name",
-        required=False,
-        type=str,
-        default="./resources/chinese-demo.txt",
-        help="file need process, same as training data in lm",
-    )
-    parser.add_argument(
-        "--ltp",
-        required=False,
-        type=str,
-        default="./resources/ltp",
-        help="resources for LTP tokenizer, usually a path",
-    )
-    parser.add_argument(
-        "--bert",
-        required=False,
-        type=str,
-        default="./resources/robert",
-        help="resources for Bert tokenizer",
-    )
-    parser.add_argument(
-        "--save_path",
-        required=False,
-        type=str,
-        default="./resources/ref.txt",
-        help="path to save res",
-    )
-
-    args = parser.parse_args()
-    main(args)
diff --git a/examples/research_projects/mlm_wwm/run_mlm_wwm.py b/examples/research_projects/mlm_wwm/run_mlm_wwm.py
deleted file mode 100644
index 629026bdb20a..000000000000
--- a/examples/research_projects/mlm_wwm/run_mlm_wwm.py
+++ /dev/null
@@ -1,435 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
-text file or a dataset.
-
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=fill-mask
-"""
-# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
-
-import json
-import logging
-import math
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-from datasets import Dataset, load_dataset
-
-import transformers
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_FOR_MASKED_LM_MAPPING,
-    AutoConfig,
-    AutoModelForMaskedLM,
-    AutoTokenizer,
-    DataCollatorForWholeWordMask,
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    set_seed,
-)
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-
-
-logger = logging.getLogger(__name__)
-MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
-            )
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_overrides: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override some existing default config settings when a model is trained from scratch. Example: "
-                "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
-            )
-        },
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
-            raise ValueError(
-                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
-            )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    train_ref_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
-    )
-    validation_ref_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    max_seq_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated. Default to the max input length of the model."
-            )
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    mlm_probability: float = field(
-        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if self.train_file is not None:
-            extension = self.train_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-        if self.validation_file is not None:
-            extension = self.validation_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
-
-
-def add_chinese_references(dataset, ref_file):
-    with open(ref_file, "r", encoding="utf-8") as f:
-        refs = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
-    assert len(dataset) == len(refs)
-
-    dataset_dict = {c: dataset[c] for c in dataset.column_names}
-    dataset_dict["chinese_ref"] = refs
-    return Dataset.from_dict(dataset_dict)
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
-        if "validation" not in datasets.keys():
-            datasets["validation"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-            )
-            datasets["train"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-            )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if extension == "txt":
-            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files)
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
-    }
-    if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
-    elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
-    else:
-        config = CONFIG_MAPPING[model_args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-        if model_args.config_overrides is not None:
-            logger.info(f"Overriding config: {model_args.config_overrides}")
-            config.update_from_string(model_args.config_overrides)
-            logger.info(f"New config: {config}")
-
-    tokenizer_kwargs = {
-        "cache_dir": model_args.cache_dir,
-        "use_fast": model_args.use_fast_tokenizer,
-        "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
-    }
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    if model_args.model_name_or_path:
-        model = AutoModelForMaskedLM.from_pretrained(
-            model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-            revision=model_args.model_revision,
-            token=True if model_args.use_auth_token else None,
-        )
-    else:
-        logger.info("Training new model from scratch")
-        model = AutoModelForMaskedLM.from_config(config)
-
-    model.resize_token_embeddings(len(tokenizer))
-
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    if training_args.do_train:
-        column_names = datasets["train"].column_names
-    else:
-        column_names = datasets["validation"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
-
-    padding = "max_length" if data_args.pad_to_max_length else False
-
-    def tokenize_function(examples):
-        # Remove empty lines
-        examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
-        return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)
-
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        remove_columns=[text_column_name],
-        load_from_cache_file=not data_args.overwrite_cache,
-    )
-
-    # Add the chinese references if provided
-    if data_args.train_ref_file is not None:
-        tokenized_datasets["train"] = add_chinese_references(tokenized_datasets["train"], data_args.train_ref_file)
-    if data_args.validation_ref_file is not None:
-        tokenized_datasets["validation"] = add_chinese_references(
-            tokenized_datasets["validation"], data_args.validation_ref_file
-        )
-    # If we have ref files, need to avoid it removed by trainer
-    has_ref = data_args.train_ref_file or data_args.validation_ref_file
-    if has_ref:
-        training_args.remove_unused_columns = False
-
-    # Data collator
-    # This one will take care of randomly masking the tokens.
-    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
-
-    # Initialize our Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
-        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-    )
-
-    # Training
-    if training_args.do_train:
-        if last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
-            checkpoint = model_args.model_name_or_path
-        else:
-            checkpoint = None
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
-        if trainer.is_world_process_zero():
-            with open(output_train_file, "w") as writer:
-                logger.info("***** Train results *****")
-                for key, value in sorted(train_result.metrics.items()):
-                    logger.info(f"  {key} = {value}")
-                    writer.write(f"{key} = {value}\n")
-
-            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
-            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        eval_output = trainer.evaluate()
-
-        perplexity = math.exp(eval_output["eval_loss"])
-        results["perplexity"] = perplexity
-
-        output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm_wwm.txt")
-        if trainer.is_world_process_zero():
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key, value in sorted(results.items()):
-                    logger.info(f"  {key} = {value}")
-                    writer.write(f"{key} = {value}\n")
-
-    return results
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/mm-imdb/README.md b/examples/research_projects/mm-imdb/README.md
deleted file mode 100644
index 68b2f15159ec..000000000000
--- a/examples/research_projects/mm-imdb/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-## MM-IMDb
-
-Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/main/examples/research_projects/mm-imdb/run_mmimdb.py).
-
-[MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
-
-### Training on MM-IMDb
-
-```bash
-python run_mmimdb.py \
-    --data_dir /path/to/mmimdb/dataset/ \
-    --model_type bert \
-    --model_name_or_path google-bert/bert-base-uncased \
-    --output_dir /path/to/save/dir/ \
-    --do_train \
-    --do_eval \
-    --max_seq_len 512 \
-    --gradient_accumulation_steps 20 \
-    --num_image_embeds 3 \
-    --num_train_epochs 100 \
-    --patience 5
-```
-
diff --git a/examples/research_projects/mm-imdb/run_mmimdb.py b/examples/research_projects/mm-imdb/run_mmimdb.py
deleted file mode 100644
index 686691e0b9ce..000000000000
--- a/examples/research_projects/mm-imdb/run_mmimdb.py
+++ /dev/null
@@ -1,575 +0,0 @@
-# coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Copyright (c) HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Finetuning the library models for multimodal multiclass prediction on MM-IMDB dataset."""
-
-import argparse
-import glob
-import json
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from sklearn.metrics import f1_score
-from torch import nn
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
-
-import transformers
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    AutoConfig,
-    AutoModel,
-    AutoTokenizer,
-    MMBTConfig,
-    MMBTForClassification,
-    get_linear_schedule_with_warmup,
-)
-from transformers.trainer_utils import is_main_process
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer, criterion):
-    """Train the model"""
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(
-        train_dataset,
-        sampler=train_sampler,
-        batch_size=args.train_batch_size,
-        collate_fn=collate_fn,
-        num_workers=args.num_workers,
-    )
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    tr_loss, logging_loss = 0.0, 0.0
-    best_f1, n_no_improve = 0, 0
-    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproducibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            labels = batch[5]
-            inputs = {
-                "input_ids": batch[0],
-                "input_modal": batch[2],
-                "attention_mask": batch[1],
-                "modal_start_tokens": batch[3],
-                "modal_end_tokens": batch[4],
-            }
-            outputs = model(**inputs)
-            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
-            loss = criterion(logits, labels)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    logs = {}
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer, criterion)
-                        for key, value in results.items():
-                            eval_key = "eval_{}".format(key)
-                            logs[eval_key] = value
-
-                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
-                    learning_rate_scalar = scheduler.get_lr()[0]
-                    logs["learning_rate"] = learning_rate_scalar
-                    logs["loss"] = loss_scalar
-                    logging_loss = tr_loss
-
-                    for key, value in logs.items():
-                        tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{"step": global_step}}))
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-        if args.local_rank == -1:
-            results = evaluate(args, model, tokenizer, criterion)
-            if results["micro_f1"] > best_f1:
-                best_f1 = results["micro_f1"]
-                n_no_improve = 0
-            else:
-                n_no_improve += 1
-
-            if n_no_improve > args.patience:
-                train_iterator.close()
-                break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, criterion, prefix=""):
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    eval_output_dir = args.output_dir
-    eval_dataset = load_examples(args, tokenizer, evaluate=True)
-
-    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(eval_output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(eval_dataset)
-    eval_dataloader = DataLoader(
-        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn
-    )
-
-    # multi-gpu eval
-    if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
-        model = nn.DataParallel(model)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(eval_dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    eval_loss = 0.0
-    nb_eval_steps = 0
-    preds = None
-    out_label_ids = None
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
-        batch = tuple(t.to(args.device) for t in batch)
-
-        with torch.no_grad():
-            batch = tuple(t.to(args.device) for t in batch)
-            labels = batch[5]
-            inputs = {
-                "input_ids": batch[0],
-                "input_modal": batch[2],
-                "attention_mask": batch[1],
-                "modal_start_tokens": batch[3],
-                "modal_end_tokens": batch[4],
-            }
-            outputs = model(**inputs)
-            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
-            tmp_eval_loss = criterion(logits, labels)
-            eval_loss += tmp_eval_loss.mean().item()
-        nb_eval_steps += 1
-        if preds is None:
-            preds = torch.sigmoid(logits).detach().cpu().numpy() > 0.5
-            out_label_ids = labels.detach().cpu().numpy()
-        else:
-            preds = np.append(preds, torch.sigmoid(logits).detach().cpu().numpy() > 0.5, axis=0)
-            out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
-
-    eval_loss = eval_loss / nb_eval_steps
-    result = {
-        "loss": eval_loss,
-        "macro_f1": f1_score(out_label_ids, preds, average="macro"),
-        "micro_f1": f1_score(out_label_ids, preds, average="micro"),
-    }
-
-    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
-    with open(output_eval_file, "w") as writer:
-        logger.info("***** Eval results {} *****".format(prefix))
-        for key in sorted(result.keys()):
-            logger.info("  %s = %s", key, str(result[key]))
-            writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return result
-
-
-def load_examples(args, tokenizer, evaluate=False):
-    path = os.path.join(args.data_dir, "dev.jsonl" if evaluate else "train.jsonl")
-    transforms = get_image_transforms()
-    labels = get_mmimdb_labels()
-    dataset = JsonlDataset(path, tokenizer, transforms, labels, args.max_seq_length - args.num_image_embeds - 2)
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default=None,
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help=(
-            "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        ),
-    )
-    parser.add_argument(
-        "--num_image_embeds", default=1, type=int, help="Number of Image Embeddings from the Image Encoder"
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument("--patience", default=5, type=int, help="Patience for Early Stopping.")
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument("--num_workers", type=int, default=8, help="number of worker threads for dataloading")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    # Setup model
-    labels = get_mmimdb_labels()
-    num_labels = len(labels)
-    transformer_config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir,
-    )
-    transformer = AutoModel.from_pretrained(
-        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir
-    )
-    img_encoder = ImageEncoder(args)
-    config = MMBTConfig(transformer_config, num_labels=num_labels)
-    model = MMBTForClassification(config, transformer, img_encoder)
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_examples(args, tokenizer, evaluate=False)
-        label_frequences = train_dataset.get_label_frequencies()
-        label_frequences = [label_frequences[l] for l in labels]
-        label_weights = (
-            torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)
-        ) ** -1
-        criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, criterion)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, WEIGHTS_NAME))
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = MMBTForClassification(config, transformer, img_encoder)
-        model.load_state_dict(torch.load(os.path.join(args.output_dir, WEIGHTS_NAME)))
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = [
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            ]
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-            model = MMBTForClassification(config, transformer, img_encoder)
-            model.load_state_dict(torch.load(checkpoint))
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, criterion, prefix=prefix)
-            result = {k + "_{}".format(global_step): v for k, v in result.items()}
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/mm-imdb/utils_mmimdb.py b/examples/research_projects/mm-imdb/utils_mmimdb.py
deleted file mode 100644
index df8e38d59749..000000000000
--- a/examples/research_projects/mm-imdb/utils_mmimdb.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Copyright (c) HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from collections import Counter
-
-import torch
-import torchvision
-import torchvision.transforms as transforms
-from PIL import Image
-from torch import nn
-from torch.utils.data import Dataset
-
-
-POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}
-
-
-class ImageEncoder(nn.Module):
-    def __init__(self, args):
-        super().__init__()
-        model = torchvision.models.resnet152(pretrained=True)
-        modules = list(model.children())[:-2]
-        self.model = nn.Sequential(*modules)
-        self.pool = nn.AdaptiveAvgPool2d(POOLING_BREAKDOWN[args.num_image_embeds])
-
-    def forward(self, x):
-        # Bx3x224x224 -> Bx2048x7x7 -> Bx2048xN -> BxNx2048
-        out = self.pool(self.model(x))
-        out = torch.flatten(out, start_dim=2)
-        out = out.transpose(1, 2).contiguous()
-        return out  # BxNx2048
-
-
-class JsonlDataset(Dataset):
-    def __init__(self, data_path, tokenizer, transforms, labels, max_seq_length):
-        self.data = [json.loads(l) for l in open(data_path)]
-        self.data_dir = os.path.dirname(data_path)
-        self.tokenizer = tokenizer
-        self.labels = labels
-        self.n_classes = len(labels)
-        self.max_seq_length = max_seq_length
-
-        self.transforms = transforms
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, index):
-        sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
-        start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
-        sentence = sentence[: self.max_seq_length]
-
-        label = torch.zeros(self.n_classes)
-        label[[self.labels.index(tgt) for tgt in self.data[index]["label"]]] = 1
-
-        image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
-        image = self.transforms(image)
-
-        return {
-            "image_start_token": start_token,
-            "image_end_token": end_token,
-            "sentence": sentence,
-            "image": image,
-            "label": label,
-        }
-
-    def get_label_frequencies(self):
-        label_freqs = Counter()
-        for row in self.data:
-            label_freqs.update(row["label"])
-        return label_freqs
-
-
-def collate_fn(batch):
-    lens = [len(row["sentence"]) for row in batch]
-    bsz, max_seq_len = len(batch), max(lens)
-
-    mask_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
-    text_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
-
-    for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
-        text_tensor[i_batch, :length] = input_row["sentence"]
-        mask_tensor[i_batch, :length] = 1
-
-    img_tensor = torch.stack([row["image"] for row in batch])
-    tgt_tensor = torch.stack([row["label"] for row in batch])
-    img_start_token = torch.stack([row["image_start_token"] for row in batch])
-    img_end_token = torch.stack([row["image_end_token"] for row in batch])
-
-    return text_tensor, mask_tensor, img_tensor, img_start_token, img_end_token, tgt_tensor
-
-
-def get_mmimdb_labels():
-    return [
-        "Crime",
-        "Drama",
-        "Thriller",
-        "Action",
-        "Comedy",
-        "Romance",
-        "Documentary",
-        "Short",
-        "Mystery",
-        "History",
-        "Family",
-        "Adventure",
-        "Fantasy",
-        "Sci-Fi",
-        "Western",
-        "Horror",
-        "Sport",
-        "War",
-        "Music",
-        "Musical",
-        "Animation",
-        "Biography",
-        "Film-Noir",
-    ]
-
-
-def get_image_transforms():
-    return transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.46777044, 0.44531429, 0.40661017],
-                std=[0.12221994, 0.12145835, 0.14380469],
-            ),
-        ]
-    )
diff --git a/examples/research_projects/movement-pruning/README.md b/examples/research_projects/movement-pruning/README.md
deleted file mode 100644
index 575ec1a9b492..000000000000
--- a/examples/research_projects/movement-pruning/README.md
+++ /dev/null
@@ -1,185 +0,0 @@
-# Movement Pruning: Adaptive Sparsity by Fine-Tuning
-
-Author: @VictorSanh
-
-*Magnitude pruning is a widely used strategy for reducing model size in pure supervised learning; however, it is less effective in the transfer learning regime that has become standard for state-of-the-art natural language processing applications. We propose the use of *movement pruning*, a simple, deterministic first-order weight pruning method that is more adaptive to pretrained model fine-tuning. Experiments show that when pruning large pretrained language models, movement pruning shows significant improvements in high-sparsity regimes. When combined with distillation, the approach achieves minimal accuracy loss with down to only 3% of the model parameters:*
-
-| Fine-pruning+Distillation<br>(Teacher=BERT-base fine-tuned) | BERT base<br>fine-tuned | Remaining<br>Weights (%) | Magnitude Pruning      | L0 Regularization      | Movement Pruning       | Soft Movement Pruning          |
-| :---:                                                       | :---:                   | :---:                    | :---:                  | :---:                  | :---:                  | :---:                          |
-| SQuAD - Dev<br>EM/F1                                        | 80.4/88.1               | 10%<br>3%                | 70.2/80.1<br>45.5/59.6 | 72.4/81.9<br>64.3/75.8 | 75.6/84.3<br>67.5/78.0 | **76.6/84.9**<br>**72.7/82.3** |
-| MNLI - Dev<br>acc/MM acc                                    | 84.5/84.9               | 10%<br>3%                | 78.3/79.3<br>69.4/70.6 | 78.7/79.7<br>76.0/76.2 | 80.1/80.4<br>76.5/77.4 | **81.2/81.8**<br>**79.5/80.1** |
-| QQP - Dev<br>acc/F1                                         | 91.4/88.4               | 10%<br>3%                | 79.8/65.0<br>72.4/57.8 | 88.1/82.8<br>87.0/81.9 | 89.7/86.2<br>86.1/81.5 | **90.2/86.8**<br>**89.1/85.5** |
-
-This page contains information on how to fine-prune pre-trained models such as `BERT` to obtain extremely sparse models with movement pruning. In contrast to magnitude pruning which selects weights that are far from 0, movement pruning retains weights that are moving away from 0.
-
-For more information, we invite you to check out [our paper](https://arxiv.org/abs/2005.07683).
-You can also have a look at this fun *Explain Like I'm Five* introductory [slide deck](https://www.slideshare.net/VictorSanh/movement-pruning-explain-like-im-five-234205241).
-
-<div align="center">
-<img src="https://www.seekpng.com/png/detail/166-1669328_how-to-make-emmental-cheese-at-home-icooker.png" width="400">
-</div>
-
-## Extreme sparsity and efficient storage
-
-One promise of extreme pruning is to obtain extremely small models that can be easily sent (and stored) on edge devices. By setting weights to 0., we reduce the amount of information we need to store, and thus decreasing the memory size. We are able to obtain extremely sparse fine-pruned models with movement pruning: ~95% of the dense performance with ~5% of total remaining weights in the BERT encoder.
-
-In [this notebook](https://github.com/huggingface/transformers/blob/main/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb), we showcase how we can leverage standard tools that exist out-of-the-box to efficiently store an extremely sparse question answering model (only 6% of total remaining weights in the encoder). We are able to reduce the memory size of the encoder **from the 340MB (the original dense BERT) to 11MB**, without any additional training of the model (every operation is performed *post fine-pruning*). It is sufficiently small to store it on a [91' floppy disk](https://en.wikipedia.org/wiki/Floptical) 📎!
-
-While movement pruning does not directly optimize for memory footprint (but rather the number of non-null weights), we hypothetize that further memory compression ratios can be achieved with specific quantization aware trainings (see for instance [Q8BERT](https://arxiv.org/abs/1910.06188), [And the Bit Goes Down](https://arxiv.org/abs/1907.05686) or [Quant-Noise](https://arxiv.org/abs/2004.07320)).
-
-## Fine-pruned models
-
-As examples, we release two English PruneBERT checkpoints (models fine-pruned from a pre-trained `BERT` checkpoint), one on SQuAD and the other on MNLI.
-
-- **`prunebert-base-uncased-6-finepruned-w-distil-squad`**<br/>
-Pre-trained `BERT-base-uncased` fine-pruned with soft movement pruning on SQuAD v1.1. We use an additional distillation signal from `BERT-base-uncased` finetuned on SQuAD. The encoder counts 6% of total non-null weights and reaches 83.8 F1 score. The model can be accessed with: `pruned_bert = BertForQuestionAnswering.from_pretrained("huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad")`
-- **`prunebert-base-uncased-6-finepruned-w-distil-mnli`**<br/>
-Pre-trained `BERT-base-uncased` fine-pruned with soft movement pruning on MNLI. We use an additional distillation signal from `BERT-base-uncased` finetuned on MNLI. The encoder counts 6% of total non-null weights and reaches 80.7 (matched) accuracy. The model can be accessed with: `pruned_bert = BertForSequenceClassification.from_pretrained("huggingface/prunebert-base-uncased-6-finepruned-w-distil-mnli")`
-
-## How to fine-prune?
-
-### Setup
-
-The code relies on the 🤗 Transformers library. In addition to the dependencies listed in the [`examples`](https://github.com/huggingface/transformers/tree/main/examples) folder, you should install a few additional dependencies listed in the `requirements.txt` file: `pip install -r requirements.txt`.
-
-Note that we built our experiments on top of a stabilized version of the library (commit https://github.com/huggingface/transformers/commit/352d5472b0c1dec0f420d606d16747d851b4bda8): we do not guarantee that everything is still compatible with the latest version of the main branch.
-
-### Fine-pruning with movement pruning
-
-Below, we detail how to reproduce the results reported in the paper. We use SQuAD as a running example. Commands (and scripts) can be easily adapted for other tasks.
-
-The following command fine-prunes a pre-trained `BERT-base` on SQuAD using movement pruning towards 15% of remaining weights (85% sparsity). Note that we freeze all the embeddings modules (from their pre-trained value) and only prune the Fully Connected layers in the encoder (12 layers of Transformer Block).
-
-```bash
-SERIALIZATION_DIR=<OUTPUT_DIR>
-SQUAD_DATA=<SQUAD_DATA>
-
-python examples/movement-pruning/masked_run_squad.py \
-    --output_dir $SERIALIZATION_DIR \
-    --data_dir $SQUAD_DATA \
-    --train_file train-v1.1.json \
-    --predict_file dev-v1.1.json \
-    --do_train --do_eval --do_lower_case \
-    --model_type masked_bert \
-    --model_name_or_path google-bert/bert-base-uncased \
-    --per_gpu_train_batch_size 16 \
-    --warmup_steps 5400 \
-    --num_train_epochs 10 \
-    --learning_rate 3e-5 --mask_scores_learning_rate 1e-2 \
-    --initial_threshold 1 --final_threshold 0.15 \
-    --initial_warmup 1 --final_warmup 2 \
-    --pruning_method topK --mask_init constant --mask_scale 0.
-```
-
-### Fine-pruning with other methods
-
-We can also explore other fine-pruning methods by changing the `pruning_method` parameter:
-
-Soft movement pruning
-```bash
-python examples/movement-pruning/masked_run_squad.py \
-    --output_dir $SERIALIZATION_DIR \
-    --data_dir $SQUAD_DATA \
-    --train_file train-v1.1.json \
-    --predict_file dev-v1.1.json \
-    --do_train --do_eval --do_lower_case \
-    --model_type masked_bert \
-    --model_name_or_path google-bert/bert-base-uncased \
-    --per_gpu_train_batch_size 16 \
-    --warmup_steps 5400 \
-    --num_train_epochs 10 \
-    --learning_rate 3e-5 --mask_scores_learning_rate 1e-2 \
-    --initial_threshold 0 --final_threshold 0.1 \
-    --initial_warmup 1 --final_warmup 2 \
-    --pruning_method sigmoied_threshold --mask_init constant --mask_scale 0. \
-    --regularization l1 --final_lambda 400.
-```
-
-L0 regularization
-```bash
-python examples/movement-pruning/masked_run_squad.py \
-    --output_dir $SERIALIZATION_DIR \
-    --data_dir $SQUAD_DATA \
-    --train_file train-v1.1.json \
-    --predict_file dev-v1.1.json \
-    --do_train --do_eval --do_lower_case \
-    --model_type masked_bert \
-    --model_name_or_path google-bert/bert-base-uncased \
-    --per_gpu_train_batch_size 16 \
-    --warmup_steps 5400 \
-    --num_train_epochs 10 \
-    --learning_rate 3e-5 --mask_scores_learning_rate 1e-1 \
-    --initial_threshold 1. --final_threshold 1. \
-    --initial_warmup 1 --final_warmup 1 \
-    --pruning_method l0 --mask_init constant --mask_scale 2.197 \
-    --regularization l0 --final_lambda 125.
-```
-
-Iterative Magnitude Pruning
-```bash
-python examples/movement-pruning/masked_run_squad.py \
-    --output_dir ./dbg \
-    --data_dir examples/distillation/data/squad_data \
-    --train_file train-v1.1.json \
-    --predict_file dev-v1.1.json \
-    --do_train --do_eval --do_lower_case \
-    --model_type masked_bert \
-    --model_name_or_path google-bert/bert-base-uncased \
-    --per_gpu_train_batch_size 16 \
-    --warmup_steps 5400 \
-    --num_train_epochs 10 \
-    --learning_rate 3e-5 \
-    --initial_threshold 1 --final_threshold 0.15 \
-    --initial_warmup 1 --final_warmup 2 \
-    --pruning_method magnitude
-```
-
-### After fine-pruning
-
-**Counting parameters**
-
-Regularization based pruning methods (soft movement pruning and L0 regularization) rely on the penalty to induce sparsity. The multiplicative coefficient controls the sparsity level.
-To obtain the effective sparsity level in the encoder, we simply count the number of activated (non-null) weights:
-
-```bash
-python examples/movement-pruning/counts_parameters.py \
-    --pruning_method sigmoied_threshold \
-    --threshold 0.1 \
-    --serialization_dir $SERIALIZATION_DIR
-```
-
-**Pruning once for all**
-
-Once the model has been fine-pruned, the pruned weights can be set to 0. once for all (reducing the amount of information to store). In our running experiments, we can convert a `MaskedBertForQuestionAnswering` (a BERT model augmented to enable on-the-fly pruning capabilities) to a standard `BertForQuestionAnswering`:
-
-```bash
-python examples/movement-pruning/bertarize.py \
-    --pruning_method sigmoied_threshold \
-    --threshold 0.1 \
-    --model_name_or_path $SERIALIZATION_DIR
-```
-
-## Hyper-parameters
-
-For reproducibility purposes, we share the detailed results presented in the paper. These [tables](https://docs.google.com/spreadsheets/d/17JgRq_OFFTniUrz6BZWW_87DjFkKXpI1kYDSsseT_7g/edit?usp=sharing) exhaustively describe the individual hyper-parameters used for each data point.
-
-## Inference speed
-
-Early experiments show that even though models fine-pruned with (soft) movement pruning are extremely sparse, they do not benefit from significant improvement in terms of inference speed when using the standard PyTorch inference.
-We are currently benchmarking and exploring inference setups specifically for sparse architectures.
-In particular, hardware manufacturers are announcing devices that will speedup inference for sparse networks considerably.
-
-## Citation
-
-If you find this resource useful, please consider citing the following paper:
-
-```bibtex
-@article{sanh2020movement,
-    title={Movement Pruning: Adaptive Sparsity by Fine-Tuning},
-    author={Victor Sanh and Thomas Wolf and Alexander M. Rush},
-    year={2020},
-    eprint={2005.07683},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-```
diff --git a/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb b/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb
deleted file mode 100644
index e159549a105c..000000000000
--- a/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb
+++ /dev/null
@@ -1,645 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Saving PruneBERT\n",
-    "\n",
-    "\n",
-    "This notebook aims at showcasing how we can leverage standard tools to save (and load) an extremely sparse model fine-pruned with [movement pruning](https://arxiv.org/abs/2005.07683) (or any other unstructured pruning mehtod).\n",
-    "\n",
-    "In this example, we used BERT (base-uncased, but the procedure described here is not specific to BERT and can be applied to a large variety of models.\n",
-    "\n",
-    "We first obtain an extremely sparse model by fine-pruning with movement pruning on SQuAD v1.1. We then used the following combination of standard tools:\n",
-    "- We reduce the precision of the model with Int8 dynamic quantization using [PyTorch implementation](https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html). We only quantized the Fully Connected Layers.\n",
-    "- Sparse quantized matrices are converted into the [Compressed Sparse Row format](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html).\n",
-    "- We use HDF5 with `gzip` compression to store the weights.\n",
-    "\n",
-    "We experiment with a question answering model with only 6% of total remaining weights in the encoder (previously obtained with movement pruning). **We are able to reduce the memory size of the encoder from 340MB (original dense BERT) to 11MB**, which fits on a [91' floppy disk](https://en.wikipedia.org/wiki/Floptical)!\n",
-    "\n",
-    "<img src=\"https://upload.wikimedia.org/wikipedia/commons/thumb/0/00/Floptical_disk_21MB.jpg/440px-Floptical_disk_21MB.jpg\" width=\"200\">\n",
-    "\n",
-    "*Note: this notebook is compatible with `torch>=1.5.0` If you are using, `torch==1.4.0`, please refer to [this previous version of the notebook](https://github.com/huggingface/transformers/commit/b11386e158e86e62d4041eabd86d044cd1695737).*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Includes\n",
-    "\n",
-    "import json\n",
-    "import os\n",
-    "from collections import OrderedDict\n",
-    "\n",
-    "import h5py\n",
-    "import numpy as np\n",
-    "import torch\n",
-    "from scipy import sparse\n",
-    "from torch import nn\n",
-    "\n",
-    "from transformers import *\n",
-    "\n",
-    "\n",
-    "os.chdir(\"../../\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Saving"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Dynamic quantization induces little or no loss of performance while significantly reducing the memory footprint."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load fine-pruned model and quantize the model\n",
-    "\n",
-    "model = BertForQuestionAnswering.from_pretrained(\"huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad\")\n",
-    "model.to(\"cpu\")\n",
-    "\n",
-    "quantized_model = torch.quantization.quantize_dynamic(\n",
-    "    model=model,\n",
-    "    qconfig_spec={\n",
-    "        nn.Linear: torch.quantization.default_dynamic_qconfig,\n",
-    "    },\n",
-    "    dtype=torch.qint8,\n",
-    ")\n",
-    "# print(quantized_model)\n",
-    "\n",
-    "qtz_st = quantized_model.state_dict()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Saving the original (encoder + classifier) in the standard torch.save format\n",
-    "\n",
-    "dense_st = {\n",
-    "    name: param for name, param in model.state_dict().items() if \"embedding\" not in name and \"pooler\" not in name\n",
-    "}\n",
-    "torch.save(\n",
-    "    dense_st,\n",
-    "    \"dbg/dense_squad.pt\",\n",
-    ")\n",
-    "dense_mb_size = os.path.getsize(\"dbg/dense_squad.pt\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Decompose quantization for bert.encoder.layer.0.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.0.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.0.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.0.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.0.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.0.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.1.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.1.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.1.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.1.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.1.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.1.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.2.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.2.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.2.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.2.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.2.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.2.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.3.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.3.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.3.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.3.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.3.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.3.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.4.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.4.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.4.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.4.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.4.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.4.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.5.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.5.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.5.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.5.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.5.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.5.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.6.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.6.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.6.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.6.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.6.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.6.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.7.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.7.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.7.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.7.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.7.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.7.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.8.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.8.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.8.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.8.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.8.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.8.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.9.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.9.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.9.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.9.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.9.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.9.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.10.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.10.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.10.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.10.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.10.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.10.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.11.attention.self.query._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.11.attention.self.key._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.11.attention.self.value._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.11.attention.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.11.intermediate.dense._packed_params.weight\n",
-      "Decompose quantization for bert.encoder.layer.11.output.dense._packed_params.weight\n",
-      "Decompose quantization for bert.pooler.dense._packed_params.weight\n",
-      "Decompose quantization for qa_outputs._packed_params.weight\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Elementary representation: we decompose the quantized tensors into (scale, zero_point, int_repr).\n",
-    "# See https://pytorch.org/docs/stable/quantization.html\n",
-    "\n",
-    "# We further leverage the fact that int_repr is sparse matrix to optimize the storage: we decompose int_repr into\n",
-    "# its CSR representation (data, indptr, indices).\n",
-    "\n",
-    "elementary_qtz_st = {}\n",
-    "for name, param in qtz_st.items():\n",
-    "    if \"dtype\" not in name and param.is_quantized:\n",
-    "        print(\"Decompose quantization for\", name)\n",
-    "        # We need to extract the scale, the zero_point and the int_repr for the quantized tensor and modules\n",
-    "        scale = param.q_scale()  # torch.tensor(1,) - float32\n",
-    "        zero_point = param.q_zero_point()  # torch.tensor(1,) - int32\n",
-    "        elementary_qtz_st[f\"{name}.scale\"] = scale\n",
-    "        elementary_qtz_st[f\"{name}.zero_point\"] = zero_point\n",
-    "\n",
-    "        # We assume the int_repr is sparse and compute its CSR representation\n",
-    "        # Only the FCs in the encoder are actually sparse\n",
-    "        int_repr = param.int_repr()  # torch.tensor(nb_rows, nb_columns) - int8\n",
-    "        int_repr_cs = sparse.csr_matrix(int_repr)  # scipy.sparse.csr.csr_matrix\n",
-    "\n",
-    "        elementary_qtz_st[f\"{name}.int_repr.data\"] = int_repr_cs.data  # np.array int8\n",
-    "        elementary_qtz_st[f\"{name}.int_repr.indptr\"] = int_repr_cs.indptr  # np.array int32\n",
-    "        assert max(int_repr_cs.indices) < 65535  # If not, we shall fall back to int32\n",
-    "        elementary_qtz_st[f\"{name}.int_repr.indices\"] = np.uint16(int_repr_cs.indices)  # np.array uint16\n",
-    "        elementary_qtz_st[f\"{name}.int_repr.shape\"] = int_repr_cs.shape  # tuple(int, int)\n",
-    "    else:\n",
-    "        elementary_qtz_st[name] = param"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create mapping from torch.dtype to string description (we could also used an int8 instead of string)\n",
-    "str_2_dtype = {\"qint8\": torch.qint8}\n",
-    "dtype_2_str = {torch.qint8: \"qint8\"}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Encoder Size (MB) - Sparse & Quantized - `torch.save`: 21.29\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Saving the pruned (encoder + classifier) in the standard torch.save format\n",
-    "\n",
-    "dense_optimized_st = {\n",
-    "    name: param for name, param in elementary_qtz_st.items() if \"embedding\" not in name and \"pooler\" not in name\n",
-    "}\n",
-    "torch.save(\n",
-    "    dense_optimized_st,\n",
-    "    \"dbg/dense_squad_optimized.pt\",\n",
-    ")\n",
-    "print(\n",
-    "    \"Encoder Size (MB) - Sparse & Quantized - `torch.save`:\",\n",
-    "    round(os.path.getsize(\"dbg/dense_squad_optimized.pt\") / 1e6, 2),\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Skip bert.embeddings.word_embeddings.weight\n",
-      "Skip bert.embeddings.position_embeddings.weight\n",
-      "Skip bert.embeddings.token_type_embeddings.weight\n",
-      "Skip bert.embeddings.LayerNorm.weight\n",
-      "Skip bert.embeddings.LayerNorm.bias\n",
-      "Skip bert.pooler.dense.scale\n",
-      "Skip bert.pooler.dense.zero_point\n",
-      "Skip bert.pooler.dense._packed_params.weight.scale\n",
-      "Skip bert.pooler.dense._packed_params.weight.zero_point\n",
-      "Skip bert.pooler.dense._packed_params.weight.int_repr.data\n",
-      "Skip bert.pooler.dense._packed_params.weight.int_repr.indptr\n",
-      "Skip bert.pooler.dense._packed_params.weight.int_repr.indices\n",
-      "Skip bert.pooler.dense._packed_params.weight.int_repr.shape\n",
-      "Skip bert.pooler.dense._packed_params.bias\n",
-      "Skip bert.pooler.dense._packed_params.dtype\n",
-      "\n",
-      "Encoder Size (MB) - Dense:              340.26\n",
-      "Encoder Size (MB) - Sparse & Quantized: 11.28\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Save the decomposed state_dict with an HDF5 file\n",
-    "# Saving only the encoder + QA Head\n",
-    "\n",
-    "with h5py.File(\"dbg/squad_sparse.h5\", \"w\") as hf:\n",
-    "    for name, param in elementary_qtz_st.items():\n",
-    "        if \"embedding\" in name:\n",
-    "            print(f\"Skip {name}\")\n",
-    "            continue\n",
-    "\n",
-    "        if \"pooler\" in name:\n",
-    "            print(f\"Skip {name}\")\n",
-    "            continue\n",
-    "\n",
-    "        if type(param) == torch.Tensor:\n",
-    "            if param.numel() == 1:\n",
-    "                # module scale\n",
-    "                # module zero_point\n",
-    "                hf.attrs[name] = param\n",
-    "                continue\n",
-    "\n",
-    "            if param.requires_grad:\n",
-    "                # LayerNorm\n",
-    "                param = param.detach().numpy()\n",
-    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
-    "\n",
-    "        elif type(param) == float or type(param) == int or type(param) == tuple:\n",
-    "            # float - tensor _packed_params.weight.scale\n",
-    "            # int   - tensor _packed_params.weight.zero_point\n",
-    "            # tuple - tensor _packed_params.weight.shape\n",
-    "            hf.attrs[name] = param\n",
-    "\n",
-    "        elif type(param) == torch.dtype:\n",
-    "            # dtype - tensor _packed_params.dtype\n",
-    "            hf.attrs[name] = dtype_2_str[param]\n",
-    "\n",
-    "        else:\n",
-    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
-    "\n",
-    "\n",
-    "with open(\"dbg/metadata.json\", \"w\") as f:\n",
-    "    f.write(json.dumps(qtz_st._metadata))\n",
-    "\n",
-    "size = os.path.getsize(\"dbg/squad_sparse.h5\") + os.path.getsize(\"dbg/metadata.json\")\n",
-    "print(\"\")\n",
-    "print(\"Encoder Size (MB) - Dense:             \", round(dense_mb_size / 1e6, 2))\n",
-    "print(\"Encoder Size (MB) - Sparse & Quantized:\", round(size / 1e6, 2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Size (MB): 99.41\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Save the decomposed state_dict to HDF5 storage\n",
-    "# Save everything in the architecutre (embedding + encoder + QA Head)\n",
-    "\n",
-    "with h5py.File(\"dbg/squad_sparse_with_embs.h5\", \"w\") as hf:\n",
-    "    for name, param in elementary_qtz_st.items():\n",
-    "        #         if \"embedding\" in name:\n",
-    "        #             print(f\"Skip {name}\")\n",
-    "        #             continue\n",
-    "\n",
-    "        #         if \"pooler\" in name:\n",
-    "        #             print(f\"Skip {name}\")\n",
-    "        #             continue\n",
-    "\n",
-    "        if type(param) == torch.Tensor:\n",
-    "            if param.numel() == 1:\n",
-    "                # module scale\n",
-    "                # module zero_point\n",
-    "                hf.attrs[name] = param\n",
-    "                continue\n",
-    "\n",
-    "            if param.requires_grad:\n",
-    "                # LayerNorm\n",
-    "                param = param.detach().numpy()\n",
-    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
-    "\n",
-    "        elif type(param) == float or type(param) == int or type(param) == tuple:\n",
-    "            # float - tensor _packed_params.weight.scale\n",
-    "            # int   - tensor _packed_params.weight.zero_point\n",
-    "            # tuple - tensor _packed_params.weight.shape\n",
-    "            hf.attrs[name] = param\n",
-    "\n",
-    "        elif type(param) == torch.dtype:\n",
-    "            # dtype - tensor _packed_params.dtype\n",
-    "            hf.attrs[name] = dtype_2_str[param]\n",
-    "\n",
-    "        else:\n",
-    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
-    "\n",
-    "\n",
-    "with open(\"dbg/metadata.json\", \"w\") as f:\n",
-    "    f.write(json.dumps(qtz_st._metadata))\n",
-    "\n",
-    "size = os.path.getsize(\"dbg/squad_sparse_with_embs.h5\") + os.path.getsize(\"dbg/metadata.json\")\n",
-    "print(\"\\nSize (MB):\", round(size / 1e6, 2))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Loading"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Reconstruct the elementary state dict\n",
-    "\n",
-    "reconstructed_elementary_qtz_st = {}\n",
-    "\n",
-    "hf = h5py.File(\"dbg/squad_sparse_with_embs.h5\", \"r\")\n",
-    "\n",
-    "for attr_name, attr_param in hf.attrs.items():\n",
-    "    if \"shape\" in attr_name:\n",
-    "        attr_param = tuple(attr_param)\n",
-    "    elif \".scale\" in attr_name:\n",
-    "        if \"_packed_params\" in attr_name:\n",
-    "            attr_param = float(attr_param)\n",
-    "        else:\n",
-    "            attr_param = torch.tensor(attr_param)\n",
-    "    elif \".zero_point\" in attr_name:\n",
-    "        if \"_packed_params\" in attr_name:\n",
-    "            attr_param = int(attr_param)\n",
-    "        else:\n",
-    "            attr_param = torch.tensor(attr_param)\n",
-    "    elif \".dtype\" in attr_name:\n",
-    "        attr_param = str_2_dtype[attr_param]\n",
-    "    reconstructed_elementary_qtz_st[attr_name] = attr_param\n",
-    "    # print(f\"Unpack {attr_name}\")\n",
-    "\n",
-    "# Get the tensors/arrays\n",
-    "for data_name, data_param in hf.items():\n",
-    "    if \"LayerNorm\" in data_name or \"_packed_params.bias\" in data_name:\n",
-    "        reconstructed_elementary_qtz_st[data_name] = torch.from_numpy(np.array(data_param))\n",
-    "    elif \"embedding\" in data_name:\n",
-    "        reconstructed_elementary_qtz_st[data_name] = torch.from_numpy(np.array(data_param))\n",
-    "    else:  # _packed_params.weight.int_repr.data, _packed_params.weight.int_repr.indices and _packed_params.weight.int_repr.indptr\n",
-    "        data_param = np.array(data_param)\n",
-    "        if \"indices\" in data_name:\n",
-    "            data_param = np.array(data_param, dtype=np.int32)\n",
-    "        reconstructed_elementary_qtz_st[data_name] = data_param\n",
-    "    # print(f\"Unpack {data_name}\")\n",
-    "\n",
-    "\n",
-    "hf.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Sanity checks\n",
-    "\n",
-    "for name, param in reconstructed_elementary_qtz_st.items():\n",
-    "    assert name in elementary_qtz_st\n",
-    "for name, param in elementary_qtz_st.items():\n",
-    "    assert name in reconstructed_elementary_qtz_st, name\n",
-    "\n",
-    "for name, param in reconstructed_elementary_qtz_st.items():\n",
-    "    assert type(param) == type(elementary_qtz_st[name]), name\n",
-    "    if type(param) == torch.Tensor:\n",
-    "        assert torch.all(torch.eq(param, elementary_qtz_st[name])), name\n",
-    "    elif type(param) == np.ndarray:\n",
-    "        assert (param == elementary_qtz_st[name]).all(), name\n",
-    "    else:\n",
-    "        assert param == elementary_qtz_st[name], name"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Re-assemble the sparse int_repr from the CSR format\n",
-    "\n",
-    "reconstructed_qtz_st = {}\n",
-    "\n",
-    "for name, param in reconstructed_elementary_qtz_st.items():\n",
-    "    if \"weight.int_repr.indptr\" in name:\n",
-    "        prefix_ = name[:-16]\n",
-    "        data = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.data\"]\n",
-    "        indptr = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.indptr\"]\n",
-    "        indices = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.indices\"]\n",
-    "        shape = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.shape\"]\n",
-    "\n",
-    "        int_repr = sparse.csr_matrix(arg1=(data, indices, indptr), shape=shape)\n",
-    "        int_repr = torch.tensor(int_repr.todense())\n",
-    "\n",
-    "        scale = reconstructed_elementary_qtz_st[f\"{prefix_}.scale\"]\n",
-    "        zero_point = reconstructed_elementary_qtz_st[f\"{prefix_}.zero_point\"]\n",
-    "        weight = torch._make_per_tensor_quantized_tensor(int_repr, scale, zero_point)\n",
-    "\n",
-    "        reconstructed_qtz_st[f\"{prefix_}\"] = weight\n",
-    "    elif (\n",
-    "        \"int_repr.data\" in name\n",
-    "        or \"int_repr.shape\" in name\n",
-    "        or \"int_repr.indices\" in name\n",
-    "        or \"weight.scale\" in name\n",
-    "        or \"weight.zero_point\" in name\n",
-    "    ):\n",
-    "        continue\n",
-    "    else:\n",
-    "        reconstructed_qtz_st[name] = param"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Sanity checks\n",
-    "\n",
-    "for name, param in reconstructed_qtz_st.items():\n",
-    "    assert name in qtz_st\n",
-    "for name, param in qtz_st.items():\n",
-    "    assert name in reconstructed_qtz_st, name\n",
-    "\n",
-    "for name, param in reconstructed_qtz_st.items():\n",
-    "    assert type(param) == type(qtz_st[name]), name\n",
-    "    if type(param) == torch.Tensor:\n",
-    "        assert torch.all(torch.eq(param, qtz_st[name])), name\n",
-    "    elif type(param) == np.ndarray:\n",
-    "        assert (param == qtz_st[name]).all(), name\n",
-    "    else:\n",
-    "        assert param == qtz_st[name], name"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Sanity checks"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<All keys matched successfully>"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Load the re-constructed state dict into a model\n",
-    "\n",
-    "dummy_model = BertForQuestionAnswering.from_pretrained(\"bert-base-uncased\")\n",
-    "dummy_model.to(\"cpu\")\n",
-    "\n",
-    "reconstructed_qtz_model = torch.quantization.quantize_dynamic(\n",
-    "    model=dummy_model,\n",
-    "    qconfig_spec=None,\n",
-    "    dtype=torch.qint8,\n",
-    ")\n",
-    "\n",
-    "reconstructed_qtz_st = OrderedDict(reconstructed_qtz_st)\n",
-    "with open(\"dbg/metadata.json\", \"r\") as read_file:\n",
-    "    metadata = json.loads(read_file.read())\n",
-    "reconstructed_qtz_st._metadata = metadata\n",
-    "\n",
-    "reconstructed_qtz_model.load_state_dict(reconstructed_qtz_st)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sanity check passed\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Sanity checks on the infernce\n",
-    "\n",
-    "N = 32\n",
-    "\n",
-    "for _ in range(25):\n",
-    "    inputs = torch.randint(low=0, high=30000, size=(N, 128))\n",
-    "    mask = torch.ones(size=(N, 128))\n",
-    "\n",
-    "    y_reconstructed = reconstructed_qtz_model(input_ids=inputs, attention_mask=mask)[0]\n",
-    "    y = quantized_model(input_ids=inputs, attention_mask=mask)[0]\n",
-    "\n",
-    "    assert torch.all(torch.eq(y, y_reconstructed))\n",
-    "print(\"Sanity check passed\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/examples/research_projects/movement-pruning/bertarize.py b/examples/research_projects/movement-pruning/bertarize.py
deleted file mode 100644
index da7534f4a6f9..000000000000
--- a/examples/research_projects/movement-pruning/bertarize.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright 2020-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Once a model has been fine-pruned, the weights that are masked during the forward pass can be pruned once for all.
-For instance, once the a model from the :class:`~emmental.MaskedBertForSequenceClassification` is trained, it can be saved (and then loaded)
-as a standard :class:`~transformers.BertForSequenceClassification`.
-"""
-
-import argparse
-import os
-import shutil
-
-import torch
-from emmental.modules import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
-
-
-def main(args):
-    pruning_method = args.pruning_method
-    threshold = args.threshold
-
-    model_name_or_path = args.model_name_or_path.rstrip("/")
-    target_model_path = args.target_model_path
-
-    print(f"Load fine-pruned model from {model_name_or_path}")
-    model = torch.load(os.path.join(model_name_or_path, "pytorch_model.bin"))
-    pruned_model = {}
-
-    for name, tensor in model.items():
-        if "embeddings" in name or "LayerNorm" in name or "pooler" in name:
-            pruned_model[name] = tensor
-            print(f"Copied layer {name}")
-        elif "classifier" in name or "qa_output" in name:
-            pruned_model[name] = tensor
-            print(f"Copied layer {name}")
-        elif "bias" in name:
-            pruned_model[name] = tensor
-            print(f"Copied layer {name}")
-        else:
-            if pruning_method == "magnitude":
-                mask = MagnitudeBinarizer.apply(inputs=tensor, threshold=threshold)
-                pruned_model[name] = tensor * mask
-                print(f"Pruned layer {name}")
-            elif pruning_method == "topK":
-                if "mask_scores" in name:
-                    continue
-                prefix_ = name[:-6]
-                scores = model[f"{prefix_}mask_scores"]
-                mask = TopKBinarizer.apply(scores, threshold)
-                pruned_model[name] = tensor * mask
-                print(f"Pruned layer {name}")
-            elif pruning_method == "sigmoied_threshold":
-                if "mask_scores" in name:
-                    continue
-                prefix_ = name[:-6]
-                scores = model[f"{prefix_}mask_scores"]
-                mask = ThresholdBinarizer.apply(scores, threshold, True)
-                pruned_model[name] = tensor * mask
-                print(f"Pruned layer {name}")
-            elif pruning_method == "l0":
-                if "mask_scores" in name:
-                    continue
-                prefix_ = name[:-6]
-                scores = model[f"{prefix_}mask_scores"]
-                l, r = -0.1, 1.1
-                s = torch.sigmoid(scores)
-                s_bar = s * (r - l) + l
-                mask = s_bar.clamp(min=0.0, max=1.0)
-                pruned_model[name] = tensor * mask
-                print(f"Pruned layer {name}")
-            else:
-                raise ValueError("Unknown pruning method")
-
-    if target_model_path is None:
-        target_model_path = os.path.join(
-            os.path.dirname(model_name_or_path), f"bertarized_{os.path.basename(model_name_or_path)}"
-        )
-
-    if not os.path.isdir(target_model_path):
-        shutil.copytree(model_name_or_path, target_model_path)
-        print(f"\nCreated folder {target_model_path}")
-
-    torch.save(pruned_model, os.path.join(target_model_path, "pytorch_model.bin"))
-    print("\nPruned model saved! See you later!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--pruning_method",
-        choices=["l0", "magnitude", "topK", "sigmoied_threshold"],
-        type=str,
-        required=True,
-        help=(
-            "Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning,"
-            " sigmoied_threshold = Soft movement pruning)"
-        ),
-    )
-    parser.add_argument(
-        "--threshold",
-        type=float,
-        required=False,
-        help=(
-            "For `magnitude` and `topK`, it is the level of remaining weights (in %) in the fine-pruned model. "
-            "For `sigmoied_threshold`, it is the threshold \tau against which the (sigmoied) scores are compared. "
-            "Not needed for `l0`"
-        ),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        required=True,
-        help="Folder containing the model that was previously fine-pruned",
-    )
-    parser.add_argument(
-        "--target_model_path",
-        default=None,
-        type=str,
-        required=False,
-        help="Folder containing the model that was previously fine-pruned",
-    )
-
-    args = parser.parse_args()
-
-    main(args)
diff --git a/examples/research_projects/movement-pruning/counts_parameters.py b/examples/research_projects/movement-pruning/counts_parameters.py
deleted file mode 100644
index c0ac53fb7859..000000000000
--- a/examples/research_projects/movement-pruning/counts_parameters.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright 2020-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Count remaining (non-zero) weights in the encoder (i.e. the transformer layers).
-Sparsity and remaining weights levels are equivalent: sparsity % = 100 - remaining weights %.
-"""
-
-import argparse
-import os
-
-import torch
-from emmental.modules import ThresholdBinarizer, TopKBinarizer
-
-
-def main(args):
-    serialization_dir = args.serialization_dir
-    pruning_method = args.pruning_method
-    threshold = args.threshold
-
-    st = torch.load(os.path.join(serialization_dir, "pytorch_model.bin"), map_location="cpu")
-
-    remaining_count = 0  # Number of remaining (not pruned) params in the encoder
-    encoder_count = 0  # Number of params in the encoder
-
-    print("name".ljust(60, " "), "Remaining Weights %", "Remaining Weight")
-    for name, param in st.items():
-        if "encoder" not in name:
-            continue
-
-        if "mask_scores" in name:
-            if pruning_method == "topK":
-                mask_ones = TopKBinarizer.apply(param, threshold).sum().item()
-            elif pruning_method == "sigmoied_threshold":
-                mask_ones = ThresholdBinarizer.apply(param, threshold, True).sum().item()
-            elif pruning_method == "l0":
-                l, r = -0.1, 1.1
-                s = torch.sigmoid(param)
-                s_bar = s * (r - l) + l
-                mask = s_bar.clamp(min=0.0, max=1.0)
-                mask_ones = (mask > 0.0).sum().item()
-            else:
-                raise ValueError("Unknown pruning method")
-            remaining_count += mask_ones
-            print(name.ljust(60, " "), str(round(100 * mask_ones / param.numel(), 3)).ljust(20, " "), str(mask_ones))
-        else:
-            encoder_count += param.numel()
-            if "bias" in name or "LayerNorm" in name:
-                remaining_count += param.numel()
-
-    print("")
-    print("Remaining Weights (global) %: ", 100 * remaining_count / encoder_count)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--pruning_method",
-        choices=["l0", "topK", "sigmoied_threshold"],
-        type=str,
-        required=True,
-        help=(
-            "Pruning Method (l0 = L0 regularization, topK = Movement pruning, sigmoied_threshold = Soft movement"
-            " pruning)"
-        ),
-    )
-    parser.add_argument(
-        "--threshold",
-        type=float,
-        required=False,
-        help=(
-            "For `topK`, it is the level of remaining weights (in %) in the fine-pruned model. "
-            "For `sigmoied_threshold`, it is the threshold \tau against which the (sigmoied) scores are compared. "
-            "Not needed for `l0`"
-        ),
-    )
-    parser.add_argument(
-        "--serialization_dir",
-        type=str,
-        required=True,
-        help="Folder containing the model that was previously fine-pruned",
-    )
-
-    args = parser.parse_args()
-
-    main(args)
diff --git a/examples/research_projects/movement-pruning/emmental/__init__.py b/examples/research_projects/movement-pruning/emmental/__init__.py
deleted file mode 100644
index 6646667ea883..000000000000
--- a/examples/research_projects/movement-pruning/emmental/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from .configuration_bert_masked import MaskedBertConfig
-from .modeling_bert_masked import (
-    MaskedBertForMultipleChoice,
-    MaskedBertForQuestionAnswering,
-    MaskedBertForSequenceClassification,
-    MaskedBertForTokenClassification,
-    MaskedBertModel,
-)
-from .modules import *
diff --git a/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py b/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
deleted file mode 100644
index 9c7459f27a7b..000000000000
--- a/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Masked BERT model configuration. It replicates the class `~transformers.BertConfig`
-and adapts it to the specificities of MaskedBert (`pruning_method`, `mask_init` and `mask_scale`."""
-
-import logging
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-logger = logging.getLogger(__name__)
-
-
-class MaskedBertConfig(PretrainedConfig):
-    """
-    A class replicating the `~transformers.BertConfig` with additional parameters for pruning/masking configuration.
-    """
-
-    model_type = "masked_bert"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        pruning_method="topK",
-        mask_init="constant",
-        mask_scale=0.0,
-        **kwargs,
-    ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.pruning_method = pruning_method
-        self.mask_init = mask_init
-        self.mask_scale = mask_scale
diff --git a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
deleted file mode 100644
index 8c0b091c7de7..000000000000
--- a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
+++ /dev/null
@@ -1,1019 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Masked Version of BERT. It replaces the `torch.nn.Linear` layers with
-:class:`~emmental.MaskedLinear` and add an additional parameters in the forward pass to
-compute the adaptive mask.
-Built on top of `transformers.models.bert.modeling_bert`"""
-
-import logging
-import math
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from emmental import MaskedBertConfig
-from emmental.modules import MaskedLinear
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
-from transformers.modeling_utils import PreTrainedModel, prune_linear_layer
-from transformers.models.bert.modeling_bert import ACT2FN, load_tf_weights_in_bert
-
-
-logger = logging.getLogger(__name__)
-
-
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        if position_ids is None:
-            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).expand(input_shape)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertSelfAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
-                % (config.hidden_size, config.num_attention_heads)
-            )
-        self.output_attentions = config.output_attentions
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = MaskedLinear(
-            config.hidden_size,
-            self.all_head_size,
-            pruning_method=config.pruning_method,
-            mask_init=config.mask_init,
-            mask_scale=config.mask_scale,
-        )
-        self.key = MaskedLinear(
-            config.hidden_size,
-            self.all_head_size,
-            pruning_method=config.pruning_method,
-            mask_init=config.mask_init,
-            mask_scale=config.mask_scale,
-        )
-        self.value = MaskedLinear(
-            config.hidden_size,
-            self.all_head_size,
-            pruning_method=config.pruning_method,
-            mask_init=config.mask_init,
-            mask_scale=config.mask_scale,
-        )
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        threshold=None,
-    ):
-        mixed_query_layer = self.query(hidden_states, threshold=threshold)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        if encoder_hidden_states is not None:
-            mixed_key_layer = self.key(encoder_hidden_states, threshold=threshold)
-            mixed_value_layer = self.value(encoder_hidden_states, threshold=threshold)
-            attention_mask = encoder_attention_mask
-        else:
-            mixed_key_layer = self.key(hidden_states, threshold=threshold)
-            mixed_value_layer = self.value(hidden_states, threshold=threshold)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        key_layer = self.transpose_for_scores(mixed_key_layer)
-        value_layer = self.transpose_for_scores(mixed_value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
-        return outputs
-
-
-class BertSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = MaskedLinear(
-            config.hidden_size,
-            config.hidden_size,
-            pruning_method=config.pruning_method,
-            mask_init=config.mask_init,
-            mask_scale=config.mask_scale,
-        )
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor, threshold):
-        hidden_states = self.dense(hidden_states, threshold=threshold)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.self = BertSelfAttention(config)
-        self.output = BertSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
-        heads = set(heads) - self.pruned_heads  # Convert to set and remove already pruned heads
-        for head in heads:
-            # Compute how many pruned heads are before the head and move the index accordingly
-            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        threshold=None,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            threshold=threshold,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states, threshold=threshold)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class BertIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = MaskedLinear(
-            config.hidden_size,
-            config.intermediate_size,
-            pruning_method=config.pruning_method,
-            mask_init=config.mask_init,
-            mask_scale=config.mask_scale,
-        )
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states, threshold):
-        hidden_states = self.dense(hidden_states, threshold=threshold)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class BertOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = MaskedLinear(
-            config.intermediate_size,
-            config.hidden_size,
-            pruning_method=config.pruning_method,
-            mask_init=config.mask_init,
-            mask_scale=config.mask_scale,
-        )
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor, threshold):
-        hidden_states = self.dense(hidden_states, threshold=threshold)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.attention = BertAttention(config)
-        self.is_decoder = config.is_decoder
-        if self.is_decoder:
-            self.crossattention = BertAttention(config)
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        threshold=None,
-    ):
-        self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask, threshold=threshold)
-        attention_output = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
-        if self.is_decoder and encoder_hidden_states is not None:
-            cross_attention_outputs = self.crossattention(
-                attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
-
-        intermediate_output = self.intermediate(attention_output, threshold=threshold)
-        layer_output = self.output(intermediate_output, attention_output, threshold=threshold)
-        outputs = (layer_output,) + outputs
-        return outputs
-
-
-class BertEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        threshold=None,
-    ):
-        all_hidden_states = ()
-        all_attentions = ()
-        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(
-                hidden_states,
-                attention_mask,
-                head_mask[i],
-                encoder_hidden_states,
-                encoder_attention_mask,
-                threshold=threshold,
-            )
-            hidden_states = layer_outputs[0]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
-
-
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class MaskedBertPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = MaskedBertConfig
-    load_tf_weights = load_tf_weights_in_bert
-    base_model_prefix = "bert"
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-
-MASKED_BERT_START_DOCSTRING = r"""
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config (:class:`~emmental.MaskedBertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-MASKED_BERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.__call__` for details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-            if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-"""
-
-
-@add_start_docstrings(
-    "The bare Masked Bert Model transformer outputting raw hidden-states without any specific head on top.",
-    MASKED_BERT_START_DOCSTRING,
-)
-class MaskedBertModel(MaskedBertPreTrainedModel):
-    """
-    The `MaskedBertModel` class replicates the :class:`~transformers.BertModel` class
-    and adds specific inputs to compute the adaptive mask on the fly.
-    Note that we freeze the embeddings modules from their pre-trained values.
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = BertEmbeddings(config)
-        self.embeddings.requires_grad_(requires_grad=False)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        threshold=None,
-    ):
-        r"""
-        threshold (:obj:`float`):
-            Threshold value (see :class:`~emmental.MaskedLinear`).
-
-        Return:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-            last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-                Sequence of hidden-states at the output of the last layer of the model.
-            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-                Last layer hidden-state of the first token of the sequence (classification token)
-                further processed by a Linear layer and a Tanh activation function. The Linear
-                layer weights are trained from the next sentence prediction (classification)
-                objective during pre-training.
-
-                This output is usually *not* a good summary
-                of the semantic content of the input, you're often better with averaging or pooling
-                the sequence of hidden-states for the whole input sequence.
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-        """
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if self.config.is_decoder:
-                batch_size, seq_length = input_shape
-                seq_ids = torch.arange(seq_length, device=device)
-                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                causal_mask = causal_mask.to(
-                    attention_mask.dtype
-                )  # causal and attention masks must have same type with pytorch version < 1.3
-                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
-            else:
-                extended_attention_mask = attention_mask[:, None, None, :]
-        else:
-            raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
-                    input_shape, attention_mask.shape
-                )
-            )
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-
-            if encoder_attention_mask.dim() == 3:
-                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-            elif encoder_attention_mask.dim() == 2:
-                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-            else:
-                raise ValueError(
-                    "Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(
-                        encoder_hidden_shape, encoder_attention_mask.shape
-                    )
-                )
-
-            encoder_extended_attention_mask = encoder_extended_attention_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # fp16 compatibility
-            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = (
-                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-                )  # We can specify head_mask for each layer
-            head_mask = head_mask.to(
-                dtype=next(self.parameters()).dtype
-            )  # switch to float if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            threshold=threshold,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
-
-        outputs = (
-            sequence_output,
-            pooled_output,
-        ) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Masked Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    MASKED_BERT_START_DOCSTRING,
-)
-class MaskedBertForSequenceClassification(MaskedBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = MaskedBertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        threshold=None,
-    ):
-        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Labels for computing the sequence classification/regression loss.
-                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-            threshold (:obj:`float`):
-                Threshold value (see :class:`~emmental.MaskedLinear`).
-
-        Returns:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-                Classification (or regression if config.num_labels==1) loss.
-            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-                Classification (or regression if config.num_labels==1) scores (before SoftMax).
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            threshold=threshold,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Masked Bert Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    MASKED_BERT_START_DOCSTRING,
-)
-class MaskedBertForMultipleChoice(MaskedBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = MaskedBertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        threshold=None,
-    ):
-        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Labels for computing the multiple choice classification loss.
-                Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-                of the input tensors. (see `input_ids` above)
-            threshold (:obj:`float`):
-                Threshold value (see :class:`~emmental.MaskedLinear`).
-
-        Returns:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-            loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
-                Classification loss.
-            classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-                `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
-
-                Classification scores (before SoftMax).
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-
-        """
-        num_choices = input_ids.shape[1]
-
-        input_ids = input_ids.view(-1, input_ids.size(-1))
-        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            threshold=threshold,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Masked Bert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    MASKED_BERT_START_DOCSTRING,
-)
-class MaskedBertForTokenClassification(MaskedBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = MaskedBertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        threshold=None,
-    ):
-        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-                Labels for computing the token classification loss.
-                Indices should be in ``[0, ..., config.num_labels - 1]``.
-            threshold (:obj:`float`):
-                Threshold value (see :class:`~emmental.MaskedLinear`).
-
-        Returns:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-                Classification loss.
-            scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
-                Classification scores (before SoftMax).
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            threshold=threshold,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Masked Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
-    MASKED_BERT_START_DOCSTRING,
-)
-class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = MaskedBertModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        threshold=None,
-    ):
-        r"""
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Labels for position (index) of the start of the labelled span for computing the token classification loss.
-                Positions are clamped to the length of the sequence (`sequence_length`).
-                Position outside of the sequence are not taken into account for computing the loss.
-            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Labels for position (index) of the end of the labelled span for computing the token classification loss.
-                Positions are clamped to the length of the sequence (`sequence_length`).
-                Position outside of the sequence are not taken into account for computing the loss.
-            threshold (:obj:`float`):
-                Threshold value (see :class:`~emmental.MaskedLinear`).
-
-        Returns:
-            :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
-            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-                Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-            start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-                Span-start scores (before SoftMax).
-            end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-                Span-end scores (before SoftMax).
-            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-                :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-
-                Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-                heads.
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            threshold=threshold,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (
-            start_logits,
-            end_logits,
-        ) + outputs[2:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/examples/research_projects/movement-pruning/emmental/modules/__init__.py b/examples/research_projects/movement-pruning/emmental/modules/__init__.py
deleted file mode 100644
index 761a6343d6b5..000000000000
--- a/examples/research_projects/movement-pruning/emmental/modules/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
-from .masked_nn import MaskedLinear
diff --git a/examples/research_projects/movement-pruning/emmental/modules/binarizer.py b/examples/research_projects/movement-pruning/emmental/modules/binarizer.py
deleted file mode 100644
index c96975e3b375..000000000000
--- a/examples/research_projects/movement-pruning/emmental/modules/binarizer.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2020-present, AllenAI Authors, University of Illinois Urbana-Champaign,
-# Intel Nervana Systems and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Binarizers take a (real value) matrix as input and produce a binary (values in {0,1}) mask of the same shape.
-"""
-
-import torch
-from torch import autograd
-
-
-class ThresholdBinarizer(autograd.Function):
-    """
-    Thresholdd binarizer.
-    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j} > \tau`
-    where `\tau` is a real value threshold.
-
-    Implementation is inspired from:
-        https://github.com/arunmallya/piggyback
-        Piggyback: Adapting a Single Network to Multiple Tasks by Learning to Mask Weights
-        Arun Mallya, Dillon Davis, Svetlana Lazebnik
-    """
-
-    @staticmethod
-    def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool):
-        """
-        Args:
-            inputs (`torch.FloatTensor`)
-                The input matrix from which the binarizer computes the binary mask.
-            threshold (`float`)
-                The threshold value (in R).
-            sigmoid (`bool`)
-                If set to ``True``, we apply the sigmoid function to the `inputs` matrix before comparing to `threshold`.
-                In this case, `threshold` should be a value between 0 and 1.
-        Returns:
-            mask (`torch.FloatTensor`)
-                Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
-                retained, 0 - the associated weight is pruned).
-        """
-        nb_elems = inputs.numel()
-        nb_min = int(0.005 * nb_elems) + 1
-        if sigmoid:
-            mask = (torch.sigmoid(inputs) > threshold).type(inputs.type())
-        else:
-            mask = (inputs > threshold).type(inputs.type())
-        if mask.sum() < nb_min:
-            # We limit the pruning so that at least 0.5% (half a percent) of the weights are remaining
-            k_threshold = inputs.flatten().kthvalue(max(nb_elems - nb_min, 1)).values
-            mask = (inputs > k_threshold).type(inputs.type())
-        return mask
-
-    @staticmethod
-    def backward(ctx, gradOutput):
-        return gradOutput, None, None
-
-
-class TopKBinarizer(autograd.Function):
-    """
-    Top-k Binarizer.
-    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
-    is among the k% highest values of S.
-
-    Implementation is inspired from:
-        https://github.com/allenai/hidden-networks
-        What's hidden in a randomly weighted neural network?
-        Vivek Ramanujan*, Mitchell Wortsman*, Aniruddha Kembhavi, Ali Farhadi, Mohammad Rastegari
-    """
-
-    @staticmethod
-    def forward(ctx, inputs: torch.tensor, threshold: float):
-        """
-        Args:
-            inputs (`torch.FloatTensor`)
-                The input matrix from which the binarizer computes the binary mask.
-            threshold (`float`)
-                The percentage of weights to keep (the rest is pruned).
-                `threshold` is a float between 0 and 1.
-        Returns:
-            mask (`torch.FloatTensor`)
-                Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
-                retained, 0 - the associated weight is pruned).
-        """
-        # Get the subnetwork by sorting the inputs and using the top threshold %
-        mask = inputs.clone()
-        _, idx = inputs.flatten().sort(descending=True)
-        j = int(threshold * inputs.numel())
-
-        # flat_out and mask access the same memory.
-        flat_out = mask.flatten()
-        flat_out[idx[j:]] = 0
-        flat_out[idx[:j]] = 1
-        return mask
-
-    @staticmethod
-    def backward(ctx, gradOutput):
-        return gradOutput, None
-
-
-class MagnitudeBinarizer:
-    """
-    Magnitude Binarizer.
-    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
-    is among the k% highest values of |S| (absolute value).
-
-    Implementation is inspired from https://github.com/NervanaSystems/distiller/blob/2291fdcc2ea642a98d4e20629acb5a9e2e04b4e6/distiller/pruning/automated_gradual_pruner.py#L24
-    """
-
-    @staticmethod
-    def apply(inputs: torch.tensor, threshold: float):
-        """
-        Args:
-            inputs (`torch.FloatTensor`)
-                The input matrix from which the binarizer computes the binary mask.
-                This input marix is typically the weight matrix.
-            threshold (`float`)
-                The percentage of weights to keep (the rest is pruned).
-                `threshold` is a float between 0 and 1.
-        Returns:
-            mask (`torch.FloatTensor`)
-                Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
-                retained, 0 - the associated weight is pruned).
-        """
-        # Get the subnetwork by sorting the inputs and using the top threshold %
-        mask = inputs.clone()
-        _, idx = inputs.abs().flatten().sort(descending=True)
-        j = int(threshold * inputs.numel())
-
-        # flat_out and mask access the same memory.
-        flat_out = mask.flatten()
-        flat_out[idx[j:]] = 0
-        flat_out[idx[:j]] = 1
-        return mask
diff --git a/examples/research_projects/movement-pruning/emmental/modules/masked_nn.py b/examples/research_projects/movement-pruning/emmental/modules/masked_nn.py
deleted file mode 100644
index e3c94836851e..000000000000
--- a/examples/research_projects/movement-pruning/emmental/modules/masked_nn.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# coding=utf-8
-# Copyright 2020-present, the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Masked Linear module: A fully connected layer that computes an adaptive binary mask on the fly.
-The mask (binary or not) is computed at each forward pass and multiplied against
-the weight matrix to prune a portion of the weights.
-The pruned weight matrix is then multiplied against the inputs (and if necessary, the bias is added).
-"""
-
-import math
-
-import torch
-from torch import nn
-from torch.nn import init
-
-from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
-
-
-class MaskedLinear(nn.Linear):
-    """
-    Fully Connected layer with on the fly adaptive mask.
-    If needed, a score matrix is created to store the importance of each associated weight.
-    """
-
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias: bool = True,
-        mask_init: str = "constant",
-        mask_scale: float = 0.0,
-        pruning_method: str = "topK",
-    ):
-        """
-        Args:
-            in_features (`int`)
-                Size of each input sample
-            out_features (`int`)
-                Size of each output sample
-            bias (`bool`)
-                If set to ``False``, the layer will not learn an additive bias.
-                Default: ``True``
-            mask_init (`str`)
-                The initialization method for the score matrix if a score matrix is needed.
-                Choices: ["constant", "uniform", "kaiming"]
-                Default: ``constant``
-            mask_scale (`float`)
-                The initialization parameter for the chosen initialization method `mask_init`.
-                Default: ``0.``
-            pruning_method (`str`)
-                Method to compute the mask.
-                Choices: ["topK", "threshold", "sigmoied_threshold", "magnitude", "l0"]
-                Default: ``topK``
-        """
-        super(MaskedLinear, self).__init__(in_features=in_features, out_features=out_features, bias=bias)
-        assert pruning_method in ["topK", "threshold", "sigmoied_threshold", "magnitude", "l0"]
-        self.pruning_method = pruning_method
-
-        if self.pruning_method in ["topK", "threshold", "sigmoied_threshold", "l0"]:
-            self.mask_scale = mask_scale
-            self.mask_init = mask_init
-            self.mask_scores = nn.Parameter(torch.empty(self.weight.size()))
-            self.init_mask()
-
-    def init_mask(self):
-        if self.mask_init == "constant":
-            init.constant_(self.mask_scores, val=self.mask_scale)
-        elif self.mask_init == "uniform":
-            init.uniform_(self.mask_scores, a=-self.mask_scale, b=self.mask_scale)
-        elif self.mask_init == "kaiming":
-            init.kaiming_uniform_(self.mask_scores, a=math.sqrt(5))
-
-    def forward(self, input: torch.tensor, threshold: float):
-        # Get the mask
-        if self.pruning_method == "topK":
-            mask = TopKBinarizer.apply(self.mask_scores, threshold)
-        elif self.pruning_method in ["threshold", "sigmoied_threshold"]:
-            sig = "sigmoied" in self.pruning_method
-            mask = ThresholdBinarizer.apply(self.mask_scores, threshold, sig)
-        elif self.pruning_method == "magnitude":
-            mask = MagnitudeBinarizer.apply(self.weight, threshold)
-        elif self.pruning_method == "l0":
-            l, r, b = -0.1, 1.1, 2 / 3
-            if self.training:
-                u = torch.zeros_like(self.mask_scores).uniform_().clamp(0.0001, 0.9999)
-                s = torch.sigmoid((u.log() - (1 - u).log() + self.mask_scores) / b)
-            else:
-                s = torch.sigmoid(self.mask_scores)
-            s_bar = s * (r - l) + l
-            mask = s_bar.clamp(min=0.0, max=1.0)
-        # Mask weights with computed mask
-        weight_thresholded = mask * self.weight
-        # Compute output (linear layer) with masked weights
-        return nn.functional.linear(input, weight_thresholded, self.bias)
diff --git a/examples/research_projects/movement-pruning/masked_run_glue.py b/examples/research_projects/movement-pruning/masked_run_glue.py
deleted file mode 100644
index 4ddb42483575..000000000000
--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ /dev/null
@@ -1,962 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fine-pruning Masked BERT on sequence classification on GLUE."""
-
-import argparse
-import glob
-import json
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from emmental import MaskedBertConfig, MaskedBertForSequenceClassification
-from torch import nn
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForSequenceClassification,
-    BertTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
-from transformers import glue_output_modes as output_modes
-from transformers import glue_processors as processors
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
-    "masked_bert": (MaskedBertConfig, MaskedBertForSequenceClassification, BertTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def schedule_threshold(
-    step: int,
-    total_step: int,
-    warmup_steps: int,
-    initial_threshold: float,
-    final_threshold: float,
-    initial_warmup: int,
-    final_warmup: int,
-    final_lambda: float,
-):
-    if step <= initial_warmup * warmup_steps:
-        threshold = initial_threshold
-    elif step > (total_step - final_warmup * warmup_steps):
-        threshold = final_threshold
-    else:
-        spars_warmup_steps = initial_warmup * warmup_steps
-        spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps
-        mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps)
-        threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff**3)
-    regu_lambda = final_lambda * threshold / final_threshold
-    return threshold, regu_lambda
-
-
-def regularization(model: nn.Module, mode: str):
-    regu, counter = 0, 0
-    for name, param in model.named_parameters():
-        if "mask_scores" in name:
-            if mode == "l1":
-                regu += torch.norm(torch.sigmoid(param), p=1) / param.numel()
-            elif mode == "l0":
-                regu += torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1)).sum() / param.numel()
-            else:
-                raise ValueError("Don't know this mode.")
-            counter += 1
-    return regu / counter
-
-
-def train(args, train_dataset, model, tokenizer, teacher=None):
-    """Train the model"""
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter(log_dir=args.output_dir)
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if "mask_score" in n and p.requires_grad],
-            "lr": args.mask_scores_learning_rate,
-        },
-        {
-            "params": [
-                p
-                for n, p in model.named_parameters()
-                if "mask_score" not in n and p.requires_grad and not any(nd in n for nd in no_decay)
-            ],
-            "lr": args.learning_rate,
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [
-                p
-                for n, p in model.named_parameters()
-                if "mask_score" not in n and p.requires_grad and any(nd in n for nd in no_decay)
-            ],
-            "lr": args.learning_rate,
-            "weight_decay": 0.0,
-        },
-    ]
-
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = nn.parallel.DistributedDataParallel(
-            model,
-            device_ids=[args.local_rank],
-            output_device=args.local_rank,
-            find_unused_parameters=True,
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-    # Distillation
-    if teacher is not None:
-        logger.info("  Training with distillation")
-
-    global_step = 0
-    # Global TopK
-    if args.global_topk:
-        threshold_mem = None
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        # set global_step to global_step of last saved checkpoint from model path
-        try:
-            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
-        except ValueError:
-            global_step = 0
-        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-        logger.info("  Continuing training from epoch %d", epochs_trained)
-        logger.info("  Continuing training from global step %d", global_step)
-        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained,
-        int(args.num_train_epochs),
-        desc="Epoch",
-        disable=args.local_rank not in [-1, 0],
-    )
-    set_seed(args)  # Added here for reproducibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            threshold, regu_lambda = schedule_threshold(
-                step=global_step,
-                total_step=t_total,
-                warmup_steps=args.warmup_steps,
-                final_threshold=args.final_threshold,
-                initial_threshold=args.initial_threshold,
-                final_warmup=args.final_warmup,
-                initial_warmup=args.initial_warmup,
-                final_lambda=args.final_lambda,
-            )
-            # Global TopK
-            if args.global_topk:
-                if threshold == 1.0:
-                    threshold = -1e2  # Or an indefinitely low quantity
-                else:
-                    if (threshold_mem is None) or (global_step % args.global_topk_frequency_compute == 0):
-                        # Sort all the values to get the global topK
-                        concat = torch.cat(
-                            [param.view(-1) for name, param in model.named_parameters() if "mask_scores" in name]
-                        )
-                        n = concat.numel()
-                        kth = max(n - (int(n * threshold) + 1), 1)
-                        threshold_mem = concat.kthvalue(kth).values.item()
-                        threshold = threshold_mem
-                    else:
-                        threshold = threshold_mem
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert", "masked_bert", "xlnet", "albert"] else None
-                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
-
-            if "masked" in args.model_type:
-                inputs["threshold"] = threshold
-
-            outputs = model(**inputs)
-            loss, logits_stu = outputs  # model outputs are always tuple in transformers (see doc)
-
-            # Distillation loss
-            if teacher is not None:
-                if "token_type_ids" not in inputs:
-                    inputs["token_type_ids"] = None if args.teacher_type == "xlm" else batch[2]
-                with torch.no_grad():
-                    (logits_tea,) = teacher(
-                        input_ids=inputs["input_ids"],
-                        token_type_ids=inputs["token_type_ids"],
-                        attention_mask=inputs["attention_mask"],
-                    )
-
-                loss_logits = nn.functional.kl_div(
-                    input=nn.functional.log_softmax(logits_stu / args.temperature, dim=-1),
-                    target=nn.functional.softmax(logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature**2)
-
-                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
-
-            # Regularization
-            if args.regularization is not None:
-                regu_ = regularization(model=model, mode=args.regularization)
-                loss = loss + regu_lambda * regu_
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0 or (
-                # last step in epoch but step is always smaller than gradient_accumulation_steps
-                len(epoch_iterator) <= args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)
-            ):
-                if args.fp16:
-                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    tb_writer.add_scalar("threshold", threshold, global_step)
-                    for name, param in model.named_parameters():
-                        if not param.requires_grad:
-                            continue
-                        tb_writer.add_scalar("parameter_mean/" + name, param.data.mean(), global_step)
-                        tb_writer.add_scalar("parameter_std/" + name, param.data.std(), global_step)
-                        tb_writer.add_scalar("parameter_min/" + name, param.data.min(), global_step)
-                        tb_writer.add_scalar("parameter_max/" + name, param.data.max(), global_step)
-                        tb_writer.add_scalar("grad_mean/" + name, param.grad.data.mean(), global_step)
-                        tb_writer.add_scalar("grad_std/" + name, param.grad.data.std(), global_step)
-                        if args.regularization is not None and "mask_scores" in name:
-                            if args.regularization == "l1":
-                                perc = (torch.sigmoid(param) > threshold).sum().item() / param.numel()
-                            elif args.regularization == "l0":
-                                perc = (torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1))).sum().item() / param.numel()
-                            tb_writer.add_scalar("retained_weights_perc/" + name, perc, global_step)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    logs = {}
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            eval_key = "eval_{}".format(key)
-                            logs[eval_key] = value
-
-                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
-                    learning_rate_scalar = scheduler.get_lr()
-                    logs["learning_rate"] = learning_rate_scalar[0]
-                    if len(learning_rate_scalar) > 1:
-                        for idx, lr in enumerate(learning_rate_scalar[1:]):
-                            logs[f"learning_rate/{idx+1}"] = lr
-                    logs["loss"] = loss_scalar
-                    if teacher is not None:
-                        logs["loss/distil"] = loss_logits.item()
-                    if args.regularization is not None:
-                        logs["loss/regularization"] = regu_.item()
-                    if (teacher is not None) or (args.regularization is not None):
-                        if (teacher is not None) and (args.regularization is not None):
-                            logs["loss/instant_ce"] = (
-                                loss.item()
-                                - regu_lambda * logs["loss/regularization"]
-                                - args.alpha_distil * logs["loss/distil"]
-                            ) / args.alpha_ce
-                        elif teacher is not None:
-                            logs["loss/instant_ce"] = (
-                                loss.item() - args.alpha_distil * logs["loss/distil"]
-                            ) / args.alpha_ce
-                        else:
-                            logs["loss/instant_ce"] = loss.item() - regu_lambda * logs["loss/regularization"]
-                    logging_loss = tr_loss
-
-                    for key, value in logs.items():
-                        tb_writer.add_scalar(key, value, global_step)
-                    print(json.dumps({**logs, **{"step": global_step}}))
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + "/MM") if args.task_name == "mnli" else (args.output_dir,)
-
-    results = {}
-    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
-
-        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(eval_output_dir)
-
-        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset)
-        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        # multi-gpu eval
-        if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
-            model = nn.DataParallel(model)
-
-        # Eval!
-        logger.info("***** Running evaluation {} *****".format(prefix))
-        logger.info("  Num examples = %d", len(eval_dataset))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        eval_loss = 0.0
-        nb_eval_steps = 0
-        preds = None
-        out_label_ids = None
-
-        # Global TopK
-        if args.global_topk:
-            threshold_mem = None
-
-        for batch in tqdm(eval_dataloader, desc="Evaluating"):
-            model.eval()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            with torch.no_grad():
-                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-                if args.model_type != "distilbert":
-                    inputs["token_type_ids"] = (
-                        batch[2] if args.model_type in ["bert", "masked_bert", "xlnet", "albert"] else None
-                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
-                if "masked" in args.model_type:
-                    inputs["threshold"] = args.final_threshold
-                    if args.global_topk:
-                        if threshold_mem is None:
-                            concat = torch.cat(
-                                [param.view(-1) for name, param in model.named_parameters() if "mask_scores" in name]
-                            )
-                            n = concat.numel()
-                            kth = max(n - (int(n * args.final_threshold) + 1), 1)
-                            threshold_mem = concat.kthvalue(kth).values.item()
-                        inputs["threshold"] = threshold_mem
-                outputs = model(**inputs)
-                tmp_eval_loss, logits = outputs[:2]
-
-                eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if preds is None:
-                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs["labels"].detach().cpu().numpy()
-            else:
-                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
-
-        eval_loss = eval_loss / nb_eval_steps
-        if args.output_mode == "classification":
-            from scipy.special import softmax
-
-            probs = softmax(preds, axis=-1)
-            entropy = np.exp((-probs * np.log(probs)).sum(axis=-1).mean())
-            preds = np.argmax(preds, axis=1)
-        elif args.output_mode == "regression":
-            preds = np.squeeze(preds)
-        result = compute_metrics(eval_task, preds, out_label_ids)
-        results.update(result)
-        if entropy is not None:
-            result["eval_avg_entropy"] = entropy
-
-        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results {} *****".format(prefix))
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return results
-
-
-def load_and_cache_examples(args, task, tokenizer, evaluate=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    processor = processors[task]()
-    output_mode = output_modes[task]
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args.data_dir,
-        "cached_{}_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-            str(task),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        label_list = processor.get_labels()
-        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
-            # HACK(label indices are swapped in RoBERTa pretrained model)
-            label_list[1], label_list[2] = label_list[2], label_list[1]
-        examples = (
-            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        )
-        features = convert_examples_to_features(
-            examples,
-            tokenizer,
-            max_length=args.max_seq_length,
-            label_list=label_list,
-            output_mode=output_mode,
-        )
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-    if output_mode == "classification":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
-    elif output_mode == "regression":
-        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-
-    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--task_name",
-        default=None,
-        type=str,
-        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    # Other parameters
-    parser.add_argument(
-        "--config_name",
-        default="",
-        type=str,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help=(
-            "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        ),
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training",
-        action="store_true",
-        help="Run evaluation during training at each logging step.",
-    )
-    parser.add_argument(
-        "--do_lower_case",
-        action="store_true",
-        help="Set this flag if you are using an uncased model.",
-    )
-
-    parser.add_argument(
-        "--per_gpu_train_batch_size",
-        default=8,
-        type=int,
-        help="Batch size per GPU/CPU for training.",
-    )
-    parser.add_argument(
-        "--per_gpu_eval_batch_size",
-        default=8,
-        type=int,
-        help="Batch size per GPU/CPU for evaluation.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-
-    # Pruning parameters
-    parser.add_argument(
-        "--mask_scores_learning_rate",
-        default=1e-2,
-        type=float,
-        help="The Adam initial learning rate of the mask scores.",
-    )
-    parser.add_argument(
-        "--initial_threshold", default=1.0, type=float, help="Initial value of the threshold (for scheduling)."
-    )
-    parser.add_argument(
-        "--final_threshold", default=0.7, type=float, help="Final value of the threshold (for scheduling)."
-    )
-    parser.add_argument(
-        "--initial_warmup",
-        default=1,
-        type=int,
-        help=(
-            "Run `initial_warmup` * `warmup_steps` steps of threshold warmup during which threshold stays "
-            "at its `initial_threshold` value (sparsity schedule)."
-        ),
-    )
-    parser.add_argument(
-        "--final_warmup",
-        default=2,
-        type=int,
-        help=(
-            "Run `final_warmup` * `warmup_steps` steps of threshold cool-down during which threshold stays "
-            "at its final_threshold value (sparsity schedule)."
-        ),
-    )
-
-    parser.add_argument(
-        "--pruning_method",
-        default="topK",
-        type=str,
-        help=(
-            "Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning,"
-            " sigmoied_threshold = Soft movement pruning)."
-        ),
-    )
-    parser.add_argument(
-        "--mask_init",
-        default="constant",
-        type=str,
-        help="Initialization method for the mask scores. Choices: constant, uniform, kaiming.",
-    )
-    parser.add_argument(
-        "--mask_scale", default=0.0, type=float, help="Initialization parameter for the chosen initialization method."
-    )
-
-    parser.add_argument("--regularization", default=None, help="Add L0 or L1 regularization to the mask scores.")
-    parser.add_argument(
-        "--final_lambda",
-        default=0.0,
-        type=float,
-        help="Regularization intensity (used in conjunction with `regularization`.",
-    )
-
-    parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
-    parser.add_argument(
-        "--global_topk_frequency_compute",
-        default=25,
-        type=int,
-        help="Frequency at which we compute the TopK global threshold.",
-    )
-
-    # Distillation parameters (optional)
-    parser.add_argument(
-        "--teacher_type",
-        default=None,
-        type=str,
-        help=(
-            "Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for"
-            " distillation."
-        ),
-    )
-    parser.add_argument(
-        "--teacher_name_or_path",
-        default=None,
-        type=str,
-        help="Path to the already fine-tuned teacher model. Only for distillation.",
-    )
-    parser.add_argument(
-        "--alpha_ce", default=0.5, type=float, help="Cross entropy loss linear weight. Only for distillation."
-    )
-    parser.add_argument(
-        "--alpha_distil", default=0.5, type=float, help="Distillation loss linear weight. Only for distillation."
-    )
-    parser.add_argument(
-        "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation."
-    )
-
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs",
-        default=3.0,
-        type=float,
-        help="Total number of training epochs to perform.",
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir",
-        action="store_true",
-        help="Overwrite the content of the output directory",
-    )
-    parser.add_argument(
-        "--overwrite_cache",
-        action="store_true",
-        help="Overwrite the cached training and evaluation sets",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-
-    args = parser.parse_args()
-
-    # Regularization
-    if args.regularization == "null":
-        args.regularization = None
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to"
-            " overcome."
-        )
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Prepare GLUE task
-    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
-        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
-    args.output_mode = output_modes[args.task_name]
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=args.task_name,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-        pruning_method=args.pruning_method,
-        mask_init=args.mask_init,
-        mask_scale=args.mask_scale,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-        do_lower_case=args.do_lower_case,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.teacher_type is not None:
-        assert args.teacher_name_or_path is not None
-        assert args.alpha_distil > 0.0
-        assert args.alpha_distil + args.alpha_ce > 0.0
-        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
-        teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path)
-        teacher = teacher_model_class.from_pretrained(
-            args.teacher_name_or_path,
-            from_tf=False,
-            config=teacher_config,
-            cache_dir=args.cache_dir if args.cache_dir else None,
-        )
-        teacher.to(args.device)
-    else:
-        teacher = None
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = [
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            ]
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = {k + "_{}".format(global_step): v for k, v in result.items()}
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/movement-pruning/masked_run_squad.py b/examples/research_projects/movement-pruning/masked_run_squad.py
deleted file mode 100644
index 7b1c2b322097..000000000000
--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ /dev/null
@@ -1,1147 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fine-pruning Masked BERT for question-answering on SQuAD."""
-
-import argparse
-import glob
-import logging
-import os
-import random
-import timeit
-
-import numpy as np
-import torch
-from emmental import MaskedBertConfig, MaskedBertForQuestionAnswering
-from torch import nn
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForQuestionAnswering,
-    BertTokenizer,
-    get_linear_schedule_with_warmup,
-    squad_convert_examples_to_features,
-)
-from transformers.data.metrics.squad_metrics import (
-    compute_predictions_log_probs,
-    compute_predictions_logits,
-    squad_evaluate,
-)
-from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
-    "masked_bert": (MaskedBertConfig, MaskedBertForQuestionAnswering, BertTokenizer),
-}
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def schedule_threshold(
-    step: int,
-    total_step: int,
-    warmup_steps: int,
-    initial_threshold: float,
-    final_threshold: float,
-    initial_warmup: int,
-    final_warmup: int,
-    final_lambda: float,
-):
-    if step <= initial_warmup * warmup_steps:
-        threshold = initial_threshold
-    elif step > (total_step - final_warmup * warmup_steps):
-        threshold = final_threshold
-    else:
-        spars_warmup_steps = initial_warmup * warmup_steps
-        spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps
-        mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps)
-        threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff**3)
-    regu_lambda = final_lambda * threshold / final_threshold
-    return threshold, regu_lambda
-
-
-def regularization(model: nn.Module, mode: str):
-    regu, counter = 0, 0
-    for name, param in model.named_parameters():
-        if "mask_scores" in name:
-            if mode == "l1":
-                regu += torch.norm(torch.sigmoid(param), p=1) / param.numel()
-            elif mode == "l0":
-                regu += torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1)).sum() / param.numel()
-            else:
-                raise ValueError("Don't know this mode.")
-            counter += 1
-    return regu / counter
-
-
-def to_list(tensor):
-    return tensor.detach().cpu().tolist()
-
-
-def train(args, train_dataset, model, tokenizer, teacher=None):
-    """Train the model"""
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter(log_dir=args.output_dir)
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if "mask_score" in n and p.requires_grad],
-            "lr": args.mask_scores_learning_rate,
-        },
-        {
-            "params": [
-                p
-                for n, p in model.named_parameters()
-                if "mask_score" not in n and p.requires_grad and not any(nd in n for nd in no_decay)
-            ],
-            "lr": args.learning_rate,
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [
-                p
-                for n, p in model.named_parameters()
-                if "mask_score" not in n and p.requires_grad and any(nd in n for nd in no_decay)
-            ],
-            "lr": args.learning_rate,
-            "weight_decay": 0.0,
-        },
-    ]
-
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = nn.parallel.DistributedDataParallel(
-            model,
-            device_ids=[args.local_rank],
-            output_device=args.local_rank,
-            find_unused_parameters=True,
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-    # Distillation
-    if teacher is not None:
-        logger.info("  Training with distillation")
-
-    global_step = 1
-    # Global TopK
-    if args.global_topk:
-        threshold_mem = None
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        # set global_step to global_step of last saved checkpoint from model path
-        try:
-            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
-            global_step = int(checkpoint_suffix)
-            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-            logger.info("  Continuing training from epoch %d", epochs_trained)
-            logger.info("  Continuing training from global step %d", global_step)
-            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-        except ValueError:
-            logger.info("  Starting fine-tuning.")
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
-    )
-    # Added here for reproducibility
-    set_seed(args)
-
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            threshold, regu_lambda = schedule_threshold(
-                step=global_step,
-                total_step=t_total,
-                warmup_steps=args.warmup_steps,
-                final_threshold=args.final_threshold,
-                initial_threshold=args.initial_threshold,
-                final_warmup=args.final_warmup,
-                initial_warmup=args.initial_warmup,
-                final_lambda=args.final_lambda,
-            )
-            # Global TopK
-            if args.global_topk:
-                if threshold == 1.0:
-                    threshold = -1e2  # Or an indefinitely low quantity
-                else:
-                    if (threshold_mem is None) or (global_step % args.global_topk_frequency_compute == 0):
-                        # Sort all the values to get the global topK
-                        concat = torch.cat(
-                            [param.view(-1) for name, param in model.named_parameters() if "mask_scores" in name]
-                        )
-                        n = concat.numel()
-                        kth = max(n - (int(n * threshold) + 1), 1)
-                        threshold_mem = concat.kthvalue(kth).values.item()
-                        threshold = threshold_mem
-                    else:
-                        threshold = threshold_mem
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "token_type_ids": batch[2],
-                "start_positions": batch[3],
-                "end_positions": batch[4],
-            }
-
-            if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
-                del inputs["token_type_ids"]
-
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
-                if args.version_2_with_negative:
-                    inputs.update({"is_impossible": batch[7]})
-                if hasattr(model, "config") and hasattr(model.config, "lang2id"):
-                    inputs.update(
-                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
-                    )
-
-            if "masked" in args.model_type:
-                inputs["threshold"] = threshold
-
-            outputs = model(**inputs)
-            # model outputs are always tuple in transformers (see doc)
-            loss, start_logits_stu, end_logits_stu = outputs
-
-            # Distillation loss
-            if teacher is not None:
-                with torch.no_grad():
-                    start_logits_tea, end_logits_tea = teacher(
-                        input_ids=inputs["input_ids"],
-                        token_type_ids=inputs["token_type_ids"],
-                        attention_mask=inputs["attention_mask"],
-                    )
-
-                loss_start = nn.functional.kl_div(
-                    input=nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                    target=nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature**2)
-                loss_end = nn.functional.kl_div(
-                    input=nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                    target=nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
-                    reduction="batchmean",
-                ) * (args.temperature**2)
-                loss_logits = (loss_start + loss_end) / 2.0
-
-                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
-
-            # Regularization
-            if args.regularization is not None:
-                regu_ = regularization(model=model, mode=args.regularization)
-                loss = loss + regu_lambda * regu_
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    tb_writer.add_scalar("threshold", threshold, global_step)
-                    for name, param in model.named_parameters():
-                        if not param.requires_grad:
-                            continue
-                        tb_writer.add_scalar("parameter_mean/" + name, param.data.mean(), global_step)
-                        tb_writer.add_scalar("parameter_std/" + name, param.data.std(), global_step)
-                        tb_writer.add_scalar("parameter_min/" + name, param.data.min(), global_step)
-                        tb_writer.add_scalar("parameter_max/" + name, param.data.max(), global_step)
-                        if "pooler" in name:
-                            continue
-                        tb_writer.add_scalar("grad_mean/" + name, param.grad.data.mean(), global_step)
-                        tb_writer.add_scalar("grad_std/" + name, param.grad.data.std(), global_step)
-                        if args.regularization is not None and "mask_scores" in name:
-                            if args.regularization == "l1":
-                                perc = (torch.sigmoid(param) > threshold).sum().item() / param.numel()
-                            elif args.regularization == "l0":
-                                perc = (torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1))).sum().item() / param.numel()
-                            tb_writer.add_scalar("retained_weights_perc/" + name, perc, global_step)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                # Log metrics
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Only evaluate when single GPU otherwise metrics may not average well
-                    if args.local_rank == -1 and args.evaluate_during_training:
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    learning_rate_scalar = scheduler.get_lr()
-                    tb_writer.add_scalar("lr", learning_rate_scalar[0], global_step)
-                    if len(learning_rate_scalar) > 1:
-                        for idx, lr in enumerate(learning_rate_scalar[1:]):
-                            tb_writer.add_scalar(f"lr/{idx+1}", lr, global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    if teacher is not None:
-                        tb_writer.add_scalar("loss/distil", loss_logits.item(), global_step)
-                    if args.regularization is not None:
-                        tb_writer.add_scalar("loss/regularization", regu_.item(), global_step)
-                    if (teacher is not None) or (args.regularization is not None):
-                        if (teacher is not None) and (args.regularization is not None):
-                            tb_writer.add_scalar(
-                                "loss/instant_ce",
-                                (loss.item() - regu_lambda * regu_.item() - args.alpha_distil * loss_logits.item())
-                                / args.alpha_ce,
-                                global_step,
-                            )
-                        elif teacher is not None:
-                            tb_writer.add_scalar(
-                                "loss/instant_ce",
-                                (loss.item() - args.alpha_distil * loss_logits.item()) / args.alpha_ce,
-                                global_step,
-                            )
-                        else:
-                            tb_writer.add_scalar(
-                                "loss/instant_ce", loss.item() - regu_lambda * regu_.item(), global_step
-                            )
-                    logging_loss = tr_loss
-
-                # Save model checkpoint
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    # Take care of distributed/parallel training
-                    model_to_save = model.module if hasattr(model, "module") else model
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
-
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset)
-    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # multi-gpu eval
-    if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
-        model = nn.DataParallel(model)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-
-    all_results = []
-    start_time = timeit.default_timer()
-    # Global TopK
-    if args.global_topk:
-        threshold_mem = None
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
-        batch = tuple(t.to(args.device) for t in batch)
-
-        with torch.no_grad():
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "token_type_ids": batch[2],
-            }
-
-            if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
-                del inputs["token_type_ids"]
-
-            example_indices = batch[3]
-
-            # XLNet and XLM use more arguments for their predictions
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
-                # for lang_id-sensitive xlm models
-                if hasattr(model, "config") and hasattr(model.config, "lang2id"):
-                    inputs.update(
-                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
-                    )
-            if "masked" in args.model_type:
-                inputs["threshold"] = args.final_threshold
-                if args.global_topk:
-                    if threshold_mem is None:
-                        concat = torch.cat(
-                            [param.view(-1) for name, param in model.named_parameters() if "mask_scores" in name]
-                        )
-                        n = concat.numel()
-                        kth = max(n - (int(n * args.final_threshold) + 1), 1)
-                        threshold_mem = concat.kthvalue(kth).values.item()
-                    inputs["threshold"] = threshold_mem
-            outputs = model(**inputs)
-
-        for i, example_index in enumerate(example_indices):
-            eval_feature = features[example_index.item()]
-            unique_id = int(eval_feature.unique_id)
-
-            output = [to_list(output[i]) for output in outputs]
-
-            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
-            # models only use two.
-            if len(output) >= 5:
-                start_logits = output[0]
-                start_top_index = output[1]
-                end_logits = output[2]
-                end_top_index = output[3]
-                cls_logits = output[4]
-
-                result = SquadResult(
-                    unique_id,
-                    start_logits,
-                    end_logits,
-                    start_top_index=start_top_index,
-                    end_top_index=end_top_index,
-                    cls_logits=cls_logits,
-                )
-
-            else:
-                start_logits, end_logits = output
-                result = SquadResult(unique_id, start_logits, end_logits)
-
-            all_results.append(result)
-
-    evalTime = timeit.default_timer() - start_time
-    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
-
-    # Compute predictions
-    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
-    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
-
-    if args.version_2_with_negative:
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
-    else:
-        output_null_log_odds_file = None
-
-    # XLNet and XLM use a more complex post-processing procedure
-    if args.model_type in ["xlnet", "xlm"]:
-        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
-        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
-
-        predictions = compute_predictions_log_probs(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            start_n_top,
-            end_n_top,
-            args.version_2_with_negative,
-            tokenizer,
-            args.verbose_logging,
-        )
-    else:
-        predictions = compute_predictions_logits(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            args.do_lower_case,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            args.verbose_logging,
-            args.version_2_with_negative,
-            args.null_score_diff_threshold,
-            tokenizer,
-        )
-
-    # Compute the F1 and exact scores.
-    results = squad_evaluate(examples, predictions)
-    return results
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        torch.distributed.barrier()
-
-    # Load data features from cache or dataset file
-    input_dir = args.data_dir if args.data_dir else "."
-    cached_features_file = os.path.join(
-        input_dir,
-        "cached_{}_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            args.tokenizer_name
-            if args.tokenizer_name
-            else list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-            list(filter(None, args.predict_file.split("/"))).pop()
-            if evaluate
-            else list(filter(None, args.train_file.split("/"))).pop(),
-        ),
-    )
-
-    # Init features and dataset from cache if it exists
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features_and_dataset = torch.load(cached_features_file)
-        features, dataset, examples = (
-            features_and_dataset["features"],
-            features_and_dataset["dataset"],
-            features_and_dataset["examples"],
-        )
-    else:
-        logger.info("Creating features from dataset file at %s", input_dir)
-
-        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
-            try:
-                import tensorflow_datasets as tfds
-            except ImportError:
-                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
-
-            if args.version_2_with_negative:
-                logger.warning("tensorflow_datasets does not handle version 2 of SQuAD.")
-
-            tfds_examples = tfds.load("squad")
-            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
-        else:
-            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
-            if evaluate:
-                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
-            else:
-                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
-
-        features, dataset = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=not evaluate,
-            return_dataset="pt",
-            threads=args.threads,
-        )
-
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        torch.distributed.barrier()
-
-    if output_examples:
-        return dataset, examples, features
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model checkpoints and predictions will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        help="The input data dir. Should contain the .json files for the task."
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--train_file",
-        default=None,
-        type=str,
-        help="The input training file. If a data dir is specified, will look for the file there"
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--predict_file",
-        default=None,
-        type=str,
-        help="The input evaluation file. If a data dir is specified, will look for the file there"
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-    )
-
-    parser.add_argument(
-        "--version_2_with_negative",
-        action="store_true",
-        help="If true, the SQuAD examples contain some that do not have an answer.",
-    )
-    parser.add_argument(
-        "--null_score_diff_threshold",
-        type=float,
-        default=0.0,
-        help="If null_score - best_non_null is greater than the threshold predict null.",
-    )
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=384,
-        type=int,
-        help=(
-            "The maximum total input sequence length after WordPiece tokenization. Sequences "
-            "longer than this will be truncated, and sequences shorter than this will be padded."
-        ),
-    )
-    parser.add_argument(
-        "--doc_stride",
-        default=128,
-        type=int,
-        help="When splitting up a long document into chunks, how much stride to take between chunks.",
-    )
-    parser.add_argument(
-        "--max_query_length",
-        default=64,
-        type=int,
-        help=(
-            "The maximum number of tokens for the question. Questions longer than this will "
-            "be truncated to this length."
-        ),
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-
-    # Pruning parameters
-    parser.add_argument(
-        "--mask_scores_learning_rate",
-        default=1e-2,
-        type=float,
-        help="The Adam initial learning rate of the mask scores.",
-    )
-    parser.add_argument(
-        "--initial_threshold", default=1.0, type=float, help="Initial value of the threshold (for scheduling)."
-    )
-    parser.add_argument(
-        "--final_threshold", default=0.7, type=float, help="Final value of the threshold (for scheduling)."
-    )
-    parser.add_argument(
-        "--initial_warmup",
-        default=1,
-        type=int,
-        help=(
-            "Run `initial_warmup` * `warmup_steps` steps of threshold warmup during which threshold stays "
-            "at its `initial_threshold` value (sparsity schedule)."
-        ),
-    )
-    parser.add_argument(
-        "--final_warmup",
-        default=2,
-        type=int,
-        help=(
-            "Run `final_warmup` * `warmup_steps` steps of threshold cool-down during which threshold stays "
-            "at its final_threshold value (sparsity schedule)."
-        ),
-    )
-
-    parser.add_argument(
-        "--pruning_method",
-        default="topK",
-        type=str,
-        help=(
-            "Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning,"
-            " sigmoied_threshold = Soft movement pruning)."
-        ),
-    )
-    parser.add_argument(
-        "--mask_init",
-        default="constant",
-        type=str,
-        help="Initialization method for the mask scores. Choices: constant, uniform, kaiming.",
-    )
-    parser.add_argument(
-        "--mask_scale", default=0.0, type=float, help="Initialization parameter for the chosen initialization method."
-    )
-
-    parser.add_argument("--regularization", default=None, help="Add L0 or L1 regularization to the mask scores.")
-    parser.add_argument(
-        "--final_lambda",
-        default=0.0,
-        type=float,
-        help="Regularization intensity (used in conjunction with `regularization`.",
-    )
-
-    parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
-    parser.add_argument(
-        "--global_topk_frequency_compute",
-        default=25,
-        type=int,
-        help="Frequency at which we compute the TopK global threshold.",
-    )
-
-    # Distillation parameters (optional)
-    parser.add_argument(
-        "--teacher_type",
-        default=None,
-        type=str,
-        help=(
-            "Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for"
-            " distillation."
-        ),
-    )
-    parser.add_argument(
-        "--teacher_name_or_path",
-        default=None,
-        type=str,
-        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.",
-    )
-    parser.add_argument(
-        "--alpha_ce", default=0.5, type=float, help="Cross entropy loss linear weight. Only for distillation."
-    )
-    parser.add_argument(
-        "--alpha_distil", default=0.5, type=float, help="Distillation loss linear weight. Only for distillation."
-    )
-    parser.add_argument(
-        "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation."
-    )
-
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs",
-        default=3.0,
-        type=float,
-        help="Total number of training epochs to perform.",
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-    parser.add_argument(
-        "--n_best_size",
-        default=20,
-        type=int,
-        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
-    )
-    parser.add_argument(
-        "--max_answer_length",
-        default=30,
-        type=int,
-        help=(
-            "The maximum length of an answer that can be generated. This is needed because the start "
-            "and end predictions are not conditioned on one another."
-        ),
-    )
-    parser.add_argument(
-        "--verbose_logging",
-        action="store_true",
-        help=(
-            "If true, all of the warnings related to data processing will be printed. "
-            "A number of warnings are expected for a normal SQuAD evaluation."
-        ),
-    )
-    parser.add_argument(
-        "--lang_id",
-        default=0,
-        type=int,
-        help=(
-            "language id of input for language-specific xlm models (see"
-            " tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"
-        ),
-    )
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-
-    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
-    args = parser.parse_args()
-
-    # Regularization
-    if args.regularization == "null":
-        args.regularization = None
-
-    if args.doc_stride >= args.max_seq_length - args.max_query_length:
-        logger.warning(
-            "WARNING - You've set a doc stride which may be superior to the document length in some "
-            "examples. This could result in errors when building features from the examples. Please reduce the doc "
-            "stride or increase the maximum length to ensure the features are correctly built."
-        )
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
-
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-        pruning_method=args.pruning_method,
-        mask_init=args.mask_init,
-        mask_scale=args.mask_scale,
-    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    model = model_class.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.teacher_type is not None:
-        assert args.teacher_name_or_path is not None
-        assert args.alpha_distil > 0.0
-        assert args.alpha_distil + args.alpha_ce > 0.0
-        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
-        teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path)
-        teacher = teacher_model_class.from_pretrained(
-            args.teacher_name_or_path,
-            from_tf=False,
-            config=teacher_config,
-            cache_dir=args.cache_dir if args.cache_dir else None,
-        )
-        teacher.to(args.device)
-    else:
-        teacher = None
-
-    if args.local_rank == 0:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
-    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
-    # remove the need for this code, but it is still valid.
-    if args.fp16:
-        try:
-            import apex
-
-            apex.amp.register_half_function(torch, "einsum")
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Save the trained model and the tokenizer
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        # Take care of distributed/parallel training
-        model_to_save = model.module if hasattr(model, "module") else model
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)  # , force_download=True)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        model.to(args.device)
-
-    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        if args.do_train:
-            logger.info("Loading checkpoints saved during training for evaluation")
-            checkpoints = [args.output_dir]
-            if args.eval_all_checkpoints:
-                checkpoints = [
-                    os.path.dirname(c)
-                    for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-                ]
-
-        else:
-            logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
-            checkpoints = [args.model_name_or_path]
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        for checkpoint in checkpoints:
-            # Reload the model
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)  # , force_download=True)
-            model.to(args.device)
-
-            # Evaluate
-            result = evaluate(args, model, tokenizer, prefix=global_step)
-
-            result = {k + ("_{}".format(global_step) if global_step else ""): v for k, v in result.items()}
-            results.update(result)
-
-    logger.info("Results: {}".format(results))
-    predict_file = list(filter(None, args.predict_file.split("/"))).pop()
-    if not os.path.exists(os.path.join(args.output_dir, predict_file)):
-        os.makedirs(os.path.join(args.output_dir, predict_file))
-    output_eval_file = os.path.join(args.output_dir, predict_file, "eval_results.txt")
-    with open(output_eval_file, "w") as writer:
-        for key in sorted(results.keys()):
-            writer.write("%s = %s\n" % (key, str(results[key])))
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/movement-pruning/requirements.txt b/examples/research_projects/movement-pruning/requirements.txt
deleted file mode 100644
index b678a785bc34..000000000000
--- a/examples/research_projects/movement-pruning/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-torch>=1.4.0
--e git+https://github.com/huggingface/transformers.git@352d5472b0c1dec0f420d606d16747d851b4bda8#egg=transformers
-knockknock>=0.1.8.1
-h5py>=2.10.0
-numpy>=1.18.2
-scipy>=1.4.1
diff --git a/examples/research_projects/onnx/summarization/README.md b/examples/research_projects/onnx/summarization/README.md
deleted file mode 100644
index c43b0450ea2c..000000000000
--- a/examples/research_projects/onnx/summarization/README.md
+++ /dev/null
@@ -1,43 +0,0 @@
-<!---
-Copyright 2021 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Bart + Beam Search to ONNX
-
-Author: [@fatcat-z](https://github.com/fatcat-z)
-
-This folder contains an example of exporting Bart + Beam Search generation (`BartForConditionalGeneration`) to ONNX.
-
-Beam Search contains a for-loop workflow, so we need to make them TorchScript-compatible for exporting to ONNX. This example shows how to make a Bart model be TorchScript-compatible by wrapping up it into a new model. In addition, some changes were made to the `beam_search()` function to make it TorchScript-compatible.
-
-
-## How to run the example
-
-To make sure you can successfully run the latest versions of the example scripts, you have to **install the library from source** and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install '.[onnxruntime]'
-```
-Then cd in this example folder and run
-```bash
-pip install -r requirements.txt
-```
-
-Now you can run the example command below to get the example ONNX file:
-
-```bash
-python run_onnx_exporter.py --model_name_or_path facebook/bart-base
-```
diff --git a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py b/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
deleted file mode 100644
index 5c1b0da70002..000000000000
--- a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
+++ /dev/null
@@ -1,755 +0,0 @@
-import copy
-import itertools
-from typing import List, Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-
-from transformers import BartConfig
-from transformers.generation import GenerationMixin
-
-
-def _convert_past_list_to_tuple(past_key_values):
-    """
-    In Bart model, the type of past_key_values is tuple(tuple(torch.FloatTensor)) which is not
-    TorchScript-compatible. To support this, we have to convert it during the export process.
-    This function will convert past values from a list to tuple(tuple(torch.FloatTensor)) for
-    the inner decoder.
-
-    According to the definition of past_key_values, each inner tuple(torch.FloatTensor) has 4 tensors,
-    so we convert every 4 elements in the list as a tuple(torch.FloatTensor).
-    """
-    count_of_each_inner_tuple = 4
-    results = ()
-    temp_result = ()
-    count_n = len(past_key_values) // count_of_each_inner_tuple
-    for idx in range(count_n):
-        real_idx = idx * count_of_each_inner_tuple
-        temp_result = tuple(past_key_values[real_idx : real_idx + count_of_each_inner_tuple])
-        results += ((temp_result),)
-
-    return results
-
-
-class EncoderForONNX(torch.nn.Module):
-    def __init__(self, encoder):
-        super().__init__()
-        self.encoder = encoder
-
-    def forward(self, input_ids, attention_mask):
-        return self.encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            return_dict=False,
-        )
-
-
-class DecoderForONNX(torch.nn.Module):
-    def __init__(self, decoder):
-        super().__init__()
-        self.decoder = decoder
-
-    def forward(self, input_ids, encoder_state, attention_mask, past=None):
-        all_results = None
-        if past is not None:
-            all_results = _convert_past_list_to_tuple(past)
-            input_ids = input_ids[:, -1:]
-
-        last_hidden_state, past_key_values = self.decoder(
-            input_ids=input_ids,
-            encoder_hidden_states=encoder_state,
-            encoder_attention_mask=attention_mask,
-            past_key_values=all_results,
-            return_dict=False,
-        )
-
-        past_values = []
-        for past in past_key_values:
-            past_values = past_values + list(past)
-        return last_hidden_state, past_values
-
-
-def _create_traced_encoder(encoder, input_ids, attention_mask):
-    encoder_c = copy.deepcopy(encoder)
-    encoder_for_onnx = EncoderForONNX(encoder_c)
-
-    return torch.jit.trace(encoder_for_onnx, (input_ids, attention_mask))
-
-
-def _create_traced_decoder(decoder, input_ids, encoder_state, attention_mask, past=None):
-    decoder_c = copy.deepcopy(decoder)
-    decoder_for_onnx = DecoderForONNX(decoder_c)
-    past_values = list(itertools.chain.from_iterable(past or ()))
-
-    # Do this twice so we got 2 different decoders for further work.
-    if past_values:
-        return torch.jit.trace(decoder_for_onnx, (input_ids, encoder_state, attention_mask, past_values))
-    else:
-        return torch.jit.trace(decoder_for_onnx, (input_ids, encoder_state, attention_mask))
-
-
-class BartConfigTS(BartConfig, torch.nn.Module):
-    """
-    BartConfigTS is a TorchScript-compatible transformers.models.bart.configuration_bart.BartConfig.
-    TorchScript only supports sub-classes of torch.nn.Module.
-    """
-
-    def __init__(self, config):
-        BartConfig.__init__(self, config)
-        torch.nn.Module.__init__(self)
-
-
-class MinLengthLogitsProcessorTS(torch.nn.Module):
-    r"""
-    :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
-
-    Args:
-        min_length (:obj:`int`):
-            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
-        eos_token_id (:obj:`int`):
-            The id of the `end-of-sequence` token.
-    """
-
-    def __init__(self, min_length: int, eos_token_id: int):
-        super().__init__()
-
-        if not isinstance(min_length, int) or min_length < 0:
-            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
-
-        if not isinstance(eos_token_id, int) or eos_token_id < 0:
-            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
-
-        self.min_length = min_length
-        self.eos_token_id = eos_token_id
-
-    def forward(self, input_ids, scores) -> torch.Tensor:
-        cur_len = input_ids.shape[-1]
-        if cur_len < self.min_length:
-            scores[:, self.eos_token_id] = -float("inf")
-        return scores
-
-
-class BARTGenerator(torch.nn.Module, GenerationMixin):
-    def __init__(self, model):
-        super().__init__()
-        self.config = BartConfigTS(model.config)
-        self.config.force_bos_token_to_be_generated = False
-        self._trace_modules(model)
-        self.logits_processor = MinLengthLogitsProcessorTS(self.config.min_length, self.config.eos_token_id)
-        self.final_logits_weight = model.model.shared.weight
-        self.final_logits_bias = model.final_logits_bias
-        self.decoder_layers = model.config.decoder_layers
-
-    def _trace_modules(self, model):
-        input_ids = torch.tensor(
-            [
-                [
-                    19,
-                    669,
-                    18,
-                    420,
-                    8,
-                    664,
-                    57,
-                    42,
-                    8,
-                    664,
-                    21,
-                    3028,
-                    195,
-                    4445,
-                    331,
-                    1293,
-                    34,
-                    21,
-                    10,
-                    6174,
-                    1100,
-                    6,
-                    69,
-                    104,
-                    42,
-                    32,
-                    2621,
-                    1638,
-                    144,
-                    4,
-                    6174,
-                    558,
-                    108,
-                    4419,
-                    1091,
-                    28,
-                    4,
-                    1668,
-                    9,
-                    1509,
-                    1621,
-                    279,
-                    35,
-                    867,
-                    2734,
-                    85,
-                    11,
-                    2216,
-                    2734,
-                    85,
-                    203,
-                    2244,
-                    7,
-                    6,
-                    15,
-                    8102,
-                    7,
-                    57,
-                    8629,
-                    5,
-                    model.config.eos_token_id,
-                ]
-            ],
-            device=model.device,
-            dtype=torch.long,
-        )
-        attention_mask = torch.tensor(
-            [[True] * input_ids.shape[-1]],
-            device=model.device,
-            dtype=torch.bool,
-        )
-        self.encoder = _create_traced_encoder(model.get_encoder(), input_ids, attention_mask)
-        encoder_outputs = model.get_encoder()(input_ids, attention_mask=attention_mask, return_dict=True)
-        decoder = model.model.decoder
-        decoder_outputs = decoder(input_ids, attention_mask, encoder_outputs["last_hidden_state"], None, None, None)
-        self.decoder_no_past = _create_traced_decoder(
-            model.model.decoder, input_ids, encoder_outputs["last_hidden_state"], attention_mask
-        )
-        self.decoder_with_past = _create_traced_decoder(
-            model.model.decoder, input_ids, encoder_outputs["last_hidden_state"], attention_mask, decoder_outputs[1]
-        )
-
-    def _encoder_forward(self, input_ids, attention_mask):
-        return self.encoder(input_ids, attention_mask)[0]
-
-    @staticmethod
-    def _init_sequence_length_for_generation(
-        input_ids: torch.LongTensor, max_length: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
-        unfinished_sequences = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + 1
-        sequence_lengths = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + max_length
-
-        cur_len = input_ids.shape[-1]
-        return sequence_lengths, unfinished_sequences, cur_len
-
-    def _decoder_forward(self, input_ids, encoder_output, attention_mask, past: List[torch.Tensor]):
-        # Update here to use different decoder for different values of past.
-        if past is None or len(past) == 0:
-            decoder_output, past = self.decoder_no_past(
-                input_ids=input_ids, encoder_state=encoder_output, attention_mask=attention_mask
-            )
-        else:
-            decoder_output, past = self.decoder_with_past(
-                input_ids=input_ids, encoder_state=encoder_output, attention_mask=attention_mask, past=past
-            )
-
-        lm_logits = F.linear(decoder_output, self.final_logits_weight, bias=self.final_logits_bias)
-
-        return lm_logits, past
-
-    def greedy_search(
-        self, input_ids, encoder_output, attention_mask, max_length, pad_token_id: int, eos_token_id: int
-    ):
-        # init sequence length tensors
-        sequence_lengths, unfinished_sequences, cur_len = self._init_sequence_length_for_generation(
-            input_ids, max_length
-        )
-
-        past: List[torch.Tensor] = []
-        while cur_len < max_length:
-            logits, past = self._decoder_forward(input_ids, encoder_output, attention_mask, past)
-            next_token_logits = logits[:, -1, :]
-
-            # pre-process distribution
-            scores = self.logits_processor(input_ids, next_token_logits)
-
-            # argmax
-            next_tokens = torch.argmax(scores, dim=-1)
-
-            # add code that transfomers next_tokens to tokens_to_add
-            if eos_token_id is not None:
-                assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
-                next_tokens = next_tokens * unfinished_sequences + (pad_token_id) * (1 - unfinished_sequences)
-
-            # add token and increase length by one
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-
-            # update sequence length
-            if eos_token_id is not None:
-                sequence_lengths, unfinished_sequences = self._update_seq_length_for_generation(
-                    sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id
-                )
-
-            # stop when there is a </s> in each sentence, or if we exceed the maximul length
-            if unfinished_sequences.max() == 0:
-                break
-
-            # increase cur_len
-            cur_len = cur_len + 1
-
-        return input_ids
-
-    def _prepare_decoder_input_ids_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        decoder_start_token_id,
-        bos_token_id: Optional[int] = None,
-    ) -> torch.LongTensor:
-        decoder_input_ids = (
-            torch.ones((input_ids.shape[0], 1), dtype=input_ids.dtype, device=input_ids.device)
-            * decoder_start_token_id
-        )
-        return decoder_input_ids
-
-    def forward(self, input_ids, attention_mask, max_length, decoder_start_token_id):
-        pad_token_id = self.config.pad_token_id
-        bos_token_id = self.config.bos_token_id
-        eos_token_id = self.config.eos_token_id
-
-        # special case if pad_token_id is not defined
-        if pad_token_id is None and eos_token_id is not None:
-            # Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.
-            pad_token_id = eos_token_id
-
-        encoder_output = self._encoder_forward(input_ids, attention_mask)
-
-        input_ids = self._prepare_decoder_input_ids_for_generation(
-            input_ids,
-            decoder_start_token_id=decoder_start_token_id,
-            bos_token_id=bos_token_id,
-        )
-
-        return self.greedy_search(
-            input_ids,
-            encoder_output,
-            attention_mask,
-            max_length=max_length,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-        )
-
-
-# TorchScript compatible BeamSearchScorer
-class BeamSearchScorerTS(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.max_length: int = 200
-        self.num_beams: int = 3
-        self.batch_size: int = 1
-        self.length_penalty: float = 1.0
-        self.do_early_stopping: bool = True
-        self.num_beam_hyps_to_keep: int = 1
-        self.num_beam_groups: int = 1
-        self.group_size: int = self.num_beams // self.num_beam_groups
-        self._done = torch.zeros(self.batch_size, dtype=torch.bool)
-        self._beam_hyps_count = torch.zeros(self.batch_size, dtype=torch.long)
-        self._beam_hyps_worst_scores = torch.zeros(self.batch_size) + 1e9
-        self._beam_hyps_max_length: int = self.max_length - 1
-        self._beam_hyps: List[torch.Tensor] = [torch.zeros(2)]  # placeholder for TorchScript compatibility
-        self._beam_scores: List[torch.Tensor] = [torch.zeros(2)]  # placeholder for TorchScript compatibility
-
-    def is_done(self) -> torch.Tensor:
-        return self._done.all()
-
-    def init(
-        self,
-        batch_size: int,
-        max_length: int,
-        num_beams: int,
-        device: torch.device,
-        length_penalty: float = 1.0,
-        do_early_stopping: bool = False,
-        num_beam_hyps_to_keep: int = 1,
-        num_beam_groups: int = 1,
-    ):
-        self.max_length = max_length
-        self.num_beams = num_beams
-        self.batch_size = batch_size
-        self.length_penalty = length_penalty
-        self.do_early_stopping = do_early_stopping
-        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
-        self.num_beam_groups = num_beam_groups
-        self.group_size = self.num_beams // self.num_beam_groups
-
-        # NOTE: TorchScript does not support List of Modules
-        #       Rewritten BeamHypotheses with tensors and list of tensors.
-        self._done = torch.zeros(batch_size, dtype=torch.bool, device=device)
-        self._beam_hyps_count = torch.zeros(batch_size, dtype=torch.long, device=device)
-        self._beam_hyps_worst_scores = torch.zeros(batch_size, device=device) + 1e9
-        self._beam_hyps = []
-        self._beam_scores = []
-
-        self._beam_hyps_max_length = max_length - 1  # ignoring bos_token
-
-        if not isinstance(num_beams, int) or num_beams <= 1:
-            raise ValueError(
-                f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1,"
-                " one should make use of `greedy_search` instead."
-            )
-
-        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
-            raise ValueError(
-                "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be"
-                f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
-            )
-
-    def hypo_len(self, hypo_idx: int):
-        """
-        Number of hypotheses in the list.
-        """
-        return self._beam_hyps_count[hypo_idx]
-
-    def hypo_add(self, hyp: torch.Tensor, sum_logprobs: float, hypo_idx: int):
-        """
-        Add a new hypothesis to the list.
-        """
-        score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
-        hyps_count = self.hypo_len(hypo_idx)
-        if hyps_count < self.num_beams or score > self._beam_hyps_worst_scores[hypo_idx]:
-            # NOTE: work around difference of torch.sum(empty_tensor) == 0, while error in onnx.
-            # Bug: https://msdata.visualstudio.com/Vienna/_workitems/edit/1486599
-            beam_idx = (
-                torch.sum(self._beam_hyps_count[:hypo_idx]) if hypo_idx != 0 else torch.tensor(0, dtype=torch.long)
-            )
-            self._beam_scores.insert(beam_idx, torch.tensor([score]))
-            self._beam_hyps.insert(beam_idx, hyp)
-            if hyps_count + 1 > self.num_beams:
-                sorted_next_scores, sorted_indices = torch.topk(
-                    torch.cat(self._beam_scores)[beam_idx : beam_idx + hyps_count + 1], hyps_count + 1, largest=False
-                )
-                del self._beam_hyps[int((sorted_indices[0] + beam_idx))]
-                del self._beam_scores[int((sorted_indices[0] + beam_idx))]
-                self._beam_hyps_worst_scores[hypo_idx] = sorted_next_scores[1]
-            else:
-                self._beam_hyps_worst_scores[hypo_idx] = min(score, self._beam_hyps_worst_scores[hypo_idx])
-                self._beam_hyps_count[hypo_idx] = hyps_count + 1
-
-    def hypo_is_done(self, hypo_idx: int, best_sum_logprobs: float, cur_len: int) -> bool:
-        """
-        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
-        one in the heap, then we are done with this sentence.
-        """
-        if self.hypo_len(hypo_idx) < self.num_beams:
-            return False
-        elif self.do_early_stopping:
-            return True
-        else:
-            cur_score = best_sum_logprobs / cur_len**self.length_penalty
-            ret = self._beam_hyps_worst_scores[hypo_idx].item() >= cur_score
-            return ret
-
-    def process(
-        self,
-        input_ids: torch.Tensor,
-        next_scores: torch.Tensor,
-        next_tokens: torch.Tensor,
-        next_indices: torch.Tensor,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[int] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        cur_len = input_ids.shape[-1]
-        batch_size = len(self._beam_hyps_count)
-        assert batch_size == (input_ids.shape[0] // self.group_size)
-
-        device = input_ids.device
-        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
-        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
-        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)
-
-        for batch_idx in range(batch_size):
-            if self._done[batch_idx]:
-                assert (
-                    self.hypo_len(batch_idx) >= self.num_beams
-                ), "Batch can only be done if at least {} beams have been generated".format(self.num_beams)
-                assert (
-                    eos_token_id is not None and pad_token_id is not None
-                ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
-                # pad the batch
-                next_beam_scores[batch_idx, :] = 0
-                next_beam_tokens[batch_idx, :] = pad_token_id
-                next_beam_indices[batch_idx, :] = 0
-                continue
-
-            # next tokens for this sentence
-            beam_idx = 0
-            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
-                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
-            ):
-                batch_beam_idx = batch_idx * self.group_size + next_index
-                # add to generated hypotheses if end of sentence
-                if (eos_token_id is not None) and (next_token == eos_token_id):
-                    # if beam_token does not belong to top num_beams tokens, it should not be added
-                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
-                    if is_beam_token_worse_than_top_num_beams:
-                        continue
-                    self.hypo_add(
-                        input_ids[batch_beam_idx].clone(),
-                        next_score.item(),
-                        batch_idx,
-                    )
-                else:
-                    # add next predicted token since it is not eos_token
-                    next_beam_scores[batch_idx, beam_idx] = next_score
-                    next_beam_tokens[batch_idx, beam_idx] = next_token
-                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
-                    beam_idx += 1
-
-                # once the beam for next step is full, don't add more tokens to it.
-                if beam_idx == self.group_size:
-                    break
-
-            if beam_idx < self.group_size:
-                raise ValueError(
-                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
-                    f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
-                )
-
-            # Check if we are done so that we can save a pad step if all(done)
-            self._done[batch_idx] = self._done[batch_idx] or self.hypo_is_done(
-                batch_idx,
-                next_scores[batch_idx].max().item(),
-                cur_len,
-            )
-
-        return next_beam_scores.view(-1), next_beam_tokens.view(-1), next_beam_indices.view(-1)
-
-    def finalize(
-        self,
-        input_ids: torch.Tensor,
-        final_beam_scores: torch.Tensor,
-        final_beam_tokens: torch.Tensor,
-        final_beam_indices: torch.Tensor,
-        pad_token_id: int,
-        eos_token_id: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        batch_size = len(self._beam_hyps_count)
-
-        # finalize all open beam hypotheses and add to generated hypotheses
-        for batch_idx in range(batch_size):
-            if self._done[batch_idx]:
-                continue
-
-            # all open beam hypotheses are added to the beam hypothesis
-            # beam hypothesis class automatically keeps the best beams
-            for beam_id in range(self.num_beams):
-                batch_beam_idx = batch_idx * self.num_beams + beam_id
-                final_score = final_beam_scores[batch_beam_idx].item()
-                final_tokens = input_ids[batch_beam_idx]
-                self.hypo_add(final_tokens, final_score, batch_idx)
-
-        # select the best hypotheses
-        # NOTE: torch.Tensor.new_zeros() is not scriptable
-        sent_lengths = torch.zeros(batch_size * self.num_beam_hyps_to_keep, dtype=torch.long)
-        best = []
-        best_scores = torch.zeros(
-            batch_size * self.num_beam_hyps_to_keep, device=input_ids.device, dtype=torch.float32
-        )
-        # retrieve best hypotheses
-        for i in range(batch_size):
-            # NOTE: lambda is not scriptable
-            batch_hypo_start = torch.sum(self._beam_hyps_count[:i]) if i > 0 else torch.tensor(0, dtype=torch.long)
-            batch_hypo_end = torch.sum(self._beam_hyps_count[: i + 1])
-            beam_scores = torch.cat(self._beam_scores)[batch_hypo_start:batch_hypo_end]
-            sorted_next_scores, sorted_indices = torch.topk(beam_scores, len(beam_scores), largest=True)
-            for j in range(self.num_beam_hyps_to_keep):
-                best_score = beam_scores[sorted_indices[j]]
-                best_hyp = self._beam_hyps[batch_hypo_start + sorted_indices[j]]
-                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
-                # append to lists
-                best.append(best_hyp)
-                best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
-
-        # prepare for adding eos
-        sent_max_len = min(sent_lengths.max() + 1, self.max_length)
-        decoded = torch.zeros(batch_size * self.num_beam_hyps_to_keep, sent_max_len, dtype=torch.long)
-        # shorter batches are padded if needed
-        if sent_lengths.min() != sent_lengths.max():
-            assert pad_token_id is not None, "`pad_token_id` has to be defined"
-            decoded.fill_(pad_token_id)
-
-        # fill with hypotheses and eos_token_id if the latter fits in
-        for i, hypo in enumerate(best):
-            decoded[i, : sent_lengths[i]] = hypo
-            if sent_lengths[i] < self.max_length:
-                decoded[i, sent_lengths[i]] = eos_token_id
-
-        return decoded, best_scores
-
-
-class BARTBeamSearchGenerator(BARTGenerator):
-    def __init__(self, model):
-        super().__init__(model)
-        self.beam_scorer = BeamSearchScorerTS()
-        self.device = model.device
-
-    @staticmethod
-    def _expand_inputs_for_generation(
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        last_hidden_state: torch.Tensor,
-        expand_size: int = 1,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        expanded_return_idx = (
-            torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
-        )
-        input_ids = input_ids.index_select(0, expanded_return_idx)
-
-        attention_mask = attention_mask.index_select(0, expanded_return_idx)
-
-        last_hidden_state = last_hidden_state.index_select(0, expanded_return_idx.to(last_hidden_state.device))
-        return input_ids, attention_mask, last_hidden_state
-
-    def adjust_logits_during_generation(self, logits, cur_len: int, max_length: int):
-        if cur_len == 1 and self.config.force_bos_token_to_be_generated:
-            logits = self._force_token_id_to_be_generated(logits, self.config.bos_token_id)
-        elif cur_len == max_length - 1 and self.config.eos_token_id is not None:
-            logits = self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
-        return logits
-
-    @staticmethod
-    def _force_token_id_to_be_generated(scores, token_id: int):
-        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
-        mask = torch.full_like(scores, 1, dtype=torch.bool)
-        mask[:, token_id] = False
-        return scores.masked_fill(mask, -float("inf"))
-
-    def _reorder_cache(self, past: List[torch.Tensor], beam_idx):
-        # if decoder past is not included in output
-        # speedy decoding is disabled and no need to reorder
-        reordered_decoder_past = []
-        for state in past:
-            reordered_decoder_past.append(state.index_select(0, beam_idx))
-        return reordered_decoder_past
-
-    def beam_search(
-        self, input_ids, encoder_output, attention_mask, num_beams, max_length, pad_token_id: int, eos_token_id: int
-    ):
-        batch_size = self.beam_scorer.batch_size
-
-        num_beams = self.beam_scorer.num_beams
-        batch_beam_size, cur_len = input_ids.shape
-
-        assert (
-            num_beams * batch_size == batch_beam_size
-        ), f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
-
-        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
-        beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view((batch_size * num_beams,))
-        next_tokens = torch.zeros((batch_size, num_beams), dtype=torch.long, device=input_ids.device)
-        next_indices = torch.zeros((batch_size, num_beams), dtype=torch.long, device=input_ids.device)
-
-        past: List[torch.Tensor] = []
-        while cur_len < max_length:
-            logits, past = self._decoder_forward(input_ids, encoder_output, attention_mask, past)
-            next_token_logits = logits[:, -1, :]
-
-            # adjust tokens for Bart, *e.g.*
-            next_token_logits = self.adjust_logits_during_generation(
-                next_token_logits, cur_len=cur_len, max_length=max_length
-            )
-
-            next_token_scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
-
-            # pre-process distribution
-            next_token_scores = self.logits_processor(input_ids, next_token_scores)
-            next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores)
-
-            # reshape for beam search
-            vocab_size = next_token_scores.shape[-1]
-            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
-
-            next_token_scores, next_tokens = torch.topk(
-                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
-            )
-
-            next_indices = next_tokens // vocab_size
-            next_tokens = next_tokens % vocab_size
-
-            beam_scores, beam_next_tokens, beam_idx = self.beam_scorer.process(
-                input_ids,
-                next_token_scores,
-                next_tokens,
-                next_indices,
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
-            )
-
-            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
-
-            cur_len = cur_len + 1
-
-            if len(past) > 0:
-                past = self._reorder_cache(past, beam_idx)
-
-            if self.beam_scorer.is_done():
-                break
-
-        sequences, sequence_scores = self.beam_scorer.finalize(
-            input_ids,
-            beam_scores,
-            next_tokens,
-            next_indices,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-        )
-
-        return sequences
-
-    def forward(self, input_ids, attention_mask, num_beams, max_length, decoder_start_token_id):
-        pad_token_id = self.config.pad_token_id
-        bos_token_id = self.config.bos_token_id
-        eos_token_id = self.config.eos_token_id
-
-        # special case if pad_token_id is not defined
-        if pad_token_id is None and eos_token_id is not None:
-            # logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
-            pad_token_id = eos_token_id
-
-        encoder_output = self._encoder_forward(input_ids, attention_mask)
-
-        input_ids = self._prepare_decoder_input_ids_for_generation(
-            input_ids,
-            decoder_start_token_id=decoder_start_token_id,
-            bos_token_id=bos_token_id,
-        )
-
-        batch_size = input_ids.shape[0]
-
-        length_penalty = self.config.length_penalty
-        num_return_sequences = self.config.num_return_sequences
-        early_stopping = True
-
-        self.beam_scorer.init(
-            batch_size=batch_size,
-            max_length=max_length,
-            num_beams=num_beams,
-            device=self.device,
-            length_penalty=length_penalty,
-            do_early_stopping=early_stopping,
-            num_beam_hyps_to_keep=num_return_sequences,
-        )
-
-        input_ids, attention_mask, encoder_output = self._expand_inputs_for_generation(
-            input_ids,
-            attention_mask,
-            encoder_output,
-            expand_size=num_beams,
-        )
-
-        return self.beam_search(
-            input_ids=input_ids,
-            encoder_output=encoder_output,
-            attention_mask=attention_mask,
-            num_beams=num_beams,
-            max_length=max_length,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-        )
diff --git a/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py b/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py
deleted file mode 100644
index 1df20e4504da..000000000000
--- a/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py
+++ /dev/null
@@ -1,121 +0,0 @@
-"""
-Code to remove duplicate initializers to reduce ONNX model size.
-"""
-
-import os
-
-import numpy
-import onnx
-
-
-def _is_equal_tensor_proto(a, b):
-    name_a = a.name
-    name_b = b.name
-
-    a.name = ""
-    b.name = ""
-
-    res = a == b
-
-    a.name = name_a
-    b.name = name_b
-
-    return res
-
-
-def _node_replace_input_with(node_proto, name, new_name):
-    for i, input_name in enumerate(node_proto.input):
-        if input_name == name:
-            node_proto.input.insert(i, new_name)
-            node_proto.input.pop(i + 1)
-
-    if node_proto.op_type == "If":
-        _graph_replace_input_with(node_proto.attribute[0].g, name, new_name)
-        _graph_replace_input_with(node_proto.attribute[1].g, name, new_name)
-    if node_proto.op_type == "Loop":
-        _graph_replace_input_with(node_proto.attribute[0].g, name, new_name)
-
-
-def _graph_replace_input_with(graph_proto, name, new_name):
-    for n in graph_proto.node:
-        _node_replace_input_with(n, name, new_name)
-
-
-def _remove_dup_initializers_from_model(model, model_without_ext, ind_to_replace):
-    inits_with_data = list(model.graph.initializer)
-    inits = list(model_without_ext.graph.initializer)
-    for i, ref_i in ind_to_replace:
-        assert inits_with_data[i].name == inits[i].name
-        assert inits_with_data[ref_i].name == inits[ref_i].name
-        assert i > ref_i
-
-        name_i = inits[i].name
-        name_ref = inits[ref_i].name
-
-        model_without_ext.graph.initializer.remove(inits[i])
-
-        # for n in model.graph.node:
-        _graph_replace_input_with(model_without_ext.graph, name_i, name_ref)
-
-
-def remove_dup_initializers(onnx_file_path):
-    """
-    Removes duplicate initializers from the model to reduce its size.
-    Writes a new file in the same directory as onnx_file_path and returns the path to that file.
-    """
-
-    model_file_folder = os.path.dirname(onnx_file_path)
-    model_file_name = os.path.basename(onnx_file_path)
-
-    model = onnx.load(os.path.join(model_file_folder, model_file_name))
-
-    inits = list(model.graph.initializer)
-
-    dup_set = set()
-    dup_map = {}
-    ind_to_replace = []
-
-    total_reduced_size = 0
-
-    for i in range(len(inits)):
-        if i in dup_set:
-            continue
-
-        for j in range(i + 1, len(inits)):
-            if j in dup_set:
-                continue
-            if _is_equal_tensor_proto(inits[i], inits[j]):
-                dup_set.add(i)
-                dup_set.add(j)
-
-                dtype = inits[j].data_type
-                mem_size = numpy.prod(inits[j].dims)
-                if dtype == 1:
-                    mem_size *= 4
-                elif dtype == 6:
-                    mem_size *= 4
-                elif dtype == 7 or dtype == 11:
-                    mem_size *= 8
-                else:
-                    print("unexpected data type: ", dtype)
-                total_reduced_size += mem_size
-
-                name_i = inits[i].name
-                name_j = inits[j].name
-
-                if name_i in dup_map:
-                    dup_map[name_i].append(name_j)
-                else:
-                    dup_map[name_i] = [name_j]
-                ind_to_replace.append((j, i))
-
-    print("total reduced size: ", total_reduced_size / 1024 / 1024 / 1024, "GB")
-
-    ind_to_replace = sorted(ind_to_replace)
-    _remove_dup_initializers_from_model(model, model, ind_to_replace)
-
-    optimized_model_file_name = "optimized_" + model_file_name
-    new_model = os.path.join(model_file_folder, optimized_model_file_name)
-    onnx.save(model, new_model)
-
-    return new_model
diff --git a/examples/research_projects/onnx/summarization/requirements.txt b/examples/research_projects/onnx/summarization/requirements.txt
deleted file mode 100644
index 215356506121..000000000000
--- a/examples/research_projects/onnx/summarization/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-torch >= 1.10
\ No newline at end of file
diff --git a/examples/research_projects/onnx/summarization/run_onnx_exporter.py b/examples/research_projects/onnx/summarization/run_onnx_exporter.py
deleted file mode 100644
index fa826732701f..000000000000
--- a/examples/research_projects/onnx/summarization/run_onnx_exporter.py
+++ /dev/null
@@ -1,206 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" """
-
-import argparse
-import logging
-import os
-import sys
-
-import numpy as np
-import onnxruntime
-import torch
-from bart_onnx.generation_onnx import BARTBeamSearchGenerator
-from bart_onnx.reduce_onnx_size import remove_dup_initializers
-
-import transformers
-from transformers import BartForConditionalGeneration, BartTokenizer
-
-
-logging.basicConfig(
-    format="%(asctime)s | %(levelname)s | %(name)s |  [%(filename)s:%(lineno)d] %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    level=os.environ.get("LOGLEVEL", "INFO").upper(),
-    stream=sys.stdout,
-)
-
-logger = logging.getLogger(__name__)
-
-model_dict = {"facebook/bart-base": BartForConditionalGeneration}
-tokenizer_dict = {"facebook/bart-base": BartTokenizer}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Export Bart model + Beam Search to ONNX graph.")
-    parser.add_argument(
-        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
-    )
-    parser.add_argument(
-        "--max_length",
-        type=int,
-        default=5,
-        help="The maximum total input sequence length after tokenization.",
-    )
-    parser.add_argument(
-        "--num_beams",
-        type=int,
-        default=None,
-        help=(
-            "Number of beams to use for evaluation. This argument will be "
-            "passed to ``model.generate``, which is used during ``evaluate`` and ``predict``."
-        ),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=True,
-    )
-    parser.add_argument(
-        "--config_name",
-        type=str,
-        default=None,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cpu",
-        help="Device where the model will be run",
-    )
-    parser.add_argument("--output_file_path", type=str, default=None, help="Where to store the final ONNX file.")
-
-    args = parser.parse_args()
-
-    return args
-
-
-def load_model_tokenizer(model_name, device="cpu"):
-    huggingface_model = model_dict[model_name].from_pretrained(model_name).to(device)
-    tokenizer = tokenizer_dict[model_name].from_pretrained(model_name)
-
-    if model_name in ["facebook/bart-base"]:
-        huggingface_model.config.no_repeat_ngram_size = 0
-        huggingface_model.config.forced_bos_token_id = None
-        huggingface_model.config.min_length = 0
-
-    return huggingface_model, tokenizer
-
-
-def export_and_validate_model(model, tokenizer, onnx_file_path, num_beams, max_length):
-    model.eval()
-
-    ort_sess = None
-    bart_script_model = torch.jit.script(BARTBeamSearchGenerator(model))
-
-    with torch.no_grad():
-        ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt").to(model.device)
-
-        summary_ids = model.generate(
-            inputs["input_ids"],
-            attention_mask=inputs["attention_mask"],
-            num_beams=num_beams,
-            max_length=max_length,
-            early_stopping=True,
-            decoder_start_token_id=model.config.decoder_start_token_id,
-        )
-
-        torch.onnx.export(
-            bart_script_model,
-            (
-                inputs["input_ids"],
-                inputs["attention_mask"],
-                num_beams,
-                max_length,
-                model.config.decoder_start_token_id,
-            ),
-            onnx_file_path,
-            opset_version=14,
-            input_names=["input_ids", "attention_mask", "num_beams", "max_length", "decoder_start_token_id"],
-            output_names=["output_ids"],
-            dynamic_axes={
-                "input_ids": {0: "batch", 1: "seq"},
-                "output_ids": {0: "batch", 1: "seq_out"},
-            },
-            example_outputs=summary_ids,
-        )
-
-        logger.info("Model exported to {}".format(onnx_file_path))
-
-        new_onnx_file_path = remove_dup_initializers(os.path.abspath(onnx_file_path))
-
-        logger.info("Deduplicated and optimized model written to {}".format(new_onnx_file_path))
-
-        ort_sess = onnxruntime.InferenceSession(new_onnx_file_path)
-        ort_out = ort_sess.run(
-            None,
-            {
-                "input_ids": inputs["input_ids"].cpu().numpy(),
-                "attention_mask": inputs["attention_mask"].cpu().numpy(),
-                "num_beams": np.array(num_beams),
-                "max_length": np.array(max_length),
-                "decoder_start_token_id": np.array(model.config.decoder_start_token_id),
-            },
-        )
-
-        np.testing.assert_allclose(summary_ids.cpu().numpy(), ort_out[0], rtol=1e-3, atol=1e-3)
-
-        logger.info("Model outputs from torch and ONNX Runtime are similar.")
-        logger.info("Success.")
-
-
-def main():
-    args = parse_args()
-    max_length = 5
-    num_beams = 4
-
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-
-    logger.setLevel(logging.INFO)
-    transformers.utils.logging.set_verbosity_error()
-
-    device = torch.device(args.device)
-
-    model, tokenizer = load_model_tokenizer(args.model_name_or_path, device)
-
-    if model.config.decoder_start_token_id is None:
-        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
-
-    model.to(device)
-
-    if args.max_length:
-        max_length = args.max_length
-
-    if args.num_beams:
-        num_beams = args.num_beams
-
-    if args.output_file_path:
-        output_name = args.output_file_path
-    else:
-        output_name = "BART.onnx"
-
-    logger.info("Exporting model to ONNX")
-    export_and_validate_model(model, tokenizer, output_name, num_beams, max_length)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/performer/README.md b/examples/research_projects/performer/README.md
deleted file mode 100644
index fa847268b0c8..000000000000
--- a/examples/research_projects/performer/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Performer fine-tuning
-
-Example authors: @TevenLeScao, @Patrickvonplaten
-
-Paper authors: Krzysztof Choromanski, Valerii Likhosherstov, David Dohan, Xingyou Song, Andreea Gane, Tamas Sarlos, Peter Hawkins, Jared Davis, Afroz Mohiuddin, Lukasz Kaiser, David Belanger, Lucy Colwell, Adrian Weller
-
-## Requirements
-
-`datasets`, `flax` and `jax`. `wandb` integration is built-in if you want to use it.
-
-## Examples
-
-`sanity_script.sh` will launch performer fine-tuning from the google-bert/bert-base-cased checkpoint on the Simple Wikipedia dataset (a small, easy-language English Wikipedia) from `datasets`.
-`full_script.sh` will launch performer fine-tuning from the google-bert/bert-large-cased checkpoint on the English Wikipedia dataset from `datasets`.
-
-Here are a few key arguments:
-- Remove the `--performer` argument to use a standard Bert model.
-  
-- Add `--reinitialize` to start from a blank model rather than a Bert checkpoint. 
-  
-- You may change the Bert size by passing a different [checkpoint](https://huggingface.co/transformers/pretrained_models.html) to the `--model_name_or_path` argument.
-
-- Passing your user name to the `--wandb_user_name` argument will trigger weights and biases logging.
-
-- You can choose a dataset with `--dataset_name` and `--dataset_config`. Our [viewer](https://huggingface.co/datasets/viewer/) will help you find what you need.
\ No newline at end of file
diff --git a/examples/research_projects/performer/full_script.sh b/examples/research_projects/performer/full_script.sh
deleted file mode 100755
index 8634666f983b..000000000000
--- a/examples/research_projects/performer/full_script.sh
+++ /dev/null
@@ -1 +0,0 @@
-TOKENIZERS_PARALLELISM=true python run_mlm_performer.py  --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.en --model_name_or_path bert-large-cased --tokenizer_name bert-large-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer
\ No newline at end of file
diff --git a/examples/research_projects/performer/modeling_flax_performer.py b/examples/research_projects/performer/modeling_flax_performer.py
deleted file mode 100644
index 7c2fde6ddbb5..000000000000
--- a/examples/research_projects/performer/modeling_flax_performer.py
+++ /dev/null
@@ -1,551 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google Flax Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Callable, Dict, Tuple
-
-import flax.linen as nn
-import jax
-import jax.numpy as jnp
-import numpy as np
-from jax.random import PRNGKey
-from modeling_flax_performer_utils import make_fast_softmax_attention
-
-from transformers.file_utils import add_start_docstrings
-from transformers.modeling_flax_utils import ACT2FN
-from transformers.models.bert.configuration_bert import BertConfig
-from transformers.models.bert.modeling_flax_bert import FlaxBertOnlyMLMHead, FlaxBertPreTrainedModel
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "BertConfig"
-_TOKENIZER_FOR_DOC = "BertTokenizer"
-
-BERT_START_DOCSTRING = r"""
-
-    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
-
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    general usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
-"""
-
-BERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
-
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
-
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
-
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-"""
-
-
-class FlaxPerformerLayerNorm(nn.Module):
-    """
-    Layer normalization (https://arxiv.org/abs/1607.06450). Operates on the last axis of the input data.
-    """
-
-    epsilon: float = 1e-6
-    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
-    bias: bool = True  # If True, bias (beta) is added.
-    scale: bool = True  # If True, multiply by scale (gamma). When the next layer is linear
-    # (also e.g. nn.relu), this can be disabled since the scaling will be
-    # done by the next layer.
-    bias_init: jnp.ndarray = nn.initializers.zeros
-    scale_init: jnp.ndarray = nn.initializers.ones
-
-    @nn.compact
-    def __call__(self, x):
-        """
-        Applies layer normalization on the input. It normalizes the activations of the layer for each given example in
-        a batch independently, rather than across a batch like Batch Normalization. i.e. applies a transformation that
-        maintains the mean activation within each example close to 0 and the activation standard deviation close to 1
-
-        Args:
-          x: the inputs
-
-        Returns:
-          Normalized inputs (the same shape as inputs).
-        """
-        features = x.shape[-1]
-        mean = jnp.mean(x, axis=-1, keepdims=True)
-        mean2 = jnp.mean(jax.lax.square(x), axis=-1, keepdims=True)
-        var = mean2 - jax.lax.square(mean)
-        mul = jax.lax.rsqrt(var + self.epsilon)
-        if self.scale:
-            mul = mul * jnp.asarray(self.param("gamma", self.scale_init, (features,)), self.dtype)
-        y = (x - mean) * mul
-        if self.bias:
-            y = y + jnp.asarray(self.param("beta", self.bias_init, (features,)), self.dtype)
-        return y
-
-
-class FlaxPerformerEmbedding(nn.Module):
-    """
-    Specify a new class for doing the embedding stuff as Flax's one use 'embedding' for the parameter name and PyTorch
-    use 'weight'
-    """
-
-    vocab_size: int
-    hidden_size: int
-    emb_init: Callable[..., np.ndarray] = nn.initializers.normal(stddev=0.1)
-
-    @nn.compact
-    def __call__(self, inputs):
-        embedding = self.param("weight", self.emb_init, (self.vocab_size, self.hidden_size))
-        return jnp.take(embedding, inputs, axis=0)
-
-
-class FlaxPerformerEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    vocab_size: int
-    hidden_size: int
-    type_vocab_size: int
-    max_length: int
-
-    @nn.compact
-    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask):
-        # Embed
-        w_emb = FlaxPerformerEmbedding(self.vocab_size, self.hidden_size, name="word_embeddings")(
-            jnp.atleast_2d(input_ids.astype("i4"))
-        )
-        p_emb = FlaxPerformerEmbedding(self.max_length, self.hidden_size, name="position_embeddings")(
-            jnp.atleast_2d(position_ids.astype("i4"))
-        )
-        t_emb = FlaxPerformerEmbedding(self.type_vocab_size, self.hidden_size, name="token_type_embeddings")(
-            jnp.atleast_2d(token_type_ids.astype("i4"))
-        )
-
-        # Sum all embeddings
-        summed_emb = w_emb + jnp.broadcast_to(p_emb, w_emb.shape) + t_emb
-
-        # Layer Norm
-        layer_norm = FlaxPerformerLayerNorm(name="layer_norm")(summed_emb)
-
-        return layer_norm
-
-
-class FlaxPerformerAttention(nn.Module):
-    num_heads: int
-    head_size: int
-
-    @nn.compact
-    def __call__(self, hidden_state, attention_mask):
-        single_head_dim = self.head_size // self.num_heads
-        fast_softmax_attention = make_fast_softmax_attention(qkv_dim=single_head_dim)
-        self_att = nn.attention.SelfAttention(
-            num_heads=self.num_heads, qkv_features=self.head_size, name="self", attention_fn=fast_softmax_attention
-        )(hidden_state, attention_mask)
-
-        layer_norm = FlaxPerformerLayerNorm(name="layer_norm")(self_att + hidden_state)
-        return layer_norm
-
-
-class FlaxPerformerIntermediate(nn.Module):
-    output_size: int
-    hidden_act: str = "gelu"
-
-    @nn.compact
-    def __call__(self, hidden_state):
-        # TODO: Add ACT2FN reference to change activation function
-        dense = nn.Dense(features=self.output_size, name="dense")(hidden_state)
-        return ACT2FN[self.hidden_act](dense)
-
-
-class FlaxPerformerOutput(nn.Module):
-    @nn.compact
-    def __call__(self, intermediate_output, attention_output):
-        hidden_state = nn.Dense(attention_output.shape[-1], name="dense")(intermediate_output)
-        hidden_state = FlaxPerformerLayerNorm(name="layer_norm")(hidden_state + attention_output)
-        return hidden_state
-
-
-class FlaxPerformerLayer(nn.Module):
-    num_heads: int
-    head_size: int
-    intermediate_size: int
-    hidden_act: str = "gelu"
-
-    @nn.compact
-    def __call__(self, hidden_state, attention_mask):
-        attention = FlaxPerformerAttention(self.num_heads, self.head_size, name="attention")(
-            hidden_state, attention_mask
-        )
-        intermediate = FlaxPerformerIntermediate(
-            self.intermediate_size, name="intermediate", hidden_act=self.hidden_act
-        )(attention)
-        output = FlaxPerformerOutput(name="output")(intermediate, attention)
-
-        return output
-
-
-class FlaxPerformerLayerCollection(nn.Module):
-    """
-    Stores N BertLayer(s)
-    """
-
-    num_layers: int
-    num_heads: int
-    head_size: int
-    intermediate_size: int
-    hidden_act: str = "gelu"
-
-    @nn.compact
-    def __call__(self, inputs, attention_mask):
-        assert self.num_layers > 0, f"num_layers should be >= 1, got ({self.num_layers})"
-
-        # Initialize input / output
-        input_i = inputs
-
-        # Forward over all encoders
-        for i in range(self.num_layers):
-            layer = FlaxPerformerLayer(
-                self.num_heads, self.head_size, self.intermediate_size, hidden_act=self.hidden_act, name=f"{i}"
-            )
-            input_i = layer(input_i, attention_mask)
-        return input_i
-
-
-class FlaxPerformerEncoder(nn.Module):
-    num_layers: int
-    num_heads: int
-    head_size: int
-    intermediate_size: int
-    hidden_act: str = "gelu"
-
-    @nn.compact
-    def __call__(self, hidden_state, attention_mask):
-        layer = FlaxPerformerLayerCollection(
-            self.num_layers,
-            self.num_heads,
-            self.head_size,
-            self.intermediate_size,
-            name="layer",
-            hidden_act=self.hidden_act,
-        )(hidden_state, attention_mask)
-        return layer
-
-
-class FlaxPerformerPooler(nn.Module):
-    @nn.compact
-    def __call__(self, hidden_state):
-        cls_token = hidden_state[:, 0]
-        out = nn.Dense(hidden_state.shape[-1], name="dense")(cls_token)
-        return jax.lax.tanh(out)
-
-
-class FlaxPerformerModule(nn.Module):
-    vocab_size: int
-    hidden_size: int
-    type_vocab_size: int
-    max_length: int
-    num_encoder_layers: int
-    num_heads: int
-    head_size: int
-    intermediate_size: int
-    hidden_act: str = "gelu"
-    add_pooling_layer: bool = True
-
-    @nn.compact
-    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask):
-        # Embedding
-        embeddings = FlaxPerformerEmbeddings(
-            self.vocab_size, self.hidden_size, self.type_vocab_size, self.max_length, name="embeddings"
-        )(input_ids, token_type_ids, position_ids, attention_mask)
-
-        # N stacked encoding layers
-        encoder = FlaxPerformerEncoder(
-            self.num_encoder_layers,
-            self.num_heads,
-            self.head_size,
-            self.intermediate_size,
-            hidden_act=self.hidden_act,
-            name="encoder",
-        )(embeddings, attention_mask)
-
-        if not self.add_pooling_layer:
-            return encoder
-
-        pooled = FlaxPerformerPooler(name="pooler")(encoder)
-        return encoder, pooled
-
-
-@add_start_docstrings(
-    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-    BERT_START_DOCSTRING,
-)
-class FlaxPerformerModel(FlaxBertPreTrainedModel):
-    """
-    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
-    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-    """
-
-    model_class = FlaxPerformerModule
-    config_class = BertConfig
-    base_model_prefix = "bert"
-
-    @staticmethod
-    def convert_from_pytorch(pt_state: Dict, config: BertConfig) -> Dict:
-        jax_state = dict(pt_state)
-
-        # Need to change some parameters name to match Flax names so that we don't have to fork any layer
-        for key, tensor in pt_state.items():
-            # Key parts
-            key_parts = set(key.split("."))
-
-            # Every dense layer has "kernel" parameters instead of "weight"
-            if "dense.weight" in key:
-                del jax_state[key]
-                key = key.replace("weight", "kernel")
-                jax_state[key] = tensor
-
-            # SelfAttention needs also to replace "weight" by "kernel"
-            if {"query", "key", "value"} & key_parts:
-                # Flax SelfAttention decomposes the heads (num_head, size // num_heads)
-                if "bias" in key:
-                    jax_state[key] = tensor.reshape((config.num_attention_heads, -1))
-                elif "weight":
-                    del jax_state[key]
-                    key = key.replace("weight", "kernel")
-                    tensor = tensor.reshape((config.num_attention_heads, -1, config.hidden_size)).transpose((2, 0, 1))
-                    jax_state[key] = tensor
-
-            # SelfAttention output is not a separate layer, remove one nesting
-            if "attention.output.dense" in key:
-                del jax_state[key]
-                key = key.replace("attention.output.dense", "attention.self.out")
-                jax_state[key] = tensor
-
-            # SelfAttention output is not a separate layer, remove nesting on layer norm
-            if "attention.output.LayerNorm" in key:
-                del jax_state[key]
-                key = key.replace("attention.output.LayerNorm", "attention.LayerNorm")
-                jax_state[key] = tensor
-
-            # There are some transposed parameters w.r.t their PyTorch counterpart
-            if "intermediate.dense.kernel" in key or "output.dense.kernel" in key:
-                jax_state[key] = tensor.T
-
-            # Self Attention output projection needs to be transposed
-            if "out.kernel" in key:
-                jax_state[key] = tensor.reshape((config.hidden_size, config.num_attention_heads, -1)).transpose(
-                    1, 2, 0
-                )
-
-            # Pooler needs to transpose its kernel
-            if "pooler.dense.kernel" in key:
-                jax_state[key] = tensor.T
-
-            # Handle LayerNorm conversion
-            if "LayerNorm" in key:
-                del jax_state[key]
-
-                # Replace LayerNorm by layer_norm
-                new_key = key.replace("LayerNorm", "layer_norm")
-
-                if "weight" in key:
-                    new_key = new_key.replace("weight", "gamma")
-                elif "bias" in key:
-                    new_key = new_key.replace("bias", "beta")
-
-                jax_state[new_key] = tensor
-
-        return jax_state
-
-    def __init__(
-        self, config: BertConfig, input_shape: Tuple = (1, 1), seed: int = 0, dtype: jnp.dtype = jnp.float32, **kwargs
-    ):
-        module = FlaxPerformerModule(
-            vocab_size=config.vocab_size,
-            hidden_size=config.hidden_size,
-            type_vocab_size=config.type_vocab_size,
-            max_length=config.max_position_embeddings,
-            num_encoder_layers=config.num_hidden_layers,
-            num_heads=config.num_attention_heads,
-            head_size=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            dropout_rate=config.hidden_dropout_prob,
-            hidden_act=config.hidden_act,
-        )
-
-        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype)
-
-    @property
-    def module(self) -> nn.Module:
-        return self._module
-
-    def __call__(
-        self, input_ids, token_type_ids=None, position_ids=None, dropout_rng: PRNGKey = None, attention_mask=None
-    ):
-        input_ids, attention_mask, token_type_ids, position_ids = self._check_inputs(
-            input_ids, attention_mask, token_type_ids, position_ids
-        )
-
-        # Handle any PRNG if needed
-        rngs = {}
-        if dropout_rng is not None:
-            rngs["dropout"] = dropout_rng
-
-        return self.module.apply(
-            {"params": self.params},
-            jnp.array(input_ids, dtype="i4"),
-            jnp.array(token_type_ids, dtype="i4"),
-            jnp.array(position_ids, dtype="i4"),
-            jnp.array(attention_mask, dtype="i4"),
-            rng=rngs,
-        )
-
-
-class FlaxPerformerForMaskedLM(FlaxBertPreTrainedModel):
-    def __init__(
-        self, config: BertConfig, input_shape: Tuple = (1, 1), seed: int = 0, dtype: jnp.dtype = jnp.float32, **kwargs
-    ):
-        module = FlaxPerformerForMaskedLMModule(
-            vocab_size=config.vocab_size,
-            type_vocab_size=config.type_vocab_size,
-            hidden_size=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            head_size=config.hidden_size,
-            num_heads=config.num_attention_heads,
-            num_encoder_layers=config.num_hidden_layers,
-            max_length=config.max_position_embeddings,
-            hidden_act=config.hidden_act,
-            **kwargs,
-        )
-
-        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype)
-
-    def __call__(
-        self,
-        input_ids,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        params: dict = None,
-        train: bool = False,
-        dropout_rng: PRNGKey = None,
-    ):
-        input_ids, attention_mask, token_type_ids, position_ids = self._check_inputs(
-            input_ids, attention_mask, token_type_ids, position_ids
-        )
-
-        # Handle any PRNG if needed
-        rngs = {}
-        if dropout_rng is not None:
-            rngs["dropout"] = dropout_rng
-
-        return self.module.apply(
-            {"params": params or self.params},
-            jnp.array(input_ids, dtype="i4"),
-            jnp.array(attention_mask, dtype="i4"),
-            jnp.array(token_type_ids, dtype="i4"),
-            jnp.array(position_ids, dtype="i4"),
-            not train,
-            rngs=rngs,
-        )
-
-
-class FlaxPerformerForMaskedLMModule(nn.Module):
-    vocab_size: int
-    hidden_size: int
-    intermediate_size: int
-    head_size: int
-    num_heads: int
-    num_encoder_layers: int
-    type_vocab_size: int
-    max_length: int
-    hidden_act: str
-    dropout_rate: float = 0.0
-    dtype: jnp.dtype = jnp.float32
-
-    @nn.compact
-    def __call__(
-        self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, deterministic: bool = True
-    ):
-        # Model
-        encoder = FlaxPerformerModule(
-            vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            type_vocab_size=self.type_vocab_size,
-            max_length=self.max_length,
-            num_encoder_layers=self.num_encoder_layers,
-            num_heads=self.num_heads,
-            head_size=self.hidden_size,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            add_pooling_layer=False,
-            name="bert",
-        )(input_ids, attention_mask, token_type_ids, position_ids)
-
-        # Compute the prediction scores
-        encoder = nn.Dropout(rate=self.dropout_rate)(encoder, deterministic=deterministic)
-        logits = FlaxBertOnlyMLMHead(
-            vocab_size=self.vocab_size, hidden_act=self.hidden_act, name="cls", dtype=self.dtype
-        )(encoder)
-
-        return (logits,)
diff --git a/examples/research_projects/performer/modeling_flax_performer_utils.py b/examples/research_projects/performer/modeling_flax_performer_utils.py
deleted file mode 100644
index 24c5e4d7c7fc..000000000000
--- a/examples/research_projects/performer/modeling_flax_performer_utils.py
+++ /dev/null
@@ -1,658 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-IMPORTANT:
-
-This code was copied from
-https://github.com/google-research/google-research/blob/master/performer/fast_self_attention/fast_self_attention.py on
-6/11/2020. This is very new code, so it might be prone to change soon -> make sure to check the original code and
-update accordingly
-
-Core Fast Attention Module for Flax. Implementation of the approximate fast softmax and generalized attention mechanism
-leveraging structured random feature maps [RFM] techniques and low rank decomposition of the attention matrix.
-"""
-# pylint: disable=invalid-name, missing-function-docstring, line-too-long
-
-import abc
-import functools
-from collections.abc import Iterable  # pylint: disable=g-importing-member
-
-import jax
-import jax.numpy as jnp
-import numpy as onp
-from absl import logging
-from jax import lax, random
-
-
-def nonnegative_softmax_kernel_feature_creator(
-    data, projection_matrix, attention_dims_t, batch_dims_t, precision, is_query, normalize_data=True, eps=0.0001
-):
-    """
-    Constructs nonnegative kernel features for fast softmax attention
-
-    Args:
-      data: input for which features are computes
-      projection_matrix: random matrix used to compute features
-      attention_dims_t: tuple of attention dimensions
-      batch_dims_t: tuple of batch dimensions
-      precision: precision parameter
-      is_query: predicate indicating whether input data corresponds to queries or
-        keys
-      normalize_data: predicate indicating whether data should be normalized,
-      eps: numerical stabilizer
-
-    Returns:
-      Random features for fast softmax attention.
-    """
-    del attention_dims_t
-    if normalize_data:
-        # We have e^{qk^T/sqrt{d}} = e^{q_norm k_norm^T}, where
-        # w_norm = w * data_normalizer for w in {q,k}.
-        data_normalizer = 1.0 / (jnp.sqrt(jnp.sqrt(data.shape[-1])))
-    else:
-        data_normalizer = 1.0
-    ratio = 1.0 / jnp.sqrt(projection_matrix.shape[0])
-    data_mod_shape = data.shape[0 : len(batch_dims_t)] + projection_matrix.shape
-    data_thick_random_matrix = jnp.zeros(data_mod_shape) + projection_matrix
-
-    data_dash = lax.dot_general(
-        data_normalizer * data,
-        data_thick_random_matrix,
-        (((data.ndim - 1,), (data_thick_random_matrix.ndim - 1,)), (batch_dims_t, batch_dims_t)),
-        precision=precision,
-    )
-
-    diag_data = jnp.square(data)
-    diag_data = jnp.sum(diag_data, axis=data.ndim - 1)
-    diag_data = (diag_data / 2.0) * data_normalizer * data_normalizer
-    diag_data = jnp.expand_dims(diag_data, axis=data.ndim - 1)
-
-    if is_query:
-        last_dims_t = (len(data_dash.shape) - 1,)
-        data_dash = ratio * (
-            jnp.exp(data_dash - diag_data - jnp.max(data_dash, axis=last_dims_t, keepdims=True)) + eps
-        )
-    else:
-        data_dash = ratio * (jnp.exp(data_dash - diag_data - jnp.max(data_dash)) + eps)
-
-    return data_dash
-
-
-def sincos_softmax_kernel_feature_creator(
-    data, projection_matrix, attention_dims_t, batch_dims_t, precision, normalize_data=True
-):
-    """
-    Constructs kernel sin-cos features for fast softmax attention
-
-    Args:
-      data: input for which features are computes
-      projection_matrix: random matrix used to compute features
-      attention_dims_t: tuple of attention dimensions
-      batch_dims_t: tuple of batch dimensions
-      precision: precision parameter
-      normalize_data: predicate indicating whether data should be normalized
-
-    Returns:
-      Random features for fast softmax attention.
-    """
-    if normalize_data:
-        # We have: exp(qk^T/sqrt{d}) = exp(|q|^2/2sqrt{d}) * exp(|k|^2/2sqrt{d}) *
-        # exp(-(|q*c-k*c|^2)/2), where c = 1.0 / sqrt{sqrt{d}}.
-        data_normalizer = 1.0 / (jnp.sqrt(jnp.sqrt(data.shape[-1])))
-    else:
-        data_normalizer = 1.0
-    ratio = 1.0 / jnp.sqrt(projection_matrix.shape[0])
-    data_mod_shape = data.shape[0 : len(batch_dims_t)] + projection_matrix.shape
-    data_thick_random_matrix = jnp.zeros(data_mod_shape) + projection_matrix
-
-    data_dash = lax.dot_general(
-        data_normalizer * data,
-        data_thick_random_matrix,
-        (((data.ndim - 1,), (data_thick_random_matrix.ndim - 1,)), (batch_dims_t, batch_dims_t)),
-        precision=precision,
-    )
-    data_dash_cos = ratio * jnp.cos(data_dash)
-    data_dash_sin = ratio * jnp.sin(data_dash)
-    data_dash = jnp.concatenate((data_dash_cos, data_dash_sin), axis=-1)
-
-    # Constructing D_data and data^{'}
-    diag_data = jnp.square(data)
-    diag_data = jnp.sum(diag_data, axis=data.ndim - 1)
-    diag_data = (diag_data / 2.0) * data_normalizer * data_normalizer
-    diag_data = jnp.expand_dims(diag_data, axis=data.ndim - 1)
-    # Additional renormalization for numerical stability
-    data_renormalizer = jnp.max(diag_data, attention_dims_t, keepdims=True)
-    diag_data -= data_renormalizer
-    diag_data = jnp.exp(diag_data)
-    data_prime = data_dash * diag_data
-    return data_prime
-
-
-def generalized_kernel_feature_creator(
-    data, projection_matrix, batch_dims_t, precision, kernel_fn, kernel_epsilon, normalize_data
-):
-    """
-    Constructs kernel features for fast generalized attention
-
-    Args:
-      data: input for which features are computes
-      projection_matrix: matrix used to compute features
-      batch_dims_t: tuple of batch dimensions
-      precision: precision parameter
-      kernel_fn: kernel function used
-      kernel_epsilon: additive positive term added to every feature for numerical
-        stability
-      normalize_data: predicate indicating whether data should be normalized
-
-    Returns:
-      Random features for fast generalized attention.
-    """
-    if normalize_data:
-        data_normalizer = 1.0 / (jnp.sqrt(jnp.sqrt(data.shape[-1])))
-    else:
-        data_normalizer = 1.0
-    if projection_matrix is None:
-        return kernel_fn(data_normalizer * data) + kernel_epsilon
-    else:
-        data_mod_shape = data.shape[0 : len(batch_dims_t)] + projection_matrix.shape
-        data_thick_random_matrix = jnp.zeros(data_mod_shape) + projection_matrix
-        data_dash = lax.dot_general(
-            data_normalizer * data,
-            data_thick_random_matrix,
-            (((data.ndim - 1,), (data_thick_random_matrix.ndim - 1,)), (batch_dims_t, batch_dims_t)),
-            precision=precision,
-        )
-    data_prime = kernel_fn(data_dash) + kernel_epsilon
-    return data_prime
-
-
-def make_fast_softmax_attention(
-    qkv_dim,
-    renormalize_attention=True,
-    numerical_stabilizer=0.000001,
-    nb_features=256,
-    ortho_features=True,
-    ortho_scaling=0.0,
-    redraw_features=True,
-    unidirectional=False,
-    nonnegative_features=True,
-    lax_scan_unroll=1,
-):
-    """Construct a fast softmax attention method."""
-    logging.info(
-        "Fast softmax attention: %s features and orthogonal=%s, renormalize=%s",
-        nb_features,
-        ortho_features,
-        renormalize_attention,
-    )
-    if ortho_features:
-        matrix_creator = functools.partial(GaussianOrthogonalRandomMatrix, nb_features, qkv_dim, scaling=ortho_scaling)
-    else:
-        matrix_creator = functools.partial(GaussianUnstructuredRandomMatrix, nb_features, qkv_dim)
-    if nonnegative_features:
-
-        def kernel_feature_creator(
-            data, projection_matrix, attention_dims_t, batch_dims_t, precision, is_query, normalize_data=True
-        ):
-            return nonnegative_softmax_kernel_feature_creator(
-                data,
-                projection_matrix,
-                attention_dims_t,
-                batch_dims_t,
-                precision,
-                is_query,
-                normalize_data,
-                numerical_stabilizer,
-            )
-
-    else:
-
-        def kernel_feature_creator(
-            data, projection_matrix, attention_dims_t, batch_dims_t, precision, is_query, normalize_data=True
-        ):
-            del is_query
-            return sincos_softmax_kernel_feature_creator(
-                data, projection_matrix, attention_dims_t, batch_dims_t, precision, normalize_data
-            )
-
-    attention_fn = FastAttentionviaLowRankDecomposition(
-        matrix_creator,
-        kernel_feature_creator,
-        renormalize_attention=renormalize_attention,
-        numerical_stabilizer=numerical_stabilizer,
-        redraw_features=redraw_features,
-        unidirectional=unidirectional,
-        lax_scan_unroll=lax_scan_unroll,
-    ).dot_product_attention
-    return attention_fn
-
-
-def make_fast_generalized_attention(
-    qkv_dim,
-    renormalize_attention=True,
-    numerical_stabilizer=0.0,
-    nb_features=256,
-    features_type="deterministic",
-    kernel_fn=jax.nn.relu,
-    kernel_epsilon=0.001,
-    redraw_features=False,
-    unidirectional=False,
-    lax_scan_unroll=1,
-):
-    """Construct a fast generalized attention menthod."""
-    logging.info("Fast generalized attention.: %s features and renormalize=%s", nb_features, renormalize_attention)
-    if features_type == "ortho":
-        matrix_creator = functools.partial(GaussianOrthogonalRandomMatrix, nb_features, qkv_dim, scaling=False)
-    elif features_type == "iid":
-        matrix_creator = functools.partial(GaussianUnstructuredRandomMatrix, nb_features, qkv_dim)
-    elif features_type == "deterministic":
-        matrix_creator = None
-    else:
-        raise ValueError("Unknown feature value type")
-
-    def kernel_feature_creator(
-        data, projection_matrix, attention_dims_t, batch_dims_t, precision, is_query, normalize_data=False
-    ):
-        del attention_dims_t
-        del is_query
-        return generalized_kernel_feature_creator(
-            data, projection_matrix, batch_dims_t, precision, kernel_fn, kernel_epsilon, normalize_data
-        )
-
-    attention_fn = FastAttentionviaLowRankDecomposition(
-        matrix_creator,
-        kernel_feature_creator,
-        renormalize_attention=renormalize_attention,
-        numerical_stabilizer=numerical_stabilizer,
-        redraw_features=redraw_features,
-        unidirectional=unidirectional,
-        lax_scan_unroll=lax_scan_unroll,
-    ).dot_product_attention
-    return attention_fn
-
-
-class RandomMatrix:
-    r"""
-    Abstract class providing a method for constructing 2D random arrays. Class is responsible for constructing 2D
-    random arrays.
-    """
-
-    __metaclass__ = abc.ABCMeta
-
-    @abc.abstractmethod
-    def get_2d_array(self):
-        raise NotImplementedError("Abstract method")
-
-
-class GaussianUnstructuredRandomMatrix(RandomMatrix):
-    def __init__(self, nb_rows, nb_columns, key):
-        self.nb_rows = nb_rows
-        self.nb_columns = nb_columns
-        self.key = key
-
-    def get_2d_array(self):
-        return random.normal(self.key, (self.nb_rows, self.nb_columns))
-
-
-class GaussianOrthogonalRandomMatrix(RandomMatrix):
-    r"""
-    Class providing a method to create Gaussian orthogonal matrix. Class is responsible for constructing 2D Gaussian
-    orthogonal arrays.
-    """
-
-    def __init__(self, nb_rows, nb_columns, key, scaling=0):
-        self.nb_rows = nb_rows
-        self.nb_columns = nb_columns
-        self.key = key
-        self.scaling = scaling
-
-    def get_2d_array(self):
-        nb_full_blocks = int(self.nb_rows / self.nb_columns)
-        block_list = []
-        rng = self.key
-        for _ in range(nb_full_blocks):
-            rng, rng_input = jax.random.split(rng)
-            unstructured_block = random.normal(rng_input, (self.nb_columns, self.nb_columns))
-            q, _ = jnp.linalg.qr(unstructured_block)
-            q = jnp.transpose(q)
-            block_list.append(q)
-        remaining_rows = self.nb_rows - nb_full_blocks * self.nb_columns
-        if remaining_rows > 0:
-            rng, rng_input = jax.random.split(rng)
-            unstructured_block = random.normal(rng_input, (self.nb_columns, self.nb_columns))
-            q, _ = jnp.linalg.qr(unstructured_block)
-            q = jnp.transpose(q)
-            block_list.append(q[0:remaining_rows])
-        final_matrix = jnp.vstack(block_list)
-
-        if self.scaling == 0:
-            multiplier = jnp.linalg.norm(random.normal(self.key, (self.nb_rows, self.nb_columns)), axis=1)
-        elif self.scaling == 1:
-            multiplier = jnp.sqrt(float(self.nb_columns)) * jnp.ones((self.nb_rows))
-        else:
-            raise ValueError("Scaling must be one of {0, 1}. Was %s" % self._scaling)
-
-        return jnp.matmul(jnp.diag(multiplier), final_matrix)
-
-
-class FastAttention:
-    r"""
-    Abstract class providing a method for fast attention. Class is responsible for providing a method
-    <dot_product_attention> for fast approximate attention.
-    """
-
-    __metaclass__ = abc.ABCMeta
-
-    @abc.abstractmethod
-    def dot_product_attention(
-        self,
-        query,
-        key,
-        value,
-        dtype=jnp.float32,
-        bias=None,
-        axis=None,
-        broadcast_dropout=True,
-        dropout_rng=None,
-        dropout_rate=0.0,
-        deterministic=False,
-        precision=None,
-    ):
-        """
-        Computes dot-product attention given query, key, and value. This is the core function for applying fast
-        approximate dot-product attention. It calculates the attention weights given query and key and combines the
-        values using the attention weights. This function supports multi-dimensional inputs
-
-        Args:
-          query: queries for calculating attention with shape of [batch_size, dim1,
-            dim2, ..., dimN, num_heads, mem_channels].
-          key: keys for calculating attention with shape of [batch_size, dim1, dim2,
-            ..., dimN, num_heads, mem_channels].
-          value: values to be used in attention with shape of [batch_size, dim1,
-            dim2,..., dimN, num_heads, value_channels].
-          dtype: the dtype of the computation (default: float32)
-          bias: bias for the attention weights. This can be used for incorporating
-            autoregressive mask, padding mask, proximity bias.
-          axis: axises over which the attention is applied.
-          broadcast_dropout: bool: use a broadcasted dropout along batch dims.
-          dropout_rng: JAX PRNGKey: to be used for dropout.
-          dropout_rate: dropout rate.
-          deterministic: bool, deterministic or not (to apply dropout).
-          precision: numerical precision of the computation see `jax.lax.Precision`
-            for details
-
-        Returns:
-          Output of shape [bs, dim1, dim2, ..., dimN,, num_heads, value_channels].
-        """
-        raise NotImplementedError("Abstract method")
-
-
-def _numerator(z_slice_shape, precision, unroll=1):
-    def fwd(qs, ks, vs):
-        def body(p, qkv):
-            (q, k, v) = qkv
-            p += jnp.einsum("...m,...d->...md", k, v, precision=precision)
-            X_slice = jnp.einsum("...m,...md->...d", q, p, precision=precision)
-            return p, X_slice
-
-        init_value = jnp.zeros(z_slice_shape)
-        p, W = lax.scan(body, init_value, (qs, ks, vs), unroll=unroll)
-        return W, (p, qs, ks, vs)
-
-    def bwd(pqkv, W_ct):
-        def body(carry, qkv_xct):
-            p, p_ct = carry
-            q, k, v, x_ct = qkv_xct
-            q_ct = jnp.einsum("...d,...md->...m", x_ct, p, precision=precision)
-            p_ct += jnp.einsum("...d,...m->...md", x_ct, q, precision=precision)
-            k_ct = jnp.einsum("...md,...d->...m", p_ct, v, precision=precision)
-            v_ct = jnp.einsum("...md,...m->...d", p_ct, k, precision=precision)
-            p -= jnp.einsum("...m,...d->...md", k, v, precision=precision)
-            return (p, p_ct), (q_ct, k_ct, v_ct)
-
-        p, qs, ks, vs = pqkv
-        _, (qs_ct, ks_ct, vs_ct) = lax.scan(
-            body, (p, jnp.zeros_like(p)), (qs, ks, vs, W_ct), reverse=True, unroll=unroll
-        )
-        return qs_ct, ks_ct, vs_ct
-
-    @jax.custom_vjp
-    def _numerator_impl(qs, ks, vs):
-        W, _ = fwd(qs, ks, vs)
-        return W
-
-    _numerator_impl.defvjp(fwd, bwd)
-
-    return _numerator_impl
-
-
-def _denominator(t_slice_shape, precision, unroll=1):
-    def fwd(qs, ks):
-        def body(p, qk):
-            q, k = qk
-            p += k
-            x = jnp.einsum("...m,...m->...", q, p, precision=precision)
-            return p, x
-
-        p = jnp.zeros(t_slice_shape)
-        p, R = lax.scan(body, p, (qs, ks), unroll=unroll)
-        return R, (qs, ks, p)
-
-    def bwd(qkp, R_ct):
-        def body(carry, qkx):
-            p, p_ct = carry
-            q, k, x_ct = qkx
-            q_ct = jnp.einsum("...,...m->...m", x_ct, p, precision=precision)
-            p_ct += jnp.einsum("...,...m->...m", x_ct, q, precision=precision)
-            k_ct = p_ct
-            p -= k
-            return (p, p_ct), (q_ct, k_ct)
-
-        qs, ks, p = qkp
-        _, (qs_ct, ks_ct) = lax.scan(body, (p, jnp.zeros_like(p)), (qs, ks, R_ct), reverse=True, unroll=unroll)
-        return (qs_ct, ks_ct)
-
-    @jax.custom_vjp
-    def _denominator_impl(qs, ks):
-        R, _ = fwd(qs, ks)
-        return R
-
-    _denominator_impl.defvjp(fwd, bwd)
-
-    return _denominator_impl
-
-
-class FastAttentionviaLowRankDecomposition(FastAttention):
-    r"""
-    Class providing a method for fast attention via low rank decomposition. Class is responsible for providing a method
-    <dot_product_attention> for fast dot-product attention with the use of low rank decomposition (e.g. with random
-    feature maps).
-    """
-
-    def __init__(
-        self,
-        matrix_creator,
-        kernel_feature_creator,
-        renormalize_attention,
-        numerical_stabilizer,
-        redraw_features,
-        unidirectional,
-        lax_scan_unroll=1,
-    ):  # For optimal GPU performance, set to 16.
-        rng = random.PRNGKey(0)
-        self.matrix_creator = matrix_creator
-        self.projection_matrix = self.draw_weights(rng)
-        self.kernel_feature_creator = kernel_feature_creator
-        self.renormalize_attention = renormalize_attention
-        self.numerical_stabilizer = numerical_stabilizer
-        self.redraw_features = redraw_features
-        self.unidirectional = unidirectional
-        self.lax_scan_unroll = lax_scan_unroll
-
-    def draw_weights(self, key):
-        if self.matrix_creator is None:
-            return None
-        matrixrng, _ = random.split(key)
-        projection_matrix = self.matrix_creator(key=matrixrng).get_2d_array()
-        return projection_matrix
-
-    def dot_product_attention(
-        self,
-        query,
-        key,
-        value,
-        dtype=jnp.float32,
-        bias=None,
-        axis=None,
-        broadcast_dropout=True,
-        dropout_rng=None,
-        dropout_rate=0.0,
-        deterministic=False,
-        precision=None,
-    ):
-        assert key.shape[:-1] == value.shape[:-1]
-        assert query.shape[0:1] == key.shape[0:1] and query.shape[-1] == key.shape[-1]
-        if axis is None:
-            axis = tuple(range(1, key.ndim - 2))
-        if not isinstance(axis, Iterable):
-            axis = (axis,)
-        assert key.ndim == query.ndim
-        assert key.ndim == value.ndim
-        for ax in axis:
-            if not (query.ndim >= 3 and 1 <= ax < query.ndim - 2):
-                raise ValueError("Attention axis must be between the batch axis and the last-two axes.")
-        n = key.ndim
-
-        # Constructing projection tensor.
-        if self.redraw_features:
-            # TODO(kchoro): Get rid of the constant below.
-            query_seed = lax.convert_element_type(jnp.ceil(jnp.sum(query) * 10000000.0), jnp.int32)
-            rng = random.PRNGKey(query_seed)
-            self.projection_matrix = self.draw_weights(rng)
-
-        # batch_dims is  <bs, <non-attention dims>, num_heads>
-        batch_dims = tuple(onp.delete(range(n), axis + (n - 1,)))
-        # q & k -> (bs, <non-attention dims>, num_heads, <attention dims>, channels)
-        qk_perm = batch_dims + axis + (n - 1,)
-        k_extra_perm = axis + batch_dims + (n - 1,)
-        key_extra = key.transpose(k_extra_perm)
-        key = key.transpose(qk_perm)
-        query = query.transpose(qk_perm)
-        # v -> (bs, <non-attention dims>, num_heads, <attention dims>, channels)
-        v_perm = batch_dims + axis + (n - 1,)
-        value = value.transpose(v_perm)
-        batch_dims_t = tuple(range(len(batch_dims)))
-        attention_dims_t = tuple(range(len(batch_dims), len(batch_dims) + len(axis)))
-
-        # Constructing tensors Q^{'} and K^{'}.
-        query_prime = self.kernel_feature_creator(
-            query, self.projection_matrix, attention_dims_t, batch_dims_t, precision, True
-        )
-        key_prime = self.kernel_feature_creator(
-            key, self.projection_matrix, attention_dims_t, batch_dims_t, precision, False
-        )
-
-        if self.unidirectional:
-            index = attention_dims_t[0]
-            z_slice_shape = key_prime.shape[0 : len(batch_dims_t)] + (key_prime.shape[-1],) + (value.shape[-1],)
-
-            numerator_fn = _numerator(z_slice_shape, precision, self.lax_scan_unroll)
-            W = numerator_fn(
-                jnp.moveaxis(query_prime, index, 0), jnp.moveaxis(key_prime, index, 0), jnp.moveaxis(value, index, 0)
-            )
-
-            # Constructing W = (Q^{'}(K^{'})^{T})_{masked}V
-            W = jnp.moveaxis(W, 0, index)
-
-            if not self.renormalize_attention:
-                # Unidirectional, not-normalized attention.
-                perm_inv = _invert_perm(qk_perm)
-                result = W.transpose(perm_inv)
-                return result
-            else:
-                # Unidirectional, normalized attention.
-                thick_all_ones = jnp.zeros(key.shape[0:-1]) + jnp.ones(key_extra.shape[0 : len(axis)])
-
-                index = attention_dims_t[0]
-                t_slice_shape = key_prime.shape[0 : len(batch_dims_t)] + (key_prime.shape[-1],)
-                denominator_fn = _denominator(t_slice_shape, precision, self.lax_scan_unroll)
-                R = denominator_fn(jnp.moveaxis(query_prime, index, 0), jnp.moveaxis(key_prime, index, 0))
-
-                R = jnp.moveaxis(R, 0, index)
-        else:
-            contract_query = tuple(range(len(batch_dims) + len(axis), len(batch_dims) + len(axis) + 1))
-            contract_z = tuple(range(len(batch_dims), len(batch_dims) + 1))
-            # Constructing Z = (K^{'})^{T}V
-            # Z (bs, <non-attention dims>, num_heads, channels_m, channels_v)
-            Z = lax.dot_general(
-                key_prime,
-                value,
-                ((attention_dims_t, attention_dims_t), (batch_dims_t, batch_dims_t)),
-                precision=precision,
-            )
-            # Constructing W = Q^{'}Z = Q^{'}(K^{'})^{T}V
-            # q (bs, <non-attention dims>, num_heads, <attention dims>, channels_m)
-            # Z (bs, <non-attention dims>, num_heads, channels_m, channels_v)
-            # W (bs,  <non-attention dims>, num_heads, <attention dims>, channels_v)
-            W = lax.dot_general(
-                query_prime, Z, ((contract_query, contract_z), (batch_dims_t, batch_dims_t)), precision=precision
-            )
-            if not self.renormalize_attention:
-                # Bidirectional, not-normalized attention.
-                perm_inv = _invert_perm(qk_perm)
-                result = W.transpose(perm_inv)
-                return result
-            else:
-                # Bidirectional, normalized attention.
-                thick_all_ones = jnp.zeros(key.shape[0:-1]) + jnp.ones(key_extra.shape[0 : len(axis)])
-                contract_key = tuple(range(len(batch_dims), len(batch_dims) + len(axis)))
-                contract_thick_all_ones = tuple(range(thick_all_ones.ndim - len(axis), thick_all_ones.ndim))
-                # Construct T = (K^{'})^{T} 1_L
-                # k (bs, <non-attention dims>, num_heads, <attention dims>, channels)
-                T = lax.dot_general(
-                    key_prime,
-                    thick_all_ones,
-                    ((contract_key, contract_thick_all_ones), (batch_dims_t, batch_dims_t)),
-                    precision=precision,
-                )
-
-                # Construct partition function: R = Q^{'} T = Q^{'}(K^{'})^{T} 1_L
-                # q_p (bs, <non-attention dims>, num_heads, <attention dims>, channs_m)
-                # T   (bs, <non-attention dims>, num_heads, channels_m)
-                R = lax.dot_general(
-                    query_prime,
-                    T,
-                    (((query_prime.ndim - 1,), (T.ndim - 1,)), (batch_dims_t, range(0, len(T.shape) - 1))),
-                    precision=precision,
-                )
-
-        R = R + 2 * self.numerical_stabilizer * (jnp.abs(R) <= self.numerical_stabilizer)
-        R = jnp.reciprocal(R)
-        R = jnp.expand_dims(R, len(R.shape))
-        # W (bs, <non-attention dims>, num_heads, <attention dims>, channels_v)
-        # R (bs, <non-attention dims>, num_heads, <attention dims>, extra_channel)
-        result = W * R
-        # back to (bs, dim1, dim2, ..., dimN, num_heads, channels)
-        perm_inv = _invert_perm(qk_perm)
-        result = result.transpose(perm_inv)
-        return result
-
-
-def _invert_perm(perm):
-    perm_inv = [0] * len(perm)
-    for i, j in enumerate(perm):
-        perm_inv[j] = i
-    return tuple(perm_inv)
diff --git a/examples/research_projects/performer/run_mlm_performer.py b/examples/research_projects/performer/run_mlm_performer.py
deleted file mode 100644
index 0332fe1575ff..000000000000
--- a/examples/research_projects/performer/run_mlm_performer.py
+++ /dev/null
@@ -1,693 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
-text file or a dataset.
-
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=fill-mask
-"""
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-
-# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import jax
-import jax.numpy as jnp
-import numpy as np
-from datasets import load_dataset
-from flax import jax_utils
-from flax.optim import Adam
-from flax.training import common_utils
-from flax.training.common_utils import get_metrics
-from jax.nn import log_softmax
-from modeling_flax_performer import FlaxPerformerForMaskedLM
-from tqdm import tqdm
-
-from transformers import (
-    MODEL_FOR_MASKED_LM_MAPPING,
-    AutoTokenizer,
-    BertConfig,
-    FlaxBertForMaskedLM,
-    HfArgumentParser,
-    PreTrainedTokenizerBase,
-    TensorType,
-    TrainingArguments,
-    is_tensorboard_available,
-    set_seed,
-)
-
-
-# Cache the result
-has_tensorboard = is_tensorboard_available()
-if has_tensorboard:
-    try:
-        from flax.metrics.tensorboard import SummaryWriter
-    except ImportError as ie:
-        has_tensorboard = False
-        print(f"Unable to display metrics through TensorBoard because some package are not installed: {ie}")
-
-else:
-    print(
-        "Unable to display metrics through TensorBoard because the package is not installed: "
-        "Please run pip install tensorboard to enable."
-    )
-
-MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-@dataclass
-class WandbArguments:
-    """
-    Arguments for logging
-    """
-
-    wandb_user_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "The WandB user name for potential logging. If left None, no logging"},
-    )
-    wandb_project_name: Optional[str] = field(
-        default="performer-experiments",
-        metadata={"help": "The WandB project name for potential logging"},
-    )
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
-            )
-        },
-    )
-    performer: bool = field(
-        default=False,
-        metadata={"help": "Whether to use FAVOR+ attention"},
-    )
-    reinitialize: bool = field(
-        default=False,
-        metadata={"help": "Whether to use a blank model without pretraining"},
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    train_ref_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
-    )
-    validation_ref_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    max_seq_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated. Default to the max input length of the model."
-            )
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    mlm_probability: float = field(
-        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
-
-
-# Adapted from transformers/data/data_collator.py
-# Letting here for now, let's discuss where it should live
-@dataclass
-class FlaxDataCollatorForLanguageModeling:
-    """
-    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
-    are not all of the same length.
-
-    Args:
-        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
-            The tokenizer used for encoding the data.
-        mlm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to use masked language modeling. If set to :obj:`False`, the labels are the same as the
-            inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for
-            non-masked tokens and the value to predict for the masked token.
-        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
-            The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`.
-
-    .. note::
-
-        For best performance, this data collator should be used with a dataset having items that are dictionaries or
-        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
-        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
-        argument :obj:`return_special_tokens_mask=True`.
-    """
-
-    tokenizer: PreTrainedTokenizerBase
-    mlm: bool = True
-    mlm_probability: float = 0.15
-
-    def __post_init__(self):
-        if self.mlm and self.tokenizer.mask_token is None:
-            raise ValueError(
-                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
-                "You should pass `mlm=False` to train on causal language modeling instead."
-            )
-
-    def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
-        # Handle dict or lists with proper padding and conversion to tensor.
-        batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
-
-        # If special token mask has been preprocessed, pop it from the dict.
-        special_tokens_mask = batch.pop("special_tokens_mask", None)
-        if self.mlm:
-            batch["input_ids"], batch["labels"] = self.mask_tokens(
-                batch["input_ids"], special_tokens_mask=special_tokens_mask
-            )
-        else:
-            labels = batch["input_ids"].copy()
-            if self.tokenizer.pad_token_id is not None:
-                labels[labels == self.tokenizer.pad_token_id] = -100
-            batch["labels"] = labels
-        return batch
-
-    def mask_tokens(
-        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
-    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
-        """
-        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
-        """
-        labels = inputs.copy()
-        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
-        probability_matrix = np.full(labels.shape, self.mlm_probability)
-        special_tokens_mask = special_tokens_mask.astype("bool")
-
-        probability_matrix[special_tokens_mask] = 0.0
-        masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
-        labels[~masked_indices] = -100  # We only compute loss on masked tokens
-
-        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
-        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
-
-        # 10% of the time, we replace masked input tokens with random word
-        indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
-        indices_random &= masked_indices & ~indices_replaced
-
-        random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
-        inputs[indices_random] = random_words[indices_random]
-
-        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
-        return inputs, labels
-
-
-def create_learning_rate_scheduler(
-    factors="constant * linear_warmup * rsqrt_decay",
-    base_learning_rate=0.5,
-    warmup_steps=1000,
-    decay_factor=0.5,
-    steps_per_decay=20000,
-    steps_per_cycle=100000,
-):
-    """Creates learning rate schedule.
-    Interprets factors in the factors string which can consist of:
-    * constant: interpreted as the constant value,
-    * linear_warmup: interpreted as linear warmup until warmup_steps,
-    * rsqrt_decay: divide by square root of max(step, warmup_steps)
-    * rsqrt_normalized_decay: divide by square root of max(step/warmup_steps, 1)
-    * decay_every: Every k steps decay the learning rate by decay_factor.
-    * cosine_decay: Cyclic cosine decay, uses steps_per_cycle parameter.
-    Args:
-      factors: string, factors separated by "*" that defines the schedule.
-      base_learning_rate: float, the starting constant for the lr schedule.
-      warmup_steps: int, how many steps to warm up for in the warmup schedule.
-      decay_factor: float, the amount to decay the learning rate by.
-      steps_per_decay: int, how often to decay the learning rate.
-      steps_per_cycle: int, steps per cycle when using cosine decay.
-    Returns:
-      a function learning_rate(step): float -> {"learning_rate": float}, the
-      step-dependent lr.
-    """
-    factors = [n.strip() for n in factors.split("*")]
-
-    def step_fn(step):
-        """Step to learning rate function."""
-        ret = 1.0
-        for name in factors:
-            if name == "constant":
-                ret *= base_learning_rate
-            elif name == "linear_warmup":
-                ret *= jnp.minimum(1.0, step / warmup_steps)
-            elif name == "rsqrt_decay":
-                ret /= jnp.sqrt(jnp.maximum(step, warmup_steps))
-            elif name == "rsqrt_normalized_decay":
-                ret *= jnp.sqrt(warmup_steps)
-                ret /= jnp.sqrt(jnp.maximum(step, warmup_steps))
-            elif name == "decay_every":
-                ret *= decay_factor ** (step // steps_per_decay)
-            elif name == "cosine_decay":
-                progress = jnp.maximum(0.0, (step - warmup_steps) / float(steps_per_cycle))
-                ret *= jnp.maximum(0.0, 0.5 * (1.0 + jnp.cos(jnp.pi * (progress % 1.0))))
-            else:
-                raise ValueError("Unknown factor %s." % name)
-        return jnp.asarray(ret, dtype=jnp.float32)
-
-    return step_fn
-
-
-def compute_metrics(logits, labels, weights, label_smoothing=0.0):
-    """Compute summary metrics."""
-    loss, normalizer = cross_entropy(logits, labels, weights, label_smoothing)
-    acc, _ = accuracy(logits, labels, weights)
-    metrics = {"loss": loss, "accuracy": acc, "normalizer": normalizer}
-    metrics = jax.lax.psum(metrics, axis_name="batch")
-    return metrics
-
-
-def accuracy(logits, targets, weights=None):
-    """Compute weighted accuracy for log probs and targets.
-    Args:
-     logits: [batch, length, num_classes] float array.
-     targets: categorical targets [batch, length] int array.
-     weights: None or array of shape [batch, length]
-    Returns:
-      Tuple of scalar loss and batch normalizing factor.
-    """
-    if logits.ndim != targets.ndim + 1:
-        raise ValueError(
-            "Incorrect shapes. Got shape %s logits and %s targets" % (str(logits.shape), str(targets.shape))
-        )
-
-    loss = jnp.equal(jnp.argmax(logits, axis=-1), targets)
-    loss *= weights
-
-    return loss.sum(), weights.sum()
-
-
-def cross_entropy(logits, targets, weights=None, label_smoothing=0.0):
-    """Compute cross entropy and entropy for log probs and targets.
-    Args:
-     logits: [batch, length, num_classes] float array.
-     targets: categorical targets [batch, length] int array.
-     weights: None or array of shape [batch, length]
-     label_smoothing: label smoothing constant, used to determine the on and off values.
-    Returns:
-      Tuple of scalar loss and batch normalizing factor.
-    """
-    if logits.ndim != targets.ndim + 1:
-        raise ValueError(
-            "Incorrect shapes. Got shape %s logits and %s targets" % (str(logits.shape), str(targets.shape))
-        )
-
-    vocab_size = logits.shape[-1]
-    confidence = 1.0 - label_smoothing
-    low_confidence = (1.0 - confidence) / (vocab_size - 1)
-    normalizing_constant = -(
-        confidence * jnp.log(confidence) + (vocab_size - 1) * low_confidence * jnp.log(low_confidence + 1e-20)
-    )
-    soft_targets = common_utils.onehot(targets, vocab_size, on_value=confidence, off_value=low_confidence)
-
-    loss = -jnp.sum(soft_targets * log_softmax(logits), axis=-1)
-    loss = loss - normalizing_constant
-
-    if weights is not None:
-        loss = loss * weights
-        normalizing_factor = weights.sum()
-    else:
-        normalizing_factor = np.prod(targets.shape)
-
-    return loss.sum(), normalizing_factor
-
-
-def training_step(optimizer, batch, dropout_rng):
-    dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
-
-    def loss_fn(params):
-        targets = batch.pop("labels")
-
-        # Hide away tokens which doesn't participate in the optimization
-        token_mask = jnp.where(targets > 0, 1.0, 0.0)
-
-        logits = model(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
-        loss, weight_sum = cross_entropy(logits, targets, token_mask)
-        return loss / weight_sum
-
-    step = optimizer.state.step
-    lr = lr_scheduler_fn(step)
-    grad_fn = jax.value_and_grad(loss_fn)
-    loss, grad = grad_fn(optimizer.target)
-    grad = jax.lax.pmean(grad, "batch")
-    optimizer = optimizer.apply_gradient(grad, learning_rate=lr)
-
-    return loss, optimizer, new_dropout_rng
-
-
-def eval_step(params, batch):
-    """
-    Calculate evaluation metrics on a batch.
-    """
-    targets = batch.pop("labels")
-
-    # Hide away tokens which doesn't participate in the optimization
-    token_mask = jnp.where(targets > 0, 1.0, 0.0)
-    logits = model(**batch, params=params, train=False)[0]
-
-    return compute_metrics(logits, targets, token_mask)
-
-
-def generate_batch_splits(samples_idx: np.ndarray, batch_size: int) -> np.ndarray:
-    nb_samples = len(samples_idx)
-    samples_to_remove = nb_samples % batch_size
-
-    if samples_to_remove != 0:
-        samples_idx = samples_idx[:-samples_to_remove]
-    sections_split = nb_samples // batch_size
-    batch_idx = np.split(samples_idx, sections_split)
-    return batch_idx
-
-
-if __name__ == "__main__":
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, WandbArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args, wandb_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
-    else:
-        model_args, data_args, training_args, wandb_args = parser.parse_args_into_dataclasses()
-
-    if (
-        os.path.exists(training_args.output_dir)
-        and os.listdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-            "Use --overwrite_output_dir to overcome."
-        )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        level="NOTSET",
-        datefmt="[%X]",
-    )
-
-    # Log on each process the small summary:
-    logger = logging.getLogger(__name__)
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
-        if "validation" not in datasets.keys():
-            datasets["validation"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[:{data_args.validation_split_percentage}%]",
-            )
-            datasets["train"] = load_dataset(
-                data_args.dataset_name,
-                data_args.dataset_config_name,
-                split=f"train[{data_args.validation_split_percentage}%:]",
-            )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if extension == "txt":
-            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files)
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Load pretrained model and tokenizer
-
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    rng = jax.random.PRNGKey(training_args.seed)
-    dropout_rngs = jax.random.split(rng, jax.local_device_count())
-
-    config = BertConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
-    lm_class = FlaxPerformerForMaskedLM if model_args.performer else FlaxBertForMaskedLM
-    if model_args.reinitialize:
-        model = lm_class(config=BertConfig.from_pretrained(model_args.model_name_or_path))
-    else:
-        model = lm_class.from_pretrained(
-            model_args.model_name_or_path,
-            dtype=jnp.float32,
-            input_shape=(training_args.train_batch_size, config.max_position_embeddings),
-            seed=training_args.seed,
-            dropout_rate=0.1,
-        )
-
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-        )
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-        )
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    if training_args.do_train:
-        column_names = datasets["train"].column_names
-    else:
-        column_names = datasets["validation"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
-
-    padding = "max_length" if data_args.pad_to_max_length else False
-
-    def tokenize_function(examples):
-        # Remove empty lines
-        examples = [line for line in examples if len(line) > 0 and not line.isspace()]
-        return tokenizer(
-            examples,
-            return_special_tokens_mask=True,
-            padding=padding,
-            truncation=True,
-            max_length=data_args.max_seq_length,
-        )
-
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        input_columns=[text_column_name],
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        remove_columns=column_names,
-        load_from_cache_file=not data_args.overwrite_cache,
-    )
-
-    # Enable tensorboard only on the master node
-    if has_tensorboard and jax.host_id() == 0:
-        summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix())
-
-    # Data collator
-    # This one will take care of randomly masking the tokens.
-    data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
-
-    # Setup optimizer
-    optimizer = Adam(
-        learning_rate=training_args.learning_rate,
-        weight_decay=training_args.weight_decay,
-        beta1=training_args.adam_beta1,
-        beta2=training_args.adam_beta2,
-    ).create(model.params)
-
-    # Create learning rate scheduler
-    lr_scheduler_fn = create_learning_rate_scheduler(
-        base_learning_rate=training_args.learning_rate, warmup_steps=max(training_args.warmup_steps, 1)
-    )
-
-    # Create parallel version of the training and evaluation steps
-    p_training_step = jax.pmap(training_step, "batch", donate_argnums=(0,))
-    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
-
-    # Replicate the optimizer on each device
-    optimizer = jax_utils.replicate(optimizer)
-
-    # Store some constant
-    nb_epochs = int(training_args.num_train_epochs)
-    batch_size = int(training_args.train_batch_size)
-    eval_batch_size = int(training_args.eval_batch_size)
-
-    if wandb_args.wandb_user_name is not None:
-        import wandb
-
-        wandb.init(project=wandb_args.wandb_project_name, entity=wandb_args.wandb_user_name)
-
-    epochs = tqdm(range(nb_epochs), desc=f"Epoch ... (1/{nb_epochs})", position=0)
-    for epoch in epochs:
-        # ======================== Training ================================
-        # Create sampling rng
-        rng, training_rng, eval_rng = jax.random.split(rng, 3)
-
-        # Generate an epoch by shuffling sampling indices from the train dataset
-        nb_training_samples = len(tokenized_datasets["train"])
-        # Avoid using jax.numpy here in case of TPU training
-        training_samples_idx = np.random.permutation(np.arange(nb_training_samples))
-        training_batch_idx = generate_batch_splits(training_samples_idx, batch_size)
-
-        # Gather the indexes for creating the batch and do a training step
-        for batch_idx in tqdm(training_batch_idx, desc="Training...", position=1):
-            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
-            model_inputs = data_collator(samples, pad_to_multiple_of=16)
-
-            # Model forward
-            model_inputs = common_utils.shard(model_inputs.data)
-            loss, optimizer, dropout_rngs = p_training_step(optimizer, model_inputs, dropout_rngs)
-
-            if wandb_args.wandb_user_name is not None:
-                wandb.log({"Training loss": np.array(loss).mean()})
-
-        epochs.write(f"Loss: {loss}")
-
-        # ======================== Evaluating ==============================
-        nb_eval_samples = len(tokenized_datasets["validation"])
-        # Avoid using jax.numpy here in case of TPU training
-        eval_samples_idx = np.arange(nb_eval_samples)
-        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
-
-        eval_metrics = []
-        for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
-            samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
-            model_inputs = data_collator(samples, pad_to_multiple_of=16)
-
-            # Model forward
-            model_inputs = common_utils.shard(model_inputs.data)
-            metrics = p_eval_step(optimizer.target, model_inputs)
-            eval_metrics.append(metrics)
-
-        eval_metrics_np = get_metrics(eval_metrics)
-        eval_metrics_np = jax.tree_util.tree_map(jnp.sum, eval_metrics_np)
-        eval_normalizer = eval_metrics_np.pop("normalizer")
-        eval_summary = jax.tree_util.tree_map(lambda x: x / eval_normalizer, eval_metrics_np)
-
-        # Update progress bar
-        epochs.desc = (
-            f"Epoch... ({epoch + 1}/{nb_epochs} | Loss: {eval_summary['loss']}, Acc: {eval_summary['accuracy']})"
-        )
-
-        if wandb_args.wandb_user_name is not None:
-            wandb.log({"Eval loss": np.array(eval_summary["loss"]).mean()})
-
-        # Save metrics
-        if has_tensorboard and jax.host_id() == 0:
-            for name, value in eval_summary.items():
-                summary_writer.scalar(name, value, epoch)
diff --git a/examples/research_projects/performer/sanity_script.sh b/examples/research_projects/performer/sanity_script.sh
deleted file mode 100755
index b96cd7e643ef..000000000000
--- a/examples/research_projects/performer/sanity_script.sh
+++ /dev/null
@@ -1 +0,0 @@
-TOKENIZERS_PARALLELISM=true python run_mlm_performer.py  --output_dir experiments --dataset_name wikipedia --dataset_config_name 20200501.simple --model_name_or_path bert-base-cased --tokenizer_name bert-base-cased --do_train --overwrite_output_dir --per_device_train_batch_size 4 --learning_rate 5e-4 --warmup_steps 100 --num_train_epochs 3 --performer
\ No newline at end of file
diff --git a/examples/research_projects/pplm/README.md b/examples/research_projects/pplm/README.md
deleted file mode 100644
index f37ea8e96f21..000000000000
--- a/examples/research_projects/pplm/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Plug and Play Language Models: a Simple Approach to Controlled Text Generation
-
-Authors: [Sumanth Dathathri](https://dathath.github.io/), [Andrea Madotto](https://andreamad8.github.io/), Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/)
-
-This folder contains the original code used to run the Plug and Play Language Model (PPLM).
-
-Paper link: https://arxiv.org/abs/1912.02164
-
-Blog link: https://eng.uber.com/pplm
-
-Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM
-
-# Note
-
-⚠️ This project should be run with pytorch-lightning==1.0.4 which has a potential security vulnerability
-
-## Setup
-
-```bash
-git clone https://github.com/huggingface/transformers && cd transformers
-pip install .
-pip install nltk torchtext # additional requirements.
-cd examples/research_projects/pplm
-```
-
-## PPLM-BoW
-
-### Example command for bag-of-words control
-
-```bash
-python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.03 --window_length 5 --kl_scale 0.01 --gm_scale 0.99 --colorama --sample
-```
-
-### Tuning hyperparameters for bag-of-words control
-
-1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model.
-
-2. If the language being generated is repetitive (For e.g. "science science experiment experiment"), there are several options to consider: </br>
-	a) Reduce the `--stepsize` </br>
-	b) Increase `--kl_scale` (the KL-loss coefficient) or decrease `--gm_scale` (the gm-scaling term) </br>
-	c) Add `--grad-length xx` where xx is an (integer <= length, e.g. `--grad-length 30`).</br>
-
-
-## PPLM-Discrim
-
-### Example command for discriminator based sentiment control
-
-```bash
-python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --length 50 --gamma 1.0 --num_iterations 10 --num_samples 10 --stepsize 0.04 --kl_scale 0.01 --gm_scale 0.95 --sample
-```
-
-### Tuning hyperparameters for discriminator control
-
-1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model.
-
-2. Use `--class_label 3` for negative, and `--class_label 2` for positive
diff --git a/examples/research_projects/pplm/imgs/headfigure.png b/examples/research_projects/pplm/imgs/headfigure.png
deleted file mode 100644
index f4c11ad54d10..000000000000
Binary files a/examples/research_projects/pplm/imgs/headfigure.png and /dev/null differ
diff --git a/examples/research_projects/pplm/imgs/wooly.png b/examples/research_projects/pplm/imgs/wooly.png
deleted file mode 100644
index 190d3afd49f1..000000000000
Binary files a/examples/research_projects/pplm/imgs/wooly.png and /dev/null differ
diff --git a/examples/research_projects/pplm/pplm_classification_head.py b/examples/research_projects/pplm/pplm_classification_head.py
deleted file mode 100644
index e26521fe3910..000000000000
--- a/examples/research_projects/pplm/pplm_classification_head.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from torch import nn
-
-
-class ClassificationHead(nn.Module):
-    """Classification Head for  transformer encoders"""
-
-    def __init__(self, class_size, embed_size):
-        super().__init__()
-        self.class_size = class_size
-        self.embed_size = embed_size
-        # self.mlp1 = nn.Linear(embed_size, embed_size)
-        # self.mlp2 = (nn.Linear(embed_size, class_size))
-        self.mlp = nn.Linear(embed_size, class_size)
-
-    def forward(self, hidden_state):
-        # hidden_state = nn.functional.relu(self.mlp1(hidden_state))
-        # hidden_state = self.mlp2(hidden_state)
-        logits = self.mlp(hidden_state)
-        return logits
diff --git a/examples/research_projects/pplm/requirements.txt b/examples/research_projects/pplm/requirements.txt
deleted file mode 100644
index f93fde0f78f6..000000000000
--- a/examples/research_projects/pplm/requirements.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-tensorboard
-scikit-learn
-seqeval
-psutil
-sacrebleu
-rouge-score
-tensorflow_datasets
-pytorch-lightning
-matplotlib
-git-python==1.0.3
-faiss-cpu
-streamlit
-elasticsearch
-nltk
-pandas
-datasets >= 1.1.3
-fire
-pytest
-conllu
-sentencepiece != 0.1.92
-protobuf
-transformers==4.38.0
diff --git a/examples/research_projects/pplm/run_pplm.py b/examples/research_projects/pplm/run_pplm.py
deleted file mode 100644
index cc49b7fa83c4..000000000000
--- a/examples/research_projects/pplm/run_pplm.py
+++ /dev/null
@@ -1,823 +0,0 @@
-#! /usr/bin/env python3
-# coding=utf-8
-
-# Copyright (c) 2019 Uber Technologies, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Example command with bag of words:
-python run_pplm.py -B space --cond_text "The president" --length 100 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.01 --window_length 5 --kl_scale 0.01 --gm_scale 0.95
-
-Example command with discriminator:
-python run_pplm.py -D sentiment --class_label 3 --cond_text "The lake" --length 10 --gamma 1.0 --num_iterations 30 --num_samples 10 --stepsize 0.01 --kl_scale 0.01 --gm_scale 0.95
-"""
-
-import argparse
-import json
-from operator import add
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from pplm_classification_head import ClassificationHead
-from torch import nn
-from tqdm import trange
-
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
-from transformers.file_utils import cached_path
-
-
-PPLM_BOW = 1
-PPLM_DISCRIM = 2
-PPLM_BOW_DISCRIM = 3
-SMALL_CONST = 1e-15
-BIG_CONST = 1e10
-
-BAG_OF_WORDS_ARCHIVE_MAP = {
-    "legal": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
-    "military": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
-    "politics": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
-    "religion": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
-    "science": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
-    "space": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
-    "technology": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
-}
-
-DISCRIMINATOR_MODELS_PARAMS = {
-    "clickbait": {
-        "url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/clickbait_classifier_head.pt",
-        "class_size": 2,
-        "embed_size": 1024,
-        "class_vocab": {"non_clickbait": 0, "clickbait": 1},
-        "default_class": 1,
-        "pretrained_model": "openai-community/gpt2-medium",
-    },
-    "sentiment": {
-        "url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/SST_classifier_head.pt",
-        "class_size": 5,
-        "embed_size": 1024,
-        "class_vocab": {"very_positive": 2, "very_negative": 3},
-        "default_class": 3,
-        "pretrained_model": "openai-community/gpt2-medium",
-    },
-}
-
-
-def top_k_filter(logits, k, probs=False):
-    """
-    Masks everything but the k top entries as -infinity (1e10).
-    Used to mask logits such that e^-infinity -> 0 won't contribute to the
-    sum of the denominator.
-    """
-    if k == 0:
-        return logits
-    else:
-        values = torch.topk(logits, k)[0]
-        batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
-        if probs:
-            return torch.where(logits < batch_mins, torch.ones_like(logits) * 0.0, logits)
-        return torch.where(logits < batch_mins, torch.ones_like(logits) * -BIG_CONST, logits)
-
-
-def perturb_past(
-    past,
-    model,
-    last,
-    unpert_past=None,
-    unpert_logits=None,
-    accumulated_hidden=None,
-    grad_norms=None,
-    stepsize=0.01,
-    one_hot_bows_vectors=None,
-    classifier=None,
-    class_label=None,
-    loss_type=0,
-    num_iterations=3,
-    horizon_length=1,
-    window_length=0,
-    decay=False,
-    gamma=1.5,
-    kl_scale=0.01,
-    device="cuda",
-):
-    # Generate inital perturbed past
-    grad_accumulator = [(np.zeros(p.shape).astype("float32")) for p in past]
-
-    if accumulated_hidden is None:
-        accumulated_hidden = 0
-
-    if decay:
-        decay_mask = torch.arange(0.0, 1.0 + SMALL_CONST, 1.0 / (window_length))[1:]
-    else:
-        decay_mask = 1.0
-
-    # TODO fix this comment (SUMANTH)
-    # Generate a mask is gradient perturbated is based on a past window
-    _, _, _, curr_length, _ = past[0].shape
-
-    if curr_length > window_length and window_length > 0:
-        ones_key_val_shape = tuple(past[0].shape[:-2]) + (window_length,) + tuple(past[0].shape[-1:])
-
-        zeros_key_val_shape = tuple(past[0].shape[:-2]) + (curr_length - window_length,) + tuple(past[0].shape[-1:])
-
-        ones_mask = torch.ones(ones_key_val_shape)
-        ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
-        ones_mask = ones_mask.permute(0, 1, 2, 4, 3)
-
-        window_mask = torch.cat((ones_mask, torch.zeros(zeros_key_val_shape)), dim=-2).to(device)
-    else:
-        window_mask = torch.ones_like(past[0]).to(device)
-
-    # accumulate perturbations for num_iterations
-    loss_per_iter = []
-    new_accumulated_hidden = None
-    for i in range(num_iterations):
-        print("Iteration ", i + 1)
-        curr_perturbation = [torch.from_numpy(p_).requires_grad_(True).to(device=device) for p_ in grad_accumulator]
-        # make sure p_.grad is not None
-        for p_ in curr_perturbation:
-            p_.retain_grad()
-
-        # Compute hidden using perturbed past
-        perturbed_past = list(map(add, past, curr_perturbation))
-        _, _, _, curr_length, _ = curr_perturbation[0].shape
-        lm_output = model(last, past_key_values=perturbed_past)
-        all_logits, all_hidden = lm_output["logits"], lm_output["hidden_states"]
-        hidden = all_hidden[-1]
-        new_accumulated_hidden = accumulated_hidden + torch.sum(hidden, dim=1).detach()
-        # TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
-        logits = all_logits[:, -1, :]
-        probs = nn.functional.softmax(logits, dim=-1)
-
-        loss = 0.0
-        loss_list = []
-        if loss_type == PPLM_BOW or loss_type == PPLM_BOW_DISCRIM:
-            for one_hot_bow in one_hot_bows_vectors:
-                bow_logits = torch.mm(probs, torch.t(one_hot_bow))
-                bow_loss = -torch.log(torch.sum(bow_logits))
-                loss += bow_loss
-                loss_list.append(bow_loss)
-            print(" pplm_bow_loss:", loss.data.cpu().numpy())
-
-        if loss_type == 2 or loss_type == 3:
-            ce_loss = nn.CrossEntropyLoss()
-            # TODO why we need to do this assignment and not just using unpert_past? (Sumanth)
-            curr_unpert_past = unpert_past
-            curr_probs = torch.unsqueeze(probs, dim=1)
-            wte = model.resize_token_embeddings()
-            for _ in range(horizon_length):
-                inputs_embeds = torch.matmul(curr_probs, wte.weight.data)
-                lm_output = model(past_key_values=curr_unpert_past, inputs_embeds=inputs_embeds)
-                curr_all_logits, curr_unpert_past, curr_all_hidden = (
-                    lm_output["logits"],
-                    lm_output["past_key_values"],
-                    lm_output["hidden_states"],
-                )
-                curr_logits = curr_all_logits[:, -1, :]
-                curr_probs = nn.functional.softmax(curr_logits, dim=-1)
-                curr_probs = torch.unsqueeze(curr_probs, dim=1)
-                curr_hidden = curr_all_hidden[-1]
-                new_accumulated_hidden = new_accumulated_hidden + torch.sum(curr_hidden, dim=1)
-
-            prediction = classifier(new_accumulated_hidden / (curr_length + 1 + horizon_length))
-
-            label = torch.tensor(prediction.shape[0] * [class_label], device=device, dtype=torch.long)
-            discrim_loss = ce_loss(prediction, label)
-            print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy())
-            loss += discrim_loss
-            loss_list.append(discrim_loss)
-
-        kl_loss = 0.0
-        if kl_scale > 0.0:
-            unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1)
-            unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach()
-            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach()
-            corrected_probs = probs + correction.detach()
-            kl_loss = kl_scale * ((corrected_probs * (corrected_probs / unpert_probs).log()).sum())
-            print(" kl_loss", kl_loss.data.cpu().numpy())
-            loss += kl_loss
-
-        loss_per_iter.append(loss.data.cpu().numpy())
-        print(" pplm_loss", (loss - kl_loss).data.cpu().numpy())
-
-        # compute gradients
-        loss.backward()
-
-        # calculate gradient norms
-        if grad_norms is not None and loss_type == PPLM_BOW:
-            grad_norms = [
-                torch.max(grad_norms[index], torch.norm(p_.grad * window_mask))
-                for index, p_ in enumerate(curr_perturbation)
-            ]
-        else:
-            grad_norms = [
-                (torch.norm(p_.grad * window_mask) + SMALL_CONST) for index, p_ in enumerate(curr_perturbation)
-            ]
-
-        # normalize gradients
-        grad = [
-            -stepsize * (p_.grad * window_mask / grad_norms[index] ** gamma).data.cpu().numpy()
-            for index, p_ in enumerate(curr_perturbation)
-        ]
-
-        # accumulate gradient
-        grad_accumulator = list(map(add, grad, grad_accumulator))
-
-        # reset gradients, just to make sure
-        for p_ in curr_perturbation:
-            p_.grad.data.zero_()
-
-        # removing past from the graph
-        new_past = []
-        for p_ in past:
-            new_past.append(p_.detach())
-        past = new_past
-
-    # apply the accumulated perturbations to the past
-    grad_accumulator = [torch.from_numpy(p_).requires_grad_(True).to(device=device) for p_ in grad_accumulator]
-    pert_past = list(map(add, past, grad_accumulator))
-
-    return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter
-
-
-def get_classifier(
-    name: Optional[str], class_label: Union[str, int], device: str
-) -> Tuple[Optional[ClassificationHead], Optional[int]]:
-    if name is None:
-        return None, None
-
-    params = DISCRIMINATOR_MODELS_PARAMS[name]
-    classifier = ClassificationHead(class_size=params["class_size"], embed_size=params["embed_size"]).to(device)
-    if "url" in params:
-        resolved_archive_file = cached_path(params["url"])
-    elif "path" in params:
-        resolved_archive_file = params["path"]
-    else:
-        raise ValueError("Either url or path have to be specified in the discriminator model parameters")
-    classifier.load_state_dict(torch.load(resolved_archive_file, map_location=device))
-    classifier.eval()
-
-    if isinstance(class_label, str):
-        if class_label in params["class_vocab"]:
-            label_id = params["class_vocab"][class_label]
-        else:
-            label_id = params["default_class"]
-            print("class_label {} not in class_vocab".format(class_label))
-            print("available values are: {}".format(params["class_vocab"]))
-            print("using default class {}".format(label_id))
-
-    elif isinstance(class_label, int):
-        if class_label in set(params["class_vocab"].values()):
-            label_id = class_label
-        else:
-            label_id = params["default_class"]
-            print("class_label {} not in class_vocab".format(class_label))
-            print("available values are: {}".format(params["class_vocab"]))
-            print("using default class {}".format(label_id))
-
-    else:
-        label_id = params["default_class"]
-
-    return classifier, label_id
-
-
-def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]:
-    bow_indices = []
-    for id_or_path in bag_of_words_ids_or_paths:
-        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
-            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
-        else:
-            filepath = id_or_path
-        with open(filepath, "r") as f:
-            words = f.read().strip().split("\n")
-        bow_indices.append([tokenizer.encode(word.strip(), add_prefix_space=True) for word in words])
-    return bow_indices
-
-
-def build_bows_one_hot_vectors(bow_indices, tokenizer, device="cuda"):
-    if bow_indices is None:
-        return None
-
-    one_hot_bows_vectors = []
-    for single_bow in bow_indices:
-        single_bow = list(filter(lambda x: len(x) <= 1, single_bow))
-        single_bow = torch.tensor(single_bow).to(device)
-        num_words = single_bow.shape[0]
-        one_hot_bow = torch.zeros(num_words, tokenizer.vocab_size).to(device)
-        one_hot_bow.scatter_(1, single_bow, 1)
-        one_hot_bows_vectors.append(one_hot_bow)
-    return one_hot_bows_vectors
-
-
-def full_text_generation(
-    model,
-    tokenizer,
-    context=None,
-    num_samples=1,
-    device="cuda",
-    bag_of_words=None,
-    discrim=None,
-    class_label=None,
-    length=100,
-    stepsize=0.02,
-    temperature=1.0,
-    top_k=10,
-    sample=False,
-    num_iterations=3,
-    grad_length=10000,
-    horizon_length=1,
-    window_length=0,
-    decay=False,
-    gamma=1.5,
-    gm_scale=0.9,
-    kl_scale=0.01,
-    repetition_penalty=1.0,
-    **kwargs,
-):
-    classifier, class_id = get_classifier(discrim, class_label, device)
-
-    bow_indices = []
-    if bag_of_words:
-        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
-
-    if bag_of_words and classifier:
-        print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.")
-        loss_type = PPLM_BOW_DISCRIM
-
-    elif bag_of_words:
-        loss_type = PPLM_BOW
-        print("Using PPLM-BoW")
-
-    elif classifier is not None:
-        loss_type = PPLM_DISCRIM
-        print("Using PPLM-Discrim")
-
-    else:
-        raise Exception("Specify either a bag of words or a discriminator")
-
-    unpert_gen_tok_text, _, _ = generate_text_pplm(
-        model=model,
-        tokenizer=tokenizer,
-        context=context,
-        device=device,
-        length=length,
-        sample=sample,
-        perturb=False,
-        repetition_penalty=repetition_penalty,
-    )
-    if device == "cuda":
-        torch.cuda.empty_cache()
-
-    pert_gen_tok_texts = []
-    discrim_losses = []
-    losses_in_time = []
-
-    for i in range(num_samples):
-        pert_gen_tok_text, discrim_loss, loss_in_time = generate_text_pplm(
-            model=model,
-            tokenizer=tokenizer,
-            context=context,
-            device=device,
-            perturb=True,
-            bow_indices=bow_indices,
-            classifier=classifier,
-            class_label=class_id,
-            loss_type=loss_type,
-            length=length,
-            stepsize=stepsize,
-            temperature=temperature,
-            top_k=top_k,
-            sample=sample,
-            num_iterations=num_iterations,
-            grad_length=grad_length,
-            horizon_length=horizon_length,
-            window_length=window_length,
-            decay=decay,
-            gamma=gamma,
-            gm_scale=gm_scale,
-            kl_scale=kl_scale,
-            repetition_penalty=repetition_penalty,
-        )
-        pert_gen_tok_texts.append(pert_gen_tok_text)
-        if classifier is not None:
-            discrim_losses.append(discrim_loss.data.cpu().numpy())
-        losses_in_time.append(loss_in_time)
-
-    if device == "cuda":
-        torch.cuda.empty_cache()
-
-    return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
-
-
-def generate_text_pplm(
-    model,
-    tokenizer,
-    context=None,
-    past=None,
-    device="cuda",
-    perturb=True,
-    bow_indices=None,
-    classifier=None,
-    class_label=None,
-    loss_type=0,
-    length=100,
-    stepsize=0.02,
-    temperature=1.0,
-    top_k=10,
-    sample=False,
-    num_iterations=3,
-    grad_length=10000,
-    horizon_length=1,
-    window_length=0,
-    decay=False,
-    gamma=1.5,
-    gm_scale=0.9,
-    kl_scale=0.01,
-    repetition_penalty=1.0,
-):
-    output_so_far = None
-    if context:
-        context_t = torch.tensor(context, device=device, dtype=torch.long)
-        while len(context_t.shape) < 2:
-            context_t = context_t.unsqueeze(0)
-        output_so_far = context_t
-
-    # collect one hot vectors for bags of words
-    one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer, device)
-
-    grad_norms = None
-    last = None
-    unpert_discrim_loss = 0
-    loss_in_time = []
-    for i in trange(length, ascii=True):
-        # Get past/probs for current output, except for last word
-        # Note that GPT takes 2 inputs: past + current_token
-
-        # run model forward to obtain unperturbed
-        if past is None and output_so_far is not None:
-            last = output_so_far[:, -1:]
-            if output_so_far.shape[1] > 1:
-                past = model(output_so_far[:, :-1])["past_key_values"]
-
-        lm_output = model(output_so_far)
-        unpert_logits, unpert_past, unpert_all_hidden = (
-            lm_output["logits"],
-            lm_output["past_key_values"],
-            lm_output["hidden_states"],
-        )
-        unpert_last_hidden = unpert_all_hidden[-1]
-
-        # check if we are abowe grad max length
-        if i >= grad_length:
-            current_stepsize = stepsize * 0
-        else:
-            current_stepsize = stepsize
-
-        # modify the past if necessary
-        if not perturb or num_iterations == 0:
-            pert_past = past
-
-        else:
-            accumulated_hidden = unpert_last_hidden[:, :-1, :]
-            accumulated_hidden = torch.sum(accumulated_hidden, dim=1)
-
-            if past is not None:
-                pert_past, _, grad_norms, loss_this_iter = perturb_past(
-                    past,
-                    model,
-                    last,
-                    unpert_past=unpert_past,
-                    unpert_logits=unpert_logits,
-                    accumulated_hidden=accumulated_hidden,
-                    grad_norms=grad_norms,
-                    stepsize=current_stepsize,
-                    one_hot_bows_vectors=one_hot_bows_vectors,
-                    classifier=classifier,
-                    class_label=class_label,
-                    loss_type=loss_type,
-                    num_iterations=num_iterations,
-                    horizon_length=horizon_length,
-                    window_length=window_length,
-                    decay=decay,
-                    gamma=gamma,
-                    kl_scale=kl_scale,
-                    device=device,
-                )
-                loss_in_time.append(loss_this_iter)
-            else:
-                pert_past = past
-
-        lm_output = model(last, past_key_values=pert_past)
-        pert_logits, past = (
-            lm_output["logits"],
-            lm_output["past_key_values"],
-        )
-        pert_logits = pert_logits[:, -1, :] / temperature  # + SMALL_CONST
-
-        for token_idx in set(output_so_far[0].tolist()):
-            if pert_logits[0, token_idx] < 0:
-                pert_logits[0, token_idx] *= repetition_penalty
-            else:
-                pert_logits[0, token_idx] /= repetition_penalty
-
-        pert_probs = nn.functional.softmax(pert_logits, dim=-1)
-
-        if classifier is not None:
-            ce_loss = nn.CrossEntropyLoss()
-            prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
-            label = torch.tensor([class_label], device=device, dtype=torch.long)
-            unpert_discrim_loss = ce_loss(prediction, label)
-            print("unperturbed discrim loss", unpert_discrim_loss.data.cpu().numpy())
-        else:
-            unpert_discrim_loss = 0
-
-        # Fuse the modified model and original model
-        if perturb:
-            unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1)
-
-            pert_probs = (pert_probs**gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
-            pert_probs = top_k_filter(pert_probs, k=top_k, probs=True)  # + SMALL_CONST
-
-            # rescale
-            if torch.sum(pert_probs) <= 1:
-                pert_probs = pert_probs / torch.sum(pert_probs)
-
-        else:
-            pert_logits = top_k_filter(pert_logits, k=top_k)  # + SMALL_CONST
-            pert_probs = nn.functional.softmax(pert_logits, dim=-1)
-
-        # sample or greedy
-        if sample:
-            last = torch.multinomial(pert_probs, num_samples=1)
-
-        else:
-            _, last = torch.topk(pert_probs, k=1, dim=-1)
-
-        # update context/output_so_far appending the new token
-        output_so_far = last if output_so_far is None else torch.cat((output_so_far, last), dim=1)
-
-        print(tokenizer.decode(output_so_far.tolist()[0]))
-
-    return output_so_far, unpert_discrim_loss, loss_in_time
-
-
-def set_generic_model_params(discrim_weights, discrim_meta):
-    if discrim_weights is None:
-        raise ValueError("When using a generic discriminator, discrim_weights need to be specified")
-    if discrim_meta is None:
-        raise ValueError("When using a generic discriminator, discrim_meta need to be specified")
-
-    with open(discrim_meta, "r") as discrim_meta_file:
-        meta = json.load(discrim_meta_file)
-    meta["path"] = discrim_weights
-    DISCRIMINATOR_MODELS_PARAMS["generic"] = meta
-
-
-def run_pplm_example(
-    pretrained_model="openai-community/gpt2-medium",
-    cond_text="",
-    uncond=False,
-    num_samples=1,
-    bag_of_words=None,
-    discrim=None,
-    discrim_weights=None,
-    discrim_meta=None,
-    class_label=-1,
-    length=100,
-    stepsize=0.02,
-    temperature=1.0,
-    top_k=10,
-    sample=False,
-    num_iterations=3,
-    grad_length=10000,
-    horizon_length=1,
-    window_length=0,
-    decay=False,
-    gamma=1.5,
-    gm_scale=0.9,
-    kl_scale=0.01,
-    seed=0,
-    no_cuda=False,
-    colorama=False,
-    repetition_penalty=1.0,
-):
-    # set Random seed
-    torch.manual_seed(seed)
-    np.random.seed(seed)
-
-    # set the device
-    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
-
-    if discrim == "generic":
-        set_generic_model_params(discrim_weights, discrim_meta)
-
-    if discrim is not None:
-        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim]["pretrained_model"]
-        print("discrim = {}, pretrained_model set to discriminator's = {}".format(discrim, pretrained_model))
-
-    # load pretrained model
-    model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True)
-    model.to(device)
-    model.eval()
-
-    # load tokenizer
-    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
-
-    # Freeze GPT-2 weights
-    for param in model.parameters():
-        param.requires_grad = False
-
-    # figure out conditioning text
-    if uncond:
-        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token])
-    else:
-        raw_text = cond_text
-        while not raw_text:
-            print("Did you forget to add `--cond_text`? ")
-            raw_text = input("Model prompt >>> ")
-        tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text)
-
-    print("= Prefix of sentence =")
-    print(tokenizer.decode(tokenized_cond_text))
-    print()
-
-    # generate unperturbed and perturbed texts
-
-    # full_text_generation returns:
-    # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
-    unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
-        model=model,
-        tokenizer=tokenizer,
-        context=tokenized_cond_text,
-        device=device,
-        num_samples=num_samples,
-        bag_of_words=bag_of_words,
-        discrim=discrim,
-        class_label=class_label,
-        length=length,
-        stepsize=stepsize,
-        temperature=temperature,
-        top_k=top_k,
-        sample=sample,
-        num_iterations=num_iterations,
-        grad_length=grad_length,
-        horizon_length=horizon_length,
-        window_length=window_length,
-        decay=decay,
-        gamma=gamma,
-        gm_scale=gm_scale,
-        kl_scale=kl_scale,
-        repetition_penalty=repetition_penalty,
-    )
-
-    # untokenize unperturbed text
-    unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])
-
-    print("=" * 80)
-    print("= Unperturbed generated text =")
-    print(unpert_gen_text)
-    print()
-
-    generated_texts = []
-
-    bow_word_ids = set()
-    if bag_of_words and colorama:
-        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
-        for single_bow_list in bow_indices:
-            # filtering all words in the list composed of more than 1 token
-            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
-            # w[0] because we are sure w has only 1 item because previous fitler
-            bow_word_ids.update(w[0] for w in filtered)
-
-    # iterate through the perturbed texts
-    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
-        try:
-            # untokenize unperturbed text
-            if colorama:
-                import colorama
-
-                pert_gen_text = ""
-                for word_id in pert_gen_tok_text.tolist()[0]:
-                    if word_id in bow_word_ids:
-                        pert_gen_text += "{}{}{}".format(
-                            colorama.Fore.RED,
-                            tokenizer.decode([word_id]),
-                            colorama.Style.RESET_ALL,
-                        )
-                    else:
-                        pert_gen_text += tokenizer.decode([word_id])
-            else:
-                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])
-
-            print("= Perturbed generated text {} =".format(i + 1))
-            print(pert_gen_text)
-            print()
-        except Exception as exc:
-            print("Ignoring error while generating perturbed text:", exc)
-
-        # keep the prefix, perturbed seq, original seq for each index
-        generated_texts.append((tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))
-
-    return
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pretrained_model",
-        "-M",
-        type=str,
-        default="openai-community/gpt2-medium",
-        help="pretrained model name or path to local checkpoint",
-    )
-    parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
-    parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
-    parser.add_argument(
-        "--num_samples",
-        type=int,
-        default=1,
-        help="Number of samples to generate from the modified latents",
-    )
-    parser.add_argument(
-        "--bag_of_words",
-        "-B",
-        type=str,
-        default=None,
-        help=(
-            "Bags of words used for PPLM-BoW. "
-            "Either a BOW id (see list in code) or a filepath. "
-            "Multiple BoWs separated by ;"
-        ),
-    )
-    parser.add_argument(
-        "--discrim",
-        "-D",
-        type=str,
-        default=None,
-        choices=("clickbait", "sentiment", "toxicity", "generic"),
-        help="Discriminator to use",
-    )
-    parser.add_argument(
-        "--discrim_weights",
-        type=str,
-        default=None,
-        help="Weights for the generic discriminator",
-    )
-    parser.add_argument(
-        "--discrim_meta",
-        type=str,
-        default=None,
-        help="Meta information for the generic discriminator",
-    )
-    parser.add_argument(
-        "--class_label",
-        type=int,
-        default=-1,
-        help="Class label used for the discriminator",
-    )
-    parser.add_argument("--length", type=int, default=100)
-    parser.add_argument("--stepsize", type=float, default=0.02)
-    parser.add_argument("--temperature", type=float, default=1.0)
-    parser.add_argument("--top_k", type=int, default=10)
-    parser.add_argument("--sample", action="store_true", help="Generate from end-of-text as prefix")
-    parser.add_argument("--num_iterations", type=int, default=3)
-    parser.add_argument("--grad_length", type=int, default=10000)
-    parser.add_argument(
-        "--window_length",
-        type=int,
-        default=0,
-        help="Length of past which is being optimized; 0 corresponds to infinite window length",
-    )
-    parser.add_argument(
-        "--horizon_length",
-        type=int,
-        default=1,
-        help="Length of future to optimize over",
-    )
-    parser.add_argument("--decay", action="store_true", help="whether to decay or not")
-    parser.add_argument("--gamma", type=float, default=1.5)
-    parser.add_argument("--gm_scale", type=float, default=0.9)
-    parser.add_argument("--kl_scale", type=float, default=0.01)
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--no_cuda", action="store_true", help="no cuda")
-    parser.add_argument("--colorama", action="store_true", help="colors keywords")
-    parser.add_argument(
-        "--repetition_penalty",
-        type=float,
-        default=1.0,
-        help="Penalize repetition. More than 1.0 -> less repetition",
-    )
-
-    args = parser.parse_args()
-    run_pplm_example(**vars(args))
diff --git a/examples/research_projects/pplm/run_pplm_discrim_train.py b/examples/research_projects/pplm/run_pplm_discrim_train.py
deleted file mode 100644
index 43ec5823e377..000000000000
--- a/examples/research_projects/pplm/run_pplm_discrim_train.py
+++ /dev/null
@@ -1,526 +0,0 @@
-#! /usr/bin/env python3
-# coding=utf-8
-
-# Copyright (c) 2019 Uber Technologies, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import csv
-import json
-import math
-import time
-
-import numpy as np
-import torch
-import torch.optim as optim
-import torch.utils.data as data
-from nltk.tokenize.treebank import TreebankWordDetokenizer
-from pplm_classification_head import ClassificationHead
-from torch import nn
-from torchtext import data as torchtext_data
-from torchtext import datasets
-from tqdm import tqdm, trange
-
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
-
-
-torch.manual_seed(0)
-np.random.seed(0)
-EPSILON = 1e-10
-example_sentence = "This is incredible! I love it, this is the best chicken I have ever had."
-max_length_seq = 100
-
-
-class Discriminator(nn.Module):
-    """Transformer encoder followed by a Classification Head"""
-
-    def __init__(self, class_size, pretrained_model="openai-community/gpt2-medium", cached_mode=False, device="cpu"):
-        super().__init__()
-        self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
-        self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
-        self.embed_size = self.encoder.transformer.config.hidden_size
-        self.classifier_head = ClassificationHead(class_size=class_size, embed_size=self.embed_size)
-        self.cached_mode = cached_mode
-        self.device = device
-
-    def get_classifier(self):
-        return self.classifier_head
-
-    def train_custom(self):
-        for param in self.encoder.parameters():
-            param.requires_grad = False
-        self.classifier_head.train()
-
-    def avg_representation(self, x):
-        mask = x.ne(0).unsqueeze(2).repeat(1, 1, self.embed_size).float().to(self.device).detach()
-        hidden = self.encoder.transformer(x)["last_hidden_state"]
-        masked_hidden = hidden * mask
-        avg_hidden = torch.sum(masked_hidden, dim=1) / (torch.sum(mask, dim=1).detach() + EPSILON)
-        return avg_hidden
-
-    def forward(self, x):
-        if self.cached_mode:
-            avg_hidden = x.to(self.device)
-        else:
-            avg_hidden = self.avg_representation(x.to(self.device))
-
-        logits = self.classifier_head(avg_hidden)
-        probs = nn.functional.log_softmax(logits, dim=-1)
-
-        return probs
-
-
-class Dataset(data.Dataset):
-    def __init__(self, X, y):
-        """Reads source and target sequences from txt files."""
-        self.X = X
-        self.y = y
-
-    def __len__(self):
-        return len(self.X)
-
-    def __getitem__(self, index):
-        """Returns one data pair (source and target)."""
-        data = {}
-        data["X"] = self.X[index]
-        data["y"] = self.y[index]
-        return data
-
-
-def collate_fn(data):
-    def pad_sequences(sequences):
-        lengths = [len(seq) for seq in sequences]
-
-        padded_sequences = torch.zeros(len(sequences), max(lengths)).long()  # padding value = 0
-
-        for i, seq in enumerate(sequences):
-            end = lengths[i]
-            padded_sequences[i, :end] = seq[:end]
-
-        return padded_sequences, lengths
-
-    item_info = {}
-    for key in data[0].keys():
-        item_info[key] = [d[key] for d in data]
-
-    x_batch, _ = pad_sequences(item_info["X"])
-    y_batch = torch.tensor(item_info["y"], dtype=torch.long)
-
-    return x_batch, y_batch
-
-
-def cached_collate_fn(data):
-    item_info = {}
-    for key in data[0].keys():
-        item_info[key] = [d[key] for d in data]
-
-    x_batch = torch.cat(item_info["X"], 0)
-    y_batch = torch.tensor(item_info["y"], dtype=torch.long)
-
-    return x_batch, y_batch
-
-
-def train_epoch(data_loader, discriminator, optimizer, epoch=0, log_interval=10, device="cpu"):
-    samples_so_far = 0
-    discriminator.train_custom()
-    for batch_idx, (input_t, target_t) in enumerate(data_loader):
-        input_t, target_t = input_t.to(device), target_t.to(device)
-
-        optimizer.zero_grad()
-
-        output_t = discriminator(input_t)
-        loss = nn.functional.nll_loss(output_t, target_t)
-        loss.backward(retain_graph=True)
-        optimizer.step()
-
-        samples_so_far += len(input_t)
-
-        if batch_idx % log_interval == 0:
-            print(
-                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
-                    epoch + 1,
-                    samples_so_far,
-                    len(data_loader.dataset),
-                    100 * samples_so_far / len(data_loader.dataset),
-                    loss.item(),
-                )
-            )
-
-
-def evaluate_performance(data_loader, discriminator, device="cpu"):
-    discriminator.eval()
-    test_loss = 0
-    correct = 0
-    with torch.no_grad():
-        for input_t, target_t in data_loader:
-            input_t, target_t = input_t.to(device), target_t.to(device)
-            output_t = discriminator(input_t)
-            # sum up batch loss
-            test_loss += nn.functional.nll_loss(output_t, target_t, reduction="sum").item()
-            # get the index of the max log-probability
-            pred_t = output_t.argmax(dim=1, keepdim=True)
-            correct += pred_t.eq(target_t.view_as(pred_t)).sum().item()
-
-    test_loss /= len(data_loader.dataset)
-
-    print(
-        "Performance on test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
-            test_loss, correct, len(data_loader.dataset), 100.0 * correct / len(data_loader.dataset)
-        )
-    )
-
-
-def predict(input_sentence, model, classes, cached=False, device="cpu"):
-    input_t = model.tokenizer.encode(input_sentence)
-    input_t = torch.tensor([input_t], dtype=torch.long, device=device)
-    if cached:
-        input_t = model.avg_representation(input_t)
-
-    log_probs = model(input_t).data.cpu().numpy().flatten().tolist()
-    print("Input sentence:", input_sentence)
-    print(
-        "Predictions:",
-        ", ".join("{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in zip(classes, log_probs)),
-    )
-
-
-def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, device="cpu"):
-    data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=collate_fn)
-
-    xs = []
-    ys = []
-    for batch_idx, (x, y) in enumerate(tqdm(data_loader, ascii=True)):
-        with torch.no_grad():
-            x = x.to(device)
-            avg_rep = discriminator.avg_representation(x).cpu().detach()
-            avg_rep_list = torch.unbind(avg_rep.unsqueeze(1))
-            xs += avg_rep_list
-            ys += y.cpu().numpy().tolist()
-
-    data_loader = torch.utils.data.DataLoader(
-        dataset=Dataset(xs, ys), batch_size=batch_size, shuffle=shuffle, collate_fn=cached_collate_fn
-    )
-
-    return data_loader
-
-
-def train_discriminator(
-    dataset,
-    dataset_fp=None,
-    pretrained_model="openai-community/gpt2-medium",
-    epochs=10,
-    batch_size=64,
-    log_interval=10,
-    save_model=False,
-    cached=False,
-    no_cuda=False,
-):
-    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
-
-    print("Preprocessing {} dataset...".format(dataset))
-    start = time.time()
-
-    if dataset == "SST":
-        idx2class = ["positive", "negative", "very positive", "very negative", "neutral"]
-        class2idx = {c: i for i, c in enumerate(idx2class)}
-
-        discriminator = Discriminator(
-            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
-        ).to(device)
-
-        text = torchtext_data.Field()
-        label = torchtext_data.Field(sequential=False)
-        train_data, val_data, test_data = datasets.SST.splits(
-            text,
-            label,
-            fine_grained=True,
-            train_subtrees=True,
-        )
-
-        x = []
-        y = []
-        for i in trange(len(train_data), ascii=True):
-            seq = TreebankWordDetokenizer().detokenize(vars(train_data[i])["text"])
-            seq = discriminator.tokenizer.encode(seq)
-            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
-            x.append(seq)
-            y.append(class2idx[vars(train_data[i])["label"]])
-        train_dataset = Dataset(x, y)
-
-        test_x = []
-        test_y = []
-        for i in trange(len(test_data), ascii=True):
-            seq = TreebankWordDetokenizer().detokenize(vars(test_data[i])["text"])
-            seq = discriminator.tokenizer.encode(seq)
-            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
-            test_x.append(seq)
-            test_y.append(class2idx[vars(test_data[i])["label"]])
-        test_dataset = Dataset(test_x, test_y)
-
-        discriminator_meta = {
-            "class_size": len(idx2class),
-            "embed_size": discriminator.embed_size,
-            "pretrained_model": pretrained_model,
-            "class_vocab": class2idx,
-            "default_class": 2,
-        }
-
-    elif dataset == "clickbait":
-        idx2class = ["non_clickbait", "clickbait"]
-        class2idx = {c: i for i, c in enumerate(idx2class)}
-
-        discriminator = Discriminator(
-            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
-        ).to(device)
-
-        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
-            data = []
-            for i, line in enumerate(f):
-                try:
-                    data.append(eval(line))
-                except Exception:
-                    print("Error evaluating line {}: {}".format(i, line))
-                    continue
-        x = []
-        y = []
-        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
-            for i, line in enumerate(tqdm(f, ascii=True)):
-                try:
-                    d = eval(line)
-                    seq = discriminator.tokenizer.encode(d["text"])
-
-                    if len(seq) < max_length_seq:
-                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
-                    else:
-                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
-                        continue
-                    x.append(seq)
-                    y.append(d["label"])
-                except Exception:
-                    print("Error evaluating / tokenizing line {}, skipping it".format(i))
-                    pass
-
-        full_dataset = Dataset(x, y)
-        train_size = int(0.9 * len(full_dataset))
-        test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
-
-        discriminator_meta = {
-            "class_size": len(idx2class),
-            "embed_size": discriminator.embed_size,
-            "pretrained_model": pretrained_model,
-            "class_vocab": class2idx,
-            "default_class": 1,
-        }
-
-    elif dataset == "toxic":
-        idx2class = ["non_toxic", "toxic"]
-        class2idx = {c: i for i, c in enumerate(idx2class)}
-
-        discriminator = Discriminator(
-            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
-        ).to(device)
-
-        x = []
-        y = []
-        with open("datasets/toxic/toxic_train.txt") as f:
-            for i, line in enumerate(tqdm(f, ascii=True)):
-                try:
-                    d = eval(line)
-                    seq = discriminator.tokenizer.encode(d["text"])
-
-                    if len(seq) < max_length_seq:
-                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
-                    else:
-                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
-                        continue
-                    x.append(seq)
-                    y.append(int(np.sum(d["label"]) > 0))
-                except Exception:
-                    print("Error evaluating / tokenizing line {}, skipping it".format(i))
-                    pass
-
-        full_dataset = Dataset(x, y)
-        train_size = int(0.9 * len(full_dataset))
-        test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
-
-        discriminator_meta = {
-            "class_size": len(idx2class),
-            "embed_size": discriminator.embed_size,
-            "pretrained_model": pretrained_model,
-            "class_vocab": class2idx,
-            "default_class": 0,
-        }
-
-    else:  # if dataset == "generic":
-        # This assumes the input dataset is a TSV with the following structure:
-        # class \t text
-
-        if dataset_fp is None:
-            raise ValueError("When generic dataset is selected, dataset_fp needs to be specified aswell.")
-
-        classes = set()
-        with open(dataset_fp) as f:
-            csv_reader = csv.reader(f, delimiter="\t")
-            for row in tqdm(csv_reader, ascii=True):
-                if row:
-                    classes.add(row[0])
-
-        idx2class = sorted(classes)
-        class2idx = {c: i for i, c in enumerate(idx2class)}
-
-        discriminator = Discriminator(
-            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
-        ).to(device)
-
-        x = []
-        y = []
-        with open(dataset_fp) as f:
-            csv_reader = csv.reader(f, delimiter="\t")
-            for i, row in enumerate(tqdm(csv_reader, ascii=True)):
-                if row:
-                    label = row[0]
-                    text = row[1]
-
-                    try:
-                        seq = discriminator.tokenizer.encode(text)
-                        if len(seq) < max_length_seq:
-                            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
-
-                        else:
-                            print("Line {} is longer than maximum length {}".format(i, max_length_seq))
-                            continue
-
-                        x.append(seq)
-                        y.append(class2idx[label])
-
-                    except Exception:
-                        print("Error tokenizing line {}, skipping it".format(i))
-                        pass
-
-        full_dataset = Dataset(x, y)
-        train_size = int(0.9 * len(full_dataset))
-        test_size = len(full_dataset) - train_size
-        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
-
-        discriminator_meta = {
-            "class_size": len(idx2class),
-            "embed_size": discriminator.embed_size,
-            "pretrained_model": pretrained_model,
-            "class_vocab": class2idx,
-            "default_class": 0,
-        }
-
-    end = time.time()
-    print("Preprocessed {} data points".format(len(train_dataset) + len(test_dataset)))
-    print("Data preprocessing took: {:.3f}s".format(end - start))
-
-    if cached:
-        print("Building representation cache...")
-
-        start = time.time()
-
-        train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device)
-
-        test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device)
-
-        end = time.time()
-        print("Building representation cache took: {:.3f}s".format(end - start))
-
-    else:
-        train_loader = torch.utils.data.DataLoader(
-            dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
-        )
-        test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn)
-
-    if save_model:
-        with open("{}_classifier_head_meta.json".format(dataset), "w") as meta_file:
-            json.dump(discriminator_meta, meta_file)
-
-    optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)
-
-    for epoch in range(epochs):
-        start = time.time()
-        print("\nEpoch", epoch + 1)
-
-        train_epoch(
-            discriminator=discriminator,
-            data_loader=train_loader,
-            optimizer=optimizer,
-            epoch=epoch,
-            log_interval=log_interval,
-            device=device,
-        )
-        evaluate_performance(data_loader=test_loader, discriminator=discriminator, device=device)
-
-        end = time.time()
-        print("Epoch took: {:.3f}s".format(end - start))
-
-        print("\nExample prediction")
-        predict(example_sentence, discriminator, idx2class, cached=cached, device=device)
-
-        if save_model:
-            # torch.save(discriminator.state_dict(),
-            #           "{}_discriminator_{}.pt".format(
-            #               args.dataset, epoch + 1
-            #               ))
-            torch.save(
-                discriminator.get_classifier().state_dict(),
-                "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1),
-            )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Train a discriminator on top of GPT-2 representations")
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="SST",
-        choices=("SST", "clickbait", "toxic", "generic"),
-        help=(
-            "dataset to train the discriminator on. "
-            "In case of generic, the dataset is expected "
-            "to be a TSBV file with structure: class \\t text"
-        ),
-    )
-    parser.add_argument(
-        "--dataset_fp",
-        type=str,
-        default="",
-        help="File path of the dataset to use. Needed only in case of generic datadset",
-    )
-    parser.add_argument(
-        "--pretrained_model",
-        type=str,
-        default="openai-community/gpt2-medium",
-        help="Pretrained model to use as encoder",
-    )
-    parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs")
-    parser.add_argument(
-        "--batch_size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
-    )
-    parser.add_argument(
-        "--log_interval",
-        type=int,
-        default=10,
-        metavar="N",
-        help="how many batches to wait before logging training status",
-    )
-    parser.add_argument("--save_model", action="store_true", help="whether to save the model")
-    parser.add_argument("--cached", action="store_true", help="whether to cache the input representations")
-    parser.add_argument("--no_cuda", action="store_true", help="use to turn off cuda")
-    args = parser.parse_args()
-
-    train_discriminator(**(vars(args)))
diff --git a/examples/research_projects/quantization-qdqbert/Dockerfile b/examples/research_projects/quantization-qdqbert/Dockerfile
deleted file mode 100644
index e64c9f0e021d..000000000000
--- a/examples/research_projects/quantization-qdqbert/Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-# coding=utf-8
-# Copyright 2021 NVIDIA Corporation. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-FROM nvcr.io/nvidia/pytorch:22.02-py3
-LABEL maintainer="Hugging Face"
-LABEL repository="transformers"
-
-RUN apt-get update
-RUN apt-get install sudo
-
-RUN python3 -m pip install --no-cache-dir --upgrade pip
-RUN python3 -m pip install --no-cache-dir --ignore-installed pycuda
-RUN python3 -m pip install --no-cache-dir \
-    pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
-RUN python3 -m pip install --no-cache-dir onnxruntime-gpu==1.11
-
-WORKDIR /workspace
-COPY . transformers/
-RUN cd transformers/ && \
-    python3 -m pip install --no-cache-dir .
-
-RUN python3 -m pip install --no-cache-dir datasets \
-    accelerate
diff --git a/examples/research_projects/quantization-qdqbert/README.md b/examples/research_projects/quantization-qdqbert/README.md
deleted file mode 100644
index 2cc2d5e5f98c..000000000000
--- a/examples/research_projects/quantization-qdqbert/README.md
+++ /dev/null
@@ -1,200 +0,0 @@
-<!---
-Copyright 2021 NVIDIA Corporation. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Huggingface QDQBERT Quantization Example
-
-The QDQBERT model adds fake quantization (pair of QuantizeLinear/DequantizeLinear ops) to:
- * linear layer inputs and weights
- * matmul inputs
- * residual add inputs
-
-In this example, we use QDQBERT model to do quantization on SQuAD task, including Quantization Aware Training (QAT), Post Training Quantization (PTQ) and inferencing using TensorRT.
-
-Required:
-- [pytorch-quantization toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization)
-- [TensorRT >= 8.2](https://developer.nvidia.com/tensorrt)
-- PyTorch >= 1.10.0
-
-## Setup the environment with Dockerfile
-
-Under the directory of `transformers/`, build the docker image:
-```bash
-docker build . -f examples/research_projects/quantization-qdqbert/Dockerfile -t bert_quantization:latest
-```
-
-Run the docker:
-```bash
-docker run --gpus all --privileged --rm -it --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 bert_quantization:latest
-```
-
-In the container:
-```bash
-cd transformers/examples/research_projects/quantization-qdqbert/
-```
-
-## Quantization Aware Training (QAT)
-
-Calibrate the pretrained model and finetune with quantization awared:
-
-```bash
-python3 run_quant_qa.py \
-  --model_name_or_path google-bert/bert-base-uncased \
-  --dataset_name squad \
-  --max_seq_length 128 \
-  --doc_stride 32 \
-  --output_dir calib/google-bert/bert-base-uncased \
-  --do_calib \
-  --calibrator percentile \
-  --percentile 99.99
-```
-
-```bash
-python3 run_quant_qa.py \
-  --model_name_or_path calib/google-bert/bert-base-uncased \
-  --dataset_name squad \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 12 \
-  --learning_rate 4e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 128 \
-  --doc_stride 32 \
-  --output_dir finetuned_int8/google-bert/bert-base-uncased \
-  --tokenizer_name google-bert/bert-base-uncased \
-  --save_steps 0
-```
-
-### Export QAT model to ONNX
-
-To export the QAT model finetuned above:
-
-```bash
-python3 run_quant_qa.py \
-  --model_name_or_path finetuned_int8/google-bert/bert-base-uncased \
-  --output_dir ./ \
-  --save_onnx \
-  --per_device_eval_batch_size 1 \
-  --max_seq_length 128 \
-  --doc_stride 32 \
-  --dataset_name squad \
-  --tokenizer_name google-bert/bert-base-uncased
-```
-
-Use `--recalibrate-weights` to calibrate the weight ranges according to the quantizer axis. Use `--quant-per-tensor` for per tensor quantization (default is per channel).
-Recalibrating will affect the accuracy of the model, but the change should be minimal (< 0.5 F1).
-
-### Benchmark the INT8 QAT ONNX model inference with TensorRT using dummy input
-
-```bash
-trtexec --onnx=model.onnx --explicitBatch --workspace=16384 --int8 --shapes=input_ids:64x128,attention_mask:64x128,token_type_ids:64x128 --verbose
-```
-
-### Benchmark the INT8 QAT ONNX model inference with [ONNX Runtime-TRT](https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html) using dummy input
-
-```bash
-python3 ort-infer-benchmark.py
-```
-
-### Evaluate the INT8 QAT ONNX model inference with TensorRT
-
-```bash
-python3 evaluate-hf-trt-qa.py \
-  --onnx_model_path=./model.onnx \
-  --output_dir ./ \
-  --per_device_eval_batch_size 64 \
-  --max_seq_length 128 \
-  --doc_stride 32 \
-  --dataset_name squad \
-  --tokenizer_name google-bert/bert-base-uncased \
-  --int8 \
-  --seed 42
-```
-
-## Fine-tuning of FP32 model for comparison
-
-Finetune a fp32 precision model with [transformers/examples/pytorch/question-answering/](../../pytorch/question-answering/):
-
-```bash
-python3 ../../pytorch/question-answering/run_qa.py \
-  --model_name_or_path google-bert/bert-base-uncased \
-  --dataset_name squad \
-  --per_device_train_batch_size 12 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 128 \
-  --doc_stride 32 \
-  --output_dir ./finetuned_fp32/google-bert/bert-base-uncased \
-  --save_steps 0 \
-  --do_train \
-  --do_eval
-```
-
-## Post Training Quantization (PTQ)
-
-### PTQ by calibrating and evaluating the finetuned FP32 model above:
-
-```bash
-python3 run_quant_qa.py \
-  --model_name_or_path ./finetuned_fp32/google-bert/bert-base-uncased \
-  --dataset_name squad \
-  --calibrator percentile \
-  --percentile 99.99 \
-  --max_seq_length 128 \
-  --doc_stride 32 \
-  --output_dir ./calib/google-bert/bert-base-uncased \
-  --save_steps 0 \
-  --do_calib \
-  --do_eval
-```
-
-### Export the INT8 PTQ model to ONNX
-
-```bash
-python3 run_quant_qa.py \
-  --model_name_or_path ./calib/google-bert/bert-base-uncased \
-  --output_dir ./ \
-  --save_onnx \
-  --per_device_eval_batch_size 1 \
-  --max_seq_length 128 \
-  --doc_stride 32 \
-  --dataset_name squad \
-  --tokenizer_name google-bert/bert-base-uncased
-```
-
-### Evaluate the INT8 PTQ ONNX model inference with TensorRT
-
-```bash
-python3 evaluate-hf-trt-qa.py \
-  --onnx_model_path=./model.onnx \
-  --output_dir ./ \
-  --per_device_eval_batch_size 64 \
-  --max_seq_length 128 \
-  --doc_stride 32 \
-  --dataset_name squad \
-  --tokenizer_name google-bert/bert-base-uncased \
-  --int8 \
-  --seed 42
-```
-
-### Quantization options
-
-Some useful options to support different implementations and optimizations. These should be specified for both calibration and finetuning.
-
-|argument|description|
-|--------|-----------|
-|`--quant-per-tensor`| quantize weights with one quantization range per tensor |
-|`--fuse-qkv` | use a single range (the max) for quantizing QKV weights and output activations  |
-|`--clip-gelu N` | clip the output of GELU to a maximum of N when quantizing (e.g. 10) |
-|`--disable-dropout` | disable dropout for consistent activation ranges |
diff --git a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
deleted file mode 100755
index 7a8ea2109bc5..000000000000
--- a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
+++ /dev/null
@@ -1,457 +0,0 @@
-# coding=utf-8
-# Copyright 2021 NVIDIA Corporation. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
-
-import argparse
-import logging
-import os
-import time
-import timeit
-
-import datasets
-import numpy as np
-import pycuda.autoinit  # noqa: F401
-import pycuda.driver as cuda
-import tensorrt as trt
-import torch
-from absl import logging as absl_logging
-from accelerate import Accelerator
-from datasets import load_dataset, load_metric
-from torch.utils.data import DataLoader
-from utils_qa import postprocess_qa_predictions
-
-import transformers
-from transformers import AutoTokenizer, EvalPrediction, default_data_collator, set_seed
-from transformers.trainer_pt_utils import nested_concat, nested_truncate
-
-
-TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
-absl_logger = absl_logging.get_absl_logger()
-absl_logger.setLevel(logging.WARNING)
-
-logger = logging.getLogger(__name__)
-
-parser = argparse.ArgumentParser()
-
-# Required parameters
-parser.add_argument(
-    "--onnx_model_path",
-    default=None,
-    type=str,
-    required=True,
-    help="Path to ONNX model: ",
-)
-
-parser.add_argument(
-    "--output_dir",
-    default=None,
-    type=str,
-    required=True,
-    help="The output directory where the model checkpoints and predictions will be written.",
-)
-
-# Other parameters
-
-parser.add_argument(
-    "--tokenizer_name",
-    default="",
-    type=str,
-    required=True,
-    help="Pretrained tokenizer name or path if not the same as model_name",
-)
-
-parser.add_argument(
-    "--version_2_with_negative",
-    action="store_true",
-    help="If true, the SQuAD examples contain some that do not have an answer.",
-)
-parser.add_argument(
-    "--null_score_diff_threshold",
-    type=float,
-    default=0.0,
-    help="If null_score - best_non_null is greater than the threshold predict null.",
-)
-
-parser.add_argument(
-    "--max_seq_length",
-    default=384,
-    type=int,
-    help=(
-        "The maximum total input sequence length after WordPiece tokenization. Sequences "
-        "longer than this will be truncated, and sequences shorter than this will be padded."
-    ),
-)
-parser.add_argument(
-    "--doc_stride",
-    default=128,
-    type=int,
-    help="When splitting up a long document into chunks, how much stride to take between chunks.",
-)
-
-parser.add_argument("--per_device_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.")
-
-parser.add_argument(
-    "--n_best_size",
-    default=20,
-    type=int,
-    help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
-)
-parser.add_argument(
-    "--max_answer_length",
-    default=30,
-    type=int,
-    help=(
-        "The maximum length of an answer that can be generated. This is needed because the start "
-        "and end predictions are not conditioned on one another."
-    ),
-)
-
-parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-parser.add_argument(
-    "--dataset_name",
-    type=str,
-    default=None,
-    required=True,
-    help="The name of the dataset to use (via the datasets library).",
-)
-parser.add_argument(
-    "--dataset_config_name",
-    type=str,
-    default=None,
-    help="The configuration name of the dataset to use (via the datasets library).",
-)
-parser.add_argument(
-    "--preprocessing_num_workers", type=int, default=4, help="A csv or a json file containing the training data."
-)
-parser.add_argument("--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets")
-parser.add_argument(
-    "--fp16",
-    action="store_true",
-    help="Whether to use 16-bit (mixed) precision instead of 32-bit",
-)
-parser.add_argument(
-    "--int8",
-    action="store_true",
-    help="Whether to use INT8",
-)
-
-args = parser.parse_args()
-
-if args.tokenizer_name:
-    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
-else:
-    raise ValueError(
-        "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
-        "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-    )
-
-logger.info("Training/evaluation parameters %s", args)
-
-args.eval_batch_size = args.per_device_eval_batch_size
-
-INPUT_SHAPE = (args.eval_batch_size, args.max_seq_length)
-
-# TRT Engine properties
-STRICT_TYPES = True
-
-engine_name = "temp_engine/bert-fp32.engine"
-if args.fp16:
-    engine_name = "temp_engine/bert-fp16.engine"
-if args.int8:
-    engine_name = "temp_engine/bert-int8.engine"
-
-# import ONNX file
-if not os.path.exists("temp_engine"):
-    os.makedirs("temp_engine")
-
-EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(
-    network, TRT_LOGGER
-) as parser:
-    with open(args.onnx_model_path, "rb") as model:
-        if not parser.parse(model.read()):
-            for error in range(parser.num_errors):
-                print(parser.get_error(error))
-
-    # Query input names and shapes from parsed TensorRT network
-    network_inputs = [network.get_input(i) for i in range(network.num_inputs)]
-    input_names = [_input.name for _input in network_inputs]  # ex: ["actual_input1"]
-
-    with builder.create_builder_config() as config:
-        config.max_workspace_size = 1 << 50
-        if STRICT_TYPES:
-            config.set_flag(trt.BuilderFlag.STRICT_TYPES)
-        if args.fp16:
-            config.set_flag(trt.BuilderFlag.FP16)
-        if args.int8:
-            config.set_flag(trt.BuilderFlag.INT8)
-        profile = builder.create_optimization_profile()
-        config.add_optimization_profile(profile)
-        for i in range(len(input_names)):
-            profile.set_shape(input_names[i], INPUT_SHAPE, INPUT_SHAPE, INPUT_SHAPE)
-        engine = builder.build_engine(network, config)
-
-        # serialize_engine and store in file (can be directly loaded and deserialized):
-        with open(engine_name, "wb") as f:
-            f.write(engine.serialize())
-
-
-# run inference with TRT
-def model_infer(inputs, context, d_inputs, h_output0, h_output1, d_output0, d_output1, stream):
-    input_ids = np.asarray(inputs["input_ids"], dtype=np.int32)
-    attention_mask = np.asarray(inputs["attention_mask"], dtype=np.int32)
-    token_type_ids = np.asarray(inputs["token_type_ids"], dtype=np.int32)
-
-    # Copy inputs
-    cuda.memcpy_htod_async(d_inputs[0], input_ids.ravel(), stream)
-    cuda.memcpy_htod_async(d_inputs[1], attention_mask.ravel(), stream)
-    cuda.memcpy_htod_async(d_inputs[2], token_type_ids.ravel(), stream)
-    # start time
-    start_time = time.time()
-    # Run inference
-    context.execute_async(
-        bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output0), int(d_output1)], stream_handle=stream.handle
-    )
-    # Transfer predictions back from GPU
-    cuda.memcpy_dtoh_async(h_output0, d_output0, stream)
-    cuda.memcpy_dtoh_async(h_output1, d_output1, stream)
-    # Synchronize the stream and take time
-    stream.synchronize()
-    # end time
-    end_time = time.time()
-    infer_time = end_time - start_time
-    outputs = (h_output0, h_output1)
-    # print(outputs)
-    return outputs, infer_time
-
-
-# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-accelerator = Accelerator()
-# Make one log on every process with the configuration for debugging.
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-
-# Setup logging, we only want one process per machine to log things on the screen.
-# accelerator.is_local_main_process is only True for one process per machine.
-logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
-if accelerator.is_local_main_process:
-    datasets.utils.logging.set_verbosity_warning()
-    transformers.utils.logging.set_verbosity_info()
-else:
-    datasets.utils.logging.set_verbosity_error()
-    transformers.utils.logging.set_verbosity_error()
-
-# If passed along, set the training seed now.
-if args.seed is not None:
-    set_seed(args.seed)
-
-# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-# (the dataset will be downloaded automatically from the datasets Hub).
-#
-# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-# 'text' is found. You can easily tweak this behavior (see below).
-if args.dataset_name is not None:
-    # Downloading and loading a dataset from the hub.
-    raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
-else:
-    raise ValueError("Evaluation requires a dataset name")
-# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-# https://huggingface.co/docs/datasets/loading_datasets.
-
-# Preprocessing the datasets.
-# Preprocessing is slightly different for training and evaluation.
-
-column_names = raw_datasets["validation"].column_names
-
-question_column_name = "question" if "question" in column_names else column_names[0]
-context_column_name = "context" if "context" in column_names else column_names[1]
-answer_column_name = "answers" if "answers" in column_names else column_names[2]
-
-# Padding side determines if we do (question|context) or (context|question).
-pad_on_right = tokenizer.padding_side == "right"
-
-if args.max_seq_length > tokenizer.model_max_length:
-    logger.warning(
-        f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the "
-        f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-    )
-
-max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
-
-
-# Validation preprocessing
-def prepare_validation_features(examples):
-    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
-    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
-    # left whitespace
-    examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
-
-    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
-    # in one example possible giving several features when a context is long, each of those features having a
-    # context that overlaps a bit the context of the previous feature.
-    tokenized_examples = tokenizer(
-        examples[question_column_name if pad_on_right else context_column_name],
-        examples[context_column_name if pad_on_right else question_column_name],
-        truncation="only_second" if pad_on_right else "only_first",
-        max_length=max_seq_length,
-        stride=args.doc_stride,
-        return_overflowing_tokens=True,
-        return_offsets_mapping=True,
-        padding="max_length",
-    )
-
-    # Since one example might give us several features if it has a long context, we need a map from a feature to
-    # its corresponding example. This key gives us just that.
-    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
-
-    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
-    # corresponding example_id and we will store the offset mappings.
-    tokenized_examples["example_id"] = []
-
-    for i in range(len(tokenized_examples["input_ids"])):
-        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
-        sequence_ids = tokenized_examples.sequence_ids(i)
-        context_index = 1 if pad_on_right else 0
-
-        # One example can give several spans, this is the index of the example containing this span of text.
-        sample_index = sample_mapping[i]
-        tokenized_examples["example_id"].append(examples["id"][sample_index])
-
-        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
-        # position is part of the context or not.
-        tokenized_examples["offset_mapping"][i] = [
-            (o if sequence_ids[k] == context_index else None)
-            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
-        ]
-
-    return tokenized_examples
-
-
-eval_examples = raw_datasets["validation"]
-# Validation Feature Creation
-eval_dataset = eval_examples.map(
-    prepare_validation_features,
-    batched=True,
-    num_proc=args.preprocessing_num_workers,
-    remove_columns=column_names,
-    load_from_cache_file=not args.overwrite_cache,
-    desc="Running tokenizer on validation dataset",
-)
-
-data_collator = default_data_collator
-
-eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])
-eval_dataloader = DataLoader(
-    eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
-)
-
-
-# Post-processing:
-def post_processing_function(examples, features, predictions, stage="eval"):
-    # Post-processing: we match the start logits and end logits to answers in the original context.
-    predictions = postprocess_qa_predictions(
-        examples=examples,
-        features=features,
-        predictions=predictions,
-        version_2_with_negative=args.version_2_with_negative,
-        n_best_size=args.n_best_size,
-        max_answer_length=args.max_answer_length,
-        null_score_diff_threshold=args.null_score_diff_threshold,
-        output_dir=args.output_dir,
-        prefix=stage,
-    )
-    # Format the result to the format the metric expects.
-    if args.version_2_with_negative:
-        formatted_predictions = [
-            {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
-        ]
-    else:
-        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
-
-    references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
-    return EvalPrediction(predictions=formatted_predictions, label_ids=references)
-
-
-metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")
-
-# Evaluation!
-logger.info("Loading ONNX model %s for evaluation", args.onnx_model_path)
-with open(engine_name, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(
-    f.read()
-) as engine, engine.create_execution_context() as context:
-    # setup for TRT inferrence
-    for i in range(len(input_names)):
-        context.set_binding_shape(i, INPUT_SHAPE)
-    assert context.all_binding_shapes_specified
-
-    def binding_nbytes(binding):
-        return trt.volume(engine.get_binding_shape(binding)) * engine.get_binding_dtype(binding).itemsize
-
-    # Allocate device memory for inputs and outputs.
-    d_inputs = [cuda.mem_alloc(binding_nbytes(binding)) for binding in engine if engine.binding_is_input(binding)]
-
-    # Allocate output buffer
-    h_output0 = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
-    h_output1 = cuda.pagelocked_empty(tuple(context.get_binding_shape(4)), dtype=np.float32)
-    d_output0 = cuda.mem_alloc(h_output0.nbytes)
-    d_output1 = cuda.mem_alloc(h_output1.nbytes)
-
-    # Create a stream in which to copy inputs/outputs and run inference.
-    stream = cuda.Stream()
-
-    # Evaluation
-    logger.info("***** Running Evaluation *****")
-    logger.info(f"  Num examples = {len(eval_dataset)}")
-    logger.info(f"  Batch size = {args.per_device_eval_batch_size}")
-
-    total_time = 0.0
-    niter = 0
-    start_time = timeit.default_timer()
-
-    all_preds = None
-    for step, batch in enumerate(eval_dataloader):
-        outputs, infer_time = model_infer(batch, context, d_inputs, h_output0, h_output1, d_output0, d_output1, stream)
-        total_time += infer_time
-        niter += 1
-
-        start_logits, end_logits = outputs
-        start_logits = torch.tensor(start_logits)
-        end_logits = torch.tensor(end_logits)
-
-        # necessary to pad predictions and labels for being gathered
-        start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
-        end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)
-
-        logits = (accelerator.gather(start_logits).cpu().numpy(), accelerator.gather(end_logits).cpu().numpy())
-        all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
-
-    if all_preds is not None:
-        all_preds = nested_truncate(all_preds, len(eval_dataset))
-
-    evalTime = timeit.default_timer() - start_time
-    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset))
-    # Inference time from TRT
-    logger.info("Average Inference Time = {:.3f} ms".format(total_time * 1000 / niter))
-    logger.info("Total Inference Time =  {:.3f} ms".format(total_time * 1000))
-    logger.info("Total Number of Inference =  %d", niter)
-
-prediction = post_processing_function(eval_examples, eval_dataset, all_preds)
-eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
-logger.info(f"Evaluation metrics: {eval_metric}")
diff --git a/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py
deleted file mode 100644
index bb0436c12580..000000000000
--- a/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import os
-import time
-
-import numpy as np
-import onnxruntime as ort
-
-
-os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1"
-os.environ["ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE"] = "0"
-os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1"
-
-sess_opt = ort.SessionOptions()
-sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
-print("Create inference session...")
-execution_provider = ["TensorrtExecutionProvider", "CUDAExecutionProvider"]
-sess = ort.InferenceSession("model.onnx", sess_options=sess_opt, providers=execution_provider)
-run_opt = ort.RunOptions()
-
-sequence = 128
-batch = 1
-input_ids = np.ones((batch, sequence), dtype=np.int64)
-attention_mask = np.ones((batch, sequence), dtype=np.int64)
-token_type_ids = np.ones((batch, sequence), dtype=np.int64)
-
-print("Warm up phase...")
-sess.run(
-    None,
-    {
-        sess.get_inputs()[0].name: input_ids,
-        sess.get_inputs()[1].name: attention_mask,
-        sess.get_inputs()[2].name: token_type_ids,
-    },
-    run_options=run_opt,
-)
-
-print("Start inference...")
-start_time = time.time()
-max_iters = 2000
-predict = {}
-for iter in range(max_iters):
-    predict = sess.run(
-        None,
-        {
-            sess.get_inputs()[0].name: input_ids,
-            sess.get_inputs()[1].name: attention_mask,
-            sess.get_inputs()[2].name: token_type_ids,
-        },
-        run_options=run_opt,
-    )
-print("Average Inference Time = {:.3f} ms".format((time.time() - start_time) * 1000 / max_iters))
diff --git a/examples/research_projects/quantization-qdqbert/quant_trainer.py b/examples/research_projects/quantization-qdqbert/quant_trainer.py
deleted file mode 100755
index 132aa2849058..000000000000
--- a/examples/research_projects/quantization-qdqbert/quant_trainer.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# coding=utf-8
-# Copyright 2021 NVIDIA Corporation. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Helper functions for training models with pytorch-quantization"""
-
-import logging
-import re
-
-import pytorch_quantization
-import pytorch_quantization.nn as quant_nn
-import torch
-from pytorch_quantization import calib
-from pytorch_quantization.tensor_quant import QuantDescriptor
-
-
-logger = logging.getLogger(__name__)
-
-name_width = 50  # max width of layer names
-qname_width = 70  # max width of quantizer names
-
-# ========================================== Quant Trainer API ==========================================
-
-
-def add_arguments(parser):
-    """Add arguments to parser for functions defined in quant_trainer."""
-
-    group = parser.add_argument_group("quant_trainer arguments")
-    group.add_argument("--wprec", type=int, default=8, help="weight precision")
-    group.add_argument("--aprec", type=int, default=8, help="activation precision")
-    group.add_argument("--quant-per-tensor", action="store_true", help="per tensor weight scaling")
-    group.add_argument("--quant-disable", action="store_true", help="disable all quantizers")
-    group.add_argument("--quant-disable-embeddings", action="store_true", help="disable all embeddings quantizers")
-    group.add_argument("--quant-disable-keyword", type=str, nargs="+", help="disable quantizers by keyword")
-    group.add_argument("--quant-disable-layer-module", type=str, help="disable quantizers by keyword under layer.")
-    group.add_argument("--quant-enable-layer-module", type=str, help="enable quantizers by keyword under layer")
-    group.add_argument("--calibrator", default="max", help="which quantization range calibrator to use")
-    group.add_argument("--percentile", default=None, type=float, help="percentile for PercentileCalibrator")
-    group.add_argument("--fuse-qkv", action="store_true", help="use the same scale factor for qkv")
-    group.add_argument("--clip-gelu", metavar="N", type=float, help="clip gelu output maximum value to N")
-    group.add_argument(
-        "--recalibrate-weights",
-        action="store_true",
-        help=(
-            "recalibrate weight amaxes by taking the max of the weights."
-            " amaxes will be computed with the current quantization granularity (axis)."
-        ),
-    )
-
-
-def set_default_quantizers(args):
-    """Set default quantizers before creating the model."""
-
-    if args.calibrator == "max":
-        calib_method = "max"
-    elif args.calibrator == "percentile":
-        if args.percentile is None:
-            raise ValueError("Specify --percentile when using percentile calibrator")
-        calib_method = "histogram"
-    elif args.calibrator == "mse":
-        calib_method = "histogram"
-    else:
-        raise ValueError(f"Invalid calibrator {args.calibrator}")
-
-    input_desc = QuantDescriptor(num_bits=args.aprec, calib_method=calib_method)
-    weight_desc = QuantDescriptor(num_bits=args.wprec, axis=(None if args.quant_per_tensor else (0,)))
-    quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
-    quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-
-
-def configure_model(model, args, calib=False, eval=False):
-    """Function called before the training loop."""
-
-    logger.info("Configuring Model for Quantization")
-    logger.info(f"using quantization package {pytorch_quantization.__file__}")
-
-    if not calib:
-        if args.quant_disable_embeddings:
-            set_quantizer_by_name(model, ["embeddings"], which="weight", _disabled=True)
-
-        if args.quant_disable:
-            set_quantizer_by_name(model, [""], _disabled=True)
-
-        if args.quant_disable_keyword:
-            set_quantizer_by_name(model, args.quant_disable_keyword, _disabled=True)
-
-        if args.quant_disable_layer_module:
-            set_quantizer_by_name(model, [r"layer.\d+." + args.quant_disable_layer_module], _disabled=True)
-
-        if args.quant_enable_layer_module:
-            set_quantizer_by_name(model, [r"layer.\d+." + args.quant_enable_layer_module], _disabled=False)
-
-        if args.recalibrate_weights:
-            recalibrate_weights(model)
-
-        if args.fuse_qkv:
-            fuse_qkv(model, args)
-
-    if args.clip_gelu:
-        clip_gelu(model, args.clip_gelu)
-
-    # if args.local_rank in [-1, 0] and not calib:
-    print_quant_summary(model)
-
-
-def enable_calibration(model):
-    """Enable calibration of all *_input_quantizer modules in model."""
-
-    logger.info("Enabling Calibration")
-    for name, module in model.named_modules():
-        if name.endswith("_quantizer"):
-            if module._calibrator is not None:
-                module.disable_quant()
-                module.enable_calib()
-            else:
-                module.disable()
-            logger.info(f"{name:80}: {module}")
-
-
-def finish_calibration(model, args):
-    """Disable calibration and load amax for all "*_input_quantizer modules in model."""
-
-    logger.info("Loading calibrated amax")
-    for name, module in model.named_modules():
-        if name.endswith("_quantizer"):
-            if module._calibrator is not None:
-                if isinstance(module._calibrator, calib.MaxCalibrator):
-                    module.load_calib_amax()
-                else:
-                    module.load_calib_amax("percentile", percentile=args.percentile)
-                module.enable_quant()
-                module.disable_calib()
-            else:
-                module.enable()
-    model.cuda()
-    print_quant_summary(model)
-
-
-# ========================================== Helper Function ==========================================
-
-
-def fuse_qkv(model, args):
-    """Adjust quantization ranges to match an implementation where the QKV projections are implemented with a single GEMM.
-    Force the weight and output scale factors to match by taking the max of (Q,K,V).
-    """
-
-    def fuse3(qq, qk, qv):
-        for mod in [qq, qk, qv]:
-            if not hasattr(mod, "_amax"):
-                print("          WARNING: NO AMAX BUFFER")
-                return
-        q = qq._amax.detach().item()
-        k = qk._amax.detach().item()
-        v = qv._amax.detach().item()
-
-        amax = max(q, k, v)
-        qq._amax.fill_(amax)
-        qk._amax.fill_(amax)
-        qv._amax.fill_(amax)
-        logger.info(f"          q={q:5.2f} k={k:5.2f} v={v:5.2f} -> {amax:5.2f}")
-
-    for name, mod in model.named_modules():
-        if name.endswith(".attention.self"):
-            logger.info(f"FUSE_QKV: {name:{name_width}}")
-            fuse3(mod.matmul_q_input_quantizer, mod.matmul_k_input_quantizer, mod.matmul_v_input_quantizer)
-            if args.quant_per_tensor:
-                fuse3(mod.query._weight_quantizer, mod.key._weight_quantizer, mod.value._weight_quantizer)
-
-
-def clip_gelu(model, maxval):
-    """Clip activations generated by GELU to maxval when quantized.
-    Implemented by adjusting the amax of the following input_quantizer.
-    """
-
-    for name, mod in model.named_modules():
-        if name.endswith(".output.dense") and not name.endswith("attention.output.dense"):
-            amax_init = mod._input_quantizer._amax.data.detach().item()
-            mod._input_quantizer._amax.data.detach().clamp_(max=maxval)
-            amax = mod._input_quantizer._amax.data.detach().item()
-            logger.info(f"CLIP_GELU: {name:{name_width}} amax: {amax_init:5.2f} -> {amax:5.2f}")
-
-
-def expand_amax(model):
-    """Expand per-tensor amax to be per channel, where each channel is assigned the per-tensor amax."""
-
-    for name, mod in model.named_modules():
-        if hasattr(mod, "_weight_quantizer") and mod._weight_quantizer.axis is not None:
-            k = mod.weight.shape[0]
-            amax = mod._weight_quantizer._amax.detach()
-            mod._weight_quantizer._amax = torch.ones(k, dtype=amax.dtype, device=amax.device) * amax
-            print(f"expanding {name} {amax} -> {mod._weight_quantizer._amax}")
-
-
-def recalibrate_weights(model):
-    """Performs max calibration on the weights and updates amax."""
-
-    for name, mod in model.named_modules():
-        if hasattr(mod, "_weight_quantizer"):
-            if not hasattr(mod.weight_quantizer, "_amax"):
-                print("RECALIB: {name:{name_width}} WARNING: NO AMAX BUFFER")
-                continue
-
-            # determine which axes to reduce across
-            # e.g. a 4D tensor quantized per axis 0 should reduce over (1,2,3)
-            axis_set = set() if mod._weight_quantizer.axis is None else set(mod._weight_quantizer.axis)
-            reduce_axis = set(range(len(mod.weight.size()))) - axis_set
-            amax = pytorch_quantization.utils.reduce_amax(mod.weight, axis=reduce_axis, keepdims=True).detach()
-            logger.info(f"RECALIB: {name:{name_width}} {mod._weight_quantizer._amax.flatten()} -> {amax.flatten()}")
-            mod._weight_quantizer._amax = amax
-
-
-def print_model_summary(model, name_width=25, line_width=180, ignore=None):
-    """Print model quantization configuration."""
-
-    if ignore is None:
-        ignore = []
-    elif not isinstance(ignore, list):
-        ignore = [ignore]
-
-    name_width = 0
-    for name, mod in model.named_modules():
-        if not hasattr(mod, "weight"):
-            continue
-        name_width = max(name_width, len(name))
-
-    for name, mod in model.named_modules():
-        input_q = getattr(mod, "_input_quantizer", None)
-        weight_q = getattr(mod, "_weight_quantizer", None)
-        if not hasattr(mod, "weight"):
-            continue
-        if type(mod) in ignore:
-            continue
-        if [True for s in ignore if isinstance(s, str) and s in name]:
-            continue
-        act_str = f"Act:{input_q.extra_repr()}"
-        wgt_str = f"Wgt:{weight_q.extra_repr()}"
-        s = f"{name:{name_width}} {act_str} {wgt_str}"
-        if len(s) <= line_width:
-            logger.info(s)
-        else:
-            logger.info(f"{name:{name_width}} {act_str}")
-            logger.info(f'{"  ":{name_width}} {wgt_str}')
-
-
-def print_quant_summary(model):
-    """Print summary of all quantizer modules in the model."""
-
-    count = 0
-    for name, mod in model.named_modules():
-        if isinstance(mod, pytorch_quantization.nn.TensorQuantizer):
-            print(f"{name:80} {mod}")
-            count += 1
-    print(f"{count} TensorQuantizers found in model")
-
-
-def set_quantizer(name, mod, quantizer, k, v):
-    """Set attributes for mod.quantizer."""
-
-    quantizer_mod = getattr(mod, quantizer, None)
-    if quantizer_mod is not None:
-        assert hasattr(quantizer_mod, k)
-        setattr(quantizer_mod, k, v)
-    else:
-        logger.warning(f"{name} has no {quantizer}")
-
-
-def set_quantizers(name, mod, which="both", **kwargs):
-    """Set quantizer attributes for mod."""
-
-    s = f"Warning: changing {which} quantizers of {name:{qname_width}}"
-    for k, v in kwargs.items():
-        s += f" {k}={v}"
-        if which in ["input", "both"]:
-            set_quantizer(name, mod, "_input_quantizer", k, v)
-        if which in ["weight", "both"]:
-            set_quantizer(name, mod, "_weight_quantizer", k, v)
-    logger.info(s)
-
-
-def set_quantizer_by_name(model, names, **kwargs):
-    """Set quantizer attributes for layers where name contains a substring in names."""
-
-    for name, mod in model.named_modules():
-        if hasattr(mod, "_input_quantizer") or hasattr(mod, "_weight_quantizer"):
-            for n in names:
-                if re.search(n, name):
-                    set_quantizers(name, mod, **kwargs)
-        elif name.endswith("_quantizer"):
-            for n in names:
-                if re.search(n, name):
-                    s = f"Warning: changing {name:{name_width}}"
-                    for k, v in kwargs.items():
-                        s += f" {k}={v}"
-                        setattr(mod, k, v)
-                    logger.info(s)
diff --git a/examples/research_projects/quantization-qdqbert/run_quant_qa.py b/examples/research_projects/quantization-qdqbert/run_quant_qa.py
deleted file mode 100755
index 770a36525b5c..000000000000
--- a/examples/research_projects/quantization-qdqbert/run_quant_qa.py
+++ /dev/null
@@ -1,688 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team All rights reserved.
-# Copyright 2021 NVIDIA Corporation. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for question answering.
-"""
-# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import datasets
-import quant_trainer
-from datasets import load_dataset, load_metric
-from trainer_quant_qa import QuestionAnsweringTrainer
-from utils_qa import postprocess_qa_predictions
-
-import transformers
-from transformers import (
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    HfArgumentParser,
-    PreTrainedTokenizerFast,
-    QDQBertConfig,
-    QDQBertForQuestionAnswering,
-    TrainingArguments,
-    default_data_collator,
-    set_seed,
-)
-from transformers.trainer_utils import SchedulerType, get_last_checkpoint
-from transformers.utils import check_min_version
-from transformers.utils.versions import require_version
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.9.0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
-            )
-        },
-    )
-    do_calib: bool = field(default=False, metadata={"help": "Whether to run calibration of quantization ranges."})
-    num_calib_batch: int = field(
-        default=4,
-        metadata={"help": "Number of batches for calibration. 0 will disable calibration "},
-    )
-    save_onnx: bool = field(default=False, metadata={"help": "Whether to save model to onnx."})
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_seq_length: int = field(
-        default=384,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. If False, will pad the samples dynamically when"
-                " batching to the maximum length in the batch (which can be faster on GPU but will be slower on TPU)."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    version_2_with_negative: bool = field(
-        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
-    )
-    null_score_diff_threshold: float = field(
-        default=0.0,
-        metadata={
-            "help": (
-                "The threshold used to select the null answer: if the best answer has a score that is less than "
-                "the score of the null answer minus this threshold, the null answer is selected for this example. "
-                "Only useful when `version_2_with_negative=True`."
-            )
-        },
-    )
-    doc_stride: int = field(
-        default=128,
-        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
-    )
-    n_best_size: int = field(
-        default=20,
-        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
-    )
-    max_answer_length: int = field(
-        default=30,
-        metadata={
-            "help": (
-                "The maximum length of an answer that can be generated. This is needed because the start "
-                "and end predictions are not conditioned on one another."
-            )
-        },
-    )
-
-    def __post_init__(self):
-        if (
-            self.dataset_name is None
-            and self.train_file is None
-            and self.validation_file is None
-            and self.test_file is None
-        ):
-            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-            if self.test_file is not None:
-                extension = self.test_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    # quant_trainer arguments
-    quant_trainer.add_arguments(parser)
-
-    # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-    #     # If we pass only one argument to the script and it's the path to a json file,
-    #     # let's parse it to get our arguments.
-    #     model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    # else:
-
-    model_args, data_args, training_args, quant_trainer_args = parser.parse_args_into_dataclasses()
-
-    # setup QAT training args for scheduler (default to use cosine annealing learning rate schedule)
-    training_args.lr_scheduler_type = SchedulerType.COSINE
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
-        )
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # set default quantization parameters before building model
-    quant_trainer.set_default_quantizers(quant_trainer_args)
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = QDQBertConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=True,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-    model = QDQBertForQuestionAnswering.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-
-    # Tokenizer check: this script requires a fast tokenizer.
-    if not isinstance(tokenizer, PreTrainedTokenizerFast):
-        raise ValueError(
-            "This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
-            " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
-            " this requirement"
-        )
-
-    # Preprocessing the datasets.
-    # Preprocessing is slightly different for training and evaluation.
-    if training_args.do_train or model_args.do_calib:
-        column_names = raw_datasets["train"].column_names
-    elif training_args.do_eval or model_args.save_onnx:
-        column_names = raw_datasets["validation"].column_names
-    else:
-        column_names = raw_datasets["test"].column_names
-    question_column_name = "question" if "question" in column_names else column_names[0]
-    context_column_name = "context" if "context" in column_names else column_names[1]
-    answer_column_name = "answers" if "answers" in column_names else column_names[2]
-
-    # Padding side determines if we do (question|context) or (context|question).
-    pad_on_right = tokenizer.padding_side == "right"
-
-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    # Training preprocessing
-    def prepare_train_features(examples):
-        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
-        # in one example possible giving several features when a context is long, each of those features having a
-        # context that overlaps a bit the context of the previous feature.
-        tokenized_examples = tokenizer(
-            examples[question_column_name if pad_on_right else context_column_name],
-            examples[context_column_name if pad_on_right else question_column_name],
-            truncation="only_second" if pad_on_right else "only_first",
-            max_length=max_seq_length,
-            stride=data_args.doc_stride,
-            return_overflowing_tokens=True,
-            return_offsets_mapping=True,
-            padding="max_length" if data_args.pad_to_max_length else False,
-        )
-
-        # Since one example might give us several features if it has a long context, we need a map from a feature to
-        # its corresponding example. This key gives us just that.
-        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
-        # The offset mappings will give us a map from token to character position in the original context. This will
-        # help us compute the start_positions and end_positions.
-        offset_mapping = tokenized_examples.pop("offset_mapping")
-
-        # Let's label those examples!
-        tokenized_examples["start_positions"] = []
-        tokenized_examples["end_positions"] = []
-
-        for i, offsets in enumerate(offset_mapping):
-            # We will label impossible answers with the index of the CLS token.
-            input_ids = tokenized_examples["input_ids"][i]
-            cls_index = input_ids.index(tokenizer.cls_token_id)
-
-            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
-            sequence_ids = tokenized_examples.sequence_ids(i)
-
-            # One example can give several spans, this is the index of the example containing this span of text.
-            sample_index = sample_mapping[i]
-            answers = examples[answer_column_name][sample_index]
-            # If no answers are given, set the cls_index as answer.
-            if len(answers["answer_start"]) == 0:
-                tokenized_examples["start_positions"].append(cls_index)
-                tokenized_examples["end_positions"].append(cls_index)
-            else:
-                # Start/end character index of the answer in the text.
-                start_char = answers["answer_start"][0]
-                end_char = start_char + len(answers["text"][0])
-
-                # Start token index of the current span in the text.
-                token_start_index = 0
-                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
-                    token_start_index += 1
-
-                # End token index of the current span in the text.
-                token_end_index = len(input_ids) - 1
-                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
-                    token_end_index -= 1
-
-                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
-                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
-                    tokenized_examples["start_positions"].append(cls_index)
-                    tokenized_examples["end_positions"].append(cls_index)
-                else:
-                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
-                    # Note: we could go after the last offset if the answer is the last word (edge case).
-                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
-                        token_start_index += 1
-                    tokenized_examples["start_positions"].append(token_start_index - 1)
-                    while offsets[token_end_index][1] >= end_char:
-                        token_end_index -= 1
-                    tokenized_examples["end_positions"].append(token_end_index + 1)
-
-        return tokenized_examples
-
-    if training_args.do_train or model_args.do_calib:
-        if "train" not in raw_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = raw_datasets["train"]
-        if data_args.max_train_samples is not None:
-            # We will select sample from whole data if argument is specified
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-        # Create train feature from dataset
-        with training_args.main_process_first(desc="train dataset map pre-processing"):
-            train_dataset = train_dataset.map(
-                prepare_train_features,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on train dataset",
-            )
-        if data_args.max_train_samples is not None:
-            # Number of samples might increase during Feature Creation, We select only specified max samples
-            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-            train_dataset = train_dataset.select(range(max_train_samples))
-
-    # Validation preprocessing
-    def prepare_validation_features(examples):
-        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
-        # in one example possible giving several features when a context is long, each of those features having a
-        # context that overlaps a bit the context of the previous feature.
-        tokenized_examples = tokenizer(
-            examples[question_column_name if pad_on_right else context_column_name],
-            examples[context_column_name if pad_on_right else question_column_name],
-            truncation="only_second" if pad_on_right else "only_first",
-            max_length=max_seq_length,
-            stride=data_args.doc_stride,
-            return_overflowing_tokens=True,
-            return_offsets_mapping=True,
-            padding="max_length" if data_args.pad_to_max_length else False,
-        )
-
-        # Since one example might give us several features if it has a long context, we need a map from a feature to
-        # its corresponding example. This key gives us just that.
-        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
-
-        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
-        # corresponding example_id and we will store the offset mappings.
-        tokenized_examples["example_id"] = []
-
-        for i in range(len(tokenized_examples["input_ids"])):
-            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
-            sequence_ids = tokenized_examples.sequence_ids(i)
-            context_index = 1 if pad_on_right else 0
-
-            # One example can give several spans, this is the index of the example containing this span of text.
-            sample_index = sample_mapping[i]
-            tokenized_examples["example_id"].append(examples["id"][sample_index])
-
-            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
-            # position is part of the context or not.
-            tokenized_examples["offset_mapping"][i] = [
-                (o if sequence_ids[k] == context_index else None)
-                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
-            ]
-
-        return tokenized_examples
-
-    if training_args.do_eval or model_args.save_onnx:
-        if "validation" not in raw_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_examples = raw_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            # We will select sample from whole data
-            max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
-            eval_examples = eval_examples.select(range(max_eval_samples))
-        # Validation Feature Creation
-        with training_args.main_process_first(desc="validation dataset map pre-processing"):
-            eval_dataset = eval_examples.map(
-                prepare_validation_features,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on validation dataset",
-            )
-        if data_args.max_eval_samples is not None:
-            # During Feature creation dataset samples might increase, we will select required samples again
-            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
-            eval_dataset = eval_dataset.select(range(max_eval_samples))
-
-    if training_args.do_predict:
-        if "test" not in raw_datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_examples = raw_datasets["test"]
-        if data_args.max_predict_samples is not None:
-            # We will select sample from whole data
-            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
-        # Predict Feature Creation
-        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
-            predict_dataset = predict_examples.map(
-                prepare_validation_features,
-                batched=True,
-                num_proc=data_args.preprocessing_num_workers,
-                remove_columns=column_names,
-                load_from_cache_file=not data_args.overwrite_cache,
-                desc="Running tokenizer on prediction dataset",
-            )
-        if data_args.max_predict_samples is not None:
-            # During Feature creation dataset samples might increase, we will select required samples again
-            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
-            predict_dataset = predict_dataset.select(range(max_predict_samples))
-
-    # Data collator
-    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
-    # collator.
-    data_collator = (
-        default_data_collator
-        if data_args.pad_to_max_length
-        else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
-    )
-
-    # Post-processing:
-    def post_processing_function(examples, features, predictions, stage="eval"):
-        # Post-processing: we match the start logits and end logits to answers in the original context.
-        predictions = postprocess_qa_predictions(
-            examples=examples,
-            features=features,
-            predictions=predictions,
-            version_2_with_negative=data_args.version_2_with_negative,
-            n_best_size=data_args.n_best_size,
-            max_answer_length=data_args.max_answer_length,
-            null_score_diff_threshold=data_args.null_score_diff_threshold,
-            output_dir=training_args.output_dir,
-            log_level=log_level,
-            prefix=stage,
-        )
-        # Format the result to the format the metric expects.
-        if data_args.version_2_with_negative:
-            formatted_predictions = [
-                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
-            ]
-        else:
-            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
-
-        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
-        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
-
-    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
-
-    def compute_metrics(p: EvalPrediction):
-        return metric.compute(predictions=p.predictions, references=p.label_ids)
-
-    # Initialize our Trainer
-    trainer = QuestionAnsweringTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train or model_args.do_calib else None,
-        eval_dataset=eval_dataset if training_args.do_eval or model_args.save_onnx else None,
-        eval_examples=eval_examples if training_args.do_eval or model_args.save_onnx else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        post_process_function=post_processing_function,
-        compute_metrics=compute_metrics,
-        quant_trainer_args=quant_trainer_args,
-    )
-
-    # Calibration
-    if model_args.do_calib:
-        logger.info("*** Calibrate ***")
-        results = trainer.calibrate()
-        trainer.save_model()
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-
-        quant_trainer.configure_model(trainer.model, quant_trainer_args)
-
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        quant_trainer.configure_model(trainer.model, quant_trainer_args, eval=True)
-        metrics = trainer.evaluate()
-
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Prediction
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-        results = trainer.predict(predict_dataset, predict_examples)
-        metrics = results.metrics
-
-        max_predict_samples = (
-            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
-        )
-        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
-
-        trainer.log_metrics("predict", metrics)
-        trainer.save_metrics("predict", metrics)
-
-    if training_args.push_to_hub:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
-        if data_args.dataset_name is not None:
-            kwargs["dataset_tags"] = data_args.dataset_name
-            if data_args.dataset_config_name is not None:
-                kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-            else:
-                kwargs["dataset"] = data_args.dataset_name
-
-        trainer.push_to_hub(**kwargs)
-
-    if model_args.save_onnx:
-        logger.info("Exporting model to onnx")
-        results = trainer.save_onnx(output_dir=training_args.output_dir)
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py b/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py
deleted file mode 100644
index a56d875354dd..000000000000
--- a/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team All rights reserved.
-# Copyright 2021 NVIDIA Corporation. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A subclass of `Trainer` specific to Question-Answering tasks
-"""
-
-import logging
-import os
-
-import quant_trainer
-import torch
-from torch.utils.data import DataLoader
-
-from transformers import Trainer, is_torch_xla_available
-from transformers.trainer_utils import PredictionOutput
-
-
-logger = logging.getLogger(__name__)
-
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
-    import torch_xla.debug.metrics as met
-
-
-class QuestionAnsweringTrainer(Trainer):
-    def __init__(self, *args, eval_examples=None, post_process_function=None, quant_trainer_args=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.eval_examples = eval_examples
-        self.post_process_function = post_process_function
-        self.quant_trainer_args = quant_trainer_args
-        self.calib_num = 128  # default number of calibration samples
-
-    def get_calib_dataloader(self, calib_dataset=None):
-        """
-        Returns the calibration dataloader :class:`~torch.utils.data.DataLoader`.
-
-        Args:
-            calib_dataset (:obj:`torch.utils.data.Dataset`, `optional`)
-        """
-        if calib_dataset is None and self.calib_dataset is None:
-            raise ValueError("Trainer: calibration requires an calib_dataset.")
-        calib_dataset = calib_dataset if calib_dataset is not None else self.calib_dataset
-
-        calib_dataset = self._remove_unused_columns(calib_dataset, description="Calibration")
-
-        return DataLoader(
-            calib_dataset,
-            batch_size=self.args.eval_batch_size,
-            collate_fn=self.data_collator,
-            drop_last=self.args.dataloader_drop_last,
-            num_workers=self.args.dataloader_num_workers,
-            pin_memory=self.args.dataloader_pin_memory,
-            shuffle=True,
-        )
-
-    def calibrate(self, calib_dataset=None):
-        calib_dataset = self.train_dataset if calib_dataset is None else calib_dataset
-        calib_dataloader = self.get_calib_dataloader(calib_dataset)
-
-        model = self.model
-        quant_trainer.configure_model(model, self.quant_trainer_args, calib=True)
-        model.eval()
-        quant_trainer.enable_calibration(model)
-
-        logger.info("***** Running calibration *****")
-        logger.info(f"  Num examples = {self.calib_num}")
-        logger.info(f"  Batch size = {calib_dataloader.batch_size}")
-
-        for step, inputs in enumerate(calib_dataloader):
-            # Prediction step
-            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only=True)
-            if (step + 1) * calib_dataloader.batch_size >= self.calib_num:
-                break
-
-        quant_trainer.finish_calibration(model, self.quant_trainer_args)
-        self.model = model
-
-    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
-        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
-        eval_dataloader = self.get_eval_dataloader(eval_dataset)
-        eval_examples = self.eval_examples if eval_examples is None else eval_examples
-
-        # Temporarily disable metric computation, we will do it in the loop here.
-        compute_metrics = self.compute_metrics
-        self.compute_metrics = None
-        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-        try:
-            output = eval_loop(
-                eval_dataloader,
-                description="Evaluation",
-                # No point gathering the predictions if there are no metrics, otherwise we defer to
-                # self.args.prediction_loss_only
-                prediction_loss_only=True if compute_metrics is None else None,
-                ignore_keys=ignore_keys,
-            )
-        finally:
-            self.compute_metrics = compute_metrics
-
-        if self.post_process_function is not None and self.compute_metrics is not None:
-            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
-            metrics = self.compute_metrics(eval_preds)
-
-            # Prefix all keys with metric_key_prefix + '_'
-            for key in list(metrics.keys()):
-                if not key.startswith(f"{metric_key_prefix}_"):
-                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
-            self.log(metrics)
-        else:
-            metrics = {}
-
-        if self.args.tpu_metrics_debug or self.args.debug:
-            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
-            xm.master_print(met.metrics_report())
-
-        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
-        return metrics
-
-    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
-        predict_dataloader = self.get_test_dataloader(predict_dataset)
-
-        # Temporarily disable metric computation, we will do it in the loop here.
-        compute_metrics = self.compute_metrics
-        self.compute_metrics = None
-        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-        try:
-            output = eval_loop(
-                predict_dataloader,
-                description="Prediction",
-                # No point gathering the predictions if there are no metrics, otherwise we defer to
-                # self.args.prediction_loss_only
-                prediction_loss_only=True if compute_metrics is None else None,
-                ignore_keys=ignore_keys,
-            )
-        finally:
-            self.compute_metrics = compute_metrics
-
-        if self.post_process_function is None or self.compute_metrics is None:
-            return output
-
-        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
-        metrics = self.compute_metrics(predictions)
-
-        # Prefix all keys with metric_key_prefix + '_'
-        for key in list(metrics.keys()):
-            if not key.startswith(f"{metric_key_prefix}_"):
-                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
-
-        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
-
-    def save_onnx(self, output_dir="./"):
-        eval_dataset = self.eval_dataset
-        eval_dataloader = self.get_eval_dataloader(eval_dataset)
-
-        batch = next(iter(eval_dataloader))
-
-        # saving device - to make it consistent
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-        # convert to tuple
-        input_tuple = tuple(v.to(device) for k, v in batch.items())
-
-        logger.info("Converting model to be onnx compatible")
-        from pytorch_quantization.nn import TensorQuantizer
-
-        TensorQuantizer.use_fb_fake_quant = True
-
-        model = self.model.to(device)
-
-        model.eval()
-        model.float()
-
-        model_to_save = model.module if hasattr(model, "module") else model
-        quant_trainer.configure_model(model_to_save, self.quant_trainer_args)
-
-        output_model_file = os.path.join(output_dir, "model.onnx")
-        logger.info(f"exporting model to {output_model_file}")
-
-        axes = {0: "batch_size", 1: "seq_len"}
-
-        torch.onnx.export(
-            model_to_save,
-            input_tuple,
-            output_model_file,
-            export_params=True,
-            opset_version=13,
-            do_constant_folding=True,
-            input_names=["input_ids", "attention_mask", "token_type_ids"],
-            output_names=["output_start_logits", "output_end_logits"],
-            dynamic_axes={
-                "input_ids": axes,
-                "attention_mask": axes,
-                "token_type_ids": axes,
-                "output_start_logits": axes,
-                "output_end_logits": axes,
-            },
-            verbose=True,
-        )
-        logger.info("onnx export finished")
diff --git a/examples/research_projects/quantization-qdqbert/utils_qa.py b/examples/research_projects/quantization-qdqbert/utils_qa.py
deleted file mode 100644
index e90d6c4747c9..000000000000
--- a/examples/research_projects/quantization-qdqbert/utils_qa.py
+++ /dev/null
@@ -1,435 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Post-processing utilities for question answering.
-"""
-
-import collections
-import json
-import logging
-import os
-from typing import Optional, Tuple
-
-import numpy as np
-from tqdm.auto import tqdm
-
-
-logger = logging.getLogger(__name__)
-
-
-def postprocess_qa_predictions(
-    examples,
-    features,
-    predictions: Tuple[np.ndarray, np.ndarray],
-    version_2_with_negative: bool = False,
-    n_best_size: int = 20,
-    max_answer_length: int = 30,
-    null_score_diff_threshold: float = 0.0,
-    output_dir: Optional[str] = None,
-    prefix: Optional[str] = None,
-    log_level: Optional[int] = logging.WARNING,
-):
-    """
-    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
-    original contexts. This is the base postprocessing functions for models that only return start and end logits.
-
-    Args:
-        examples: The non-preprocessed dataset (see the main script for more information).
-        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
-            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
-            first dimension must match the number of elements of :obj:`features`.
-        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the underlying dataset contains examples with no answers.
-        n_best_size (:obj:`int`, `optional`, defaults to 20):
-            The total number of n-best predictions to generate when looking for an answer.
-        max_answer_length (:obj:`int`, `optional`, defaults to 30):
-            The maximum length of an answer that can be generated. This is needed because the start and end predictions
-            are not conditioned on one another.
-        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
-            The threshold used to select the null answer: if the best answer has a score that is less than the score of
-            the null answer minus this threshold, the null answer is selected for this example (note that the score of
-            the null answer for an example giving several features is the minimum of the scores for the null answer on
-            each feature: all features must be aligned on the fact they `want` to predict a null answer).
-
-            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
-        output_dir (:obj:`str`, `optional`):
-            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
-            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
-            answers, are saved in `output_dir`.
-        prefix (:obj:`str`, `optional`):
-            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
-        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
-            ``logging`` log level (e.g., ``logging.WARNING``)
-    """
-    if len(predictions) != 2:
-        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
-    all_start_logits, all_end_logits = predictions
-
-    if len(predictions[0]) != len(features):
-        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
-
-    # Build a map example to its corresponding features.
-    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
-    features_per_example = collections.defaultdict(list)
-    for i, feature in enumerate(features):
-        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
-
-    # The dictionaries we have to fill.
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    if version_2_with_negative:
-        scores_diff_json = collections.OrderedDict()
-
-    # Logging.
-    logger.setLevel(log_level)
-    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
-
-    # Let's loop over all the examples!
-    for example_index, example in enumerate(tqdm(examples)):
-        # Those are the indices of the features associated to the current example.
-        feature_indices = features_per_example[example_index]
-
-        min_null_prediction = None
-        prelim_predictions = []
-
-        # Looping through all the features associated to the current example.
-        for feature_index in feature_indices:
-            # We grab the predictions of the model for this feature.
-            start_logits = all_start_logits[feature_index]
-            end_logits = all_end_logits[feature_index]
-            # This is what will allow us to map some the positions in our logits to span of texts in the original
-            # context.
-            offset_mapping = features[feature_index]["offset_mapping"]
-            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
-            # available in the current feature.
-            token_is_max_context = features[feature_index].get("token_is_max_context", None)
-
-            # Update minimum null prediction.
-            feature_null_score = start_logits[0] + end_logits[0]
-            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
-                min_null_prediction = {
-                    "offsets": (0, 0),
-                    "score": feature_null_score,
-                    "start_logit": start_logits[0],
-                    "end_logit": end_logits[0],
-                }
-
-            # Go through all possibilities for the `n_best_size` greater start and end logits.
-            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
-            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
-                    # to part of the input_ids that are not in the context.
-                    if (
-                        start_index >= len(offset_mapping)
-                        or end_index >= len(offset_mapping)
-                        or offset_mapping[start_index] is None
-                        or len(offset_mapping[start_index]) < 2
-                        or offset_mapping[end_index] is None
-                        or len(offset_mapping[end_index]) < 2
-                    ):
-                        continue
-                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
-                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
-                        continue
-                    # Don't consider answer that don't have the maximum context available (if such information is
-                    # provided).
-                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
-                        continue
-
-                    prelim_predictions.append(
-                        {
-                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
-                            "score": start_logits[start_index] + end_logits[end_index],
-                            "start_logit": start_logits[start_index],
-                            "end_logit": end_logits[end_index],
-                        }
-                    )
-        if version_2_with_negative:
-            # Add the minimum null prediction
-            prelim_predictions.append(min_null_prediction)
-            null_score = min_null_prediction["score"]
-
-        # Only keep the best `n_best_size` predictions.
-        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
-
-        # Add back the minimum null prediction if it was removed because of its low score.
-        if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
-            predictions.append(min_null_prediction)
-
-        # Use the offsets to gather the answer text in the original context.
-        context = example["context"]
-        for pred in predictions:
-            offsets = pred.pop("offsets")
-            pred["text"] = context[offsets[0] : offsets[1]]
-
-        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
-        # failure.
-        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
-            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
-
-        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
-        # the LogSumExp trick).
-        scores = np.array([pred.pop("score") for pred in predictions])
-        exp_scores = np.exp(scores - np.max(scores))
-        probs = exp_scores / exp_scores.sum()
-
-        # Include the probabilities in our predictions.
-        for prob, pred in zip(probs, predictions):
-            pred["probability"] = prob
-
-        # Pick the best prediction. If the null answer is not possible, this is easy.
-        if not version_2_with_negative:
-            all_predictions[example["id"]] = predictions[0]["text"]
-        else:
-            # Otherwise we first need to find the best non-empty prediction.
-            i = 0
-            while predictions[i]["text"] == "":
-                i += 1
-            best_non_null_pred = predictions[i]
-
-            # Then we compare to the null prediction using the threshold.
-            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
-            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example["id"]] = ""
-            else:
-                all_predictions[example["id"]] = best_non_null_pred["text"]
-
-        # Make `predictions` JSON-serializable by casting np.float back to float.
-        all_nbest_json[example["id"]] = [
-            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
-            for pred in predictions
-        ]
-
-    # If we have an output_dir, let's save all those dicts.
-    if output_dir is not None:
-        if not os.path.isdir(output_dir):
-            raise EnvironmentError(f"{output_dir} is not a directory.")
-
-        prediction_file = os.path.join(
-            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
-        )
-        nbest_file = os.path.join(
-            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
-        )
-        if version_2_with_negative:
-            null_odds_file = os.path.join(
-                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
-            )
-
-        logger.info(f"Saving predictions to {prediction_file}.")
-        with open(prediction_file, "w") as writer:
-            writer.write(json.dumps(all_predictions, indent=4) + "\n")
-        logger.info(f"Saving nbest_preds to {nbest_file}.")
-        with open(nbest_file, "w") as writer:
-            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-        if version_2_with_negative:
-            logger.info(f"Saving null_odds to {null_odds_file}.")
-            with open(null_odds_file, "w") as writer:
-                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions
-
-
-def postprocess_qa_predictions_with_beam_search(
-    examples,
-    features,
-    predictions: Tuple[np.ndarray, np.ndarray],
-    version_2_with_negative: bool = False,
-    n_best_size: int = 20,
-    max_answer_length: int = 30,
-    start_n_top: int = 5,
-    end_n_top: int = 5,
-    output_dir: Optional[str] = None,
-    prefix: Optional[str] = None,
-    log_level: Optional[int] = logging.WARNING,
-):
-    """
-    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
-    original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
-    cls token predictions.
-
-    Args:
-        examples: The non-preprocessed dataset (see the main script for more information).
-        features: The processed dataset (see the main script for more information).
-        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
-            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
-            first dimension must match the number of elements of :obj:`features`.
-        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the underlying dataset contains examples with no answers.
-        n_best_size (:obj:`int`, `optional`, defaults to 20):
-            The total number of n-best predictions to generate when looking for an answer.
-        max_answer_length (:obj:`int`, `optional`, defaults to 30):
-            The maximum length of an answer that can be generated. This is needed because the start and end predictions
-            are not conditioned on one another.
-        start_n_top (:obj:`int`, `optional`, defaults to 5):
-            The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
-        end_n_top (:obj:`int`, `optional`, defaults to 5):
-            The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
-        output_dir (:obj:`str`, `optional`):
-            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
-            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
-            answers, are saved in `output_dir`.
-        prefix (:obj:`str`, `optional`):
-            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
-        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
-            ``logging`` log level (e.g., ``logging.WARNING``)
-    """
-    if len(predictions) != 5:
-        raise ValueError("`predictions` should be a tuple with five elements.")
-    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
-
-    if len(predictions[0]) != len(features):
-        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
-
-    # Build a map example to its corresponding features.
-    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
-    features_per_example = collections.defaultdict(list)
-    for i, feature in enumerate(features):
-        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
-
-    # The dictionaries we have to fill.
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
-
-    # Logging.
-    logger.setLevel(log_level)
-    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
-
-    # Let's loop over all the examples!
-    for example_index, example in enumerate(tqdm(examples)):
-        # Those are the indices of the features associated to the current example.
-        feature_indices = features_per_example[example_index]
-
-        min_null_score = None
-        prelim_predictions = []
-
-        # Looping through all the features associated to the current example.
-        for feature_index in feature_indices:
-            # We grab the predictions of the model for this feature.
-            start_log_prob = start_top_log_probs[feature_index]
-            start_indexes = start_top_index[feature_index]
-            end_log_prob = end_top_log_probs[feature_index]
-            end_indexes = end_top_index[feature_index]
-            feature_null_score = cls_logits[feature_index]
-            # This is what will allow us to map some the positions in our logits to span of texts in the original
-            # context.
-            offset_mapping = features[feature_index]["offset_mapping"]
-            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
-            # available in the current feature.
-            token_is_max_context = features[feature_index].get("token_is_max_context", None)
-
-            # Update minimum null prediction
-            if min_null_score is None or feature_null_score < min_null_score:
-                min_null_score = feature_null_score
-
-            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
-            for i in range(start_n_top):
-                for j in range(end_n_top):
-                    start_index = int(start_indexes[i])
-                    j_index = i * end_n_top + j
-                    end_index = int(end_indexes[j_index])
-                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
-                    # p_mask but let's not take any risk)
-                    if (
-                        start_index >= len(offset_mapping)
-                        or end_index >= len(offset_mapping)
-                        or offset_mapping[start_index] is None
-                        or offset_mapping[end_index] is None
-                    ):
-                        continue
-                    # Don't consider answers with a length negative or > max_answer_length.
-                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
-                        continue
-                    # Don't consider answer that don't have the maximum context available (if such information is
-                    # provided).
-                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
-                        continue
-                    prelim_predictions.append(
-                        {
-                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
-                            "score": start_log_prob[i] + end_log_prob[j_index],
-                            "start_log_prob": start_log_prob[i],
-                            "end_log_prob": end_log_prob[j_index],
-                        }
-                    )
-
-        # Only keep the best `n_best_size` predictions.
-        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
-
-        # Use the offsets to gather the answer text in the original context.
-        context = example["context"]
-        for pred in predictions:
-            offsets = pred.pop("offsets")
-            pred["text"] = context[offsets[0] : offsets[1]]
-
-        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
-        # failure.
-        if len(predictions) == 0:
-            predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
-
-        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
-        # the LogSumExp trick).
-        scores = np.array([pred.pop("score") for pred in predictions])
-        exp_scores = np.exp(scores - np.max(scores))
-        probs = exp_scores / exp_scores.sum()
-
-        # Include the probabilities in our predictions.
-        for prob, pred in zip(probs, predictions):
-            pred["probability"] = prob
-
-        # Pick the best prediction and set the probability for the null answer.
-        all_predictions[example["id"]] = predictions[0]["text"]
-        if version_2_with_negative:
-            scores_diff_json[example["id"]] = float(min_null_score)
-
-        # Make `predictions` JSON-serializable by casting np.float back to float.
-        all_nbest_json[example["id"]] = [
-            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
-            for pred in predictions
-        ]
-
-    # If we have an output_dir, let's save all those dicts.
-    if output_dir is not None:
-        if not os.path.isdir(output_dir):
-            raise EnvironmentError(f"{output_dir} is not a directory.")
-
-        prediction_file = os.path.join(
-            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
-        )
-        nbest_file = os.path.join(
-            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
-        )
-        if version_2_with_negative:
-            null_odds_file = os.path.join(
-                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
-            )
-
-        logger.info(f"Saving predictions to {prediction_file}.")
-        with open(prediction_file, "w") as writer:
-            writer.write(json.dumps(all_predictions, indent=4) + "\n")
-        logger.info(f"Saving nbest_preds to {nbest_file}.")
-        with open(nbest_file, "w") as writer:
-            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-        if version_2_with_negative:
-            logger.info(f"Saving null_odds to {null_odds_file}.")
-            with open(null_odds_file, "w") as writer:
-                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions, scores_diff_json
diff --git a/examples/research_projects/rag-end2end-retriever/README.md b/examples/research_projects/rag-end2end-retriever/README.md
deleted file mode 100644
index 9bff4e8c29ab..000000000000
--- a/examples/research_projects/rag-end2end-retriever/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# End-to-End finetuning of RAG (including DPR retriever) for Question Answering.
-
-This finetuning script is actively maintained by [Shamane Siri](https://github.com/shamanez). Feel free to ask questions on the [Forum](https://discuss.huggingface.co/) or post an issue on [GitHub](https://github.com/huggingface/transformers/issues/new/choose) and tag @shamanez.
-
-Others that helped out: Patrick von Platen (@patrickvonplaten), Quentin Lhoest (@lhoestq), and Rivindu Weerasekera (@rivinduw)
-
-The original RAG implementation is able to train the question encoder and generator end-to-end.
-This extension enables complete end-to-end training of RAG including the context encoder in the retriever component.
-Please read the [accompanying blog post](https://shamanesiri.medium.com/how-to-finetune-the-entire-rag-architecture-including-dpr-retriever-4b4385322552) for details on this implementation.
-
-The original RAG code has also been modified to work with the latest versions of pytorch lightning (version 1.2.10) and RAY (version 1.3.0). All other implementation details remain the same as the [original RAG code](https://github.com/huggingface/transformers/tree/main/examples/research_projects/rag).
-Read more about RAG  at https://arxiv.org/abs/2005.11401.
-
-This code can be modified to experiment with other research on retrival augmented models which include training of the retriever (e.g. [REALM](https://arxiv.org/abs/2002.08909) and [MARGE](https://arxiv.org/abs/2006.15020)).
-
-To start training, use the bash script (finetune_rag_ray_end2end.sh) in this folder. This script also includes descriptions on each command-line argument used.
-
-# Latest Update
-
-⚠️ Updated the rag-end2end-retriever to be compatible with PL==1.6.4 and RAY==1.13.0 (latest versions to the date 2022-June-11)
-
-# Note
-
-⚠️ This project should be run with pytorch-lightning==1.3.1 which has a potential security vulnerability
-
-# Testing
-
-The following two bash scripts can be used to quickly test the implementation.
-1. sh ./test_run/test_finetune.sh script
-    - Tests the full end-to-end fine-tuning ability with a dummy knowlendge-base and dummy training dataset (check test_dir directory).
-    - Users can replace the dummy dataset and knowledge-base with their own to do their own finetuning.
-    - Please read the comments in the test_finetune.sh file.
-2. sh ./test_run/test_rag_new_features.sh
-    - Tests the newly added functions (set_context_encoder and set_context_encoder_tokenizer) related to modeling rag.
-    - This is sufficient to check the model's ability to use the set functions correctly.
-
-
-
-# Comparison of end2end RAG (including DPR finetuning)  VS original-RAG
-
-We conducted a simple experiment to investigate the effectiveness of this end2end training extension using the SQuAD dataset. Please execute the following steps to reproduce the results.
-
--   Create a knowledge-base using all the context passages in the SQuAD dataset with their respective titles.
--   Use the question-answer pairs as training data.
--   Train the system for 10 epochs.
--   Test the Exact Match (EM) score with the SQuAD dataset's validation set.
--   Training dataset, the knowledge-base, and hyperparameters used in experiments can be accessed from [here](https://drive.google.com/drive/folders/1qyzV-PaEARWvaU_jjpnU_NUS3U_dSjtG?usp=sharing).
-
-# Results
-
-- We train both models for 10 epochs.
-
-| Model Type          | EM-Score|
-| --------------------| --------|
-| RAG-original        | 28.12   |
-| RAG-end2end with DPR| 40.02   |
diff --git a/examples/research_projects/rag-end2end-retriever/callbacks_rag.py b/examples/research_projects/rag-end2end-retriever/callbacks_rag.py
deleted file mode 100644
index 09a30ff6d5c4..000000000000
--- a/examples/research_projects/rag-end2end-retriever/callbacks_rag.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import logging
-from pathlib import Path
-
-import numpy as np
-import pytorch_lightning as pl
-import torch
-from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
-from pytorch_lightning.utilities import rank_zero_only
-from utils_rag import save_json
-
-
-def count_trainable_parameters(model):
-    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
-    params = sum([np.prod(p.size()) for p in model_parameters])
-    return params
-
-
-logger = logging.getLogger(__name__)
-
-
-def get_checkpoint_callback(output_dir, metric):
-    """Saves the best model by validation EM score."""
-    if metric == "rouge2":
-        exp = "{val_avg_rouge2:.4f}-{step_count}"
-    elif metric == "bleu":
-        exp = "{val_avg_bleu:.4f}-{step_count}"
-    elif metric == "em":
-        exp = "{val_avg_em:.4f}-{step_count}"
-    elif metric == "loss":
-        exp = "{val_avg_loss:.4f}-{step_count}"
-    else:
-        raise NotImplementedError(
-            f"seq2seq callbacks only support rouge2 and bleu, got {metric}, You can make your own by adding to this"
-            " function."
-        )
-
-    checkpoint_callback = ModelCheckpoint(
-        dirpath=output_dir,
-        filename=exp,
-        monitor=f"val_{metric}",
-        mode="max",
-        save_top_k=1,
-        every_n_epochs=1,  # works only with PL > 1.3
-    )
-
-    return checkpoint_callback
-
-
-def get_early_stopping_callback(metric, patience):
-    return EarlyStopping(
-        monitor=f"val_{metric}",  # does this need avg?
-        mode="min" if "loss" in metric else "max",
-        patience=patience,
-        verbose=True,
-    )
-
-
-class Seq2SeqLoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lrs = {f"lr_group_{i}": param["lr"] for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)}
-        pl_module.logger.log_metrics(lrs)
-
-    @rank_zero_only
-    def _write_logs(
-        self, trainer: pl.Trainer, pl_module: pl.LightningModule, type_path: str, save_generations=True
-    ) -> None:
-        logger.info(f"***** {type_path} results at step {trainer.global_step:05d} *****")
-        metrics = trainer.callback_metrics
-        trainer.logger.log_metrics({k: v for k, v in metrics.items() if k not in ["log", "progress_bar", "preds"]})
-        # Log results
-        od = Path(pl_module.hparams.output_dir)
-        if type_path == "test":
-            results_file = od / "test_results.txt"
-            generations_file = od / "test_generations.txt"
-        else:
-            # this never gets hit. I prefer not to save intermediate generations, and results are in metrics.json
-            # If people want this it will be easy enough to add back.
-            results_file = od / f"{type_path}_results/{trainer.global_step:05d}.txt"
-            generations_file = od / f"{type_path}_generations/{trainer.global_step:05d}.txt"
-            results_file.parent.mkdir(exist_ok=True)
-            generations_file.parent.mkdir(exist_ok=True)
-        with open(results_file, "a+") as writer:
-            for key in sorted(metrics):
-                if key in ["log", "progress_bar", "preds"]:
-                    continue
-                val = metrics[key]
-                if isinstance(val, torch.Tensor):
-                    val = val.item()
-                msg = f"{key}: {val:.6f}\n"
-                writer.write(msg)
-
-        if not save_generations:
-            return
-
-        if "preds" in metrics:
-            content = "\n".join(metrics["preds"])
-            generations_file.open("w+").write(content)
-
-    @rank_zero_only
-    def on_train_start(self, trainer, pl_module):
-        try:
-            npars = pl_module.model.model.num_parameters()
-        except AttributeError:
-            npars = pl_module.model.num_parameters()
-
-        n_trainable_pars = count_trainable_parameters(pl_module)
-        # mp stands for million parameters
-        trainer.logger.log_metrics({"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6})
-
-    @rank_zero_only
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        save_json(pl_module.metrics, pl_module.metrics_save_path)
-        return self._write_logs(trainer, pl_module, "test")
-
-    @rank_zero_only
-    def on_validation_end(self, trainer: pl.Trainer, pl_module):
-        save_json(pl_module.metrics, pl_module.metrics_save_path)
-        # Uncommenting this will save val generations
-        # return self._write_logs(trainer, pl_module, "valid")
diff --git a/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py b/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py
deleted file mode 100644
index f97467292c25..000000000000
--- a/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import logging
-import random
-
-import ray
-
-from transformers import RagConfig, RagRetriever, RagTokenizer
-from transformers.models.rag.retrieval_rag import CustomHFIndex
-
-
-logger = logging.getLogger(__name__)
-
-
-class RayRetriever:
-    def __init__(self):
-        self.initialized = False
-
-    def create_rag_retriever(self, config, question_encoder_tokenizer, generator_tokenizer, index):
-        if not self.initialized:
-            self.retriever = RagRetriever(
-                config,
-                question_encoder_tokenizer=question_encoder_tokenizer,
-                generator_tokenizer=generator_tokenizer,
-                index=index,
-                init_retrieval=False,
-            )
-            self.initialized = True
-
-    def init_retrieval(self):
-        self.retriever.index.init_index()
-
-    def clear_object(self):
-        # delete the old self.retriever object before assigning the new index
-        del self.retriever
-        self.initialized = False
-
-    def retrieve(self, question_hidden_states, n_docs):
-        doc_ids, retrieved_doc_embeds = self.retriever._main_retrieve(question_hidden_states, n_docs)
-        doc_dicts = self.retriever.index.get_doc_dicts(doc_ids)
-        return doc_ids, retrieved_doc_embeds, doc_dicts
-
-
-class RagRayDistributedRetriever(RagRetriever):
-    """
-    A distributed retriever built on top of the ``Ray`` API, a library
-    for building distributed applications (https://docs.ray.io/en/master/).
-    package. During training, all training workers initialize their own
-    instance of a `RagRayDistributedRetriever`, and each instance of
-    this distributed retriever shares a common set of Retrieval Ray
-    Actors (https://docs.ray.io/en/master/walkthrough.html#remote
-    -classes-actors) that load the index on separate processes. Ray
-    handles the communication between the `RagRayDistributedRetriever`
-    instances and the remote Ray actors. If training is done in a
-    non-distributed setup, the index will simply be loaded in the same
-    process as the training worker and Ray will not be used.
-
-    Args:
-        config (:class:`~transformers.RagConfig`):
-            The configuration of the RAG model this Retriever is used with. Contains parameters indicating which ``Index`` to build.
-        question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
-            The tokenizer that was used to tokenize the question.
-            It is used to decode the question and then use the generator_tokenizer.
-        generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
-            The tokenizer used for the generator part of the RagModel.
-        retrieval_workers (:obj:`List[ray.ActorClass(RayRetriever)]`): A list of already initialized `RayRetriever` actors.
-            These actor classes run on remote processes and are responsible for performing the index lookup.
-        index (:class:`~transformers.retrieval_rag.Index`, optional, defaults to the one defined by the configuration):
-            If specified, use this index instead of the one built using the configuration
-    """
-
-    def __init__(self, config, question_encoder_tokenizer, generator_tokenizer, retrieval_workers, index=None):
-        if index is not None and index.is_initialized() and len(retrieval_workers) > 0:
-            raise ValueError(
-                "When using Ray for distributed fine-tuning, "
-                "you'll need to provide the paths instead, "
-                "as the dataset and the index are loaded "
-                "separately. More info in examples/rag/use_own_knowledge_dataset.py "
-            )
-
-        super().__init__(
-            config,
-            question_encoder_tokenizer=question_encoder_tokenizer,
-            generator_tokenizer=generator_tokenizer,
-            index=index,
-            init_retrieval=False,
-        )
-
-        self.retrieval_workers = retrieval_workers
-        self.question_encoder_tokenizer = question_encoder_tokenizer
-        self.generator_tokenizer = generator_tokenizer
-        if len(self.retrieval_workers) > 0:
-            ray.get(
-                [
-                    worker.create_rag_retriever.remote(config, question_encoder_tokenizer, generator_tokenizer, index)
-                    for worker in self.retrieval_workers
-                ]
-            )
-
-    def init_retrieval(self):
-        """
-        Retriever initialization function, needs to be called from the
-        training process. This function triggers retrieval initialization
-        for all retrieval actors if using distributed setting, or loads
-        index into current process if training is not distributed.
-        """
-        logger.info("initializing retrieval")
-
-        if len(self.retrieval_workers) > 0:
-            ray.get([worker.init_retrieval.remote() for worker in self.retrieval_workers])
-        else:
-            # Non-distributed training. Load index into this same process.
-            self.index.init_index()
-
-    def retrieve(self, question_hidden_states, n_docs):
-        """
-        Retrieves documents for specified ``question_hidden_states``. If
-        running training with multiple workers, a random retrieval actor is
-        selected to perform the index lookup and return the result.
-
-        Args:
-            question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`):
-                A batch of query vectors to retrieve with.
-            n_docs (:obj:`int`):
-                The number of docs retrieved per query.
-
-        Output:
-            retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`
-                The retrieval embeddings of the retrieved docs per query.
-            doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`)
-                The ids of the documents in the index
-            doc_dicts (:obj:`List[dict]`):
-                The retrieved_doc_embeds examples per query.
-        """
-        if len(self.retrieval_workers) > 0:
-            # Select a random retrieval actor.
-            random_worker = self.retrieval_workers[random.randint(0, len(self.retrieval_workers) - 1)]
-            doc_ids, retrieved_doc_embeds, doc_dicts = ray.get(
-                random_worker.retrieve.remote(question_hidden_states, n_docs)
-            )
-        else:
-            doc_ids, retrieved_doc_embeds = self._main_retrieve(question_hidden_states, n_docs)
-            doc_dicts = self.index.get_doc_dicts(doc_ids)
-        return retrieved_doc_embeds, doc_ids, doc_dicts
-
-    @classmethod
-    def get_tokenizers(cls, retriever_name_or_path, indexed_dataset=None, **kwargs):
-        return super(RagRayDistributedRetriever, cls).get_tokenizers(retriever_name_or_path, indexed_dataset, **kwargs)
-
-    @classmethod
-    def from_pretrained(cls, retriever_name_or_path, actor_handles, indexed_dataset=None, **kwargs):
-        config = kwargs.pop("config", None) or RagConfig.from_pretrained(retriever_name_or_path, **kwargs)
-        rag_tokenizer = RagTokenizer.from_pretrained(retriever_name_or_path, config=config)
-        question_encoder_tokenizer = rag_tokenizer.question_encoder
-        generator_tokenizer = rag_tokenizer.generator
-
-        if indexed_dataset is not None:
-            config.index_name = "custom"
-            index = CustomHFIndex(config.retrieval_vector_size, indexed_dataset)
-        else:
-            index = cls._build_index(config)
-
-        return cls(
-            config,
-            question_encoder_tokenizer=question_encoder_tokenizer,
-            generator_tokenizer=generator_tokenizer,
-            retrieval_workers=actor_handles,
-            index=index,
-        )
-
-    def re_load(self):
-        logger.info("re-loading the new dataset with embeddings")
-        # access from the training loop
-
-        ray.get([worker.clear_object.remote() for worker in self.retrieval_workers])
-
-        # build the index object again
-        index = self._build_index(self.config)
-
-        ray.get(
-            [
-                worker.create_rag_retriever.remote(
-                    self.config, self.question_encoder_tokenizer, self.generator_tokenizer, index
-                )
-                for worker in self.retrieval_workers
-            ]
-        )
diff --git a/examples/research_projects/rag-end2end-retriever/eval_rag.py b/examples/research_projects/rag-end2end-retriever/eval_rag.py
deleted file mode 100644
index 55f4da56571d..000000000000
--- a/examples/research_projects/rag-end2end-retriever/eval_rag.py
+++ /dev/null
@@ -1,320 +0,0 @@
-"""Evaluation script for RAG models."""
-
-import argparse
-import ast
-import logging
-import os
-import sys
-
-import pandas as pd
-import torch
-from tqdm import tqdm
-
-from transformers import BartForConditionalGeneration, RagRetriever, RagSequenceForGeneration, RagTokenForGeneration
-from transformers import logging as transformers_logging
-
-
-sys.path.append(os.path.join(os.getcwd()))  # noqa: E402 # isort:skip
-from utils_rag import exact_match_score, f1_score  # noqa: E402 # isort:skip
-
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-
-transformers_logging.set_verbosity_info()
-
-
-def infer_model_type(model_name_or_path):
-    if "token" in model_name_or_path:
-        return "rag_token"
-    if "sequence" in model_name_or_path:
-        return "rag_sequence"
-    if "bart" in model_name_or_path:
-        return "bart"
-    return None
-
-
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-    return max(metric_fn(prediction, gt) for gt in ground_truths)
-
-
-def get_scores(args, preds_path, gold_data_path):
-    hypos = [line.strip() for line in open(preds_path, "r").readlines()]
-    answers = []
-
-    if args.gold_data_mode == "qa":
-        data = pd.read_csv(gold_data_path, sep="\t", header=None)
-        for answer_list in data[1]:
-            ground_truths = ast.literal_eval(answer_list)
-            answers.append(ground_truths)
-    else:
-        references = [line.strip() for line in open(gold_data_path, "r").readlines()]
-        answers = [[reference] for reference in references]
-
-    f1 = em = total = 0
-    for prediction, ground_truths in zip(hypos, answers):
-        total += 1
-        em += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
-        f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
-
-    em = 100.0 * em / total
-    f1 = 100.0 * f1 / total
-
-    logger.info(f"F1: {f1:.2f}")
-    logger.info(f"EM: {em:.2f}")
-
-
-def get_precision_at_k(args, preds_path, gold_data_path):
-    k = args.k
-    hypos = [line.strip() for line in open(preds_path, "r").readlines()]
-    references = [line.strip() for line in open(gold_data_path, "r").readlines()]
-
-    em = total = 0
-    for hypo, reference in zip(hypos, references):
-        hypo_provenance = set(hypo.split("\t")[:k])
-        ref_provenance = set(reference.split("\t"))
-        total += 1
-        em += len(hypo_provenance & ref_provenance) / k
-
-    em = 100.0 * em / total
-    logger.info(f"Precision@{k}: {em: .2f}")
-
-
-def evaluate_batch_retrieval(args, rag_model, questions):
-    def strip_title(title):
-        if title.startswith('"'):
-            title = title[1:]
-        if title.endswith('"'):
-            title = title[:-1]
-        return title
-
-    retriever_input_ids = rag_model.retriever.question_encoder_tokenizer.batch_encode_plus(
-        questions,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-    )["input_ids"].to(args.device)
-
-    question_enc_outputs = rag_model.rag.question_encoder(retriever_input_ids)
-    question_enc_pool_output = question_enc_outputs[0]
-
-    result = rag_model.retriever(
-        retriever_input_ids,
-        question_enc_pool_output.cpu().detach().to(torch.float32).numpy(),
-        prefix=rag_model.rag.generator.config.prefix,
-        n_docs=rag_model.config.n_docs,
-        return_tensors="pt",
-    )
-    all_docs = rag_model.retriever.index.get_doc_dicts(result.doc_ids)
-    provenance_strings = []
-    for docs in all_docs:
-        provenance = [strip_title(title) for title in docs["title"]]
-        provenance_strings.append("\t".join(provenance))
-    return provenance_strings
-
-
-def evaluate_batch_e2e(args, rag_model, questions):
-    with torch.no_grad():
-        inputs_dict = rag_model.retriever.question_encoder_tokenizer.batch_encode_plus(
-            questions, return_tensors="pt", padding=True, truncation=True
-        )
-
-        input_ids = inputs_dict.input_ids.to(args.device)
-        attention_mask = inputs_dict.attention_mask.to(args.device)
-        outputs = rag_model.generate(  # rag_model overwrites generate
-            input_ids,
-            attention_mask=attention_mask,
-            num_beams=args.num_beams,
-            min_length=args.min_length,
-            max_length=args.max_length,
-            early_stopping=False,
-            num_return_sequences=1,
-            bad_words_ids=[[0, 0]],  # BART likes to repeat BOS tokens, dont allow it to generate more than one
-        )
-        answers = rag_model.retriever.generator_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        if args.print_predictions:
-            for q, a in zip(questions, answers):
-                logger.info("Q: {} - A: {}".format(q, a))
-
-        return answers
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_type",
-        choices=["rag_sequence", "rag_token", "bart"],
-        type=str,
-        help=(
-            "RAG model type: rag_sequence, rag_token or bart, if none specified, the type is inferred from the"
-            " model_name_or_path"
-        ),
-    )
-    parser.add_argument(
-        "--index_name",
-        default=None,
-        choices=["exact", "compressed", "legacy"],
-        type=str,
-        help="RAG model retriever type",
-    )
-    parser.add_argument(
-        "--index_path",
-        default=None,
-        type=str,
-        help="Path to the retrieval index",
-    )
-    parser.add_argument("--n_docs", default=5, type=int, help="Number of retrieved docs")
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained checkpoints or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--eval_mode",
-        choices=["e2e", "retrieval"],
-        default="e2e",
-        type=str,
-        help=(
-            "Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calculates"
-            " precision@k."
-        ),
-    )
-    parser.add_argument("--k", default=1, type=int, help="k for the precision@k calculation")
-    parser.add_argument(
-        "--evaluation_set",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to a file containing evaluation samples",
-    )
-    parser.add_argument(
-        "--gold_data_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to a tab-separated file with gold samples",
-    )
-    parser.add_argument(
-        "--gold_data_mode",
-        default="qa",
-        type=str,
-        choices=["qa", "ans"],
-        help=(
-            "Format of the gold data file"
-            "qa - a single line in the following format: question [tab] answer_list"
-            "ans - a single line of the gold file contains the expected answer string"
-        ),
-    )
-    parser.add_argument(
-        "--predictions_path",
-        type=str,
-        default="predictions.txt",
-        help="Name of the predictions file, to be stored in the checkpoints directory",
-    )
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument(
-        "--eval_batch_size",
-        default=8,
-        type=int,
-        help="Batch size per GPU/CPU for evaluation.",
-    )
-    parser.add_argument(
-        "--recalculate",
-        help="Recalculate predictions even if the prediction file exists",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--num_beams",
-        default=4,
-        type=int,
-        help="Number of beams to be used when generating answers",
-    )
-    parser.add_argument("--min_length", default=1, type=int, help="Min length of the generated answers")
-    parser.add_argument("--max_length", default=50, type=int, help="Max length of the generated answers")
-
-    parser.add_argument(
-        "--print_predictions",
-        action="store_true",
-        help="If True, prints predictions while evaluating.",
-    )
-    parser.add_argument(
-        "--print_docs",
-        action="store_true",
-        help="If True, prints docs retried while generating.",
-    )
-    args = parser.parse_args()
-    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    return args
-
-
-def main(args):
-    model_kwargs = {}
-    if args.model_type is None:
-        args.model_type = infer_model_type(args.model_name_or_path)
-        assert args.model_type is not None
-    if args.model_type.startswith("rag"):
-        model_class = RagTokenForGeneration if args.model_type == "rag_token" else RagSequenceForGeneration
-        model_kwargs["n_docs"] = args.n_docs
-        if args.index_name is not None:
-            model_kwargs["index_name"] = args.index_name
-        if args.index_path is not None:
-            model_kwargs["index_path"] = args.index_path
-    else:
-        model_class = BartForConditionalGeneration
-
-    checkpoints = (
-        [f.path for f in os.scandir(args.model_name_or_path) if f.is_dir()]
-        if args.eval_all_checkpoints
-        else [args.model_name_or_path]
-    )
-
-    logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-    score_fn = get_scores if args.eval_mode == "e2e" else get_precision_at_k
-    evaluate_batch_fn = evaluate_batch_e2e if args.eval_mode == "e2e" else evaluate_batch_retrieval
-
-    for checkpoint in checkpoints:
-        if os.path.exists(args.predictions_path) and (not args.recalculate):
-            logger.info("Calculating metrics based on an existing predictions file: {}".format(args.predictions_path))
-            score_fn(args, args.predictions_path, args.gold_data_path)
-            continue
-
-        logger.info("***** Running evaluation for {} *****".format(checkpoint))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        logger.info("  Predictions will be stored under {}".format(args.predictions_path))
-
-        if args.model_type.startswith("rag"):
-            retriever = RagRetriever.from_pretrained(checkpoint, **model_kwargs)
-            model = model_class.from_pretrained(checkpoint, retriever=retriever, **model_kwargs)
-            model.retriever.init_retrieval()
-        else:
-            model = model_class.from_pretrained(checkpoint, **model_kwargs)
-        model.to(args.device)
-
-        with open(args.evaluation_set, "r") as eval_file, open(args.predictions_path, "w") as preds_file:
-            questions = []
-            for line in tqdm(eval_file):
-                questions.append(line.strip())
-                if len(questions) == args.eval_batch_size:
-                    answers = evaluate_batch_fn(args, model, questions)
-                    preds_file.write("\n".join(answers) + "\n")
-                    preds_file.flush()
-                    questions = []
-            if len(questions) > 0:
-                answers = evaluate_batch_fn(args, model, questions)
-                preds_file.write("\n".join(answers))
-                preds_file.flush()
-
-            score_fn(args, args.predictions_path, args.gold_data_path)
-
-
-if __name__ == "__main__":
-    args = get_args()
-    main(args)
diff --git a/examples/research_projects/rag-end2end-retriever/finetune_rag.py b/examples/research_projects/rag-end2end-retriever/finetune_rag.py
deleted file mode 100644
index 9bc2e5db6d5d..000000000000
--- a/examples/research_projects/rag-end2end-retriever/finetune_rag.py
+++ /dev/null
@@ -1,815 +0,0 @@
-"""Finetuning script for RAG models. Adapted from examples.seq2seq.finetune.py"""
-
-import argparse
-import copy
-import json
-import logging
-import multiprocessing
-import os
-import random
-import shutil
-import sys
-import time
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Dict, List, Tuple
-
-import numpy as np
-import pytorch_lightning as pl
-import torch
-import torch.distributed as dist
-from datasets import concatenate_datasets, load_from_disk
-from torch.utils.data import DataLoader
-
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    BartForConditionalGeneration,
-    BatchEncoding,
-    DPRConfig,
-    DPRContextEncoder,
-    DPRContextEncoderTokenizerFast,
-    RagConfig,
-    RagSequenceForGeneration,
-    RagTokenForGeneration,
-    RagTokenizer,
-    T5ForConditionalGeneration,
-)
-from transformers import logging as transformers_logging
-from transformers.integrations import is_ray_available
-
-
-if is_ray_available():
-    import ray
-    from distributed_ray_retriever import RagRayDistributedRetriever, RayRetriever
-
-from glob import glob
-
-from callbacks_rag import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
-from kb_encode_utils import add_index, embed_update
-from lightning_base import BaseTransformer, add_generic_args, generic_train
-from pynvml import nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit
-from utils_rag import (
-    Seq2SeqDataset,
-    calculate_exact_match,
-    get_git_info,
-    is_rag_model,
-    lmap,
-    pickle_save,
-    save_git_info,
-    save_json,
-    set_extra_model_params,
-)
-
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-transformers_logging.set_verbosity_info()
-
-
-sys.path.insert(2, str(Path(__file__).resolve().parents[1]))
-isEmUpdateBusy = False
-isAddIndexBusy = False
-processes = []
-threadHandle_index = None
-
-
-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-
-
-class GenerativeQAModule(BaseTransformer):
-    mode = "generative_qa"
-    loss_names = ["loss"]
-    metric_names = ["em"]
-    val_metric = "em"
-
-    def __init__(self, hparams, **kwargs):
-        # when loading from a pytorch lightning checkpoint, hparams are passed as dict
-        if isinstance(hparams, dict):
-            hparams = AttrDict(hparams)
-        if hparams.model_type == "rag_sequence":
-            self.model_class = RagSequenceForGeneration
-        elif hparams.model_type == "rag_token":
-            self.model_class = RagTokenForGeneration
-        elif hparams.model_type == "bart":
-            self.model_class = BartForConditionalGeneration
-        else:
-            self.model_class = T5ForConditionalGeneration
-        self.is_rag_model = is_rag_model(hparams.model_type)
-
-        config_class = RagConfig if self.is_rag_model else AutoConfig
-        config = config_class.from_pretrained(hparams.model_name_or_path)
-
-        # set retriever parameters
-        config.index_name = hparams.index_name or config.index_name
-        config.passages_path = hparams.passages_path or config.passages_path
-        config.index_path = hparams.index_path or config.index_path
-        config.use_dummy_dataset = hparams.use_dummy_dataset
-
-        # set extra_model_params for generator configs and load_model
-        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "attention_dropout", "dropout")
-        if self.is_rag_model:
-            if hparams.prefix is not None:
-                config.generator.prefix = hparams.prefix
-            config.label_smoothing = hparams.label_smoothing
-            hparams, config.generator = set_extra_model_params(extra_model_params, hparams, config.generator)
-            if hparams.distributed_retriever == "ray":
-                # The Ray retriever needs the handles to the retriever actors.
-                retriever = RagRayDistributedRetriever.from_pretrained(
-                    hparams.model_name_or_path, hparams.actor_handles, config=config
-                )
-
-                if hparams.end2end:
-                    ctx_encoder_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
-                        "facebook/dpr-ctx_encoder-multiset-base"
-                    )
-                    retriever.set_ctx_encoder_tokenizer(ctx_encoder_tokenizer)
-            else:
-                logger.info("please use RAY as the distributed retrieval method")
-
-            model = self.model_class.from_pretrained(hparams.model_name_or_path, config=config, retriever=retriever)
-            if hparams.end2end:
-                ctx_encoder = DPRContextEncoder.from_pretrained(hparams.context_encoder_name)
-                model.set_context_encoder_for_training(ctx_encoder)
-            prefix = config.question_encoder.prefix
-        else:
-            if hparams.prefix is not None:
-                config.prefix = hparams.prefix
-            hparams, config = set_extra_model_params(extra_model_params, hparams, config)
-            model = self.model_class.from_pretrained(hparams.model_name_or_path, config=config)
-            prefix = config.prefix
-
-        tokenizer = (
-            RagTokenizer.from_pretrained(hparams.model_name_or_path)
-            if self.is_rag_model
-            else AutoTokenizer.from_pretrained(hparams.model_name_or_path)
-        )
-
-        self.config_dpr = DPRConfig.from_pretrained(hparams.context_encoder_name)
-        self.custom_config = hparams
-        self.context_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(hparams.context_encoder_name)
-
-        super().__init__(hparams, config=config, tokenizer=tokenizer, model=model)
-
-        save_git_info(self.hparams.output_dir)
-        self.output_dir = Path(self.hparams.output_dir)
-        self.dpr_ctx_check_dir = str(Path(self.hparams.output_dir)) + "/dpr_ctx_checkpoint"
-        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
-        self.hparams_save_path = Path(self.output_dir) / "hparams.pkl"
-        pickle_save(self.hparams, self.hparams_save_path)
-        self.step_count = 0
-        self.metrics = defaultdict(list)
-
-        self.dataset_kwargs: dict = {
-            "data_dir": self.hparams.data_dir,
-            "max_source_length": self.hparams.max_source_length,
-            "prefix": prefix or "",
-        }
-        n_observations_per_split = {
-            "train": self.hparams.n_train,
-            "val": self.hparams.n_val,
-            "test": self.hparams.n_test,
-        }
-        self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()}
-        self.target_lens = {
-            "train": self.hparams.max_target_length,
-            "val": self.hparams.val_max_target_length,
-            "test": self.hparams.test_max_target_length,
-        }
-        assert self.target_lens["train"] <= self.target_lens["val"], f"target_lens: {self.target_lens}"
-        assert self.target_lens["train"] <= self.target_lens["test"], f"target_lens: {self.target_lens}"
-
-        self.hparams.git_sha = get_git_info()["repo_sha"]
-        self.num_workers = hparams.num_workers
-        self.distributed_port = self.hparams.distributed_port
-
-        # For single GPU training, init_ddp_connection is not called.
-        # So we need to initialize the retrievers here.
-        if hparams.gpus <= 1:
-            if hparams.distributed_retriever == "ray":
-                self.model.retriever.init_retrieval()
-            else:
-                logger.info("please use RAY as the distributed retrieval method")
-
-        self.distributed_retriever = hparams.distributed_retriever
-
-    def forward(self, input_ids, **kwargs):
-        return self.model(input_ids, **kwargs)
-
-    def ids_to_clean_text(self, generated_ids: List[int]):
-        gen_text = self.tokenizer.batch_decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-        return lmap(str.strip, gen_text)
-
-    def _step(self, batch: dict) -> Tuple:
-        source_ids, source_mask, target_ids = batch["input_ids"], batch["attention_mask"], batch["decoder_input_ids"]
-
-        rag_kwargs = {}
-        if isinstance(self.model, T5ForConditionalGeneration):
-            decoder_input_ids = self.model._shift_right(target_ids)
-            lm_labels = target_ids
-        elif isinstance(self.model, BartForConditionalGeneration):
-            decoder_input_ids = target_ids[:, :-1].contiguous()
-            lm_labels = target_ids[:, 1:].clone()
-        else:
-            assert self.is_rag_model
-            generator = self.model.rag.generator
-            if isinstance(generator, T5ForConditionalGeneration):
-                decoder_start_token_id = generator.config.decoder_start_token_id
-                decoder_input_ids = (
-                    torch.cat(
-                        [torch.tensor([[decoder_start_token_id]] * target_ids.shape[0]).to(target_ids), target_ids],
-                        dim=1,
-                    )
-                    if target_ids.shape[0] < self.target_lens["train"]
-                    else generator._shift_right(target_ids)
-                )
-            elif isinstance(generator, BartForConditionalGeneration):
-                decoder_input_ids = target_ids
-            lm_labels = decoder_input_ids
-            rag_kwargs["reduce_loss"] = True
-
-        assert decoder_input_ids is not None
-
-        outputs = self(
-            source_ids,
-            attention_mask=source_mask,
-            decoder_input_ids=decoder_input_ids,
-            use_cache=False,
-            labels=lm_labels,
-            **rag_kwargs,
-        )
-        loss = outputs["loss"]
-        return (loss,)
-
-    @property
-    def pad(self) -> int:
-        raise NotImplementedError("pad not implemented")
-
-    def training_step(self, batch, batch_idx) -> Dict:
-        global isEmUpdateBusy  # use to check whether the entire embedding update process is finished or not
-        global isAddIndexBusy  # use to check whether the entire indexing process  is finished or not
-        global processes  # use to keep threads embedding update processes
-        global threadHandle_index  # use to keep thread in embedding indexing processes
-
-        if (self.trainer.global_rank == 0) and (self.custom_config.end2end):
-            if (not batch_idx == 0) and (batch_idx % self.custom_config.indexing_freq == 0):
-                free_gpu_list = []
-                nvmlInit()
-                deviceCount = nvmlDeviceGetCount()
-
-                my_list = json.loads(self.custom_config.gpu_order)
-
-                for i in range(deviceCount):
-                    handle = nvmlDeviceGetHandleByIndex(i)
-                    info = nvmlDeviceGetMemoryInfo(handle)
-
-                    if info.used / 1e6 < 15:
-                        position = my_list.index(i)
-                        free_gpu_list.append("cuda:" + str(position))
-
-                if len(free_gpu_list) >= self.custom_config.index_gpus:
-                    has_free_gpus = True
-
-                else:
-                    has_free_gpus = False
-
-                if (not isEmUpdateBusy) and has_free_gpus:
-                    model_copy = type(self.model.rag.ctx_encoder)(
-                        self.config_dpr
-                    )  # get a new instance  #this will be load in the CPU
-                    model_copy.load_state_dict(self.model.rag.ctx_encoder.state_dict())  # copy weights
-
-                    processes = []
-
-                    if len(free_gpu_list) > self.custom_config.index_gpus:
-                        cuda_devices = random.sample(free_gpu_list, self.custom_config.index_gpus)
-                    else:
-                        cuda_devices = free_gpu_list
-
-                    num_processes = len(cuda_devices)
-
-                    for rank in range(num_processes):
-                        logger.info("Iniitializing  embedding calculation process rank{}".format(rank))
-                        device = cuda_devices[rank]
-                        p = multiprocessing.Process(
-                            target=embed_update,
-                            args=(
-                                copy.deepcopy(model_copy),
-                                num_processes,
-                                device,
-                                rank,
-                                self.custom_config.shard_dir,
-                                self.custom_config.csv_path,
-                            ),
-                        )
-                        processes.append(p)
-
-                    for p in processes:
-                        p.start()
-
-                    isEmUpdateBusy = True
-
-            if isEmUpdateBusy and (not isAddIndexBusy):
-                index_process_list = [processes[k].is_alive() for k in range(self.custom_config.index_gpus)]
-                if (
-                    sum(index_process_list) == 0
-                ):  # If entire list is false, we can say all embedding calculation process has finished
-                    logger.info("Start adding the index")
-                    threadHandle_index = multiprocessing.Process(
-                        target=add_index,
-                        args=(
-                            self.custom_config.shard_dir,
-                            self.config.index_path,
-                        ),
-                    )
-                    threadHandle_index.start()
-                    isAddIndexBusy = True
-
-            # check when index building has started
-            if isAddIndexBusy:
-                # check still the index_building process is happening
-                if not threadHandle_index.is_alive():
-                    logger.info("Merging the dataset shards")
-                    saved_dataset_shards = []
-
-                    for address in glob(str(self.custom_config.shard_dir) + "/*/"):
-                        saved_dataset_shards.append(load_from_disk(address))
-
-                    concat = concatenate_datasets(saved_dataset_shards)
-                    concat.save_to_disk(self.config.passages_path)  # here we update the main passage file on the disk
-                    logger.info("done updating the dataset")
-
-                    # To Do (@Aaron) : Useful in the future dynamic memory implementation.
-                    # if you load the index from the disk make sure to update the index file here, otherwise it is ok to update the index file from the worker.
-                    # logger.info("then updating the index")
-                    # shutil.copy(self.custom_config.temp_index, self.config.idex_path)
-
-                    logger.info("Loading new passages and iniitalzing new index")
-                    self.trainer.model.module.module.model.rag.retriever.re_load()
-                    self.trainer.model.module.module.model.rag.retriever.init_retrieval()
-
-                    isEmUpdateBusy = False
-                    isAddIndexBusy = False
-        self.trainer.strategy.barrier("barrier")
-
-        loss_tensors = self._step(batch)
-
-        logs = dict(zip(self.loss_names, loss_tensors))
-        # tokens per batch
-        tgt_pad_token_id = (
-            self.tokenizer.generator.pad_token_id
-            if isinstance(self.tokenizer, RagTokenizer)
-            else self.tokenizer.pad_token_id
-        )
-        src_pad_token_id = (
-            self.tokenizer.question_encoder.pad_token_id
-            if isinstance(self.tokenizer, RagTokenizer)
-            else self.tokenizer.pad_token_id
-        )
-        logs["tpb"] = (
-            batch["input_ids"].ne(src_pad_token_id).sum() + batch["decoder_input_ids"].ne(tgt_pad_token_id).sum()
-        )
-        self.log("loss", loss_tensors[0])
-        return loss_tensors[0]
-
-    def validation_step(self, batch, batch_idx) -> Dict:
-        return self._generative_step(batch)
-
-    def validation_epoch_end(self, outputs, prefix="val") -> Dict:
-        self.step_count += 1
-        losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names}
-        loss = losses["loss"]
-        gen_metrics = {
-            k: np.array([x[k] for x in outputs]).mean() for k in self.metric_names + ["gen_time", "gen_len"]
-        }
-        metrics_tensor: torch.FloatTensor = torch.tensor(gen_metrics[self.val_metric]).type_as(loss)
-        gen_metrics.update({k: v.item() for k, v in losses.items()})
-
-        # fix for https://github.com/PyTorchLightning/pytorch-lightning/issues/2424
-        if dist.is_initialized():
-            dist.all_reduce(metrics_tensor, op=dist.ReduceOp.SUM)
-            metrics_tensor = metrics_tensor / dist.get_world_size()
-            gen_metrics.update({self.val_metric: metrics_tensor.item()})
-
-        losses.update(gen_metrics)
-        metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
-        metrics["step_count"] = self.step_count
-        self.save_metrics(metrics, prefix)  # writes to self.metrics_save_path
-
-        log_dict = {
-            f"{prefix}_avg_em": metrics[f"{prefix}_avg_em"],
-            "step_count": metrics["step_count"],
-            f"{prefix}_avg_loss": metrics[f"{prefix}_avg_loss"],
-            f"{prefix}_loss": loss,
-            f"{prefix}_em": metrics_tensor,
-        }
-        self.log_dict(log_dict)
-
-    def save_metrics(self, latest_metrics, type_path) -> None:
-        self.metrics[type_path].append(latest_metrics)
-        save_json(self.metrics, self.metrics_save_path)
-
-    def calc_generative_metrics(self, preds, target) -> Dict:
-        return calculate_exact_match(preds, target)
-
-    def _generative_step(self, batch: dict) -> dict:
-        start_time = time.time()
-        batch = BatchEncoding(batch).to(device=self.model.device)
-        generated_ids = self.model.generate(
-            batch["input_ids"],
-            attention_mask=batch["attention_mask"],
-            do_deduplication=False,  # rag specific parameter
-            use_cache=True,
-            min_length=1,
-            max_length=self.target_lens["val"],
-        )
-        gen_time = (time.time() - start_time) / batch["input_ids"].shape[0]
-        preds: List[str] = self.ids_to_clean_text(generated_ids)
-        target: List[str] = self.ids_to_clean_text(batch["decoder_input_ids"])
-        # print(preds,target)
-        loss_tensors = self._step(batch)
-        base_metrics = dict(zip(self.loss_names, loss_tensors))
-        gen_metrics: Dict = self.calc_generative_metrics(preds, target)
-
-        summ_len = np.mean(lmap(len, generated_ids))
-        base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **gen_metrics)
-        return base_metrics
-
-    def test_step(self, batch, batch_idx):
-        return self._generative_step(batch)
-
-    def test_epoch_end(self, outputs):
-        return self.validation_epoch_end(outputs, prefix="test")
-
-    def get_dataset(self, type_path) -> Seq2SeqDataset:
-        n_obs = self.n_obs[type_path]
-        max_target_length = self.target_lens[type_path]
-        dataset = Seq2SeqDataset(
-            self.tokenizer,
-            type_path=type_path,
-            n_obs=n_obs,
-            max_target_length=max_target_length,
-            **self.dataset_kwargs,
-        )
-        return dataset
-
-    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False) -> DataLoader:
-        dataset = self.get_dataset(type_path)
-
-        dataloader = DataLoader(
-            dataset,
-            batch_size=batch_size,
-            collate_fn=dataset.collate_fn,
-            shuffle=shuffle,
-            num_workers=self.num_workers,
-        )
-        return dataloader
-
-    def train_dataloader(self) -> DataLoader:
-        dataloader = self.get_dataloader("train", batch_size=self.hparams.train_batch_size, shuffle=True)
-        return dataloader
-
-    def val_dataloader(self) -> DataLoader:
-        return self.get_dataloader("val", batch_size=self.hparams.eval_batch_size)
-
-    def test_dataloader(self) -> DataLoader:
-        return self.get_dataloader("test", batch_size=self.hparams.eval_batch_size)
-
-    @pl.utilities.rank_zero_only
-    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        save_path = self.output_dir.joinpath("checkpoint{}".format(self.step_count))
-        self.model.config.save_step = self.step_count
-        # self.model.save_pretrained(save_path)
-        self.tokenizer.save_pretrained(save_path)
-
-        if self.custom_config.end2end:
-            modified_state_dict = self.model.state_dict()
-            for key in self.model.state_dict().keys():
-                if key.split(".")[1] == "ctx_encoder":
-                    del modified_state_dict[key]
-            self.model.save_pretrained(save_directory=save_path, state_dict=modified_state_dict)
-
-            save_path_dpr = os.path.join(self.dpr_ctx_check_dir, "checkpoint{}".format(self.step_count))
-            self.model.rag.ctx_encoder.save_pretrained(save_path_dpr)
-            self.context_tokenizer.save_pretrained(save_path_dpr)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        BaseTransformer.add_model_specific_args(parser, root_dir)
-        add_generic_args(parser, root_dir)
-        parser.add_argument(
-            "--max_source_length",
-            default=128,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument(
-            "--max_target_length",
-            default=25,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument(
-            "--val_max_target_length",
-            default=25,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument(
-            "--test_max_target_length",
-            default=25,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default")
-        parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.")
-        parser.add_argument("--n_val", type=int, default=-1, required=False, help="# examples. -1 means use all.")
-        parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.")
-        parser.add_argument("--label_smoothing", type=float, default=0.0, required=False)
-        parser.add_argument(
-            "--prefix",
-            type=str,
-            default=None,
-            help="Prefix added at the beginning of each text, typically used with T5-based models.",
-        )
-        parser.add_argument(
-            "--early_stopping_patience",
-            type=int,
-            default=-1,
-            required=False,
-            help=(
-                "-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So"
-                " val_check_interval will effect it."
-            ),
-        )
-        parser.add_argument(
-            "--distributed-port", type=int, default=-1, required=False, help="Port number for distributed training."
-        )
-        parser.add_argument(
-            "--model_type",
-            choices=["rag_sequence", "rag_token", "bart", "t5"],
-            type=str,
-            help=(
-                "RAG model type: sequence or token, if none specified, the type is inferred from the"
-                " model_name_or_path"
-            ),
-        )
-        parser.add_argument(
-            "--context_encoder_name",
-            default="facebook/dpr-ctx_encoder-multiset-base",
-            type=str,
-            help="Name of the pre-trained context encoder checkpoint from the DPR",
-        )
-        parser.add_argument(
-            "--csv_path",
-            default=str(Path(__file__).parent / "test_run" / "dummy-kb" / "my_knowledge_dataset.csv"),
-            type=str,
-            help="path of the raw KB csv",
-        )
-        parser.add_argument("--end2end", action="store_true", help="whether to train the system end2end or not")
-        parser.add_argument("--index_gpus", type=int, help="how many GPUs used in re-encoding process")
-        parser.add_argument(
-            "--shard_dir",
-            type=str,
-            default=str(Path(__file__).parent / "test_run" / "kb-shards"),
-            help="directory used to keep temporary shards during the re-encode process",
-        )
-
-        parser.add_argument(
-            "--gpu_order",
-            type=str,
-            help=(
-                "order of the GPU used during the fine-tuning.  Used to finding free GPUs during the re-encode"
-                " process. I do not have many GPUs :)"
-            ),
-        )
-
-        parser.add_argument("--indexing_freq", type=int, help="frequency of re-encode process")
-        return parser
-
-    @staticmethod
-    def add_retriever_specific_args(parser):
-        parser.add_argument(
-            "--index_name",
-            type=str,
-            default=None,
-            help=(
-                "Name of the index to use: 'hf' for a canonical dataset from the datasets library (default), 'custom'"
-                " for a local index, or 'legacy' for the orignal one)"
-            ),
-        )
-        parser.add_argument(
-            "--passages_path",
-            type=str,
-            default=str(Path(__file__).parent / "test_run" / "dummy-kb" / "my_knowledge_dataset"),
-            help=(
-                "Path to the dataset of passages for custom index. More info about custom indexes in the RagRetriever"
-                " documentation as well as in `examples/rag/use_own_knowledge_dataset.py`"
-            ),
-        )
-        parser.add_argument(
-            "--index_path",
-            type=str,
-            default=str(Path(__file__).parent / "test_run" / "dummy-kb" / "my_knowledge_dataset_hnsw_index.faiss"),
-            help=(
-                "Path to the faiss index for custom index. More info about custom indexes in the RagRetriever"
-                " documentation as well as in `examples/rag/use_own_knowledge_dataset.py`"
-            ),
-        )
-        parser.add_argument(
-            "--distributed_retriever",
-            choices=["ray", "pytorch"],
-            type=str,
-            default="ray",
-            help=(
-                "What implementation to use for distributed retriever? If "
-                "pytorch is selected, the index is loaded on training "
-                "worker 0, and torch.distributed is used to handle "
-                "communication between training worker 0, and the other "
-                "training workers. If ray is selected, the Ray library is "
-                "used to create load the index on separate processes, "
-                "and Ray handles the communication between the training "
-                "workers and the retrieval actors."
-            ),
-        )
-        parser.add_argument(
-            "--use_dummy_dataset",
-            type=bool,
-            default=False,
-            help=(
-                "Whether to use the dummy version of the dataset index. More info about custom indexes in the"
-                " RagRetriever documentation as well as in `examples/rag/use_own_knowledge_dataset.py`"
-            ),
-        )
-        return parser
-
-    @staticmethod
-    def add_ray_specific_args(parser):
-        # Ray cluster address.
-        parser.add_argument(
-            "--ray-address",
-            default="auto",
-            type=str,
-            help=(
-                "The address of the Ray cluster to connect to. If not "
-                "specified, Ray will attempt to automatically detect the "
-                "cluster. Has no effect if pytorch is used as the distributed "
-                "retriever."
-            ),
-        )
-        parser.add_argument(
-            "--num_retrieval_workers",
-            type=int,
-            default=1,
-            help=(
-                "The number of retrieval actors to use when Ray is selected "
-                "for the distributed retriever. Has no effect when "
-                "distributed_retriever is set to pytorch."
-            ),
-        )
-        return parser
-
-
-def main(args=None, model=None) -> GenerativeQAModule:
-    parser = argparse.ArgumentParser()
-    parser = pl.Trainer.add_argparse_args(parser)
-    parser = GenerativeQAModule.add_model_specific_args(parser, os.getcwd())
-    parser = GenerativeQAModule.add_retriever_specific_args(parser)
-    args = args or parser.parse_args()
-
-    Path(args.output_dir).mkdir(exist_ok=True)
-    Path(args.output_dir + "/dpr_ctx_checkpoint").mkdir(
-        exist_ok=True
-    )  # save dpr_context encoder seprately for the future use
-    print(args.shard_dir)
-    if os.path.exists(args.shard_dir):  # we do not need previous kb shards used in dataset re-conding and re-indexing
-        shutil.rmtree(args.shard_dir)
-    Path(args.shard_dir).mkdir(exist_ok=True)
-
-    if os.path.exists(
-        args.cache_dir
-    ):  # we do not need previous cache files used in dataset re-conding and re-indexing
-        shutil.rmtree(args.cache_dir)
-    Path(args.cache_dir).mkdir(exist_ok=True)
-
-    named_actors = []
-    if args.distributed_retriever == "ray" and args.gpus > 1:
-        if not is_ray_available():
-            raise RuntimeError("Please install Ray to use the Ray distributed retriever.")
-        # Connect to an existing Ray cluster.
-        try:
-            ray.init(address=args.ray_address, namespace="rag")
-        except (ConnectionError, ValueError):
-            logger.warning(
-                "Connection to Ray cluster failed. Make sure a Ray "
-                "cluster is running by either using Ray's cluster "
-                "launcher (`ray up`) or by manually starting Ray on "
-                "each node via `ray start --head` for the head node "
-                "and `ray start --address='<ip address>:6379'` for "
-                "additional nodes. See "
-                "https://docs.ray.io/en/master/cluster/index.html "
-                "for more info."
-            )
-            raise
-
-        # Create Ray actors only for rank 0.
-        if ("LOCAL_RANK" not in os.environ or os.environ["LOCAL_RANK"] == 0) and (
-            "NODE_RANK" not in os.environ or os.environ["NODE_RANK"] == 0
-        ):
-            remote_cls = ray.remote(RayRetriever)
-            named_actors = [
-                remote_cls.options(name="retrieval_worker_{}".format(i)).remote()
-                for i in range(args.num_retrieval_workers)
-            ]
-        else:
-            logger.info(
-                "Getting named actors for NODE_RANK {}, LOCAL_RANK {}".format(
-                    os.environ["NODE_RANK"], os.environ["LOCAL_RANK"]
-                )
-            )
-            named_actors = [ray.get_actor("retrieval_worker_{}".format(i)) for i in range(args.num_retrieval_workers)]
-    args.actor_handles = named_actors
-    assert args.actor_handles == named_actors
-
-    if model is None:
-        model: GenerativeQAModule = GenerativeQAModule(args)
-
-    dataset = Path(args.data_dir).name
-    if (
-        args.logger_name == "default"
-        or args.fast_dev_run
-        or str(args.output_dir).startswith("/tmp")
-        or str(args.output_dir).startswith("/var")
-    ):
-        training_logger = True  # don't pollute wandb logs unnecessarily
-    elif args.logger_name == "wandb":
-        from pytorch_lightning.loggers import WandbLogger
-
-        project = os.environ.get("WANDB_PROJECT", dataset)
-        training_logger = WandbLogger(name=model.output_dir.name, project=project)
-
-    elif args.logger_name == "wandb_shared":
-        from pytorch_lightning.loggers import WandbLogger
-
-        training_logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}")
-
-    es_callback = (
-        get_early_stopping_callback(model.val_metric, args.early_stopping_patience)
-        if args.early_stopping_patience >= 0
-        else False
-    )
-
-    trainer: pl.Trainer = generic_train(
-        model,
-        args,
-        logging_callback=Seq2SeqLoggingCallback(),
-        checkpoint_callback=get_checkpoint_callback(args.output_dir, model.val_metric),
-        early_stopping_callback=es_callback,
-        logger=training_logger,
-        profiler=pl.profiler.AdvancedProfiler() if args.profile else None,
-    )
-
-    pickle_save(model.hparams, model.output_dir / "hparams.pkl")
-    if not args.do_predict:
-        return model
-
-    # test() without a model tests using the best checkpoint automatically
-    trainer.test()
-    return model
-
-
-if __name__ == "__main__":
-    multiprocessing.set_start_method("spawn")
-    parser = argparse.ArgumentParser()
-    parser = pl.Trainer.add_argparse_args(parser)
-    parser = GenerativeQAModule.add_model_specific_args(parser, os.getcwd())
-    parser = GenerativeQAModule.add_retriever_specific_args(parser)
-    parser = GenerativeQAModule.add_ray_specific_args(parser)
-
-    # Pytorch Lightning Profiler
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="If True, use pytorch_lightning.profiler.AdvancedProfiler to profile the Trainer.",
-    )
-
-    args = parser.parse_args()
-    main(args)
diff --git a/examples/research_projects/rag-end2end-retriever/finetune_rag_ray_end2end.sh b/examples/research_projects/rag-end2end-retriever/finetune_rag_ray_end2end.sh
deleted file mode 100755
index cef1a264c935..000000000000
--- a/examples/research_projects/rag-end2end-retriever/finetune_rag_ray_end2end.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-# Sample script to finetune RAG using Ray for distributed retrieval.
-
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-#creates the custom knowlegebase
-python use_own_knowledge_dataset.py  \
-    --csv_path /DIR/SQUAD-KB/squad-kb.csv \
-    --output_dir  /DIR/SQUAD-KB
-
-# Start a single-node Ray cluster.
-ray start --head
-
-# A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
-# run ./examples/rag/finetune_rag_ray.sh --help to see all the possible options
-
-
-
-python finetune_rag.py \
-    --data_dir  /DIR/squad-training-data \
-    --output_dir /DIR/model_checkpoints \
-    --model_name_or_path facebook/rag-token-base \
-    --model_type rag_token \
-    --fp16 \
-    --gpus 2  \
-    --profile \
-    --do_train \
-    --end2end \
-    --do_predict \
-    --n_val -1  \
-    --train_batch_size 4 \
-    --eval_batch_size 1 \
-    --max_source_length 128 \
-    --max_target_length 25 \
-    --val_max_target_length 25 \
-    --test_max_target_length 25 \
-    --label_smoothing 0.1 \
-    --dropout 0.1 \
-    --attention_dropout 0.1 \
-    --weight_decay 0.001 \
-    --adam_epsilon 1e-08 \
-    --max_grad_norm 0.1 \
-    --lr_scheduler polynomial \
-    --learning_rate 3e-05 \
-    --num_train_epochs 10 \
-    --warmup_steps 500 \
-    --gradient_accumulation_steps 8 \
-    --distributed_retriever ray \
-    --num_retrieval_workers 4  \
-    --passages_path /DIR/SQUAD-KB/my_knowledge_dataset \
-    --index_path  /DIR/SQUAD-KB/my_knowledge_dataset_hnsw_index.faiss \
-    --index_name custom \
-    --context_encoder_name facebook/dpr-ctx_encoder-multiset-base \
-    --csv_path /DIR/SQUAD-KB/squad-kb.csv \
-    --index_gpus 1 \
-    --gpu_order [5,6,7,8,9,0,1,2,3,4] \
-    --shard_dir ./test_dir/kb-shards \
-    --indexing_freq 500
-   
-    
-
-# Stop the Ray cluster.
-ray stop
-
-
-#this script was used to test the SQuAD data.
-#change the dir paramater acording to your prefernece.
-#please use the same device ordere when running CUDA_VISIBLE_DEVICES=5,6,7,8,9,0,1,2,3,4 sh finetune_rag_ray_end2end.sh
\ No newline at end of file
diff --git a/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py b/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py
deleted file mode 100644
index 444c07b2bab1..000000000000
--- a/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import os
-from functools import partial
-from glob import glob
-
-import faiss
-from datasets import Features, Sequence, Value, concatenate_datasets, load_dataset, load_from_disk
-
-from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast
-
-
-def split_text(text, n=100, character=" "):
-    """Split the text every ``n``-th occurrence of ``character``"""
-    text = text.split(character)
-    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]
-
-
-def split_documents(documents):
-    """Split documents into passages"""
-    titles, texts = [], []
-    for title, text in zip(documents["title"], documents["text"]):
-        if text is not None:
-            for passage in split_text(text):
-                titles.append(title if title is not None else "")
-                texts.append(passage)
-    return {"title": titles, "text": texts}
-
-
-def embed_update(ctx_encoder, total_processes, device, process_num, shard_dir, csv_path):
-    kb_dataset = load_dataset(
-        "csv", data_files=[csv_path], split="train", delimiter="\t", column_names=["title", "text"]
-    )
-    kb_dataset = kb_dataset.map(
-        split_documents, batched=True, num_proc=1
-    )  # if you want you can load already splitted csv.
-    kb_list = [kb_dataset.shard(total_processes, i, contiguous=True) for i in range(total_processes)]
-    data_shrad = kb_list[process_num]
-
-    arrow_folder = "data_" + str(process_num)
-    passages_path = os.path.join(shard_dir, arrow_folder)
-
-    context_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")
-    ctx_encoder = ctx_encoder.to(device=device)
-
-    def embed(
-        documents: dict, ctx_encoder: DPRContextEncoder, ctx_tokenizer: DPRContextEncoderTokenizerFast, device
-    ) -> dict:
-        """Compute the DPR embeddings of document passages"""
-        input_ids = ctx_tokenizer(
-            documents["title"], documents["text"], truncation=True, padding="longest", return_tensors="pt"
-        )["input_ids"]
-        embeddings = ctx_encoder(input_ids.to(device=device), return_dict=True).pooler_output
-        return {"embeddings": embeddings.detach().cpu().numpy()}
-
-    new_features = Features(
-        {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
-    )  # optional, save as float32 instead of float64 to save space
-
-    dataset = data_shrad.map(
-        partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=context_tokenizer, device=device),
-        batched=True,
-        batch_size=16,
-        features=new_features,
-    )
-    dataset.save_to_disk(passages_path)
-
-
-def add_index(shard_dir, index_path):
-    data_shard_list = []
-
-    for shard_address in glob(str(shard_dir) + "/*/"):
-        data_shard_list.append(load_from_disk(shard_address))
-
-    concat = concatenate_datasets(data_shard_list)
-    faiss.omp_set_num_threads(96)
-
-    index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
-    concat.add_faiss_index("embeddings", custom_index=index)
-    concat.get_index("embeddings").save(
-        index_path
-    )  # since we load the index in to memory,we can directly update the index in the disk
diff --git a/examples/research_projects/rag-end2end-retriever/lightning_base.py b/examples/research_projects/rag-end2end-retriever/lightning_base.py
deleted file mode 100644
index 9c918eea47b6..000000000000
--- a/examples/research_projects/rag-end2end-retriever/lightning_base.py
+++ /dev/null
@@ -1,414 +0,0 @@
-import argparse
-import logging
-import os
-from pathlib import Path
-from typing import Any, Dict
-
-import pytorch_lightning as pl
-from pytorch_lightning.utilities import rank_zero_info
-
-from transformers import (
-    AdamW,
-    AutoConfig,
-    AutoModel,
-    AutoModelForPreTraining,
-    AutoModelForQuestionAnswering,
-    AutoModelForSeq2SeqLM,
-    AutoModelForSequenceClassification,
-    AutoModelForTokenClassification,
-    AutoModelWithLMHead,
-    AutoTokenizer,
-    PretrainedConfig,
-    PreTrainedTokenizer,
-)
-from transformers.optimization import (
-    Adafactor,
-    get_cosine_schedule_with_warmup,
-    get_cosine_with_hard_restarts_schedule_with_warmup,
-    get_linear_schedule_with_warmup,
-    get_polynomial_decay_schedule_with_warmup,
-)
-from transformers.utils.versions import require_version
-
-
-logger = logging.getLogger(__name__)
-
-require_version("pytorch_lightning>=1.0.4")
-
-MODEL_MODES = {
-    "base": AutoModel,
-    "sequence-classification": AutoModelForSequenceClassification,
-    "question-answering": AutoModelForQuestionAnswering,
-    "pretraining": AutoModelForPreTraining,
-    "token-classification": AutoModelForTokenClassification,
-    "language-modeling": AutoModelWithLMHead,
-    "summarization": AutoModelForSeq2SeqLM,
-    "translation": AutoModelForSeq2SeqLM,
-}
-
-
-# update this and the import above to support new schedulers from transformers.optimization
-arg_to_scheduler = {
-    "linear": get_linear_schedule_with_warmup,
-    "cosine": get_cosine_schedule_with_warmup,
-    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
-    "polynomial": get_polynomial_decay_schedule_with_warmup,
-    # '': get_constant_schedule,             # not supported for now
-    # '': get_constant_schedule_with_warmup, # not supported for now
-}
-arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
-arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
-
-
-class BaseTransformer(pl.LightningModule):
-    def __init__(
-        self,
-        hparams: argparse.Namespace,
-        num_labels=None,
-        mode="base",
-        config=None,
-        tokenizer=None,
-        model=None,
-        **config_kwargs,
-    ):
-        """Initialize a model, tokenizer and config."""
-        super().__init__()
-        # TODO: move to self.save_hyperparameters()
-        # self.save_hyperparameters()
-        # can also expand arguments into trainer signature for easier reading
-
-        self.save_hyperparameters(hparams)
-        self.step_count = 0
-        self.output_dir = Path(self.hparams.output_dir)
-        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
-        if config is None:
-            self.config = AutoConfig.from_pretrained(
-                self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
-                **({"num_labels": num_labels} if num_labels is not None else {}),
-                cache_dir=cache_dir,
-                **config_kwargs,
-            )
-        else:
-            self.config: PretrainedConfig = config
-
-        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
-        for p in extra_model_params:
-            if getattr(self.hparams, p, None):
-                assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute"
-                setattr(self.config, p, getattr(self.hparams, p))
-
-        if tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.tokenizer: PreTrainedTokenizer = tokenizer
-        self.model_type = MODEL_MODES[mode]
-        if model is None:
-            self.model = self.model_type.from_pretrained(
-                self.hparams.model_name_or_path,
-                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
-                config=self.config,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.model = model
-
-    def load_hf_checkpoint(self, *args, **kwargs):
-        self.model = self.model_type.from_pretrained(*args, **kwargs)
-
-    def get_lr_scheduler(self):
-        get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
-        scheduler = get_schedule_func(
-            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps()
-        )
-        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
-        return scheduler
-
-    def configure_optimizers(self):
-        """Prepare optimizer and schedule (linear warmup and decay)"""
-        model = self.model
-        no_decay = ["bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [
-                    p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)
-                ],  # check this named paramters
-                "weight_decay": self.hparams.weight_decay,
-            },
-            {
-                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-                "weight_decay": 0.0,
-            },
-        ]
-        if self.hparams.adafactor:
-            optimizer = Adafactor(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False
-            )
-
-        else:
-            optimizer = AdamW(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
-            )
-        self.opt = optimizer
-
-        scheduler = self.get_lr_scheduler()
-
-        return [optimizer], [scheduler]
-
-    def test_step(self, batch, batch_nb):
-        return self.validation_step(batch, batch_nb)
-
-    def test_epoch_end(self, outputs):
-        return self.validation_end(outputs)
-
-    def total_steps(self) -> int:
-        """The number of total training steps that will be run. Used for lr scheduler purposes."""
-        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
-        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
-        return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs
-
-    def setup(self, stage):
-        if stage == "test":
-            self.dataset_size = len(self.test_dataloader().dataset)
-        else:
-            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
-            self.dataset_size = len(self.train_dataloader().dataset)
-
-    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False):
-        raise NotImplementedError("You must implement this for your task")
-
-    def train_dataloader(self):
-        return self.train_loader
-
-    def val_dataloader(self):
-        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
-
-    def test_dataloader(self):
-        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
-
-    def _feature_file(self, mode):
-        return os.path.join(
-            self.hparams.data_dir,
-            "cached_{}_{}_{}".format(
-                mode,
-                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
-                str(self.hparams.max_seq_length),
-            ),
-        )
-
-    @pl.utilities.rank_zero_only
-    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        save_path = self.output_dir.joinpath("best_tfmr")
-        self.model.config.save_step = self.step_count
-        self.model.save_pretrained(save_path)
-        self.tokenizer.save_pretrained(save_path)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        parser.add_argument(
-            "--model_name_or_path",
-            default=None,
-            type=str,
-            required=True,
-            help="Path to pretrained model or model identifier from huggingface.co/models",
-        )
-        parser.add_argument(
-            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-        )
-        parser.add_argument(
-            "--tokenizer_name",
-            default=None,
-            type=str,
-            help="Pretrained tokenizer name or path if not the same as model_name",
-        )
-        parser.add_argument(
-            "--cache_dir",
-            default=str(Path(__file__).parent / "test_run" / "cache"),
-            type=str,
-            help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-        )
-        parser.add_argument(
-            "--encoder_layerdrop",
-            type=float,
-            help="Encoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--decoder_layerdrop",
-            type=float,
-            help="Decoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--dropout",
-            type=float,
-            help="Dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--attention_dropout",
-            type=float,
-            help="Attention dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-        parser.add_argument(
-            "--lr_scheduler",
-            default="linear",
-            choices=arg_to_scheduler_choices,
-            metavar=arg_to_scheduler_metavar,
-            type=str,
-            help="Learning rate scheduler",
-        )
-        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-        parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader")
-        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
-        parser.add_argument("--train_batch_size", default=32, type=int)
-        parser.add_argument("--eval_batch_size", default=32, type=int)
-        parser.add_argument("--adafactor", action="store_true")
-
-
-class InitCallback(pl.Callback):
-    # this process can also be done with PL ddp plugging.
-    # But still it is experimental (check original RAG, I updated that with pluggin (shamanez))
-    def on_sanity_check_start(self, trainer, pl_module):
-        if (
-            trainer.is_global_zero and trainer.global_rank == 0
-        ):  # we initialize the retriever only on master worker with RAY. In new pytorch-lightning accelorators are removed.
-            pl_module.model.rag.retriever.init_retrieval()  # better to use hook functions.
-
-
-class CheckParamCallback(pl.Callback):
-    # check whether new added model paramters are differentiable
-    def on_after_backward(self, trainer, pl_module):
-        # print(pl_module.model.rag)
-        for name, param in pl_module.model.rag.named_parameters():
-            if param.grad is None:
-                print(name)
-
-
-class LoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
-        lrs = {f"lr_group_{i}": lr for i, lr in enumerate(lr_scheduler.get_lr())}
-        pl_module.logger.log_metrics(lrs)
-
-    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Validation results *****")
-        metrics = trainer.callback_metrics
-        # Log results
-        for key in sorted(metrics):
-            if key not in ["log", "progress_bar"]:
-                rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
-
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Test results *****")
-        metrics = trainer.callback_metrics
-        # Log and save results to file
-        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
-        with open(output_test_results_file, "w") as writer:
-            for key in sorted(metrics):
-                if key not in ["log", "progress_bar"]:
-                    rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
-                    writer.write("{} = {}\n".format(key, str(metrics[key])))
-
-
-def add_generic_args(parser, root_dir) -> None:
-    #  To allow all pl args uncomment the following line
-    #  parser = pl.Trainer.add_argparse_args(parser)
-    parser.add_argument(
-        "--output_dir",
-        default=str(Path(__file__).parent / "test_run" / "model_checkpoints"),
-        type=str,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O2",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
-    parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm")
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        dest="accumulate_grad_batches",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument(
-        "--data_dir",
-        default=str(Path(__file__).parent / "test_run" / "dummy-train-data"),
-        type=str,
-        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
-    )
-
-
-def generic_train(
-    model: BaseTransformer,
-    args: argparse.Namespace,
-    early_stopping_callback=None,
-    logger=True,  # can pass WandbLogger() here
-    extra_callbacks=[],
-    checkpoint_callback=None,
-    logging_callback=None,
-    **extra_train_kwargs,
-):
-    pl.seed_everything(args.seed)
-
-    # init model
-    odir = Path(model.hparams.output_dir)
-    odir.mkdir(exist_ok=True)
-
-    # add custom checkpoints
-    if checkpoint_callback is None:
-        checkpoint_callback = pl.callbacks.ModelCheckpoint(
-            filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
-        )
-    if early_stopping_callback:
-        extra_callbacks.append(early_stopping_callback)
-    if logging_callback is None:
-        logging_callback = LoggingCallback()
-
-    train_params = {}
-
-    if args.fp16:
-        train_params["precision"] = 16
-
-    if args.gpus > 1:
-        train_params["accelerator"] = "auto"
-        train_params["strategy"] = "ddp"
-
-    train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
-    train_params["profiler"] = None
-    train_params["devices"] = "auto"
-
-    trainer = pl.Trainer.from_argparse_args(
-        args,
-        weights_summary=None,
-        callbacks=[logging_callback] + extra_callbacks + [InitCallback()] + [checkpoint_callback],
-        logger=logger,
-        val_check_interval=1,
-        num_sanity_val_steps=2,
-        **train_params,
-    )
-
-    if args.do_train:
-        trainer.fit(model)
-
-    else:
-        print("RAG modeling tests with new set functions successfully executed!")
-    return trainer
diff --git a/examples/research_projects/rag-end2end-retriever/requirements.txt b/examples/research_projects/rag-end2end-retriever/requirements.txt
deleted file mode 100644
index 32025229d074..000000000000
--- a/examples/research_projects/rag-end2end-retriever/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-faiss-cpu >= 1.7.2
-datasets 
-psutil >= 5.9.1
-torch >= 1.11.0
-pytorch-lightning == 1.6.4
-nvidia-ml-py3 == 7.352.0
-ray >=  1.13.0
\ No newline at end of file
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/dummy-kb/my_knowledge_dataset.csv b/examples/research_projects/rag-end2end-retriever/test_run/dummy-kb/my_knowledge_dataset.csv
deleted file mode 100644
index 76da009a2f23..000000000000
--- a/examples/research_projects/rag-end2end-retriever/test_run/dummy-kb/my_knowledge_dataset.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-Aaron	Aaron Aaron ( or ; "Ahärôn") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman ("prophet") to the Pharaoh. Part of the Law (Torah) that Moses received from God at Sinai granted Aaron the priesthood for himself and his male descendants, and he became the first High Priest of the Israelites. Aaron died before the Israelites crossed the North Jordan river and he was buried on Mount Hor (Numbers 33:39; Deuteronomy 10:6 says he died and was buried at Moserah). Aaron is also mentioned in the New Testament of the Bible. According to the Book of Exodus, Aaron first functioned as Moses' assistant. Because Moses complained that he could not speak well, God appointed Aaron as Moses' "prophet" (Exodus 4:10-17; 7:1). At the command of Moses, he let his rod turn into a snake. Then he stretched out his rod in order to bring on the first three plagues. After that, Moses tended to act and speak for himself. During the journey in the wilderness, Aaron was not always prominent or active. At the battle with Amalek, he was chosen with Hur to support the hand of Moses that held the "rod of God". When the revelation was given to Moses at biblical Mount Sinai, he headed the elders of Israel who accompanied Moses on the way to the summit.
-"Pokémon"	Pokémon , also known as in Japan, is a media franchise managed by The Pokémon Company, a Japanese consortium between Nintendo, Game Freak, and Creatures. The franchise copyright is shared by all three companies, but Nintendo is the sole owner of the trademark. The franchise was created by Satoshi Tajiri in 1995, and is centered on fictional creatures called "Pokémon", which humans, known as Pokémon Trainers, catch and train to battle each other for sport. The English slogan for the franchise is "Gotta Catch 'Em All". Works within the franchise are set in the Pokémon universe. The franchise began as "Pokémon Red" and "Green" (released outside of Japan as "Pokémon Red" and "Blue"), a pair of video games for the original Game Boy that were developed by Game Freak and published by Nintendo in February 1996. "Pokémon" has since gone on to become the highest-grossing media franchise of all time, with over in revenue up until March 2017. The original video game series is the second best-selling video game franchise (behind Nintendo's "Mario" franchise) with more than 300million copies sold and over 800million mobile downloads. In addition, the "Pokémon" franchise includes the world's top-selling toy brand, the top-selling trading card game with over 25.7billion cards sold, an anime television series that has become the most successful video game adaptation with over 20 seasons and 1,000 episodes in 124 countries, as well as an anime film series, a , books, manga comics, music, and merchandise. The franchise is also represented in other Nintendo media, such as the "Super Smash Bros." series. In November 2005, 4Kids Entertainment, which had managed the non-game related licensing of "Pokémon", announced that it had agreed not to renew the "Pokémon" representation agreement. The Pokémon Company International oversees all "Pokémon" licensing outside Asia.
\ No newline at end of file
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.source b/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.source
deleted file mode 100644
index 3d5cbc38039d..000000000000
--- a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.source
+++ /dev/null
@@ -1,8 +0,0 @@
-What does Moses' rod turn into ?
-Who is Aron?
-Where did Moses grow up ?
-What happens at the command of the Moses ?
-Who manages the Pokémon ?
-Who owned the Pokémon trademark ?
-What else include in Pokémon franchise ?
-How many seasons in Pokémon animme series ?
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.target b/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.target
deleted file mode 100644
index a3a6e04372c7..000000000000
--- a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/test.target
+++ /dev/null
@@ -1,8 +0,0 @@
-to a snake
-Moses' assistant
-Egyptian royal court
-let his rod turn in to a snake
-The Pokémon Company
-Nintendo
-world's top-selling toy brand, the top-selling trading card game
-over 20 seasons
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/train.source b/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/train.source
deleted file mode 100644
index 9f72c3e03a7b..000000000000
--- a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/train.source
+++ /dev/null
@@ -1,48 +0,0 @@
-What does Moses' rod turn into ?
-Who is Aron?
-Where did Moses grow up ?
-What happens at the command of the Moses ?
-Who manages the Pokémon ?
-Who owned the Pokémon trademark ?
-What else include in Pokémon franchise ?
-How many seasons in Pokémon animme series ?
-What does Moses' rod turn into ?
-Who is Aron?
-Where did Moses grow up ?
-What happens at the command of the Moses ?
-Who manages the Pokémon ?
-Who owned the Pokémon trademark ?
-What else include in Pokémon franchise ?
-How many seasons in Pokémon animme series ?
-What does Moses' rod turn into ?
-Who is Aron?
-Where did Moses grow up ?
-What happens at the command of the Moses ?
-Who manages the Pokémon ?
-Who owned the Pokémon trademark ?
-What else include in Pokémon franchise ?
-How many seasons in Pokémon animme series ?
-What does Moses' rod turn into ?
-Who is Aron?
-Where did Moses grow up ?
-What happens at the command of the Moses ?
-Who manages the Pokémon ?
-Who owned the Pokémon trademark ?
-What else include in Pokémon franchise ?
-How many seasons in Pokémon animme series ?
-What does Moses' rod turn into ?
-Who is Aron?
-Where did Moses grow up ?
-What happens at the command of the Moses ?
-Who manages the Pokémon ?
-Who owned the Pokémon trademark ?
-What else include in Pokémon franchise ?
-How many seasons in Pokémon animme series ?
-What does Moses' rod turn into ?
-Who is Aron?
-Where did Moses grow up ?
-What happens at the command of the Moses ?
-Who manages the Pokémon ?
-Who owned the Pokémon trademark ?
-What else include in Pokémon franchise ?
-How many seasons in Pokémon animme series ?
\ No newline at end of file
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/train.target b/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/train.target
deleted file mode 100644
index 3bda0caf2e31..000000000000
--- a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/train.target
+++ /dev/null
@@ -1,48 +0,0 @@
-to a snake
-Moses' assistant
-Egyptian royal court
-let his rod turn in to a snake
-The Pokémon Company
-Nintendo
-world's top-selling toy brand, the top-selling trading card game 
-over 20 seasons 
-to a snake
-Moses' assistant
-Egyptian royal court
-let his rod turn in to a snake
-The Pokémon Company
-Nintendo
-world's top-selling toy brand, the top-selling trading card game 
-over 20 seasons 
-to a snake
-Moses' assistant
-Egyptian royal court
-let his rod turn in to a snake
-The Pokémon Company
-Nintendo
-world's top-selling toy brand, the top-selling trading card game 
-over 20 seasons 
-to a snake
-Moses' assistant
-Egyptian royal court
-let his rod turn in to a snake
-The Pokémon Company
-Nintendo
-world's top-selling toy brand, the top-selling trading card game 
-over 20 seasons 
-to a snake
-Moses' assistant
-Egyptian royal court
-let his rod turn in to a snake
-The Pokémon Company
-Nintendo
-world's top-selling toy brand, the top-selling trading card game 
-over 20 seasons 
-to a snake
-Moses' assistant
-Egyptian royal court
-let his rod turn in to a snake
-The Pokémon Company
-Nintendo
-world's top-selling toy brand, the top-selling trading card game 
-over 20 seasons 
\ No newline at end of file
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/val.source b/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/val.source
deleted file mode 100644
index a2c628e9ca08..000000000000
--- a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/val.source
+++ /dev/null
@@ -1,8 +0,0 @@
-What does Moses' rod turn into ?
-Who is Aron?
-Where did Moses grow up ?
-What happens at the command of the Moses ?
-Who manages the Pokémon ?
-Who owned the Pokémon trademark ?
-What else include in Pokémon franchise ?
-How many seasons in Pokémon animme series ?
\ No newline at end of file
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/val.target b/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/val.target
deleted file mode 100644
index 57bfcf5270a5..000000000000
--- a/examples/research_projects/rag-end2end-retriever/test_run/dummy-train-data/val.target
+++ /dev/null
@@ -1,8 +0,0 @@
-to a snake
-Moses' assistant
-Egyptian royal court
-let his rod turn in to a snake
-The Pokémon Company
-Nintendo
-world's top-selling toy brand, the top-selling trading card game 
-over 20 seasons 
\ No newline at end of file
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/test_finetune.sh b/examples/research_projects/rag-end2end-retriever/test_run/test_finetune.sh
deleted file mode 100755
index c44d110d2004..000000000000
--- a/examples/research_projects/rag-end2end-retriever/test_run/test_finetune.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-#creates the custom knowlegebase
-python use_own_knowledge_dataset.py
-
-
-# Start a single-node Ray cluster.
-ray start --head
-
-# A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
-# run ./examples/rag/finetune_rag_ray.sh --help to see all the possible options
-
-
-
-python finetune_rag.py \
-    --model_name_or_path facebook/rag-token-base \
-    --model_type rag_token \
-    --fp16 \
-    --gpus 2  \
-    --profile \
-    --do_train \
-    --end2end \
-    --do_predict \
-    --n_val -1  \
-    --train_batch_size 1 \
-    --eval_batch_size 1 \
-    --max_source_length 128 \
-    --max_target_length 25 \
-    --val_max_target_length 25 \
-    --test_max_target_length 25 \
-    --label_smoothing 0.1 \
-    --dropout 0.1 \
-    --attention_dropout 0.1 \
-    --weight_decay 0.001 \
-    --adam_epsilon 1e-08 \
-    --max_grad_norm 0.1 \
-    --lr_scheduler polynomial \
-    --learning_rate 3e-05 \
-    --num_train_epochs 10 \
-    --warmup_steps 500 \
-    --gradient_accumulation_steps 1 \
-    --distributed_retriever ray \
-    --num_retrieval_workers 4  \
-    --index_name custom \
-    --context_encoder_name facebook/dpr-ctx_encoder-multiset-base \
-    --index_gpus 2 \
-    --gpu_order [2,3,4,5,6,7,8,9,0,1] \
-    --indexing_freq 5
-   
-    
-
-# Stop the Ray cluster.
-ray stop
-
-#CUDA_VISIBLE_DEVICES=2,3,4,5,6,7,8,9,0,1 sh ./test_run/test_finetune.sh
-#Make sure --gpu_order is same. 
\ No newline at end of file
diff --git a/examples/research_projects/rag-end2end-retriever/test_run/test_rag_new_features.sh b/examples/research_projects/rag-end2end-retriever/test_run/test_rag_new_features.sh
deleted file mode 100755
index 6c667c094039..000000000000
--- a/examples/research_projects/rag-end2end-retriever/test_run/test_rag_new_features.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-python use_own_knowledge_dataset.py
-
-ray start --head
-python finetune_rag.py \
-    --model_name_or_path facebook/rag-token-base \
-    --model_type rag_token \
-    --context_encoder_name facebook/dpr-ctx_encoder-multiset-base \
-    --fp16 \
-    --gpus 1  \
-    --profile \
-    --end2end \
-    --index_name custom
-
-ray stop
diff --git a/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py b/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
deleted file mode 100644
index 20e0ea2d3cc2..000000000000
--- a/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import logging
-import os
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from typing import List, Optional
-
-import faiss
-import torch
-from datasets import Features, Sequence, Value, load_dataset
-
-from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast, HfArgumentParser
-
-
-logger = logging.getLogger(__name__)
-torch.set_grad_enabled(False)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-
-def split_text(text: str, n=100, character=" ") -> List[str]:
-    """Split the text every ``n``-th occurrence of ``character``"""
-    text = text.split(character)
-    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]
-
-
-def split_documents(documents: dict) -> dict:
-    """Split documents into passages"""
-    titles, texts = [], []
-    for title, text in zip(documents["title"], documents["text"]):
-        if text is not None:
-            for passage in split_text(text):
-                titles.append(title if title is not None else "")
-                texts.append(passage)
-    return {"title": titles, "text": texts}
-
-
-def embed(documents: dict, ctx_encoder: DPRContextEncoder, ctx_tokenizer: DPRContextEncoderTokenizerFast) -> dict:
-    """Compute the DPR embeddings of document passages"""
-    input_ids = ctx_tokenizer(
-        documents["title"], documents["text"], truncation=True, padding="longest", return_tensors="pt"
-    )["input_ids"]
-    embeddings = ctx_encoder(input_ids.to(device=device), return_dict=True).pooler_output
-    return {"embeddings": embeddings.detach().cpu().numpy()}
-
-
-def main(
-    rag_example_args: "RagExampleArguments",
-    processing_args: "ProcessingArguments",
-    index_hnsw_args: "IndexHnswArguments",
-):
-    ######################################
-    logger.info("Step 1 - Create the dataset")
-    ######################################
-
-    # The dataset needed for RAG must have three columns:
-    # - title (string): title of the document
-    # - text (string): text of a passage of the document
-    # - embeddings (array of dimension d): DPR representation of the passage
-    # Let's say you have documents in tab-separated csv files with columns "title" and "text"
-    assert os.path.isfile(rag_example_args.csv_path), "Please provide a valid path to a csv file"
-
-    # You can load a Dataset object this way
-    dataset = load_dataset(
-        "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
-    )
-
-    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets?highlight=csv#csv-files
-
-    # Then split the documents into passages of 100 words
-    dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)
-
-    # And compute the embeddings
-    ctx_encoder = DPRContextEncoder.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name).to(device=device)
-    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name)
-    new_features = Features(
-        {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
-    )  # optional, save as float32 instead of float64 to save space
-    dataset = dataset.map(
-        partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
-        batched=True,
-        batch_size=processing_args.batch_size,
-        features=new_features,
-    )
-
-    # And finally save your dataset
-    passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset")
-    dataset.save_to_disk(passages_path)
-    # from datasets import load_from_disk
-    # dataset = load_from_disk(passages_path)  # to reload the dataset
-
-    ######################################
-    logger.info("Step 2 - Index the dataset")
-    ######################################
-
-    # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search
-    index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT)
-    dataset.add_faiss_index("embeddings", custom_index=index)
-
-    # And save the index
-    index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss")
-    dataset.get_index("embeddings").save(index_path)
-    # dataset.load_faiss_index("embeddings", index_path)  # to reload the index
-
-
-@dataclass
-class RagExampleArguments:
-    csv_path: str = field(
-        default=str(Path(__file__).parent / "test_run" / "dummy-kb" / "my_knowledge_dataset.csv"),
-        metadata={"help": "Path to a tab-separated csv file with columns 'title' and 'text'"},
-    )
-    question: Optional[str] = field(
-        default=None,
-        metadata={"help": "Question that is passed as input to RAG. Default is 'What does Moses' rod turn into ?'."},
-    )
-    rag_model_name: str = field(
-        default="facebook/rag-sequence-nq",
-        metadata={"help": "The RAG model to use. Either 'facebook/rag-sequence-nq' or 'facebook/rag-token-nq'"},
-    )
-    dpr_ctx_encoder_model_name: str = field(
-        default="facebook/dpr-ctx_encoder-multiset-base",
-        metadata={
-            "help": (
-                "The DPR context encoder model to use. Either 'facebook/dpr-ctx_encoder-single-nq-base' or"
-                " 'facebook/dpr-ctx_encoder-multiset-base'"
-            )
-        },
-    )
-    output_dir: Optional[str] = field(
-        default=str(Path(__file__).parent / "test_run" / "dummy-kb"),
-        metadata={"help": "Path to a directory where the dataset passages and the index will be saved"},
-    )
-
-
-@dataclass
-class ProcessingArguments:
-    num_proc: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "The number of processes to use to split the documents into passages. Default is single process."
-        },
-    )
-    batch_size: int = field(
-        default=16,
-        metadata={
-            "help": "The batch size to use when computing the passages embeddings using the DPR context encoder."
-        },
-    )
-
-
-@dataclass
-class IndexHnswArguments:
-    d: int = field(
-        default=768,
-        metadata={"help": "The dimension of the embeddings to pass to the HNSW Faiss index."},
-    )
-    m: int = field(
-        default=128,
-        metadata={
-            "help": (
-                "The number of bi-directional links created for every new element during the HNSW index construction."
-            )
-        },
-    )
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.WARNING)
-    logger.setLevel(logging.INFO)
-
-    parser = HfArgumentParser((RagExampleArguments, ProcessingArguments, IndexHnswArguments))
-    rag_example_args, processing_args, index_hnsw_args = parser.parse_args_into_dataclasses()
-    with TemporaryDirectory() as tmp_dir:
-        rag_example_args.output_dir = rag_example_args.output_dir or tmp_dir
-        main(rag_example_args, processing_args, index_hnsw_args)
diff --git a/examples/research_projects/rag-end2end-retriever/utils_rag.py b/examples/research_projects/rag-end2end-retriever/utils_rag.py
deleted file mode 100644
index ec98c1d782e0..000000000000
--- a/examples/research_projects/rag-end2end-retriever/utils_rag.py
+++ /dev/null
@@ -1,244 +0,0 @@
-import itertools
-import json
-import linecache
-import os
-import pickle
-import re
-import socket
-import string
-from collections import Counter
-from logging import getLogger
-from pathlib import Path
-from typing import Callable, Dict, Iterable, List
-
-import git
-import torch
-from torch.utils.data import Dataset
-
-from transformers import BartTokenizer, RagTokenizer, T5Tokenizer
-
-
-def encode_line(tokenizer, line, max_length, padding_side, pad_to_max_length=True, return_tensors="pt"):
-    extra_kw = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) and not line.startswith(" ") else {}
-    tokenizer.padding_side = padding_side
-    return tokenizer(
-        [line],
-        max_length=max_length,
-        padding="max_length" if pad_to_max_length else None,
-        truncation=True,
-        return_tensors=return_tensors,
-        add_special_tokens=True,
-        **extra_kw,
-    )
-
-
-def trim_batch(
-    input_ids,
-    pad_token_id,
-    attention_mask=None,
-):
-    """Remove columns that are populated exclusively by pad_token_id"""
-    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
-    if attention_mask is None:
-        return input_ids[:, keep_column_mask]
-    else:
-        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
-
-
-class Seq2SeqDataset(Dataset):
-    def __init__(
-        self,
-        tokenizer,
-        data_dir,
-        max_source_length,
-        max_target_length,
-        type_path="train",
-        n_obs=None,
-        src_lang=None,
-        tgt_lang=None,
-        prefix="",
-    ):
-        super().__init__()
-        self.src_file = Path(data_dir).joinpath(type_path + ".source")
-        self.tgt_file = Path(data_dir).joinpath(type_path + ".target")
-        self.src_lens = self.get_char_lens(self.src_file)
-        self.max_source_length = max_source_length
-        self.max_target_length = max_target_length
-        assert min(self.src_lens) > 0, f"found empty line in {self.src_file}"
-        self.tokenizer = tokenizer
-        self.prefix = prefix
-        if n_obs is not None:
-            self.src_lens = self.src_lens[:n_obs]
-        self.src_lang = src_lang
-        self.tgt_lang = tgt_lang
-
-    def __len__(self):
-        return len(self.src_lens)
-
-    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
-        index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
-        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
-        assert source_line, f"empty source line for index {index}"
-        assert tgt_line, f"empty tgt line for index {index}"
-
-        # Need to add eos token manually for T5
-        if isinstance(self.tokenizer, T5Tokenizer):
-            source_line += self.tokenizer.eos_token
-            tgt_line += self.tokenizer.eos_token
-
-        # Pad source and target to the right
-        source_tokenizer = (
-            self.tokenizer.question_encoder if isinstance(self.tokenizer, RagTokenizer) else self.tokenizer
-        )
-        target_tokenizer = self.tokenizer.generator if isinstance(self.tokenizer, RagTokenizer) else self.tokenizer
-
-        source_inputs = encode_line(source_tokenizer, source_line, self.max_source_length, "right")
-        target_inputs = encode_line(target_tokenizer, tgt_line, self.max_target_length, "right")
-
-        source_ids = source_inputs["input_ids"].squeeze()
-        target_ids = target_inputs["input_ids"].squeeze()
-        src_mask = source_inputs["attention_mask"].squeeze()
-        return {
-            "input_ids": source_ids,
-            "attention_mask": src_mask,
-            "decoder_input_ids": target_ids,
-        }
-
-    @staticmethod
-    def get_char_lens(data_file):
-        return [len(x) for x in Path(data_file).open().readlines()]
-
-    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
-        input_ids = torch.stack([x["input_ids"] for x in batch])
-        masks = torch.stack([x["attention_mask"] for x in batch])
-        target_ids = torch.stack([x["decoder_input_ids"] for x in batch])
-        tgt_pad_token_id = (
-            self.tokenizer.generator.pad_token_id
-            if isinstance(self.tokenizer, RagTokenizer)
-            else self.tokenizer.pad_token_id
-        )
-        src_pad_token_id = (
-            self.tokenizer.question_encoder.pad_token_id
-            if isinstance(self.tokenizer, RagTokenizer)
-            else self.tokenizer.pad_token_id
-        )
-        y = trim_batch(target_ids, tgt_pad_token_id)
-        source_ids, source_mask = trim_batch(input_ids, src_pad_token_id, attention_mask=masks)
-        batch = {
-            "input_ids": source_ids,
-            "attention_mask": source_mask,
-            "decoder_input_ids": y,
-        }
-        return batch
-
-
-logger = getLogger(__name__)
-
-
-def flatten_list(summary_ids: List[List]):
-    return list(itertools.chain.from_iterable(summary_ids))
-
-
-def save_git_info(folder_path: str) -> None:
-    """Save git information to output_dir/git_log.json"""
-    repo_infos = get_git_info()
-    save_json(repo_infos, os.path.join(folder_path, "git_log.json"))
-
-
-def save_json(content, path, indent=4, **json_dump_kwargs):
-    with open(path, "w") as f:
-        json.dump(content, f, indent=indent, **json_dump_kwargs)
-
-
-def load_json(path):
-    with open(path) as f:
-        return json.load(f)
-
-
-def get_git_info():
-    repo = git.Repo(search_parent_directories=True)
-    repo_infos = {
-        "repo_id": str(repo),
-        "repo_sha": str(repo.head.object.hexsha),
-        "repo_branch": str(repo.active_branch),
-        "hostname": str(socket.gethostname()),
-    }
-    return repo_infos
-
-
-def lmap(f: Callable, x: Iterable) -> List:
-    """list(map(f, x))"""
-    return list(map(f, x))
-
-
-def pickle_save(obj, path):
-    """pickle.dump(obj, path)"""
-    with open(path, "wb") as f:
-        return pickle.dump(obj, f)
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        return re.sub(r"\b(a|an|the)\b", " ", text)
-
-    def white_space_fix(text):
-        return " ".join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def f1_score(prediction, ground_truth):
-    prediction_tokens = normalize_answer(prediction).split()
-    ground_truth_tokens = normalize_answer(ground_truth).split()
-    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
-    num_same = sum(common.values())
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(prediction_tokens)
-    recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
-
-
-def exact_match_score(prediction, ground_truth):
-    return normalize_answer(prediction) == normalize_answer(ground_truth)
-
-
-def calculate_exact_match(output_lns: List[str], reference_lns: List[str]) -> Dict:
-    assert len(output_lns) == len(reference_lns)
-    em = 0
-    for hypo, pred in zip(output_lns, reference_lns):
-        em += exact_match_score(hypo, pred)
-    if len(output_lns) > 0:
-        em /= len(output_lns)
-    return {"em": em}
-
-
-def is_rag_model(model_prefix):
-    return model_prefix.startswith("rag")
-
-
-def set_extra_model_params(extra_params, hparams, config):
-    equivalent_param = {p: p for p in extra_params}
-    # T5 models don't have `dropout` param, they have `dropout_rate` instead
-    equivalent_param["dropout"] = "dropout_rate"
-    for p in extra_params:
-        if getattr(hparams, p, None):
-            if not hasattr(config, p) and not hasattr(config, equivalent_param[p]):
-                logger.info("config doesn't have a `{}` attribute".format(p))
-                delattr(hparams, p)
-                continue
-            set_p = p if hasattr(config, p) else equivalent_param[p]
-            setattr(config, set_p, getattr(hparams, p))
-            delattr(hparams, p)
-    return hparams, config
diff --git a/examples/research_projects/rag/README.md b/examples/research_projects/rag/README.md
deleted file mode 100644
index 7fbaea84b937..000000000000
--- a/examples/research_projects/rag/README.md
+++ /dev/null
@@ -1,203 +0,0 @@
-# Intro
-
-Authors: @patrickvonplaten and @lhoestq
-
-Aimed at tackling the knowledge-intensive NLP tasks (think tasks a human wouldn't be expected to solve without access to external knowledge sources), RAG models are seq2seq models with access to a retrieval mechanism providing relevant context documents at training and evaluation time.
-
-A RAG model encapsulates two core components: a question encoder and a generator.
-During a forward pass, we encode the input with the question encoder and pass it
-to the retriever to extract relevant context documents. The documents are then prepended to the input.
-Such contextualized inputs are passed to the generator.
-
-Read more about RAG  at https://arxiv.org/abs/2005.11401.
-
-# Note
-
-⚠️ This project should be run with pytorch-lightning==1.3.1 which has a potential security vulnerability
-
-# Finetuning
-
-Our finetuning logic is based on scripts from [`examples/legacy/seq2seq`](https://github.com/huggingface/transformers/tree/main/examples/legacy/seq2seq). We accept training data in the same format as specified there - we expect a directory consisting of 6 text files:
-```bash
-train.source
-train.target
-val.source
-val.target
-test.source
-test.target
-```
-
-A sample finetuning command (run ` ./examples/research_projects/rag/finetune_rag.py --help` to list all available options):
-
-```bash
-python examples/research_projects/rag/finetune_rag.py \
-    --data_dir $DATA_DIR \
-    --output_dir $OUTPUT_DIR \
-    --model_name_or_path $MODEL_NAME_OR_PATH \
-    --model_type rag_sequence \
-    --fp16 \
-    --gpus 8
-```
-We publish two `base` models which can serve as a starting point for finetuning on downstream tasks (use them as `model_name_or_path`):
-- [`facebook/rag-sequence-base`](https://huggingface.co/facebook/rag-sequence-base) - a base for finetuning `RagSequenceForGeneration` models,
-- [`facebook/rag-token-base`](https://huggingface.co/facebook/rag-token-base) - a base for finetuning `RagTokenForGeneration` models.
-
-The `base` models initialize the question encoder with [`facebook/dpr-question_encoder-single-nq-base`](https://huggingface.co/facebook/dpr-question_encoder-single-nq-base) and the generator with [`facebook/bart-large`](https://huggingface.co/facebook/bart-large).
-
-If you would like to initialize finetuning with a base model using different question encoder and generator architectures, you can build it with a consolidation script, e.g.:
-```bash
-python examples/research_projects/rag/consolidate_rag_checkpoint.py \
-    --model_type rag_sequence \
-    --generator_name_or_path facebook/bart-large-cnn \
-    --question_encoder_name_or_path facebook/dpr-question_encoder-single-nq-base \
-    --dest path/to/checkpoint
-```
-You will then be able to pass `path/to/checkpoint` as `model_name_or_path` to the `finetune_rag.py` script.
-
-## Document Retrieval
-When running distributed fine-tuning, each training worker needs to retrieve contextual documents
-for its input by querying a index loaded into memory. RAG provides two implementations for document retrieval,
-one with [`torch.distributed`](https://pytorch.org/docs/stable/distributed.html) communication package and the other
-with [`Ray`](https://docs.ray.io/en/master/).
-
-This option can be configured with the `--distributed_retriever` flag which can either be set to `pytorch` or `ray`.
-By default this flag is set to `pytorch`.
-
-For the Pytorch implementation, only training worker 0 loads the index into CPU memory, and a gather/scatter pattern is used
-to collect the inputs from the other training workers and send back the corresponding document embeddings.
-
-For the Ray implementation, the index is loaded in *separate* process(es). The training workers randomly select which
-retriever worker to query. To use Ray for distributed retrieval, you have to set the `--distributed_retriever` arg to `ray`.
-To configure the number of retrieval workers (the number of processes that load the index), you can set the `num_retrieval_workers` flag.
-Also make sure to start the Ray cluster before running fine-tuning.
-
-```bash
-# Start a single-node Ray cluster.
-ray start --head
-
-python examples/research_projects/rag/finetune_rag.py \
-    --data_dir $DATA_DIR \
-    --output_dir $OUTPUT_DIR \
-    --model_name_or_path $MODEL_NAME_OR_PATH \
-    --model_type rag_sequence \
-    --fp16 \
-    --gpus 8
-    --distributed_retriever ray \
-    --num_retrieval_workers 4
-
-# Stop the ray cluster once fine-tuning has finished.
-ray stop
-```
-
-Using Ray can lead to retrieval speedups on multi-GPU settings since multiple processes load the index rather than
-just the rank 0 training worker. Using Ray also allows you to load the index on GPU since the index is loaded on a separate
-processes than the model, while with pytorch distributed retrieval, both are loaded in the same process potentially leading to GPU OOM.
-
-# Evaluation
-Our evaluation script enables two modes of evaluation (controlled by the `eval_mode` argument): `e2e` - end2end evaluation, returns EM (exact match) and F1 scores calculated for the downstream task and `retrieval` - which returns precision@k of the documents retrieved for provided inputs.
-
-The evaluation script expects paths to two files:
-- `evaluation_set` - a path to a file specifying the evaluation dataset, a single input per line.
-- `gold_data_path` - a path to a file contaning ground truth answers for datapoints from the `evaluation_set`, a single output per line. Check below for expected formats of the gold data files.
-
-
-## Retrieval evaluation
-For `retrieval` evaluation, we expect a gold data file where each line will consist of a tab-separated list of document titles constituting positive contexts for respective datapoints from the `evaluation_set`. E.g. given a question `who sings does he love me with reba` in the `evaluation_set`, a respective ground truth line could look as follows:
-```
-Does He Love You	Does He Love You	Red Sandy Spika dress of Reba McEntire	Greatest Hits Volume Two (Reba McEntire album)	Shoot for the Moon (album)
-```
-
-We demonstrate how to evaluate retrieval against DPR evaluation data. You can download respective files from links listed [here](https://github.com/facebookresearch/DPR/blob/master/data/download_data.py#L39-L45).
-
-1. Download and unzip the gold data file. We use the `biencoder-nq-dev` from https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz.
-    ```bash
-    wget https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz && gzip -d biencoder-nq-dev.json.gz
-   ```
-
-2. Parse the unziped file using the `parse_dpr_relevance_data.py`
-    ```bash
-    mkdir output # or wherever you want to save this
-    python examples/research_projects/rag/parse_dpr_relevance_data.py \
-        --src_path biencoder-nq-dev.json \
-        --evaluation_set output/biencoder-nq-dev.questions \
-        --gold_data_path output/biencoder-nq-dev.pages
-    ```
-3. Run evaluation:
-    ```bash
-    python examples/research_projects/rag/eval_rag.py \
-        --model_name_or_path facebook/rag-sequence-nq \
-        --model_type rag_sequence \
-        --evaluation_set output/biencoder-nq-dev.questions \
-        --gold_data_path output/biencoder-nq-dev.pages \
-        --predictions_path output/retrieval_preds.tsv  \
-        --eval_mode retrieval \
-        --k 1
-    ```
-   ```bash
-   # EXPLANATION
-    python examples/research_projects/rag/eval_rag.py \
-        --model_name_or_path facebook/rag-sequence-nq \ # model name or path of the model we're evaluating
-        --model_type rag_sequence \ # RAG model type (rag_token or rag_sequence)
-        --evaluation_set output/biencoder-nq-dev.questions \ # an input dataset for evaluation
-        --gold_data_path poutput/biencoder-nq-dev.pages \ # a dataset containing ground truth answers for samples from the evaluation_set
-        --predictions_path output/retrieval_preds.tsv  \ # name of file where predictions will be stored
-        --eval_mode retrieval \ # indicates whether we're performing retrieval evaluation or e2e evaluation
-        --k 1 # parameter k for the precision@k metric
-
-    ```
-## End-to-end evaluation
-
-We support two formats of the gold data file (controlled by the `gold_data_mode` parameter):
-- `qa` - where a single line has the following format: `input [tab] output_list`, e.g.:
-```
-who is the owner of reading football club	['Xiu Li Dai', 'Dai Yongge', 'Dai Xiuli', 'Yongge Dai']
-```
-- `ans` - where a single line contains a single expected answer, e.g.:
-```
-Xiu Li Dai
-```
-
-Predictions of the model for the samples from the `evaluation_set` will be saved under the path specified by the `predictions_path` parameter.
-If this path already exists, the script will use saved predictions to calculate metrics.
-Add `--recalculate` parameter to force the script to perform inference from scratch.
-
-An example e2e evaluation run could look as follows:
-```bash
-python examples/research_projects/rag/eval_rag.py \
-    --model_name_or_path facebook/rag-sequence-nq \
-    --model_type rag_sequence \
-    --evaluation_set path/to/test.source \
-    --gold_data_path path/to/gold_data \
-    --predictions_path path/to/e2e_preds.txt \
-    --eval_mode e2e \
-    --gold_data_mode qa \
-    --n_docs 5 \ # You can experiment with retrieving different number of documents at evaluation time
-    --print_predictions \
-    --recalculate \ # adding this parameter will force recalculating predictions even if predictions_path already exists
-```
-
-# Use your own knowledge source
-
-By default, RAG uses the English Wikipedia as a knowledge source, known as the 'wiki_dpr' dataset.
-With `use_custom_knowledge_dataset.py` you can build your own knowledge source, *e.g.* for RAG.
-
-For instance, if documents are serialized as tab-separated csv files with the columns "title" and "text", one can use `use_own_knowledge_dataset.py` as follows:
-```bash
-python examples/research_projects/rag/use_own_knowledge_dataset.py \
-    --csv_path path/to/my_csv \
-    --output_dir path/to/my_knowledge_dataset \
-```
-
-The created outputs in `path/to/my_knowledge_dataset` can then be used to finetune RAG as follows:
-```bash
-python examples/research_projects/rag/finetune_rag.py \
-    --data_dir $DATA_DIR \
-    --output_dir $OUTPUT_DIR \
-    --model_name_or_path $MODEL_NAME_OR_PATH \
-    --model_type rag_sequence \
-    --fp16 \
-    --gpus 8
-    --index_name custom
-    --passages_path path/to/data/my_knowledge_dataset
-    --index_path path/to/my_knowledge_dataset_hnsw_index.faiss
-```
diff --git a/examples/research_projects/rag/__init__.py b/examples/research_projects/rag/__init__.py
deleted file mode 100644
index 3cee09bb7f51..000000000000
--- a/examples/research_projects/rag/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import os
-import sys
-
-
-sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
diff --git a/examples/research_projects/rag/_test_finetune_rag.py b/examples/research_projects/rag/_test_finetune_rag.py
deleted file mode 100644
index 0906295b3018..000000000000
--- a/examples/research_projects/rag/_test_finetune_rag.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import json
-import logging
-import os
-import sys
-from pathlib import Path
-
-import finetune_rag
-
-from transformers.file_utils import is_apex_available
-from transformers.testing_utils import (
-    TestCasePlus,
-    execute_subprocess_async,
-    require_ray,
-    require_torch_gpu,
-    require_torch_multi_gpu,
-)
-
-
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger()
-
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class RagFinetuneExampleTests(TestCasePlus):
-    def _create_dummy_data(self, data_dir):
-        os.makedirs(data_dir, exist_ok=True)
-        contents = {"source": "What is love ?", "target": "life"}
-        n_lines = {"train": 12, "val": 2, "test": 2}
-        for split in ["train", "test", "val"]:
-            for field in ["source", "target"]:
-                content = "\n".join([contents[field]] * n_lines[split])
-                with open(os.path.join(data_dir, f"{split}.{field}"), "w") as f:
-                    f.write(content)
-
-    def _run_finetune(self, gpus: int, distributed_retriever: str = "pytorch"):
-        tmp_dir = self.get_auto_remove_tmp_dir()
-        output_dir = os.path.join(tmp_dir, "output")
-        data_dir = os.path.join(tmp_dir, "data")
-        self._create_dummy_data(data_dir=data_dir)
-
-        testargs = f"""
-                --data_dir {data_dir} \
-                --output_dir {output_dir} \
-                --model_name_or_path facebook/rag-sequence-base \
-                --model_type rag_sequence \
-                --do_train \
-                --do_predict \
-                --n_val -1 \
-                --val_check_interval 1.0 \
-                --train_batch_size 2 \
-                --eval_batch_size 1 \
-                --max_source_length 25 \
-                --max_target_length 25 \
-                --val_max_target_length 25 \
-                --test_max_target_length 25 \
-                --label_smoothing 0.1 \
-                --dropout 0.1 \
-                --attention_dropout 0.1 \
-                --weight_decay 0.001 \
-                --adam_epsilon 1e-08 \
-                --max_grad_norm 0.1 \
-                --lr_scheduler polynomial \
-                --learning_rate 3e-04 \
-                --num_train_epochs 1 \
-                --warmup_steps 4 \
-                --gradient_accumulation_steps 1 \
-                --distributed-port 8787 \
-                --use_dummy_dataset 1 \
-                --distributed_retriever {distributed_retriever} \
-            """.split()
-
-        if gpus > 0:
-            testargs.append(f"--gpus={gpus}")
-            if is_apex_available():
-                testargs.append("--fp16")
-        else:
-            testargs.append("--gpus=0")
-            testargs.append("--distributed_backend=ddp_cpu")
-            testargs.append("--num_processes=2")
-
-        cmd = [sys.executable, str(Path(finetune_rag.__file__).resolve())] + testargs
-        execute_subprocess_async(cmd, env=self.get_env())
-
-        metrics_save_path = os.path.join(output_dir, "metrics.json")
-        with open(metrics_save_path) as f:
-            result = json.load(f)
-        return result
-
-    @require_torch_gpu
-    def test_finetune_gpu(self):
-        result = self._run_finetune(gpus=1)
-        self.assertGreaterEqual(result["test"][0]["test_avg_em"], 0.2)
-
-    @require_torch_multi_gpu
-    def test_finetune_multigpu(self):
-        result = self._run_finetune(gpus=2)
-        self.assertGreaterEqual(result["test"][0]["test_avg_em"], 0.2)
-
-    @require_torch_gpu
-    @require_ray
-    def test_finetune_gpu_ray_retrieval(self):
-        result = self._run_finetune(gpus=1, distributed_retriever="ray")
-        self.assertGreaterEqual(result["test"][0]["test_avg_em"], 0.2)
-
-    @require_torch_multi_gpu
-    @require_ray
-    def test_finetune_multigpu_ray_retrieval(self):
-        result = self._run_finetune(gpus=1, distributed_retriever="ray")
-        self.assertGreaterEqual(result["test"][0]["test_avg_em"], 0.2)
diff --git a/examples/research_projects/rag/callbacks_rag.py b/examples/research_projects/rag/callbacks_rag.py
deleted file mode 100644
index d75f97995bd1..000000000000
--- a/examples/research_projects/rag/callbacks_rag.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import logging
-from pathlib import Path
-
-import numpy as np
-import pytorch_lightning as pl
-import torch
-from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
-from pytorch_lightning.utilities import rank_zero_only
-from utils_rag import save_json
-
-
-def count_trainable_parameters(model):
-    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
-    params = sum([np.prod(p.size()) for p in model_parameters])
-    return params
-
-
-logger = logging.getLogger(__name__)
-
-
-def get_checkpoint_callback(output_dir, metric):
-    """Saves the best model by validation EM score."""
-    if metric == "rouge2":
-        exp = "{val_avg_rouge2:.4f}-{step_count}"
-    elif metric == "bleu":
-        exp = "{val_avg_bleu:.4f}-{step_count}"
-    elif metric == "em":
-        exp = "{val_avg_em:.4f}-{step_count}"
-    else:
-        raise NotImplementedError(
-            f"seq2seq callbacks only support rouge2 and bleu, got {metric}, You can make your own by adding to this"
-            " function."
-        )
-
-    checkpoint_callback = ModelCheckpoint(
-        dirpath=output_dir,
-        filename=exp,
-        monitor=f"val_{metric}",
-        mode="max",
-        save_top_k=3,
-        every_n_epochs=1,  # maybe save a checkpoint every time val is run, not just end of epoch.
-    )
-    return checkpoint_callback
-
-
-def get_early_stopping_callback(metric, patience):
-    return EarlyStopping(
-        monitor=f"val_{metric}",  # does this need avg?
-        mode="min" if "loss" in metric else "max",
-        patience=patience,
-        verbose=True,
-    )
-
-
-class Seq2SeqLoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lrs = {f"lr_group_{i}": param["lr"] for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)}
-        pl_module.logger.log_metrics(lrs)
-
-    @rank_zero_only
-    def _write_logs(
-        self, trainer: pl.Trainer, pl_module: pl.LightningModule, type_path: str, save_generations=True
-    ) -> None:
-        logger.info(f"***** {type_path} results at step {trainer.global_step:05d} *****")
-        metrics = trainer.callback_metrics
-        trainer.logger.log_metrics({k: v for k, v in metrics.items() if k not in ["log", "progress_bar", "preds"]})
-        # Log results
-        od = Path(pl_module.hparams.output_dir)
-        if type_path == "test":
-            results_file = od / "test_results.txt"
-            generations_file = od / "test_generations.txt"
-        else:
-            # this never gets hit. I prefer not to save intermediate generations, and results are in metrics.json
-            # If people want this it will be easy enough to add back.
-            results_file = od / f"{type_path}_results/{trainer.global_step:05d}.txt"
-            generations_file = od / f"{type_path}_generations/{trainer.global_step:05d}.txt"
-            results_file.parent.mkdir(exist_ok=True)
-            generations_file.parent.mkdir(exist_ok=True)
-        with open(results_file, "a+") as writer:
-            for key in sorted(metrics):
-                if key in ["log", "progress_bar", "preds"]:
-                    continue
-                val = metrics[key]
-                if isinstance(val, torch.Tensor):
-                    val = val.item()
-                msg = f"{key}: {val:.6f}\n"
-                writer.write(msg)
-
-        if not save_generations:
-            return
-
-        if "preds" in metrics:
-            content = "\n".join(metrics["preds"])
-            generations_file.open("w+").write(content)
-
-    @rank_zero_only
-    def on_train_start(self, trainer, pl_module):
-        try:
-            npars = pl_module.model.model.num_parameters()
-        except AttributeError:
-            npars = pl_module.model.num_parameters()
-
-        n_trainable_pars = count_trainable_parameters(pl_module)
-        # mp stands for million parameters
-        trainer.logger.log_metrics({"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6})
-
-    @rank_zero_only
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        save_json(pl_module.metrics, pl_module.metrics_save_path)
-        return self._write_logs(trainer, pl_module, "test")
-
-    @rank_zero_only
-    def on_validation_end(self, trainer: pl.Trainer, pl_module):
-        save_json(pl_module.metrics, pl_module.metrics_save_path)
-        # Uncommenting this will save val generations
-        # return self._write_logs(trainer, pl_module, "valid")
diff --git a/examples/research_projects/rag/consolidate_rag_checkpoint.py b/examples/research_projects/rag/consolidate_rag_checkpoint.py
deleted file mode 100644
index 6adae75fea9b..000000000000
--- a/examples/research_projects/rag/consolidate_rag_checkpoint.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-A script creating a RAG checkpoint from a generator and a question encoder checkpoints.
-"""
-
-import argparse
-from pathlib import Path
-
-from transformers import AutoConfig, AutoTokenizer, RagConfig, RagSequenceForGeneration, RagTokenForGeneration
-
-
-def consolidate(
-    model_type,
-    generator_name_or_path: str,
-    question_encoder_name_or_path: str,
-    dest_dir: Path,
-    config_name_or_path: str = None,
-    generator_tokenizer_name_or_path: str = None,
-    question_encoder_tokenizer_name_or_path: str = None,
-):
-    if config_name_or_path is None:
-        config_name_or_path = "facebook/rag-token-base" if model_type == "rag_token" else "facebook/rag-sequence-base"
-
-    if generator_tokenizer_name_or_path is None:
-        generator_tokenizer_name_or_path = generator_name_or_path
-
-    if question_encoder_tokenizer_name_or_path is None:
-        question_encoder_tokenizer_name_or_path = question_encoder_name_or_path
-
-    model_class = RagTokenForGeneration if model_type == "rag_token" else RagSequenceForGeneration
-
-    # Save model.
-    rag_config = RagConfig.from_pretrained(config_name_or_path)
-    gen_config = AutoConfig.from_pretrained(generator_name_or_path)
-    question_encoder_config = AutoConfig.from_pretrained(question_encoder_name_or_path)
-
-    rag_config.generator = gen_config
-    rag_config.question_encoder = question_encoder_config
-
-    rag_model = model_class.from_pretrained_question_encoder_generator(
-        question_encoder_name_or_path, generator_name_or_path, config=rag_config
-    )
-    rag_model.save_pretrained(dest_dir)
-
-    # Sanity check.
-    model_class.from_pretrained(dest_dir)
-
-    # Save tokenizers.
-    gen_tokenizer = AutoTokenizer.from_pretrained(generator_tokenizer_name_or_path)
-    gen_tokenizer.save_pretrained(dest_dir / "generator_tokenizer/")
-    question_encoder_tokenizer = AutoTokenizer.from_pretrained(question_encoder_tokenizer_name_or_path)
-    question_encoder_tokenizer.save_pretrained(dest_dir / "question_encoder_tokenizer/")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_type",
-        choices=["rag_sequence", "rag_token"],
-        required=True,
-        type=str,
-        help="RAG model type: rag_sequence, rag_token",
-    )
-    parser.add_argument("--dest", type=str, required=True, help="Path to the output checkpoint directory.")
-    parser.add_argument("--generator_name_or_path", type=str, required=True, help="Generator model identifier")
-    parser.add_argument(
-        "--question_encoder_name_or_path", type=str, required=True, help="Question encoder model identifier"
-    )
-
-    parser.add_argument(
-        "--generator_tokenizer_name_or_path",
-        type=str,
-        help="Generator tokenizer identifier, if not specified, resolves to ``generator_name_or_path``",
-    )
-    parser.add_argument(
-        "--question_encoder_tokenizer_name_or_path",
-        type=str,
-        help="Question encoder tokenizer identifier, if not specified, resolves to ``question_encoder_name_or_path``",
-    )
-    parser.add_argument(
-        "--config_name_or_path",
-        type=str,
-        help=(
-            "Identifier of the model config to use, if not provided, resolves to a base config for a given"
-            " ``model_type``"
-        ),
-    )
-
-    args = parser.parse_args()
-
-    dest_dir = Path(args.dest)
-    dest_dir.mkdir(exist_ok=True)
-
-    consolidate(
-        args.model_type,
-        args.generator_name_or_path,
-        args.question_encoder_name_or_path,
-        dest_dir,
-        args.config_name_or_path,
-        args.generator_tokenizer_name_or_path,
-        args.question_encoder_tokenizer_name_or_path,
-    )
diff --git a/examples/research_projects/rag/distributed_pytorch_retriever.py b/examples/research_projects/rag/distributed_pytorch_retriever.py
deleted file mode 100644
index e2403ff8e5b5..000000000000
--- a/examples/research_projects/rag/distributed_pytorch_retriever.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import logging
-import os
-from typing import List, Tuple
-
-import numpy as np
-import psutil
-import torch
-import torch.distributed as dist
-
-from transformers import RagRetriever
-
-
-logger = logging.getLogger(__name__)
-
-
-class RagPyTorchDistributedRetriever(RagRetriever):
-    """
-    A distributed retriever built on top of the ``torch.distributed`` communication package. During training all workers
-    initialize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
-    in cpu memory. The index will also work well in a non-distributed setup.
-
-    Args:
-        config (:class:`~transformers.RagConfig`):
-            The configuration of the RAG model this Retriever is used with. Contains parameters indicating which ``Index`` to build.
-        question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
-            The tokenizer that was used to tokenize the question.
-            It is used to decode the question and then use the generator_tokenizer.
-        generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
-            The tokenizer used for the generator part of the RagModel.
-        index (:class:`~transformers.models.rag.retrieval_rag.Index`, optional, defaults to the one defined by the configuration):
-            If specified, use this index instead of the one built using the configuration
-    """
-
-    def __init__(self, config, question_encoder_tokenizer, generator_tokenizer, index=None):
-        super().__init__(
-            config,
-            question_encoder_tokenizer=question_encoder_tokenizer,
-            generator_tokenizer=generator_tokenizer,
-            index=index,
-            init_retrieval=False,
-        )
-        self.process_group = None
-
-    def init_retrieval(self, distributed_port: int):
-        """
-        Retriever initialization function, needs to be called from the training process. The function sets some common parameters
-        and environment variables. On top of that, (only) the main process in the process group loads the index into memory.
-
-        Args:
-            distributed_port (:obj:`int`):
-                The port on which the main communication of the training run is carried out. We set the port for retrieval-related
-                communication as ``distributed_port + 1``.
-        """
-
-        logger.info("initializing retrieval")
-
-        # initializing a separate process group for retrieval as the default
-        # nccl backend doesn't support gather/scatter operations while gloo
-        # is too slow to replace nccl for the core gpu communication
-        if dist.is_initialized():
-            logger.info("dist initialized")
-            # needs to be set manually
-            os.environ["GLOO_SOCKET_IFNAME"] = self._infer_socket_ifname()
-            # avoid clash with the NCCL port
-            os.environ["MASTER_PORT"] = str(distributed_port + 1)
-            self.process_group = dist.new_group(ranks=None, backend="gloo")
-
-        # initialize retriever only on the main worker
-        if not dist.is_initialized() or self._is_main():
-            logger.info("dist not initialized / main")
-            self.index.init_index()
-
-        # all processes wait untill the retriever is initialized by the main process
-        if dist.is_initialized():
-            torch.distributed.barrier(group=self.process_group)
-
-    def _is_main(self):
-        return dist.get_rank(group=self.process_group) == 0
-
-    def _scattered(self, scatter_list, target_shape, target_type=torch.float32):
-        target_tensor = torch.empty(target_shape, dtype=target_type)
-        dist.scatter(target_tensor, src=0, scatter_list=scatter_list, group=self.process_group)
-        return target_tensor
-
-    def _infer_socket_ifname(self):
-        addrs = psutil.net_if_addrs()
-        # a hacky way to deal with varying network interface names
-        ifname = next((addr for addr in addrs if addr.startswith("e")), None)
-        return ifname
-
-    def retrieve(self, question_hidden_states: np.ndarray, n_docs: int) -> Tuple[np.ndarray, List[dict]]:
-        """
-        Retrieves documents for specified ``question_hidden_states``. The main process, which has the access to the index stored in memory, gathers queries
-        from all the processes in the main training process group, performs the retrieval and scatters back the results.
-
-        Args:
-            question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`):
-                A batch of query vectors to retrieve with.
-            n_docs (:obj:`int`):
-                The number of docs retrieved per query.
-
-        Output:
-            retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`
-                The retrieval embeddings of the retrieved docs per query.
-            doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`)
-                The ids of the documents in the index
-            doc_dicts (:obj:`List[dict]`):
-                The retrieved_doc_embeds examples per query.
-        """
-
-        # single GPU training
-        if not dist.is_initialized():
-            doc_ids, retrieved_doc_embeds = self._main_retrieve(question_hidden_states, n_docs)
-            return retrieved_doc_embeds, doc_ids, self.index.get_doc_dicts(doc_ids)
-
-        # distributed training
-        world_size = dist.get_world_size(group=self.process_group)
-
-        # gather logic
-        gather_list = None
-        if self._is_main():
-            gather_list = [torch.empty(question_hidden_states.shape, dtype=torch.float32) for _ in range(world_size)]
-        dist.gather(torch.tensor(question_hidden_states), dst=0, gather_list=gather_list, group=self.process_group)
-
-        # scatter logic
-        n_queries = question_hidden_states.shape[0]
-        scatter_ids = []
-        scatter_vectors = []
-        if self._is_main():
-            assert len(gather_list) == world_size
-            ids, vectors = self._main_retrieve(torch.cat(gather_list).numpy(), n_docs)
-            ids, vectors = torch.tensor(ids), torch.tensor(vectors)
-            scatter_ids = self._chunk_tensor(ids, n_queries)
-            scatter_vectors = self._chunk_tensor(vectors, n_queries)
-        doc_ids = self._scattered(scatter_ids, [n_queries, n_docs], target_type=torch.int64)
-        retrieved_doc_embeds = self._scattered(scatter_vectors, [n_queries, n_docs, question_hidden_states.shape[1]])
-
-        return retrieved_doc_embeds.numpy(), doc_ids.numpy(), self.index.get_doc_dicts(doc_ids)
diff --git a/examples/research_projects/rag/distributed_ray_retriever.py b/examples/research_projects/rag/distributed_ray_retriever.py
deleted file mode 100644
index dd5baaf72611..000000000000
--- a/examples/research_projects/rag/distributed_ray_retriever.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import logging
-import random
-
-import ray
-
-from transformers import RagConfig, RagRetriever, RagTokenizer
-from transformers.models.rag.retrieval_rag import CustomHFIndex
-
-
-logger = logging.getLogger(__name__)
-
-
-class RayRetriever:
-    def __init__(self):
-        self.initialized = False
-
-    def create_rag_retriever(self, config, question_encoder_tokenizer, generator_tokenizer, index):
-        if not self.initialized:
-            self.retriever = RagRetriever(
-                config,
-                question_encoder_tokenizer=question_encoder_tokenizer,
-                generator_tokenizer=generator_tokenizer,
-                index=index,
-                init_retrieval=False,
-            )
-            self.initialized = True
-
-    def init_retrieval(self):
-        self.retriever.index.init_index()
-
-    def retrieve(self, question_hidden_states, n_docs):
-        doc_ids, retrieved_doc_embeds = self.retriever._main_retrieve(question_hidden_states, n_docs)
-        return doc_ids, retrieved_doc_embeds
-
-
-class RagRayDistributedRetriever(RagRetriever):
-    """
-    A distributed retriever built on top of the ``Ray`` API, a library
-    for building distributed applications (https://docs.ray.io/en/master/).
-    package. During training, all training workers initialize their own
-    instance of a `RagRayDistributedRetriever`, and each instance of
-    this distributed retriever shares a common set of Retrieval Ray
-    Actors (https://docs.ray.io/en/master/walkthrough.html#remote
-    -classes-actors) that load the index on separate processes. Ray
-    handles the communication between the `RagRayDistributedRetriever`
-    instances and the remote Ray actors. If training is done in a
-    non-distributed setup, the index will simply be loaded in the same
-    process as the training worker and Ray will not be used.
-
-    Args:
-        config (:class:`~transformers.RagConfig`):
-            The configuration of the RAG model this Retriever is used with. Contains parameters indicating which ``Index`` to build.
-        question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
-            The tokenizer that was used to tokenize the question.
-            It is used to decode the question and then use the generator_tokenizer.
-        generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
-            The tokenizer used for the generator part of the RagModel.
-        retrieval_workers (:obj:`List[ray.ActorClass(RayRetriever)]`): A list of already initialized `RayRetriever` actors.
-            These actor classes run on remote processes and are responsible for performing the index lookup.
-        index (:class:`~transformers.retrieval_rag.Index`, optional, defaults to the one defined by the configuration):
-            If specified, use this index instead of the one built using the configuration
-    """
-
-    def __init__(self, config, question_encoder_tokenizer, generator_tokenizer, retrieval_workers, index=None):
-        if index is not None and index.is_initialized() and len(retrieval_workers) > 0:
-            raise ValueError(
-                "When using Ray for distributed fine-tuning, "
-                "you'll need to provide the paths instead, "
-                "as the dataset and the index are loaded "
-                "separately. More info in examples/rag/use_own_knowledge_dataset.py "
-            )
-        super().__init__(
-            config,
-            question_encoder_tokenizer=question_encoder_tokenizer,
-            generator_tokenizer=generator_tokenizer,
-            index=index,
-            init_retrieval=False,
-        )
-        self.retrieval_workers = retrieval_workers
-        if len(self.retrieval_workers) > 0:
-            ray.get(
-                [
-                    worker.create_rag_retriever.remote(config, question_encoder_tokenizer, generator_tokenizer, index)
-                    for worker in self.retrieval_workers
-                ]
-            )
-
-    def init_retrieval(self):
-        """
-        Retriever initialization function, needs to be called from the
-        training process. This function triggers retrieval initialization
-        for all retrieval actors if using distributed setting, or loads
-        index into current process if training is not distributed.
-        """
-        logger.info("initializing retrieval")
-
-        if len(self.retrieval_workers) > 0:
-            ray.get([worker.init_retrieval.remote() for worker in self.retrieval_workers])
-        else:
-            # Non-distributed training. Load index into this same process.
-            self.index.init_index()
-
-    def retrieve(self, question_hidden_states, n_docs):
-        """
-        Retrieves documents for specified ``question_hidden_states``. If
-        running training with multiple workers, a random retrieval actor is
-        selected to perform the index lookup and return the result.
-
-        Args:
-            question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`):
-                A batch of query vectors to retrieve with.
-            n_docs (:obj:`int`):
-                The number of docs retrieved per query.
-
-        Output:
-            retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`
-                The retrieval embeddings of the retrieved docs per query.
-            doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`)
-                The ids of the documents in the index
-            doc_dicts (:obj:`List[dict]`):
-                The retrieved_doc_embeds examples per query.
-        """
-        if len(self.retrieval_workers) > 0:
-            # Select a random retrieval actor.
-            random_worker = self.retrieval_workers[random.randint(0, len(self.retrieval_workers) - 1)]
-            doc_ids, retrieved_doc_embeds = ray.get(random_worker.retrieve.remote(question_hidden_states, n_docs))
-        else:
-            doc_ids, retrieved_doc_embeds = self._main_retrieve(question_hidden_states, n_docs)
-        return retrieved_doc_embeds, doc_ids, self.index.get_doc_dicts(doc_ids)
-
-    @classmethod
-    def get_tokenizers(cls, retriever_name_or_path, indexed_dataset=None, **kwargs):
-        return super(RagRayDistributedRetriever, cls).get_tokenizers(retriever_name_or_path, indexed_dataset, **kwargs)
-
-    @classmethod
-    def from_pretrained(cls, retriever_name_or_path, actor_handles, indexed_dataset=None, **kwargs):
-        config = kwargs.pop("config", None) or RagConfig.from_pretrained(retriever_name_or_path, **kwargs)
-        rag_tokenizer = RagTokenizer.from_pretrained(retriever_name_or_path, config=config)
-        question_encoder_tokenizer = rag_tokenizer.question_encoder
-        generator_tokenizer = rag_tokenizer.generator
-        if indexed_dataset is not None:
-            config.index_name = "custom"
-            index = CustomHFIndex(config.retrieval_vector_size, indexed_dataset)
-        else:
-            index = cls._build_index(config)
-        return cls(
-            config,
-            question_encoder_tokenizer=question_encoder_tokenizer,
-            generator_tokenizer=generator_tokenizer,
-            retrieval_workers=actor_handles,
-            index=index,
-        )
diff --git a/examples/research_projects/rag/eval_rag.py b/examples/research_projects/rag/eval_rag.py
deleted file mode 100644
index 55f4da56571d..000000000000
--- a/examples/research_projects/rag/eval_rag.py
+++ /dev/null
@@ -1,320 +0,0 @@
-"""Evaluation script for RAG models."""
-
-import argparse
-import ast
-import logging
-import os
-import sys
-
-import pandas as pd
-import torch
-from tqdm import tqdm
-
-from transformers import BartForConditionalGeneration, RagRetriever, RagSequenceForGeneration, RagTokenForGeneration
-from transformers import logging as transformers_logging
-
-
-sys.path.append(os.path.join(os.getcwd()))  # noqa: E402 # isort:skip
-from utils_rag import exact_match_score, f1_score  # noqa: E402 # isort:skip
-
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-
-transformers_logging.set_verbosity_info()
-
-
-def infer_model_type(model_name_or_path):
-    if "token" in model_name_or_path:
-        return "rag_token"
-    if "sequence" in model_name_or_path:
-        return "rag_sequence"
-    if "bart" in model_name_or_path:
-        return "bart"
-    return None
-
-
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-    return max(metric_fn(prediction, gt) for gt in ground_truths)
-
-
-def get_scores(args, preds_path, gold_data_path):
-    hypos = [line.strip() for line in open(preds_path, "r").readlines()]
-    answers = []
-
-    if args.gold_data_mode == "qa":
-        data = pd.read_csv(gold_data_path, sep="\t", header=None)
-        for answer_list in data[1]:
-            ground_truths = ast.literal_eval(answer_list)
-            answers.append(ground_truths)
-    else:
-        references = [line.strip() for line in open(gold_data_path, "r").readlines()]
-        answers = [[reference] for reference in references]
-
-    f1 = em = total = 0
-    for prediction, ground_truths in zip(hypos, answers):
-        total += 1
-        em += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
-        f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
-
-    em = 100.0 * em / total
-    f1 = 100.0 * f1 / total
-
-    logger.info(f"F1: {f1:.2f}")
-    logger.info(f"EM: {em:.2f}")
-
-
-def get_precision_at_k(args, preds_path, gold_data_path):
-    k = args.k
-    hypos = [line.strip() for line in open(preds_path, "r").readlines()]
-    references = [line.strip() for line in open(gold_data_path, "r").readlines()]
-
-    em = total = 0
-    for hypo, reference in zip(hypos, references):
-        hypo_provenance = set(hypo.split("\t")[:k])
-        ref_provenance = set(reference.split("\t"))
-        total += 1
-        em += len(hypo_provenance & ref_provenance) / k
-
-    em = 100.0 * em / total
-    logger.info(f"Precision@{k}: {em: .2f}")
-
-
-def evaluate_batch_retrieval(args, rag_model, questions):
-    def strip_title(title):
-        if title.startswith('"'):
-            title = title[1:]
-        if title.endswith('"'):
-            title = title[:-1]
-        return title
-
-    retriever_input_ids = rag_model.retriever.question_encoder_tokenizer.batch_encode_plus(
-        questions,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-    )["input_ids"].to(args.device)
-
-    question_enc_outputs = rag_model.rag.question_encoder(retriever_input_ids)
-    question_enc_pool_output = question_enc_outputs[0]
-
-    result = rag_model.retriever(
-        retriever_input_ids,
-        question_enc_pool_output.cpu().detach().to(torch.float32).numpy(),
-        prefix=rag_model.rag.generator.config.prefix,
-        n_docs=rag_model.config.n_docs,
-        return_tensors="pt",
-    )
-    all_docs = rag_model.retriever.index.get_doc_dicts(result.doc_ids)
-    provenance_strings = []
-    for docs in all_docs:
-        provenance = [strip_title(title) for title in docs["title"]]
-        provenance_strings.append("\t".join(provenance))
-    return provenance_strings
-
-
-def evaluate_batch_e2e(args, rag_model, questions):
-    with torch.no_grad():
-        inputs_dict = rag_model.retriever.question_encoder_tokenizer.batch_encode_plus(
-            questions, return_tensors="pt", padding=True, truncation=True
-        )
-
-        input_ids = inputs_dict.input_ids.to(args.device)
-        attention_mask = inputs_dict.attention_mask.to(args.device)
-        outputs = rag_model.generate(  # rag_model overwrites generate
-            input_ids,
-            attention_mask=attention_mask,
-            num_beams=args.num_beams,
-            min_length=args.min_length,
-            max_length=args.max_length,
-            early_stopping=False,
-            num_return_sequences=1,
-            bad_words_ids=[[0, 0]],  # BART likes to repeat BOS tokens, dont allow it to generate more than one
-        )
-        answers = rag_model.retriever.generator_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-
-        if args.print_predictions:
-            for q, a in zip(questions, answers):
-                logger.info("Q: {} - A: {}".format(q, a))
-
-        return answers
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_type",
-        choices=["rag_sequence", "rag_token", "bart"],
-        type=str,
-        help=(
-            "RAG model type: rag_sequence, rag_token or bart, if none specified, the type is inferred from the"
-            " model_name_or_path"
-        ),
-    )
-    parser.add_argument(
-        "--index_name",
-        default=None,
-        choices=["exact", "compressed", "legacy"],
-        type=str,
-        help="RAG model retriever type",
-    )
-    parser.add_argument(
-        "--index_path",
-        default=None,
-        type=str,
-        help="Path to the retrieval index",
-    )
-    parser.add_argument("--n_docs", default=5, type=int, help="Number of retrieved docs")
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained checkpoints or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--eval_mode",
-        choices=["e2e", "retrieval"],
-        default="e2e",
-        type=str,
-        help=(
-            "Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calculates"
-            " precision@k."
-        ),
-    )
-    parser.add_argument("--k", default=1, type=int, help="k for the precision@k calculation")
-    parser.add_argument(
-        "--evaluation_set",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to a file containing evaluation samples",
-    )
-    parser.add_argument(
-        "--gold_data_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to a tab-separated file with gold samples",
-    )
-    parser.add_argument(
-        "--gold_data_mode",
-        default="qa",
-        type=str,
-        choices=["qa", "ans"],
-        help=(
-            "Format of the gold data file"
-            "qa - a single line in the following format: question [tab] answer_list"
-            "ans - a single line of the gold file contains the expected answer string"
-        ),
-    )
-    parser.add_argument(
-        "--predictions_path",
-        type=str,
-        default="predictions.txt",
-        help="Name of the predictions file, to be stored in the checkpoints directory",
-    )
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument(
-        "--eval_batch_size",
-        default=8,
-        type=int,
-        help="Batch size per GPU/CPU for evaluation.",
-    )
-    parser.add_argument(
-        "--recalculate",
-        help="Recalculate predictions even if the prediction file exists",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--num_beams",
-        default=4,
-        type=int,
-        help="Number of beams to be used when generating answers",
-    )
-    parser.add_argument("--min_length", default=1, type=int, help="Min length of the generated answers")
-    parser.add_argument("--max_length", default=50, type=int, help="Max length of the generated answers")
-
-    parser.add_argument(
-        "--print_predictions",
-        action="store_true",
-        help="If True, prints predictions while evaluating.",
-    )
-    parser.add_argument(
-        "--print_docs",
-        action="store_true",
-        help="If True, prints docs retried while generating.",
-    )
-    args = parser.parse_args()
-    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    return args
-
-
-def main(args):
-    model_kwargs = {}
-    if args.model_type is None:
-        args.model_type = infer_model_type(args.model_name_or_path)
-        assert args.model_type is not None
-    if args.model_type.startswith("rag"):
-        model_class = RagTokenForGeneration if args.model_type == "rag_token" else RagSequenceForGeneration
-        model_kwargs["n_docs"] = args.n_docs
-        if args.index_name is not None:
-            model_kwargs["index_name"] = args.index_name
-        if args.index_path is not None:
-            model_kwargs["index_path"] = args.index_path
-    else:
-        model_class = BartForConditionalGeneration
-
-    checkpoints = (
-        [f.path for f in os.scandir(args.model_name_or_path) if f.is_dir()]
-        if args.eval_all_checkpoints
-        else [args.model_name_or_path]
-    )
-
-    logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-    score_fn = get_scores if args.eval_mode == "e2e" else get_precision_at_k
-    evaluate_batch_fn = evaluate_batch_e2e if args.eval_mode == "e2e" else evaluate_batch_retrieval
-
-    for checkpoint in checkpoints:
-        if os.path.exists(args.predictions_path) and (not args.recalculate):
-            logger.info("Calculating metrics based on an existing predictions file: {}".format(args.predictions_path))
-            score_fn(args, args.predictions_path, args.gold_data_path)
-            continue
-
-        logger.info("***** Running evaluation for {} *****".format(checkpoint))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        logger.info("  Predictions will be stored under {}".format(args.predictions_path))
-
-        if args.model_type.startswith("rag"):
-            retriever = RagRetriever.from_pretrained(checkpoint, **model_kwargs)
-            model = model_class.from_pretrained(checkpoint, retriever=retriever, **model_kwargs)
-            model.retriever.init_retrieval()
-        else:
-            model = model_class.from_pretrained(checkpoint, **model_kwargs)
-        model.to(args.device)
-
-        with open(args.evaluation_set, "r") as eval_file, open(args.predictions_path, "w") as preds_file:
-            questions = []
-            for line in tqdm(eval_file):
-                questions.append(line.strip())
-                if len(questions) == args.eval_batch_size:
-                    answers = evaluate_batch_fn(args, model, questions)
-                    preds_file.write("\n".join(answers) + "\n")
-                    preds_file.flush()
-                    questions = []
-            if len(questions) > 0:
-                answers = evaluate_batch_fn(args, model, questions)
-                preds_file.write("\n".join(answers))
-                preds_file.flush()
-
-            score_fn(args, args.predictions_path, args.gold_data_path)
-
-
-if __name__ == "__main__":
-    args = get_args()
-    main(args)
diff --git a/examples/research_projects/rag/finetune_rag.py b/examples/research_projects/rag/finetune_rag.py
deleted file mode 100644
index 7f4778d7d71e..000000000000
--- a/examples/research_projects/rag/finetune_rag.py
+++ /dev/null
@@ -1,649 +0,0 @@
-"""Finetuning script for RAG models. Adapted from examples.seq2seq.finetune.py"""
-
-import argparse
-import logging
-import os
-import sys
-import time
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Dict, List, Tuple
-
-import numpy as np
-import pytorch_lightning as pl
-import torch
-import torch.distributed as dist
-import torch.distributed as torch_distrib
-from pytorch_lightning.plugins.training_type import DDPPlugin
-from torch.utils.data import DataLoader
-
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    BartForConditionalGeneration,
-    BatchEncoding,
-    RagConfig,
-    RagSequenceForGeneration,
-    RagTokenForGeneration,
-    RagTokenizer,
-    T5ForConditionalGeneration,
-)
-from transformers import logging as transformers_logging
-from transformers.integrations import is_ray_available
-
-
-if is_ray_available():
-    import ray
-    from distributed_ray_retriever import RagRayDistributedRetriever, RayRetriever
-
-from callbacks_rag import (  # noqa: E402 # isort:skipq
-    get_checkpoint_callback,
-    get_early_stopping_callback,
-    Seq2SeqLoggingCallback,
-)
-
-from distributed_pytorch_retriever import RagPyTorchDistributedRetriever  # noqa: E402 # isort:skip
-from utils_rag import (  # noqa: E402 # isort:skip
-    calculate_exact_match,
-    flatten_list,
-    get_git_info,
-    is_rag_model,
-    lmap,
-    pickle_save,
-    save_git_info,
-    save_json,
-    set_extra_model_params,
-    Seq2SeqDataset,
-)
-
-# need the parent dir module
-sys.path.insert(2, str(Path(__file__).resolve().parents[1]))
-from lightning_base import BaseTransformer, add_generic_args, generic_train  # noqa
-
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-transformers_logging.set_verbosity_info()
-
-
-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-
-
-class CustomDDP(DDPPlugin):
-    def init_ddp_connection(self, global_rank=None, world_size=None) -> None:
-        module = self.model
-        global_rank = global_rank if global_rank is not None else self.cluster_environment.global_rank()
-        world_size = world_size if world_size is not None else self.cluster_environment.world_size()
-        os.environ["MASTER_ADDR"] = self.cluster_environment.master_address()
-        os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
-        if not torch.distributed.is_initialized():
-            logger.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
-            torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size)
-
-        if module.is_rag_model:
-            self.distributed_port = module.hparams.distributed_port
-            if module.distributed_retriever == "pytorch":
-                module.model.rag.retriever.init_retrieval(self.distributed_port)
-            elif module.distributed_retriever == "ray" and global_rank == 0:
-                # For the Ray retriever, only initialize it once when global
-                # rank is 0.
-                module.model.rag.retriever.init_retrieval()
-
-
-class GenerativeQAModule(BaseTransformer):
-    mode = "generative_qa"
-    loss_names = ["loss"]
-    metric_names = ["em"]
-    val_metric = "em"
-
-    def __init__(self, hparams, **kwargs):
-        # when loading from a pytorch lightning checkpoint, hparams are passed as dict
-        if isinstance(hparams, dict):
-            hparams = AttrDict(hparams)
-        if hparams.model_type == "rag_sequence":
-            self.model_class = RagSequenceForGeneration
-        elif hparams.model_type == "rag_token":
-            self.model_class = RagTokenForGeneration
-        elif hparams.model_type == "bart":
-            self.model_class = BartForConditionalGeneration
-        else:
-            self.model_class = T5ForConditionalGeneration
-        self.is_rag_model = is_rag_model(hparams.model_type)
-
-        config_class = RagConfig if self.is_rag_model else AutoConfig
-        config = config_class.from_pretrained(hparams.model_name_or_path)
-
-        # set retriever parameters
-        config.index_name = hparams.index_name or config.index_name
-        config.passages_path = hparams.passages_path or config.passages_path
-        config.index_path = hparams.index_path or config.index_path
-        config.use_dummy_dataset = hparams.use_dummy_dataset
-
-        # set extra_model_params for generator configs and load_model
-        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "attention_dropout", "dropout")
-        if self.is_rag_model:
-            if hparams.prefix is not None:
-                config.generator.prefix = hparams.prefix
-            config.label_smoothing = hparams.label_smoothing
-            hparams, config.generator = set_extra_model_params(extra_model_params, hparams, config.generator)
-            if hparams.distributed_retriever == "pytorch":
-                retriever = RagPyTorchDistributedRetriever.from_pretrained(hparams.model_name_or_path, config=config)
-            elif hparams.distributed_retriever == "ray":
-                # The Ray retriever needs the handles to the retriever actors.
-                retriever = RagRayDistributedRetriever.from_pretrained(
-                    hparams.model_name_or_path, hparams.actor_handles, config=config
-                )
-            model = self.model_class.from_pretrained(hparams.model_name_or_path, config=config, retriever=retriever)
-            prefix = config.question_encoder.prefix
-        else:
-            if hparams.prefix is not None:
-                config.prefix = hparams.prefix
-            hparams, config = set_extra_model_params(extra_model_params, hparams, config)
-            model = self.model_class.from_pretrained(hparams.model_name_or_path, config=config)
-            prefix = config.prefix
-
-        tokenizer = (
-            RagTokenizer.from_pretrained(hparams.model_name_or_path)
-            if self.is_rag_model
-            else AutoTokenizer.from_pretrained(hparams.model_name_or_path)
-        )
-
-        super().__init__(hparams, config=config, tokenizer=tokenizer, model=model)
-
-        save_git_info(self.hparams.output_dir)
-        self.output_dir = Path(self.hparams.output_dir)
-        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
-        self.hparams_save_path = Path(self.output_dir) / "hparams.pkl"
-        pickle_save(self.hparams, self.hparams_save_path)
-        self.step_count = 0
-        self.metrics = defaultdict(list)
-
-        self.dataset_kwargs: dict = {
-            "data_dir": self.hparams.data_dir,
-            "max_source_length": self.hparams.max_source_length,
-            "prefix": prefix or "",
-        }
-        n_observations_per_split = {
-            "train": self.hparams.n_train,
-            "val": self.hparams.n_val,
-            "test": self.hparams.n_test,
-        }
-        self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()}
-
-        self.target_lens = {
-            "train": self.hparams.max_target_length,
-            "val": self.hparams.val_max_target_length,
-            "test": self.hparams.test_max_target_length,
-        }
-        assert self.target_lens["train"] <= self.target_lens["val"], f"target_lens: {self.target_lens}"
-        assert self.target_lens["train"] <= self.target_lens["test"], f"target_lens: {self.target_lens}"
-
-        self.hparams.git_sha = get_git_info()["repo_sha"]
-        self.num_workers = hparams.num_workers
-        self.distributed_port = self.hparams.distributed_port
-
-        # For single GPU training, init_ddp_connection is not called.
-        # So we need to initialize the retrievers here.
-        if hparams.gpus <= 1:
-            if hparams.distributed_retriever == "ray":
-                self.model.retriever.init_retrieval()
-            elif hparams.distributed_retriever == "pytorch":
-                self.model.retriever.init_retrieval(self.distributed_port)
-
-        self.distributed_retriever = hparams.distributed_retriever
-
-    def forward(self, input_ids, **kwargs):
-        return self.model(input_ids, **kwargs)
-
-    def ids_to_clean_text(self, generated_ids: List[int]):
-        gen_text = self.tokenizer.batch_decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-        return lmap(str.strip, gen_text)
-
-    def _step(self, batch: dict) -> Tuple:
-        source_ids, source_mask, target_ids = batch["input_ids"], batch["attention_mask"], batch["decoder_input_ids"]
-
-        rag_kwargs = {}
-        if isinstance(self.model, T5ForConditionalGeneration):
-            decoder_input_ids = self.model._shift_right(target_ids)
-            lm_labels = target_ids
-        elif isinstance(self.model, BartForConditionalGeneration):
-            decoder_input_ids = target_ids[:, :-1].contiguous()
-            lm_labels = target_ids[:, 1:].clone()
-        else:
-            assert self.is_rag_model
-            generator = self.model.rag.generator
-            if isinstance(generator, T5ForConditionalGeneration):
-                decoder_start_token_id = generator.config.decoder_start_token_id
-                decoder_input_ids = (
-                    torch.cat(
-                        [torch.tensor([[decoder_start_token_id]] * target_ids.shape[0]).to(target_ids), target_ids],
-                        dim=1,
-                    )
-                    if target_ids.shape[0] < self.target_lens["train"]
-                    else generator._shift_right(target_ids)
-                )
-            elif isinstance(generator, BartForConditionalGeneration):
-                decoder_input_ids = target_ids
-            lm_labels = decoder_input_ids
-            rag_kwargs["reduce_loss"] = True
-
-        assert decoder_input_ids is not None
-
-        outputs = self(
-            source_ids,
-            attention_mask=source_mask,
-            decoder_input_ids=decoder_input_ids,
-            use_cache=False,
-            labels=lm_labels,
-            **rag_kwargs,
-        )
-
-        loss = outputs["loss"]
-        return (loss,)
-
-    @property
-    def pad(self) -> int:
-        raise NotImplementedError("pad not implemented")
-
-    def training_step(self, batch, batch_idx) -> Dict:
-        loss_tensors = self._step(batch)
-
-        logs = {name: loss.detach() for name, loss in zip(self.loss_names, loss_tensors)}
-        # tokens per batch
-        tgt_pad_token_id = (
-            self.tokenizer.generator.pad_token_id
-            if isinstance(self.tokenizer, RagTokenizer)
-            else self.tokenizer.pad_token_id
-        )
-        src_pad_token_id = (
-            self.tokenizer.question_encoder.pad_token_id
-            if isinstance(self.tokenizer, RagTokenizer)
-            else self.tokenizer.pad_token_id
-        )
-        logs["tpb"] = (
-            batch["input_ids"].ne(src_pad_token_id).sum() + batch["decoder_input_ids"].ne(tgt_pad_token_id).sum()
-        )
-
-        return {"loss": loss_tensors[0], "log": logs}
-
-    def validation_step(self, batch, batch_idx) -> Dict:
-        return self._generative_step(batch)
-
-    def validation_epoch_end(self, outputs, prefix="val") -> Dict:
-        self.step_count += 1
-        losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names}
-        loss = losses["loss"]
-        gen_metrics = {
-            k: np.array([x[k] for x in outputs]).mean() for k in self.metric_names + ["gen_time", "gen_len"]
-        }
-        metrics_tensor: torch.FloatTensor = torch.tensor(gen_metrics[self.val_metric]).type_as(loss)
-        gen_metrics.update({k: v.item() for k, v in losses.items()})
-
-        # fix for https://github.com/PyTorchLightning/pytorch-lightning/issues/2424
-        if dist.is_initialized():
-            dist.all_reduce(metrics_tensor, op=dist.ReduceOp.SUM)
-            metrics_tensor = metrics_tensor / dist.get_world_size()
-            gen_metrics.update({self.val_metric: metrics_tensor.item()})
-
-        losses.update(gen_metrics)
-        metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
-        metrics["step_count"] = self.step_count
-        self.save_metrics(metrics, prefix)  # writes to self.metrics_save_path
-        preds = flatten_list([x["preds"] for x in outputs])
-        return {"log": metrics, "preds": preds, f"{prefix}_loss": loss, f"{prefix}_{self.val_metric}": metrics_tensor}
-
-    def save_metrics(self, latest_metrics, type_path) -> None:
-        self.metrics[type_path].append(latest_metrics)
-        save_json(self.metrics, self.metrics_save_path)
-
-    def calc_generative_metrics(self, preds, target) -> Dict:
-        return calculate_exact_match(preds, target)
-
-    def _generative_step(self, batch: dict) -> dict:
-        start_time = time.time()
-        batch = BatchEncoding(batch).to(device=self.model.device)
-        generated_ids = self.model.generate(
-            batch["input_ids"],
-            attention_mask=batch["attention_mask"],
-            do_deduplication=False,  # rag specific parameter
-            use_cache=True,
-            min_length=1,
-            max_length=self.target_lens["val"],
-        )
-
-        gen_time = (time.time() - start_time) / batch["input_ids"].shape[0]
-        preds: List[str] = self.ids_to_clean_text(generated_ids)
-        target: List[str] = self.ids_to_clean_text(batch["decoder_input_ids"])
-        loss_tensors = self._step(batch)
-        base_metrics = dict(zip(self.loss_names, loss_tensors))
-        gen_metrics: Dict = self.calc_generative_metrics(preds, target)
-
-        summ_len = np.mean(lmap(len, generated_ids))
-        base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **gen_metrics)
-        return base_metrics
-
-    def test_step(self, batch, batch_idx):
-        return self._generative_step(batch)
-
-    def test_epoch_end(self, outputs):
-        return self.validation_epoch_end(outputs, prefix="test")
-
-    def get_dataset(self, type_path) -> Seq2SeqDataset:
-        n_obs = self.n_obs[type_path]
-        max_target_length = self.target_lens[type_path]
-        dataset = Seq2SeqDataset(
-            self.tokenizer,
-            type_path=type_path,
-            n_obs=n_obs,
-            max_target_length=max_target_length,
-            **self.dataset_kwargs,
-        )
-        return dataset
-
-    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False) -> DataLoader:
-        dataset = self.get_dataset(type_path)
-
-        dataloader = DataLoader(
-            dataset,
-            batch_size=batch_size,
-            collate_fn=dataset.collate_fn,
-            shuffle=shuffle,
-            num_workers=self.num_workers,
-        )
-        return dataloader
-
-    def train_dataloader(self) -> DataLoader:
-        dataloader = self.get_dataloader("train", batch_size=self.hparams.train_batch_size, shuffle=True)
-        return dataloader
-
-    def val_dataloader(self) -> DataLoader:
-        return self.get_dataloader("val", batch_size=self.hparams.eval_batch_size)
-
-    def test_dataloader(self) -> DataLoader:
-        return self.get_dataloader("test", batch_size=self.hparams.eval_batch_size)
-
-    @pl.utilities.rank_zero_only
-    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        save_path = self.output_dir.joinpath("checkpoint{}".format(self.step_count))
-        self.model.config.save_step = self.step_count
-        self.model.save_pretrained(save_path)
-        self.tokenizer.save_pretrained(save_path)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        BaseTransformer.add_model_specific_args(parser, root_dir)
-        add_generic_args(parser, root_dir)
-        parser.add_argument(
-            "--max_source_length",
-            default=128,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument(
-            "--max_target_length",
-            default=25,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument(
-            "--val_max_target_length",
-            default=25,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument(
-            "--test_max_target_length",
-            default=25,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default")
-        parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.")
-        parser.add_argument("--n_val", type=int, default=-1, required=False, help="# examples. -1 means use all.")
-        parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.")
-        parser.add_argument("--label_smoothing", type=float, default=0.0, required=False)
-        parser.add_argument(
-            "--prefix",
-            type=str,
-            default=None,
-            help="Prefix added at the beginning of each text, typically used with T5-based models.",
-        )
-        parser.add_argument(
-            "--early_stopping_patience",
-            type=int,
-            default=-1,
-            required=False,
-            help=(
-                "-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So"
-                " val_check_interval will effect it."
-            ),
-        )
-        parser.add_argument(
-            "--distributed-port", type=int, default=-1, required=False, help="Port number for distributed training."
-        )
-        parser.add_argument(
-            "--model_type",
-            choices=["rag_sequence", "rag_token", "bart", "t5"],
-            type=str,
-            help=(
-                "RAG model type: sequence or token, if none specified, the type is inferred from the"
-                " model_name_or_path"
-            ),
-        )
-        return parser
-
-    @staticmethod
-    def add_retriever_specific_args(parser):
-        parser.add_argument(
-            "--index_name",
-            type=str,
-            default=None,
-            help=(
-                "Name of the index to use: 'hf' for a canonical dataset from the datasets library (default), 'custom'"
-                " for a local index, or 'legacy' for the orignal one)"
-            ),
-        )
-        parser.add_argument(
-            "--passages_path",
-            type=str,
-            default=None,
-            help=(
-                "Path to the dataset of passages for custom index. More info about custom indexes in the RagRetriever"
-                " documentation as well as in `examples/rag/use_own_knowledge_dataset.py`"
-            ),
-        )
-        parser.add_argument(
-            "--index_path",
-            type=str,
-            default=None,
-            help=(
-                "Path to the faiss index for custom index. More info about custom indexes in the RagRetriever"
-                " documentation as well as in `examples/rag/use_own_knowledge_dataset.py`"
-            ),
-        )
-        parser.add_argument(
-            "--distributed_retriever",
-            choices=["ray", "pytorch"],
-            type=str,
-            default="pytorch",
-            help=(
-                "What implementation to use for distributed retriever? If "
-                "pytorch is selected, the index is loaded on training "
-                "worker 0, and torch.distributed is used to handle "
-                "communication between training worker 0, and the other "
-                "training workers. If ray is selected, the Ray library is "
-                "used to create load the index on separate processes, "
-                "and Ray handles the communication between the training "
-                "workers and the retrieval actors."
-            ),
-        )
-        parser.add_argument(
-            "--use_dummy_dataset",
-            type=bool,
-            default=False,
-            help=(
-                "Whether to use the dummy version of the dataset index. More info about custom indexes in the"
-                " RagRetriever documentation as well as in `examples/rag/use_own_knowledge_dataset.py`"
-            ),
-        )
-        return parser
-
-    @staticmethod
-    def add_ray_specific_args(parser):
-        # Ray cluster address.
-        parser.add_argument(
-            "--ray-address",
-            default="auto",
-            type=str,
-            help=(
-                "The address of the Ray cluster to connect to. If not "
-                "specified, Ray will attempt to automatically detect the "
-                "cluster. Has no effect if pytorch is used as the distributed "
-                "retriever."
-            ),
-        )
-        parser.add_argument(
-            "--num_retrieval_workers",
-            type=int,
-            default=1,
-            help=(
-                "The number of retrieval actors to use when Ray is selected "
-                "for the distributed retriever. Has no effect when "
-                "distributed_retriever is set to pytorch."
-            ),
-        )
-        return parser
-
-
-def main(args=None, model=None) -> GenerativeQAModule:
-    parser = argparse.ArgumentParser()
-    parser = pl.Trainer.add_argparse_args(parser)
-    parser = GenerativeQAModule.add_model_specific_args(parser, os.getcwd())
-    parser = GenerativeQAModule.add_retriever_specific_args(parser)
-
-    args = args or parser.parse_args()
-
-    Path(args.output_dir).mkdir(exist_ok=True)
-
-    named_actors = []
-    if args.distributed_retriever == "ray" and args.gpus > 1:
-        if not is_ray_available():
-            raise RuntimeError("Please install Ray to use the Ray distributed retriever.")
-        # Connect to an existing Ray cluster.
-        try:
-            ray.init(address=args.ray_address, namespace="rag")
-        except (ConnectionError, ValueError):
-            logger.warning(
-                "Connection to Ray cluster failed. Make sure a Ray "
-                "cluster is running by either using Ray's cluster "
-                "launcher (`ray up`) or by manually starting Ray on "
-                "each node via `ray start --head` for the head node "
-                "and `ray start --address='<ip address>:6379'` for "
-                "additional nodes. See "
-                "https://docs.ray.io/en/master/cluster/index.html "
-                "for more info."
-            )
-            raise
-
-        # Create Ray actors only for rank 0.
-        if ("LOCAL_RANK" not in os.environ or int(os.environ["LOCAL_RANK"]) == 0) and (
-            "NODE_RANK" not in os.environ or int(os.environ["NODE_RANK"]) == 0
-        ):
-            remote_cls = ray.remote(RayRetriever)
-            named_actors = [
-                remote_cls.options(name="retrieval_worker_{}".format(i)).remote()
-                for i in range(args.num_retrieval_workers)
-            ]
-        else:
-            logger.info(
-                "Getting named actors for NODE_RANK {}, LOCAL_RANK {}".format(
-                    os.environ["NODE_RANK"], os.environ["LOCAL_RANK"]
-                )
-            )
-            named_actors = [ray.get_actor("retrieval_worker_{}".format(i)) for i in range(args.num_retrieval_workers)]
-    args.actor_handles = named_actors
-    assert args.actor_handles == named_actors
-
-    if model is None:
-        model: GenerativeQAModule = GenerativeQAModule(args)
-
-    dataset = Path(args.data_dir).name
-    if (
-        args.logger_name == "default"
-        or args.fast_dev_run
-        or str(args.output_dir).startswith("/tmp")
-        or str(args.output_dir).startswith("/var")
-    ):
-        training_logger = True  # don't pollute wandb logs unnecessarily
-    elif args.logger_name == "wandb":
-        from pytorch_lightning.loggers import WandbLogger
-
-        project = os.environ.get("WANDB_PROJECT", dataset)
-        training_logger = WandbLogger(name=model.output_dir.name, project=project)
-
-    elif args.logger_name == "wandb_shared":
-        from pytorch_lightning.loggers import WandbLogger
-
-        training_logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}")
-
-    es_callback = (
-        get_early_stopping_callback(model.val_metric, args.early_stopping_patience)
-        if args.early_stopping_patience >= 0
-        else False
-    )
-
-    trainer: pl.Trainer = generic_train(
-        model,
-        args,
-        logging_callback=Seq2SeqLoggingCallback(),
-        checkpoint_callback=get_checkpoint_callback(args.output_dir, model.val_metric),
-        early_stopping_callback=es_callback,
-        logger=training_logger,
-        custom_ddp_plugin=CustomDDP() if args.gpus > 1 else None,
-        profiler=pl.profiler.AdvancedProfiler() if args.profile else None,
-    )
-    pickle_save(model.hparams, model.output_dir / "hparams.pkl")
-
-    if not args.do_predict:
-        return model
-
-    # test() without a model tests using the best checkpoint automatically
-    trainer.test()
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser = pl.Trainer.add_argparse_args(parser)
-    parser = GenerativeQAModule.add_model_specific_args(parser, os.getcwd())
-    parser = GenerativeQAModule.add_retriever_specific_args(parser)
-    parser = GenerativeQAModule.add_ray_specific_args(parser)
-
-    # Pytorch Lightning Profiler
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="If True, use pytorch_lightning.profiler.AdvancedProfiler to profile the Trainer.",
-    )
-
-    args = parser.parse_args()
-
-    main(args)
diff --git a/examples/research_projects/rag/finetune_rag.sh b/examples/research_projects/rag/finetune_rag.sh
deleted file mode 100755
index 8fd1fea3e546..000000000000
--- a/examples/research_projects/rag/finetune_rag.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-# A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
-# run ./examples/rag/finetune_rag.sh --help to see all the possible options
-
-python examples/rag/finetune_rag.py \
-    --data_dir $DATA_DIR \
-    --output_dir $OUTPUT_DIR \
-    --model_name_or_path $MODEL_NAME_OR_PATH \
-    --model_type rag_sequence \
-    --fp16 \
-    --gpus 8 \
-    --profile \
-    --do_train \
-    --do_predict \
-    --n_val -1 \
-    --train_batch_size 8 \
-    --eval_batch_size 1 \
-    --max_source_length 128 \
-    --max_target_length 25 \
-    --val_max_target_length 25 \
-    --test_max_target_length 25 \
-    --label_smoothing 0.1 \
-    --dropout 0.1 \
-    --attention_dropout 0.1 \
-    --weight_decay 0.001 \
-    --adam_epsilon 1e-08 \
-    --max_grad_norm 0.1 \
-    --lr_scheduler polynomial \
-    --learning_rate 3e-05 \
-    --num_train_epochs 100 \
-    --warmup_steps 500 \
-    --gradient_accumulation_steps 1 \
diff --git a/examples/research_projects/rag/finetune_rag_ray.sh b/examples/research_projects/rag/finetune_rag_ray.sh
deleted file mode 100755
index 7c8e7b97e77c..000000000000
--- a/examples/research_projects/rag/finetune_rag_ray.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-# Sample script to finetune RAG using Ray for distributed retrieval.
-
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-# Start a single-node Ray cluster.
-ray start --head
-
-# A sample finetuning run, you need to specify data_dir, output_dir and model_name_or_path
-# run ./examples/rag/finetune_rag_ray.sh --help to see all the possible options
-
-python examples/rag/finetune_rag.py \
-    --data_dir $DATA_DIR \
-    --output_dir $OUTPUT_DIR \
-    --model_name_or_path $MODEL_NAME_OR_PATH \
-    --model_type rag_sequence \
-    --fp16 \
-    --gpus 8 \
-    --profile \
-    --do_train \
-    --do_predict \
-    --n_val -1 \
-    --train_batch_size 8 \
-    --eval_batch_size 1 \
-    --max_source_length 128 \
-    --max_target_length 25 \
-    --val_max_target_length 25 \
-    --test_max_target_length 25 \
-    --label_smoothing 0.1 \
-    --dropout 0.1 \
-    --attention_dropout 0.1 \
-    --weight_decay 0.001 \
-    --adam_epsilon 1e-08 \
-    --max_grad_norm 0.1 \
-    --lr_scheduler polynomial \
-    --learning_rate 3e-05 \
-    --num_train_epochs 100 \
-    --warmup_steps 500 \
-    --gradient_accumulation_steps 1 \
-    --distributed_retriever ray \
-    --num_retrieval_workers 4
-
-# Stop the Ray cluster.
-ray stop
diff --git a/examples/research_projects/rag/lightning_base.py b/examples/research_projects/rag/lightning_base.py
deleted file mode 100644
index 12099bc3aa10..000000000000
--- a/examples/research_projects/rag/lightning_base.py
+++ /dev/null
@@ -1,404 +0,0 @@
-import argparse
-import logging
-import os
-from pathlib import Path
-from typing import Any, Dict
-
-import pytorch_lightning as pl
-from pytorch_lightning.utilities import rank_zero_info
-
-from transformers import (
-    AdamW,
-    AutoConfig,
-    AutoModel,
-    AutoModelForPreTraining,
-    AutoModelForQuestionAnswering,
-    AutoModelForSeq2SeqLM,
-    AutoModelForSequenceClassification,
-    AutoModelForTokenClassification,
-    AutoModelWithLMHead,
-    AutoTokenizer,
-    PretrainedConfig,
-    PreTrainedTokenizer,
-)
-from transformers.optimization import (
-    Adafactor,
-    get_cosine_schedule_with_warmup,
-    get_cosine_with_hard_restarts_schedule_with_warmup,
-    get_linear_schedule_with_warmup,
-    get_polynomial_decay_schedule_with_warmup,
-)
-from transformers.utils.versions import require_version
-
-
-logger = logging.getLogger(__name__)
-
-require_version("pytorch_lightning>=1.0.4")
-
-MODEL_MODES = {
-    "base": AutoModel,
-    "sequence-classification": AutoModelForSequenceClassification,
-    "question-answering": AutoModelForQuestionAnswering,
-    "pretraining": AutoModelForPreTraining,
-    "token-classification": AutoModelForTokenClassification,
-    "language-modeling": AutoModelWithLMHead,
-    "summarization": AutoModelForSeq2SeqLM,
-    "translation": AutoModelForSeq2SeqLM,
-}
-
-
-# update this and the import above to support new schedulers from transformers.optimization
-arg_to_scheduler = {
-    "linear": get_linear_schedule_with_warmup,
-    "cosine": get_cosine_schedule_with_warmup,
-    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
-    "polynomial": get_polynomial_decay_schedule_with_warmup,
-    # '': get_constant_schedule,             # not supported for now
-    # '': get_constant_schedule_with_warmup, # not supported for now
-}
-arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
-arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
-
-
-class BaseTransformer(pl.LightningModule):
-    def __init__(
-        self,
-        hparams: argparse.Namespace,
-        num_labels=None,
-        mode="base",
-        config=None,
-        tokenizer=None,
-        model=None,
-        **config_kwargs,
-    ):
-        """Initialize a model, tokenizer and config."""
-        super().__init__()
-        # TODO: move to self.save_hyperparameters()
-        # self.save_hyperparameters()
-        # can also expand arguments into trainer signature for easier reading
-
-        self.save_hyperparameters(hparams)
-        self.step_count = 0
-        self.output_dir = Path(self.hparams.output_dir)
-        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
-        if config is None:
-            self.config = AutoConfig.from_pretrained(
-                self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
-                **({"num_labels": num_labels} if num_labels is not None else {}),
-                cache_dir=cache_dir,
-                **config_kwargs,
-            )
-        else:
-            self.config: PretrainedConfig = config
-
-        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
-        for p in extra_model_params:
-            if getattr(self.hparams, p, None):
-                assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute"
-                setattr(self.config, p, getattr(self.hparams, p))
-
-        if tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.tokenizer: PreTrainedTokenizer = tokenizer
-        self.model_type = MODEL_MODES[mode]
-        if model is None:
-            self.model = self.model_type.from_pretrained(
-                self.hparams.model_name_or_path,
-                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
-                config=self.config,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.model = model
-
-    def load_hf_checkpoint(self, *args, **kwargs):
-        self.model = self.model_type.from_pretrained(*args, **kwargs)
-
-    def get_lr_scheduler(self):
-        get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
-        scheduler = get_schedule_func(
-            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps()
-        )
-        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
-        return scheduler
-
-    def configure_optimizers(self):
-        """Prepare optimizer and schedule (linear warmup and decay)"""
-        model = self.model
-        no_decay = ["bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-                "weight_decay": self.hparams.weight_decay,
-            },
-            {
-                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-                "weight_decay": 0.0,
-            },
-        ]
-        if self.hparams.adafactor:
-            optimizer = Adafactor(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False
-            )
-
-        else:
-            optimizer = AdamW(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
-            )
-        self.opt = optimizer
-
-        scheduler = self.get_lr_scheduler()
-
-        return [optimizer], [scheduler]
-
-    def test_step(self, batch, batch_nb):
-        return self.validation_step(batch, batch_nb)
-
-    def test_epoch_end(self, outputs):
-        return self.validation_end(outputs)
-
-    def total_steps(self) -> int:
-        """The number of total training steps that will be run. Used for lr scheduler purposes."""
-        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
-        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
-        return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs
-
-    def setup(self, stage):
-        if stage == "test":
-            self.dataset_size = len(self.test_dataloader().dataset)
-        else:
-            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
-            self.dataset_size = len(self.train_dataloader().dataset)
-
-    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False):
-        raise NotImplementedError("You must implement this for your task")
-
-    def train_dataloader(self):
-        return self.train_loader
-
-    def val_dataloader(self):
-        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
-
-    def test_dataloader(self):
-        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
-
-    def _feature_file(self, mode):
-        return os.path.join(
-            self.hparams.data_dir,
-            "cached_{}_{}_{}".format(
-                mode,
-                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
-                str(self.hparams.max_seq_length),
-            ),
-        )
-
-    @pl.utilities.rank_zero_only
-    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        save_path = self.output_dir.joinpath("best_tfmr")
-        self.model.config.save_step = self.step_count
-        self.model.save_pretrained(save_path)
-        self.tokenizer.save_pretrained(save_path)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        parser.add_argument(
-            "--model_name_or_path",
-            default=None,
-            type=str,
-            required=True,
-            help="Path to pretrained model or model identifier from huggingface.co/models",
-        )
-        parser.add_argument(
-            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-        )
-        parser.add_argument(
-            "--tokenizer_name",
-            default=None,
-            type=str,
-            help="Pretrained tokenizer name or path if not the same as model_name",
-        )
-        parser.add_argument(
-            "--cache_dir",
-            default="",
-            type=str,
-            help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-        )
-        parser.add_argument(
-            "--encoder_layerdrop",
-            type=float,
-            help="Encoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--decoder_layerdrop",
-            type=float,
-            help="Decoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--dropout",
-            type=float,
-            help="Dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--attention_dropout",
-            type=float,
-            help="Attention dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-        parser.add_argument(
-            "--lr_scheduler",
-            default="linear",
-            choices=arg_to_scheduler_choices,
-            metavar=arg_to_scheduler_metavar,
-            type=str,
-            help="Learning rate scheduler",
-        )
-        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-        parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader")
-        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
-        parser.add_argument("--train_batch_size", default=32, type=int)
-        parser.add_argument("--eval_batch_size", default=32, type=int)
-        parser.add_argument("--adafactor", action="store_true")
-
-
-class InitCallback(pl.Callback):
-    # This method is better that using a custom DDP plugging with the latest pytorch-lightning (@shamanez)
-    def on_sanity_check_start(self, trainer, pl_module):
-        if (
-            trainer.is_global_zero and trainer.global_rank == 0
-        ):  # we initialize the retriever only on master worker with RAY. In new pytorch-lightning accelorators are removed.
-            pl_module.model.rag.retriever.init_retrieval()  # better to use hook functions.
-
-
-class LoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
-        lrs = {f"lr_group_{i}": lr for i, lr in enumerate(lr_scheduler.get_lr())}
-        pl_module.logger.log_metrics(lrs)
-
-    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Validation results *****")
-        metrics = trainer.callback_metrics
-        # Log results
-        for key in sorted(metrics):
-            if key not in ["log", "progress_bar"]:
-                rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
-
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Test results *****")
-        metrics = trainer.callback_metrics
-        # Log and save results to file
-        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
-        with open(output_test_results_file, "w") as writer:
-            for key in sorted(metrics):
-                if key not in ["log", "progress_bar"]:
-                    rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
-                    writer.write("{} = {}\n".format(key, str(metrics[key])))
-
-
-def add_generic_args(parser, root_dir) -> None:
-    #  To allow all pl args uncomment the following line
-    #  parser = pl.Trainer.add_argparse_args(parser)
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O2",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
-    parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm")
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        dest="accumulate_grad_batches",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
-    )
-
-
-def generic_train(
-    model: BaseTransformer,
-    args: argparse.Namespace,
-    early_stopping_callback=None,
-    logger=True,  # can pass WandbLogger() here
-    custom_ddp_plugin=None,
-    extra_callbacks=[],
-    checkpoint_callback=None,
-    logging_callback=None,
-    **extra_train_kwargs,
-):
-    pl.seed_everything(args.seed)
-
-    # init model
-    odir = Path(model.hparams.output_dir)
-    odir.mkdir(exist_ok=True)
-
-    # add custom checkpoints
-    if checkpoint_callback is None:
-        checkpoint_callback = pl.callbacks.ModelCheckpoint(
-            filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
-        )
-    if early_stopping_callback:
-        extra_callbacks.append(early_stopping_callback)
-    if logging_callback is None:
-        logging_callback = LoggingCallback()
-
-    train_params = {}
-
-    # TODO: remove with PyTorch 1.6 since pl uses native amp
-    if args.fp16:
-        train_params["precision"] = 16
-        # train_params["amp_level"] = args.fp16_opt_level
-
-    if args.gpus > 1:
-        train_params["accelerator"] = "auto"  # "ddp"
-        train_params["strategy"] = "ddp"
-
-    train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
-    train_params["profiler"] = None  # extra_train_kwargs.get("profiler", None) #get unwanted logs
-    train_params["devices"] = "auto"
-
-    trainer = pl.Trainer.from_argparse_args(
-        args,
-        weights_summary=None,
-        callbacks=[logging_callback] + extra_callbacks + [checkpoint_callback] + [InitCallback()],
-        # plugins=[custom_ddp_plugin],
-        logger=logger,
-        **train_params,
-    )
-
-    if args.do_train:
-        trainer.fit(model)
-
-    return trainer
diff --git a/examples/research_projects/rag/parse_dpr_relevance_data.py b/examples/research_projects/rag/parse_dpr_relevance_data.py
deleted file mode 100644
index 4d8a1e5f4674..000000000000
--- a/examples/research_projects/rag/parse_dpr_relevance_data.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""
-This script reads DPR retriever training data and parses each datapoint. We save a line per datapoint.
-Each line consists of the query followed by a tab-separated list of Wikipedia page titles constituting
-positive contexts for a given query.
-"""
-
-import argparse
-import json
-
-from tqdm import tqdm
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--src_path",
-        type=str,
-        default="biencoder-nq-dev.json",
-        help="Path to raw DPR training data",
-    )
-    parser.add_argument(
-        "--evaluation_set",
-        type=str,
-        help="where to store parsed evaluation_set file",
-    )
-    parser.add_argument(
-        "--gold_data_path",
-        type=str,
-        help="where to store parsed gold_data_path file",
-    )
-    args = parser.parse_args()
-
-    with open(args.src_path, "r") as src_file, open(args.evaluation_set, "w") as eval_file, open(
-        args.gold_data_path, "w"
-    ) as gold_file:
-        dpr_records = json.load(src_file)
-        for dpr_record in tqdm(dpr_records):
-            question = dpr_record["question"]
-            contexts = [context["title"] for context in dpr_record["positive_ctxs"]]
-            eval_file.write(question + "\n")
-            gold_file.write("\t".join(contexts) + "\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/rag/requirements.txt b/examples/research_projects/rag/requirements.txt
deleted file mode 100644
index 5988d38de9e9..000000000000
--- a/examples/research_projects/rag/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-faiss-cpu >= 1.6.3
-datasets >= 1.0.1
-psutil >= 5.7.0
-torch >= 1.4.0
-ray >= 1.10.0
-pytorch-lightning >= 1.5.10, <=1.6.0
-transformers
-GitPython
\ No newline at end of file
diff --git a/examples/research_projects/rag/test_data/my_knowledge_dataset.csv b/examples/research_projects/rag/test_data/my_knowledge_dataset.csv
deleted file mode 100644
index 76da009a2f23..000000000000
--- a/examples/research_projects/rag/test_data/my_knowledge_dataset.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-Aaron	Aaron Aaron ( or ; "Ahärôn") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman ("prophet") to the Pharaoh. Part of the Law (Torah) that Moses received from God at Sinai granted Aaron the priesthood for himself and his male descendants, and he became the first High Priest of the Israelites. Aaron died before the Israelites crossed the North Jordan river and he was buried on Mount Hor (Numbers 33:39; Deuteronomy 10:6 says he died and was buried at Moserah). Aaron is also mentioned in the New Testament of the Bible. According to the Book of Exodus, Aaron first functioned as Moses' assistant. Because Moses complained that he could not speak well, God appointed Aaron as Moses' "prophet" (Exodus 4:10-17; 7:1). At the command of Moses, he let his rod turn into a snake. Then he stretched out his rod in order to bring on the first three plagues. After that, Moses tended to act and speak for himself. During the journey in the wilderness, Aaron was not always prominent or active. At the battle with Amalek, he was chosen with Hur to support the hand of Moses that held the "rod of God". When the revelation was given to Moses at biblical Mount Sinai, he headed the elders of Israel who accompanied Moses on the way to the summit.
-"Pokémon"	Pokémon , also known as in Japan, is a media franchise managed by The Pokémon Company, a Japanese consortium between Nintendo, Game Freak, and Creatures. The franchise copyright is shared by all three companies, but Nintendo is the sole owner of the trademark. The franchise was created by Satoshi Tajiri in 1995, and is centered on fictional creatures called "Pokémon", which humans, known as Pokémon Trainers, catch and train to battle each other for sport. The English slogan for the franchise is "Gotta Catch 'Em All". Works within the franchise are set in the Pokémon universe. The franchise began as "Pokémon Red" and "Green" (released outside of Japan as "Pokémon Red" and "Blue"), a pair of video games for the original Game Boy that were developed by Game Freak and published by Nintendo in February 1996. "Pokémon" has since gone on to become the highest-grossing media franchise of all time, with over in revenue up until March 2017. The original video game series is the second best-selling video game franchise (behind Nintendo's "Mario" franchise) with more than 300million copies sold and over 800million mobile downloads. In addition, the "Pokémon" franchise includes the world's top-selling toy brand, the top-selling trading card game with over 25.7billion cards sold, an anime television series that has become the most successful video game adaptation with over 20 seasons and 1,000 episodes in 124 countries, as well as an anime film series, a , books, manga comics, music, and merchandise. The franchise is also represented in other Nintendo media, such as the "Super Smash Bros." series. In November 2005, 4Kids Entertainment, which had managed the non-game related licensing of "Pokémon", announced that it had agreed not to renew the "Pokémon" representation agreement. The Pokémon Company International oversees all "Pokémon" licensing outside Asia.
\ No newline at end of file
diff --git a/examples/research_projects/rag/test_distributed_retriever.py b/examples/research_projects/rag/test_distributed_retriever.py
deleted file mode 100644
index 7e75e0a7a7ef..000000000000
--- a/examples/research_projects/rag/test_distributed_retriever.py
+++ /dev/null
@@ -1,338 +0,0 @@
-import json
-import os
-import shutil
-import sys
-import tempfile
-import unittest
-from unittest import TestCase
-from unittest.mock import patch
-
-import faiss
-import numpy as np
-from datasets import Dataset
-
-from transformers import BartConfig, BartTokenizer, DPRConfig, DPRQuestionEncoderTokenizer, RagConfig
-from transformers.file_utils import is_datasets_available, is_faiss_available, is_psutil_available, is_torch_available
-from transformers.integrations import is_ray_available
-from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
-from transformers.models.rag.retrieval_rag import CustomHFIndex, RagRetriever
-from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
-from transformers.testing_utils import require_ray
-
-
-sys.path.append(os.path.join(os.getcwd()))  # noqa: E402 # noqa: E402 # isort:skip
-
-if is_torch_available():
-    from distributed_pytorch_retriever import RagPyTorchDistributedRetriever  # noqa: E402 # isort:skip
-else:
-    RagPyTorchDistributedRetriever = None
-
-if is_ray_available():
-    import ray  # noqa: E402 # isort:skip
-    from distributed_ray_retriever import RagRayDistributedRetriever, RayRetriever  # noqa: E402 # isort:skip
-else:
-    ray = None
-    RagRayDistributedRetriever = None
-    RayRetriever = None
-
-
-def require_distributed_retrieval(test_case):
-    """
-    Decorator marking a test that requires a set of dependencies necessary for pefrorm retrieval with
-    :class:`~transformers.RagRetriever`.
-
-    These tests are skipped when respective libraries are not installed.
-
-    """
-    if not (is_datasets_available() and is_faiss_available() and is_psutil_available()):
-        test_case = unittest.skip("test requires Datasets, Faiss, psutil")(test_case)
-    return test_case
-
-
-@require_distributed_retrieval
-class RagRetrieverTest(TestCase):
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-        self.retrieval_vector_size = 8
-
-        # DPR tok
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
-        os.makedirs(dpr_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(dpr_tokenizer_path, DPR_VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-        # BART tok
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "<unk>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
-
-        bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
-        os.makedirs(bart_tokenizer_path, exist_ok=True)
-        self.vocab_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(bart_tokenizer_path, BART_VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
-        return DPRQuestionEncoderTokenizer.from_pretrained(os.path.join(self.tmpdirname, "dpr_tokenizer"))
-
-    def get_bart_tokenizer(self) -> BartTokenizer:
-        return BartTokenizer.from_pretrained(os.path.join(self.tmpdirname, "bart_tokenizer"))
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
-    def get_dummy_dataset(self):
-        dataset = Dataset.from_dict(
-            {
-                "id": ["0", "1"],
-                "text": ["foo", "bar"],
-                "title": ["Foo", "Bar"],
-                "embeddings": [np.ones(self.retrieval_vector_size), 2 * np.ones(self.retrieval_vector_size)],
-            }
-        )
-        dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
-        return dataset
-
-    def get_dummy_pytorch_distributed_retriever(
-        self, init_retrieval: bool, port=12345
-    ) -> RagPyTorchDistributedRetriever:
-        dataset = self.get_dummy_dataset()
-        config = RagConfig(
-            retrieval_vector_size=self.retrieval_vector_size,
-            question_encoder=DPRConfig().to_dict(),
-            generator=BartConfig().to_dict(),
-        )
-        with patch("transformers.models.rag.retrieval_rag.load_dataset") as mock_load_dataset:
-            mock_load_dataset.return_value = dataset
-            retriever = RagPyTorchDistributedRetriever(
-                config,
-                question_encoder_tokenizer=self.get_dpr_tokenizer(),
-                generator_tokenizer=self.get_bart_tokenizer(),
-            )
-            if init_retrieval:
-                retriever.init_retrieval(port)
-        return retriever
-
-    def get_dummy_ray_distributed_retriever(self, init_retrieval: bool) -> RagRayDistributedRetriever:
-        # Have to run in local mode because sys.path modifications at top of
-        # file are not propogated to remote workers.
-        # https://stackoverflow.com/questions/54338013/parallel-import-a-python-file-from-sibling-folder
-        ray.init(local_mode=True)
-        config = RagConfig(
-            retrieval_vector_size=self.retrieval_vector_size,
-            question_encoder=DPRConfig().to_dict(),
-            generator=BartConfig().to_dict(),
-        )
-        remote_cls = ray.remote(RayRetriever)
-        workers = [remote_cls.remote() for _ in range(1)]
-        with patch("transformers.models.rag.retrieval_rag.load_dataset") as mock_load_dataset:
-            mock_load_dataset.return_value = self.get_dummy_dataset()
-            retriever = RagRayDistributedRetriever(
-                config,
-                question_encoder_tokenizer=self.get_dpr_tokenizer(),
-                generator_tokenizer=self.get_bart_tokenizer(),
-                retrieval_workers=workers,
-            )
-            if init_retrieval:
-                retriever.init_retrieval()
-        return retriever
-
-    def get_dummy_custom_hf_index_pytorch_retriever(self, init_retrieval: bool, from_disk: bool, port=12345):
-        dataset = self.get_dummy_dataset()
-        config = RagConfig(
-            retrieval_vector_size=self.retrieval_vector_size,
-            question_encoder=DPRConfig().to_dict(),
-            generator=BartConfig().to_dict(),
-            index_name="custom",
-        )
-        if from_disk:
-            config.passages_path = os.path.join(self.tmpdirname, "dataset")
-            config.index_path = os.path.join(self.tmpdirname, "index.faiss")
-            dataset.get_index("embeddings").save(os.path.join(self.tmpdirname, "index.faiss"))
-            dataset.drop_index("embeddings")
-            dataset.save_to_disk(os.path.join(self.tmpdirname, "dataset"))
-            del dataset
-            retriever = RagPyTorchDistributedRetriever(
-                config,
-                question_encoder_tokenizer=self.get_dpr_tokenizer(),
-                generator_tokenizer=self.get_bart_tokenizer(),
-            )
-        else:
-            retriever = RagPyTorchDistributedRetriever(
-                config,
-                question_encoder_tokenizer=self.get_dpr_tokenizer(),
-                generator_tokenizer=self.get_bart_tokenizer(),
-                index=CustomHFIndex(config.retrieval_vector_size, dataset),
-            )
-        if init_retrieval:
-            retriever.init_retrieval(port)
-        return retriever
-
-    def get_dummy_custom_hf_index_ray_retriever(self, init_retrieval: bool, from_disk: bool):
-        # Have to run in local mode because sys.path modifications at top of
-        # file are not propogated to remote workers.
-        # https://stackoverflow.com/questions/54338013/parallel-import-a-python-file-from-sibling-folder
-        ray.init(local_mode=True)
-        dataset = self.get_dummy_dataset()
-        config = RagConfig(
-            retrieval_vector_size=self.retrieval_vector_size,
-            question_encoder=DPRConfig().to_dict(),
-            generator=BartConfig().to_dict(),
-            index_name="custom",
-        )
-        remote_cls = ray.remote(RayRetriever)
-        workers = [remote_cls.remote() for _ in range(1)]
-        if from_disk:
-            config.passages_path = os.path.join(self.tmpdirname, "dataset")
-            config.index_path = os.path.join(self.tmpdirname, "index.faiss")
-            dataset.get_index("embeddings").save(os.path.join(self.tmpdirname, "index.faiss"))
-            dataset.drop_index("embeddings")
-            dataset.save_to_disk(os.path.join(self.tmpdirname, "dataset"))
-            del dataset
-            retriever = RagRayDistributedRetriever(
-                config,
-                question_encoder_tokenizer=self.get_dpr_tokenizer(),
-                generator_tokenizer=self.get_bart_tokenizer(),
-                retrieval_workers=workers,
-                index=CustomHFIndex.load_from_disk(
-                    vector_size=config.retrieval_vector_size,
-                    dataset_path=config.passages_path,
-                    index_path=config.index_path,
-                ),
-            )
-        else:
-            retriever = RagRayDistributedRetriever(
-                config,
-                question_encoder_tokenizer=self.get_dpr_tokenizer(),
-                generator_tokenizer=self.get_bart_tokenizer(),
-                retrieval_workers=workers,
-                index=CustomHFIndex(config.retrieval_vector_size, dataset),
-            )
-        if init_retrieval:
-            retriever.init_retrieval()
-        return retriever
-
-    def distributed_retriever_check(self, retriever: RagRetriever, hidden_states: np.array, n_docs: int) -> None:
-        retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
-        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
-        self.assertEqual(len(doc_dicts), 2)
-        self.assertEqual(sorted(doc_dicts[0]), ["embeddings", "id", "text", "title"])
-        self.assertEqual(len(doc_dicts[0]["id"]), n_docs)
-        self.assertEqual(doc_dicts[0]["id"][0], "1")  # max inner product is reached with second doc
-        self.assertEqual(doc_dicts[1]["id"][0], "0")  # max inner product is reached with first doc
-        self.assertListEqual(doc_ids.tolist(), [[1], [0]])
-
-    def test_pytorch_distributed_retriever_retrieve(self):
-        n_docs = 1
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-
-        self.distributed_retriever_check(
-            self.get_dummy_pytorch_distributed_retriever(init_retrieval=True), hidden_states, n_docs
-        )
-
-    def test_custom_hf_index_pytorch_retriever_retrieve(self):
-        n_docs = 1
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-
-        self.distributed_retriever_check(
-            self.get_dummy_custom_hf_index_pytorch_retriever(init_retrieval=True, from_disk=False),
-            hidden_states,
-            n_docs,
-        )
-
-    def test_custom_pytorch_distributed_retriever_retrieve_from_disk(self):
-        n_docs = 1
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-
-        self.distributed_retriever_check(
-            self.get_dummy_custom_hf_index_pytorch_retriever(init_retrieval=True, from_disk=True),
-            hidden_states,
-            n_docs,
-        )
-
-    @require_ray
-    def test_ray_distributed_retriever_retrieve(self):
-        n_docs = 1
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-
-        self.distributed_retriever_check(
-            self.get_dummy_ray_distributed_retriever(init_retrieval=True), hidden_states, n_docs
-        )
-        ray.shutdown()
-
-    @require_ray
-    def test_custom_hf_index_ray_retriever_retrieve(self):
-        n_docs = 1
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-        with self.assertRaises(ValueError):
-            self.distributed_retriever_check(
-                self.get_dummy_custom_hf_index_ray_retriever(init_retrieval=True, from_disk=False),
-                hidden_states,
-                n_docs,
-            )
-        ray.shutdown()
-
-    @require_ray
-    def test_custom_ray_distributed_retriever_retrieve_from_disk(self):
-        n_docs = 1
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-
-        self.distributed_retriever_check(
-            self.get_dummy_custom_hf_index_ray_retriever(init_retrieval=True, from_disk=True), hidden_states, n_docs
-        )
-        ray.shutdown()
diff --git a/examples/research_projects/rag/use_own_knowledge_dataset.py b/examples/research_projects/rag/use_own_knowledge_dataset.py
deleted file mode 100644
index d2ab6d07d5cc..000000000000
--- a/examples/research_projects/rag/use_own_knowledge_dataset.py
+++ /dev/null
@@ -1,208 +0,0 @@
-import logging
-import os
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from typing import List, Optional
-
-import faiss
-import torch
-from datasets import Features, Sequence, Value, load_dataset
-
-from transformers import (
-    DPRContextEncoder,
-    DPRContextEncoderTokenizerFast,
-    HfArgumentParser,
-    RagRetriever,
-    RagSequenceForGeneration,
-    RagTokenizer,
-)
-
-
-logger = logging.getLogger(__name__)
-torch.set_grad_enabled(False)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-
-def split_text(text: str, n=100, character=" ") -> List[str]:
-    """Split the text every ``n``-th occurrence of ``character``"""
-    text = text.split(character)
-    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]
-
-
-def split_documents(documents: dict) -> dict:
-    """Split documents into passages"""
-    titles, texts = [], []
-    for title, text in zip(documents["title"], documents["text"]):
-        if text is not None:
-            for passage in split_text(text):
-                titles.append(title if title is not None else "")
-                texts.append(passage)
-    return {"title": titles, "text": texts}
-
-
-def embed(documents: dict, ctx_encoder: DPRContextEncoder, ctx_tokenizer: DPRContextEncoderTokenizerFast) -> dict:
-    """Compute the DPR embeddings of document passages"""
-    input_ids = ctx_tokenizer(
-        documents["title"], documents["text"], truncation=True, padding="longest", return_tensors="pt"
-    )["input_ids"]
-    embeddings = ctx_encoder(input_ids.to(device=device), return_dict=True).pooler_output
-    return {"embeddings": embeddings.detach().cpu().numpy()}
-
-
-def main(
-    rag_example_args: "RagExampleArguments",
-    processing_args: "ProcessingArguments",
-    index_hnsw_args: "IndexHnswArguments",
-):
-    ######################################
-    logger.info("Step 1 - Create the dataset")
-    ######################################
-
-    # The dataset needed for RAG must have three columns:
-    # - title (string): title of the document
-    # - text (string): text of a passage of the document
-    # - embeddings (array of dimension d): DPR representation of the passage
-
-    # Let's say you have documents in tab-separated csv files with columns "title" and "text"
-    assert os.path.isfile(rag_example_args.csv_path), "Please provide a valid path to a csv file"
-
-    # You can load a Dataset object this way
-    dataset = load_dataset(
-        "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
-    )
-
-    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets?highlight=csv#csv-files
-
-    # Then split the documents into passages of 100 words
-    dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)
-
-    # And compute the embeddings
-    ctx_encoder = DPRContextEncoder.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name).to(device=device)
-    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name)
-    new_features = Features(
-        {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
-    )  # optional, save as float32 instead of float64 to save space
-    dataset = dataset.map(
-        partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
-        batched=True,
-        batch_size=processing_args.batch_size,
-        features=new_features,
-    )
-
-    # And finally save your dataset
-    passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset")
-    dataset.save_to_disk(passages_path)
-    # from datasets import load_from_disk
-    # dataset = load_from_disk(passages_path)  # to reload the dataset
-
-    ######################################
-    logger.info("Step 2 - Index the dataset")
-    ######################################
-
-    # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search
-    index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT)
-    dataset.add_faiss_index("embeddings", custom_index=index)
-
-    # And save the index
-    index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss")
-    dataset.get_index("embeddings").save(index_path)
-    # dataset.load_faiss_index("embeddings", index_path)  # to reload the index
-
-    ######################################
-    logger.info("Step 3 - Load RAG")
-    ######################################
-
-    # Easy way to load the model
-    retriever = RagRetriever.from_pretrained(
-        rag_example_args.rag_model_name, index_name="custom", indexed_dataset=dataset
-    )
-    model = RagSequenceForGeneration.from_pretrained(rag_example_args.rag_model_name, retriever=retriever)
-    tokenizer = RagTokenizer.from_pretrained(rag_example_args.rag_model_name)
-
-    # For distributed fine-tuning you'll need to provide the paths instead, as the dataset and the index are loaded separately.
-    # retriever = RagRetriever.from_pretrained(rag_model_name, index_name="custom", passages_path=passages_path, index_path=index_path)
-
-    ######################################
-    logger.info("Step 4 - Have fun")
-    ######################################
-
-    question = rag_example_args.question or "What does Moses' rod turn into ?"
-    input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]
-    generated = model.generate(input_ids)
-    generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]
-    logger.info("Q: " + question)
-    logger.info("A: " + generated_string)
-
-
-@dataclass
-class RagExampleArguments:
-    csv_path: str = field(
-        default=str(Path(__file__).parent / "test_data" / "my_knowledge_dataset.csv"),
-        metadata={"help": "Path to a tab-separated csv file with columns 'title' and 'text'"},
-    )
-    question: Optional[str] = field(
-        default=None,
-        metadata={"help": "Question that is passed as input to RAG. Default is 'What does Moses' rod turn into ?'."},
-    )
-    rag_model_name: str = field(
-        default="facebook/rag-sequence-nq",
-        metadata={"help": "The RAG model to use. Either 'facebook/rag-sequence-nq' or 'facebook/rag-token-nq'"},
-    )
-    dpr_ctx_encoder_model_name: str = field(
-        default="facebook/dpr-ctx_encoder-multiset-base",
-        metadata={
-            "help": (
-                "The DPR context encoder model to use. Either 'facebook/dpr-ctx_encoder-single-nq-base' or"
-                " 'facebook/dpr-ctx_encoder-multiset-base'"
-            )
-        },
-    )
-    output_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Path to a directory where the dataset passages and the index will be saved"},
-    )
-
-
-@dataclass
-class ProcessingArguments:
-    num_proc: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "The number of processes to use to split the documents into passages. Default is single process."
-        },
-    )
-    batch_size: int = field(
-        default=16,
-        metadata={
-            "help": "The batch size to use when computing the passages embeddings using the DPR context encoder."
-        },
-    )
-
-
-@dataclass
-class IndexHnswArguments:
-    d: int = field(
-        default=768,
-        metadata={"help": "The dimension of the embeddings to pass to the HNSW Faiss index."},
-    )
-    m: int = field(
-        default=128,
-        metadata={
-            "help": (
-                "The number of bi-directional links created for every new element during the HNSW index construction."
-            )
-        },
-    )
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.WARNING)
-    logger.setLevel(logging.INFO)
-
-    parser = HfArgumentParser((RagExampleArguments, ProcessingArguments, IndexHnswArguments))
-    rag_example_args, processing_args, index_hnsw_args = parser.parse_args_into_dataclasses()
-    with TemporaryDirectory() as tmp_dir:
-        rag_example_args.output_dir = rag_example_args.output_dir or tmp_dir
-        main(rag_example_args, processing_args, index_hnsw_args)
diff --git a/examples/research_projects/rag/utils_rag.py b/examples/research_projects/rag/utils_rag.py
deleted file mode 100644
index ec98c1d782e0..000000000000
--- a/examples/research_projects/rag/utils_rag.py
+++ /dev/null
@@ -1,244 +0,0 @@
-import itertools
-import json
-import linecache
-import os
-import pickle
-import re
-import socket
-import string
-from collections import Counter
-from logging import getLogger
-from pathlib import Path
-from typing import Callable, Dict, Iterable, List
-
-import git
-import torch
-from torch.utils.data import Dataset
-
-from transformers import BartTokenizer, RagTokenizer, T5Tokenizer
-
-
-def encode_line(tokenizer, line, max_length, padding_side, pad_to_max_length=True, return_tensors="pt"):
-    extra_kw = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) and not line.startswith(" ") else {}
-    tokenizer.padding_side = padding_side
-    return tokenizer(
-        [line],
-        max_length=max_length,
-        padding="max_length" if pad_to_max_length else None,
-        truncation=True,
-        return_tensors=return_tensors,
-        add_special_tokens=True,
-        **extra_kw,
-    )
-
-
-def trim_batch(
-    input_ids,
-    pad_token_id,
-    attention_mask=None,
-):
-    """Remove columns that are populated exclusively by pad_token_id"""
-    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
-    if attention_mask is None:
-        return input_ids[:, keep_column_mask]
-    else:
-        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
-
-
-class Seq2SeqDataset(Dataset):
-    def __init__(
-        self,
-        tokenizer,
-        data_dir,
-        max_source_length,
-        max_target_length,
-        type_path="train",
-        n_obs=None,
-        src_lang=None,
-        tgt_lang=None,
-        prefix="",
-    ):
-        super().__init__()
-        self.src_file = Path(data_dir).joinpath(type_path + ".source")
-        self.tgt_file = Path(data_dir).joinpath(type_path + ".target")
-        self.src_lens = self.get_char_lens(self.src_file)
-        self.max_source_length = max_source_length
-        self.max_target_length = max_target_length
-        assert min(self.src_lens) > 0, f"found empty line in {self.src_file}"
-        self.tokenizer = tokenizer
-        self.prefix = prefix
-        if n_obs is not None:
-            self.src_lens = self.src_lens[:n_obs]
-        self.src_lang = src_lang
-        self.tgt_lang = tgt_lang
-
-    def __len__(self):
-        return len(self.src_lens)
-
-    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
-        index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
-        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
-        assert source_line, f"empty source line for index {index}"
-        assert tgt_line, f"empty tgt line for index {index}"
-
-        # Need to add eos token manually for T5
-        if isinstance(self.tokenizer, T5Tokenizer):
-            source_line += self.tokenizer.eos_token
-            tgt_line += self.tokenizer.eos_token
-
-        # Pad source and target to the right
-        source_tokenizer = (
-            self.tokenizer.question_encoder if isinstance(self.tokenizer, RagTokenizer) else self.tokenizer
-        )
-        target_tokenizer = self.tokenizer.generator if isinstance(self.tokenizer, RagTokenizer) else self.tokenizer
-
-        source_inputs = encode_line(source_tokenizer, source_line, self.max_source_length, "right")
-        target_inputs = encode_line(target_tokenizer, tgt_line, self.max_target_length, "right")
-
-        source_ids = source_inputs["input_ids"].squeeze()
-        target_ids = target_inputs["input_ids"].squeeze()
-        src_mask = source_inputs["attention_mask"].squeeze()
-        return {
-            "input_ids": source_ids,
-            "attention_mask": src_mask,
-            "decoder_input_ids": target_ids,
-        }
-
-    @staticmethod
-    def get_char_lens(data_file):
-        return [len(x) for x in Path(data_file).open().readlines()]
-
-    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
-        input_ids = torch.stack([x["input_ids"] for x in batch])
-        masks = torch.stack([x["attention_mask"] for x in batch])
-        target_ids = torch.stack([x["decoder_input_ids"] for x in batch])
-        tgt_pad_token_id = (
-            self.tokenizer.generator.pad_token_id
-            if isinstance(self.tokenizer, RagTokenizer)
-            else self.tokenizer.pad_token_id
-        )
-        src_pad_token_id = (
-            self.tokenizer.question_encoder.pad_token_id
-            if isinstance(self.tokenizer, RagTokenizer)
-            else self.tokenizer.pad_token_id
-        )
-        y = trim_batch(target_ids, tgt_pad_token_id)
-        source_ids, source_mask = trim_batch(input_ids, src_pad_token_id, attention_mask=masks)
-        batch = {
-            "input_ids": source_ids,
-            "attention_mask": source_mask,
-            "decoder_input_ids": y,
-        }
-        return batch
-
-
-logger = getLogger(__name__)
-
-
-def flatten_list(summary_ids: List[List]):
-    return list(itertools.chain.from_iterable(summary_ids))
-
-
-def save_git_info(folder_path: str) -> None:
-    """Save git information to output_dir/git_log.json"""
-    repo_infos = get_git_info()
-    save_json(repo_infos, os.path.join(folder_path, "git_log.json"))
-
-
-def save_json(content, path, indent=4, **json_dump_kwargs):
-    with open(path, "w") as f:
-        json.dump(content, f, indent=indent, **json_dump_kwargs)
-
-
-def load_json(path):
-    with open(path) as f:
-        return json.load(f)
-
-
-def get_git_info():
-    repo = git.Repo(search_parent_directories=True)
-    repo_infos = {
-        "repo_id": str(repo),
-        "repo_sha": str(repo.head.object.hexsha),
-        "repo_branch": str(repo.active_branch),
-        "hostname": str(socket.gethostname()),
-    }
-    return repo_infos
-
-
-def lmap(f: Callable, x: Iterable) -> List:
-    """list(map(f, x))"""
-    return list(map(f, x))
-
-
-def pickle_save(obj, path):
-    """pickle.dump(obj, path)"""
-    with open(path, "wb") as f:
-        return pickle.dump(obj, f)
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        return re.sub(r"\b(a|an|the)\b", " ", text)
-
-    def white_space_fix(text):
-        return " ".join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def f1_score(prediction, ground_truth):
-    prediction_tokens = normalize_answer(prediction).split()
-    ground_truth_tokens = normalize_answer(ground_truth).split()
-    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
-    num_same = sum(common.values())
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(prediction_tokens)
-    recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
-
-
-def exact_match_score(prediction, ground_truth):
-    return normalize_answer(prediction) == normalize_answer(ground_truth)
-
-
-def calculate_exact_match(output_lns: List[str], reference_lns: List[str]) -> Dict:
-    assert len(output_lns) == len(reference_lns)
-    em = 0
-    for hypo, pred in zip(output_lns, reference_lns):
-        em += exact_match_score(hypo, pred)
-    if len(output_lns) > 0:
-        em /= len(output_lns)
-    return {"em": em}
-
-
-def is_rag_model(model_prefix):
-    return model_prefix.startswith("rag")
-
-
-def set_extra_model_params(extra_params, hparams, config):
-    equivalent_param = {p: p for p in extra_params}
-    # T5 models don't have `dropout` param, they have `dropout_rate` instead
-    equivalent_param["dropout"] = "dropout_rate"
-    for p in extra_params:
-        if getattr(hparams, p, None):
-            if not hasattr(config, p) and not hasattr(config, equivalent_param[p]):
-                logger.info("config doesn't have a `{}` attribute".format(p))
-                delattr(hparams, p)
-                continue
-            set_p = p if hasattr(config, p) else equivalent_param[p]
-            setattr(config, set_p, getattr(hparams, p))
-            delattr(hparams, p)
-    return hparams, config
diff --git a/examples/research_projects/robust-speech-event/README.md b/examples/research_projects/robust-speech-event/README.md
deleted file mode 100644
index ca3c5cdecdec..000000000000
--- a/examples/research_projects/robust-speech-event/README.md
+++ /dev/null
@@ -1,713 +0,0 @@
-# Robust Speech Challenge 🤗
-
-Welcome to the robust speech recognition challenge 🎙️ !
-
-The goal of this event is to build **robust**, **real-world** speech recognition (ASR) systems in as many languages as possible 🌏🌍🌎.
-If necessary and available, free access to a V100S 32 GB GPU will kindly be provided by the [OVHcloud team](https://www.ovhcloud.com/) 🚀.
-This document summarizes all the relevant information required for the speech community event 📋.
-
-To sign-up, please see [this forum post](https://discuss.huggingface.co/t/open-to-the-community-robust-speech-recognition-challenge/13614) 🤗. Please make sure to:
-- Read it in detail
-- Fill the google form
-- Join our Discord server in the #join-sprint channel.
-
-## Table of Contents
-
-- [TLDR;](#tldr)
-- [Important dates](#important-dates)
-- [How to install pytorch, transformers, datasets](#how-to-install-relevant-libraries)
-- [Data and Preprocessing](#data-and-preprocessing)
-- [How to fine-tune an acoustic model](#how-to-finetune-an-acoustic-model)
-- [How to fine-tune with OVH could](#how-to-finetune-with-ovh-cloud)
-- [How to combine n-gram language models with acoustic model](#how-to-combine-n-gram-with-acoustic-model)
-- [Evaluation](#evaluation)
-- [Prizes](#prizes)
-- [Communication and Problems](#communication-and-problems)
-- [Talks](#talks)
-- [General Tips & Tricks](#general-tips-and-tricks)
-
-## TLDR
-
-Participants are encouraged to leverage pre-trained speech recognition checkpoints,
-preferably [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53), 
-to train a speech recognition system in a language of their choice.
-
-Speech recognition systems should be trained using **PyTorch**, **🤗 Transformers**, and, **🤗 Datasets**.
-For more information on how to install the above libraries, please read through 
-[How to install pytorch, transformers, datasets](#how-to-install-relevant-libraries).
-
-Participants can make use of whatever data they think is useful to build a 
-speech recognition system for **real-world** audio data - 
-**except** the Common Voice `"test"` split of their chosen language.
-The section [Data and preprocessing](#data-and-preprocessing) explains 
-in more detail what audio data can be used, how to find suitable audio data, and 
-how the audio data can be processed.
-
-For training, it is recommended to use the [official training script](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py) or a modification thereof. A step-by-step guide on how to fine-tune 
-an acoustic model for a speech recognition system can be found under [How to fine-tune an acoustic model](#how-to-finetune-an-acoustic-model).
-If possible it is encouraged to fine-tune the acoustic models on local GPU machines, but 
-if those are not available, the OVH could team kindly provides a limited 
-number of GPUs for the event. Simply fill out [this google form](https://forms.gle/GFZkMkKLiufi75g28) to get access to a GPU.
-For more information on how to train an acoustic model on one of OVH's GPU - see [How to fine-tune a speech recognition model with OVHcould](#how-to-fine-tune-with-ovh-cloud).
-
-The performance of speech recognition system can often significantly be improved by adding a 
-language model for decoding. For more information on how to add a language model, please 
-take a look at [How to combine n-gram language models with speech recognition models](#how-to-combine-n-gram-with-model).
-
-During the event, the speech recognition system will be evaluated on both the Common Voice `"test"` split 
-of the participants' chosen language as well as the *real-world* `"dev"` data provided by 
-the Hugging Face team. 
-At the end of the robust speech recognition challenge, the speech recognition system will also be evaluated on the
-*real-world* `"test"` data provided by the Hugging Face team. Each participant should add an 
-`eval.py` script to her/his model repository in a specific format that lets one easily 
-evaluate the speech recognition system on both Common Voice's `"test"` data as well as the *real-world* audio 
-data. Please read through the [Evaluation](#evaluation) section to make sure your evaluation script is in the correct format. Speech recognition systems
-with evaluation scripts in an incorrect format can sadly not be considered for the Challenge.
-
-At the end of the event, the best performing speech recognition system 
-will receive a prize 🏆 - more information regarding the prizes can be found under [Prizes](#prizes).
-
-We believe that framing the event as a competition is more fun, but at the core, the event is about
-creating speech recognition systems in as many languages as possible as a community.
-This can be achieved by working together, helping each other to solve bugs, share important findings, etc...🤗
-
-**Note**:
-Please, read through the section on [Communication & Problems](#communication-and-problems) to make sure you 
-know how to ask for help, etc...
-All important announcements will be made on discord. Please make sure that 
-you've joined [this discord channel](https://discord.gg/SHr5wC7m)
-
-Also, please make sure that you have been added to the [Speech Event Organization](https://huggingface.co/speech-recognition-community-v2). 
-You should have received an invite by email. If you didn't receive an invite, please contact the organizers, *e.g.* Anton, Patrick, or Omar directly on discord.
-
-## Important dates
-
-![timeline](https://github.com/patrickvonplaten/scientific_images/raw/master/Robush%20Speech%20Challenge.png)
-
-
-## Data and preprocessing
-
-In this section, we will quickly go over how to find suitable training data and 
-how to preprocess it.
-
-To begin with, **all data except Common Voice's `"test"` data can be used as training data.**
-The exception includes all Common Voice versions as the test data split of later Common Voice versions often
-overlaps with the one of previous versions, *e.g.* the test data of Common Voice 7 in English is 
-to a big part identical to the test data of Common Voice 6 in English:
-
-```python
-load_dataset("mozilla-foundation/common_voice_7_0", "en", split="test") 
-```
-
-includes more or less the same data as
-
-```python
-load_dataset("mozilla-foundation/common_voice_6_1", "en", split="test") 
-```
-
-However, we strongly encourage participants to make use of Common Voice's other splits, *e.g.* `"train"` and `"validation"`.
-For most languages, the Common Voice dataset offers already a decent amount of training data. It is usually 
-always advantageous to collect additional data. To do so, the participants are in a first step encouraged to search the
-Hugging Face Hub for additional audio data, for example by selecting the category 
-["speech-processing"](https://huggingface.co/datasets?task_categories=task_categories:speech-processing&sort=downloads).
-All datasets that are available on the Hub can be downloaded via the 🤗 Datasets library in the same way Common Voice is downloaded.
-If one wants to combine multiple datasets for training, it might make sense to take a look at 
-the [`interleave_datasets`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=interleave#datasets.interleave_datasets) function.
-
-In addition, participants can also make use of their audio data. Here, please make sure that you **are allowed to use the audio data**. E.g., if audio data 
-is taken from media platforms, such as YouTube, it should be verified that the media platform and the owner of the data have given her/his approval to use the audio 
-data in the context of machine learning research. If you are not sure whether the data you want to use has the appropriate licensing, please contact the Hugging Face 
-team on discord.
-
-Next, let's talk about preprocessing. Audio data and transcriptions have to be brought into the correct format when 
-training the acoustic model (example shown in [How to fine-tune an acoustic model](#how-to-finetune-an-acoustic-model)).
-It is recommended that this is done by using 🤗 Datasets `.map()` function as shown 
-[here](https://github.com/huggingface/transformers/blob/9a2dabae7002258e41419491c73dd43ad61b5de7/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py#L444). As can be 
-see we can pass some characters that will be removed from the transcriptions, *e.g.*: `--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \`
-on the official ["Single GPU Example"](https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-recognition#single-gpu-ctc).
-The participants are free to modify this preprocessing by removing more characters or even replacing characters as 
-it is done in the [official blog post](https://github.com/huggingface/transformers/blob/9a2dabae7002258e41419491c73dd43ad61b5de7/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py#L444).
-**However**, there are some rules regarding what characters are allowed to be removed/replaced and which are not.
-These rules are not this straightforward and therefore often have to be evaluated case-by-case.
-It is allowed (and recommended) to normalize the data to only have lower-case characters. It is also allowed (and recommended) to remove typographical 
-symbols and punctuation marks. A list of such symbols can *e.g.* be found [here](https://en.wikipedia.org/wiki/List_of_typographical_symbols_and_punctuation_marks) - however here we already must be careful. We should **not** remove a symbol that would change the meaning of the words, *e.g.* in English, 
-we should not remove the single quotation mark `'` since it would change the meaning of the word `"it's"` to `"its"` which would then be incorrect. 
-So the golden rule here is to not remove any characters that could change the meaning of a word into another word. This is not always obvious and should 
-be given some consideration. As another example, it is fine to remove the "Hyphen-minus" sign "`-`" since it doesn't change the 
-meaning of a word to another one. *E.g.* "`fine-tuning`" would be changed to "`finetuning`" which has still the same meaning.
-
-Since those choices are not always obvious when in doubt feel free to ask on Discord or even better post your question on the forum, as was 
-done, *e.g.* [here](https://discuss.huggingface.co/t/spanish-asr-fine-tuning-wav2vec2/4586).
-
-## How to install relevant libraries
-
-The following libraries are required to fine-tune a speech model with 🤗 Transformers and 🤗 Datasets in PyTorch.
-
-- [PyTorch](https://pytorch.org/)
-- [Transformers](https://github.com/huggingface/transformers)
-- [Datasets](https://github.com/huggingface/datasets)
-
-We recommend installing the above libraries in a [virtual environment](https://docs.python.org/3/library/venv.html). 
-If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going
-to use and activate it.
-
-You should be able to run the command:
-
-```bash
-python3 -m venv <your-venv-name>
-```
-
-You can activate your venv by running
-
-```bash
-source ~/<your-venv-name>/bin/activate
-```
-
-To begin with please make sure you have PyTorch and CUDA correctly installed. 
-The following command should return ``True``:
-
-```bash
-python -c "import torch; print(torch.cuda.is_available())"
-```
-
-If the above command doesn't print ``True``, in the first step, please follow the
-instructions [here](https://pytorch.org/) to install PyTorch with CUDA.
-
-We strongly recommend making use of the provided PyTorch examples scripts in [transformers/examples/pytorch/speech-recognition](https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-recognition) to train your speech recognition
-system.
-In all likelihood, you will adjust one of the example scripts, so we recommend forking and cloning the 🤗 Transformers repository as follows. 
-
-1. Fork the [repository](https://github.com/huggingface/transformers) by
-   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
-   under your GitHub user account.
-
-2. Clone your fork to your local disk, and add the base repository as a remote:
-
-   ```bash
-   $ git clone https://github.com/<your Github handle>/transformers.git
-   $ cd transformers
-   $ git remote add upstream https://github.com/huggingface/transformers.git
-   ```
-
-3. Create a new branch to hold your development changes. This is especially useful to share code changes with your team:
-
-   ```bash
-   $ git checkout -b a-descriptive-name-for-my-project
-   ```
-
-4. Set up a PyTorch environment by running the following command your virtual environment:
-
-   ```bash
-   $ pip install -e ".[torch-speech]"
-   ```
-
-   (If transformers was already installed in the virtual environment, remove
-   it with `pip uninstall transformers` before reinstalling it in editable
-   mode with the `-e` flag.)
-
-   If you have already cloned that repo, you might need to `git pull` to get the most recent changes in the `transformers`
-   library.
-
-   Running this command will automatically install `torch` and the most relevant 
-   libraries required for fine-tuning a speech recognition system.
-
-Next, you should also install the 🤗 Datasets library. We strongly recommend installing the 
-library from source to profit from the most current additions during the community week.
-
-Simply run the following steps:
-
-```bash
-$ cd ~/
-$ git clone https://github.com/huggingface/datasets.git
-$ cd datasets
-$ pip install -e ".[streaming]"
-```
-
-If you plan on contributing a specific dataset during 
-the community week, please fork the datasets repository and follow the instructions 
-[here](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-create-a-pull-request).
-
-To verify that all libraries are correctly installed, you can run the following command in a Python shell.
-It verifies that both `transformers` and `datasets` have been correclty installed.
-
-```python
-from transformers import AutoModelForCTC, AutoProcessor
-from datasets import load_dataset
-
-dummy_dataset = load_dataset("common_voice", "ab", split="test")
-
-model = AutoModelForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
-model.to("cuda")
-
-processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
-
-input_values = processor(dummy_dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=16_000).input_values
-input_values = input_values.to("cuda")
-
-logits = model(input_values).logits
-
-assert logits.shape[-1] == 32
-```
-
-## How to finetune an acoustic model
-
-In this section, we show you how to fine-tune a pre-trained [XLS-R Model](https://huggingface.co/docs/transformers/model_doc/xls_r) on the [Common Voice 7 dataset](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0). 
-
-We recommend fine-tuning one of the following pre-trained XLS-R checkpoints:
-
-- [300M parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-300m)
-- [1B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-1b)
-- [2B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-2b)
-
-To begin with, please note that to use the Common Voice dataset, you 
-have to accept that **your email address** and **username** are shared with the 
-mozilla-foundation. To get access to the dataset please click on "*Access repository*" [here](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0).
-
-Next, we recommended that you get familiar with the XLS-R model and its capabilities.
-In collaboration with [Fairseq's Wav2Vec2 team](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec), 
-we've written ["Fine-tuning XLS-R for Multi-Lingual ASR with 🤗 Transformers"](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) which gives an in-detail explanation of how XLS-R functions and how it can be fine-tuned.
-
-The blog can also be opened and directly fine-tuned in a google colab notebook.
-In this section, we will explain how to fine-tune the model on a local machine.
-
-1. **Log in**
-
-To begin with, you should check that you are correctly logged in and that you have `git-lfs` installed so that your fine-tuned model can automatically be uploaded.
-
-Run:
-
-```bash
-huggingface-cli login
-```
-
-to login. It is recommended to login with your access token that can be found under your hugging face profile (icon in the top right corner on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens -> New Token (if haven't generated one already)
-
-You can then copy-paste this token to log in locally.
-
-2. **Create your model repository**
-
-First, let's make sure that `git-lfs` is correctly installed. To so, simply run:
-
-```bash
-git-lfs -v
-```
-
-The output should show something like `git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that the `git-lfs` command was not found, please make
-sure to install it [here](https://git-lfs.github.com/) or simply via: 
-
-```bash
-sudo apt-get install git-lfs
-```
-
-Now you can create your model repository which will contain all relevant files to 
-reproduce your training. You can either directly create the model repository on the 
-Hub (Settings -> New Model) or via the CLI. Here we choose to use the CLI instead.
-
-Assuming that we want to call our model repository *xls-r-ab-test*, we can run the 
-following command:
-
-```bash
-huggingface-cli repo create xls-r-ab-test
-```
-
-You can now see the model on the Hub, *e.g.* under https://huggingface.co/hf-test/xls-r-ab-test .
-
-Let's clone the repository so that we can define our training script inside.
-
-```bash
-git lfs install
-git clone https://huggingface.co/hf-test/xls-r-ab-test
-```
-
-3. **Add your training script and `run`-command to the repository**
-
-We encourage participants to add all relevant files for training directly to the 
-directory so that everything is fully reproducible.
-
-Let's first copy-paste the official training script from our clone 
-of `transformers` to our just created directory:
-
-```bash
-cp ~/transformers/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py ./
-```
-
-Next, we'll create a bash file to define the hyper-parameters and configurations 
-for training. More detailed information on different settings (single-GPU vs. multi-GPU) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/speech-recognition#connectionist-temporal-classification).
-
-For demonstration purposes, we will use a dummy XLS-R model `model_name_or_path="hf-test/xls-r-dummy"` on the very low-resource language of "Abkhaz" of [Common Voice 7](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0): `dataset_config_name="ab"` for just a single epoch.
-
-Before starting to train, let's make sure we have installed all the required libraries. You might want to run:
-
-```bash
-pip install -r ~/transformers/examples/pytorch/speech-recognition/requirements.txt
-```
-
-Alright, finally we can define the training script. We'll simply use some 
-dummy hyper-parameters and configurations for demonstration purposes.
-
-Note that we add the flag `--use_auth_token` so that datasets requiring access, 
-such as [Common Voice 7](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0) can be downloaded. In addition, we add the `--push_to_hub` flag to make use of the 
-[Trainers `push_to-hub` functionality](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) so that your model will be automatically uploaded to the Hub.
-
-Let's copy the following code snippet in a file called `run.sh`
-
-```bash
-echo '''python run_speech_recognition_ctc.py \
-	--dataset_name="mozilla-foundation/common_voice_7_0" \
-	--model_name_or_path="hf-test/xls-r-dummy" \
-	--dataset_config_name="ab" \
-	--output_dir="./" \
-	--overwrite_output_dir \
-	--max_steps="10" \
-	--per_device_train_batch_size="2" \
-	--learning_rate="3e-4" \
-	--save_total_limit="1" \
-	--eval_strategy="steps" \
-	--text_column_name="sentence" \
-	--length_column_name="input_length" \
-	--save_steps="5" \
-	--layerdrop="0.0" \
-	--freeze_feature_encoder \
-	--gradient_checkpointing \
-	--fp16 \
-	--group_by_length \
-	--push_to_hub \
-	--use_auth_token \
-	--do_train --do_eval''' > run.sh
-```
-
-4. **Start training**
-
-Now all that is left to do is to start training the model by executing the 
-run file.
-
-```bash
-bash run.sh
-```
-
-The training should not take more than a couple of minutes. 
-During the training intermediate saved checkpoints are automatically uploaded to
-your model repository as can be seen [on this commit](https://huggingface.co/hf-test/xls-r-ab-test/commit/0eb19a0fca4d7d163997b59663d98cd856022aa6) . 
-
-At the end of the training, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer) automatically creates a nice model card and all 
-relevant files are uploaded.
-
-5. **Tips for real model training**
-
-The above steps illustrate how a model can technically be fine-tuned.
-However as you can see on the model card [hf-test/xls-r-ab-test](https://huggingface.co/hf-test/xls-r-ab-test), our demonstration has a very poor performance which is
-not surprising given that we trained for just 10 steps on a randomly initialized
-model.
-
-For real model training, it is recommended to use one of the actual pre-trained XLS-R models:
-
-- [300M parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-300m)
-- [1B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-1b)
-- [2B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-2b)
-
-Also, the hyper-parameters should be carefully chosen depending on the dataset.
-As an example, we will fine-tune the 300M parameters model on Swedish on a single 
-TITAN RTX 24GB GPU.
-
-The model will be called `"xls-r-300m-sv"`. 
-Following the above steps we first create the model:
-
-```bash
-huggingface-cli repo create xls-r-300m-sv
-```
-
-, clone it locally (assuming the `<username>` is `hf-test`)
-
-```bash
-git clone hf-test/xls-r-300m-sv
-```
-
-, and, define the following hyperparameters for training
-
-```bash
-echo '''python run_speech_recognition_ctc.py \
-	--dataset_name="mozilla-foundation/common_voice_7_0" \
-	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
-	--dataset_config_name="sv-SE" \
-	--output_dir="./" \
-	--overwrite_output_dir \
-	--num_train_epochs="50" \
-	--per_device_train_batch_size="8" \
-	--per_device_eval_batch_size="8" \
-	--gradient_accumulation_steps="4" \
-	--learning_rate="7.5e-5" \
-	--warmup_steps="2000" \
-	--length_column_name="input_length" \
-	--eval_strategy="steps" \
-	--text_column_name="sentence" \
-	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
-	--save_steps="500" \
-	--eval_steps="500" \
-	--logging_steps="100" \
-	--layerdrop="0.0" \
-	--activation_dropout="0.1" \
-	--save_total_limit="3" \
-	--freeze_feature_encoder \
-	--feat_proj_dropout="0.0" \
-	--mask_time_prob="0.75" \
-	--mask_time_length="10" \
-	--mask_feature_prob="0.25" \
-	--mask_feature_length="64" \
-	--gradient_checkpointing \
-	--use_auth_token \
-	--fp16 \
-	--group_by_length \
-	--do_train --do_eval \
-	--push_to_hub''' > run.sh
-```
-
-The training takes *ca.* 7 hours and yields a reasonable test word 
-error rate of 27% as can be seen on the automatically generated [model card](https://huggingface.co/hf-test/xls-r-300m-sv).
-
-The above-chosen hyperparameters probably work quite well on a range of different 
-datasets and languages but are by no means optimal. It is up to you to find a good set of 
-hyperparameters.
-
-
-## How to finetune with OVH cloud
-
-[![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://youtu.be/XkMnYocAEO0) For a more detailed guide on setting up OVHcloud please watch this video: https://youtu.be/XkMnYocAEO0
-
-### Creating an OVHCloud account
-*TIP*: If you haven't created a project on OVHcloud yet, make sure you've received your GPU voucher code *beforehand*, 
-so that you can skip entering the credit card information.
-1. If you're a US citizen, create an account via [OVHcloud.CA](https://ovhcloud.ca/). 
-If you're from anywhere else in the world, create an account via [OVHcloud.COM](https://ovhcloud.com/).
-2. Once logged in, click `Public Cloud` from the top menu and then click `Create your first OVH Public Cloud project`. 
-Then enter a project name (e.g. "huggingface"), enter your voucher code, and click `Continue` -> `Create my project`.
-*Note: if you see a request for credit card details during the last step, and you can't skip it, then your voucher code 
-is invalid. Please report it to the [#ovh-support](https://discord.gg/p4qqDV3M) channel on Discord.*
-
-### Setting up an AI notebook
-1. Go to the `Public Cloud` page and select `Project Management` -> `Users & Roles` from the menu on the left. 
-2. Click `+ Add user`. Write a user description (e.g. `AI Trainer`), and select an `AI Training Operator` user role. 
-Click `Confirm`.
-3. Write down the *username* and *password* (at the top of the screen) somewhere. They will be needed during step 7.
-4. Select `AI & Machine Learning` -> `AI Training` from the menu on the left. 
-Click `+ Launch a new job` on the AI Training page.
-5. On the `Launch a new job` page:
-   * In `1. Choose a region` select a region closest to you.
-   * In `2. Enter the Docker image` select `Custom image` -> `baaastijn/ovh_huggingface`.
-   * You can skip steps `3.` and `4.` if you will be using the Hugging Face Hub to store the models after training.
-   * In `5. Configure your job` select **1** `GPU`.
-   * Validate the info and Create the job.
-6. On the `AI Training Jobs` screen wait until the job's status changes from `Pending` to `Running`.
-7. Click `HTTP Access` from the Job's details page and log in with the AI training user you've created earlier. 
-Once logged in, you can close the page and click `HTTP Access` to launch a JupyterLab notebook.
-8. Awesome, now you have a free GPU-enabled Jupyter instance!
-
-**Note**: If you're an experienced Docker user, feel free to create a custom docker image with all of the needed packages 
-like the one in step 5. The Dockerfile for it is available here: 
-[baaastijn/Dockerimages](https://github.com/baaastijn/Dockerimages/tree/main/Hugginface_challenge_speech).
-Once you've built your image, push it to https://hub.docker.com/ and select it during the OVHcloud job creation.
-
-For more quick tutorials about OVHcloud AI products, check out the showcase https://vimeo.com/showcase/8903300
-
-## How to combine n-gram with acoustic model
-
-Having trained a speech recognition model with CTC as shown in the section above, 
-one can further improve the model's performance by adding an **n-gram language model**
-to the decoding process of the model. By doing so, we are replacing the naive greedy decoding 
-with **n-gram-boosted** beam search decoding.
-
-N-gram language models can be built on CPU in just a few minutes. *N-gram-boosted* beam search decoding noticeably slows down the 
-inference time, but also yields significant word error rates improvements - usually between 10-40 %.
-
-You can find an in-detail blog post on how to build an *n-gram* [here](https://huggingface.co/blog/wav2vec2-with-ngram).
-The blog post can be opened in a google colab and by adapting three lines of the example for your use case, one can directly
-create an *n-gram* in the google colab.
-The blog post gives in-detail instructions on how to build an n-gram and how to add it to your trained speech recognition model.
-
-- why one should add an *n-gram* to her/his speech recognition system,
-- how to build an *n-gram*, and,
-- how to add the built *n-gram* the speech recognition system for seamless decoding
-
-Our previously trained model - [xls-r-300m-sv](https://huggingface.co/hf-test/xls-r-300m-sv) - enjoys a 30% word error rate reduction after 
-having added an n-gram. As shown in the example of the blog post, we strongly advise participants to upload all files required for combining 
-the *n-gram* with a trained speech recognition model directly into the same model repository.
-
-## Evaluation
-
-Finally, we have arrived at the most fun part of the challenge - sitting back and
-watching the model transcribe audio. If possible, every participant should evaluate 
-the speech recognition system on the test set of Common Voice 7 and 
-ideally also on the real-world audio data (if available).
-For languages that have neither a Common Voice evaluation dataset nor a real world 
-evaluation dataset, please contact the organizers on Discord so that we can work 
-together to find some evaluation data.
-
-As a first step, one should copy the official `eval.py` script to her/his model 
-repository. Let's use our previously trained [xls-r-300m-sv](https://huggingface.co/hf-test/xls-r-300m-sv) again as an example.
-
-Assuming that we have a clone of the model's repo under `~/xls-r-300m-sv`, we can 
-copy the `eval.py` script to the repo.
-
-```bash
-cp ~/transformers/examples/research_projects/robust-speech-event/eval.py ~/xls-r-300m-sv
-```
-
-Next, we should adapt `eval.py` so that it fits our evaluation data. Here it is 
-important to keep the `eval.py` file in the following format:
-
-- 1. The following input arguments should not be changed and keep their original functionality/meaning (being to load the model and dataset): `"--model_id"`, `"--dataset"`, `"--config"`, `"--split"`. We recommend to not change any of the code written under `if __name__ == "__main__":`.
-- 2. The function `def log_results(result: Dataset, args: Dict[str, str])` should also not be changed. The function expects the above names attached to the `args` object as well as a `datasets.Dataset` object, called `result` which includes all predictions and target transcriptions under the names `"predictions"` and `"targets"` respectively.
-- 3. All other code can be changed and adapted. Participants are especially invited to change the `def normalize_text(text: str) -> str:` function as this might be a very language and model-training specific function.
-- 4. **Important**: It is not allowed to "cheat" in any way when in comes to pre-and postprocessing. In short, "cheating" refers to any of the following:
-	- a. Somehow giving the model access to the target transcriptions to improve performance. The model is not allowed to use the target transcriptions to generate its predictions.
-	- b. Pre-processing the target transcriptions in a way that makes the target transcriptions lose their original meaning. This corresponds to what has already been said in [Data and Preprocessing](#data-and-preprocessing) and is somewhat of a grey zone. It means that one should not remove characters that would make a word to lose its meaning. E.g., it is not allowed to replace all `e` in English with `i` and simply make the model learn that `e` and `i` are the same letter for a better word error rate. This would destroy the meaning of words such as `fell -> fill`. However, it is totally fine to normalize (*e.g.* lowercase) all letters, remove punctuation. There can be a lot of language-specific exceptions and in case you are not sure whether your target transcription pre-processing is allowed, please ask on the Discord channel.
-
-Uff, that was a lot of text describing how to make sure your `eval.py` script 
-is in the correct format. If you have any questions, please ask openly in Discord.
-
-Great, now that we have adapted the `eval.py` script, we can lean back and run the 
-evaluation. 
-First, one should evaluate the model on Common Voice 7's test data. This might 
-already have been done for your acoustic model during training but in case you 
-added an *n-gram* language model after having fine-tuned the acoustic model, you
-should now see a nice improvement.
-
-The command to evaluate our test model [xls-r-300m-sv](https://huggingface.co/hf-test/xls-r-300m-sv) on Common Voice 7's test data is the following:
-
-```bash
-cd xls-r-300m-sv
-./eval.py --model_id ./ --dataset mozilla-foundation/common_voice_7_0 --config sv-SE --split test --log_outputs
-```
-
-To log each of the model's predictions with the target transcriptions, you can just 
-add the `--log_outputs` flag.
-
-Running this command should automatically create the file:
-`mozilla-foundation_common_voice_7_0_sv-SE_test_eval_results.txt` that contains 
-both the word- and character error rate.
-
-In a few days, we will give everybody access to some real-world audio data for as many languages as possible.
-If your language has real-world audio data, it will most likely have audio input 
-of multiple minutes. 🤗Transformer's [ASR pipeline](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline) supports audio chunking out-of-the-box. You only need to specify 
-how song each audio chunk should be (`chunk_length_s`) and how much audio stride 
-(`stride_length_s`) each chunk should use.
-For more information on the chunking works, please have a look at [this nice blog post](TODO: ).
-
-In the case of `xls-r-300m-sv`, the following command can be run:
-
-```bash 
-cd xls-r-300m-sv
-./eval.py --model_id hf-test/xls-r-300m-sv --dataset <to-be-announced> --config sv --split validation --chunk_length_s 5.0 --stride_length_s 1.0 --log_outputs
-```
-
-Great, now you should have successfully evaluated your model. Finally, there is one 
-**important** thing you should do so that your model is taken into account 
-for the final evaluation. You should add two tags to your model, one being `robust-speech-event`, one being the ISO code of your chosen language, *e.g.* `"sv"` for the 
-exemplary model we used above. You can find a list of all available languages and 
-their ISO code [here](https://huggingface.co/languages).
-
-To add the tags, simply edit the README.md of your model repository and add
-
-```
-- "sv"
-- "robust-speech-event"
-```
-
-under `tags:` as done [here](https://huggingface.co/hf-test/xls-r-300m-sv/commit/a495fd70c96bb7d019729be9273a265c2557345e).
-
-To verify that you've added the tags correctly make sure that your model 
-appears when clicking on [this link](https://huggingface.co/models?other=robust-speech-event).
-
-Great that's it! This should give you all the necessary information to evaluate
-your model. For the final evaluation, we will verify each evaluation result to 
-determine the final score and thereby the winning models for each language.
-
-The final score is calculated as follows:
-
-```bash
-FINAL_SCORE = 1/3 * WER_Common_Voice_7_test + 1/3 * WER_REAL_AUDIO_DEV + 1/3 * WER_REAL_AUDIO_TEST
-```
-
-The dataset `WER_REAL_AUDIO_TEST` is hidden and will only be published 
-at the end of the robust speech challenge.
-
-If there is no real audio data for your language the final score will be 
-computed solely based on the Common Voice 7 test dataset. If there is also
-no Common Voice 7 test dataset for your language, we will see together how to 
-score your model - if this is the case, please don't be discouraged. We are 
-especially excited about speech recognition systems of such low-resource 
-languages and will make sure that we'll decide on a good approach to evaluating 
-your model.
-
-## Prizes
-
-TODO(Patrick, Omar, ...)
-
-## Communication and Problems
-
-If you encounter any problems or have any questions, you should use one of the following platforms
-depending on your type of problem. Hugging Face is an "open-source-first" organization meaning 
-that we'll try to solve all problems in the most public and most transparent way possible so that everybody
-in the community profits.
-
-The following table summarizes what platform to use for which problem.
-
-- Problem/question/bug with the 🤗 Datasets library that you think is a general problem that also impacts other people, please open an [Issues on Datasets](https://github.com/huggingface/datasets/issues/new?assignees=&labels=bug&template=bug-report.md&title=) and ping @anton-l and @patrickvonplaten.
-- Problem/question/bug with the 🤗 Transformers library that you think is a general problem that also impacts other people, please open an [Issues on Transformers](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title=) and ping @anton-l and @patrickvonplaten.
-- Problem/question with a modified, customized training script that is less likely to impact other people, please post your problem/question [on the forum](https://discuss.huggingface.co/) and ping @anton-l and @patrickvonplaten.
-- Questions regarding access to the OVHcloud GPU, please ask in the Discord channel **#ovh-support**.
-- Other questions regarding the event, rules of the event, or if you are not sure where to post your question, please ask in the Discord channel **#sprint-discussions**.
-
-## Talks
-
-We are very excited to be hosting 2 days of talks from Kensho-Technologies, Mozilla's Common Voice, Meta AI Research and Hugging Face.
-
-### Thursday, January 20th
-
- Speaker        | Topic                           | Time                  |  Video |
-|-------------|---------------------------------|------------------------|------------------------|
-| Patrick von Platen, Hugging Face | Introduction to Robust Speech Challenge | 4h30pm - 5h00pm UTC     | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://www.youtube.com/watch?v=X9e5Tto-Iuk)
-| Raymond Grossman and Jeremy Lopez, Kensho-Technologies | Pyctcdecode & Speech2text decoding | 5h30pm - 6h00pm UTC      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://www.youtube.com/watch?v=mp7fHMTnK9A)
-
-### Friday, January 21th
-
- Speaker        | Topic                           | Time                  | Video |
-|-------------|---------------------------------|------------------------|------------------------|
-| Gabriel Habayeb, Mozilla Common Voice | Unlocking global speech with Mozilla Common Voice | 4h30pm - 5h00pm UTC      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://www.youtube.com/watch?v=Vvn984QmAVg)
-| Changhan Wang, Meta AI Research | XLS-R: Large-Scale Cross-lingual Speech Representation Learning on 128 Languages | 5h30pm - 6h00pm UTC      | [![Youtube](https://www.youtube.com/s/desktop/f506bd45/img/favicon_32.png)](https://www.youtube.com/watch?v=ic_J7ZCROBM)
-
-### Talks & Speakers
-
-#### Patrick von Platen, Research Engineer, Hugging Face
-- Talk: Introduction to Robust Speech Challenge
-- Abstract: In this talk, Patrick outlines the Robust Speech Challenge and gives tips and tricks on how to train and evaluate speech recognition systems with 🤗 Transformers and 🤗 Datasets, and PyTorch.
-- Speaker info: Patrick von Platen is a research engineer at Hugging Face and one of the core maintainers of the popular Transformers library. He specializes in speech recognition, encoder-decoder models, and long-range sequence modeling. Before joining Hugging Face, Patrick researched speech recognition at Uber AI, Cambridge University, and RWTH Aachen University.
-
-#### Raymond Grossman, Jeremy Lopez, Machine Learning Engineer, Kensho Technologies
-- Talk: PyCTCDecode & Speech2text decoding
-- Abstract: PyCTCDecode is a fast and feature-rich CTC beam search decoder for speech recognition written in Python, providing n-gram (kenlm) language model support similar to PaddlePaddle's decoder, but incorporating many new features such as byte pair encoding and real-time decoding to support models like Nvidia's Conformer-CTC or Facebook's Wav2Vec2.
-- Speaker info : 
-	- Raymond works as a machine learning engineer at Kensho Technologies, specializing in speech and natural language domains. Before coming to Kensho, he studied mathematics at Princeton and was an avid Kaggler under the moniker @ToTrainThemIsMyCause. 
-	- Jeremy is a machine learning engineer at Kensho Technologies and has worked on a variety of different topics including search and speech recognition. Before working at Kensho, he earned a PhD in experimental particle physics at MIT and continued doing physics research as a postdoc at the University of Colorado Boulder.
-
-#### Gabriel Habayeb, Data Engineer, Common Voice @ Mozilla
-- Talk: Unlocking global speech with Mozilla Common Voice
-- Abstract: Hear from Common Voice Data Engineer Gabriel Habayeb (Mozilla Foundation) as he talks about how Common Voice makes it easy to crowdsource voice data in global languages, as well as getting key insights into the dataset itself, how we maintain quality, use metadata - and our plans for the future!
-- Speaker info: Gabriel is a software developer with the Common Voice team at the Mozilla Foundation with a focus on data engineering. Before joining the Foundation, he spent the last six years working across different industries, including education, enterprise and not-for-profit organizations.
-
-#### Changhan Wang, Main author of XLS-R and Research Engineer, Meta AI Research
-- Talk: XLS-R: Large-Scale Cross-lingual Speech Representation Learning on 128 Languages
-- Abstract: In this talk, Changhan will present XLS-R, a large-scale model for cross-lingual speech representation learning based on wav2vec 2.0. XLS-R has up to 2B parameters and was trained on nearly half a million hours of publicly available speech audio in 128 languages, an order of magnitude more public data than the largest known prior work. On the CoVoST-2 speech translation benchmark, XLS-R improves the previous state of the art by an average of 7.4 BLEU over 21 translation directions into English. For speech recognition, XLS-R improves over the best known prior work on BABEL, MLS, CommonVoice as well as VoxPopuli, lowering error rates by 14-34% relative on average. XLS-R also sets a new state of the art on VoxLingua107 language identification. The XLS-R team hopes to work together with the open-source community to improve speech processing tasks for many more languages of the world.
-
-## General Tips and Tricks
-
-- Memory efficient training:
-
-In case, you are getting out-of-memory errors on your GPU, we recommend to use 
-[bitsandbytes](https://github.com/TimDettmers/bitsandbytes) to replace the 
-native memory-intensive Adam optimizer with the one of `bitsandbytes`. You
-can simply run the script `./run_speech_recognition_ctc_bnb.py` provided in this 
-folder that makes use of `bitsandbytes` instead of the official one.
-
-- Dataset streaming
-
-TODO(Patrick)
diff --git a/examples/research_projects/robust-speech-event/eval.py b/examples/research_projects/robust-speech-event/eval.py
deleted file mode 100755
index b6c89a6d49fa..000000000000
--- a/examples/research_projects/robust-speech-event/eval.py
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import re
-from typing import Dict
-
-import torch
-from datasets import Audio, Dataset, load_dataset, load_metric
-
-from transformers import AutoFeatureExtractor, pipeline
-
-
-def log_results(result: Dataset, args: Dict[str, str]):
-    """DO NOT CHANGE. This function computes and logs the result metrics."""
-
-    log_outputs = args.log_outputs
-    dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
-
-    # load metric
-    wer = load_metric("wer")
-    cer = load_metric("cer")
-
-    # compute metrics
-    wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
-    cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
-
-    # print & log results
-    result_str = f"WER: {wer_result}\nCER: {cer_result}"
-    print(result_str)
-
-    with open(f"{dataset_id}_eval_results.txt", "w") as f:
-        f.write(result_str)
-
-    # log all results in text file. Possibly interesting for analysis
-    if log_outputs is not None:
-        pred_file = f"log_{dataset_id}_predictions.txt"
-        target_file = f"log_{dataset_id}_targets.txt"
-
-        with open(pred_file, "w") as p, open(target_file, "w") as t:
-            # mapping function to write output
-            def write_to_file(batch, i):
-                p.write(f"{i}" + "\n")
-                p.write(batch["prediction"] + "\n")
-                t.write(f"{i}" + "\n")
-                t.write(batch["target"] + "\n")
-
-            result.map(write_to_file, with_indices=True)
-
-
-def normalize_text(text: str) -> str:
-    """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
-
-    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
-
-    text = re.sub(chars_to_ignore_regex, "", text.lower())
-
-    # In addition, we can normalize the target text, e.g. removing new lines characters etc...
-    # note that order is important here!
-    token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
-
-    for t in token_sequences_to_ignore:
-        text = " ".join(text.split(t))
-
-    return text
-
-
-def main(args):
-    # load dataset
-    dataset = load_dataset(args.dataset, args.config, split=args.split, token=True)
-
-    # for testing: only process the first two examples as a test
-    # dataset = dataset.select(range(10))
-
-    # load processor
-    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
-    sampling_rate = feature_extractor.sampling_rate
-
-    # resample audio
-    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
-
-    # load eval pipeline
-    if args.device is None:
-        args.device = 0 if torch.cuda.is_available() else -1
-    asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
-
-    # map function to decode audio
-    def map_to_pred(batch):
-        prediction = asr(
-            batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
-        )
-
-        batch["prediction"] = prediction["text"]
-        batch["target"] = normalize_text(batch["sentence"])
-        return batch
-
-    # run inference on all examples
-    result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
-
-    # compute and log_results
-    # do not change function below
-    log_results(result, args)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        required=True,
-        help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets",
-    )
-    parser.add_argument(
-        "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice"
-    )
-    parser.add_argument("--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`")
-    parser.add_argument(
-        "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to 5 seconds."
-    )
-    parser.add_argument(
-        "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to 1 second."
-    )
-    parser.add_argument(
-        "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
-    )
-    parser.add_argument(
-        "--device",
-        type=int,
-        default=None,
-        help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
-    )
-    args = parser.parse_args()
-
-    main(args)
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
deleted file mode 100755
index e95bc185e48c..000000000000
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ /dev/null
@@ -1,779 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-"""Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
-
-import functools
-import json
-import logging
-import os
-import re
-import sys
-import warnings
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Union
-
-import bitsandbytes as bnb
-import datasets
-import numpy as np
-import torch
-from datasets import DatasetDict, load_dataset, load_metric
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoFeatureExtractor,
-    AutoModelForCTC,
-    AutoProcessor,
-    AutoTokenizer,
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    Wav2Vec2Processor,
-    set_seed,
-)
-from transformers.trainer_pt_utils import get_parameter_names
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
-from transformers.utils.versions import require_version
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.16.0.dev0")
-
-require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
-
-
-logger = logging.getLogger(__name__)
-
-
-def list_field(default=None, metadata=None):
-    return field(default_factory=lambda: default, metadata=metadata)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    tokenizer_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    freeze_feature_encoder: bool = field(
-        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
-    )
-    attention_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
-    )
-    activation_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
-    )
-    feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
-    hidden_dropout: float = field(
-        default=0.0,
-        metadata={
-            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
-        },
-    )
-    final_dropout: float = field(
-        default=0.0,
-        metadata={"help": "The dropout probability for the final projection layer."},
-    )
-    mask_time_prob: float = field(
-        default=0.05,
-        metadata={
-            "help": (
-                "Probability of each feature vector along the time axis to be chosen as the start of the vector "
-                "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature "
-                "vectors will be masked along the time axis."
-            )
-        },
-    )
-    mask_time_length: int = field(
-        default=10,
-        metadata={"help": "Length of vector span to mask along the time axis."},
-    )
-    mask_feature_prob: float = field(
-        default=0.0,
-        metadata={
-            "help": (
-                "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
-                " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
-                " bins will be masked along the time axis."
-            )
-        },
-    )
-    mask_feature_length: int = field(
-        default=10,
-        metadata={"help": "Length of vector span to mask along the feature axis."},
-    )
-    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
-    ctc_loss_reduction: Optional[str] = field(
-        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_name: str = field(
-        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: str = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_split_name: str = field(
-        default="train+validation",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    eval_split_name: str = field(
-        default="test",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    audio_column_name: str = field(
-        default="audio",
-        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
-    )
-    text_column_name: str = field(
-        default="text",
-        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of validation examples to this "
-                "value if set."
-            )
-        },
-    )
-    chars_to_ignore: Optional[List[str]] = list_field(
-        default=None,
-        metadata={"help": "A list of characters to remove from the transcripts."},
-    )
-    eval_metrics: List[str] = list_field(
-        default=["wer"],
-        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
-    )
-    max_duration_in_seconds: float = field(
-        default=20.0,
-        metadata={
-            "help": (
-                "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
-                " 'max_duration_in_seconds`"
-            )
-        },
-    )
-    min_duration_in_seconds: float = field(
-        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
-    )
-    preprocessing_only: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to only do data preprocessing and skip training. This is especially useful when data"
-                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
-                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
-                " can consequently be loaded in distributed training"
-            )
-        },
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "If :obj:`True`, will use the token generated when running"
-                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
-            )
-        },
-    )
-    unk_token: str = field(
-        default="[UNK]",
-        metadata={"help": "The unk token for the tokenizer"},
-    )
-    pad_token: str = field(
-        default="[PAD]",
-        metadata={"help": "The padding token for the tokenizer"},
-    )
-    word_delimiter_token: str = field(
-        default="|",
-        metadata={"help": "The word delimiter token for the tokenizer"},
-    )
-    phoneme_language: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The target language that should be used be"
-                " passed to the tokenizer for tokenization. Note that"
-                " this is only relevant if the model classifies the"
-                " input audio to a sequence of phoneme sequences."
-            )
-        },
-    )
-
-
-@dataclass
-class DataCollatorCTCWithPadding:
-    """
-    Data collator that will dynamically pad the inputs received.
-    Args:
-        processor (:class:`~transformers.AutoProcessor`)
-            The processor used for proccessing the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
-              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-              different lengths).
-        max_length (:obj:`int`, `optional`):
-            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
-        max_length_labels (:obj:`int`, `optional`):
-            Maximum length of the ``labels`` returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
-            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-    """
-
-    processor: AutoProcessor
-    padding: Union[bool, str] = "longest"
-    pad_to_multiple_of: Optional[int] = None
-    pad_to_multiple_of_labels: Optional[int] = None
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lengths and need
-        # different padding methods
-        input_features = [{"input_values": feature["input_values"]} for feature in features]
-        label_features = [{"input_ids": feature["labels"]} for feature in features]
-
-        batch = self.processor.pad(
-            input_features,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="pt",
-        )
-
-        labels_batch = self.processor.pad(
-            labels=label_features,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of_labels,
-            return_tensors="pt",
-        )
-
-        # replace padding with -100 to ignore loss correctly
-        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
-
-        batch["labels"] = labels
-
-        return batch
-
-
-def create_vocabulary_from_data(
-    datasets: DatasetDict,
-    word_delimiter_token: Optional[str] = None,
-    unk_token: Optional[str] = None,
-    pad_token: Optional[str] = None,
-):
-    # Given training and test labels create vocabulary
-    def extract_all_chars(batch):
-        all_text = " ".join(batch["target_text"])
-        vocab = list(set(all_text))
-        return {"vocab": [vocab], "all_text": [all_text]}
-
-    vocabs = datasets.map(
-        extract_all_chars,
-        batched=True,
-        batch_size=-1,
-        keep_in_memory=True,
-        remove_columns=datasets["train"].column_names,
-    )
-
-    # take union of all unique characters in each dataset
-    vocab_set = functools.reduce(
-        lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
-    )
-
-    vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
-
-    # replace white space with delimiter token
-    if word_delimiter_token is not None:
-        vocab_dict[word_delimiter_token] = vocab_dict[" "]
-        del vocab_dict[" "]
-
-    # add unk and pad token
-    if unk_token is not None:
-        vocab_dict[unk_token] = len(vocab_dict)
-
-    if pad_token is not None:
-        vocab_dict[pad_token] = len(vocab_dict)
-
-    return vocab_dict
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # 1. First, let's load the dataset
-    raw_datasets = DatasetDict()
-
-    if training_args.do_train:
-        raw_datasets["train"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=data_args.train_split_name,
-            token=data_args.use_auth_token,
-        )
-
-        if data_args.audio_column_name not in raw_datasets["train"].column_names:
-            raise ValueError(
-                f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
-                " Make sure to set `--audio_column_name` to the correct audio column - one of"
-                f" {', '.join(raw_datasets['train'].column_names)}."
-            )
-
-        if data_args.text_column_name not in raw_datasets["train"].column_names:
-            raise ValueError(
-                f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
-                "Make sure to set `--text_column_name` to the correct text column - one of "
-                f"{', '.join(raw_datasets['train'].column_names)}."
-            )
-
-        if data_args.max_train_samples is not None:
-            raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        raw_datasets["eval"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=data_args.eval_split_name,
-            token=data_args.use_auth_token,
-        )
-
-        if data_args.max_eval_samples is not None:
-            raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
-
-    # 2. We remove some special characters from the datasets
-    # that make training complicated and do not help in transcribing the speech
-    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
-    # that could be easily picked up by the model
-    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
-    )
-    text_column_name = data_args.text_column_name
-
-    def remove_special_characters(batch):
-        if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
-        else:
-            batch["target_text"] = batch[text_column_name].lower() + " "
-        return batch
-
-    with training_args.main_process_first(desc="dataset map special characters removal"):
-        raw_datasets = raw_datasets.map(
-            remove_special_characters,
-            remove_columns=[text_column_name],
-            desc="remove special characters from datasets",
-        )
-
-    # save special tokens for tokenizer
-    word_delimiter_token = data_args.word_delimiter_token
-    unk_token = data_args.unk_token
-    pad_token = data_args.pad_token
-
-    # 3. Next, let's load the config as we might need it to create
-    # the tokenizer
-    # load config
-    config = AutoConfig.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, token=data_args.use_auth_token
-    )
-
-    # 4. Next, if no tokenizer file is defined,
-    # we create the vocabulary of the model by extracting all unique characters from
-    # the training and evaluation datasets
-    # We need to make sure that only first rank saves vocabulary
-    # make sure all processes wait until vocab is created
-    tokenizer_name_or_path = model_args.tokenizer_name_or_path
-    tokenizer_kwargs = {}
-    if tokenizer_name_or_path is None:
-        # save vocab in training output dir
-        tokenizer_name_or_path = training_args.output_dir
-
-        vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
-
-        with training_args.main_process_first():
-            if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
-                os.remove(vocab_file)
-
-        with training_args.main_process_first(desc="dataset map vocabulary creation"):
-            if not os.path.isfile(vocab_file):
-                os.makedirs(tokenizer_name_or_path, exist_ok=True)
-                vocab_dict = create_vocabulary_from_data(
-                    raw_datasets,
-                    word_delimiter_token=word_delimiter_token,
-                    unk_token=unk_token,
-                    pad_token=pad_token,
-                )
-
-                # save vocab dict to be loaded into tokenizer
-                with open(vocab_file, "w") as file:
-                    json.dump(vocab_dict, file)
-
-        # if tokenizer has just been created
-        # it is defined by `tokenizer_class` if present in config else by `model_type`
-        tokenizer_kwargs = {
-            "config": config if config.tokenizer_class is not None else None,
-            "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
-            "unk_token": unk_token,
-            "pad_token": pad_token,
-            "word_delimiter_token": word_delimiter_token,
-        }
-
-    # 5. Now we can instantiate the feature extractor, tokenizer and model
-    # Note for distributed training, the .from_pretrained methods guarantee that only
-    # one local process can concurrently download model & vocab.
-
-    # load feature_extractor and tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_name_or_path,
-        token=data_args.use_auth_token,
-        **tokenizer_kwargs,
-    )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, token=data_args.use_auth_token
-    )
-
-    # adapt config
-    config.update(
-        {
-            "feat_proj_dropout": model_args.feat_proj_dropout,
-            "attention_dropout": model_args.attention_dropout,
-            "hidden_dropout": model_args.hidden_dropout,
-            "final_dropout": model_args.final_dropout,
-            "mask_time_prob": model_args.mask_time_prob,
-            "mask_time_length": model_args.mask_time_length,
-            "mask_feature_prob": model_args.mask_feature_prob,
-            "mask_feature_length": model_args.mask_feature_length,
-            "gradient_checkpointing": training_args.gradient_checkpointing,
-            "layerdrop": model_args.layerdrop,
-            "ctc_loss_reduction": model_args.ctc_loss_reduction,
-            "pad_token_id": tokenizer.pad_token_id,
-            "vocab_size": len(tokenizer),
-            "activation_dropout": model_args.activation_dropout,
-        }
-    )
-
-    # create model
-    model = AutoModelForCTC.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        config=config,
-        token=data_args.use_auth_token,
-    )
-
-    # freeze encoder
-    if model_args.freeze_feature_encoder:
-        model.freeze_feature_encoder()
-
-    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
-    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
-    # so that we just need to set the correct target sampling rate and normalize the input
-    # via the `feature_extractor`
-
-    # make sure that dataset decodes audio with correct sampling rate
-    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
-    if dataset_sampling_rate != feature_extractor.sampling_rate:
-        raw_datasets = raw_datasets.cast_column(
-            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
-        )
-
-    # derive max & min input length for sample rate & max duration
-    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
-    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
-    audio_column_name = data_args.audio_column_name
-    num_workers = data_args.preprocessing_num_workers
-
-    # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
-    phoneme_language = data_args.phoneme_language
-
-    # Preprocessing the datasets.
-    # We need to read the audio files as arrays and tokenize the targets.
-    def prepare_dataset(batch):
-        # load audio
-        sample = batch[audio_column_name]
-
-        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
-        batch["input_values"] = inputs.input_values[0]
-        batch["input_length"] = len(batch["input_values"])
-
-        # encode targets
-        additional_kwargs = {}
-        if phoneme_language is not None:
-            additional_kwargs["phonemizer_lang"] = phoneme_language
-
-        batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
-        return batch
-
-    with training_args.main_process_first(desc="dataset map preprocessing"):
-        vectorized_datasets = raw_datasets.map(
-            prepare_dataset,
-            remove_columns=next(iter(raw_datasets.values())).column_names,
-            num_proc=num_workers,
-            desc="preprocess datasets",
-        )
-
-        def is_audio_in_length_range(length):
-            return length > min_input_length and length < max_input_length
-
-        # filter data that is shorter than min_input_length
-        vectorized_datasets = vectorized_datasets.filter(
-            is_audio_in_length_range,
-            num_proc=num_workers,
-            input_columns=["input_length"],
-        )
-
-    # 7. Next, we can prepare the training.
-    # Let's use word error rate (WER) as our evaluation metric,
-    # instantiate a data collator and the trainer
-
-    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
-    eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
-
-    # for large datasets it is advised to run the preprocessing on a
-    # single machine first with ``args.preprocessing_only`` since there will mostly likely
-    # be a timeout when running the script in distributed mode.
-    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
-    # cached dataset
-    if data_args.preprocessing_only:
-        logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
-        return
-
-    def compute_metrics(pred):
-        pred_logits = pred.predictions
-        pred_ids = np.argmax(pred_logits, axis=-1)
-
-        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
-
-        pred_str = tokenizer.batch_decode(pred_ids)
-        # we do not want to group tokens when computing the metrics
-        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
-
-        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
-
-        return metrics
-
-    # Now save everything to be able to create a single processor later
-    if is_main_process(training_args.local_rank):
-        # save feature extractor, tokenizer and config
-        feature_extractor.save_pretrained(training_args.output_dir)
-        tokenizer.save_pretrained(training_args.output_dir)
-        config.save_pretrained(training_args.output_dir)
-
-    try:
-        processor = AutoProcessor.from_pretrained(training_args.output_dir)
-    except (OSError, KeyError):
-        warnings.warn(
-            "Loading a processor from a feature extractor config that does not"
-            " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
-            " attribute to your `preprocessor_config.json` file to suppress this warning: "
-            " `'processor_class': 'Wav2Vec2Processor'`",
-            FutureWarning,
-        )
-        processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
-
-    # Instantiate custom data collator
-    data_collator = DataCollatorCTCWithPadding(processor=processor)
-
-    decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm], ["bias", "layernorm", "rmsnorm"])
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if n in decay_parameters],
-            "weight_decay": training_args.weight_decay,
-        },
-        {
-            "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = bnb.optim.Adam8bit(
-        params=optimizer_grouped_parameters,
-        lr=training_args.learning_rate,
-        betas=(training_args.adam_beta1, training_args.adam_beta2),
-        eps=training_args.adam_epsilon,
-    )
-
-    optimizers = (optimizer, None)
-
-    # Initialize Trainer
-    trainer = Trainer(
-        model=model,
-        data_collator=data_collator,
-        args=training_args,
-        compute_metrics=compute_metrics,
-        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
-        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=feature_extractor,
-        optimizers=optimizers,
-    )
-
-    # 8. Finally, we can start training
-
-    # Training
-    if training_args.do_train:
-        # use last checkpoint if exist
-        if last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        elif os.path.isdir(model_args.model_name_or_path):
-            checkpoint = model_args.model_name_or_path
-        else:
-            checkpoint = None
-
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples
-            if data_args.max_train_samples is not None
-            else len(vectorized_datasets["train"])
-        )
-        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-        max_eval_samples = (
-            data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
-        )
-        metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Write model card and (optionally) push to hub
-    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
-    kwargs = {
-        "finetuned_from": model_args.model_name_or_path,
-        "tasks": "automatic-speech-recognition",
-        "tags": ["automatic-speech-recognition", data_args.dataset_name],
-        "dataset_args": (
-            f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
-            f" {data_args.eval_split_name}"
-        ),
-        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
-    }
-    if "common_voice" in data_args.dataset_name:
-        kwargs["language"] = config_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
deleted file mode 100644
index 0fb567aba053..000000000000
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
+++ /dev/null
@@ -1,679 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-"""Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition in streaming mode"""
-
-import logging
-import os
-import re
-import sys
-import warnings
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Union
-
-import datasets
-import numpy as np
-import torch
-from datasets import IterableDatasetDict, interleave_datasets, load_dataset, load_metric
-from torch.utils.data import IterableDataset
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoFeatureExtractor,
-    AutoModelForCTC,
-    AutoProcessor,
-    AutoTokenizer,
-    HfArgumentParser,
-    Trainer,
-    TrainerCallback,
-    TrainingArguments,
-    Wav2Vec2Processor,
-    set_seed,
-)
-from transformers.trainer_pt_utils import IterableDatasetShard
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
-from transformers.utils.versions import require_version
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.17.0.dev0")
-
-require_version("datasets>=1.18.2", "To fix: pip install 'datasets>=1.18.2'")
-
-
-logger = logging.getLogger(__name__)
-
-
-def list_field(default=None, metadata=None):
-    return field(default_factory=lambda: default, metadata=metadata)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    tokenizer_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    freeze_feature_encoder: bool = field(
-        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
-    )
-    attention_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
-    )
-    activation_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
-    )
-    feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
-    hidden_dropout: float = field(
-        default=0.0,
-        metadata={
-            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
-        },
-    )
-    final_dropout: float = field(
-        default=0.0,
-        metadata={"help": "The dropout probability for the final projection layer."},
-    )
-    mask_time_prob: float = field(
-        default=0.05,
-        metadata={
-            "help": (
-                "Probability of each feature vector along the time axis to be chosen as the start of the vector "
-                "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature "
-                "vectors will be masked along the time axis."
-            )
-        },
-    )
-    mask_time_length: int = field(
-        default=10,
-        metadata={"help": "Length of vector span to mask along the time axis."},
-    )
-    mask_feature_prob: float = field(
-        default=0.0,
-        metadata={
-            "help": (
-                "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
-                " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
-                " bins will be masked along the time axis."
-            )
-        },
-    )
-    mask_feature_length: int = field(
-        default=10,
-        metadata={"help": "Length of vector span to mask along the feature axis."},
-    )
-    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
-    ctc_loss_reduction: Optional[str] = field(
-        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_name: str = field(
-        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: str = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_split_name: str = field(
-        default="train+validation",
-        metadata={
-            "help": (
-                "The name of the training data set split to use (via the datasets library). Defaults to "
-                "'train+validation'"
-            )
-        },
-    )
-    eval_split_name: str = field(
-        default="test",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'test'"
-        },
-    )
-    audio_column_name: str = field(
-        default="audio",
-        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
-    )
-    text_column_name: str = field(
-        default="text",
-        metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of validation examples to this "
-                "value if set."
-            )
-        },
-    )
-    shuffle_buffer_size: Optional[int] = field(
-        default=500,
-        metadata={
-            "help": (
-                "The number of streamed examples to download before shuffling them. The large the buffer, "
-                "the closer it is to real offline shuffling."
-            )
-        },
-    )
-    chars_to_ignore: Optional[List[str]] = list_field(
-        default=None,
-        metadata={"help": "A list of characters to remove from the transcripts."},
-    )
-    eval_metrics: List[str] = list_field(
-        default=["wer"],
-        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
-    )
-    max_duration_in_seconds: float = field(
-        default=20.0,
-        metadata={"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds."},
-    )
-    preprocessing_only: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to only do data preprocessing and skip training. This is especially useful when data"
-                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
-                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
-                " can consequently be loaded in distributed training"
-            )
-        },
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "If :obj:`True`, will use the token generated when running"
-                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
-            )
-        },
-    )
-    phoneme_language: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The target language that should be used be"
-                " passed to the tokenizer for tokenization. Note that"
-                " this is only relevant if the model classifies the"
-                " input audio to a sequence of phoneme sequences."
-            )
-        },
-    )
-
-
-@dataclass
-class DataCollatorCTCWithPadding:
-    """
-    Data collator that will dynamically pad the inputs received.
-    Args:
-        processor (:class:`~transformers.AutoProcessor`)
-            The processor used for proccessing the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
-              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-              different lengths).
-        max_length (:obj:`int`, `optional`):
-            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
-        max_length_labels (:obj:`int`, `optional`):
-            Maximum length of the ``labels`` returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
-            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-    """
-
-    processor: AutoProcessor
-    padding: Union[bool, str] = "longest"
-    max_length: Optional[int] = None
-    pad_to_multiple_of: Optional[int] = None
-    pad_to_multiple_of_labels: Optional[int] = None
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lengths and need
-        # different padding methods
-        input_features = []
-        label_features = []
-        for feature in features:
-            if self.max_length and feature["input_values"].shape[-1] > self.max_length:
-                continue
-            input_features.append({"input_values": feature["input_values"]})
-            label_features.append({"input_ids": feature["labels"]})
-
-        batch = self.processor.pad(
-            input_features,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="pt",
-        )
-
-        labels_batch = self.processor.pad(
-            labels=label_features,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of_labels,
-            return_tensors="pt",
-        )
-
-        # replace padding with -100 to ignore loss correctly
-        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
-
-        batch["labels"] = labels
-
-        return batch
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # 1. First, let's load the dataset
-    raw_datasets = IterableDatasetDict()
-    raw_column_names = {}
-
-    def load_streaming_dataset(split, sampling_rate, **kwargs):
-        if "+" in split:
-            dataset_splits = [load_dataset(split=split_name, **kwargs) for split_name in split.split("+")]
-            # `features` and `cast_column` won't be available after interleaving, so we'll use them here
-            features = dataset_splits[0].features
-            # make sure that the dataset decodes audio with a correct sampling rate
-            dataset_splits = [
-                dataset.cast_column(data_args.audio_column_name, datasets.features.Audio(sampling_rate=sampling_rate))
-                for dataset in dataset_splits
-            ]
-
-            interleaved_dataset = interleave_datasets(dataset_splits)
-            return interleaved_dataset, features
-        else:
-            dataset = load_dataset(split=split, **kwargs)
-            features = dataset.features
-            # make sure that the dataset decodes audio with a correct sampling rate
-            dataset = dataset.cast_column(
-                data_args.audio_column_name, datasets.features.Audio(sampling_rate=sampling_rate)
-            )
-            return dataset, features
-
-    # `datasets` takes care of automatically loading and resampling the audio,
-    # so we just need to set the correct target sampling rate and normalize the input
-    # via the `feature_extractor`
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, token=data_args.use_auth_token
-    )
-
-    if training_args.do_train:
-        raw_datasets["train"], train_features = load_streaming_dataset(
-            path=data_args.dataset_name,
-            name=data_args.dataset_config_name,
-            split=data_args.train_split_name,
-            token=data_args.use_auth_token,
-            streaming=True,
-            sampling_rate=feature_extractor.sampling_rate,
-        )
-        raw_column_names["train"] = list(train_features.keys())
-
-        if data_args.audio_column_name not in raw_column_names["train"]:
-            raise ValueError(
-                f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
-                " Make sure to set `--audio_column_name` to the correct audio column - one of"
-                f" {', '.join(raw_column_names['train'])}."
-            )
-
-        if data_args.text_column_name not in raw_column_names["train"]:
-            raise ValueError(
-                f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
-                "Make sure to set `--text_column_name` to the correct text column - one of "
-                f"{', '.join(raw_column_names['train'])}."
-            )
-
-        if data_args.max_train_samples is not None:
-            raw_datasets["train"] = raw_datasets["train"].take(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        raw_datasets["eval"], eval_features = load_streaming_dataset(
-            path=data_args.dataset_name,
-            name=data_args.dataset_config_name,
-            split=data_args.eval_split_name,
-            token=data_args.use_auth_token,
-            streaming=True,
-            sampling_rate=feature_extractor.sampling_rate,
-        )
-        raw_column_names["eval"] = list(eval_features.keys())
-
-        if data_args.max_eval_samples is not None:
-            raw_datasets["eval"] = raw_datasets["eval"].take(range(data_args.max_eval_samples))
-
-    # 2. We remove some special characters from the datasets
-    # that make training complicated and do not help in transcribing the speech
-    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
-    # that could be easily picked up by the model
-    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
-    )
-    text_column_name = data_args.text_column_name
-
-    def remove_special_characters(batch):
-        if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
-        else:
-            batch["target_text"] = batch[text_column_name].lower() + " "
-        return batch
-
-    with training_args.main_process_first(desc="dataset map special characters removal"):
-        for split, dataset in raw_datasets.items():
-            raw_datasets[split] = dataset.map(
-                remove_special_characters,
-            ).remove_columns([text_column_name])
-
-    # 3. Next, let's load the config as we might need it to create
-    # the tokenizer
-    config = AutoConfig.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, token=data_args.use_auth_token
-    )
-
-    # 4. Now we can instantiate the tokenizer and model
-    # Note for distributed training, the .from_pretrained methods guarantee that only
-    # one local process can concurrently download model & vocab.
-
-    tokenizer_name_or_path = model_args.tokenizer_name_or_path
-    if tokenizer_name_or_path is None:
-        raise ValueError(
-            "Tokenizer has to be created before training in streaming mode. Please specify --tokenizer_name_or_path"
-        )
-    # load feature_extractor and tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_name_or_path,
-        config=config,
-        token=data_args.use_auth_token,
-    )
-
-    # adapt config
-    config.update(
-        {
-            "feat_proj_dropout": model_args.feat_proj_dropout,
-            "attention_dropout": model_args.attention_dropout,
-            "hidden_dropout": model_args.hidden_dropout,
-            "final_dropout": model_args.final_dropout,
-            "mask_time_prob": model_args.mask_time_prob,
-            "mask_time_length": model_args.mask_time_length,
-            "mask_feature_prob": model_args.mask_feature_prob,
-            "mask_feature_length": model_args.mask_feature_length,
-            "gradient_checkpointing": training_args.gradient_checkpointing,
-            "layerdrop": model_args.layerdrop,
-            "ctc_loss_reduction": model_args.ctc_loss_reduction,
-            "pad_token_id": tokenizer.pad_token_id,
-            "vocab_size": len(tokenizer),
-            "activation_dropout": model_args.activation_dropout,
-        }
-    )
-
-    # create model
-    model = AutoModelForCTC.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        config=config,
-        token=data_args.use_auth_token,
-    )
-
-    # freeze encoder
-    if model_args.freeze_feature_encoder:
-        model.freeze_feature_encoder()
-
-    # 5. Now we preprocess the datasets including loading the audio, resampling and normalization
-    audio_column_name = data_args.audio_column_name
-
-    # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
-    phoneme_language = data_args.phoneme_language
-
-    # Preprocessing the datasets.
-    # We need to read the audio files as arrays and tokenize the targets.
-    def prepare_dataset(batch):
-        # load audio
-        sample = batch[audio_column_name]
-
-        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
-        batch["input_values"] = inputs.input_values[0]
-        batch["input_length"] = len(batch["input_values"])
-
-        # encode targets
-        additional_kwargs = {}
-        if phoneme_language is not None:
-            additional_kwargs["phonemizer_lang"] = phoneme_language
-
-        batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
-        return batch
-
-    vectorized_datasets = IterableDatasetDict()
-    with training_args.main_process_first(desc="dataset map preprocessing"):
-        for split, dataset in raw_datasets.items():
-            vectorized_datasets[split] = (
-                dataset.map(prepare_dataset)
-                .remove_columns(raw_column_names[split] + ["target_text"])
-                .with_format("torch")
-            )
-            if split == "train":
-                vectorized_datasets[split] = vectorized_datasets[split].shuffle(
-                    buffer_size=data_args.shuffle_buffer_size,
-                    seed=training_args.seed,
-                )
-
-    # 6. Next, we can prepare the training.
-    # Let's use word error rate (WER) as our evaluation metric,
-    # instantiate a data collator and the trainer
-
-    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
-    eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}
-
-    def compute_metrics(pred):
-        pred_logits = pred.predictions
-        pred_ids = np.argmax(pred_logits, axis=-1)
-
-        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
-
-        pred_str = tokenizer.batch_decode(pred_ids)
-        # we do not want to group tokens when computing the metrics
-        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
-
-        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
-
-        return metrics
-
-    # Now save everything to be able to create a single processor later
-    if is_main_process(training_args.local_rank):
-        # save feature extractor, tokenizer and config
-        feature_extractor.save_pretrained(training_args.output_dir)
-        tokenizer.save_pretrained(training_args.output_dir)
-        config.save_pretrained(training_args.output_dir)
-
-    try:
-        processor = AutoProcessor.from_pretrained(training_args.output_dir)
-    except (OSError, KeyError):
-        warnings.warn(
-            "Loading a processor from a feature extractor config that does not"
-            " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
-            " attribute to your `preprocessor_config.json` file to suppress this warning: "
-            " `'processor_class': 'Wav2Vec2Processor'`",
-            FutureWarning,
-        )
-        processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
-
-    # Instantiate custom data collator
-    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
-    data_collator = DataCollatorCTCWithPadding(processor=processor, max_length=max_input_length)
-
-    # trainer callback to reinitialize and reshuffle the streamable datasets at the beginning of each epoch
-    class ShuffleCallback(TrainerCallback):
-        def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
-            if isinstance(train_dataloader.dataset, IterableDatasetShard):
-                pass  # set_epoch() is handled by the Trainer
-            elif isinstance(train_dataloader.dataset, IterableDataset):
-                train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
-
-    # Initialize Trainer
-    trainer = Trainer(
-        model=model,
-        data_collator=data_collator,
-        args=training_args,
-        compute_metrics=compute_metrics,
-        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
-        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=processor,
-        callbacks=[ShuffleCallback()],
-    )
-
-    # 7. Finally, we can start training
-
-    # Training
-    if training_args.do_train:
-        # use last checkpoint if exist
-        if last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        elif os.path.isdir(model_args.model_name_or_path):
-            checkpoint = model_args.model_name_or_path
-        else:
-            checkpoint = None
-
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()
-
-        metrics = train_result.metrics
-        if data_args.max_train_samples:
-            metrics["train_samples"] = data_args.max_train_samples
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-        if data_args.max_eval_samples:
-            metrics["eval_samples"] = data_args.max_eval_samples
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    # Write model card and (optionally) push to hub
-    config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
-    kwargs = {
-        "finetuned_from": model_args.model_name_or_path,
-        "tasks": "automatic-speech-recognition",
-        "tags": ["automatic-speech-recognition", data_args.dataset_name],
-        "dataset_args": (
-            f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
-            f" {data_args.eval_split_name}"
-        ),
-        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
-    }
-    if "common_voice" in data_args.dataset_name:
-        kwargs["language"] = config_name
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/self-training-text-classification/README.md b/examples/research_projects/self-training-text-classification/README.md
deleted file mode 100644
index 062d5de7afd0..000000000000
--- a/examples/research_projects/self-training-text-classification/README.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# Self-training
-
-This is an implementation of the self-training algorithm (without task augmentation) in the [EMNLP 2021](https://2021.emnlp.org/) paper: [STraTA: Self-Training with Task Augmentation for Better Few-shot Learning](https://arxiv.org/abs/2109.06270). Please check out https://github.com/google-research/google-research/tree/master/STraTA for the original codebase.
-
-**Note**: The code can be used as a tool for automatic data labeling.
-
-## Table of Contents
-
-   * [Installation](#installation)
-   * [Self-training](#self-training)
-      * [Running self-training with a base model](#running-self-training-with-a-base-model)
-      * [Hyperparameters for self-training](#hyperparameters-for-self-training)
-      * [Distributed training](#distributed-training)
-   * [Demo](#demo)
-   * [How to cite](#how-to-cite)
-
-## Installation
-This repository is tested on Python 3.8+, PyTorch 1.10+, and the 🤗 Transformers 4.16+.
-
-You should install all necessary Python packages in a [virtual environment](https://docs.python.org/3/library/venv.html). If you are unfamiliar with Python virtual environments, please check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
-
-Below, we create a virtual environment with the [Anaconda Python distribution](https://www.anaconda.com/products/distribution) and activate it.
-```sh
-conda create -n strata python=3.9
-conda activate strata
-```
-Next, you need to install 🤗 Transformers. Please refer to [🤗 Transformers installation page](https://github.com/huggingface/transformers#installation) for a detailed guide.
-```sh
-pip install transformers
-```
-Finally, install all necessary Python packages for our self-training algorithm.
-
-```sh
-pip install -r STraTA/selftraining/requirements.txt
-```
-This will install PyTorch as a backend.
-
-## Self-training
-### Running self-training with a base model
-The following example code shows how to run our self-training algorithm with a base model (e.g., `BERT`) on the `SciTail` science entailment dataset, which has two classes `['entails', 'neutral']`. We assume that you have a data directory that includes some training data (e.g., `train.csv`), evaluation data (e.g., `eval.csv`), and unlabeled data (e.g., `infer.csv`).
-
-```python
-import os
-from selftraining import selftrain
-
-data_dir = '/path/to/your/data/dir'
-parameters_dict = {
-    'max_selftrain_iterations': 100,
-    'model_name_or_path': '/path/to/your/base/model',  # could be the id of a model hosted by 🤗 Transformers
-    'output_dir': '/path/to/your/output/dir',
-    'train_file': os.path.join(data_dir, 'train.csv'),
-    'infer_file': os.path.join(data_dir, 'infer.csv'),
-    'eval_file': os.path.join(data_dir, 'eval.csv'),
-    'eval_strategy': 'steps',
-    'task_name': 'scitail',
-    'label_list': ['entails', 'neutral'],
-    'per_device_train_batch_size': 32,
-    'per_device_eval_batch_size': 8,
-    'max_length': 128,
-    'learning_rate': 2e-5,
-    'max_steps': 100000,
-    'eval_steps': 1,
-    'early_stopping_patience': 50,
-    'overwrite_output_dir': True,
-    'do_filter_by_confidence': False,
-    # 'confidence_threshold': 0.3,
-    'do_filter_by_val_performance': True,
-    'finetune_on_labeled_data': False,
-    'seed': 42,
-}
-selftrain(**parameters_dict)
-```
-
-**Note**: We checkpoint periodically during self-training. In case of preemptions, just re-run the above script and self-training will resume from the latest iteration.
-
-### Hyperparameters for self-training
-If you have development data, you might want to tune some hyperparameters for self-training.
-Below are hyperparameters that could provide additional gains for your task.
-
-  - `finetune_on_labeled_data`: If set to `True`, the resulting model from each self-training iteration is further fine-tuned on the original labeled data before the next self-training iteration. Intuitively, this would give the model a chance to "correct" ifself after being trained on pseudo-labeled data.
-  - `do_filter_by_confidence`: If set to `True`, the pseudo-labeled data in each self-training iteration is filtered based on the model confidence. For instance, if `confidence_threshold` is set to `0.3`, pseudo-labeled examples with a confidence score less than or equal to `0.3` will be discarded. Note that `confidence_threshold` should be greater or equal to `1/num_labels`, where `num_labels` is the number of class labels. Filtering out the lowest-confidence pseudo-labeled examples could be helpful in some cases.
-  - `do_filter_by_val_performance`: If set to `True`, the pseudo-labeled data in each self-training iteration is filtered based on the current validation performance. For instance, if your validation performance is 80% accuracy, you might want to get rid of 20% of the pseudo-labeled data with the lowest the confidence scores.
-
-### Distributed training
-We strongly recommend distributed training with multiple accelerators. To activate distributed training, please try one of the following methods:
-
-1. Run `accelerate config` and answer to the questions asked. This will save a `default_config.yaml` file in your cache folder for 🤗 Accelerate. Now, you can run your script with the following command:
-
-```sh
-accelerate launch your_script.py --args_to_your_script
-```
-
-2. Run your script with the following command:
-
-```sh
-python -m torch.distributed.launch --nnodes="{$NUM_NODES}" --nproc_per_node="{$NUM_TRAINERS}" --your_script.py --args_to_your_script
-```
-
-3. Run your script with the following command:
-
-```sh
-torchrun --nnodes="{$NUM_NODES}" --nproc_per_node="{$NUM_TRAINERS}" --your_script.py --args_to_your_script
-```
-
-## Demo
-Please check out `run.sh` to see how to perform our self-training algorithm with a `BERT` Base model on the SciTail science entailment dataset using 8 labeled examples per class. You can configure your training environment by specifying `NUM_NODES` and `NUM_TRAINERS` (number of processes per node). To launch the script, simply run `source run.sh`.
-
-## How to cite
-If you extend or use this code, please cite the [paper](https://arxiv.org/abs/2109.06270) where it was introduced:
-
-```bibtex
-@inproceedings{vu-etal-2021-strata,
-    title = "{ST}ra{TA}: Self-Training with Task Augmentation for Better Few-shot Learning",
-    author = "Vu, Tu  and
-      Luong, Minh-Thang  and
-      Le, Quoc  and
-      Simon, Grady  and
-      Iyyer, Mohit",
-    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
-    month = nov,
-    year = "2021",
-    address = "Online and Punta Cana, Dominican Republic",
-    publisher = "Association for Computational Linguistics",
-    url = "https://aclanthology.org/2021.emnlp-main.462",
-    doi = "10.18653/v1/2021.emnlp-main.462",
-    pages = "5715--5731",
-}
-```
diff --git a/examples/research_projects/self-training-text-classification/finetuning.py b/examples/research_projects/self-training-text-classification/finetuning.py
deleted file mode 100644
index 4bf9eb28df28..000000000000
--- a/examples/research_projects/self-training-text-classification/finetuning.py
+++ /dev/null
@@ -1,818 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fine-tuning the library models for sequence classification."""
-
-import argparse
-import dataclasses
-import json
-import logging
-import math
-import os
-import random
-import shutil
-from typing import List, Optional
-
-import datasets
-import numpy as np
-import pandas as pd
-import torch
-from datasets import load_dataset, load_metric
-from torch.utils.data import DataLoader
-from tqdm.auto import tqdm
-
-from transformers import (
-    AdamW,
-    AutoConfig,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    default_data_collator,
-    get_scheduler,
-    set_seed,
-)
-from transformers.file_utils import ExplicitEnum
-from transformers.trainer_utils import IntervalStrategy
-
-
-logger = logging.getLogger(__name__)
-
-
-class Split(ExplicitEnum):
-    TRAIN = "train"
-    EVAL = "eval"
-    TEST = "test"
-    INFER = "infer"
-
-
-@dataclasses.dataclass
-class FTModelArguments:
-    """Arguments pertaining to which config/tokenizer/model we are going to fine-tune from."""
-
-    model_name_or_path: str = dataclasses.field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models."}
-    )
-    use_fast_tokenizer: Optional[bool] = dataclasses.field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    cache_dir: Optional[str] = dataclasses.field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co."},
-    )
-
-
-@dataclasses.dataclass
-class FTDataArguments:
-    """Arguments pertaining to what data we are going to input our model for training and evaluation."""
-
-    train_file: str = dataclasses.field(
-        default=None, metadata={"help": "A csv or a json file containing the training data."}
-    )
-    eval_file: Optional[str] = dataclasses.field(
-        default=None, metadata={"help": "A csv or a json file containing the validation data."}
-    )
-    test_file: Optional[str] = dataclasses.field(
-        default=None, metadata={"help": "A csv or a json file containing the test data."}
-    )
-    infer_file: Optional[str] = dataclasses.field(
-        default=None, metadata={"help": "A csv or a json file containing the data to predict on."}
-    )
-    task_name: Optional[str] = dataclasses.field(
-        default=None,
-        metadata={"help": "The name of the task to train on."},
-    )
-    label_list: Optional[List[str]] = dataclasses.field(
-        default=None, metadata={"help": "The list of labels for the task."}
-    )
-
-    max_length: Optional[int] = dataclasses.field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    pad_to_max_length: Optional[bool] = dataclasses.field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-            )
-        },
-    )
-
-
-@dataclasses.dataclass
-class FTTrainingArguments:
-    """Training arguments pertaining to the training loop itself."""
-
-    output_dir: str = dataclasses.field(
-        metadata={"help": "The output directory where the model predictions and checkpoints will be written."}
-    )
-    do_train: Optional[bool] = dataclasses.field(
-        default=False,
-        metadata={"help": "Whether to run training or not."},
-    )
-    do_eval: Optional[bool] = dataclasses.field(
-        default=False,
-        metadata={"help": "Whether to run evaluation on the validation set or not."},
-    )
-    do_predict: Optional[bool] = dataclasses.field(
-        default=False,
-        metadata={"help": "Whether to run inference on the inference set or not."},
-    )
-    seed: Optional[int] = dataclasses.field(
-        default=42,
-        metadata={"help": "Random seed that will be set at the beginning of training."},
-    )
-    per_device_train_batch_size: Optional[int] = dataclasses.field(
-        default=8,
-        metadata={"help": "The batch size per GPU/TPU core/CPU for training."},
-    )
-    per_device_eval_batch_size: Optional[int] = dataclasses.field(
-        default=8,
-        metadata={"help": "The batch size per GPU/TPU core/CPU for evaluation."},
-    )
-    weight_decay: Optional[float] = dataclasses.field(
-        default=0.0,
-        metadata={
-            "help": (
-                "The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in"
-                " [`AdamW`] optimizer."
-            )
-        },
-    )
-    learning_rate: Optional[float] = dataclasses.field(
-        default=5e-5,
-        metadata={"help": "The initial learning rate for [`AdamW`] optimizer."},
-    )
-    gradient_accumulation_steps: Optional[int] = dataclasses.field(
-        default=1,
-        metadata={
-            "help": (
-                "Number of updates steps to accumulate the gradients for, before performing a backward/update pass."
-            )
-        },
-    )
-    max_steps: Optional[int] = dataclasses.field(
-        default=-1,
-        metadata={
-            "help": (
-                "If set to a positive number, the total number of training steps to perform. Overrides"
-                " `num_train_epochs`."
-            )
-        },
-    )
-    lr_scheduler_type: Optional[str] = dataclasses.field(
-        default="linear", metadata={"help": "The scheduler type to use."}
-    )
-    warmup_steps: Optional[int] = dataclasses.field(
-        default=1,
-        metadata={
-            "help": (
-                "Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of"
-                " `warmup_ratio`."
-            )
-        },
-    )
-    eval_strategy: Optional[str] = dataclasses.field(
-        default="no",
-        metadata={
-            "help": 'The evaluation strategy to adopt during training. Possible values are: ["no", "step", "epoch]'
-        },
-    )
-    eval_steps: Optional[int] = dataclasses.field(
-        default=1,
-        metadata={"help": 'Number of update steps between two evaluations if `eval_strategy="steps"`.'},
-    )
-    eval_metric: Optional[str] = dataclasses.field(
-        default="accuracy", metadata={"help": "The evaluation metric used for the task."}
-    )
-    keep_checkpoint_max: Optional[int] = dataclasses.field(
-        default=1,
-        metadata={"help": "The maximum number of best checkpoint files to keep."},
-    )
-    early_stopping_patience: Optional[int] = dataclasses.field(
-        default=10,
-        metadata={"help": "Number of evaluation calls with no improvement after which training will be stopped."},
-    )
-    early_stopping_threshold: Optional[float] = dataclasses.field(
-        default=0.0,
-        metadata={
-            "help": "How much the specified evaluation metric must improve to satisfy early stopping conditions."
-        },
-    )
-
-
-def train(args, accelerator, model, tokenizer, train_dataloader, optimizer, lr_scheduler, eval_dataloader=None):
-    """Train a model on the given training data."""
-
-    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", args.num_examples[Split.TRAIN.value])
-    logger.info("  Instantaneous batch size per device = %d", args.per_device_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d", total_batch_size)
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", args.max_steps)
-
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_steps), disable=not accelerator.is_local_main_process)
-
-    checkpoints = None
-    eval_results = None
-    best_checkpoint = None
-    best_eval_result = None
-    early_stopping_patience_counter = 0
-    should_training_stop = False
-    epoch = 0
-    completed_steps = 0
-    train_loss = 0.0
-    model.zero_grad()
-
-    for _ in range(args.num_train_epochs):
-        epoch += 1
-        model.train()
-        for step, batch in enumerate(train_dataloader):
-            outputs = model(**batch)
-            loss = outputs.loss
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            train_loss += loss.item()
-
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-                progress_bar.update(1)
-                completed_steps += 1
-
-                # Evaluate during training
-                if (
-                    eval_dataloader is not None
-                    and args.eval_strategy == IntervalStrategy.STEPS.value
-                    and args.eval_steps > 0
-                    and completed_steps % args.eval_steps == 0
-                ):
-                    accelerator.wait_for_everyone()
-                    new_checkpoint = f"checkpoint-{IntervalStrategy.STEPS.value}-{completed_steps}"
-                    new_eval_result = evaluate(args, accelerator, eval_dataloader, "eval", model, new_checkpoint)[
-                        args.eval_metric
-                    ]
-                    logger.info(
-                        "Evaluation result at step %d: %s = %f", completed_steps, args.eval_metric, new_eval_result
-                    )
-                    if checkpoints is None:
-                        checkpoints = np.array([new_checkpoint])
-                        eval_results = np.array([new_eval_result])
-                        best_checkpoint = new_checkpoint
-                        best_eval_result = new_eval_result
-                    else:
-                        if new_eval_result - best_eval_result > args.early_stopping_threshold:
-                            best_checkpoint = new_checkpoint
-                            best_eval_result = new_eval_result
-                            early_stopping_patience_counter = 0
-                        else:
-                            if new_eval_result == best_eval_result:
-                                best_checkpoint = new_checkpoint
-                                best_eval_result = new_eval_result
-                            early_stopping_patience_counter += 1
-
-                        if early_stopping_patience_counter >= args.early_stopping_patience:
-                            should_training_stop = True
-
-                        checkpoints = np.append(checkpoints, [new_checkpoint], axis=0)
-                        eval_results = np.append(eval_results, [new_eval_result], axis=0)
-                        sorted_ids = np.argsort(eval_results)
-                        eval_results = eval_results[sorted_ids]
-                        checkpoints = checkpoints[sorted_ids]
-
-                    if len(checkpoints) > args.keep_checkpoint_max:
-                        # Delete the current worst checkpoint
-                        checkpoint_to_remove, *checkpoints = checkpoints
-                        eval_results = eval_results[1:]
-                        if checkpoint_to_remove != new_checkpoint:
-                            if accelerator.is_main_process:
-                                shutil.rmtree(os.path.join(args.output_dir, checkpoint_to_remove), ignore_errors=True)
-                            accelerator.wait_for_everyone()
-
-                    if new_checkpoint in checkpoints:
-                        # Save model checkpoint
-                        checkpoint_output_dir = os.path.join(args.output_dir, new_checkpoint)
-                        if accelerator.is_main_process:
-                            if not os.path.exists(checkpoint_output_dir):
-                                os.makedirs(checkpoint_output_dir)
-                        accelerator.wait_for_everyone()
-                        unwrapped_model = accelerator.unwrap_model(model)
-                        unwrapped_model.save_pretrained(checkpoint_output_dir, save_function=accelerator.save)
-                        if accelerator.is_main_process:
-                            tokenizer.save_pretrained(checkpoint_output_dir)
-                            logger.info("Saving model checkpoint to %s", checkpoint_output_dir)
-
-            if completed_steps >= args.max_steps:
-                break
-
-            if should_training_stop:
-                break
-
-        # Evaluate during training
-        if eval_dataloader is not None and args.eval_strategy == IntervalStrategy.EPOCH.value:
-            accelerator.wait_for_everyone()
-            new_checkpoint = f"checkpoint-{IntervalStrategy.EPOCH.value}-{epoch}"
-            new_eval_result = evaluate(args, accelerator, eval_dataloader, "eval", model, new_checkpoint)[
-                args.eval_metric
-            ]
-            logger.info("Evaluation result at epoch %d: %s = %f", epoch, args.eval_metric, new_eval_result)
-
-            if checkpoints is None:
-                checkpoints = np.array([new_checkpoint])
-                eval_results = np.array([new_eval_result])
-                best_checkpoint = new_checkpoint
-                best_eval_result = new_eval_result
-            else:
-                if new_eval_result - best_eval_result > args.early_stopping_threshold:
-                    best_checkpoint = new_checkpoint
-                    best_eval_result = new_eval_result
-                    early_stopping_patience_counter = 0
-                else:
-                    if new_eval_result == best_eval_result:
-                        best_checkpoint = new_checkpoint
-                        best_eval_result = new_eval_result
-                    early_stopping_patience_counter += 1
-
-                if early_stopping_patience_counter >= args.early_stopping_patience:
-                    should_training_stop = True
-
-                checkpoints = np.append(checkpoints, [new_checkpoint], axis=0)
-                eval_results = np.append(eval_results, [new_eval_result], axis=0)
-                sorted_ids = np.argsort(eval_results)
-                eval_results = eval_results[sorted_ids]
-                checkpoints = checkpoints[sorted_ids]
-
-            if len(checkpoints) > args.keep_checkpoint_max:
-                # Delete the current worst checkpoint
-                checkpoint_to_remove, *checkpoints = checkpoints
-                eval_results = eval_results[1:]
-                if checkpoint_to_remove != new_checkpoint:
-                    if accelerator.is_main_process:
-                        shutil.rmtree(os.path.join(args.output_dir, checkpoint_to_remove), ignore_errors=True)
-                    accelerator.wait_for_everyone()
-
-            if new_checkpoint in checkpoints:
-                # Save model checkpoint
-                checkpoint_output_dir = os.path.join(args.output_dir, new_checkpoint)
-                if accelerator.is_main_process:
-                    if not os.path.exists(checkpoint_output_dir):
-                        os.makedirs(checkpoint_output_dir)
-                accelerator.wait_for_everyone()
-                unwrapped_model = accelerator.unwrap_model(model)
-                unwrapped_model.save_pretrained(checkpoint_output_dir, save_function=accelerator.save)
-                if accelerator.is_main_process:
-                    tokenizer.save_pretrained(checkpoint_output_dir)
-                    logger.info("Saving model checkpoint to %s", checkpoint_output_dir)
-
-        if completed_steps >= args.max_steps:
-            break
-
-        if should_training_stop:
-            break
-
-    if best_checkpoint is not None:
-        # Save the best checkpoint
-        logger.info("Best checkpoint: %s", best_checkpoint)
-        logger.info("Best evaluation result: %s = %f", args.eval_metric, best_eval_result)
-        best_checkpoint_output_dir = os.path.join(args.output_dir, best_checkpoint)
-        if accelerator.is_main_process:
-            shutil.move(best_checkpoint_output_dir, os.path.join(args.output_dir, "best-checkpoint"))
-            shutil.rmtree(best_checkpoint_output_dir, ignore_errors=True)
-        accelerator.wait_for_everyone()
-
-    else:
-        # Assume that the last checkpoint is the best checkpoint and save it
-        checkpoint_output_dir = os.path.join(args.output_dir, "best-checkpoint")
-        if not os.path.exists(checkpoint_output_dir):
-            os.makedirs(checkpoint_output_dir)
-
-        accelerator.wait_for_everyone()
-        unwrapped_model = accelerator.unwrap_model(model)
-        unwrapped_model.save_pretrained(checkpoint_output_dir, save_function=accelerator.save)
-        if accelerator.is_main_process:
-            tokenizer.save_pretrained(checkpoint_output_dir)
-            logger.info("Saving model checkpoint to %s", checkpoint_output_dir)
-    return completed_steps, train_loss / completed_steps
-
-
-def evaluate(args, accelerator, dataloader, eval_set, model, checkpoint, has_labels=True, write_to_file=True):
-    """Evaluate a model checkpoint on the given evaluation data."""
-
-    num_examples = args.num_examples[eval_set]
-    eval_metric = None
-    completed_steps = 0
-    eval_loss = 0.0
-    all_predictions = None
-    all_references = None
-    all_probabilities = None
-
-    if has_labels:
-        # Get the metric function
-        eval_metric = load_metric(args.eval_metric)
-
-    eval_results = {}
-    model.eval()
-    for _, batch in enumerate(dataloader):
-        with torch.no_grad():
-            outputs = model(**batch)
-
-        eval_loss += outputs.loss.item()
-        logits = outputs.logits
-        predictions = logits.argmax(dim=-1) if not args.is_regression else logits.squeeze()
-        predictions = accelerator.gather(predictions)
-
-        if all_predictions is None:
-            all_predictions = predictions.detach().cpu().numpy()
-        else:
-            all_predictions = np.append(all_predictions, predictions.detach().cpu().numpy(), axis=0)
-
-        if not args.is_regression:
-            probabilities = logits.softmax(dim=-1).max(dim=-1).values
-            probabilities = accelerator.gather(probabilities)
-            if all_probabilities is None:
-                all_probabilities = probabilities.detach().cpu().numpy()
-            else:
-                all_probabilities = np.append(all_probabilities, probabilities.detach().cpu().numpy(), axis=0)
-
-        if has_labels:
-            references = batch["labels"]
-            references = accelerator.gather(references)
-            if all_references is None:
-                all_references = references.detach().cpu().numpy()
-            else:
-                all_references = np.append(all_references, references.detach().cpu().numpy(), axis=0)
-
-            eval_metric.add_batch(
-                predictions=predictions,
-                references=references,
-            )
-        completed_steps += 1
-
-    if has_labels:
-        eval_results.update(eval_metric.compute())
-        eval_results["completed_steps"] = completed_steps
-        eval_results["avg_eval_loss"] = eval_loss / completed_steps
-
-        if write_to_file:
-            accelerator.wait_for_everyone()
-            if accelerator.is_main_process:
-                results_file = os.path.join(args.output_dir, f"{eval_set}_results_{checkpoint}.json")
-                with open(results_file, "w") as f:
-                    json.dump(eval_results, f, indent=4, sort_keys=True)
-
-    if write_to_file:
-        accelerator.wait_for_everyone()
-        if accelerator.is_main_process:
-            output_file = os.path.join(args.output_dir, f"{eval_set}_output_{checkpoint}.csv")
-            if not args.is_regression:
-                assert len(all_predictions) == len(all_probabilities)
-                df = pd.DataFrame(list(zip(all_predictions, all_probabilities)), columns=["prediction", "probability"])
-            else:
-                df = pd.DataFrame(all_predictions, columns=["prediction"])
-            df = df.head(num_examples)
-            df.to_csv(output_file, header=True, index=False)
-    return eval_results
-
-
-def load_from_pretrained(args, pretrained_model_name_or_path):
-    """Load the pretrained model and tokenizer."""
-
-    # In distributed training, the .from_pretrained methods guarantee that only
-    # one local process can concurrently perform this procedure.
-
-    config = AutoConfig.from_pretrained(
-        pretrained_model_name_or_path,
-        num_labels=args.num_labels if hasattr(args, "num_labels") else None,
-        finetuning_task=args.task_name.lower(),
-        cache_dir=args.cache_dir,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        pretrained_model_name_or_path, use_fast=args.use_fast_tokenizer, cache_dir=args.cache_dir
-    )
-    model = AutoModelForSequenceClassification.from_pretrained(
-        pretrained_model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        ignore_mismatched_sizes=True,
-        cache_dir=args.cache_dir,
-    )
-    return config, tokenizer, model
-
-
-def finetune(accelerator, model_name_or_path, train_file, output_dir, **kwargs):
-    """Fine-tuning a pre-trained model on a downstream task.
-
-    Args:
-      accelerator: An instance of an accelerator for distributed training (on
-        multi-GPU, TPU) or mixed precision training.
-      model_name_or_path: Path to pretrained model or model identifier from
-        huggingface.co/models.
-      train_file: A csv or a json file containing the training data.
-      output_dir: The output directory where the model predictions and checkpoints
-        will be written.
-      **kwargs: Dictionary of key/value pairs with which to update the
-        configuration object after loading. The values in kwargs of any keys which
-        are configuration attributes will be used to override the loaded values.
-    """
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state)
-
-    # Setup logging, we only want one process per machine to log things on the
-    # screen. accelerator.is_local_main_process is only True for one process per
-    # machine.
-    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
-
-    model_args = FTModelArguments(model_name_or_path=model_name_or_path)
-    data_args = FTDataArguments(train_file=train_file)
-    training_args = FTTrainingArguments(output_dir=output_dir)
-    args = argparse.Namespace()
-
-    for arg_class in (model_args, data_args, training_args):
-        for key, value in vars(arg_class).items():
-            setattr(args, key, value)
-
-    for key, value in kwargs.items():
-        if hasattr(args, key):
-            setattr(args, key, value)
-
-    # Sanity checks
-    data_files = {}
-    args.data_file_extension = None
-
-    # You need to provide the training data as we always run training
-    args.do_train = True
-    assert args.train_file is not None
-    data_files[Split.TRAIN.value] = args.train_file
-
-    if args.do_eval or args.eval_strategy != IntervalStrategy.NO.value:
-        assert args.eval_file is not None
-        data_files[Split.EVAL.value] = args.eval_file
-
-    if args.do_eval and args.test_file is not None:
-        data_files[Split.TEST.value] = args.test_file
-
-    if args.do_predict:
-        assert args.infer_file is not None
-        data_files[Split.INFER.value] = args.infer_file
-
-    for key in data_files:
-        extension = data_files[key].split(".")[-1]
-        assert extension in ["csv", "json"], f"`{key}_file` should be a csv or a json file."
-        if args.data_file_extension is None:
-            args.data_file_extension = extension
-        else:
-            assert extension == args.data_file_extension, f"`{key}_file` should be a {args.data_file_extension} file`."
-
-    assert (
-        args.eval_metric in datasets.list_metrics()
-    ), f"{args.eval_metric} not in the list of supported metrics {datasets.list_metrics()}."
-
-    # Handle the output directory creation
-    if accelerator.is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-    accelerator.wait_for_everyone()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # You need to provide your CSV/JSON data files.
-    #
-    # For CSV/JSON files, this script will use as labels the column called 'label'
-    # and as pair of sentences the sentences in columns called 'sentence1' and
-    # 'sentence2' if these columns exist or the first two columns not named
-    # 'label' if at least two columns are provided.
-    #
-    # If the CSVs/JSONs contain only one non-label column, the script does single
-    # sentence classification on this single column.
-    #
-    # In distributed training, the load_dataset function guarantees that only one
-    # local process can download the dataset.
-
-    # Loading the dataset from local csv or json files.
-    raw_datasets = load_dataset(args.data_file_extension, data_files=data_files)
-
-    # Labels
-    is_regression = raw_datasets[Split.TRAIN.value].features["label"].dtype in ["float32", "float64"]
-    args.is_regression = is_regression
-
-    if args.is_regression:
-        label_list = None
-        num_labels = 1
-    else:
-        label_list = args.label_list
-        assert label_list is not None
-        label_list.sort()  # Let's sort it for determinism
-        num_labels = len(label_list)
-    args.num_labels = num_labels
-
-    # Load pre-trained model
-    config, tokenizer, model = load_from_pretrained(args, args.model_name_or_path)
-
-    # Preprocessing the datasets
-    non_label_column_names = [name for name in raw_datasets[Split.TRAIN.value].column_names if name != "label"]
-    if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
-        sentence1_key, sentence2_key = "sentence1", "sentence2"
-    else:
-        if len(non_label_column_names) >= 2:
-            sentence1_key, sentence2_key = non_label_column_names[:2]
-        else:
-            sentence1_key, sentence2_key = non_label_column_names[0], None
-
-    label_to_id = {v: i for i, v in enumerate(label_list)}
-    config.label2id = label_to_id
-    config.id2label = {id: label for label, id in config.label2id.items()}
-    padding = "max_length" if args.pad_to_max_length else False
-
-    def preprocess_function(examples):
-        # Tokenize the texts
-        texts = (
-            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
-        )
-        result = tokenizer(*texts, padding=padding, max_length=args.max_length, truncation=True)
-
-        if "label" in examples:
-            if label_to_id is not None:
-                # Map labels to IDs (not necessary for GLUE tasks)
-                result["labels"] = [label_to_id[l] for l in examples["label"]]
-            else:
-                # In all cases, rename the column to labels because the model will
-                # expect that.
-                result["labels"] = examples["label"]
-        return result
-
-    with accelerator.main_process_first():
-        processed_datasets = raw_datasets.map(
-            preprocess_function,
-            batched=True,
-            remove_columns=raw_datasets[Split.TRAIN.value].column_names,
-            desc="Running tokenizer on dataset",
-        )
-
-    num_examples = {}
-    splits = [s.value for s in Split]
-    for split in splits:
-        if split in processed_datasets:
-            num_examples[split] = len(processed_datasets[split])
-    args.num_examples = num_examples
-
-    train_dataset = processed_datasets[Split.TRAIN.value]
-    eval_dataset = processed_datasets[Split.EVAL.value] if Split.EVAL.value in processed_datasets else None
-    test_dataset = processed_datasets[Split.TEST.value] if Split.TEST.value in processed_datasets else None
-    infer_dataset = processed_datasets[Split.INFER.value] if Split.INFER.value in processed_datasets else None
-
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info("Sample %d of the training set: %s.", index, train_dataset[index])
-
-    # DataLoaders creation:
-    if args.pad_to_max_length:
-        # If padding was already done ot max length, we use the default data
-        # collator that will just convert everything to tensors.
-        data_collator = default_data_collator
-    else:
-        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by
-        # padding to the maximum length of the samples passed). When using mixed
-        # precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple of
-        # 8s, which will enable the use of Tensor Cores on NVIDIA hardware with
-        # compute capability >= 7.5 (Volta).
-        # For fp8, we pad to multiple of 16.
-        if accelerator.mixed_precision == "fp8":
-            pad_to_multiple_of = 16
-        elif accelerator.mixed_precision != "no":
-            pad_to_multiple_of = 8
-        else:
-            pad_to_multiple_of = None
-        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
-
-    train_dataloader = DataLoader(
-        train_dataset,
-        batch_size=args.per_device_train_batch_size,
-        shuffle=True,
-        collate_fn=data_collator,
-    )
-    eval_dataloader, test_dataloader, infer_dataloader = None, None, None
-
-    if eval_dataset is not None:
-        eval_dataloader = DataLoader(
-            eval_dataset, batch_size=args.per_device_eval_batch_size, collate_fn=data_collator
-        )
-
-    if test_dataset is not None:
-        test_dataloader = DataLoader(
-            test_dataset, batch_size=args.per_device_eval_batch_size, collate_fn=data_collator
-        )
-
-    if infer_dataset is not None:
-        infer_dataloader = DataLoader(
-            infer_dataset, batch_size=args.per_device_eval_batch_size, collate_fn=data_collator
-        )
-
-    # Optimizer
-    # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
-
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader, test_dataloader, infer_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader, test_dataloader, infer_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab its
-    # length below (cause its length will be shorter in multiprocess)
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_steps == -1:
-        args.max_steps = args.num_train_epochs * num_update_steps_per_epoch
-    else:
-        args.num_train_epochs = math.ceil(args.max_steps / num_update_steps_per_epoch)
-
-    lr_scheduler = get_scheduler(
-        name=args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=args.warmup_steps,
-        num_training_steps=args.max_steps,
-    )
-
-    # Train
-    completed_steps, avg_train_loss = train(
-        args, accelerator, model, tokenizer, train_dataloader, optimizer, lr_scheduler, eval_dataloader
-    )
-    accelerator.wait_for_everyone()
-    logger.info("Training job completed: completed_steps = %d, avg_train_loss = %f", completed_steps, avg_train_loss)
-
-    args.model_name_or_path = os.path.join(args.output_dir, "best-checkpoint")
-    logger.info("Loading the best checkpoint: %s", args.model_name_or_path)
-    config, tokenizer, model = load_from_pretrained(args, args.model_name_or_path)
-    model = accelerator.prepare(model)
-
-    if args.do_eval:
-        # Evaluate
-        if eval_dataloader is not None:
-            logger.info("***** Running evaluation on the eval data using the best checkpoint *****")
-            eval_results = evaluate(args, accelerator, eval_dataloader, Split.EVAL.value, model, "best-checkpoint")
-            avg_eval_loss = eval_results["avg_eval_loss"]
-            eval_metric = eval_results[args.eval_metric]
-            logger.info("Evaluation job completed: avg_eval_loss = %f", avg_eval_loss)
-            logger.info("Evaluation result for the best checkpoint: %s = %f", args.eval_metric, eval_metric)
-
-        if test_dataloader is not None:
-            logger.info("***** Running evaluation on the test data using the best checkpoint *****")
-            eval_results = evaluate(args, accelerator, test_dataloader, Split.TEST.value, model, "best-checkpoint")
-            avg_eval_loss = eval_results["avg_eval_loss"]
-            eval_metric = eval_results[args.eval_metric]
-            logger.info("Test job completed: avg_test_loss = %f", avg_eval_loss)
-            logger.info("Test result for the best checkpoint: %s = %f", args.eval_metric, eval_metric)
-
-    if args.do_predict:
-        # Predict
-        if infer_dataloader is not None:
-            logger.info("***** Running inference using the best checkpoint *****")
-            evaluate(
-                args, accelerator, infer_dataloader, Split.INFER.value, model, "best-checkpoint", has_labels=False
-            )
-            logger.info("Inference job completed.")
-
-    # Release all references to the internal objects stored and call the garbage
-    # collector. You should call this method between two trainings with different
-    # models/optimizers.
-    accelerator.free_memory()
diff --git a/examples/research_projects/self-training-text-classification/requirements.txt b/examples/research_projects/self-training-text-classification/requirements.txt
deleted file mode 100644
index 25d66c8b6a4b..000000000000
--- a/examples/research_projects/self-training-text-classification/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-accelerate
-datasets >= 1.8.0
-protobuf
-scikit-learn
-scipy
-sentencepiece != 0.1.92
-torch >= 1.3
diff --git a/examples/research_projects/self-training-text-classification/run.sh b/examples/research_projects/self-training-text-classification/run.sh
deleted file mode 100755
index 34e91d7c127c..000000000000
--- a/examples/research_projects/self-training-text-classification/run.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2022 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/bin/bash
-
-# Create a virtual environment
-conda deactivate
-conda update conda -y
-conda update anaconda -y
-pip install --upgrade pip
-python3 -m pip install --user virtualenv
-conda create -n strata python=3.9 -y
-conda activate strata
-# Install all necessary packages
-pip install transformers
-pip install -r requirements.txt
-
-# Download and prepare data
-WORK_DIR="/tmp/strata"
-rm -rf "${WORK_DIR}" && mkdir -p "${WORK_DIR}"
-wget https://storage.googleapis.com/gresearch/strata/demo.zip -P "${WORK_DIR}"
-DEMO_ZIP_FILE="${WORK_DIR}/demo.zip"
-unzip "${DEMO_ZIP_FILE}" -d "${WORK_DIR}" && rm "${DEMO_ZIP_FILE}"
-DATA_DIR="${WORK_DIR}/demo/scitail-8"
-OUTPUT_DIR="/tmp/output"
-rm -rf "${OUTPUT_DIR}" && mkdir -p "${OUTPUT_DIR}"
-
-# Specific hyperparameters
-MODEL_NAME_OR_PATH="bert-base-uncased"
-NUM_NODES=1
-NUM_TRAINERS=4
-LAUNCH_SCRIPT="torchrun --nnodes='${NUM_NODES}' --nproc_per_node='${NUM_TRAINERS}' python -c"
-MAX_SELFTRAIN_ITERATIONS=100
-TRAIN_FILE="train.csv"
-INFER_FILE="infer.csv"
-EVAL_FILE="eval_256.csv"
-MAX_STEPS=100000
-
-# Start self-training
-${LAUNCH_SCRIPT} "
-import os
-from selftraining import selftrain
-
-data_dir = '${DATA_DIR}'
-parameters_dict = {
-  'max_selftrain_iterations': ${MAX_SELFTRAIN_ITERATIONS},
-  'model_name_or_path': '${MODEL_NAME_OR_PATH}',
-  'output_dir': '${OUTPUT_DIR}',
-  'train_file': os.path.join(data_dir, '${TRAIN_FILE}'),
-  'infer_file': os.path.join(data_dir, '${INFER_FILE}'),
-  'eval_file': os.path.join(data_dir, '${EVAL_FILE}'),
-  'eval_strategy': 'steps',
-  'task_name': 'scitail',
-  'label_list': ['entails', 'neutral'],
-  'per_device_train_batch_size': 32,
-  'per_device_eval_batch_size': 8,
-  'max_length': 128,
-  'learning_rate': 2e-5,
-  'max_steps': ${MAX_STEPS},
-  'eval_steps': 1,
-  'early_stopping_patience': 50,
-  'overwrite_output_dir': True,
-  'do_filter_by_confidence': False,
-  'do_filter_by_val_performance': True,
-  'finetune_on_labeled_data': False,
-  'seed': 42,
-}
-
-selftrain(**parameters_dict)
-"
diff --git a/examples/research_projects/self-training-text-classification/selftraining.py b/examples/research_projects/self-training-text-classification/selftraining.py
deleted file mode 100644
index d741225b061e..000000000000
--- a/examples/research_projects/self-training-text-classification/selftraining.py
+++ /dev/null
@@ -1,388 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Self-training for sequence classification."""
-
-import argparse
-import dataclasses
-import json
-import logging
-import os
-import shutil
-from typing import List, Optional
-
-import datasets
-from accelerate import Accelerator
-from datasets import load_dataset
-from finetuning import finetune
-from tqdm.auto import tqdm
-
-import transformers
-from transformers import AutoConfig, set_seed
-from transformers.trainer_utils import IntervalStrategy
-
-
-logger = logging.getLogger(__name__)
-
-MODEL_BIN_FILE = "pytorch_model.bin"
-
-
-@dataclasses.dataclass
-class STModelArguments:
-    """Arguments pertaining to which config/tokenizer/model we are going to fine-tune from."""
-
-    model_name_or_path: str = dataclasses.field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models."}
-    )
-    cache_dir: Optional[str] = dataclasses.field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co."},
-    )
-
-
-@dataclasses.dataclass
-class STDataArguments:
-    """Arguments pertaining to what data we are going to input our model for training and evaluation."""
-
-    train_file: str = dataclasses.field(metadata={"help": "A csv or a json file containing the training data."})
-    infer_file: str = dataclasses.field(metadata={"help": "A csv or a json file containing the data to predict on."})
-    eval_file: Optional[str] = dataclasses.field(
-        default=None, metadata={"help": "A csv or a json file containing the validation data."}
-    )
-    task_name: Optional[str] = dataclasses.field(
-        default=None,
-        metadata={"help": "The name of the task to train on."},
-    )
-    label_list: Optional[List[str]] = dataclasses.field(
-        default=None, metadata={"help": "The list of labels for the task."}
-    )
-
-
-@dataclasses.dataclass
-class STTrainingArguments:
-    """Training arguments pertaining to the training loop itself."""
-
-    output_dir: str = dataclasses.field(
-        metadata={"help": "The output directory where the model predictions and checkpoints will be written."}
-    )
-    eval_metric: Optional[str] = dataclasses.field(
-        default="accuracy", metadata={"help": "The evaluation metric used for the task."}
-    )
-    eval_strategy: Optional[str] = dataclasses.field(
-        default="no",
-        metadata={
-            "help": 'The evaluation strategy to adopt during training. Possible values are: ["no", "step", "epoch]'
-        },
-    )
-    early_stopping_patience: Optional[int] = dataclasses.field(
-        default=10,
-        metadata={"help": "Number of evaluation calls with no improvement after which training will be stopped."},
-    )
-    early_stopping_threshold: Optional[float] = dataclasses.field(
-        default=0.0,
-        metadata={
-            "help": "How much the specified evaluation metric must improve to satisfy early stopping conditions."
-        },
-    )
-    do_filter_by_confidence: Optional[bool] = dataclasses.field(
-        default=False,
-        metadata={"help": "Whether to filter the pseudo-labeled data based on the confidence score."},
-    )
-    do_filter_by_val_performance: Optional[bool] = dataclasses.field(
-        default=False,
-        metadata={"help": "Whether to filter the pseudo-labeled data based on the validation performance."},
-    )
-    finetune_on_labeled_data: Optional[bool] = dataclasses.field(
-        default=False,
-        metadata={"help": "Whether to fine-tune on labeled data after pseudo training."},
-    )
-    confidence_threshold: Optional[float] = dataclasses.field(
-        default=0.0,
-        metadata={"help": "Confidence threshold for pseudo-labeled data filtering."},
-    )
-    max_selftrain_iterations: Optional[int] = dataclasses.field(
-        default=100,
-        metadata={"help": "Number of evaluation calls with no improvement after which training will be stopped."},
-    )
-    seed: Optional[int] = dataclasses.field(
-        default=None,
-        metadata={"help": "Random seed for initialization."},
-    )
-
-
-def create_pseudo_labeled_data(args, infer_input, infer_output, eval_result, id2label, next_data_dir):
-    """Create pseudeo labeled data for the next self-training iteration."""
-
-    dataset = datasets.concatenate_datasets([infer_input, infer_output], axis=1)
-
-    if args.do_filter_by_confidence:
-        dataset = dataset.filter(lambda example: example["probability"] > args.confidence_threshold)
-
-    if args.do_filter_by_val_performance:
-        assert eval_result >= 0.0 and eval_result <= 1.0
-        num_selected_rows = int(eval_result * len(dataset))
-        print(num_selected_rows)
-        dataset = dataset.sort("probability", reverse=True)
-        dataset = dataset.select(range(num_selected_rows))
-
-    dataset = dataset.remove_columns(["label", "probability"])
-    dataset = dataset.rename_column("prediction", "label")
-    dataset = dataset.map(lambda example: {"label": id2label[example["label"]]})
-    dataset = dataset.shuffle(seed=args.seed)
-
-    pseudo_labeled_data_file = os.path.join(next_data_dir, f"train_pseudo.{args.data_file_extension}")
-    if args.data_file_extension == "csv":
-        dataset.to_csv(pseudo_labeled_data_file, index=False)
-    else:
-        dataset.to_json(pseudo_labeled_data_file)
-
-
-def selftrain(model_name_or_path, train_file, infer_file, output_dir, **kwargs):
-    """Self-training a pre-trained model on a downstream task.
-
-    Args:
-      model_name_or_path: Path to pretrained model or model identifier from
-        huggingface.co/models.
-      train_file: A csv or a json file containing the training data.
-      infer_file: A csv or a json file containing the data to predict on.
-      output_dir: The output directory where the model predictions and checkpoints
-        will be written.
-      **kwargs: Dictionary of key/value pairs with which to update the
-        configuration object after loading. The values in kwargs of any keys which
-        are configuration attributes will be used to override the loaded values.
-    """
-    # Initialize the accelerator. We will let the accelerator handle device
-    # placement for us.
-    accelerator = Accelerator()
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state)
-
-    # Setup logging, we only want one process per machine to log things on the
-    # screen. accelerator.is_local_main_process is only True for one process per
-    # machine.
-    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
-
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    model_args = STModelArguments(model_name_or_path=model_name_or_path)
-    data_args = STDataArguments(train_file=train_file, infer_file=infer_file)
-    training_args = STTrainingArguments(output_dir=output_dir)
-    args = argparse.Namespace()
-
-    for arg_class in (model_args, data_args, training_args):
-        for key, value in vars(arg_class).items():
-            setattr(args, key, value)
-
-    for key, value in kwargs.items():
-        if hasattr(args, key):
-            setattr(args, key, value)
-
-    # Sanity checks
-    data_files = {}
-    args.data_file_extension = None
-
-    # You need to provide the training data and the data to predict on
-    assert args.train_file is not None
-    assert args.infer_file is not None
-    data_files["train"] = args.train_file
-    data_files["infer"] = args.infer_file
-
-    if args.eval_strategy != IntervalStrategy.NO.value:
-        assert args.eval_file is not None
-        data_files["eval"] = args.eval_file
-
-    for key in data_files:
-        extension = data_files[key].split(".")[-1]
-        assert extension in ["csv", "json"], f"`{key}_file` should be a csv or a json file."
-        if args.data_file_extension is None:
-            args.data_file_extension = extension
-        else:
-            assert extension == args.data_file_extension, f"`{key}_file` should be a {args.data_file_extension} file`."
-
-    assert (
-        args.eval_metric in datasets.list_metrics()
-    ), f"{args.eval_metric} not in the list of supported metrics {datasets.list_metrics()}."
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    logger.info("Creating the initial data directory for self-training...")
-    data_dir_format = f"{args.output_dir}/self-train_iter-{{}}".format
-    initial_data_dir = data_dir_format(0)
-
-    if accelerator.is_main_process:
-        if args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-            os.makedirs(initial_data_dir, exist_ok=True)
-    accelerator.wait_for_everyone()
-
-    best_iteration = None
-    best_eval_result = None
-    early_stopping_patience_counter = 0
-    should_training_stop = False
-    # Show the progress bar
-    progress_bar = tqdm(range(args.max_selftrain_iterations), disable=not accelerator.is_local_main_process)
-
-    # Self-train
-    for iteration in range(0, int(args.max_selftrain_iterations)):
-        current_data_dir = data_dir_format(iteration)
-        assert os.path.exists(current_data_dir)
-
-        # Stage 1: initial fine-tuning for iteration = 0 or pseudo-training for
-        # iteration > 0
-        current_output_dir = os.path.join(current_data_dir, "stage-1")
-        arguments_dict = {
-            "accelerator": accelerator,
-            "model_name_or_path": args.model_name_or_path,
-            "cache_dir": args.cache_dir,
-            "do_train": True,
-            "train_file": data_files["train"] if iteration == 0 else data_files["train_pseudo"],
-            "do_eval": True if args.eval_file is not None else False,
-            "eval_file": data_files["eval"],
-            "do_predict": True,
-            "infer_file": data_files["infer"],
-            "task_name": args.task_name,
-            "label_list": args.label_list,
-            "output_dir": current_output_dir,
-            "eval_metric": args.eval_metric,
-            "eval_strategy": args.eval_strategy,
-            "early_stopping_patience": args.early_stopping_patience,
-            "early_stopping_threshold": args.early_stopping_threshold,
-            "seed": args.seed,
-        }
-        # Add additional training arguments
-        for key, value in kwargs.items():
-            if key not in arguments_dict and not hasattr(training_args, key):
-                arguments_dict.update({key: value})
-
-        model_bin_file_path = os.path.join(current_output_dir, "best-checkpoint", MODEL_BIN_FILE)
-        if os.path.exists(model_bin_file_path):
-            logger.info(
-                "Found existing model checkpoint at %s. Skipping self-training: iteration: %d, stage: 1.",
-                model_bin_file_path,
-                iteration,
-            )
-        else:
-            logger.info("***** Running self-training: iteration: %d, stage: 1 *****", iteration)
-            finetune(**arguments_dict)
-            accelerator.wait_for_everyone()
-            assert os.path.exists(model_bin_file_path)
-            logger.info("Self-training job completed: iteration: %d, stage: 1.", iteration)
-
-        if iteration > 0 and args.finetune_on_labeled_data:
-            # Stage 2 (optional): fine-tuning on the original labeled data
-            model_path = os.path.join(current_output_dir, "best-checkpoint")
-            current_output_dir = os.path.join(current_data_dir, "stage-2")
-            # Update arguments_dict
-            arguments_dict["model_name_or_path"] = model_path
-            arguments_dict["train_file"] = data_files["train"]
-            arguments_dict["output_dir"] = current_output_dir
-
-            model_bin_file_path = os.path.join(current_output_dir, "best-checkpoint", MODEL_BIN_FILE)
-            if os.path.exists(model_bin_file_path):
-                logger.info(
-                    "Found existing model checkpoint at %s. Skipping self-training: iteration: %d, stage: 2.",
-                    model_bin_file_path,
-                    iteration,
-                )
-            else:
-                logger.info("***** Running self-training: iteration: %d, stage: 2 *****", iteration)
-                finetune(**arguments_dict)
-                accelerator.wait_for_everyone()
-                assert os.path.exists(model_bin_file_path)
-                logger.info("Self-training job completed: iteration: %d, stage: 2.", iteration)
-
-        new_iteration = iteration
-        next_data_dir = data_dir_format(iteration + 1)
-
-        config = AutoConfig.from_pretrained(os.path.join(current_output_dir, "best-checkpoint"))
-        id2label = config.id2label
-        eval_results_file = os.path.join(current_output_dir, "eval_results_best-checkpoint.json")
-        test_results_file = os.path.join(current_output_dir, "test_results_best-checkpoint.json")
-        assert os.path.exists(eval_results_file)
-
-        with open(eval_results_file, "r") as f:
-            eval_result = float(json.load(f)[args.eval_metric])
-        infer_output_file = os.path.join(current_output_dir, "infer_output_best-checkpoint.csv")
-        assert os.path.exists(infer_output_file)
-        # Loading the dataset from local csv or json files.
-        infer_input = load_dataset(args.data_file_extension, data_files={"data": data_files["infer"]})["data"]
-        infer_output = load_dataset("csv", data_files={"data": infer_output_file})["data"]
-
-        if accelerator.is_main_process:
-            os.makedirs(next_data_dir, exist_ok=True)
-            shutil.copy(eval_results_file, os.path.join(output_dir, f"eval_results_iter-{iteration}.json"))
-            if os.path.exists(test_results_file):
-                shutil.copy(eval_results_file, os.path.join(output_dir, f"test_results_iter-{iteration}.json"))
-            create_pseudo_labeled_data(args, infer_input, infer_output, eval_result, id2label, next_data_dir)
-        accelerator.wait_for_everyone()
-
-        data_files["train_pseudo"] = os.path.join(next_data_dir, f"train_pseudo.{args.data_file_extension}")
-
-        if args.eval_strategy != IntervalStrategy.NO.value:
-            new_eval_result = eval_result
-
-            if best_iteration is None:
-                best_iteration = new_iteration
-                best_eval_result = new_eval_result
-            else:
-                if new_eval_result - best_eval_result > args.early_stopping_threshold:
-                    best_iteration = new_iteration
-                    best_eval_result = new_eval_result
-                    early_stopping_patience_counter = 0
-                else:
-                    if new_eval_result == best_eval_result:
-                        best_iteration = new_iteration
-                        best_eval_result = new_eval_result
-                    early_stopping_patience_counter += 1
-
-                if early_stopping_patience_counter >= args.early_stopping_patience:
-                    should_training_stop = True
-
-        progress_bar.update(1)
-
-        if should_training_stop:
-            break
-
-    if best_iteration is not None:
-        # Save the best iteration
-        logger.info("Best iteration: %d", best_iteration)
-        logger.info("Best evaluation result: %s = %f", args.eval_metric, best_eval_result)
-        accelerator.wait_for_everyone()
-        if accelerator.is_main_process:
-            shutil.copy(
-                os.path.join(output_dir, f"eval_results_iter-{iteration}.json"),
-                os.path.join(output_dir, "eval_results_best-iteration.json"),
-            )
-    else:
-        # Assume that the last iteration is the best
-        logger.info("Best iteration: %d", args.max_selftrain_iterations - 1)
-        logger.info("Best evaluation result: %s = %f", args.eval_metric, eval_result)
-        accelerator.wait_for_everyone()
-        if accelerator.is_main_process:
-            shutil.copy(
-                os.path.join(output_dir, f"eval_results_iter-{args.max_selftrain_iterations - 1}.json"),
-                os.path.join(output_dir, "eval_results_best-iteration.json"),
-            )
diff --git a/examples/research_projects/seq2seq-distillation/README.md b/examples/research_projects/seq2seq-distillation/README.md
deleted file mode 100644
index ab79a652ed38..000000000000
--- a/examples/research_projects/seq2seq-distillation/README.md
+++ /dev/null
@@ -1,434 +0,0 @@
-## Sequence to Sequence Training and Evaluation
-
-This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks.
-
-Author: Sam Shleifer (https://github.com/sshleifer)
-
-### Supported Architectures
-
-- `BartForConditionalGeneration` (and anything that inherits from it)
-- `MarianMTModel`
-- `PegasusForConditionalGeneration`
-- `MBartForConditionalGeneration`
-- `FSMTForConditionalGeneration`
-- `T5ForConditionalGeneration`
-
-# Note
-
-⚠️ This project should be run with pytorch-lightning==1.0.4 which has a potential security vulnerability
-
-## Datasets
-
-#### XSUM
-
-```bash
-cd examples/contrib/pytorch-lightning/seq2seq
-wget https://cdn-datasets.huggingface.co/summarization/xsum.tar.gz
-tar -xzvf xsum.tar.gz
-export XSUM_DIR=${PWD}/xsum
-```
-this should make a directory called `xsum/` with files like `test.source`.
-To use your own data, copy that files format. Each article to be summarized is on its own line.
-
-#### CNN/DailyMail
-
-```bash
-cd examples/contrib/pytorch-lightning/seq2seq
-wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz
-tar -xzvf cnn_dm_v2.tgz  # empty lines removed
-mv cnn_cln cnn_dm
-export CNN_DIR=${PWD}/cnn_dm
-```
-this should make a directory called `cnn_dm/` with 6 files.
-
-#### WMT16 English-Romanian Translation Data
-
-download with this command:
-```bash
-wget https://cdn-datasets.huggingface.co/translation/wmt_en_ro.tar.gz
-tar -xzvf wmt_en_ro.tar.gz
-export ENRO_DIR=${PWD}/wmt_en_ro
-```
-this should make a directory called `wmt_en_ro/` with 6 files.
-
-#### WMT English-German
-
-```bash
-wget https://cdn-datasets.huggingface.co/translation/wmt_en_de.tgz
-tar -xzvf wmt_en_de.tgz
-export DATA_DIR=${PWD}/wmt_en_de
-```
-
-#### FSMT datasets (wmt)
-
-Refer to the scripts starting with `eval_` under:
-https://github.com/huggingface/transformers/tree/main/scripts/fsmt
-
-#### Pegasus (multiple datasets)
-
-Multiple eval datasets are available for download from:
-https://github.com/stas00/porting/tree/master/datasets/pegasus
-
-
-#### Your Data
-
-If you are using your own data, it must be formatted as one directory with 6 files:
-```
-train.source
-train.target
-val.source
-val.target
-test.source
-test.target
-```
-The `.source` files are the input, the `.target` files are the desired output.
-
-### Potential issues
-
-- native AMP (`--fp16` and no apex) may lead to a huge memory leak and require 10x gpu memory. This has been fixed in pytorch-nightly and the minimal official version to have this fix will be pytorch-1.8. Until then if you have to use mixed precision please use AMP only with pytorch-nightly or NVIDIA's apex. Reference: https://github.com/huggingface/transformers/issues/8403
-
-
-### Tips and Tricks
-
-General Tips:
-- since you need to run from this folder, and likely need to modify code, the easiest workflow is fork transformers, clone your fork, and run `pip install -e .` before you get started.
-- try `--freeze_encoder` or `--freeze_embeds` for faster training/larger batch size.  (3hr per epoch with bs=8, see the "xsum_shared_task" command below)
-- `fp16_opt_level=O1` (the default works best).
-- In addition to the pytorch-lightning .ckpt checkpoint, a transformers checkpoint will be saved.
-Load it with `BartForConditionalGeneration.from_pretrained(f'{output_dir}/best_tfmr)`.
-- At the moment, `--do_predict` does not work in a multi-gpu setting. You need to use `evaluate_checkpoint` or the `run_eval.py` code.
-- This warning can be safely ignored:
-    > "Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-xsum and are newly initialized: ['final_logits_bias']"
-- Both finetuning and eval are 30% faster with `--fp16`. For that you need to [install apex](https://github.com/NVIDIA/apex#quick-start).
-- Read scripts before you run them!
-
-Summarization Tips:
-- (summ) 1 epoch at batch size 1 for bart-large takes 24 hours and requires 13GB GPU RAM with fp16 on an NVIDIA-V100.
-- If you want to run experiments on improving the summarization finetuning process, try the XSUM Shared Task (below). It's faster to train than CNNDM because the summaries are shorter.
-- For CNN/DailyMail, the default `val_max_target_length` and `test_max_target_length` will truncate the ground truth labels, resulting in slightly higher rouge scores. To get accurate rouge scores, you should rerun calculate_rouge on the `{output_dir}/test_generations.txt` file saved by `trainer.test()`
-- `--max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 ` is a reasonable setting for XSUM.
-- `wandb` can be used by specifying `--logger_name wandb`. It is useful for reproducibility. Specify the environment variable `WANDB_PROJECT='hf_xsum'` to do the XSUM shared task.
-- If you are finetuning on your own dataset, start from `distilbart-cnn-12-6` if you want long summaries and `distilbart-xsum-12-6` if you want short summaries.
-(It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods).
-
-**Update 2018-07-18**
-Datasets: `LegacySeq2SeqDataset` will be used for all tokenizers without a `prepare_seq2seq_batch` method. Otherwise, `Seq2SeqDataset` will be used.
-Future work/help wanted: A new dataset to support multilingual tasks.
-
-
-### Finetuning Scripts
-All finetuning bash scripts call finetune.py (or distillation.py) with reasonable command line arguments. They usually require extra command line arguments to work.
-
-To see all the possible command line options, run:
-
-```bash
-./finetune.py --help
-```
-
-### Finetuning Training Params
-
-To override the pretrained model's training params, you can pass them to `./finetune.sh`:
-
-```bash
-./finetune.sh \
-    [...]
-    --encoder_layerdrop 0.1 \
-    --decoder_layerdrop 0.1 \
-    --dropout 0.1 \
-    --attention_dropout 0.1 \
-```
-
-### Summarization Finetuning
-Run/modify `finetune.sh`
-
-The following command should work on a 16GB GPU:
-```bash
-./finetune.sh \
-    --data_dir $XSUM_DIR \
-    --train_batch_size=1 \
-    --eval_batch_size=1 \
-    --output_dir=xsum_results \
-    --num_train_epochs 6 \
-    --model_name_or_path facebook/bart-large
-```
-
-There is a starter finetuning script for pegasus at `finetune_pegasus_xsum.sh`.
-
-### Translation Finetuning
-
-First, follow the wmt_en_ro download instructions.
-Then you can finetune mbart_cc25 on english-romanian with the following command.
-**Recommendation:** Read and potentially modify the fairly opinionated defaults in `train_mbart_cc25_enro.sh` script before running it.
-
-Best performing command:
-```bash
-# optionally
-export ENRO_DIR='wmt_en_ro' # Download instructions above
-# export WANDB_PROJECT="MT" # optional
-export MAX_LEN=128
-export BS=4
-./train_mbart_cc25_enro.sh --output_dir enro_finetune_baseline --label_smoothing 0.1 --fp16_opt_level=O1 --logger_name wandb --sortish_sampler
-```
-This should take < 6h/epoch on a 16GB v100 and achieve test BLEU above 26
-To get results in line with fairseq, you need to do some postprocessing. (see `romanian_postprocessing.md`)
-
-MultiGPU command
-(using 8 GPUS as an example)
-```bash
-export ENRO_DIR='wmt_en_ro' # Download instructions above
- # export WANDB_PROJECT="MT" # optional
-export MAX_LEN=128
-export BS=4
-./train_mbart_cc25_enro.sh --output_dir enro_finetune_baseline --gpus 8 --logger_name wandb
-```
-### Finetuning Outputs
-As you train, `output_dir` will be filled with files, that look kind of like this (comments are mine).
-Some of them are metrics, some of them are checkpoints, some of them are metadata. Here is a quick tour:
-
-```bash
-output_dir
-├── best_tfmr  # this is a huggingface checkpoint generated by save_pretrained. It is the same model as the PL .ckpt file below
-│   ├── config.json
-│   ├── merges.txt
-│   ├── pytorch_model.bin
-│   ├── special_tokens_map.json
-│   ├── tokenizer_config.json
-│   └── vocab.json
-├── git_log.json   # repo, branch, and commit hash
-├── val_avg_rouge2=0.1984-step_count=11.ckpt  # this is a pytorch lightning checkpoint associated with the best val score. (it will be called BLEU for MT)
-├── metrics.json  # new validation metrics will continually be appended to this
-├── student  # this is a huggingface checkpoint generated by SummarizationDistiller. It is the student before it gets finetuned.
-│   ├── config.json
-│   └── pytorch_model.bin
-├── test_generations.txt
-# ^^ are the summaries or translations produced by your best checkpoint on the test data. Populated when training is done
-├── test_results.txt  # a convenience file with the test set metrics. This data is also in metrics.json['test']
-├── hparams.pkl  # the command line args passed after some light preprocessing. Should be saved fairly quickly.
-```
-After training, you can recover the best checkpoint by running
-```python
-from transformers import AutoModelForSeq2SeqLM
-model = AutoModelForSeq2SeqLM.from_pretrained(f'{output_dir}/best_tfmr')
-```
-
-### Converting pytorch-lightning checkpoints
-pytorch lightning ``-do_predict`` often fails, after you are done training, the best way to evaluate your model is to convert it.
-
-This should be done for you, with a file called `{save_dir}/best_tfmr`.
-
-If that file doesn't exist but you have a lightning `.ckpt` file, you can run
-```bash
-python convert_pl_checkpoint_to_hf.py PATH_TO_CKPT  randomly_initialized_hf_model_path save_dir/best_tfmr
-```
-Then either `run_eval` or `run_distributed_eval` with `save_dir/best_tfmr` (see previous sections)
-
-
-# Experimental Features
-These features are harder to use and not always useful.
-
-###  Dynamic Batch Size for MT
-`finetune.py` has a command line arg `--max_tokens_per_batch` that allows batches to be dynamically sized.
-This feature can only be used:
-- with fairseq installed
-- on 1 GPU
-- without sortish sampler
-- after calling `./save_len_file.py $tok $data_dir`
-
-For example,
-```bash
-./save_len_file.py Helsinki-NLP/opus-mt-en-ro  wmt_en_ro
-./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
-```
-splits `wmt_en_ro/train` into 11,197 uneven length batches and can finish 1 epoch in 8 minutes on a v100.
-
-For comparison,
-```bash
-./dynamic_bs_example.sh --sortish_sampler --train_batch_size 48
-```
-uses 12,723 batches of length 48 and takes slightly more time 9.5 minutes.
-
-The feature is still experimental, because:
-+ we can make it much more robust if we have memory mapped/preprocessed datasets.
-+ The speedup over sortish sampler is not that large at the moment.
-
-# DistilBART
-<!---It should be called distilling bart and pegasus, but I don't want to break the link in the paper.-->
-This section describes all code and artifacts from our [Paper](http://arxiv.org/abs/2010.13002)
-
-![DBART](https://huggingface.co/front/thumbnails/distilbart_large.png)
-
-+ For the CNN/DailyMail dataset, (relatively longer, more extractive summaries), we found a simple technique that works, which we call "Shrink and Fine-tune", or SFT.
-you just copy alternating layers from `facebook/bart-large-cnn` and fine-tune more on the cnn/dm data. `sshleifer/distill-pegasus-cnn-16-4`, `sshleifer/distilbart-cnn-12-6` and all other checkpoints under `sshleifer` that start with `distilbart-cnn` were trained this way.
-+ For the XSUM dataset, training on pseudo-labels worked best for Pegasus (`sshleifer/distill-pegasus-16-4`), while training with KD worked best for `distilbart-xsum-12-6`
-+ For `sshleifer/dbart-xsum-12-3`
-+ We ran 100s experiments, and didn't want to document 100s of commands. If you want a command to replicate a figure from the paper that is not documented below, feel free to ask on the [forums](https://discuss.huggingface.co/t/seq2seq-distillation-methodology-questions/1270) and tag `@sshleifer`.
-+ You can see the performance tradeoffs of model sizes [here](https://docs.google.com/spreadsheets/d/1EkhDMwVO02m8jCD1cG3RoFPLicpcL1GQHTQjfvDYgIM/edit#gid=0).
-and more granular timing results [here](https://docs.google.com/spreadsheets/d/1EkhDMwVO02m8jCD1cG3RoFPLicpcL1GQHTQjfvDYgIM/edit#gid=1753259047&range=B2:I23).
-
-### Evaluation
-
-use [run_distributed_eval](./run_distributed_eval.py), with the following convenient alias
-```bash
-deval () {
-	proc=$1
-	m=$2
-	dd=$3
-	sd=$4
-	shift
-	shift
-	shift
-	shift
-	python -m torch.distributed.launch --nproc_per_node=$proc  run_distributed_eval.py \
-		--model_name $m  --save_dir $sd --data_dir $dd $@
-}
-```
-On a 1 GPU system, here are four commands (that assume `xsum`, `cnn_dm` are downloaded, cmd-F for those links in this file).
-
-`distilBART`:
-```bash
-deval 1 sshleifer/distilbart-xsum-12-3 xsum dbart_12_3_xsum_eval --fp16  # --help for more choices.
-deval 1 sshleifer/distilbart-cnn_dm-12-6 cnn_dm dbart_12_6_cnn_eval --fp16
-```
-
-`distill-pegasus`:
-```bash
-deval 1 sshleifer/distill-pegasus-cnn-16-4 cnn_dm dpx_cnn_eval
-deval 1 sshleifer/distill-pegasus-xsum-16-4 xsum dpx_xsum_eval
-```
-
-### Distillation
-+ For all of the following commands, you can get roughly equivalent result and faster run times by passing `--num_beams=4`. That's not what we did for the paper.
-+ Besides the KD section, you can also run commands with the built-in transformers trainer. See, for example, [builtin_trainer/train_distilbart_cnn.sh](./builtin_trainer/train_distilbart_cnn.sh).
-+ Large performance deviations (> 5X slower or more than 0.5 Rouge-2 worse), should be reported.
-+ Multi-gpu (controlled with `--gpus` should work, but might require more epochs).
-
-#### Recommended Workflow
-+ Get your dataset in the right format. (see 6 files above).
-+ Find a teacher model [Pegasus](https://huggingface.co/models?search=pegasus) (slower, better ROUGE) or `facebook/bart-large-xsum`/`facebook/bart-large-cnn` (faster, slightly lower.).
-Choose the checkpoint where the corresponding dataset is most similar (or identical to) your dataset.
-+ Follow the sections in order below. You can stop after SFT if you are satisfied, or move on to pseudo-labeling if you want more performance.
-+ student size: If you want a close to free 50% speedup, cut the decoder in half. If you want a larger speedup, cut it in 4.
-+ If your SFT run starts at a validation ROUGE-2 that is more than 10 pts below the teacher's validation ROUGE-2,  you have a bug. Switching to a more expensive technique will not help. Try setting a breakpoint and looking at generation and truncation defaults/hyper-parameters, and share your experience on the forums!
-
-
-#### Initialization
-We use [make_student.py](./make_student.py) to copy alternating layers from the teacher, and save the resulting model to disk
-```bash
-python make_student.py facebook/bart-large-xsum --save_path dbart_xsum_12_3  -e 12 -d 3
-```
-or for `pegasus-xsum`
-```bash
-python make_student.py google/pegasus-xsum --save_path dpx_xsum_16_4  --e 16 --d 4
-```
-we now have an initialized student saved to  `dbart_xsum_12_3`, which we will use for the following commands.
-+ Extension: To replicate more complicated initialize experiments in section 6.1, or try your own. Use the `create_student_by_copying_alternating_layers` function.
-
-#### Pegasus
-+ The following commands are written for BART and will require, at minimum, the following modifications
-+ reduce batch size, and increase gradient accumulation steps so that the product `gpus * batch size * gradient_accumulation_steps = 256`. We used `--learning-rate` = 1e-4 * gradient accumulation steps.
-+ don't use fp16
-+ `--tokenizer_name google/pegasus-large`
-
-### SFT (No Teacher Distillation)
-You don't need `distillation.py`, you can just run:
-
-```bash
-python finetune.py \
-  --data_dir xsum \
-  --freeze_encoder --freeze_embeds \
-  --learning_rate=3e-4 \
-  --do_train \
-  --do_predict \
-  --fp16 --fp16_opt_level=O1 \
-  --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \
-  --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
-  --model_name_or_path dbart_xsum_12_3 \
-  --train_batch_size=64 --eval_batch_size=64 \
-  --sortish_sampler \
-  --num_train_epochs=6 \
-  --warmup_steps 500 \
-  --output_dir distilbart_xsum_sft_12_3 --gpus 1
-```
-
-+ Note: The command that produced `sshleifer/distilbart-cnn-12-6` is at [train_distilbart_cnn.sh](./[train_distilbart_cnn.sh)
-
-```bash
-./train_distilbart_cnn.sh
-```
-<!--- runtime: 6H on NVIDIA RTX 24GB GPU -->
-+ Tip: You can get the same simple distillation logic by using `distillation.py --no_teacher ` followed by identical arguments as the ones in `train_distilbart_cnn.sh`.
-If you are using `wandb` and comparing the two distillation methods, using this entry point will make your logs consistent,
-because you will have the same hyper-parameters logged in every run.
-
-### Pseudo-Labeling
-+ You don't need `distillation.py`.
-+ Instructions to generate pseudo-labels and use pre-computed pseudo-labels can be found [here](./precomputed_pseudo_labels.md).
-Simply run `finetune.py` with one of those pseudo-label datasets as `--data_dir` (`DATA`, below).
-
-```bash
-python finetune.py \
-  --teacher facebook/bart-large-xsum --data_dir DATA \
-  --freeze_encoder --freeze_embeds \
-  --learning_rate=3e-4 \
-  --do_train \
-  --do_predict \
-  --fp16 --fp16_opt_level=O1 \
-  --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \
-  --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
-  --model_name_or_path dbart_xsum_12_3 \
-  --train_batch_size=32 --eval_batch_size=32 \
-  --sortish_sampler \
-  --num_train_epochs=5 \
-  --warmup_steps 500 \
-  --output_dir dbart_xsum_12_3_PL --gpus 1 --logger_name wandb
-```
-
-
-
-To combine datasets, as in Section 6.2, try something like:
-```bash
-curl -S https://cdn-datasets.huggingface.co/pseudo/xsum/bart_xsum_pl.tgz | tar -xvz -C .
-curl -S https://cdn-datasets.huggingface.co/pseudo/xsum/pegasus_xsum.tgz | tar -xvz -C .
-curl -S https://cdn-datasets.huggingface.co/summarization/xsum.tar.gz | tar -xvz -C .
-mkdir all_pl
-cat bart_xsum_pl/train.source pegasus_xsum/train.source xsum/train.source > all_pl/train.source
-cat bart_xsum_pl/train.target pegasus_xsum/train.target xsum/train.target > all_pl/train.target
-cp xsum/val* all_pl
-cp xsum/test* all_pl
-```
-then use `all_pl` as DATA in the command above.
-
-#### Direct Knowledge Distillation (KD)
-+ In this method, we use try to enforce that the student and teacher produce similar encoder_outputs, logits, and hidden_states using `SummarizationDistiller`.
-+ This method was used for `sshleifer/distilbart-xsum-12-6`, `6-6`, and `9-6` checkpoints were produced.
-+ You must use [`distillation.py`](./distillation.py). Note that this command initializes the student for you.
-
-The command that produced `sshleifer/distilbart-xsum-12-6` is at [./train_distilbart_xsum.sh](train_distilbart_xsum.sh)
-```bash
-./train_distilbart_xsum.sh --logger_name wandb --gpus 1
-```
-
-+ Expected ROUGE-2 between 21.3 and 21.6, run time ~13H.
-+ direct KD + Pegasus is VERY slow and works best with `--supervise_forward --normalize_hidden`.
-
-<!--- runtime: 13H on V-100 16GB GPU. -->
-
-### Citation
-
-```bibtex
-@misc{shleifer2020pretrained,
-      title={Pre-trained Summarization Distillation},
-      author={Sam Shleifer and Alexander M. Rush},
-      year={2020},
-      eprint={2010.13002},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-@article{Wolf2019HuggingFacesTS,
-  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
-  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush},
-  journal={ArXiv},
-  year={2019},
-  volume={abs/1910.03771}
-}
-```
diff --git a/examples/research_projects/seq2seq-distillation/_test_bash_script.py b/examples/research_projects/seq2seq-distillation/_test_bash_script.py
deleted file mode 100644
index fa84a60c0c88..000000000000
--- a/examples/research_projects/seq2seq-distillation/_test_bash_script.py
+++ /dev/null
@@ -1,203 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import os
-import sys
-from unittest.mock import patch
-
-import pytorch_lightning as pl
-import timeout_decorator
-import torch
-from distillation import SummarizationDistiller, distill_main
-from finetune import SummarizationModule, main
-
-from transformers import MarianMTModel
-from transformers.file_utils import cached_path
-from transformers.testing_utils import TestCasePlus, require_torch_gpu, slow
-from utils import load_json
-
-
-MARIAN_MODEL = "sshleifer/mar_enro_6_3_student"
-
-
-class TestMbartCc25Enro(TestCasePlus):
-    def setUp(self):
-        super().setUp()
-
-        data_cached = cached_path(
-            "https://cdn-datasets.huggingface.co/translation/wmt_en_ro-tr40k-va0.5k-te0.5k.tar.gz",
-            extract_compressed_file=True,
-        )
-        self.data_dir = f"{data_cached}/wmt_en_ro-tr40k-va0.5k-te0.5k"
-
-    @slow
-    @require_torch_gpu
-    def test_model_download(self):
-        """This warms up the cache so that we can time the next test without including download time, which varies between machines."""
-        MarianMTModel.from_pretrained(MARIAN_MODEL)
-
-    # @timeout_decorator.timeout(1200)
-    @slow
-    @require_torch_gpu
-    def test_train_mbart_cc25_enro_script(self):
-        env_vars_to_replace = {
-            "$MAX_LEN": 64,
-            "$BS": 64,
-            "$GAS": 1,
-            "$ENRO_DIR": self.data_dir,
-            "facebook/mbart-large-cc25": MARIAN_MODEL,
-            # "val_check_interval=0.25": "val_check_interval=1.0",
-            "--learning_rate=3e-5": "--learning_rate 3e-4",
-            "--num_train_epochs 6": "--num_train_epochs 1",
-        }
-
-        # Clean up bash script
-        bash_script = (self.test_file_dir / "train_mbart_cc25_enro.sh").open().read().split("finetune.py")[1].strip()
-        bash_script = bash_script.replace("\\\n", "").strip().replace('"$@"', "")
-        for k, v in env_vars_to_replace.items():
-            bash_script = bash_script.replace(k, str(v))
-        output_dir = self.get_auto_remove_tmp_dir()
-
-        # bash_script = bash_script.replace("--fp16 ", "")
-        args = f"""
-            --output_dir {output_dir}
-            --tokenizer_name Helsinki-NLP/opus-mt-en-ro
-            --sortish_sampler
-            --do_predict
-            --gpus 1
-            --freeze_encoder
-            --n_train 40000
-            --n_val 500
-            --n_test 500
-            --fp16_opt_level O1
-            --num_sanity_val_steps 0
-            --eval_beams 2
-        """.split()
-        # XXX: args.gpus > 1 : handle multi_gpu in the future
-
-        testargs = ["finetune.py"] + bash_script.split() + args
-        with patch.object(sys, "argv", testargs):
-            parser = argparse.ArgumentParser()
-            parser = pl.Trainer.add_argparse_args(parser)
-            parser = SummarizationModule.add_model_specific_args(parser, os.getcwd())
-            args = parser.parse_args()
-            model = main(args)
-
-        # Check metrics
-        metrics = load_json(model.metrics_save_path)
-        first_step_stats = metrics["val"][0]
-        last_step_stats = metrics["val"][-1]
-        self.assertEqual(len(metrics["val"]), (args.max_epochs / args.val_check_interval))
-        assert isinstance(last_step_stats[f"val_avg_{model.val_metric}"], float)
-
-        self.assertGreater(last_step_stats["val_avg_gen_time"], 0.01)
-        # model hanging on generate. Maybe bad config was saved. (XXX: old comment/assert?)
-        self.assertLessEqual(last_step_stats["val_avg_gen_time"], 1.0)
-
-        # test learning requirements:
-
-        # 1. BLEU improves over the course of training by more than 2 pts
-        self.assertGreater(last_step_stats["val_avg_bleu"] - first_step_stats["val_avg_bleu"], 2)
-
-        # 2. BLEU finishes above 17
-        self.assertGreater(last_step_stats["val_avg_bleu"], 17)
-
-        # 3. test BLEU and val BLEU within ~1.1 pt.
-        self.assertLess(abs(metrics["val"][-1]["val_avg_bleu"] - metrics["test"][-1]["test_avg_bleu"]), 1.1)
-
-        # check lightning ckpt can be loaded and has a reasonable statedict
-        contents = os.listdir(output_dir)
-        ckpt_path = [x for x in contents if x.endswith(".ckpt")][0]
-        full_path = os.path.join(args.output_dir, ckpt_path)
-        ckpt = torch.load(full_path, map_location="cpu")
-        expected_key = "model.model.decoder.layers.0.encoder_attn_layer_norm.weight"
-        assert expected_key in ckpt["state_dict"]
-        assert ckpt["state_dict"]["model.model.decoder.layers.0.encoder_attn_layer_norm.weight"].dtype == torch.float32
-
-        # TODO: turn on args.do_predict when PL bug fixed.
-        if args.do_predict:
-            contents = {os.path.basename(p) for p in contents}
-            assert "test_generations.txt" in contents
-            assert "test_results.txt" in contents
-            # assert len(metrics["val"]) ==  desired_n_evals
-            assert len(metrics["test"]) == 1
-
-
-class TestDistilMarianNoTeacher(TestCasePlus):
-    @timeout_decorator.timeout(600)
-    @slow
-    @require_torch_gpu
-    def test_opus_mt_distill_script(self):
-        data_dir = f"{self.test_file_dir_str}/test_data/wmt_en_ro"
-        env_vars_to_replace = {
-            "--fp16_opt_level=O1": "",
-            "$MAX_LEN": 128,
-            "$BS": 16,
-            "$GAS": 1,
-            "$ENRO_DIR": data_dir,
-            "$m": "sshleifer/student_marian_en_ro_6_1",
-            "val_check_interval=0.25": "val_check_interval=1.0",
-        }
-
-        # Clean up bash script
-        bash_script = (
-            (self.test_file_dir / "distil_marian_no_teacher.sh").open().read().split("distillation.py")[1].strip()
-        )
-        bash_script = bash_script.replace("\\\n", "").strip().replace('"$@"', "")
-        bash_script = bash_script.replace("--fp16 ", " ")
-
-        for k, v in env_vars_to_replace.items():
-            bash_script = bash_script.replace(k, str(v))
-        output_dir = self.get_auto_remove_tmp_dir()
-        bash_script = bash_script.replace("--fp16", "")
-        epochs = 6
-        testargs = (
-            ["distillation.py"]
-            + bash_script.split()
-            + [
-                f"--output_dir={output_dir}",
-                "--gpus=1",
-                "--learning_rate=1e-3",
-                f"--num_train_epochs={epochs}",
-                "--warmup_steps=10",
-                "--val_check_interval=1.0",
-                "--do_predict",
-            ]
-        )
-        with patch.object(sys, "argv", testargs):
-            parser = argparse.ArgumentParser()
-            parser = pl.Trainer.add_argparse_args(parser)
-            parser = SummarizationDistiller.add_model_specific_args(parser, os.getcwd())
-            args = parser.parse_args()
-            # assert args.gpus == gpus THIS BREAKS for multi_gpu
-
-            model = distill_main(args)
-
-        # Check metrics
-        metrics = load_json(model.metrics_save_path)
-        first_step_stats = metrics["val"][0]
-        last_step_stats = metrics["val"][-1]
-        assert len(metrics["val"]) >= (args.max_epochs / args.val_check_interval)  # +1 accounts for val_sanity_check
-
-        assert last_step_stats["val_avg_gen_time"] >= 0.01
-
-        assert first_step_stats["val_avg_bleu"] < last_step_stats["val_avg_bleu"]  # model learned nothing
-        assert 1.0 >= last_step_stats["val_avg_gen_time"]  # model hanging on generate. Maybe bad config was saved.
-        assert isinstance(last_step_stats[f"val_avg_{model.val_metric}"], float)
-
-        # check lightning ckpt can be loaded and has a reasonable statedict
-        contents = os.listdir(output_dir)
-        ckpt_path = [x for x in contents if x.endswith(".ckpt")][0]
-        full_path = os.path.join(args.output_dir, ckpt_path)
-        ckpt = torch.load(full_path, map_location="cpu")
-        expected_key = "model.model.decoder.layers.0.encoder_attn_layer_norm.weight"
-        assert expected_key in ckpt["state_dict"]
-        assert ckpt["state_dict"]["model.model.decoder.layers.0.encoder_attn_layer_norm.weight"].dtype == torch.float32
-
-        # TODO: turn on args.do_predict when PL bug fixed.
-        if args.do_predict:
-            contents = {os.path.basename(p) for p in contents}
-            assert "test_generations.txt" in contents
-            assert "test_results.txt" in contents
-            # assert len(metrics["val"]) ==  desired_n_evals
-            assert len(metrics["test"]) == 1
diff --git a/examples/research_projects/seq2seq-distillation/_test_make_student.py b/examples/research_projects/seq2seq-distillation/_test_make_student.py
deleted file mode 100644
index 73df66315cbd..000000000000
--- a/examples/research_projects/seq2seq-distillation/_test_make_student.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import tempfile
-import unittest
-
-from make_student import create_student_by_copying_alternating_layers
-
-from transformers import AutoConfig
-from transformers.file_utils import cached_property
-from transformers.testing_utils import require_torch
-
-
-TINY_BART = "sshleifer/bart-tiny-random"
-TINY_T5 = "patrickvonplaten/t5-tiny-random"
-
-
-@require_torch
-class MakeStudentTester(unittest.TestCase):
-    @cached_property
-    def teacher_config(self):
-        return AutoConfig.from_pretrained(TINY_BART)
-
-    def test_valid_t5(self):
-        student, *_ = create_student_by_copying_alternating_layers(TINY_T5, tempfile.mkdtemp(), e=1, d=1)
-        self.assertEqual(student.config.num_hidden_layers, 1)
-
-    def test_asymmetric_t5(self):
-        student, *_ = create_student_by_copying_alternating_layers(TINY_T5, tempfile.mkdtemp(), e=1, d=None)
-
-    def test_same_decoder_small_encoder(self):
-        student, *_ = create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=1, d=None)
-        self.assertEqual(student.config.encoder_layers, 1)
-        self.assertEqual(student.config.decoder_layers, self.teacher_config.encoder_layers)
-
-    def test_small_enc_small_dec(self):
-        student, *_ = create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=1, d=1)
-        self.assertEqual(student.config.encoder_layers, 1)
-        self.assertEqual(student.config.decoder_layers, 1)
-
-    def test_raises_assert(self):
-        with self.assertRaises(AssertionError):
-            create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=None, d=None)
diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
deleted file mode 100644
index 0ee4dd8afe1d..000000000000
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
+++ /dev/null
@@ -1,444 +0,0 @@
-import argparse
-import logging
-import os
-import sys
-import tempfile
-from pathlib import Path
-
-import lightning_base
-import pytest
-import pytorch_lightning as pl
-import torch
-from convert_pl_checkpoint_to_hf import convert_pl_to_hf
-from distillation import distill_main
-from finetune import SummarizationModule, main
-from huggingface_hub import list_models
-from parameterized import parameterized
-from run_eval import generate_summaries_or_translations
-from torch import nn
-
-from transformers import AutoConfig, AutoModelForSeq2SeqLM
-from transformers.testing_utils import CaptureStderr, CaptureStdout, TestCasePlus, require_torch_gpu, slow
-from utils import label_smoothed_nll_loss, lmap, load_json
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-CUDA_AVAILABLE = torch.cuda.is_available()
-CHEAP_ARGS = {
-    "max_tokens_per_batch": None,
-    "supervise_forward": True,
-    "normalize_hidden": True,
-    "label_smoothing": 0.2,
-    "eval_max_gen_length": None,
-    "eval_beams": 1,
-    "val_metric": "loss",
-    "save_top_k": 1,
-    "adafactor": True,
-    "early_stopping_patience": 2,
-    "logger_name": "default",
-    "length_penalty": 0.5,
-    "cache_dir": "",
-    "task": "summarization",
-    "num_workers": 2,
-    "alpha_hid": 0,
-    "freeze_embeds": True,
-    "enc_only": False,
-    "tgt_suffix": "",
-    "resume_from_checkpoint": None,
-    "sortish_sampler": True,
-    "student_decoder_layers": 1,
-    "val_check_interval": 1.0,
-    "output_dir": "",
-    "fp16": False,  # TODO(SS): set this to CUDA_AVAILABLE if ci installs apex or start using native amp
-    "no_teacher": False,
-    "fp16_opt_level": "O1",
-    "gpus": 1 if CUDA_AVAILABLE else 0,
-    "n_tpu_cores": 0,
-    "max_grad_norm": 1.0,
-    "do_train": True,
-    "do_predict": True,
-    "accumulate_grad_batches": 1,
-    "server_ip": "",
-    "server_port": "",
-    "seed": 42,
-    "model_name_or_path": "sshleifer/bart-tiny-random",
-    "config_name": "",
-    "tokenizer_name": "facebook/bart-large",
-    "do_lower_case": False,
-    "learning_rate": 0.3,
-    "lr_scheduler": "linear",
-    "weight_decay": 0.0,
-    "adam_epsilon": 1e-08,
-    "warmup_steps": 0,
-    "max_epochs": 1,
-    "train_batch_size": 2,
-    "eval_batch_size": 2,
-    "max_source_length": 12,
-    "max_target_length": 12,
-    "val_max_target_length": 12,
-    "test_max_target_length": 12,
-    "fast_dev_run": False,
-    "no_cache": False,
-    "n_train": -1,
-    "n_val": -1,
-    "n_test": -1,
-    "student_encoder_layers": 1,
-    "freeze_encoder": False,
-    "auto_scale_batch_size": False,
-    "overwrite_output_dir": False,
-    "student": None,
-}
-
-
-def _dump_articles(path: Path, articles: list):
-    content = "\n".join(articles)
-    Path(path).open("w").writelines(content)
-
-
-ARTICLES = [" Sam ate lunch today.", "Sams lunch ingredients."]
-SUMMARIES = ["A very interesting story about what I ate for lunch.", "Avocado, celery, turkey, coffee"]
-T5_TINY = "patrickvonplaten/t5-tiny-random"
-T5_TINIER = "sshleifer/t5-tinier-random"
-BART_TINY = "sshleifer/bart-tiny-random"
-MBART_TINY = "sshleifer/tiny-mbart"
-MARIAN_TINY = "sshleifer/tiny-marian-en-de"
-FSMT_TINY = "stas/tiny-wmt19-en-de"
-
-
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-logging.disable(logging.CRITICAL)  # remove noisy download output from tracebacks
-
-
-def make_test_data_dir(tmp_dir):
-    for split in ["train", "val", "test"]:
-        _dump_articles(os.path.join(tmp_dir, f"{split}.source"), ARTICLES)
-        _dump_articles(os.path.join(tmp_dir, f"{split}.target"), SUMMARIES)
-    return tmp_dir
-
-
-class TestSummarizationDistiller(TestCasePlus):
-    @classmethod
-    def setUpClass(cls):
-        logging.disable(logging.CRITICAL)  # remove noisy download output from tracebacks
-        return cls
-
-    @slow
-    @require_torch_gpu
-    def test_hub_configs(self):
-        """I put require_torch_gpu cause I only want this to run with self-scheduled."""
-
-        model_list = list_models()
-        org = "sshleifer"
-        model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
-        allowed_to_be_broken = ["sshleifer/blenderbot-3B", "sshleifer/blenderbot-90M"]
-        failures = []
-        for m in model_ids:
-            if m in allowed_to_be_broken:
-                continue
-            try:
-                AutoConfig.from_pretrained(m)
-            except Exception:
-                failures.append(m)
-        assert not failures, f"The following models could not be loaded through AutoConfig: {failures}"
-
-    def test_distill_no_teacher(self):
-        updates = {"student_encoder_layers": 2, "student_decoder_layers": 1, "no_teacher": True}
-        self._test_distiller_cli(updates)
-
-    def test_distill_checkpointing_with_teacher(self):
-        updates = {
-            "student_encoder_layers": 2,
-            "student_decoder_layers": 1,
-            "max_epochs": 4,
-            "val_check_interval": 0.25,
-            "alpha_hid": 2.0,
-            "model_name_or_path": "IGNORE_THIS_IT_DOESNT_GET_USED",
-        }
-        model = self._test_distiller_cli(updates, check_contents=False)
-
-        ckpts = list(Path(model.output_dir).glob("*.ckpt"))
-        self.assertEqual(1, len(ckpts))
-        transformer_ckpts = list(Path(model.output_dir).glob("**/*.bin"))
-        self.assertEqual(len(transformer_ckpts), 2)
-        examples = lmap(str.strip, Path(model.hparams.data_dir).joinpath("test.source").open().readlines())
-        out_path = tempfile.mktemp()  # XXX: not being cleaned up
-        generate_summaries_or_translations(examples, out_path, str(model.output_dir / "best_tfmr"))
-        self.assertTrue(Path(out_path).exists())
-
-        out_path_new = self.get_auto_remove_tmp_dir()
-        convert_pl_to_hf(ckpts[0], transformer_ckpts[0].parent, out_path_new)
-        assert os.path.exists(os.path.join(out_path_new, "pytorch_model.bin"))
-
-    def test_loss_fn(self):
-        model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY)
-        input_ids, mask = model.dummy_inputs["input_ids"], model.dummy_inputs["attention_mask"]
-        target_ids = torch.tensor([[0, 4, 8, 2], [0, 8, 2, 1]], dtype=torch.long, device=model.device)
-        decoder_input_ids = target_ids[:, :-1].contiguous()  # Why this line?
-        lm_labels = target_ids[:, 1:].clone()  # why clone?
-        model_computed_loss = model(
-            input_ids, attention_mask=mask, decoder_input_ids=decoder_input_ids, labels=lm_labels, use_cache=False
-        ).loss
-
-        logits = model(input_ids, attention_mask=mask, decoder_input_ids=decoder_input_ids, use_cache=False).logits
-
-        lprobs = nn.functional.log_softmax(logits, dim=-1)
-        smoothed_loss, nll_loss = label_smoothed_nll_loss(
-            lprobs, lm_labels, 0.1, ignore_index=model.config.pad_token_id
-        )
-        with self.assertRaises(AssertionError):
-            # TODO: understand why this breaks
-            self.assertEqual(nll_loss, model_computed_loss)
-
-    def test_distill_mbart(self):
-        updates = {
-            "student_encoder_layers": 2,
-            "student_decoder_layers": 1,
-            "num_train_epochs": 4,
-            "val_check_interval": 0.25,
-            "alpha_hid": 2.0,
-            "task": "translation",
-            "model_name_or_path": "IGNORE_THIS_IT_DOESNT_GET_USED",
-            "tokenizer_name": MBART_TINY,
-            "teacher": MBART_TINY,
-            "src_lang": "en_XX",
-            "tgt_lang": "ro_RO",
-        }
-        model = self._test_distiller_cli(updates, check_contents=False)
-        assert model.model.config.model_type == "mbart"
-
-        ckpts = list(Path(model.output_dir).glob("*.ckpt"))
-        self.assertEqual(1, len(ckpts))
-        transformer_ckpts = list(Path(model.output_dir).glob("**/*.bin"))
-        all_files = list(Path(model.output_dir).glob("best_tfmr/*"))
-        assert len(all_files) > 2
-        self.assertEqual(len(transformer_ckpts), 2)
-
-    def test_distill_t5(self):
-        updates = {
-            "student_encoder_layers": 1,
-            "student_decoder_layers": 1,
-            "alpha_hid": 2.0,
-            "teacher": T5_TINY,
-            "model_name_or_path": T5_TINY,
-            "tokenizer_name": T5_TINY,
-        }
-        self._test_distiller_cli(updates)
-
-    def test_distill_different_base_models(self):
-        updates = {
-            "teacher": T5_TINY,
-            "student": T5_TINIER,
-            "model_name_or_path": T5_TINIER,
-            "tokenizer_name": T5_TINIER,
-        }
-        self._test_distiller_cli(updates)
-
-    def _test_distiller_cli(self, updates, check_contents=True):
-        default_updates = {
-            "label_smoothing": 0.0,
-            "early_stopping_patience": -1,
-            "train_batch_size": 1,
-            "eval_batch_size": 2,
-            "max_epochs": 2,
-            "alpha_mlm": 0.2,
-            "alpha_ce": 0.8,
-            "do_predict": True,
-            "model_name_or_path": "sshleifer/tinier_bart",
-            "teacher": CHEAP_ARGS["model_name_or_path"],
-            "val_check_interval": 0.5,
-        }
-        default_updates.update(updates)
-        args_d: dict = CHEAP_ARGS.copy()
-        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
-        output_dir = self.get_auto_remove_tmp_dir()
-
-        args_d.update(data_dir=tmp_dir, output_dir=output_dir, **default_updates)
-        model = distill_main(argparse.Namespace(**args_d))
-        if not check_contents:
-            return model
-        contents = os.listdir(output_dir)
-        contents = {os.path.basename(p) for p in contents}
-        ckpt_files = [p for p in contents if p.endswith("ckpt")]
-        assert len(ckpt_files) > 0
-
-        self.assertIn("test_generations.txt", contents)
-        self.assertIn("test_results.txt", contents)
-
-        metrics = load_json(model.metrics_save_path)
-        last_step_stats = metrics["val"][-1]
-        self.assertGreaterEqual(last_step_stats["val_avg_gen_time"], 0.01)
-        self.assertGreaterEqual(1.0, last_step_stats["val_avg_gen_time"])
-        self.assertIsInstance(last_step_stats[f"val_avg_{model.val_metric}"], float)
-        desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) + 1)
-        self.assertEqual(len(metrics["val"]), desired_n_evals)
-        self.assertEqual(len(metrics["test"]), 1)
-        return model
-
-
-class TestTheRest(TestCasePlus):
-    @parameterized.expand(
-        [T5_TINY, BART_TINY, MBART_TINY, MARIAN_TINY, FSMT_TINY],
-    )
-    def test_finetune(self, model):
-        args_d: dict = CHEAP_ARGS.copy()
-        task = "translation" if model in [MBART_TINY, MARIAN_TINY, FSMT_TINY] else "summarization"
-        args_d["label_smoothing"] = 0.1 if task == "translation" else 0
-
-        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
-        output_dir = self.get_auto_remove_tmp_dir()
-        args_d.update(
-            data_dir=tmp_dir,
-            model_name_or_path=model,
-            tokenizer_name=None,
-            train_batch_size=2,
-            eval_batch_size=2,
-            output_dir=output_dir,
-            do_predict=True,
-            task=task,
-            src_lang="en_XX",
-            tgt_lang="ro_RO",
-            freeze_encoder=True,
-            freeze_embeds=True,
-        )
-        assert "n_train" in args_d
-        args = argparse.Namespace(**args_d)
-        module = main(args)
-
-        input_embeds = module.model.get_input_embeddings()
-        assert not input_embeds.weight.requires_grad
-        if model == T5_TINY:
-            lm_head = module.model.lm_head
-            assert not lm_head.weight.requires_grad
-            assert (lm_head.weight == input_embeds.weight).all().item()
-        elif model == FSMT_TINY:
-            fsmt = module.model.model
-            embed_pos = fsmt.decoder.embed_positions
-            assert not embed_pos.weight.requires_grad
-            assert not fsmt.decoder.embed_tokens.weight.requires_grad
-            # check that embeds are not the same
-            assert fsmt.decoder.embed_tokens != fsmt.encoder.embed_tokens
-        else:
-            bart = module.model.model
-            embed_pos = bart.decoder.embed_positions
-            assert not embed_pos.weight.requires_grad
-            assert not bart.shared.weight.requires_grad
-            # check that embeds are the same
-            assert bart.decoder.embed_tokens == bart.encoder.embed_tokens
-            assert bart.decoder.embed_tokens == bart.shared
-
-        example_batch = load_json(module.output_dir / "text_batch.json")
-        assert isinstance(example_batch, dict)
-        assert len(example_batch) >= 4
-
-    def test_finetune_extra_model_args(self):
-        args_d: dict = CHEAP_ARGS.copy()
-
-        task = "summarization"
-        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
-
-        args_d.update(
-            data_dir=tmp_dir,
-            tokenizer_name=None,
-            train_batch_size=2,
-            eval_batch_size=2,
-            do_predict=False,
-            task=task,
-            src_lang="en_XX",
-            tgt_lang="ro_RO",
-            freeze_encoder=True,
-            freeze_embeds=True,
-        )
-
-        # test models whose config includes the extra_model_args
-        model = BART_TINY
-        output_dir = self.get_auto_remove_tmp_dir()
-        args_d1 = args_d.copy()
-        args_d1.update(
-            model_name_or_path=model,
-            output_dir=output_dir,
-        )
-        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
-        for p in extra_model_params:
-            args_d1[p] = 0.5
-        args = argparse.Namespace(**args_d1)
-        model = main(args)
-        for p in extra_model_params:
-            assert getattr(model.config, p) == 0.5, f"failed to override the model config for param {p}"
-
-        # test models whose config doesn't include the extra_model_args
-        model = T5_TINY
-        output_dir = self.get_auto_remove_tmp_dir()
-        args_d2 = args_d.copy()
-        args_d2.update(
-            model_name_or_path=model,
-            output_dir=output_dir,
-        )
-        unsupported_param = "encoder_layerdrop"
-        args_d2[unsupported_param] = 0.5
-        args = argparse.Namespace(**args_d2)
-        with pytest.raises(Exception) as excinfo:
-            model = main(args)
-        assert str(excinfo.value) == f"model config doesn't have a `{unsupported_param}` attribute"
-
-    def test_finetune_lr_schedulers(self):
-        args_d: dict = CHEAP_ARGS.copy()
-
-        task = "summarization"
-        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
-
-        model = BART_TINY
-        output_dir = self.get_auto_remove_tmp_dir()
-
-        args_d.update(
-            data_dir=tmp_dir,
-            model_name_or_path=model,
-            output_dir=output_dir,
-            tokenizer_name=None,
-            train_batch_size=2,
-            eval_batch_size=2,
-            do_predict=False,
-            task=task,
-            src_lang="en_XX",
-            tgt_lang="ro_RO",
-            freeze_encoder=True,
-            freeze_embeds=True,
-        )
-
-        # emulate finetune.py
-        parser = argparse.ArgumentParser()
-        parser = pl.Trainer.add_argparse_args(parser)
-        parser = SummarizationModule.add_model_specific_args(parser, os.getcwd())
-        args = {"--help": True}
-
-        # --help test
-        with pytest.raises(SystemExit) as excinfo:
-            with CaptureStdout() as cs:
-                args = parser.parse_args(args)
-            assert False, "--help is expected to sys.exit"
-        assert excinfo.type is SystemExit
-        expected = lightning_base.arg_to_scheduler_metavar
-        assert expected in cs.out, "--help is expected to list the supported schedulers"
-
-        # --lr_scheduler=non_existing_scheduler test
-        unsupported_param = "non_existing_scheduler"
-        args = {f"--lr_scheduler={unsupported_param}"}
-        with pytest.raises(SystemExit) as excinfo:
-            with CaptureStderr() as cs:
-                args = parser.parse_args(args)
-            assert False, "invalid argument is expected to sys.exit"
-        assert excinfo.type is SystemExit
-        expected = f"invalid choice: '{unsupported_param}'"
-        assert expected in cs.err, f"should have bailed on invalid choice of scheduler {unsupported_param}"
-
-        # --lr_scheduler=existing_scheduler test
-        supported_param = "cosine"
-        args_d1 = args_d.copy()
-        args_d1["lr_scheduler"] = supported_param
-        args = argparse.Namespace(**args_d1)
-        model = main(args)
-        assert (
-            getattr(model.hparams, "lr_scheduler") == supported_param
-        ), f"lr_scheduler={supported_param} shouldn't fail"
diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py
deleted file mode 100644
index 9eeb3b30d399..000000000000
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# as due to their complexity multi-gpu tests could impact other tests, and to aid debug we have those in a separate module.
-
-import os
-import sys
-from pathlib import Path
-
-import torch
-
-from transformers.testing_utils import TestCasePlus, execute_subprocess_async, require_torch_multi_gpu
-from utils import load_json
-
-
-CUDA_AVAILABLE = torch.cuda.is_available()
-ARTICLES = [" Sam ate lunch today.", "Sams lunch ingredients."]
-SUMMARIES = ["A very interesting story about what I ate for lunch.", "Avocado, celery, turkey, coffee"]
-CHEAP_ARGS = {
-    "max_tokens_per_batch": None,
-    "supervise_forward": True,
-    "normalize_hidden": True,
-    "label_smoothing": 0.2,
-    "eval_max_gen_length": None,
-    "eval_beams": 1,
-    "val_metric": "loss",
-    "save_top_k": 1,
-    "adafactor": True,
-    "early_stopping_patience": 2,
-    "logger_name": "default",
-    "length_penalty": 0.5,
-    "cache_dir": "",
-    "task": "summarization",
-    "num_workers": 2,
-    "alpha_hid": 0,
-    "freeze_embeds": True,
-    "enc_only": False,
-    "tgt_suffix": "",
-    "resume_from_checkpoint": None,
-    "sortish_sampler": True,
-    "student_decoder_layers": 1,
-    "val_check_interval": 1.0,
-    "output_dir": "",
-    "fp16": False,  # TODO(SS): set this to CUDA_AVAILABLE if ci installs apex or start using native amp
-    "no_teacher": False,
-    "fp16_opt_level": "O1",
-    "gpus": 1 if CUDA_AVAILABLE else 0,
-    "n_tpu_cores": 0,
-    "max_grad_norm": 1.0,
-    "do_train": True,
-    "do_predict": True,
-    "accumulate_grad_batches": 1,
-    "server_ip": "",
-    "server_port": "",
-    "seed": 42,
-    "model_name_or_path": "sshleifer/bart-tiny-random",
-    "config_name": "",
-    "tokenizer_name": "facebook/bart-large",
-    "do_lower_case": False,
-    "learning_rate": 0.3,
-    "lr_scheduler": "linear",
-    "weight_decay": 0.0,
-    "adam_epsilon": 1e-08,
-    "warmup_steps": 0,
-    "max_epochs": 1,
-    "train_batch_size": 2,
-    "eval_batch_size": 2,
-    "max_source_length": 12,
-    "max_target_length": 12,
-    "val_max_target_length": 12,
-    "test_max_target_length": 12,
-    "fast_dev_run": False,
-    "no_cache": False,
-    "n_train": -1,
-    "n_val": -1,
-    "n_test": -1,
-    "student_encoder_layers": 1,
-    "freeze_encoder": False,
-    "auto_scale_batch_size": False,
-    "overwrite_output_dir": False,
-    "student": None,
-}
-
-
-def _dump_articles(path: Path, articles: list):
-    content = "\n".join(articles)
-    Path(path).open("w").writelines(content)
-
-
-def make_test_data_dir(tmp_dir):
-    for split in ["train", "val", "test"]:
-        _dump_articles(os.path.join(tmp_dir, f"{split}.source"), ARTICLES)
-        _dump_articles(os.path.join(tmp_dir, f"{split}.target"), SUMMARIES)
-    return tmp_dir
-
-
-class TestSummarizationDistillerMultiGPU(TestCasePlus):
-    @classmethod
-    def setUpClass(cls):
-        return cls
-
-    @require_torch_multi_gpu
-    def test_multi_gpu(self):
-        updates = {
-            "no_teacher": True,
-            "freeze_encoder": True,
-            "gpus": 2,
-            "overwrite_output_dir": True,
-            "sortish_sampler": True,
-        }
-        self._test_distiller_cli_fork(updates, check_contents=False)
-
-    def _test_distiller_cli_fork(self, updates, check_contents=True):
-        default_updates = {
-            "label_smoothing": 0.0,
-            "early_stopping_patience": -1,
-            "train_batch_size": 1,
-            "eval_batch_size": 2,
-            "max_epochs": 2,
-            "alpha_mlm": 0.2,
-            "alpha_ce": 0.8,
-            "do_predict": True,
-            "model_name_or_path": "sshleifer/tinier_bart",
-            "teacher": CHEAP_ARGS["model_name_or_path"],
-            "val_check_interval": 0.5,
-        }
-        default_updates.update(updates)
-        args_d: dict = CHEAP_ARGS.copy()
-        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
-        output_dir = self.get_auto_remove_tmp_dir()
-        args_d.update(data_dir=tmp_dir, output_dir=output_dir, **default_updates)
-
-        def convert(k, v):
-            if k in ["tgt_suffix", "server_ip", "server_port", "out", "n_tpu_cores"]:
-                return ""
-            if v is False or v is None:
-                return ""
-            if v is True:  # or len(str(v))==0:
-                return f"--{k}"
-            return f"--{k}={v}"
-
-        cli_args = [x for x in (convert(k, v) for k, v in args_d.items()) if len(x)]
-        cmd = [sys.executable, f"{self.test_file_dir}/distillation.py"] + cli_args
-        execute_subprocess_async(cmd, env=self.get_env())
-
-        contents = os.listdir(output_dir)
-        contents = {os.path.basename(p) for p in contents}
-        ckpt_files = [p for p in contents if p.endswith("ckpt")]
-        assert len(ckpt_files) > 0
-
-        self.assertIn("test_generations.txt", contents)
-        self.assertIn("test_results.txt", contents)
-
-        # get the following from the module, (we don't have access to `model` here)
-        metrics_save_path = os.path.join(output_dir, "metrics.json")
-        val_metric = "rouge2"
-
-        metrics = load_json(metrics_save_path)
-        # {'test': [{'test_avg_loss': 10.63731575012207, 'test_avg_rouge1': 0.0, 'test_avg_rouge2': 0.0, 'test_avg_rougeL': 0.0, 'test_avg_gen_time': 0.1822289228439331, 'test_avg_gen_len': 142.0, 'step_count': 1}]}
-        print(metrics)
-        last_step_stats = metrics["val"][-1]
-        self.assertGreaterEqual(last_step_stats["val_avg_gen_time"], 0.01)
-        self.assertIsInstance(last_step_stats[f"val_avg_{val_metric}"], float)
-        self.assertEqual(len(metrics["test"]), 1)
-        desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) / 2 + 1)
-        self.assertEqual(len(metrics["val"]), desired_n_evals)
diff --git a/examples/research_projects/seq2seq-distillation/callbacks.py b/examples/research_projects/seq2seq-distillation/callbacks.py
deleted file mode 100644
index 6f6ed5dd58ac..000000000000
--- a/examples/research_projects/seq2seq-distillation/callbacks.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import logging
-from pathlib import Path
-
-import numpy as np
-import pytorch_lightning as pl
-import torch
-from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
-from pytorch_lightning.utilities import rank_zero_only
-
-from utils import save_json
-
-
-def count_trainable_parameters(model):
-    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
-    params = sum([np.prod(p.size()) for p in model_parameters])
-    return params
-
-
-logger = logging.getLogger(__name__)
-
-
-class Seq2SeqLoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lrs = {f"lr_group_{i}": param["lr"] for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)}
-        pl_module.logger.log_metrics(lrs)
-
-    @rank_zero_only
-    def _write_logs(
-        self, trainer: pl.Trainer, pl_module: pl.LightningModule, type_path: str, save_generations=True
-    ) -> None:
-        logger.info(f"***** {type_path} results at step {trainer.global_step:05d} *****")
-        metrics = trainer.callback_metrics
-        trainer.logger.log_metrics({k: v for k, v in metrics.items() if k not in ["log", "progress_bar", "preds"]})
-        # Log results
-        od = Path(pl_module.hparams.output_dir)
-        if type_path == "test":
-            results_file = od / "test_results.txt"
-            generations_file = od / "test_generations.txt"
-        else:
-            # this never gets hit. I prefer not to save intermediate generations, and results are in metrics.json
-            # If people want this it will be easy enough to add back.
-            results_file = od / f"{type_path}_results/{trainer.global_step:05d}.txt"
-            generations_file = od / f"{type_path}_generations/{trainer.global_step:05d}.txt"
-            results_file.parent.mkdir(exist_ok=True)
-            generations_file.parent.mkdir(exist_ok=True)
-        with open(results_file, "a+") as writer:
-            for key in sorted(metrics):
-                if key in ["log", "progress_bar", "preds"]:
-                    continue
-                val = metrics[key]
-                if isinstance(val, torch.Tensor):
-                    val = val.item()
-                msg = f"{key}: {val:.6f}\n"
-                writer.write(msg)
-
-        if not save_generations:
-            return
-
-        if "preds" in metrics:
-            content = "\n".join(metrics["preds"])
-            generations_file.open("w+").write(content)
-
-    @rank_zero_only
-    def on_train_start(self, trainer, pl_module):
-        try:
-            npars = pl_module.model.model.num_parameters()
-        except AttributeError:
-            npars = pl_module.model.num_parameters()
-
-        n_trainable_pars = count_trainable_parameters(pl_module)
-        # mp stands for million parameters
-        trainer.logger.log_metrics({"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6})
-
-    @rank_zero_only
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        save_json(pl_module.metrics, pl_module.metrics_save_path)
-        return self._write_logs(trainer, pl_module, "test")
-
-    @rank_zero_only
-    def on_validation_end(self, trainer: pl.Trainer, pl_module):
-        save_json(pl_module.metrics, pl_module.metrics_save_path)
-        # Uncommenting this will save val generations
-        # return self._write_logs(trainer, pl_module, "valid")
-
-
-def get_checkpoint_callback(output_dir, metric, save_top_k=1, lower_is_better=False):
-    """Saves the best model by validation ROUGE2 score."""
-    if metric == "rouge2":
-        exp = "{val_avg_rouge2:.4f}-{step_count}"
-    elif metric == "bleu":
-        exp = "{val_avg_bleu:.4f}-{step_count}"
-    elif metric == "loss":
-        exp = "{val_avg_loss:.4f}-{step_count}"
-    else:
-        raise NotImplementedError(
-            f"seq2seq callbacks only support rouge2, bleu and loss, got {metric}, You can make your own by adding to"
-            " this function."
-        )
-
-    checkpoint_callback = ModelCheckpoint(
-        dirpath=output_dir,
-        filename=exp,
-        monitor=f"val_{metric}",
-        mode="min" if "loss" in metric else "max",
-        save_top_k=save_top_k,
-    )
-    return checkpoint_callback
-
-
-def get_early_stopping_callback(metric, patience):
-    return EarlyStopping(
-        monitor=f"val_{metric}",  # does this need avg?
-        mode="min" if "loss" in metric else "max",
-        patience=patience,
-        verbose=True,
-    )
diff --git a/examples/research_projects/seq2seq-distillation/convert_pl_checkpoint_to_hf.py b/examples/research_projects/seq2seq-distillation/convert_pl_checkpoint_to_hf.py
deleted file mode 100755
index 5f3c984f3724..000000000000
--- a/examples/research_projects/seq2seq-distillation/convert_pl_checkpoint_to_hf.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env python
-
-import os
-from pathlib import Path
-from typing import Dict, List
-
-import fire
-import torch
-
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-from transformers.utils.logging import get_logger
-
-
-logger = get_logger(__name__)
-
-
-def remove_prefix(text: str, prefix: str):
-    if text.startswith(prefix):
-        return text[len(prefix) :]
-    return text  # or whatever
-
-
-def sanitize(sd):
-    return {remove_prefix(k, "model."): v for k, v in sd.items()}
-
-
-def average_state_dicts(state_dicts: List[Dict[str, torch.Tensor]]):
-    new_sd = {}
-    for k in state_dicts[0].keys():
-        tensors = [sd[k] for sd in state_dicts]
-        new_t = sum(tensors) / len(tensors)
-        assert isinstance(new_t, torch.Tensor)
-        new_sd[k] = new_t
-    return new_sd
-
-
-def convert_pl_to_hf(pl_ckpt_path: str, hf_src_model_dir: str, save_path: str) -> None:
-    """Cleanup a pytorch-lightning .ckpt file or experiment dir and save a huggingface model with that state dict.
-    Silently allows extra pl keys (like teacher.) Puts all ckpt models into CPU RAM at once!
-
-    Args:
-        pl_ckpt_path (:obj:`str`): Path to a .ckpt file saved by pytorch_lightning or dir containing ckpt files.
-            If a directory is passed, all .ckpt files inside it will be averaged!
-        hf_src_model_dir (:obj:`str`): Path to a directory containing a correctly shaped checkpoint
-        save_path (:obj:`str`): Directory to save the new model
-
-    """
-    hf_model = AutoModelForSeq2SeqLM.from_pretrained(hf_src_model_dir)
-    if os.path.isfile(pl_ckpt_path):
-        ckpt_files = [pl_ckpt_path]
-    else:
-        assert os.path.isdir(pl_ckpt_path)
-        ckpt_files = list(Path(pl_ckpt_path).glob("*.ckpt"))
-        assert ckpt_files, f"could not find any ckpt files inside the {pl_ckpt_path} directory"
-
-    if len(ckpt_files) > 1:
-        logger.info(f"averaging the weights of {ckpt_files}")
-
-    state_dicts = [sanitize(torch.load(x, map_location="cpu")["state_dict"]) for x in ckpt_files]
-    state_dict = average_state_dicts(state_dicts)
-
-    missing, unexpected = hf_model.load_state_dict(state_dict, strict=False)
-    assert not missing, f"missing keys: {missing}"
-    hf_model.save_pretrained(save_path)
-    try:
-        tok = AutoTokenizer.from_pretrained(hf_src_model_dir)
-        tok.save_pretrained(save_path)
-    except Exception:
-        pass
-        # dont copy tokenizer if cant
-
-
-if __name__ == "__main__":
-    fire.Fire(convert_pl_to_hf)
diff --git a/examples/research_projects/seq2seq-distillation/distil_marian_enro_teacher.sh b/examples/research_projects/seq2seq-distillation/distil_marian_enro_teacher.sh
deleted file mode 100755
index 5c938a71604e..000000000000
--- a/examples/research_projects/seq2seq-distillation/distil_marian_enro_teacher.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-export PYTHONPATH="../":"${PYTHONPATH}"
-export WANDB_PROJECT=dmar
-# export MAX_LEN=128
-python distillation.py \
-  --learning_rate=3e-4 \
-  --do_train \
-  --fp16 \
-  --val_check_interval 0.25 \
-  --teacher Helsinki-NLP/opus-mt-en-ro \
-  --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
-  --student_decoder_layers 3 --student_encoder_layers 6 \
-  --freeze_encoder --freeze_embeds \
-  --model_name_or_path IGNORED \
-  --alpha_hid=3. \
-  --train_batch_size=$BS --eval_batch_size=$BS \
-  --tokenizer_name Helsinki-NLP/opus-mt-en-ro \
-  --warmup_steps 500 --logger_name wandb \
-  --fp16_opt_level O1 --task translation --normalize_hidden --num_sanity_val_steps=0 \
-  "$@"
diff --git a/examples/research_projects/seq2seq-distillation/distil_marian_no_teacher.sh b/examples/research_projects/seq2seq-distillation/distil_marian_no_teacher.sh
deleted file mode 100755
index 4f0f53d7960b..000000000000
--- a/examples/research_projects/seq2seq-distillation/distil_marian_no_teacher.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env bash
-export PYTHONPATH="../":"${PYTHONPATH}"
-export WANDB_PROJECT=dmar
-export MAX_LEN=128
-python finetune.py \
-  --learning_rate=3e-4 \
-  --do_train \
-  --do_predict \
-  --fp16 \
-  --val_check_interval 0.25 \
-  --data_dir $ENRO_DIR \
-  --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
-  --freeze_encoder --freeze_embeds \
-  --train_batch_size=$BS --eval_batch_size=$BS \
-  --tokenizer_name $m --model_name_or_path $m \
-  --warmup_steps 500 --sortish_sampler --logger_name wandb \
-  --gpus 1 --fp16_opt_level=O1 --task translation --num_sanity_val_steps=0 \
-  "$@"
diff --git a/examples/research_projects/seq2seq-distillation/distillation.py b/examples/research_projects/seq2seq-distillation/distillation.py
deleted file mode 100755
index 323f62bf4581..000000000000
--- a/examples/research_projects/seq2seq-distillation/distillation.py
+++ /dev/null
@@ -1,310 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import gc
-import os
-import sys
-from pathlib import Path
-from typing import List  # noqa: F401
-
-import pytorch_lightning as pl
-import torch
-from finetune import SummarizationModule, TranslationModule
-from finetune import main as ft_main
-from make_student import create_student_by_copying_alternating_layers, get_layers_to_supervise
-from torch import nn
-
-from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5ForConditionalGeneration
-from transformers.models.bart.modeling_bart import shift_tokens_right
-from utils import calculate_bleu, check_output_dir, freeze_params, label_smoothed_nll_loss, use_task_specific_params
-
-
-# need the parent dir module
-sys.path.insert(2, str(Path(__file__).resolve().parents[1]))
-from lightning_base import generic_train  # noqa
-
-
-class SummarizationDistiller(SummarizationModule):
-    """Supports T5, Bart, Pegasus and other models that inherit from Bart."""
-
-    loss_names = ["loss", "ce_loss", "mlm_loss", "hid_loss_enc", "hid_loss_dec"]
-
-    def __init__(self, hparams):
-        assert Path(hparams.data_dir).exists()
-        self.output_dir = Path(hparams.output_dir)
-        self.output_dir.mkdir(exist_ok=True)
-
-        save_dir = self.output_dir.joinpath("student")
-
-        hparams.model_name_or_path = str(save_dir)  # Tell lightning we are training the student
-        teacher = AutoModelForSeq2SeqLM.from_pretrained(hparams.teacher).eval()
-        use_task_specific_params(teacher, hparams.task)  # We copy good generation parameters to student by default
-        if hparams.student is not None:
-            student = AutoModelForSeq2SeqLM.from_pretrained(hparams.student)
-            use_task_specific_params(student, hparams.task)
-            e_layer_ids, d_layer_ids = None, None
-        else:
-            student, e_layer_ids, d_layer_ids = create_student_by_copying_alternating_layers(
-                teacher, e=hparams.student_encoder_layers, d=hparams.student_decoder_layers, save_path=save_dir
-            )
-
-        if hparams.length_penalty != -1:
-            student.config.length_penalty = hparams.length_penalty
-        hparams.tokenizer_name = hparams.teacher  # Use teacher's tokenizer
-        super().__init__(hparams, model=student, config=student.config)
-        assert student.config.model_type == teacher.config.model_type, (
-            f"teacher, student model types should be the same, got {student.config.model_type} !="
-            f" {teacher.config.model_type}"
-        )
-
-        if student.config.model_type == "t5":
-            student_encoder_layers = len(student.get_encoder().block)
-            student_decoder_layers = len(student.get_decoder().block)
-            teacher_encoder_layers = len(teacher.get_encoder().block)
-            teacher_decoder_layers = len(teacher.get_decoder().block)
-        else:
-            student_encoder_layers = student.config.encoder_layers
-            student_decoder_layers = student.config.decoder_layers
-            teacher_encoder_layers = teacher.config.encoder_layers
-            teacher_decoder_layers = teacher.config.decoder_layers
-
-        self.different_base_models = not (hparams.student is None or hparams.teacher == hparams.student)
-        self.do_calc_hidden_loss = (not self.different_base_models) and hparams.alpha_hid > 0
-        self.different_encoder = self.different_base_models or (student_encoder_layers != teacher_encoder_layers)
-        # self.different_encoder determines whether we need to run the teacher encoder
-        self.teacher = teacher
-        freeze_params(self.teacher)
-
-        if not self.different_encoder:  # To save RAM, delete teacher encoder and freeze student encoder.
-            try:
-                del self.teacher.model.encoder
-            except AttributeError:  # T5
-                del self.teacher.encoder
-
-        if e_layer_ids is None:
-            e_layer_ids = list(range(student_encoder_layers))
-        if d_layer_ids is None:
-            d_layer_ids = list(range(student_decoder_layers))
-
-        self.e_layer_ids, self.d_layer_ids = e_layer_ids, d_layer_ids  # type: List[int], List[int]
-
-        if self.do_calc_hidden_loss:  # Intermediate supervision: Decide which layers to supervise
-            if hparams.supervise_forward:
-                self.e_matches = get_layers_to_supervise(
-                    n_student=len(self.e_layer_ids), n_teacher=teacher_encoder_layers
-                )
-                self.d_matches = get_layers_to_supervise(
-                    n_student=len(self.d_layer_ids), n_teacher=teacher_decoder_layers
-                )
-            else:  # student layer should emulate hidden states of the teacher layer it was copied from
-                self.e_matches = self.e_layer_ids
-                self.d_matches = self.d_layer_ids
-        else:
-            self.e_matches = None
-            self.d_matches = None
-
-        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
-        self.temperature = 2.0
-        self.alpha_mlm = hparams.alpha_mlm
-        self.alpha_ce = hparams.alpha_ce
-        self.alpha_hid = hparams.alpha_hid
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def calc_ce_loss(self, mask, s_logits, t_logits):
-        """Copy pasted from distillbert (transformers/examples/distillation/)"""
-        # mask has False at padding_idx
-        sel_mask = mask[:, :, None].expand_as(s_logits)
-        vocab_size = s_logits.size(-1)
-        s_logits_slct = torch.masked_select(s_logits, sel_mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
-        t_logits_slct = torch.masked_select(t_logits, sel_mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
-        s_logits_slct = s_logits_slct.view(-1, vocab_size)  # (bs * seq_length, voc_size) modulo the 1s in mask
-        t_logits_slct = t_logits_slct.view(-1, vocab_size)  # (bs * seq_length, voc_size) modulo the 1s in mask
-        assert t_logits_slct.size() == s_logits_slct.size()
-        loss_ce = (
-            self.ce_loss_fct(
-                nn.functional.log_softmax(s_logits_slct / self.temperature, dim=-1),
-                nn.functional.softmax(t_logits_slct / self.temperature, dim=-1),
-            )
-            * (self.temperature) ** 2
-        )
-        return loss_ce
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        SummarizationModule.add_model_specific_args(parser, root_dir)
-        add_distill_args(parser)
-        return parser
-
-    def _step(self, batch: dict) -> tuple:
-        """Compute the loss for a batch"""
-        pad_token_id = self.tokenizer.pad_token_id
-        input_ids, src_mask, labels = batch["input_ids"], batch["attention_mask"], batch["labels"]
-        if isinstance(self.model, T5ForConditionalGeneration):
-            decoder_input_ids = self.model._shift_right(labels)
-        else:
-            decoder_input_ids = shift_tokens_right(labels, pad_token_id)
-
-        # noinspection PyCallingNonCallable
-        student_outputs = self(
-            input_ids,
-            attention_mask=src_mask,
-            decoder_input_ids=decoder_input_ids,
-            output_hidden_states=self.do_calc_hidden_loss,
-            output_attentions=False,
-            use_cache=False,
-        )
-        lm_logits = student_outputs["logits"]
-
-        # Same cross entropy vs. label smoothing logic as finetune.py
-        assert lm_logits.shape[-1] == self.model.config.vocab_size
-        if self.hparams.label_smoothing == 0:
-            # Same behavior as modeling_bart.py, besides ignoring pad_token_id
-            loss_fct = nn.CrossEntropyLoss(ignore_index=pad_token_id)
-            student_lm_loss = loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1))
-        else:
-            lprobs = nn.functional.log_softmax(lm_logits, dim=-1)
-            student_lm_loss, _ = label_smoothed_nll_loss(
-                lprobs, labels, self.hparams.label_smoothing, ignore_index=pad_token_id
-            )
-
-        def zero_tensor():
-            return torch.tensor(0.0).type_as(student_lm_loss)
-
-        teacher_enc_outputs = student_outputs[
-            "encoder_last_hidden_state"
-        ]  # use this unless self.different_base_models
-        hid_loss_enc, hid_loss_dec = zero_tensor(), zero_tensor()
-        if self.different_encoder:  # compute encoder hidden state loss
-            all_teacher_encoder_outputs = self.teacher.get_encoder()(
-                input_ids,
-                attention_mask=src_mask,
-                output_hidden_states=self.do_calc_hidden_loss,
-            )
-            if self.different_base_models:
-                teacher_enc_outputs = all_teacher_encoder_outputs["last_hidden_state"]
-            elif self.do_calc_hidden_loss:
-                hid_loss_enc = self.calc_hidden_loss(
-                    src_mask,
-                    student_outputs["encoder_hidden_states"],
-                    all_teacher_encoder_outputs["hidden_states"],
-                    self.e_matches,
-                    normalize_hidden=self.hparams.normalize_hidden,
-                )
-
-        teacher_outputs = self.teacher(
-            input_ids,
-            attention_mask=src_mask,
-            encoder_outputs=(teacher_enc_outputs,),
-            decoder_input_ids=decoder_input_ids,
-            output_hidden_states=self.do_calc_hidden_loss,
-            use_cache=False,  # since we are not passing labels, never let this default to True
-        )
-        dec_mask = decoder_input_ids.ne(pad_token_id)
-        loss_ce = self.calc_ce_loss(dec_mask, lm_logits, teacher_outputs["logits"])
-        if self.do_calc_hidden_loss:  # Intermediate supervision of decoder hidden states
-            hid_loss_dec = self.calc_hidden_loss(
-                dec_mask,
-                student_outputs["decoder_hidden_states"],
-                teacher_outputs["decoder_hidden_states"],
-                self.d_matches,
-                normalize_hidden=self.hparams.normalize_hidden,
-            )
-
-        blended_loss = (
-            self.alpha_ce * loss_ce
-            + self.alpha_mlm * student_lm_loss
-            + self.hparams.alpha_hid * (hid_loss_enc + hid_loss_dec)
-        )
-        return blended_loss, loss_ce, student_lm_loss, hid_loss_enc, hid_loss_dec
-
-    @staticmethod
-    def calc_hidden_loss(attention_mask, hidden_states, hidden_states_T, matches, normalize_hidden):
-        """MSE(student_hid, teacher_hid[matches]). Called "Intermediate supervision" in paper. Inspired by TinyBERT."""
-        msg = "expected list or tuple for hidden_states, got tensor of shape: "
-        assert not isinstance(hidden_states, torch.Tensor), f"{msg}{hidden_states.shape}"
-        assert not isinstance(hidden_states_T, torch.Tensor), f"{msg}{hidden_states_T.shape}"
-        mask = attention_mask.to(hidden_states[0])
-        valid_count = mask.sum() * hidden_states[0].size(-1)
-        student_states = torch.stack([hidden_states[i] for i in range(len(matches))])
-        teacher_states = torch.stack([hidden_states_T[j] for j in matches])
-        assert student_states.shape == teacher_states.shape, f"{student_states.shape} != {teacher_states.shape}"
-        if normalize_hidden:
-            student_states = nn.functional.layer_norm(student_states, student_states.shape[1:])
-            teacher_states = nn.functional.layer_norm(teacher_states, teacher_states.shape[1:])
-        mse = nn.functional.mse_loss(student_states, teacher_states, reduction="none")
-        masked_mse = (mse * mask.unsqueeze(0).unsqueeze(-1)).sum() / valid_count
-        return masked_mse
-
-
-def add_distill_args(parser):
-    # NOTE: if --student argument was specified and the teacher and student base models
-    # are different, the models still have to have the same tokenizer, specified by
-    # --tokenizer_name. So, for example, you can distill from t5_large to t5_small but not
-    # from bart to t5. This s because if the tokenizers are different, the output space
-    # for the two models is also different and their logits are not comparable.
-    parser.add_argument("--teacher", type=str)
-    parser.add_argument("--alpha_ce", default=0.8, type=float)
-    parser.add_argument("--alpha_mlm", default=0.2, type=float)
-    parser.add_argument("--alpha_hid", default=0.0, type=float, required=False)
-    parser.add_argument("--student", type=str, required=False)
-    parser.add_argument("--student_decoder_layers", default=12, type=int, required=False)
-    parser.add_argument("--student_encoder_layers", default=12, type=int, required=False)
-    parser.add_argument("--no_teacher", action="store_true", default=False)
-    parser.add_argument("--length_penalty", type=float, default=-1)
-    parser.add_argument("--supervise_forward", action="store_true", default=False)
-    parser.add_argument("--normalize_hidden", action="store_true", default=False)
-
-
-class TranslationDistiller(SummarizationDistiller):
-    """Supports T5, mBART, Marian, other models that inherit from Bart."""
-
-    mode = "translation"
-    metric_names = ["bleu"]
-    default_val_metric = "bleu"
-
-    def __init__(self, hparams, **kwargs):
-        super().__init__(hparams, **kwargs)
-        assert hparams.src_lang is not None
-        assert hparams.tgt_lang is not None
-        self.dataset_kwargs["src_lang"] = hparams.src_lang
-        self.dataset_kwargs["tgt_lang"] = hparams.tgt_lang
-        if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer):
-            self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang]
-
-    def calc_generative_metrics(self, preds, target) -> dict:
-        return calculate_bleu(preds, target)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        TranslationModule.add_model_specific_args(parser, root_dir)
-        add_distill_args(parser)
-        return parser
-
-
-def create_module(args):
-    if args.no_teacher:
-        module_cls = TranslationModule if "translation" in args.task else SummarizationModule
-    else:  # DISTILL WITH TEACHER
-        module_cls = TranslationDistiller if "translation" in args.task else SummarizationDistiller
-    args.setup_cls: str = module_cls.__name__
-    print(f"using module {args.setup_cls}")
-    model = module_cls(args)
-    return model
-
-
-def distill_main(args):
-    Path(args.output_dir).mkdir(exist_ok=True)
-    check_output_dir(args, expected_items=3)
-
-    model = create_module(args)
-    return ft_main(args, model=model)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser = pl.Trainer.add_argparse_args(parser)
-    parser = SummarizationDistiller.add_model_specific_args(parser, os.getcwd())
-    args = parser.parse_args()
-
-    distill_main(args)
diff --git a/examples/research_projects/seq2seq-distillation/dynamic_bs_example.sh b/examples/research_projects/seq2seq-distillation/dynamic_bs_example.sh
deleted file mode 100755
index cfe9e21f0f67..000000000000
--- a/examples/research_projects/seq2seq-distillation/dynamic_bs_example.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-export PYTHONPATH="../":"${PYTHONPATH}"
-export WANDB_PROJECT=dmar
-export MAX_LEN=128
-export m=sshleifer/student_marian_en_ro_6_1
-python finetune.py \
-  --learning_rate=3e-4 \
-  --do_train \
-  --fp16 \
-  --data_dir wmt_en_ro \
-  --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
-  --freeze_encoder --freeze_embeds \
-  --train_batch_size=48 --eval_batch_size=64 \
-  --tokenizer_name $m --model_name_or_path $m --num_train_epochs=1 \
-  --warmup_steps 500 --logger_name wandb --gpus 1 \
-  --fp16_opt_level=O1 --task translation \
-  "$@"
diff --git a/examples/research_projects/seq2seq-distillation/finetune.py b/examples/research_projects/seq2seq-distillation/finetune.py
deleted file mode 100755
index ff889af81e36..000000000000
--- a/examples/research_projects/seq2seq-distillation/finetune.py
+++ /dev/null
@@ -1,454 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import glob
-import logging
-import os
-import sys
-import time
-from collections import defaultdict
-from pathlib import Path
-from typing import Dict, List, Tuple
-
-import numpy as np
-import pytorch_lightning as pl
-import torch
-from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
-from torch import nn
-from torch.utils.data import DataLoader
-
-from transformers import MBartTokenizer, T5ForConditionalGeneration
-from transformers.models.bart.modeling_bart import shift_tokens_right
-from utils import (
-    ROUGE_KEYS,
-    LegacySeq2SeqDataset,
-    Seq2SeqDataset,
-    assert_all_frozen,
-    calculate_bleu,
-    calculate_rouge,
-    check_output_dir,
-    flatten_list,
-    freeze_embeds,
-    freeze_params,
-    get_git_info,
-    label_smoothed_nll_loss,
-    lmap,
-    pickle_save,
-    save_git_info,
-    save_json,
-    use_task_specific_params,
-)
-
-
-# need the parent dir module
-sys.path.insert(2, str(Path(__file__).resolve().parents[1]))
-from lightning_base import BaseTransformer, add_generic_args, generic_train  # noqa
-
-
-logger = logging.getLogger(__name__)
-
-
-class SummarizationModule(BaseTransformer):
-    mode = "summarization"
-    loss_names = ["loss"]
-    metric_names = ROUGE_KEYS
-    default_val_metric = "rouge2"
-
-    def __init__(self, hparams, **kwargs):
-        if hparams.sortish_sampler and hparams.gpus > 1:
-            hparams.replace_sampler_ddp = False
-        elif hparams.max_tokens_per_batch is not None:
-            if hparams.gpus > 1:
-                raise NotImplementedError("Dynamic Batch size does not work for multi-gpu training")
-            if hparams.sortish_sampler:
-                raise ValueError("--sortish_sampler and --max_tokens_per_batch may not be used simultaneously")
-
-        super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs)
-        use_task_specific_params(self.model, "summarization")
-        save_git_info(self.hparams.output_dir)
-        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
-        self.hparams_save_path = Path(self.output_dir) / "hparams.pkl"
-        pickle_save(self.hparams, self.hparams_save_path)
-        self.step_count = 0
-        self.metrics = defaultdict(list)
-        self.model_type = self.config.model_type
-        self.vocab_size = self.config.tgt_vocab_size if self.model_type == "fsmt" else self.config.vocab_size
-
-        self.dataset_kwargs: dict = {
-            "data_dir": self.hparams.data_dir,
-            "max_source_length": self.hparams.max_source_length,
-            "prefix": self.model.config.prefix or "",
-        }
-        n_observations_per_split = {
-            "train": self.hparams.n_train,
-            "val": self.hparams.n_val,
-            "test": self.hparams.n_test,
-        }
-        self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()}
-
-        self.target_lens = {
-            "train": self.hparams.max_target_length,
-            "val": self.hparams.val_max_target_length,
-            "test": self.hparams.test_max_target_length,
-        }
-        assert self.target_lens["train"] <= self.target_lens["val"], f"target_lens: {self.target_lens}"
-        assert self.target_lens["train"] <= self.target_lens["test"], f"target_lens: {self.target_lens}"
-        if self.hparams.freeze_embeds:
-            freeze_embeds(self.model)
-        if self.hparams.freeze_encoder:
-            freeze_params(self.model.get_encoder())
-            assert_all_frozen(self.model.get_encoder())
-
-        self.hparams.git_sha = get_git_info()["repo_sha"]
-        self.num_workers = hparams.num_workers
-        self.decoder_start_token_id = None  # default to config
-        if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer):
-            self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang]
-            self.model.config.decoder_start_token_id = self.decoder_start_token_id
-        self.dataset_class = (
-            Seq2SeqDataset if hasattr(self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset
-        )
-        self.already_saved_batch = False
-        self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams
-        if self.hparams.eval_max_gen_length is not None:
-            self.eval_max_length = self.hparams.eval_max_gen_length
-        else:
-            self.eval_max_length = self.model.config.max_length
-        self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric
-
-    def save_readable_batch(self, batch: Dict[str, torch.Tensor]) -> Dict[str, List[str]]:
-        """A debugging utility"""
-        readable_batch = {
-            k: self.tokenizer.batch_decode(v.tolist()) if "mask" not in k else v.shape for k, v in batch.items()
-        }
-        save_json(readable_batch, Path(self.output_dir) / "text_batch.json")
-        save_json({k: v.tolist() for k, v in batch.items()}, Path(self.output_dir) / "tok_batch.json")
-
-        self.already_saved_batch = True
-        return readable_batch
-
-    def forward(self, input_ids, **kwargs):
-        return self.model(input_ids, **kwargs)
-
-    def ids_to_clean_text(self, generated_ids: List[int]):
-        gen_text = self.tokenizer.batch_decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-        return lmap(str.strip, gen_text)
-
-    def _step(self, batch: dict) -> Tuple:
-        pad_token_id = self.tokenizer.pad_token_id
-        src_ids, src_mask = batch["input_ids"], batch["attention_mask"]
-        tgt_ids = batch["labels"]
-        if isinstance(self.model, T5ForConditionalGeneration):
-            decoder_input_ids = self.model._shift_right(tgt_ids)
-        else:
-            decoder_input_ids = shift_tokens_right(tgt_ids, pad_token_id)
-        if not self.already_saved_batch:  # This would be slightly better if it only happened on rank zero
-            batch["decoder_input_ids"] = decoder_input_ids
-            self.save_readable_batch(batch)
-
-        outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
-        lm_logits = outputs["logits"]
-        if self.hparams.label_smoothing == 0:
-            # Same behavior as modeling_bart.py, besides ignoring pad_token_id
-            ce_loss_fct = nn.CrossEntropyLoss(ignore_index=pad_token_id)
-
-            assert lm_logits.shape[-1] == self.vocab_size
-            loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))
-        else:
-            lprobs = nn.functional.log_softmax(lm_logits, dim=-1)
-            loss, nll_loss = label_smoothed_nll_loss(
-                lprobs, tgt_ids, self.hparams.label_smoothing, ignore_index=pad_token_id
-            )
-        return (loss,)
-
-    @property
-    def pad(self) -> int:
-        return self.tokenizer.pad_token_id
-
-    def training_step(self, batch, batch_idx) -> Dict:
-        loss_tensors = self._step(batch)
-
-        logs = dict(zip(self.loss_names, loss_tensors))
-        # tokens per batch
-        logs["tpb"] = batch["input_ids"].ne(self.pad).sum() + batch["labels"].ne(self.pad).sum()
-        logs["bs"] = batch["input_ids"].shape[0]
-        logs["src_pad_tok"] = batch["input_ids"].eq(self.pad).sum()
-        logs["src_pad_frac"] = batch["input_ids"].eq(self.pad).float().mean()
-        # TODO(SS): make a wandb summary metric for this
-        return {"loss": loss_tensors[0], "log": logs}
-
-    def validation_step(self, batch, batch_idx) -> Dict:
-        return self._generative_step(batch)
-
-    def validation_epoch_end(self, outputs, prefix="val") -> Dict:
-        self.step_count += 1
-        losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names}
-        loss = losses["loss"]
-        generative_metrics = {
-            k: np.array([x[k] for x in outputs]).mean() for k in self.metric_names + ["gen_time", "gen_len"]
-        }
-        metric_val = (
-            generative_metrics[self.val_metric] if self.val_metric in generative_metrics else losses[self.val_metric]
-        )
-        metric_tensor: torch.FloatTensor = torch.tensor(metric_val).type_as(loss)
-        generative_metrics.update({k: v.item() for k, v in losses.items()})
-        losses.update(generative_metrics)
-        all_metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
-        all_metrics["step_count"] = self.step_count
-        self.metrics[prefix].append(all_metrics)  # callback writes this to self.metrics_save_path
-        preds = flatten_list([x["preds"] for x in outputs])
-        return {
-            "log": all_metrics,
-            "preds": preds,
-            f"{prefix}_loss": loss,
-            f"{prefix}_{self.val_metric}": metric_tensor,
-        }
-
-    def calc_generative_metrics(self, preds, target) -> Dict:
-        return calculate_rouge(preds, target)
-
-    def _generative_step(self, batch: dict) -> dict:
-        t0 = time.time()
-
-        # parser.add_argument('--eval_max_gen_length', type=int, default=None, help='never generate more than n tokens')
-        generated_ids = self.model.generate(
-            batch["input_ids"],
-            attention_mask=batch["attention_mask"],
-            use_cache=True,
-            decoder_start_token_id=self.decoder_start_token_id,
-            num_beams=self.eval_beams,
-            max_length=self.eval_max_length,
-        )
-        gen_time = (time.time() - t0) / batch["input_ids"].shape[0]
-        preds: List[str] = self.ids_to_clean_text(generated_ids)
-        target: List[str] = self.ids_to_clean_text(batch["labels"])
-        loss_tensors = self._step(batch)
-        base_metrics = dict(zip(self.loss_names, loss_tensors))
-        rouge: Dict = self.calc_generative_metrics(preds, target)
-        summ_len = np.mean(lmap(len, generated_ids))
-        base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **rouge)
-        return base_metrics
-
-    def test_step(self, batch, batch_idx):
-        return self._generative_step(batch)
-
-    def test_epoch_end(self, outputs):
-        return self.validation_epoch_end(outputs, prefix="test")
-
-    def get_dataset(self, type_path) -> Seq2SeqDataset:
-        n_obs = self.n_obs[type_path]
-        max_target_length = self.target_lens[type_path]
-        dataset = self.dataset_class(
-            self.tokenizer,
-            type_path=type_path,
-            n_obs=n_obs,
-            max_target_length=max_target_length,
-            **self.dataset_kwargs,
-        )
-        return dataset
-
-    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False) -> DataLoader:
-        dataset = self.get_dataset(type_path)
-
-        if self.hparams.sortish_sampler and type_path != "test" and type_path != "val":
-            sampler = dataset.make_sortish_sampler(batch_size, distributed=self.hparams.gpus > 1)
-            return DataLoader(
-                dataset,
-                batch_size=batch_size,
-                collate_fn=dataset.collate_fn,
-                shuffle=False,
-                num_workers=self.num_workers,
-                sampler=sampler,
-            )
-
-        elif self.hparams.max_tokens_per_batch is not None and type_path != "test" and type_path != "val":
-            batch_sampler = dataset.make_dynamic_sampler(
-                self.hparams.max_tokens_per_batch, distributed=self.hparams.gpus > 1
-            )
-            return DataLoader(
-                dataset,
-                batch_sampler=batch_sampler,
-                collate_fn=dataset.collate_fn,
-                # shuffle=False,
-                num_workers=self.num_workers,
-                # batch_size=None,
-            )
-        else:
-            return DataLoader(
-                dataset,
-                batch_size=batch_size,
-                collate_fn=dataset.collate_fn,
-                shuffle=shuffle,
-                num_workers=self.num_workers,
-                sampler=None,
-            )
-
-    def train_dataloader(self) -> DataLoader:
-        dataloader = self.get_dataloader("train", batch_size=self.hparams.train_batch_size, shuffle=True)
-        return dataloader
-
-    def val_dataloader(self) -> DataLoader:
-        return self.get_dataloader("val", batch_size=self.hparams.eval_batch_size)
-
-    def test_dataloader(self) -> DataLoader:
-        return self.get_dataloader("test", batch_size=self.hparams.eval_batch_size)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        BaseTransformer.add_model_specific_args(parser, root_dir)
-        add_generic_args(parser, root_dir)
-        parser.add_argument(
-            "--max_source_length",
-            default=1024,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument(
-            "--max_target_length",
-            default=56,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument(
-            "--val_max_target_length",
-            default=142,  # these defaults are optimized for CNNDM. For xsum, see README.md.
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument(
-            "--test_max_target_length",
-            default=142,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-        parser.add_argument("--freeze_encoder", action="store_true")
-        parser.add_argument("--freeze_embeds", action="store_true")
-        parser.add_argument("--sortish_sampler", action="store_true", default=False)
-        parser.add_argument("--overwrite_output_dir", action="store_true", default=False)
-        parser.add_argument("--max_tokens_per_batch", type=int, default=None)
-        parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default")
-        parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.")
-        parser.add_argument("--n_val", type=int, default=500, required=False, help="# examples. -1 means use all.")
-        parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.")
-        parser.add_argument(
-            "--task", type=str, default="summarization", required=False, help="# examples. -1 means use all."
-        )
-        parser.add_argument("--label_smoothing", type=float, default=0.0, required=False)
-        parser.add_argument("--src_lang", type=str, default="", required=False)
-        parser.add_argument("--tgt_lang", type=str, default="", required=False)
-        parser.add_argument("--eval_beams", type=int, default=None, required=False)
-        parser.add_argument(
-            "--val_metric", type=str, default=None, required=False, choices=["bleu", "rouge2", "loss", None]
-        )
-        parser.add_argument("--eval_max_gen_length", type=int, default=None, help="never generate more than n tokens")
-        parser.add_argument("--save_top_k", type=int, default=1, required=False, help="How many checkpoints to save")
-        parser.add_argument(
-            "--early_stopping_patience",
-            type=int,
-            default=-1,
-            required=False,
-            help=(
-                "-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So"
-                " val_check_interval will effect it."
-            ),
-        )
-        return parser
-
-
-class TranslationModule(SummarizationModule):
-    mode = "translation"
-    loss_names = ["loss"]
-    metric_names = ["bleu"]
-    default_val_metric = "bleu"
-
-    def __init__(self, hparams, **kwargs):
-        super().__init__(hparams, **kwargs)
-        self.dataset_kwargs["src_lang"] = hparams.src_lang
-        self.dataset_kwargs["tgt_lang"] = hparams.tgt_lang
-
-    def calc_generative_metrics(self, preds, target) -> dict:
-        return calculate_bleu(preds, target)
-
-
-def main(args, model=None) -> SummarizationModule:
-    Path(args.output_dir).mkdir(exist_ok=True)
-    check_output_dir(args, expected_items=3)
-
-    if model is None:
-        if "summarization" in args.task:
-            model: SummarizationModule = SummarizationModule(args)
-        else:
-            model: SummarizationModule = TranslationModule(args)
-    dataset = Path(args.data_dir).name
-    if (
-        args.logger_name == "default"
-        or args.fast_dev_run
-        or str(args.output_dir).startswith("/tmp")
-        or str(args.output_dir).startswith("/var")
-    ):
-        logger = True  # don't pollute wandb logs unnecessarily
-    elif args.logger_name == "wandb":
-        from pytorch_lightning.loggers import WandbLogger
-
-        project = os.environ.get("WANDB_PROJECT", dataset)
-        logger = WandbLogger(name=model.output_dir.name, project=project)
-
-    elif args.logger_name == "wandb_shared":
-        from pytorch_lightning.loggers import WandbLogger
-
-        logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}")
-
-    if args.early_stopping_patience >= 0:
-        es_callback = get_early_stopping_callback(model.val_metric, args.early_stopping_patience)
-    else:
-        es_callback = False
-
-    lower_is_better = args.val_metric == "loss"
-    trainer: pl.Trainer = generic_train(
-        model,
-        args,
-        logging_callback=Seq2SeqLoggingCallback(),
-        checkpoint_callback=get_checkpoint_callback(
-            args.output_dir, model.val_metric, args.save_top_k, lower_is_better
-        ),
-        early_stopping_callback=es_callback,
-        logger=logger,
-    )
-    pickle_save(model.hparams, model.output_dir / "hparams.pkl")
-    if not args.do_predict:
-        return model
-
-    model.hparams.test_checkpoint = ""
-    checkpoints = sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True))
-    if checkpoints:
-        model.hparams.test_checkpoint = checkpoints[-1]
-        trainer.resume_from_checkpoint = checkpoints[-1]
-    trainer.logger.log_hyperparams(model.hparams)
-
-    # test() without a model tests using the best checkpoint automatically
-    trainer.test()
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser = pl.Trainer.add_argparse_args(parser)
-    parser = SummarizationModule.add_model_specific_args(parser, os.getcwd())
-
-    args = parser.parse_args()
-
-    main(args)
diff --git a/examples/research_projects/seq2seq-distillation/finetune.sh b/examples/research_projects/seq2seq-distillation/finetune.sh
deleted file mode 100755
index 683c2d7752df..000000000000
--- a/examples/research_projects/seq2seq-distillation/finetune.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-# the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
-# run ./finetune.sh --help to see all the possible options
-python finetune.py \
-    --learning_rate=3e-5 \
-    --fp16 \
-    --gpus 1 \
-    --do_train \
-    --do_predict \
-    --n_val 1000 \
-    --val_check_interval 0.1 \
-    "$@"
diff --git a/examples/research_projects/seq2seq-distillation/finetune_bart_tiny.sh b/examples/research_projects/seq2seq-distillation/finetune_bart_tiny.sh
deleted file mode 100755
index f0289b45ab5c..000000000000
--- a/examples/research_projects/seq2seq-distillation/finetune_bart_tiny.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-# Script for verifying that run_bart_sum can be invoked from its directory
-
-# Get tiny dataset with cnn_dm format (4 examples for train, val, test)
-wget https://cdn-datasets.huggingface.co/summarization/cnn_tiny.tgz
-tar -xzvf cnn_tiny.tgz
-rm cnn_tiny.tgz
-
-export OUTPUT_DIR_NAME=bart_utest_output
-export CURRENT_DIR=${PWD}
-export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
-
-# Make output directory if it doesn't exist
-mkdir -p $OUTPUT_DIR
-
-# Add parent directory to python path to access lightning_base.py and testing_utils.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-python finetune.py \
---data_dir=cnn_tiny/ \
---model_name_or_path=sshleifer/bart-tiny-random \
---learning_rate=3e-5 \
---train_batch_size=2 \
---eval_batch_size=2 \
---output_dir=$OUTPUT_DIR \
---num_train_epochs=1  \
---gpus=0 \
---do_train "$@"
-
-rm -rf cnn_tiny
-rm -rf $OUTPUT_DIR
-
-
-
diff --git a/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh b/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh
deleted file mode 100755
index ec7ff98557c1..000000000000
--- a/examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-# From appendix C of paper https://arxiv.org/abs/1912.08777
-# Set --gradient_accumulation_steps  so that effective batch size is 256 (2*128, 4*64, 8*32, 16*16)
-python finetune.py \
-    --learning_rate=1e-4 \
-    --do_train \
-    --do_predict \
-    --n_val 1000 \
-    --val_check_interval 0.25 \
-    --max_source_length 512 --max_target_length 56 \
-    --freeze_embeds --label_smoothing 0.1 --adafactor --task summarization_xsum \
-    "$@"
diff --git a/examples/research_projects/seq2seq-distillation/finetune_t5.sh b/examples/research_projects/seq2seq-distillation/finetune_t5.sh
deleted file mode 100755
index 504e9eb71e35..000000000000
--- a/examples/research_projects/seq2seq-distillation/finetune_t5.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-python finetune.py \
---data_dir=$CNN_DIR \
---learning_rate=3e-5 \
---train_batch_size=$BS \
---eval_batch_size=$BS \
---output_dir=$OUTPUT_DIR \
---max_source_length=512 \
---max_target_length=56 \
---val_check_interval=0.1 --n_val=200 \
---do_train --do_predict \
- "$@"
diff --git a/examples/research_projects/seq2seq-distillation/lightning_base.py b/examples/research_projects/seq2seq-distillation/lightning_base.py
deleted file mode 100644
index 640828bacd34..000000000000
--- a/examples/research_projects/seq2seq-distillation/lightning_base.py
+++ /dev/null
@@ -1,393 +0,0 @@
-import argparse
-import logging
-import os
-from pathlib import Path
-from typing import Any, Dict
-
-import pytorch_lightning as pl
-from pytorch_lightning.utilities import rank_zero_info
-
-from transformers import (
-    AdamW,
-    AutoConfig,
-    AutoModel,
-    AutoModelForPreTraining,
-    AutoModelForQuestionAnswering,
-    AutoModelForSeq2SeqLM,
-    AutoModelForSequenceClassification,
-    AutoModelForTokenClassification,
-    AutoModelWithLMHead,
-    AutoTokenizer,
-    PretrainedConfig,
-    PreTrainedTokenizer,
-)
-from transformers.optimization import (
-    Adafactor,
-    get_cosine_schedule_with_warmup,
-    get_cosine_with_hard_restarts_schedule_with_warmup,
-    get_linear_schedule_with_warmup,
-    get_polynomial_decay_schedule_with_warmup,
-)
-from transformers.utils.versions import require_version
-
-
-logger = logging.getLogger(__name__)
-
-require_version("pytorch_lightning>=1.0.4")
-
-MODEL_MODES = {
-    "base": AutoModel,
-    "sequence-classification": AutoModelForSequenceClassification,
-    "question-answering": AutoModelForQuestionAnswering,
-    "pretraining": AutoModelForPreTraining,
-    "token-classification": AutoModelForTokenClassification,
-    "language-modeling": AutoModelWithLMHead,
-    "summarization": AutoModelForSeq2SeqLM,
-    "translation": AutoModelForSeq2SeqLM,
-}
-
-
-# update this and the import above to support new schedulers from transformers.optimization
-arg_to_scheduler = {
-    "linear": get_linear_schedule_with_warmup,
-    "cosine": get_cosine_schedule_with_warmup,
-    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
-    "polynomial": get_polynomial_decay_schedule_with_warmup,
-    # '': get_constant_schedule,             # not supported for now
-    # '': get_constant_schedule_with_warmup, # not supported for now
-}
-arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
-arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
-
-
-class BaseTransformer(pl.LightningModule):
-    def __init__(
-        self,
-        hparams: argparse.Namespace,
-        num_labels=None,
-        mode="base",
-        config=None,
-        tokenizer=None,
-        model=None,
-        **config_kwargs,
-    ):
-        """Initialize a model, tokenizer and config."""
-        super().__init__()
-        # TODO: move to self.save_hyperparameters()
-        # self.save_hyperparameters()
-        # can also expand arguments into trainer signature for easier reading
-
-        self.save_hyperparameters(hparams)
-        self.step_count = 0
-        self.output_dir = Path(self.hparams.output_dir)
-        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
-        if config is None:
-            self.config = AutoConfig.from_pretrained(
-                self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
-                **({"num_labels": num_labels} if num_labels is not None else {}),
-                cache_dir=cache_dir,
-                **config_kwargs,
-            )
-        else:
-            self.config: PretrainedConfig = config
-
-        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
-        for p in extra_model_params:
-            if getattr(self.hparams, p, None):
-                assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute"
-                setattr(self.config, p, getattr(self.hparams, p))
-
-        if tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.tokenizer: PreTrainedTokenizer = tokenizer
-        self.model_type = MODEL_MODES[mode]
-        if model is None:
-            self.model = self.model_type.from_pretrained(
-                self.hparams.model_name_or_path,
-                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
-                config=self.config,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.model = model
-
-    def load_hf_checkpoint(self, *args, **kwargs):
-        self.model = self.model_type.from_pretrained(*args, **kwargs)
-
-    def get_lr_scheduler(self):
-        get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
-        scheduler = get_schedule_func(
-            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps()
-        )
-        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
-        return scheduler
-
-    def configure_optimizers(self):
-        """Prepare optimizer and schedule (linear warmup and decay)"""
-        model = self.model
-        no_decay = ["bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-                "weight_decay": self.hparams.weight_decay,
-            },
-            {
-                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-                "weight_decay": 0.0,
-            },
-        ]
-        if self.hparams.adafactor:
-            optimizer = Adafactor(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False
-            )
-
-        else:
-            optimizer = AdamW(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
-            )
-        self.opt = optimizer
-
-        scheduler = self.get_lr_scheduler()
-
-        return [optimizer], [scheduler]
-
-    def test_step(self, batch, batch_nb):
-        return self.validation_step(batch, batch_nb)
-
-    def test_epoch_end(self, outputs):
-        return self.validation_end(outputs)
-
-    def total_steps(self) -> int:
-        """The number of total training steps that will be run. Used for lr scheduler purposes."""
-        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
-        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
-        return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs
-
-    def setup(self, mode):
-        if mode == "test":
-            self.dataset_size = len(self.test_dataloader().dataset)
-        else:
-            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
-            self.dataset_size = len(self.train_dataloader().dataset)
-
-    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False):
-        raise NotImplementedError("You must implement this for your task")
-
-    def train_dataloader(self):
-        return self.train_loader
-
-    def val_dataloader(self):
-        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
-
-    def test_dataloader(self):
-        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
-
-    def _feature_file(self, mode):
-        return os.path.join(
-            self.hparams.data_dir,
-            "cached_{}_{}_{}".format(
-                mode,
-                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
-                str(self.hparams.max_seq_length),
-            ),
-        )
-
-    @pl.utilities.rank_zero_only
-    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
-        save_path = self.output_dir.joinpath("best_tfmr")
-        self.model.config.save_step = self.step_count
-        self.model.save_pretrained(save_path)
-        self.tokenizer.save_pretrained(save_path)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        parser.add_argument(
-            "--model_name_or_path",
-            default=None,
-            type=str,
-            required=True,
-            help="Path to pretrained model or model identifier from huggingface.co/models",
-        )
-        parser.add_argument(
-            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-        )
-        parser.add_argument(
-            "--tokenizer_name",
-            default=None,
-            type=str,
-            help="Pretrained tokenizer name or path if not the same as model_name",
-        )
-        parser.add_argument(
-            "--cache_dir",
-            default="",
-            type=str,
-            help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-        )
-        parser.add_argument(
-            "--encoder_layerdrop",
-            type=float,
-            help="Encoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--decoder_layerdrop",
-            type=float,
-            help="Decoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--dropout",
-            type=float,
-            help="Dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--attention_dropout",
-            type=float,
-            help="Attention dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-        parser.add_argument(
-            "--lr_scheduler",
-            default="linear",
-            choices=arg_to_scheduler_choices,
-            metavar=arg_to_scheduler_metavar,
-            type=str,
-            help="Learning rate scheduler",
-        )
-        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-        parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader")
-        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
-        parser.add_argument("--train_batch_size", default=32, type=int)
-        parser.add_argument("--eval_batch_size", default=32, type=int)
-        parser.add_argument("--adafactor", action="store_true")
-
-
-class LoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
-        lrs = {f"lr_group_{i}": lr for i, lr in enumerate(lr_scheduler.get_lr())}
-        pl_module.logger.log_metrics(lrs)
-
-    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Validation results *****")
-        metrics = trainer.callback_metrics
-        # Log results
-        for key in sorted(metrics):
-            if key not in ["log", "progress_bar"]:
-                rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
-
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Test results *****")
-        metrics = trainer.callback_metrics
-        # Log and save results to file
-        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
-        with open(output_test_results_file, "w") as writer:
-            for key in sorted(metrics):
-                if key not in ["log", "progress_bar"]:
-                    rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
-                    writer.write("{} = {}\n".format(key, str(metrics[key])))
-
-
-def add_generic_args(parser, root_dir) -> None:
-    #  To allow all pl args uncomment the following line
-    #  parser = pl.Trainer.add_argparse_args(parser)
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O2",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
-    parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm")
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        dest="accumulate_grad_batches",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
-    )
-
-
-def generic_train(
-    model: BaseTransformer,
-    args: argparse.Namespace,
-    early_stopping_callback=None,
-    logger=True,  # can pass WandbLogger() here
-    extra_callbacks=[],
-    checkpoint_callback=None,
-    logging_callback=None,
-    **extra_train_kwargs,
-):
-    pl.seed_everything(args.seed)
-
-    # init model
-    odir = Path(model.hparams.output_dir)
-    odir.mkdir(exist_ok=True)
-
-    # add custom checkpoints
-    if checkpoint_callback is None:
-        checkpoint_callback = pl.callbacks.ModelCheckpoint(
-            filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
-        )
-    if early_stopping_callback:
-        extra_callbacks.append(early_stopping_callback)
-    if logging_callback is None:
-        logging_callback = LoggingCallback()
-
-    train_params = {}
-
-    # TODO: remove with PyTorch 1.6 since pl uses native amp
-    if args.fp16:
-        train_params["precision"] = 16
-        train_params["amp_level"] = args.fp16_opt_level
-
-    if args.gpus > 1:
-        train_params["distributed_backend"] = "ddp"
-
-    train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
-    train_params["accelerator"] = extra_train_kwargs.get("accelerator", None)
-    train_params["profiler"] = extra_train_kwargs.get("profiler", None)
-
-    trainer = pl.Trainer.from_argparse_args(
-        args,
-        weights_summary=None,
-        callbacks=[logging_callback] + extra_callbacks,
-        logger=logger,
-        checkpoint_callback=checkpoint_callback,
-        **train_params,
-    )
-
-    if args.do_train:
-        trainer.fit(model)
-
-    return trainer
diff --git a/examples/research_projects/seq2seq-distillation/make_student.py b/examples/research_projects/seq2seq-distillation/make_student.py
deleted file mode 100644
index 83e014bf481e..000000000000
--- a/examples/research_projects/seq2seq-distillation/make_student.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import warnings
-from pathlib import Path
-from typing import List, Tuple, Union
-
-import fire
-from torch import nn
-
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-def copy_layers(src_layers: nn.ModuleList, dest_layers: nn.ModuleList, layers_to_copy: List[int]) -> None:
-    layers_to_copy = nn.ModuleList([src_layers[i] for i in layers_to_copy])
-    assert len(dest_layers) == len(layers_to_copy), f"{len(dest_layers)} != {len(layers_to_copy)}"
-    dest_layers.load_state_dict(layers_to_copy.state_dict())
-
-
-LAYERS_TO_COPY = {
-    # maps  num layers in teacher -> num_layers in student -> which teacher layers to copy.
-    # 12: bart, 16: pegasus, 6: marian/Helsinki-NLP
-    12: {
-        1: [0],  # This says that if the teacher has 12 layers and the student has 1, copy layer 0 of the teacher
-        2: [0, 6],
-        3: [0, 6, 11],
-        4: [0, 4, 8, 11],
-        6: [0, 2, 4, 7, 9, 11],
-        9: [0, 1, 2, 4, 5, 7, 9, 10, 11],
-        12: list(range(12)),
-    },
-    16: {  # maps  num layers in student -> which teacher layers to copy
-        1: [0],
-        2: [0, 15],
-        3: [0, 8, 15],
-        4: [0, 5, 10, 15],
-        6: [0, 3, 6, 9, 12, 15],
-        8: [0, 2, 4, 6, 8, 10, 12, 15],
-        9: [0, 1, 3, 5, 7, 9, 11, 13, 15],
-        12: [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 15],
-        16: list(range(16)),
-    },
-    6: {1: [0], 2: [0, 5], 3: [0, 2, 5], 4: [0, 1, 3, 5], 6: list(range(6))},
-}
-LAYERS_TO_SUPERVISE = {
-    # maps  num layers in student -> which teacher layers to copy.
-    6: {1: [5], 2: [3, 5], 3: [1, 4, 5], 4: [1, 2, 4, 5]},
-    12: {1: [11], 2: [5, 11], 3: [3, 7, 11], 6: [1, 3, 5, 8, 10, 11]},
-    16: {1: [15], 4: [4, 9, 12, 15], 8: [1, 3, 5, 7, 9, 11, 13, 15]},
-}
-
-
-def pick_layers_to_copy(n_student, n_teacher):
-    try:
-        val = LAYERS_TO_COPY[n_teacher][n_student]
-        return val
-    except KeyError:
-        if n_student != n_teacher:
-            warnings.warn(
-                f"no hardcoded layers to copy for teacher {n_teacher} -> student {n_student}, defaulting to first"
-                f" {n_student}"
-            )
-        return list(range(n_student))
-
-
-def get_layers_to_supervise(n_student, n_teacher) -> List[int]:
-    """Used or the --supervise_forward kwarg"""
-    if n_student > n_teacher:
-        raise ValueError(f"Cannot perform intermediate supervision for student {n_student} > teacher {n_teacher}")
-    elif n_teacher == n_student:
-        return list(range(n_teacher))
-    elif n_student == 1:
-        return [n_teacher - 1]
-    else:
-        return LAYERS_TO_SUPERVISE[n_teacher][n_student]
-
-
-def create_student_by_copying_alternating_layers(
-    teacher: Union[str, PreTrainedModel],
-    save_path: Union[str, Path] = "student",
-    e: Union[int, None] = None,
-    d: Union[int, None] = None,
-    copy_first_teacher_layers=False,
-    e_layers_to_copy=None,
-    d_layers_to_copy=None,
-    **extra_config_kwargs,
-) -> Tuple[PreTrainedModel, List[int], List[int]]:
-    """Make a student by copying alternating layers from a teacher, save it to save_path.
-    Args:
-        teacher: str or PreTrainedModel if str, this will call AutoModelForSeq2SeqLM.from_pretrained(teacher) before
-        copying layers
-        save_path: where to save the student, defaults to student directory.
-        e: how many Encoder layers should the student have, default is fully copy of teacher
-        d: how many Decoder layers should the student have, default is fully copy of teacher
-        copy_first_teacher_layers: [bool] dont copy alternating layers, just the first e/d.
-        **extra_config_kwargs: extra kwargs to pass to the student, by default the teacher config is used.
-
-    Returns:
-        student: new, smaller model.  (Also saves it to save_path)
-        e_layers_to_copy: list of which teacher encoder layers were used
-        d_layers_to_copy: list of which teacher decoder layers were used
-    """
-    _msg = "encoder_layers and decoder_layers cannot be both None-- you would just have an identical teacher."
-    assert (e is not None) or (d is not None), _msg
-    if isinstance(teacher, str):
-        AutoTokenizer.from_pretrained(teacher).save_pretrained(save_path)  # purely for convenience
-        teacher = AutoModelForSeq2SeqLM.from_pretrained(teacher).eval()
-    else:
-        assert isinstance(teacher, PreTrainedModel), f"teacher must be a model or string got type {type(teacher)}"
-    init_kwargs = teacher.config.to_diff_dict()
-
-    try:
-        teacher_e, teacher_d = teacher.config.encoder_layers, teacher.config.decoder_layers
-        if e is None:
-            e = teacher_e
-        if d is None:
-            d = teacher_d
-        init_kwargs.update({"encoder_layers": e, "decoder_layers": d})
-    except AttributeError:  # T5
-        if hasattr(teacher.config, "num_encoder_layers"):
-            teacher_e, teacher_d = teacher.config.num_encoder_layers, teacher.config.num_decoder_layers
-        else:
-            teacher_e, teacher_d = teacher.config.num_layers, teacher.config.num_decoder_layers
-        if e is None:
-            e = teacher_e
-        if d is None:
-            d = teacher_d
-        if hasattr(teacher.config, "num_encoder_layers"):
-            init_kwargs.update({"num_encoder_layers": e, "num_decoder_layers": d})
-        else:
-            init_kwargs.update({"num_layers": e, "num_decoder_layers": d})
-
-    # Kwargs to instantiate student: teacher kwargs with updated layer numbers + **extra_config_kwargs
-    init_kwargs.update(extra_config_kwargs)
-
-    # Copy weights
-    student_cfg = teacher.config_class(**init_kwargs)
-    student = AutoModelForSeq2SeqLM.from_config(student_cfg)
-    # Start by copying the full teacher state dict this will copy the first N teacher layers to the student.
-    info = student.load_state_dict(teacher.state_dict(), strict=False)
-    assert info.missing_keys == [], info.missing_keys  # every student key should have a teacher keys.
-
-    if copy_first_teacher_layers:  # Our copying is done. We just log and save
-        e_layers_to_copy, d_layers_to_copy = list(range(e)), list(range(d))
-        logger.info(
-            f"Copied encoder layers {e_layers_to_copy} and decoder layers {d_layers_to_copy}. Saving them to"
-            f" {save_path}"
-        )
-        student.save_pretrained(save_path)
-        return student, e_layers_to_copy, d_layers_to_copy
-
-    # Decide which layers of the teacher to copy. Not exactly alternating -- we try to keep first and last layer.
-    if e_layers_to_copy is None:
-        e_layers_to_copy: List[int] = pick_layers_to_copy(e, teacher_e)
-    if d_layers_to_copy is None:
-        d_layers_to_copy: List[int] = pick_layers_to_copy(d, teacher_d)
-
-    try:
-        if hasattr(
-            teacher, "prophetnet"
-        ):  # For ProphetNet, student.model.encoder.layers is called student.prophetnet.encoder.layers
-            copy_layers(teacher.prophetnet.encoder.layers, student.prophetnet.encoder.layers, e_layers_to_copy)
-            copy_layers(teacher.prophetnet.decoder.layers, student.prophetnet.decoder.layers, d_layers_to_copy)
-        else:
-            copy_layers(teacher.model.encoder.layers, student.model.encoder.layers, e_layers_to_copy)
-            copy_layers(teacher.model.decoder.layers, student.model.decoder.layers, d_layers_to_copy)
-    except AttributeError:  # For t5, student.model.encoder.layers is called student.encoder.block
-        copy_layers(teacher.encoder.block, student.encoder.block, e_layers_to_copy)
-        copy_layers(teacher.decoder.block, student.decoder.block, d_layers_to_copy)
-    logger.info(
-        f"Copied encoder layers {e_layers_to_copy} and decoder layers {d_layers_to_copy}. Saving them to {save_path}"
-    )
-    student.config.init_metadata = {
-        "teacher_type": teacher.config.model_type,
-        "copied_encoder_layers": e_layers_to_copy,
-        "copied_decoder_layers": d_layers_to_copy,
-    }
-    student.save_pretrained(save_path)
-    # Save information about copying for easier reproducibility
-
-    return student, e_layers_to_copy, d_layers_to_copy
-
-
-if __name__ == "__main__":
-    fire.Fire(create_student_by_copying_alternating_layers)
diff --git a/examples/research_projects/seq2seq-distillation/precomputed_pseudo_labels.md b/examples/research_projects/seq2seq-distillation/precomputed_pseudo_labels.md
deleted file mode 100644
index fb2713ccde84..000000000000
--- a/examples/research_projects/seq2seq-distillation/precomputed_pseudo_labels.md
+++ /dev/null
@@ -1,43 +0,0 @@
-### Saved Pseudo-Labels
-These are the generations of various large models on various large **training** sets. All in all they took about 200 GPU hours to produce.
-
-### Available Pseudo-labels
-| Dataset | Model                       | Link                                                                                   | Rouge Scores       | Notes                                                                                                       
-|---------|-----------------------------|----------------------------------------------------------------------------------------|--------------------|-------------------------------------------------------------------------------------------------------------
-| XSUM    | `facebook/bart-large-xsum`    | [download](https://cdn-datasets.huggingface.co/pseudo/xsum/bart_xsum_pl.tgz)          | 49.8/28.0/42.5     |                                                                                                             
-| XSUM    | `google/pegasus-xsum`         | [download](https://cdn-datasets.huggingface.co/pseudo/xsum/pegasus_xsum.tgz)          | 53.3/32.7/46.5     |                                                                                                             
-| XSUM    | `facebook/bart-large-xsum`    | [download](https://cdn-datasets.huggingface.co/pseudo/xsum/xsum_pl2_bart.tgz)         |                   | Bart pseudolabels filtered to those with Rouge2 > 10.0 w GT.                                                 
-| CNN/DM  | `sshleifer/pegasus-cnn-ft-v2` | [download](https://cdn-datasets.huggingface.co/pseudo/cnn_dm/pegasus_cnn_cnn_pls.tgz) | 47.316/26.65/44.56 | do not worry about the fact that train.source is one line shorter.                                          
-| CNN/DM  | `facebook/bart-large-cnn`     | [download](https://cdn-datasets.huggingface.co/pseudo/cnn_dm/cnn_bart_pl.tgz)         |                    | 5K (2%) are missing, there should be 282173                                                                 
-| CNN/DM  | `google/pegasus-xsum`         | [download](https://cdn-datasets.huggingface.co/pseudo/cnn_dm/pegasus_xsum_on_cnn.tgz) | 21.5/6.76/25       | extra labels for xsum distillation  Used max_source_length=512, (and all other pegasus-xsum configuration). 
-| EN-RO   | `Helsinki-NLP/opus-mt-en-ro`  | [download](https://cdn-datasets.huggingface.co/pseudo/wmt_en_ro/opus_mt_en_ro.tgz) |       |  
-| EN-RO   | `facebook/mbart-large-en-ro`  | [download](https://cdn-datasets.huggingface.co/pseudo/wmt_en_ro/mbart_large_en_ro.tgz) |       |  
-
-
-(EN_RO = WMT 2016 English-Romanian).
-
-Example Download Command:
-```bash
-curl -S https://cdn-datasets.huggingface.co/pseudo/xsum/bart_xsum_pl.tgz | tar -xvz -C .
-```
-### Generating New Pseudolabels
-Here is the command I used to generate the pseudolabels in the second row of the table, after downloading XSUM from [here](https://cdn-datasets.huggingface.co/summarization/xsum.tar.gz). 
-
-```bash                                                                         
-python -m torch.distributed.launch --nproc_per_node=8 run_distributed_eval.py \
-    --model_name google/pegasus-xsum \ 
-    --save_dir pegasus_xsum \ 
-    --data_dir xsum \
-    --bs 8 --sync_timeout 60000 \
-    --max_source_length 512 \
-    --type_path train
-```
-
-+ These commands takes a while to run. For example, `pegasus_cnn_cnn_pls.tgz` took 8 hours on 8 GPUs.
-+ Pegasus does not work in fp16 :(, Bart, mBART and Marian do.
-+ Even if you have 1 GPU, `run_distributed_eval.py` is 10-20% faster than `run_eval.py` because it uses `SortishSampler` to minimize padding computation.
-
-### Contributions
-Feel free to contribute your own pseudolabels via PR. Add a row to this table with a new google drive link (or other command line downloadable link).
-
-
diff --git a/examples/research_projects/seq2seq-distillation/requirements.txt b/examples/research_projects/seq2seq-distillation/requirements.txt
deleted file mode 100644
index 533f6339ab08..000000000000
--- a/examples/research_projects/seq2seq-distillation/requirements.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-tensorboard
-scikit-learn
-psutil
-sacrebleu
-rouge-score
-tensorflow_datasets
-pytorch-lightning
-matplotlib
-git-python==1.0.3
-faiss-cpu
-streamlit
-elasticsearch
-nltk
-pandas
-datasets >= 1.1.3
-fire
-pytest
-conllu
-sentencepiece != 0.1.92
-protobuf
diff --git a/examples/research_projects/seq2seq-distillation/run_eval.py b/examples/research_projects/seq2seq-distillation/run_eval.py
deleted file mode 100755
index 54ad6c6fb6b6..000000000000
--- a/examples/research_projects/seq2seq-distillation/run_eval.py
+++ /dev/null
@@ -1,167 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import datetime
-import json
-import time
-import warnings
-from logging import getLogger
-from pathlib import Path
-from typing import Dict, List
-
-import torch
-from tqdm import tqdm
-
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-from utils import calculate_bleu, calculate_rouge, chunks, parse_numeric_n_bool_cl_kwargs, use_task_specific_params
-
-
-logger = getLogger(__name__)
-
-
-DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-
-
-def generate_summaries_or_translations(
-    examples: List[str],
-    out_file: str,
-    model_name: str,
-    batch_size: int = 8,
-    device: str = DEFAULT_DEVICE,
-    fp16=False,
-    task="summarization",
-    prefix=None,
-    **generate_kwargs,
-) -> Dict:
-    """Save model.generate results to <out_file>, and return how long it took."""
-    fout = Path(out_file).open("w", encoding="utf-8")
-    model_name = str(model_name)
-    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
-    if fp16:
-        model = model.half()
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    logger.info(f"Inferred tokenizer type: {tokenizer.__class__}")  # if this is wrong, check config.model_type.
-
-    start_time = time.time()
-    # update config with task specific params
-    use_task_specific_params(model, task)
-    if prefix is None:
-        prefix = prefix or getattr(model.config, "prefix", "") or ""
-    for examples_chunk in tqdm(list(chunks(examples, batch_size))):
-        examples_chunk = [prefix + text for text in examples_chunk]
-        batch = tokenizer(examples_chunk, return_tensors="pt", truncation=True, padding="longest").to(device)
-        summaries = model.generate(
-            input_ids=batch.input_ids,
-            attention_mask=batch.attention_mask,
-            **generate_kwargs,
-        )
-        dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        for hypothesis in dec:
-            fout.write(hypothesis + "\n")
-            fout.flush()
-    fout.close()
-    runtime = int(time.time() - start_time)  # seconds
-    n_obs = len(examples)
-    return {"n_obs": n_obs, "runtime": runtime, "seconds_per_sample": round(runtime / n_obs, 4)}
-
-
-def datetime_now():
-    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-
-
-def run_generate(verbose=True):
-    """
-
-    Takes input text, generates output, and then using reference calculates the BLEU scores.
-
-    The results are saved to a file and returned to the caller, and printed out unless ``verbose=False`` is passed.
-
-    Args:
-        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): print results to stdout
-
-    Returns:
-        a tuple: ``(scores, params}``
-        - ``scores``: a dict of scores data ``{'bleu': 39.6501, 'n_obs': 2000, 'runtime': 186, 'seconds_per_sample': 0.093}``
-        - ``params``: a dict of custom params, e.g. ``{'num_beams': 5, 'length_penalty': 0.8}``
-    """
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("model_name", type=str, help="like facebook/bart-large-cnn,t5-base, etc.")
-    parser.add_argument("input_path", type=str, help="like cnn_dm/test.source")
-    parser.add_argument("save_path", type=str, help="where to save summaries")
-    parser.add_argument("--reference_path", type=str, required=False, help="like cnn_dm/test.target")
-    parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics")
-    parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.")
-    parser.add_argument(
-        "--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
-    )
-    parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
-    parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
-    parser.add_argument(
-        "--n_obs", type=int, default=-1, required=False, help="How many observations. Defaults to all."
-    )
-    parser.add_argument("--fp16", action="store_true")
-    parser.add_argument("--dump-args", action="store_true", help="print the custom hparams with the results")
-    parser.add_argument(
-        "--info",
-        nargs="?",
-        type=str,
-        const=datetime_now(),
-        help=(
-            "use in conjunction w/ --dump-args to print with the results whatever other info you'd like, e.g."
-            " lang=en-ru. If no value is passed, the current datetime string will be used."
-        ),
-    )
-    # Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate
-    args, rest = parser.parse_known_args()
-    parsed_args = parse_numeric_n_bool_cl_kwargs(rest)
-    if parsed_args and verbose:
-        print(f"parsed the following generate kwargs: {parsed_args}")
-    with open(args.input_path) as f:
-        examples = [" " + x.rstrip() if "t5" in args.model_name else x.rstrip() for x in f.readlines()]
-    if args.n_obs > 0:
-        examples = examples[: args.n_obs]
-    Path(args.save_path).parent.mkdir(exist_ok=True)
-    if args.reference_path is None and Path(args.score_path).exists():
-        warnings.warn(f"score_path {args.score_path} will be overwritten unless you type ctrl-c.")
-    runtime_metrics = generate_summaries_or_translations(
-        examples,
-        args.save_path,
-        args.model_name,
-        batch_size=args.bs,
-        device=args.device,
-        fp16=args.fp16,
-        task=args.task,
-        prefix=args.prefix,
-        **parsed_args,
-    )
-
-    if args.reference_path is None:
-        return {}
-
-    # Compute scores
-    score_fn = calculate_bleu if "translation" in args.task else calculate_rouge
-    output_lns = [x.rstrip() for x in open(args.save_path).readlines()]
-    reference_lns = [x.rstrip() for x in open(args.reference_path).readlines()][: len(output_lns)]
-    scores: dict = score_fn(output_lns, reference_lns)
-    scores.update(runtime_metrics)
-
-    if args.dump_args:
-        scores.update(parsed_args)
-    if args.info:
-        scores["info"] = args.info
-
-    if verbose:
-        print(scores)
-
-    if args.score_path is not None:
-        json.dump(scores, open(args.score_path, "w"))
-
-    return scores
-
-
-if __name__ == "__main__":
-    # Usage for MT:
-    # python run_eval.py MODEL_NAME $DATA_DIR/test.source $save_dir/test_translations.txt --reference_path $DATA_DIR/test.target --score_path $save_dir/test_bleu.json  --task translation $@
-    run_generate(verbose=True)
diff --git a/examples/research_projects/seq2seq-distillation/sentence_splitter.py b/examples/research_projects/seq2seq-distillation/sentence_splitter.py
deleted file mode 100644
index c5acec73928c..000000000000
--- a/examples/research_projects/seq2seq-distillation/sentence_splitter.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import re
-
-from filelock import FileLock
-
-
-try:
-    import nltk
-
-    NLTK_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    NLTK_AVAILABLE = False
-
-if NLTK_AVAILABLE:
-    with FileLock(".lock") as lock:
-        nltk.download("punkt", quiet=True)
-
-
-def add_newline_to_end_of_each_sentence(x: str) -> str:
-    """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
-    re.sub("<n>", "", x)  # remove pegasus newline char
-    assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
-    return "\n".join(nltk.sent_tokenize(x))
diff --git a/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh b/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh
deleted file mode 100755
index 6a1bafbdc9c8..000000000000
--- a/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env bash
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-export BS=32
-export GAS=1
-
-python finetune.py \
-    --learning_rate=3e-5 \
-    --fp16 \
-    --gpus 1 \
-    --do_train \
-    --do_predict \
-    --val_check_interval 0.25 \
-    --n_val 500 \
-    --num_train_epochs 2 \
-    --freeze_encoder --freeze_embeds --data_dir cnn_dm \
-    --max_target_length 142 --val_max_target_length=142 \
-    --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \
-    --model_name_or_path sshleifer/student_cnn_12_6 \
-    --tokenizer_name facebook/bart-large \
-    --warmup_steps 500 \
-    --output_dir distilbart-cnn-12-6 \
-    "$@"
-
diff --git a/examples/research_projects/seq2seq-distillation/train_distilbart_xsum.sh b/examples/research_projects/seq2seq-distillation/train_distilbart_xsum.sh
deleted file mode 100755
index 86a3440fc0c0..000000000000
--- a/examples/research_projects/seq2seq-distillation/train_distilbart_xsum.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env bash
-export PYTHONPATH="../":"${PYTHONPATH}"
-python distillation.py \
-  --teacher facebook/bart-large-xsum --data_dir xsum \
-  --tokenizer_name facebook/bart-large-xsum \
-  --student_decoder_layers 6 --student_encoder_layers 12 \
-  --freeze_encoder --freeze_embeds \
-  --learning_rate=3e-4 \
-  --do_train \
-  --do_predict \
-  --fp16 --fp16_opt_level=O1 \
-  --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \
-  --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
-  --model_name_or_path IGNORED \
-  --alpha_hid=3. \
-  --train_batch_size=16 --eval_batch_size=16 --gradient_accumulation_steps=2 \
-  --sortish_sampler \
-  --num_train_epochs=6 \
-  --warmup_steps 500 \
-  --output_dir distilbart_xsum_12_6 \
-  "$@"
diff --git a/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh b/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh
deleted file mode 100755
index 54e7935ff60d..000000000000
--- a/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env bash
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-python finetune.py \
-    --learning_rate=3e-5 \
-    --fp16 \
-    --do_train \
-    --val_check_interval=0.25 \
-    --adam_eps 1e-06 \
-    --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \
-    --data_dir $ENRO_DIR \
-    --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
-    --train_batch_size=$BS --eval_batch_size=$BS \
-    --task translation \
-    --warmup_steps 500 \
-    --freeze_embeds \
-    --model_name_or_path=facebook/mbart-large-cc25 \
-    "$@"
diff --git a/examples/research_projects/seq2seq-distillation/utils.py b/examples/research_projects/seq2seq-distillation/utils.py
deleted file mode 100644
index de666e0c2490..000000000000
--- a/examples/research_projects/seq2seq-distillation/utils.py
+++ /dev/null
@@ -1,645 +0,0 @@
-import itertools
-import json
-import linecache
-import math
-import os
-import pickle
-import socket
-from logging import getLogger
-from pathlib import Path
-from typing import Callable, Dict, Iterable, List, Tuple, Union
-
-import git
-import numpy as np
-import torch
-import torch.distributed as dist
-from rouge_score import rouge_scorer, scoring
-from sacrebleu import corpus_bleu
-from sentence_splitter import add_newline_to_end_of_each_sentence
-from torch import nn
-from torch.utils.data import Dataset, Sampler
-
-from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
-from transformers.file_utils import cached_property
-from transformers.models.bart.modeling_bart import shift_tokens_right
-
-
-try:
-    from fairseq.data.data_utils import batch_by_size
-
-    FAIRSEQ_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    FAIRSEQ_AVAILABLE = False
-
-
-def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
-    """From fairseq"""
-    if target.dim() == lprobs.dim() - 1:
-        target = target.unsqueeze(-1)
-    nll_loss = -lprobs.gather(dim=-1, index=target)
-    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
-    if ignore_index is not None:
-        pad_mask = target.eq(ignore_index)
-        nll_loss.masked_fill_(pad_mask, 0.0)
-        smooth_loss.masked_fill_(pad_mask, 0.0)
-    else:
-        nll_loss = nll_loss.squeeze(-1)
-        smooth_loss = smooth_loss.squeeze(-1)
-
-    nll_loss = nll_loss.sum()  # mean()? Scared to break other math.
-    smooth_loss = smooth_loss.sum()
-    eps_i = epsilon / lprobs.size(-1)
-    loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
-    return loss, nll_loss
-
-
-def lmap(f: Callable, x: Iterable) -> List:
-    """list(map(f, x))"""
-    return list(map(f, x))
-
-
-def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
-    """Uses sacrebleu's corpus_bleu implementation."""
-    return {"bleu": round(corpus_bleu(output_lns, [refs_lns], **kwargs).score, 4)}
-
-
-def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) -> Callable[[EvalPrediction], Dict]:
-    def non_pad_len(tokens: np.ndarray) -> int:
-        return np.count_nonzero(tokens != tokenizer.pad_token_id)
-
-    def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]:
-        pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
-        label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
-        pred_str = lmap(str.strip, pred_str)
-        label_str = lmap(str.strip, label_str)
-        return pred_str, label_str
-
-    def summarization_metrics(pred: EvalPrediction) -> Dict:
-        pred_str, label_str = decode_pred(pred)
-        rouge: Dict = calculate_rouge(pred_str, label_str)
-        summ_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
-        rouge.update({"gen_len": summ_len})
-        return rouge
-
-    def translation_metrics(pred: EvalPrediction) -> Dict:
-        pred_str, label_str = decode_pred(pred)
-        bleu: Dict = calculate_bleu(pred_str, label_str)
-        gen_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
-        bleu.update({"gen_len": gen_len})
-        return bleu
-
-    compute_metrics_fn = summarization_metrics if "summarization" in task_name else translation_metrics
-    return compute_metrics_fn
-
-
-def trim_batch(
-    input_ids,
-    pad_token_id,
-    attention_mask=None,
-):
-    """Remove columns that are populated exclusively by pad_token_id"""
-    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
-    if attention_mask is None:
-        return input_ids[:, keep_column_mask]
-    else:
-        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
-
-
-class AbstractSeq2SeqDataset(Dataset):
-    def __init__(
-        self,
-        tokenizer,
-        data_dir,
-        max_source_length,
-        max_target_length,
-        type_path="train",
-        n_obs=None,
-        prefix="",
-        **dataset_kwargs,
-    ):
-        super().__init__()
-        self.src_file = Path(data_dir).joinpath(type_path + ".source")
-        self.tgt_file = Path(data_dir).joinpath(type_path + ".target")
-        self.len_file = Path(data_dir).joinpath(type_path + ".len")
-        if os.path.exists(self.len_file):
-            self.src_lens = pickle_load(self.len_file)
-            self.used_char_len = False
-        else:
-            self.src_lens = self.get_char_lens(self.src_file)
-            self.used_char_len = True
-        self.max_source_length = max_source_length
-        self.max_target_length = max_target_length
-        assert min(self.src_lens) > 0, f"found empty line in {self.src_file}"
-        self.tokenizer = tokenizer
-        self.prefix = prefix if prefix is not None else ""
-
-        if n_obs is not None:
-            self.src_lens = self.src_lens[:n_obs]
-        self.pad_token_id = self.tokenizer.pad_token_id
-        self.dataset_kwargs = dataset_kwargs
-        dataset_kwargs.update({"add_prefix_space": True} if isinstance(self.tokenizer, BartTokenizer) else {})
-
-    def __len__(self):
-        return len(self.src_lens)
-
-    @staticmethod
-    def get_char_lens(data_file):
-        return [len(x) for x in Path(data_file).open().readlines()]
-
-    @cached_property
-    def tgt_lens(self):
-        """Length in characters of target documents"""
-        return self.get_char_lens(self.tgt_file)
-
-    def make_sortish_sampler(self, batch_size, distributed=False, shuffle=True, **kwargs):
-        if distributed:
-            return DistributedSortishSampler(self, batch_size, shuffle=shuffle, **kwargs)
-        else:
-            return SortishSampler(self.src_lens, batch_size, shuffle=shuffle)
-
-    def make_dynamic_sampler(self, max_tokens_per_batch=1024, **kwargs):
-        assert FAIRSEQ_AVAILABLE, "Dynamic batch size requires `pip install fairseq`"
-        assert not self.used_char_len, "You must call  python make_len_file.py before calling make_dynamic_sampler"
-        sorted_indices = list(self.make_sortish_sampler(1024, shuffle=False))
-
-        def num_tokens_in_example(i):
-            return min(self.src_lens[i], self.max_target_length)
-
-        # call fairseq cython function
-        batch_sampler: List[List[int]] = batch_by_size(
-            sorted_indices,
-            num_tokens_fn=num_tokens_in_example,
-            max_tokens=max_tokens_per_batch,
-            required_batch_size_multiple=64,
-        )
-        shuffled_batches = [batch_sampler[i] for i in np.random.permutation(range(len(batch_sampler)))]
-        # move the largest batch to the front to OOM quickly (uses an approximation for padding)
-        approximate_toks_per_batch = [max(self.src_lens[i] for i in batch) * len(batch) for batch in shuffled_batches]
-        largest_batch_idx = np.argmax(approximate_toks_per_batch)
-        shuffled_batches[0], shuffled_batches[largest_batch_idx] = (
-            shuffled_batches[largest_batch_idx],
-            shuffled_batches[0],
-        )
-        return shuffled_batches
-
-    def __getitem__(self, item):
-        raise NotImplementedError("You must implement this")
-
-    def collate_fn(self, batch):
-        raise NotImplementedError("You must implement this")
-
-
-class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
-    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
-        """Call tokenizer on src and tgt_lines"""
-        index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
-        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
-        assert source_line, f"empty source line for index {index}"
-        assert tgt_line, f"empty tgt line for index {index}"
-        source_inputs = self.encode_line(self.tokenizer, source_line, self.max_source_length)
-        target_inputs = self.encode_line(self.tokenizer, tgt_line, self.max_target_length)
-
-        source_ids = source_inputs["input_ids"].squeeze()
-        target_ids = target_inputs["input_ids"].squeeze()
-        src_mask = source_inputs["attention_mask"].squeeze()
-        return {
-            "input_ids": source_ids,
-            "attention_mask": src_mask,
-            "labels": target_ids,
-        }
-
-    def encode_line(self, tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"):
-        """Only used by LegacyDataset"""
-        return tokenizer(
-            [line],
-            max_length=max_length,
-            padding="max_length" if pad_to_max_length else None,
-            truncation=True,
-            return_tensors=return_tensors,
-            **self.dataset_kwargs,
-        )
-
-    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
-        input_ids = torch.stack([x["input_ids"] for x in batch])
-        masks = torch.stack([x["attention_mask"] for x in batch])
-        target_ids = torch.stack([x["labels"] for x in batch])
-        pad_token_id = self.pad_token_id
-        y = trim_batch(target_ids, pad_token_id)
-        source_ids, source_mask = trim_batch(input_ids, pad_token_id, attention_mask=masks)
-        batch = {
-            "input_ids": source_ids,
-            "attention_mask": source_mask,
-            "labels": y,
-        }
-        return batch
-
-
-class Seq2SeqDataset(AbstractSeq2SeqDataset):
-    """A dataset that calls prepare_seq2seq_batch."""
-
-    def __getitem__(self, index) -> Dict[str, str]:
-        index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
-        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
-        assert source_line, f"empty source line for index {index}"
-        assert tgt_line, f"empty tgt line for index {index}"
-        return {"tgt_texts": tgt_line, "src_texts": source_line, "id": index - 1}
-
-    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
-        """Call prepare_seq2seq_batch."""
-        batch_encoding: Dict[str, torch.Tensor] = self.tokenizer.prepare_seq2seq_batch(
-            [x["src_texts"] for x in batch],
-            tgt_texts=[x["tgt_texts"] for x in batch],
-            max_length=self.max_source_length,
-            max_target_length=self.max_target_length,
-            return_tensors="pt",
-            **self.dataset_kwargs,
-        ).data
-        batch_encoding["ids"] = torch.tensor([x["id"] for x in batch])
-        return batch_encoding
-
-
-class Seq2SeqDataCollator:
-    def __init__(self, tokenizer, data_args, tpu_num_cores=None):
-        self.tokenizer = tokenizer
-        self.pad_token_id = tokenizer.pad_token_id
-        assert (
-            self.pad_token_id is not None
-        ), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
-        self.data_args = data_args
-        self.tpu_num_cores = tpu_num_cores
-        self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
-        if data_args.src_lang is not None:
-            self.dataset_kwargs["src_lang"] = data_args.src_lang
-        if data_args.tgt_lang is not None:
-            self.dataset_kwargs["tgt_lang"] = data_args.tgt_lang
-
-    def __call__(self, batch) -> Dict[str, torch.Tensor]:
-        if hasattr(self.tokenizer, "prepare_seq2seq_batch"):
-            batch = self._encode(batch)
-            input_ids, attention_mask, labels = (
-                batch["input_ids"],
-                batch["attention_mask"],
-                batch["labels"],
-            )
-        else:
-            input_ids = torch.stack([x["input_ids"] for x in batch])
-            attention_mask = torch.stack([x["attention_mask"] for x in batch])
-            labels = torch.stack([x["labels"] for x in batch])
-
-            labels = trim_batch(labels, self.pad_token_id)
-            input_ids, attention_mask = trim_batch(input_ids, self.pad_token_id, attention_mask=attention_mask)
-
-        if isinstance(self.tokenizer, T5Tokenizer):
-            decoder_input_ids = self._shift_right_t5(labels)
-        else:
-            decoder_input_ids = shift_tokens_right(labels, self.pad_token_id)
-
-        batch = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "labels": labels,
-        }
-        return batch
-
-    def _shift_right_t5(self, input_ids):
-        # shift inputs to the right
-        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-        shifted_input_ids[..., 0] = self.pad_token_id
-        return shifted_input_ids
-
-    def _encode(self, batch) -> Dict[str, torch.Tensor]:
-        batch_encoding = self.tokenizer.prepare_seq2seq_batch(
-            [x["src_texts"] for x in batch],
-            tgt_texts=[x["tgt_texts"] for x in batch],
-            max_length=self.data_args.max_source_length,
-            max_target_length=self.data_args.max_target_length,
-            padding="max_length" if self.tpu_num_cores is not None else "longest",  # TPU hack
-            return_tensors="pt",
-            **self.dataset_kwargs,
-        )
-        return batch_encoding.data
-
-
-class SortishSampler(Sampler):
-    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
-
-    def __init__(self, data, batch_size, shuffle=True):
-        self.data, self.bs, self.shuffle = data, batch_size, shuffle
-
-    def __len__(self) -> int:
-        return len(self.data)
-
-    def __iter__(self):
-        return iter(sortish_sampler_indices(self.data, self.bs, shuffle=self.shuffle))
-
-
-def sortish_sampler_indices(data: List, bs: int, shuffle=True) -> np.array:
-    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
-    if not shuffle:
-        return np.argsort(np.array(data) * -1)
-
-    def key_fn(i):
-        return data[i]
-
-    idxs = np.random.permutation(len(data))
-    sz = bs * 50
-    ck_idx = [idxs[i : i + sz] for i in range(0, len(idxs), sz)]
-    sort_idx = np.concatenate([sorted(s, key=key_fn, reverse=True) for s in ck_idx])
-    sz = bs
-    ck_idx = [sort_idx[i : i + sz] for i in range(0, len(sort_idx), sz)]
-    max_ck = np.argmax([key_fn(ck[0]) for ck in ck_idx])  # find the chunk with the largest key,
-    ck_idx[0], ck_idx[max_ck] = ck_idx[max_ck], ck_idx[0]  # then make sure it goes first.
-    sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([], dtype=int)
-    sort_idx = np.concatenate((ck_idx[0], sort_idx))
-    return sort_idx
-
-
-class DistributedSortishSampler(Sampler):
-    """Copied from torch DistributedSampler"""
-
-    def __init__(self, dataset, batch_size, num_replicas=None, rank=None, add_extra_examples=True, shuffle=True):
-        if num_replicas is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            num_replicas = dist.get_world_size()
-        if rank is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            rank = dist.get_rank()
-        self.dataset = dataset
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.epoch = 0
-        if add_extra_examples:
-            self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
-            self.total_size = self.num_samples * self.num_replicas
-        else:
-            self.total_size = len(dataset)
-            self.num_samples = len(self.available_indices)
-        self.batch_size = batch_size
-        self.add_extra_examples = add_extra_examples
-        self.shuffle = shuffle
-
-    def __iter__(self) -> Iterable:
-        g = torch.Generator()
-        g.manual_seed(self.epoch)
-
-        sortish_data = [self.dataset.src_lens[i] for i in self.available_indices]
-        sortish_indices = sortish_sampler_indices(sortish_data, self.batch_size, shuffle=self.shuffle)
-        indices = [self.available_indices[i] for i in sortish_indices]
-        assert len(indices) == self.num_samples
-        return iter(indices)
-
-    @cached_property
-    def available_indices(self) -> np.array:
-        indices = list(range(len(self.dataset)))
-        # add extra samples to make it evenly divisible
-        indices += indices[: (self.total_size - len(indices))]
-        assert len(indices) == self.total_size
-        # subsample
-        available_indices = indices[self.rank : self.total_size : self.num_replicas]
-        return available_indices
-
-    def __len__(self):
-        return self.num_samples
-
-    def set_epoch(self, epoch):
-        self.epoch = epoch
-
-
-logger = getLogger(__name__)
-
-
-def use_task_specific_params(model, task):
-    """Update config with summarization specific params."""
-    task_specific_params = model.config.task_specific_params
-
-    if task_specific_params is not None:
-        pars = task_specific_params.get(task, {})
-        logger.info(f"using task specific params for {task}: {pars}")
-        model.config.update(pars)
-
-
-def pickle_load(path):
-    """pickle.load(path)"""
-    with open(path, "rb") as f:
-        return pickle.load(f)
-
-
-def pickle_save(obj, path):
-    """pickle.dump(obj, path)"""
-    with open(path, "wb") as f:
-        return pickle.dump(obj, f)
-
-
-def flatten_list(summary_ids: List[List]):
-    return list(itertools.chain.from_iterable(summary_ids))
-
-
-def save_git_info(folder_path: str) -> None:
-    """Save git information to output_dir/git_log.json"""
-    repo_infos = get_git_info()
-    save_json(repo_infos, os.path.join(folder_path, "git_log.json"))
-
-
-def save_json(content, path, indent=4, **json_dump_kwargs):
-    with open(path, "w") as f:
-        json.dump(content, f, indent=indent, **json_dump_kwargs)
-
-
-def load_json(path):
-    with open(path) as f:
-        return json.load(f)
-
-
-def get_git_info():
-    try:
-        repo = git.Repo(search_parent_directories=True)
-        repo_infos = {
-            "repo_id": str(repo),
-            "repo_sha": str(repo.head.object.hexsha),
-            "repo_branch": str(repo.active_branch),
-            "hostname": str(socket.gethostname()),
-        }
-        return repo_infos
-    except TypeError:
-        return {
-            "repo_id": None,
-            "repo_sha": None,
-            "repo_branch": None,
-            "hostname": None,
-        }
-
-
-ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
-
-
-def extract_rouge_mid_statistics(dct):
-    new_dict = {}
-    for k1, v1 in dct.items():
-        mid = v1.mid
-        new_dict[k1] = {stat: round(getattr(mid, stat), 4) for stat in ["precision", "recall", "fmeasure"]}
-    return new_dict
-
-
-def calculate_rouge(
-    pred_lns: List[str],
-    tgt_lns: List[str],
-    use_stemmer=True,
-    rouge_keys=ROUGE_KEYS,
-    return_precision_and_recall=False,
-    bootstrap_aggregation=True,
-    newline_sep=True,
-) -> Dict:
-    """Calculate rouge using rouge_scorer package.
-
-    Args:
-        pred_lns: list of summaries generated by model
-        tgt_lns: list of groundtruth summaries (e.g. contents of val.target)
-        use_stemmer:  Bool indicating whether Porter stemmer should be used to
-        strip word suffixes to improve matching.
-        rouge_keys:  which metrics to compute, defaults to rouge1, rouge2, rougeL, rougeLsum
-        return_precision_and_recall: (False) whether to also return precision and recall.
-        bootstrap_aggregation: whether to do the typical bootstrap resampling of scores. Defaults to True, if False
-            this function returns a collections.defaultdict[metric: list of values for each observation for each subscore]``
-        newline_sep:(default=True) whether to add newline between sentences. This is essential for calculation rougeL
-        on multi sentence summaries (CNN/DM dataset).
-
-    Returns:
-         Dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
-
-    """
-    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
-    aggregator = scoring.BootstrapAggregator()
-    for pred, tgt in zip(tgt_lns, pred_lns):
-        # rougeLsum expects "\n" separated sentences within a summary
-        if newline_sep:
-            pred = add_newline_to_end_of_each_sentence(pred)
-            tgt = add_newline_to_end_of_each_sentence(tgt)
-        scores = scorer.score(pred, tgt)
-        aggregator.add_scores(scores)
-
-    if bootstrap_aggregation:
-        result = aggregator.aggregate()
-        if return_precision_and_recall:
-            return extract_rouge_mid_statistics(result)  # here we return dict
-        else:
-            return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
-
-    else:
-        return aggregator._scores  # here we return defaultdict(list)
-
-
-# Utilities for freezing parameters and checking whether they are frozen
-
-
-def freeze_params(model: nn.Module):
-    """Set requires_grad=False for each of model.parameters()"""
-    for par in model.parameters():
-        par.requires_grad = False
-
-
-def freeze_embeds(model):
-    """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
-    model_type = model.config.model_type
-
-    if model_type == "t5":
-        freeze_params(model.shared)
-        for d in [model.encoder, model.decoder]:
-            freeze_params(d.embed_tokens)
-    elif model_type == "fsmt":
-        for d in [model.model.encoder, model.model.decoder]:
-            freeze_params(d.embed_positions)
-            freeze_params(d.embed_tokens)
-    else:
-        freeze_params(model.model.shared)
-        for d in [model.model.encoder, model.model.decoder]:
-            freeze_params(d.embed_positions)
-            freeze_params(d.embed_tokens)
-
-
-def grad_status(model: nn.Module) -> Iterable:
-    return (par.requires_grad for par in model.parameters())
-
-
-def any_requires_grad(model: nn.Module) -> bool:
-    return any(grad_status(model))
-
-
-def assert_all_frozen(model):
-    model_grads: List[bool] = list(grad_status(model))
-    n_require_grad = sum(lmap(int, model_grads))
-    npars = len(model_grads)
-    assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
-
-
-def assert_not_all_frozen(model):
-    model_grads: List[bool] = list(grad_status(model))
-    npars = len(model_grads)
-    assert any(model_grads), f"none of {npars} weights require grad"
-
-
-def parse_numeric_n_bool_cl_kwargs(unparsed_args: List[str]) -> Dict[str, Union[int, float, bool]]:
-    """
-    Parse an argv list of unspecified command line args to a dict.
-    Assumes all values are either numeric or boolean in the form of true/false.
-    """
-    result = {}
-    assert len(unparsed_args) % 2 == 0, f"got odd number of unparsed args: {unparsed_args}"
-    num_pairs = len(unparsed_args) // 2
-    for pair_num in range(num_pairs):
-        i = 2 * pair_num
-        assert unparsed_args[i].startswith("--")
-        if unparsed_args[i + 1].lower() == "true":
-            value = True
-        elif unparsed_args[i + 1].lower() == "false":
-            value = False
-        else:
-            try:
-                value = int(unparsed_args[i + 1])
-            except ValueError:
-                value = float(unparsed_args[i + 1])  # this can raise another informative ValueError
-
-        result[unparsed_args[i][2:]] = value
-    return result
-
-
-def write_txt_file(ordered_tgt, path):
-    f = Path(path).open("w")
-    for ln in ordered_tgt:
-        f.write(ln + "\n")
-        f.flush()
-
-
-def chunks(lst, n):
-    """Yield successive n-sized chunks from lst."""
-    for i in range(0, len(lst), n):
-        yield lst[i : i + n]
-
-
-def check_output_dir(args, expected_items=0):
-    """
-    Checks whether to bail out if output_dir already exists and has more than expected_items in it
-
-    `args`: needs to have the following attributes of `args`:
-      - output_dir
-      - do_train
-      - overwrite_output_dir
-
-    `expected_items`: normally 0 (default) - i.e. empty dir, but in some cases a few files are expected (e.g. recovery from OOM)
-    """
-    if (
-        os.path.exists(args.output_dir)
-        and len(os.listdir(args.output_dir)) > expected_items
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({args.output_dir}) already exists and "
-            f"has {len(os.listdir(args.output_dir))} items in it (expected {expected_items} items). "
-            "Use --overwrite_output_dir to overcome."
-        )
diff --git a/examples/research_projects/synthid_text/README.md b/examples/research_projects/synthid_text/README.md
deleted file mode 100644
index 30ab99903737..000000000000
--- a/examples/research_projects/synthid_text/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# SynthID Text
-
-This project showcases the use of SynthIDText for watermarking LLMs. The code shown in this repo also
-demostrates the training of the detector for detecting such watermarked text. This detector can be uploaded onto
-a private HF hub repo (private for security reasons) and can be initialized again through pretrained model loading also shown in this script.
-
-See our blog post: https://huggingface.co/blog/synthid-text
-
-
-## Python version
-
-User would need python 3.9 to run this example.
-
-## Installation and running
-
-Once you install transformers you would need to install requirements for this project through requirements.txt provided in this folder.
-
-```
-pip install -r requirements.txt
-```
-
-## To run the detector training
-
-```
-python detector_training.py --model_name=google/gemma-7b-it
-```
-
-Check the script for more parameters are are tunable and check out paper at link
-https://www.nature.com/articles/s41586-024-08025-4 for more information on these parameters.
-
-## Caveat
-
-Make sure to run the training of the detector and the detection on the same hardware
-CPU, GPU or TPU to get consistent results (we use detecterministic randomness which is hardware dependent).
diff --git a/examples/research_projects/synthid_text/detector_training.py b/examples/research_projects/synthid_text/detector_training.py
deleted file mode 100644
index 35d0ea22f42b..000000000000
--- a/examples/research_projects/synthid_text/detector_training.py
+++ /dev/null
@@ -1,502 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Google DeepMind.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import dataclasses
-import enum
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    BayesianDetectorConfig,
-    BayesianDetectorModel,
-    SynthIDTextWatermarkDetector,
-    SynthIDTextWatermarkingConfig,
-    SynthIDTextWatermarkLogitsProcessor,
-)
-from utils import (
-    get_tokenized_uwm_outputs,
-    get_tokenized_wm_outputs,
-    process_raw_model_outputs,
-    update_fn_if_fpr_tpr,
-    upload_model_to_hf,
-)
-
-
-@enum.unique
-class ValidationMetric(enum.Enum):
-    """Direction along the z-axis."""
-
-    TPR_AT_FPR = "tpr_at_fpr"
-    CROSS_ENTROPY = "cross_entropy"
-
-
-@dataclasses.dataclass
-class TrainingArguments:
-    """Training arguments pertaining to the training loop itself."""
-
-    eval_metric: Optional[str] = dataclasses.field(
-        default=ValidationMetric.TPR_AT_FPR, metadata={"help": "The evaluation metric used."}
-    )
-
-
-def train_detector(
-    detector: torch.nn.Module,
-    g_values: torch.Tensor,
-    mask: torch.Tensor,
-    watermarked: torch.Tensor,
-    epochs: int = 250,
-    learning_rate: float = 1e-3,
-    minibatch_size: int = 64,
-    seed: int = 0,
-    l2_weight: float = 0.0,
-    shuffle: bool = True,
-    g_values_val: Optional[torch.Tensor] = None,
-    mask_val: Optional[torch.Tensor] = None,
-    watermarked_val: Optional[torch.Tensor] = None,
-    verbose: bool = False,
-    validation_metric: ValidationMetric = ValidationMetric.TPR_AT_FPR,
-) -> Tuple[Dict[str, Any], float]:
-    """Trains a Bayesian detector model.
-
-    Args:
-      g_values: g-values of shape [num_train, seq_len, watermarking_depth].
-      mask: A binary array shape [num_train, seq_len] indicating which g-values
-        should be used. g-values with mask value 0 are discarded.
-      watermarked: A binary array of shape [num_train] indicating whether the
-        example is watermarked (0: unwatermarked, 1: watermarked).
-      epochs: Number of epochs to train for.
-      learning_rate: Learning rate for optimizer.
-      minibatch_size: Minibatch size for training. Note that a minibatch
-        requires ~ 32 * minibatch_size * seq_len * watermarked_depth *
-        watermarked_depth bits of memory.
-      seed: Seed for parameter initialization.
-      l2_weight: Weight to apply to L2 regularization for delta parameters.
-      shuffle: Whether to shuffle before training.
-      g_values_val: Validation g-values of shape [num_val, seq_len,
-        watermarking_depth].
-      mask_val: Validation mask of shape [num_val, seq_len].
-      watermarked_val: Validation watermark labels of shape [num_val].
-      verbose: Boolean indicating verbosity of training. If true, the loss will
-        be printed. Defaulted to False.
-      use_tpr_fpr_for_val: Whether to use TPR@FPR=1% as metric for validation.
-        If false, use cross entropy loss.
-
-    Returns:
-      Tuple of
-        training_history: Training history keyed by epoch number where the
-        values are
-          dictionaries containing the loss, validation loss, and model
-          parameters,
-          keyed by
-          'loss', 'val_loss', and 'params', respectively.
-        min_val_loss: Minimum validation loss achieved during training.
-    """
-
-    # Set the random seed for reproducibility
-    torch.manual_seed(seed)
-
-    # Shuffle the data if required
-    if shuffle:
-        indices = torch.randperm(len(g_values))
-        g_values = g_values[indices]
-        mask = mask[indices]
-        watermarked = watermarked[indices]
-
-    # Initialize optimizer
-    optimizer = torch.optim.Adam(detector.parameters(), lr=learning_rate)
-    history = {}
-    min_val_loss = float("inf")
-
-    for epoch in range(epochs):
-        losses = []
-        detector.train()
-        num_batches = len(g_values) // minibatch_size
-        for i in range(0, len(g_values), minibatch_size):
-            end = i + minibatch_size
-            if end > len(g_values):
-                break
-            loss_batch_weight = l2_weight / num_batches
-
-            optimizer.zero_grad()
-            loss = detector(
-                g_values=g_values[i:end],
-                mask=mask[i:end],
-                labels=watermarked[i:end],
-                loss_batch_weight=loss_batch_weight,
-            )[1]
-            loss.backward()
-            optimizer.step()
-            losses.append(loss.item())
-        train_loss = sum(losses) / len(losses)
-
-        val_losses = []
-        if g_values_val is not None:
-            detector.eval()
-            if validation_metric == ValidationMetric.TPR_AT_FPR:
-                val_loss = update_fn_if_fpr_tpr(
-                    detector,
-                    g_values_val,
-                    mask_val,
-                    watermarked_val,
-                    minibatch_size=minibatch_size,
-                )
-            else:
-                for i in range(0, len(g_values_val), minibatch_size):
-                    end = i + minibatch_size
-                    if end > len(g_values_val):
-                        break
-                    with torch.no_grad():
-                        v_loss = detector(
-                            g_values=g_values_val[i:end],
-                            mask=mask_val[i:end],
-                            labels=watermarked_val[i:end],
-                            loss_batch_weight=0,
-                        )[1]
-                    val_losses.append(v_loss.item())
-                val_loss = sum(val_losses) / len(val_losses)
-
-        # Store training history
-        history[epoch + 1] = {"loss": train_loss, "val_loss": val_loss}
-        if verbose:
-            if val_loss is not None:
-                print(f"Epoch {epoch}: loss {loss} (train), {val_loss} (val)")
-            else:
-                print(f"Epoch {epoch}: loss {loss} (train)")
-
-        if val_loss is not None and val_loss < min_val_loss:
-            min_val_loss = val_loss
-            best_val_epoch = epoch
-
-    if verbose:
-        print(f"Best val Epoch: {best_val_epoch}, min_val_loss: {min_val_loss}")
-
-    return history, min_val_loss
-
-
-def train_best_detector(
-    tokenized_wm_outputs: Union[List[np.ndarray], np.ndarray],
-    tokenized_uwm_outputs: Union[List[np.ndarray], np.ndarray],
-    logits_processor: SynthIDTextWatermarkLogitsProcessor,
-    tokenizer: Any,
-    torch_device: torch.device,
-    test_size: float = 0.3,
-    pos_truncation_length: Optional[int] = 200,
-    neg_truncation_length: Optional[int] = 100,
-    max_padded_length: int = 2300,
-    n_epochs: int = 50,
-    learning_rate: float = 2.1e-2,
-    l2_weights: np.ndarray = np.logspace(-3, -2, num=4),
-    verbose: bool = False,
-    validation_metric: ValidationMetric = ValidationMetric.TPR_AT_FPR,
-):
-    """Train and return the best detector given range of hyperparameters.
-
-    In practice, we have found that tuning pos_truncation_length,
-    neg_truncation_length, n_epochs, learning_rate and l2_weights can help
-    improve the performance of the detector. We reccommend tuning these
-    parameters for your data.
-    """
-    l2_weights = list(l2_weights)
-
-    (
-        train_g_values,
-        train_masks,
-        train_labels,
-        cv_g_values,
-        cv_masks,
-        cv_labels,
-    ) = process_raw_model_outputs(
-        logits_processor,
-        tokenizer,
-        pos_truncation_length,
-        neg_truncation_length,
-        max_padded_length,
-        tokenized_wm_outputs,
-        test_size,
-        tokenized_uwm_outputs,
-        torch_device,
-    )
-
-    best_detector = None
-    lowest_loss = float("inf")
-    val_losses = []
-    for l2_weight in l2_weights:
-        config = BayesianDetectorConfig(watermarking_depth=len(logits_processor.keys))
-        detector = BayesianDetectorModel(config).to(torch_device)
-        _, min_val_loss = train_detector(
-            detector=detector,
-            g_values=train_g_values,
-            mask=train_masks,
-            watermarked=train_labels,
-            g_values_val=cv_g_values,
-            mask_val=cv_masks,
-            watermarked_val=cv_labels,
-            learning_rate=learning_rate,
-            l2_weight=l2_weight,
-            epochs=n_epochs,
-            verbose=verbose,
-            validation_metric=validation_metric,
-        )
-        val_losses.append(min_val_loss)
-        if min_val_loss < lowest_loss:
-            lowest_loss = min_val_loss
-            best_detector = detector
-    return best_detector, lowest_loss
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="google/gemma-2b-it",
-        help=("LM model to train the detector for."),
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=1.0,
-        help=("Temperature to sample from the model."),
-    )
-    parser.add_argument(
-        "--top_k",
-        type=int,
-        default=40,
-        help=("Top K for sampling."),
-    )
-    parser.add_argument(
-        "--top_p",
-        type=float,
-        default=1.0,
-        help=("Top P for sampling."),
-    )
-    parser.add_argument(
-        "--num_negatives",
-        type=int,
-        default=10000,
-        help=("Number of negatives for detector training."),
-    )
-    parser.add_argument(
-        "--pos_batch_size",
-        type=int,
-        default=32,
-        help=("Batch size of watermarked positives while sampling."),
-    )
-    parser.add_argument(
-        "--num_pos_batch",
-        type=int,
-        default=313,
-        help=("Number of positive batches for training."),
-    )
-    parser.add_argument(
-        "--generation_length",
-        type=int,
-        default=512,
-        help=("Generation length for sampling."),
-    )
-    parser.add_argument(
-        "--save_model_to_hf_hub",
-        action="store_true",
-        help=("Whether to save the trained model HF hub. By default it will be a private repo."),
-    )
-    parser.add_argument(
-        "--load_from_hf_hub",
-        action="store_true",
-        help=(
-            "Whether to load trained detector model from HF Hub, make sure its the model trained on the same model "
-            "we are loading in the script."
-        ),
-    )
-    parser.add_argument(
-        "--hf_hub_model_name",
-        type=str,
-        default=None,
-        help=("HF hub model name for loading of saving the model."),
-    )
-    parser.add_argument(
-        "--eval_detector_on_prompts",
-        action="store_true",
-        help=("Evaluate detector on a prompt and print probability of watermark."),
-    )
-
-    args = parser.parse_args()
-    model_name = args.model_name
-    temperature = args.temperature
-    top_k = args.top_k
-    top_p = args.top_p
-    num_negatives = args.num_negatives
-    pos_batch_size = args.pos_batch_size
-    num_pos_batch = args.num_pos_batch
-    if num_pos_batch < 10:
-        raise ValueError("--num_pos_batch should be greater than 10.")
-    generation_length = args.generation_length
-    save_model_to_hf_hub = args.save_model_to_hf_hub
-    load_from_hf_hub = args.load_from_hf_hub
-    repo_name = args.hf_hub_model_name
-    eval_detector_on_prompts = args.eval_detector_on_prompts
-
-    NEG_BATCH_SIZE = 32
-
-    # Truncate outputs to this length for training.
-    POS_TRUNCATION_LENGTH = 200
-    NEG_TRUNCATION_LENGTH = 100
-    # Pad trucated outputs to this length for equal shape across all batches.
-    MAX_PADDED_LENGTH = 1000
-
-    DEVICE = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
-    if DEVICE.type not in ("cuda", "tpu"):
-        raise ValueError("We have found the training stable on GPU and TPU, we are working on" " a fix for CPUs")
-
-    model = None
-    if not load_from_hf_hub:
-        # Change this to make your watermark unique. Check documentation in the paper to understand the
-        # impact of these parameters.
-        DEFAULT_WATERMARKING_CONFIG = {
-            "ngram_len": 5,  # This corresponds to H=4 context window size in the paper.
-            "keys": [
-                654,
-                400,
-                836,
-                123,
-                340,
-                443,
-                597,
-                160,
-                57,
-                29,
-                590,
-                639,
-                13,
-                715,
-                468,
-                990,
-                966,
-                226,
-                324,
-                585,
-                118,
-                504,
-                421,
-                521,
-                129,
-                669,
-                732,
-                225,
-                90,
-                960,
-            ],
-            "sampling_table_size": 2**16,
-            "sampling_table_seed": 0,
-            "context_history_size": 1024,
-        }
-        watermark_config = SynthIDTextWatermarkingConfig(**DEFAULT_WATERMARKING_CONFIG)
-
-        model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        tokenizer.pad_token = tokenizer.eos_token
-
-        logits_processor = SynthIDTextWatermarkLogitsProcessor(**DEFAULT_WATERMARKING_CONFIG, device=DEVICE)
-        tokenized_wm_outputs = get_tokenized_wm_outputs(
-            model,
-            tokenizer,
-            watermark_config,
-            num_pos_batch,
-            pos_batch_size,
-            temperature,
-            generation_length,
-            top_k,
-            top_p,
-            DEVICE,
-        )
-        tokenized_uwm_outputs = get_tokenized_uwm_outputs(num_negatives, NEG_BATCH_SIZE, tokenizer, DEVICE)
-
-        best_detector, lowest_loss = train_best_detector(
-            tokenized_wm_outputs=tokenized_wm_outputs,
-            tokenized_uwm_outputs=tokenized_uwm_outputs,
-            logits_processor=logits_processor,
-            tokenizer=tokenizer,
-            torch_device=DEVICE,
-            test_size=0.3,
-            pos_truncation_length=POS_TRUNCATION_LENGTH,
-            neg_truncation_length=NEG_TRUNCATION_LENGTH,
-            max_padded_length=MAX_PADDED_LENGTH,
-            n_epochs=100,
-            learning_rate=3e-3,
-            l2_weights=[
-                0,
-            ],
-            verbose=True,
-            validation_metric=ValidationMetric.TPR_AT_FPR,
-        )
-    else:
-        if repo_name is None:
-            raise ValueError("When loading from pretrained detector model name cannot be None.")
-        best_detector = BayesianDetectorModel.from_pretrained(repo_name).to(DEVICE)
-
-    best_detector.config.set_detector_information(
-        model_name=model_name, watermarking_config=DEFAULT_WATERMARKING_CONFIG
-    )
-    if save_model_to_hf_hub:
-        upload_model_to_hf(best_detector, repo_name)
-
-    # Evaluate model response with the detector
-    if eval_detector_on_prompts:
-        model_name = best_detector.config.model_name
-        watermark_config_dict = best_detector.config.watermarking_config
-        logits_processor = SynthIDTextWatermarkLogitsProcessor(**watermark_config_dict, device=DEVICE)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        tokenizer.pad_token = tokenizer.eos_token
-        synthid_text_detector = SynthIDTextWatermarkDetector(best_detector, logits_processor, tokenizer)
-
-        if model is None:
-            model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
-        watermarking_config = SynthIDTextWatermarkingConfig(**watermark_config_dict)
-
-        prompts = ["Write a essay on cats."]
-        inputs = tokenizer(
-            prompts,
-            return_tensors="pt",
-            padding=True,
-        ).to(DEVICE)
-
-        _, inputs_len = inputs["input_ids"].shape
-
-        outputs = model.generate(
-            **inputs,
-            watermarking_config=watermarking_config,
-            do_sample=True,
-            max_length=inputs_len + generation_length,
-            temperature=temperature,
-            top_k=40,
-            top_p=1.0,
-        )
-        outputs = outputs[:, inputs_len:]
-        result = synthid_text_detector(outputs)
-
-        # You should set this based on expected fpr (false positive rate) and tpr (true positive rate).
-        # Check our demo at HF Spaces for more info.
-        upper_threshold = 0.95
-        lower_threshold = 0.12
-        if result[0][0] > upper_threshold:
-            print("The text is watermarked.")
-        elif lower_threshold < result[0][0] < upper_threshold:
-            print("It is hard to determine if the text is watermarked or not.")
-        else:
-            print("The text is not watermarked.")
diff --git a/examples/research_projects/synthid_text/requirements.txt b/examples/research_projects/synthid_text/requirements.txt
deleted file mode 100644
index 9e40a93ee08f..000000000000
--- a/examples/research_projects/synthid_text/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-tensorflow-datasets>=4.9.3
-torch >= 1.3
-datasets
-scikit-learn
-tensorflow
diff --git a/examples/research_projects/synthid_text/utils.py b/examples/research_projects/synthid_text/utils.py
deleted file mode 100644
index abcb6ca2f282..000000000000
--- a/examples/research_projects/synthid_text/utils.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Google DeepMind.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-from typing import Any, List, Optional, Tuple
-
-import datasets
-import numpy as np
-import tensorflow as tf
-import tensorflow_datasets as tfds
-import torch
-import tqdm
-from huggingface_hub import HfApi, create_repo
-from huggingface_hub.utils import RepositoryNotFoundError
-from sklearn import model_selection
-
-import transformers
-
-
-def pad_to_len(
-    arr: torch.Tensor,
-    target_len: int,
-    left_pad: bool,
-    eos_token: int,
-    device: torch.device,
-) -> torch.Tensor:
-    """Pad or truncate array to given length."""
-    if arr.shape[1] < target_len:
-        shape_for_ones = list(arr.shape)
-        shape_for_ones[1] = target_len - shape_for_ones[1]
-        padded = (
-            torch.ones(
-                shape_for_ones,
-                device=device,
-                dtype=torch.long,
-            )
-            * eos_token
-        )
-        if not left_pad:
-            arr = torch.concatenate((arr, padded), dim=1)
-        else:
-            arr = torch.concatenate((padded, arr), dim=1)
-    else:
-        arr = arr[:, :target_len]
-    return arr
-
-
-def filter_and_truncate(
-    outputs: torch.Tensor,
-    truncation_length: Optional[int],
-    eos_token_mask: torch.Tensor,
-) -> torch.Tensor:
-    """Filter and truncate outputs to given length.
-
-    Args:
-    outputs: output tensor of shape [batch_size, output_len]
-    truncation_length: Length to truncate the final output.
-    eos_token_mask: EOS token mask of shape [batch_size, output_len]
-
-    Returns:
-    output tensor of shape [batch_size, truncation_length].
-    """
-    if truncation_length:
-        outputs = outputs[:, :truncation_length]
-        truncation_mask = torch.sum(eos_token_mask, dim=1) >= truncation_length
-        return outputs[truncation_mask, :]
-    return outputs
-
-
-def process_outputs_for_training(
-    all_outputs: List[torch.Tensor],
-    logits_processor: transformers.generation.SynthIDTextWatermarkLogitsProcessor,
-    tokenizer: Any,
-    pos_truncation_length: Optional[int],
-    neg_truncation_length: Optional[int],
-    max_length: int,
-    is_cv: bool,
-    is_pos: bool,
-    torch_device: torch.device,
-) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
-    """Process raw model outputs into format understandable by the detector.
-
-    Args:
-    all_outputs: sequence of outputs of shape [batch_size, output_len].
-    logits_processor: logits processor used for watermarking.
-    tokenizer: tokenizer used for the model.
-    pos_truncation_length: Length to truncate wm outputs.
-    neg_truncation_length: Length to truncate uwm outputs.
-    max_length: Length to pad truncated outputs so that all processed entries.
-        have same shape.
-    is_cv: Process given outputs for cross validation.
-    is_pos: Process given outputs for positives.
-    torch_device: torch device to use.
-
-    Returns:
-    Tuple of
-        all_masks: list of masks of shape [batch_size, max_length].
-        all_g_values: list of g_values of shape [batch_size, max_length, depth].
-    """
-    all_masks = []
-    all_g_values = []
-    for outputs in tqdm.tqdm(all_outputs):
-        # outputs is of shape [batch_size, output_len].
-        # output_len can differ from batch to batch.
-        eos_token_mask = logits_processor.compute_eos_token_mask(
-            input_ids=outputs,
-            eos_token_id=tokenizer.eos_token_id,
-        )
-        if is_pos or is_cv:
-            # filter with length for positives for both train and CV.
-            # We also filter for length when CV negatives are processed.
-            outputs = filter_and_truncate(outputs, pos_truncation_length, eos_token_mask)
-        elif not is_pos and not is_cv:
-            outputs = filter_and_truncate(outputs, neg_truncation_length, eos_token_mask)
-
-        # If no filtered outputs skip this batch.
-        if outputs.shape[0] == 0:
-            continue
-
-        # All outputs are padded to max-length with eos-tokens.
-        outputs = pad_to_len(outputs, max_length, False, tokenizer.eos_token_id, torch_device)
-        # outputs shape [num_filtered_entries, max_length]
-
-        eos_token_mask = logits_processor.compute_eos_token_mask(
-            input_ids=outputs,
-            eos_token_id=tokenizer.eos_token_id,
-        )
-
-        context_repetition_mask = logits_processor.compute_context_repetition_mask(
-            input_ids=outputs,
-        )
-
-        # context_repetition_mask of shape [num_filtered_entries, max_length -
-        # (ngram_len - 1)].
-        context_repetition_mask = pad_to_len(context_repetition_mask, max_length, True, 0, torch_device)
-        # We pad on left to get same max_length shape.
-        # context_repetition_mask of shape [num_filtered_entries, max_length].
-        combined_mask = context_repetition_mask * eos_token_mask
-
-        g_values = logits_processor.compute_g_values(
-            input_ids=outputs,
-        )
-
-        # g_values of shape [num_filtered_entries, max_length - (ngram_len - 1),
-        # depth].
-        g_values = pad_to_len(g_values, max_length, True, 0, torch_device)
-
-        # We pad on left to get same max_length shape.
-        # g_values of shape [num_filtered_entries, max_length, depth].
-        all_masks.append(combined_mask)
-        all_g_values.append(g_values)
-    return all_masks, all_g_values
-
-
-def tpr_at_fpr(detector, detector_inputs, w_true, minibatch_size, target_fpr=0.01) -> torch.Tensor:
-    """Calculates true positive rate (TPR) at false positive rate (FPR)=target_fpr."""
-    positive_idxs = w_true == 1
-    negative_idxs = w_true == 0
-    num_samples = detector_inputs[0].size(0)
-
-    w_preds = []
-    for start in range(0, num_samples, minibatch_size):
-        end = start + minibatch_size
-        detector_inputs_ = (
-            detector_inputs[0][start:end],
-            detector_inputs[1][start:end],
-        )
-        with torch.no_grad():
-            w_pred = detector(*detector_inputs_)[0]
-        w_preds.append(w_pred)
-
-    w_pred = torch.cat(w_preds, dim=0)  # Concatenate predictions
-    positive_scores = w_pred[positive_idxs]
-    negative_scores = w_pred[negative_idxs]
-
-    # Calculate the FPR threshold
-    # Note: percentile -> quantile
-    fpr_threshold = torch.quantile(negative_scores, 1 - target_fpr)
-    # Note: need to switch to FP32 since torch.mean doesn't work with torch.bool
-    return torch.mean((positive_scores >= fpr_threshold).to(dtype=torch.float32)).item()  # TPR
-
-
-def update_fn_if_fpr_tpr(detector, g_values_val, mask_val, watermarked_val, minibatch_size):
-    """Loss function for negative TPR@FPR=1% as the validation loss."""
-    tpr_ = tpr_at_fpr(
-        detector=detector,
-        detector_inputs=(g_values_val, mask_val),
-        w_true=watermarked_val,
-        minibatch_size=minibatch_size,
-    )
-    return -tpr_
-
-
-def process_raw_model_outputs(
-    logits_processor,
-    tokenizer,
-    pos_truncation_length,
-    neg_truncation_length,
-    max_padded_length,
-    tokenized_wm_outputs,
-    test_size,
-    tokenized_uwm_outputs,
-    torch_device,
-):
-    # Split data into train and CV
-    train_wm_outputs, cv_wm_outputs = model_selection.train_test_split(tokenized_wm_outputs, test_size=test_size)
-
-    train_uwm_outputs, cv_uwm_outputs = model_selection.train_test_split(tokenized_uwm_outputs, test_size=test_size)
-
-    process_kwargs = {
-        "logits_processor": logits_processor,
-        "tokenizer": tokenizer,
-        "pos_truncation_length": pos_truncation_length,
-        "neg_truncation_length": neg_truncation_length,
-        "max_length": max_padded_length,
-        "torch_device": torch_device,
-    }
-
-    # Process both train and CV data for training
-    wm_masks_train, wm_g_values_train = process_outputs_for_training(
-        [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in train_wm_outputs],
-        is_pos=True,
-        is_cv=False,
-        **process_kwargs,
-    )
-    wm_masks_cv, wm_g_values_cv = process_outputs_for_training(
-        [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in cv_wm_outputs],
-        is_pos=True,
-        is_cv=True,
-        **process_kwargs,
-    )
-    uwm_masks_train, uwm_g_values_train = process_outputs_for_training(
-        [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in train_uwm_outputs],
-        is_pos=False,
-        is_cv=False,
-        **process_kwargs,
-    )
-    uwm_masks_cv, uwm_g_values_cv = process_outputs_for_training(
-        [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in cv_uwm_outputs],
-        is_pos=False,
-        is_cv=True,
-        **process_kwargs,
-    )
-
-    # We get list of data; here we concat all together to be passed to the detector.
-    def pack(mask, g_values):
-        mask = torch.cat(mask, dim=0)
-        g = torch.cat(g_values, dim=0)
-        return mask, g
-
-    wm_masks_train, wm_g_values_train = pack(wm_masks_train, wm_g_values_train)
-    # Note: Use float instead of bool. Otherwise, the entropy calculation doesn't work
-    wm_labels_train = torch.ones((wm_masks_train.shape[0],), dtype=torch.float, device=torch_device)
-
-    wm_masks_cv, wm_g_values_cv = pack(wm_masks_cv, wm_g_values_cv)
-    wm_labels_cv = torch.ones((wm_masks_cv.shape[0],), dtype=torch.float, device=torch_device)
-
-    uwm_masks_train, uwm_g_values_train = pack(uwm_masks_train, uwm_g_values_train)
-    uwm_labels_train = torch.zeros((uwm_masks_train.shape[0],), dtype=torch.float, device=torch_device)
-
-    uwm_masks_cv, uwm_g_values_cv = pack(uwm_masks_cv, uwm_g_values_cv)
-    uwm_labels_cv = torch.zeros((uwm_masks_cv.shape[0],), dtype=torch.float, device=torch_device)
-
-    # Concat pos and negatives data together.
-    train_g_values = torch.cat((wm_g_values_train, uwm_g_values_train), dim=0).squeeze()
-    train_labels = torch.cat((wm_labels_train, uwm_labels_train), axis=0).squeeze()
-    train_masks = torch.cat((wm_masks_train, uwm_masks_train), axis=0).squeeze()
-
-    cv_g_values = torch.cat((wm_g_values_cv, uwm_g_values_cv), axis=0).squeeze()
-    cv_labels = torch.cat((wm_labels_cv, uwm_labels_cv), axis=0).squeeze()
-    cv_masks = torch.cat((wm_masks_cv, uwm_masks_cv), axis=0).squeeze()
-
-    # Shuffle data.
-    shuffled_idx = torch.randperm(train_g_values.shape[0])  # Use torch for GPU compatibility
-
-    train_g_values = train_g_values[shuffled_idx]
-    train_labels = train_labels[shuffled_idx]
-    train_masks = train_masks[shuffled_idx]
-
-    # Shuffle the cross-validation data
-    shuffled_idx_cv = torch.randperm(cv_g_values.shape[0])  # Use torch for GPU compatibility
-    cv_g_values = cv_g_values[shuffled_idx_cv]
-    cv_labels = cv_labels[shuffled_idx_cv]
-    cv_masks = cv_masks[shuffled_idx_cv]
-
-    # Del some variables so we free up GPU memory.
-    del (
-        wm_g_values_train,
-        wm_labels_train,
-        wm_masks_train,
-        wm_g_values_cv,
-        wm_labels_cv,
-        wm_masks_cv,
-    )
-    gc.collect()
-    torch.cuda.empty_cache()
-
-    return train_g_values, train_masks, train_labels, cv_g_values, cv_masks, cv_labels
-
-
-def get_tokenized_uwm_outputs(num_negatives, neg_batch_size, tokenizer, device):
-    dataset, info = tfds.load("wikipedia/20230601.en", split="train", with_info=True)
-    dataset = dataset.take(num_negatives)
-
-    # Convert the dataset to a DataFrame
-    df = tfds.as_dataframe(dataset, info)
-    ds = tf.data.Dataset.from_tensor_slices(dict(df))
-    tf.random.set_seed(0)
-    ds = ds.shuffle(buffer_size=10_000)
-    ds = ds.batch(batch_size=neg_batch_size)
-
-    tokenized_uwm_outputs = []
-    # Pad to this length (on the right) for batching.
-    padded_length = 1000
-    for i, batch in tqdm.tqdm(enumerate(ds)):
-        responses = [val.decode() for val in batch["text"].numpy()]
-        inputs = tokenizer(
-            responses,
-            return_tensors="pt",
-            padding=True,
-        ).to(device)
-        inputs = inputs["input_ids"].cpu().numpy()
-        if inputs.shape[1] >= padded_length:
-            inputs = inputs[:, :padded_length]
-        else:
-            inputs = np.concatenate(
-                [inputs, np.ones((neg_batch_size, padded_length - inputs.shape[1])) * tokenizer.eos_token_id], axis=1
-            )
-        tokenized_uwm_outputs.append(inputs)
-        if len(tokenized_uwm_outputs) * neg_batch_size > num_negatives:
-            break
-    return tokenized_uwm_outputs
-
-
-def get_tokenized_wm_outputs(
-    model,
-    tokenizer,
-    watermark_config,
-    num_pos_batches,
-    pos_batch_size,
-    temperature,
-    max_output_len,
-    top_k,
-    top_p,
-    device,
-):
-    eli5_prompts = datasets.load_dataset("Pavithree/eli5")
-
-    wm_outputs = []
-
-    for batch_id in tqdm.tqdm(range(num_pos_batches)):
-        prompts = eli5_prompts["train"]["title"][batch_id * pos_batch_size : (batch_id + 1) * pos_batch_size]
-        prompts = [prompt.strip('"') for prompt in prompts]
-        inputs = tokenizer(
-            prompts,
-            return_tensors="pt",
-            padding=True,
-        ).to(device)
-        _, inputs_len = inputs["input_ids"].shape
-
-        outputs = model.generate(
-            **inputs,
-            watermarking_config=watermark_config,
-            do_sample=True,
-            max_length=inputs_len + max_output_len,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-        )
-
-        wm_outputs.append(outputs[:, inputs_len:].cpu().detach())
-
-        del outputs, inputs, prompts
-        gc.collect()
-
-    gc.collect()
-    torch.cuda.empty_cache()
-    return wm_outputs
-
-
-def upload_model_to_hf(model, hf_repo_name: str, private: bool = True):
-    api = HfApi()
-
-    # Check if the repository exists
-    try:
-        api.repo_info(repo_id=hf_repo_name, use_auth_token=True)
-        print(f"Repository '{hf_repo_name}' already exists.")
-    except RepositoryNotFoundError:
-        # If the repository does not exist, create it
-        print(f"Repository '{hf_repo_name}' not found. Creating it...")
-        create_repo(repo_id=hf_repo_name, private=private, use_auth_token=True)
-        print(f"Repository '{hf_repo_name}' created successfully.")
-
-    # Push the model to the Hugging Face Hub
-    print(f"Uploading model to Hugging Face repo '{hf_repo_name}'...")
-    model.push_to_hub(repo_id=hf_repo_name, use_auth_token=True)
diff --git a/examples/research_projects/tapex/README.md b/examples/research_projects/tapex/README.md
deleted file mode 100644
index b98eb9b428d0..000000000000
--- a/examples/research_projects/tapex/README.md
+++ /dev/null
@@ -1,288 +0,0 @@
-<!---
-Copyright 2022 The Microsoft Inc. and The HuggingFace Inc. Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Run Table Tasks with TAPEX
-
-TAPEX is a table pre-training approach for table-related tasks. By learning a neural SQL executor over a synthetic corpus based on generative language models (e.g., BART), it achieves state-of-the-art performance on several table-based question answering benchmarks and table-based fact verification benchmark. More details can be found in the original paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/pdf/2107.07653.pdf).
-
-> If you are also familiar with [fairseq](https://github.com/pytorch/fairseq), you may also find [the official implementation](https://github.com/microsoft/Table-Pretraining) useful, which leverages the framework.
-
-## Table Question Answering Tasks
-
-### What is Table Question Answering
-
-![Example](https://table-pretraining.github.io/assets/tableqa_task.png)
-
-The task of Table Question Answering (TableQA) is to empower machines to answer users' questions over a given table. The resulting answer(s) can be a region in the table, or a number calculated by applying aggregation operators to a specific region.
-
-### What Questions Can be Answered
-
-Benefiting from the powerfulness of generative models, TAPEX can deal with almost all kinds of questions over tables (if there is training data). Below are some typical question and their answers taken from [WikiTableQuestion](https://nlp.stanford.edu/blog/wikitablequestions-a-complex-real-world-question-understanding-dataset).
-
-| Question | Answer |
-| :---: | :---: |
-| What is the years won for each team? | 2004, 2008, 2012 |
-| How long did Taiki Tsuchiya last? | 4:27 |
-| What is the total amount of matches drawn? | 1 |
-| Besides Tiger Woods, what other player won between 2007 and 2009? | Camilo Villegas |
-| What was the last Baekje Temple? | Uija |
-| What is the difference between White voters and Black voters in 1948? | 0 |
-| What is the average number of sailors for each country during the worlds qualification tournament? | 2 |
-
-
-### How to Fine-tune TAPEX on TableQA
-
-We provide a fine-tuning script of tapex for TableQA on the WikiSQL benchmark: [WikiSQL](https://github.com/salesforce/WikiSQL).
-This script is customized for tapex models, and can be easily adapted to other benchmarks such as WikiTableQuestion
-(only some tweaks in the function `preprocess_tableqa_function`).
-
-#### TAPEX-Base on WikiSQL
-
-Here is how to run the script on the WikiSQL with `tapex-base`:
-> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 16GB and 1 GPU card. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly.
-
-```bash
-export EXP_NAME=wikisql_tapex_base
-
-python run_wikisql_with_tapex.py \
-  --do_train \
-  --do_eval \
-  --output_dir $EXP_NAME \
-  --model_name_or_path microsoft/tapex-base \
-  --overwrite_output_dir \
-  --per_device_train_batch_size 4 \
-  --gradient_accumulation_steps 8 \
-  --per_device_eval_batch_size 4 \
-  --learning_rate 3e-5 \
-  --logging_steps 10 \
-  --eval_steps 1000 \
-  --save_steps 1000 \
-  --warmup_steps 1000 \
-  --eval_strategy steps \
-  --predict_with_generate \
-  --num_beams 5 \
-  --weight_decay 1e-2 \
-  --label_smoothing_factor 0.1 \
-  --max_steps 20000
-```
-
-#### TAPEX-Large on WikiSQL
-
-Here is how to run the script on the WikiSQL with `tapex-large`:
-> The default hyper-parameter may allow you to reproduce our reported tapex-large results within the memory budget of 16GB and 1 GPU card with fp16. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. If you do not install apex or other mixed-precision-training libs, you could disable the `predict_with_generate` option to save GPU memory and manually evaluate the model once the fine-tuning finished. Or just pick up the last checkpoint, which usually performs good enough on the dataset.
-
-```bash
-export EXP_NAME=wikisql_tapex_large
-
-python run_wikisql_with_tapex.py \
-  --do_train \
-  --do_eval \
-  --output_dir $EXP_NAME \
-  --model_name_or_path microsoft/tapex-large \
-  --overwrite_output_dir \
-  --per_device_train_batch_size 1 \
-  --gradient_accumulation_steps 32 \
-  --per_device_eval_batch_size 4 \
-  --learning_rate 3e-5 \
-  --logging_steps 10 \
-  --eval_steps 1000 \
-  --save_steps 1000 \
-  --warmup_steps 1000 \
-  --eval_strategy steps \
-  --predict_with_generate \
-  --num_beams 5 \
-  --weight_decay 1e-2 \
-  --label_smoothing_factor 0.1 \
-  --max_steps 20000 \
-  --fp16
-```
-
-#### TAPEX-Base on WikiTableQuestions
-
-Here is how to run the script on the WikiTableQuestions with `tapex-base`:
-> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 16GB and 1 GPU card. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly.
-
-```bash
-export EXP_NAME=wikitablequestions_tapex_base
-
-python run_wikitablequestions_with_tapex.py \
-  --do_train \
-  --do_eval \
-  --output_dir $EXP_NAME \
-  --model_name_or_path microsoft/tapex-base \
-  --overwrite_output_dir \
-  --per_device_train_batch_size 4 \
-  --gradient_accumulation_steps 8 \
-  --per_device_eval_batch_size 4 \
-  --learning_rate 3e-5 \
-  --logging_steps 10 \
-  --eval_steps 1000 \
-  --save_steps 1000 \
-  --warmup_steps 1000 \
-  --eval_strategy steps \
-  --predict_with_generate \
-  --num_beams 5 \
-  --weight_decay 1e-2 \
-  --label_smoothing_factor 0.1 \
-  --max_steps 20000
-```
-
-#### TAPEX-Large on WikiTableQuestions
-
-Here is how to run the script on the WikiTableQuestions with `tapex-large`:
-> The default hyper-parameter may allow you to reproduce our reported tapex-large results within the memory budget of 16GB and 1 GPU card with fp16. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. If you do not install apex or other mixed-precision-training libs, you could reduce the `per_device_train_batch_size` and `per_device_eval_batch_size` and have another try. Or you could disable the `predict_with_generate` option to save GPU memory and manually evaluate the model once the fine-tuning finished. Or just pick up the last checkpoint, which usually performs good enough on the dataset.
-
-```bash
-export EXP_NAME=wikitablequestions_tapex_large
-
-python run_wikitablequestions_with_tapex.py \
-  --do_train \
-  --do_eval \
-  --output_dir $EXP_NAME \
-  --model_name_or_path microsoft/tapex-large \
-  --overwrite_output_dir \
-  --per_device_train_batch_size 2 \
-  --gradient_accumulation_steps 12 \
-  --per_device_eval_batch_size 4 \
-  --learning_rate 3e-5 \
-  --logging_steps 10 \
-  --eval_steps 1000 \
-  --save_steps 1000 \
-  --warmup_steps 1000 \
-  --eval_strategy steps \
-  --predict_with_generate \
-  --num_beams 5 \
-  --weight_decay 1e-2 \
-  --label_smoothing_factor 0.1 \
-  --max_steps 20000 \
-  --fp16
-```
-
-### How to Evaluate TAPEX Fine-tuned Models on TableQA
-
-We provide fine-tuned model weights to reproduce our results. You can evaluate them using the following command:
-> You can also replace `microsoft/tapex-base-finetuned-wikisql` with your local directory to evaluate your fine-tuned models. Notice that if the model has a larger size, you should reduce `per_device_eval_batch_size` to fit the memory requirement.
-
-```bash
-export EXP_NAME=wikisql_tapex_base_eval
-
-python run_wikisql_with_tapex.py \
-  --do_eval \
-  --model_name_or_path microsoft/tapex-base-finetuned-wikisql \
-  --output_dir $EXP_NAME \
-  --per_device_eval_batch_size 4 \
-  --predict_with_generate \
-  --num_beams 5
-```
-
-## Table Fact Verification Tasks
-
-### What is Table Fact Verification
-
-![Example](https://table-pretraining.github.io/assets/tableft_task.png)
-
-The task of Table Fact Verification (TableFV) is to empower machines to justify if a statement follows facts in a given table. The result is a binary classification belonging to `1` (entailed) or `0` (refused).
-
-### How to Fine-tune TAPEX on TableFV
-
-#### TAPEX-Base on TabFact
-
-We provide a fine-tuning script of tapex for TableFV on the TabFact benchmark: [TabFact](https://github.com/wenhuchen/Table-Fact-Checking).
-
-Here is how to run the script on the TabFact:
-> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 16GB and 1 GPU card. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. Note that the `eval_accumulation_steps` is necessary, otherwise GPU memory leaks will occur during the evaluation.
-
-```bash
-export EXP_NAME=tabfact_tapex_base
-
-python run_tabfact_with_tapex.py \
-  --do_train \
-  --do_eval \
-  --output_dir $EXP_NAME \
-  --model_name_or_path microsoft/tapex-base \
-  --overwrite_output_dir \
-  --per_device_train_batch_size 3 \
-  --gradient_accumulation_steps 16 \
-  --per_device_eval_batch_size 12 \
-  --eval_accumulation_steps 6 \
-  --warm_steps 1000 \
-  --logging_steps 10 \
-  --learning_rate 3e-5 \
-  --eval_steps 1000 \
-  --save_steps 1000 \
-  --eval_strategy steps \
-  --weight_decay 1e-2 \
-  --max_steps 30000 \
-  --max_grad_norm 0.1
-```
-
-#### TAPEX-Large on TabFact
-
-Here is how to run the script on the TabFact:
-> The default hyper-parameter may allow you to reproduce our reported tapex-base results within the memory budget of 24GB and 1 GPU card. Sorry we cannot reduce the memory consumption since the model input in TabFact usually contains nearly ~1000 tokens. If you have more GPU cards, you could reduce `gradient_accumulation_steps` accordingly. Note that the `eval_accumulation_steps` is necessary, otherwise GPU memory leaks will occur during the evaluation.
-
-```bash
-export EXP_NAME=tabfact_tapex_large
-
-python run_tabfact_with_tapex.py \
-  --do_train \
-  --do_eval \
-  --output_dir $EXP_NAME \
-  --model_name_or_path microsoft/tapex-large \
-  --overwrite_output_dir \
-  --per_device_train_batch_size 2 \
-  --gradient_accumulation_steps 18 \
-  --per_device_eval_batch_size 4 \
-  --eval_accumulation_steps 12 \
-  --warm_steps 1000 \
-  --logging_steps 10 \
-  --learning_rate 3e-5 \
-  --eval_steps 1000 \
-  --save_steps 1000 \
-  --eval_strategy steps \
-  --weight_decay 1e-2 \
-  --max_steps 30000 \
-  --max_grad_norm 0.1
-```
-
-### How to Evaluate TAPEX Fine-tuned Models on TableFV
-
-We provide fine-tuned model weights to reproduce our results. You can evaluate them using the following command:
-> You can also replace `microsoft/tapex-base-finetuned-tabfact` with your local directory to evaluate your fine-tuned models. Notice that if the model has a larger size, you should reduce `per_device_eval_batch_size` to fit the memory requirement.
-
-```bash
-export EXP_NAME=tabfact_tapex_base_eval
-
-python run_tabfact_with_tapex.py \
-  --do_eval \
-  --model_name_or_path microsoft/tapex-base-finetuned-tabfact \
-  --output_dir $EXP_NAME \
-  --per_device_eval_batch_size 12 \
-  --eval_accumulation_steps 6
-```
-
-## Reproduced Results
-
-We get the following results on the dev set of the benchmark with the previous commands:
-
-| Task | Model Size | Metric | Result |
-|:---:|:---:|:---:|:---:|
-| WikiSQL (Weak) | Base | Denotation Accuracy | 88.1 |
-| WikiSQL (Weak) | Large | Denotation Accuracy | 89.5 |
-| WikiTableQuestion | Base | Denotation Accuracy | 47.1 |
-| WikiTableQuestion | Large | Denotation Accuracy | 57.2 |
-| TabFact | Base | Accuracy | 78.7 |
-| TabFact | Large | Accuracy | 83.6 |
diff --git a/examples/research_projects/tapex/requirements.txt b/examples/research_projects/tapex/requirements.txt
deleted file mode 100644
index 2379012a9b23..000000000000
--- a/examples/research_projects/tapex/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-numpy
-datasets
-pandas
-nltk
\ No newline at end of file
diff --git a/examples/research_projects/tapex/run_tabfact_with_tapex.py b/examples/research_projects/tapex/run_tabfact_with_tapex.py
deleted file mode 100644
index 5dcec10a084c..000000000000
--- a/examples/research_projects/tapex/run_tabfact_with_tapex.py
+++ /dev/null
@@ -1,471 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The Microsoft and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Fine-tuning the library models for tapex on table-based fact verification tasks.
-Adapted from script: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_glue.py
-"""
-
-import logging
-import os
-import random
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import datasets
-import numpy as np
-import pandas as pd
-from datasets import load_dataset
-
-import transformers
-from transformers import (
-    AutoConfig,
-    BartForSequenceClassification,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    HfArgumentParser,
-    TapexTokenizer,
-    Trainer,
-    TrainingArguments,
-    default_data_collator,
-    set_seed,
-)
-from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version
-from transformers.utils.versions import require_version
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.17.0.dev0")
-
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_name: Optional[str] = field(
-        default="tab_fact", metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default="tab_fact",
-        metadata={"help": "The configuration name of the dataset to use (via the datasets library)."},
-    )
-    max_seq_length: int = field(
-        default=1024,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to `max_seq_length`. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "A csv or a json file containing the training data."}
-    )
-    validation_file: Optional[str] = field(
-        default=None, metadata={"help": "A csv or a json file containing the validation data."}
-    )
-    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})
-
-    def __post_init__(self):
-        if self.dataset_name is not None:
-            pass
-        elif self.train_file is None or self.validation_file is None:
-            raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
-        else:
-            train_extension = self.train_file.split(".")[-1]
-            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            validation_extension = self.validation_file.split(".")[-1]
-            assert (
-                validation_extension == train_extension
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
-            )
-        },
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-
-    log_level = training_args.get_process_log_level()
-    logger.setLevel(log_level)
-    datasets.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.set_verbosity(log_level)
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table.
-    #
-    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
-    # single column. You can easily tweak this behavior (see below)
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(
-            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
-        )
-    else:
-        # Loading a dataset from your local files.
-        # CSV/JSON training and evaluation files are needed.
-        data_files = {"train": data_args.train_file, "validation": data_args.validation_file}
-
-        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
-        # when you use `do_predict` without specifying a GLUE benchmark task.
-        if training_args.do_predict:
-            if data_args.test_file is not None:
-                train_extension = data_args.train_file.split(".")[-1]
-                test_extension = data_args.test_file.split(".")[-1]
-                assert (
-                    test_extension == train_extension
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
-                data_files["test"] = data_args.test_file
-            else:
-                raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
-
-        for key in data_files.keys():
-            logger.info(f"load a local file for {key}: {data_files[key]}")
-
-        if data_args.train_file.endswith(".csv"):
-            # Loading a dataset from local csv files
-            raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
-        else:
-            # Loading a dataset from local json files
-            raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
-    # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Labels
-    label_list = raw_datasets["train"].features["label"].names
-    num_labels = len(label_list)
-
-    # Load pretrained model and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-    # load tapex tokenizer
-    tokenizer = TapexTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-        add_prefix_space=True,
-    )
-    model = BartForSequenceClassification.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-
-    # Padding strategy
-    if data_args.pad_to_max_length:
-        padding = "max_length"
-    else:
-        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
-        padding = False
-
-    # Some models have set the order of the labels to use, so let's make sure we do use it.
-    model.config.label2id = {"Refused": 0, "Entailed": 1}
-    model.config.id2label = {0: "Refused", 1: "Entailed"}
-
-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the "
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
-    def preprocess_tabfact_function(examples):
-        # Tokenize the texts
-        def _convert_table_text_to_pandas(_table_text):
-            """Runs the structured pandas table object for _table_text.
-            An example _table_text can be: round#clubs remaining\nfirst round#156\n
-            """
-            _table_content = [_table_row.split("#") for _table_row in _table_text.strip("\n").split("\n")]
-            _table_pd = pd.DataFrame.from_records(_table_content[1:], columns=_table_content[0])
-            return _table_pd
-
-        questions = examples["statement"]
-        tables = list(map(_convert_table_text_to_pandas, examples["table_text"]))
-        result = tokenizer(tables, questions, padding=padding, max_length=max_seq_length, truncation=True)
-
-        result["label"] = examples["label"]
-        return result
-
-    with training_args.main_process_first(desc="dataset map pre-processing"):
-        raw_datasets = raw_datasets.map(
-            preprocess_tabfact_function,
-            batched=True,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on dataset",
-        )
-    if training_args.do_train:
-        if "train" not in raw_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = raw_datasets["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = raw_datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-
-    if training_args.do_predict or data_args.test_file is not None:
-        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_dataset = raw_datasets["test"]
-        if data_args.max_predict_samples is not None:
-            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
-
-    # Log a few random samples from the training set:
-    if training_args.do_train:
-        for index in random.sample(range(len(train_dataset)), 3):
-            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
-
-    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
-    # predictions and label_ids field) and has to return a dictionary string to float.
-    def compute_metrics(p: EvalPrediction):
-        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
-        preds = np.argmax(preds, axis=1)
-        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
-
-    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
-    if data_args.pad_to_max_length:
-        data_collator = default_data_collator
-    elif training_args.fp16:
-        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
-    else:
-        data_collator = None
-
-    # Initialize our Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        compute_metrics=compute_metrics,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-    )
-
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        metrics = trainer.evaluate(eval_dataset=eval_dataset)
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-
-        # Removing the `label` columns because it contains -1 and Trainer won't like that.
-        predict_dataset = predict_dataset.remove_columns("label")
-        predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
-        predictions = np.argmax(predictions, axis=1)
-
-        output_predict_file = os.path.join(training_args.output_dir, "predict_results_tabfact.txt")
-        if trainer.is_world_process_zero():
-            with open(output_predict_file, "w") as writer:
-                logger.info("***** Predict Results *****")
-                writer.write("index\tprediction\n")
-                for index, item in enumerate(predictions):
-                    item = label_list[item]
-                    writer.write(f"{index}\t{item}\n")
-
-    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/tapex/run_wikisql_with_tapex.py b/examples/research_projects/tapex/run_wikisql_with_tapex.py
deleted file mode 100644
index 81e940a77c88..000000000000
--- a/examples/research_projects/tapex/run_wikisql_with_tapex.py
+++ /dev/null
@@ -1,649 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The Microsoft and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Fine-tuning the library models for tapex on table-based question answering tasks.
-Adapted from script: https://github.com/huggingface/transformers/blob/master/examples/pytorch/summarization/run_summarization.py
-"""
-
-import logging
-import os
-import sys
-from collections import defaultdict
-from copy import deepcopy
-from dataclasses import dataclass, field
-from functools import partial
-from typing import List, Optional
-
-import nltk  # Here to have a nice missing dependency error message early on
-import numpy as np
-import pandas as pd
-from datasets import load_dataset
-from filelock import FileLock
-from wikisql_utils import _TYPE_CONVERTER, retrieve_wikisql_query_answer_tapas
-
-import transformers
-from transformers import (
-    AutoConfig,
-    BartForConditionalGeneration,
-    DataCollatorForSeq2Seq,
-    HfArgumentParser,
-    Seq2SeqTrainer,
-    Seq2SeqTrainingArguments,
-    TapexTokenizer,
-    set_seed,
-)
-from transformers.file_utils import is_offline_mode
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.17.0.dev0")
-
-logger = logging.getLogger(__name__)
-
-try:
-    nltk.data.find("tokenizers/punkt")
-except (LookupError, OSError):
-    if is_offline_mode():
-        raise LookupError(
-            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
-        )
-    with FileLock(".lock") as lock:
-        nltk.download("punkt", quiet=True)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Pretrained tokenizer name or path if not the same as model_name. "
-                "By default we use BART-large tokenizer for TAPEX-large."
-            )
-        },
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default="wikisql", metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
-            )
-        },
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_source_length: Optional[int] = field(
-        default=1024,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    max_target_length: Optional[int] = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total sequence length for target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    val_max_target_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. "
-                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
-                "during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to model maximum sentence length. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-                "efficient on GPU but very bad for TPU."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    num_beams: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
-                "which is used during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    ignore_pad_token_for_loss: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
-        },
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-        if self.val_max_target_length is None:
-            self.val_max_target_length = self.max_target_length
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table.
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
-
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-
-    # IMPORTANT: the initial BART model's decoding is penalized by no_repeat_ngram_size, and thus
-    # we should disable it here to avoid problematic generation
-    config.no_repeat_ngram_size = 0
-    config.max_length = 1024
-    config.early_stopping = False
-
-    # load tapex tokenizer
-    tokenizer = TapexTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-        add_prefix_space=True,
-    )
-
-    # load Bart based Tapex model (default tapex-large)
-    model = BartForConditionalGeneration.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-
-    if model.config.decoder_start_token_id is None:
-        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    if training_args.do_train:
-        column_names = datasets["train"].column_names
-    elif training_args.do_eval:
-        column_names = datasets["validation"].column_names
-    elif training_args.do_predict:
-        column_names = datasets["test"].column_names
-    else:
-        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
-        return
-
-    # Temporarily set max_target_length for training.
-    max_target_length = data_args.max_target_length
-    padding = "max_length" if data_args.pad_to_max_length else False
-
-    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warning(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for "
-            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
-        )
-
-    def preprocess_tableqa_function(examples, is_training=False):
-        """
-        The is_training FLAG is used to identify if we could use the supervision
-        to truncate the table content if it is required.
-        """
-
-        # this function is specific for WikiSQL since the util function need the data structure
-        # to retrieve the WikiSQL answer for each question
-        def _convert_table_types(_table):
-            """Runs the type converter over the table cells."""
-            ret_table = deepcopy(_table)
-            types = ret_table["types"]
-            ret_table["real_rows"] = ret_table["rows"]
-            typed_rows = []
-            for row in ret_table["rows"]:
-                typed_row = []
-                for column, cell_value in enumerate(row):
-                    typed_row.append(_TYPE_CONVERTER[types[column]](cell_value))
-                typed_rows.append(typed_row)
-            ret_table["rows"] = typed_rows
-            return ret_table
-
-        questions = [question.lower() for question in examples["question"]]
-        example_tables = examples["table"]
-        example_sqls = examples["sql"]
-        tables = [
-            pd.DataFrame.from_records(example_table["rows"], columns=example_table["header"])
-            for example_table in example_tables
-        ]
-
-        # using tapas utils to obtain wikisql answer
-        answers = []
-        for example_sql, example_table in zip(example_sqls, example_tables):
-            tapas_table = _convert_table_types(example_table)
-            answer_list: List[str] = retrieve_wikisql_query_answer_tapas(tapas_table, example_sql)
-            # you can choose other delimiters to split each answer
-            answers.append(answer_list)
-
-        # IMPORTANT: we cannot pass by answers during evaluation, answers passed during training are used to
-        # truncate large tables in the train set!
-        if is_training:
-            model_inputs = tokenizer(
-                table=tables,
-                query=questions,
-                answer=answers,
-                max_length=data_args.max_source_length,
-                padding=padding,
-                truncation=True,
-            )
-        else:
-            model_inputs = tokenizer(
-                table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
-            )
-
-        labels = tokenizer(
-            answer=[", ".join(answer) for answer in answers],
-            max_length=max_target_length,
-            padding=padding,
-            truncation=True,
-        )
-
-        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-        # padding in the loss.
-        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-        model_inputs["labels"] = labels["input_ids"]
-
-        return model_inputs
-
-    # in training, we can use the answer as extra information to truncate large tables
-    preprocess_tableqa_function_training = partial(preprocess_tableqa_function, is_training=True)
-
-    if training_args.do_train:
-        if "train" not in datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        train_dataset = train_dataset.map(
-            preprocess_tableqa_function_training,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    if training_args.do_eval:
-        max_target_length = data_args.val_max_target_length
-        if "validation" not in datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-        eval_dataset = eval_dataset.map(
-            preprocess_tableqa_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    if training_args.do_predict:
-        max_target_length = data_args.val_max_target_length
-        if "test" not in datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_dataset = datasets["test"]
-        if data_args.max_predict_samples is not None:
-            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
-        predict_dataset = predict_dataset.map(
-            preprocess_tableqa_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    # Data collator
-    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
-    data_collator = DataCollatorForSeq2Seq(
-        tokenizer,
-        model=model,
-        label_pad_token_id=label_pad_token_id,
-        pad_to_multiple_of=8 if training_args.fp16 else None,
-    )
-
-    def postprocess_text(preds, labels):
-        preds = [pred.strip() for pred in preds]
-        labels = [label.strip() for label in labels]
-
-        return preds, labels
-
-    def compute_metrics(eval_preds):
-        preds, labels = eval_preds
-        if isinstance(preds, tuple):
-            preds = preds[0]
-        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-        if data_args.ignore_pad_token_for_loss:
-            # Replace -100 in the labels as we can't decode them.
-            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
-        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-
-        # Some simple post-processing
-        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
-
-        delimiter = ", "
-
-        # define example evaluation
-        def evaluate_example(predict_str: str, ground_str: str):
-            predict_spans = predict_str.split(delimiter)
-            ground_spans = ground_str.split(delimiter)
-            predict_values = defaultdict(lambda: 0)
-            ground_values = defaultdict(lambda: 0)
-            for span in predict_spans:
-                try:
-                    predict_values[float(span)] += 1
-                except ValueError:
-                    predict_values[span.strip()] += 1
-            for span in ground_spans:
-                try:
-                    ground_values[float(span)] += 1
-                except ValueError:
-                    ground_values[span.strip()] += 1
-            is_correct = predict_values == ground_values
-            return is_correct
-
-        def get_denotation_accuracy(predictions: List[str], references: List[str]):
-            assert len(predictions) == len(references)
-            correct_num = 0
-            for predict_str, ground_str in zip(predictions, references):
-                is_correct = evaluate_example(predict_str.lower(), ground_str.lower())
-                if is_correct:
-                    correct_num += 1
-            return correct_num / len(predictions)
-
-        accuracy = get_denotation_accuracy(decoded_preds, decoded_labels)
-        result = {"denotation_accuracy": accuracy}
-
-        return result
-
-    # Initialize our Trainer
-    trainer = Seq2SeqTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
-    )
-
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        metrics = trainer.evaluate(
-            max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
-        )
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-
-        predict_results = trainer.predict(
-            predict_dataset,
-            metric_key_prefix="predict",
-            max_length=data_args.val_max_target_length,
-            num_beams=data_args.num_beams,
-        )
-        metrics = predict_results.metrics
-        max_predict_samples = (
-            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
-        )
-        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
-
-        trainer.log_metrics("predict", metrics)
-        trainer.save_metrics("predict", metrics)
-
-        if trainer.is_world_process_zero():
-            if training_args.predict_with_generate:
-                predictions = tokenizer.batch_decode(
-                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
-                )
-                predictions = [pred.strip() for pred in predictions]
-                output_prediction_file = os.path.join(training_args.output_dir, "tapex_predictions.txt")
-                with open(output_prediction_file, "w") as writer:
-                    writer.write("\n".join(predictions))
-
-    return results
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
deleted file mode 100644
index 55350025cb3b..000000000000
--- a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
+++ /dev/null
@@ -1,625 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The Microsoft and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Fine-tuning the library models for tapex on table-based question answering tasks.
-Adapted from script: https://github.com/huggingface/transformers/blob/master/examples/pytorch/summarization/run_summarization.py
-"""
-
-import logging
-import os
-import sys
-from collections import defaultdict
-from dataclasses import dataclass, field
-from functools import partial
-from typing import List, Optional
-
-import nltk  # Here to have a nice missing dependency error message early on
-import numpy as np
-import pandas as pd
-from datasets import load_dataset
-from filelock import FileLock
-
-import transformers
-from transformers import (
-    AutoConfig,
-    BartForConditionalGeneration,
-    DataCollatorForSeq2Seq,
-    HfArgumentParser,
-    Seq2SeqTrainer,
-    Seq2SeqTrainingArguments,
-    TapexTokenizer,
-    set_seed,
-)
-from transformers.file_utils import is_offline_mode
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.17.0.dev0")
-
-logger = logging.getLogger(__name__)
-
-try:
-    nltk.data.find("tokenizers/punkt")
-except (LookupError, OSError):
-    if is_offline_mode():
-        raise LookupError(
-            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
-        )
-    with FileLock(".lock") as lock:
-        nltk.download("punkt", quiet=True)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Pretrained tokenizer name or path if not the same as model_name. "
-                "By default we use BART-large tokenizer for TAPEX-large."
-            )
-        },
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
-                "with private models)."
-            )
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default="wikitablequestions", metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "An optional input evaluation data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
-            )
-        },
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "An optional input test data file to evaluate the metrics (rouge) on (a jsonlines or csv file)."
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_source_length: Optional[int] = field(
-        default=1024,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    max_target_length: Optional[int] = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total sequence length for target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    val_max_target_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`. "
-                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
-                "during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to pad all samples to model maximum sentence length. "
-                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-                "efficient on GPU but very bad for TPU."
-            )
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    num_beams: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
-                "which is used during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    ignore_pad_token_for_loss: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
-        },
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-        if self.val_max_target_length is None:
-            self.val_max_target_length = self.max_target_length
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table.
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
-
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-
-    # IMPORTANT: the initial BART model's decoding is penalized by no_repeat_ngram_size, and thus
-    # we should disable it here to avoid problematic generation
-    config.no_repeat_ngram_size = 0
-    config.max_length = 1024
-    config.early_stopping = False
-
-    # load tapex tokenizer
-    tokenizer = TapexTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-        add_prefix_space=True,
-    )
-
-    # load Bart based Tapex model (default tapex-large)
-    model = BartForConditionalGeneration.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        token=True if model_args.use_auth_token else None,
-    )
-
-    if model.config.decoder_start_token_id is None:
-        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    if training_args.do_train:
-        column_names = datasets["train"].column_names
-    elif training_args.do_eval:
-        column_names = datasets["validation"].column_names
-    elif training_args.do_predict:
-        column_names = datasets["test"].column_names
-    else:
-        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
-        return
-
-    # Temporarily set max_target_length for training.
-    max_target_length = data_args.max_target_length
-    padding = "max_length" if data_args.pad_to_max_length else False
-
-    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warning(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for "
-            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
-        )
-
-    def preprocess_tableqa_function(examples, is_training=False):
-        """
-        The is_training FLAG is used to identify if we could use the supervision
-        to truncate the table content if it is required.
-        """
-
-        questions = [question.lower() for question in examples["question"]]
-        example_tables = examples["table"]
-        tables = [
-            pd.DataFrame.from_records(example_table["rows"], columns=example_table["header"])
-            for example_table in example_tables
-        ]
-
-        # using wikitablequestion's answer set
-        answers = examples["answers"]
-
-        # IMPORTANT: we cannot pass by answers during evaluation, answers passed during training are used to
-        # truncate large tables in the train set!
-        if is_training:
-            model_inputs = tokenizer(
-                table=tables,
-                query=questions,
-                answer=answers,
-                max_length=data_args.max_source_length,
-                padding=padding,
-                truncation=True,
-            )
-        else:
-            model_inputs = tokenizer(
-                table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
-            )
-
-        labels = tokenizer(
-            answer=[", ".join(answer) for answer in answers],
-            max_length=max_target_length,
-            padding=padding,
-            truncation=True,
-        )
-
-        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-        # padding in the loss.
-        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-        model_inputs["labels"] = labels["input_ids"]
-
-        return model_inputs
-
-    # in training, we can use the answer as extra information to truncate large tables
-    preprocess_tableqa_function_training = partial(preprocess_tableqa_function, is_training=True)
-
-    if training_args.do_train:
-        if "train" not in datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        train_dataset = train_dataset.map(
-            preprocess_tableqa_function_training,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    if training_args.do_eval:
-        max_target_length = data_args.val_max_target_length
-        if "validation" not in datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
-        if data_args.max_eval_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-        eval_dataset = eval_dataset.map(
-            preprocess_tableqa_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    if training_args.do_predict:
-        max_target_length = data_args.val_max_target_length
-        if "test" not in datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_dataset = datasets["test"]
-        if data_args.max_predict_samples is not None:
-            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
-        predict_dataset = predict_dataset.map(
-            preprocess_tableqa_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    # Data collator
-    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
-    data_collator = DataCollatorForSeq2Seq(
-        tokenizer,
-        model=model,
-        label_pad_token_id=label_pad_token_id,
-        pad_to_multiple_of=8 if training_args.fp16 else None,
-    )
-
-    def postprocess_text(preds, labels):
-        preds = [pred.strip() for pred in preds]
-        labels = [label.strip() for label in labels]
-
-        return preds, labels
-
-    def compute_metrics(eval_preds):
-        preds, labels = eval_preds
-        if isinstance(preds, tuple):
-            preds = preds[0]
-        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-        if data_args.ignore_pad_token_for_loss:
-            # Replace -100 in the labels as we can't decode them.
-            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
-        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-
-        # Some simple post-processing
-        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
-
-        delimiter = ", "
-
-        # define example evaluation
-        def evaluate_example(predict_str: str, ground_str: str):
-            predict_spans = predict_str.split(delimiter)
-            ground_spans = ground_str.split(delimiter)
-            predict_values = defaultdict(lambda: 0)
-            ground_values = defaultdict(lambda: 0)
-            for span in predict_spans:
-                try:
-                    predict_values[float(span)] += 1
-                except ValueError:
-                    predict_values[span.strip()] += 1
-            for span in ground_spans:
-                try:
-                    ground_values[float(span)] += 1
-                except ValueError:
-                    ground_values[span.strip()] += 1
-            _is_correct = predict_values == ground_values
-            return _is_correct
-
-        def get_denotation_accuracy(predictions: List[str], references: List[str]):
-            assert len(predictions) == len(references)
-            correct_num = 0
-            for predict_str, ground_str in zip(predictions, references):
-                is_correct = evaluate_example(predict_str.lower(), ground_str.lower())
-                if is_correct:
-                    correct_num += 1
-            return correct_num / len(predictions)
-
-        accuracy = get_denotation_accuracy(decoded_preds, decoded_labels)
-        result = {"denotation_accuracy": accuracy}
-
-        return result
-
-    # Initialize our Trainer
-    trainer = Seq2SeqTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
-    )
-
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        metrics = trainer.evaluate(
-            max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
-        )
-        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-
-        predict_results = trainer.predict(
-            predict_dataset,
-            metric_key_prefix="predict",
-            max_length=data_args.val_max_target_length,
-            num_beams=data_args.num_beams,
-        )
-        metrics = predict_results.metrics
-        max_predict_samples = (
-            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
-        )
-        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
-
-        trainer.log_metrics("predict", metrics)
-        trainer.save_metrics("predict", metrics)
-
-        if trainer.is_world_process_zero():
-            if training_args.predict_with_generate:
-                predictions = tokenizer.batch_decode(
-                    predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
-                )
-                predictions = [pred.strip() for pred in predictions]
-                output_prediction_file = os.path.join(training_args.output_dir, "tapex_predictions.txt")
-                with open(output_prediction_file, "w") as writer:
-                    writer.write("\n".join(predictions))
-
-    return results
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/tapex/wikisql_utils.py b/examples/research_projects/tapex/wikisql_utils.py
deleted file mode 100644
index 13d10e091a10..000000000000
--- a/examples/research_projects/tapex/wikisql_utils.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Microsoft, The Google and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dataclasses
-import enum
-import functools
-import math
-import re
-
-# The following script is adapted from the script of TaPas.
-# Original: https://github.com/google-research/tapas/master/wikisql_utils.py
-from typing import Any, List
-
-
-EMPTY_ANSWER = "none"
-EMPTY_ANSWER_AGG = "none"
-
-
-def _split_thousands(delimiter, value):
-    split = value.split(delimiter)
-    return len(split) > 1 and any((len(x) == 3 for x in split))
-
-
-def convert_to_float(value):
-    """Converts value to a float using a series of increasingly complex heuristics.
-    Args:
-      value: object that needs to be converted. Allowed types include
-        float/int/strings.
-    Returns:
-      A float interpretation of value.
-    Raises:
-      ValueError if the float conversion of value fails.
-    """
-    if isinstance(value, float):
-        return value
-    if isinstance(value, int):
-        return float(value)
-    if not isinstance(value, str):
-        raise TypeError("Argument value is not a string. Can't parse it as float")
-    sanitized = value
-
-    try:
-        # Example: 1,000.7
-        if "." in sanitized and "," in sanitized:
-            return float(sanitized.replace(",", ""))
-        # 1,000
-        if "," in sanitized and _split_thousands(",", sanitized):
-            return float(sanitized.replace(",", ""))
-        # 5,5556
-        if "," in sanitized and sanitized.count(",") == 1 and not _split_thousands(",", sanitized):
-            return float(sanitized.replace(",", "."))
-        # 0.0.0.1
-        if sanitized.count(".") > 1:
-            return float(sanitized.replace(".", ""))
-        # 0,0,0,1
-        if sanitized.count(",") > 1:
-            return float(sanitized.replace(",", ""))
-        return float(sanitized)
-    except ValueError:
-        # Avoid adding the sanitized value in the error message.
-        raise ValueError("Unable to convert value to float")
-
-
-def _normalize_float(answer):
-    if answer is None:
-        return None
-    try:
-        value = convert_to_float(answer)
-        if isinstance(value, float) and math.isnan(value):
-            return None
-        return value
-    except ValueError:
-        return answer.lower()
-
-
-_TYPE_CONVERTER = {
-    "text": lambda x: x,
-    "real": convert_to_float,
-}
-
-
-class _Aggregation(enum.Enum):
-    """Aggregations as defined by WikiSQL. Indexes match the data."""
-
-    NONE = 0
-    MAX = 1
-    MIN = 2
-    COUNT = 3
-    SUM = 4
-    AVERAGE = 5
-
-
-class _Operator(enum.Enum):
-    """The boolean operators used by WikiSQL. Indexes match the data."""
-
-    EQUALS = 0
-    GREATER = 1
-    LESSER = 2
-
-
-@dataclasses.dataclass
-class _Condition:
-    """Represents an SQL where clauses (e.g A = "a" or B > 5)."""
-
-    column: str
-    operator: _Operator
-    cmp_value: Any
-
-
-_TOKENIZER = re.compile(r"\w+|[^\w\s]+", re.UNICODE | re.MULTILINE | re.DOTALL)
-
-
-def _normalize_for_match(x):
-    return list(_TOKENIZER.findall(x.lower()))
-
-
-def _compare(operator, src, tgt):
-    if operator == _Operator.EQUALS:
-        return src == tgt
-    elif operator == _Operator.GREATER:
-        return src > tgt
-    elif operator == _Operator.LESSER:
-        return src < tgt
-    raise ValueError(f"Unknown operator: {operator}")
-
-
-def _parse_value(table, column, cell_value):
-    """Convert numeric values to floats and keeps everything else as string."""
-    types = table["types"]
-    return _TYPE_CONVERTER[types[column]](cell_value)
-
-
-def _is_string(x):
-    return isinstance(x, str)
-
-
-def _respect_conditions(table, row, conditions):
-    """True if 'row' satisfies all 'conditions'."""
-    for cond in conditions:
-        table_value = row[cond.column]
-
-        cmp_value = _parse_value(table, cond.column, cond.cmp_value)
-
-        if _is_string(table_value) and _is_string(cmp_value):
-            table_value = _normalize_for_match(table_value)
-            cmp_value = _normalize_for_match(cmp_value)
-
-        if not isinstance(table_value, type(cmp_value)):
-            raise TypeError("Type difference {} != {}".format(type(table_value), type(cmp_value)))
-
-        if not _compare(cond.operator, table_value, cmp_value):
-            return False
-    return True
-
-
-def _get_float_answer(table, answer_coordinates, aggregation_op):
-    """Applies operation to produce reference float answer."""
-    if not answer_coordinates:
-        if aggregation_op == _Aggregation.COUNT:
-            return 0.0
-        else:
-            return EMPTY_ANSWER_AGG
-
-    # Count can support non numeric answers.
-    if aggregation_op == _Aggregation.COUNT:
-        return float(len(answer_coordinates))
-
-    # If we have just one answer, if float returns it or try a conversion.
-    values = [table["rows"][i][j] for (i, j) in answer_coordinates]
-    if len(answer_coordinates) == 1:
-        try:
-            return convert_to_float(values[0])
-        except ValueError as e:
-            if aggregation_op != _Aggregation.NONE:
-                raise e
-
-    if aggregation_op == _Aggregation.NONE:
-        return None
-
-    # Other aggregation only support numeric values. Bail out if we have strings.
-    if not all((isinstance(v, (int, float)) for v in values)):
-        return None
-
-    if aggregation_op == _Aggregation.SUM:
-        return float(sum(values))
-    elif aggregation_op == _Aggregation.AVERAGE:
-        return sum(values) / len(answer_coordinates)
-    else:
-        raise ValueError(f"Unknown aggregation: {aggregation_op}")
-
-
-def _get_answer_coordinates(table, sql_query):
-    """Retrieves references coordinates by executing SQL."""
-    # MAX and MIN are automatically supported by the model.
-    aggregation_op_index = sql_query["agg"]
-    if aggregation_op_index >= 3:
-        aggregation_op = _Aggregation(aggregation_op_index)
-    else:
-        aggregation_op = _Aggregation.NONE
-
-    target_column = sql_query["sel"]
-    conditions = [
-        _Condition(column, _Operator(operator), cmp_value)
-        for column, operator, cmp_value in zip(
-            sql_query["conds"]["column_index"], sql_query["conds"]["operator_index"], sql_query["conds"]["condition"]
-        )
-    ]
-
-    indices = []
-    for row in range(len(table["rows"])):
-        if _respect_conditions(table, table["rows"][row], conditions):
-            indices.append((row, target_column))
-
-    if not indices:
-        return [], aggregation_op
-
-    if len(indices) == 1:
-        return indices, aggregation_op
-
-    # Parsing of MIN/MAX.
-    if aggregation_op_index in (1, 2):
-        operators = {2: min, 1: max}
-        values = [(table["rows"][i][j], index) for index, (i, j) in enumerate(indices)]
-        reduced = functools.reduce(operators[sql_query["agg"]], values)
-
-        ret = [indices[reduced[1]]]
-        return ret, _Aggregation.NONE
-
-    return indices, aggregation_op
-
-
-def _get_answer_text(table, answer_coordinates, float_answer):
-    if float_answer is not None:
-        return [str(float_answer)]
-    return [str(table["real_rows"][r][c]) for r, c in answer_coordinates]
-
-
-def retrieve_wikisql_query_answer_tapas(table, example) -> List:
-    answer_coordinates, aggregation_op = _get_answer_coordinates(table, example)
-    float_answer = _get_float_answer(table, answer_coordinates, aggregation_op)
-    answer_text = _get_answer_text(table, answer_coordinates, float_answer)
-    # keep the original data the same with TaPas
-    if len(answer_text) == 0:
-        answer_text = [EMPTY_ANSWER]
-    return answer_text
diff --git a/examples/research_projects/token-healing/README.md b/examples/research_projects/token-healing/README.md
deleted file mode 100644
index f3594f32dc7a..000000000000
--- a/examples/research_projects/token-healing/README.md
+++ /dev/null
@@ -1,40 +0,0 @@
-<!-- back to top link -->
-<a name="readme-top"></a>
-
-<!-- ABOUT THE PROJECT -->
-## What is token healing?
-
-Token healing rectifies the token boundary bias in greedy tokenization. It does this by trimming and regrowing the prompt to better align with the model's tokenizer, thus enhancing generation quality. The improvement is clearest with completion models.
-
-Example: given a completion prompt with a partial url ending with `:`, the model might have seen the expected completion `://` as a _single_ token in training. However, the prompt's tail token `:` tells it that the next token is not `//`, and so it looks for wrong completions. Such errors compound in auto-regressive language models.
-
-Debiasing token boundaries also addresses output sensitivity to prompts ending with whitespace.
-
-A more thorough explanation can be found on [The Art of Prompt Design: Prompt Boundaries and Token Healing | by Scott Lundberg](https://towardsdatascience.com/the-art-of-prompt-design-prompt-boundaries-and-token-healing-3b2448b0be38).
-
-## Usage
-
-```py
-prompt = 'The link is <a href="http:'
-raw_output = generate(prompt, completion_model, tokenizer, token_healing=False)
-# The link is <a href="http:&#47;&#47;www&#47;dailymail&#
-
-# The model saw '://' as a single token in training. Seeing a prompt ending with `:` tells it that the
-# next token is likely not `//`, because otherwise it would've seen `://`.
-# Thus, it completes with a token other than `//`, in this case, `&`.
-
-healed_output = generate(prompt, completion_model, tokenizer, token_healing=True)
-# The link is <a href="http://www.365doki.com/post/3699
-
-# You can also use token healing in isolation
-# This can be useful if you have other work to do before the generation
-# Or if you want to delegate generation to another process
-input_ids = tokenizer(test_prompts, return_tensors='pt', padding=True).input_ids.cuda()
-healed_ids = model.heal_tokens(input_ids)
-healed_prompts = tokenizer.batch_decode(healed_ids, skip_special_tokens=True)
-# outputs the healed prompts without further completion/generation
-```
-
-See `run_token_healing.py` for the full example.
-
-<p align="right">(<a href="#readme-top">back to top</a>)</p>
\ No newline at end of file
diff --git a/examples/research_projects/token-healing/run_token_healing.py b/examples/research_projects/token-healing/run_token_healing.py
deleted file mode 100644
index 2dd9148c1bcc..000000000000
--- a/examples/research_projects/token-healing/run_token_healing.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import argparse
-
-from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
-
-
-def generate(inputs, model, tokenizer, token_healing):
-    input_ids = tokenizer(inputs, return_tensors="pt", padding=True, device_map="auto").input_ids
-    generation_config = GenerationConfig(
-        max_new_tokens=8,
-        token_healing=token_healing,
-        pad_token_id=model.config.pad_token_id,
-        repetition_penalty=1.1,
-    )
-    output = model.generate(inputs=input_ids, generation_config=generation_config)
-    return tokenizer.batch_decode(output, skip_special_tokens=True)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt", type=str)
-    parser.add_argument("--model_name_or_path", type=str, default="TheBloke/deepseek-llm-7B-base-GPTQ")
-    args = parser.parse_args()
-
-    prompts = (
-        [args.prompt]
-        if args.prompt
-        else [
-            'An example ["like this"] and another example [',
-            'The link is <a href="http:',
-            'The link is <a href="http',  # test aggressive healing http->https
-            "I read a book about ",  # test trailing whitespace
-            "I read a book about",  # test nothing to heal
-        ]
-    )
-
-    model_name_or_path = args.model_name_or_path
-    completion_model = AutoModelForCausalLM.from_pretrained(
-        model_name_or_path,
-        device_map="auto",
-        use_cache=True,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
-
-    raw_output = generate(prompts, completion_model, tokenizer, token_healing=False)
-    healed_output = generate(prompts, completion_model, tokenizer, token_healing=True)
-
-    for p, a, b in zip(prompts, raw_output, healed_output):
-        print(f"\nPrompt: {p}\nWithout healing:\n{a}\nWith healing:\n{b}")
-
-    # You can also use token healing in isolation
-    # This can be useful if you have other work to do before the generation
-    # Or if you want to delegate generation to another process
-    input_ids = tokenizer(prompts, return_tensors="pt", padding=True).input_ids.cuda()
-    healed_ids = completion_model.heal_tokens(input_ids)
-    healed_prompts = tokenizer.batch_decode(healed_ids, skip_special_tokens=True)
-    print("\nhealed prompts:")
-    for p in healed_prompts:
-        print(p)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/visual_bert/README.md b/examples/research_projects/visual_bert/README.md
deleted file mode 100644
index ec197ce5f350..000000000000
--- a/examples/research_projects/visual_bert/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# VisualBERT Demo
-
-This demo shows usage of VisualBERT VQA model and is adapted from LXMERT demo present [here](https://github.com/huggingface/transformers/blob/main/examples/research_projects/lxmert/demo.ipynb).
-1. make a virtualenv: ``virtualenv venv`` and activate ``source venv/bin/activate``
-2. install reqs: ``pip install -r ./requirements.txt``
-3. usage is as shown in demo.ipynb
diff --git a/examples/research_projects/visual_bert/demo.ipynb b/examples/research_projects/visual_bert/demo.ipynb
deleted file mode 100644
index 9f61beea8e24..000000000000
--- a/examples/research_projects/visual_bert/demo.ipynb
+++ /dev/null
@@ -1,255 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# %pip install-r requirements.txt"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Note**: This demo is adapted from the LXMERT Demo present here: https://github.com/huggingface/transformers/tree/main/examples/research_projects/lxmert"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2021-08-11 04:32:30.532299: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n"
-     ]
-    }
-   ],
-   "source": [
-    "import io\n",
-    "\n",
-    "import numpy as np\n",
-    "import PIL.Image\n",
-    "import torch\n",
-    "from IPython.display import Image, display\n",
-    "from modeling_frcnn import GeneralizedRCNN\n",
-    "from processing_image import Preprocess\n",
-    "from visualizing_image import SingleImageViz\n",
-    "\n",
-    "import utils\n",
-    "from transformers import BertTokenizerFast, VisualBertForQuestionAnswering\n",
-    "from utils import Config\n",
-    "\n",
-    "\n",
-    "# URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/images/input.jpg\"\n",
-    "URL = \"https://vqa.cloudcv.org/media/test2014/COCO_test2014_000000262567.jpg\"\n",
-    "OBJ_URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/objects_vocab.txt\"\n",
-    "ATTR_URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/attributes_vocab.txt\"\n",
-    "VQA_URL = \"https://dl.fbaipublicfiles.com/pythia/data/answers_vqa.txt\"\n",
-    "\n",
-    "\n",
-    "# for visualizing output\n",
-    "def showarray(a, fmt=\"jpeg\"):\n",
-    "    a = np.uint8(np.clip(a, 0, 255))\n",
-    "    f = io.BytesIO()\n",
-    "    PIL.Image.fromarray(a).save(f, fmt)\n",
-    "    display(Image(data=f.getvalue()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load object, attribute, and answer labels\n",
-    "\n",
-    "objids = utils.get_data(OBJ_URL)\n",
-    "attrids = utils.get_data(ATTR_URL)\n",
-    "vqa_answers = utils.get_data(VQA_URL)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "loading configuration file cache\n",
-      "loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /home/crocoder/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0\n",
-      "All model checkpoint weights were used when initializing GeneralizedRCNN.\n",
-      "\n",
-      "All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.\n",
-      "If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# load models and model components\n",
-    "frcnn_cfg = Config.from_pretrained(\"unc-nlp/frcnn-vg-finetuned\")\n",
-    "\n",
-    "frcnn = GeneralizedRCNN.from_pretrained(\"unc-nlp/frcnn-vg-finetuned\", config=frcnn_cfg)\n",
-    "\n",
-    "image_preprocess = Preprocess(frcnn_cfg)\n",
-    "\n",
-    "bert_tokenizer = BertTokenizerFast.from_pretrained(\"bert-base-uncased\")\n",
-    "visualbert_vqa = VisualBertForQuestionAnswering.from_pretrained(\"uclanlp/visualbert-vqa\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/crocoder/anaconda3/envs/transformers_env/lib/python3.8/site-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)\n",
-      "  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)\n"
-     ]
-    },
-    {
-     "data": {
-      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAGPAlgDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDI1SytpPEWqXl2ryIjQxLGhAJJjBySQccL6d6kttJsJFt0aI+Zc7jGQFwgBIGRjnke1Wm03UbnxdqBtJoFjkjQsko3A4VQMgqRnrWrB4Z1tYzGt3aBTn1yM9cHbkZ9q65y5XFPsv63JMePSNMKIGibebfzyQFxx1GMeg65p66Tp215jAfKFuJlUBd2d4XBOPrzitxPB+tHB+1WfEflfeb7vp92rtr4R1eOKUG5syxhEUfJIA3hsH5eR1/OoVRP+v8Aggec65olmZpp40wUhgaJcDgOXznjnpS3ek6ZZ6bdp9jZ5BcxrG+9VK5iJ5+UnGc5GRnjpjnrbzwTr8viBPL1KyR54AjBk3qQCSOCuOMcccVM/wANPEkpnMms2MhnwZN8W7JAIBGV4OCeRg81lzptuL/r7/Q0ukldHIT+GrC2uPJEkayrIIX2zxt5mTtbag+Zce+ffFc9r9pZCdra3V1RWWIySEEn5gM8AY47c/WvUW+GfiSQR7tYsCyEMH8n5yR0y23J/E1heJPhTrVvo93eNf2s7gqxVcgtlh6gDvSdr3QnNNWOR1i1tbi31eAWkEI067WKApEEKplwVYjljhQcnng1W8LNaR3sdrcBWt5ZB8n2SORpDwNu9jlO3K+tdi3w88capZ2zy/ZnQ7ZRzGN5xwX/ALxwcc56mpbf4beObTd9njsoyWLhgsO5GPUqeq/gRT5veuVzQOO0mJEguUura2WwhV/PZ4FMjMQdqh/vbs9ADjAJPGaytM0a3v5ri3AYzi3aSFV7soDYPr8ob9K9Hi+G/jqKxWyEVg8C7iFljgkILdTlgTn8e1V7P4VeNLC7iu7SOCG4i+4/moSOMdzipvqirwabMa18M2Gl+ItPuYHkeMajax27MR85EhEhPHQMnH1FUrXQ9Nury1ubJ7yANdSwSNvG/IUMGUgcA5PHOPU11q/DTx2iWiKYQLOXzoP3iHa+Qc8nnkDrTYPhf43ttghW3UJIZlG9DhyME9fSr512JvA5dtD0u/j0GzW3aCaS1aSSXzFAYK0pbIIHzHbwS3AwD0zWTrGj6fYxwTW/lnzMhoRdJOYyO5aPjkH0Heu2ufh/4t0uythc3VhaQwSfuJZZoUKk5O0OTnHLHbnHXis+68H6neFTPqvh8heix3dtGPyUjmh6rRBzRuUDZWm06SLSAW/9mC583yx5nmFBJu39ep24zj2zVPw7aWyxX1yy2cPlqipcXCeaiMW6FSGySAecHGD9a3z4Y1s2P2M67ovlY2/8f1vu25zt3bs7c84zilg8L6vbOzQ6v4fVXVVeM3VsUYDplScE8dcZ6nvRZ3TsNSiZrWyweJLqK2srKO33iSZ5oFdUiwCWBOdqnOQBzyAOan0LTtMuIbqRbUNE905j3k5VMDANZTXPiCO91G3XUNNcGfEvmiCRZGUkAgsDkDtjinWL6vZxsi3enfNIZGCyxgHOOMAjA47YqJtpe6XD2bl7y0/r/h/l5nT/ANk6f2s48bd1K2kadjC2cfBHPPeue/tHWgADf2Rx1Iki5Hp1/wDr0v8AaOtAhvttgQDkjzY+fQdai9Tuap0rp8pvtpOmqSDaRcdfmNOfRdOGALRAScZ5rATVdZzITdaf8x4zKny/Tn/GmLqWuBwWv7JgD0aWIA/kc0+apbdkNU01Zbb/ANev4HRJouncg2qHBxnmkTRdP3Ya1XJzj0rn/wC09b3vi9sVDdAJY8L9Of5099S1qQJtvNPQr3WWPJ/Mmi89NWJuKcvPy/L5nVaHoWmS65p8U1mrB7iMMp6EbhnpXsP/AAhPhv8A6BMH5t/jXzzBrOuRX9tcR39hA8UisGSSNsEEHJBJ9K6r/hP/ABVznxTYZ/3If/iKqE5xVrv73/mRWSlLmit/u+R65/whPhv/AKBMH5t/jR/whPhv/oEwfm3+NeRD4geKyAf+EnsAT2KQ5/8AQKP+FgeKv+hp0/8A74h/+Iq1Vm9m/vf+Zi4OLs0eu/8ACE+G/wDoEwfm3+NYfhzw1pD6trcTWS7IpwqKHYYGW9688X4h+K2/5mexH1SH/wCIrP0/xv4ltbu9li8QWaNNLl22xHccnkZX37VtTrP2U1JvW3fuP2M+bltqe6/8Iron/PiP+/j/AONL/wAIron/AD4j/v4/+NeM/wDCwvFe7H/CT2OPUJD/APE13Om61rV7Z2rDxfobTSxKxTzY92SuTwF61zpNuyQOLSTfU63/AIRXRP8AnxH/AH8f/Gs/XfDGjR+HtTkSyAZbSUg+Y3BCH3qh9o8Q/wDQz6P/AN9J/wDE1Be/29d2NxbSeJtIKSxNGwVkJIIIOPlqnTn/ACv7n/kZyaUWzlrPwno9zaw7LXD3axtCfMb5du3ze/TJb/vmm2XhvSbskNZQJHO0hhJklMgAzjGMrgf7VTQ+ENXVI/K8X6fEqAhFZ8FAc5HtnJ/OpY/CWtwxmOLxrpqRk52rJgZ9cVKpVGtn9z/yMaacoJqW6IPsFr/Z+3yuP7J2dT08/OPzrPtLZNP0TUbqwVortSiiSNjuVSGJwe3IWtT/AIQ3WNu3/hMdL27dmN4+7nOPpnnFRS+EdYsbae4g8YaajrExzC+1iMdART9lP+V/c/8AI15dGipdIItQkWFQjh8lUGNr9WAHb5s8VfvLuSfTLW5HneZFOR5s8m9i2AflOBwMdO2apaP4T1abSoZI/FmnRK7F9jvyGBPJ9+vPvWjL4T1yaVJZfGunPIhyjNLkr9D2o9lPs/uf+Qcu3kUtad5b6OSRiztbwlmPUny1q7a6ZbXFgk3lEvNEYowGP+uG4/qFUf8AAqS48J65d7ftPjXTptudvmS7sZ64zTE8H6xGECeMdLUI29Ar42t6j0PA5o9lU/lf3P8AyDlfKkmRalfWelxC3+xLcRHUBCAZWUD5QC3HOeM+nPSobK8gs76ALaLI5ub63JaQ8iKNSOmOu4/nVebwjd3GtrpVx4ltJXkiE8bxHIRlbLHqOdqtzVeXw3eTzaTNbeIIY3v7hriIkZ8shVDleeWLhh2ztHNS4SWjQvZu1rlrTrzTrqxtru4itoVuZZFdS85MSrjOzarAtg5wx54rO0fWWF9c7bb9/HaSTQYfJJC5BHHXbk+2K0bnw9qtnN5qeI7mOa5s7qS482LyHdo1JBZQ5/M9scVy+p6Fd2F/ZyQ6zBG4tYJQ6naykxqeD+PWlysrldndmtreuSra6XdXFo5nubbfIzPyf3jhSTjklQp+mK27rU2l0+6s4kkkuIbS3drc8QxAlPmRu7HcM8Dq3JxWNq/hzVYTrF5LrgMN5Iq2krE/6UhbIwc8jaBnr2FLoeh39wYNLu/FcUUq3Jiaxm8w+Wqjoq4xknIIOMbfejlYnB6eRvPezWVlZSwW0FxbWepIAYbqN94+XL/KSeT26gYzXQf8LEH/AEDB/wCBH/2NcV4d8J6jCdMmTURPHBfNK0kSHyYCu3DuDjIOM87ePXpUPh+x1O71J7KWe2c3MMkURKj5ZNpKEccfMAPoTQi4R5VY76Dx69zcRwQ6UGklcIi/aQMknAH3aRfHxabyv7MQPnHzXQUA/UristrYLqOnXtn9mSO51KCKNVQfKkUjB8cdWHkkn1JqlYwXV/cWU1xHZDGpPEVjhADRbQQp4+bB7nnnk0FG+3j90VGfSSqyDchM+AwyRkfLzyCPwpv/AAsMf9Awf+BH/wBjWFaQ6hd2/hpb17c6ekLpPItsnMoklKKWwOo2ZG4ZyT3zWZ4ls76KKyEZeK5/eea81glqHXjbhFLDj5ueM8daAO1/4Te4+y/av7El+zZ2+d5p2Z9M7MUWvjS6vmZbTQ5rhlGWEUhcge+ErmhBqvlreeen9mDRjAWx8vneWRt6dfNw2OvfFc/pWkavcaxa295cJHAxEspKYIhA3MwyP7oJFAHo58XXz3D2i+H7g3CDLxBmLqPUjZkdRWtoOojW7KW4aMWzRymIoW3HgA+g9f0rz/T21LVhrNxOsl0Li5idbOzYJIqguQd2xvlUYXG09RyMVtafbeJJrvV5LPVLPyjfyH7nU4X2PsOp6UAdwYI/+e60028f/Pda5Y2Xi7/oKWf/AHwP/iKYbLxb/wBBSz/74H/xFAHVG3j/AOfhKYbaP/n4T/P41yxsvFn/AEFLP/vgf/EUw2Xiz/oJ2f8A3wP/AIikBrapbxi+08eehzL+XIrRNrF/z8p/n8a4q8tfEa3FqJtQtmcv+7IQcHjr8v0qybPxV/0E7T/vgf8AxNZw+KXy/I0l8Mf66nUm0i/5+k/z+NMNpD/z9R/5/GuWNn4q/wCgnaf98D/4mmG08U/9BK0/74H/AMTWhmdSbOH/AJ+4/wDP40w2cJ/5fI/8/jXLGz8Uf9BK0/74H/xNRm08Tj/mI2v/AHwP/iaBnVNYwf8AP5H/AJ/GomsYP+f2P9P8a5g2nib/AKCNr/3wP/iaja08S/8AQRtf++B/8TSA6WaxhWMkXkZPoP8A9dMGnwFQTfRDI6cf41ysNzq1vr1vZX11HKsiFyEQDsfYelbxHApdTmX+8v8Awr82Wv7MgZgBfxEk4AAH+NFVoF/0uH/rov8AOiqOkx9OP/FU3X/XEf8AstddAelcfYHHiq7/AOuI/wDZa622PSurFfFH/DH8iEacXSrcdU4qux9BXOBQl/5GW0/65H/2atsCsWb/AJGW0/65H/2atsDis6e8vX/IuptH0/zFrJ8UD/im7z6L/wChCtisjxR/yLd59F/9CFaGZd0of8Siy/64R/8AoIq6Kp6T/wAgiy/64R/+girtABS0UuKBhijFFLQBwnxVtmvPDmn20eA0uoxqCegyj9a89XQdLkdtgYxopZ2KkEDIHA3c9R6V6P8AE+K7k8PWTWS5niv45ByBjCv6/hXKav4b1230q9mOlxRxi3cSASqQRjn+L+VdUW404+bYupjf8I7pwV5G4iCK4IBJIJx0z1z70TeHNPjEm0B2j2kj5gNpxg5z15HFVdQ8HWlm+oo+ryvHY3C20/l2fPJbaYx5nP3TnJGO2agPg9oLm5jv7wwqbs2UTxQeYHK7TvPzDauCvPJ56VHtvN/iOxiyeHLV9Yvbe1vC5hvvJlBiICb2IBU7vmwRg5xz69adpPh+0nvbb7US0JvPs7ooOSMEjncMdMVr2/gs3E2yW4EN7c3c0KRx224eZExA3sWGAT0IB56jvVrTPDUMtnDLqNuUQ6dLdxC3tk3RyCZl5+dd5wM8noQOABU80ea5pzytY56y8MR38Znt2u3gaXyo3W1LHcACS4ViFA3DnJz6VWutDtrGxSaaUi5dnQQqmRlW2nLZ4H4H+tbcfghpLRVG37TNDJdW9s8A27Vzwzh/ldgmcYYdMkVU1jwdJY2tnHETcXDwxTNGIVRIxIgcruL8kE44GCOc9qm8bWQOcm7sr6dpum/ZrH7VZ+cb25aHPmMDEo2jK4OCcsTzkcDis+LS1m1A2YwArMHck7VVc5b6AAmuvtvCkcdrBp39kxm5OnPfC9Mp3JOEMgULnbtwoXpnPOe1Yuh6DdQ6iv2iwsArAjzbuWQInfJ8lt3bHHrzQ5JpIFJp3K+pW+m2l9bm209Ht5LdHEU0jnJI6/KwOfxxUOuWdlb6mbe2tkh8pVWVUdiPMx82NxJ4OR17V1M3h2OPxZPC1pp8lkuwwTahJNsQYBJXyzuOc8Fh0AzVGDwgY/iDDCtmsmnjU1ULcBCWi80feHfI/OiUrpruCm00ygvhMP8AcnDFp0jT5cbo2Cnf14x5ice/WnNoun3Vjp0ccoSeSGYoBF/rdruRuOeMgADr07Cuk07wgYtfsLiJoby0S5a1nikgVAOCRgbjuU44Jwfl+6KybDwFe3ekG8jSZZ7aGRwptR5OFySPND/exk/dx2zTUoLYHUkzPt9I09rOaS+d+NPWdGhhGVzMF/vDcecc+vsKYPCjHTvtWy43eQbgEwN5ewDPL5xuwM4xjtnNbVr4NFlqGmx3LCe5YwCa2Nupi8tmVtpcvkkAjI247ZNVfEHg2azuZEaCE3LSORCqAeTHn5ATuxnHbHAxzngHNEHNsxNPtLIWN5e3Fqs4hMaLEzsFLMTydpB4CnuOtV9QsrWy1W4hhjPlERyorNnaHQPjPtux+Fdpp/g61S1s9Ol0pJJr6zknkumkIMUg37FChtpX5BnIJ+Y4IxXM6RpMsmp3Rn0WHUZNwRIpHZQGzgcIwJ4GMZo3g0CqNSuOu7bTF0y3u4Le2L+cFkihabYBjO19xzu6/dOOD6V3vhLStJn1PRd+nxxtNE0jqkkmMbDtHLH0z+VZFz4SsrNtSmXSIZ4oBbhbMzHyxM65kywYMQhDqOR1HXv0HhHwZInjeeSKGRNPjSKQfvFzEskJKr1ycZ2/hVRqtNtO1yG7pJnoP/COaF/z4j/v4/8AjTJfD+hpE7LZgMFJB8x+v51PeaPBaRLKkkhO8Dk9jUMZ/wBEl/H+VXKvV5W1Uf3v/MyqRXI/R/kxlroGjTW6vJZqXOcnzG9frVqPwvob9LFf+/j/AONNsz/oqfj/ADrStTWccTWSS5397Iw8V7GHovyKf/CJ6H/z4D/v4/8AjVXU/C2ippV462IDLA5B8x+u0+9dGOlVNW/5A99/17yf+gmn9Zrfzv72bWRzfhrwxo8/h60kksgzkNk+Y394+9a3/CJ6H/z4j/v4/wDjS+Ff+Ras/o3/AKEa2KFiay05397CyMX/AIRPRP8AnxH/AH8f/Gj/AIRPRP8AnxH/AH8f/Gtqin9Zrfzv72FkcjqXw28PanMJ5IriJlUJtil46k55B9aoH4TeG/W9/wC/o/8Aia77/lmfrTDUOtUerk/vHZHBn4T+G/W9/wC/o/8AiaafhP4c9b3/AL+j/wCJruzTan2s+7HZHC/8Kn8Oet5/39H/AMTWN4l+HOiaXp0c9s92HMyod0gPBB9vavUq5vxsP+JND/18r/Jq6cJOUq8Iyd1cUloVT8PtJ/5+L3/vtP8A4mj/AIV9pP8Az8Xv/faf/E11uKK4yjkv+FfaT/z8Xv8A32n/AMTSf8K+0n/n4vf++0/+JrraSgDkv+FfaT/z8Xv/AH2n/wATSf8ACv8ASv8An4vf++1/+JrraQ0Acl/wr/Sv+fi9/wC+1/8AiaT/AIV/pX/Pxe/99r/8TXWUUAcfJ4B0oH/j4vP++1/+JrW0nSLfRbR7a2eR0ZzITIQTkgDsB6VqSfeqI0AMNNNONMNADDTDTzTDSAytT/4/bD/rp/UVfNUNT/4/LD/rp/UVfNZQ+OXy/I0l8Mf66kZqM1I1RtWhAwjFMIp5phFAEZqNgKlNRsKQHN3vPjKx/wCuDf8As1bZFYl9/wAjlY/9cD/7NW6RkCjqcy/3l/4V+bEtx/pcP++v86Kdbj/Sof8AfX+dFUdJz9oyr4putxA/cjqf92uptp4v+eif99CvMtbsluvF13JLJ5cEFqjyMF3HkgAAZGTk+opttpdnIgle5ZIXl8qJjFyxwCSRu4AyPXrTxeI99Lsorr2Mue1z2KG5g4/fR/8AfQq5HdW//PeL/vsV4/BoEDeVHLMEnld41TYSAynHJzwPwNPh0S1maIpN+5dJG3shBBQZPGenT865PrS7r8SfaI9RluIP+EktG86PaIjzuGP4q2hd23/PxD/32K8NfRYGullikLQfZ2lUlcEkNtIIzx1B71fh0ewW2leffnyUkUquduWx/eGf/r0o4jlu/Pz8ip1NEey/a7b/AJ+If++xWV4nurdvDl2BPETheA4/vCvLz4eVbUymOTIjEv8Aq22bTz97PXFZuv6ZaWdnNGjF5Fxu+XAHI9+a0hiFOXLFp/eSppuyPb9Ku7YaRZA3EX+oT+Mf3RVz7Za/8/MP/fYrxSPSrFbT7OLcebFbJJ5u9sscKSCM4xz+lRWdpZGXbNFDtP8AFKz4H/fJpRrzlFyS2BTbV0e4/bLX/n5h/wC+xS/bLX/n5h/7+CvF4NOsRqb25s0eISMS0jOGVByejDsKqxadbXV4I0jEYkYhRknHoOv0FCryd7rRK41JnuX2y1/5+Yf++xS/bLX/AJ+Yf++xXicmhQRwCUkY8rzDweDkYXr6Mp/Gp77SLM3lw8O0FJgrIEwq5PGOefpgVP1rWyt+IvaanoXjW4gk0aERzRsftCnCsD2ar3iGeGbw9qMUUsckj27hUVgSTjoBXj2t6RafYIlj3C8kvWhXamFY8YH3vlHOeB3xU2o6HHZ2pmCSYD7DvjKc+2TyODXTHG80KcHZavvrsJTTZY1fVbyGK6lu7CVf7QnEz7IG4cbjgZPT5j61WPja4M8skmkPNvl89Ve2bEb4xkfN7Dg5HAqKfSbGaxsIXtw0l44Tzd7Ax5JUEDOOvJyDxXB2lwkU4aW2W5BGBGzMAT2+6Qf1rJVZScklt/m1+hSm3ex1mneMNRhvvNOmTSNb3LzRlomyzMxY7vx9MVbi8Y6hCsCDRHkjiga32OrgOjMWOcc9W7Y6CsSDTLM6pfu6wxJaWscs9vJI3lxyswUqSMtgZzgZOeKms1e0v7pfLhjR1jdBAzMhUgkFS2Tg9eeamnXlUfLFK4lUbdkag8aaosAjTQlDrG0UcpDl40bOVHbuRkgnnrVG88SaveSpI+lbSsSRDCt0RQo7dcAVY89v71IZ3/vVt+97L72V7w3/AITDXls/IGlRbxEYRcGJzIIz1XrjHJGcZwcZqtD4l1WFyW0O2lUoF2SRTYyP4uGByfrj2qyZ3/vVGbiT+9+lH73svxH7w0+L9ckupZ7nR7W4Z9oVZLeUCMKMALtYcYwMHPSqZ8UeJf7YTU2iLzrMJtpt2Ckg5xgduO1WmuZB/F+lRNdS/wB79KX73svxD3iR/G3iQXFvLFYQw+VKZmSO3k2yuRjLZY/kMAZ6VFD4z8RwWK2/2GN5Et5LZbh4JPMEbhgRw23I3Eg4z0zkcVE13MD9/wDQVG15P/f/AEFH73svxD3hzeLPEDJbNJYI91bmPF0YpA7hMbVYBtp4AGducd6p6h4i8RalAsdxHLvSRmWVY2DhTzsz3UHpnJHrjipGvZ/+en6CozfXH/PT9BRet2X4h7w+HxT4hg00Wgti8qRvDFdvE/mxRvncq4O3+JuSpI3HBFVtF8Q6no9/JcwaVHNIYDAwlE3UnmTKuCr4yMggAdADzTmv7j/np+gqtFeTrJKRJyTzwK3pRqulUbtol37iblctReINTt7m5ePSI/slyirJYsJ2i+Ugggl94ORnIbuR0OKgfWNbuL27v7ia5heZlZggKKAOAAOwAwAKkS+uCCTL+gp00rzaTOztkg4/lXM51I25kt/MG2tz6XikM/hSylJyTFGSfwqnGf8ARZPx/lT9HfzfAenv626H9aiT/j2k/H+VdX2GKp8MvR/kye0/49k/H+dadoe1ZVof9GT8f51p2ZrKOyJw/wDBh6L8jRHSquq/8ge+/wCveT/0E1aHSqurf8ge+/695P8A0E1RqUfCv/Is2f0b/wBCNbNY/hX/AJFmz+jf+hGtigBKKWigA/gP1phqT+A/WozQMaaaacRTaQCVzfjb/kCw/wDXyv8AJq6Wua8bf8gWH/r5X+TV1YL/AHiHr/mKWx0dFLRXKUNpD0p1NIoAQ02nUlADTSU41zHjjU9R0rRoJdLkZLiS5WP5Yw5IKscYIPoKBpXdjebqajNeVf8ACS+Lyqt9uyGUsp+zJggdSPl6DBpW8QeMEETS3jRxyEBXa2QA/T5eaB8vmeommGvNr3W/E1pHI39sK+25kgx9mjGduOenfPSmxax4snt1lGrW6s4Zo4miTdIF6kfJjsepGccUWDlPSDTDXmVr4g8UXcjKNVhRUUu7vEmFUd+EJ/KpjqnikSuG1i2WJY1kM5iXZtPT+DPPpjNKwcp2Wp/8flh/10/qKvNXlmo634mj1GC3lvVeZXGzZEhyTjBGF5zxipv7f8VsFIumIclVIt15I6gfL2qIxtKTLkvdSPSjUbGvP7zVfFNtGkovWeBo43Mv2ZQoLKDtztx3o+3+LJLee4huZHihKht1qoblS2cBTwMdc9x61dieU7w0w153JrniiOBJ5Lh1hf7sjW6hW+h281Laap4jvIzJ/atvCm8Rq0sajc57DCH8zx70WDlO8JppNeeprfiaS8FoLwecX2bTEgwc45+Wi41rXYpUSLV4Llm4/cwg4Ppyg/SlYLG9ff8AI5WOP+eB/wDZ63cHA+lcRA+qjxtb21/dRTTi1LqyAbQCCccAeprrvLvcD98mPp/9albUwjBfWG7r4V+bLcA/0qH/AH1/nRVRReRyK/mp8pB6f/Woq0jqVJvZnG6hBfSeMrn7LbpcRyWypJHIQFYcH1B64OQatW2m6zESq6VbtH5nmJGzgiNsYyPnz2HXPSr1v/yOFx/1xH8hXSRnDCniaUHPVbqP5HLGKd7nMwaR4h3wSfYkZoWZwTInzFjk5+ar2m6BrkRRXsU2RpNtzIhyzJjB+bpkCustm6VpwnisJUKclaw3CLVjzZtF19dZt4hpsQUwMggEi7SpyT/FnqM9e1aX/CPeImYltKhKGMRGPzVC7Qcjo+f1rrZD/wAVLaf9cj/7NW+tTCjTbenX+upU6cUlp0PChqstyoiGlyyyhdocAF9oHs+OAPToKz9a1N7q1uJHtJlZsZ+7jqP9quv0W3ggFnttjI89tNM0+4/KQHGAOmBgZzzk1U1ex06LQJFlkiEj2gmVwJTIX4IHA2bc/L/XtXSqUIu6Rx05ydn+n/BMm01i5nsYbdNOmeVoUjLoFLuoAwPvew7dqI9WCMxOlM4OMBiOMenz102m21qJ7CWyjjS2VhHuJcSKShIDg8Z4P3eKzNTgjtZ1giT5FUETE584H+IdgPQD8ean2NNKyX4lR5m7J/h/wTNOtzGSeQ2cxeYFWb5OATk4+b2xUUepvFIsi2lwGUhhynUf8Crq5mcxXFsc/YUsI5EX+EMQh3D3LEjPuRVXQYc3P2lWi82N1WNHkVTknluSM4Hp3IqlTgtEt/8Ahhpy5W7r7jHm1+4nimjaylCyyeYcbOPYfN06fkKRtdmaSZzYy5lcO33OCDnj5q34riWx164RvtDRvcEFLeYAMd3AOAQ3B6VTjgjbXzbXCoEadomCE7VJJGR7A8/hUKhSSso/1p/khJS8tr7f8E53WNdlazH+guH+0ecjkgGNz3XD+w65qxeaqbiI40142JyWXbyf++8flW14g0qzh0N5pI8PAqxPlj/ryU5PPYO3HT5Kuz6XZz3E1p5Jtlhu4rfztxJdWbBJzxnA3DGBWyo04xi0ur/QSlbVP8P+CcrLrVwmktFFZP5qIfKkbbmMkdR838wa5TTri80qW2vl0uJiqsqPLuw7d2BDD5hkDjGOO/NepQ6faXsKFrIwDzJIzCHb94FQtt5yc5ABx6jgVyiafZajZaOs9pHZQol9MIC0pjcpt6Y3Pt4ycZPDYx2zdGm76bg2/wCl/wAE5C01KVL+8kh022SKVfLmtwzskgznJLOWzkA8EdK0bfUpZLiWa5iEeVRESPG1VUYAHPpVqOz0BP7TvYIbe7EOnpOYYmnWFJjcLH8pfa5UqeRk8kgHgEZeuWtra6tCII/Jt57eCfywxby98aswBPJAJOM80KnCL5ktS1zN6P8AD/gmp/aUX91/0/xpP7Si/uv+n+NXvEUs8tv4jgus/ZbK+jjsFP3YhlwFT0BQZwOuAazfC97e2StdPeS2+jW8okuUB+W4b/nljo5YDGDkAZJrXS4J1HG919w86jF/df8AT/GtC1s5by2SeNkCPnAY88HFWPDcuNP0uASyQm9nuDHaxJmG54ACztngAj0bAOeOtc7pz/8AEviH1/maLxW6C1aV1GSXy/4JutpFz/fi/wC+j/hUbaPc/wDPSH/vo/4Vms+aiZqLw7fiHssR/wA/F/4D/wAE0m0W5P8Ay0h/76P+FRtod1/z0h/76P8AhWazVEWovDt+IvZ4j/n4v/Af+CaTaFdn/lpB/wB9H/CmHQbv/npB/wB9H/CswmmGlzQ7fiL2eI/5+L/wH/gmmdAu/wDnpB/30f8ACqkGiXMk06h4co2Dlj7+1UzUCffk+tdVFx9lU06Lr5i9nX6zX/gP/BNg+HrvPEkH/fR/wqwuh3X9mzQmSHcxyDuOO3tWDjPFXguzSZwP74/pXn1nGy06rr5ilTr9Zr/wH/gn0N4euFj8B2Fs4JkSAKSvIyDQkyi3cc9D/Ko/Ar+b8MbA+kLD9amjP+iv9D/Kum8eR6fiE6dfklea2f2fJ+YttcIIFBDVo2t7GvVX/KqVr/x7r/nvWnY9ayi4WWn4kUKeI9jG01svs/8ABLI1GHH3X/KquqahCdIvRtfmB+3+ya1R0qrqo/4k19/17yf+gmqvDt+Jr7PEfzr/AMB/4JkeF7+JPDdmpV+A3b/aNa/9ow/3ZPyql4U/5Fmz+jf+hmtqi8O34g6eIvpNf+A/8Epf2jD/AHZPy/8Ar0n9ow/3ZPyq7SUXh2/EXs8R/wA/F/4D/wAErC+iMDPtfAOOlRf2jD/dk/Kr/wDyzNRmi8e34lOnX6TX/gP/AASmdQi/uv8AlSfb4v7r/lVs0Urw7fiL2eI/5+L/AMB/4JT/ALQi/uyflXOeNL2N9HhAV/8Aj4XqPZq66ua8bf8AIFh/6+V/k1dWCcPrENOvf1B069tZr/wH/gmz9vi/uv8AlR9vi/uv+VW6SuW8O34h7PEf8/F/4D/wSp9vi/uv+VIb6L+6/wCVXDSGi8O34h7PEf8APxf+A/8ABKf26L+6/wCVJ9ui/uv+VWzSUXh2/Efs8R/z8X/gP/BKn22L+6/5VyXxB1JYNIsJ0Vt0N/HJyOuFY13GKxPE1vDcabGk8McqiYEB1DDODzzRePb8TSlTr86vNf8AgP8AwTzu71aw+y3UFvIzCJRFa/IRuVtu8+33T1/vUyW+0yOxligkQlzEy8SF+Dzuz8uevSvSW0XSv+gZZ/8Afhf8KjOjaX/0DbP/AL8L/hU3XYv953X3f8E8v1PULaeKUROWLXs0o+Uj5W24P6GpbW8tVjsrl5tslpGyeVtJMhyzLg4x1bBye3evQ59H0wQt/wAS2z/78L/hQuj6X5a/8S6z6f8APBf8KbS5boiFSXtXCXZP8TzPTZYbWdneaMM8LBGZCyxsTjDDHPGexHIq1cXlpdG4ga5RPMiiHnbGEe9PRQMhcHsO3QV6AdH0z/oG2f8A34X/AAph0jTP+gdaf9+F/wAKi50XPKdZull1a3lt2O2Py0RiME7FVQfxxmugutXsCl0sLnCxlrcbSPnk37x7Y39f9gV0eo6Vpy3liBp9qAZOcQrzyParp0jTf+gdaf8Aflf8KiMvekW7pJnAyajC7SAzEodOSBRg43gLx+YPtSXd3a3UV/GtwqeYYJEZlbDbI2UjgHnLd+Peu8Ok6b/0DrT/AL8r/hTDpOm9tPtf+/K/4VVyLnEanqVtcW1y1v8AZV+0BAUxL5gwQccnYMYxkdvSqdnLay2CW1xci3MVx524qzblIAIGAeflGM8c9a9AOk6cOmn2n/flf8KadK04f8w+1/78r/hRcLnB2+oxJr/9qOV2yXEjNFtyyq2eemD97pntU0mqW8L2rzTNqE8XmZnRihG4AKAWXJK4Y9OM8dK7M6Vp3/Pha/8Aflf8Kb/ZWnf8+Fr/AN+V/wAKLhc4yzubO58cacbMSgJYBG3tnkLjH3R09e9dyPuiucuLS2t/Gdl5FvFFmBvuIFz970rpOw+lL7RzL/eX/hX5sY3ANFDdD9KKuJ6NHZnN23PjCf8A64j+Qro1rnLb/kb5/wDriP5CujWtMR8a9I/kccOvqaFq/StWE9KxIGw1a1u3ArJFMY5/4qS0/wCuR/8AZq6BDXOsf+Kjtf8Arkf/AGauhjPFTT3l6/5FVNo+hxOn+BrpbRo49dmjj3nKLEQDx6b6q674JurPw3dKuuTGEAZhEZCnLDtvxXead/qG/wB4/wBKqeKf+RavPov/AKEK3qaSZxYR81CDfY5uz8E3t1p9lNL4guGYRKy7oySuVHQ76efh1IyIjayxRM7VMHC564+euu0n/kD2P/XvH/6CKu1B0WS2OHPw/uGtxbnXZDCDkRmE7QfpvxSN8OpGkEja05kGMMYOeOnO+u5paB2OKi8B3cO/yvEE8fmHL7YiNx9/n5qAfDYhgRq5BBzkW/8A9lXe0HPagLHlXi3wRPbaUrvrMkoluVLq0R+ZsN8x+bk9fzrXv/AFw1ntl12WRIwNitCSF+nz8Vs+OP8AkCQ5/wCflf5NW9ff8ecn4fzrrcV7Km+7f6E9Ti18B3Nx5MkuvTO6qNrNESV+h38V4/4u03UtP8WXMM2sXUslvJvilZmyuQOR83HQfkK+lYOIYj/sj+VeJfFy0+z+LY5wMC4hBz6kcf0rme7HZHmwk1G/urxrrVbqZ3AikaWRnLoGyFOTyAQDj1FSSWEs5Vprx5CqhFLgnCgYAHPQAYAqOzb/AEi6/wB//GrofikWkhssd9cRQRTapcyR2/8AqUdmIj/3QTx+FT291rNoJBba9fwiWQyyCOZ13uerHDck4HNR76XfQHLHsMiGoW8E0EOrXUcMxJljR2CyZ67gDz+NOt4/IgWLdu255xjvRu4ppagrREpamM1RlqaWoEOY1GTQTTCaQgJphpTSUCGmoE+/J9anNQp9+T611Uf4NX0X5kvdDgcEGrqfPpU/++P6VS+lXIjjSZv+ug/pXn1tl6oUuh738NH8z4Z2w/u+Yv8AKrsf/Hq/0/pWZ8KH8z4cqP7ssg/QVpR/8erD2P8AKun/AJdsVT4Zej/Iltf+PdfT/wCvWpY/erKtT+4StOxPz1lHZE4f+DD0X5GuvSquq/8AIGvv+veT/wBBNWl6VW1b/kD33/XvJ/6Cao1KXhT/AJFmz+jf+hmtmsbwp/yLNn9G/wDQzWzQAlBopKAF/wCWZplP/gP1qOgYhpKWkpAJXN+N/wDkCw/9fK/yaulrmvG//IFh/wCvlf5NXVgv94h6/wCZMtjo6KWkrlKENJmlNIaAEpKU0lACVk+If+PCP/rqP5GtbvWR4i/48Y/+ug/kaaLp/Gi21RmpGqNqkkr3H+pakX/Vr9BS3H+pamr/AKtfoKt/B8zmj/vT/wAK/NiGmGnmmVmdRl6l/wAflh/10/qKvGqOpf8AH5Y/9dP6irxrKHxS/roaS+GP9dRh71G1PI5phrQgafrTDTyaYaAGHFNOKccZppxQBzt9/wAjlY/9e5/9nreH3awb7/kc7H/rg3/s9bw4A+lLqcy/3l/4V+bGt90/SihuAfpRWsT0aOzPLdOvPEjpf6rctHEY7H7RGymPeeVC7k5YKQTgkDOODSPrnjaKKOQqh8xkQIuxnUv90MoO5c9sgZqFvEGlyW1/cfa5vPutMS1FuSnlo6iMEj5t3OzpgfU9r2p+L7K8DTQ3jRGeaN3jSC3XZh1YkOMOeR3x7mpk7vU5VBLp/X3klprnigS3aXd3ArQ2cs6eRJFJ8yY4O0tjr04NUp/GvjKxuBBNOscxAOzapIz0BA6H2PNXF8X6XZalFewy+dcpbTK0rxxIXYlSgKoxXjDc5yc1y+q39lJqv2qznOx8SbZGB2N1Kg55GfXB9fUpD5V2/r7zstV13xPp/m3MWsJPdWUiwXKeQqiMnP3TuO4AgjkL2wCKseHvFXi/XXaFNTnWbcAPKtFkRQf4nO8FV9wDXO6rrekSpqL2s7+bqtwksocpiIAsxC4bJ5Pfb0x71X0i80OxuzdXF7OXt5S0aIiYlXtk7/lPXONw+tJITS2aOs0PxL4w1O3uFh1OZZYS25ktFaBcDOWk38Zx/dNYU/jzxpqFlcxm5WWKOPzJFKr90Ec+/aoNI1TRIZor+ad0lhldzaoE8tgTkDcWzjsRtNUtE1TT7PVUe9kVrOVXhnVHGSjqVOMn3z+FU99yIQiopKJ0g8W/EOzRbcvsMSxLsAQkbx8gGDyT6CpZfF/xFinhhM6s8zFE8sxuCw6jKkgEdweneqsvjWykSwuQym5S+E8+XADIjsyD1z+8YfgKig8RaRYeRbxXUk8JlmaSSRkDKrxmPC4JHAJOeM+gpF8q7f195ePjH4iC6S3+0xlnjMiurxmMoM5beDtwMHnNXL7xV46gmtILe9DSPZrcTmRowiZZhneSFC8Lg55z15rFj8Q6TFEmni7kaA2ssLXBKeYGZ1cEDcRxsAxu9eRU6+KtKimWBLj9ybGOAyukMjB0cnO1iVOc9M/jxyByrt/X3lqPxb8R5GnUXEa+Q6pIZHjRVLAleWYDBAOD0PHqKrweO/Hs+rR6b9sEdy8wg2ugG1s45+hrN1PxHbXFnfRJd7pJJ7dkciOPKRxuvRDgdR07VDceILNfGbaxBIhiF2JwrMMkbs4ODQFl2/r7zX8T+LfEq6XDOmri+tDcGMl7dUKyKM9NzcEHg5zwcgVdsvGnijU7NlGukXnlySeQbZdmEBbBbdnJC5+6R71yur32jtplvplpdyGBrwzySuIyyjbtAA34OOeSRnPQVbsdS0W30GSIX8kF5OGE8ixRvleyKfMBAPBPHP0HOnM+Vai5VfY6WPxT4vbT1ZNcX7Z9kN2tuIF2+WATjduzu2jONuO2c1J4wk1Kfw5oer6tAsxngQhywPLKD7461y9l4mtrHQXiF80ty8D2yxtHGBGrAg/vMlyMHheBn6V6P4qgjv8A4N6SYnWSWGzt3Cqcn/VrWcrdXYcXGO8U/v8A0aPJINQszLNttI87uePr7VZ+3W3X7JH/AJ/CsS1hma5ucQyH5+cKeOtWxBcd4Jf++DUafzfijbnp/wDPtf8Ak3+Zofbrb/n0j/z+FL9utv8An0j/AM/hVD7Pcf8APCX/AL4NHkXHeCT/AL4NHu/zfig56f8Az7X/AJN/mXvt1v8A8+kf+fwo+3W3/PpH/n8Ko/Z7j/nhL/3waTyLj/nhL/3waPd/m/FBz0/+fa/8m/zL3262/wCfSP8Az+FSJJbXNtcEW6IUTIIH19qzvs1x/wA8JP8Avg1dsreYW12DDIMpxlTzwaNP5vxRE6kErqmvx7+pn/J/eNJiP+8acbW4H/LCT/vg0n2a4/54S/8AfBo93+f8Ub/WI/8APuP3P/MbiP8AvGkxH/eNP+zXH/PCX/vg0n2W4/54S/8AfBo93+f8UH1iP/PuP3P/ADGYi/vGoYxHvkyx61Y+y3H/ADwl/wC+DUEVvOZJQIZDg84U8V10Lexq+90XVfzESxEbr93H7n/mSrHGTwxqZONJm/3x/SmxQTqfmhkAx3U1KtvP/ZUw8mTO8YGw+1efOSvbm6oeJ5ZUYTUUm29vK3qe1/B1t/gKZf7tw4/8dFbEf/Hq30Nc98G5RB4UvYZz5ZFwSA/ynlfet5JEFs4LqDz3rs54+zepxVPhfo/yZNbf6hceh/nWlYZ8w1k280YhXMijr3rRsZ4RIcyoPqwrKM423Jw6/cw9F+RvL0qtqv8AyBr7/r3k/wDQTUiXVvj/AF8X/fYqtqtzbnR70CeP/j3k/jH901XPHujWxX8Kf8izZ/Rv/QzWzWD4VuYB4aswZowcNwWH941sfarf/nvF/wB9ijnj3HYlpDUf2q3/AOe8X/fYpPtVv/z3i/77FHPHuFib+A0ykFxCYmPnR4B67hURurf/AJ7xf99ijnj3AkNJUZuYP+e8X/fYpPtUH/PeL/vsUc8e6CxJXN+Nv+QND/18r/Jq3/tUH/PaP/vsVz3jVlfRIGVgwNyvIOezV1YGSeJhZ9f8yZbHTU2l4ppNc4xc02gmoJ7iO3jLucAUgJjSGqkF/FOuUYGpvNH94UDJc1j+Ij/ocf8A10H8jWoJFPQg1j+IHzar/wBdB/I00XT+NF9jUZpxNMJqSSC4/wBS1Iv+rX6CluP9U1NX/Vr9BVv4F6nLH/en/hX5saetNNOaoyazOszdS5vLH/rp/UVeNUNS/wCPyx/66f1FXiayh8Uv66Gkvhj/AF1GseKYaeaYeK0IGGmk89KeTUZoAQ/SmkUpFNoA52//AORysf8Ar3P/ALPW9/CKwb7/AJHKx/692/8AZ63uwHtS6nMv95f+FfmxG+6fpRQ2Ap+lFaxPRo7M8rsvC+lW7aik8vm3sWnLO0BjIWMsUIw+7lgG5GB16mtC5+H6WsZMiSxtHJGkzzQvHENxC5VyfmAJGeB6jNR2s2sz3E9oNMtDPLaLBLcZ/ePGu3aCd+3jaoyBk45zWhNZ6tdMJJtHtHuNwaSYv80hH94b9vPfABNedKniVJNve3bt+X49zBNale18GWX9pTWKWlzNM1rIY1uIjFluMFfnII96x5dD02O8FvEROBhTICwXd3xzyPfj6V1Pla8qLHb6bBbxLHIiJFJ93fjcwJcnPyjvjjpVW60bWr658+SxjWVgN7I6je394/N1PfGKqhCsp3qbW/H+u34g7dCDUPDeiCW/hgsPKfT5RH5nmuTMuWUlgTgHIBGMd6u6J4U0e6tkeTT45S9wI2Mtw6YXAPyYYbm9ue3HNWLi01+8u0gfTYFeYiWZkcBp2AbBY7sdz0xyc1q2eja/CoQaPbuiSmaJXlBETHHT5+eg4OelZKjWdFwW9+/l39df6sOVk0zmbLw9okum6iDp+ZYMMkrSuGA3quMZx0J7VSl8Mafc2V0IodskcLSqNzHdt5I6+gJ/Cu203QtfS2uVOjW85uGPmSSTYY8g44kA6jPSqs+geItEgOopp6EwEHEkiMpBOCCA2SDnFdbp1I+0stemvkvu1/O5z0ZKVOLuZX/CAWMRjg2q7loolbLAM7ZB/i4AKt+VNTwXo85je2O+Es4kdwylNq7icbjkYBx9O1dLaaL4qn03T2WyX92ROkglTcWPzc/N65P4mrH9geJleMw6TBCiMzeWkibWLDDZy56jjH5VzqniratX+X+XX8HubXickPBmjuBOm42nlNI0hDBhghSNu7rkr371afwRpNw8P2a2kaJbVZHaJXZ2JYgfLu6+vOOOtdGNA8Sh126RCsAjMfkCVdhUnJz8+euDnOeBTn0HxLI43aRAYvKERh81dpUHI/jzkeuaUqWJbuv0/wAt/PYLxOUk8D6RaidrsPEkTxqMK5Zg6lhwWGCMcj6/jBF4P0z+3F06WD/luImZXbpnBI5rq5PC3iJ4JYF0qKOOSRZNqTJwVBAxlz/eNEnhfxPJqDXwsAkxk8wFZo8Bs54+atYU69nzPdPto7K36iujhfEWg6N/YkV7a6eLZvtXlMqyuwI25B5J565/pWlbeFNF/s25lns90rQO8I81xtC/xdfXgfQ1q+KND1yPT7dp9Ht0t1uN3lLKNrsQc5w+e3YjFaw0TxhY2cixwyiPyygX7SuFB7gBuDWk6FZ4eKirO73fS60v/WmgXVzlbfwnoo0S4lms83RhWaP9642LvVeme+T17Y9a9Im0a3f4dJHBFtddOQock4IQetYVvoXjBNPa3EMrQyQiPDXK/KvB4+bjpj6E139haPHoVpZzrtkW1SJ1znBCgEUUqMuaftUnd6f09hSemh8n2V/dpcXWJSGL/N8o681c/tO8/wCex/75H+FQXNt9m13VbfGPLuWX8mNJsOa19jTf2V9wJssf2pe/89j/AN8j/Cl/tS8/57H/AL5H+FVtlL5Zpexp/wAq+4Lssf2pef8APc/98j/Cganef89z/wB8j/Cq+w1FdP8AZrd5du7bjjOO9Hsaf8q+4Ls0P7TvMcTH/vkf4Vbs9QuntrotKSVTI4HvXKf2q3UWrf8AfX/1qu2OrSeRdqLQnMefv4wAD7Ueypr7K+4malJWRo/2peH/AJbf+Oj/AAoOpXnUTH/vkf4VhxajLLII47NizdBu/wDrU+XUZoH2SWZyRkESAgj2IGDR7Kltyr7i7StzdDY/tS8/57f+Oj/CnwSX2qXcFlG3mSyuERcAZY8Dn8axzc3BhaZbZCgXccTqSB9Ovep9P1e90y9ttRhslkaCRZAolDe+GA5APSnCnSUk0kTWp1HTenQ3ZNLmfRrP7Hc2N5dT30sX2iF28tUSNWOdwGNvzEnHToSKzoNHvzM0kN9YSWbxPO14vmGNVRgGBGzfkFl429wenNJb+LxptnaSaRpElrHa3kkh33bPI3mR7GAYIuBtBwRyM9zzUc3ip5rm21GUa7JHFvjjL6yzTxv8pJV/LwoxxjHP4V0RkuVnN9Xl/L5/1qaFvp02oaJK1vJbTSxXoR7xSyxJF5e4sdwGBn1Ge3XiqOi7tRv5I5pl+zwJLNLJEvJjjQudue5C4GR3GRSt4+mc3kf9kqba8nWS4gaQlZUCBMN8v3iQG3jB3c4rJ0rVX0yWW/hsjJa7mgeCSQ5aORGUruCj+HIzjrg4rGdr/caexcqduXv+lzvNMtbS502G/thMtvcgkRyuGeNlYqQWAAboDnA647VdisLC50u6liS5jltog7SO4KMxIG0ADI6nHJ6Vy+j+KSYPstloNybK3j2RIJ9zKxLMWdtmDkn0HAA961LzxO39lx2CeHL1GjXcWjugyPIersBHyfbdwP10urNEPD+5dRNdbLTJNClvEguopVkSJC9wrq7HJbgIDgAevcU3TdEbUNhjZArSiIlmI2nBOTx0wD+RrBl8Q6g+k2kC6HKsUO9j+9yXdjy23bkcBR+FWdO8V6jp1pdR/wBgTSC6TapEvKHBGcBc/dZh261CkhxwrVO/L/wx0S+Gzx5jJGArs7Oxwiq20k8evAxmo7nw4ogud88UcKRBvPZmKEMDtxgE889uxq7NqfiG5unkl8HaoYZYBDJGscgJ+bcWB2cHdz0NQ3d9rb2lzFceDtV+xGFV2KrqyBAcHcUI/ibPHftV6EKl/d/r7ytpfhgx26x3TxRbJfLyznEhJyNpA6EYOeByOauz+G4nv7hImjhQ3LwwI7nLkHoOD6gZJH1qCx1XXLu0WSXwfqU0XmeZCIkkCrj5QudhyMKo7HjrV4ax4lzKD4W16NWmeZRAZI8bjnB+Q5H5UaCdJ3vYpp4a328cvmxBpY3kSIsd7BSwPbGflPU1DY6TaTefLP5nkwReYwRgC3zBQASDjlh2qyl74iSS0c+EdVYwRPHzFJ824uc/c/2/0qvZv4gtmlEnhLVZIpkMciCF1JGQeDsODkA9KV0X7FWegt9pdtbFBCXMM0aypvPzAHsfoQae+kadLYPcQC4iEcixkyuGD5z0wBgjGcc026k166Yn/hEdWRURY4VETnYB6/Jz39OT+FTXV3rMqwfZ/BuswmDHlqwZkHqceUCSe5zRcHRjZaGhF4cs7TVrArHcx/6fHEvnMMTLu++uAOOnr1HNel/2Laej/wDfVeWQalrn2+3dPCGrRqb2O5lMiu3IPb5BtHJ657c8V3H/AAlWq/8AQr3v5t/8RQ32IWFpyXvxu/68zWn0i1SCRwHyqkj5vauc8S2sf/CNWr5bJuB3/wB6rFx4p1Q28oPhi9AKEZy3p/u1zmveItQl8O20beH7qNRODvJOD97j7taYOpKONhroN4OgldQX9fM9FNhFj7z/AJ002UI/if8AOufbxXqg/wCZYvfzb/4moX8WamM58NXg9yT/APE1PtJ9w+pYf+Rf18y74k1Sw8M6NLqV2J3jVgiqnJZjnA6cdK8Ov/iXrk9+1zGsCx5wsBBK7fQ89feq/jrx3qHinUmhWOSHToGxHAGzuYdWbjk1xrXDd4WodSfcPqWH/kX9fM9Bh+LN0yZh09Fcdcvxn86WP4rap5wM9lC0XcIxDV5wJWV+Ijg08ztjJiIHrS9pPuH1LD/yI9/0vxVHf2Ud5bSbo3H3T1U9wa2ri5jvNMjlBOS4yM9ODXg3hnWDp90Y5GIt58Z/2T2NeoWl+REqZ43A0lUne1zSlg8OppqC/r5noot0I6t+dBt09W/Okhl3oGHcZqbPAp+0n3I+pYf+Rf18yrNAoiYgn86QQLsHXpUs/wDqmpF+4v0FU6k+TfqcyweH+stci+Ffm/MhMK+9MMS+pqdqjas/az7nV9Sw38i/r5mVqMYF3Y8nmT+oq8Yx71T1H/j8seP+Wn9RV81lCpPnlr2/I1ng8O4RXIuv5+pEUA7mmlR6mpG6UwnNa+1n3M/qWG/kX9fMZjDECkJ460v8RpvvRVd2n5IWCiowlFbKUvzEzTTnFKcfjSE1mdhzl9/yOVj/ANe7f+zVvgYUfSsC/wD+Rzsf+vdv/Zq3wflH0pdTlX+8v/CvzY1s4P0oob7p+lFaRPRo7M5HSv8AkYZP+uP+FdB0bFYGk/8AIxP/ANcf8K6Bxg5p1d4/4Y/+knLH4pepKpqZTVdDxUhkWMZd1UdMk4rNK+xdiaF/+J3bf9cz/WuqgbpXFxXcA1m3JnjwEPO8e9dRb39oMZuof+/gopQneWj37Py8iqkZWWnQ09NP7lv941B4o/5Fq8+i/wDoQpmnX9msRBuoB83eQf41F4mv7R/Dl2q3UDMQvAkBP3hXVUhLmej+44MFCX1eGnT/ADNjSf8AkDWP/XvH/wCgirlZelahZDSLINd24IgQEGQcfKPern9o2P8Az+W//f1f8ajkn2f3M6uWXYs0tVf7Rsf+f23/AO/q/wCNH9o2P/P7b/8Af1f8aXJPs/uYcsuxaFLVX+0bH/n9t/8Av6v+NH9o2P8Az+2//f1f8aOSfZ/cw5ZdjE8c/wDIEg/6+V/k1b99/wAeUn4fzrmPG19aSaNCsd1A5+0qcLID2at291GyazkAvLc9Okq+v1rscJexp6dX+hPLK70LsP8Ax7Rf7g/lStxg+lVYdRsRbxj7Zb/dH/LVfT60rajZEf8AH5b/APf1f8a5XCV9n9zHyy7HzV4mtPJ8d6+gGF+1uR/30azvI56V0ni8IfGWpToyssszcqcg8mskKAahpp6i23Kgt6d9mq8qr1qVYgRwKAM025qhq0ITTZSw4BXP/fQro/I46VQ1e3VtMmDDI4/mKAuYL7pLpCblRblsxbXHyjtgfw9hVpZAOUmxKYWUsZgxzzjLDFXIdJtDaxMYuSgP3j6fWp4dIszDcEw9F4+Y+/vWTpnU8a43dt33f9ehzFudt1IJHG51dN5bPJBHX+tJcr+6ghDIzRoSxDDA5JwD3/CtSW0skbb5ZLeikk1A1pD2tW/Fj/jVcutzNVvc5bf1uZ+5YtPKqwLyv8wB6KvT8yf0q7Zbo4JI3kh2lQVEZUsee+OcfWkNrCOtuw/E1Z0yOwjuWaaFmXYRgE9cj3pxjqjKtXapysr6W/r8zOs5JFsXWG4SKUyqfmcKSMHPJpxkt5JQI5hEn2qRgQQCBgYPtkjrQbSD/nn/AOPGoRZx5Pyjr6mqjTTjN37fmdCxLSUbbf18i20sZliYSr9o8l1DtKHKtnjLDjpnmm20wSzuftUyNKZUw5YOAdpwTjqBxUMVrCHO5OMf3jUiW0H9nyIY/wB4WGDuOMce9c0oW09DaeKfJGaWt3p9y1/zNfwZvZ9SVm3vlSSGznk9+9bw+6/1NV/htNoem3uoHWLOe4jdF8sQscggnOfmFdSdQ8JESY0u85Jx8x/+Lrfl0ucrq2g427/1+COdQ/IKmgb97W/9p8LQYjn0i/SQAEq2QeeRxv8ATFWEvPCVrctHcaNqCOhwyNkFT6Eb6hRClW/dRTXS2/e39fqexxn5RVfVf+QPff8AXvJ/6Ca5JPifoqgf6LqH/ftP/i6ZqnxH0o2F1bvZalHJJAcCSJR95cg/e6EEH6VoZHR+Ff8AkWbP6N/6Ga2a820H4jaRZaJbW8ltfF0DZKxpj7xP96tL/haGif8APrqH/ftP/i6AO2zSE1xB+KGi/wDPrqH/AH7T/wCLq3pfjvTdXuWgt4LtWVC5MiKBjIHZj60DOs3Dyz9aiL1nHWYPJb5JOvoP8a5m51GW8124hjvbmCNUDAJ9B2yPWolLlCx2hf3ppk965Dy5/wDoL3v/AHyf/iqryvKhw2rXw+qn/wCKqeeX8o7HY3Mv+jS/7h/lXK+JX/4pi0H/AE8D/wBmqq4naJsatdHKnsf/AIqsW4upZtMltXnkkMVwm0ufTfWuDnbHU3JWFJe6z0iWbAJrA8S6oNP8PajdZwY7d8H3IwP1NOm1VCDgNXCfES8nvNA+x27hPOkG8s2MqOcfmBSCx4+CSdxOd3J+pqGcYPQ/hWtHod2UGZIM/wC8f8Kf/YFyVw0kP/fR/wAKYHNyTtkALgZp29zC+/HtVy9iFnI9uVUzKwyQARgjNU5p5DEy+XEAfRBmkA9ZlCqCewrv/CmqvdwxrISSh27j/FXnCgkKOmfwrutDfyZoYgQQqjkdO1HUun8aPd7KTMSj2q+DWHp0gMMfPYVsI2etNmaFn/1TUi/6tfpRMf3TUi/6tfpTfwfM54/70/8ACvzY1qYae1MNZnWZmpf8flj/ANdP6irx6VR1H/j8sf8Arp/UVePWsofFL+uhrL4Y/wBdRhpn0p7fpTO9aGYz+LikPSlPDGk78HitKm69Ec2E+GX+KX5iU00pOaTtWZ1HO3//ACOdj/17t/7NW8PuisC//wCRzsf+vc/+zVvj7o57Uupyr/eX/hX5sa2Np+lFD/dP0orSJ6NHZnJaT/yMb/8AXH/Cuik6Gub0o/8AFRSf9cf8K0tb1T+ytNa68rzcMBt3bep9cGnUV3Ff3Y/+knNBNykl3ZeR6r6qwNmv/XQfyNcp/wAJpMI/N/sltpbaD53f/vmq1943eW3AbTWRVYMSZf8A7GtMMnGtFs6acHGabO2FjZDVYFMQ2lDn5j710NrpulsQDCuf99v8a82i8fxefBdnRsxY2DNz94kH/Zq4/jyUOGh0SUKeg88n/wBkqIVMQm7ye/cc3V016dz0yw0TTXjObYH5v77f41H4i0TTofD11JHbAOAuDvb+8PeuJsvihc2oMT6BIWznmcg/+gUa58UZbjQ7mJ9BeJGC5kNwSB8wP9yuidWs5O0n95x4aGJjSjGV7pa6no2l6Bpj6VZs1qCzQISd7ddo96uf8I9pP/PoP++2/wAa8wtfjG1lptsj6BMI0iVRI0xCnA/3Kt2/xjkul3QaKZB/s3Of/ZKl1ay3k/vN2qydm2vmeif8I7pP/PoP++2/xpf+Ed0n/n0H/fbf41wQ+LN4OvhuQ/8Abwf/AIinf8LbuO/hmX/wJP8A8RS9tW/mf3ivV7/j/wAE7v8A4R3Sf+fQf99t/jXM+OLO30fRoJ9PiEU73Kx5GWyNrHGDnuBWFcfGaSB0jPhe4d3DMFS4ycKMk/c6Ac/hWBq/xotdQlsRPoc8McF4krsJg5AUEHAwMnnPXtWdWvW5HaTv6ibrWvd/eReIRrEOj28k5kjlkuvKWFoAGPy5B6Z5z6VblfXPMNsY5fPIyIvs/wAxHqBtzWAPiPpVvBYI6T3Jiu5pHOGwqtHtVh0JOTkgEdOD3pJ/iLGsUEa2FtFA8UioWE5ikBZcqSfm6jPy9D9an6xW5Yrmel+rJ5qvd/ebt3f6hp+mrNNLJ9oM6262q24MjMVyABjOfaqlrJr2sTpFJKbMZJMbBVYAcksQOMAEmsyx8f6Bpwu4orW9JuJwwnPzPCDGASmewOVxnO09c1S034h2ttqDC4sZRA8ckTSI27AZSoYAgdyD9Kj2tZ3fPL72Pmq9394niEXdlNJNPMlzDLLvilgHyty6nqAQQQQQR/Sob6D7Dp8NzNqFuZZokmS1VZPM2t052bPf71VNU8TWV5YwabFG52l5WmIIG9nZtqjGSMbRk45z9as/2u0OkXVhqV7e3CPaolvaSRHED5Vgw3cLhc8r1B9DU1U5S5nJ39RSjKTuP1JBpKIJdStpLkpG5tolk3qHUOMkoF6EdGPWorW8vbnyxb29zL5r+XHsi3b34O0YHJ5HHvTZtZjl0K5sr7Ub3UGbyxaq6E/Z9p5ILfd+X5cLwc+1S+HfENlpGnXiOLv7SrCazYIDtkKPGe/HDhvqgrPkaju7+pPs5D4bnUbh4kgtrqR5siNUhyXx1xgc4qvdf2peCewhsbyW6UfPAkBLrgjOVAyK1r7xRo9xcXkFqLuC2msjBFJ5Y+Rmm85gQDnHJTI7YqrqniHTL7SbqwSa7ikaO0X7U8f+s8oMCGxk4+YY/wBxc47CT8/vYvZyK2li/vtkENtdOIgqzMkG7yh0y2On41ae11E3WpW9vHcSWtpI8U9ysHygKSMnGcdM1d/4STTNTvIRAdSjlGpLdKUiy8/yooOBn58qSB6ueatL4z0y4u4WttPLzwXs1xEJUlOQxDBgsbgE4HIYdB1xmlr5/eJ05NbHOC38uza6j07U5rULvNwsJEe3JGd3pkEZ9QaTTRaapNKgR7eKGMyzTyyEJGgIGTgEnkgYAJyRV+HxRo6yaYrR3ohttOuLZ0H3Q0nncY753rz9PSs7S9T0OC21G0mt7uNLyAR78g4KyK4zjnHy471XLo9X95Xs5CahBZafe/ZZppFJVZElA3xujAFWB64IPpn1q9aaTawxalqHm22rRWdmsoggaQKWaQL8/CthQSTtPYc1U1bVNAvJYQlrctHa20cCFjtL7RzgfUnr2/Kll1iygWJrPV9RikgKpbeUX/cr82SCWG3r90ZHXmqjG1nd/eKVN8pc8NaJpupXMdxd2WmCwvL1bRAZLverlQSkQUE5wQcvke9V9cl02y8IadYPp0TmO8vIkuN8m9Srpk/f25IwOQeOmDzU1r4gXzAr69q88k8haXypJAGUKcbiXGTnHGOMdav6HZx31vqNv5t4bOafc8DM21267mXdgnpyc9K1iouEry/HzNI0Jz+FEPhi4Nvp3h6PSopBHf6tJDfKvImA8rEcnqu1mODxyx7VgaLq+naB4hfULS3uJYLd5Qghl2vtKsoZW5xgHOfau1k0f+zbCWOxkubSOchZhFuRZFweGAbn8aLa3gtru3NneyQ/Z93liIlTGW67cNxnjOOtc0nC9ubt1N54KsqcXbq+3+YeH7qEa1HqZvtS83UdKLwTajK081tiRlGW6lTtbDAD73TitHxFGvmWzifzp2tFa4lKlS75b5iDzyu05PJ4J61EjLDqDXyahcLfMMNPk+YR/vbs1A9v+9l865mleVtxkaPJOfXmtbQ5X734mcsHWs/d/Ff5nQataQ3BvQbXypILOCZbnc2WJEYwR93BycYGeKtPpsd14kvWuYrZoZr8wq0jSh855CBAecEcsMVgb7maAW0moXc0anOxwSAfpu9Kt2yaixmaC9vlMhzKV3DefU/NzRFRk9JfiKOFrcik1p6r/MbdWkWn2EYFslzJM0qmZ2YeXtYqAoUgZ43HOeoqXxVapHpTXSqJpJbWBHfJAt8QJgY7luuTxjpznCLZajFHKkd1eIk2fMVQwD+ueeap31jeiyui1xc4eHY+VPzKBwDz0GBj6CtPYvv+Ivq8/wCmv8w8MS3lt4WtJLBiJ5L7y5Ng5ZcfKp9iS3HfFQ35a2167+wuVSK4fyTH1ChjjH4VFokF5a6ahtru5h3qQ3lAruGTwcHmniG7gCCK7uIxG5dNgI2scAkc8Hgc+1J0G+v4jWHm/wDh1/maWqahO8WlajD9rQ4kXzXuN07AEZO/aOMNgHHHPpW1pF40uuWt3tkjE2mEgzSb5XxLjc7YGTxwcdAK46Z9Ua7W7Oo3huVGFmJbeB/vZzT7G7vbPUZL25e4u5ZI9haQnceQepz6VMqF01d/eP6tP+mv8z1cXWbCVt3Rv8K5y3uv+KlumB6xD/2WqEOuSnw9dTNbvlZQMbuv3fasrStYmuNelMdnJI5j5RTzgY9q554Xb3n/AOBB9Xl/TR1Gu+Lbfw1Db+Yks91dNtht4VBdsdTz6cVjav49tE1G00+4tbmSK6AxPtAVWI5Xr1Hem6p58+qR6pJoU08tpCRboSwIckZOcegrnvEN5q3im1jt7fQ5tPa3YPCzpubefvHgCr+pf3n/AOBC+rT/AKa/zOtTUoIt0QV8fX/69c1qd7ELa5UZ5ulIH/fVQX93NZzQLLbPGjEIC5xk/lXP6tqIUyrxzKD1+v8AjVYPDcuYUld/eKeHkov/ADPSJ9WtSp3B8fX/AOvXn/jrUdHkgt/tlpcSgycbWxg4P+0KWXXFPp7fPXMeJZ4r+GAGVVKSZzuz2qfqf95/+BFfVp/01/mRpe+GkUD+z7of8D/+yqWLUPDn22BEsLkOXXB39Dn/AHq5uSJADi5QZ/z61JaQxnUrc/aUz5ifzHvVfU/7z/8AAifq0/6a/wAzotVvPDy6lKkthctJkZIfg8D/AGqzLu78PNbuIrC6WTsSTjr/AL1Qa1bxtrM3+kIDlcD8B71n3EZCyOXJ8vAwO5oeEt73M/8AwIiVGUVzPb1NqO58NrDGJNPui+wEkP3x/vVvaNf6LcTBbe0uFdRjLNnj/vquEDggEjJxXSeCVEms7DwCpOPXkUlho3vzP72FNe+j3rTJoZLWJ0RgCoxmtiI8cVhabGIoFTstbMJ4GKHhY/zS/wDAmZJE8ufKJPSgZ2DnjFDnMLUD7ij2pvCx5Pilv/MzCK/2lr+6vzYhFNNONNNYU48lZxTbVlu79WdS0Zmaj/x+WP8A10/qKvHpiqOoj/TLH/rp/UVeNbQ+KX9dDaXwx/rqRtzTT7089eaYeK0IIyPmNIenFKfvGkNaVN16I5cJ8Mv8UvzEpp9qd07U39azOs5y+/5HOx/69z/7NW+OMc9qwL7/AJHOx/692/8AZq3x90etLqcq/wB5f+FfmxrfdP0oofO059KK0iejR2ZxumnHiCT/AK4/4VL4lMcmmqk2PLM0YbPAxmq1i23XpT/0x/wpviEJc2HkOWCs4zg88Vo/jh6R/wDSTLDaV/mYeMpCLqNY8yt8oUDPyjHHHeq9xHvAX7LMzlSD/oy5HoQmSD3FST6VZoxAlm/76H+FR6i+g2EC2lvNcXV3jc+1l2j9K0o1OeoopHbDFKUkuXt2v+Q6NbO6tzbOsZMLhVUoFXftzyOgPPT14p1rqIeZra1hknlRXUvt2oHAOAT9eKytBFpqbPPqNnczorE+VBIqgL7Ajmuyv5/C6eEbjUNJmvWu48Rx2koVWDHpxt5HuK51iYSk7bozqVm3CSjt/wAHyOc1C3uWhgWS6aFkRmkiiOG25z168c1LNFpj6UZzErFYFCzyfNyMcAnvnOfzrKvtDV9TWRnlYGMEknvk+1TPotulm7bpQRjuPX6VUq8VNozw1S9Lm5dGlv5aLp9/ysdLBfQ2+nxGUmNVRRllIzx29fwqsJNIuJGu1RBLAd5kVNpPbn1HtVW10OzaCMmSYZUfxD0+lW10Gxx/rpv++l/wqViIpp9jRSalGSV7K2v/AA3f/gjX1GayZ4lzelIt6qBiRiW7dj8p+vFWIdatpi6IrmdApeEDLLkentxn0qM+HrBuPOmIPHLL/hWVqngW0jmxp16ySAZdWORu9AQKv6zBlyq9of8AA/Df+rGyk8F14gtYt6oBBeLI5GdgMDcnHbr+VUxpemK9+Z4IVSz8tUF0ZHWUMT+9Pl5OCAMYwPmHJ707fw9I0sCX9xc6dIAytcLysisMNtxgjIyCD1zW1aeFrWO6Q2viXUEMMZWJkXZsTOcA7+OTmuariIrmk9LEzqWSlyaGQukaNqVxJDZophhmWaWXawxbFSXPzAHCFeuATuFT6fb2Op2OiwzaUpS9mnRX3N/o6GT+HnHyg5O7OQPxqiLeKNNVmhnvJXuR9lF1O/LqSC5298lcZJ6Z471bg8PXcds1umt3UNuwKmCNyFIPUEZwfyrWV1CM29He3y3M1K7+Ehi06w8mC2bTVYy6dJdteZPyMoc4x93blQpyM5PXtT5tHto/D93LNZWUN3awwSkRmQv87KPnyNnIfPynIxViPws4sjZjXLlbZjkwhvkJ9SucVJJ4anktRav4gu2twuwRM5KBcg4xuxjIBx7Co51e9wtq3ylK9i0nTNR1uSDw/Gy6deCKOJpHPmAuQXPPQYwMY+8M5q3retW0HipbSfSIlj/0dXLgZQGNMg/QHH4Vz8Gm30Ot6jcWmp3Rvo5gGkViHdWySxOckZAz9auzaObi7uHlvJridZdjbYfNbaAAOrcDt+FTUqKE7N/1ocuIxEVJxa2/r9TY1fxBK1hqRvNLQmzvlt7dZFHAIk3BfQAKvA9R61L4c8QwXDWll5MVoXuCLmHySRPGdvdRjgZzuIAHNYMlpfXNssj6jd7bXcYHlUsEAPAVt3yngcDvSXlve21rPDb6rcGCRgblAxALED7wzyD61HtIv3SViIuoall4kuItOup5bRV0pd8cMAQYuHOcAL0wMglu3HcisnTLs6/LcaYunQrcTQMbciMD51w36hWH407TxqwsIILPWLyOBZWWSOORlRFODkgNjB+b9aw47LUG1GP+zJp0laRhBLHlDx1IIPGB+lWpx11B1o3s+m/9fkegW5srHWdIntbFFt7vUIYbXKjcqqSshz1zu2n8a5m1i0zV2s7s6aIBPcXEDxLI2CEiV1bk8H5ucYBwOBWVJHfWd1aWrXF/FFb/AOpkKOjJlgWdFJ45HbngV0E3hvV4/EtrpkcjWenfaZPsN0If3c3ytyhB/eFkVe5ySAaIu63Gq0Z200IY0s9Yl8OWM1lFGh01pmmVpdzBDMdhwW4Zl5wucnjsKwtaXTM2Z00QyyyhlljtVmMYYHjb5oDEkHkc8j3rt20S8s5tOsba8vUe5sbm4jsprJYmE0ecAQB2CucnpyfTmuU0qPUtc8QGHU7q5+2tBcRK1wCWimCuBF8x+Qk4AHHJHFWnZhKpFqxo3rSpDdaXNC8emRaLHOsTrhVlKId+OzeaSuevUVU8JRy6Xo2p38sV5Zw4iAu7eAmUBiSNvI+UgcncP4eucVFrulazZaLp+kzS3reVZvdXNptfFuQzEblzwcc8gYBFEMeoaZ4bk1hdTuIbm9mEEEiOQ5gjCAtuBzjJVQP9gjtSuthSqx6m5coujvqV+Le/t57nU0hD6fiNyhTIO4r3JOVAGSMcYqKw1i50C31uC6t2vrZNQa3F7HKqOko3YI6kggE4xjin3ml3+mHWNTGq3cNndJbrZXSAqLss6EKrB/mwm/Ppgg9ag8KLfy+A7w6ZpceoXX9rxDy3tlmITynydjZH1bqAc5HWrpScbruOVRTskNk8Q6w2mJfObo2TzNEsglGC6gEjGcjhhz05qpceJDqUTR3lo90zHOXIY11DadbTtZWOjywGxTXr5ULr5yYEcZCgH7/ooP3uPWi90SBLuxuotPWTU5NNneK0ubBLbzZ0kwu+3Viu4IWwv8RVeOeYajLVrsEqk5QUW9Fsefy6fcTwS6hZ288dtDKsUnzBiruGKjGc8hG9uPpWwniW6n1eERSPj5E+zu2S5z2966TSLOS803V49es4ra7F9ZNHZmEW8XneTcbEkUY2qeD25wCQCSOd8LzXcfxEBvbeKyuPMkTa6CMxTFHEfGAFw5X0Aq1U0sZ2crryNXUNWvrXVDbXVhc21ywBEMpZGIx1AIzU0t3qlvZC9uNKvUs8488hghOcY3bcdazvENrq9tpuhWcsM/8AakMt1MYNhMiQYQjI6gZWZvoSelamjHV10PUbi501LW2nsZGTU5FkxggERrlthLEbRxuBbPas2oXWhtQU40uVPSy7DI9fRrN7ldEuZreNgjzNcvsVj0BKgAE4PFaNxJZSQSA2zxFlI3LcMQPzzSXL2g8C6hBa6rZXFtb/AGZtgWVXaU7y5OUA3E8DnGEGT6517f25t5REWGVOAV56dDVOcV0KUJy+018kPg+yxQRoLsDrgu2AefWn3Fv5JXzZTHvGVJJAb6etcxNcym2QRqzAZ/h96hW+vmtjblZDHnIBB+U+1HNB9F95PLVX2n9y/wAjpmtkPP2n8c1EbFDz9p6+/wD9esGO4vgoK7sehU1Kl3dDkq2PTFTzQ7L7zRKp/O/uX+R2FvaoPCV7F5uVaYEv2H3aqeHdRg8N64t9LKjxrG6EFtudw9frVa21WVfBGoExkuLhcKRyRlO1Yi29tqtnqF07JbPHNAoefcAoZXyMAHOSo7Hp9aidSEbO23mTJSs7zf3L/I7y9+JE1xJ5VlLYRAnALnef51Xn8Y6tp96Yrm5sMgAmMxbTgj6/jXnQ0lv7QksnuLeG5SXylSQt87ZxwQCPxJApt9ZpFotvfG4X7TI8iNAyvu+UgcfLjjOTk+mKPrC5krLX1MHCS+2/uR6Bq2uWHiGCGMvAk4nSTKyZAI4xjtXNapZWst3JvvYo8Nxuxz+tY2mWVtLDZPefaA97cmCIxYAjxtG9gQcjLdBjoearW9jHceIRZ3rFIQXDMJAnIUnG5hgcjGTWlCtCOKjUt8N/6QPn5WuZ/cjYOk2bDi/h/If41TudFsyOdUgTHfA/xqnNZwW2qwxXFleQ2zqCFW4SVnzkZWQLtI/A9DVTVLP7Lql5aRBzHDO8aluSQGIGfypxq029F5/1qP8AeS05n9yGatpo0+6WEzh8qHztx6+/tUNko/tK2O8f61P5itnUfC+oSzBo/KYCaO3ABOWLdGHH3fmXJ/2hTR4cEtrpklvLbrcPHK7KWfM5SR8leMD5VGM7c+5qJ16aloyIN8quynq6j/hIJfnH30/kKgu2/d3CZ/iU/pWv/wAI/Hcy3MtzPDa7LFLmJh5hDEyqmXwrepGB6r71nyaHdC1klEsG8xef9ny3mGMfxDjHTJxnOOcVSrws1fqbqX7tx8/0M5cFR8w6V0vhZxb3TT8DYhwScVjabp8Elrd316lw0FvsURwkKzsxOOSDgYVj0PapbiwGn6xLApkkh2K6MRglGVWGffBFCqLn5bkwupI9b8LeMrjUoJESxa5eM8mNu35V08fiO8Vwv9i3G49tx/8Aia8g0PSreKWyuIWe3S6k8oxvcxzkggYPyAbTzjaea07PVLZZ9O1H7DfR25vfKYEb3XaVORgDrnp2IqVXjJpXM0mesx67fS/I2i3KKerEnA/8dp0mu368DQrpgOAwJ5/8drO0nxlpd/MbRLxWY/cLKy5+uQK6qO7tmiA+0RdP74rfmXJv1MYwl9af+FfmzFi1u+lmVG0W5jB6sSeP/HajfXb5ZGUaHcsASAQTz/47W011bZ/4+Iv++xTftVv/AM/EX/fYriTX1h6/ZX5s6VCXMcxeaxeyXNqzaPcIUfIUk/N09qtNrt//ANAO6/M//E1dvpY5byy8uRHxJztbOORWia0h8Utf6sazTUY6/wBXOeOu35/5gdz+Z/8AiaYddvv+gJc/mf8A4mugYUw1pqRZ9zEbV7wQiT+ybgsxwU5yP0qE65fd9EufzP8A8TW7/EaQ/rWlTdeiOXCJ8stftS/Mwv7bvv8AoC3P5n/4mk/tu+/6Atz+Z/8Aia28g0hOOazsdVn3OKn1O5m8X2RfTZoyIWGDn0b2rYk1u7jDZ0ifamcvk4wO/wB2or//AJHKxP8A07t/7NWtf/8AIMuf+uLf+gml1OVJ/WXr9lfmxtlefbrCO42bd4Py5zjBI/pRVfQv+QFb/Rv/AEI0VrDY9Kh8Jyts4XXJie0X+FVdZugI/vcBsntVa81GLTr67uJX2qkQH1zjgVy09/c6pcF5iUg/giz+prR/HD/DH/0kjCq9f5jr3VZr52jtTti6NL6/SoLa0RHOB1XknqadGoUAAAAdAKWQzgxpbBmlkcIqqMliegArOhNRqxk9rm1FKElJljSmk0+cNG3ynIIx1zV2HPzgnOT1xWf9g1dbtbYCJpSrMdk8TBAvXcwOFx33EVLHYa4000YRQYFV5HaSNUCt91txO0g+ucVj9TXNzaX/AK8jRStszfRm1PVI7aAxRSNEzAO/GFVmPb0BrMuNTj+yOMox4/j962dNutdttPaSXUBBZQaZLI0KvF8zuxUM6dTlW+ViPTB9cyz1P7T4YnW41CSHSYrZIHjQxuvnNkgiEgfPlWbduzgcHtWksPHmvZHLTlOMFG5f0a4/tGe2sofJEsi4UtJgcDPYH0pF1WIpuKIF/vGTioLzW9StNftEv9Y2m3nwsazK8KIUPzomP3YwRhfes3XpftcFhfpq0kmlb2t441tVRrfGCQE3YY8gk5BPftS+rw7Iv2k+52DyyW2lC+TTIymwSGSS5DMqE4D+WMMFJ4DEYPFUItU1DVMx209lCY8Es88UROfeQjJ+lZmrXrWtqmqW+rNKL/TxbiGW3SNwiBI1+UM3H7s88fd6mq0Gm6xZw6dqOnNayvLaNNILr7PIuQ7htqOCCAqqTgEjPan9Wj2QlOfc247LWNRkubZkLyxP5MvnTJHsc5AUbsAscHCjk4rM0+SawvpBcXDzRKpVoGIUqc+uM1Lqz39/IILbU0Z55U1CSS4kjjBZxgsGIAwD0HXmq0M2vP4xvr/iHbdObh5vLiUCQnIy2FyQW4HXtWU8JFxastRzqTlFK4+/1e1axjSKFAquMASdOD7VZm1mNrlUVAqkdpf/AK1Lq0t3/wAI9bNbXSzNHYPbP/q1VYhMQSDgcEo3OckvgdcU2LV7nUjdXGqarE+lWk8U0CLEknlpuwEVONgIIBXgHGcHFdUqadOMLLS/42J5533HjUk/vf8AkSrU0jQ2FteM6+XcM6oBKc5XGc8f7Qqje6zq9vqtrFfatFe3KymW0nKK2EeMGM8g4XJQhegwcVsXl5qunDTf7T1SWe7i81RPLbLgM2MssrAliMYBI9OmKz9hHXRA5z7nHhbW71O6kkkYEnosg/wqb7JY/wDPaT/v6P8ACuoh1W5tPFztFqLTSXWmruuJAnmS/vCwyR/ENowc5wo9K1NYW0uNQS48uGaSSCF5iACWfYu7J9cg5981VRJyv/WxMqjbOSvfDL6fbLcXSSJGWCn/AEhGKMRkBgOVJGSA2DxTLHw8NSB+yOHbdtEbXcSO59FVsFj9Aa9D1UWhTVZlmtphqF0skIVlLY3M25h/CRnHOOpqvocDWepLdJJp0SxyBJHlaEsoBBJXOSfYpnpUciI52cEvhppdPN8JEjt8uoaa9ijLFQCwCsQSRkdB3qlJ4ftX0Q33244IzsIB/ix1r1LTmTYY7mSyfSvMlYiYx+dz3A+/uOB049a3I1T/AIVE0QUZAORgY5npqKE5M8ftfBUFzDBuv3USqCAEGemcCrF38N7dBF5N9cMJI8gFRndk5/pXvsBWTwpYtydlvGeTyDtA4pbogT6XIc4+UZzz2rRxShfzOec5czV+i/M+fofhkLyyuGS7uHuLdwrARg8HHB96hg+F8txEsqXMxhb/AJa+UNoHrmvp1Nscz/3pSTlT1wMc/lTNqGBrcAZAGR/D+VTZG1z5wm+Et3JO5tzeNH/CTCDkY60aP8MoLxpEe9nYqSNsaqCSO3519KFtkeBnAHqa5JbK1t9UNxFEFklJLNk89fypxinJIyxFRwpSkuiPGE+EOqXE0iL9pZ4yA6+V93PSqFj8MLy+a7aNphFatiR9g9/8DX07G48yZ9oHzdeecVyfh9l8/wAQkqBmbkH3L1tSpxdKo2tkvzNeZ3R4vZ/DOO9gaeDUWkiVghdduAxGcflV+L4TyszI1zcBwGO4Acbc5yPwNes39lZaf4eiS0tIoA9wGbYgAJwatTTbL2KTCgmBznHXO6vPm2n80dFZWowkurf6Hkdl8KrW70prmWS8UhwgmR12591K/TvTp/hJLpBEst+WiLgFgoFd/Yata2ujFJriKJjPnaxA7CuQ8W+LlvruPTYZA8YnU706HntWskrGEptRfo/yZzV54V02O7ZZNZWM9lfbnp9aYfDulKg3awijsSy/N9KyvET3UuoSICzqpGNxzjgetVruOU21p93cE7/QVi4q8f66GuFm3h7vsjTl8P6cchNehx7lf8adLp0TRuR4htiQpO0KvPt96udaKX+Nhj2qaMhUO0rnHpWnJF7oXPLua8GmIYEY69bpnPyFVyOfrTjpkGc/8JHbD2Kr/jWKZdqAsAR7Cq84DruRuD2o5Idg55dzp4tOt88+IrVv+Ar/APFVZi0u0Y8a9asO4wP/AIquMgfa2D39KtENGVlQ7SOpH9aXs4dkNVJ9zuYtCim8NXsVvq8UkrTja6qDt+7xjPP/ANeuFvHu7e31DTyGkdriNjLjH+rDr0x33evGK7Hwq6totwwOB9q/9lWuO1O7c6pe7scTuAcf7Rp+yh2Ic5PdksXjGa2vZ5/sk6NJcicCG48vPAGxyFyy8dOOprLvNb+12Jgkt5FlWaSWKRZBhQ5BIZdvPTggjrVOdvmznk1WJLE1Cw9OLukQlY29J14wC1gntGuXguPNt2Em3ax28MMHcMqDgYPXnmobm6jXUSLu3uJOSZVEoQtkcY+U7cZ75z7VQsG/4mNsP+mqfzFXtVXzPEFwufT/ANBFbYWjCWMjDa43flbHvrkc11aBrSYWVqhVYVmHmHktkuUIzk/3en50zU9UXUNTmureCWATOZGR5BJhiSTghRxz0x+NaWlxWUPyyxK+04Ykcisa7QRanKiDChjjArNUoJ3SHqtbnU3XiS7s5oAbFv3dmYsEkZfjbJ07bE4/2etYNlr80V3pQ8gsbRHixu/1m9mOenGN/wClbOujdexqT/yyH8zWAiRpqdsB185P5iieHpRk0l/X9MzpLmgmXp9faK+NtcWkjRG0W0kRZAjH5xIGBKnHIHGDTbrxTcvp5sna+Vkh8hRHdlIiuMDdHt5OOOCAfSqOsf8AIxS/76fyFUr1c3Mhz6fyqvq9Nq9uptyp0+bz/Qt6dq/2aGe2uYHuLacLuVHCMGU5BDYPqRyD1q1Ffy6pqk00ltJuYrsRG+VI1GNvTJ4CjOR0PXPGREEA5Na+hf8AH24H/PM/zFCpR5uawQ+Jam5aR/arWO2tNPulsxOJZGL73kYAgAMFAAGT2z+Va+r3F2dMkeO3vIhbqZFkmZpGDYHJYjtgVn6FqJt7YJJKscUY6k4AqHXvFCX1hLZWm7bJw8zcAjuAKXsad07EK+5T8G3k0/ivTo5n3RtLhhgDIwa+ioNNsSBmAc/7R/xr538EWw/4SaynbOxWJU+pwa+iLa4DxqQea19nDk2W5zqpP601d/CvzZJLpFkDkQDH+8f8ai/sqy/54D/vo/41o7g8XuKZiuNQj9Yat9lfmzpVSfNuUV020jdXWHDKcg7j1/OrBqRqibiuhRS2RTbe5GTxUbVI3eoiRTAZ/FSGlP3uaQnpV1N16I5cJ8Mv8UvzG/hScdxSmkbmoOo52/8A+Rysf+vc/wDs9a1//wAgy5/64t/6CayL7P8AwmVj/wBe5/8AZq17/wD5Blz/ANcX/wDQTS6nKv8AeZf4V+bKmhf8gK3x6N/6EaKNC/5AUAx2b/0I0VcNj0MP8J4dez6helJvtAaSZNzeZtUDDY44AHQVWji1XzHUMoKAFiSoAB6HPTFWoLi38lDJGW8pShyoYZLZBwTz171Obi3uYrg7GRBHGpKqAeD6A4xUqV2teyN6NOm3dPXyfl/mUlh1LyJpGnRGiZVKttGcgnP6D659qltL3UtJ1Cy1CZVlSCdJDHkAnvg45XIzg057q3lSWNomVTs2Ecn5VIGefenNcQTOzCDMkzLuDjheQTjnn9KEkne/4A403pF/j6/8D8xYbyw06e5iSe+ktLy3aCV3hjWSE7lZcAOQ2CgzkrkE9Kmuddgm02exgaYI9vDbW7yIuW2OXZnAPGSxwBnHFV7pYTJPDb26l2lJO5OFAz7+/tTYpbWJY0eNZZY2LARRjaenBJPtWnMuXR/gQ4JT5b/1f/LX8DUn1axlW9mYXn22705LRkWNTEhXYNwbOSDs6Y4z3rL054E0+80y+a4FvO0c6zQxKzo6bgBsLDOQ7d+OKvTaFc3F2ip5nllfmZUAAP581Nc+GFS0DyT7RGuDhMliT1PPvVytzfF+DMKdnG7t95lajPb6lq8t5LLLBG8yARBFZlhAx1z94ADjGDzyKta/La3hhTS7m5+zQExxW0lssYiXudwdtzE8kkDP0wBauLCwtWhDq7MYlORGDke/PWp3ks4ZDG8DhlG0jyl6+vWs5TVtHv5M0jGDbUpWMrWS+pagHtsxWsUSQQpLgEIowM4zyep56k1tW99ZQW+lyk3BvbC0kgCBF8uRmaQgk5yAN/oc9OOpriazyvySHAIP7peffr/nFOle1jLRtHIGAA/1a8d89aamk9/wYKML3uOXULPzUM6kMtlFBHK1sk/lup+Y7HO0gjjJ5HpTtR1ex1Oa9jlkuILeSWOaJ4okdlKxhGVlyowcZBBGPTniCQw3EZWOCYqFwxW3U7ffOeKzLb7P5klttkeRiCNsQLZGe2fek6nuu7/BhKMdEnfQtXGt20nh220O4E3kQox82NRuWXzHZTjPzKVfBBxg8j3paReWkVpd2t48/wBnu1UeZFGCyMjAg7SwBHUYyOvtSXtuJbgxpbzrMWGIfI+bGPTOf0qJzAsiQvHJD5ahSGiAbOMkkZ9Sa0ck4xfN+DIdk3+BtwTW2q+J4HTzUSJYUt0KA5SNAAWOeD8oPGeSa7i/TTbqK6ykzG7nFxL5jHCkbunPP3j6cVyPh02CaxbyMJwu3adsCk8IR03V2n2jSnJDG7wDwBap/wDF1UJUmvenZ/4ZfoTUTUmo6r1K3hqz0LVPH1rbItxKLeE27pMnlrwGIIZXJPWvVD4Q0AShfsA+YEnM8n6fNXi+hXdpB4m1OdJbqKRSdhihUMOf94YrsU1qaaKSZNS1ciEAu20fKCcdPM55xTfsL61P/JZf5GVpPWx26+DvD5ZkNgABjgzyZ/nSDwjoDRsx08blJwPOfI/8erhf+EicPuGp6xk9TsH/AMXQPEDDIGp6wEPbYOf/ACJRbD/8/P8AyWX+QuWXb8Ud23hHw+Iw62Izxk+dJx/49UfiS1ttM8HXNpZoIoBjC7i3Vwep571xqatdyWzzLd641uh5cQ5VT7nfVDU9SvNS0ua2trjWrucgbYlh3Z+YZ4Dn+VFsP/z9/wDJZf5Byy7Hp1owHhm1jJAX7JEQv4LTpn3vpwJBKuMD04WvObLUrwW8FqJNcedIlV4BASwwBxjdnj6VfsJbvVLjy4L7Uo2jYBlmTaUJ9PmPp7U5OhyfxOvaX+RhOEuZu3RfmekmUtalw43gEFyOR60skwCpIrAAsNxA5NciuhayxZRq9/s6Yw2P50v9haw2V/ta/wAg+jf41HNhv+fv/ksv8jo9nPt+KOrupTHbSN7VhYBuICWx7fnWXd6JrAi2vrF8Nx7hv/iqqXGi6nCEjfU7vcRkOQcj9aqDw7mrVP8AyWX+RhioS9hO/byO72RRW8itP8zbuMetcro5ijl8Qfvek64yOvLVn6jpOrWtv+91m8jDHGXyP/Zq5O4ikiFwsersTKQXbOMnn3560vrGHp0ppTu3b7Mu9+qOhUptrT8Udp4p1uwt9Jhi+0JvVw23OOxrgdZ8ePeWbGzUosaiInPJ9cfnWHfWoDEyamkpz/FgkfrVT7BA1jKP7SgPzg71AwOnvXk1K8G/muj/AMjrr039Xp+r/NFK6leQ7y7ep3HFQRTq19bgsc+YvT6ip5dPtpX3y6tEgA+6cf41Z0/SrWS6ieLUrVwrr2yev+9XQ60Nv0f+Ry1IS9m/R/kytqkiJqs2TjJHX6Cpbu2SW3tnV+QvHOc9K0NR0SCTUpZP7QiUnGVYD0HvRJpluY4Qb+KPauASOD096ydaF4/5Pt6FYKD+rf8AbqMDywrYLEH1I4qGRIMksoDdiORXSJpVsw41WBvbA/xqq3hu1USu2oR5AJAA4/nWirw/pP8AyKdORjggRLmQYxwMVUkIOcMPqK6u28J2t7ZpL/aSqT1XA45+tQ3Pg23h5/tSMD3A/wAaHXhf/gP/ACF7OVjkF+/1GK04ZBsxlT6CtAeHLNTzqkOPoP8AGnx6HZwtn+1rfPuB/wDFUe3p/wBJ/wCQezkb/hSNBpEyk7d11n2+6tcDrYCapeAsAPPk6f7xr03w9pludCnKajEw+05yAPRfeuJ1DSLFr+6Y6zbAiZ/vY4O4/wC1Ve2glf8AR/5E8krnJSABRgk59RioM8mujl0ewkGDrlt9SB/8VUX/AAj9jjJ1y2A+g/8AiqX1in/Sf+QezkY9h/yErbH/AD1T+YrU1M+Xrdwcc/L1/wB0Voab4ctPtkLpq8EhV1bAA9f96na1pdqNTmk/tSAsSAU4yOB71rgq8Hj6dvyf+Qp05KLMia+YsHi2pIOvvUcsy3RVyhWQDDY6H3rVTQrAj/kNWx/Af/FVIuiWKjB1m2/DH/xVZfWIf0n/AJFezkT6yM6nHwf9SP5mqBtJG1C2cgbfNXp9a6y50W2uNViLanEuYwApA55PvSXOj2lleQo+qxbi6kIQB3+tXWrx9o1+j/yIw1P92mzktWgUa3KQBncv8hWVfw4ndvpXY6nptm2pyyHU4A2QdhxnoPesu/0y1NvJL/aUO/j5MDPX60lVXK/Xs/8AI7XGPsfn+hFPpE9tp6Xz2tsLdkDrvuI1kZd23IjJ3kZ7gYq7caDq+jTP9o01YnSTyJFimSRkc9FYISRnBxnr2zVyS80ybQvsF1dNcTm3CQpLaxj7M2/duWbdvx975MY5ro9aurHRfF2tXVo80t3NqayyRzKojTypd5AIOWywHUDAyOetedGtV5lG13r3117309TnSOFvNM1O0eCK5sYh50hiTFzGyq4xlWIJCMM8hsEVo6d4ble4nTVLVI4xZS3MLQTpKsu0HGGXKkZBBwa2WuNPuL23Ekpu7EXX2iS0Gl20AB2sFJMePMI3fxYB9Oa2odWFpLZTWscjS2lvcRq5sooVZ3wVPlodoA79T9aUqtZxSUdfn8hcpzFlpOrLqIjjg8h4UVmLXCRiIMDtBJICsf7pwfatuH/hIYrae4NzcwwwyPFIXuthDqMlcFgScHoOT2qW1ura3a7RYpYorqVLljJaxXRjkAYMoEh+ZTuyGyGHTnkmjq2rG5smikMrSm9luDI8aoGVlRRwpwD8p4HFaxqVW0nGyuu/z6mfIudvy8ja0y5vrmze6uNfvreASLEpV2cliCem4cADk+44NNurnWrS8ltZNVvTJE5jIE7nJBxxzWNo+rWP2RrC+eUJ54nR4ArEnBDKQSOoxz2x0NbK60s2qPqyRB5jcNJ5TDKDuOQQcg/y60k2q07ptW0/TX77lWNCaz1eO6s4Dr12TNE8kreY+Itm7ePvc42mqN3dahbJDNDrF7PbzglHd2RgQcEEbjg9O56irR1623WGNP8AKWKKWGYIzcrJvB27mPZs89/asq9vIJIYLS0ErQ2+475VCszMck4BOBwB1PSpozqqcee/X9f+BbyKaNW+i1iC3iuIL28eE20cz5uvmGQMkLnO0E4zj8ajuU1U6g1vaahcMFjibMt4EJLoGwNxGeScAUkmr2PlLKi3P2pbH7JsZV2ZKbS2c56E4GPfNKNY09p55pIX3skKxu1vHLgIgVl2ucDJA+bk8dKqFXEWva9r9P8Ag9O4NIrRf29L5rC7uUETmNzLdeX8w6qNzDJ9hzUt3/accMU0F9eMn2ZJ5Xe4ICliRgHI9OB160ahqun6pJcib7TChu5rmEpGrEiTGVYbhg/KOQT1NMvNYt73SbewlSVRbwjynUD/AFncEZ5UjHPUfpWzqYiTi3G3ddlZeev4W/PGkopO3d/mNsprye2murrWLyC3iZY9yMzsWbOABuHZSTzUF9dapYX01rJqNyzRsV3LM2G9COe9Nsbq1FjPY3vnLFJIkqvCoZlZQwxgkcEMe/YUs+pxyaw2prGDIs6ukEi5TaOgJyDngduatSqqpLTTp26W1++/bQ00sR6tbXlhq+kT3OrXHmTo6y4LEwbeo68n5uRxzkVpM11baw1mdRubmBrUyAyEjcGg3jK5PTPr2rI1fV7PULjSbU2YtniaR5pY97kKx52hn5PfnHIHPWtWS5srvWYprKS4bFoY2E0SpjZBsBGGbOdue2PeopSrc0faX+1fa2+mxk0vaO3ZfqdLof8AyArf6N/6EaKTQv8AkB2/0b/0I0V2w2OvD/CeR29vCbG6JhjyJePlHotMEEP/ADxj/wC+RXX+FPCJ1m1fzroRRSzH7oy3AH+FdXf+CdF0iwjkjheaUyhS8rZ4we1Y4WLc36/oVhpxVSz7nltppkl/J5dnYmd/SKLd/Kuq074YateMpuraCyiPUyAbsfSvYoIIbaMRwRJEg6Ki4FSitFBGTrPojhrf4WaIqol0XljXny0AQMf9ojk/TOK6Oy8K+H7CMJb6LYKB3aBWb8yM1r0tUZOTZQi0bSyvOm2fX/ngv+FZ/iLSNMTQLpk060VgFwRCoP3h7VtLKqIe59BWb4ilVvD90O+F/wDQhVT+JmND+HE5fUNJtPLiey0qznu/sttlGt0fEZD7mwR6hct29qfJpGl/b79o9NillW8KvHFYpcYjwMcEjaCd3zD06jvrnw1Za19knuZbhWWzjQCNgBgDPcH1p48A6V/z8Xv/AH2v/wATXPT95P1f5nRLQ5b+yLKTS7j7PpcdvEnmt5s1nGwcBjgeZ1VscYHU/WjXtL09I3a0sLVhlPtLGFd0bbRtA44U+o6nOewrqx4A0r/n4vf++1/+Jp3/AAr/AEn/AJ+L3/vtf/ia05dSbnD2mkx3UGktaWULeRfM10RGvyL8mGf/AGcBuvHX1rC1Pw8lzqi3NhpokWR3WONYuJADnAx1wPTnmvRtQ8CaXHeWKie8w8mDl19R/s1bufh9pPlY8+8OTj76/wDxNYVZKMJt9CqmkEzyHXNKuJ5NHig0CFbpISX0qGJ9wUOx+YbvM+bPTOQOlUPEduqX1mJLK2tJfsib7OJP+PcjI2Nkli2AG+Y5G4A9K0/FnhGLQr57UvMU3gxsSPmU59vwrAtdKjexnlSWZXjVmULjk7o19P8AaNdkoNUoyfn+g0uZ6HqFiuhZtC9rYQyXFuNT+WJVKxIEEij0H+u4/wBkVFpmo2N3ptvdrFatp0kMz3lyIxiKQF8At/AQAmF4znvmvKm8P3ks7rjLKQrbnQfMf4eep9utNh0S4MkKgMGlzs+72JB+nQ9aw+ZXI+x6Kt3Z2b314bS3GiNYJLFfJCuXuDt3L5nUtvLLszwBnHetu71WytrDVJJLYQ6UpgEFysSgSxNIo3Kw++MEEnnB446V5bJok0TJLDI/ltbrIxJTd7kL1x74pZNLu4oPOaaQJgE/dyAehI6gH3p/MXI10PVG1PTRrWn20mnXQjn1GKG3mexSOCRCTwHDHzQeDnnp71zV54u0ybw/a6iYvLLXUtuPKhVchVjYA4643detckdLvlaJRO7GVxGux0Ybj2JHQ/WlTS75pdnnvgEBsSR8E9v97g8dfahadQ9m30PRNP8AENtPZadqESznTYbScXUoUbImBk3K/PDMCuB3yMVzH9svqEZgtNHvr2R4hOtuYG/ex7gMgKQxXPdT29qwXs5W1eSxivJgBM0YZscAE8nj0FVdTg8qwW4ivpp4ZCU+ZApDDGQRz6g00wdNnqA1eK6e9sjZ3EtwLG2Emm2f+sT7vygncfkwMggnnnpkWLbxvYaT4i1Lzo7mQwGBmEaglcJkrnPLD7p9wa8xtdOaSCAf2hKtxJAZkiCAjaATjd64B7VY0+R4LS8lt9TuUeNA0gaBdpOcAA7vU+lGyIlSb0PW/wDhduioTjTtWcH/AGAP61H/AMLt0hWLDStVOexx/jXkceqaxOC0dzIwDqh4Xq2cfyqxbS6vLfNBPNKuPOT5dufMRC2Pzx+dPnZqqSfQ9Nn+NOkTEFtH1Q46DI4/Wlj+JNrrX76PTruJYhtw7cn9fevJ5brV0ZlldnxCZVKOhG0fxZGQQMHgVeD6xayiCK+Vy0av8jx8AqrHODwBnqeuM1UJ2kjDE0OajJJdDsdX+KdnqUIX7DcxEdCxyf51ysnie3lYkQyHPqo/xrmLu91G3cpcyYfAOPlYEHoQRwR7itCQ3VrFOI78tNalVuI/KAC54+U98Hg8CpbvuaKO9kSzX8V1KcxPkDptqtMxuNInCqY1DjAA+lWLFp7uHzJb54wZViQLCHJY+vTA/P6VbtfNuZXtJNSmR1LbiturIAoyTncPQ9qwnG7v5o2qXnRhHzf6GI+m3F3IoBUrj6H+da+meGjFPDKxdXVwQNowefXNO05bh4GlLMzhyAQAOwq7D4nvtPljtWnnYSOFIZsjk1vc5Kq9x+j/ACY68a3t9RlF0kr5xwuD2HvTLmLz4kaGOQJjOCmePzqteeIJY9YmDP0I6jOOBW5/wmD6XYh1udrSgHb1z/nNZveP9dB4P/dv+3Uc7b3ltG5jNvIXHBBUf41LdahbtbELbzx/KeAo5/Wur0TxFBrYkY2yROozvJ5Jqa7gN/ZyeXvQBGLMe/HatCkzhbHWVt0QIkuRwcDr+tW7vXo5Y8CGTP8AtIK6TTtBWWxhYSMrMDnnrya0I9PWOBhcBTj7uBSY0eXXN3EWD7JM55+UAUz7VDId5jkPoAK7m/0uIqSq4OeKypIVt4fLTjBwfrQBp+ErmGTw/cKIyo+04Ix/srXnGoyWv9r38cm9X+0PtYDj7x616loTLHoUrdzcgfoteVajdeTrl8+wMwuJME9vmNUSymbaSSdY40ZixwuB1rrNb8L6fpvhO1nJuv7VDHzsrmJgTwBzwQKytL8W3en6lBcAkLGeQp5xXW6r8RL/AF6NrO11CTyCPmhn43+2aBI86tZ3S8tnJbaJFzj2PSrOpTrNq87qGAOOv0Feq6HrV/e6RFpV7otlPbbgSXADZz98EDkii88L6TZzvfsvmPNIq7WGQnH/ANYVeDa+vUwl8LPIoElmbEMMjn0Vc1baCe3YC4t5Ez2ZSK+orGKztF8m3hijC8jYoGRWH400VNaslKKDNGcof6VmB5pFcWkOsRm5heSMxAEBc45PvTPEPkXl3bSWxnwsikB0Axz7Gusjt3tLjDIUJUbWP3SeeKoX/iJpJVtZ4dkokCup6deCKuq7TZOHTdNHmeszNDrcu5T95fr0FZt3dl3cBTg4rr9Zjil1SVhGAcjn8BWNf2i+VI468VHtPd+Z3Kh+5evX9C3ezousIsqsUNuFO0e5rp9M1a1vrhUCzK3/AC0GM8DuOayb5dlwp/uwh+Ovetzw/KtsiXqNuYHEi+3rWWFf7qJglaojvbTxTothAsNrZ3MaD+7EuT9TnmpT4003/nhd/wDfsf41v2l5BfWy3FvIHjYdQensaW4iE8DxEkB1KkjtWpCOUk8faISUH2jf6BF/+KqWPxtpckeFjuXA4I2KR/Osqbw5qFnfIygyQgk70PseoqPTrPVYtR3QxyhvMJLEYGM96t/B8zlT/wBpf+Ffmy/9v0C/vo2SyuIZ8nDpGF7Hrg1R0vxPH4c1WeGaO4kspZW+ZUHytn613qk8Z696R1SVGjcAqwwRXGn/ALQ/8K/NnRa8jIk8a6YeRBd4P/TMf41U/wCErsYWaRorkiU5XCD9efetXSdUR559LkkBmtzhSe4qeI/vZ/8Ae/xqqv8AEh8/yKXU55/GWnl8+Tdf98D/ABph8Yaf/wA8br/vgf410T/fFMJ5opWvL1/yLlsvQw28UWS263BiudjHAGwZ/n7VCfF+n/8APG6/74H+NdBnk0mfWumpuvRHFhPhl/il+Zz58X6f18m6/wC+B/jTT4usP+eN1/3wP8a6HOBwaYcc1B1nJJqMWqeK7SaFJFRYmU71xzhj/Wuiv/8AkGXP/XF//QTVlz8pqtf/APIMuf8Ari3/AKCaXU5V/vMv8K/NlXQ/+QFb49G/9CNFJoX/ACA7f6N/6EaKuGx6GH+Eg+Hv/HhD/wBdZP5V0viX/kGx/wDXZf5Gua+Hv/HhD/11k/lXS+Jf+QbH/wBdl/kajCfE/X9EZYb+N8zbFOpBVO51jTLKbybrUbSCUDOyWdVbH0JrVJvYzLwpcVmf8JHof/QZ07/wKT/Gl/4SPQ/+gzp3/gUn+NHLLsBZEJPK884xVLxFGqeHrsgc4X/0IUsXiPQwvOs6d1/5+k/xqtq2r6NqGlz2sWt6YryYwWukxwQfX2qpxlzPQyofw4mjpH/Htbf9e0f/AKCK1RXAQanPBII4/FOiqiIFXM6dAAB2q2ur3hPHizQ//AiP/CuejCST06s6JvVeh2wpwrjl1G8PXxhoI/7eY/8ACnf2vNHz/wAJfoLH2njP9K25JdjO50t7pz37RFJWjaMkggZ9P8Kgm0W8WMFtSm6+/wDjWEPEF9/D4s0FR7zx/wCFRXGs3siAv4v0NhnoLiMD+Vc2Kw96UnbW3mX7eSjyp/gih8TNHW00WG8kuTdSJKFw46DBPqa8nsNTGn2rSKmXfei8cZ3Rnn8AenNd3478SxNowtH8SaVesZAfKtGVyODySBXn2j3BS1b/AE+1T5zwzgdhXZVpJUYWT3ffyNadabev5IsQ6laxRNCgdI/M81GaCOVgSACPm+g5H5VJHdMNLuriVW3tIwgkIxkvw/6Dt0zU32s/9BOz/wC+1pDdn/oJ2f8A32tcvI+35m6m/wCrFH+07PzQ6rP9pS0+z7So2nKkE5znoemKfNqVs6zyKspmuEWN0OAqgFSSDnJztHYYzRHdY1KU/b7XO0fMXGD0q6JpnXK3tsw7EMP8KFTb6P8AH/IUZyf9IhbW7GN4fLicIl5HPgQom1FzleDljyOT19qzLfUrUQCK5Ew2T+chjAO7gZByRjoOefpWpI05HN3B+Y/wqo5l/wCe8X5im6cv5X+P+RfM3/SKX9rRprb3yxFo2mZ9p4JUk8fXBqO+vrI2cdrF5xgRmkZ3jG4scDG3OMfKO/c1bJkz/r4vzFVb0ubSTMqHp396Xs5fyv8AH/IG3Z/8D/MtaZrKQWccYklaby2RUMS7UBB6PndjnOMAZqv9ujGmy2yIfMMpeUnoQBhR+rfnV3RWf7VajzUxt6Z/2aspu3an+8Q/e79OtV7KXJez38/8jCdSSlbyXbqzG0vV4rCWZpYmYNH8m3HDghlP5irL+ILdpbZhDIBHbSJJ05laMpke3C/rUR3Y/wBbH+dRSuUGfMQmpVKb+y/x/wAjdycVv+QLrdtFZxxGJyy20sXAGMsxI/Dmr9hrNq8jXCLIGkt1gmVolYKVVACuThvu9CBWBK+8nJzXQ+HWZdLuTHIqNuPLdP4a2hh5XV4v8Thr4qUacmraHP6nq0d1cpsHmIiBFLQpFgZJxtXgdavXGuWdwt08MMouL1lM4cAKnO44OctlgOoGKyggLBsU1X2ueoz1pewl/K/x/wAi/bSOm0/VreziaNJbmICXeJIkG6VcfdYbuB+JHJ60RX8ax3zrGY3mO1AOiKTkj9APzqHT55J4lCTDaOMtxiurtUvEUA6haAn+9tP9KynSmn8L6d/8jWpUapQku77GJpd20Ni7CJziQ/MPoKa+oQzXcINrlzIo3Ee9aWoveopLXtpJnqFxn+Vc6gkS7idJkz5gPXpzVvDN/Zf4mU68lBrTZ9F5lnVBCNQmzbxs3HJ6nge1YOpzNKIwECrHkYByO3+FaWpyub6bzJ4ycjJXvxWTNMHXaTnPaiOGkrPlf4mVKtJ0Ypvouxd8PawLS+AlyIzgcGvV4tRhGnyhDuDRONy9vlOK8NfYCcI1dN4RuL66uza205UEHKSHqO9XKEo6tWGmetaJGJtOtZAD91gf++jVm6t90XlkkbsjNXNDs/s+kRQuQXTOcdOSTTruHdKoHY5qBnLRHz5Li2kx5sJ6ewrA1u3MF2uPuyrkfUVp3twlt40kYHAlHI/Co/EaGURlcfu2Bz9aQxNJ/wCQFP7XIP6LXkmrbn1m9UAkm4k4A/2jXtnh+zFxpF0mOfPz+gqt8PvCWg6h4k1Jr5DNeRTOTHNwo+Y8gd6pEs4fw58Ltf12ZGa1NvbjBZ5DjIPTFd/D+z8nkK02tskpOdscHb65617NZadaWMKx28e1R6sT/Oro6UmwPKYfhh/YkAew1S5ZEGWW5w2cemMYrlvE8ptNsUkyZZwdufY17T4jiuH0O7+ycT+UduBk18qa/wDb5vEk9vIsrS5HBBJ6CrwWuPp/11CXwM9cttZc3AKP/Dj61tw3V06Fi4ZeoBFcFpnh/XNH06KXVIhEpIGCfmH1rduteh0ywLu4bjjnk1i20y0k0S65qdvBaskmNxwpH1zg/mK4XUX83V7VyytJtQSYOcn/APViofEd1Lc6iMscFAf1NVrNT9phyc/Ov86utL32isLTtST8h+pqf7RlPuP5Cs69H+iyfh/MVq6kP+JhKfp/IVl32RZycen86lfC/U7F/Bfr+hf1VjHcbuP9QmPzNGkzs4mSLO1BvA9vT8qr6xKDJ5X8Xlqw9xzUOiTtBdSuBkeXk/mM1GF/hRORL3zrdH1e/wBO8uayuV6fvIm5BHY+46j8K7ew8c2cqhL+J7aTuy/Mh/wryOFrm3voltgTCi70z0KHGQfoeR+NaMmqOHYDBAOAQOtbtGCZ7PHqVndxZt7qKTP91qmQnHLDGPWvGNK1NX1SFSuCSeQMdjRceIHS6mQT3C4kYcSEd6pr3Pmc0X/tT/wr82e1b1B5ZR+NYeq+KbDTFdEk8+4GcRp2Pua850nWTPqsCNLM+SeGkJ7GqV7qkaX9woGMSt/OuRL/AGl/4V+bOm/vGkNTuo9UGoCRhLv3NtHUelemaZfR39ubmM8SYYj0rxk6oW3YzgDJPPSu68IzT/aYTGSLdoPnz/EcDFVV+OHz/IpHaMfnppNITk0n1pUt5+v6I0nsvQO5pD+tGaac9a6am69EcWE+GX+KX5gfWkOaOaac+pzUHUI/Q1XvsnTbn/ri/wDKp3PHpVa+4025548l/wCRqepyr/eZf4V+bK2hf8gO3+jf+hGik0P/AJAdufZv/QjRWkNj0MP8JF8Pf+PCH/rrJ/Kul8Tf8g2P/rsv8jXNfD0f6BD/ANdZP5V0viX/AJBsf/XZf5GownxP1/RGWG/jfM2xXm/im3s5fEeqyzqHlWCFFDQq4AIPTJ4Jx1xx2r0gc9K4DXNJ1bVPGV/b2FoJUeCJnJZVIK/Uj+9Xbh20ptdv1RkznrvQdOa8nkkVYIzN5aBFJGcegIwOR+dRL4Zs96xSgJNI7RxqMkEg45OeOeO9dYPC3iTe7zaZC4Z/M2tKmFb1Hz01dA16MjOnpJIrFlcyoSpPUj5qftbvd/iI5B9AsY7aGTaWklXeEAOAMkcnPtVy28PaYUto5LUM9wGO/e2U5IGOcdu9bJ8MeIpLePGnDEa7FxKmT1P973pE8OeLlt8vp9vAig7JZZk3Jnrj5gPzzSVVqWrJhZxVjmINN0yO8PmwRlMf8tHbA9zg5q9PpWmx3ipBYxSrIq7QXk2kn+7yDj61UvRb6Owe9hjusfJhJA+9s5/hbr249Kz5/E+szXCS2OlxReWNqea2QvpgZHrnr1rGniLRacups6UnsjVvtJ05buURW6pEh2/fbHHU8n1pdSstBTUrtI3RpI7gJJDEpAj3vtGMHHBOMcVxdxaazeHN3LI+f4Q4A/Q1qXF/qc9zJcDT7SKWWdZ5mi48xlOQDljx9MZ75NWsVe92P2DJrtdO/wBIMRMFvDL5BmaEuWk5+6u/7oAzk4PPSobPwtNc35iu70HEssL4U7EZdu09eQxYDtUEMmoxmcSWNvPFNJ5pjlPyhxnBGGB7nvg981d059cma+i8pXa6YTyOWUMGDZ45wATj8hWVTExUG5N29X/mVKm0tjIfw5ZxWbzXF00KxQxzSKsO5hvYgKBuGTjB5x1qtHoyW+p/2eZBIGkQJJgjKuFKnHbhhxV/VrnVLoXkk1vCv2woHCEAKF6BeeOg9arRLqGoyG7MSq42ICjAY2IqjqeuAK2qVf3UWm92NRfNsXdQ0/TntLyS1tPINncrDnzGbzFbfgtk/e+Ttgc9Kj0a0sLqQW9xYI0YBee5Mjhok9Rg7ePcHJOKtXcmpXa7XsLdFaUTTCM485/Vvm9zwMDk0sD3kFi9mdItJYnk8xt0jgk9gSsgyB2zWf1h81+Yfs3bYz9LtbG6+0xTaeixwwO8l55j7lbB2DGdvJwMYyfWtHTLO2bTYWaLJOe59TVaKa7FsdLbSrR1jy+8u4Ysf4jtcAkDgZHH51o2EclvYxxyDa4zkZz3NZzxFRJck3cujBXd0JJY2o6RA/8AAjVWSzgH/LP9TV92461UlJyeay+tYj+d/ezqUIdim1rD2T9TVO+t4ls5CF7DufWr7HnnNUr8k2cnPp/Ol9arvRzf3spwhZ6F3RYI/tVq2znb6n+7VlYIw2pfL13Z/Wq2i/8AHza8/wAP/stWVJDaj7Z/rWjxFb2fxvfuzlqwjzvTov8A0ox5I4Uz8owPc1mTupfAHFW7uXZGe5qjGm7LvnFVTxFfdzf3snE8t7JEW0tz0FdT4ahSWwulkXKbiev+7XNjqWOMDpXUeEZY1WaCVWYuC/HTHFdEMTWcleT+9nm4pJUZdNDlEDzfKicL3p7Wkg5YAj2rqIm0fy9gtZBwe/8A9ekH9lnzCLWXgDPzdf1qfrVf+d/ezo5UUNOls/L8tkbeP73H8q6NG09rcyGNTj0Y8frWWn9lqwdbaXkdc/8A16fHcaf9ilDxyxrvAJJ+nvXPUxNe/wDEe66s3qqPsKenV/oSXT2LKdigfVj/AI1lLbpJLAfLYKZQM9jzU01zo1uC0aTS56jd/wDXqO11vSY7iJfInVPMU5ZuBz161v8AWa/87+9nPUS5Xp0ZW1rS7iO+lZUUxggABuegrEkimBwUIIPHFejM+jXzNP58bc/d3+2PWnW/hmzv7xYFBeQ8jJOQPzpLF1rfG/vZyUq9KNOMZbpLo/8AI8zKPnJBq3ol1Jp2sxXUZIKMD9fUV3+s+GbHS4i15HsA6EkjJ/OsWG38PMEczIjDsZOf/Qq2lWlPDPnbfvLf0ZosRSvdfk/8j13StTjubKK4jx5brk+xqW7uFDRuCCrdTXA6Tqmn2StFBeoInGCPMBGfXrVqfV4FREW+UbTkfMP8a4nOJoq8H3+5/wCRianqMNxqouAcyeYB9BmtK5vY50YZycis25TRri4EzzRK4bcSr4yf++qT/iV4IF2o/wCB/wD2VTzo09tT8/uf+R3PhKSEaVdOxx/pHU/RaxbvUEs/EUur6U8SXcBIljznzFzg5p/h5LZNLmFvPvjM2Sd2ecD3rFa10ldSnk89RL5jbvn75570e0QlWpNu7f3P/INc+JviJb8XdhdPHE6geV1VT3r0v4c/EdPE6pplxFL9uij3PKej+pryu40/R3ODKm3OcB+//fVWtCu7Dw7qH2yxuEjlIw3zdR+Jo9pETqw6X+5/5H0bc4NnN/1zb+VcR4j02zj0G3vEt4xcSTqGk28kDd3qtpfjuLU7doFv7fzShBQlcnip9YkupdAthN/qPOBVscE/N7fWtMJJRxkJdjKeJpqLTv8Ac/8AI6u8s454yHRWQDoRXz38SNKm07xI58tUt5OYwh4/Lsa93aTVj1Q/98//AFq4jxtaWt/at/abIsqnKEnB/pWbtYunjKSe7+5/5Hluqrm+T/rkP5mmWo/0mH/fH866G/tNLNwvmyqGCDHzdufeolttJRgyzICpyPn/APr06rXtGa0cXSVJLX7n/kY+pf8AH/L+H8hWRqDYs5Pw/nXS3dnaz3Dyi+iUN2OPT61lapp9uthKwv4mIxwMeo96SkuX5m6xlL2XLd79pdvQz9eYpfQuvURL+PJpmmKHupiPuiEsPzFa+rabbTTxM+owxnyl+Vse/vUWmabbQmYrqUMh27cDHAyPessJL93FGX1mn7RLW/o/8ilJct5cYC4BjFVTubqKuDTLZbGFTrEHzMzZ456e9RjTbX/oMw/p/jXTzI4/rdLz+5/5E+iA/wBsW/ynqe3+yar3wP2+54P+tbt7mr+jWFumrQMurxOQT8oxzwfemXNhA17cH+2IgTKxxxxz9avmXJ8zmWKpfWW9fhXR935CaBu/ty24PVu3+yaqagx/tO66/wCuft7mtjRLGGPWLdhqschBb5Rjn5T71VvrCBtQuSdYiUmVjg445PvXIn/tDf8AdX5s6PrdLm6/c/8AIyWldbebG45UD9a9g8Kp5el2ZIwfJXt7V5tYaLb3VxFD/asUheRQFAHOOfWvW7SNYQFXoowKdWS54er/ACLjiqTu9fuf+RqZoz71ErkD7po3n+7SpPWXr/kaSxlKy327P/Ik7UhPJ60zccfdNG5v7proqSV16I48Li6SjK9/il0ff0HcnvzSE03ef7tJuPoajmR0/XKXn9z/AMgb7pHeq1//AMg25/64t/I1YJJHSq9//wAg254/5Yt/I0XuzOlUVSvKUduVdGur7oq6F/yBLf6N/wChGijQ/wDkCW/0b/0I0VpDY9PD/CO+HEDzafbbR1kk5rq/FFkItKiZ3/5bqP0Ncf8ACy8aex8iNsSQTODgc4K5/rXW+K42XSo5Z2wPOX5pGwOh9aWEXvP1/RGOHdq9vM6IS2sXES7yP7oz+tc5aXEreO9RKAJm3X37JVXVfiJ4X0gmP7a19OP+WVopf/x7p+teeXPxH1WbxBd3mkWcdkZYwmZ8Oyj5ecdM8V24eUVGp/hf5oy9nOVtD21o2KGSZzsHJaRsKP6VzOqfELwvpBaJtQF3Ov8AyxtPnOfQ46V45qV/qesvu1fVLi6/2Gc7B+HSq8ccMK4jRQK4nV7HRHDr7TPSvEvj7WNLnWy0m1t4w8Yk8+YbmXJIwB07elcBqWo6prDl9X1a6uc/8s95RP8AvlcCtrxdLs1eLP8Az7r/ADauYkYFuTU1ZPmY8JCKpRduhpsIYNBt1ijUDzDgAfWqQmJ7VZmb/iQ2/wD10P8AWs4N71x0Xo/VnbIsl89aA3NQB8mpFatSSYN7Vc0vUbW1vn8+XZ+7I+6Tzx6CqAPPWtbSLmG00HxBOFuluiYYhLBcCPCsG4+6TjI5GeRgcUp041ISUtrGNeTUdOpgX13DJAAjZO4Hoaj0e9gjtXDPj94f4T6CtzxJYaeda1C/1N71kn1P7JGlo6qUwqlnOQc/eGF4zzyKq2HhKztby20i9ubp72+v5rSGaBgI4iknlBmUglssDkAjA55rvlyujGPRN/oYqpPmvoN/tC2/56f+On/Cj+0Lb/np/wCOn/CqOqWen6fpGmMhvJL68thcMxlURx/vHXAXbk5C+ox7543NFht/K8Pac9rbyRaskzXU0kQaRT5jxja55TaEDcEZzzmsOWHmX7Wp5GOl7CNSlkL/AClQAdp9qsnULY/x/wDjp/wrndLvjaayJXdQmMMXt0uNoOMkRv8AKx+tdPqd3Z2PiK2uoFENldWiOZm06GXfxgusDHYpLKRgHjnHWly0/MUalRdis19bn/lp/wCOmq73UJ6P+ho8TLaw+JnMMMkNjKsMyquFLRsituAGQu7Jbb2zjtW7N4ItIA/m3d0fIuJpJtrAZtFEpRxx1Jgbnp8y8UuSHmX7aouxzhuI/wC8fyNVb2RZLV1Ukk4wMe9araDYi1a1E95/aS6YNR83evk4KCTy9uM/cP3t33uMd6fe+F7OWS+0uzubtNQsJIEmmmcGKUySJGdqgArhnGMk5AJ4oUIX6jdapboUdJuI4Z7dpHwFXng+lTfaoc33z/6zO3g89a3bDSdPvdKk0jTJb2IPr1rayS3bq2cR3A3jAGM8/Kc4wPmOeII/DOkTX0K/bZI4pBc+dDHqFvdSqI4mkVwY+ACQRtI4x1Oci+WHLbzMJzm5NvsvzOPlRXbnJA/WoZVJwiA49au69aW1tY6bf6c90kF7G5MVxKHZGRyp+YBQQeD0Fbc0UAt5NGFrbiBdAW/Fz5Q87zzCJi3mfexk7NuduO2eacVBdwqym+xyOx2lUbcItb/h+aK1ndpm25jIHBPcVieG9UFlqK/aZwkLjEkjWMV4V+iSkD8cg13dve6Tpuua9ZNEtk1xcQSWby6fFdiOIq5PyufkDb42+XJGMY4rWDipJo48RBzpSjLZnNWVncXEDyRRlgHxnIHvV6HTLxXJaE7SMH5h/jVX7Pe6dd6jp9xMRcW908cvlNhdynBwBjjI9KuNcSrlGmkwcYO48Uv3fZ/ga/vPIYumXYJUwnj/AGhzRdaXef2RMPs+5wwIG4c9Pejz5/PIM8nI4+c09rif+y5h50m4MP4j7VhUdK+z3XY3rKr9Xp6rd9/Iwx4f1KSMMbcLnryP8a3PDnge+vJlWaILDuw5bkgeop/heG+1vUEsYbktMDkh3PSvovQtBg03T44XjRnA5JGefqa3Xs+z/AwftF1X4nn1p8N9EtLq3uFmB2r+8XBwW9elbF5o9rZXf2+1lXIwpUKRkd+a75rW3xxBF/3wKq3VmkkLhIIskf3RRel2f4E2qd0efeMra31LwpdCJVklWMtGpXvj3rwoaLfMRm1A/wCBD/GvbPFusf2Vod1p7222Vo2Ak6HnPSvGYJ7lhk3Ex/4Ga2lKmsK9/iXbszejCo5dPxLFvot2o5gx+I/xqy2l3h/5Y/8Ajw/xpIpbgDHnS/8AfRqQSzn/AJbyf99GvP5qXZ/ejujGr0a+5lc6Ref88P8Ax4f40g0e85/cf+PD/GrBlnxzNL/32aBJPj/XS4/3jSvR7P8AAq1buvuZ1Hhizlt9FmSRNrG4zjPbC1zN3pl3/aVy6xcGZiDkep966zwyXbRJyzMT9o4JOT0WuUv5pxqN1iaQASt/EfU1bdKy0f4GUVV5nqvuYo066PWH/wAeH+NPXSpjyYf1FVBcXG7ieT/vs08XVwvJnf8A77NRej2f4FtVu6/E0bXR3W5jkaEZDDnI9a39Q1rWotOWwhO6BJQ6g444Pr9a5i2vpvtESmV8bhn5j60mtX0qzOomkGGHRj6VtgXS+u0+VO/yMa0ari72+5nf6j458QXdusSqsYxhimATXB6iNVvpiZjJJnuzgn+dVpr+fHE0nP8AtGqwubhjnz5f++zWbnTff8C406kdrfczZ1OwuXukKxkjYO49TWYylGKMMEHBrZmlk/tiBTIxUpyNxx3rKuR/pUv++f51rXjG7lHuY4WUuVRl2/VkDciqGoj/AEKT8P51oHGKo6gB9jk9eP51nH4Pmdr/AIT9f0DXh/pUX/XFf61T00stxJjoYzkflWzf21retHJ9vhTEYXGQf61HYaZbLcN/xMYWyhGBj29648PWjCMU/wAn/kZJbHN3HyR28Z42xDj6k1BketbsmlWrsCdUg4AHAH+NR/2Raf8AQUh/T/Guj61T8/uf+RyeyZX8Pkf25bc92/8AQTUF44XVLrPTzn/ma3dF0u1i1aB11KF2BPyjHPyn3qtfaVaNf3DHVIVJlYkHHHP1rT61T5Ou/Z/5HKqT+st/3V+bDw9/yH7X6t/6Cap6irNql2Auf3z9P941raFYxRa3bFdVikxuwgxz8p96c9jENVnc6pFkzOSvHHJ461yrEQ+sN/3V0fd+RrytSJvBVqZPEVqrj/VK0rA/gBXrMPUntXK6BaQQyC4V43kK7d4HJFdRCeT71pKpGdSHL59H29C4qxfU/LS5qND8lLVUt5+v6Iuey9B+flFJmg8DrSZxXVU3Xojiwnwy/wAUvzHdcU09aQ57Uhz6VB1BnDYNVr//AJB1z/1yf+RqxuBFV78j+zroEc+S/wDI0CexV0P/AJAlv9G/9CNFJoZ/4kluCOzf+hGiqhsb4f4TzPR/Ed14fkuJLKeWC4MnDooIwQB3qLUPEF1qcxl1G/vbok8ByAo/AVd1NiHvOeki/wAlqh5xZRk1z4WV5fP9DTDx/eadyOPUrOEfu4XH/AR/jUK6rF9tkfbJgr6D2qyZfeq6yf6W5z/D/hXXh/hqW/lf5omV7r1JP7XhJ+7J+Q/xpDq8P92T/vkf407zD60bznk1yNo0SZteK9etrnVInjSYAQKPmUep96wDqkJ/hk/If411Xjlsa1Dz/wAuy/8AoTVzW/j71VU+JmWFT9jG3YtS6xbnRbePZLkSEngY7+9URqkP92T8h/jWxM3/ABTtqc/8tT/7NWcG965qVrP1Z0yv3IhqkI/gk/IVINVt/wC5L+Q/xqVXx/FTw+e9ak69yIatAP4JfyH+NSw38zxXEMLlbe4Ks6FRklM457dT09akDcda6TRD+4T/AHW/nROXLRqS7L9Uc+Ivyr1OObxTrME09yl2pluJVlcvBG48wA4dQykKw7EYNTaLf67BYSRwXqBWld90ih3RmADMrFSyE9ypGadet+4Xn+IVcDAdDmuuppQhJdW/wsUqa53co3FlqF3HbpPPEy28XkxDptTcWxwOeWPX1q3bS63Zae1lb3kKwNuAygZk3DDbGK7kyODtIzUm/wBaN4Jrl5mackTOshqNpqP7k2O6OExgSW0bqyk5+YMhDHJ6tk8AdAKv/adba9e6kuLOWV0EeJreORAo6BUZCqgdsAVXjYf2nLz/AAj+lW94z14o5mTCEXczb601HUbuS7vLlJZ5MbnJPYYHbgAAAAdAKtzah4gl87zNRVhPaLZScD5oVxhfu+w56nnnk09nz3qEuPWi7L5IkTXetf2Z/Z/2yL7P5flfcXf5ec7N+3dtzztzj2qvqmoa3caWbee9jMS7MlUVXfZwm5woZ8dtxOKtM3HWqd+3+hSc+n86FKVxunGxoxat4i1M28UuoRAGZbvKRJGTMqtiQlVBLfMck8njOcCrxg8QTTPcC50+ORFkUNFbxxkh1Ktu2xjcSCRk5NYkDt9ni5H3B29qv2TE2110+5/jVxqK1pK5z1qLtzRdtunmVLjw/q1xaW1rJc2zQ2wYQrnG3ccnnbk8+tSyWPiN9K/ss39r9k2eX90bym7ds8zZv2buduce1QhuelIVJzxR7WK+z+Jbw039r8P+COsdN1yxunkhfSvnREZXtIpFwowDtaMgN/tYyTkkkmtTTIfEkepXNwb2xluJ2815Z4UlYMMYKlkJX8MdvSsUR4OQtXtDH+nysFOfLP8AStqNSEpJNficWOo1KdGU1LZdgg0i+Cyy3E8UkzyGRnLklicZJJHWp59KuXAYSRe/zH/CudQuFdSGw4446HtUmX+zhCrcNR7Sn2/Ev2FZfa/A3F0y5bHzxbhyPmP+FSPpdy2nShXiyzddx9vaufgRwxyrAgd6uwR+ZpMylXUFx29xXPUqU77dV1N61Ct7CneXV9PNeZLp+n6xpOsRX+n3MUUq4BIY8juOle++H/GiX0CQ3FnNHOqgHBDBjjrnivGtF8O2U8iu8mou56iCED9SDXrvhXT9PtQI7TcJR9/fMXf8fSuiM4Pp+JjUpTivi/D/AIJ1I1m3wNySA+mB/jTTq8BJwJf++R/jVmVLcmMSOpfPy5PJNIzqpaMEFh1welJuK6fiZKNT+b8P+CeafEq0Gr2SvAuHRTkvxxXl1toFwka5eL/vo/4V6j8UtTA04WUEqvLgmQJztGOhryuzidrWP92x684961lOH1V6faXXyZ00KVa/xdOxeXR5x/HH9Mn/AAqQaTN/ej/76P8AhVfyn/55n8qPJl4PlNj6Vw89Lt+J2ezr/wA34f8ABJzpE3ZovzP+FJ/ZE/TfF/30f8KgMUv/ADzP5UnkTHpE35Uc9Lt+InSr/wA34f8ABOu0C0e30maNypJnyMH2FczfaRPJe3DB4gGkYjn3+ldL4ailGiT5Q/8AHx6ey1yeoW0/9oXJ8o481v5mrc6Vlp+JnGlX5n734f8ABGnRrj+/F/30f8KYdGuefni/76P+FRG2uD/yyNQtaz94zUc9Lt+Jp7Kv/N+H/BL8OkXIuY23xYDA/ePr9KTVtIuZbpyHiALDqx9PpVS3tpxcxHZxvH86ZrdtN9qd9vG4d/atsFOm8bTstfUyq0q3K7y/D/glr+xbn/npF/30f8Kcui3AP34v++j/AIVRFrPnBUfnUv2SZe3X3rL2lLt+P/ANPZV/5vw/4JsS4/tmDn+D/Gsu5/4+ZsdnP86m0+KRb6PIHU9/Y1DdRyG6lwB989/etJ1FKnzeb/JGdKlKE+Tsl+bISOD61Sv/APjyk/D+Yq2UkA7fnVO/Vvsch4xx/OpjJcnzOtxfsnp1/Qqsv7iP/dH8qk01T9qb/cP9KljtjJbR/MB8o/lUthaFLhjvz8h/pUxkuYiMXoYrLmoiKvGz/wBs/lUbWo7sfyoUkZuLJtBH/E7t/q3/AKCap6iv/Eyuv+uz/wAzWpoduF1q3O49W/8AQTVS/t1/tG6JJ5lb+Zrbm/dr1/Q4VF/W5f4V+bDw4P8AioLT6t/6CasSQltXuMf893/9CNO8PQKNeteT1b/0E1pW9sG1G5Ocnzn/AJmuam74h/4V+bHUjaVzqdDbbGo6YFdXA3y+9cppqlAtdLbt2611shGmh+Q0oPpTFOYyfenA81z0t5+v6I1nsvQk/gpM0nRBSZOK6qm69EcWE+GX+KX5jskj6U08UdBkUmetZnUHSq1//wAg65/65P8AyNWP51Wvz/xLbr/rk38jTEyvoRzolv04Df8AoRopuh/8ga3Ps3/oRoqobG+H+E4a8MIup45ULBmB/QVXZLNQCIW6/wCe9O1EH7fKcen8hVZ2bbg56151Gnea1f3no0aMXJPXXzLPlWZ6QH/P41WjjthqMo8o7dgwPyp6M3YGoUJOoS9fuj+ldGFh7tXV/C+vmjKdGN4779y8I7PPMJ/z+NLstBx5B/z+NRjJ7Uq5yRiuPk8395r7CPd/eb/idIxqkYuVDv5C4I9MmsYJZ5/1JrovEgsZ9QjeaZ1YQqPlHbJ9qyBDpn/PxL+X/wBat6mHvJtS/wDJjzcPVhClGMozuuyY+WOD+yoMx5j3nA9OtVDHaD/ljWvImnjSYQbiQJvODj6+1U/L0zP/AB9S/l/9auelh20/e6vqdEq9NfZn9zKfl2qn/U04LbdoTVkppeMfaZPy/wDrUuzTNo/0mT8v/rVr9Wf83/kxPt6X8s/uZAotwf8AVVt6YqGFfKXbwev1rN26WMf6TJ+X/wBatjSRZbRsmcrtOCR7/SpqUHGhVfN9nv5owxFam4pKM9+qZzk1tbTIEMRAznP+TQLOMjHmS/8AfR/xq4P7Nx/x8yfl/wDWpynTgf8Aj5l/75/+tW0VWhHljPT1X+Ru6tBu7jP7pGa1rFn78v5n/Gnx2kR/5aS/mf8AGrrrpvU3Mv5f/Wpq/wBnZyLqX8v/AK1O9f8AnX3r/IOeh/JP7pGclpbnUZUBcOFyW3Hnp71bFjF/z0f8z/jRF/ZY1KVvtMpfYOCPp7VfB07acTyfl/8AWqZSr/zr71/kKnOg0/dnu+kjPNlFnBkf9f8AGl+xxN/y1f8AM/41dJ0/HNzJ+X/1qcBp4H/HzJ+X/wBap5q/86+9f5GnPQ/kqfdIofYYs/6x/wAz/jVPVbONdNmIdyQB3PqPetoHT8/8fUn5f/WqnrC2I0qcrcSFuMce49qalXvrNfev8gcqFtIz+6RFZ2KvZQEPJzGvc+n1q5BZBY5QHfkY6n/GpbD7CLG3zcSA+Wv8vpVtfsW19szEY59v0pc1e/xr71/kRVnQ5NIz6dJd0Zg08f8APVx/wI/4002Kg4M7f99H/GtLOn5BFw/5f/WpxGnnnzpP++f/AK1RzV/5196/yNueh/LU+6RmCxXbkTSEexP+NWLLTt0p8u4kU7T3P+NXl+wAcSyfl/8AWqzZ/ZDMdkjfdPb/AOtWlGVf2ivNfev8jmx1SisNNqM726qVuhhDT1I/18n/AH0f8aZ/Zqk/62T/AL6P+NbGLLb/AK1/y/8ArU0NZAYM8n/fP/1qy5sR/OvvX+R1c1D+Wp90jFuLCNE5kkPPqf8AGpjpzeSUS4dMnOTn/Grd79jEK7JpC27uP/rVYY2OOZpP++f/AK1JSr3+Naen+RtJ4dUoS5Z7vpLpbpcpR6ZKE/e6pOF/uLnn9auQPeQQ+VDq9zDH2SLIA/WpF+wsP9fJ+X/1qbixHPnyfl/9aupYzFraUfuj/kZSqYeW8J/+Ay/zFilvoJvPh1S587GPNZju/PPAqvJPqhlZhrV1luSdx5/Wp82P/PeT8v8A61MBsQf9fJ+X/wBam8bjH9uP3R/yBVMKv+Xc/wDwGX+ZjXdpctDMzahMxKknOeePrUNhYztZRkX0i5zwM+v1rbuzYG1m2zPny24x7fSq+m/Yv7OiLTOGAPGPc+1dCxGJeEk+aN+ZdI9n/dF9YoKovcnt2l/mUjYTg838v5H/ABqRrG4CZ/tGXH4/41oMbA8md/y/+tSl9PC4+0P+P/6q5PrGJ/mj90f/AJE0+s0f5J/+Ay/zMoWExB/0+X9f8aQWMx/5fpR+f+NaJfTz/wAvT/h/+qmeZpwz/pT/AJf/AFqpV8T/ADR+6P8A8iT9aofyT+6X+Zf0fSrp9MlZNVnUCXG0ZweB71z15Zzi6nDX0pxIwyc88/Wu10GSyOjzbLlmHn+nsPauWvp9J+23Aa/YN5jZH4/StniMRyrWP3R/+RMViKKk3yy+5/5mV9jl/wCfyT/P41DJbS5x9rk/z+NaZutFUHOoH/P4VCbrQc5Oon/P4UlXxPeP3R/+RG8VQ/kn90v8ylFbSmZB9qf7w5/yakvLFzO6vcu+CDyPb61Yiu9BNxHt1Fi24YHqc/SpLy+0JLyRZdQZXGMjHTj6VP1nFKqrSW3aPf0I+s0L3cZfc/8AMr4GelB4GMUh1Tw6D/yE2H/AT/hTX1bw3/0FH/75P/xNYfVp+X3mrzKl/LL/AMBZPYn/AE2P8f5Gq1yx+0y8/wAZ/nUtjqXh976JYdQleQ5wNh9D7VDc6r4cW5lD6hMHDkEBDwc/Suj6vP2KWm/fyOb+0aXtW7S2/lfcgc8VRvyPsUn4fzq4+qeHWGEv5yx+6PLPX8qp319of2KRPtc3n8fLsOOv09KcaElDpv3N3mFJ0npLe3wvsOgcC2j9do/lVi0Obhv901BDqPhwW0aveXIYIM4jPXH0qxZ32gyTsILq4Z9pyGQ9PypRw8lK+n3mUcxpNqNpf+AszWbioHcCrTXnh0/8vt1/3wf/AImmG58OH/l8uf8Avk//ABNNYeXdfeYPMqf8sv8AwFkuiNnWbfju3/oJqrft/wATC5/66t/M1atNR0CzuUnjupyyZwGU46Y/u1h3WppNfTuqEo0jMDnqCauUHGCXmY0q6qYmU0mlypaprqzb8PNnXbb6t/6Ca1LMk6lc5HHnvj8zWL4akMmu2+AAAW7/AOya6C0jA1K4zx+9f+Zrmpq2If8AhX5surNSlodJZjp71uW/QVj2YG0VsQ/dHuK6mJGkh/ck+9OByKijP7o/WnA9eawpby9f0RrPZehPn9360zPGaU42Aj1poP511VN16I4cJ8Mv8UvzHE8+9NzwBjFBoJqDqE6VXvj/AMS+6/65N/I1Pn2qvf8A/IPuf+uTfyNAnsV9D/5Atv8ARv8A0I0U3Qv+QPb/AI/+hGiqhsb4f4TyLXL9zrFx5VzmP5cbWyPuiqEc7XV1bw3ErtG0qgjPWuttrKxNtpCRhJHuYriWZZbOM7iqSAHzCSwwVGABg9eDxXNS6FNa29vKt20moeULsW6Q5VYwC+S+eu0ZxtxjvU4dqMotraxpJRhPdme8UIhmnF1II0k2KDGMkkE/3unFaWkeHp9Wt754DcvNbWyzpELclpi0sce1cHOP3mc+2Md6r3Wl3bR3FukeyWJkluIlhbCZwvBJPdwDkDk8VuaVdQWFz4jsr6++xS3WntZ+f9nkw0oniJDBdxAKq2cAcZwCcA9ntYNO369/+GMpSjYxrTRtTn1G4sIdL1GS7iQ+bB9mYyIPUryR9aZBp19dNMtnYXlwYF3SiKFnMYyR82BxyD1rrJb7Q57m4huJYpZNPsre2inu1uVhmZThyRFiTI+VUzjheccVdfU7DxJr0sGn3jwf8TpL2GQW82bgFRwu3JEgIYjfgfMfm9c5VFayIc+lzjYbK4u9ai0pUYXzyiDypMKVfOCGz0x3z0q1qukHS4baYXltdW1wXEdxb52lkxuX5lU5GQenQgjNXH1OCx+Keo6nLCz28eoTs8kQJby2dlLAdDw2feqt2bE6bpug2eoQXAjnuruW6WGZYlLxoqqAy7ycR8nbjLDnAzRzoXOu4tnpK6jpz3NtqNnJNFDJcPZ/OJBGmSzZ2bOAM43ZxVg6BKmnG6N3Zectst49pu/fLA2MSEbduMEHGc4OcYo0OWOw0G5SXxBM2n3FpOJNKiSdWklKEKW48vCtsbduJwMYqebU9Ne3m1NLrN/daKmnJYm3cOGWJYS5bGzZtjLDBzkgY70lUtcfMzHgQ3KBoE81TIsIKIGBkbO1eB944OB1ODV7TNFm1C4nhdHgaKO4IDQZLSRJvaPHHzfdz3G4cU3wVeW2krqRvVZQYY7uxAiZt13EcxdOn32GenNbt14l059TtZrS5kEk+m3rSP5Lg/b54yrqMe4jGenvQqmgXkc8dOmgubm1vLa5guoIvMaE23zryACwOCq89eeo45q7qXhfWdK1WLT57G4NxMoMKpA3735QTtBUE43AHA4NQ2+t2MukRJLO73SeHprWQeW+fN+2NKq5x02EHPQDjOeK0m1zSBql5ePNayw61pkUG2eK5AtpEEQYSeXtbafLYZjZvfjIJzqzuK/c5m8T7NcSw3K+TPG22SJ1CMh9CuOKta94auNLtJrg3lrLLBcLBdQ27EtbSMCQjggAdCPlyARjioPEmpfbtR+2RpZukEUMKS2iTiN9mMD98d5wBjJ7AVteIb7THg8RtY3Qlm1e+S4MTRSqbUeYXbzCRgtuYAbNwwCfam6qZXM2YVroDz6O+p3Os2dhbrP9nH2pJcu+3d8vlxtxjucU7QtIk1iJI7fUrMXspfy7R/MErhRn5SEKdj95hV7wvfvpV4BL4litbGO93XNjHFcML5BjOE2bGDDIw+33xTvCeo2+lzSXf9vNaabNI/2rSohOJJk5wgKjYcg4yzDGScUuaN/IV3fQwNOmhN0DcyJt2tksTjODjOOeuKtuixyS3IMbIkPmLGjnY/zBc8nPU/pWXa2rlYW8lX3kqgZG/eHPfB4544qwFvmbzREvl+SUEXltsK5JK9c9QTnOeKKc6ajaVt+39bGkFp1ND7QqzWrQuYxPsO1XIOCeR69qnhnlB3Pqr7c4z5p4Pp1rGeOaO8jubjKLFKqBFQgDbj5een86WGTFwqJEFRXLPl884xxxx7V0wlh3B3ste3TTy239OhnPmUtG/wCvmdVDOuzP9sSeo/ef/ZU69eNtPYf22ZC38PmgjqP9qsBJI5pI3Ikwm5QfMJPbvVhmjKOE8xCxz8jlecVpfC33/qy8tr3M7yel3/XzN+MpHaRga4PMVAPLE3IOOn3uMU2VpBIAmtPsOPmWXgj865hYijiQBS2TnJJyD61tw6a0lvCF2BSg2gk8ZH0rFVKbpyWl7u2mu6t07XuEmk1eT6E7TOp2jV5T6kSnAH51XknnjBC6pcED+ITHH86bLpcsSnBi3EEZ3HH8qgWwkaNkXywSMElif6VftKfJZqN7duvT7Pyf5lXj/M/6+ZYW4uyONVmP/bU/41Nb6leW16gOozspUk/vT/jWaNJuYVl8uWM71IAJPyn16fWqD/aRqIilCK/klfvcfXpV4Z4flTl8Xa3r5fMzrNOLTbt/Xmdyut7gv+luvfHmf/X6Uq622zL3TBu48wf41wCXErR7cr/q9mS3fdnPT8KsLLKXLo3DDDhZOQfbj/GoSwrVmvz8/L+kW35v+vmdZqOrtLAoW8bJYceZz/Oqk2qTnJ+2PwcH970/WuVnMrTEiXBGPvEk/wAqZPcPICEG3e258NnP044HJ9awSo++rLy+7S2nffyNnL3I6vr/AFudMupTMcfbJfXiakF9cPtAvJiXzjEprk1mnSOUDq4wDnO0Z+lLDeyQIRubf1B34X8Rj+tXQlRXKqiXW+nmtPPS/wB/kZya1s3/AF8z0bTbqN9NiaSdWc5yWkGfvGp2uIf+esf/AH8FebQXNysKqk6/TGe/0pxurwNzMqjuSuP6V5zjLyOlPD21lL7l/md9cXERgkAkQ5U/xj0qrazRC0jDSAEZ4Le9cUbq6IyLhPyH+FHn3xXIkLfRP/rV0JVPq7VlbmX5MX+zX+KX3L/M6PUJnNrKsDtu3gjaeaxXW8fDM1wcerEVJpf264uVjMqgk/xD/wCtW7/Z94V2GaLceQe38q4Ks+SXvWPRw1CnVg/ZuWl+3T5lfQmK2kiyHBD/AMZ5/WtNmT+9H+YqG0051ZxcyKc8qY6nNlAD96Q/lVqpTa3OOVGunpF/18zpvDk0EejTK00Kt9ozgsB2WvLNYs3fXL8xpuU3DkEcjG416LpWjvcafJJE6qgl2neec4HpXL32m3K39wFkhAEjD9fpVSnBJWZEKVRyaaOTOnzf88T+VH9nzZH7quiawuh1li/z+FN+wXeM+bFj/PtU+1Xc1+ry7GNZWEi31uxjwBIpJP1qzq9jJLq07ogZSRzkegrTh0+7aVMSw8sP5/Sn3enXa3Lq0sPGP5fSsnUj7RO/T9SfYSvaxzn9mzZ/1Y/MUv8AZs391f8AvoVtNY3QGTLF+X/1qjNnc9fNj/L/AOtWqqLuDoyXQg0exki1WB22BQTklh6Gq95YSNfXBGzBlYg7h61sabZTvqEQkdGXJyAPY+1QXVnci6m2ugXe2OOgz9K2517Ja9f0MfZP2r06fqZUdhIJFJaPgg/eovLFyZHDJjjvV37LdZ++n5f/AFqR7W4OVLpz/n0pKa5Pmb+zl7Nq3UorprGNW82PoOKuaXZmK7YtIn+rI6+4qysaLGoZMkAZOanso4mnIK/wnipjPUSpNNHPGwbuy0n2E92FaWFpp20cxm6Znmx/2x+VH2MD+P8ASrxxSZXFO5PIW/DSiLxDabmAUFsk8fwmuot5IxfXP7xMGVjnI9TXOeGIludehWQbky3H/ATXTR2EH2uYGPgSN3PTJqI29u/8K/NnI/jZ0Fg0ci/I6sR1wc1swgcZNc3oaBLi7RRgK4wPzro0HSt2Wi9Gf3DH3pVPpTIj/o7fWnKeKwo7z9f0RpPZehPn90tM4pSf3Y+tNrqqbr0Rw4T4Zf4pfmOHBHNJ7UZxjNIc9Kg6gycVXvm/4l9zn/nk38jVjPrVW+40+5/65P8AyNAnsV9D/wCQNbf8C/8AQjRSaGf+JRb/AEP/AKEaKqGxvh/hPGLnVr6wvLNfJhzZwyRxhsnIk3Zzg9fmOPwptvrl61p9mFtbGYWxthdbT5vlf3Ou3pxnGccZq5qWj3F5eGaJ4gpAGGYg/wAqrRaJdW7l3khIIxwx/wAKwwlalJwjJ6nViYcmMlBPS5XuPEl7PC0f2a1Weby1nuEU+ZOEIKhvmx1VScAZIGc1siwu9Stb/UZbaLfNOZZCMABmIYgZOcZNZtn4Yu5L2PdJAVBycMf8K762sC2iXkCFQVdcZPsldOGq0ZRm+bZX/FHA+Z7nJ3kt1dWxhOm2cbMVMssaAPLtGBnnA99oGTyafp2oatpK6gumg2aXybJEglZQq7s4HzfUZOTgn1rXOkz8/PH+f/1qjbSbgdHi/wC+j/hWH1ij/MiLye5DLFLo80kUuh2FwZogCZy2QpPQbHAHTr1rKsY7iwv1uo7G3dlDAJKoZPmBHTPPWuy8S2E8moRlXj/1Kjk+5rJj0u4Dhi8X/fR/wodeitHIfvaDJYbs2CPHpFlDC0bwqiAYG7OWyWJ3e5NZNudSSF7ZNKspHi3xx3DqPMRWJyB82D1OCQSM8EcV2k1rIdHhQMmQ5PXjvWLa2c73FyFeMENjk/WsqWIpNO8urLk5JqxmWk2sWgsQmlWD/YlkCGRFO/f/AH/m+bHb0wKhtbXV7UWDrpdo32OUyoZNp35IOG+bkcfqa6Y2xhAw0bN7tVeVbtiSDF+LH/CtPb0P5kTzzOOjjv8ATr6OUWluxAKtHKFZXBGCCM+h/wAK6iz8N61rkEdzDoNoIRD5cUcciqsYz1GXyTkn7xPWqi6XdvM88rQmQ9DuJwPbivRPBX2iz0bbOwJ+Ypg5zzSqV6SpTlF6pX/IyqVJxWnU4bVtL1JoYrGTw3p0ckcYRZI5DuAHfAk25Pckd6lGi65e2iM+h2SI0iGeZCgaTbxz8/HvtAz1Oa6Ai5a9e4d0LPnOea3A08OhRZ8suzH2HerVai7+8bxlOzPHdYtJdPvGka2iEkUocK2CCeuMA9KyJbqRZI9trAuN3yKv3iwwSef/AKwrtPF2j3F3rLSxyxqjxq6oxxz0PasKHQriQGRHhD9CzMfl+nFCq0XBy5trDjKaizJguLi0jSPyYneJi0bN1iY+mDjt3zzVmzkvo7eELFEsMEvmbpOASeMHnkcn8zWpHoU8DEg28h7b2OB+GKZPol/cfM00LlSDwxwP04rNV6L3kiY1Ki2Zly3ZVmhWHewn855HPVz7ela2kvqWpX5lECGBN0r4AAXGeRk81HPoFyuZpWh2s/3VYn+ldnoehzR208QlXdPJHGAp4C7gx7egrodWgqUZcy1v+guebe5q+GvBmtSWHmQWKeW7FlMcigEH/ebNbv8AwhPiAnmxJGc486P/AOKrvNKvLayso4Vjkwoxwo/xrQ/tm3/uS/8AfI/xqPb0LfEhe0qXueYN4L18NuNkfp50f+Nd3brJbaPaQSjbJHbojLnOCFAIq7PrEGOFk/75H+NY93qcbZwH/Kj6xRW0kJucrX6GRqsuc81zRuGjmO01r6hL5hOM/jWFJC5fOV/OpeKpfzIFFmpFPuGc15p9rN946vJs5GXRfYKMf0rvRvjhc5HCnv7VwGi6dPH4hBcxlpFkbg+uT6V1YPEUnUspdH+Ry42L9hP0MiOFSmTn8KniiCnncp7c1qxaRKcAmMAdwx/wqcaTKDkmP8T/APWrk+s0v5kdqRnK8oODK7ADocGs653t95nbJrfbTZoyGJj98Hqahm0S4J+/D+Z/wqPrFLmb5ux1VE3h4esv0Ob8sqOBg/WkDzZ+8T+Fbp0O4VcK8P4sf8KRtFugMB4QMep/wqvrFH+ZHI0YLPNgkuRjnoKYskwGfNP4jIrbOhXTJgvD0/vH/CkHh+5C5d4cD/aP+FDxFH+ZCszHmUJDFMsahpMgkDgY9qqsWbPJI+tdB/Y1yqSmRofLYcqGPy46EcVBHoVyybw0Iz0yx/wrqjiKX1ST5vtL8mKz5iPw8calGGzjJrr2K+euM9KwNK0S6hvUbzIevZj/AIVumzn80IXjyR614uKq05TTUj6DKZqMJJ9n+Q9mRXP86hZ0BJPWpP7Mue8sX5mmtpc//PWP/vr/AOtWCqU/5jZ1YnUeG3U6JNgn/j4/otcrfzKNQuf+urA/ma6vw3Yyx6LMGkQn7Rng+y1y1/pczX9ywkj5lbufU10SqU+RXZyxqL2kik8y44qIzjrnrVg6TL/z0i/M/wCFINIl7vH/AN9f/WqVVpdynUXcZbzr9pi4HLjv71JqUo+3SjGOR/IU+HS5I545C0ZCsCcN/wDWqvqTj7fLx6fyFKMoyq+7rp+oua7IXmHv+VMM2On8qaTkdBn1qNic9MV0JIG2XdOmJ1KEY9f5Gqt3K32uf/fb+dS6Z/yEYfx/kaq3h/0uf/ro3866LfuV6/oc137Z+i/MjEjbh6UOx3daaOoxSSffNC/hv1/Q3/5dv1GSOw71PpzE3THP8B/mKqyEgVPpmftLf7h/mKIbmH2iiSaaT2oOfWm80zFgelMc4UmlINMYElVz1NUkZzdkdH4Ojxq9u2OpP/oJrqkX/TZ/+ujfzrA8KR7dWtgPf/0E10iD/TJiP+erfzrNf7y/8K/NnH9ol0YZur3/AK6D+tb8fOAemawtIGLq9/66f1Nbafdya3Zoi7HzAw96cCKZGcwN9aBWNHefr+iLnsvQsZ/cr9aaPrR/yyX603POa6qm69EcWE+GX+KX5js570ZyKb0oyc1B1C7sj3qtfn/QLnH/ADyb+Rqfdg1Bf4+wXPp5TfyNAmVtDP8AxKLcfX/0I0Umh/8AIJtv+Bf+hGiqhsb4f4TxLVV/0z/gIqtZ/wCvb/dNXdUH+lE/7Iqnaf69v900YLeB2Yv/AJGE/wDEzW8E6e19rq4O1UxlvSvSb6zWG11CNBjLqTz7JXM/CSz+06xKD0DCvZbTSrS4169gkVfL2qSCM/wrXdh4/u5f4f1R5KfvfJnjMltJkgIx+gzVaS2mx/qZP++TXpGsaeuk6lLBJbP5LHMUoXhh6exriNX1bUba4zb22FBI2bCeK4xXG+LEddUiIRj+4Xt7msFJXJwBzXS+K9Xn+3xxmzkbMKnKqcHk+1Y0MN/dcjTLnB6EJQM0pGYeHbbc2P3h/wDZq597h90yxDJJ9cZ9q6a40HXL3QbaC2smSQSEkSsFwOef5VQ0/wAGancSTxSXEETRNh+S3PPTj2rChs/V/maVN16IwoZZ1DPdiNBj5UUgn8aqT+ZK3ySPg9Fr0ey+HPmY3STzDu20Io/XNdTpXgzTdJIkEIln9W5C10pNmLaOA0DwPc3CpdapI0EOAwiH3mHv6V3Ng0e+FEXbAFIRfYVd1JJJFWBSQ8jiNQPfr+maYlp5mqNDEp2xpsUD2rPEq2Gqen6oxqu9vU5+W2Vry4ubhAsKfvPr6CpZbh7rRbd24/eEge3NV9Thug/2BUldI2+Z9pO403V55tK8JxyrbSyT7yscaoT8xzyfaqVSGuq+9f5nVB6P0OJ8aSQzXsSxnMttAQ2O2TwP51y8lzPARskCqR09fwq6bDUp7eWaW0umllmyxMTdh9Peq72V4SWSynkPQYiY0KpDklqunVf5jXwsqiTewMuGJPfpVqKcyttkcYU4VDwo/Clj0i+MR3WNwGPP+pbj9K1tP06aa2PnaMWkQj70bAsPyrKNSHdfev8AMziK0aizRlKsu4D5cZFeq/Dm2gm1KeLA3Q7ZkHUdNv8A7NXmmsaNbQ20Ulpa3lvOWCvCcsvPcHA/Ku18D2Wr6F4b1vUisn2sNHGFwc7CRkDjn3rrlUh7GOq3fVeXmNtXZ7vBt8oFSCPUGlkk2isnw7m38P20chwQCRuPODzz+JNXZnUxFtwJ7DNY+1h3X3r/ADIKt3cZBrn76fg81Y1G7aIfLG0n+7WJcSPIu4I/PbBqXVh3X3r/ADKVjPvX3q2Oo/lWJICWyK1pUm35ET/98msjXVurOwla3t5Gc8AhCdoI61n7SHdfev8AMu6KupT+TpF22eRGa4nw0D/bMWc/6puv0roLpL648KTO1vOZGUKQYzknPpWL4etLuPWo1kglUmJsAofSuvB1Ie0eq2fVdvU5cb/AnbsJCY8Bt56+nWnrtZyCx570R6dd7Qfsswx3Mbf4VNHa3SMT9luGb3iOBXP7WHdfev8AM61YbuQk4GT1O7tSOFfndj29KsfZZwQ7Wsq9slDg002dyAf9Cnye4jNZqpDneq6dV/mdlRpYeHrL9DKkYqx5wuO9RlnIyTwegrSm0+6UFhaTt6fuz+tQC0v9xAs52OO8R/wqvaQ/mX3r/M5LopuONzZK/wAKg8sf8KhYFwHkIHoo7CtObTLuVjLHazktjKbDlPYcciqj6dej/lzuePWJv8KPaQ7r71/mJtFF8bSEyAfWnoizwhOPNQfJ/tDuKsPp15sLNaTqMZ/1Z/wpsVneBVeO0uD3DCMnmupVIfVXqviXVdn5kfaIdOcC+jPQbq6DzBtJ6Y96y/7PvF1CNls5wGIY/um4OPpWp9kucFTbSbjzjYa8fEyg5LVfee7lU1yyXk/yGGYHim+aM9P1p/2O6xj7JP8A9+zTGtLodLSf/v01ZKUO5o6i7nYeFZAdDnx/z8+vstcjqMg/tO66f65/5mt7wpqOp6ffRWkdkfJkdnYvC2Qdvr+Ap974n1yO/uEXTUKrIwB+zv0z9a35oOK1/r7zm57Tdrfecn5vzdaQuPeukHinXs/8gxP/AAHf/GnHxRr3/QMj/wDAd/8AGlzU/wCb+vvD2r8vvOZVx70oYZ710o8Ua/8A9AyP/wAB3/xpR4n1/wD6Bsf/AIDv/jRzU/5v6+8PaP8ApnMkgnHNNLAds11H/CTa9/0Do/8AwHf/ABpD4n17/oGR/wDgO/8AjRzU/wCb+vvB1H/TMLS2H9ow8ev8jVS7JN5Px/y0b+ddbZ+ItcmvI45dOjVGzkiBxjj60ybxJr6TyImnRFVYgH7O/TP1rfmh7Fe91/T1MOdus/Tv5nI8hhx3pJThjxXWf8JL4gyM6dF1/wCfd/8AGiTxNrwYj+zY/wDwHf8AxqVOHJ8S3/rqb875H69zi2O7gZFW9M3faWH+wf6V0h8Ta+B/yDY//AZ/8adB4h1q4kKTaeiKBkEQMOfzpwnC6s/6+8yUve/4JxBBB6U09O1dkfFHiD/oFx/+Az/400+KfEP/AEC4/wDwGf8Axo54fzL8P8zJyOMPWlhXfcqPSuvPinxF/wBAuP8A8Bn/AMatw+IvEAkOdMjxgf8ALu/+NaRnDuvvX+ZhVlpYh8MJjVID9f5GugVf9Ml9PMb+dWNE8Qau99EJrONE5yTCw7H3rUHiK8+0SjyrfhyPuH1+tZxcXXbT6L82YL4jG0ri7vh6yD+ZrZU96raRrVzHf6iwSLLy5OVPqfetxdeusD93Dzz90/410M0RBGcW7fWlBGc1fTW7kwE7Ic5/un/Gga3c4+5D/wB8n/GsaVry9f8AIufT0Kv/ACxX600H8q0v7ZuPKU7Iuv8AdP8AjTP7auf+ecX/AHyf8a6p7r0Rx4W3LK380vzKFJnjg1ojWbk5+SH/AL5P+NJ/bVxn7kX/AHyf8azOkzzyDVe9/wCQfc/9cm/ka2P7ZuP+ecX/AHyf8agu9cuUs528uHiNj90+n1oEzF0P/kE2/wCP/oRoq/p+u3M2iozJD8yMDhT6n3oqobG+H+E8K1Qf6R/wEVStP9e3+7Whqi5n/wCAis+1H+kMP9k0YLeB2Yv/AJGE/wDEz0/4Hwh7+8kPRWH8hXpGk3yP4w1KR22q7bVP935VxXPfC3ww+k+HXnIJvLoF2UdhjgflSs7QahqJwVZJF+o4SvRoOykn/L+qPHWsn6M9O8pHjCTBXB9QCDWddaFpdxnz7QjPR14qhoniGC5iMNzKqkD+I8N/9euiidSA0bHaR3rnlBMz2MKXwvaTziUSkEDaAQCKtRaOkOFfy2x6Ej+taYLKPl457Ck3EHmpUUNtmJPplnLO6OrhQM4DEVi+H4beLU9YWOJcCfAzz3auhuGL3khI7dq53RSRqesEf89/6tWFDZ+r/M0qdPRG8RvYIeAemKrywNH74p5lyMEVci/0mAFlIdeD7+9dBkczIVGvW5I+WKN5D9cGq81y9jDNcHiaUcewNbNxppXU/PZf3YTB9/aud1eTz7ltxG0HH6VSScZJrp/kTJXcfU5+bUruOQvJPtTBdjtHT8q5zxL4kv5/D9tcQzmMNMQvyqePm9R7U3xTqQeU2kR4AHmY/QVi64hfwdp2MZ+0N1/4HXIsPSSdor7jqglZ+hmDxNrbW5xeYAfk7F/wpYta1aM7Ybsqp6/Ip5/EVkxyeWjxvH8rDjHY+tX4YBJKq+Y+O+xc/wA6I4el7OS5V06AkuVmlDruqMT5t84UDP3E/wAK27C81u9VJPtqW8D/APLZkU8egGOTWLBp5muWjWFUjYDLTOMgZHv/AJzXUyWzW6IuxVjVeowB/wDqqI4Wjb4F9yM4pEes3cq2MaxTsQrgb3A3OeeTxx9BivU/h9cQXPh++m1CZHT7RtPm4UYGfSvIdU4sk2MrKXGPbg1ayVXa7M5HUZyAcV0zoUlRgnFbvovIpQTbPW9e8e6NoqbY4muHx8qoeK4TUvi1qrEfZbC2gQ8DcSxH61yUtyxLqflHGc81kXL7t3zHruJ9q53Qo/yL7kV7OKOhuPiV4jZzi4gXnoIRVM/ETxJnm5iP/bIVzbEM2M8fyqMEDOMmp+r0f5F9yDlR1sXxL1lWxNFA4/3cVpwfEaO5QR3cckRPcYYfyrz0rnPHtShGLn5eaX1aj/IvuQJLseh6x4hkk0SQ2N4rNkABVGR+GKxtC1PUZdcieaYlhG2CVA9fauehQ55OK2dEU/26mBwYmIz9K6sHh6PtH7q2fTyObGpKhNrsWTruoYA+1nPbCLj+VA1vUg4/0zKnr8i8fpWZjYp3kbfQVFGw2sM4A5zXN9Wo/wAi+5HTZdjaXXL2bMbzbl6jIH+FPj1+/cMTOeB2Vf8ACsiBV3/f4xk8c0xWjBYqG6Y61CoUuZrlXToddRL6vT06y/Q1pdevihxcMreyrz+lUz4h1QDAuiT3+Rfy6VRyxPO7GeCRUUgw3HfoKv6tR/kX3I5Womp/wkGqMOLk5HfYv+FMPiDWkP8Ax/cejIh/pWW7ngbWAHqMU0ox28bfrxS+rUf5V9wrJ9DVl8Rak8ZVpVIIwcKBUcOv6lFGkcU6qgzzsBI/Os3AAf5wR6CmgKQA3TrmupUKX1VrlXxLp5MXKuY0U8Q6sLpVN6Wy39xf8K0W1m/K+b9oO8cA7R/hXMrtN2m3ONwrVDYt3+teVXoUk1aK+49fLYxtLTo/yLzeINTA5uj/AN8L/hTD4i1MD/j6P/fC/wCFZjfWonPWpWHpfyr7iZRiuh0ugeINSl161je5JUlsjYv90+1UtR8SammqXaLdEATOB8i/3j7VW8Nn/iorT6t/6Caz9VP/ABN7z/ru/wD6Ea1WHpctuVfcYNR5tjRHiXVcf8fZ/wC+F/wpw8S6mcf6Uf8Avhf8KwckUoaj6tS/lX3B7vY3x4k1P/n7P/fC/wCFPHiPU8f8fR/74X/CueEhpwkNL6rS/lX3FJx7HQDxHqWP+Pon/gC/4UHxHqX/AD9H/vhf8KwhLR5lL6tS/lX3Fe52Om03XtRl1GJHuSVJPG1fQ+1RXXiDUkuplW6ICuw+4vr9KzdGkzqsPrk/yNQXj/6bcf8AXRv51q8PT9ilyrd9PIxSj7V6dP1NVfEOpsw/0ojn+4v+FJL4h1MOf9KOP9xf8Kx0b51xnrRO37xqlYalyP3Vv2Onlh7N6dTUPiPVev2o4/3F/wAKsWOuajPOyyXJZdhIGxf8K58tV3Sj/pTf7h/pRDD0k0+VfcZRUeZaEp8Rat/z9n/vhf8ACmnxHqv/AD9n/vhf8KzCaYTQsPS/lX3GbjHsan/CR6rkZvCOf7i/4VrprmpkKftJ6c/Iv+FcknzTIPfJroLRQV59a1jhqVvhX3I5qijzbHT2OrX7JzcE8/3R/hWrAzMS55LZJrBsE6dhmt6Djt2xVxpwh8KSJSSItLJ+13vr5n9TW2hz19Kw9L/4/L3/AK6dfxNbUbc5zVspF2M/6M3+9Qp4psR/0Z/rQvOMVhR3n6/oi57L0LJ/1Ax603PpQT/o6/U00nmuqpuvRHFhPhl/il+YoIFLTc0eg/KoOoUNkY71Wvv+PG5/65N/I1YzVa+P+gXH/XNv5GgT2Kuk/wDIDi/3W/maKTSj/wASOL/db+ZoqobG+H+E8m1Ejzj/ALtVdMiWW/JJ4UdPXmrGpDMx+gqvYEQXQkJ7c08D8UDrxf8Av8/8TPpPwpdohiQHDKR8p4ptzZW1/wCItTinGFJXJXqPlSqWi6l9usIZ4NKkmBUFJEJx+YFU01G7/t++R9PuzIwDlQM8YX2FenSg7TTXT9UeZGjNPW33r/Mk1Xw3Lp0263m82PqARhq6DQdSP2JI7nqvAccj8fSseTWZZ1W3l068Eij5cjkiqkOsfZLgt/Z90pP3gV4P4VgoNaXH9Vn3X3o9EQgx9cj1pjkbCa519da3kAWzuGBXPyjipF16eUYXTbk+23/61JQdtA+qzet195cP/H0/0rnNLcRajrJIz/pGMfi1aB1S4W4b/iVXecf3aw7LU1N/qOzTLveZsycd8msKVJpPVbsueHldarZdTrbODz8Ng4Na4SG1TLYJx0rmoNbuYIQq6XdAf7n/ANalbW7luX0u7Pttrf2ZH1WfdfejRv5mkjYjC8HFeY63dvDAxQZlZiFHp711eoeImt4y02n3QB4GR1rlLy/sZrYSyWU53Mcj8/eqjTevoRLDSTWq37nnOoRFGJY5YnLH1NTanEJfBdgGXOLhv/Z63b6TQ2P7ywuB/wAC/wDsqx9YvbG50mOys0kiWKTeN/Pr7n1rH2bVzpjQkk9vvRyiQxqfugGui0axWRt7sAOw9apWenNNJvyXVeoArfsRst9vkFhuIHFOMHyP5AqE+Vr9V/mJCj/2tJGwK5QA8fStOKNkbAB2dCr/ANKz4pSl/JlWY7QME8jpipvtMoHzIwbJJGamMJELDz/pr/ML2BZ1WN1wFbIwetZtxaQ52q8g7nn/AOtVqSc4AHBBLHJzVSS4G1skA7s59q0U6sVaL/IpYeXVL71/mUJrRARgv+NV5LRVXkt19e1W3/eAkyZz3qEdcb8t6Cj2tf8Am/FB9Xl2X3r/ADKbW6Ko5OSM8Ui265Oc8VcMJ6B0Bx0IpwVVGN4P8qXtq/8AN+KF9Wl2X3r/ADKJhXvmpFt42zgt781YVY1GcqcnvSkLwN6jvxS9tX/m/FD+rS7L71/mRLbJjqxI6Ctfw7Cv9pltzZWNsZ/Af1rOIXAAcetPTGMqcnHHtVKtW6v8UZ18HKpTcFZX81/majeHryT+OH8XOP5U0eGrzH+tt+vQOf8ACsraTnD4HelEe7+In6Cs3FdvxMlhMX/z8X3L/wCSNU6FcQDc7Q88cOf8KYvh67DDdJAR6Bj/AIVSjT96XJA4wM96g8ts/wCtHvWSiud6dup11MLi/q9NKa3l0Xl/eNUaBe7ifOg5P94/4UHw9ctwXgJHcMf8KyjsX7xQkcUEgqcOgyOwrXlXb8Tk+qYz+dfcv/kjR/4R68B/10BA7Fj/AIVCfDV85JM1uf8AgZ/wrN8lADiRen5U1Yk4/eA47etHKu34i+qYz+dfcv8A5I05fDd4sZbzIMKCT8x5/SmxeH7uaBXWW3CnsWPr9KzWjySxlHToaRIQwB3j6V0qK+rPT7S6+TF9Vxd7c6v6L/5I018MXyz7hLbcc/eP+FWDot6sbRb7fcxyDuOP5ViJGqXAbzVzjoau5U2zY29a8yvCN17vbqepl+GxcVK9RbS6Lt/iL/8Awjd6UH7y3zj++f8ACo38N35/5aW3/fR/wqSytRJZRNhWwME5xzT2ggX7xjH1cVqqUP5fxOJ0Mf8A8/F9y/8AkiTQvD1/Dr1pIz2+0Fs4Y5+6fas7UvDmovqt24ltcGZyMuf7x9q0tMS2OtWeyWLdvbADAn7prnNThT+17398n+vf/wBCPvRyR/l/ESw2MejqK/ov/kix/wAI1qX/AD2tf++z/hSf8IzqX/Pa1/77P+FZvkp/z3j/AC/+vR5C/wDPdPy/+vRyL+X8R/VcZ/z8X3L/AOSNL/hGdS/57Wv/AH0f8KP+EZ1P/nva/wDfR/wrN8hP+e6flR5Cf890/KjlX8v4h9Vxn/Pxfcv/AJI0v+EZ1P8A5723/fZ/wpP+EZ1P/n4tv++z/hWd5Ef/AD3SjyE/57p+VHLH+X8Q+qYz/n4vuX/yRvaT4d1GHU4ZHngKgnIDH0PtUN34a1F7ydluIAGkYj5z6/Sq2jQKNXgP2hTyePwNQXsC/brg/aAP3rfzNaOMfZrTr38jNYXGe0a9or27Lv8A4i5H4Z1LzEzcQYyP4z/hS3HhjUWnYi4gA/3z6fSsxYUDhvtAODmorghp2IbIqHyRjqvxNZUMXGnZ1Fv2X/yRp/8ACLaj3uIP++z/AIVc03w5fW9yzvNCQUI4c+3tXO4960NHAF4//XM/zFQnC+34kU6eI51eov8AwH/gk3/CKX//AD2t/wDvs/4Uf8Ipf/8APa3/AO+z/hWTgUhAANLmh2/EzdLE/wDPxf8AgP8AwTftfC96kmTLb8f7Z/wrdtPD92oBMkOM/wB4/wCFcTaA7xXSWQOwDNaXh2/Ew9niP+fi/wDAf+CddaaROi8vF+BP+FacenygffT86wLLiMHua042wOKV4dvxKVPE/wDPxf8AgP8AwR+lWshvL8ArxJjr7mthLWQd1/Oue0tsXl5/v/1Na8bfKaV4dvxH7PEdJr/wH/gmqkDi2YZXOfWkWBx3X86ij4tH+v8AhTV6GsqTheXu9e/oXOnibL94tv5f+CW3GyFQT3qPoaYTkg0ua0lLmY6FJ0otN3bbf3j80hOaT3pT9Kk1Aniq18f9BuB28pv5GrGflxVa+P8AoFz/ANcm/kaAZV0nH9ixeu1v5mio9KONFiI9G/maKqGx0Yf4Tyy6u3WQABenpUAunbghfyqO8b98P90VEjc0sLTjeLsdeKxdf6zKPM7XNiw8R6jpjZtpdq9052n8K6zw9410eXWXm1tZ7YSxhN8R3KDx14zjivOi1RnpXbheVRqJq/u/qjhni68rXm/6+R9N2uhaFrlotxp181ynUNFKrY/IZFVbrw1HbN+885/fcAf5c18522o3unSeZZ3UsD+sbEVuW3xO8WWQ2/2q86/3ZxuFc/JQeysH1vFfzv8Ar5Hut1pVjPcDe8ocIOMgf0qAaJZq37uSfPoGH+FYEPxH0+PWIbDWlFvI0atHcqPlySRg+nSu3W5D26zwmOWIjKyRkEGh0Y25rCWNxEdOd/18ira+HUnuSGE4GOSXAwPyrI0/wvBNqmrR+dIAk+Mhh6t7e1b8d5cPIX80xqRyTycVhaXqTR6rqypkK8/Lt16tWVClBp6dWOrjMRdPney/rY1H0DTraPMlxMAByzOOf0rFum0yMlYJJnP94uMfyqxcSPI5Z3Ln1NZM1qsrMU4bP4Vv7Gn2M/r+J/nf9fIq3Fok7szTO4/hGelU7mzQWSrluG/xqzNDJCQCcZ6EVOMzWimVeQfzq4UoWlZdP8jOeNxDcbze/wDn5HG6lAIkLDP41zrzSOMIoyzYHH511muESllXhRxVHSbKJSZZAMBSFGefrXO6ML7HTHHYj+d/18iaytYVtFYS8kc8cVFZIDGEViSSxIxwAAK0pUUPuf5Y0X8aq6cALdyUfJJC4HJ9a0VKCg1bsarGV2m+d/18ivGgOpyKeEVNxz6cUXDJvIBJHrmi6tYZpC5Lozjlcjp2rNltIlZsyMR0HvWHso9ifruJX2n/AF8ieR/m4IZR69qpyXJ3EKR+Peq7xRjcodj71CYkHckdhnqaTprsH17E/wAz+/8A4BO92/GAv5Un2mTrgD8DVfZFn7pJ74NXrDTWuZ49kMskRPz7OuPWl7OPYPr2J/mf9fIZGbmRZHWMBIxlmI6Vo6bp4v7a4uXuEit4QBuYfeY9AKu3WkWmm/bUuHl8huIxkbhg9TVSGyt7qBpLVZktYQTywOffpR7OPYX17E/zP+vkQXdpKl5HbWq+dJ5YaTaM7T159OKpPK0LbXT5h/eGK1rDQ7q5g3kBY3J2/vOvsTjGasx+GLaYYkEkbno6kPk/pRyR7C+vYn+Z/wBfI577R8v3VB7DFBuXjwAVJPoOlWNR0GexnAaK48sk7W2g5/Ks42vP3vx/xo9nHsP69if5n/XyLLXj4OCue4pPtj8cjFVvLj4CgsB601EUseMNR7OPYf17E/zP+vkW2uZWGQmc+oxUIUB8Ek+yjAFQopOck/TNNKqO5qox5dkZVcRVq253cfKw3jn8aUFAhL5Yk/dHSmGBtoOANx4GaQoi/KTubuR2p6mV32HPIrf8swB7dqZhRzk49KUxxKMnJOeMGo9iHoxBo1Bt9iQsdmS2PrULORxn9aTBUndz6UE8HPFdSv8AVX/iX5Mm75hY2Au1NaBf/RHPv6/Ss6MBp1BBAPrV/wAtRZvjON3+FeXX3R6uXylaWnSX5FJyrZ+UH61Xbbn7q/lVhkAGc1AwA6VaOaUp9jT8L4/4SS0+UDlu3+yap6oR/a97/wBd5P8A0I1f8L/8jHafVv8A0E1S1T/kL3v/AF3f/wBCNX0M7yvsU8ik3U7NFILy7DSaM07NIGzQF5dhM0U7NGeaAvLsX9EP/E4t/wAf/QTVa+P+n3H/AF1b+Zq1oh/4m9v+P/oJqtfH/T7n/rq38zWr/hL1/QzvL2j06fqVuaKdmkzWRpeXYMGtDRh/pb/9cz/MVQzxWho5/wBLf/rmf5iqje5UHLmWhnc01umKdupN2TihXuZScrbFq0X5veulsV6GuetogxHWt20sUYDJf8DVXZlr2OhteAPSr8bZFYUOnQN1aT8x/hVtNKt8Z3y/99D/AApaju+xd0w/6Ve/74/ma2UIzj1rlrHTIJLi5UvJhXwMEe/tWpHotscfPN/30P8ACnqF32OkjP8Aojn/AGqaprOh0i3/ALNlj3y7WYE/MM9vaoV0G0x/rJ/++h/hWFK95+v6I0m3ZadDaHX+dPzxWKNAtf8AnpP/AN9D/Cnf8I/adPMn/wC+h/hW2pF32NntRnpWOvh+0PHmT5/3h/hQfD9p/wA9J/8Avof4U9Qu+xrk1Wvj/oNx/wBcm/lVA+H7T/npP/30P8KY2g2vTzJ/++h/hS1Fr2JdKI/saIf7LfzNFTRQJa2ohQkqoOM9aKuOx1UNInjF6f36/wC6KjjbJ/CpbuGSSUMi5G0dxUcdvKpyU/UUYWSvFXN8TRqPEykou1+wzNGeKd9nm/ufqKd9nlx9z9RXZhY354vS8eunVHFKhV091/cV26VWkFXmtpSPun8xURspD/CfzFL6lP8Amj/4Eh+yqfyP7ma/jZS2uQAdTbL/ADau/wDg/wCJhGZvDt6/7uT95bMx6N3X8eK4/U7H+1r1Lp5ljZYxGAFJ6En+tLYaS1jcpcw3mJYzuVtp4rWnhZxeso2f95EujUa+B/cz364j2TMF7dvSuV05SdQ1Rl7Tf1aqFv4yvJIVWVrcyAYJ2Nz+tZ9n4mktby9dzB+9kzyjHufT61lTws6d02t31Qp0Kr2i9l0Z1z/vUynXuKrTnyISEGXPHSsBvFOWDLJACD2jeo38Sb9372BSe4jfj6Vt7CXdfejP6tW/lf3GjBM5nLzEFcHqOgplx5n2fCOWBP3u2KyDrFuGDSSxSEDABR8flT59btrq2WM3SQ88hImqoUJK+q27omeGraPle/Zla4hWU7Q3Q4/3j6U026tcsUXAVSfaka5smAH9oAYx0ieg3NoeP7TI57Qt/hWf1afdfejVU5r7L+5/5EszBzHG+1Qo3Nu7+lZ1oXWBnWQHLHqavG8stzsL5MvjcfJfmqNra29xaMsl2FG49Izmh0JKDu106o2jGfK/df3MglJZ2wOVHPPSov7OurjCxR4GMhm4BrYhstPhYsJkc5z80bVNMUm4N+qp/dWJqxdF/wA0f/AkQ4VP5H9z/wAjFXw1KRunmCgc5Wp49I05JYw0m92OApPU1dFtabdpvS31V/8AGhYbKNo2SaFNhyCIWzU+wf8ANH/wJC9nU/kf3P8AyJbTw1BZ3HnPG4kRsbXHGPp6Ulrd29vfNHpy/ZXVs55KuT1FTXl2bsENqrDIABEZyMUy1+zQKFW8UkHLP5R3Mfc9afsH/NH/AMCQezqfyP7n/kYustcXmqXXlqGMJ6N90e59q6VfDs1r4etrV5DIb2Tc0qDG0kdB7cVSRLGPYjyxSqrb9rxMQzerDufrW1a+I5oYXSO8hwZd4/ct8oxjA9qPq/8Aej/4Ehezq/yP7n/kVtUgh0vR00u3Ym8jhAhiH/LVmH3vzrl9N1X+y5zaX2551z5jhxtT6cc1saisV9qsd+b/AMqSPGwJG2B9Kxbrwxp0s5mOoyDJyRsJoeH/AL0f/AkHs6v8j+5/5Gj/AGzBqLiAyzmLPMm8At7dKp3/AIajG6S3eRUb5iXbdj+VXbCz0qz6eRKezSROSv05q2Tb7Cgu1IbqGjY/zpewf80f/AkV7Op/I/uf+Rx76O6LmNxJg8gHn8Kglt54k5hKp1zjmuznjsZ0CtJArgcOkDA1V+xQJjGo8D1iJo9g/wCaP/gSF7Op/I/uf+RxRDo3KsPrTCwD8fdHt1ruGsrOQYku42U9jAaqtoOk7TtuQrdmEbZFHsH/ADR/8CQ/Z1P5H9z/AMjkHZ2bcwKjtSDOeFy1dZ/wj9gG3DUnzjHMRNMfw9YsuDqTAZzxEeaPYP8Amj/4Eh8lT+R/c/8AI5YqC3LnPsOKaUOeoOPwzXT/APCNad/0E3/79Gkbw1pxx/xM3/79Gj2D/mj/AOBIXs6n8j+5/wCRy7A9wfqOlISAM4z9a6n/AIRnT+2pyY9PKNWE8K6G8Q83V5lfuBCf8K0naGHcbpu6ejT6MSo1G/hf3P8AyONWTE6M3A9avGZTZyEHgH/CumPhPw/kf8Tmfj/pif8ACkPhrRgfIXVJjE3LP5RyD9Me1eZVi5NWX5f5noYPmpqXMmtJdH29DjHlHY1Azgmu5Pg7QT/zGZ/+/J/wpv8Awhmg/wDQauP+/P8A9atVSn/L+X+ZxPEQfX8znvC7j/hI7Tr1b/0E1T1Vx/a97/13k/8AQjXcaZ4a0LTdRiu11edzHn5TERnII9Pesy90LQp764lOp3ALyM2PL6ZOf7tN05pfD/X3k+2hfc4/d7Ubq6n/AIR7Qf8AoKXH/fv/AOxo/wCEd0H/AKClx/37/wDsaXJP+V/18x+1h3OVzRkDiuq/4R7Qf+gpcf8Afv8A+xo/4R3Qf+gpcf8Afv8A+xo5J/yv+vmL2sO5yu7NJu9xXV/8I7oP/QUuP+/f/wBjR/wjug/9BS4/79//AGNHJP8Alf8AXzD2sO5iaG//ABOLf6t/6CarX7f8TC56f61v5murs9H0Kyu47hdSnYpngx9eMf3azbqz0B7uZzfXOWdicL7/AO7RUlyU0mnv28hQfNUbT6fqc8WpN5rc+w+H/wDn/uv++f8A7Gj7B4f/AOf+5/75/wDsaw9suz+5m3K+6+8ww5rS0Zibx8/88z/MVa+w+Hv+f65/75/+xq3p9toUVwzRXk5YoRyv0/2aqNVN7P7hwjaS1X3nMbjTo+Wrof7H0X/n7ufyH/xNSR6RooP/AB93P5f/AGNCrL+V/czOVOTKVkgytdHaDEeajt7HRkI/0uf8v/rVqxro6pj7XL/3yf8ACn7Zfyv7mL2UhIT/APXq1GwHH401H0cf8vcv/fJ/wqZZtIDZ+1Sf98n/AAo9sv5X9zH7ORBprf6TdZ/v/wCNbMZ9KzNKWya4uz5z7S+VOOo59q2IxZAD98/5f/Wpusl0f3MFSky3Cc2T/wC9/hTVPFJ51ulu0cchOTnkGo1lUH71TRu+Z23f+Q5xlorFkHpT+pquJk/vfpT/AD0z979K3syOWXYlzgZ96UnqPWofPjx979KQzJ/e/SiwckuxLnBFRs1M85P736UwzJu60WYcsuwO3yke1FRPIpU80VUTeimk7n//2Q==",
-      "text/plain": [
-       "<IPython.core.display.Image object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# image viz\n",
-    "frcnn_visualizer = SingleImageViz(URL, id2obj=objids, id2attr=attrids)\n",
-    "# run frcnn\n",
-    "images, sizes, scales_yx = image_preprocess(URL)\n",
-    "output_dict = frcnn(\n",
-    "    images,\n",
-    "    sizes,\n",
-    "    scales_yx=scales_yx,\n",
-    "    padding=\"max_detections\",\n",
-    "    max_detections=frcnn_cfg.max_detections,\n",
-    "    return_tensors=\"pt\",\n",
-    ")\n",
-    "# add boxes and labels to the image\n",
-    "\n",
-    "frcnn_visualizer.draw_boxes(\n",
-    "    output_dict.get(\"boxes\"),\n",
-    "    output_dict.pop(\"obj_ids\"),\n",
-    "    output_dict.pop(\"obj_probs\"),\n",
-    "    output_dict.pop(\"attr_ids\"),\n",
-    "    output_dict.pop(\"attr_probs\"),\n",
-    ")\n",
-    "showarray(frcnn_visualizer._get_buffer())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# test_questions_for_url1 = [\n",
-    "#     \"Where is this scene?\",\n",
-    "#     \"what is the man riding?\",\n",
-    "#     \"What is the man wearing?\",\n",
-    "#     \"What is the color of the horse?\"\n",
-    "# ]\n",
-    "test_questions_for_url2 = [\n",
-    "    \"Where is the cat?\",\n",
-    "    \"What is near the disk?\",\n",
-    "    \"What is the color of the table?\",\n",
-    "    \"What is the color of the cat?\",\n",
-    "    \"What is the shape of the monitor?\",\n",
-    "]\n",
-    "\n",
-    "# Very important that the boxes are normalized\n",
-    "# normalized_boxes = output_dict.get(\"normalized_boxes\")\n",
-    "features = output_dict.get(\"roi_features\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Question: ['Where is the cat?']\n",
-      "prediction from VisualBert VQA: outside\n",
-      "Question: ['What is near the disk?']\n",
-      "prediction from VisualBert VQA: nothing\n",
-      "Question: ['What is the color of the table?']\n",
-      "prediction from VisualBert VQA: brown\n",
-      "Question: ['What is the color of the cat?']\n",
-      "prediction from VisualBert VQA: gray\n",
-      "Question: ['What is the shape of the monitor?']\n",
-      "prediction from VisualBert VQA: square\n"
-     ]
-    }
-   ],
-   "source": [
-    "for test_question in test_questions_for_url2:\n",
-    "    test_question = [test_question]\n",
-    "\n",
-    "    inputs = bert_tokenizer(\n",
-    "        test_question,\n",
-    "        padding=\"max_length\",\n",
-    "        max_length=20,\n",
-    "        truncation=True,\n",
-    "        return_token_type_ids=True,\n",
-    "        return_attention_mask=True,\n",
-    "        add_special_tokens=True,\n",
-    "        return_tensors=\"pt\",\n",
-    "    )\n",
-    "\n",
-    "    output_vqa = visualbert_vqa(\n",
-    "        input_ids=inputs.input_ids,\n",
-    "        attention_mask=inputs.attention_mask,\n",
-    "        visual_embeds=features,\n",
-    "        visual_attention_mask=torch.ones(features.shape[:-1]),\n",
-    "        token_type_ids=inputs.token_type_ids,\n",
-    "        output_attentions=False,\n",
-    "    )\n",
-    "    # get prediction\n",
-    "    pred_vqa = output_vqa[\"logits\"].argmax(-1)\n",
-    "    print(\"Question:\", test_question)\n",
-    "    print(\"prediction from VisualBert VQA:\", vqa_answers[pred_vqa])"
-   ]
-  }
- ],
- "metadata": {
-  "interpreter": {
-   "hash": "f237d186bbb22b392353378fb98a8d08e33f23f14150c8880e3780871939e71d"
-  },
-  "kernelspec": {
-   "display_name": "Python 3.8.0 64-bit ('transformers_env': conda)",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
diff --git a/examples/research_projects/visual_bert/extracting_data.py b/examples/research_projects/visual_bert/extracting_data.py
deleted file mode 100644
index 6b1342c9b11f..000000000000
--- a/examples/research_projects/visual_bert/extracting_data.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import getopt
-import json
-import os
-
-# import numpy as np
-import sys
-from collections import OrderedDict
-
-import datasets
-import numpy as np
-import torch
-from modeling_frcnn import GeneralizedRCNN
-from processing_image import Preprocess
-
-from utils import Config
-
-
-"""
-USAGE:
-``python extracting_data.py -i <img_dir> -o <dataset_file>.datasets <batch_size>``
-"""
-
-
-TEST = False
-CONFIG = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
-DEFAULT_SCHEMA = datasets.Features(
-    OrderedDict(
-        {
-            "attr_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
-            "attr_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
-            "boxes": datasets.Array2D((CONFIG.MAX_DETECTIONS, 4), dtype="float32"),
-            "img_id": datasets.Value("int32"),
-            "obj_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
-            "obj_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
-            "roi_features": datasets.Array2D((CONFIG.MAX_DETECTIONS, 2048), dtype="float32"),
-            "sizes": datasets.Sequence(length=2, feature=datasets.Value("float32")),
-            "preds_per_image": datasets.Value(dtype="int32"),
-        }
-    )
-)
-
-
-class Extract:
-    def __init__(self, argv=sys.argv[1:]):
-        inputdir = None
-        outputfile = None
-        subset_list = None
-        batch_size = 1
-        opts, args = getopt.getopt(argv, "i:o:b:s", ["inputdir=", "outfile=", "batch_size=", "subset_list="])
-        for opt, arg in opts:
-            if opt in ("-i", "--inputdir"):
-                inputdir = arg
-            elif opt in ("-o", "--outfile"):
-                outputfile = arg
-            elif opt in ("-b", "--batch_size"):
-                batch_size = int(arg)
-            elif opt in ("-s", "--subset_list"):
-                subset_list = arg
-
-        assert inputdir is not None  # and os.path.isdir(inputdir), f"{inputdir}"
-        assert outputfile is not None and not os.path.isfile(outputfile), f"{outputfile}"
-        if subset_list is not None:
-            with open(os.path.realpath(subset_list)) as f:
-                self.subset_list = {self._vqa_file_split()[0] for x in tryload(f)}
-        else:
-            self.subset_list = None
-
-        self.config = CONFIG
-        if torch.cuda.is_available():
-            self.config.model.device = "cuda"
-        self.inputdir = os.path.realpath(inputdir)
-        self.outputfile = os.path.realpath(outputfile)
-        self.preprocess = Preprocess(self.config)
-        self.model = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=self.config)
-        self.batch = batch_size if batch_size != 0 else 1
-        self.schema = DEFAULT_SCHEMA
-
-    def _vqa_file_split(self, file):
-        img_id = int(file.split(".")[0].split("_")[-1])
-        filepath = os.path.join(self.inputdir, file)
-        return (img_id, filepath)
-
-    @property
-    def file_generator(self):
-        batch = []
-        for i, file in enumerate(os.listdir(self.inputdir)):
-            if self.subset_list is not None and i not in self.subset_list:
-                continue
-            batch.append(self._vqa_file_split(file))
-            if len(batch) == self.batch:
-                temp = batch
-                batch = []
-                yield list(map(list, zip(*temp)))
-
-        for i in range(1):
-            yield list(map(list, zip(*batch)))
-
-    def __call__(self):
-        # make writer
-        if not TEST:
-            writer = datasets.ArrowWriter(features=self.schema, path=self.outputfile)
-        # do file generator
-        for i, (img_ids, filepaths) in enumerate(self.file_generator):
-            images, sizes, scales_yx = self.preprocess(filepaths)
-            output_dict = self.model(
-                images,
-                sizes,
-                scales_yx=scales_yx,
-                padding="max_detections",
-                max_detections=self.config.MAX_DETECTIONS,
-                pad_value=0,
-                return_tensors="np",
-                location="cpu",
-            )
-            output_dict["boxes"] = output_dict.pop("normalized_boxes")
-            if not TEST:
-                output_dict["img_id"] = np.array(img_ids)
-                batch = self.schema.encode_batch(output_dict)
-                writer.write_batch(batch)
-            if TEST:
-                break
-            # finalizer the writer
-        if not TEST:
-            num_examples, num_bytes = writer.finalize()
-            print(f"Success! You wrote {num_examples} entry(s) and {num_bytes >> 20} mb")
-
-
-def tryload(stream):
-    try:
-        data = json.load(stream)
-        try:
-            data = list(data.keys())
-        except Exception:
-            data = [d["img_id"] for d in data]
-    except Exception:
-        try:
-            data = eval(stream.read())
-        except Exception:
-            data = stream.read().split("\n")
-    return data
-
-
-if __name__ == "__main__":
-    extract = Extract(sys.argv[1:])
-    extract()
-    if not TEST:
-        dataset = datasets.Dataset.from_file(extract.outputfile)
-        # wala!
-        # print(np.array(dataset[0:2]["roi_features"]).shape)
diff --git a/examples/research_projects/visual_bert/modeling_frcnn.py b/examples/research_projects/visual_bert/modeling_frcnn.py
deleted file mode 100644
index c7c3bf376ce3..000000000000
--- a/examples/research_projects/visual_bert/modeling_frcnn.py
+++ /dev/null
@@ -1,1920 +0,0 @@
-"""
-coding=utf-8
-Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
-Adapted From Facebook Inc, Detectron2 && Huggingface Co.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.import copy
-"""
-
-import itertools
-import math
-import os
-from abc import ABCMeta, abstractmethod
-from collections import OrderedDict, namedtuple
-from typing import Dict, List, Tuple
-
-import numpy as np
-import torch
-from torch import nn
-from torch.nn.modules.batchnorm import BatchNorm2d
-from torchvision.ops import RoIPool
-from torchvision.ops.boxes import batched_nms, nms
-
-from utils import WEIGHTS_NAME, Config, cached_path, hf_bucket_url, is_remote_url, load_checkpoint
-
-
-# other:
-def norm_box(boxes, raw_sizes):
-    if not isinstance(boxes, torch.Tensor):
-        normalized_boxes = boxes.copy()
-    else:
-        normalized_boxes = boxes.clone()
-    normalized_boxes[:, :, (0, 2)] /= raw_sizes[:, 1]
-    normalized_boxes[:, :, (1, 3)] /= raw_sizes[:, 0]
-    return normalized_boxes
-
-
-def pad_list_tensors(
-    list_tensors,
-    preds_per_image,
-    max_detections=None,
-    return_tensors=None,
-    padding=None,
-    pad_value=0,
-    location=None,
-):
-    """
-    location will always be cpu for np tensors
-    """
-    if location is None:
-        location = "cpu"
-    assert return_tensors in {"pt", "np", None}
-    assert padding in {"max_detections", "max_batch", None}
-    new = []
-    if padding is None:
-        if return_tensors is None:
-            return list_tensors
-        elif return_tensors == "pt":
-            if not isinstance(list_tensors, torch.Tensor):
-                return torch.stack(list_tensors).to(location)
-            else:
-                return list_tensors.to(location)
-        else:
-            if not isinstance(list_tensors, list):
-                return np.array(list_tensors.to(location))
-            else:
-                return list_tensors.to(location)
-    if padding == "max_detections":
-        assert max_detections is not None, "specify max number of detections per batch"
-    elif padding == "max_batch":
-        max_detections = max(preds_per_image)
-    for i in range(len(list_tensors)):
-        too_small = False
-        tensor_i = list_tensors.pop(0)
-        if tensor_i.ndim < 2:
-            too_small = True
-            tensor_i = tensor_i.unsqueeze(-1)
-        assert isinstance(tensor_i, torch.Tensor)
-        tensor_i = nn.functional.pad(
-            input=tensor_i,
-            pad=(0, 0, 0, max_detections - preds_per_image[i]),
-            mode="constant",
-            value=pad_value,
-        )
-        if too_small:
-            tensor_i = tensor_i.squeeze(-1)
-        if return_tensors is None:
-            if location == "cpu":
-                tensor_i = tensor_i.cpu()
-            tensor_i = tensor_i.tolist()
-        if return_tensors == "np":
-            if location == "cpu":
-                tensor_i = tensor_i.cpu()
-            tensor_i = tensor_i.numpy()
-        else:
-            if location == "cpu":
-                tensor_i = tensor_i.cpu()
-        new.append(tensor_i)
-    if return_tensors == "np":
-        return np.stack(new, axis=0)
-    elif return_tensors == "pt" and not isinstance(new, torch.Tensor):
-        return torch.stack(new, dim=0)
-    else:
-        return list_tensors
-
-
-def do_nms(boxes, scores, image_shape, score_thresh, nms_thresh, mind, maxd):
-    scores = scores[:, :-1]
-    num_bbox_reg_classes = boxes.shape[1] // 4
-    # Convert to Boxes to use the `clip` function ...
-    boxes = boxes.reshape(-1, 4)
-    _clip_box(boxes, image_shape)
-    boxes = boxes.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
-
-    # Select max scores
-    max_scores, max_classes = scores.max(1)  # R x C --> R
-    num_objs = boxes.size(0)
-    boxes = boxes.view(-1, 4)
-    idxs = torch.arange(num_objs).to(boxes.device) * num_bbox_reg_classes + max_classes
-    max_boxes = boxes[idxs]  # Select max boxes according to the max scores.
-
-    # Apply NMS
-    keep = nms(max_boxes, max_scores, nms_thresh)
-    keep = keep[:maxd]
-    if keep.shape[-1] >= mind and keep.shape[-1] <= maxd:
-        max_boxes, max_scores = max_boxes[keep], max_scores[keep]
-        classes = max_classes[keep]
-        return max_boxes, max_scores, classes, keep
-    else:
-        return None
-
-
-# Helper Functions
-def _clip_box(tensor, box_size: Tuple[int, int]):
-    assert torch.isfinite(tensor).all(), "Box tensor contains infinite or NaN!"
-    h, w = box_size
-    tensor[:, 0].clamp_(min=0, max=w)
-    tensor[:, 1].clamp_(min=0, max=h)
-    tensor[:, 2].clamp_(min=0, max=w)
-    tensor[:, 3].clamp_(min=0, max=h)
-
-
-def _nonempty_boxes(box, threshold: float = 0.0) -> torch.Tensor:
-    widths = box[:, 2] - box[:, 0]
-    heights = box[:, 3] - box[:, 1]
-    keep = (widths > threshold) & (heights > threshold)
-    return keep
-
-
-def get_norm(norm, out_channels):
-    if isinstance(norm, str):
-        if len(norm) == 0:
-            return None
-        norm = {
-            "BN": BatchNorm2d,
-            "GN": lambda channels: nn.GroupNorm(32, channels),
-            "nnSyncBN": nn.SyncBatchNorm,  # keep for debugging
-            "": lambda x: x,
-        }[norm]
-    return norm(out_channels)
-
-
-def _create_grid_offsets(size: List[int], stride: int, offset: float, device):
-    grid_height, grid_width = size
-    shifts_x = torch.arange(
-        offset * stride,
-        grid_width * stride,
-        step=stride,
-        dtype=torch.float32,
-        device=device,
-    )
-    shifts_y = torch.arange(
-        offset * stride,
-        grid_height * stride,
-        step=stride,
-        dtype=torch.float32,
-        device=device,
-    )
-
-    shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
-    shift_x = shift_x.reshape(-1)
-    shift_y = shift_y.reshape(-1)
-    return shift_x, shift_y
-
-
-def build_backbone(cfg):
-    input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
-    norm = cfg.RESNETS.NORM
-    stem = BasicStem(
-        in_channels=input_shape.channels,
-        out_channels=cfg.RESNETS.STEM_OUT_CHANNELS,
-        norm=norm,
-        caffe_maxpool=cfg.MODEL.MAX_POOL,
-    )
-    freeze_at = cfg.BACKBONE.FREEZE_AT
-
-    if freeze_at >= 1:
-        for p in stem.parameters():
-            p.requires_grad = False
-
-    out_features = cfg.RESNETS.OUT_FEATURES
-    depth = cfg.RESNETS.DEPTH
-    num_groups = cfg.RESNETS.NUM_GROUPS
-    width_per_group = cfg.RESNETS.WIDTH_PER_GROUP
-    bottleneck_channels = num_groups * width_per_group
-    in_channels = cfg.RESNETS.STEM_OUT_CHANNELS
-    out_channels = cfg.RESNETS.RES2_OUT_CHANNELS
-    stride_in_1x1 = cfg.RESNETS.STRIDE_IN_1X1
-    res5_dilation = cfg.RESNETS.RES5_DILATION
-    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
-
-    num_blocks_per_stage = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth]
-
-    stages = []
-    out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features]
-    max_stage_idx = max(out_stage_idx)
-    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
-        dilation = res5_dilation if stage_idx == 5 else 1
-        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
-        stage_kargs = {
-            "num_blocks": num_blocks_per_stage[idx],
-            "first_stride": first_stride,
-            "in_channels": in_channels,
-            "bottleneck_channels": bottleneck_channels,
-            "out_channels": out_channels,
-            "num_groups": num_groups,
-            "norm": norm,
-            "stride_in_1x1": stride_in_1x1,
-            "dilation": dilation,
-        }
-
-        stage_kargs["block_class"] = BottleneckBlock
-        blocks = ResNet.make_stage(**stage_kargs)
-        in_channels = out_channels
-        out_channels *= 2
-        bottleneck_channels *= 2
-
-        if freeze_at >= stage_idx:
-            for block in blocks:
-                block.freeze()
-        stages.append(blocks)
-
-    return ResNet(stem, stages, out_features=out_features)
-
-
-def find_top_rpn_proposals(
-    proposals,
-    pred_objectness_logits,
-    images,
-    image_sizes,
-    nms_thresh,
-    pre_nms_topk,
-    post_nms_topk,
-    min_box_side_len,
-    training,
-):
-    """Args:
-        proposals (list[Tensor]): (L, N, Hi*Wi*A, 4).
-        pred_objectness_logits: tensors of length L.
-        nms_thresh (float): IoU threshold to use for NMS
-        pre_nms_topk (int): before nms
-        post_nms_topk (int): after nms
-        min_box_side_len (float): minimum proposal box side
-        training (bool): True if proposals are to be used in training,
-    Returns:
-        results (List[Dict]): stores post_nms_topk object proposals for image i.
-    """
-    num_images = len(images)
-    device = proposals[0].device
-
-    # 1. Select top-k anchor for every level and every image
-    topk_scores = []  # #lvl Tensor, each of shape N x topk
-    topk_proposals = []
-    level_ids = []  # #lvl Tensor, each of shape (topk,)
-    batch_idx = torch.arange(num_images, device=device)
-    for level_id, proposals_i, logits_i in zip(itertools.count(), proposals, pred_objectness_logits):
-        Hi_Wi_A = logits_i.shape[1]
-        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)
-
-        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
-        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
-        logits_i, idx = logits_i.sort(descending=True, dim=1)
-        topk_scores_i = logits_i[batch_idx, :num_proposals_i]
-        topk_idx = idx[batch_idx, :num_proposals_i]
-
-        # each is N x topk
-        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 4
-
-        topk_proposals.append(topk_proposals_i)
-        topk_scores.append(topk_scores_i)
-        level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))
-
-    # 2. Concat all levels together
-    topk_scores = torch.cat(topk_scores, dim=1)
-    topk_proposals = torch.cat(topk_proposals, dim=1)
-    level_ids = torch.cat(level_ids, dim=0)
-
-    # if I change to batched_nms, I wonder if this will make a difference
-    # 3. For each image, run a per-level NMS, and choose topk results.
-    results = []
-    for n, image_size in enumerate(image_sizes):
-        boxes = topk_proposals[n]
-        scores_per_img = topk_scores[n]
-        # I will have to take a look at the boxes clip method
-        _clip_box(boxes, image_size)
-        # filter empty boxes
-        keep = _nonempty_boxes(boxes, threshold=min_box_side_len)
-        lvl = level_ids
-        if keep.sum().item() != len(boxes):
-            boxes, scores_per_img, lvl = (
-                boxes[keep],
-                scores_per_img[keep],
-                level_ids[keep],
-            )
-
-        keep = batched_nms(boxes, scores_per_img, lvl, nms_thresh)
-        keep = keep[:post_nms_topk]
-
-        res = (boxes[keep], scores_per_img[keep])
-        results.append(res)
-
-    # I wonder if it would be possible for me to pad all these things.
-    return results
-
-
-def subsample_labels(labels, num_samples, positive_fraction, bg_label):
-    """
-    Returns:
-        pos_idx, neg_idx (Tensor):
-            1D vector of indices. The total length of both is `num_samples` or fewer.
-    """
-    positive = torch.nonzero((labels != -1) & (labels != bg_label)).squeeze(1)
-    negative = torch.nonzero(labels == bg_label).squeeze(1)
-
-    num_pos = int(num_samples * positive_fraction)
-    # protect against not enough positive examples
-    num_pos = min(positive.numel(), num_pos)
-    num_neg = num_samples - num_pos
-    # protect against not enough negative examples
-    num_neg = min(negative.numel(), num_neg)
-
-    # randomly select positive and negative examples
-    perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
-    perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
-
-    pos_idx = positive[perm1]
-    neg_idx = negative[perm2]
-    return pos_idx, neg_idx
-
-
-def add_ground_truth_to_proposals(gt_boxes, proposals):
-    raise NotImplementedError()
-
-
-def add_ground_truth_to_proposals_single_image(gt_boxes, proposals):
-    raise NotImplementedError()
-
-
-def _fmt_box_list(box_tensor, batch_index: int):
-    repeated_index = torch.full(
-        (len(box_tensor), 1),
-        batch_index,
-        dtype=box_tensor.dtype,
-        device=box_tensor.device,
-    )
-    return torch.cat((repeated_index, box_tensor), dim=1)
-
-
-def convert_boxes_to_pooler_format(box_lists: List[torch.Tensor]):
-    pooler_fmt_boxes = torch.cat(
-        [_fmt_box_list(box_list, i) for i, box_list in enumerate(box_lists)],
-        dim=0,
-    )
-    return pooler_fmt_boxes
-
-
-def assign_boxes_to_levels(
-    box_lists: List[torch.Tensor],
-    min_level: int,
-    max_level: int,
-    canonical_box_size: int,
-    canonical_level: int,
-):
-    box_sizes = torch.sqrt(torch.cat([boxes.area() for boxes in box_lists]))
-    # Eqn.(1) in FPN paper
-    level_assignments = torch.floor(canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8))
-    # clamp level to (min, max), in case the box size is too large or too small
-    # for the available feature maps
-    level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
-    return level_assignments.to(torch.int64) - min_level
-
-
-# Helper Classes
-class _NewEmptyTensorOp(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, new_shape):
-        ctx.shape = x.shape
-        return x.new_empty(new_shape)
-
-    @staticmethod
-    def backward(ctx, grad):
-        shape = ctx.shape
-        return _NewEmptyTensorOp.apply(grad, shape), None
-
-
-class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
-    def __new__(cls, *, channels=None, height=None, width=None, stride=None):
-        return super().__new__(cls, channels, height, width, stride)
-
-
-class Box2BoxTransform:
-    """
-    This R-CNN transformation scales the box's width and height
-    by exp(dw), exp(dh) and shifts a box's center by the offset
-    (dx * width, dy * height).
-    """
-
-    def __init__(self, weights: Tuple[float, float, float, float], scale_clamp: float = None):
-        """
-        Args:
-            weights (4-element tuple): Scaling factors that are applied to the
-                (dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
-                such that the deltas have unit variance; now they are treated as
-                hyperparameters of the system.
-            scale_clamp (float): When predicting deltas, the predicted box scaling
-                factors (dw and dh) are clamped such that they are <= scale_clamp.
-        """
-        self.weights = weights
-        if scale_clamp is not None:
-            self.scale_clamp = scale_clamp
-        else:
-            """
-            Value for clamping large dw and dh predictions.
-            The heuristic is that we clamp such that dw and dh are no larger
-            than what would transform a 16px box into a 1000px box
-            (based on a small anchor, 16px, and a typical image size, 1000px).
-            """
-            self.scale_clamp = math.log(1000.0 / 16)
-
-    def get_deltas(self, src_boxes, target_boxes):
-        """
-        Get box regression transformation deltas (dx, dy, dw, dh) that can be used
-        to transform the `src_boxes` into the `target_boxes`. That is, the relation
-        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
-        any delta is too large and is clamped).
-        Args:
-            src_boxes (Tensor): source boxes, e.g., object proposals
-            target_boxes (Tensor): target of the transformation, e.g., ground-truth
-                boxes.
-        """
-        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
-        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
-
-        src_widths = src_boxes[:, 2] - src_boxes[:, 0]
-        src_heights = src_boxes[:, 3] - src_boxes[:, 1]
-        src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
-        src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
-
-        target_widths = target_boxes[:, 2] - target_boxes[:, 0]
-        target_heights = target_boxes[:, 3] - target_boxes[:, 1]
-        target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths
-        target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights
-
-        wx, wy, ww, wh = self.weights
-        dx = wx * (target_ctr_x - src_ctr_x) / src_widths
-        dy = wy * (target_ctr_y - src_ctr_y) / src_heights
-        dw = ww * torch.log(target_widths / src_widths)
-        dh = wh * torch.log(target_heights / src_heights)
-
-        deltas = torch.stack((dx, dy, dw, dh), dim=1)
-        assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!"
-        return deltas
-
-    def apply_deltas(self, deltas, boxes):
-        """
-        Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
-        Args:
-            deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
-                deltas[i] represents k potentially different class-specific
-                box transformations for the single box boxes[i].
-            boxes (Tensor): boxes to transform, of shape (N, 4)
-        """
-        boxes = boxes.to(deltas.dtype)
-
-        widths = boxes[:, 2] - boxes[:, 0]
-        heights = boxes[:, 3] - boxes[:, 1]
-        ctr_x = boxes[:, 0] + 0.5 * widths
-        ctr_y = boxes[:, 1] + 0.5 * heights
-
-        wx, wy, ww, wh = self.weights
-        dx = deltas[:, 0::4] / wx
-        dy = deltas[:, 1::4] / wy
-        dw = deltas[:, 2::4] / ww
-        dh = deltas[:, 3::4] / wh
-
-        # Prevent sending too large values into torch.exp()
-        dw = torch.clamp(dw, max=self.scale_clamp)
-        dh = torch.clamp(dh, max=self.scale_clamp)
-
-        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
-        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
-        pred_w = torch.exp(dw) * widths[:, None]
-        pred_h = torch.exp(dh) * heights[:, None]
-
-        pred_boxes = torch.zeros_like(deltas)
-        pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w  # x1
-        pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h  # y1
-        pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w  # x2
-        pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h  # y2
-        return pred_boxes
-
-
-class Matcher:
-    """
-    This class assigns to each predicted "element" (e.g., a box) a ground-truth
-    element. Each predicted element will have exactly zero or one matches; each
-    ground-truth element may be matched to zero or more predicted elements.
-    The matching is determined by the MxN match_quality_matrix, that characterizes
-    how well each (ground-truth, prediction)-pair match each other. For example,
-    if the elements are boxes, this matrix may contain box intersection-over-union
-    overlap values.
-    The matcher returns (a) a vector of length N containing the index of the
-    ground-truth element m in [0, M) that matches to prediction n in [0, N).
-    (b) a vector of length N containing the labels for each prediction.
-    """
-
-    def __init__(
-        self,
-        thresholds: List[float],
-        labels: List[int],
-        allow_low_quality_matches: bool = False,
-    ):
-        """
-        Args:
-            thresholds (list): a list of thresholds used to stratify predictions
-                into levels.
-            labels (list): a list of values to label predictions belonging at
-                each level. A label can be one of {-1, 0, 1} signifying
-                {ignore, negative class, positive class}, respectively.
-            allow_low_quality_matches (bool): if True, produce additional matches or predictions with maximum match quality lower than high_threshold.
-                For example, thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and
-                thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
-                thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and thus will be considered as true positives.
-        """
-        thresholds = thresholds[:]
-        assert thresholds[0] > 0
-        thresholds.insert(0, -float("inf"))
-        thresholds.append(float("inf"))
-        assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:]))
-        assert all(label_i in [-1, 0, 1] for label_i in labels)
-        assert len(labels) == len(thresholds) - 1
-        self.thresholds = thresholds
-        self.labels = labels
-        self.allow_low_quality_matches = allow_low_quality_matches
-
-    def __call__(self, match_quality_matrix):
-        """
-        Args:
-            match_quality_matrix (Tensor[float]): an MxN tensor, containing the pairwise quality between M ground-truth elements and N predicted
-                elements. All elements must be >= 0 (due to the us of `torch.nonzero` for selecting indices in :meth:`set_low_quality_matches_`).
-        Returns:
-            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched ground-truth index in [0, M)
-            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates true or false positive or ignored
-        """
-        assert match_quality_matrix.dim() == 2
-        if match_quality_matrix.numel() == 0:
-            default_matches = match_quality_matrix.new_full((match_quality_matrix.size(1),), 0, dtype=torch.int64)
-            # When no gt boxes exist, we define IOU = 0 and therefore set labels
-            # to `self.labels[0]`, which usually defaults to background class 0
-            # To choose to ignore instead,
-            # can make labels=[-1,0,-1,1] + set appropriate thresholds
-            default_match_labels = match_quality_matrix.new_full(
-                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
-            )
-            return default_matches, default_match_labels
-
-        assert torch.all(match_quality_matrix >= 0)
-
-        # match_quality_matrix is M (gt) x N (predicted)
-        # Max over gt elements (dim 0) to find best gt candidate for each prediction
-        matched_vals, matches = match_quality_matrix.max(dim=0)
-
-        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
-
-        for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
-            low_high = (matched_vals >= low) & (matched_vals < high)
-            match_labels[low_high] = l
-
-        if self.allow_low_quality_matches:
-            self.set_low_quality_matches_(match_labels, match_quality_matrix)
-
-        return matches, match_labels
-
-    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
-        """
-        Produce additional matches for predictions that have only low-quality matches.
-        Specifically, for each ground-truth G find the set of predictions that have
-        maximum overlap with it (including ties); for each prediction in that set, if
-        it is unmatched, then match it to the ground-truth G.
-        This function implements the RPN assignment case (i)
-        in Sec. 3.1.2 of Faster R-CNN.
-        """
-        # For each gt, find the prediction with which it has highest quality
-        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
-        # Find the highest quality match available, even if it is low, including ties.
-        # Note that the matches qualities must be positive due to the use of
-        # `torch.nonzero`.
-        of_quality_inds = match_quality_matrix == highest_quality_foreach_gt[:, None]
-        if of_quality_inds.dim() == 0:
-            (_, pred_inds_with_highest_quality) = of_quality_inds.unsqueeze(0).nonzero().unbind(1)
-        else:
-            (_, pred_inds_with_highest_quality) = of_quality_inds.nonzero().unbind(1)
-        match_labels[pred_inds_with_highest_quality] = 1
-
-
-class RPNOutputs:
-    def __init__(
-        self,
-        box2box_transform,
-        anchor_matcher,
-        batch_size_per_image,
-        positive_fraction,
-        images,
-        pred_objectness_logits,
-        pred_anchor_deltas,
-        anchors,
-        boundary_threshold=0,
-        gt_boxes=None,
-        smooth_l1_beta=0.0,
-    ):
-        """
-        Args:
-            box2box_transform (Box2BoxTransform): :class:`Box2BoxTransform` instance for anchor-proposal transformations.
-            anchor_matcher (Matcher): :class:`Matcher` instance for matching anchors to ground-truth boxes; used to determine training labels.
-            batch_size_per_image (int): number of proposals to sample when training
-            positive_fraction (float): target fraction of sampled proposals that should be positive
-            images (ImageList): :class:`ImageList` instance representing N input images
-            pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A, Hi, W)
-            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A*4, Hi, Wi)
-            anchors (list[torch.Tensor]): nested list of boxes. anchors[i][j] at (n, l) stores anchor array for feature map l
-            boundary_threshold (int): if >= 0, then anchors that extend beyond the image boundary by more than boundary_thresh are not used in training.
-            gt_boxes (list[Boxes], optional): A list of N elements.
-            smooth_l1_beta (float): The transition point between L1 and L2 lossn. When set to 0, the loss becomes L1. When +inf, it is ignored
-        """
-        self.box2box_transform = box2box_transform
-        self.anchor_matcher = anchor_matcher
-        self.batch_size_per_image = batch_size_per_image
-        self.positive_fraction = positive_fraction
-        self.pred_objectness_logits = pred_objectness_logits
-        self.pred_anchor_deltas = pred_anchor_deltas
-
-        self.anchors = anchors
-        self.gt_boxes = gt_boxes
-        self.num_feature_maps = len(pred_objectness_logits)
-        self.num_images = len(images)
-        self.boundary_threshold = boundary_threshold
-        self.smooth_l1_beta = smooth_l1_beta
-
-    def _get_ground_truth(self):
-        raise NotImplementedError()
-
-    def predict_proposals(self):
-        # pred_anchor_deltas: (L, N, ? Hi, Wi)
-        # anchors:(N, L, -1, B)
-        # here we loop over specific feature map, NOT images
-        proposals = []
-        anchors = self.anchors.transpose(0, 1)
-        for anchors_i, pred_anchor_deltas_i in zip(anchors, self.pred_anchor_deltas):
-            B = anchors_i.size(-1)
-            N, _, Hi, Wi = pred_anchor_deltas_i.shape
-            anchors_i = anchors_i.flatten(start_dim=0, end_dim=1)
-            pred_anchor_deltas_i = pred_anchor_deltas_i.view(N, -1, B, Hi, Wi).permute(0, 3, 4, 1, 2).reshape(-1, B)
-            proposals_i = self.box2box_transform.apply_deltas(pred_anchor_deltas_i, anchors_i)
-            # Append feature map proposals with shape (N, Hi*Wi*A, B)
-            proposals.append(proposals_i.view(N, -1, B))
-        proposals = torch.stack(proposals)
-        return proposals
-
-    def predict_objectness_logits(self):
-        """
-        Returns:
-            pred_objectness_logits (list[Tensor]) -> (N, Hi*Wi*A).
-        """
-        pred_objectness_logits = [
-            # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N, Hi*Wi*A)
-            score.permute(0, 2, 3, 1).reshape(self.num_images, -1)
-            for score in self.pred_objectness_logits
-        ]
-        return pred_objectness_logits
-
-
-# Main Classes
-class Conv2d(nn.Conv2d):
-    def __init__(self, *args, **kwargs):
-        norm = kwargs.pop("norm", None)
-        activation = kwargs.pop("activation", None)
-        super().__init__(*args, **kwargs)
-
-        self.norm = norm
-        self.activation = activation
-
-    def forward(self, x):
-        if x.numel() == 0 and self.training:
-            assert not isinstance(self.norm, nn.SyncBatchNorm)
-        if x.numel() == 0:
-            assert not isinstance(self.norm, nn.GroupNorm)
-            output_shape = [
-                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
-                for i, p, di, k, s in zip(
-                    x.shape[-2:],
-                    self.padding,
-                    self.dilation,
-                    self.kernel_size,
-                    self.stride,
-                )
-            ]
-            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
-            empty = _NewEmptyTensorOp.apply(x, output_shape)
-            if self.training:
-                _dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
-                return empty + _dummy
-            else:
-                return empty
-
-        x = super().forward(x)
-        if self.norm is not None:
-            x = self.norm(x)
-        if self.activation is not None:
-            x = self.activation(x)
-        return x
-
-
-class LastLevelMaxPool(nn.Module):
-    """
-    This module is used in the original FPN to generate a downsampled P6 feature from P5.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.num_levels = 1
-        self.in_feature = "p5"
-
-    def forward(self, x):
-        return [nn.functional.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
-
-
-class LastLevelP6P7(nn.Module):
-    """
-    This module is used in RetinaNet to generate extra layers, P6 and P7 from C5 feature.
-    """
-
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-        self.num_levels = 2
-        self.in_feature = "res5"
-        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
-        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
-
-    def forward(self, c5):
-        p6 = self.p6(c5)
-        p7 = self.p7(nn.functional.relu(p6))
-        return [p6, p7]
-
-
-class BasicStem(nn.Module):
-    def __init__(self, in_channels=3, out_channels=64, norm="BN", caffe_maxpool=False):
-        super().__init__()
-        self.conv1 = Conv2d(
-            in_channels,
-            out_channels,
-            kernel_size=7,
-            stride=2,
-            padding=3,
-            bias=False,
-            norm=get_norm(norm, out_channels),
-        )
-        self.caffe_maxpool = caffe_maxpool
-        # use pad 1 instead of pad zero
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = nn.functional.relu_(x)
-        if self.caffe_maxpool:
-            x = nn.functional.max_pool2d(x, kernel_size=3, stride=2, padding=0, ceil_mode=True)
-        else:
-            x = nn.functional.max_pool2d(x, kernel_size=3, stride=2, padding=1)
-        return x
-
-    @property
-    def out_channels(self):
-        return self.conv1.out_channels
-
-    @property
-    def stride(self):
-        return 4  # = stride 2 conv -> stride 2 max pool
-
-
-class ResNetBlockBase(nn.Module):
-    def __init__(self, in_channels, out_channels, stride):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.stride = stride
-
-    def freeze(self):
-        for p in self.parameters():
-            p.requires_grad = False
-        return self
-
-
-class BottleneckBlock(ResNetBlockBase):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        bottleneck_channels,
-        stride=1,
-        num_groups=1,
-        norm="BN",
-        stride_in_1x1=False,
-        dilation=1,
-    ):
-        super().__init__(in_channels, out_channels, stride)
-
-        if in_channels != out_channels:
-            self.shortcut = Conv2d(
-                in_channels,
-                out_channels,
-                kernel_size=1,
-                stride=stride,
-                bias=False,
-                norm=get_norm(norm, out_channels),
-            )
-        else:
-            self.shortcut = None
-
-        # The original MSRA ResNet models have stride in the first 1x1 conv
-        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
-        # stride in the 3x3 conv
-        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
-
-        self.conv1 = Conv2d(
-            in_channels,
-            bottleneck_channels,
-            kernel_size=1,
-            stride=stride_1x1,
-            bias=False,
-            norm=get_norm(norm, bottleneck_channels),
-        )
-
-        self.conv2 = Conv2d(
-            bottleneck_channels,
-            bottleneck_channels,
-            kernel_size=3,
-            stride=stride_3x3,
-            padding=1 * dilation,
-            bias=False,
-            groups=num_groups,
-            dilation=dilation,
-            norm=get_norm(norm, bottleneck_channels),
-        )
-
-        self.conv3 = Conv2d(
-            bottleneck_channels,
-            out_channels,
-            kernel_size=1,
-            bias=False,
-            norm=get_norm(norm, out_channels),
-        )
-
-    def forward(self, x):
-        out = self.conv1(x)
-        out = nn.functional.relu_(out)
-
-        out = self.conv2(out)
-        out = nn.functional.relu_(out)
-
-        out = self.conv3(out)
-
-        if self.shortcut is not None:
-            shortcut = self.shortcut(x)
-        else:
-            shortcut = x
-
-        out += shortcut
-        out = nn.functional.relu_(out)
-        return out
-
-
-class Backbone(nn.Module, metaclass=ABCMeta):
-    def __init__(self):
-        super().__init__()
-
-    @abstractmethod
-    def forward(self):
-        pass
-
-    @property
-    def size_divisibility(self):
-        """
-        Some backbones require the input height and width to be divisible by a specific integer. This is
-        typically true for encoder / decoder type networks with lateral connection (e.g., FPN) for which feature maps need to match
-        dimension in the "bottom up" and "top down" paths. Set to 0 if no specific input size divisibility is required.
-        """
-        return 0
-
-    def output_shape(self):
-        return {
-            name: ShapeSpec(
-                channels=self._out_feature_channels[name],
-                stride=self._out_feature_strides[name],
-            )
-            for name in self._out_features
-        }
-
-    @property
-    def out_features(self):
-        """deprecated"""
-        return self._out_features
-
-    @property
-    def out_feature_strides(self):
-        """deprecated"""
-        return {f: self._out_feature_strides[f] for f in self._out_features}
-
-    @property
-    def out_feature_channels(self):
-        """deprecated"""
-        return {f: self._out_feature_channels[f] for f in self._out_features}
-
-
-class ResNet(Backbone):
-    def __init__(self, stem, stages, num_classes=None, out_features=None):
-        """
-        Args:
-            stem (nn.Module): a stem module
-            stages (list[list[ResNetBlock]]): several (typically 4) stages, each contains multiple :class:`ResNetBlockBase`.
-            num_classes (None or int): if None, will not perform classification.
-            out_features (list[str]): name of the layers whose outputs should be returned in forward. Can be anything in:
-            "stem", "linear", or "res2" ... If None, will return the output of the last layer.
-        """
-        super(ResNet, self).__init__()
-        self.stem = stem
-        self.num_classes = num_classes
-
-        current_stride = self.stem.stride
-        self._out_feature_strides = {"stem": current_stride}
-        self._out_feature_channels = {"stem": self.stem.out_channels}
-
-        self.stages_and_names = []
-        for i, blocks in enumerate(stages):
-            for block in blocks:
-                assert isinstance(block, ResNetBlockBase), block
-                curr_channels = block.out_channels
-            stage = nn.Sequential(*blocks)
-            name = "res" + str(i + 2)
-            self.add_module(name, stage)
-            self.stages_and_names.append((stage, name))
-            self._out_feature_strides[name] = current_stride = int(
-                current_stride * np.prod([k.stride for k in blocks])
-            )
-            self._out_feature_channels[name] = blocks[-1].out_channels
-
-        if num_classes is not None:
-            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-            self.linear = nn.Linear(curr_channels, num_classes)
-
-            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
-            # "The 1000-way fully-connected layer is initialized by
-            # drawing weights from a zero-mean Gaussian with std of 0.01."
-            nn.init.normal_(self.linear.weight, stddev=0.01)
-            name = "linear"
-
-        if out_features is None:
-            out_features = [name]
-        self._out_features = out_features
-        assert len(self._out_features)
-        children = [x[0] for x in self.named_children()]
-        for out_feature in self._out_features:
-            assert out_feature in children, "Available children: {}".format(", ".join(children))
-
-    def forward(self, x):
-        outputs = {}
-        x = self.stem(x)
-        if "stem" in self._out_features:
-            outputs["stem"] = x
-        for stage, name in self.stages_and_names:
-            x = stage(x)
-            if name in self._out_features:
-                outputs[name] = x
-        if self.num_classes is not None:
-            x = self.avgpool(x)
-            x = self.linear(x)
-            if "linear" in self._out_features:
-                outputs["linear"] = x
-        return outputs
-
-    def output_shape(self):
-        return {
-            name: ShapeSpec(
-                channels=self._out_feature_channels[name],
-                stride=self._out_feature_strides[name],
-            )
-            for name in self._out_features
-        }
-
-    @staticmethod
-    def make_stage(
-        block_class,
-        num_blocks,
-        first_stride=None,
-        *,
-        in_channels,
-        out_channels,
-        **kwargs,
-    ):
-        """
-        Usually, layers that produce the same feature map spatial size
-        are defined as one "stage".
-        Under such definition, stride_per_block[1:] should all be 1.
-        """
-        if first_stride is not None:
-            assert "stride" not in kwargs and "stride_per_block" not in kwargs
-            kwargs["stride_per_block"] = [first_stride] + [1] * (num_blocks - 1)
-        blocks = []
-        for i in range(num_blocks):
-            curr_kwargs = {}
-            for k, v in kwargs.items():
-                if k.endswith("_per_block"):
-                    assert (
-                        len(v) == num_blocks
-                    ), f"Argument '{k}' of make_stage should have the same length as num_blocks={num_blocks}."
-                    newk = k[: -len("_per_block")]
-                    assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
-                    curr_kwargs[newk] = v[i]
-                else:
-                    curr_kwargs[k] = v
-
-            blocks.append(block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs))
-            in_channels = out_channels
-
-        return blocks
-
-
-class ROIPooler(nn.Module):
-    """
-    Region of interest feature map pooler that supports pooling from one or more
-    feature maps.
-    """
-
-    def __init__(
-        self,
-        output_size,
-        scales,
-        sampling_ratio,
-        canonical_box_size=224,
-        canonical_level=4,
-    ):
-        super().__init__()
-        # assumption that stride is a power of 2.
-        min_level = -math.log2(scales[0])
-        max_level = -math.log2(scales[-1])
-
-        # a bunch of testing
-        assert math.isclose(min_level, int(min_level)) and math.isclose(max_level, int(max_level))
-        assert len(scales) == max_level - min_level + 1, "not pyramid"
-        assert 0 < min_level and min_level <= max_level
-        if isinstance(output_size, int):
-            output_size = (output_size, output_size)
-        assert len(output_size) == 2 and isinstance(output_size[0], int) and isinstance(output_size[1], int)
-        if len(scales) > 1:
-            assert min_level <= canonical_level and canonical_level <= max_level
-        assert canonical_box_size > 0
-
-        self.output_size = output_size
-        self.min_level = int(min_level)
-        self.max_level = int(max_level)
-        self.level_poolers = nn.ModuleList(RoIPool(output_size, spatial_scale=scale) for scale in scales)
-        self.canonical_level = canonical_level
-        self.canonical_box_size = canonical_box_size
-
-    def forward(self, feature_maps, boxes):
-        """
-        Args:
-            feature_maps: List[torch.Tensor(N,C,W,H)]
-            box_lists: list[torch.Tensor])
-        Returns:
-            A tensor of shape(N*B, Channels, output_size, output_size)
-        """
-        x = list(feature_maps.values())
-        num_level_assignments = len(self.level_poolers)
-        assert len(x) == num_level_assignments and len(boxes) == x[0].size(0)
-
-        pooler_fmt_boxes = convert_boxes_to_pooler_format(boxes)
-
-        if num_level_assignments == 1:
-            return self.level_poolers[0](x[0], pooler_fmt_boxes)
-
-        level_assignments = assign_boxes_to_levels(
-            boxes,
-            self.min_level,
-            self.max_level,
-            self.canonical_box_size,
-            self.canonical_level,
-        )
-
-        num_boxes = len(pooler_fmt_boxes)
-        num_channels = x[0].shape[1]
-        output_size = self.output_size[0]
-
-        dtype, device = x[0].dtype, x[0].device
-        output = torch.zeros(
-            (num_boxes, num_channels, output_size, output_size),
-            dtype=dtype,
-            device=device,
-        )
-
-        for level, (x_level, pooler) in enumerate(zip(x, self.level_poolers)):
-            inds = torch.nonzero(level_assignments == level).squeeze(1)
-            pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
-            output[inds] = pooler(x_level, pooler_fmt_boxes_level)
-
-        return output
-
-
-class ROIOutputs:
-    def __init__(self, cfg, training=False):
-        self.smooth_l1_beta = cfg.ROI_BOX_HEAD.SMOOTH_L1_BETA
-        self.box2box_transform = Box2BoxTransform(weights=cfg.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
-        self.training = training
-        self.score_thresh = cfg.ROI_HEADS.SCORE_THRESH_TEST
-        self.min_detections = cfg.MIN_DETECTIONS
-        self.max_detections = cfg.MAX_DETECTIONS
-
-        nms_thresh = cfg.ROI_HEADS.NMS_THRESH_TEST
-        if not isinstance(nms_thresh, list):
-            nms_thresh = [nms_thresh]
-        self.nms_thresh = nms_thresh
-
-    def _predict_boxes(self, proposals, box_deltas, preds_per_image):
-        num_pred = box_deltas.size(0)
-        B = proposals[0].size(-1)
-        K = box_deltas.size(-1) // B
-        box_deltas = box_deltas.view(num_pred * K, B)
-        proposals = torch.cat(proposals, dim=0).unsqueeze(-2).expand(num_pred, K, B)
-        proposals = proposals.reshape(-1, B)
-        boxes = self.box2box_transform.apply_deltas(box_deltas, proposals)
-        return boxes.view(num_pred, K * B).split(preds_per_image, dim=0)
-
-    def _predict_objs(self, obj_logits, preds_per_image):
-        probs = nn.functional.softmax(obj_logits, dim=-1)
-        probs = probs.split(preds_per_image, dim=0)
-        return probs
-
-    def _predict_attrs(self, attr_logits, preds_per_image):
-        attr_logits = attr_logits[..., :-1].softmax(-1)
-        attr_probs, attrs = attr_logits.max(-1)
-        return attr_probs.split(preds_per_image, dim=0), attrs.split(preds_per_image, dim=0)
-
-    @torch.no_grad()
-    def inference(
-        self,
-        obj_logits,
-        attr_logits,
-        box_deltas,
-        pred_boxes,
-        features,
-        sizes,
-        scales=None,
-    ):
-        # only the pred boxes is the
-        preds_per_image = [p.size(0) for p in pred_boxes]
-        boxes_all = self._predict_boxes(pred_boxes, box_deltas, preds_per_image)
-        obj_scores_all = self._predict_objs(obj_logits, preds_per_image)  # list of length N
-        attr_probs_all, attrs_all = self._predict_attrs(attr_logits, preds_per_image)
-        features = features.split(preds_per_image, dim=0)
-
-        # fun for each image too, also I can experiment and do multiple images
-        final_results = []
-        zipped = zip(boxes_all, obj_scores_all, attr_probs_all, attrs_all, sizes)
-        for i, (boxes, obj_scores, attr_probs, attrs, size) in enumerate(zipped):
-            for nms_t in self.nms_thresh:
-                outputs = do_nms(
-                    boxes,
-                    obj_scores,
-                    size,
-                    self.score_thresh,
-                    nms_t,
-                    self.min_detections,
-                    self.max_detections,
-                )
-                if outputs is not None:
-                    max_boxes, max_scores, classes, ids = outputs
-                    break
-
-            if scales is not None:
-                scale_yx = scales[i]
-                max_boxes[:, 0::2] *= scale_yx[1]
-                max_boxes[:, 1::2] *= scale_yx[0]
-
-            final_results.append(
-                (
-                    max_boxes,
-                    classes,
-                    max_scores,
-                    attrs[ids],
-                    attr_probs[ids],
-                    features[i][ids],
-                )
-            )
-        boxes, classes, class_probs, attrs, attr_probs, roi_features = map(list, zip(*final_results))
-        return boxes, classes, class_probs, attrs, attr_probs, roi_features
-
-    def training(self, obj_logits, attr_logits, box_deltas, pred_boxes, features, sizes):
-        pass
-
-    def __call__(
-        self,
-        obj_logits,
-        attr_logits,
-        box_deltas,
-        pred_boxes,
-        features,
-        sizes,
-        scales=None,
-    ):
-        if self.training:
-            raise NotImplementedError()
-        return self.inference(
-            obj_logits,
-            attr_logits,
-            box_deltas,
-            pred_boxes,
-            features,
-            sizes,
-            scales=scales,
-        )
-
-
-class Res5ROIHeads(nn.Module):
-    """
-    ROIHeads perform all per-region computation in an R-CNN.
-    It contains logic of cropping the regions, extract per-region features
-    (by the res-5 block in this case), and make per-region predictions.
-    """
-
-    def __init__(self, cfg, input_shape):
-        super().__init__()
-        self.batch_size_per_image = cfg.RPN.BATCH_SIZE_PER_IMAGE
-        self.positive_sample_fraction = cfg.ROI_HEADS.POSITIVE_FRACTION
-        self.in_features = cfg.ROI_HEADS.IN_FEATURES
-        self.num_classes = cfg.ROI_HEADS.NUM_CLASSES
-        self.proposal_append_gt = cfg.ROI_HEADS.PROPOSAL_APPEND_GT
-        self.feature_strides = {k: v.stride for k, v in input_shape.items()}
-        self.feature_channels = {k: v.channels for k, v in input_shape.items()}
-        self.cls_agnostic_bbox_reg = cfg.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
-        self.stage_channel_factor = 2**3  # res5 is 8x res2
-        self.out_channels = cfg.RESNETS.RES2_OUT_CHANNELS * self.stage_channel_factor
-
-        # self.proposal_matcher = Matcher(
-        #     cfg.ROI_HEADS.IOU_THRESHOLDS,
-        #     cfg.ROI_HEADS.IOU_LABELS,
-        #     allow_low_quality_matches=False,
-        # )
-
-        pooler_resolution = cfg.ROI_BOX_HEAD.POOLER_RESOLUTION
-        pooler_scales = (1.0 / self.feature_strides[self.in_features[0]],)
-        sampling_ratio = cfg.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
-        res5_halve = cfg.ROI_BOX_HEAD.RES5HALVE
-        use_attr = cfg.ROI_BOX_HEAD.ATTR
-        num_attrs = cfg.ROI_BOX_HEAD.NUM_ATTRS
-
-        self.pooler = ROIPooler(
-            output_size=pooler_resolution,
-            scales=pooler_scales,
-            sampling_ratio=sampling_ratio,
-        )
-
-        self.res5 = self._build_res5_block(cfg)
-        if not res5_halve:
-            """
-            Modifications for VG in RoI heads:
-            1. Change the stride of conv1 and shortcut in Res5.Block1 from 2 to 1
-            2. Modifying all conv2 with (padding: 1 --> 2) and (dilation: 1 --> 2)
-            """
-            self.res5[0].conv1.stride = (1, 1)
-            self.res5[0].shortcut.stride = (1, 1)
-            for i in range(3):
-                self.res5[i].conv2.padding = (2, 2)
-                self.res5[i].conv2.dilation = (2, 2)
-
-        self.box_predictor = FastRCNNOutputLayers(
-            self.out_channels,
-            self.num_classes,
-            self.cls_agnostic_bbox_reg,
-            use_attr=use_attr,
-            num_attrs=num_attrs,
-        )
-
-    def _build_res5_block(self, cfg):
-        stage_channel_factor = self.stage_channel_factor  # res5 is 8x res2
-        num_groups = cfg.RESNETS.NUM_GROUPS
-        width_per_group = cfg.RESNETS.WIDTH_PER_GROUP
-        bottleneck_channels = num_groups * width_per_group * stage_channel_factor
-        out_channels = self.out_channels
-        stride_in_1x1 = cfg.RESNETS.STRIDE_IN_1X1
-        norm = cfg.RESNETS.NORM
-
-        blocks = ResNet.make_stage(
-            BottleneckBlock,
-            3,
-            first_stride=2,
-            in_channels=out_channels // 2,
-            bottleneck_channels=bottleneck_channels,
-            out_channels=out_channels,
-            num_groups=num_groups,
-            norm=norm,
-            stride_in_1x1=stride_in_1x1,
-        )
-        return nn.Sequential(*blocks)
-
-    def _shared_roi_transform(self, features, boxes):
-        x = self.pooler(features, boxes)
-        return self.res5(x)
-
-    def forward(self, features, proposal_boxes, gt_boxes=None):
-        if self.training:
-            """
-            see https://github.com/airsplay/py-bottom-up-attention/\
-                    blob/master/detectron2/modeling/roi_heads/roi_heads.py
-            """
-            raise NotImplementedError()
-
-        assert not proposal_boxes[0].requires_grad
-        box_features = self._shared_roi_transform(features, proposal_boxes)
-        feature_pooled = box_features.mean(dim=[2, 3])  # pooled to 1x1
-        obj_logits, attr_logits, pred_proposal_deltas = self.box_predictor(feature_pooled)
-        return obj_logits, attr_logits, pred_proposal_deltas, feature_pooled
-
-
-class AnchorGenerator(nn.Module):
-    """
-    For a set of image sizes and feature maps, computes a set of anchors.
-    """
-
-    def __init__(self, cfg, input_shape: List[ShapeSpec]):
-        super().__init__()
-        sizes = cfg.ANCHOR_GENERATOR.SIZES
-        aspect_ratios = cfg.ANCHOR_GENERATOR.ASPECT_RATIOS
-        self.strides = [x.stride for x in input_shape]
-        self.offset = cfg.ANCHOR_GENERATOR.OFFSET
-        assert 0.0 <= self.offset < 1.0, self.offset
-
-        """
-        sizes (list[list[int]]): sizes[i] is the list of anchor sizes for feat map i
-            1. given in absolute lengths in units of the input image;
-            2. they do not dynamically scale if the input image size changes.
-        aspect_ratios (list[list[float]])
-        strides (list[int]): stride of each input feature.
-        """
-
-        self.num_features = len(self.strides)
-        self.cell_anchors = nn.ParameterList(self._calculate_anchors(sizes, aspect_ratios))
-        self._spacial_feat_dim = 4
-
-    def _calculate_anchors(self, sizes, aspect_ratios):
-        # If one size (or aspect ratio) is specified and there are multiple feature
-        # maps, then we "broadcast" anchors of that single size (or aspect ratio)
-        if len(sizes) == 1:
-            sizes *= self.num_features
-        if len(aspect_ratios) == 1:
-            aspect_ratios *= self.num_features
-        assert self.num_features == len(sizes)
-        assert self.num_features == len(aspect_ratios)
-
-        cell_anchors = [self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios)]
-
-        return cell_anchors
-
-    @property
-    def box_dim(self):
-        return self._spacial_feat_dim
-
-    @property
-    def num_cell_anchors(self):
-        """
-        Returns:
-            list[int]: Each int is the number of anchors at every pixel location, on that feature map.
-        """
-        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
-
-    def grid_anchors(self, grid_sizes):
-        anchors = []
-        for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
-            shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors.device)
-            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
-
-            anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
-
-        return anchors
-
-    def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
-        """
-        anchors are continuous geometric rectangles
-        centered on one feature map point sample.
-        We can later build the set of anchors
-        for the entire feature map by tiling these tensors
-        """
-
-        anchors = []
-        for size in sizes:
-            area = size**2.0
-            for aspect_ratio in aspect_ratios:
-                w = math.sqrt(area / aspect_ratio)
-                h = aspect_ratio * w
-                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
-                anchors.append([x0, y0, x1, y1])
-        return nn.Parameter(torch.tensor(anchors))
-
-    def forward(self, features):
-        """
-        Args:
-            features List[torch.Tensor]: list of feature maps on which to generate anchors.
-        Returns:
-            torch.Tensor: a list of #image elements.
-        """
-        num_images = features[0].size(0)
-        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
-        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes)
-        anchors_over_all_feature_maps = torch.stack(anchors_over_all_feature_maps)
-        return anchors_over_all_feature_maps.unsqueeze(0).repeat_interleave(num_images, dim=0)
-
-
-class RPNHead(nn.Module):
-    """
-    RPN classification and regression heads. Uses a 3x3 conv to produce a shared
-    hidden state from which one 1x1 conv predicts objectness logits for each anchor
-    and a second 1x1 conv predicts bounding-box deltas specifying how to deform
-    each anchor into an object proposal.
-    """
-
-    def __init__(self, cfg, input_shape: List[ShapeSpec]):
-        super().__init__()
-
-        # Standard RPN is shared across levels:
-        in_channels = [s.channels for s in input_shape]
-        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
-        in_channels = in_channels[0]
-
-        anchor_generator = AnchorGenerator(cfg, input_shape)
-        num_cell_anchors = anchor_generator.num_cell_anchors
-        box_dim = anchor_generator.box_dim
-        assert len(set(num_cell_anchors)) == 1, "Each level must have the same number of cell anchors"
-        num_cell_anchors = num_cell_anchors[0]
-
-        if cfg.PROPOSAL_GENERATOR.HIDDEN_CHANNELS == -1:
-            hid_channels = in_channels
-        else:
-            hid_channels = cfg.PROPOSAL_GENERATOR.HIDDEN_CHANNELS
-            # Modifications for VG in RPN (modeling/proposal_generator/rpn.py)
-            # Use hidden dim  instead fo the same dim as Res4 (in_channels)
-
-        # 3x3 conv for the hidden representation
-        self.conv = nn.Conv2d(in_channels, hid_channels, kernel_size=3, stride=1, padding=1)
-        # 1x1 conv for predicting objectness logits
-        self.objectness_logits = nn.Conv2d(hid_channels, num_cell_anchors, kernel_size=1, stride=1)
-        # 1x1 conv for predicting box2box transform deltas
-        self.anchor_deltas = nn.Conv2d(hid_channels, num_cell_anchors * box_dim, kernel_size=1, stride=1)
-
-        for layer in [self.conv, self.objectness_logits, self.anchor_deltas]:
-            nn.init.normal_(layer.weight, std=0.01)
-            nn.init.constant_(layer.bias, 0)
-
-    def forward(self, features):
-        """
-        Args:
-            features (list[Tensor]): list of feature maps
-        """
-        pred_objectness_logits = []
-        pred_anchor_deltas = []
-        for x in features:
-            t = nn.functional.relu(self.conv(x))
-            pred_objectness_logits.append(self.objectness_logits(t))
-            pred_anchor_deltas.append(self.anchor_deltas(t))
-        return pred_objectness_logits, pred_anchor_deltas
-
-
-class RPN(nn.Module):
-    """
-    Region Proposal Network, introduced by the Faster R-CNN paper.
-    """
-
-    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
-        super().__init__()
-
-        self.min_box_side_len = cfg.PROPOSAL_GENERATOR.MIN_SIZE
-        self.in_features = cfg.RPN.IN_FEATURES
-        self.nms_thresh = cfg.RPN.NMS_THRESH
-        self.batch_size_per_image = cfg.RPN.BATCH_SIZE_PER_IMAGE
-        self.positive_fraction = cfg.RPN.POSITIVE_FRACTION
-        self.smooth_l1_beta = cfg.RPN.SMOOTH_L1_BETA
-        self.loss_weight = cfg.RPN.LOSS_WEIGHT
-
-        self.pre_nms_topk = {
-            True: cfg.RPN.PRE_NMS_TOPK_TRAIN,
-            False: cfg.RPN.PRE_NMS_TOPK_TEST,
-        }
-        self.post_nms_topk = {
-            True: cfg.RPN.POST_NMS_TOPK_TRAIN,
-            False: cfg.RPN.POST_NMS_TOPK_TEST,
-        }
-        self.boundary_threshold = cfg.RPN.BOUNDARY_THRESH
-
-        self.anchor_generator = AnchorGenerator(cfg, [input_shape[f] for f in self.in_features])
-        self.box2box_transform = Box2BoxTransform(weights=cfg.RPN.BBOX_REG_WEIGHTS)
-        self.anchor_matcher = Matcher(
-            cfg.RPN.IOU_THRESHOLDS,
-            cfg.RPN.IOU_LABELS,
-            allow_low_quality_matches=True,
-        )
-        self.rpn_head = RPNHead(cfg, [input_shape[f] for f in self.in_features])
-
-    def training(self, images, image_shapes, features, gt_boxes):
-        pass
-
-    def inference(self, outputs, images, image_shapes, features, gt_boxes=None):
-        outputs = find_top_rpn_proposals(
-            outputs.predict_proposals(),
-            outputs.predict_objectness_logits(),
-            images,
-            image_shapes,
-            self.nms_thresh,
-            self.pre_nms_topk[self.training],
-            self.post_nms_topk[self.training],
-            self.min_box_side_len,
-            self.training,
-        )
-
-        results = []
-        for img in outputs:
-            im_boxes, img_box_logits = img
-            img_box_logits, inds = img_box_logits.sort(descending=True)
-            im_boxes = im_boxes[inds]
-            results.append((im_boxes, img_box_logits))
-
-        (proposal_boxes, logits) = tuple(map(list, zip(*results)))
-        return proposal_boxes, logits
-
-    def forward(self, images, image_shapes, features, gt_boxes=None):
-        """
-        Args:
-            images (torch.Tensor): input images of length `N`
-            features (dict[str: Tensor])
-            gt_instances
-        """
-        # features is dict, key = block level, v = feature_map
-        features = [features[f] for f in self.in_features]
-        pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features)
-        anchors = self.anchor_generator(features)
-        outputs = RPNOutputs(
-            self.box2box_transform,
-            self.anchor_matcher,
-            self.batch_size_per_image,
-            self.positive_fraction,
-            images,
-            pred_objectness_logits,
-            pred_anchor_deltas,
-            anchors,
-            self.boundary_threshold,
-            gt_boxes,
-            self.smooth_l1_beta,
-        )
-        # For RPN-only models, the proposals are the final output
-
-        if self.training:
-            raise NotImplementedError()
-            return self.training(outputs, images, image_shapes, features, gt_boxes)
-        else:
-            return self.inference(outputs, images, image_shapes, features, gt_boxes)
-
-
-class FastRCNNOutputLayers(nn.Module):
-    """
-    Two linear layers for predicting Fast R-CNN outputs:
-      (1) proposal-to-detection box regression deltas
-      (2) classification scores
-    """
-
-    def __init__(
-        self,
-        input_size,
-        num_classes,
-        cls_agnostic_bbox_reg,
-        box_dim=4,
-        use_attr=False,
-        num_attrs=-1,
-    ):
-        """
-        Args:
-            input_size (int): channels, or (channels, height, width)
-            num_classes (int)
-            cls_agnostic_bbox_reg (bool)
-            box_dim (int)
-        """
-        super().__init__()
-
-        if not isinstance(input_size, int):
-            input_size = np.prod(input_size)
-
-        # (do + 1 for background class)
-        self.cls_score = nn.Linear(input_size, num_classes + 1)
-        num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
-        self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
-
-        self.use_attr = use_attr
-        if use_attr:
-            """
-            Modifications for VG in RoI heads
-            Embedding: {num_classes + 1} --> {input_size // 8}
-            Linear: {input_size + input_size // 8} --> {input_size // 4}
-            Linear: {input_size // 4} --> {num_attrs + 1}
-            """
-            self.cls_embedding = nn.Embedding(num_classes + 1, input_size // 8)
-            self.fc_attr = nn.Linear(input_size + input_size // 8, input_size // 4)
-            self.attr_score = nn.Linear(input_size // 4, num_attrs + 1)
-
-        nn.init.normal_(self.cls_score.weight, std=0.01)
-        nn.init.normal_(self.bbox_pred.weight, std=0.001)
-        for item in [self.cls_score, self.bbox_pred]:
-            nn.init.constant_(item.bias, 0)
-
-    def forward(self, roi_features):
-        if roi_features.dim() > 2:
-            roi_features = torch.flatten(roi_features, start_dim=1)
-        scores = self.cls_score(roi_features)
-        proposal_deltas = self.bbox_pred(roi_features)
-        if self.use_attr:
-            _, max_class = scores.max(-1)  # [b, c] --> [b]
-            cls_emb = self.cls_embedding(max_class)  # [b] --> [b, 256]
-            roi_features = torch.cat([roi_features, cls_emb], -1)  # [b, 2048] + [b, 256] --> [b, 2304]
-            roi_features = self.fc_attr(roi_features)
-            roi_features = nn.functional.relu(roi_features)
-            attr_scores = self.attr_score(roi_features)
-            return scores, attr_scores, proposal_deltas
-        else:
-            return scores, proposal_deltas
-
-
-class GeneralizedRCNN(nn.Module):
-    def __init__(self, cfg):
-        super().__init__()
-
-        self.device = torch.device(cfg.MODEL.DEVICE)
-        self.backbone = build_backbone(cfg)
-        self.proposal_generator = RPN(cfg, self.backbone.output_shape())
-        self.roi_heads = Res5ROIHeads(cfg, self.backbone.output_shape())
-        self.roi_outputs = ROIOutputs(cfg)
-        self.to(self.device)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        config = kwargs.pop("config", None)
-        state_dict = kwargs.pop("state_dict", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        from_tf = kwargs.pop("from_tf", False)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        use_cdn = kwargs.pop("use_cdn", True)
-
-        # Load config if we don't provide a configuration
-        if not isinstance(config, Config):
-            config_path = config if config is not None else pretrained_model_name_or_path
-            # try:
-            config = Config.from_pretrained(
-                config_path,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-            )
-
-        # Load model
-        if pretrained_model_name_or_path is not None:
-            if os.path.isdir(pretrained_model_name_or_path):
-                if os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
-                    # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                else:
-                    raise EnvironmentError(
-                        "Error no file named {} found in directory {} ".format(
-                            WEIGHTS_NAME,
-                            pretrained_model_name_or_path,
-                        )
-                    )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-                archive_file = pretrained_model_name_or_path
-            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
-                    pretrained_model_name_or_path + ".index"
-                )
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = hf_bucket_url(
-                    pretrained_model_name_or_path,
-                    filename=WEIGHTS_NAME,
-                    use_cdn=use_cdn,
-                )
-
-            try:
-                # Load from URL or cache if already cached
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                )
-                if resolved_archive_file is None:
-                    raise EnvironmentError
-            except EnvironmentError:
-                msg = f"Can't load weights for '{pretrained_model_name_or_path}'."
-                raise EnvironmentError(msg)
-
-            if resolved_archive_file == archive_file:
-                print("loading weights file {}".format(archive_file))
-            else:
-                print("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
-        else:
-            resolved_archive_file = None
-
-        # Instantiate model.
-        model = cls(config)
-
-        if state_dict is None:
-            try:
-                try:
-                    state_dict = torch.load(resolved_archive_file, map_location="cpu")
-                except Exception:
-                    state_dict = load_checkpoint(resolved_archive_file)
-
-            except Exception:
-                raise OSError(
-                    "Unable to load weights from pytorch checkpoint file. "
-                    "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
-                )
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-
-        # Convert old format to new format if needed from a PyTorch state_dict
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if "gamma" in key:
-                new_key = key.replace("gamma", "weight")
-            if "beta" in key:
-                new_key = key.replace("beta", "bias")
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, "_metadata", None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        model_to_load = model
-        model_to_load.load_state_dict(state_dict)
-
-        if model.__class__.__name__ != model_to_load.__class__.__name__:
-            base_model_state_dict = model_to_load.state_dict().keys()
-            head_model_state_dict_without_base_prefix = [
-                key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
-            ]
-            missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
-
-        if len(unexpected_keys) > 0:
-            print(
-                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
-                " with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical"
-                " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
-            )
-        else:
-            print(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-        if len(missing_keys) > 0:
-            print(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-            )
-        else:
-            print(
-                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
-                f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
-                " training."
-            )
-        if len(error_msgs) > 0:
-            raise RuntimeError(
-                "Error(s) in loading state_dict for {}:\n\t{}".format(
-                    model.__class__.__name__, "\n\t".join(error_msgs)
-                )
-            )
-        # Set model in evaluation mode to deactivate DropOut modules by default
-        model.eval()
-
-        return model
-
-    def forward(
-        self,
-        images,
-        image_shapes,
-        gt_boxes=None,
-        proposals=None,
-        scales_yx=None,
-        **kwargs,
-    ):
-        """
-        kwargs:
-            max_detections (int), return_tensors {"np", "pt", None}, padding {None,
-            "max_detections"}, pad_value (int), location = {"cuda", "cpu"}
-        """
-        if self.training:
-            raise NotImplementedError()
-        return self.inference(
-            images=images,
-            image_shapes=image_shapes,
-            gt_boxes=gt_boxes,
-            proposals=proposals,
-            scales_yx=scales_yx,
-            **kwargs,
-        )
-
-    @torch.no_grad()
-    def inference(
-        self,
-        images,
-        image_shapes,
-        gt_boxes=None,
-        proposals=None,
-        scales_yx=None,
-        **kwargs,
-    ):
-        # run images through backbone
-        original_sizes = image_shapes * scales_yx
-        features = self.backbone(images)
-
-        # generate proposals if none are available
-        if proposals is None:
-            proposal_boxes, _ = self.proposal_generator(images, image_shapes, features, gt_boxes)
-        else:
-            assert proposals is not None
-
-        # pool object features from either gt_boxes, or from proposals
-        obj_logits, attr_logits, box_deltas, feature_pooled = self.roi_heads(features, proposal_boxes, gt_boxes)
-
-        # prepare FRCNN Outputs and select top proposals
-        boxes, classes, class_probs, attrs, attr_probs, roi_features = self.roi_outputs(
-            obj_logits=obj_logits,
-            attr_logits=attr_logits,
-            box_deltas=box_deltas,
-            pred_boxes=proposal_boxes,
-            features=feature_pooled,
-            sizes=image_shapes,
-            scales=scales_yx,
-        )
-
-        # will we pad???
-        subset_kwargs = {
-            "max_detections": kwargs.get("max_detections", None),
-            "return_tensors": kwargs.get("return_tensors", None),
-            "pad_value": kwargs.get("pad_value", 0),
-            "padding": kwargs.get("padding", None),
-        }
-        preds_per_image = torch.tensor([p.size(0) for p in boxes])
-        boxes = pad_list_tensors(boxes, preds_per_image, **subset_kwargs)
-        classes = pad_list_tensors(classes, preds_per_image, **subset_kwargs)
-        class_probs = pad_list_tensors(class_probs, preds_per_image, **subset_kwargs)
-        attrs = pad_list_tensors(attrs, preds_per_image, **subset_kwargs)
-        attr_probs = pad_list_tensors(attr_probs, preds_per_image, **subset_kwargs)
-        roi_features = pad_list_tensors(roi_features, preds_per_image, **subset_kwargs)
-        subset_kwargs["padding"] = None
-        preds_per_image = pad_list_tensors(preds_per_image, None, **subset_kwargs)
-        sizes = pad_list_tensors(image_shapes, None, **subset_kwargs)
-        normalized_boxes = norm_box(boxes, original_sizes)
-        return OrderedDict(
-            {
-                "obj_ids": classes,
-                "obj_probs": class_probs,
-                "attr_ids": attrs,
-                "attr_probs": attr_probs,
-                "boxes": boxes,
-                "sizes": sizes,
-                "preds_per_image": preds_per_image,
-                "roi_features": roi_features,
-                "normalized_boxes": normalized_boxes,
-            }
-        )
diff --git a/examples/research_projects/visual_bert/processing_image.py b/examples/research_projects/visual_bert/processing_image.py
deleted file mode 100644
index 65f8f6cd377c..000000000000
--- a/examples/research_projects/visual_bert/processing_image.py
+++ /dev/null
@@ -1,151 +0,0 @@
-"""
-coding=utf-8
-Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
-Adapted From Facebook Inc, Detectron2
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.import copy
-"""
-
-import sys
-from typing import Tuple
-
-import numpy as np
-import torch
-from PIL import Image
-from torch import nn
-
-from transformers.image_utils import PILImageResampling
-from utils import img_tensorize
-
-
-class ResizeShortestEdge:
-    def __init__(self, short_edge_length, max_size=sys.maxsize):
-        """
-        Args:
-            short_edge_length (list[min, max])
-            max_size (int): maximum allowed longest edge length.
-        """
-        self.interp_method = "bilinear"
-        self.max_size = max_size
-        self.short_edge_length = short_edge_length
-
-    def __call__(self, imgs):
-        img_augs = []
-        for img in imgs:
-            h, w = img.shape[:2]
-            # later: provide list and randomly choose index for resize
-            size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
-            if size == 0:
-                return img
-            scale = size * 1.0 / min(h, w)
-            if h < w:
-                newh, neww = size, scale * w
-            else:
-                newh, neww = scale * h, size
-            if max(newh, neww) > self.max_size:
-                scale = self.max_size * 1.0 / max(newh, neww)
-                newh = newh * scale
-                neww = neww * scale
-            neww = int(neww + 0.5)
-            newh = int(newh + 0.5)
-
-            if img.dtype == np.uint8:
-                pil_image = Image.fromarray(img)
-                pil_image = pil_image.resize((neww, newh), PILImageResampling.BILINEAR)
-                img = np.asarray(pil_image)
-            else:
-                img = img.permute(2, 0, 1).unsqueeze(0)  # 3, 0, 1)  # hw(c) -> nchw
-                img = nn.functional.interpolate(
-                    img, (newh, neww), mode=self.interp_method, align_corners=False
-                ).squeeze(0)
-            img_augs.append(img)
-
-        return img_augs
-
-
-class Preprocess:
-    def __init__(self, cfg):
-        self.aug = ResizeShortestEdge([cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST)
-        self.input_format = cfg.INPUT.FORMAT
-        self.size_divisibility = cfg.SIZE_DIVISIBILITY
-        self.pad_value = cfg.PAD_VALUE
-        self.max_image_size = cfg.INPUT.MAX_SIZE_TEST
-        self.device = cfg.MODEL.DEVICE
-        self.pixel_std = torch.tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(len(cfg.MODEL.PIXEL_STD), 1, 1)
-        self.pixel_mean = torch.tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(len(cfg.MODEL.PIXEL_STD), 1, 1)
-        self.normalizer = lambda x: (x - self.pixel_mean) / self.pixel_std
-
-    def pad(self, images):
-        max_size = tuple(max(s) for s in zip(*[img.shape for img in images]))
-        image_sizes = [im.shape[-2:] for im in images]
-        images = [
-            nn.functional.pad(
-                im,
-                [0, max_size[-1] - size[1], 0, max_size[-2] - size[0]],
-                value=self.pad_value,
-            )
-            for size, im in zip(image_sizes, images)
-        ]
-
-        return torch.stack(images), torch.tensor(image_sizes)
-
-    def __call__(self, images, single_image=False):
-        with torch.no_grad():
-            if not isinstance(images, list):
-                images = [images]
-            if single_image:
-                assert len(images) == 1
-            for i in range(len(images)):
-                if isinstance(images[i], torch.Tensor):
-                    images.insert(i, images.pop(i).to(self.device).float())
-                elif not isinstance(images[i], torch.Tensor):
-                    images.insert(
-                        i,
-                        torch.as_tensor(img_tensorize(images.pop(i), input_format=self.input_format))
-                        .to(self.device)
-                        .float(),
-                    )
-            # resize smallest edge
-            raw_sizes = torch.tensor([im.shape[:2] for im in images])
-            images = self.aug(images)
-            # transpose images and convert to torch tensors
-            # images = [torch.as_tensor(i.astype("float32")).permute(2, 0, 1).to(self.device) for i in images]
-            # now normalize before pad to avoid useless arithmetic
-            images = [self.normalizer(x) for x in images]
-            # now pad them to do the following operations
-            images, sizes = self.pad(images)
-            # Normalize
-
-            if self.size_divisibility > 0:
-                raise NotImplementedError()
-            # pad
-            scales_yx = torch.true_divide(raw_sizes, sizes)
-            if single_image:
-                return images[0], sizes[0], scales_yx[0]
-            else:
-                return images, sizes, scales_yx
-
-
-def _scale_box(boxes, scale_yx):
-    boxes[:, 0::2] *= scale_yx[:, 1]
-    boxes[:, 1::2] *= scale_yx[:, 0]
-    return boxes
-
-
-def _clip_box(tensor, box_size: Tuple[int, int]):
-    assert torch.isfinite(tensor).all(), "Box tensor contains infinite or NaN!"
-    h, w = box_size
-    tensor[:, 0].clamp_(min=0, max=w)
-    tensor[:, 1].clamp_(min=0, max=h)
-    tensor[:, 2].clamp_(min=0, max=w)
-    tensor[:, 3].clamp_(min=0, max=h)
diff --git a/examples/research_projects/visual_bert/requirements.txt b/examples/research_projects/visual_bert/requirements.txt
deleted file mode 100644
index e2778663a53c..000000000000
--- a/examples/research_projects/visual_bert/requirements.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-appdirs==1.4.3
-argon2-cffi==20.1.0
-async-generator==1.10
-attrs==20.2.0
-backcall==0.2.0
-CacheControl==0.12.6
-certifi==2024.7.4
-cffi==1.14.2
-chardet==3.0.4
-click==7.1.2
-colorama==0.4.3
-contextlib2==0.6.0
-cycler==0.10.0
-datasets==1.0.0
-decorator==4.4.2
-defusedxml==0.6.0
-dill==0.3.2
-distlib==0.3.0
-distro==1.4.0
-entrypoints==0.3
-filelock==3.0.12
-future==0.18.3
-html5lib==1.0.1
-idna==3.7
-ipaddr==2.2.0
-ipykernel==5.3.4
-ipython
-ipython-genutils==0.2.0
-ipywidgets==7.5.1
-jedi==0.17.2
-Jinja2>=2.11.3
-joblib==1.2.0
-jsonschema==3.2.0
-jupyter==1.0.0
-jupyter-client==6.1.7
-jupyter-console==6.2.0
-jupyter-core==4.11.2
-jupyterlab-pygments==0.1.1
-kiwisolver==1.2.0
-lockfile==0.12.2
-MarkupSafe==1.1.1
-matplotlib==3.3.1
-mistune==2.0.3
-msgpack==0.6.2
-nbclient==0.5.0
-nbconvert==6.5.1
-nbformat==5.0.7
-nest-asyncio==1.4.0
-notebook==6.4.12
-numpy==1.22.0
-opencv-python==4.8.1.78
-packaging==20.3
-pandas==1.1.2
-pandocfilters==1.4.2
-parso==0.7.1
-pep517==0.8.2
-pexpect==4.8.0
-pickleshare==0.7.5
-Pillow>=8.1.1
-progress==1.5
-prometheus-client==0.8.0
-prompt-toolkit==3.0.7
-ptyprocess==0.6.0
-pyaml==20.4.0
-pyarrow==15.0.0
-pycparser==2.20
-Pygments>=2.7.4
-pyparsing==2.4.6
-pyrsistent==0.16.0
-python-dateutil==2.8.1
-pytoml==0.1.21
-pytz==2020.1
-PyYAML>=5.4
-pyzmq==19.0.2
-qtconsole==4.7.7
-QtPy==1.9.0
-regex==2020.7.14
-requests==2.32.2
-retrying==1.3.3
-sacremoses==0.0.43
-Send2Trash==1.5.0
-sentencepiece==0.1.91
-six==1.14.0
-terminado==0.8.3
-testpath==0.4.4
-tokenizers==0.8.1rc2
-torch==2.2.0
-torchvision==0.7.0
-tornado==6.4.2
-tqdm==4.66.3
-traitlets
-git+https://github.com/huggingface/transformers.git
-urllib3==1.26.19
-wcwidth==0.2.5
-webencodings==0.5.1
-wget==3.2
-widgetsnbextension==3.5.1
-xxhash==2.0.0
diff --git a/examples/research_projects/visual_bert/utils.py b/examples/research_projects/visual_bert/utils.py
deleted file mode 100644
index 995fbd2c19ae..000000000000
--- a/examples/research_projects/visual_bert/utils.py
+++ /dev/null
@@ -1,554 +0,0 @@
-"""
-coding=utf-8
-Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal, Huggingface team :)
-Adapted From Facebook Inc, Detectron2
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.import copy
-"""
-
-import copy
-import fnmatch
-import json
-import os
-import pickle as pkl
-import shutil
-import sys
-import tarfile
-import tempfile
-from collections import OrderedDict
-from contextlib import contextmanager
-from functools import partial
-from io import BytesIO
-from pathlib import Path
-from urllib.parse import urlparse
-from zipfile import ZipFile, is_zipfile
-
-import cv2
-import numpy as np
-import requests
-import wget
-from filelock import FileLock
-from huggingface_hub.utils import insecure_hashlib
-from PIL import Image
-from tqdm.auto import tqdm
-from yaml import Loader, dump, load
-
-
-try:
-    import torch
-
-    _torch_available = True
-except ImportError:
-    _torch_available = False
-
-
-try:
-    from torch.hub import _get_torch_home
-
-    torch_cache_home = _get_torch_home()
-except ImportError:
-    torch_cache_home = os.path.expanduser(
-        os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
-    )
-
-default_cache_path = os.path.join(torch_cache_home, "transformers")
-
-CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
-S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
-PATH = "/".join(str(Path(__file__).resolve()).split("/")[:-1])
-CONFIG = os.path.join(PATH, "config.yaml")
-ATTRIBUTES = os.path.join(PATH, "attributes.txt")
-OBJECTS = os.path.join(PATH, "objects.txt")
-PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
-PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
-TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
-WEIGHTS_NAME = "pytorch_model.bin"
-CONFIG_NAME = "config.yaml"
-
-
-def load_labels(objs=OBJECTS, attrs=ATTRIBUTES):
-    vg_classes = []
-    with open(objs) as f:
-        for object in f.readlines():
-            vg_classes.append(object.split(",")[0].lower().strip())
-
-    vg_attrs = []
-    with open(attrs) as f:
-        for object in f.readlines():
-            vg_attrs.append(object.split(",")[0].lower().strip())
-    return vg_classes, vg_attrs
-
-
-def load_checkpoint(ckp):
-    r = OrderedDict()
-    with open(ckp, "rb") as f:
-        ckp = pkl.load(f)["model"]
-    for k in copy.deepcopy(list(ckp.keys())):
-        v = ckp.pop(k)
-        if isinstance(v, np.ndarray):
-            v = torch.tensor(v)
-        else:
-            assert isinstance(v, torch.tensor), type(v)
-        r[k] = v
-    return r
-
-
-class Config:
-    _pointer = {}
-
-    def __init__(self, dictionary: dict, name: str = "root", level=0):
-        self._name = name
-        self._level = level
-        d = {}
-        for k, v in dictionary.items():
-            if v is None:
-                raise ValueError()
-            k = copy.deepcopy(k)
-            v = copy.deepcopy(v)
-            if isinstance(v, dict):
-                v = Config(v, name=k, level=level + 1)
-            d[k] = v
-            setattr(self, k, v)
-
-        self._pointer = d
-
-    def __repr__(self):
-        return str(list((self._pointer.keys())))
-
-    def __setattr__(self, key, val):
-        self.__dict__[key] = val
-        self.__dict__[key.upper()] = val
-        levels = key.split(".")
-        last_level = len(levels) - 1
-        pointer = self._pointer
-        if len(levels) > 1:
-            for i, l in enumerate(levels):
-                if hasattr(self, l) and isinstance(getattr(self, l), Config):
-                    setattr(getattr(self, l), ".".join(levels[i:]), val)
-                if l == last_level:
-                    pointer[l] = val
-                else:
-                    pointer = pointer[l]
-
-    def to_dict(self):
-        return self._pointer
-
-    def dump_yaml(self, data, file_name):
-        with open(f"{file_name}", "w") as stream:
-            dump(data, stream)
-
-    def dump_json(self, data, file_name):
-        with open(f"{file_name}", "w") as stream:
-            json.dump(data, stream)
-
-    @staticmethod
-    def load_yaml(config):
-        with open(config) as stream:
-            data = load(stream, Loader=Loader)
-        return data
-
-    def __str__(self):
-        t = "    "
-        if self._name != "root":
-            r = f"{t * (self._level-1)}{self._name}:\n"
-        else:
-            r = ""
-        level = self._level
-        for i, (k, v) in enumerate(self._pointer.items()):
-            if isinstance(v, Config):
-                r += f"{t * (self._level)}{v}\n"
-                self._level += 1
-            else:
-                r += f"{t * (self._level)}{k}: {v} ({type(v).__name__})\n"
-            self._level = level
-        return r[:-1]
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        return cls(config_dict)
-
-    @classmethod
-    def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs):
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-
-        if os.path.isdir(pretrained_model_name_or_path):
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            config_file = pretrained_model_name_or_path
-        else:
-            config_file = hf_bucket_url(pretrained_model_name_or_path, filename=CONFIG_NAME, use_cdn=False)
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_config_file = cached_path(
-                config_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-            )
-            # Load config dict
-            if resolved_config_file is None:
-                raise EnvironmentError
-
-            config_file = Config.load_yaml(resolved_config_file)
-
-        except EnvironmentError:
-            msg = "Can't load config for"
-            raise EnvironmentError(msg)
-
-        if resolved_config_file == config_file:
-            print("loading configuration file from path")
-        else:
-            print("loading configuration file cache")
-
-        return Config.load_yaml(resolved_config_file), kwargs
-
-
-# quick compare tensors
-def compare(in_tensor):
-    out_tensor = torch.load("dump.pt", map_location=in_tensor.device)
-    n1 = in_tensor.numpy()
-    n2 = out_tensor.numpy()[0]
-    print(n1.shape, n1[0, 0, :5])
-    print(n2.shape, n2[0, 0, :5])
-    assert np.allclose(n1, n2, rtol=0.01, atol=0.1), (
-        f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x is False])/len(n1.flatten())*100:.4f} %"
-        " element-wise mismatch"
-    )
-    raise Exception("tensors are all good")
-
-    # Hugging face functions below
-
-
-def is_remote_url(url_or_filename):
-    parsed = urlparse(url_or_filename)
-    return parsed.scheme in ("http", "https")
-
-
-def hf_bucket_url(model_id: str, filename: str, use_cdn=True) -> str:
-    endpoint = CLOUDFRONT_DISTRIB_PREFIX if use_cdn else S3_BUCKET_PREFIX
-    legacy_format = "/" not in model_id
-    if legacy_format:
-        return f"{endpoint}/{model_id}-{filename}"
-    else:
-        return f"{endpoint}/{model_id}/{filename}"
-
-
-def http_get(
-    url,
-    temp_file,
-    proxies=None,
-    resume_size=0,
-    user_agent=None,
-):
-    ua = "python/{}".format(sys.version.split()[0])
-    if _torch_available:
-        ua += "; torch/{}".format(torch.__version__)
-    if isinstance(user_agent, dict):
-        ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
-    elif isinstance(user_agent, str):
-        ua += "; " + user_agent
-    headers = {"user-agent": ua}
-    if resume_size > 0:
-        headers["Range"] = "bytes=%d-" % (resume_size,)
-    response = requests.get(url, stream=True, proxies=proxies, headers=headers)
-    if response.status_code == 416:  # Range not satisfiable
-        return
-    content_length = response.headers.get("Content-Length")
-    total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(
-        unit="B",
-        unit_scale=True,
-        total=total,
-        initial=resume_size,
-        desc="Downloading",
-    )
-    for chunk in response.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-
-
-def get_from_cache(
-    url,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    etag_timeout=10,
-    resume_download=False,
-    user_agent=None,
-    local_files_only=False,
-):
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    os.makedirs(cache_dir, exist_ok=True)
-
-    etag = None
-    if not local_files_only:
-        try:
-            response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
-            if response.status_code == 200:
-                etag = response.headers.get("ETag")
-        except (EnvironmentError, requests.exceptions.Timeout):
-            # etag is already None
-            pass
-
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
-    # try to get the last downloaded one
-    if etag is None:
-        if os.path.exists(cache_path):
-            return cache_path
-        else:
-            matching_files = [
-                file
-                for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
-                if not file.endswith(".json") and not file.endswith(".lock")
-            ]
-            if len(matching_files) > 0:
-                return os.path.join(cache_dir, matching_files[-1])
-            else:
-                # If files cannot be found and local_files_only=True,
-                # the models might've been found if local_files_only=False
-                # Notify the user about that
-                if local_files_only:
-                    raise ValueError(
-                        "Cannot find the requested files in the cached path and outgoing traffic has been"
-                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
-                        " to False."
-                    )
-                return None
-
-    # From now on, etag is not None.
-    if os.path.exists(cache_path) and not force_download:
-        return cache_path
-
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + ".lock"
-    with FileLock(lock_path):
-        # If the download just completed while the lock was activated.
-        if os.path.exists(cache_path) and not force_download:
-            # Even if returning early like here, the lock will be released.
-            return cache_path
-
-        if resume_download:
-            incomplete_path = cache_path + ".incomplete"
-
-            @contextmanager
-            def _resumable_file_manager():
-                with open(incomplete_path, "a+b") as f:
-                    yield f
-
-            temp_file_manager = _resumable_file_manager
-            if os.path.exists(incomplete_path):
-                resume_size = os.stat(incomplete_path).st_size
-            else:
-                resume_size = 0
-        else:
-            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
-            resume_size = 0
-
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            print(
-                "%s not found in cache or force_download set to True, downloading to %s",
-                url,
-                temp_file.name,
-            )
-
-            http_get(
-                url,
-                temp_file,
-                proxies=proxies,
-                resume_size=resume_size,
-                user_agent=user_agent,
-            )
-
-        os.replace(temp_file.name, cache_path)
-
-        meta = {"url": url, "etag": etag}
-        meta_path = cache_path + ".json"
-        with open(meta_path, "w") as meta_file:
-            json.dump(meta, meta_file)
-
-    return cache_path
-
-
-def url_to_filename(url, etag=None):
-    url_bytes = url.encode("utf-8")
-    url_hash = insecure_hashlib.sha256(url_bytes)
-    filename = url_hash.hexdigest()
-
-    if etag:
-        etag_bytes = etag.encode("utf-8")
-        etag_hash = insecure_hashlib.sha256(etag_bytes)
-        filename += "." + etag_hash.hexdigest()
-
-    if url.endswith(".h5"):
-        filename += ".h5"
-
-    return filename
-
-
-def cached_path(
-    url_or_filename,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    resume_download=False,
-    user_agent=None,
-    extract_compressed_file=False,
-    force_extract=False,
-    local_files_only=False,
-):
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if is_remote_url(url_or_filename):
-        # URL, so get it from the cache (downloading if necessary)
-        output_path = get_from_cache(
-            url_or_filename,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            user_agent=user_agent,
-            local_files_only=local_files_only,
-        )
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        output_path = url_or_filename
-    elif urlparse(url_or_filename).scheme == "":
-        # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
-
-    if extract_compressed_file:
-        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
-            return output_path
-
-        # Path where we extract compressed archives
-        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
-        output_dir, output_file = os.path.split(output_path)
-        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
-        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
-
-        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
-            return output_path_extracted
-
-        # Prevent parallel extractions
-        lock_path = output_path + ".lock"
-        with FileLock(lock_path):
-            shutil.rmtree(output_path_extracted, ignore_errors=True)
-            os.makedirs(output_path_extracted)
-            if is_zipfile(output_path):
-                with ZipFile(output_path, "r") as zip_file:
-                    zip_file.extractall(output_path_extracted)
-                    zip_file.close()
-            elif tarfile.is_tarfile(output_path):
-                tar_file = tarfile.open(output_path)
-                tar_file.extractall(output_path_extracted)
-                tar_file.close()
-            else:
-                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
-
-        return output_path_extracted
-
-    return output_path
-
-
-def get_data(query, delim=","):
-    assert isinstance(query, str)
-    if os.path.isfile(query):
-        with open(query) as f:
-            data = eval(f.read())
-    else:
-        req = requests.get(query)
-        try:
-            data = requests.json()
-        except Exception:
-            data = req.content.decode()
-            assert data is not None, "could not connect"
-            try:
-                data = eval(data)
-            except Exception:
-                data = data.split("\n")
-        req.close()
-    return data
-
-
-def get_image_from_url(url):
-    response = requests.get(url)
-    img = np.array(Image.open(BytesIO(response.content)))
-    return img
-
-
-# to load legacy frcnn checkpoint from detectron
-def load_frcnn_pkl_from_url(url):
-    fn = url.split("/")[-1]
-    if fn not in os.listdir(os.getcwd()):
-        wget.download(url)
-    with open(fn, "rb") as stream:
-        weights = pkl.load(stream)
-    model = weights.pop("model")
-    new = {}
-    for k, v in model.items():
-        new[k] = torch.from_numpy(v)
-        if "running_var" in k:
-            zero = torch.tensor([0])
-            k2 = k.replace("running_var", "num_batches_tracked")
-            new[k2] = zero
-    return new
-
-
-def get_demo_path():
-    print(f"{os.path.abspath(os.path.join(PATH, os.pardir))}/demo.ipynb")
-
-
-def img_tensorize(im, input_format="RGB"):
-    assert isinstance(im, str)
-    if os.path.isfile(im):
-        img = cv2.imread(im)
-    else:
-        img = get_image_from_url(im)
-        assert img is not None, f"could not connect to: {im}"
-    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    if input_format == "RGB":
-        img = img[:, :, ::-1]
-    return img
-
-
-def chunk(images, batch=1):
-    return (images[i : i + batch] for i in range(0, len(images), batch))
diff --git a/examples/research_projects/visual_bert/visualizing_image.py b/examples/research_projects/visual_bert/visualizing_image.py
deleted file mode 100644
index dcfd8426ff4f..000000000000
--- a/examples/research_projects/visual_bert/visualizing_image.py
+++ /dev/null
@@ -1,500 +0,0 @@
-"""
-coding=utf-8
-Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
-Adapted From Facebook Inc, Detectron2
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.import copy
-"""
-
-import colorsys
-import io
-
-import cv2
-import matplotlib as mpl
-import matplotlib.colors as mplc
-import matplotlib.figure as mplfigure
-import numpy as np
-import torch
-from matplotlib.backends.backend_agg import FigureCanvasAgg
-
-from utils import img_tensorize
-
-
-_SMALL_OBJ = 1000
-
-
-class SingleImageViz:
-    def __init__(
-        self,
-        img,
-        scale=1.2,
-        edgecolor="g",
-        alpha=0.5,
-        linestyle="-",
-        saveas="test_out.jpg",
-        rgb=True,
-        pynb=False,
-        id2obj=None,
-        id2attr=None,
-        pad=0.7,
-    ):
-        """
-        img: an RGB image of shape (H, W, 3).
-        """
-        if isinstance(img, torch.Tensor):
-            img = img.numpy().astype("np.uint8")
-        if isinstance(img, str):
-            img = img_tensorize(img)
-        assert isinstance(img, np.ndarray)
-
-        width, height = img.shape[1], img.shape[0]
-        fig = mplfigure.Figure(frameon=False)
-        dpi = fig.get_dpi()
-        width_in = (width * scale + 1e-2) / dpi
-        height_in = (height * scale + 1e-2) / dpi
-        fig.set_size_inches(width_in, height_in)
-        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
-        ax.axis("off")
-        ax.set_xlim(0.0, width)
-        ax.set_ylim(height)
-
-        self.saveas = saveas
-        self.rgb = rgb
-        self.pynb = pynb
-        self.img = img
-        self.edgecolor = edgecolor
-        self.alpha = 0.5
-        self.linestyle = linestyle
-        self.font_size = int(np.sqrt(min(height, width)) * scale // 3)
-        self.width = width
-        self.height = height
-        self.scale = scale
-        self.fig = fig
-        self.ax = ax
-        self.pad = pad
-        self.id2obj = id2obj
-        self.id2attr = id2attr
-        self.canvas = FigureCanvasAgg(fig)
-
-    def add_box(self, box, color=None):
-        if color is None:
-            color = self.edgecolor
-        (x0, y0, x1, y1) = box
-        width = x1 - x0
-        height = y1 - y0
-        self.ax.add_patch(
-            mpl.patches.Rectangle(
-                (x0, y0),
-                width,
-                height,
-                fill=False,
-                edgecolor=color,
-                linewidth=self.font_size // 3,
-                alpha=self.alpha,
-                linestyle=self.linestyle,
-            )
-        )
-
-    def draw_boxes(self, boxes, obj_ids=None, obj_scores=None, attr_ids=None, attr_scores=None):
-        if len(boxes.shape) > 2:
-            boxes = boxes[0]
-        if len(obj_ids.shape) > 1:
-            obj_ids = obj_ids[0]
-        if len(obj_scores.shape) > 1:
-            obj_scores = obj_scores[0]
-        if len(attr_ids.shape) > 1:
-            attr_ids = attr_ids[0]
-        if len(attr_scores.shape) > 1:
-            attr_scores = attr_scores[0]
-        if isinstance(boxes, torch.Tensor):
-            boxes = boxes.numpy()
-        if isinstance(boxes, list):
-            boxes = np.array(boxes)
-        assert isinstance(boxes, np.ndarray)
-        areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
-        sorted_idxs = np.argsort(-areas).tolist()
-        boxes = boxes[sorted_idxs] if boxes is not None else None
-        obj_ids = obj_ids[sorted_idxs] if obj_ids is not None else None
-        obj_scores = obj_scores[sorted_idxs] if obj_scores is not None else None
-        attr_ids = attr_ids[sorted_idxs] if attr_ids is not None else None
-        attr_scores = attr_scores[sorted_idxs] if attr_scores is not None else None
-
-        assigned_colors = [self._random_color(maximum=1) for _ in range(len(boxes))]
-        assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
-        if obj_ids is not None:
-            labels = self._create_text_labels_attr(obj_ids, obj_scores, attr_ids, attr_scores)
-            for i in range(len(boxes)):
-                color = assigned_colors[i]
-                self.add_box(boxes[i], color)
-                self.draw_labels(labels[i], boxes[i], color)
-
-    def draw_labels(self, label, box, color):
-        x0, y0, x1, y1 = box
-        text_pos = (x0, y0)
-        instance_area = (y1 - y0) * (x1 - x0)
-        small = _SMALL_OBJ * self.scale
-        if instance_area < small or y1 - y0 < 40 * self.scale:
-            if y1 >= self.height - 5:
-                text_pos = (x1, y0)
-            else:
-                text_pos = (x0, y1)
-
-        height_ratio = (y1 - y0) / np.sqrt(self.height * self.width)
-        lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-        font_size = np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
-        font_size *= 0.75 * self.font_size
-
-        self.draw_text(
-            text=label,
-            position=text_pos,
-            color=lighter_color,
-        )
-
-    def draw_text(
-        self,
-        text,
-        position,
-        color="g",
-        ha="left",
-    ):
-        rotation = 0
-        font_size = self.font_size
-        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
-        color[np.argmax(color)] = max(0.8, np.max(color))
-        bbox = {
-            "facecolor": "black",
-            "alpha": self.alpha,
-            "pad": self.pad,
-            "edgecolor": "none",
-        }
-        x, y = position
-        self.ax.text(
-            x,
-            y,
-            text,
-            size=font_size * self.scale,
-            family="sans-serif",
-            bbox=bbox,
-            verticalalignment="top",
-            horizontalalignment=ha,
-            color=color,
-            zorder=10,
-            rotation=rotation,
-        )
-
-    def save(self, saveas=None):
-        if saveas is None:
-            saveas = self.saveas
-        if saveas.lower().endswith(".jpg") or saveas.lower().endswith(".png"):
-            cv2.imwrite(
-                saveas,
-                self._get_buffer()[:, :, ::-1],
-            )
-        else:
-            self.fig.savefig(saveas)
-
-    def _create_text_labels_attr(self, classes, scores, attr_classes, attr_scores):
-        labels = [self.id2obj[i] for i in classes]
-        attr_labels = [self.id2attr[i] for i in attr_classes]
-        labels = [
-            f"{label} {score:.2f} {attr} {attr_score:.2f}"
-            for label, score, attr, attr_score in zip(labels, scores, attr_labels, attr_scores)
-        ]
-        return labels
-
-    def _create_text_labels(self, classes, scores):
-        labels = [self.id2obj[i] for i in classes]
-        if scores is not None:
-            if labels is None:
-                labels = ["{:.0f}%".format(s * 100) for s in scores]
-            else:
-                labels = ["{} {:.0f}%".format(li, s * 100) for li, s in zip(labels, scores)]
-        return labels
-
-    def _random_color(self, maximum=255):
-        idx = np.random.randint(0, len(_COLORS))
-        ret = _COLORS[idx] * maximum
-        if not self.rgb:
-            ret = ret[::-1]
-        return ret
-
-    def _get_buffer(self):
-        if not self.pynb:
-            s, (width, height) = self.canvas.print_to_buffer()
-            if (width, height) != (self.width, self.height):
-                img = cv2.resize(self.img, (width, height))
-            else:
-                img = self.img
-        else:
-            buf = io.BytesIO()  # works for cairo backend
-            self.canvas.print_rgba(buf)
-            width, height = self.width, self.height
-            s = buf.getvalue()
-            img = self.img
-
-        buffer = np.frombuffer(s, dtype="uint8")
-        img_rgba = buffer.reshape(height, width, 4)
-        rgb, alpha = np.split(img_rgba, [3], axis=2)
-
-        try:
-            import numexpr as ne  # fuse them with numexpr
-
-            visualized_image = ne.evaluate("img * (1 - alpha / 255.0) + rgb * (alpha / 255.0)")
-        except ImportError:
-            alpha = alpha.astype("float32") / 255.0
-            visualized_image = img * (1 - alpha) + rgb * alpha
-
-        return visualized_image.astype("uint8")
-
-    def _change_color_brightness(self, color, brightness_factor):
-        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
-        color = mplc.to_rgb(color)
-        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
-        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
-        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
-        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
-        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
-        return modified_color
-
-
-# Color map
-_COLORS = (
-    np.array(
-        [
-            0.000,
-            0.447,
-            0.741,
-            0.850,
-            0.325,
-            0.098,
-            0.929,
-            0.694,
-            0.125,
-            0.494,
-            0.184,
-            0.556,
-            0.466,
-            0.674,
-            0.188,
-            0.301,
-            0.745,
-            0.933,
-            0.635,
-            0.078,
-            0.184,
-            0.300,
-            0.300,
-            0.300,
-            0.600,
-            0.600,
-            0.600,
-            1.000,
-            0.000,
-            0.000,
-            1.000,
-            0.500,
-            0.000,
-            0.749,
-            0.749,
-            0.000,
-            0.000,
-            1.000,
-            0.000,
-            0.000,
-            0.000,
-            1.000,
-            0.667,
-            0.000,
-            1.000,
-            0.333,
-            0.333,
-            0.000,
-            0.333,
-            0.667,
-            0.000,
-            0.333,
-            1.000,
-            0.000,
-            0.667,
-            0.333,
-            0.000,
-            0.667,
-            0.667,
-            0.000,
-            0.667,
-            1.000,
-            0.000,
-            1.000,
-            0.333,
-            0.000,
-            1.000,
-            0.667,
-            0.000,
-            1.000,
-            1.000,
-            0.000,
-            0.000,
-            0.333,
-            0.500,
-            0.000,
-            0.667,
-            0.500,
-            0.000,
-            1.000,
-            0.500,
-            0.333,
-            0.000,
-            0.500,
-            0.333,
-            0.333,
-            0.500,
-            0.333,
-            0.667,
-            0.500,
-            0.333,
-            1.000,
-            0.500,
-            0.667,
-            0.000,
-            0.500,
-            0.667,
-            0.333,
-            0.500,
-            0.667,
-            0.667,
-            0.500,
-            0.667,
-            1.000,
-            0.500,
-            1.000,
-            0.000,
-            0.500,
-            1.000,
-            0.333,
-            0.500,
-            1.000,
-            0.667,
-            0.500,
-            1.000,
-            1.000,
-            0.500,
-            0.000,
-            0.333,
-            1.000,
-            0.000,
-            0.667,
-            1.000,
-            0.000,
-            1.000,
-            1.000,
-            0.333,
-            0.000,
-            1.000,
-            0.333,
-            0.333,
-            1.000,
-            0.333,
-            0.667,
-            1.000,
-            0.333,
-            1.000,
-            1.000,
-            0.667,
-            0.000,
-            1.000,
-            0.667,
-            0.333,
-            1.000,
-            0.667,
-            0.667,
-            1.000,
-            0.667,
-            1.000,
-            1.000,
-            1.000,
-            0.000,
-            1.000,
-            1.000,
-            0.333,
-            1.000,
-            1.000,
-            0.667,
-            1.000,
-            0.333,
-            0.000,
-            0.000,
-            0.500,
-            0.000,
-            0.000,
-            0.667,
-            0.000,
-            0.000,
-            0.833,
-            0.000,
-            0.000,
-            1.000,
-            0.000,
-            0.000,
-            0.000,
-            0.167,
-            0.000,
-            0.000,
-            0.333,
-            0.000,
-            0.000,
-            0.500,
-            0.000,
-            0.000,
-            0.667,
-            0.000,
-            0.000,
-            0.833,
-            0.000,
-            0.000,
-            1.000,
-            0.000,
-            0.000,
-            0.000,
-            0.167,
-            0.000,
-            0.000,
-            0.333,
-            0.000,
-            0.000,
-            0.500,
-            0.000,
-            0.000,
-            0.667,
-            0.000,
-            0.000,
-            0.833,
-            0.000,
-            0.000,
-            1.000,
-            0.000,
-            0.000,
-            0.000,
-            0.143,
-            0.143,
-            0.143,
-            0.857,
-            0.857,
-            0.857,
-            1.000,
-            1.000,
-            1.000,
-        ]
-    )
-    .astype(np.float32)
-    .reshape(-1, 3)
-)
diff --git a/examples/research_projects/vqgan-clip/README.md b/examples/research_projects/vqgan-clip/README.md
deleted file mode 100644
index a74bf9209b0a..000000000000
--- a/examples/research_projects/vqgan-clip/README.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# Simple VQGAN CLIP
-
-Author: @ErwannMillon 
-
-This is a very simple VQGAN-CLIP implementation that was built as a part of the <a href= "https://github.com/ErwannMillon/face-editor"> Face Editor project </a> . This simplified version allows you to generate or edit images using text with just three lines of code. For a more full featured implementation with masking, more advanced losses, and a full GUI, check out the Face Editor project. 
-
-By default this uses a CelebA checkpoint (for generating/editing faces), but also has an imagenet checkpoint that can be loaded by specifying vqgan_config and vqgan_checkpoint when instantiating VQGAN_CLIP. 
-
-Learning rate and iterations can be set by modifying vqgan_clip.lr and vqgan_clip.iterations . 
-
-You can edit images by passing `image_path` to the generate function. 
-See the generate function's docstring to learn more about how to format prompts. 
-
-## Usage
-The easiest way to test this out is by <a href="https://colab.research.google.com/drive/1Ez4D1J6-hVkmlXeR5jBPWYyu6CLA9Yor?usp=sharing
-">using the Colab demo</a>
-
-To install locally: 
-- Clone this repo
-- Install git-lfs (ubuntu: sudo apt-get install git-lfs , MacOS: brew install git-lfs) 
-
-In the root of the repo run:
-
-```bash
-conda create -n vqganclip python=3.8
-conda activate vqganclip
-git-lfs install
-git clone https://huggingface.co/datasets/erwann/face_editor_model_ckpt model_checkpoints
-pip install -r requirements.txt
-```
-
-### Generate new images
-```python
-from VQGAN_CLIP import VQGAN_CLIP
-vqgan_clip = VQGAN_CLIP()
-vqgan_clip.generate("a picture of a smiling woman")
-```
-
-### Edit an image
-To get a test image, run 
-`git clone https://huggingface.co/datasets/erwann/vqgan-clip-pic test_images`
-
-To edit:
-```python
-from VQGAN_CLIP import VQGAN_CLIP
-vqgan_clip = VQGAN_CLIP()
-
-vqgan_clip.lr = .07
-vqgan_clip.iterations = 15
-vqgan_clip.generate(
-    pos_prompts= ["a picture of a beautiful asian woman", "a picture of a woman from Japan"],
-    neg_prompts=["a picture of an Indian person", "a picture of a white person"],
-    image_path="./test_images/face.jpeg",
-    show_intermediate=True,
-    save_intermediate=True,
-)
-```
-
-### Make an animation from the most recent generation
-`vqgan_clip.make_animation()`
-
-## Features:
-- Positive and negative prompts
-- Multiple prompts
-- Prompt Weights
-- Creating GIF animations of the transformations
-- Wandb logging
-
-
-
diff --git a/examples/research_projects/vqgan-clip/VQGAN_CLIP.py b/examples/research_projects/vqgan-clip/VQGAN_CLIP.py
deleted file mode 100644
index 1bfbc4cd5c36..000000000000
--- a/examples/research_projects/vqgan-clip/VQGAN_CLIP.py
+++ /dev/null
@@ -1,268 +0,0 @@
-import os
-from glob import glob
-
-import imageio
-import torch
-import torchvision
-import wandb
-from img_processing import custom_to_pil, loop_post_process, preprocess, preprocess_vqgan
-from loaders import load_vqgan
-from PIL import Image
-from torch import nn
-
-from transformers import CLIPModel, CLIPTokenizerFast
-from utils import get_device, get_timestamp, show_pil
-
-
-class ProcessorGradientFlow:
-    """
-    This wraps the huggingface CLIP processor to allow backprop through the image processing step.
-    The original processor forces conversion to PIL images, which is faster for image processing but breaks gradient flow.
-    We call the original processor to get the text embeddings, but use our own image processing to keep images as torch tensors.
-    """
-
-    def __init__(self, device: str = "cpu", clip_model: str = "openai/clip-vit-large-patch14") -> None:
-        self.device = device
-        self.tokenizer = CLIPTokenizerFast.from_pretrained(clip_model)
-        self.image_mean = [0.48145466, 0.4578275, 0.40821073]
-        self.image_std = [0.26862954, 0.26130258, 0.27577711]
-        self.normalize = torchvision.transforms.Normalize(self.image_mean, self.image_std)
-        self.resize = torchvision.transforms.Resize(224)
-        self.center_crop = torchvision.transforms.CenterCrop(224)
-
-    def preprocess_img(self, images):
-        images = self.resize(images)
-        images = self.center_crop(images)
-        images = self.normalize(images)
-        return images
-
-    def __call__(self, text=None, images=None, **kwargs):
-        encoding = self.tokenizer(text=text, **kwargs)
-        encoding["pixel_values"] = self.preprocess_img(images)
-        encoding = {key: value.to(self.device) for (key, value) in encoding.items()}
-        return encoding
-
-
-class VQGAN_CLIP(nn.Module):
-    def __init__(
-        self,
-        iterations=10,
-        lr=0.01,
-        vqgan=None,
-        vqgan_config=None,
-        vqgan_checkpoint=None,
-        clip=None,
-        clip_preprocessor=None,
-        device=None,
-        log=False,
-        save_vector=True,
-        return_val="image",
-        quantize=True,
-        save_intermediate=False,
-        show_intermediate=False,
-        make_grid=False,
-    ) -> None:
-        """
-        Instantiate a VQGAN_CLIP model. If you want to use a custom VQGAN model, pass it as vqgan.
-        """
-        super().__init__()
-        self.latent = None
-        self.device = device if device else get_device()
-        if vqgan:
-            self.vqgan = vqgan
-        else:
-            self.vqgan = load_vqgan(self.device, conf_path=vqgan_config, ckpt_path=vqgan_checkpoint)
-        self.vqgan.eval()
-        if clip:
-            self.clip = clip
-        else:
-            self.clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        self.clip.to(self.device)
-        self.clip_preprocessor = ProcessorGradientFlow(device=self.device)
-
-        self.iterations = iterations
-        self.lr = lr
-        self.log = log
-        self.make_grid = make_grid
-        self.return_val = return_val
-        self.quantize = quantize
-        self.latent_dim = self.vqgan.decoder.z_shape
-
-    def make_animation(self, input_path=None, output_path=None, total_duration=5, extend_frames=True):
-        """
-        Make an animation from the intermediate images saved during generation.
-        By default, uses the images from the most recent generation created by the generate function.
-        If you want to use images from a different generation, pass the path to the folder containing the images as input_path.
-        """
-        images = []
-        if output_path is None:
-            output_path = "./animation.gif"
-        if input_path is None:
-            input_path = self.save_path
-        paths = sorted(glob(input_path + "/*"))
-        if not len(paths):
-            raise ValueError(
-                "No images found in save path, aborting (did you pass save_intermediate=True to the generate"
-                " function?)"
-            )
-        if len(paths) == 1:
-            print("Only one image found in save path, (did you pass save_intermediate=True to the generate function?)")
-        frame_duration = total_duration / len(paths)
-        durations = [frame_duration] * len(paths)
-        if extend_frames:
-            durations[0] = 1.5
-            durations[-1] = 3
-        for file_name in paths:
-            if file_name.endswith(".png"):
-                images.append(imageio.imread(file_name))
-        imageio.mimsave(output_path, images, duration=durations)
-        print(f"gif saved to {output_path}")
-
-    def _get_latent(self, path=None, img=None):
-        if not (path or img):
-            raise ValueError("Input either path or tensor")
-        if img is not None:
-            raise NotImplementedError
-        x = preprocess(Image.open(path), target_image_size=256).to(self.device)
-        x_processed = preprocess_vqgan(x)
-        z, *_ = self.vqgan.encode(x_processed)
-        return z
-
-    def _add_vector(self, transform_vector):
-        """Add a vector transform to the base latent and returns the resulting image."""
-        base_latent = self.latent.detach().requires_grad_()
-        trans_latent = base_latent + transform_vector
-        if self.quantize:
-            z_q, *_ = self.vqgan.quantize(trans_latent)
-        else:
-            z_q = trans_latent
-        return self.vqgan.decode(z_q)
-
-    def _get_clip_similarity(self, prompts, image, weights=None):
-        clip_inputs = self.clip_preprocessor(text=prompts, images=image, return_tensors="pt", padding=True)
-        clip_outputs = self.clip(**clip_inputs)
-        similarity_logits = clip_outputs.logits_per_image
-        if weights is not None:
-            similarity_logits = similarity_logits * weights
-        return similarity_logits.sum()
-
-    def _get_clip_loss(self, pos_prompts, neg_prompts, image):
-        pos_logits = self._get_clip_similarity(pos_prompts["prompts"], image, weights=(1 / pos_prompts["weights"]))
-        if neg_prompts:
-            neg_logits = self._get_clip_similarity(neg_prompts["prompts"], image, weights=neg_prompts["weights"])
-        else:
-            neg_logits = torch.tensor([1], device=self.device)
-        loss = -torch.log(pos_logits) + torch.log(neg_logits)
-        return loss
-
-    def _optimize_CLIP(self, original_img, pos_prompts, neg_prompts):
-        vector = torch.randn_like(self.latent, requires_grad=True, device=self.device)
-        optim = torch.optim.Adam([vector], lr=self.lr)
-
-        for i in range(self.iterations):
-            optim.zero_grad()
-            transformed_img = self._add_vector(vector)
-            processed_img = loop_post_process(transformed_img)
-            clip_loss = self._get_CLIP_loss(pos_prompts, neg_prompts, processed_img)
-            print("CLIP loss", clip_loss)
-            if self.log:
-                wandb.log({"CLIP Loss": clip_loss})
-            clip_loss.backward(retain_graph=True)
-            optim.step()
-            if self.return_val == "image":
-                yield custom_to_pil(transformed_img[0])
-            else:
-                yield vector
-
-    def _init_logging(self, positive_prompts, negative_prompts, image_path):
-        wandb.init(reinit=True, project="face-editor")
-        wandb.config.update({"Positive Prompts": positive_prompts})
-        wandb.config.update({"Negative Prompts": negative_prompts})
-        wandb.config.update({"lr": self.lr, "iterations": self.iterations})
-        if image_path:
-            image = Image.open(image_path)
-            image = image.resize((256, 256))
-            wandb.log("Original Image", wandb.Image(image))
-
-    def process_prompts(self, prompts):
-        if not prompts:
-            return []
-        processed_prompts = []
-        weights = []
-        if isinstance(prompts, str):
-            prompts = [prompt.strip() for prompt in prompts.split("|")]
-        for prompt in prompts:
-            if isinstance(prompt, (tuple, list)):
-                processed_prompt = prompt[0]
-                weight = float(prompt[1])
-            elif ":" in prompt:
-                processed_prompt, weight = prompt.split(":")
-                weight = float(weight)
-            else:
-                processed_prompt = prompt
-                weight = 1.0
-            processed_prompts.append(processed_prompt)
-            weights.append(weight)
-        return {
-            "prompts": processed_prompts,
-            "weights": torch.tensor(weights, device=self.device),
-        }
-
-    def generate(
-        self,
-        pos_prompts,
-        neg_prompts=None,
-        image_path=None,
-        show_intermediate=True,
-        save_intermediate=False,
-        show_final=True,
-        save_final=True,
-        save_path=None,
-    ):
-        """Generate an image from the given prompts.
-        If image_path is provided, the image is used as a starting point for the optimization.
-        If image_path is not provided, a random latent vector is used as a starting point.
-        You must provide at least one positive prompt, and optionally provide negative prompts.
-        Prompts must be formatted in one of the following ways:
-        - A single prompt as a string, e.g "A smiling woman"
-        - A set of prompts separated by pipes: "A smiling woman | a woman with brown hair"
-        - A set of prompts and their weights separated by colons: "A smiling woman:1 | a woman with brown hair: 3" (default weight is 1)
-        - A list of prompts, e.g ["A smiling woman", "a woman with brown hair"]
-        - A list of prompts and weights, e.g [("A smiling woman", 1), ("a woman with brown hair", 3)]
-        """
-        if image_path:
-            self.latent = self._get_latent(image_path)
-        else:
-            self.latent = torch.randn(self.latent_dim, device=self.device)
-        if self.log:
-            self._init_logging(pos_prompts, neg_prompts, image_path)
-
-        assert pos_prompts, "You must provide at least one positive prompt."
-        pos_prompts = self.process_prompts(pos_prompts)
-        neg_prompts = self.process_prompts(neg_prompts)
-        if save_final and save_path is None:
-            save_path = os.path.join("./outputs/", "_".join(pos_prompts["prompts"]))
-        if not os.path.exists(save_path):
-            os.makedirs(save_path)
-        else:
-            save_path = save_path + "_" + get_timestamp()
-            os.makedirs(save_path)
-        self.save_path = save_path
-
-        original_img = self.vqgan.decode(self.latent)[0]
-        if show_intermediate:
-            print("Original Image")
-            show_pil(custom_to_pil(original_img))
-
-        original_img = loop_post_process(original_img)
-        for iter, transformed_img in enumerate(self._optimize_CLIP(original_img, pos_prompts, neg_prompts)):
-            if show_intermediate:
-                show_pil(transformed_img)
-            if save_intermediate:
-                transformed_img.save(os.path.join(self.save_path, f"iter_{iter:03d}.png"))
-            if self.log:
-                wandb.log({"Image": wandb.Image(transformed_img)})
-        if show_final:
-            show_pil(transformed_img)
-        if save_final:
-            transformed_img.save(os.path.join(self.save_path, f"iter_{iter:03d}_final.png"))
diff --git a/examples/research_projects/vqgan-clip/img_processing.py b/examples/research_projects/vqgan-clip/img_processing.py
deleted file mode 100644
index 221ebd86dae7..000000000000
--- a/examples/research_projects/vqgan-clip/img_processing.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import numpy as np
-import PIL
-import torch
-import torchvision.transforms as T
-import torchvision.transforms.functional as TF
-from PIL import Image
-
-
-def preprocess(img, target_image_size=256):
-    s = min(img.size)
-
-    if s < target_image_size:
-        raise ValueError(f"min dim for image {s} < {target_image_size}")
-
-    r = target_image_size / s
-    s = (round(r * img.size[1]), round(r * img.size[0]))
-    img = TF.resize(img, s, interpolation=PIL.Image.LANCZOS)
-    img = TF.center_crop(img, output_size=2 * [target_image_size])
-    img = torch.unsqueeze(T.ToTensor()(img), 0)
-    return img
-
-
-def preprocess_vqgan(x):
-    x = 2.0 * x - 1.0
-    return x
-
-
-def custom_to_pil(x, process=True, mode="RGB"):
-    x = x.detach().cpu()
-    if process:
-        x = post_process_tensor(x)
-    x = x.numpy()
-    if process:
-        x = (255 * x).astype(np.uint8)
-    x = Image.fromarray(x)
-    if not x.mode == mode:
-        x = x.convert(mode)
-    return x
-
-
-def post_process_tensor(x):
-    x = torch.clamp(x, -1.0, 1.0)
-    x = (x + 1.0) / 2.0
-    x = x.permute(1, 2, 0)
-    return x
-
-
-def loop_post_process(x):
-    x = post_process_tensor(x.squeeze())
-    return x.permute(2, 0, 1).unsqueeze(0)
diff --git a/examples/research_projects/vqgan-clip/loaders.py b/examples/research_projects/vqgan-clip/loaders.py
deleted file mode 100644
index 88513bcb6918..000000000000
--- a/examples/research_projects/vqgan-clip/loaders.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import importlib
-
-import torch
-import yaml
-from omegaconf import OmegaConf
-from taming.models.vqgan import VQModel
-
-
-def load_config(config_path, display=False):
-    config = OmegaConf.load(config_path)
-    if display:
-        print(yaml.dump(OmegaConf.to_container(config)))
-    return config
-
-
-def load_vqgan(device, conf_path=None, ckpt_path=None):
-    if conf_path is None:
-        conf_path = "./model_checkpoints/vqgan_only.yaml"
-    config = load_config(conf_path, display=False)
-    model = VQModel(**config.model.params)
-    if ckpt_path is None:
-        ckpt_path = "./model_checkpoints/vqgan_only.pt"
-    sd = torch.load(ckpt_path, map_location=device)
-    if ".ckpt" in ckpt_path:
-        sd = sd["state_dict"]
-    model.load_state_dict(sd, strict=True)
-    model.to(device)
-    del sd
-    return model
-
-
-def reconstruct_with_vqgan(x, model):
-    z, _, [_, _, indices] = model.encode(x)
-    print(f"VQGAN --- {model.__class__.__name__}: latent shape: {z.shape[2:]}")
-    xrec = model.decode(z)
-    return xrec
-
-
-def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
-    if reload:
-        module_imp = importlib.import_module(module)
-        importlib.reload(module_imp)
-    return getattr(importlib.import_module(module, package=None), cls)
-
-
-def instantiate_from_config(config):
-    if "target" not in config:
-        raise KeyError("Expected key `target` to instantiate.")
-    return get_obj_from_str(config["target"])(**config.get("params", {}))
-
-
-def load_model_from_config(config, sd, gpu=True, eval_mode=True):
-    model = instantiate_from_config(config)
-    if sd is not None:
-        model.load_state_dict(sd)
-    if gpu:
-        model.cuda()
-    if eval_mode:
-        model.eval()
-    return {"model": model}
-
-
-def load_model(config, ckpt, gpu, eval_mode):
-    # load the specified checkpoint
-    if ckpt:
-        pl_sd = torch.load(ckpt, map_location="cpu")
-        global_step = pl_sd["global_step"]
-        print(f"loaded model from global step {global_step}.")
-    else:
-        pl_sd = {"state_dict": None}
-        global_step = None
-    model = load_model_from_config(config.model, pl_sd["state_dict"], gpu=gpu, eval_mode=eval_mode)["model"]
-    return model, global_step
diff --git a/examples/research_projects/vqgan-clip/requirements.txt b/examples/research_projects/vqgan-clip/requirements.txt
deleted file mode 100644
index 197616324224..000000000000
--- a/examples/research_projects/vqgan-clip/requirements.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-einops
-gradio
-icecream
-imageio
-lpips
-matplotlib
-more_itertools
-numpy
-omegaconf
-opencv_python_headless
-Pillow
-pudb
-pytorch_lightning
-PyYAML
-requests
-scikit_image
-scipy
-setuptools
-streamlit
-taming-transformers
-torch
-torchvision
-tqdm
-transformers==4.48.0
-tokenizers==0.13.2
-typing_extensions
-wandb
diff --git a/examples/research_projects/vqgan-clip/utils.py b/examples/research_projects/vqgan-clip/utils.py
deleted file mode 100644
index 7db45fcbb52b..000000000000
--- a/examples/research_projects/vqgan-clip/utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from datetime import datetime
-
-import matplotlib.pyplot as plt
-import torch
-
-
-def freeze_module(module):
-    for param in module.parameters():
-        param.requires_grad = False
-
-
-def get_device():
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
-        device = "mps"
-    if device == "mps":
-        print(
-            "WARNING: MPS currently doesn't seem to work, and messes up backpropagation without any visible torch"
-            " errors. I recommend using CUDA on a colab notebook or CPU instead if you're facing inexplicable issues"
-            " with generations."
-        )
-    return device
-
-
-def show_pil(img):
-    fig = plt.imshow(img)
-    fig.axes.get_xaxis().set_visible(False)
-    fig.axes.get_yaxis().set_visible(False)
-    plt.show()
-
-
-def get_timestamp():
-    current_time = datetime.now()
-    timestamp = current_time.strftime("%H:%M:%S")
-    return timestamp
diff --git a/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md b/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md
deleted file mode 100644
index 7a580a361324..000000000000
--- a/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md
+++ /dev/null
@@ -1,516 +0,0 @@
-# Fine-Tuning week of XLSR-Wav2Vec2 on 60 languages 🌍
-
-Welcome to the fine-tuning week! The goal of this week is to have state-of-the-art automatic speech recognition (ASR) models in as many languages as possible. The fine-tuning week ends on Friday, the 26th March at midnight PST time.
-
-Participants are encouraged to fine-tune the pretrained [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) checkpoint on one or more of the 60 languages of [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets).
-Furthermore, it is very much appreciated if participants fine-tune XLSR-Wav2Vec2 on a language that is not included in the Common Voice dataset.
-
-All fine-tuned models uploaded until Friday, the 26th March midnight PST, will be taken into account for competition, and the best model per language will be awarded a prize if the best model performs reasonably well. 
-The testing data to evaluate the models will be the official [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets) *`test data`* of version 6.1. Again, participants are very much encouraged to fine-tune XLSR-Wav2Vec2 on languages that are not found in the Common Voice dataset since those languages are even more likely to be underrepresented in the speech community. 
-Each model fine-tuned on a language not found in Common Voice, will be evaluated by the Hugging Face team after Friday, the 26th March at midnight PST, and if the model performs reasonably well, the model receives a prize as well. 
-For more information on which data can be used for training, how the models are evaluated exactly, and what type of data preprocessing can be used, please see ["Training and Evaluation Rules"](#training-and-evaluation-rules).
-
-**Please keep in mind:**
-The spirit of the fine-tuning week is to provide state-of-the-art speech recognition in as many languages as possible to the community! 
-So while we encourage healthy competition between people/groups of the same language so that better results are obtained, it is extremely important that we help each other and share our insights with the whole team/community. 
-What matters in the end is what has been achieved by the team as a whole during the fine-tuning week. 
-That being said, we strongly encourage people to share tips & tricks on the forum or Slack, help each other when team members encounter bugs, and work in groups. 
-To make it easier to share and help, forum threads have been created under the name {language} ASR: Fine-Tuning Wav2Vec2, e.g. here. 
-It is very much possible that prizes will be given to groups of people instead of individuals. Also, don't hesitate to ask questions, propose improvements to the organization, to the material given to participants, etc...🤗
-
-## Table of Contents
-
-- [Organization of the fine tuning week](#organization-of-the-fine-tuning-week)
-- [How to fine tune XLSR Wav2Vec2](#how-to-fine-tune-xlsr-wav2vec2)
-	- [Google colab setup](#google-colab-setup)
-	- [Local machine](#local-machine)
-- [How to upload my trained checkpoint](#how-to-upload-my-trained-checkpoint)
-	- [How to create the README](#how-to-create-the-readme)
-- [How to evaluate my trained checkpoint](#how-to-evaluate-my-trained-checkpoint)
-- [Rules of training and evaluation](#rules-of-training-and-evaluation)
-- [Tips and tricks](#tips-and-tricks)
-	- [How to combine multiple datasests into one](#how-to-combine-multiple-datasets-into-one)
-	- [How to effectively preprocess the data](#how-to-effectively-preprocess-the-data)
-	- [How to efficiently preproces the data](#how-to-do-efficiently-load-datasets-with-limited-ram-and-hard-drive-space)
-	- [How to do hyperparameter tuning](#how-to-do-hyperparameter-tuning)
-	- [How to preprocess and evaluate character based languages](#how-to-preprocess-and-evaluate-character-based-languages)
-- [Further reading material](#further-reading-material)
-- [FAQ](#faq)
-
-## Organization of the fine tuning week
-
-The week officially starts on 22.03.2021 and ends on 29.03.2021, but you are more than welcome to start fine-tuning models before the start date. 
-General questions you might have, general problems you encounter, and general tips can be shared directly on the Slack channel (see [this post](https://discuss.huggingface.co/t/open-to-the-community-xlsr-wav2vec2-fine-tuning-week-for-low-resource-languages/4467) on how to be added to Slack). 
-More language-specific questions or specific bugs should be posted on the [forum](https://discuss.huggingface.co/) (feel free to use already existing language-specific threads, *e.g.* [this one](https://discuss.huggingface.co/t/arabic-asr-fine-tuning-wav2vec2/4608) or open a new one if there is no thread for your language yet) or directly on [github](https://github.com/huggingface/transformers) if you think some code or document needs correction/improvement.
-Starting on Monday, the 22.03.2021, the Hugging Face team will try to provide an overview of currently trained models along with their evaluation results.
-All the necessary information on:
-
-- How to fine-tune the XLSR model
-- How to upload the model
-- How to share your evaluation results & training/eval script
-- What are the training/evaluation rules
-
-can be found in the sections below. If something is still unclear, feel free to drop a message in the Slack channel.
-
-## How to fine tune XLSR Wav2Vec2
-
-This chapter gives an in-detail explanation of how to fine-tune [Facebook's multi-lingual Wav2vec2](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on any language of the [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets).
-
-Two possible setups can be used to fine-tune Wav2Vec2. The easiest setup is to simply use [google colab](https://colab.research.google.com/). It is possible to train the full model in a *free* google colab, but it is recommended to use google colab pro since it is more stable.
-
-The other option is to run a script locally. While this can be more difficult to set up, it also means that you have more control over the training run and probably access to better GPUs than you would have in a google colab. 
-For small datasets, it is usually totally sufficient to train your model
-in a google colab. For larger and thus more memory-intensive datasets, it is probably
-better to fine-tune the model locally.
-
-For each option, we explain in detail how to fine-tune XLSR-Wav2Vec2 in the following.
-
-### Google colab setup
-
-**Note**: Instead of reading the following section, you can simply watch [this](https://www.youtube.com/watch?v=UynYn2C3tI0&ab_channel=PatrickvonPlaten) video, where Patrick explains how to adapt the google colab for your specific language.
-
-**1.**: If you plan on training XLSR-Wav2Vec2 in a google colab, you should first make sure to have a valid gmail account. You can sign up for a gmail account [here](https://accounts.google.com/signup/v2/webcreateaccount?hl=en&flowName=GlifWebSignIn&flowEntry=SignUp). 
-Having successfully signed up for gmail, you can now sign in to your account to make sure you are logged in when opening new tabs in your browser.
-
-**2.**: Next, head over to the official [Fine-Tune XLSR-Wav2Vec2 with 🤗 Transformes](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLSR_Wav2Vec2_on_Turkish_ASR_with_%F0%9F%A4%97_Transformers.ipynb) google colab. The first thing you should do is to make a copy of it - click `->File->Save a copy in Drive`. This should save a copy of the google colab in your google drive. 
-
-**3.**: Now it is highly recommended to carefully read the google colab without running the cells yet. 
-You should get an understanding of the model is trained and what you will have to change when training the model in a different language. 
-Having done so, you can again head over to [Common Voice](https://commonvoice.mozilla.org/en/datasets) and pick a language you want to fine-tune [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on. Make sure you remember the language code (For each language, you can find it under the field "*Version*". It corresponds to **all characters before the first underscore**. *E.g.* for Greek it is *el*, while for Irish it is *ga-IE*.
-
-**4.**: Now you should replace the language code used for the demo of this colab, being *tr* for Turkish with the language code corresponding to the language you just chose in the **second** cell of the google colab. This will load the correct data for your language.
-
-**5.**: It is time to start running the google colab! Make sure that you have selected "GPU" as your runtime environment and you can start running the cells one-by-one. Make sure you attentively read the text between the cells to understand what is happening and to eventually correct the cells to improve the fine-tuning script for your language. Things you might want to improve/change:
- 
- - Data loading. It is very much recommended to use more than just the official training data of the Common Voice dataset. If you find more data on the internet, feel free to use it! Check out the section ["How to combined multiple datasets into one"](#how-to-combine-multiple-datasets-into-one)
-
-- Data Processing. You should adapt the data processing to your specific language. In data processing, you should make the data more uniform so that it will be easier for the model to learn how to classify speech in your data. Here it can be really helpful to be proficient in the language to know what can be done to simplify the language without changing the meaning. 
-Data processing methods include, but are not limited to:
-	- Normalizing your data. Make sure all characters are lower-cased.
-	- Remove typographical symbols and punctuation marks. See a list [here](https://en.wikipedia.org/wiki/List_of_typographical_symbols_and_punctuation_marks). Be careful to not remove punctuation marks that can change the meaning of the sentence. *E.g.* you should not remove the single quotation mark `'` in English, as it would change the words `"it's"` to `"its"` which is a different word and has thus a different meaning. For more tips on data processing see ["How to effectively preprocess the data"](#how-to-effectively-preprocess-the-data")
-
-- Hyperparameter Tuning. Depending on the size of the data you should probably change the hyperparameters of the google colab. You can change any parameter you like. For more tips and tricks see ["How to do hyperparameter tuning for my language"](#how-to-do-hyperparameter-tuning-for-my-language)
-
-When running the google colab make sure that you uncomment the cell corresponding to mounting your google drive to the colab. This cell looks as follows:
-
-```python
-# from google.colab import drive
-# drive.mount('/content/gdrive/')
-``` 
-
-Uncomment it, run it, and follow the instructions to mount your google drive. This way you can be sure that the model parameters and created tokenizer & feature extractor files are saved in **your** google drive.
-
-Also, make sure that you uncomment the cells corresponding to save the preprocessing files and trained model weights to your drive. Otherwise, you might lose a trained model if you google crashes. You should change the name of your model from `wav2vec2-large-xlsr-turkish-demo` to `wav2vec2-large-xlsr-{your_favorite_name}`.
-
-Those cells correspond to:
-
-```python
-# processor.save_pretrained("/content/gdrive/MyDrive/wav2vec2-large-xlsr-turkish-demo")
-```
-
-and the line:
-
-```python
-  output_dir="/content/gdrive/MyDrive/wav2vec2-large-xlsr-turkish-demo",
-```
-
-further below (which should already be uncommented).
-
-Having finished the training you should find the following files/folders under the folder `wav2vec2-large-xlsr-{your_favorite_name}` in your google drive:
-
-- `preprocessor_config.json` - the parameters of the feature extractor
-- `special_tokens_map.json` - the special token map of the tokenizer
-- `tokenizer_config.json` - the parameters of the tokenizer
-- `vocab.json` - the vocabulary of the tokenizer
-- `checkpoint-{...}/` - the saved checkpoints saved during training. Each checkpoint should contain the files: `config.json`, `optimizer.pt`, `pytorch_model.bin`, `scheduler.pt`, `training_args.bin`. The files `config.json` and `pytorch_model.bin` define your model.
-
-If you are happy with your training results it is time to upload your model! 
-Download the following files to your local computer: **`preprocessor_config.json`, `special_tokens_map.json`, `tokenizer_config.json`, `vocab.json`, `config.json`, `pytorch_model.bin`**. Those files fully define a XLSR-Wav2Vec2 model checkpoint.
-
-Awesome you have successfully trained a XLSR-Wav2Vec2 model 😎. Now you can jump to the section ["How to upload my trained checkpoint"](#how-to-upload-my-trained-checkpoint)
-
-### Local machine
-
-We have provided `run_common_voice.py` script to run fine-tuning on local machine. The script is similar to the colab but allows you to launch training using command line, save and continue training from previous checkpoints and launch training on multiple GPUs.
-For bigger datasets, we recommend to train Wav2Vec2 locally instead of in a google colab.
-
-1. To begin with, we should clone transformers localy and install all the required packages.
-
-First, you need to clone the `transformers` repo with:
-
-```bash
-$ git clone https://github.com/huggingface/transformers.git
-```
-
-Second, head over to the `examples/research_projects/wav2vec2` directory, where the `run_common_voice.py` script is located.
-
-```bash
-$ cd transformers/examples/research_projects/wav2vec2
-```
-
-Third, install the required packages. The
-packages are listed in the `requirements.txt` file and can be installed with
-
-```bash
-$ pip install -r requirements.txt
-```
-
-	**Note**: Installing the latest version of `torchaudio` will also upgrade `torch` to it's latest stable version. If you are using specific version of `torch` then make sure
-	to use the correct `torchaudio` version compatible with your version of `torch`. By default the `requirements.txt` will install the latest version of `torchaudio`.
-
-2. Next, take a look at the `run_common_voice.py` script to get an understanding of how it works. In short the script does the following:
-
-	- Load the given common voice dataset
-	- Create vocab for the language
-	- Load the model with given hyperparameters
-	- Pre-process the dataset to input into the model
-	- Run training
-	- Run evaluation
-
-3. The following examples show how you can launch fine-tuning for the common voice dataset. 
-Here we will run the script on the *Turkish* Common Voice dataset for demonstration purposes.
-	
-	**To lanuch fine-tuninig on a single GPU:**
-	
-	```bash
-	python run_common_voice.py \
-		--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
-		--dataset_config_name="tr" \ # use this argument to specify the language code
-		--output_dir=./wav2vec2-large-xlsr-turkish-demo \
-		--overwrite_output_dir \
-		--num_train_epochs="5" \
-		--per_device_train_batch_size="16" \
-		--learning_rate="3e-4" \
-		--warmup_steps="500" \
-		--eval_strategy="steps" \
-		--save_steps="400" \
-		--eval_steps="400" \
-		--logging_steps="400" \
-		--save_total_limit="3" \
-		--freeze_feature_extractor \
-		--feat_proj_dropout="0.0" \
-		--layerdrop="0.1" \
-		--gradient_checkpointing \
-		--fp16 \
-		--group_by_length \
-		--do_train --do_eval
-	```
-
-	**To lanuch fine-tuninig on multiple GPUs:**
-	
-	```bash
-	python -m torch.distributed.launch \
-		--nproc_per_node 4 run_common_voice.py \
-		--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
-		--dataset_config_name="tr" \ # use this argument to specify the language code
-		--output_dir=./wav2vec2-large-xlsr-turkish-demo \
-		--overwrite_output_dir \
-		--num_train_epochs="5" \
-		--per_device_train_batch_size="16" \
-		--learning_rate="3e-4" \
-		--warmup_steps="500" \
-		--eval_strategy="steps" \
-		--save_steps="400" \
-		--eval_steps="400" \
-		--logging_steps="400" \
-		--save_total_limit="3" \
-		--freeze_feature_extractor \
-		--feat_proj_dropout="0.0" \
-		--layerdrop="0.1" \
-		--gradient_checkpointing \
-		--fp16 \
-		--group_by_length \
-		--do_train --do_eval
-	```
-
-	The above command will launch the training on 4 GPUs. Use the `--nproc_per_node` option to specify the number of GPUs.
-
-	Once the training is finished, the model and checkpoints will be saved under the directory specified by the `--output_dir` argument.
-
-4. The script also allows you to resume training from the last saved checkpoint. To resume training from last saved checkpoint remove the `--overwrite_output_dir` option and run the same command again.  And to continue training from a specific checkpoint, keep the `--overwrite_output_dir`
-option and pass the path of the checkpoint as `--model_name_or_path`.
-
-As the script is based on the `Trainer` API, refer to the [Trainer docs](https://huggingface.co/transformers/main_classes/trainer.html) for more information about ``Trainer`` and ``TrainingArguments``.
-
-[OVH cloud](https://www.ovh.com/world/) has generously offered free compute for this sprint. Please refer to [this video](https://www.youtube.com/watch?v=2hlkWAESMk8&ab_channel=Databuzzword) to get started with OVH. 
-
-
-## How to upload my trained checkpoint
-
-To upload your trained checkpoint, you have to create a new model repository on the 🤗 model hub, from this page: https://huggingface.co/new
-
-> You can also follow the more in-depth instructions [here](https://huggingface.co/transformers/model_sharing.html) if needed.
-
-Having created your model repository on the hub, you should clone it locally:
-
-```bash
-git lfs install
-
-git clone https://huggingface.co/username/your-model-name
-```
-
-Then and add the following files that fully define a XLSR-Wav2Vec2 checkpoint into the repository. You should have added the following files.
-
-- `preprocessor_config.json`
-- `special_tokens_map.json`
-- `tokenizer_config.json`
-- `vocab.json`
-- `config.json`
-- `pytorch_model.bin`
-
-Having added the above files, you should run the following to push files to your model repository.  
-```bash
-git add . && git commit -m "Add model files" && git push
-```
-
-The next **very important** step is to create the model card. For people to use your fine-tuned 
-model it is important to understand: 
-
-- What kind of model is it?
-- What is your model useful for?
-- What data was your model trained on?
-- How well does your model perform?
-
-All these questions should be answered in a model card which is the first thing people see when 
-visiting your model on the hub under `https://huggingface.co/{your_username}/{your_modelname}`.
-
-**Note**:
-It is extremely important that you add this model card or else we cannot find your model and thus cannot take the model into 
-account for the final evaluation.
-
-### How to create the readme
-
-The model card is written in markdown (`.md`) and should be added by simply clicking on the "Add model card" button which is found on the top right corner. 
-You are encouraged to copy-paste the following template into your model card. 
-
-**Make sure that** instead of copying the output of the markdown file you copy the **raw** version of the following part. 
-
-To get the raw version of this file, simply click on the "`raw`" button on the top right corner of this file next to "`blame`" and copy everything below the marker.
-Make sure that you read and consequently remove all #TODO: statements from the model card. 
-
-<======================Copy **raw** version from here=========================
----
-language: {lang_id} #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
-datasets:
-- common_voice #TODO: remove if you did not use the common voice dataset
-- TODO: add more datasets if you have used additional datasets. Make sure to use the exact same 
-dataset name as the one found [here](https://huggingface.co/datasets). If the dataset can not be found in the official datasets, just give it a new name
-metrics:
-- wer
-tags:
-- audio
-- automatic-speech-recognition
-- speech
-- xlsr-fine-tuning-week
-license: apache-2.0
-model-index:
-- name: {human_readable_name} #TODO: replace {human_readable_name} with a name of your model as it should appear on the leaderboard. It could be something like `Elgeish XLSR Wav2Vec2 Large 53`
-  results:
-  - task: 
-      name: Speech Recognition
-      type: automatic-speech-recognition
-    dataset:
-      name: Common Voice {lang_id} #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
-      type: common_voice
-      args: {lang_id} #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
-    metrics:
-       - name: Test WER
-         type: wer
-         value: {wer_result_on_test} #TODO (IMPORTANT): replace {wer_result_on_test} with the WER error rate you achieved on the common_voice test set. It should be in the format XX.XX (don't add the % sign here). **Please** remember to fill out this value after you evaluated your model, so that your model appears on the leaderboard. If you fill out this model card before evaluating your model, please remember to edit the model card afterward to fill in your value
----
-
-# Wav2Vec2-Large-XLSR-53-{language} #TODO: replace language with your {language}, *e.g.* French
-
-Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on {language} using the [Common Voice](https://huggingface.co/datasets/common_voice), ... and ... dataset{s}. #TODO: replace {language} with your language, *e.g.* French and eventually add more datasets that were used and eventually remove common voice if model was not trained on common voice
-When using this model, make sure that your speech input is sampled at 16kHz.
-
-## Usage
-
-The model can be used directly (without a language model) as follows:
-
-```python
-import torch
-import torchaudio
-from datasets import load_dataset
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-
-test_dataset = load_dataset("common_voice", "{lang_id}", split="test[:2%]") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
-
-processor = Wav2Vec2Processor.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
-model = Wav2Vec2ForCTC.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
-
-resampler = torchaudio.transforms.Resample(48_000, 16_000)
-
-# Preprocessing the datasets.
-# We need to read the aduio files as arrays
-def speech_file_to_array_fn(batch):
-	speech_array, sampling_rate = torchaudio.load(batch["path"])
-	batch["speech"] = resampler(speech_array).squeeze().numpy()
-	return batch
-
-test_dataset = test_dataset.map(speech_file_to_array_fn)
-inputs = processor(test_dataset[:2]["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
-
-with torch.no_grad():
-	logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
-
-predicted_ids = torch.argmax(logits, dim=-1)
-
-print("Prediction:", processor.batch_decode(predicted_ids))
-print("Reference:", test_dataset[:2]["sentence"])
-```
-
-
-## Evaluation
-
-The model can be evaluated as follows on the {language} test data of Common Voice.  # TODO: replace #TODO: replace language with your {language}, *e.g.* French
-
-
-```python
-import torch
-import torchaudio
-from datasets import load_dataset, load_metric
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-import re
-
-test_dataset = load_dataset("common_voice", "{lang_id}", split="test") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
-wer = load_metric("wer")
-
-processor = Wav2Vec2Processor.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
-model = Wav2Vec2ForCTC.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
-model.to("cuda")
-
-chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'  # TODO: adapt this list to include all special characters you removed from the data
-resampler = torchaudio.transforms.Resample(48_000, 16_000)
-
-# Preprocessing the datasets.
-# We need to read the aduio files as arrays
-def speech_file_to_array_fn(batch):
-	batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
-	speech_array, sampling_rate = torchaudio.load(batch["path"])
-	batch["speech"] = resampler(speech_array).squeeze().numpy()
-	return batch
-
-test_dataset = test_dataset.map(speech_file_to_array_fn)
-
-# Preprocessing the datasets.
-# We need to read the aduio files as arrays
-def evaluate(batch):
-	inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
-
-	with torch.no_grad():
-		logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
-
-	pred_ids = torch.argmax(logits, dim=-1)
-	batch["pred_strings"] = processor.batch_decode(pred_ids)
-	return batch
-
-result = test_dataset.map(evaluate, batched=True, batch_size=8)
-
-print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
-```
-
-**Test Result**: XX.XX %  # TODO: write output of print here. IMPORTANT: Please remember to also replace {wer_result_on_test} at the top of with this value here. tags.
-
-
-## Training
-
-The Common Voice `train`, `validation`, and ... datasets were used for training as well as ... and ...  # TODO: adapt to state all the datasets that were used for training.
-
-The script used for training can be found [here](...) # TODO: fill in a link to your training script here. If you trained your model in a colab, simply fill in the link here. If you trained the model locally, it would be great if you could upload the training script on github and paste the link here.
-
-=======================To here===============================>
-
-Your model in then available under *huggingface.co/{your_username}/{your_chosen_xlsr-large_model_name}* for everybody to use 🎉.
-
-## How to evaluate my trained checkpoint
-
-Having uploaded your model, you should now evaluate your model in a final step. This should be as simple as 
-copying the evaluation code of your model card into a python script and running it. Make sure to note 
-the final result on the model card **both** under the YAML tags at the very top **and** below your evaluation code under "Test Results".
-
-## Rules of training and evaluation
-
-In this section, we will quickly go over what data is allowed to be used as training 
-data, what kind of data preprocessing is allowed be used, and how the model should be evaluated.
-
-To make it very simple regarding the first point: **All data except the official common voice `test` data set can be used as training data**. For models trained in a language that is not included in Common Voice, the author of the model is responsible to 
-leave a reasonable amount of data for evaluation.
-
-Second, the rules regarding the preprocessing are not that as straight-forward. It is allowed (and recommended) to 
-normalize the data to only have lower-case characters. It is also allowed (and recommended) to remove typographical 
-symbols and punctuation marks. A list of such symbols can *e.g.* be fonud [here](https://en.wikipedia.org/wiki/List_of_typographical_symbols_and_punctuation_marks) - however here we already must be careful. We should **not** remove a symbol that 
-would change the meaning of the words, *e.g.* in English, we should not remove the single quotation mark `'` since it 
-would change the meaning of the word `"it's"` to `"its"` which would then be incorrect. So the golden rule here is to 
-not remove any characters that could change the meaning of a word into another word. This is not always obvious and should 
-be given some consideration. As another example, it is fine to remove the "Hypen-minus" sign "`-`" since it doesn't change the 
-meaninng of a word to another one. *E.g.* "`fine-tuning`" would be changed to "`finetuning`" which has still the same meaning.
-
-Since those choices are not always obvious when in doubt feel free to ask on Slack or even better post on the forum, as was 
-done, *e.g.* [here](https://discuss.huggingface.co/t/spanish-asr-fine-tuning-wav2vec2/4586).
-
-## Tips and tricks
-
-This section summarizes a couple of tips and tricks across various topics. It will continously be updated during the week.
-
-### How to combine multiple datasets into one
-
-Check out [this](https://discuss.huggingface.co/t/how-to-combine-local-data-files-with-an-official-dataset/4685) post.
-
-### How to effectively preprocess the data
-
-
-### How to do efficiently load datasets with limited ram and hard drive space
-
-Check out [this](https://discuss.huggingface.co/t/german-asr-fine-tuning-wav2vec2/4558/8?u=patrickvonplaten) post.
-
-
-### How to do hyperparameter tuning
-
-
-### How to preprocess and evaluate character based languages
-
-
-## Further reading material
-
-It is recommended that take some time to read up on how Wav2vec2 works in theory. 
-Getting a better understanding of the theory and the inner mechanisms of the model often helps when fine-tuning the model. 
-
-**However**, if you don't like reading blog posts/papers, don't worry - it is by no means necessary to go through the theory to fine-tune Wav2Vec2 on your language of choice.
-
-If you are interested in learning more about the model though, here are a couple of resources that are important to better understand Wav2Vec2:
-
-- [Facebook's Wav2Vec2 blog post](https://ai.facebook.com/blog/wav2vec-state-of-the-art-speech-recognition-through-self-supervision/)
-- [Official Wav2Vec2 paper](https://arxiv.org/abs/2006.11477)
-- [Official XLSR Wav2vec2 paper](https://arxiv.org/pdf/2006.13979.pdf)
-- [Hugging Face Blog](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2)
-- [How does CTC (Connectionist Temporal Classification) work](https://distill.pub/2017/ctc/)
-
-It helps to have a good understanding of the following points:
-
-- How was XLSR-Wav2Vec2 pretrained? -> Feature vectors were masked and had to be predicted by the model; very similar in spirit to masked language model of BERT.
-
-- What parts of XLSR-Wav2Vec2 are responsible for what? What is the feature extractor part used for? -> extract feature vectors from the 1D raw audio waveform; What is the transformer part doing? -> mapping feature vectors to contextualized feature vectors; ...
-
-- What part of the model needs to be fine-tuned? -> The pretrained model **does not** include a language head to classify the contextualized features to letters. This is randomly initialized when loading the pretrained checkpoint and has to be fine-tuned. Also, note that the authors recommend to **not** further fine-tune the feature extractor.
-
-- What data was used to XLSR-Wav2Vec2? The checkpoint we will use for further fine-tuning was pretrained on **53** languages. 
-
-- What languages are considered to be similar by XLSR-Wav2Vec2? In the official [XLSR Wav2Vec2 paper](https://arxiv.org/pdf/2006.13979.pdf), the authors show nicely which languages share a common contextualized latent space. It might be useful for you to extend your training data with data of other languages that are considered to be very similar by the model (or you).
-
-
-## FAQ
-
-- Can a participant fine-tune models for more than one language? 
-Yes! A participant can fine-tune models in as many languages she/he likes
-- Can a participant use extra data (apart from the common voice data)?
-Yes! All data except the official common voice `test data` can be used for training.
-If a participant wants to train a model on a language that is not part of Common Voice (which 
-is very much encouraged!), the participant should make sure that some test data is held out to 
-make sure the model is not overfitting.
-- Can we fine-tune for high-resource languages? 
-Yes! While we do not really recommend people to fine-tune models in English since there are
-already so many fine-tuned speech recognition models in English. However, it is very much 
-appreciated if participants want to fine-tune models in other "high-resource" languages, such 
-as French, Spanish, or German. For such cases, one probably needs to train locally and apply 
-might have to apply tricks such as lazy data loading (check the ["Lazy data loading"](#how-to-do-lazy-data-loading) section for more details).
diff --git a/examples/research_projects/wav2vec2/README.md b/examples/research_projects/wav2vec2/README.md
deleted file mode 100644
index 88f62778a3ad..000000000000
--- a/examples/research_projects/wav2vec2/README.md
+++ /dev/null
@@ -1,249 +0,0 @@
-**NOTE**: This example is outdated and is not longer actively maintained. Please 
-follow the new instructions of fine-tuning Wav2Vec2 [here](https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/README.md)
-
-## Fine-tuning Wav2Vec2
-
-The `run_asr.py` script allows one to fine-tune pretrained Wav2Vec2 models that can be found [here](https://huggingface.co/models?search=facebook/wav2vec2).
-
-This finetuning script can also be run as a google colab [TODO: here]( ).
-
-### Fine-Tuning with TIMIT
-Let's take a look at the [script](./finetune_base_timit_asr.sh) used to fine-tune [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base)
-with the [TIMIT dataset](https://huggingface.co/datasets/timit_asr):
-
-```bash
-#!/usr/bin/env bash
-python run_asr.py \
---output_dir="./wav2vec2-base-timit-asr" \
---num_train_epochs="30" \
---per_device_train_batch_size="20" \
---per_device_eval_batch_size="20" \
---eval_strategy="steps" \
---save_steps="500" \
---eval_steps="100" \
---logging_steps="50" \
---learning_rate="5e-4" \
---warmup_steps="3000" \
---model_name_or_path="facebook/wav2vec2-base" \
---fp16 \
---dataset_name="timit_asr" \
---train_split_name="train" \
---validation_split_name="test" \
---orthography="timit" \
---preprocessing_num_workers="$(nproc)" \
---group_by_length \
---freeze_feature_extractor \
---verbose_logging \
-```
-
-The resulting model and inference examples can be found [here](https://huggingface.co/elgeish/wav2vec2-base-timit-asr).
-Some of the arguments above may look unfamiliar, let's break down what's going on:
-
-`--orthography="timit"` applies certain text preprocessing rules, for tokenization and normalization, to clean up the dataset.
-In this case, we use the following instance of `Orthography`:
-
-```python
-Orthography(
-    do_lower_case=True,
-    # break compounds like "quarter-century-old" and replace pauses "--"
-    translation_table=str.maketrans({"-": " "}),
-)
-```
-
-The instance above is used as follows:
-* creates a tokenizer with `do_lower_case=True` (ignores casing for input and lowercases output when decoding)
-* replaces `"-"` with `" "` to break compounds like `"quarter-century-old"` and to clean up suspended hyphens
-* cleans up consecutive whitespaces (replaces them with a single space: `" "`)
-* removes characters not in vocabulary (lacking respective sound units)
-
-`--verbose_logging` logs text preprocessing updates and when evaluating, using the validation split every `eval_steps`,
-logs references and predictions.
-
-### Fine-Tuning with Arabic Speech Corpus
-
-Other datasets, like the [Arabic Speech Corpus dataset](https://huggingface.co/datasets/arabic_speech_corpus),
-require more work! Let's take a look at the [script](./finetune_large_xlsr_53_arabic_speech_corpus.sh)
-used to fine-tune [wav2vec2-large-xlsr-53](https://huggingface.co/elgeish/wav2vec2-large-xlsr-53-arabic):
-
-```bash
-#!/usr/bin/env bash
-python run_asr.py \
---output_dir="./wav2vec2-large-xlsr-53-arabic-speech-corpus" \
---num_train_epochs="50" \
---per_device_train_batch_size="1" \
---per_device_eval_batch_size="1" \
---gradient_accumulation_steps="8" \
---eval_strategy="steps" \
---save_steps="500" \
---eval_steps="100" \
---logging_steps="50" \
---learning_rate="5e-4" \
---warmup_steps="3000" \
---model_name_or_path="elgeish/wav2vec2-large-xlsr-53-arabic" \
---fp16 \
---dataset_name="arabic_speech_corpus" \
---train_split_name="train" \
---validation_split_name="test" \
---max_duration_in_seconds="15" \
---orthography="buckwalter" \
---preprocessing_num_workers="$(nproc)" \
---group_by_length \
---freeze_feature_extractor \
---target_feature_extractor_sampling_rate \
---verbose_logging \
-```
-
-First, let's understand how this dataset represents Arabic text; it uses a format called
-[Buckwalter transliteration](https://en.wikipedia.org/wiki/Buckwalter_transliteration).
-We use the [lang-trans](https://github.com/kariminf/lang-trans) package to convert back to Arabic when logging.
-The Buckwalter format only includes ASCII characters, some of which are non-alpha (e.g., `">"` maps to `"أ"`).
-
-`--orthography="buckwalter"` applies certain text preprocessing rules, for tokenization and normalization, to clean up the dataset. In this case, we use the following instance of `Orthography`:
-
-```python
-Orthography(
-    vocab_file=pathlib.Path(__file__).parent.joinpath("vocab/buckwalter.json"),
-    word_delimiter_token="/",  # "|" is Arabic letter alef with madda above
-    words_to_remove={"sil"},  # fixing "sil" in arabic_speech_corpus dataset
-    untransliterator=arabic.buckwalter.untransliterate,
-    translation_table=str.maketrans(translation_table = {
-        "-": " ",  # sometimes used to represent pauses
-        "^": "v",  # fixing "tha" in arabic_speech_corpus dataset
-    }),
-)
-```
-
-The instance above is used as follows:
-* creates a tokenizer with Buckwalter vocabulary and `word_delimiter_token="/"`
-* replaces `"-"` with `" "` to clean up hyphens and fixes the orthography for `"ث"`
-* removes words used as indicators (in this case, `"sil"` is used for silence)
-* cleans up consecutive whitespaces (replaces them with a single space: `" "`)
-* removes characters not in vocabulary (lacking respective sound units)
-
-`--verbose_logging` logs text preprocessing updates and when evaluating, using the validation split every `eval_steps`,
-logs references and predictions. Using the Buckwalter format, text is also logged in Arabic abjad.
-
-`--target_feature_extractor_sampling_rate` resamples audio to target feature extractor's sampling rate (16kHz).
-
-`--max_duration_in_seconds="15"` filters out examples whose audio is longer than the specified limit,
-which helps with capping GPU memory usage.
-
-
-### DeepSpeed Integration
-
-To learn how to deploy Deepspeed Integration please refer to [this guide](https://huggingface.co/transformers/main/main_classes/deepspeed.html#deepspeed-trainer-integration).
-
-But to get started quickly all you need is to install:
-```bash
-pip install deepspeed
-```
-and then use the default configuration files in this directory:
-
-* `ds_config_wav2vec2_zero2.json`
-* `ds_config_wav2vec2_zero3.json`
-
-Here are examples of how you can use DeepSpeed:
-
-(edit the value for `--num_gpus` to match the number of GPUs you have)
-
-ZeRO-2:
-
-```bash
-PYTHONPATH=../../../src deepspeed --num_gpus 2 \
-run_asr.py \
---output_dir=output_dir --num_train_epochs=2 --per_device_train_batch_size=2 \
---per_device_eval_batch_size=2 --eval_strategy=steps --save_steps=500 --eval_steps=100 \
---logging_steps=5 --learning_rate=5e-4 --warmup_steps=3000 \
---model_name_or_path=patrickvonplaten/wav2vec2_tiny_random_robust \
---dataset_name=hf-internal-testing/librispeech_asr_dummy --dataset_config_name=clean \
---train_split_name=validation --validation_split_name=validation --orthography=timit \
---preprocessing_num_workers=1 --group_by_length --freeze_feature_extractor --verbose_logging \
---deepspeed ds_config_wav2vec2_zero2.json
-```
-
-For ZeRO-2 with more than 1 gpu you need to use (which is already in the example configuration file):
-```json
-    "zero_optimization": {
-        ...
-        "find_unused_parameters": true,
-        ...
-    }
-```
-
-ZeRO-3:
-
-```bash
-PYTHONPATH=../../../src deepspeed --num_gpus 2 \
-run_asr.py \
---output_dir=output_dir --num_train_epochs=2 --per_device_train_batch_size=2 \
---per_device_eval_batch_size=2 --eval_strategy=steps --save_steps=500 --eval_steps=100 \
---logging_steps=5 --learning_rate=5e-4 --warmup_steps=3000 \
---model_name_or_path=patrickvonplaten/wav2vec2_tiny_random_robust \
---dataset_name=hf-internal-testing/librispeech_asr_dummy --dataset_config_name=clean \
---train_split_name=validation --validation_split_name=validation --orthography=timit \
---preprocessing_num_workers=1 --group_by_length --freeze_feature_extractor --verbose_logging \
---deepspeed ds_config_wav2vec2_zero3.json
-```
-
-### Pretraining Wav2Vec2
-
-The `run_pretrain.py` script allows one to pretrain a Wav2Vec2 model from scratch using Wav2Vec2's contrastive loss objective (see official [paper](https://arxiv.org/abs/2006.11477) for more information). 
-It is recommended to pre-train Wav2Vec2 with Trainer + Deepspeed (please refer to [this guide](https://huggingface.co/transformers/main/main_classes/deepspeed.html#deepspeed-trainer-integration) for more information).
-
-Here is an example of how you can use DeepSpeed ZeRO-2 to pretrain a small Wav2Vec2 model:
-
-```bash
-PYTHONPATH=../../../src deepspeed --num_gpus 4 run_pretrain.py \
---output_dir="./wav2vec2-base-libri-100h" \
---num_train_epochs="3" \
---per_device_train_batch_size="32" \
---per_device_eval_batch_size="32" \
---gradient_accumulation_steps="2" \
---save_total_limit="3" \
---save_steps="500" \
---logging_steps="10" \
---learning_rate="5e-4" \
---weight_decay="0.01" \
---warmup_steps="3000" \
---model_name_or_path="patrickvonplaten/wav2vec2-base-libri-100h" \
---dataset_name="librispeech_asr" \
---dataset_config_name="clean" \
---train_split_name="train.100" \
---preprocessing_num_workers="4" \
---max_duration_in_seconds="10.0" \
---group_by_length \
---verbose_logging \
---fp16 \
---deepspeed ds_config_wav2vec2_zero2.json \
-```
-
-
-### Forced Alignment
-
-Character level forced alignment for audio and text pairs with wav2vec2 models finetuned on ASR task for a specific language.
-Inspired by [this](https://pytorch.org/tutorials/intermediate/forced_alignment_with_torchaudio_tutorial.html) Pytorch tutorial.
-
-#### Input Formats
-
-    Input format in script.txt              Input format in wavs directroy
-    0000    sentence1                       0000.wav
-    0001    sentence2                       0001.wav
-    
-#### Output Format
-
-Output directory will contain 0000.txt and 0001.txt. Each file will have format like below
-
-    char    score   start_ms    end_ms
-    h       0.25    1440        1520
-    
-#### Run command
-
-```bash
-python alignment.py  \
---model_name="arijitx/wav2vec2-xls-r-300m-bengali" \
---wav_dir="./wavs"
---text_file="script.txt" \
---input_wavs_sr=48000 \
---output_dir="./out_alignment" \
---cuda
-```
diff --git a/examples/research_projects/wav2vec2/alignment.py b/examples/research_projects/wav2vec2/alignment.py
deleted file mode 100644
index 55b477f5ee96..000000000000
--- a/examples/research_projects/wav2vec2/alignment.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Parts of the code are adapted from the snippets provided in the TorchAudio Wav2Vec forced alignment tutorial.
-# The full tutorial can be found here: https://pytorch.org/audio/stable/tutorials/forced_alignment_tutorial.html
-
-import argparse
-import os
-from dataclasses import dataclass
-
-import torch
-import torchaudio
-from tqdm import tqdm
-
-from transformers import AutoConfig, AutoModelForCTC, AutoProcessor
-
-
-class Wav2Vec2Aligner:
-    def __init__(self, model_name, input_wavs_sr, cuda):
-        self.cuda = cuda
-        self.config = AutoConfig.from_pretrained(model_name)
-        self.model = AutoModelForCTC.from_pretrained(model_name)
-        self.model.eval()
-        if self.cuda:
-            self.model.to(device="cuda")
-        self.processor = AutoProcessor.from_pretrained(model_name)
-        self.resampler = torchaudio.transforms.Resample(input_wavs_sr, 16_000)
-        blank_id = 0
-        vocab = list(self.processor.tokenizer.get_vocab().keys())
-        for i in range(len(vocab)):
-            if vocab[i] == "[PAD]" or vocab[i] == "<pad>":
-                blank_id = i
-        print("Blank Token id [PAD]/<pad>", blank_id)
-        self.blank_id = blank_id
-
-    def speech_file_to_array_fn(self, wav_path):
-        speech_array, sampling_rate = torchaudio.load(wav_path)
-        speech = self.resampler(speech_array).squeeze().numpy()
-        return speech
-
-    def align_single_sample(self, item):
-        blank_id = self.blank_id
-        transcript = "|".join(item["sent"].split(" "))
-        if not os.path.isfile(item["wav_path"]):
-            print(item["wav_path"], "not found in wavs directory")
-
-        speech_array = self.speech_file_to_array_fn(item["wav_path"])
-        inputs = self.processor(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)
-        if self.cuda:
-            inputs = inputs.to(device="cuda")
-
-        with torch.no_grad():
-            logits = self.model(inputs.input_values).logits
-
-        # get the emission probability at frame level
-        emissions = torch.log_softmax(logits, dim=-1)
-        emission = emissions[0].cpu().detach()
-
-        # get labels from vocab
-        labels = ([""] + list(self.processor.tokenizer.get_vocab().keys()))[
-            :-1
-        ]  # logits don't align with the tokenizer's vocab
-
-        dictionary = {c: i for i, c in enumerate(labels)}
-        tokens = []
-        for c in transcript:
-            if c in dictionary:
-                tokens.append(dictionary[c])
-
-        def get_trellis(emission, tokens, blank_id=0):
-            """
-            Build a trellis matrix of shape (num_frames + 1, num_tokens + 1)
-            that represents the probabilities of each source token being at a certain time step
-            """
-            num_frames = emission.size(0)
-            num_tokens = len(tokens)
-
-            # Trellis has extra diemsions for both time axis and tokens.
-            # The extra dim for tokens represents <SoS> (start-of-sentence)
-            # The extra dim for time axis is for simplification of the code.
-            trellis = torch.full((num_frames + 1, num_tokens + 1), -float("inf"))
-            trellis[:, 0] = 0
-            for t in range(num_frames):
-                trellis[t + 1, 1:] = torch.maximum(
-                    # Score for staying at the same token
-                    trellis[t, 1:] + emission[t, blank_id],
-                    # Score for changing to the next token
-                    trellis[t, :-1] + emission[t, tokens],
-                )
-            return trellis
-
-        trellis = get_trellis(emission, tokens, blank_id)
-
-        @dataclass
-        class Point:
-            token_index: int
-            time_index: int
-            score: float
-
-        def backtrack(trellis, emission, tokens, blank_id=0):
-            """
-            Walk backwards from the last (sentence_token, time_step) pair to build the optimal sequence alignment path
-            """
-            # Note:
-            # j and t are indices for trellis, which has extra dimensions
-            # for time and tokens at the beginning.
-            # When referring to time frame index `T` in trellis,
-            # the corresponding index in emission is `T-1`.
-            # Similarly, when referring to token index `J` in trellis,
-            # the corresponding index in transcript is `J-1`.
-            j = trellis.size(1) - 1
-            t_start = torch.argmax(trellis[:, j]).item()
-
-            path = []
-            for t in range(t_start, 0, -1):
-                # 1. Figure out if the current position was stay or change
-                # Note (again):
-                # `emission[J-1]` is the emission at time frame `J` of trellis dimension.
-                # Score for token staying the same from time frame J-1 to T.
-                stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
-                # Score for token changing from C-1 at T-1 to J at T.
-                changed = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
-
-                # 2. Store the path with frame-wise probability.
-                prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item()
-                # Return token index and time index in non-trellis coordinate.
-                path.append(Point(j - 1, t - 1, prob))
-
-                # 3. Update the token
-                if changed > stayed:
-                    j -= 1
-                    if j == 0:
-                        break
-            else:
-                raise ValueError("Failed to align")
-            return path[::-1]
-
-        path = backtrack(trellis, emission, tokens, blank_id)
-
-        @dataclass
-        class Segment:
-            label: str
-            start: int
-            end: int
-            score: float
-
-            def __repr__(self):
-                return f"{self.label}\t{self.score:4.2f}\t{self.start*20:5d}\t{self.end*20:5d}"
-
-            @property
-            def length(self):
-                return self.end - self.start
-
-        def merge_repeats(path):
-            """
-            Merge repeated tokens into a single segment. Note: this shouldn't affect repeated characters from the
-            original sentences (e.g. `ll` in `hello`)
-            """
-            i1, i2 = 0, 0
-            segments = []
-            while i1 < len(path):
-                while i2 < len(path) and path[i1].token_index == path[i2].token_index:
-                    i2 += 1
-                score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1)
-                segments.append(
-                    Segment(
-                        transcript[path[i1].token_index],
-                        path[i1].time_index,
-                        path[i2 - 1].time_index + 1,
-                        score,
-                    )
-                )
-                i1 = i2
-            return segments
-
-        segments = merge_repeats(path)
-        with open(item["out_path"], "w") as out_align:
-            for seg in segments:
-                out_align.write(str(seg) + "\n")
-
-    def align_data(self, wav_dir, text_file, output_dir):
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-
-        # load text file
-        lines = open(text_file, encoding="utf8").readlines()
-
-        items = []
-        for line in lines:
-            if len(line.strip().split("\t")) != 2:
-                print("Script must be in format: 00001  this is my sentence")
-                exit()
-
-            wav_name, sentence = line.strip().split("\t")
-            wav_path = os.path.join(wav_dir, wav_name + ".wav")
-            out_path = os.path.join(output_dir, wav_name + ".txt")
-
-            items.append({"sent": sentence, "wav_path": wav_path, "out_path": out_path})
-        print("Number of samples found in script file", len(items))
-
-        for item in tqdm(items):
-            self.align_single_sample(item)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name", type=str, default="arijitx/wav2vec2-xls-r-300m-bengali", help="wav2vec model name"
-    )
-    parser.add_argument("--wav_dir", type=str, default="./wavs", help="directory containing wavs")
-    parser.add_argument("--text_file", type=str, default="script.txt", help="file containing text")
-    parser.add_argument("--input_wavs_sr", type=int, default=16000, help="sampling rate of input audios")
-    parser.add_argument(
-        "--output_dir", type=str, default="./out_alignment", help="output directory containing the alignment files"
-    )
-    parser.add_argument("--cuda", action="store_true")
-
-    args = parser.parse_args()
-
-    aligner = Wav2Vec2Aligner(args.model_name, args.input_wavs_sr, args.cuda)
-    aligner.align_data(args.wav_dir, args.text_file, args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/wav2vec2/ds_config_wav2vec2_zero2.json b/examples/research_projects/wav2vec2/ds_config_wav2vec2_zero2.json
deleted file mode 100644
index 6745e9917a37..000000000000
--- a/examples/research_projects/wav2vec2/ds_config_wav2vec2_zero2.json
+++ /dev/null
@@ -1,51 +0,0 @@
-{
-    "fp16": {
-        "enabled": "auto",
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    },
-
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": "auto",
-            "betas": "auto",
-            "eps": "auto",
-            "weight_decay": "auto"
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": "auto",
-            "warmup_max_lr": "auto",
-            "warmup_num_steps": "auto"
-        }
-    },
-
-    "zero_optimization": {
-        "stage": 2,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "find_unused_parameters": true,
-        "allgather_partitions": true,
-        "allgather_bucket_size": 2e8,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 2e8,
-        "contiguous_gradients": true
-    },
-
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": "auto",
-    "steps_per_print": 2000,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "wall_clock_breakdown": false
-}
diff --git a/examples/research_projects/wav2vec2/ds_config_wav2vec2_zero3.json b/examples/research_projects/wav2vec2/ds_config_wav2vec2_zero3.json
deleted file mode 100644
index 1beb972ba895..000000000000
--- a/examples/research_projects/wav2vec2/ds_config_wav2vec2_zero3.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-    "fp16": {
-        "enabled": "auto",
-        "loss_scale": 0,
-        "loss_scale_window": 1000,
-        "initial_scale_power": 16,
-        "hysteresis": 2,
-        "min_loss_scale": 1
-    },
-
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": "auto",
-            "betas": "auto",
-            "eps": "auto",
-            "weight_decay": "auto"
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": "auto",
-            "warmup_max_lr": "auto",
-            "warmup_num_steps": "auto"
-        }
-    },
-
-    "zero_optimization": {
-        "stage": 3,
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "offload_param": {
-            "device": "cpu",
-            "pin_memory": true
-        },
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "sub_group_size": 1e9,
-        "reduce_bucket_size": "auto",
-        "stage3_prefetch_bucket_size": "auto",
-        "stage3_param_persistence_threshold": "auto",
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_16bit_weights_on_model_save": true
-    },
-
-    "gradient_accumulation_steps": "auto",
-    "gradient_clipping": "auto",
-    "steps_per_print": 2000,
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "wall_clock_breakdown": false
-}
diff --git a/examples/research_projects/wav2vec2/finetune_base_100.sh b/examples/research_projects/wav2vec2/finetune_base_100.sh
deleted file mode 100755
index 254b0afef3d6..000000000000
--- a/examples/research_projects/wav2vec2/finetune_base_100.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env bash
-python run_asr.py \
---output_dir="./wav2vec2-base-100h" \
---num_train_epochs="30" \
---per_device_train_batch_size="32" \
---per_device_eval_batch_size="32" \
---eval_strategy="steps" \
---save_total_limit="3" \
---save_steps="500" \
---eval_steps="100" \
---logging_steps="50" \
---learning_rate="5e-4" \
---warmup_steps="3000" \
---model_name_or_path="facebook/wav2vec2-base" \
---fp16 \
---dataset_name="librispeech_asr" \
---dataset_config_name="clean" \
---train_split_name="train.100" \
---preprocessing_num_workers="32" \
---group_by_length \
---freeze_feature_extractor
diff --git a/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh b/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh
deleted file mode 100755
index 508cb532b0f0..000000000000
--- a/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-python run_asr.py \
---output_dir="./wav2vec2-base-timit-asr" \
---num_train_epochs="30" \
---per_device_train_batch_size="20" \
---per_device_eval_batch_size="20" \
---eval_strategy="steps" \
---save_steps="500" \
---eval_steps="100" \
---logging_steps="50" \
---learning_rate="5e-4" \
---warmup_steps="3000" \
---model_name_or_path="facebook/wav2vec2-base" \
---fp16 \
---dataset_name="timit_asr" \
---train_split_name="train" \
---validation_split_name="test" \
---orthography="timit" \
---preprocessing_num_workers="$(nproc)" \
---group_by_length \
---freeze_feature_extractor \
---verbose_logging \
diff --git a/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh b/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh
deleted file mode 100755
index 6956b093e725..000000000000
--- a/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env bash
-python run_asr.py \
---output_dir="./wav2vec2-large-lv60-100h" \
---num_train_epochs="30" \
---per_device_train_batch_size="16" \
---per_device_eval_batch_size="16" \
---eval_strategy="steps" \
---save_total_limit="3" \
---save_steps="500" \
---eval_steps="100" \
---logging_steps="50" \
---learning_rate="5e-4" \
---warmup_steps="3000" \
---model_name_or_path="facebook/wav2vec2-large-lv60" \
---fp16 \
---dataset_name="librispeech_asr" \
---dataset_config_name="clean" \
---train_split_name="train.100" \
---preprocessing_num_workers="32" \
---group_by_length \
---freeze_feature_extractor
diff --git a/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh b/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh
deleted file mode 100755
index fa02e71ea82c..000000000000
--- a/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-python run_asr.py \
---output_dir="./wav2vec2-large-lv60-timit-asr" \
---num_train_epochs="30" \
---per_device_train_batch_size="2" \
---per_device_eval_batch_size="2" \
---gradient_accumulation_steps="4" \
---eval_strategy="steps" \
---save_steps="500" \
---eval_steps="100" \
---logging_steps="50" \
---learning_rate="5e-4" \
---warmup_steps="3000" \
---model_name_or_path="facebook/wav2vec2-large-lv60" \
---fp16 \
---dataset_name="timit_asr" \
---train_split_name="train" \
---validation_split_name="test" \
---orthography="timit" \
---preprocessing_num_workers="$(nproc)" \
---group_by_length \
---freeze_feature_extractor \
---verbose_logging \
diff --git a/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh b/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh
deleted file mode 100755
index e90bc8caa6c0..000000000000
--- a/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env bash
-python run_asr.py \
---output_dir="./wav2vec2-large-xlsr-53-arabic-speech-corpus" \
---num_train_epochs="50" \
---per_device_train_batch_size="1" \
---per_device_eval_batch_size="1" \
---gradient_accumulation_steps="8" \
---eval_strategy="steps" \
---save_steps="500" \
---eval_steps="100" \
---logging_steps="50" \
---learning_rate="5e-4" \
---warmup_steps="3000" \
---model_name_or_path="elgeish/wav2vec2-large-xlsr-53-arabic" \
---fp16 \
---dataset_name="arabic_speech_corpus" \
---train_split_name="train" \
---validation_split_name="test" \
---max_duration_in_seconds="15" \
---orthography="buckwalter" \
---preprocessing_num_workers="$(nproc)" \
---group_by_length \
---freeze_feature_extractor \
---target_feature_extractor_sampling_rate \
---verbose_logging \
diff --git a/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh b/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh
deleted file mode 100644
index 70da0e0a0d12..000000000000
--- a/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-python run_common_voice.py \
-    --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
-    --dataset_config_name="tr" \
-    --output_dir=./wav2vec2-large-xlsr-turkish-demo \
-    --overwrite_output_dir \
-    --num_train_epochs="5" \
-    --per_device_train_batch_size="16" \
-    --eval_strategy="steps" \
-    --learning_rate="3e-4" \
-    --warmup_steps="500" \
-    --fp16 \
-    --freeze_feature_extractor \
-    --save_steps="400" \
-    --eval_steps="400" \
-    --save_total_limit="3" \
-    --logging_steps="400" \
-    --group_by_length \
-    --feat_proj_dropout="0.0" \
-    --layerdrop="0.1" \
-    --gradient_checkpointing \
-    --do_train --do_eval
diff --git a/examples/research_projects/wav2vec2/requirements.txt b/examples/research_projects/wav2vec2/requirements.txt
deleted file mode 100644
index 26b553c13928..000000000000
--- a/examples/research_projects/wav2vec2/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-transformers
-datasets
-torch>=1.5.0
-torchaudio
-jiwer==2.2.0
-lang-trans==0.6.0
-librosa==0.8.0
diff --git a/examples/research_projects/wav2vec2/run_alignment.sh b/examples/research_projects/wav2vec2/run_alignment.sh
deleted file mode 100644
index 95bfe02cf037..000000000000
--- a/examples/research_projects/wav2vec2/run_alignment.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-python alignment.py  \
---model_name="arijitx/wav2vec2-xls-r-300m-bengali" \
---wav_dir="./wavs" \
---text_file="script.txt" \
---input_wavs_sr=48000 \
---output_dir="./out_alignment" \
---cuda
diff --git a/examples/research_projects/wav2vec2/run_asr.py b/examples/research_projects/wav2vec2/run_asr.py
deleted file mode 100755
index 6535e3485d17..000000000000
--- a/examples/research_projects/wav2vec2/run_asr.py
+++ /dev/null
@@ -1,480 +0,0 @@
-#!/usr/bin/env python3
-import logging
-import pathlib
-import re
-import sys
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Set, Union
-
-import datasets
-import librosa
-import numpy as np
-import torch
-from lang_trans import arabic
-from packaging import version
-from torch import nn
-
-from transformers import (
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2ForCTC,
-    Wav2Vec2Processor,
-    is_apex_available,
-    trainer_utils,
-)
-
-
-if is_apex_available():
-    from apex import amp
-
-if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.6"):
-    _is_native_amp_available = True
-    from torch.cuda.amp import autocast
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    freeze_feature_extractor: Optional[bool] = field(
-        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
-    )
-    verbose_logging: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Whether to log verbose messages or not."},
-    )
-
-
-def configure_logger(model_args: ModelArguments, training_args: TrainingArguments):
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logging_level = logging.WARNING
-    if model_args.verbose_logging:
-        logging_level = logging.DEBUG
-    elif trainer_utils.is_main_process(training_args.local_rank):
-        logging_level = logging.INFO
-    logger.setLevel(logging_level)
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_name: str = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_split_name: Optional[str] = field(
-        default="train",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    validation_split_name: Optional[str] = field(
-        default="validation",
-        metadata={
-            "help": (
-                "The name of the validation data set split to use (via the datasets library). Defaults to 'validation'"
-            )
-        },
-    )
-    target_text_column: Optional[str] = field(
-        default="text",
-        metadata={"help": "Column in the dataset that contains label (target text). Defaults to 'text'"},
-    )
-    speech_file_column: Optional[str] = field(
-        default="file",
-        metadata={"help": "Column in the dataset that contains speech file path. Defaults to 'file'"},
-    )
-    target_feature_extractor_sampling_rate: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Resample loaded audio to target feature extractor's sampling rate or not."},
-    )
-    max_duration_in_seconds: Optional[float] = field(
-        default=None,
-        metadata={"help": "Filters out examples longer than specified. Defaults to no filtering."},
-    )
-    orthography: Optional[str] = field(
-        default="librispeech",
-        metadata={
-            "help": (
-                "Orthography used for normalization and tokenization: 'librispeech' (default), 'timit', or"
-                " 'buckwalter'."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-
-
-@dataclass
-class Orthography:
-    """
-    Orthography scheme used for text normalization and tokenization.
-
-    Args:
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to accept lowercase input and lowercase the output when decoding.
-        vocab_file (:obj:`str`, `optional`):
-            File containing the vocabulary.
-        word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
-            The token used for delimiting words; it needs to be in the vocabulary.
-        translation_table (:obj:`Dict[str, str]`, `optional`, defaults to :obj:`{}`):
-            Table to use with `str.translate()` when preprocessing text (e.g., "-" -> " ").
-        words_to_remove (:obj:`Set[str]`, `optional`, defaults to :obj:`set()`):
-            Words to remove when preprocessing text (e.g., "sil").
-        untransliterator (:obj:`Callable[[str], str]`, `optional`):
-            Function that untransliterates text back into native writing system.
-    """
-
-    do_lower_case: bool = False
-    vocab_file: Optional[str] = None
-    word_delimiter_token: Optional[str] = "|"
-    translation_table: Optional[Dict[str, str]] = field(default_factory=dict)
-    words_to_remove: Optional[Set[str]] = field(default_factory=set)
-    untransliterator: Optional[Callable[[str], str]] = None
-
-    @classmethod
-    def from_name(cls, name: str):
-        if name == "librispeech":
-            return cls()
-        if name == "timit":
-            return cls(
-                do_lower_case=True,
-                # break compounds like "quarter-century-old" and replace pauses "--"
-                translation_table=str.maketrans({"-": " "}),
-            )
-        if name == "buckwalter":
-            translation_table = {
-                "-": " ",  # sometimes used to represent pauses
-                "^": "v",  # fixing "tha" in arabic_speech_corpus dataset
-            }
-            return cls(
-                vocab_file=pathlib.Path(__file__).parent.joinpath("vocab/buckwalter.json"),
-                word_delimiter_token="/",  # "|" is Arabic letter alef with madda above
-                translation_table=str.maketrans(translation_table),
-                words_to_remove={"sil"},  # fixing "sil" in arabic_speech_corpus dataset
-                untransliterator=arabic.buckwalter.untransliterate,
-            )
-        raise ValueError(f"Unsupported orthography: '{name}'.")
-
-    def preprocess_for_training(self, text: str) -> str:
-        # TODO(elgeish) return a pipeline (e.g., from jiwer) instead? Or rely on branch predictor as is
-        if len(self.translation_table) > 0:
-            text = text.translate(self.translation_table)
-        if len(self.words_to_remove) == 0:
-            text = " ".join(text.split())  # clean up whitespaces
-        else:
-            text = " ".join(w for w in text.split() if w not in self.words_to_remove)  # and clean up whilespaces
-        return text
-
-    def create_processor(self, model_args: ModelArguments) -> Wav2Vec2Processor:
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir
-        )
-        if self.vocab_file:
-            tokenizer = Wav2Vec2CTCTokenizer(
-                self.vocab_file,
-                cache_dir=model_args.cache_dir,
-                do_lower_case=self.do_lower_case,
-                word_delimiter_token=self.word_delimiter_token,
-            )
-        else:
-            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
-                model_args.model_name_or_path,
-                cache_dir=model_args.cache_dir,
-                do_lower_case=self.do_lower_case,
-                word_delimiter_token=self.word_delimiter_token,
-            )
-        return Wav2Vec2Processor(feature_extractor, tokenizer)
-
-
-@dataclass
-class DataCollatorCTCWithPadding:
-    """
-    Data collator that will dynamically pad the inputs received.
-    Args:
-        processor (:class:`~transformers.Wav2Vec2Processor`)
-            The processor used for proccessing the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
-              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-              different lengths).
-        max_length (:obj:`int`, `optional`):
-            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
-        max_length_labels (:obj:`int`, `optional`):
-            Maximum length of the ``labels`` returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
-            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-    """
-
-    processor: Wav2Vec2Processor
-    padding: Union[bool, str] = True
-    max_length: Optional[int] = None
-    max_length_labels: Optional[int] = None
-    pad_to_multiple_of: Optional[int] = None
-    pad_to_multiple_of_labels: Optional[int] = None
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lengths and need
-        # different padding methods
-        input_features = [{"input_values": feature["input_values"]} for feature in features]
-        label_features = [{"input_ids": feature["labels"]} for feature in features]
-
-        batch = self.processor.pad(
-            input_features,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="pt",
-        )
-        labels_batch = self.processor.pad(
-            labels=label_features,
-            padding=self.padding,
-            max_length=self.max_length_labels,
-            pad_to_multiple_of=self.pad_to_multiple_of_labels,
-            return_tensors="pt",
-        )
-
-        # replace padding with -100 to ignore loss correctly
-        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
-
-        batch["labels"] = labels
-
-        return batch
-
-
-class CTCTrainer(Trainer):
-    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
-        """
-        Perform a training step on a batch of inputs.
-
-        Subclass and override to inject custom behavior.
-
-        Args:
-            model (:obj:`nn.Module`):
-                The model to train.
-            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
-
-        Return:
-            :obj:`torch.Tensor`: The tensor with training loss on this batch.
-        """
-
-        model.train()
-        inputs = self._prepare_inputs(inputs)
-
-        if self.use_amp:
-            with autocast():
-                loss = self.compute_loss(model, inputs)
-        else:
-            loss = self.compute_loss(model, inputs)
-
-        if self.args.n_gpu > 1:
-            if model.module.config.ctc_loss_reduction == "mean":
-                loss = loss.mean()
-            elif model.module.config.ctc_loss_reduction == "sum":
-                loss = loss.sum() / (inputs["labels"] >= 0).sum()
-            else:
-                raise ValueError(f"{model.config.ctc_loss_reduction} is not valid. Choose one of ['mean', 'sum']")
-
-        if self.args.gradient_accumulation_steps > 1:
-            loss = loss / self.args.gradient_accumulation_steps
-
-        if self.use_amp:
-            self.scaler.scale(loss).backward()
-        elif self.use_apex:
-            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
-                scaled_loss.backward()
-        elif self.deepspeed:
-            self.deepspeed.backward(loss)
-        else:
-            loss.backward()
-
-        return loss.detach()
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    configure_logger(model_args, training_args)
-
-    orthography = Orthography.from_name(data_args.orthography.lower())
-    processor = orthography.create_processor(model_args)
-    model = Wav2Vec2ForCTC.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        gradient_checkpointing=training_args.gradient_checkpointing,
-        vocab_size=len(processor.tokenizer),
-    )
-
-    train_dataset = datasets.load_dataset(
-        data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
-    )
-    val_dataset = datasets.load_dataset(
-        data_args.dataset_name, data_args.dataset_config_name, split=data_args.validation_split_name
-    )
-
-    wer_metric = datasets.load_metric("wer")
-    target_sr = processor.feature_extractor.sampling_rate if data_args.target_feature_extractor_sampling_rate else None
-    vocabulary_chars_str = "".join(t for t in processor.tokenizer.get_vocab().keys() if len(t) == 1)
-    vocabulary_text_cleaner = re.compile(  # remove characters not in vocabulary
-        rf"[^\s{re.escape(vocabulary_chars_str)}]",  # allow space in addition to chars in vocabulary
-        flags=re.IGNORECASE if processor.tokenizer.do_lower_case else 0,
-    )
-    text_updates = []
-
-    def prepare_example(example):  # TODO(elgeish) make use of multiprocessing?
-        example["speech"], example["sampling_rate"] = librosa.load(example[data_args.speech_file_column], sr=target_sr)
-        if data_args.max_duration_in_seconds is not None:
-            example["duration_in_seconds"] = len(example["speech"]) / example["sampling_rate"]
-        # Normalize and clean up text; order matters!
-        updated_text = orthography.preprocess_for_training(example[data_args.target_text_column])
-        updated_text = vocabulary_text_cleaner.sub("", updated_text)
-        if updated_text != example[data_args.target_text_column]:
-            text_updates.append((example[data_args.target_text_column], updated_text))
-            example[data_args.target_text_column] = updated_text
-        return example
-
-    train_dataset = train_dataset.map(prepare_example, remove_columns=[data_args.speech_file_column])
-    val_dataset = val_dataset.map(prepare_example, remove_columns=[data_args.speech_file_column])
-
-    if data_args.max_duration_in_seconds is not None:
-
-        def filter_by_max_duration(example):
-            return example["duration_in_seconds"] <= data_args.max_duration_in_seconds
-
-        old_train_size = len(train_dataset)
-        old_val_size = len(val_dataset)
-        train_dataset = train_dataset.filter(filter_by_max_duration, remove_columns=["duration_in_seconds"])
-        val_dataset = val_dataset.filter(filter_by_max_duration, remove_columns=["duration_in_seconds"])
-        if len(train_dataset) > old_train_size:
-            logger.warning(
-                f"Filtered out {len(train_dataset) - old_train_size} train example(s) longer than"
-                f" {data_args.max_duration_in_seconds} second(s)."
-            )
-        if len(val_dataset) > old_val_size:
-            logger.warning(
-                f"Filtered out {len(val_dataset) - old_val_size} validation example(s) longer than"
-                f" {data_args.max_duration_in_seconds} second(s)."
-            )
-    logger.info(f"Split sizes: {len(train_dataset)} train and {len(val_dataset)} validation.")
-
-    logger.warning(f"Updated {len(text_updates)} transcript(s) using '{data_args.orthography}' orthography rules.")
-    if logger.isEnabledFor(logging.DEBUG):
-        for original_text, updated_text in text_updates:
-            logger.debug(f'Updated text: "{original_text}" -> "{updated_text}"')
-    text_updates = None
-
-    def prepare_dataset(batch):
-        # check that all files have the correct sampling rate
-        assert (
-            len(set(batch["sampling_rate"])) == 1
-        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
-
-        processed_batch = processor(
-            audio=batch["speech"], text=batch[data_args.target_text_column], sampling_rate=batch["sampling_rate"][0]
-        )
-        batch.update(processed_batch)
-        return batch
-
-    train_dataset = train_dataset.map(
-        prepare_dataset,
-        batch_size=training_args.per_device_train_batch_size,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-    )
-    val_dataset = val_dataset.map(
-        prepare_dataset,
-        batch_size=training_args.per_device_train_batch_size,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-    )
-
-    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
-
-    def compute_metrics(pred):
-        pred_logits = pred.predictions
-        pred_ids = np.argmax(pred_logits, axis=-1)
-
-        pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
-
-        pred_str = processor.batch_decode(pred_ids)
-        # we do not want to group tokens when computing the metrics
-        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
-        if logger.isEnabledFor(logging.DEBUG):
-            for reference, predicted in zip(label_str, pred_str):
-                logger.debug(f'reference: "{reference}"')
-                logger.debug(f'predicted: "{predicted}"')
-                if orthography.untransliterator is not None:
-                    logger.debug(f'reference (untransliterated): "{orthography.untransliterator(reference)}"')
-                    logger.debug(f'predicted (untransliterated): "{orthography.untransliterator(predicted)}"')
-
-        wer = wer_metric.compute(predictions=pred_str, references=label_str)
-
-        return {"wer": wer}
-
-    if model_args.freeze_feature_extractor:
-        model.freeze_feature_extractor()
-
-    trainer = CTCTrainer(
-        model=model,
-        data_collator=data_collator,
-        args=training_args,
-        compute_metrics=compute_metrics,
-        train_dataset=train_dataset,
-        eval_dataset=val_dataset,
-        tokenizer=processor.feature_extractor,
-    )
-
-    trainer.train()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/wav2vec2/run_common_voice.py b/examples/research_projects/wav2vec2/run_common_voice.py
deleted file mode 100644
index a7f57960d89f..000000000000
--- a/examples/research_projects/wav2vec2/run_common_voice.py
+++ /dev/null
@@ -1,513 +0,0 @@
-#!/usr/bin/env python3
-import json
-import logging
-import os
-import re
-import sys
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
-
-import datasets
-import numpy as np
-import torch
-import torchaudio
-from packaging import version
-from torch import nn
-
-import transformers
-from transformers import (
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2ForCTC,
-    Wav2Vec2Processor,
-    is_apex_available,
-    set_seed,
-)
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-
-
-if is_apex_available():
-    from apex import amp
-
-
-if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.6"):
-    _is_native_amp_available = True
-    from torch.cuda.amp import autocast
-
-logger = logging.getLogger(__name__)
-
-
-def list_field(default=None, metadata=None):
-    return field(default_factory=lambda: default, metadata=metadata)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    freeze_feature_extractor: Optional[bool] = field(
-        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
-    )
-    attention_dropout: Optional[float] = field(
-        default=0.1, metadata={"help": "The dropout ratio for the attention probabilities."}
-    )
-    activation_dropout: Optional[float] = field(
-        default=0.1, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
-    )
-    hidden_dropout: Optional[float] = field(
-        default=0.1,
-        metadata={
-            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
-        },
-    )
-    feat_proj_dropout: Optional[float] = field(
-        default=0.1,
-        metadata={"help": "The dropout probability for all 1D convolutional layers in feature extractor."},
-    )
-    mask_time_prob: Optional[float] = field(
-        default=0.05,
-        metadata={
-            "help": (
-                "Propability of each feature vector along the time axis to be chosen as the start of the vector "
-                "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature "
-                "vectors will be masked along the time axis. This is only relevant if ``apply_spec_augment is True``."
-            )
-        },
-    )
-    layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."})
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_split_name: Optional[str] = field(
-        default="train+validation",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of validation examples to this "
-                "value if set."
-            )
-        },
-    )
-    chars_to_ignore: List[str] = list_field(
-        default=[",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�"],
-        metadata={"help": "A list of characters to remove from the transcripts."},
-    )
-
-
-@dataclass
-class DataCollatorCTCWithPadding:
-    """
-    Data collator that will dynamically pad the inputs received.
-    Args:
-        processor (:class:`~transformers.Wav2Vec2Processor`)
-            The processor used for proccessing the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
-              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-              different lengths).
-        max_length (:obj:`int`, `optional`):
-            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
-        max_length_labels (:obj:`int`, `optional`):
-            Maximum length of the ``labels`` returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
-            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-    """
-
-    processor: Wav2Vec2Processor
-    padding: Union[bool, str] = True
-    max_length: Optional[int] = None
-    max_length_labels: Optional[int] = None
-    pad_to_multiple_of: Optional[int] = None
-    pad_to_multiple_of_labels: Optional[int] = None
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lengths and need
-        # different padding methods
-        input_features = [{"input_values": feature["input_values"]} for feature in features]
-        label_features = [{"input_ids": feature["labels"]} for feature in features]
-
-        batch = self.processor.pad(
-            input_features,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="pt",
-        )
-        labels_batch = self.processor.pad(
-            labels=label_features,
-            padding=self.padding,
-            max_length=self.max_length_labels,
-            pad_to_multiple_of=self.pad_to_multiple_of_labels,
-            return_tensors="pt",
-        )
-
-        # replace padding with -100 to ignore loss correctly
-        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
-
-        batch["labels"] = labels
-
-        return batch
-
-
-class CTCTrainer(Trainer):
-    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
-        """
-        Perform a training step on a batch of inputs.
-
-        Subclass and override to inject custom behavior.
-
-        Args:
-            model (:obj:`nn.Module`):
-                The model to train.
-            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
-
-        Return:
-            :obj:`torch.Tensor`: The tensor with training loss on this batch.
-        """
-
-        model.train()
-        inputs = self._prepare_inputs(inputs)
-
-        if self.use_amp:
-            with autocast():
-                loss = self.compute_loss(model, inputs)
-        else:
-            loss = self.compute_loss(model, inputs)
-
-        if self.args.n_gpu > 1:
-            if model.module.config.ctc_loss_reduction == "mean":
-                loss = loss.mean()
-            elif model.module.config.ctc_loss_reduction == "sum":
-                loss = loss.sum() / (inputs["labels"] >= 0).sum()
-            else:
-                raise ValueError(f"{model.config.ctc_loss_reduction} is not valid. Choose one of ['mean', 'sum']")
-
-        if self.args.gradient_accumulation_steps > 1:
-            loss = loss / self.args.gradient_accumulation_steps
-
-        if self.use_amp:
-            self.scaler.scale(loss).backward()
-        elif self.use_apex:
-            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
-                scaled_loss.backward()
-        elif self.deepspeed:
-            self.deepspeed.backward(loss)
-        else:
-            loss.backward()
-
-        return loss.detach()
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets:
-    train_dataset = datasets.load_dataset(
-        "common_voice", data_args.dataset_config_name, split=data_args.train_split_name
-    )
-    eval_dataset = datasets.load_dataset("common_voice", data_args.dataset_config_name, split="test")
-
-    # Create and save tokenizer
-    chars_to_ignore_regex = f'[{"".join(data_args.chars_to_ignore)}]'
-
-    def remove_special_characters(batch):
-        batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
-        return batch
-
-    train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"])
-    eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"])
-
-    def extract_all_chars(batch):
-        all_text = " ".join(batch["text"])
-        vocab = list(set(all_text))
-        return {"vocab": [vocab], "all_text": [all_text]}
-
-    vocab_train = train_dataset.map(
-        extract_all_chars,
-        batched=True,
-        batch_size=-1,
-        keep_in_memory=True,
-        remove_columns=train_dataset.column_names,
-    )
-    vocab_test = train_dataset.map(
-        extract_all_chars,
-        batched=True,
-        batch_size=-1,
-        keep_in_memory=True,
-        remove_columns=eval_dataset.column_names,
-    )
-
-    vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
-    vocab_dict = {v: k for k, v in enumerate(vocab_list)}
-    vocab_dict["|"] = vocab_dict[" "]
-    del vocab_dict[" "]
-    vocab_dict["[UNK]"] = len(vocab_dict)
-    vocab_dict["[PAD]"] = len(vocab_dict)
-
-    with open("vocab.json", "w") as vocab_file:
-        json.dump(vocab_dict, vocab_file)
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    tokenizer = Wav2Vec2CTCTokenizer(
-        "vocab.json",
-        unk_token="[UNK]",
-        pad_token="[PAD]",
-        word_delimiter_token="|",
-    )
-    feature_extractor = Wav2Vec2FeatureExtractor(
-        feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True
-    )
-    processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-    model = Wav2Vec2ForCTC.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        activation_dropout=model_args.activation_dropout,
-        attention_dropout=model_args.attention_dropout,
-        hidden_dropout=model_args.hidden_dropout,
-        feat_proj_dropout=model_args.feat_proj_dropout,
-        mask_time_prob=model_args.mask_time_prob,
-        gradient_checkpointing=training_args.gradient_checkpointing,
-        layerdrop=model_args.layerdrop,
-        ctc_loss_reduction="mean",
-        pad_token_id=processor.tokenizer.pad_token_id,
-        vocab_size=len(processor.tokenizer),
-    )
-
-    if data_args.max_train_samples is not None:
-        max_train_samples = min(len(train_dataset), data_args.max_train_samples)
-        train_dataset = train_dataset.select(range(max_train_samples))
-
-    if data_args.max_val_samples is not None:
-        eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-
-    resampler = torchaudio.transforms.Resample(48_000, 16_000)
-
-    # Preprocessing the datasets.
-    # We need to read the aduio files as arrays and tokenize the targets.
-    def speech_file_to_array_fn(batch):
-        speech_array, sampling_rate = torchaudio.load(batch["path"])
-        batch["speech"] = resampler(speech_array).squeeze().numpy()
-        batch["sampling_rate"] = 16_000
-        batch["target_text"] = batch["text"]
-        return batch
-
-    train_dataset = train_dataset.map(
-        speech_file_to_array_fn,
-        remove_columns=train_dataset.column_names,
-        num_proc=data_args.preprocessing_num_workers,
-    )
-    eval_dataset = eval_dataset.map(
-        speech_file_to_array_fn,
-        remove_columns=eval_dataset.column_names,
-        num_proc=data_args.preprocessing_num_workers,
-    )
-
-    def prepare_dataset(batch):
-        # check that all files have the correct sampling rate
-        assert (
-            len(set(batch["sampling_rate"])) == 1
-        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
-
-        processed_batch = processor(
-            audio=batch["speech"], text=batch["target_text"], sampling_rate=batch["sampling_rate"][0]
-        )
-        batch.update(processed_batch)
-        return batch
-
-    train_dataset = train_dataset.map(
-        prepare_dataset,
-        remove_columns=train_dataset.column_names,
-        batch_size=training_args.per_device_train_batch_size,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-    )
-    eval_dataset = eval_dataset.map(
-        prepare_dataset,
-        remove_columns=eval_dataset.column_names,
-        batch_size=training_args.per_device_train_batch_size,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-    )
-
-    # Metric
-    wer_metric = datasets.load_metric("wer")
-
-    def compute_metrics(pred):
-        pred_logits = pred.predictions
-        pred_ids = np.argmax(pred_logits, axis=-1)
-
-        pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
-
-        pred_str = processor.batch_decode(pred_ids)
-        # we do not want to group tokens when computing the metrics
-        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
-
-        wer = wer_metric.compute(predictions=pred_str, references=label_str)
-
-        return {"wer": wer}
-
-    if model_args.freeze_feature_extractor:
-        model.freeze_feature_extractor()
-
-    # Data collator
-    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
-
-    # Initialize our Trainer
-    trainer = CTCTrainer(
-        model=model,
-        data_collator=data_collator,
-        args=training_args,
-        compute_metrics=compute_metrics,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=processor.feature_extractor,
-    )
-
-    # Training
-    if training_args.do_train:
-        if last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        elif os.path.isdir(model_args.model_name_or_path):
-            checkpoint = model_args.model_name_or_path
-        else:
-            checkpoint = None
-
-        # Save the feature_extractor and the tokenizer
-        if is_main_process(training_args.local_rank):
-            processor.save_pretrained(training_args.output_dir)
-
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/wav2vec2/run_pretrain.py b/examples/research_projects/wav2vec2/run_pretrain.py
deleted file mode 100755
index 985e6df40e31..000000000000
--- a/examples/research_projects/wav2vec2/run_pretrain.py
+++ /dev/null
@@ -1,396 +0,0 @@
-#!/usr/bin/env python3
-import logging
-import sys
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
-
-import librosa
-import torch
-from datasets import DatasetDict, load_dataset
-from packaging import version
-from torch import nn
-
-from transformers import (
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    Wav2Vec2Config,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2ForPreTraining,
-    is_apex_available,
-    trainer_utils,
-)
-from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices
-
-
-if is_apex_available():
-    from apex import amp
-
-if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.6"):
-    _is_native_amp_available = True
-    from torch.cuda.amp import autocast
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    freeze_feature_extractor: Optional[bool] = field(
-        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
-    )
-    verbose_logging: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Whether to log verbose messages or not."},
-    )
-    max_gumbel_temperature: Optional[float] = field(
-        default=2.0, metadata={"help": "Maximum temperature for gumbel softmax."}
-    )
-    min_gumbel_temperature: Optional[float] = field(
-        default=0.5, metadata={"help": "Minimum temperature for gumbel softmax."}
-    )
-    gumbel_temperature_decay: Optional[float] = field(
-        default=0.999995, metadata={"help": "Decay of gumbel temperature during training."}
-    )
-
-
-def configure_logger(model_args: ModelArguments, training_args: TrainingArguments):
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logging_level = logging.WARNING
-    if model_args.verbose_logging:
-        logging_level = logging.DEBUG
-    elif trainer_utils.is_main_process(training_args.local_rank):
-        logging_level = logging.INFO
-    logger.setLevel(logging_level)
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_name: str = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_split_name: Optional[str] = field(
-        default="train",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    validation_split_name: Optional[str] = field(
-        default="validation",
-        metadata={
-            "help": (
-                "The name of the validation data set split to use (via the datasets library). Defaults to 'validation'"
-            )
-        },
-    )
-    speech_file_column: Optional[str] = field(
-        default="file",
-        metadata={"help": "Column in the dataset that contains speech file path. Defaults to 'file'"},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    validation_split_percentage: Optional[int] = field(
-        default=1,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_duration_in_seconds: Optional[float] = field(
-        default=20.0, metadata={"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds"}
-    )
-
-
-@dataclass
-class DataCollatorForWav2Vec2Pretraining:
-    """
-    Data collator that will dynamically pad the inputs received and prepare masked indices
-    for self-supervised pretraining.
-
-    Args:
-        model (:class:`~transformers.Wav2Vec2ForPreTraining`):
-            The Wav2Vec2 model used for pretraining. The data collator needs to have access
-            to config and ``_get_feat_extract_output_lengths`` function for correct padding.
-        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
-            The processor used for proccessing the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
-              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-              different lengths).
-        max_length (:obj:`int`, `optional`):
-            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
-            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-    """
-
-    model: Wav2Vec2ForPreTraining
-    feature_extractor: Wav2Vec2FeatureExtractor
-    padding: Union[bool, str] = "longest"
-    pad_to_multiple_of: Optional[int] = None
-    max_length: Optional[int] = None
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # reformat list to dict and set to pytorch format
-        batch = self.feature_extractor.pad(
-            features,
-            max_length=self.max_length,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="pt",
-        )
-        mask_indices_seq_length = self.model._get_feat_extract_output_lengths(batch["input_values"].shape[-1])
-
-        batch_size = batch["input_values"].shape[0]
-
-        # make sure that no loss is computed on padded inputs
-        if batch["attention_mask"] is not None:
-            # compute real output lengths according to convolution formula
-            output_lengths = self.model._get_feat_extract_output_lengths(batch["attention_mask"].sum(-1)).to(
-                torch.long
-            )
-
-            attention_mask = torch.zeros(
-                (batch_size, mask_indices_seq_length), dtype=torch.long, device=batch["input_values"].device
-            )
-
-            # these two operations makes sure that all values
-            # before the output lengths indices are attended to
-            attention_mask[
-                (torch.arange(attention_mask.shape[0], device=batch["input_values"].device), output_lengths - 1)
-            ] = 1
-            attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
-
-        # sample randomly masked indices
-        batch["mask_time_indices"] = _compute_mask_indices(
-            (batch_size, mask_indices_seq_length),
-            self.model.config.mask_time_prob,
-            self.model.config.mask_time_length,
-            attention_mask=attention_mask,
-            min_masks=2,
-        )
-
-        return batch
-
-
-class Wav2Vec2PreTrainer(Trainer):
-    """
-    Subclassed :class:`~transformers.Trainer` for Wav2Vec2-like pretraining. Trainer can decay gumbel softmax temperature during training.
-    """
-
-    def __init__(self, *args, max_gumbel_temp=1, min_gumbel_temp=0, gumbel_temp_decay=1.0, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.num_update_step = 0
-        self.max_gumbel_temp = max_gumbel_temp
-        self.min_gumbel_temp = min_gumbel_temp
-        self.gumbel_temp_decay = gumbel_temp_decay
-
-    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
-        """
-        Perform a training step on a batch of inputs.
-
-        Subclass and override to inject custom behavior.
-
-        Args:
-            model (:obj:`nn.Module`):
-                The model to train.
-            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
-
-        Return:
-            :obj:`torch.Tensor`: The tensor with training loss on this batch.
-        """
-
-        model.train()
-        inputs = self._prepare_inputs(inputs)
-
-        if self.use_amp:
-            with autocast():
-                loss = self.compute_loss(model, inputs)
-        else:
-            loss = self.compute_loss(model, inputs)
-
-        if self.args.n_gpu > 1 or self.deepspeed:
-            if model.module.config.ctc_loss_reduction == "mean":
-                loss = loss.mean()
-            elif model.module.config.ctc_loss_reduction == "sum":
-                loss = loss.sum() / (inputs["mask_time_indices"]).sum()
-            else:
-                raise ValueError(f"{model.config.ctc_loss_reduction} is not valid. Choose one of ['mean', 'sum']")
-
-        if self.args.gradient_accumulation_steps > 1:
-            loss = loss / self.args.gradient_accumulation_steps
-
-        if self.use_amp:
-            self.scaler.scale(loss).backward()
-        elif self.use_apex:
-            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
-                scaled_loss.backward()
-        elif self.deepspeed:
-            self.deepspeed.backward(loss)
-        else:
-            loss.backward()
-
-        self.num_update_step += 1
-        # make sure gumbel softmax temperature is decayed
-        if self.args.n_gpu > 1 or self.deepspeed:
-            model.module.set_gumbel_temperature(
-                max(self.max_gumbel_temp * self.gumbel_temp_decay**self.num_update_step, self.min_gumbel_temp)
-            )
-        else:
-            model.set_gumbel_temperature(
-                max(self.max_gumbel_temp * self.gumbel_temp_decay**self.num_update_step, self.min_gumbel_temp)
-            )
-
-        return loss.detach()
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    configure_logger(model_args, training_args)
-
-    # Downloading and loading a dataset from the hub.
-    datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
-
-    if "validation" not in datasets.keys():
-        # make sure only "validation" and "train" keys remain"
-        datasets = DatasetDict()
-        datasets["validation"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=f"{data_args.train_split_name}[:{data_args.validation_split_percentage}%]",
-            cache_dir=model_args.cache_dir,
-        )
-        datasets["train"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=f"{data_args.train_split_name}[{data_args.validation_split_percentage}%:]",
-            cache_dir=model_args.cache_dir,
-        )
-    else:
-        # make sure only "validation" and "train" keys remain"
-        datasets = DatasetDict()
-        datasets["validation"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split="validation",
-            cache_dir=model_args.cache_dir,
-        )
-        datasets["train"] = load_dataset(
-            data_args.dataset_name,
-            data_args.dataset_config_name,
-            split=f"{data_args.train_split_name}",
-            cache_dir=model_args.cache_dir,
-        )
-
-    # only normalized-inputs-training is supported
-    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, do_normalize=True
-    )
-
-    def prepare_dataset(batch):
-        # check that all files have the correct sampling rate
-        batch["speech"], _ = librosa.load(batch[data_args.speech_file_column], sr=feature_extractor.sampling_rate)
-        return batch
-
-    # load audio files into numpy arrays
-    vectorized_datasets = datasets.map(
-        prepare_dataset, num_proc=data_args.preprocessing_num_workers, remove_columns=datasets["train"].column_names
-    )
-
-    # filter audio files that are too long
-    vectorized_datasets = vectorized_datasets.filter(
-        lambda data: len(data["speech"]) < int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate)
-    )
-
-    def normalize(batch):
-        return feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate)
-
-    # normalize and transform to `BatchFeatures`
-    vectorized_datasets = vectorized_datasets.map(
-        normalize,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        load_from_cache_file=not data_args.overwrite_cache,
-        remove_columns=vectorized_datasets["train"].column_names,
-    )
-
-    # pretraining is only supported for "newer" stable layer norm architecture
-    # apply_spec_augment has to be True, mask_feature_prob has to be 0.0
-    config = Wav2Vec2Config.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        gradient_checkpointing=training_args.gradient_checkpointing,
-    )
-
-    if not config.do_stable_layer_norm or config.feat_extract_norm != "layer":
-        raise ValueError(
-            "PreTraining is only supported for ``config.do_stable_layer_norm=True`` and"
-            " ``config.feat_extract_norm='layer'"
-        )
-
-    model = Wav2Vec2ForPreTraining(config)
-
-    data_collator = DataCollatorForWav2Vec2Pretraining(model=model, feature_extractor=feature_extractor)
-
-    trainer = Wav2Vec2PreTrainer(
-        model=model,
-        data_collator=data_collator,
-        args=training_args,
-        train_dataset=vectorized_datasets["train"],
-        eval_dataset=vectorized_datasets["validation"],
-        tokenizer=feature_extractor,
-        max_gumbel_temp=model_args.max_gumbel_temperature,
-        min_gumbel_temp=model_args.min_gumbel_temperature,
-        gumbel_temp_decay=model_args.gumbel_temperature_decay,
-    )
-    trainer.train()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py b/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
deleted file mode 100644
index 8fb2df711125..000000000000
--- a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# XXX: we want transformers master here - in the absense of conftest manipulating sys.path:
-# hack it in for now:
-import sys
-from pathlib import Path
-
-
-git_repo_path = Path(__file__).resolve().parents[3] / "src"
-sys.path.insert(1, str(git_repo_path))
-
-import dataclasses  # noqa
-import io  # noqa
-import itertools  # noqa
-import json  # noqa
-import os  # noqa
-import unittest  # noqa
-from copy import deepcopy  # noqa
-
-from parameterized import parameterized  # noqa
-from transformers import TrainingArguments, is_torch_available  # noqa
-from transformers.integrations.deepspeed import is_deepspeed_available  # noqa
-from transformers.file_utils import WEIGHTS_NAME  # noqa
-from transformers.testing_utils import (  # noqa
-    CaptureLogger,
-    ExtendSysPath,
-    TestCasePlus,
-    execute_subprocess_async,
-    get_gpu_count,
-    mockenv_context,
-    require_deepspeed,
-    require_torch_gpu,
-    require_torch_multi_gpu,
-    slow,
-)
-from transformers.trainer_utils import set_seed  # noqa
-
-
-set_seed(42)
-
-models = {"base": "patrickvonplaten/wav2vec2_tiny_random", "robust": "patrickvonplaten/wav2vec2_tiny_random_robust"}
-
-ZERO2 = "zero2"
-ZERO3 = "zero3"
-stages = [ZERO2, ZERO3]
-
-
-def custom_name_func(func, param_num, param):
-    # customize the test name generator function as we want both params to appear in the sub-test
-    # name, as by default it shows only the first param
-    param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
-    return f"{func.__name__}_{param_based_name}"
-
-
-# Cartesian-product of zero stages with models to test
-params = list(itertools.product(stages, models.keys()))
-
-
-@slow
-@require_deepspeed
-@require_torch_gpu
-class TestDeepSpeedWav2Vec2(TestCasePlus):
-    @parameterized.expand(params, name_func=custom_name_func)
-    def test_fp32_non_distributed(self, stage, model):
-        self.run_and_check(
-            stage=stage,
-            model=model,
-            distributed=False,
-            fp16=False,
-        )
-
-    @require_torch_multi_gpu
-    @parameterized.expand(params, name_func=custom_name_func)
-    def test_fp32_distributed(self, stage, model):
-        self.run_and_check(
-            stage=stage,
-            model=model,
-            distributed=True,
-            fp16=False,
-        )
-
-    @parameterized.expand(params, name_func=custom_name_func)
-    def test_fp16_non_distributed(self, stage, model):
-        self.run_and_check(
-            stage=stage,
-            model=model,
-            distributed=False,
-            fp16=True,
-        )
-
-    @require_torch_multi_gpu
-    @parameterized.expand(params, name_func=custom_name_func)
-    def test_fp16_distributed(self, stage, model):
-        self.run_and_check(
-            stage=stage,
-            model=model,
-            distributed=True,
-            fp16=True,
-        )
-
-    def do_checks(self, output_dir):
-        # XXX: run_asr is premature and doesn't save any results
-        # so all we check for now is that the process didn't fail
-        pass
-
-    # XXX: need to do better validation beyond just that the run was successful
-    def run_and_check(
-        self,
-        stage: str,
-        model: str,
-        eval_steps: int = 10,
-        distributed: bool = True,
-        quality_checks: bool = True,
-        fp16: bool = True,
-    ):
-        model_name = models[model]
-
-        output_dir = self.run_trainer(
-            stage=stage,
-            model_name=model_name,
-            eval_steps=eval_steps,
-            num_train_epochs=1,
-            distributed=distributed,
-            fp16=fp16,
-        )
-
-        self.do_checks(output_dir)
-
-        return output_dir
-
-    def run_trainer(
-        self,
-        stage: str,
-        model_name: str,
-        eval_steps: int = 10,
-        num_train_epochs: int = 1,
-        distributed: bool = True,
-        fp16: bool = True,
-    ):
-        output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
-        args = f"""
-            --model_name_or_path {model_name}
-            --dataset_name hf-internal-testing/librispeech_asr_dummy
-            --dataset_config_name clean
-            --train_split_name validation
-            --validation_split_name validation
-            --output_dir {output_dir}
-            --num_train_epochs {str(num_train_epochs)}
-            --per_device_train_batch_size 2
-            --per_device_eval_batch_size 2
-            --eval_strategy steps
-            --learning_rate 5e-4
-            --warmup_steps 8
-            --orthography timit
-            --preprocessing_num_workers 1
-            --group_by_length
-            --freeze_feature_extractor
-            --report_to none
-            --save_steps 0
-            --eval_steps {eval_steps}
-            --report_to none
-        """.split()
-
-        if fp16:
-            args.extend(["--fp16"])
-
-        # currently ds_config_wav2vec2_zero.json requires "zero_optimization.find_unused_parameters": true,
-        # hence the separate config files
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_wav2vec2_{stage}.json".split()
-        script = [f"{self.examples_dir_str}/research_projects/wav2vec2/run_asr.py"]
-        launcher = self.get_launcher(distributed)
-
-        cmd = launcher + script + args + ds_args
-        # keep for quick debug
-        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
-        execute_subprocess_async(cmd, env=self.get_env())
-
-        return output_dir
-
-    def get_launcher(self, distributed=False):
-        # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
-        # - it won't be able to handle that
-        # 2. for now testing with just 2 gpus max (since some quality tests may give different
-        # results with mode gpus because we use very little data)
-        num_gpus = min(2, get_gpu_count()) if distributed else 1
-        return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
diff --git a/examples/research_projects/wav2vec2/vocab/buckwalter.json b/examples/research_projects/wav2vec2/vocab/buckwalter.json
deleted file mode 100644
index 3f98fc2d521d..000000000000
--- a/examples/research_projects/wav2vec2/vocab/buckwalter.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-    "<pad>": 0,
-    "<s>": 1,
-    "</s>": 2,
-    "<unk>": 3,
-    "/": 4,
-    "'": 5,
-    "|": 6,
-    ">": 7,
-    "&": 8,
-    "<": 9,
-    "}": 10,
-    "A": 11,
-    "b": 12,
-    "p": 13,
-    "t": 14,
-    "v": 15,
-    "j": 16,
-    "H": 17,
-    "x": 18,
-    "d": 19,
-    "*": 20,
-    "r": 21,
-    "z": 22,
-    "s": 23,
-    "$": 24,
-    "S": 25,
-    "D": 26,
-    "T": 27,
-    "Z": 28,
-    "E": 29,
-    "g": 30,
-    "_": 31,
-    "f": 32,
-    "q": 33,
-    "k": 34,
-    "l": 35,
-    "m": 36,
-    "n": 37,
-    "h": 38,
-    "w": 39,
-    "Y": 40,
-    "y": 41,
-    "F": 42,
-    "N": 43,
-    "K": 44,
-    "a": 45,
-    "u": 46,
-    "i": 47,
-    "~": 48,
-    "o": 49,
-    "`": 50,
-    "{": 51,
-    "P": 52,
-    "J": 53,
-    "V": 54,
-    "G": 55
-}
\ No newline at end of file
diff --git a/examples/research_projects/xtreme-s/README.md b/examples/research_projects/xtreme-s/README.md
deleted file mode 100644
index 5314ba9880ad..000000000000
--- a/examples/research_projects/xtreme-s/README.md
+++ /dev/null
@@ -1,160 +0,0 @@
-<!---
-Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# XTREME-S benchmark examples
-
-*Maintainers: [Anton Lozhkov](https://github.com/anton-l) and [Patrick von Platen](https://github.com/patrickvonplaten)*
-
-The Cross-lingual TRansfer Evaluation of Multilingual Encoders for Speech (XTREME-S) benchmark is a benchmark designed to evaluate speech representations across languages, tasks, domains and data regimes. It covers XX typologically diverse languages and seven downstream tasks grouped in four families: speech recognition, translation, classification and retrieval.
-
-XTREME-S covers speech recognition with Fleurs, Multilingual LibriSpeech (MLS) and VoxPopuli, speech translation with CoVoST-2, speech classification with LangID (Fleurs) and intent classification (MInds-14) and finally speech(-text) retrieval with Fleurs. Each of the tasks covers a subset of the 102 languages included in XTREME-S (shown here with their ISO 3166-1 codes): afr, amh, ara, asm, ast, azj, bel, ben, bos, cat, ceb, ces, cmn, cym, dan, deu, ell, eng, spa, est, fas, ful, fin, tgl, fra, gle, glg, guj, hau, heb, hin, hrv, hun, hye, ind, ibo, isl, ita, jpn, jav, kat, kam, kea, kaz, khm, kan, kor, ckb, kir, ltz, lug, lin, lao, lit, luo, lav, mri, mkd, mal, mon, mar, msa, mlt, mya, nob, npi, nld, nso, nya, oci, orm, ory, pan, pol, pus, por, ron, rus, bul, snd, slk, slv, sna, som, srp, swe, swh, tam, tel, tgk, tha, tur, ukr, umb, urd, uzb, vie, wol, xho, yor, yue and zul.
-
-Paper: [XTREME-S: Evaluating Cross-lingual Speech Representations](https://arxiv.org/abs/2203.10752)
-
-Dataset: [https://huggingface.co/datasets/google/xtreme_s](https://huggingface.co/datasets/google/xtreme_s)
-
-## Fine-tuning for the XTREME-S tasks
-
-Based on the [`run_xtreme_s.py`](https://github.com/huggingface/transformers/blob/main/examples/research_projects/xtreme-s/run_xtreme_s.py) script.
-
-This script can fine-tune any of the pretrained speech models on the [hub](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition) on the [XTREME-S dataset](https://huggingface.co/datasets/google/xtreme_s) tasks.
-
-XTREME-S is made up of 7 different tasks. Here is how to run the script on each of them:
-
-```bash
-export TASK_NAME=mls.all
-
-python run_xtreme_s.py \
-    --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
-    --task="${TASK_NAME}" \
-    --output_dir="xtreme_s_xlsr_${TASK_NAME}" \
-    --num_train_epochs=100 \
-    --per_device_train_batch_size=32 \
-    --learning_rate="3e-4" \
-    --target_column_name="transcription" \
-    --save_steps=500 \
-    --eval_steps=500 \
-    --gradient_checkpointing \
-    --fp16 \
-    --group_by_length \
-    --do_train \
-    --do_eval \
-    --do_predict \
-    --push_to_hub
-```
-
-where `TASK_NAME` can be one of: `mls, voxpopuli, covost2, fleurs-asr, fleurs-lang_id, minds14`.
-
-We get the following results on the test set of the benchmark's datasets. 
-The corresponding training commands for each dataset are given in the sections below:
-
-| Task                  | Dataset   | Result                | Fine-tuned model & logs                                            | Training time | GPUs   |
-|-----------------------|-----------|-----------------------|--------------------------------------------------------------------|---------------|--------|
-| Speech Recognition    | MLS       | 30.33 WER             | [here](https://huggingface.co/anton-l/xtreme_s_xlsr_300m_mls/)     | 18:47:25      | 8xV100 |
-| Speech Recognition    | VoxPopuli | -                     | -                                                                  | -             | -      |
-| Speech Recognition    | FLEURS    | -                     | -                                                                  | -             | -      |
-| Speech Translation    | CoVoST-2  | -                     | -                                                                  | -             | -      |
-| Speech Classification | Minds-14  | 90.15 F1 / 90.33 Acc. | [here](https://huggingface.co/anton-l/xtreme_s_xlsr_300m_minds14/) | 2:54:21       | 2xA100 |
-| Speech Classification | FLEURS    | -                     | -                                                                  | -             | -      |
-| Speech Retrieval      | FLEURS    | -                     | -                                                                  | -             | -      |
-
-### Speech Recognition with MLS
-
-The following command shows how to fine-tune the [XLS-R](https://huggingface.co/docs/transformers/main/model_doc/xls_r) model on [XTREME-S MLS](https://huggingface.co/datasets/google/xtreme_s#multilingual-librispeech-mls) using 8 GPUs in half-precision.
-
-```bash
-python -m torch.distributed.launch \
-    --nproc_per_node=8 \
-    run_xtreme_s.py \
-    --task="mls" \
-    --language="all" \
-    --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
-    --output_dir="xtreme_s_xlsr_300m_mls" \
-    --overwrite_output_dir \
-    --num_train_epochs=100 \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=1 \
-    --gradient_accumulation_steps=2 \
-    --learning_rate="3e-4" \
-    --warmup_steps=3000 \
-    --eval_strategy="steps" \
-    --max_duration_in_seconds=20 \
-    --save_steps=500 \
-    --eval_steps=500 \
-    --logging_steps=1 \
-    --layerdrop=0.0 \
-    --mask_time_prob=0.3 \
-    --mask_time_length=10 \
-    --mask_feature_prob=0.1 \
-    --mask_feature_length=64 \
-    --freeze_feature_encoder \
-    --gradient_checkpointing \
-    --fp16 \
-    --group_by_length \
-    --do_train \
-    --do_eval \
-    --do_predict \
-    --metric_for_best_model="wer" \
-    --greater_is_better=False \
-    --load_best_model_at_end \
-    --push_to_hub
-```
-
-On 8 V100 GPUs, this script should run in ~19 hours and yield a cross-entropy loss of **0.6215** and word error rate of **30.33**
-
-### Speech Classification with Minds-14
-
-The following command shows how to fine-tune the [XLS-R](https://huggingface.co/docs/transformers/main/model_doc/xls_r) model on [XTREME-S MLS](https://huggingface.co/datasets/google/xtreme_s#intent-classification---minds-14) using 2 GPUs in half-precision.
-
-```bash
-python -m torch.distributed.launch \
-    --nproc_per_node=2 \
-    run_xtreme_s.py \
-    --task="minds14" \
-    --language="all" \
-    --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
-    --output_dir="xtreme_s_xlsr_300m_minds14" \
-    --overwrite_output_dir \
-    --num_train_epochs=50 \
-    --per_device_train_batch_size=32 \
-    --per_device_eval_batch_size=8 \
-    --gradient_accumulation_steps=1 \
-    --learning_rate="3e-4" \
-    --warmup_steps=1500 \
-    --eval_strategy="steps" \
-    --max_duration_in_seconds=30 \
-    --save_steps=200 \
-    --eval_steps=200 \
-    --logging_steps=1 \
-    --layerdrop=0.0 \
-    --mask_time_prob=0.3 \
-    --mask_time_length=10 \
-    --mask_feature_prob=0.1 \
-    --mask_feature_length=64 \
-    --freeze_feature_encoder \
-    --gradient_checkpointing \
-    --fp16 \
-    --group_by_length \
-    --do_train \
-    --do_eval \
-    --do_predict \
-    --metric_for_best_model="f1" \
-    --greater_is_better=True \
-    --load_best_model_at_end \
-    --push_to_hub
-```
-
-On 2 A100 GPUs, this script should run in ~5 hours and yield a cross-entropy loss of **0.4119** and F1 score of **90.15**
diff --git a/examples/research_projects/xtreme-s/requirements.txt b/examples/research_projects/xtreme-s/requirements.txt
deleted file mode 100644
index 219959a4b267..000000000000
--- a/examples/research_projects/xtreme-s/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-datasets >= 1.18.0
-torch >= 1.5
-torchaudio
-librosa
-jiwer
diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py
deleted file mode 100644
index a467b3c6eb8d..000000000000
--- a/examples/research_projects/xtreme-s/run_xtreme_s.py
+++ /dev/null
@@ -1,949 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-"""Fine-tuning a 🤗 Transformers pretrained speech model on the XTREME-S benchmark tasks"""
-
-import json
-import logging
-import os
-import re
-import sys
-from collections import OrderedDict, defaultdict
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Union
-
-import datasets
-import numpy as np
-import torch
-from datasets import DatasetDict, load_dataset, load_metric
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoFeatureExtractor,
-    AutoModelForAudioClassification,
-    AutoModelForCTC,
-    AutoModelForSpeechSeq2Seq,
-    AutoProcessor,
-    AutoTokenizer,
-    HfArgumentParser,
-    Seq2SeqTrainer,
-    Seq2SeqTrainingArguments,
-    Trainer,
-    set_seed,
-)
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
-from transformers.utils.versions import require_version
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.18.0.dev0")
-
-require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
-
-
-logger = logging.getLogger(__name__)
-
-
-def list_field(default=None, metadata=None):
-    return field(default_factory=lambda: default, metadata=metadata)
-
-
-TASK_TO_TARGET_COLUMN_NAME = {
-    "fleurs-asr": "transcription",
-    "fleurs-lang_id": "lang_id",
-    "mls": "transcription",
-    "voxpopuli": "transcription",
-    "covost2": "translation",
-    "minds14": "intent_class",
-    "babel": "transcription",
-}
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    tokenizer_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Where do you want to store the pretrained models and datasets downloaded from huggingface.co"
-        },
-    )
-    freeze_feature_encoder: bool = field(
-        default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
-    )
-    attention_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
-    )
-    activation_dropout: float = field(
-        default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
-    )
-    feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
-    hidden_dropout: float = field(
-        default=0.0,
-        metadata={
-            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
-        },
-    )
-    final_dropout: float = field(
-        default=0.0,
-        metadata={"help": "The dropout probability for the final projection layer."},
-    )
-    mask_time_prob: float = field(
-        default=0.05,
-        metadata={
-            "help": (
-                "Probability of each feature vector along the time axis to be chosen as the start of the vector "
-                "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature "
-                "vectors will be masked along the time axis."
-            )
-        },
-    )
-    mask_time_length: int = field(
-        default=10,
-        metadata={"help": "Length of vector span to mask along the time axis."},
-    )
-    mask_feature_prob: float = field(
-        default=0.0,
-        metadata={
-            "help": (
-                "Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
-                " to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
-                " bins will be masked along the time axis."
-            )
-        },
-    )
-    mask_feature_length: int = field(
-        default=10,
-        metadata={"help": "Length of vector span to mask along the feature axis."},
-    )
-    layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
-    ctc_zero_infinity: bool = field(
-        default=False,
-        metadata={"help": "Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`."},
-    )
-    ctc_loss_reduction: Optional[str] = field(
-        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_name: str = field(
-        default="google/xtreme_s",
-        metadata={"help": "The name of the dataset to use (via the datasets library). Defaults to 'google/xtreme_s'"},
-    )
-    task: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The task name of the benchmark to use (via the datasets library). Should be on of: "
-                "'fleurs-asr', 'mls', 'voxpopuli', 'covost2', 'minds14', 'fleurs-lang_id', 'babel'."
-            )
-        },
-    )
-    language: str = field(
-        default="all",
-        metadata={"help": "The language id as defined in the datasets config name or `all` for all languages."},
-    )
-    language_group: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The language group to select a subset of languages to train on. "
-                "This option is only used the 'fleurs-asr' task. Should be one of: "
-                "'western_european_we', 'eastern_european_ee', 'central_asia_middle_north_african_cmn', "
-                "'sub_saharan_african_ssa', 'south_asian_sa', 'south_east_asian_sea', 'chinese_japanase_korean_cjk'."
-            )
-        },
-    )
-    train_split_name: str = field(
-        default="train",
-        metadata={
-            "help": "The name of the training dataset split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    eval_split_name: str = field(
-        default="validation",
-        metadata={
-            "help": (
-                "The name of the evaluation dataset split to use (via the datasets library). Defaults to 'validation'"
-            )
-        },
-    )
-    predict_split_name: str = field(
-        default="test",
-        metadata={
-            "help": "The name of the prediction dataset split to use (via the datasets library). Defaults to 'test'"
-        },
-    )
-    audio_column_name: str = field(
-        default="audio",
-        metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
-    )
-    target_column_name: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The name of the dataset column containing the target data (transcription/translation/label). If None,"
-                " the name will be inferred from the task. Defaults to None."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of validation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-                "value if set."
-            )
-        },
-    )
-    chars_to_ignore: Optional[List[str]] = list_field(
-        default=', ? . ! - ; : " “ % ‘ ” �'.split(" "),
-        metadata={"help": "A list of characters to remove from the transcripts."},
-    )
-    max_duration_in_seconds: float = field(
-        default=30.0,
-        metadata={
-            "help": (
-                "Filter audio files that are longer than `max_duration_in_seconds` seconds to"
-                " 'max_duration_in_seconds`"
-            )
-        },
-    )
-    min_duration_in_seconds: float = field(
-        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
-    )
-    preprocessing_only: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to only do data preprocessing and skip training. This is especially useful when data"
-                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
-                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
-                " can consequently be loaded in distributed training"
-            )
-        },
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "If :obj:`True`, will use the token generated when running"
-                ":obj:`huggingface-cli login` as HTTP bearer authorization for remote files."
-            )
-        },
-    )
-    unk_token: str = field(
-        default="[UNK]",
-        metadata={"help": "The unk token for the tokenizer"},
-    )
-    pad_token: str = field(
-        default="[PAD]",
-        metadata={"help": "The padding token for the tokenizer"},
-    )
-    word_delimiter_token: str = field(
-        default="|",
-        metadata={"help": "The word delimiter token for the tokenizer"},
-    )
-    phoneme_language: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The target language that should be used be"
-                " passed to the tokenizer for tokenization. Note that"
-                " this is only relevant if the model classifies the"
-                " input audio to a sequence of phoneme sequences."
-            )
-        },
-    )
-    per_lang_metrics: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "If `True`, compute the test metrics separately for each language, and average the results. "
-                "If `False` compute the average test metrics in a single pass for all languages at once."
-            )
-        },
-    )
-
-
-@dataclass
-class SpeechDataCollatorWithPadding:
-    processor: AutoProcessor
-    decoder_start_token_id: Optional[int] = None
-    padding: Union[bool, str] = "longest"
-    pad_labels: Optional[int] = True
-    pad_to_multiple_of: Optional[int] = None
-    pad_to_multiple_of_labels: Optional[int] = None
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lengths and need
-        # different padding methods
-        input_features = [{"input_values": feature["input_values"]} for feature in features]
-
-        batch = self.processor.pad(
-            input_features,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="pt",
-        )
-
-        if self.pad_labels:
-            label_features = [{"input_ids": feature["labels"]} for feature in features]
-            labels_batch = self.processor.pad(
-                labels=label_features,
-                padding=self.padding,
-                pad_to_multiple_of=self.pad_to_multiple_of_labels,
-                return_tensors="pt",
-            )
-
-            # replace padding with -100 to ignore loss correctly
-            labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
-
-            # if bos token is appended in previous tokenization step,
-            # cut bos token here as it's append later anyways
-            if (
-                self.decoder_start_token_id is not None
-                and (labels[:, 0] == self.decoder_start_token_id).all().cpu().item()
-            ):
-                labels = labels[:, 1:]
-
-            batch["labels"] = labels
-        else:
-            batch["labels"] = torch.tensor([feature["labels"] for feature in features])
-
-        return batch
-
-
-def create_vocabulary_from_data(
-    datasets: DatasetDict,
-    word_delimiter_token: Optional[str] = None,
-    unk_token: Optional[str] = None,
-    pad_token: Optional[str] = None,
-):
-    # Given training and test labels create vocabulary
-    def extract_all_chars(batch):
-        all_text = " ".join(batch["target_text"])
-        vocab = list(set(all_text))
-        return {"vocab": [vocab], "all_text": [all_text]}
-
-    vocabs = datasets.map(
-        extract_all_chars,
-        batched=True,
-        batch_size=-1,
-        keep_in_memory=True,
-        remove_columns=datasets["train"].column_names,
-    )
-
-    # take union of all unique characters in each dataset
-    vocab_set = (
-        (set(vocabs["train"]["vocab"][0]) if "train" in vocabs else set())
-        | (set(vocabs["eval"]["vocab"][0]) if "eval" in vocabs else set())
-        | (set(vocabs["predict"]["vocab"][0]) if "predict" in vocabs else set())
-    )
-
-    vocab_dict = {v: k for k, v in enumerate(sorted(vocab_set))}
-
-    # replace white space with delimiter token
-    if word_delimiter_token is not None:
-        vocab_dict[word_delimiter_token] = vocab_dict[" "]
-        del vocab_dict[" "]
-
-    # add unk and pad token
-    if unk_token is not None:
-        vocab_dict[unk_token] = len(vocab_dict)
-
-    if pad_token is not None:
-        vocab_dict[pad_token] = len(vocab_dict)
-
-    return vocab_dict
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
-        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # 1. First, let's load the dataset
-    raw_datasets = DatasetDict()
-    task_name = data_args.task
-    lang_id = data_args.language
-
-    if task_name is None:
-        raise ValueError(
-            "Set --task should be set to '<xtreme_s_task>' (e.g. 'fleurs-asr', 'mls', 'covost2', 'minds14') "
-        )
-    if lang_id is None:
-        raise ValueError(
-            "Set --language should be set to the language id of the sub dataset "
-            "config to be used (e.g. 'pl', 'en.tr', 'fr-FR') or 'all'"
-            " for multi-lingual fine-tuning."
-        )
-    if data_args.language_group is not None:
-        if data_args.task != "fleurs-asr":
-            raise ValueError("--language_group should only be used with --task=fleurs-asr")
-        if data_args.language != "all":
-            raise ValueError("--language_group should only be used with --language=all")
-
-    if data_args.target_column_name is None:
-        target_column_name = TASK_TO_TARGET_COLUMN_NAME[task_name]
-    else:
-        target_column_name = data_args.target_column_name
-
-    # here we differentiate between tasks with text as the target and classification tasks
-    is_text_target = target_column_name in ("transcription", "translation")
-
-    config_name = ".".join([task_name.split("-")[0], lang_id])
-
-    if training_args.do_train:
-        raw_datasets["train"] = load_dataset(
-            data_args.dataset_name,
-            config_name,
-            split=data_args.train_split_name,
-            token=data_args.use_auth_token,
-            cache_dir=model_args.cache_dir,
-        )
-
-        if data_args.audio_column_name not in raw_datasets["train"].column_names:
-            raise ValueError(
-                f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
-                " Make sure to set `--audio_column_name` to the correct audio column - one of"
-                f" {', '.join(raw_datasets['train'].column_names)}."
-            )
-
-        if target_column_name not in raw_datasets["train"].column_names:
-            raise ValueError(
-                f"--target_column_name {target_column_name} not found in dataset '{data_args.dataset_name}'. "
-                "Make sure to set `--target_column_name` to the correct text column - one of "
-                f"{', '.join(raw_datasets['train'].column_names)}."
-            )
-
-        if data_args.max_train_samples is not None:
-            raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        raw_datasets["eval"] = load_dataset(
-            data_args.dataset_name,
-            config_name,
-            split=data_args.eval_split_name,
-            token=data_args.use_auth_token,
-            cache_dir=model_args.cache_dir,
-        )
-
-        if data_args.max_eval_samples is not None:
-            raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
-
-    if training_args.do_predict:
-        raw_datasets["predict"] = load_dataset(
-            data_args.dataset_name,
-            config_name,
-            split=data_args.predict_split_name,
-            token=data_args.use_auth_token,
-            cache_dir=model_args.cache_dir,
-        )
-
-        if data_args.max_predict_samples is not None:
-            raw_datasets["predict"] = raw_datasets["predict"].select(range(data_args.max_predict_samples))
-
-    lang_list = next(iter(raw_datasets.values())).features["lang_id"].names
-    if not is_text_target:
-        label_list = next(iter(raw_datasets.values())).features[target_column_name].names
-        num_labels = len(label_list)
-
-    num_workers = data_args.preprocessing_num_workers
-
-    lang_group = data_args.language_group
-    if lang_group is not None:
-        with training_args.main_process_first(desc="language group filter"):
-            lang_group_id = next(iter(raw_datasets.values())).features["lang_group_id"].str2int(lang_group)
-            raw_datasets = raw_datasets.filter(
-                lambda lang_group: lang_group == lang_group_id,
-                num_proc=num_workers,
-                input_columns=["lang_group_id"],
-            )
-
-    # 2. We remove some special characters from the datasets
-    # that make training complicated and do not help in transcribing the speech
-    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
-    # that could be easily picked up by the model
-    chars_to_ignore_regex = (
-        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
-    )
-
-    def remove_special_characters(batch):
-        if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[target_column_name]).lower() + " "
-        else:
-            batch["target_text"] = batch[target_column_name].lower() + " "
-        return batch
-
-    if is_text_target:
-        with training_args.main_process_first(desc="dataset map special characters removal"):
-            raw_datasets = raw_datasets.map(
-                remove_special_characters,
-                remove_columns=[target_column_name],
-                desc="remove special characters from datasets",
-            )
-
-        # save special tokens for tokenizer
-        word_delimiter_token = data_args.word_delimiter_token
-        unk_token = data_args.unk_token
-        pad_token = data_args.pad_token
-
-    # 3. Next, let's load the config as we might need it to create
-    # the tokenizer
-    config = AutoConfig.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, token=data_args.use_auth_token
-    )
-
-    if is_text_target:
-        # 4. (Optional, for ASR and translation) If no tokenizer file is defined,
-        # we create the vocabulary of the model by extracting all unique characters from
-        # the training and evaluation datasets
-        # We need to make sure that only first rank saves vocabulary
-        # make sure all processes wait until vocab is created
-        tokenizer_name_or_path = model_args.tokenizer_name_or_path
-        tokenizer_kwargs = {}
-        if tokenizer_name_or_path is None:
-            # save vocab in training output dir
-            tokenizer_name_or_path = training_args.output_dir
-
-            vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
-
-            with training_args.main_process_first():
-                if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
-                    os.remove(vocab_file)
-
-            with training_args.main_process_first(desc="dataset map vocabulary creation"):
-                if not os.path.isfile(vocab_file):
-                    os.makedirs(tokenizer_name_or_path, exist_ok=True)
-                    vocab_dict = create_vocabulary_from_data(
-                        raw_datasets,
-                        word_delimiter_token=word_delimiter_token,
-                        unk_token=unk_token,
-                        pad_token=pad_token,
-                    )
-
-                    # save vocab dict to be loaded into tokenizer
-                    with open(vocab_file, "w") as file:
-                        json.dump(vocab_dict, file)
-
-            # if tokenizer has just been created
-            # it is defined by `tokenizer_class` if present in config else by `model_type`
-            if not config.is_encoder_decoder:
-                tokenizer_kwargs = {
-                    "config": config if config.tokenizer_class is not None else None,
-                    "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
-                    "unk_token": unk_token,
-                    "pad_token": pad_token,
-                    "word_delimiter_token": word_delimiter_token,
-                }
-            else:
-                tokenizer_kwargs = {}
-
-    # 5. Now we can instantiate the feature extractor, tokenizer and model
-    # Note for distributed training, the .from_pretrained methods guarantee that only
-    # one local process can concurrently download model & vocab.
-
-    # load feature_extractor and tokenizer
-    if is_text_target:
-        tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_name_or_path,
-            token=data_args.use_auth_token,
-            **tokenizer_kwargs,
-        )
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        model_args.model_name_or_path, cache_dir=model_args.cache_dir, token=data_args.use_auth_token
-    )
-
-    # adapt config
-    # (speech translation requires pre-configured seq2seq models)
-    if task_name != "covost2":
-        config.update(
-            {
-                "feat_proj_dropout": model_args.feat_proj_dropout,
-                "attention_dropout": model_args.attention_dropout,
-                "hidden_dropout": model_args.hidden_dropout,
-                "final_dropout": model_args.final_dropout,
-                "mask_time_prob": model_args.mask_time_prob,
-                "mask_time_length": model_args.mask_time_length,
-                "mask_feature_prob": model_args.mask_feature_prob,
-                "mask_feature_length": model_args.mask_feature_length,
-                "gradient_checkpointing": training_args.gradient_checkpointing,
-                "layerdrop": model_args.layerdrop,
-                "ctc_zero_infinity": model_args.ctc_zero_infinity,
-                "ctc_loss_reduction": model_args.ctc_loss_reduction,
-                "activation_dropout": model_args.activation_dropout,
-            }
-        )
-        if training_args.do_train:
-            if is_text_target:
-                config.pad_token_id = tokenizer.pad_token_id
-                config.vocab_size = len(tokenizer)
-            else:
-                label_to_id = {v: i for i, v in enumerate(label_list)}
-                config.label2id = label_to_id
-                config.id2label = {id: label for label, id in label_to_id.items()}
-                config.num_labels = num_labels
-
-    # create model
-    if target_column_name == "transcription":
-        model = AutoModelForCTC.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=model_args.cache_dir,
-            config=config,
-            token=data_args.use_auth_token,
-        )
-    elif config.is_encoder_decoder:
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=model_args.cache_dir,
-            config=config,
-            token=data_args.use_auth_token,
-        )
-        if model.config.decoder_start_token_id is None:
-            raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
-    else:
-        model = AutoModelForAudioClassification.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=model_args.cache_dir,
-            config=config,
-            token=data_args.use_auth_token,
-        )
-
-    # freeze encoder
-    if model_args.freeze_feature_encoder:
-        model.freeze_feature_encoder()
-
-    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
-    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
-    # so that we just need to set the correct target sampling rate and normalize the input
-    # via the `feature_extractor`
-
-    # make sure that dataset decodes audio with correct sampling rate
-    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
-    if dataset_sampling_rate != feature_extractor.sampling_rate:
-        raw_datasets = raw_datasets.cast_column(
-            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
-        )
-
-    # derive max & min input length for sample rate & max duration
-    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
-    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
-    audio_column_name = data_args.audio_column_name
-
-    # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
-    phoneme_language = data_args.phoneme_language
-
-    # Preprocessing the datasets.
-    # We need to read the audio files as arrays and tokenize the targets.
-    def prepare_dataset(batch):
-        # load audio
-        sample = batch[audio_column_name]
-
-        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
-        batch["input_values"] = inputs.input_values[0]
-        batch["length"] = len(batch["input_values"])
-
-        # encode targets
-        additional_kwargs = {}
-        if phoneme_language is not None:
-            additional_kwargs["phonemizer_lang"] = phoneme_language
-
-        if is_text_target:
-            batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
-        else:
-            batch["labels"] = batch[target_column_name]
-
-        batch["lang"] = batch["lang_id"]
-
-        return batch
-
-    with training_args.main_process_first(desc="dataset map preprocessing"):
-        vectorized_datasets = raw_datasets.map(
-            prepare_dataset,
-            remove_columns=next(iter(raw_datasets.values())).column_names,
-            num_proc=num_workers,
-            desc="preprocess datasets",
-        )
-
-        if training_args.do_train:
-
-            def is_audio_in_length_range(length):
-                return length > min_input_length and length < max_input_length
-
-            # filter data that is shorter than min_input_length
-            vectorized_datasets["train"] = vectorized_datasets["train"].filter(
-                is_audio_in_length_range,
-                num_proc=num_workers,
-                input_columns=["length"],
-            )
-
-    # 7. Next, we can prepare for the training step.
-    # Let's use the appropriate XTREME-S evaluation metric,
-    # instantiate a data collator and the trainer
-
-    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
-    eval_metric = load_metric("xtreme_s", task_name)
-
-    # for large datasets it is advised to run the preprocessing on a
-    # single machine first with ``args.preprocessing_only`` since there will mostly likely
-    # be a timeout when running the script in distributed mode.
-    # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
-    # cached dataset
-    if data_args.preprocessing_only:
-        logger.info(f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}")
-        return
-
-    def asr_logits_argmax(logits, labels):
-        return logits.argmax(dim=-1)
-
-    def compute_asr_metric(pred):
-        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
-
-        pred_str = tokenizer.batch_decode(pred.predictions)
-        # we do not want to group tokens when computing the metrics
-        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
-
-        metric = eval_metric.compute(predictions=pred_str, references=label_str)
-        return metric
-
-    def compute_classification_metric(pred):
-        pred_ids = np.argmax(pred.predictions, axis=1)
-        metric = eval_metric.compute(predictions=pred_ids, references=pred.label_ids)
-        return metric
-
-    # Now save everything to be able to create a single processor later
-    if is_main_process(training_args.local_rank):
-        # save feature extractor, tokenizer and config
-        feature_extractor.save_pretrained(training_args.output_dir)
-        if is_text_target:
-            tokenizer.save_pretrained(training_args.output_dir)
-        config.save_pretrained(training_args.output_dir)
-    # wait until configs are saved in the main process before loading the processor
-    if training_args.local_rank != -1:
-        torch.distributed.barrier()
-
-    if is_text_target:
-        processor = AutoProcessor.from_pretrained(training_args.output_dir)
-    else:
-        processor = AutoFeatureExtractor.from_pretrained(training_args.output_dir)
-
-    # Instantiate custom data collator
-    data_collator = SpeechDataCollatorWithPadding(processor=processor, pad_labels=is_text_target)
-
-    # Initialize Trainer
-    if target_column_name == "translation":
-        trainer = Seq2SeqTrainer(
-            model=model,
-            data_collator=data_collator,
-            args=training_args,
-            preprocess_logits_for_metrics=asr_logits_argmax if training_args.predict_with_generate else None,
-            compute_metrics=compute_asr_metric if training_args.predict_with_generate else None,
-            train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
-            eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-            tokenizer=feature_extractor,
-        )
-    else:
-        trainer = Trainer(
-            model=model,
-            data_collator=data_collator,
-            args=training_args,
-            preprocess_logits_for_metrics=asr_logits_argmax if is_text_target else None,
-            compute_metrics=compute_asr_metric if is_text_target else compute_classification_metric,
-            train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
-            eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-            tokenizer=feature_extractor,
-        )
-
-    # 8. Finally, we can start training
-
-    # Training
-    if training_args.do_train:
-        # use last checkpoint if exist
-        if last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        elif os.path.isdir(model_args.model_name_or_path):
-            checkpoint = model_args.model_name_or_path
-        else:
-            checkpoint = None
-
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples
-            if data_args.max_train_samples is not None
-            else len(vectorized_datasets["train"])
-        )
-        metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation on the test set
-    results = {}
-    if training_args.do_predict:
-        logger.info(f"*** Evaluating on the `{data_args.predict_split_name}` set ***")
-        if data_args.per_lang_metrics:
-            # separate the `test` dataset into language-specific subsets and compute metrics for each of them
-            metrics = {}
-            average_metrics = defaultdict(list)
-            for lang_id in range(len(lang_list)):
-                lang_name = lang_list[lang_id]
-                with training_args.main_process_first(desc="per-language dataset filter"):
-                    lang_dataset = vectorized_datasets["predict"].filter(
-                        lambda lang: lang == lang_id,
-                        num_proc=num_workers,
-                        input_columns=["lang"],
-                    )
-                lang_metrics = trainer.evaluate(lang_dataset)
-                redundant_metrics = ["eval_runtime", "eval_samples_per_second", "eval_steps_per_second", "eval_epoch"]
-                for metric_name, value in lang_metrics.items():
-                    average_metrics[metric_name].append(value)
-                    if metric_name not in redundant_metrics:
-                        metrics[f"{metric_name}_{lang_name}"] = value
-            for metric_name, value in average_metrics.items():
-                metrics[metric_name] = np.mean(value)
-        else:
-            metrics = trainer.evaluate(vectorized_datasets["predict"])
-        max_predict_samples = (
-            data_args.max_predict_samples
-            if data_args.max_predict_samples is not None
-            else len(vectorized_datasets["predict"])
-        )
-        metrics["predict_samples"] = min(max_predict_samples, len(vectorized_datasets["predict"]))
-
-        # make sure that the `predict` metrics end up in the log history for the model card
-        trainer.log(OrderedDict(sorted(metrics.items())))
-
-        trainer.log_metrics("predict", metrics)
-        trainer.save_metrics("predict", metrics)
-
-    # Write model card and (optionally) push to hub
-    kwargs = {
-        "finetuned_from": model_args.model_name_or_path,
-        "tasks": task_name,
-        "tags": [task_name, data_args.dataset_name],
-        "dataset_args": (
-            f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
-            f" {data_args.eval_split_name}, Predict split: {data_args.predict_split_name}"
-        ),
-        "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
-        "language": data_args.language,
-    }
-
-    if training_args.push_to_hub:
-        trainer.push_to_hub(**kwargs)
-    else:
-        trainer.create_model_card(**kwargs)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/research_projects/zero-shot-distillation/README.md b/examples/research_projects/zero-shot-distillation/README.md
deleted file mode 100644
index 14b6a8ea07f7..000000000000
--- a/examples/research_projects/zero-shot-distillation/README.md
+++ /dev/null
@@ -1,155 +0,0 @@
-# Zero-shot classifier distillation
-
-Author: @joeddav 
-
-This script provides a way to improve the speed and memory performance of a zero-shot classifier by training a more
-efficient student model from the zero-shot teacher's predictions over an unlabeled dataset.
-
-The zero-shot classification pipeline uses a model pre-trained on natural language inference (NLI) to determine the
-compatibility of a set of candidate class names with a given sequence. This serves as a convenient out-of-the-box
-classifier without the need for labeled training data. However, for a given sequence, the method requires each
-possible label to be fed through the large NLI model separately. Thus for `N` sequences and `K` classes, a total of
-`N*K` forward passes through the model are required. This requirement slows inference considerably, particularly as
-`K` grows.
-
-Given (1) an unlabeled corpus and (2) a set of candidate class names, the provided script trains a student model
-with a standard classification head with `K` output dimensions. The resulting student model can then be used for
-classifying novel text instances with a significant boost in speed and memory performance while retaining similar
-classification performance to the original zero-shot model
-
-### Usage
-
-A teacher NLI model can be distilled to a more efficient student model by running [`distill_classifier.py`](https://github.com/huggingface/transformers/blob/main/examples/research_projects/zero-shot-distillation/distill_classifier.py):
-
-```bash
-python distill_classifier.py \
---data_file <unlabeled_data.txt> \
---class_names_file <class_names.txt> \
---output_dir <output_dir>
-```
-
-`<unlabeled_data.txt>` should be a text file with a single unlabeled example per line. `<class_names.txt>` is a text file with one class name per line.
-
-Other optional arguments include:
-
-- `--teacher_name_or_path` (default: `roberta-large-mnli`): The name or path of the NLI teacher model.
-- `--student_name_or_path` (default: `distillbert-base-uncased`): The name or path of the student model which will
-be fine-tuned to copy the teacher predictions.
-- `--hypothesis_template` (default `"This example is {}."`): The template used to turn each label into an NLI-style
-hypothesis when generating teacher predictions. This template must include a `{}` or similar syntax for the
-candidate label to be inserted into the template. For example, the default template is `"This example is {}."` With
-the candidate label `sports`, this would be fed into the model like `[CLS] sequence to classify [SEP] This example
-is sports . [SEP]`.
-- `--multi_class`: Whether or not multiple candidate labels can be true. By default, the scores are normalized such
-that the sum of the label likelihoods for each sequence is 1. If `--multi_class` is passed, the labels are
-considered independent and probabilities are normalized for each candidate by doing a softmax of the entailment
-score vs. the contradiction score. This is sometimes called "multi-class multi-label" classification.
-- `--temperature` (default: `1.0`): The temperature applied to the softmax of the teacher model predictions. A
-higher temperature results in a student with smoother (lower confidence) predictions than the teacher while a value
-`<1` resultings in a higher-confidence, peaked distribution. The default `1.0` is equivalent to no smoothing.
-- `--teacher_batch_size` (default: `32`): The batch size used for generating a single set of teacher predictions.
-Does not affect training. Use `--per_device_train_batch_size` to change the training batch size.
-
-Any of the arguments in the 🤗 Trainer's
-[`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html?#trainingarguments) can also be
-modified, such as `--learning_rate`, `--fp16`, `--no_cuda`, `--warmup_steps`, etc. Run `python distill_classifier.py
--h` for a full list of available arguments or consult the [Trainer
-documentation](https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments).
-
-> **Note**: Distributed and TPU training are not currently supported. Single-node multi-GPU is supported, however,
-and will run automatically if multiple GPUs are available.
-
-### Example: Topic classification
-
-> A full colab demo notebook of this example can be found [here](https://colab.research.google.com/drive/1mjBjd0cR8G57ZpsnFCS3ngGyo5nCa9ya?usp=sharing).
-
-Let's say we're interested in classifying news articles into one of four topic categories: "the world", "sports",
-"business", or "science/tech". We have an unlabeled dataset, [AG's News](https://huggingface.co/datasets/ag_news),
-which corresponds to this problem (in reality AG's News is annotated, but we will pretend it is not for the sake of
-example).
-
-We can use an NLI model like `roberta-large-mnli` for zero-shot classification like so:
-
-```python
->>> class_names = ["the world", "sports", "business", "science/tech"]
->>> hypothesis_template = "This text is about {}."
->>> sequence = "A new moon has been discovered in Jupiter's orbit"
-
->>> zero_shot_classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")
->>> zero_shot_classifier(sequence, class_names, hypothesis_template=hypothesis_template)
-{'sequence': "A new moon has been discovered in Jupiter's orbit",
- 'labels': ['science/tech', 'the world', 'business', 'sports'],
- 'scores': [0.7035840153694153, 0.18744826316833496, 0.06027870625257492, 0.04868902638554573]}
-```
-
-Unfortunately, inference is slow since each of our 4 class names must be fed through the large model for every
-sequence to be classified. But with our unlabeled data we can distill the model to a small distilbert classifier to
-make future inference much faster.
-
-To run the script, we will need to put each training example (text only) from AG's News on its own line in
-`agnews/train_unlabeled.txt`, and each of the four class names in the newline-separated `agnews/class_names.txt`.
-Then we can run distillation with the following command:
-
-```bash
-python distill_classifier.py \
---data_file ./agnews/unlabeled.txt \
---class_names_files ./agnews/class_names.txt \
---teacher_name_or_path roberta-large-mnli \
---hypothesis_template "This text is about {}." \
---output_dir ./agnews/distilled
-```
-
-The script will generate a set of soft zero-shot predictions from `roberta-large-mnli` for each example in
-`agnews/unlabeled.txt`. It will then train a student distilbert classifier on the teacher predictions and
-save the resulting model in `./agnews/distilled`.
-
-The resulting model can then be loaded and used like any other pre-trained classifier:
-
-```python
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-model = AutoModelForSequenceClassification.from_pretrained("./agnews/distilled")
-tokenizer = AutoTokenizer.from_pretrained("./agnews/distilled")
-```
-
-and even used trivially with a `TextClassificationPipeline`:
-
-```python
->>> distilled_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
->>> distilled_classifier(sequence)
-[[{'label': 'the world', 'score': 0.14899294078350067},
-  {'label': 'sports', 'score': 0.03205857425928116},
-  {'label': 'business', 'score': 0.05943061783909798},
-  {'label': 'science/tech', 'score': 0.7595179080963135}]]
-```
-
-> Tip: pass `device=0` when constructing a pipeline to run on a GPU
-
-As we can see, the results of the student closely resemble that of the trainer despite never having seen this
-example during training. Now let's do a quick & dirty speed comparison simulating 16K examples with a batch size of
-16:
-
-```python
-for _ in range(1000):
-    zero_shot_classifier([sequence] * 16, class_names)
-# runs in 1m 23s on a single V100 GPU
-```
-
-```python
-%%time
-for _ in range(1000):
-    distilled_classifier([sequence] * 16)
-# runs in 10.3s on a single V100 GPU
-```
-
-As we can see, the distilled student model runs an order of magnitude faster than its teacher NLI model. This is
-also a seeting where we only have `K=4` possible labels. The higher the number of classes for a given task, the more
-drastic the speedup will be, since the zero-shot teacher's complexity scales linearly with the number of classes.
-
-Since we secretly have access to ground truth labels for AG's news, we can evaluate the accuracy of each model. The
-original zero-shot model `roberta-large-mnli` gets an accuracy of 69.3% on the held-out test set. After training a
-student on the unlabeled training set, the distilled model gets a similar score of 70.4%.
-
-Lastly, you can share the distilled model with the community and/or use it with our inference API by [uploading it
-to the 🤗 Hub](https://huggingface.co/transformers/model_sharing.html). We've uploaded the distilled model from this
-example at
-[joeddav/distilbert-base-uncased-agnews-student](https://huggingface.co/joeddav/distilbert-base-uncased-agnews-student).
diff --git a/examples/research_projects/zero-shot-distillation/distill_classifier.py b/examples/research_projects/zero-shot-distillation/distill_classifier.py
deleted file mode 100644
index 561812084777..000000000000
--- a/examples/research_projects/zero-shot-distillation/distill_classifier.py
+++ /dev/null
@@ -1,338 +0,0 @@
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import List, Optional
-
-import torch
-from datasets import Dataset
-from torch import nn
-from tqdm.auto import tqdm
-
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    set_seed,
-    utils,
-)
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-
-
-DESCRIPTION = """
-Distills an NLI-based zero-shot classifier to a smaller, more efficient model with a fixed set of candidate class
-names. Useful for speeding up zero-shot classification in cases where labeled training data is not available, but
-when only a single fixed set of classes is needed. Takes a teacher NLI model, student classifier model, unlabeled
-dataset, and set of K possible class names. Yields a single classifier with K outputs corresponding to the provided
-class names.
-"""
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class TeacherModelArguments:
-    teacher_name_or_path: Optional[str] = field(
-        default="roberta-large-mnli", metadata={"help": "The NLI/zero-shot teacher model to be distilled."}
-    )
-    hypothesis_template: Optional[str] = field(
-        default="This example is {}.",
-        metadata={
-            "help": (
-                "Template used to turn class names into mock hypotheses for teacher NLI model. Must include {{}} "
-                "where class name is inserted."
-            )
-        },
-    )
-    teacher_batch_size: Optional[int] = field(
-        default=32, metadata={"help": "Batch size for generating teacher predictions."}
-    )
-    multi_label: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": (
-                "Allow multiple classes to be true rather than forcing them to sum to 1 (sometimes called "
-                "multi-class multi-label classification)."
-            )
-        },
-    )
-    temperature: Optional[float] = field(
-        default=1.0, metadata={"help": "Temperature applied to teacher softmax for distillation."}
-    )
-
-
-@dataclass
-class StudentModelArguments:
-    student_name_or_path: Optional[str] = field(
-        default="distilbert-base-uncased", metadata={"help": "The NLI/zero-shot teacher model to be distilled."}
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    data_file: str = field(metadata={"help": "Text file with one unlabeled instance per line."})
-    class_names_file: str = field(metadata={"help": "Text file with one class name per line."})
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the Rust tokenizers library) or not."},
-    )
-
-
-@dataclass
-class DistillTrainingArguments(TrainingArguments):
-    output_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
-    )
-    per_device_train_batch_size: int = field(
-        default=32, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
-    )
-    per_device_eval_batch_size: int = field(
-        default=128, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
-    )
-    num_train_epochs: float = field(default=1.0, metadata={"help": "Total number of training epochs to perform."})
-    do_train: bool = field(default=True, metadata={"help": "Whether to run training of student model."})
-    do_eval: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether to evaluate the agreement of the final student predictions and the teacher predictions "
-                "after training."
-            )
-        },
-    )
-    save_total_limit: Optional[int] = field(
-        default=0,
-        metadata={
-            "help": (
-                "Limit the total amount of checkpoints. "
-                "Deletes the older checkpoints in the output_dir. Default is 0 (no checkpoints)."
-            )
-        },
-    )
-
-
-class DistillationTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
-        target_p = inputs["labels"]
-        outputs = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])
-        logits = outputs[0]
-
-        loss = -torch.sum(target_p * logits.log_softmax(dim=-1), axis=-1).mean()
-
-        if return_outputs:
-            return loss, outputs
-
-        return loss
-
-
-def read_lines(path):
-    lines = []
-    with open(path, "r") as f:
-        for line in f:
-            line = line.strip()
-            if len(line) > 0:
-                lines.append(line)
-    return lines
-
-
-def get_premise_hypothesis_pairs(examples, class_names, hypothesis_template):
-    premises = []
-    hypotheses = []
-    for example in examples:
-        for name in class_names:
-            premises.append(example)
-            hypotheses.append(hypothesis_template.format(name))
-    return premises, hypotheses
-
-
-def get_entailment_id(config):
-    for label, ind in config.label2id.items():
-        if label.lower().startswith("entail"):
-            return ind
-    logger.warning("Could not identify entailment dimension from teacher config label2id. Setting to -1.")
-    return -1
-
-
-def get_teacher_predictions(
-    model_path: str,
-    examples: List[str],
-    class_names: List[str],
-    hypothesis_template: str,
-    batch_size: int,
-    temperature: float,
-    multi_label: bool,
-    use_fast_tokenizer: bool,
-    no_cuda: bool,
-    fp16: bool,
-):
-    """
-    Gets predictions by the same method as the zero-shot pipeline but with DataParallel & more efficient batching
-    """
-    model = AutoModelForSequenceClassification.from_pretrained(model_path)
-    model_config = model.config
-    if not no_cuda and torch.cuda.is_available():
-        model = nn.DataParallel(model.cuda())
-        batch_size *= len(model.device_ids)
-    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=use_fast_tokenizer)
-
-    premises, hypotheses = get_premise_hypothesis_pairs(examples, class_names, hypothesis_template)
-    logits = []
-
-    for i in tqdm(range(0, len(premises), batch_size)):
-        batch_premises = premises[i : i + batch_size]
-        batch_hypotheses = hypotheses[i : i + batch_size]
-
-        encodings = tokenizer(
-            batch_premises,
-            batch_hypotheses,
-            padding=True,
-            truncation="only_first",
-            return_tensors="pt",
-        )
-
-        with torch.cuda.amp.autocast(enabled=fp16):
-            with torch.no_grad():
-                outputs = model(**encodings)
-        logits.append(outputs.logits.detach().cpu().float())
-
-    entail_id = get_entailment_id(model_config)
-    contr_id = -1 if entail_id == 0 else 0
-    logits = torch.cat(logits, dim=0)  # N*K x 3
-    nli_logits = logits.reshape(len(examples), len(class_names), -1)[..., [contr_id, entail_id]]  # N x K x 2
-
-    if multi_label:
-        # softmax over (contr, entail) logits for each class independently
-        nli_prob = (nli_logits / temperature).softmax(-1)
-    else:
-        # softmax over entail logits across classes s.t. class probabilities sum to 1.
-        nli_prob = (nli_logits / temperature).softmax(1)
-
-    return nli_prob[..., 1]  # N x K
-
-
-def main():
-    parser = HfArgumentParser(
-        (DataTrainingArguments, TeacherModelArguments, StudentModelArguments, DistillTrainingArguments),
-        description=DESCRIPTION,
-    )
-
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        data_args, teacher_args, student_args, training_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
-    else:
-        data_args, teacher_args, student_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        utils.logging.set_verbosity_info()
-        utils.logging.enable_default_handler()
-        utils.logging.enable_explicit_format()
-
-    if training_args.local_rank != -1:
-        raise ValueError("Distributed training is not currently supported.")
-    if training_args.tpu_num_cores is not None:
-        raise ValueError("TPU acceleration is not currently supported.")
-
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # 1. read in data
-    examples = read_lines(data_args.data_file)
-    class_names = read_lines(data_args.class_names_file)
-
-    # 2. get teacher predictions and load into dataset
-    logger.info("Generating predictions from zero-shot teacher model")
-    teacher_soft_preds = get_teacher_predictions(
-        teacher_args.teacher_name_or_path,
-        examples,
-        class_names,
-        teacher_args.hypothesis_template,
-        teacher_args.teacher_batch_size,
-        teacher_args.temperature,
-        teacher_args.multi_label,
-        data_args.use_fast_tokenizer,
-        training_args.no_cuda,
-        training_args.fp16,
-    )
-    dataset = Dataset.from_dict(
-        {
-            "text": examples,
-            "labels": teacher_soft_preds,
-        }
-    )
-
-    # 3. create student
-    logger.info("Initializing student model")
-    model = AutoModelForSequenceClassification.from_pretrained(
-        student_args.student_name_or_path, num_labels=len(class_names)
-    )
-    tokenizer = AutoTokenizer.from_pretrained(student_args.student_name_or_path, use_fast=data_args.use_fast_tokenizer)
-    model.config.id2label = dict(enumerate(class_names))
-    model.config.label2id = {label: i for i, label in enumerate(class_names)}
-
-    # 4. train student on teacher predictions
-    dataset = dataset.map(tokenizer, input_columns="text")
-    dataset.set_format("torch")
-
-    def compute_metrics(p, return_outputs=False):
-        preds = p.predictions.argmax(-1)
-        proxy_labels = p.label_ids.argmax(-1)  # "label_ids" are actually distributions
-        return {"agreement": (preds == proxy_labels).mean().item()}
-
-    trainer = DistillationTrainer(
-        model=model,
-        tokenizer=tokenizer,
-        args=training_args,
-        train_dataset=dataset,
-        compute_metrics=compute_metrics,
-    )
-
-    if training_args.do_train:
-        logger.info("Training student model on teacher predictions")
-        trainer.train()
-
-    if training_args.do_eval:
-        agreement = trainer.evaluate(eval_dataset=dataset)["eval_agreement"]
-        logger.info(f"Agreement of student and teacher predictions: {agreement * 100:0.2f}%")
-
-    trainer.save_model()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/run_on_remote.py b/examples/run_on_remote.py
index 46f87065d761..dff9d268484b 100644
--- a/examples/run_on_remote.py
+++ b/examples/run_on_remote.py
@@ -56,7 +56,7 @@
     cluster.run(["pip install torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu117"])
 
     # Run example. You can bypass the CLI wrapper and paste your own code here.
-    cluster.run([f'python transformers/examples/{args.example} {" ".join(shlex.quote(arg) for arg in unknown)}'])
+    cluster.run([f"python transformers/examples/{args.example} {' '.join(shlex.quote(arg) for arg in unknown)}"])
 
     # Alternatively, we can just import and run a training function (especially if there's no wrapper CLI):
     # from my_script... import train
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index bc656ba6ff1c..8071cb3189f7 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version(
     "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 6e4be8dcb056..acda9cd38789 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -55,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 4c9a3ad78913..224863d24296 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index 46f0470d1c56..087d9095534d 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -62,7 +62,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 3fd823ec8c08..dc9ff9fe3b07 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -53,7 +53,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index 0fbe3790e058..031ebad19eb2 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 task_to_keys = {
     "cola": ("sentence", None),
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 148ee55f26c3..2a5ec42bf13f 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -56,7 +56,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.49.0.dev0")
+check_min_version("4.51.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
@@ -501,9 +501,9 @@ def preprocess_function(examples):
 
         # region Set decoder_start_token_id
         if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-            assert (
-                data_args.target_lang is not None and data_args.source_lang is not None
-            ), "mBart requires --target_lang and --source_lang"
+            assert data_args.target_lang is not None and data_args.source_lang is not None, (
+                "mBart requires --target_lang and --source_lang"
+            )
             if isinstance(tokenizer, MBartTokenizer):
                 model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
             else:
diff --git a/i18n/README_fr.md b/i18n/README_fr.md
index 02714d52bff3..5925978c44ce 100644
--- a/i18n/README_fr.md
+++ b/i18n/README_fr.md
@@ -225,7 +225,7 @@ Le modèle lui-même est un module [`nn.Module` PyTorch](https://pytorch.org/doc
 
 1. Choisissez le bon framework pour chaque partie de la vie d'un modèle :
     - Entraînez des modèles de pointe en 3 lignes de code.
-    - Trasnférer un seul modèle entre les frameworks TF2.0/PyTorch/JAX à volonté.
+    - Transférer un seul modèle entre les frameworks TF2.0/PyTorch/JAX à volonté.
     - Choisissez facilement le bon framework pour l'entraînement, l'évaluation et la production.
 
 1. Personnalisez facilement un modèle ou un exemple selon vos besoins :
diff --git a/i18n/README_zh-hans.md b/i18n/README_zh-hans.md
index b4d121df0d32..637aba3174cd 100644
--- a/i18n/README_zh-hans.md
+++ b/i18n/README_zh-hans.md
@@ -83,7 +83,7 @@ checkpoint: 检查点
 
 🤗 Transformers 提供了数以千计的预训练模型，支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨是让最先进的 NLP 技术人人易用。
 
-🤗 Transformers 提供了便于快速下载和使用的API，让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时，每个定义的 Python 模块均完全独立，方便修改和快速研究实验。
+🤗 Transformers 提供了便于快速下载和使用的API，让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时，每个定义的 Python 模块都是完全独立的，便于修改和快速进行研究实验。
 
 🤗 Transformers 支持三个最热门的深度学习库： [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。
 
diff --git a/pyproject.toml b/pyproject.toml
index 79a6d9e70ae8..d978ed0161ee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ exclude_lines = [
 ]
 
 [tool.ruff]
+target-version = "py39"
 line-length = 119
 
 [tool.ruff.lint]
@@ -53,4 +54,4 @@ markers = [
     "generate: marks tests that use the GenerationTesterMixin"
 ]
 log_cli = 1
-log_cli_level = "WARNING"
\ No newline at end of file
+log_cli_level = "WARNING"
diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
index c9470eeeae85..b24beedcd4fa 100755
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@@ -18,7 +18,7 @@
 #
 # --variations allows you to compare variations in multiple dimensions.
 #
-# as the first dimention has 2 options and the second 3 in our example, this will run the trainer 6
+# as the first dimension has 2 options and the second 3 in our example, this will run the trainer 6
 # times adding one of:
 #
 #    1. --tf32 0 --fp16 0
diff --git a/scripts/deberta_scrtipt.py b/scripts/deberta_scrtipt.py
index b910d8de3f52..0c9c6025def7 100644
--- a/scripts/deberta_scrtipt.py
+++ b/scripts/deberta_scrtipt.py
@@ -1,7 +1,10 @@
-import torch
-from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForMaskedLM
 import time
 
+import torch
+
+from transformers import AutoModel, AutoTokenizer, pipeline
+
+
 test_sentence = 'Do you [MASK] the muffin man?'
 
 # for comparison
diff --git a/setup.py b/setup.py
index 39ddbf7f1852..0094a3b6ff5f 100644
--- a/setup.py
+++ b/setup.py
@@ -117,7 +117,8 @@
     "fugashi>=1.0",
     "GitPython<3.1.19",
     "hf-doc-builder>=0.3.0",
-    "huggingface-hub>=0.26.0,<1.0",
+    "hf_xet",
+    "huggingface-hub>=0.30.0,<1.0",
     "importlib_metadata",
     "ipadic>=1.0.0,<2.0",
     "isort>=5.5.4",
@@ -125,13 +126,15 @@
     "jaxlib>=0.4.1,<=0.4.13",
     "jieba",
     "jinja2>=3.1.0",
-    "kenlm",
+    "kenlm@git+https://github.com/ydshieh/kenlm@78f664fb3dafe1468d868d71faf19534530698d5",
     # Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support.
     "keras>2.9,<2.16",
     "keras-nlp>=0.3.1,<0.14.0",  # keras-nlp 0.14 doesn't support keras 2, see pin on keras.
+    "kernels>=0.3.2,<0.4",
     "librosa",
-    "nltk<=3.8.1",
     "natten>=0.14.6,<0.15.0",
+    "nltk<=3.8.1",
+    "num2words",
     "numpy>=1.17",
     "onnxconverter-common",
     "onnxruntime-tools>=1.4.2",
@@ -149,8 +152,10 @@
     "pydantic",
     "pytest>=7.2.0,<8.0.0",
     "pytest-asyncio",
+    "pytest-rerunfailures",
     "pytest-timeout",
     "pytest-xdist",
+    "pytest-order",
     "python>=3.9.0",
     "ray[tune]>=2.7.0",
     "regex!=2019.12.17",
@@ -158,10 +163,10 @@
     "rhoknp>=1.1.0,<1.3.1",
     "rjieba",
     "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
-    "ruff==0.5.1",
+    "ruff==0.11.2",
     "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses",
-    "safetensors>=0.4.1",
+    "safetensors>=0.4.3",
     "sagemaker>=2.31.0",
     "schedulefree>=1.2.6",
     "scikit-learn",
@@ -279,6 +284,7 @@ def run(self):
 
 extras["torch"] = deps_list("torch", "accelerate")
 extras["accelerate"] = deps_list("accelerate")
+extras["hf_xet"] = deps_list("hf_xet")
 
 if os.name == "nt":  # windows
     extras["retrieval"] = deps_list("datasets")  # faiss is not supported on windows
@@ -298,11 +304,17 @@ def run(self):
 extras["optuna"] = deps_list("optuna")
 extras["ray"] = deps_list("ray[tune]")
 extras["sigopt"] = deps_list("sigopt")
+extras["hub-kernels"] = deps_list("kernels")
 
-extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
+extras["integrations"] = extras["hub-kernels"] + extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm")
+extras["audio"] = deps_list(
+    "librosa",
+    "pyctcdecode",
+    "phonemizer",
+    "kenlm@git+https://github.com/ydshieh/kenlm@78f664fb3dafe1468d868d71faf19534530698d5",
+)
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
@@ -314,7 +326,7 @@ def run(self):
 extras["natten"] = deps_list("natten")
 extras["codecarbon"] = deps_list("codecarbon")
 extras["video"] = deps_list("av")
-
+extras["num2words"] = deps_list("num2words")
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["tiktoken"] = deps_list("tiktoken", "blobfile")
 extras["testing"] = (
@@ -323,6 +335,8 @@ def run(self):
         "pytest-asyncio",
         "pytest-rich",
         "pytest-xdist",
+        "pytest-order",
+        "pytest-rerunfailures",
         "timeout-decorator",
         "parameterized",
         "psutil",
@@ -364,6 +378,7 @@ def run(self):
     + extras["codecarbon"]
     + extras["accelerate"]
     + extras["video"]
+    + extras["num2words"]
 )
 
 
@@ -383,6 +398,7 @@ def run(self):
     + extras["sklearn"]
     + extras["modelcreation"]
     + extras["onnxruntime"]
+    + extras["num2words"]
 )
 extras["dev-tensorflow"] = (
     extras["testing"]
@@ -437,7 +453,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.49.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.51.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
@@ -449,7 +465,7 @@ def run(self):
     package_dir={"": "src"},
     packages=find_packages("src"),
     include_package_data=True,
-    package_data={"": ["**/*.cu", "**/*.cpp", "**/*.cuh", "**/*.h", "**/*.pyx"]},
+    package_data={"": ["**/*.cu", "**/*.cpp", "**/*.cuh", "**/*.h", "**/*.pyx", "py.typed"]},
     zip_safe=False,
     extras_require=extras,
     entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]},
@@ -473,8 +489,6 @@ def run(self):
 extras["tests_torch"] = deps_list()
 extras["tests_tf"] = deps_list()
 extras["tests_flax"] = deps_list()
-extras["tests_torch_and_tf"] = deps_list()
-extras["tests_torch_and_flax"] = deps_list()
 extras["tests_hub"] = deps_list()
 extras["tests_pipelines_torch"] = deps_list()
 extras["tests_pipelines_tf"] = deps_list()
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
old mode 100755
new mode 100644
index e9c752b85466..609bc08df3a0
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.49.0.dev0"
+__version__ = "4.51.0"
 
 from typing import TYPE_CHECKING
 
@@ -141,6 +141,7 @@
         "is_ray_available",
         "is_ray_tune_available",
         "is_sigopt_available",
+        "is_swanlab_available",
         "is_tensorboard_available",
         "is_wandb_available",
     ],
@@ -194,6 +195,7 @@
         "AutoTokenizer",
     ],
     "models.autoformer": ["AutoformerConfig"],
+    "models.aya_vision": ["AyaVisionConfig", "AyaVisionProcessor"],
     "models.bamba": ["BambaConfig"],
     "models.bark": [
         "BarkCoarseConfig",
@@ -343,6 +345,7 @@
     ],
     "models.deberta_v2": ["DebertaV2Config"],
     "models.decision_transformer": ["DecisionTransformerConfig"],
+    "models.deepseek_v3": ["DeepseekV3Config"],
     "models.deformable_detr": ["DeformableDetrConfig"],
     "models.deit": ["DeiTConfig"],
     "models.deprecated": [],
@@ -472,6 +475,7 @@
     "models.fuyu": ["FuyuConfig"],
     "models.gemma": ["GemmaConfig"],
     "models.gemma2": ["Gemma2Config"],
+    "models.gemma3": ["Gemma3Config", "Gemma3Processor", "Gemma3TextConfig"],
     "models.git": [
         "GitConfig",
         "GitProcessor",
@@ -496,6 +500,7 @@
     "models.gptj": ["GPTJConfig"],
     "models.granite": ["GraniteConfig"],
     "models.granitemoe": ["GraniteMoeConfig"],
+    "models.granitemoeshared": ["GraniteMoeSharedConfig"],
     "models.grounding_dino": [
         "GroundingDinoConfig",
         "GroundingDinoProcessor",
@@ -557,6 +562,12 @@
     "models.levit": ["LevitConfig"],
     "models.lilt": ["LiltConfig"],
     "models.llama": ["LlamaConfig"],
+    "models.llama4": [
+        "Llama4Config",
+        "Llama4Processor",
+        "Llama4TextConfig",
+        "Llama4VisionConfig",
+    ],
     "models.llava": [
         "LlavaConfig",
         "LlavaProcessor",
@@ -609,6 +620,7 @@
     ],
     "models.mimi": ["MimiConfig"],
     "models.mistral": ["MistralConfig"],
+    "models.mistral3": ["Mistral3Config"],
     "models.mixtral": ["MixtralConfig"],
     "models.mllama": [
         "MllamaConfig",
@@ -694,6 +706,13 @@
     "models.persimmon": ["PersimmonConfig"],
     "models.phi": ["PhiConfig"],
     "models.phi3": ["Phi3Config"],
+    "models.phi4_multimodal": [
+        "Phi4MultimodalAudioConfig",
+        "Phi4MultimodalConfig",
+        "Phi4MultimodalFeatureExtractor",
+        "Phi4MultimodalProcessor",
+        "Phi4MultimodalVisionConfig",
+    ],
     "models.phimoe": ["PhimoeConfig"],
     "models.phobert": ["PhobertTokenizer"],
     "models.pix2struct": [
@@ -706,6 +725,7 @@
     "models.plbart": ["PLBartConfig"],
     "models.poolformer": ["PoolFormerConfig"],
     "models.pop2piano": ["Pop2PianoConfig"],
+    "models.prompt_depth_anything": ["PromptDepthAnythingConfig"],
     "models.prophetnet": [
         "ProphetNetConfig",
         "ProphetNetTokenizer",
@@ -730,6 +750,8 @@
         "Qwen2VLConfig",
         "Qwen2VLProcessor",
     ],
+    "models.qwen3": ["Qwen3Config"],
+    "models.qwen3_moe": ["Qwen3MoeConfig"],
     "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
     "models.recurrent_gemma": ["RecurrentGemmaConfig"],
     "models.reformer": ["ReformerConfig"],
@@ -769,12 +791,23 @@
     "models.seggpt": ["SegGptConfig"],
     "models.sew": ["SEWConfig"],
     "models.sew_d": ["SEWDConfig"],
+    "models.shieldgemma2": [
+        "ShieldGemma2Config",
+        "ShieldGemma2Processor",
+    ],
     "models.siglip": [
         "SiglipConfig",
         "SiglipProcessor",
         "SiglipTextConfig",
         "SiglipVisionConfig",
     ],
+    "models.siglip2": [
+        "Siglip2Config",
+        "Siglip2Processor",
+        "Siglip2TextConfig",
+        "Siglip2VisionConfig",
+    ],
+    "models.smolvlm": ["SmolVLMConfig"],
     "models.speech_encoder_decoder": ["SpeechEncoderDecoderConfig"],
     "models.speech_to_text": [
         "Speech2TextConfig",
@@ -1005,11 +1038,11 @@
         "is_timm_available",
         "is_tokenizers_available",
         "is_torch_available",
+        "is_torch_hpu_available",
         "is_torch_mlu_available",
         "is_torch_musa_available",
         "is_torch_neuroncore_available",
         "is_torch_npu_available",
-        "is_torch_tpu_available",
         "is_torchvision_available",
         "is_torch_xla_available",
         "is_torch_xpu_available",
@@ -1029,6 +1062,7 @@
         "HiggsConfig",
         "HqqConfig",
         "QuantoConfig",
+        "QuarkConfig",
         "SpQRConfig",
         "TorchAoConfig",
         "VptqConfig",
@@ -1249,6 +1283,7 @@
     _import_structure["models.emu3"].append("Emu3ImageProcessor")
     _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
     _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
+    _import_structure["models.gemma3"].append("Gemma3ImageProcessor")
     _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
     _import_structure["models.got_ocr2"].extend(["GotOcr2ImageProcessor"])
     _import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
@@ -1280,6 +1315,7 @@
     _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"])
     _import_structure["models.pixtral"].append("PixtralImageProcessor")
     _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
+    _import_structure["models.prompt_depth_anything"].extend(["PromptDepthAnythingImageProcessor"])
     _import_structure["models.pvt"].extend(["PvtImageProcessor"])
     _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"])
     _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"])
@@ -1287,6 +1323,8 @@
     _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
     _import_structure["models.seggpt"].extend(["SegGptImageProcessor"])
     _import_structure["models.siglip"].append("SiglipImageProcessor")
+    _import_structure["models.siglip2"].append("Siglip2ImageProcessor")
+    _import_structure["models.smolvlm"].extend(["SmolVLMImageProcessor"])
     _import_structure["models.superglue"].extend(["SuperGlueImageProcessor"])
     _import_structure["models.superpoint"].extend(["SuperPointImageProcessor"])
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
@@ -1320,13 +1358,18 @@
     _import_structure["models.deit"].append("DeiTImageProcessorFast")
     _import_structure["models.depth_pro"].append("DepthProImageProcessorFast")
     _import_structure["models.detr"].append("DetrImageProcessorFast")
+    _import_structure["models.gemma3"].append("Gemma3ImageProcessorFast")
+    _import_structure["models.got_ocr2"].append("GotOcr2ImageProcessorFast")
+    _import_structure["models.llama4"].append("Llama4ImageProcessorFast")
     _import_structure["models.llava"].append("LlavaImageProcessorFast")
     _import_structure["models.llava_next"].append("LlavaNextImageProcessorFast")
     _import_structure["models.llava_onevision"].append("LlavaOnevisionImageProcessorFast")
+    _import_structure["models.phi4_multimodal"].append("Phi4MultimodalImageProcessorFast")
     _import_structure["models.pixtral"].append("PixtralImageProcessorFast")
     _import_structure["models.qwen2_vl"].append("Qwen2VLImageProcessorFast")
     _import_structure["models.rt_detr"].append("RTDetrImageProcessorFast")
     _import_structure["models.siglip"].append("SiglipImageProcessorFast")
+    _import_structure["models.siglip2"].append("Siglip2ImageProcessorFast")
     _import_structure["models.vit"].append("ViTImageProcessorFast")
 
 try:
@@ -1350,6 +1393,10 @@
 
     _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
 else:
+    _import_structure["model_debugging_utils"] = [
+        "model_addition_debugger",
+        "model_addition_debugger_context",
+    ]
     _import_structure["activations"] = []
     _import_structure["cache_utils"] = [
         "Cache",
@@ -1443,8 +1490,8 @@
 
     _import_structure["modeling_flash_attention_utils"] = []
     _import_structure["modeling_outputs"] = []
-    _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS"]
-    _import_structure["modeling_utils"] = ["PreTrainedModel"]
+    _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update"]
+    _import_structure["modeling_utils"] = ["PreTrainedModel", "AttentionInterface"]
 
     # PyTorch models structure
 
@@ -1588,6 +1635,7 @@
             "AutoformerPreTrainedModel",
         ]
     )
+    _import_structure["models.aya_vision"].extend(["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel"])
     _import_structure["models.bamba"].extend(
         [
             "BambaForCausalLM",
@@ -1984,6 +2032,13 @@
             "DecisionTransformerPreTrainedModel",
         ]
     )
+    _import_structure["models.deepseek_v3"].extend(
+        [
+            "DeepseekV3ForCausalLM",
+            "DeepseekV3Model",
+            "DeepseekV3PreTrainedModel",
+        ]
+    )
     _import_structure["models.deformable_detr"].extend(
         [
             "DeformableDetrForObjectDetection",
@@ -2437,6 +2492,14 @@
             "Gemma2PreTrainedModel",
         ]
     )
+    _import_structure["models.gemma3"].extend(
+        [
+            "Gemma3ForCausalLM",
+            "Gemma3ForConditionalGeneration",
+            "Gemma3PreTrainedModel",
+            "Gemma3TextModel",
+        ]
+    )
     _import_structure["models.git"].extend(
         [
             "GitForCausalLM",
@@ -2454,6 +2517,15 @@
             "GlmPreTrainedModel",
         ]
     )
+    _import_structure["models.llama4"].extend(
+        [
+            "Llama4ForCausalLM",
+            "Llama4ForConditionalGeneration",
+            "Llama4TextModel",
+            "Llama4VisionModel",
+            "Llama4PreTrainedModel",
+        ]
+    )
     _import_structure["models.glpn"].extend(
         [
             "GLPNForDepthEstimation",
@@ -2539,6 +2611,14 @@
             "GraniteMoePreTrainedModel",
         ]
     )
+
+    _import_structure["models.granitemoeshared"].extend(
+        [
+            "GraniteMoeSharedForCausalLM",
+            "GraniteMoeSharedModel",
+            "GraniteMoeSharedPreTrainedModel",
+        ]
+    )
     _import_structure["models.grounding_dino"].extend(
         [
             "GroundingDinoForObjectDetection",
@@ -2755,6 +2835,17 @@
             "LlavaNextPreTrainedModel",
         ]
     )
+    _import_structure["models.phi4_multimodal"].extend(
+        [
+            "Phi4MultimodalForCausalLM",
+            "Phi4MultimodalPreTrainedModel",
+            "Phi4MultimodalAudioModel",
+            "Phi4MultimodalAudioPreTrainedModel",
+            "Phi4MultimodalModel",
+            "Phi4MultimodalVisionModel",
+            "Phi4MultimodalVisionPreTrainedModel",
+        ]
+    )
     _import_structure["models.llava_next_video"].extend(
         [
             "LlavaNextVideoForConditionalGeneration",
@@ -2905,6 +2996,12 @@
             "MistralPreTrainedModel",
         ]
     )
+    _import_structure["models.mistral3"].extend(
+        [
+            "Mistral3ForConditionalGeneration",
+            "Mistral3PreTrainedModel",
+        ]
+    )
     _import_structure["models.mixtral"].extend(
         [
             "MixtralForCausalLM",
@@ -2975,6 +3072,7 @@
     _import_structure["models.modernbert"].extend(
         [
             "ModernBertForMaskedLM",
+            "ModernBertForQuestionAnswering",
             "ModernBertForSequenceClassification",
             "ModernBertForTokenClassification",
             "ModernBertModel",
@@ -3288,6 +3386,12 @@
             "Pop2PianoPreTrainedModel",
         ]
     )
+    _import_structure["models.prompt_depth_anything"].extend(
+        [
+            "PromptDepthAnythingForDepthEstimation",
+            "PromptDepthAnythingPreTrainedModel",
+        ]
+    )
     _import_structure["models.prophetnet"].extend(
         [
             "ProphetNetDecoder",
@@ -3354,6 +3458,26 @@
             "Qwen2VLPreTrainedModel",
         ]
     )
+    _import_structure["models.qwen3"].extend(
+        [
+            "Qwen3ForCausalLM",
+            "Qwen3ForQuestionAnswering",
+            "Qwen3ForSequenceClassification",
+            "Qwen3ForTokenClassification",
+            "Qwen3Model",
+            "Qwen3PreTrainedModel",
+        ]
+    )
+    _import_structure["models.qwen3_moe"].extend(
+        [
+            "Qwen3MoeForCausalLM",
+            "Qwen3MoeForQuestionAnswering",
+            "Qwen3MoeForSequenceClassification",
+            "Qwen3MoeForTokenClassification",
+            "Qwen3MoeModel",
+            "Qwen3MoePreTrainedModel",
+        ]
+    )
     _import_structure["models.rag"].extend(
         [
             "RagModel",
@@ -3481,6 +3605,7 @@
         [
             "SamModel",
             "SamPreTrainedModel",
+            "SamVisionModel",
         ]
     )
     _import_structure["models.seamless_m4t"].extend(
@@ -3539,6 +3664,7 @@
             "SEWDPreTrainedModel",
         ]
     )
+    _import_structure["models.shieldgemma2"].append("ShieldGemma2ForImageClassification")
     _import_structure["models.siglip"].extend(
         [
             "SiglipForImageClassification",
@@ -3548,6 +3674,25 @@
             "SiglipVisionModel",
         ]
     )
+    _import_structure["models.siglip2"].extend(
+        [
+            "Siglip2ForImageClassification",
+            "Siglip2Model",
+            "Siglip2PreTrainedModel",
+            "Siglip2TextModel",
+            "Siglip2VisionModel",
+        ]
+    )
+    _import_structure["models.smolvlm"].extend(
+        [
+            "SmolVLMForConditionalGeneration",
+            "SmolVLMModel",
+            "SmolVLMPreTrainedModel",
+            "SmolVLMProcessor",
+            "SmolVLMVisionConfig",
+            "SmolVLMVisionTransformer",
+        ]
+    )
     _import_structure["models.speech_encoder_decoder"].extend(["SpeechEncoderDecoderModel"])
     _import_structure["models.speech_to_text"].extend(
         [
@@ -4050,7 +4195,6 @@
     )
     _import_structure["optimization"] = [
         "Adafactor",
-        "AdamW",
         "get_constant_schedule",
         "get_constant_schedule_with_warmup",
         "get_cosine_schedule_with_warmup",
@@ -4630,6 +4774,7 @@
         [
             "TFSamModel",
             "TFSamPreTrainedModel",
+            "TFSamVisionModel",
         ]
     )
     _import_structure["models.segformer"].extend(
@@ -5226,6 +5371,7 @@
         is_ray_available,
         is_ray_tune_available,
         is_sigopt_available,
+        is_swanlab_available,
         is_tensorboard_available,
         is_wandb_available,
     )
@@ -5281,6 +5427,10 @@
     from .models.autoformer import (
         AutoformerConfig,
     )
+    from .models.aya_vision import (
+        AyaVisionConfig,
+        AyaVisionProcessor,
+    )
     from .models.bamba import BambaConfig
     from .models.bark import (
         BarkCoarseConfig,
@@ -5443,6 +5593,9 @@
     from .models.decision_transformer import (
         DecisionTransformerConfig,
     )
+    from .models.deepseek_v3 import (
+        DeepseekV3Config,
+    )
     from .models.deformable_detr import (
         DeformableDetrConfig,
     )
@@ -5582,6 +5735,7 @@
     from .models.fuyu import FuyuConfig
     from .models.gemma import GemmaConfig
     from .models.gemma2 import Gemma2Config
+    from .models.gemma3 import Gemma3Config, Gemma3Processor, Gemma3TextConfig
     from .models.git import (
         GitConfig,
         GitProcessor,
@@ -5605,6 +5759,7 @@
     from .models.gptj import GPTJConfig
     from .models.granite import GraniteConfig
     from .models.granitemoe import GraniteMoeConfig
+    from .models.granitemoeshared import GraniteMoeSharedConfig
     from .models.grounding_dino import (
         GroundingDinoConfig,
         GroundingDinoProcessor,
@@ -5668,6 +5823,12 @@
     from .models.levit import LevitConfig
     from .models.lilt import LiltConfig
     from .models.llama import LlamaConfig
+    from .models.llama4 import (
+        Llama4Config,
+        Llama4Processor,
+        Llama4TextConfig,
+        Llama4VisionConfig,
+    )
     from .models.llava import (
         LlavaConfig,
         LlavaProcessor,
@@ -5727,6 +5888,7 @@
         MimiConfig,
     )
     from .models.mistral import MistralConfig
+    from .models.mistral3 import Mistral3Config
     from .models.mixtral import MixtralConfig
     from .models.mllama import (
         MllamaConfig,
@@ -5828,6 +5990,13 @@
     )
     from .models.phi import PhiConfig
     from .models.phi3 import Phi3Config
+    from .models.phi4_multimodal import (
+        Phi4MultimodalAudioConfig,
+        Phi4MultimodalConfig,
+        Phi4MultimodalFeatureExtractor,
+        Phi4MultimodalProcessor,
+        Phi4MultimodalVisionConfig,
+    )
     from .models.phimoe import PhimoeConfig
     from .models.phobert import PhobertTokenizer
     from .models.pix2struct import (
@@ -5847,6 +6016,7 @@
     from .models.pop2piano import (
         Pop2PianoConfig,
     )
+    from .models.prompt_depth_anything import PromptDepthAnythingConfig
     from .models.prophetnet import (
         ProphetNetConfig,
         ProphetNetTokenizer,
@@ -5868,6 +6038,8 @@
         Qwen2VLConfig,
         Qwen2VLProcessor,
     )
+    from .models.qwen3 import Qwen3Config
+    from .models.qwen3_moe import Qwen3MoeConfig
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
     from .models.recurrent_gemma import RecurrentGemmaConfig
     from .models.reformer import ReformerConfig
@@ -5914,12 +6086,23 @@
     from .models.seggpt import SegGptConfig
     from .models.sew import SEWConfig
     from .models.sew_d import SEWDConfig
+    from .models.shieldgemma2 import (
+        ShieldGemma2Config,
+        ShieldGemma2Processor,
+    )
     from .models.siglip import (
         SiglipConfig,
         SiglipProcessor,
         SiglipTextConfig,
         SiglipVisionConfig,
     )
+    from .models.siglip2 import (
+        Siglip2Config,
+        Siglip2Processor,
+        Siglip2TextConfig,
+        Siglip2VisionConfig,
+    )
+    from .models.smolvlm import SmolVLMConfig
     from .models.speech_encoder_decoder import SpeechEncoderDecoderConfig
     from .models.speech_to_text import (
         Speech2TextConfig,
@@ -6176,11 +6359,11 @@
         is_timm_available,
         is_tokenizers_available,
         is_torch_available,
+        is_torch_hpu_available,
         is_torch_mlu_available,
         is_torch_musa_available,
         is_torch_neuroncore_available,
         is_torch_npu_available,
-        is_torch_tpu_available,
         is_torch_xla_available,
         is_torch_xpu_available,
         is_torchvision_available,
@@ -6202,6 +6385,7 @@
         HiggsConfig,
         HqqConfig,
         QuantoConfig,
+        QuarkConfig,
         SpQRConfig,
         TorchAoConfig,
         VptqConfig,
@@ -6395,6 +6579,7 @@
             FlavaProcessor,
         )
         from .models.fuyu import FuyuImageProcessor, FuyuProcessor
+        from .models.gemma3 import Gemma3ImageProcessor
         from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
         from .models.got_ocr2 import GotOcr2ImageProcessor
         from .models.grounding_dino import GroundingDinoImageProcessor
@@ -6442,6 +6627,7 @@
             PoolFormerFeatureExtractor,
             PoolFormerImageProcessor,
         )
+        from .models.prompt_depth_anything import PromptDepthAnythingImageProcessor
         from .models.pvt import PvtImageProcessor
         from .models.qwen2_vl import Qwen2VLImageProcessor
         from .models.rt_detr import RTDetrImageProcessor
@@ -6449,6 +6635,8 @@
         from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
         from .models.seggpt import SegGptImageProcessor
         from .models.siglip import SiglipImageProcessor
+        from .models.siglip2 import Siglip2ImageProcessor
+        from .models.smolvlm import SmolVLMImageProcessor
         from .models.superglue import SuperGlueImageProcessor
         from .models.superpoint import SuperPointImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
@@ -6478,13 +6666,18 @@
         from .models.deit import DeiTImageProcessorFast
         from .models.depth_pro import DepthProImageProcessorFast
         from .models.detr import DetrImageProcessorFast
+        from .models.gemma3 import Gemma3ImageProcessorFast
+        from .models.got_ocr2 import GotOcr2ImageProcessorFast
+        from .models.llama4 import Llama4ImageProcessorFast
         from .models.llava import LlavaImageProcessorFast
         from .models.llava_next import LlavaNextImageProcessorFast
         from .models.llava_onevision import LlavaOnevisionImageProcessorFast
+        from .models.phi4_multimodal import Phi4MultimodalImageProcessorFast
         from .models.pixtral import PixtralImageProcessorFast
         from .models.qwen2_vl import Qwen2VLImageProcessorFast
         from .models.rt_detr import RTDetrImageProcessorFast
         from .models.siglip import SiglipImageProcessorFast
+        from .models.siglip2 import Siglip2ImageProcessorFast
         from .models.vit import ViTImageProcessorFast
 
     try:
@@ -6502,6 +6695,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_pt_objects import *
     else:
+        # Debugging
         from .cache_utils import (
             Cache,
             CacheConfig,
@@ -6587,8 +6781,12 @@
             TorchExportableModuleWithStaticCache,
             convert_and_export_with_cache,
         )
-        from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
-        from .modeling_utils import PreTrainedModel
+        from .model_debugging_utils import (
+            model_addition_debugger,
+            model_addition_debugger_context,
+        )
+        from .modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+        from .modeling_utils import AttentionInterface, PreTrainedModel
         from .models.albert import (
             AlbertForMaskedLM,
             AlbertForMultipleChoice,
@@ -6714,6 +6912,7 @@
             AutoformerModel,
             AutoformerPreTrainedModel,
         )
+        from .models.aya_vision import AyaVisionForConditionalGeneration, AyaVisionPreTrainedModel
         from .models.bamba import BambaForCausalLM, BambaModel, BambaPreTrainedModel
         from .models.bark import (
             BarkCausalModel,
@@ -7034,6 +7233,11 @@
             DecisionTransformerModel,
             DecisionTransformerPreTrainedModel,
         )
+        from .models.deepseek_v3 import (
+            DeepseekV3ForCausalLM,
+            DeepseekV3Model,
+            DeepseekV3PreTrainedModel,
+        )
         from .models.deformable_detr import (
             DeformableDetrForObjectDetection,
             DeformableDetrModel,
@@ -7401,6 +7605,12 @@
             Gemma2Model,
             Gemma2PreTrainedModel,
         )
+        from .models.gemma3 import (
+            Gemma3ForCausalLM,
+            Gemma3ForConditionalGeneration,
+            Gemma3PreTrainedModel,
+            Gemma3TextModel,
+        )
         from .models.git import (
             GitForCausalLM,
             GitModel,
@@ -7479,6 +7689,11 @@
             GraniteMoeModel,
             GraniteMoePreTrainedModel,
         )
+        from .models.granitemoeshared import (
+            GraniteMoeSharedForCausalLM,
+            GraniteMoeSharedModel,
+            GraniteMoeSharedPreTrainedModel,
+        )
         from .models.grounding_dino import (
             GroundingDinoForObjectDetection,
             GroundingDinoModel,
@@ -7635,6 +7850,13 @@
             LlamaModel,
             LlamaPreTrainedModel,
         )
+        from .models.llama4 import (
+            Llama4ForCausalLM,
+            Llama4ForConditionalGeneration,
+            Llama4PreTrainedModel,
+            Llama4TextModel,
+            Llama4VisionModel,
+        )
         from .models.llava import (
             LlavaForConditionalGeneration,
             LlavaPreTrainedModel,
@@ -7757,6 +7979,10 @@
             MistralModel,
             MistralPreTrainedModel,
         )
+        from .models.mistral3 import (
+            Mistral3ForConditionalGeneration,
+            Mistral3PreTrainedModel,
+        )
         from .models.mixtral import (
             MixtralForCausalLM,
             MixtralForQuestionAnswering,
@@ -7812,6 +8038,7 @@
         )
         from .models.modernbert import (
             ModernBertForMaskedLM,
+            ModernBertForQuestionAnswering,
             ModernBertForSequenceClassification,
             ModernBertForTokenClassification,
             ModernBertModel,
@@ -8025,6 +8252,15 @@
             Phi3Model,
             Phi3PreTrainedModel,
         )
+        from .models.phi4_multimodal import (
+            Phi4MultimodalAudioModel,
+            Phi4MultimodalAudioPreTrainedModel,
+            Phi4MultimodalForCausalLM,
+            Phi4MultimodalModel,
+            Phi4MultimodalPreTrainedModel,
+            Phi4MultimodalVisionModel,
+            Phi4MultimodalVisionPreTrainedModel,
+        )
         from .models.phimoe import (
             PhimoeForCausalLM,
             PhimoeForSequenceClassification,
@@ -8057,6 +8293,10 @@
             Pop2PianoForConditionalGeneration,
             Pop2PianoPreTrainedModel,
         )
+        from .models.prompt_depth_anything import (
+            PromptDepthAnythingForDepthEstimation,
+            PromptDepthAnythingPreTrainedModel,
+        )
         from .models.prophetnet import (
             ProphetNetDecoder,
             ProphetNetEncoder,
@@ -8107,6 +8347,22 @@
             Qwen2VLModel,
             Qwen2VLPreTrainedModel,
         )
+        from .models.qwen3 import (
+            Qwen3ForCausalLM,
+            Qwen3ForQuestionAnswering,
+            Qwen3ForSequenceClassification,
+            Qwen3ForTokenClassification,
+            Qwen3Model,
+            Qwen3PreTrainedModel,
+        )
+        from .models.qwen3_moe import (
+            Qwen3MoeForCausalLM,
+            Qwen3MoeForQuestionAnswering,
+            Qwen3MoeForSequenceClassification,
+            Qwen3MoeForTokenClassification,
+            Qwen3MoeModel,
+            Qwen3MoePreTrainedModel,
+        )
         from .models.rag import (
             RagModel,
             RagPreTrainedModel,
@@ -8207,6 +8463,7 @@
         from .models.sam import (
             SamModel,
             SamPreTrainedModel,
+            SamVisionModel,
         )
         from .models.seamless_m4t import (
             SeamlessM4TCodeHifiGan,
@@ -8252,6 +8509,9 @@
             SEWDModel,
             SEWDPreTrainedModel,
         )
+        from .models.shieldgemma2 import (
+            ShieldGemma2ForImageClassification,
+        )
         from .models.siglip import (
             SiglipForImageClassification,
             SiglipModel,
@@ -8259,6 +8519,21 @@
             SiglipTextModel,
             SiglipVisionModel,
         )
+        from .models.siglip2 import (
+            Siglip2ForImageClassification,
+            Siglip2Model,
+            Siglip2PreTrainedModel,
+            Siglip2TextModel,
+            Siglip2VisionModel,
+        )
+        from .models.smolvlm import (
+            SmolVLMForConditionalGeneration,
+            SmolVLMModel,
+            SmolVLMPreTrainedModel,
+            SmolVLMProcessor,
+            SmolVLMVisionConfig,
+            SmolVLMVisionTransformer,
+        )
         from .models.speech_encoder_decoder import SpeechEncoderDecoderModel
         from .models.speech_to_text import (
             Speech2TextForConditionalGeneration,
@@ -8644,7 +8919,6 @@
         # Optimization
         from .optimization import (
             Adafactor,
-            AdamW,
             get_constant_schedule,
             get_constant_schedule_with_warmup,
             get_cosine_schedule_with_warmup,
@@ -9131,6 +9405,7 @@
         from .models.sam import (
             TFSamModel,
             TFSamPreTrainedModel,
+            TFSamVisionModel,
         )
         from .models.segformer import (
             TFSegformerDecodeHead,
diff --git a/src/transformers/agents/agent_types.py b/src/transformers/agents/agent_types.py
index 75973777b9a0..2da60745fae0 100644
--- a/src/transformers/agents/agent_types.py
+++ b/src/transformers/agents/agent_types.py
@@ -129,7 +129,7 @@ def to_raw(self):
             return self._raw
 
         if self._tensor is not None:
-            array = self._tensor.cpu().detach().numpy()
+            array = self._tensor.detach().cpu().numpy()
             return Image.fromarray((255 - array * 255).astype(np.uint8))
 
     def to_string(self):
@@ -147,7 +147,7 @@ def to_string(self):
             return self._path
 
         if self._tensor is not None:
-            array = self._tensor.cpu().detach().numpy()
+            array = self._tensor.detach().cpu().numpy()
 
             # There is likely simpler than load into image into save
             img = Image.fromarray((255 - array * 255).astype(np.uint8))
diff --git a/src/transformers/agents/agents.py b/src/transformers/agents/agents.py
index 08c30d54fd43..f1c1d58ff217 100644
--- a/src/transformers/agents/agents.py
+++ b/src/transformers/agents/agents.py
@@ -20,6 +20,8 @@
 import time
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+from huggingface_hub.utils._deprecation import _deprecate_method
+
 from .. import is_torch_available
 from ..utils import logging as transformers_logging
 from ..utils.import_utils import is_pygments_available
@@ -110,7 +112,7 @@ def parse_json_blob(json_blob: str) -> Dict[str, str]:
         raise ValueError(
             f"The JSON blob you used is invalid due to the following error: {e}.\n"
             f"JSON blob was: {json_blob}, decoding failed on that specific part of the blob:\n"
-            f"'{json_blob[place-4:place+5]}'."
+            f"'{json_blob[place - 4 : place + 5]}'."
         )
     except Exception as e:
         raise ValueError(f"Error in parsing the JSON blob: {e}")
@@ -215,7 +217,7 @@ def tools(self) -> Dict[str, Tool]:
         """Get all tools currently in the toolbox"""
         return self._tools
 
-    def show_tool_descriptions(self, tool_description_template: str = None) -> str:
+    def show_tool_descriptions(self, tool_description_template: Optional[str] = None) -> str:
         """
         Returns the description of all tools in the toolbox
 
@@ -330,7 +332,7 @@ def format_prompt_with_tools(toolbox: Toolbox, prompt_template: str, tool_descri
 def show_agents_descriptions(managed_agents: list):
     managed_agents_descriptions = """
 You can also give requests to team members.
-Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaning your request.
+Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaining your request.
 Given that this team member is a real human, you should be very verbose in your request.
 Here is a list of the team members that you can call:"""
     for agent in managed_agents.values():
@@ -720,6 +722,10 @@ class ReactAgent(Agent):
     The action will be parsed from the LLM output: it consists in calls to tools from the toolbox, with arguments chosen by the LLM engine.
     """
 
+    @_deprecate_method(
+        version="4.51.0",
+        message="Switch to smolagents instead, with the same functionalities and similar API (https://huggingface.co/docs/smolagents/index)",
+    )
     def __init__(
         self,
         tools: List[Tool],
@@ -774,6 +780,10 @@ def provide_final_answer(self, task) -> str:
         except Exception as e:
             return f"Error in generating final llm output: {e}."
 
+    @_deprecate_method(
+        version="4.51.0",
+        message="Switch to smolagents instead, with the same functionalities and similar API (https://huggingface.co/docs/smolagents/index)",
+    )
     def run(self, task: str, stream: bool = False, reset: bool = True, **kwargs):
         """
         Runs the agent for the given task.
@@ -881,7 +891,7 @@ def direct_run(self, task: str):
 
         return final_answer
 
-    def planning_step(self, task, is_first_step: bool = False, iteration: int = None):
+    def planning_step(self, task, is_first_step: bool = False, iteration: Optional[int] = None):
         """
         Used periodically by the agent to plan the next steps to reach the objective.
 
@@ -1225,6 +1235,10 @@ def step(self, log_entry: Dict[str, Any]):
 
 
 class ManagedAgent:
+    @_deprecate_method(
+        version="4.51.0",
+        message="Switch to smolagents instead, with the same functionalities and similar API (https://huggingface.co/docs/smolagents/index)",
+    )
     def __init__(self, agent, name, description, additional_prompting=None, provide_run_summary=False):
         self.agent = agent
         self.name = name
diff --git a/src/transformers/agents/evaluate_agent.py b/src/transformers/agents/evaluate_agent.py
index 90dfd4ff0322..555acef28c85 100644
--- a/src/transformers/agents/evaluate_agent.py
+++ b/src/transformers/agents/evaluate_agent.py
@@ -275,7 +275,7 @@ def score_code(agent_answer, theoretical_answer, verbose: bool = False):
         return 1
     elif isinstance(agent_answer, dict) and any(v in theoretical_answer for v in agent_answer.values()):
         if verbose:
-            print("Almsot perfect, result in state!")
+            print("Almost perfect, result in state!")
         return 0.75
     else:
         if verbose:
@@ -345,7 +345,7 @@ def evaluate_agent(agent, batch_size=8, verbose=False, return_errors=False):
         missing_tools = set(TEST_TOOLS) - agent_tools
         unexpected_tools = set(agent_tools) - TEST_TOOLS
         raise ValueError(
-            f"Fix the test tools in the evaluate_agent module. Tools mising: {missing_tools}. Extra tools: {unexpected_tools}."
+            f"Fix the test tools in the evaluate_agent module. Tools missing: {missing_tools}. Extra tools: {unexpected_tools}."
         )
 
     eval_tasks = []
diff --git a/src/transformers/agents/image_question_answering.py b/src/transformers/agents/image_question_answering.py
index de0efb7b6f38..832f77a4ebe7 100644
--- a/src/transformers/agents/image_question_answering.py
+++ b/src/transformers/agents/image_question_answering.py
@@ -26,8 +26,7 @@
 class ImageQuestionAnsweringTool(PipelineTool):
     default_checkpoint = "dandelin/vilt-b32-finetuned-vqa"
     description = (
-        "This is a tool that answers a question about an image. It "
-        "returns a text that is the answer to the question."
+        "This is a tool that answers a question about an image. It returns a text that is the answer to the question."
     )
     name = "image_qa"
     pre_processor_class = AutoProcessor
diff --git a/src/transformers/agents/llm_engine.py b/src/transformers/agents/llm_engine.py
index afa4d62d059e..49c96b957971 100644
--- a/src/transformers/agents/llm_engine.py
+++ b/src/transformers/agents/llm_engine.py
@@ -19,6 +19,7 @@
 from typing import Dict, List, Optional
 
 from huggingface_hub import InferenceClient
+from huggingface_hub.utils._deprecation import _deprecate_method
 
 from .. import AutoTokenizer
 from ..pipelines.base import Pipeline
@@ -73,6 +74,10 @@ def get_clean_message_list(message_list: List[Dict[str, str]], role_conversions:
 
 
 class HfEngine:
+    @_deprecate_method(
+        version="4.51.0",
+        message="Switch to smolagents instead, with the same functionalities and similar API (https://huggingface.co/docs/smolagents/index)",
+    )
     def __init__(self, model_id: Optional[str] = None):
         self.last_input_token_count = None
         self.last_output_token_count = None
diff --git a/src/transformers/agents/python_interpreter.py b/src/transformers/agents/python_interpreter.py
index 6e90f356cb92..109461ba5f6a 100644
--- a/src/transformers/agents/python_interpreter.py
+++ b/src/transformers/agents/python_interpreter.py
@@ -32,7 +32,7 @@
 
 class InterpreterError(ValueError):
     """
-    An error raised when the interpretor cannot evaluate a Python expression, due to syntax error or unsupported
+    An error raised when the interpreter cannot evaluate a Python expression, due to syntax error or unsupported
     operations.
     """
 
@@ -713,14 +713,14 @@ def evaluate_ast(
     Evaluate an abstract syntax tree using the content of the variables stored in a state and only evaluating a given
     set of functions.
 
-    This function will recurse trough the nodes of the tree provided.
+    This function will recurse through the nodes of the tree provided.
 
     Args:
         expression (`ast.AST`):
             The code to evaluate, as an abstract syntax tree.
         state (`Dict[str, Any]`):
             A dictionary mapping variable names to values. The `state` is updated if need be when the evaluation
-            encounters assignements.
+            encounters assignments.
         static_tools (`Dict[str, Callable]`):
             Functions that may be called during the evaluation. Trying to change one of these static_tools will raise an error.
         custom_tools (`Dict[str, Callable]`):
@@ -736,7 +736,7 @@ def evaluate_ast(
         )
     OPERATIONS_COUNT += 1
     if isinstance(expression, ast.Assign):
-        # Assignement -> we evaluate the assignment which should update the state
+        # Assignment -> we evaluate the assignment which should update the state
         # We return the variable assigned as it may be used to determine the final result.
         return evaluate_assign(expression, state, static_tools, custom_tools)
     elif isinstance(expression, ast.AugAssign):
diff --git a/src/transformers/agents/text_to_speech.py b/src/transformers/agents/text_to_speech.py
index ed41ef6017ae..65f47ad6ddb5 100644
--- a/src/transformers/agents/text_to_speech.py
+++ b/src/transformers/agents/text_to_speech.py
@@ -64,4 +64,4 @@ def forward(self, inputs):
 
     def decode(self, outputs):
         with torch.no_grad():
-            return self.post_processor(outputs).cpu().detach()
+            return self.post_processor(outputs).detach().cpu()
diff --git a/src/transformers/agents/tools.py b/src/transformers/agents/tools.py
index 1aa9ecf32a52..f7ead9f2ebe7 100644
--- a/src/transformers/agents/tools.py
+++ b/src/transformers/agents/tools.py
@@ -28,6 +28,7 @@
 
 from huggingface_hub import create_repo, get_collection, hf_hub_download, metadata_update, upload_folder
 from huggingface_hub.utils import RepositoryNotFoundError, build_hf_headers, get_session
+from huggingface_hub.utils._deprecation import _deprecate_method
 from packaging import version
 
 from ..dynamic_module_utils import (
@@ -132,9 +133,17 @@ class Tool:
     inputs: Dict[str, Dict[str, Union[str, type]]]
     output_type: type
 
+    @_deprecate_method(
+        version="4.51.0",
+        message="Switch to smolagents instead, with the same functionalities and similar API (https://huggingface.co/docs/smolagents/index)",
+    )
     def __init__(self, *args, **kwargs):
         self.is_initialized = False
 
+    @_deprecate_method(
+        version="4.51.0",
+        message="Switch to smolagents instead, with the same functionalities and similar API (https://huggingface.co/docs/smolagents/index)",
+    )
     def __init_subclass__(cls, **kwargs):
         super().__init_subclass__(**kwargs)
         validate_after_init(cls, do_validate_forward=False)
@@ -158,9 +167,9 @@ def validate_arguments(self, do_validate_forward: bool = True):
                 )
         for input_name, input_content in self.inputs.items():
             assert isinstance(input_content, dict), f"Input '{input_name}' should be a dictionary."
-            assert (
-                "type" in input_content and "description" in input_content
-            ), f"Input '{input_name}' should have keys 'type' and 'description', has only {list(input_content.keys())}."
+            assert "type" in input_content and "description" in input_content, (
+                f"Input '{input_name}' should have keys 'type' and 'description', has only {list(input_content.keys())}."
+            )
             if input_content["type"] not in authorized_types:
                 raise Exception(
                     f"Input '{input_name}': type '{input_content['type']}' is not an authorized value, should be one of {authorized_types}."
diff --git a/src/transformers/agents/translation.py b/src/transformers/agents/translation.py
index 7ae61f9679b8..6875aac4bc64 100644
--- a/src/transformers/agents/translation.py
+++ b/src/transformers/agents/translation.py
@@ -256,7 +256,7 @@ class TranslationTool(PipelineTool):
         },
         "tgt_lang": {
             "type": "string",
-            "description": "The language for the desired ouput language. Written in plain English, such as 'Romanian', or 'Albanian'",
+            "description": "The language for the desired output language. Written in plain English, such as 'Romanian', or 'Albanian'",
         },
     }
     output_type = "string"
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index 0ea8fe9bc4a8..8420a84e089e 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,10 +16,57 @@
 and remove unnecessary dependencies.
 """
 
+import os
 import warnings
+from io import BytesIO
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
+import requests
+
+from .utils import is_librosa_available, requires_backends
+
+
+if is_librosa_available():
+    import librosa
+
+
+def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) -> np.ndarray:
+    """
+    Loads `audio` to an np.ndarray object.
+
+    Args:
+        audio (`str` or `np.ndarray`):
+            The audio to be laoded to the numpy array format.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The samlping rate to be used when loading the audio. It should be same as the
+            sampling rate the model you will be using further was trained with.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+
+    Returns:
+        `np.ndarray`: A numpy artay representing the audio.
+    """
+    requires_backends(load_audio, ["librosa"])
+
+    if isinstance(audio, str):
+        # Load audio from URL (e.g https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav)
+        if audio.startswith("http://") or audio.startswith("https://"):
+            audio = librosa.load(BytesIO(requests.get(audio, timeout=timeout).content), sr=sampling_rate)[0]
+        elif os.path.isfile(audio):
+            audio = librosa.load(audio, sr=sampling_rate)[0]
+    elif isinstance(audio, np.ndarray):
+        audio = audio
+    else:
+        raise TypeError(
+            "Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array."
+        )
+    return audio
+
+
+AudioInput = Union[
+    np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"]  # noqa: F821
+]
 
 
 def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
@@ -146,7 +192,7 @@ def chroma_filter_bank(
     sampling_rate: int,
     tuning: float = 0.0,
     power: Optional[float] = 2.0,
-    weighting_parameters: Optional[Tuple[float, float]] = (5.0, 2.0),
+    weighting_parameters: Optional[tuple[float, float]] = (5.0, 2.0),
     start_at_c_chroma: Optional[bool] = True,
 ):
     """
@@ -247,7 +293,7 @@ def mel_filter_bank(
 
     Args:
         num_frequency_bins (`int`):
-            Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
+            Number of frequency bins (should be the same as `n_fft // 2 + 1` where `n_fft` is the size of the Fourier Transform used to compute the spectrogram).
         num_mel_filters (`int`):
             Number of mel filters to generate.
         min_frequency (`float`):
@@ -271,6 +317,12 @@ def mel_filter_bank(
     if norm is not None and norm != "slaney":
         raise ValueError('norm must be one of None or "slaney"')
 
+    if num_frequency_bins < 2:
+        raise ValueError(f"Require num_frequency_bins: {num_frequency_bins} >= 2")
+
+    if min_frequency > max_frequency:
+        raise ValueError(f"Require min_frequency: {min_frequency} <= max_frequency: {max_frequency}")
+
     # center points of the triangular mel filters
     mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
     mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
@@ -279,7 +331,7 @@ def mel_filter_bank(
 
     if triangularize_in_mel_space:
         # frequencies of FFT bins in Hz, but filters triangularized in mel space
-        fft_bin_width = sampling_rate / (num_frequency_bins * 2)
+        fft_bin_width = sampling_rate / ((num_frequency_bins - 1) * 2)
         fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale)
         filter_freqs = mel_freqs
     else:
@@ -390,6 +442,7 @@ def spectrogram(
     center: bool = True,
     pad_mode: str = "reflect",
     onesided: bool = True,
+    dither: float = 0.0,
     preemphasis: Optional[float] = None,
     mel_filters: Optional[np.ndarray] = None,
     mel_floor: float = 1e-10,
@@ -460,6 +513,12 @@ def spectrogram(
         onesided (`bool`, *optional*, defaults to `True`):
             If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
             frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins.
+        dither (`float`, *optional*, defaults to 0.0):
+            Adds dithering. In other words, adds a small Gaussian noise to each frame.
+            E.g. use 4.0 to add dithering with a normal distribution centered
+            around 0.0 with standard deviation 4.0, 0.0 means no dithering.
+            Dithering has similar effect as `mel_floor`. It reduces the high log_mel_fbank
+            values for signals with hard-zero sections, when VAD cutoff is present in the signal.
         preemphasis (`float`, *optional*)
             Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
         mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*):
@@ -540,6 +599,9 @@ def spectrogram(
     for frame_idx in range(num_frames):
         buffer[:frame_length] = waveform[timestep : timestep + frame_length]
 
+        if dither != 0.0:
+            buffer[:frame_length] += dither * np.random.randn(frame_length)
+
         if remove_dc_offset:
             buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean()
 
@@ -582,7 +644,7 @@ def spectrogram(
 
 
 def spectrogram_batch(
-    waveform_list: List[np.ndarray],
+    waveform_list: list[np.ndarray],
     window: np.ndarray,
     frame_length: int,
     hop_length: int,
@@ -591,6 +653,7 @@ def spectrogram_batch(
     center: bool = True,
     pad_mode: str = "reflect",
     onesided: bool = True,
+    dither: float = 0.0,
     preemphasis: Optional[float] = None,
     mel_filters: Optional[np.ndarray] = None,
     mel_floor: float = 1e-10,
@@ -600,7 +663,7 @@ def spectrogram_batch(
     db_range: Optional[float] = None,
     remove_dc_offset: Optional[bool] = None,
     dtype: np.dtype = np.float32,
-) -> List[np.ndarray]:
+) -> list[np.ndarray]:
     """
     Calculates spectrograms for a list of waveforms using the Short-Time Fourier Transform, optimized for batch processing.
     This function extends the capabilities of the `spectrogram` function to handle multiple waveforms efficiently by leveraging broadcasting.
@@ -653,6 +716,10 @@ def spectrogram_batch(
             The padding strategy when `center` is `True`.
         onesided (`bool`, *optional*, defaults to `True`):
             If True, returns a one-sided spectrogram for real input signals.
+        dither (`float`, *optional*, defaults to 0.0):
+            Adds dithering. In other words, adds a small Gaussian noise to each frame.
+            E.g. use 4.0 to add dithering with a normal distribution centered
+            around 0.0 with standard deviation 4.0, 0.0 means no dithering.
         preemphasis (`float`, *optional*):
             Applies a pre-emphasis filter to each frame.
         mel_filters (`np.ndarray`, *optional*):
@@ -741,6 +808,9 @@ def spectrogram_batch(
         timestep = frame_idx * hop_length
         buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length]
 
+        if dither != 0.0:
+            buffer[:, :frame_length] += dither * np.random.randn(*buffer[:, :frame_length].shape)
+
         if remove_dc_offset:
             buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True)
 
@@ -1061,7 +1131,7 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
     return frames
 
 
-def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
+def stft(frames: np.array, windowing_function: np.array, fft_window_size: Optional[int] = None):
     """
     Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
     as `torch.stft`.
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index 427e1d4e3aea..cd2a71196028 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -3,19 +3,15 @@
 import json
 import os
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 from packaging import version
 
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_6
+
 from .configuration_utils import PretrainedConfig
-from .utils import (
-    is_hqq_available,
-    is_optimum_quanto_available,
-    is_torchdynamo_compiling,
-    logging,
-)
-from .utils.deprecation import deprecate_kwarg
+from .utils import is_hqq_available, is_optimum_quanto_available, is_torch_greater_or_equal, logging
 
 
 if is_hqq_available():
@@ -24,7 +20,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Cache(torch.nn.Module):
+class Cache:
     """
     Base, abstract class for all caches. The actual data structure is specific to each subclass.
     """
@@ -83,10 +79,10 @@ def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -
     def reorder_cache(self, beam_idx: torch.LongTensor):
         """Reorders the cache for beam search, given the selected beam indices."""
         for layer_idx in range(len(self.key_cache)):
-            if self.key_cache[layer_idx] != []:
+            if self.key_cache[layer_idx].numel():
                 device = self.key_cache[layer_idx].device
                 self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
-            if self.value_cache[layer_idx] != []:
+            if self.value_cache[layer_idx].numel():
                 device = self.value_cache[layer_idx].device
                 self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
 
@@ -217,10 +213,10 @@ class QuantizedCacheConfig(CacheConfig):
             Size of the quantization group, should be a divisor of the model's hidden dimension.
             Defaults to 64.
         residual_length (`Optional[int]`, *optional*, defaults to 128):
-            Length of the residual cache which will always be stored in original presicion.
+            Length of the residual cache which will always be stored in original precision.
             Defaults to 128.
         compute_dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
-            The defualt dtype used for computations in the model. Keys and Values will be cast to this dtype after dequantization.
+            The default dtype used for computations in the model. Keys and Values will be cast to this dtype after dequantization.
         device (`str`, *optional*, defaults to `"cpu"`):
             Device on which to perform computations, should be same as the model's device.
     """
@@ -363,13 +359,23 @@ class DynamicCache(Cache):
         ```
     """
 
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def __init__(self, num_hidden_layers: Optional[int] = None) -> None:
+    def __init__(self, _distributed_cache_data: Iterable = None) -> None:
         super().__init__()
         self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
         self.key_cache: List[torch.Tensor] = []
         self.value_cache: List[torch.Tensor] = []
 
+        # `_distributed_cache_data` was originally added for compatibility with `torch.distributed` (DDP). See #36121
+        # and #36373 for more information. In a nutshell, it is `map(gather_map, zip(*caches))`, i.e. each item in the
+        # iterable contains the key and value states for a layer gathered across replicas by torch.distributed
+        # (shape=[global batch size, num_heads, seq_len, head_dim]).
+        # WARNING: `_distributed_cache_data` must be the first argument in `__init__`, otherwise we'll break
+        # compatibility. The name of the argument doesn't matter.
+        if _distributed_cache_data is not None:
+            for key_states, value_states in _distributed_cache_data:
+                self.key_cache.append(key_states)
+                self.value_cache.append(value_states)
+
     def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
         """
         Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
@@ -427,12 +433,12 @@ def update(
             if len(self.key_cache) <= layer_idx:
                 # There may be skipped layers, fill them with empty lists
                 for _ in range(len(self.key_cache), layer_idx):
-                    self.key_cache.append([])
-                    self.value_cache.append([])
+                    self.key_cache.append(torch.tensor([]))
+                    self.value_cache.append(torch.tensor([]))
                 self.key_cache.append(key_states)
                 self.value_cache.append(value_states)
             elif (
-                len(self.key_cache[layer_idx]) == 0
+                not self.key_cache[layer_idx].numel()  # prefers not t.numel() to len(t) == 0 to export the model
             ):  # fills previously skipped layers; checking for tensor causes errors
                 self.key_cache[layer_idx] = key_states
                 self.value_cache[layer_idx] = value_states
@@ -448,7 +454,7 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         is_empty_layer = (
             len(self.key_cache) == 0  # no cache in any layer
             or len(self.key_cache) <= layer_idx  # skipped `layer_idx` and hasn't run a layer with cache after it
-            or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
+            or not self.key_cache[layer_idx].numel()  # the layer has no cache
         )
         layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
         return layer_seq_length
@@ -466,10 +472,7 @@ def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
         return legacy_cache
 
     @classmethod
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def from_legacy_cache(
-        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, num_hidden_layers: int = None
-    ) -> "DynamicCache":
+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
         """Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
         backward compatibility."""
         cache = cls()
@@ -491,14 +494,11 @@ def crop(self, max_length: int):
 
         self._seen_tokens = max_length
         for idx in range(len(self.key_cache)):
-            if self.key_cache[idx] != []:
+            if self.key_cache[idx].numel():
                 self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
                 self.value_cache[idx] = self.value_cache[idx][..., :max_length, :]
 
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def batch_split(
-        self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
-    ) -> List["DynamicCache"]:
+    def batch_split(self, full_batch_size: int, split_size: int) -> List["DynamicCache"]:
         """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
         `_split_model_inputs()` in `generation.utils`"""
         out = []
@@ -511,14 +511,13 @@ def batch_split(
         return out
 
     @classmethod
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int = None) -> "DynamicCache":
+    def from_batch_splits(cls, splits: List["DynamicCache"]) -> "DynamicCache":
         """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
         `generation.utils`"""
         cache = cls()
         for idx in range(len(splits[0])):
-            key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
-            value_cache = [current.value_cache[idx] for current in splits if current.value_cache[idx] != []]
+            key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx].numel()]
+            value_cache = [current.value_cache[idx] for current in splits if current.value_cache[idx].numel()]
             if key_cache != []:
                 layer_keys = torch.cat(key_cache, dim=0)
                 layer_values = torch.cat(value_cache, dim=0)
@@ -538,12 +537,72 @@ def batch_select_indices(self, indices: torch.Tensor):
             self.value_cache[layer_idx] = self.value_cache[layer_idx][indices, ...]
 
 
+# Utilities for `DynamicCache` <> torch.export support
+def _flatten_dynamic_cache(
+    dynamic_cache: DynamicCache,
+):
+    """Flattens DynamicCache into flat list of tensors for `torch.export.export` to consume"""
+    if not isinstance(dynamic_cache, DynamicCache):
+        raise RuntimeError("This pytree flattening function should only be applied to DynamicCache")
+
+    if not is_torch_greater_or_equal_than_2_6:
+        logger.warning_once(
+            "DynamicCache + torch.export is tested on torch 2.6.0+ and may not work on earlier versions."
+        )
+
+    # NOTE it seems _seen_tokens is deprecated, so probably doesn't need tracking
+    dictionary = {
+        "key_cache": getattr(dynamic_cache, "key_cache"),
+        "value_cache": getattr(dynamic_cache, "value_cache"),
+    }
+    return torch.utils._pytree._dict_flatten(dictionary)
+
+
+def _flatten_with_keys_dynamic_cache(dynamic_cache: DynamicCache):
+    dictionary = {
+        "key_cache": getattr(dynamic_cache, "key_cache"),
+        "value_cache": getattr(dynamic_cache, "value_cache"),
+    }
+    return torch.utils._pytree._dict_flatten_with_keys(dictionary)
+
+
+def _unflatten_dynamic_cache(
+    values,
+    context: torch.utils._pytree.Context,
+):
+    dictionary = torch.utils._pytree._dict_unflatten(values, context)
+    cache = DynamicCache()
+    for k, v in dictionary.items():
+        setattr(cache, k, v)
+    return cache
+
+
+def _flatten_dynamic_cache_for_fx(cache, spec):
+    dictionary = {
+        "key_cache": getattr(cache, "key_cache"),
+        "value_cache": getattr(cache, "value_cache"),
+    }
+    return torch.utils._pytree.tree_flatten(dictionary)[0]
+
+
+if is_torch_greater_or_equal("2.3"):
+    torch.utils._pytree.register_pytree_node(
+        DynamicCache,
+        _flatten_dynamic_cache,
+        _unflatten_dynamic_cache,
+        serialized_type_name=f"{DynamicCache.__module__}.{DynamicCache.__name__}",
+        flatten_with_keys_fn=_flatten_with_keys_dynamic_cache,
+    )
+    # TODO (tmanlaibaatar) This won't be needed in torch 2.7.
+    torch.fx._pytree.register_pytree_flatten_spec(DynamicCache, _flatten_dynamic_cache_for_fx)
+
+
 class OffloadedCache(DynamicCache):
     """
-    A drop-in replacement for DynamicCache that conserves GPU memory at the expense of more CPU memory.
+    A drop-in replacement for DynamicCache that conserves accelerator(GPU, XPU) memory at the expense of more CPU memory.
     Useful for generating from models with very long context.
 
-    In addition to the default CUDA stream, where all forward() computations happen,
+    In addition to the default accelerator stream, where all forward() computations happen,
     this class uses another stream, the prefetch stream, which it creates itself.
     Since scheduling of operations on separate streams happens independently, this class uses
     the prefetch stream to asynchronously prefetch the KV cache of layer k+1 when layer k is executing.
@@ -552,17 +611,31 @@ class OffloadedCache(DynamicCache):
     """
 
     def __init__(self) -> None:
-        if not torch.cuda.is_available():
-            raise RuntimeError("OffloadedCache can only be used with a GPU")
+        if not (
+            torch.cuda.is_available()
+            or (is_torch_greater_or_equal("2.7", accept_dev=True) and torch.xpu.is_available())
+        ):
+            raise RuntimeError(
+                "OffloadedCache can only be used with a GPU"
+                + (" or XPU" if is_torch_greater_or_equal("2.7", accept_dev=True) else "")
+            )
+
         super().__init__()
         self.original_device = []
-        self.prefetch_stream = torch.cuda.Stream()
+        self.prefetch_stream = None
+        self.prefetch_stream = (
+            torch.Stream() if is_torch_greater_or_equal("2.7", accept_dev=True) else torch.cuda.Stream()
+        )
         self.beam_idx = None  # used to delay beam search operations
 
     def prefetch_layer(self, layer_idx: int):
         "Starts prefetching the next layer cache"
         if layer_idx < len(self):
-            with torch.cuda.stream(self.prefetch_stream):
+            with (
+                self.prefetch_stream
+                if is_torch_greater_or_equal("2.7", accept_dev=True)
+                else torch.cuda.stream(self.prefetch_stream)
+            ):
                 # Prefetch next layer tensors to GPU
                 device = self.original_device[layer_idx]
                 self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device, non_blocking=True)
@@ -580,7 +653,10 @@ def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
         "Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer."
         if layer_idx < len(self):
             # Evict the previous layer if necessary
-            torch.cuda.current_stream().synchronize()
+            if is_torch_greater_or_equal("2.7", accept_dev=True):
+                torch.accelerator.current_stream().synchronize()
+            else:
+                torch.cuda.current_stream().synchronize()
             self.evict_previous_layer(layer_idx)
             # Load current layer cache to its original device if not already there
             original_device = self.original_device[layer_idx]
@@ -989,6 +1065,8 @@ def update(
         """
         # Optional kwargs for `SinkCache` -- needed on models using RoPE. `partial_rotation_size` is used on models
         # with partially rotated position embeddings, like Phi or Persimmon.
+        if cache_kwargs is None:
+            cache_kwargs = {}
         sin = cache_kwargs.get("sin")
         cos = cache_kwargs.get("cos")
         partial_rotation_size = cache_kwargs.get("partial_rotation_size")
@@ -1064,20 +1142,21 @@ class StaticCache(Cache):
     Parameters:
         config (`PretrainedConfig`):
             The configuration file defining the shape-related attributes required to initialize the static cache.
-        batch_size (`int`):
-            The batch size with which the model will be used. Note that a new instance must be instantiated if a
-            smaller batch size is used. If you are manually setting the batch size, make sure to take into account the number of beams if you are running beam search
-        max_cache_len (`int`):
+        max_batch_size (`int`):
+            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a
+            smaller batch size is used. If you are manually setting the batch size, make sure to take into account the
+            number of beams if you are running beam search
+        max_cache_len (`int`, *optional*):
             The maximum sequence length with which the model will be used.
-        device (`torch.device` or `str`):
-            The device on which the cache should be initialized. Should be the same as the layer.
-            The recommended way however is not not indicate any `device`, in that case cache will be initialized on `meta`
-            device by default, and then moved to input device when updating.
+        device (`torch.device` or `str`, *optional*):
+            The device on which the cache should be initialized. If you're using more than 1 computation device, you
+            should pass the `layer_device_map` argument instead.
         dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
             The default `dtype` to use when initializing the layer.
-        layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
-            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
-            You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
+        layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
+            Mapping between the layers and its device. This is required when you are manually initializing the cache
+            and the model is split between different gpus. You can know which layers mapped to which device by
+            checking the associated device_map: `model.hf_device_map`.
 
 
     Example:
@@ -1093,7 +1172,7 @@ class StaticCache(Cache):
         >>> # Prepare a cache class and pass it to model's forward
         >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
         >>> max_generated_length = inputs.input_ids.shape[1] + 10
-        >>> past_key_values = StaticCache(config=model.config, batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
+        >>> past_key_values = StaticCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
         >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
         >>> outputs.past_key_values # access cache filled with key/values from generation
         StaticCache()
@@ -1102,26 +1181,17 @@ class StaticCache(Cache):
 
     is_compileable = True
 
-    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
-    @deprecate_kwarg("layer_device_map", version="4.52.0")
     def __init__(
         self,
         config: PretrainedConfig,
-        batch_size: int = None,
-        max_cache_len: int = None,
-        device: torch.device = None,
+        max_batch_size: int,
+        max_cache_len: Optional[int] = None,
+        device: Union[torch.device, str, None] = None,
         dtype: torch.dtype = torch.float32,
-        max_batch_size: Optional[int] = None,
         layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
     ) -> None:
         super().__init__()
-        if batch_size is not None:
-            logger.warning_once(
-                f"The 'batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
-                "v4.49. Use the more precisely named 'max_batch_size' argument instead."
-            )
-
-        self.max_batch_size = batch_size or max_batch_size
+        self.max_batch_size = max_batch_size
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
 
         # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
@@ -1129,8 +1199,7 @@ def __init__(
             config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
         )
 
-        self.dtype = dtype
-        self.device = torch.device(device) if device is not None else torch.device("meta")
+        self._dtype = dtype
         self.num_key_value_heads = (
             config.num_attention_heads
             if getattr(config, "num_key_value_heads", None) is None
@@ -1141,25 +1210,18 @@ def __init__(
         self.value_cache: List[torch.Tensor] = []
         # Note: There will be significant perf decrease if switching to use 5D tensors instead.
         cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
+        device = torch.device(device) if device is not None else None
         for idx in range(config.num_hidden_layers):
             if layer_device_map is not None:
                 layer_device = layer_device_map[idx]
             else:
-                layer_device = self.device
-            new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
-            new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
-            # Notes:
-            # 1. `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
-            #     breaks when updating the cache. It can't be used if the cache code is being compiled (but in that case
-            #     it is not needed anyway)
-            # 2. `torch.export()` requires mutations to be registered as buffers.
-            if not is_torchdynamo_compiling():
-                self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
-                self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
-                new_layer_key_cache = getattr(self, f"key_cache_{idx}")
-                new_layer_value_cache = getattr(self, f"value_cache_{idx}")
-                torch._dynamo.mark_static_address(new_layer_key_cache)
-                torch._dynamo.mark_static_address(new_layer_value_cache)
+                layer_device = device
+            new_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device)
+            new_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device)
+            # Note: `mark_static_address` is used to tag the cache as a fixed data pointer,
+            # preventing compiled graph breaks when updating the cache.
+            torch._dynamo.mark_static_address(new_layer_key_cache)
+            torch._dynamo.mark_static_address(new_layer_value_cache)
             self.key_cache.append(new_layer_key_cache)
             self.value_cache.append(new_layer_value_cache)
 
@@ -1188,12 +1250,9 @@ def update(
         Return:
             A tuple containing the updated key and value states.
         """
-
+        if cache_kwargs is None:
+            cache_kwargs = {}
         cache_position = cache_kwargs.get("cache_position")
-        if self.key_cache[layer_idx].device.type == "meta":
-            self.key_cache[layer_idx] = torch.zeros_like(self.key_cache[layer_idx], device=key_states.device)
-            self.value_cache[layer_idx] = torch.zeros_like(self.value_cache[layer_idx], device=value_states.device)
-
         k_out = self.key_cache[layer_idx]
         v_out = self.value_cache[layer_idx]
         key_states = key_states.to(k_out.dtype)
@@ -1221,8 +1280,6 @@ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
         # limit the check to the first batch member and head dimension.
         # TODO: deprecate this function in favor of `cache_position`
-        if self.key_cache[layer_idx].device.type == "meta":
-            return 0
         return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
 
     def get_max_cache_shape(self) -> Optional[int]:
@@ -1231,18 +1288,9 @@ def get_max_cache_shape(self) -> Optional[int]:
     def reset(self):
         """Resets the cache values while preserving the objects"""
         for layer_idx in range(len(self.key_cache)):
-            if self.key_cache[layer_idx].device.type != "meta":
-                # In-place ops prevent breaking the static address
-                self.key_cache[layer_idx].zero_()
-                self.value_cache[layer_idx].zero_()
-
-    @property
-    def batch_size(self):
-        logger.warning_once(
-            f"The 'batch_size' attribute of {self.__class__.__name__} is deprecated and will be removed in "
-            "v4.49. Use the more precisely named 'self.max_batch_size' attribute instead."
-        )
-        return self.max_batch_size
+            # In-place ops prevent breaking the static address
+            self.key_cache[layer_idx].zero_()
+            self.value_cache[layer_idx].zero_()
 
 
 class SlidingWindowCache(StaticCache):
@@ -1265,20 +1313,20 @@ class SlidingWindowCache(StaticCache):
     Parameters:
         config (`PretrainedConfig`):
             The configuration file defining the shape-related attributes required to initialize the static cache.
-        batch_size (`int`):
-            The batch size with which the model will be used. Note that a new instance must be instantiated if a
+        max_batch_size (`int`):
+            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a
             smaller batch size is used.
-        max_cache_len (`int`):
+        max_cache_len (`int`, *optional*):
             The maximum sequence length with which the model will be used.
-        device (`torch.device` or `str`):
-            The device on which the cache should be initialized. Should be the same as the layer.
-            The recommended way however is not not indicate any `device`, in that case cache will be initialized on `meta`
-            device by default, and then moved to input device when updating.
+        device (`torch.device` or `str`, *optional*):
+            The device on which the cache should be initialized. If you're using more than 1 computation device, you
+            should pass the `layer_device_map` argument instead.
         dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
             The default `dtype` to use when initializing the layer.
-        layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
-            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
-            You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
+        layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
+            Mapping between the layers and its device. This is required when you are manually initializing the cache
+            and the model is split between different gpus. You can know which layers mapped to which device by
+            checking the associated device_map: `model.hf_device_map`.
 
     Example:
 
@@ -1293,7 +1341,7 @@ class SlidingWindowCache(StaticCache):
         >>> # Prepare a cache class and pass it to model's forward
         >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
         >>> max_generated_length = inputs.input_ids.shape[1] + 10
-        >>> past_key_values = SlidingWindowCache(config=model.config, batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
+        >>> past_key_values = SlidingWindowCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
         >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
         >>> outputs.past_key_values # access cache filled with key/values from generation
         SlidingWindowCache()
@@ -1303,15 +1351,13 @@ class SlidingWindowCache(StaticCache):
     is_sliding = True
     is_compileable = True
 
-    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
     def __init__(
         self,
         config: PretrainedConfig,
-        batch_size: int = None,
-        max_cache_len: int = None,
-        device: torch.device = None,
+        max_batch_size: int,
+        max_cache_len: Optional[int] = None,
+        device: Union[torch.device, str, None] = None,
         dtype: torch.dtype = torch.float32,
-        max_batch_size: Optional[int] = None,
         layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
     ) -> None:
         if not hasattr(config, "sliding_window") or config.sliding_window is None:
@@ -1323,11 +1369,10 @@ def __init__(
         max_cache_len = min(config.sliding_window, max_cache_len)
         super().__init__(
             config=config,
-            batch_size=batch_size,
+            max_batch_size=max_batch_size,
             max_cache_len=max_cache_len,
             device=device,
             dtype=dtype,
-            max_batch_size=max_batch_size,
             layer_device_map=layer_device_map,
         )
 
@@ -1337,13 +1382,10 @@ def update(
         value_states: torch.Tensor,
         layer_idx: int,
         cache_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if cache_kwargs is None:
+            cache_kwargs = {}
         cache_position = cache_kwargs.get("cache_position")
-
-        if self.key_cache[layer_idx].device.type == "meta":
-            self.key_cache[layer_idx] = torch.zeros_like(self.key_cache[layer_idx], device=key_states.device)
-            self.value_cache[layer_idx] = torch.zeros_like(self.value_cache[layer_idx], device=value_states.device)
-
         k_out = self.key_cache[layer_idx]
         v_out = self.value_cache[layer_idx]
         key_states = key_states.to(k_out.dtype)
@@ -1390,10 +1432,9 @@ def get_max_cache_shape(self) -> Optional[int]:
 
     def reset(self):
         for layer_idx in range(len(self.key_cache)):
-            if self.key_cache[layer_idx].device.type != "meta":
-                # In-place ops prevent breaking the static address
-                self.key_cache[layer_idx].zero_()
-                self.value_cache[layer_idx].zero_()
+            # In-place ops prevent breaking the static address
+            self.key_cache[layer_idx].zero_()
+            self.value_cache[layer_idx].zero_()
 
 
 class EncoderDecoderCache(Cache):
@@ -1527,10 +1568,7 @@ def crop(self, maximum_length: int):
         self.check_dynamic_cache(self.crop.__name__)
         self.self_attention_cache.crop(maximum_length)
 
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def batch_split(
-        self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
-    ) -> "List[EncoderDecoderCache]":
+    def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
         """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
         `_split_model_inputs()` in `generation.utils`"""
         self.check_dynamic_cache(self.batch_split.__name__)
@@ -1543,10 +1581,7 @@ def batch_split(
         return out
 
     @classmethod
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def from_batch_splits(
-        cls, splits: List["EncoderDecoderCache"], num_hidden_layers: int = None
-    ) -> "EncoderDecoderCache":
+    def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
         """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
         `generation.utils`"""
         self_attention_cache = DynamicCache()
@@ -1583,20 +1618,20 @@ class HybridCache(Cache):
     Parameters:
         config (`PretrainedConfig):
             The configuration file defining the shape-related attributes required to initialize the static cache.
-        batch_size (`int`):
-            The batch size with which the model will be used. Note that a new instance must be instantiated if a
+        max_batch_size (`int`):
+            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a
             smaller batch size is used.
-        max_cache_len (`int`):
+        max_cache_len (`int`, *optional*):
             The maximum sequence length with which the model will be used.
         device (`torch.device` or `str`, *optional*):
-            The device on which the cache should be initialized. Should be the same as the layer.
-            The recommended way however is not not indicate any `device`, in that case cache will be initialized on `meta`
-            device by default, and then moved to input device when updating.
+            The device on which the cache should be initialized. If you're using more than 1 computation device, you
+            should pass the `layer_device_map` argument instead.
         dtype (torch.dtype, *optional*, defaults to `torch.float32`):
             The default `dtype` to use when initializing the layer.
-        layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
-            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
-            You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
+        layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
+            Mapping between the layers and its device. This is required when you are manually initializing the cache
+            and the model is split between different gpus. You can know which layers mapped to which device by
+            checking the associated device_map: `model.hf_device_map`.
 
     Example:
 
@@ -1611,33 +1646,27 @@ class HybridCache(Cache):
         >>> # Prepare a cache class and pass it to model's forward
         >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
         >>> max_generated_length = inputs.input_ids.shape[1] + 10
-        >>> past_key_values = HybridCache(config=model.config, batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
+        >>> past_key_values = HybridCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
         >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
         >>> outputs.past_key_values # access cache filled with key/values from generation
         HybridCache()
         ```
     """
 
-    is_compileable = True
+    # TODO (joao): dive deeper into gemma2 and paligemma -- there are reports of speed loss with compilation. Revert
+    # ALL changes from the PR that commented the line below when reactivating it.
+    # is_compileable = True
 
-    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
-    @deprecate_kwarg("layer_device_map", version="4.52.0")
     def __init__(
         self,
         config: PretrainedConfig,
-        batch_size: int = None,
-        max_cache_len: int = None,
-        device: Union[torch.device, str] = None,
+        max_batch_size: int,
+        max_cache_len: Optional[int] = None,
+        device: Union[torch.device, str, None] = None,
         dtype: torch.dtype = torch.float32,
-        max_batch_size: Optional[int] = None,
         layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
     ) -> None:
         super().__init__()
-        if batch_size is not None:
-            logger.warning_once(
-                f"The 'batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
-                "v4.49. Use the more precisely named 'max_batch_size' argument instead."
-            )
         if not hasattr(config, "sliding_window") or config.sliding_window is None:
             raise ValueError(
                 "Setting `cache_implementation` to 'sliding_window' requires the model config supporting "
@@ -1645,18 +1674,17 @@ def __init__(
                 "config and it's not set to None."
             )
         self.max_cache_len = max_cache_len
-        self.max_batch_size = batch_size or max_batch_size
+        self.max_batch_size = max_batch_size
         # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
         self.head_dim = (
             config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
         )
 
-        self.dtype = dtype
+        self._dtype = dtype
         self.num_key_value_heads = (
             config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
         )
 
-        self.device = torch.device(device) if device is not None else torch.device("meta")
         layer_switch = config.sliding_window_pattern if hasattr(config, "sliding_window_pattern") else 2  # 2 is for BC
         self.is_sliding = torch.tensor(
             [bool((i + 1) % layer_switch) for i in range(config.num_hidden_layers)], dtype=torch.bool
@@ -1670,16 +1698,17 @@ def __init__(
             min(config.sliding_window, max_cache_len),
             self.head_dim,
         )
+        device = torch.device(device) if device is not None and isinstance(device, str) else None
         for i in range(config.num_hidden_layers):
             if layer_device_map is not None:
                 layer_device = layer_device_map[i]
             else:
-                layer_device = self.device
+                layer_device = device
             # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
             # breaks when updating the cache.
             cache_shape = global_cache_shape if not self.is_sliding[i] else sliding_cache_shape
-            new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
-            new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
+            new_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device)
+            new_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device)
             torch._dynamo.mark_static_address(new_layer_key_cache)
             torch._dynamo.mark_static_address(new_layer_value_cache)
             self.key_cache.append(new_layer_key_cache)
@@ -1727,13 +1756,18 @@ def update(
         value_states: torch.Tensor,
         layer_idx: int,
         cache_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor]:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if cache_kwargs is None:
+            cache_kwargs = {}
         cache_position = cache_kwargs.get("cache_position")
         sliding_window = cache_kwargs.get("sliding_window")
 
-        if self.key_cache[layer_idx].device.type == "meta":
-            self.key_cache[layer_idx] = torch.zeros_like(self.key_cache[layer_idx], device=key_states.device)
-            self.value_cache[layer_idx] = torch.zeros_like(self.value_cache[layer_idx], device=value_states.device)
+        # These two `if` blocks are only reached in multigpu and if `layer_device_map` is not passed. They are used
+        # when the cache is initialized in the forward pass (e.g. Gemma2)
+        if self.key_cache[layer_idx].device != key_states.device:
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].to(key_states.device)
+        if self.value_cache[layer_idx].device != value_states.device:
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].to(value_states.device)
 
         k_out = self.key_cache[layer_idx]
         v_out = self.value_cache[layer_idx]
@@ -1767,26 +1801,208 @@ def get_seq_length(self, layer_idx: Optional[int] = 0):
                 "`get_seq_length` on `HybridCache` may get inconsistent results depending on the layer index. "
                 "Using the `layer_idx` argument is not supported."
             )
-
-        if self.key_cache[layer_idx].device.type == "meta":
-            return 0
         return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
 
     def reset(self):
         """Resets the cache values while preserving the objects"""
         for layer_idx in range(len(self.key_cache)):
-            if self.key_cache[layer_idx].device.type != "meta":
-                # In-place ops prevent breaking the static address
-                self.key_cache[layer_idx].zero_()
-                self.value_cache[layer_idx].zero_()
+            # In-place ops prevent breaking the static address
+            self.key_cache[layer_idx].zero_()
+            self.value_cache[layer_idx].zero_()
 
-    @property
-    def batch_size(self):
-        logger.warning_once(
-            f"The 'batch_size' attribute of {self.__class__.__name__} is deprecated and will be removed in "
-            "v4.49. Use the more precisely named 'self.max_batch_size' attribute instead."
+
+class HybridChunkedCache(Cache):
+    """
+    Hybrid Cache class to be used with `torch.compile` for Gemma2 models that alternate between a local sliding window attention
+    and global attention in every other layer. Under the hood, Hybrid Cache leverages ["SlidingWindowCache"] for sliding window attention
+    and ["StaticCache"] for global attention. For more information, see the documentation of each subcomponeent cache class.
+
+    Parameters:
+        config (`PretrainedConfig):
+            The configuration file defining the shape-related attributes required to initialize the static cache.
+        max_batch_size (`int`):
+            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a
+            smaller batch size is used.
+        max_cache_len (`int`, *optional*):
+            The maximum sequence length with which the model will be used.
+        device (`torch.device` or `str`, *optional*):
+            The device on which the cache should be initialized. If you're using more than 1 computation device, you
+            should pass the `layer_device_map` argument instead.
+        dtype (torch.dtype, *optional*, defaults to `torch.bfloat16`):
+            The default `dtype` to use when initializing the layer.
+        layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*):
+            Mapping between the layers and its device. This is required when you are manually initializing the cache
+            and the model is split between different gpus. You can know which layers mapped to which device by
+            checking the associated device_map: `model.hf_device_map`.
+
+    Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, HybridCache
+
+        >>> model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
+
+        >>> inputs = tokenizer(text="My name is Gemma", return_tensors="pt")
+
+        >>> # Prepare a cache class and pass it to model's forward
+        >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate
+        >>> max_generated_length = inputs.input_ids.shape[1] + 10
+        >>> past_key_values = HybridCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
+        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
+        >>> outputs.past_key_values # access cache filled with key/values from generation
+        HybridCache()
+        ```
+    """
+
+    # TODO (joao): dive deeper into gemma2 and paligemma -- there are reports of speed loss with compilation. Revert
+    # ALL changes from the PR that commented the line below when reactivating it.
+    # is_compileable = True
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        max_batch_size: int,
+        max_cache_len: Optional[int] = None,
+        device: Union[torch.device, str, None] = None,
+        dtype: torch.dtype = torch.bfloat16,
+        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
+    ) -> None:
+        super().__init__()
+        if not hasattr(config, "sliding_window") or config.sliding_window is None:
+            self.sliding_window = getattr(config.get_text_config(), "attention_chunk_size", 8192)
+        else:
+            self.sliding_window = config.sliding_window
+        self.max_cache_len = max_cache_len
+        self.max_batch_size = max_batch_size
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self._dtype = dtype
+
+        if hasattr(config.get_text_config(), "no_rope_layers"):
+            self.is_sliding = config.no_rope_layers
+        else:
+            layer_switch = getattr(config, "sliding_window_pattern", 2)
+            self.is_sliding = [bool((i + 1) % layer_switch) for i in range(config.num_hidden_layers)]
+
+        self.key_cache: List[torch.Tensor] = []
+        self.value_cache: List[torch.Tensor] = []
+        self.cumulative_length = [0 for _ in range(config.num_hidden_layers)]
+
+    def initialise_cache_layer(self, layer_idx, key_states):
+        if len(self.key_cache) > layer_idx:
+            return
+
+        num_key_value_heads = key_states.shape[1]
+        device = key_states.device
+        global_cache_shape = (self.max_batch_size, num_key_value_heads, self.max_cache_len, self.head_dim)
+        sliding_cache_shape = (
+            self.max_batch_size,
+            num_key_value_heads,
+            self.sliding_window,
+            self.head_dim,
         )
-        return self.max_batch_size
+        # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
+        # breaks when updating the cache.
+        cache_shape = sliding_cache_shape if self.is_sliding[layer_idx] else global_cache_shape
+        new_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=device)
+        new_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=device)
+        torch._dynamo.mark_static_address(new_layer_key_cache)
+        torch._dynamo.mark_static_address(new_layer_value_cache)
+        self.key_cache.append(new_layer_key_cache)
+        self.value_cache.append(new_layer_value_cache)
+
+    def _sliding_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len):
+        cumulative_length = self.cumulative_length[layer_idx]
+        is_full = cumulative_length >= max_cache_len
+        if is_full:
+            full_key_states = torch.cat((k_out[:, :, 1:, :], key_states), dim=-2)
+            full_value_states = torch.cat((v_out[:, :, 1:, :], value_states), dim=-2)
+        elif not is_full and cumulative_length + key_states.shape[2] > max_cache_len:
+            full_key_states = torch.cat((k_out[:, :, :cumulative_length, :], key_states), dim=-2)
+            full_value_states = torch.cat((v_out[:, :, :cumulative_length, :], value_states), dim=-2)
+        else:
+            self.key_cache[layer_idx].index_copy_(2, cache_position, key_states)
+            self.value_cache[layer_idx].index_copy_(2, cache_position, value_states)
+            self.cumulative_length[layer_idx] += key_states.shape[-2]
+            return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+        self.key_cache[layer_idx].copy_(full_key_states[:, :, -max_cache_len:, :])
+        self.value_cache[layer_idx].copy_(full_value_states[:, :, -max_cache_len:, :])
+        self.cumulative_length[layer_idx] += key_states.shape[-2]
+        # we should return the whole states instead of k_out, v_out to take the whole prompt
+        # into consideration when building kv cache instead of just throwing away tokens outside of the window
+        return full_key_states, full_value_states
+
+    def _static_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len):
+        k_out[:, :, cache_position] = key_states
+        v_out[:, :, cache_position] = value_states
+
+        self.key_cache[layer_idx] = k_out
+        self.value_cache[layer_idx] = v_out
+        return k_out, v_out
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if cache_kwargs is None:
+            cache_kwargs = {}
+        cache_position = cache_kwargs.get("cache_position")
+        self.initialise_cache_layer(layer_idx, key_states)
+
+        # These two `if` blocks are only reached in multigpu and if `layer_device_map` is not passed. They are used
+        # when the cache is initialized in the forward pass (e.g. Gemma2)
+        if self.key_cache[layer_idx].device != key_states.device:
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].to(key_states.device)
+        if self.value_cache[layer_idx].device != value_states.device:
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].to(value_states.device)
+
+        k_out = self.key_cache[layer_idx]
+        v_out = self.value_cache[layer_idx]
+        key_states = key_states.to(k_out.dtype)
+        value_states = value_states.to(v_out.dtype)
+
+        if self.is_sliding[layer_idx]:
+            update_fn = self._sliding_update
+        else:
+            update_fn = self._static_update
+
+        return update_fn(
+            cache_position,
+            layer_idx,
+            key_states,
+            value_states,
+            k_out,
+            v_out,
+            k_out.shape[2],
+        )
+
+    def get_max_cache_shape(self) -> Optional[int]:
+        return self.max_cache_len
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0):
+        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
+        # limit the check to the first batch member and head dimension.
+        # TODO: deprecate this function in favor of `cache_position`
+        if layer_idx != 0:
+            raise ValueError(
+                "`get_seq_length` on `HybridCache` may get inconsistent results depending on the layer index. "
+                "Using the `layer_idx` argument is not supported."
+            )
+        if len(self.key_cache) == 0:
+            return 0
+        return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
+
+    def reset(self):
+        """Resets the cache values while preserving the objects"""
+        for layer_idx in range(len(self.key_cache)):
+            # In-place ops prevent breaking the static address
+            self.key_cache[layer_idx].zero_()
+            self.value_cache[layer_idx].zero_()
+        self.cumulative_length = [0 for _ in range(len(self.cumulative_length))]
 
 
 class MambaCache:
@@ -1796,31 +2012,12 @@ class MambaCache:
     Arguments:
         config (`PretrainedConfig):
             The configuration file defining the shape-related attributes required to initialize the static cache.
-        batch_size (`int`):
-            The batch size with which the model will be used. Note that a new instance must be instantiated if a
-            smaller batch size is used.
+        max_batch_size (`int`):
+            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a smaller batch size is used.
         dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
             The default `dtype` to use when initializing the layer.
         device (`torch.device` or `str`, *optional*):
             The device on which the cache should be initialized. Should be the same as the layer.
-            The recommended way however is not not indicate any `device`, in that case cache will be initialized on `meta`
-            device by default, and then moved to input device when updating.
-
-    Attributes:
-        dtype: (`torch.dtype`):
-            The default `dtype` used to initializing the cache.
-        device (`torch.device`):
-            The default device on which the cache was initialized.
-        intermediate_size: (`int`):
-            Model's intermediate_size taken from config.
-        ssm_state_size: (`int`):
-            Model's state_size taken from config.
-        conv_kernel_size: (`int`):
-            Model's convolution kernel size taken from config
-        conv_states: (`torch.Tensor`):
-            A tensor of shape `[layer_idx, batch_size, intermediate_size, conv_kernel_size]` that holds convolutional states.
-        ssm_states: (`torch.Tensor`):
-            A tensor of shape `[layer_idx, batch_size, intermediate_size, ssm_state_size]` that holds ssm states
 
     Example:
 
@@ -1833,7 +2030,7 @@ class MambaCache:
         >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")
 
         >>> # Prepare a cache class and pass it to model's forward
-        >>> past_key_values = MambaCache(config=model.config, batch_size=1, device=model.device, dtype=model.dtype)
+        >>> past_key_values = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
         >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
         >>> outputs.past_key_values
         MambaCache()
@@ -1842,43 +2039,37 @@ class MambaCache:
 
     is_compileable = True
 
-    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
+    # TODO (joao): add layer_device_map arg and update code in `generate` accordingly
     def __init__(
         self,
         config: PretrainedConfig,
-        batch_size: int = None,
+        max_batch_size: int,
         dtype: torch.dtype = torch.float16,
-        device: Optional[Union[torch.device, str]] = None,
-        max_batch_size: Optional[int] = None,
+        device: Union[torch.device, str, None] = None,
     ):
-        if batch_size is not None:
-            logger.warning_once(
-                f"The 'batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
-                "v4.49. Use the more precisely named 'max_batch_size' argument instead."
-            )
-        self.dtype = dtype
-        self.max_batch_size = batch_size or max_batch_size
+        self.max_batch_size = max_batch_size
+        self._dtype = dtype
         self.intermediate_size = config.intermediate_size
         self.ssm_state_size = config.state_size
         self.conv_kernel_size = config.conv_kernel
-        self.device = torch.device(device) if device is not None else torch.device("meta")
 
         self.conv_states: List[torch.Tensor] = []
         self.ssm_states: List[torch.Tensor] = []
+        device = torch.device(device) if device is not None else None
         for _ in range(config.num_hidden_layers):
             conv_state: torch.Tensor = torch.zeros(
                 self.max_batch_size,
                 self.intermediate_size,
                 self.conv_kernel_size,
-                device=self.device,
-                dtype=dtype,
+                device=device,
+                dtype=self._dtype,
             )
             ssm_state: torch.Tensor = torch.zeros(
                 self.max_batch_size,
                 self.intermediate_size,
                 self.ssm_state_size,
-                device=self.device,
-                dtype=dtype,
+                device=device,
+                dtype=self._dtype,
             )
 
             torch._dynamo.mark_static_address(conv_state)
@@ -1889,11 +2080,10 @@ def __init__(
     def update_conv_state(
         self, layer_idx: int, new_conv_state: torch.Tensor, cache_position: torch.LongTensor
     ) -> torch.Tensor:
-        if self.conv_states[layer_idx].device.type == "meta":
-            self.conv_states[layer_idx] = torch.zeros_like(
-                self.conv_states[layer_idx],
-                device=new_conv_state.device,
-            )
+        # This `if` blocks is only reached in multigpu and if `layer_device_map` is not passed. It is used
+        # when the cache is initialized in the forward pass (e.g. Mamba)
+        if self.conv_states[layer_idx].device != new_conv_state.device:
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].to(new_conv_state.device)
 
         conv_state = self.conv_states[layer_idx]
         cache_position = cache_position.clamp(0, self.conv_kernel_size - 1)
@@ -1910,18 +2100,9 @@ def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
 
     def reset(self):
         for layer_idx in range(len(self.conv_states)):
-            if self.conv_states[layer_idx].device.type != "meta":
-                # In-place ops prevent breaking the static address
-                self.conv_states[layer_idx].zero_()
-                self.ssm_states[layer_idx].zero_()
-
-    @property
-    def batch_size(self):
-        logger.warning_once(
-            f"The 'batch_size' attribute of {self.__class__.__name__} is deprecated and will be removed in "
-            "v4.49. Use the more precisely named 'self.max_batch_size' attribute instead."
-        )
-        return self.max_batch_size
+            # In-place ops prevent breaking the static address
+            self.conv_states[layer_idx].zero_()
+            self.ssm_states[layer_idx].zero_()
 
 
 class OffloadedStaticCache(StaticCache):
@@ -1938,33 +2119,16 @@ class OffloadedStaticCache(StaticCache):
         max_cache_len (`int`):
             The maximum sequence length with which the model will be used.
         device (`Union[str, torch.device]`):
-            The device on which the cache should be initialized. Should be the same as the
-            layer device.
+            The device on which the cache should be initialized. If you're using more than 1 computation device, you
+            should pass the `layer_device_map` argument instead.
         dtype (`torch.dtype`, *optional*):
             The default `dtype` to use when initializing the cache.
         offload_device (`Union[str, torch.device]`, *optional*, defaults to `cpu`):
             The device to offload to. Defaults to CPU.
         layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*):
-            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
-            You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
-
-    Attributes:
-        key_cache (`List[torch.Tensor]`):
-            Off-loaded key cache tensors. First one will be on device, where-as the others are
-            off-loaded.
-        value_cache (`List[torch.Tensor]`):
-            Off-loaded value cache tensors. First one will be on device, where-as the others are
-            off-loaded.
-        max_batch_size (`int`):
-            The maximum batch size with which this cache can be used.
-        max_cache_len (`int`):
-            The maximum sequence length with which this cache can be used.
-        device (`torch.device`):
-            The device on which the cache is used.
-        offload_device (`torch.device`):
-            The device used to offload to.
-        dtype (`torch.dtype`):
-            The `dtype` used to initializing the cache.
+            Mapping between the layers and its device. This is required when you are manually initializing the cache
+            and the model is splitted between differents gpus. You can know which layers mapped to which device by
+            checking the associated device_map: `model.hf_device_map`.
 
     Example:
 
@@ -1987,7 +2151,6 @@ class OffloadedStaticCache(StaticCache):
 
     is_compileable = True
 
-    @deprecate_kwarg("layer_device_map", version="4.52.0")
     def __init__(
         self,
         config: PretrainedConfig,
@@ -2003,7 +2166,7 @@ def __init__(
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.device = torch.device(device) if layer_device_map is None else torch.device(layer_device_map[0])
         self.offload_device = torch.device(offload_device)
-        self.dtype = dtype if dtype is not None else torch.float32
+        self._dtype = dtype if dtype is not None else torch.float32
 
         # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
         head_dim = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
@@ -2175,8 +2338,8 @@ def _create_key_value_cache_tensors(
 
         is_cpu_device = device == torch.device("cpu")
 
-        key_cache = torch.zeros(shape, dtype=self.dtype, device=device, pin_memory=is_cpu_device)
-        value_cache = torch.zeros(shape, dtype=self.dtype, device=device, pin_memory=is_cpu_device)
+        key_cache = torch.zeros(shape, dtype=self._dtype, device=device, pin_memory=is_cpu_device)
+        value_cache = torch.zeros(shape, dtype=self._dtype, device=device, pin_memory=is_cpu_device)
 
         # Note: `mark_static_address` is used to tag the cache as a fixed data pointer,
         # preventing compiled graph breaks when updating the cache.
diff --git a/src/transformers/commands/add_fast_image_processor.py b/src/transformers/commands/add_fast_image_processor.py
index 72b0f0786589..e999532ee318 100644
--- a/src/transformers/commands/add_fast_image_processor.py
+++ b/src/transformers/commands/add_fast_image_processor.py
@@ -288,7 +288,7 @@ def add_fast_image_processor_to_dummy(fast_image_processor_name: str):
     if new_dummy_object not in content:
         if index_new != len(image_processor_names) - 1:
             # add the dummy object just before the next ImageProcessorFast
-            first_line = f"class {image_processor_names[index_new+1]}(metaclass=DummyObject):"
+            first_line = f"class {image_processor_names[index_new + 1]}(metaclass=DummyObject):"
             updated_content = content.replace(first_line, new_dummy_object + "\n\n" + first_line)
         else:
             # add the dummy object at the very end
@@ -313,11 +313,9 @@ def add_fast_image_processor_to_doc(fast_image_processor_name: str, model_name:
         raise ValueError(f"No doc files found for {model_name}")
 
     base_doc_string = (
-        f"## {fast_image_processor_name[:-4]}\n\n" f"[[autodoc]] {fast_image_processor_name[:-4]}\n" "    - preprocess"
-    )
-    fast_doc_string = (
-        f"## {fast_image_processor_name}\n\n" f"[[autodoc]] {fast_image_processor_name}\n" "    - preprocess"
+        f"## {fast_image_processor_name[:-4]}\n\n[[autodoc]] {fast_image_processor_name[:-4]}\n    - preprocess"
     )
+    fast_doc_string = f"## {fast_image_processor_name}\n\n[[autodoc]] {fast_image_processor_name}\n    - preprocess"
 
     for doc_file in doc_files:
         with open(doc_file, "r", encoding="utf-8") as f:
@@ -385,7 +383,7 @@ def replacement_function(match):
     # add the fast image processor to the imports
     base_import_string = f"    from transformers import {fast_image_processor_name[:-4]}"
     fast_import_string = (
-        "    if is_torchvision_available():\n" f"        from transformers import {fast_image_processor_name}"
+        f"    if is_torchvision_available():\n        from transformers import {fast_image_processor_name}"
     )
     if fast_import_string not in updated_content:
         updated_content = updated_content.replace(base_import_string, base_import_string + "\n\n" + fast_import_string)
@@ -414,11 +412,35 @@ def get_fast_image_processing_content_header(content: str) -> str:
     """
     Get the header of the slow image processor file.
     """
-    # get all lines before and including the line containing """Image processor
-    content_header = re.search(r"^(.*?\n)*?\"\"\"Image processor.*", content)
+    # get all the commented lines at the beginning of the file
+    content_header = re.search(r"^# coding=utf-8\n(#[^\n]*\n)*", content, re.MULTILINE)
+    if not content_header:
+        logger.warning("Couldn't find the content header in the slow image processor file. Using a default header.")
+        return (
+            f"# coding=utf-8\n"
+            f"# Copyright {CURRENT_YEAR} The HuggingFace Team. All rights reserved.\n"
+            f"#\n"
+            f'# Licensed under the Apache License, Version 2.0 (the "License");\n'
+            f"# you may not use this file except in compliance with the License.\n"
+            f"# You may obtain a copy of the License at\n"
+            f"#\n"
+            f"#     http://www.apache.org/licenses/LICENSE-2.0\n"
+            f"#\n"
+            f"# Unless required by applicable law or agreed to in writing, software\n"
+            f'# distributed under the License is distributed on an "AS IS" BASIS,\n'
+            f"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"
+            f"# See the License for the specific language governing permissions and\n"
+            f"# limitations under the License.\n"
+            f"\n"
+        )
     content_header = content_header.group(0)
+    # replace the year in the copyright
     content_header = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content_header)
-    content_header = content_header.replace("Image processor", "Fast Image processor")
+    # get the line starting with """Image processor in content if it exists
+    match = re.search(r'^"""Image processor.*$', content, re.MULTILINE)
+    if match:
+        content_header += match.group(0).replace("Image processor", "Fast Image processor")
+
     return content_header
 
 
@@ -522,17 +544,17 @@ def add_fast_image_processor_file(
         "    # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.\n\n"
         "    # Default values should be checked against the slow image processor\n"
         "    # None values left after checking can be removed\n"
-        f'    resample = {default_args_dict.get("resample")}\n'
-        f'    image_mean = {default_args_dict.get("image_mean")}\n'
-        f'    image_std = {default_args_dict.get("image_std")}\n'
-        f'    size = {default_args_dict.get("size")}\n'
-        f'    default_to_square = {default_args_dict.get("default_to_square")}\n'
-        f'    crop_size = {default_args_dict.get("crop_size")}\n'
-        f'    do_resize = {default_args_dict.get("do_resize")}\n'
-        f'    do_center_crop = {default_args_dict.get("do_center_crop")}\n'
-        f'    do_rescale = {default_args_dict.get("do_rescale")}\n'
-        f'    do_normalize = {default_args_dict.get("do_normalize")}\n'
-        f'    do_convert_rgb = {default_args_dict.get("do_convert_rgb")}\n\n\n'
+        f"    resample = {default_args_dict.get('resample')}\n"
+        f"    image_mean = {default_args_dict.get('image_mean')}\n"
+        f"    image_std = {default_args_dict.get('image_std')}\n"
+        f"    size = {default_args_dict.get('size')}\n"
+        f"    default_to_square = {default_args_dict.get('default_to_square')}\n"
+        f"    crop_size = {default_args_dict.get('crop_size')}\n"
+        f"    do_resize = {default_args_dict.get('do_resize')}\n"
+        f"    do_center_crop = {default_args_dict.get('do_center_crop')}\n"
+        f"    do_rescale = {default_args_dict.get('do_rescale')}\n"
+        f"    do_normalize = {default_args_dict.get('do_normalize')}\n"
+        f"    do_convert_rgb = {default_args_dict.get('do_convert_rgb')}\n\n\n"
         f'__all__ = ["{fast_image_processor_name}"]\n'
     )
 
diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py
index 85e1722aae32..bfb812340e72 100644
--- a/src/transformers/commands/add_new_model_like.py
+++ b/src/transformers/commands/add_new_model_like.py
@@ -29,6 +29,7 @@
 from ..models.auto.configuration_auto import model_type_to_module_name
 from ..utils import is_flax_available, is_tf_available, is_torch_available, logging
 from . import BaseTransformersCLICommand
+from .add_fast_image_processor import add_fast_image_processor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -66,6 +67,9 @@ class ModelPatterns:
         image_processor_class (`str`, *optional*):
             The image processor class associated with this model (leave to `None` for models that don't use an image
             processor).
+        image_processor_fast_class (`str`, *optional*):
+            The fast image processor class associated with this model (leave to `None` for models that don't use a fast
+            image processor).
         feature_extractor_class (`str`, *optional*):
             The feature extractor class associated with this model (leave to `None` for models that don't use a feature
             extractor).
@@ -82,6 +86,7 @@ class ModelPatterns:
     config_class: Optional[str] = None
     tokenizer_class: Optional[str] = None
     image_processor_class: Optional[str] = None
+    image_processor_fast_class: Optional[str] = None
     feature_extractor_class: Optional[str] = None
     processor_class: Optional[str] = None
 
@@ -107,6 +112,7 @@ def __post_init__(self):
     "config_class": "[CONFIG_CLASS]",
     "tokenizer_class": "[TOKENIZER_CLASS]",
     "image_processor_class": "[IMAGE_PROCESSOR_CLASS]",
+    "image_processor_fast_class": "[IMAGE_PROCESSOR_FAST_CLASS]",
     "feature_extractor_class": "[FEATURE_EXTRACTOR_CLASS]",
     "processor_class": "[PROCESSOR_CLASS]",
     "checkpoint": "[CHECKPOINT]",
@@ -339,7 +345,13 @@ def replace_model_patterns(
     # contains the camel-cased named, but will be treated before.
     attributes_to_check = ["config_class"]
     # Add relevant preprocessing classes
-    for attr in ["tokenizer_class", "image_processor_class", "feature_extractor_class", "processor_class"]:
+    for attr in [
+        "tokenizer_class",
+        "image_processor_class",
+        "image_processor_fast_class",
+        "feature_extractor_class",
+        "processor_class",
+    ]:
         if getattr(old_model_patterns, attr) is not None and getattr(new_model_patterns, attr) is not None:
             attributes_to_check.append(attr)
 
@@ -763,10 +775,10 @@ def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None):
         tokenizer_class = None
     image_processor_classes = auto_module.image_processing_auto.IMAGE_PROCESSOR_MAPPING_NAMES.get(model_type, None)
     if isinstance(image_processor_classes, tuple):
-        image_processor_class = image_processor_classes[0]  # we take the slow image processor class.
+        image_processor_class, image_processor_fast_class = image_processor_classes
     else:
         image_processor_class = image_processor_classes
-
+        image_processor_fast_class = None
     feature_extractor_class = auto_module.feature_extraction_auto.FEATURE_EXTRACTOR_MAPPING_NAMES.get(model_type, None)
     processor_class = auto_module.processing_auto.PROCESSOR_MAPPING_NAMES.get(model_type, None)
 
@@ -800,6 +812,7 @@ def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None):
         config_class=config_class,
         tokenizer_class=tokenizer_class,
         image_processor_class=image_processor_class,
+        image_processor_fast_class=image_processor_fast_class,
         feature_extractor_class=feature_extractor_class,
         processor_class=processor_class,
     )
@@ -905,7 +918,7 @@ def add_model_to_main_init(
         new_model_patterns (`ModelPatterns`): The patterns for the new model.
         frameworks (`List[str]`, *optional*):
             If specified, only the models implemented in those frameworks will be added.
-        with_processsing (`bool`, *optional*, defaults to `True`):
+        with_processing (`bool`, *optional*, defaults to `True`):
             Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not.
     """
     with open(TRANSFORMERS_PATH / "__init__.py", "r", encoding="utf-8") as f:
@@ -957,6 +970,7 @@ def add_model_to_main_init(
                 processing_classes = [
                     old_model_patterns.tokenizer_class,
                     old_model_patterns.image_processor_class,
+                    old_model_patterns.image_processor_fast_class,
                     old_model_patterns.feature_extractor_class,
                     old_model_patterns.processor_class,
                 ]
@@ -1034,7 +1048,7 @@ def insert_tokenizer_in_auto_module(old_model_patterns: ModelPatterns, new_model
         '        ("{model_type}", "{pretrained_archive_map}"),',
     ],
     "feature_extraction_auto.py": ['        ("{model_type}", "{feature_extractor_class}"),'],
-    "image_processing_auto.py": ['        ("{model_type}", "{image_processor_class}"),'],
+    "image_processing_auto.py": ['        ("{model_type}", "{image_processor_classes}"),'],
     "modeling_auto.py": ['        ("{model_type}", "{any_pt_class}"),'],
     "modeling_tf_auto.py": ['        ("{model_type}", "{any_tf_class}"),'],
     "modeling_flax_auto.py": ['        ("{model_type}", "{any_flax_class}"),'],
@@ -1068,14 +1082,27 @@ def add_model_to_auto_classes(
                     )
             elif "{config_class}" in pattern:
                 new_patterns.append(pattern.replace("{config_class}", old_model_patterns.config_class))
-            elif "{image_processor_class}" in pattern:
+            elif "{image_processor_classes}" in pattern:
                 if (
                     old_model_patterns.image_processor_class is not None
                     and new_model_patterns.image_processor_class is not None
                 ):
-                    new_patterns.append(
-                        pattern.replace("{image_processor_class}", old_model_patterns.image_processor_class)
-                    )
+                    if (
+                        old_model_patterns.image_processor_fast_class is not None
+                        and new_model_patterns.image_processor_fast_class is not None
+                    ):
+                        new_patterns.append(
+                            pattern.replace(
+                                '"{image_processor_classes}"',
+                                f'("{old_model_patterns.image_processor_class}", "{old_model_patterns.image_processor_fast_class}")',
+                            )
+                        )
+                    else:
+                        new_patterns.append(
+                            pattern.replace(
+                                '"{image_processor_classes}"', f'("{old_model_patterns.image_processor_class}",)'
+                            )
+                        )
             elif "{feature_extractor_class}" in pattern:
                 if (
                     old_model_patterns.feature_extractor_class is not None
@@ -1101,7 +1128,6 @@ def add_model_to_auto_classes(
             new_model_line = new_model_line.replace(
                 old_model_patterns.model_camel_cased, new_model_patterns.model_camel_cased
             )
-
             add_content_to_file(full_name, new_model_line, add_after=old_model_line)
 
     # Tokenizers require special handling
@@ -1198,6 +1224,10 @@ def duplicate_doc_file(
                 # We only add the image processor if necessary
                 if old_model_patterns.image_processor_class != new_model_patterns.image_processor_class:
                     new_blocks.append(new_block)
+            elif "ImageProcessorFast" in block_class:
+                # We only add the image processor if necessary
+                if old_model_patterns.image_processor_fast_class != new_model_patterns.image_processor_fast_class:
+                    new_blocks.append(new_block)
             elif "FeatureExtractor" in block_class:
                 # We only add the feature extractor if necessary
                 if old_model_patterns.feature_extractor_class != new_model_patterns.feature_extractor_class:
@@ -1281,6 +1311,7 @@ def create_new_model_like(
     add_copied_from: bool = True,
     frameworks: Optional[List[str]] = None,
     old_checkpoint: Optional[str] = None,
+    create_fast_image_processor: bool = False,
 ):
     """
     Creates a new model module like a given model of the Transformers library.
@@ -1295,6 +1326,8 @@ def create_new_model_like(
         old_checkpoint (`str`, *optional*):
             The name of the base checkpoint for the old model. Should be passed along when it can't be automatically
             recovered from the `model_type`.
+        create_fast_image_processor (`bool`, *optional*, defaults to `False`):
+            Whether or not to add a fast image processor to the new model, if the old model had only a slow one.
     """
     # Retrieve all the old model info.
     model_info = retrieve_info_for_model(model_type, frameworks=frameworks)
@@ -1309,7 +1342,13 @@ def create_new_model_like(
         )
 
     keep_old_processing = True
-    for processing_attr in ["image_processor_class", "feature_extractor_class", "processor_class", "tokenizer_class"]:
+    for processing_attr in [
+        "image_processor_class",
+        "image_processor_fast_class",
+        "feature_extractor_class",
+        "processor_class",
+        "tokenizer_class",
+    ]:
         if getattr(old_model_patterns, processing_attr) != getattr(new_model_patterns, processing_attr):
             keep_old_processing = False
 
@@ -1416,7 +1455,11 @@ def disable_fx_test(filename: Path) -> bool:
     duplicate_doc_file(doc_file, old_model_patterns, new_model_patterns, frameworks=frameworks)
     insert_model_in_doc_toc(old_model_patterns, new_model_patterns)
 
-    # 6. Warn the user for duplicate patterns
+    # 6. Add fast image processor if necessary
+    if create_fast_image_processor:
+        add_fast_image_processor(model_name=new_model_patterns.model_lower_cased)
+
+    # 7. Warn the user for duplicate patterns
     if old_model_patterns.model_type == old_model_patterns.checkpoint:
         print(
             "The model you picked has the same name for the model type and the checkpoint name "
@@ -1484,6 +1527,7 @@ def __init__(self, config_file=None, path_to_repo=None, *args):
                 self.add_copied_from,
                 self.frameworks,
                 self.old_checkpoint,
+                self.create_fast_image_processor,
             ) = get_user_input()
 
         self.path_to_repo = path_to_repo
@@ -1503,6 +1547,7 @@ def run(self):
             add_copied_from=self.add_copied_from,
             frameworks=self.frameworks,
             old_checkpoint=self.old_checkpoint,
+            create_fast_image_processor=self.create_fast_image_processor,
         )
 
 
@@ -1523,7 +1568,7 @@ def get_user_field(
         is_valid_answer (`Callable`, *optional*):
             If set, the question will be asked until this function returns `True` on the provided answer.
         convert_to (`Callable`, *optional*):
-            If set, the answer will be passed to this function. If this function raises an error on the procided
+            If set, the answer will be passed to this function. If this function raises an error on the provided
             answer, the question will be asked again.
         fallback_message (`str`, *optional*):
             A message that will be displayed each time the question is asked again to the user.
@@ -1594,6 +1639,7 @@ def get_user_input():
     old_model_info = retrieve_info_for_model(old_model_type)
     old_tokenizer_class = old_model_info["model_patterns"].tokenizer_class
     old_image_processor_class = old_model_info["model_patterns"].image_processor_class
+    old_image_processor_fast_class = old_model_info["model_patterns"].image_processor_fast_class
     old_feature_extractor_class = old_model_info["model_patterns"].feature_extractor_class
     old_processor_class = old_model_info["model_patterns"].processor_class
     old_frameworks = old_model_info["frameworks"]
@@ -1634,7 +1680,13 @@ def get_user_input():
 
     old_processing_classes = [
         c if not isinstance(c, tuple) else c[0]
-        for c in [old_image_processor_class, old_feature_extractor_class, old_tokenizer_class, old_processor_class]
+        for c in [
+            old_image_processor_class,
+            old_image_processor_fast_class,
+            old_feature_extractor_class,
+            old_tokenizer_class,
+            old_processor_class,
+        ]
         if c is not None
     ]
     old_processing_classes = ", ".join(old_processing_classes)
@@ -1645,9 +1697,11 @@ def get_user_input():
     )
     if keep_processing:
         image_processor_class = old_image_processor_class
+        image_processor_fast_class = old_image_processor_fast_class
         feature_extractor_class = old_feature_extractor_class
         processor_class = old_processor_class
         tokenizer_class = old_tokenizer_class
+        create_fast_image_processor = False
     else:
         if old_tokenizer_class is not None:
             tokenizer_class = get_user_field(
@@ -1663,6 +1717,13 @@ def get_user_input():
             )
         else:
             image_processor_class = None
+        if old_image_processor_fast_class is not None:
+            image_processor_fast_class = get_user_field(
+                "What will be the name of the fast image processor class for this model? ",
+                default_value=f"{model_camel_cased}ImageProcessorFast",
+            )
+        else:
+            image_processor_fast_class = None
         if old_feature_extractor_class is not None:
             feature_extractor_class = get_user_field(
                 "What will be the name of the feature extractor class for this model? ",
@@ -1677,6 +1738,16 @@ def get_user_input():
             )
         else:
             processor_class = None
+        if old_image_processor_class is not None and old_image_processor_fast_class is None:
+            create_fast_image_processor = get_user_field(
+                "A fast image processor can be created from the slow one, but modifications might be needed. "
+                "Should we add a fast image processor class for this model (recommended, yes/no)? ",
+                convert_to=convert_to_bool,
+                default_value="yes",
+                fallback_message="Please answer yes/no, y/n, true/false or 1/0.",
+            )
+        else:
+            create_fast_image_processor = False
 
     model_patterns = ModelPatterns(
         model_name,
@@ -1688,6 +1759,7 @@ def get_user_input():
         config_class=config_class,
         tokenizer_class=tokenizer_class,
         image_processor_class=image_processor_class,
+        image_processor_fast_class=image_processor_fast_class,
         feature_extractor_class=feature_extractor_class,
         processor_class=processor_class,
     )
@@ -1706,13 +1778,14 @@ def get_user_input():
         default_value="yes",
         fallback_message="Please answer yes/no, y/n, true/false or 1/0.",
     )
+
     if all_frameworks:
         frameworks = None
     else:
         frameworks = get_user_field(
-            "Please enter the list of framworks you want (pt, tf, flax) separated by spaces",
+            "Please enter the list of frameworks you want (pt, tf, flax) separated by spaces",
             is_valid_answer=lambda x: all(p in ["pt", "tf", "flax"] for p in x.split(" ")),
         )
         frameworks = list(set(frameworks.split(" ")))
 
-    return (old_model_type, model_patterns, add_copied_from, frameworks, old_checkpoint)
+    return (old_model_type, model_patterns, add_copied_from, frameworks, old_checkpoint, create_fast_image_processor)
diff --git a/src/transformers/commands/chat.py b/src/transformers/commands/chat.py
index ed6932859152..124286386f05 100644
--- a/src/transformers/commands/chat.py
+++ b/src/transformers/commands/chat.py
@@ -17,20 +17,16 @@
 import json
 import os
 import platform
-import re
+import string
 import time
 from argparse import ArgumentParser, Namespace
 from dataclasses import dataclass, field
 from threading import Thread
 from typing import Optional
 
-import torch
 import yaml
-from rich.console import Console
-from rich.live import Live
-from rich.markdown import Markdown
 
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
+from transformers.utils import is_rich_available, is_torch_available
 
 from . import BaseTransformersCLICommand
 
@@ -38,6 +34,20 @@
 if platform.system() != "Windows":
     import pwd
 
+if is_rich_available():
+    from rich.console import Console
+    from rich.live import Live
+    from rich.markdown import Markdown
+
+if is_torch_available():
+    import torch
+
+    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
+
+ALLOWED_KEY_CHARS = set(string.ascii_letters + string.whitespace)
+ALLOWED_VALUE_CHARS = set(
+    string.ascii_letters + string.digits + string.whitespace + r".!\"#$%&'()*+,\-/:<=>?@[]^_`{|}~"
+)
 
 HELP_STRING = """\
 
@@ -65,8 +75,6 @@
     "repetition_penalty",
 ]
 
-SETTING_RE = r"^set\s+[A-Za-z\s_]+=[A-Za-z\d\s.!\"#$%&'()*+,-/:<=>?@\[\]^_`{|}~]+(?:;\s*[A-Za-z\s_]+=[A-Za-z\d\s.!\"#$%&'()*+,-/:<=>?@\[\]^_`{|}~]+)*$"
-
 DEFAULT_EXAMPLES = {
     "llama": {"text": "There is a Llama in my lawn, how can I get rid of it?"},
     "code": {
@@ -153,7 +161,7 @@ def parse_settings(user_input, current_args, interface):
         return current_args, True
 
 
-def get_quantization_config(model_args) -> Optional[BitsAndBytesConfig]:
+def get_quantization_config(model_args) -> Optional["BitsAndBytesConfig"]:
     if model_args.load_in_4bit:
         quantization_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -432,7 +440,42 @@ def register_subcommand(parser: ArgumentParser):
     def __init__(self, args):
         self.args = args
 
+    @staticmethod
+    def is_valid_setting_command(s: str) -> bool:
+        # First check the basic structure
+        if not s.startswith("set ") or "=" not in s:
+            return False
+
+        # Split into individual assignments
+        assignments = [a.strip() for a in s[4:].split(";") if a.strip()]
+
+        for assignment in assignments:
+            # Each assignment should have exactly one '='
+            if assignment.count("=") != 1:
+                return False
+
+            key, value = assignment.split("=", 1)
+            key = key.strip()
+            value = value.strip()
+            if not key or not value:
+                return False
+
+            # Keys can only have alphabetic characters, spaces and underscores
+            if not set(key).issubset(ALLOWED_KEY_CHARS):
+                return False
+
+            # Values can have just about anything that isn't a semicolon
+            if not set(value).issubset(ALLOWED_VALUE_CHARS):
+                return False
+
+        return True
+
     def run(self):
+        if not is_rich_available():
+            raise ImportError("You need to install rich to use the chat interface. (`pip install rich`)")
+        if not is_torch_available():
+            raise ImportError("You need to install torch to use the chat interface. (`pip install torch`)")
+
         args = self.args
         if args.examples_path is None:
             examples = DEFAULT_EXAMPLES
@@ -488,7 +531,7 @@ def run(self):
                     interface.print_green(f"Chat saved in {filename}!")
                     continue
 
-                if re.match(SETTING_RE, user_input):
+                if self.is_valid_setting_command(user_input):
                     current_args, success = parse_settings(user_input, current_args, interface)
                     if success:
                         chat = []
diff --git a/src/transformers/commands/env.py b/src/transformers/commands/env.py
index 855bbc961bc2..4721f1ccf660 100644
--- a/src/transformers/commands/env.py
+++ b/src/transformers/commands/env.py
@@ -30,6 +30,7 @@
     is_safetensors_available,
     is_tf_available,
     is_torch_available,
+    is_torch_hpu_available,
     is_torch_npu_available,
 )
 from . import BaseTransformersCLICommand
@@ -93,7 +94,9 @@ def run(self):
 
             pt_version = torch.__version__
             pt_cuda_available = torch.cuda.is_available()
+            pt_xpu_available = torch.xpu.is_available()
             pt_npu_available = is_torch_npu_available()
+            pt_hpu_available = is_torch_hpu_available()
 
         tf_version = "not installed"
         tf_cuda_available = "NA"
@@ -149,6 +152,12 @@ def run(self):
             if pt_cuda_available:
                 info["Using GPU in script?"] = "<fill in>"
                 info["GPU type"] = torch.cuda.get_device_name()
+            elif pt_xpu_available:
+                info["Using XPU in script?"] = "<fill in>"
+                info["XPU type"] = torch.xpu.get_device_name()
+            elif pt_hpu_available:
+                info["Using HPU in script?"] = "<fill in>"
+                info["HPU type"] = torch.hpu.get_device_name()
             elif pt_npu_available:
                 info["Using NPU in script?"] = "<fill in>"
                 info["NPU type"] = torch.npu.get_device_name()
diff --git a/src/transformers/commands/transformers_cli.py b/src/transformers/commands/transformers_cli.py
index 15291dcf89d3..a066a165d9b7 100644
--- a/src/transformers/commands/transformers_cli.py
+++ b/src/transformers/commands/transformers_cli.py
@@ -14,15 +14,14 @@
 # limitations under the License.
 
 from transformers import HfArgumentParser
-
-from .add_fast_image_processor import AddFastImageProcessorCommand
-from .add_new_model_like import AddNewModelLikeCommand
-from .chat import ChatCommand
-from .convert import ConvertCommand
-from .download import DownloadCommand
-from .env import EnvironmentCommand
-from .run import RunCommand
-from .serving import ServeCommand
+from transformers.commands.add_fast_image_processor import AddFastImageProcessorCommand
+from transformers.commands.add_new_model_like import AddNewModelLikeCommand
+from transformers.commands.chat import ChatCommand
+from transformers.commands.convert import ConvertCommand
+from transformers.commands.download import DownloadCommand
+from transformers.commands.env import EnvironmentCommand
+from transformers.commands.run import RunCommand
+from transformers.commands.serving import ServeCommand
 
 
 def main():
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 188f8ea35ef4..d5991cae8f9b 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
@@ -18,9 +17,8 @@
 import copy
 import json
 import os
-import re
 import warnings
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Optional, Union
 
 from packaging import version
 
@@ -45,8 +43,6 @@
 
 logger = logging.get_logger(__name__)
 
-_re_configuration_file = re.compile(r"config\.(.*)\.json")
-
 
 class PretrainedConfig(PushToHubMixin):
     # no-format
@@ -109,7 +105,7 @@ class PretrainedConfig(PushToHubMixin):
         is_encoder_decoder (`bool`, *optional*, defaults to `False`):
             Whether the model is used as an encoder/decoder or not.
         is_decoder (`bool`, *optional*, defaults to `False`):
-            Whether the model is used as decoder or not (in which case it's used as an encoder).
+            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on decoder-only or encoder-only architectures.
         cross_attention_hidden_size** (`bool`, *optional*):
             The hidden size of the cross-attention layer in case the model is used as a decoder in an encoder-decoder
             setting and the cross-attention hidden dimension differs from `self.config.hidden_size`.
@@ -191,16 +187,16 @@ class PretrainedConfig(PushToHubMixin):
             v5.
         loss_type (`str`, *optional*):
             The type of loss that the model should use. It should be in `LOSS_MAPPING`'s keys, otherwise the loss will
-            be automatically infered from the model architecture.
+            be automatically inferred from the model architecture.
     """
 
     model_type: str = ""
     base_config_key: str = ""
-    sub_configs: Dict[str, "PretrainedConfig"] = {}
+    sub_configs: dict[str, "PretrainedConfig"] = {}
     is_composition: bool = False
-    attribute_map: Dict[str, str] = {}
-    base_model_tp_plan: Optional[Dict[str, Any]] = None
-    base_model_pp_plan: Optional[Dict[str, Tuple[List[str]]]] = None
+    attribute_map: dict[str, str] = {}
+    base_model_tp_plan: Optional[dict[str, Any]] = None
+    base_model_pp_plan: Optional[dict[str, tuple[list[str]]]] = None
     _auto_class: Optional[str] = None
 
     def __setattr__(self, key, value):
@@ -254,7 +250,7 @@ def __init__(self, **kwargs):
             if num_labels is not None and len(self.id2label) != num_labels:
                 logger.warning(
                     f"You passed along `num_labels={num_labels}` with an incompatible id to label map: "
-                    f"{self.id2label}. The number of labels wil be overwritten to {self.num_labels}."
+                    f"{self.id2label}. The number of labels will be overwritten to {self.num_labels}."
                 )
             self.id2label = {int(key): value for key, value in self.id2label.items()}
             # Keys are always strings in JSON so convert ids to int here.
@@ -574,7 +570,7 @@ def from_pretrained(
     @classmethod
     def get_config_dict(
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
         """
         From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
         [`PretrainedConfig`] using `from_dict`.
@@ -609,7 +605,7 @@ def get_config_dict(
     @classmethod
     def _get_config_dict(
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", None)
@@ -667,13 +663,13 @@ def _get_config_dict(
                 if resolved_config_file is None:
                     return None, kwargs
                 commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
-            except EnvironmentError:
+            except OSError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                 # the original exception.
                 raise
             except Exception:
                 # For any other exception, we throw a generic error.
-                raise EnvironmentError(
+                raise OSError(
                     f"Can't load the configuration of '{pretrained_model_name_or_path}'. If you were trying to load it"
                     " from 'https://huggingface.co/models', make sure you don't have a local directory with the same"
                     f" name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory"
@@ -689,9 +685,7 @@ def _get_config_dict(
 
             config_dict["_commit_hash"] = commit_hash
         except (json.JSONDecodeError, UnicodeDecodeError):
-            raise EnvironmentError(
-                f"It looks like the config file at '{resolved_config_file}' is not a valid JSON file."
-            )
+            raise OSError(f"It looks like the config file at '{resolved_config_file}' is not a valid JSON file.")
 
         if is_local:
             logger.info(f"loading configuration file {resolved_config_file}")
@@ -714,7 +708,7 @@ def _get_config_dict(
         return config_dict, kwargs
 
     @classmethod
-    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
+    def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "PretrainedConfig":
         """
         Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
 
@@ -792,7 +786,7 @@ def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PretrainedConfig
 
     @classmethod
     def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
-        with open(json_file, "r", encoding="utf-8") as reader:
+        with open(json_file, encoding="utf-8") as reader:
             text = reader.read()
         return json.loads(text)
 
@@ -803,46 +797,48 @@ def __repr__(self):
         return f"{self.__class__.__name__} {self.to_json_string()}"
 
     def __iter__(self):
-        for attr in self.__dict__:
-            yield attr
+        yield from self.__dict__
 
-    def to_diff_dict(self) -> Dict[str, Any]:
+    def to_diff_dict(self) -> dict[str, Any]:
         """
-        Removes all attributes from config which correspond to the default config attributes for better readability and
-        serializes to a Python dictionary.
+        Removes all attributes from the configuration that correspond to the default config attributes for
+        better readability, while always retaining the `config` attribute from the class. Serializes to a
+        Python dictionary.
 
         Returns:
-            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+            Dict[str, Any]: Dictionary of all the attributes that make up this configuration instance.
         """
         config_dict = self.to_dict()
 
-        # get the default config dict
+        # Get the default config dict (from a fresh PreTrainedConfig instance)
         default_config_dict = PretrainedConfig().to_dict()
 
-        # get class specific config dict
+        # Get class-specific config dict if not part of a composition
         class_config_dict = self.__class__().to_dict() if not self.is_composition else {}
 
         serializable_config_dict = {}
 
-        # only serialize values that differ from the default config
+        # Only serialize values that differ from the default config,
+        # except always keep the 'config' attribute.
         for key, value in config_dict.items():
             if (
                 isinstance(getattr(self, key, None), PretrainedConfig)
                 and key in class_config_dict
                 and isinstance(class_config_dict[key], dict)
+                or key in self.sub_configs
             ):
                 # For nested configs we need to clean the diff recursively
-                diff = recursive_diff_dict(value, class_config_dict[key], config_obj=getattr(self, key, None))
+                diff = recursive_diff_dict(value, default_config_dict, config_obj=getattr(self, key, None))
                 if "model_type" in value:
                     # Needs to be set even if it's not in the diff
                     diff["model_type"] = value["model_type"]
-                if len(diff) > 0:
-                    serializable_config_dict[key] = diff
+                serializable_config_dict[key] = diff
             elif (
                 key not in default_config_dict
                 or key == "transformers_version"
+                or key == "vocab_file"
                 or value != default_config_dict[key]
-                or (key in class_config_dict and value != class_config_dict[key])
+                or (key in default_config_dict and value != class_config_dict.get(key, value))
             ):
                 serializable_config_dict[key] = value
 
@@ -852,8 +848,7 @@ def to_diff_dict(self) -> Dict[str, Any]:
                 if not isinstance(self.quantization_config, dict)
                 else self.quantization_config
             )
-
-            # pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
+            # Pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
             _ = serializable_config_dict.pop("_pre_quantization_dtype", None)
 
         self.dict_torch_dtype_to_str(serializable_config_dict)
@@ -867,9 +862,12 @@ def to_diff_dict(self) -> Dict[str, Any]:
         if "base_model_pp_plan" in serializable_config_dict:
             del serializable_config_dict["base_model_pp_plan"]
 
+        if "_name_or_path" in serializable_config_dict:
+            del serializable_config_dict["_name_or_path"]
+
         return serializable_config_dict
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """
         Serializes this instance to a Python dictionary.
 
@@ -949,7 +947,7 @@ def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool =
         with open(json_file_path, "w", encoding="utf-8") as writer:
             writer.write(self.to_json_string(use_diff=use_diff))
 
-    def update(self, config_dict: Dict[str, Any]):
+    def update(self, config_dict: dict[str, Any]):
         """
         Updates attributes of this class with attributes from `config_dict`.
 
@@ -997,7 +995,7 @@ def update_from_string(self, update_str: str):
 
             setattr(self, k, v)
 
-    def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
+    def dict_torch_dtype_to_str(self, d: dict[str, Any]) -> None:
         """
         Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
         converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
@@ -1039,7 +1037,7 @@ def register_for_auto_class(cls, auto_class="AutoConfig"):
         cls._auto_class = auto_class
 
     @staticmethod
-    def _get_global_generation_defaults() -> Dict[str, Any]:
+    def _get_global_generation_defaults() -> dict[str, Any]:
         return {
             "max_length": 20,
             "min_length": 0,
@@ -1068,7 +1066,7 @@ def _get_global_generation_defaults() -> Dict[str, Any]:
             "begin_suppress_tokens": None,
         }
 
-    def _get_non_default_generation_parameters(self) -> Dict[str, Any]:
+    def _get_non_default_generation_parameters(self) -> dict[str, Any]:
         """
         Gets the non-default generation parameters on the PretrainedConfig instance
         """
@@ -1094,7 +1092,7 @@ def _get_non_default_generation_parameters(self) -> Dict[str, Any]:
                 is_default_in_config = is_default_generation_value = None
                 parameter_value = getattr(self_decoder_config, parameter_name)
                 # Three cases in which is okay for the model config to hold generation config parameters:
-                # 1. The parameter is set to `None`, effectivelly delegating its value to the generation config
+                # 1. The parameter is set to `None`, effectively delegating its value to the generation config
                 if parameter_value is None:
                     continue
                 # 2. If we have a default config, then the instance should hold the same generation defaults
@@ -1117,7 +1115,9 @@ def get_text_config(self, decoder=False) -> "PretrainedConfig":
         Returns the config that is meant to be used with text IO. On most models, it is the original config instance
         itself. On specific composite models, it is under a set of valid names.
 
-        If `decoder` is set to `True`, then only search for decoder config names.
+        Args:
+            decoder (`Optional[bool]`, *optional*, defaults to `False`):
+                If set to `True`, then only search for decoder config names.
         """
         decoder_possible_text_config_names = ("decoder", "generator", "text_config")
         encoder_possible_text_config_names = ("text_encoder",)
@@ -1139,11 +1139,13 @@ def get_text_config(self, decoder=False) -> "PretrainedConfig":
                 "case, using `get_text_config()` would be ambiguous. Please specify the desied text config directly."
             )
         elif len(valid_text_config_names) == 1:
-            return getattr(self, valid_text_config_names[0])
-        return self
+            config_to_return = getattr(self, valid_text_config_names[0])
+        else:
+            config_to_return = self
+        return config_to_return
 
 
-def get_configuration_file(configuration_files: List[str]) -> str:
+def get_configuration_file(configuration_files: list[str]) -> str:
     """
     Get the configuration file to use for this version of transformers.
 
@@ -1155,9 +1157,8 @@ def get_configuration_file(configuration_files: List[str]) -> str:
     """
     configuration_files_map = {}
     for file_name in configuration_files:
-        search = _re_configuration_file.search(file_name)
-        if search is not None:
-            v = search.groups()[0]
+        if file_name.startswith("config.") and file_name.endswith(".json") and file_name != "config.json":
+            v = file_name.removeprefix("config.").removesuffix(".json")
             configuration_files_map[v] = file_name
     available_versions = sorted(configuration_files_map.keys())
 
@@ -1178,6 +1179,8 @@ def recursive_diff_dict(dict_a, dict_b, config_obj=None):
     """
     Helper function to recursively take the diff between two nested dictionaries. The resulting diff only contains the
     values from `dict_a` that are different from values in `dict_b`.
+
+    dict_b : the default config dictionary. We want to remove values that are in this one
     """
     diff = {}
     default = config_obj.__class__().to_dict() if config_obj is not None else {}
@@ -1185,9 +1188,8 @@ def recursive_diff_dict(dict_a, dict_b, config_obj=None):
         obj_value = getattr(config_obj, str(key), None)
         if isinstance(obj_value, PretrainedConfig) and key in dict_b and isinstance(dict_b[key], dict):
             diff_value = recursive_diff_dict(value, dict_b[key], config_obj=obj_value)
-            if len(diff_value) > 0:
-                diff[key] = diff_value
-        elif key not in dict_b or value != dict_b[key] or key not in default or value != default[key]:
+            diff[key] = diff_value
+        elif key not in dict_b or (value != default[key]):
             diff[key] = value
     return diff
 
diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py
index 051f1d148a84..922ece8c0f45 100644
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -16,7 +16,7 @@
 from argparse import ArgumentParser
 from os import listdir, makedirs
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 
 from packaging.version import Version, parse
 
@@ -159,7 +159,7 @@ def ensure_valid_input(model, tokens, input_names):
     return ordered_input_names, tuple(model_args)
 
 
-def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]:
+def infer_shapes(nlp: Pipeline, framework: str) -> tuple[list[str], list[str], dict, BatchEncoding]:
     """
     Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model
 
@@ -189,7 +189,7 @@ def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int):
                     raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
             else:
                 seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
-                axes.update({dim: "sequence" for dim in seq_axes})
+                axes.update(dict.fromkeys(seq_axes, "sequence"))
 
         print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
         return axes
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index c3431ad5b2e0..fad6463e9826 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -278,12 +277,7 @@ def convert_pt_checkpoint_to_tf(
     if compare_with_pt_model:
         tfo = tf_model(tf_model.dummy_inputs, training=False)  # build the network
 
-        weights_only_kwarg = {"weights_only": True}
-        state_dict = torch.load(
-            pytorch_checkpoint_path,
-            map_location="cpu",
-            **weights_only_kwarg,
-        )
+        state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu", weights_only=True)
         pt_model = pt_model_class.from_pretrained(
             pretrained_model_name_or_path=None, config=config, state_dict=state_dict
         )
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index c821d2db636b..c8cc1cdbe97b 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,7 +19,6 @@
 """
 
 import warnings
-from typing import Dict, List, Tuple
 
 from packaging import version
 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
@@ -91,7 +89,7 @@ def __init__(self, model: str):
         self.sp = SentencePieceProcessor()
         self.sp.Load(model)
 
-    def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]:
+    def extract(self, vocab_scores=None) -> tuple[dict[str, int], list[tuple]]:
         """
         By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
         order the merges with respect to the piece scores instead.
@@ -105,7 +103,7 @@ def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]:
 
 
 class GemmaSentencePieceExtractor(SentencePieceExtractor):
-    def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]:
+    def extract(self, vocab_scores=None) -> tuple[dict[str, int], list[tuple]]:
         """
         By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
         order the merges with respect to the piece scores instead.
@@ -113,10 +111,10 @@ def extract(self, vocab_scores=None) -> Tuple[Dict[str, int], List[Tuple]]:
         sp = self.sp
         vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())}
 
-        # there is a missing token in the vocab. We have to do this to support merges
+        # If "\t" is missing in the vocab, we have to do this to support merges
         # "<0x09>" is the bytefallback for `\t`
-        vocab["\t"] = vocab.get("<0x09>")
-
+        if "\t" not in vocab:
+            vocab["\t"] = vocab.get("<0x09>")
         merges = generate_merges(vocab, vocab_scores)
         return vocab, merges
 
@@ -328,7 +326,7 @@ def converted(self) -> Tokenizer:
 
 
 class GPT2Converter(Converter):
-    def converted(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None) -> Tokenizer:
+    def converted(self, vocab: dict[str, int] = None, merges: list[tuple[str, str]] = None) -> Tokenizer:
         if not vocab:
             vocab = self.original_tokenizer.encoder
         if not merges:
@@ -397,7 +395,7 @@ def converted(self) -> Tokenizer:
 
 
 class Qwen2Converter(Converter):
-    def converted(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None) -> Tokenizer:
+    def converted(self, vocab: dict[str, int] = None, merges: list[tuple[str, str]] = None) -> Tokenizer:
         if not vocab:
             vocab = self.original_tokenizer.encoder
         if not merges:
@@ -1296,12 +1294,14 @@ def vocab(self, proto):
             (self.original_tokenizer.eos_token, 0.0),
             (self.original_tokenizer.bos_token, 0.0),
         ]
-        for piece in proto.pieces[3:]:
-            if piece.piece == "<0x09>":
-                vocab += [("\t", piece.score)]
-            else:
-                vocab += [(piece.piece, piece.score)]
-        # vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
+        vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
+
+        # Older gemma tokenizers had a missing tab token, so we fix that here
+        if not any(x[0] == "\t" for x in vocab):
+            override_index = next((i for i, x in enumerate(vocab) if x[0] == "<0x09>"), None)
+            if override_index is not None:
+                vocab[override_index] = ("\t", 0.0)
+
         return vocab
 
     def pre_tokenizer(self, replacement, add_prefix_space):
@@ -1578,14 +1578,16 @@ def __init__(
         self.vocab_file = vocab_file
         self.pattern = pattern
         self.add_prefix_space = add_prefix_space
-        self.additional_special_tokens = additional_special_tokens
+        self.additional_special_tokens = (
+            additional_special_tokens.keys() if type(additional_special_tokens) is dict else additional_special_tokens
+        )
 
     def extract_vocab_merges_from_model(self, tiktoken_url: str):
         try:
             from tiktoken.load import load_tiktoken_bpe
         except Exception:
             raise ValueError(
-                "`tiktoken` is required to read a `tiktoken` file. Install it with " "`pip install tiktoken`."
+                "`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`."
             )
 
         bpe_ranks = load_tiktoken_bpe(tiktoken_url)
@@ -1627,7 +1629,10 @@ def converted(self) -> Tokenizer:
             ]
         )
         tokenizer.decoder = decoders.ByteLevel()
-        tokenizer.add_special_tokens(self.additional_special_tokens)
+
+        tokenizer.add_special_tokens(
+            [AddedToken(token, normalized=False, special=True) for token in self.additional_special_tokens]
+        )
 
         tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
 
@@ -1725,7 +1730,7 @@ def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokeni
             ).converted()
         except Exception:
             raise ValueError(
-                f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path "
+                f"Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path "
                 f"with a SentencePiece tokenizer.model file."
-                f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
+                f"Currently available slow->fast converters: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
             )
diff --git a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
index 0b93e4c53ff8..855ab6381ee1 100755
--- a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
+++ b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py b/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
index 8ccb033b3df1..e2c825a45b60 100755
--- a/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
+++ b/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 4af7d609f03d..07490a25f9e5 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import multiprocessing as mp
 import random
 import warnings
 from collections.abc import Mapping
@@ -787,6 +788,8 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
             If set, will pad the sequence to a multiple of the provided value.
         return_tensors (`str`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+        seed (`int`, *optional*):
+            The seed to use for the random number generator for masking. If not provided, the global RNG will be used.
 
     <Tip>
 
@@ -827,6 +830,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
     pad_to_multiple_of: Optional[int] = None
     tf_experimental_compile: bool = False
     return_tensors: str = "pt"
+    seed: Optional[int] = None
 
     def __post_init__(self):
         if self.mlm and self.tokenizer.mask_token is None:
@@ -843,17 +847,66 @@ def __post_init__(self):
         if self.random_replace_prob < 0 or self.random_replace_prob > 1:
             raise ValueError("random_replace_prob should be between 0 and 1.")
 
+        self.mlm_probability = float(self.mlm_probability)
+        self.mask_replace_prob = float(self.mask_replace_prob)
+        self.random_replace_prob = float(self.random_replace_prob)
+
         if self.tf_experimental_compile:
             import tensorflow as tf
 
             self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True)
 
+        self.generator = None
+
+    def get_generator(self, seed):
+        if self.return_tensors == "pt":
+            import torch
+
+            return torch.Generator().manual_seed(seed)
+        elif self.return_tensors == "tf":
+            import tensorflow as tf
+
+            return tf.random.Generator.from_seed(seed)
+        else:
+            import numpy as np
+
+            return np.random.default_rng(seed)
+
+    def create_rng(self):
+        if mp.current_process().name == "MainProcess":
+            # If we are in the main process, we create a generator object with the seed
+            self.generator = self.get_generator(self.seed)
+        else:
+            # If we are in a worker process (i.e using multiprocessing), we need to set a unique seed for each
+            # worker's generator, generated as the main seed + the worker's ID.
+            # (https://pytorch.org/docs/stable/data.html#randomness-in-multi-process-data-loading)
+            # Only PyTorch DataLoader allows us to access the worker ID, and so we check for this.
+            # For other frameworks, we will throw an error.
+            import torch
+
+            worker_info = torch.utils.data.get_worker_info()
+            if worker_info is None:
+                error_string = (
+                    "Worker process information is not available for seeding the generator. This may be because",
+                    "you are using multiprocessing without using a PyTorch DataLoader. The `seed` parameter can",
+                    "only be used when using multiprocessing with a PyTorch DataLoader. Please either use a",
+                    "single process or use a PyTorch DataLoader with multiple workers.",
+                )
+                raise ValueError(error_string)
+
+            self.generator = self.get_generator(self.seed + worker_info.id)
+
     @staticmethod
-    def tf_bernoulli(shape, probability):
+    def tf_bernoulli(shape, probability, generator=None):
         import tensorflow as tf
 
         prob_matrix = tf.fill(shape, probability)
-        return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool)
+        # if generator exists, use it to generate the random numbers
+        # otherwise, use the global RNG
+        if generator:
+            return tf.cast(prob_matrix - generator.uniform(shape, 0, 1) >= 0, tf.bool)
+        else:
+            return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool)
 
     def tf_mask_tokens(
         self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
@@ -868,12 +921,12 @@ def tf_mask_tokens(
         input_shape = tf.shape(inputs)
         # 1 for a special token, 0 for a normal token in the special tokens mask
         # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
-        masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability) & ~special_tokens_mask
+        masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability, self.generator) & ~special_tokens_mask
         # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
         labels = tf.where(masked_indices, inputs, -100)
 
         # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = self.tf_bernoulli(input_shape, self.mask_replace_prob) & masked_indices
+        indices_replaced = self.tf_bernoulli(input_shape, self.mask_replace_prob, self.generator) & masked_indices
 
         inputs = tf.where(indices_replaced, mask_token_id, inputs)
 
@@ -887,9 +940,15 @@ def tf_mask_tokens(
         random_replace_prob_scaled = self.random_replace_prob / remaining_prob
         # random_replace_prob% of the time, we replace masked input tokens with random word
         indices_random = (
-            self.tf_bernoulli(input_shape, random_replace_prob_scaled) & masked_indices & ~indices_replaced
+            self.tf_bernoulli(input_shape, random_replace_prob_scaled, self.generator)
+            & masked_indices
+            & ~indices_replaced
         )
-        random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
+
+        if self.generator:
+            random_words = self.generator.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
+        else:
+            random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
 
         inputs = tf.where(indices_random, random_words, inputs)
 
@@ -899,6 +958,11 @@ def tf_mask_tokens(
     def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
         import tensorflow as tf
 
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
         # Handle dict or lists with proper padding and conversion to tensor.
         if isinstance(examples[0], Mapping):
             batch = pad_without_fast_tokenizer_warning(
@@ -939,6 +1003,12 @@ def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict
 
     def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
         # Handle dict or lists with proper padding and conversion to tensor.
+
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
         if isinstance(examples[0], Mapping):
             batch = pad_without_fast_tokenizer_warning(
                 self.tokenizer, examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of
@@ -979,11 +1049,14 @@ def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No
             special_tokens_mask = special_tokens_mask.bool()
 
         probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
-        masked_indices = torch.bernoulli(probability_matrix).bool()
+        masked_indices = torch.bernoulli(probability_matrix, generator=self.generator).bool()
         labels[~masked_indices] = -100  # We only compute loss on masked tokens
 
         # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = torch.bernoulli(torch.full(labels.shape, self.mask_replace_prob)).bool() & masked_indices
+        indices_replaced = (
+            torch.bernoulli(torch.full(labels.shape, self.mask_replace_prob), generator=self.generator).bool()
+            & masked_indices
+        )
         inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
 
         if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
@@ -997,11 +1070,11 @@ def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No
 
         # random_replace_prob% of the time, we replace masked input tokens with random word
         indices_random = (
-            torch.bernoulli(torch.full(labels.shape, random_replace_prob_scaled)).bool()
+            torch.bernoulli(torch.full(labels.shape, random_replace_prob_scaled), generator=self.generator).bool()
             & masked_indices
             & ~indices_replaced
         )
-        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long, generator=self.generator)
         inputs[indices_random] = random_words[indices_random]
 
         # The rest of the time ((1-random_replace_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
@@ -1009,6 +1082,12 @@ def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No
 
     def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
         # Handle dict or lists with proper padding and conversion to tensor.
+
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
         if isinstance(examples[0], Mapping):
             batch = pad_without_fast_tokenizer_warning(
                 self.tokenizer, examples, return_tensors="np", pad_to_multiple_of=self.pad_to_multiple_of
@@ -1048,13 +1127,21 @@ def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No
 
         probability_matrix[special_tokens_mask] = 0
         # Numpy doesn't have bernoulli, so we use a binomial with 1 trial
-        masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
+        if self.generator:
+            masked_indices = self.generator.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
+        else:
+            masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
         labels[~masked_indices] = -100  # We only compute loss on masked tokens
 
         # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = (
-            np.random.binomial(1, self.mask_replace_prob, size=labels.shape).astype(bool) & masked_indices
-        )
+        if self.generator:
+            indices_replaced = (
+                self.generator.binomial(1, self.mask_replace_prob, size=labels.shape).astype(bool) & masked_indices
+            )
+        else:
+            indices_replaced = (
+                np.random.binomial(1, self.mask_replace_prob, size=labels.shape).astype(bool) & masked_indices
+            )
         inputs[indices_replaced] = self.tokenizer.mask_token_id
 
         if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
@@ -1065,14 +1152,24 @@ def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = No
         # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
         # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
         random_replace_prob_scaled = self.random_replace_prob / remaining_prob
-        indices_random = (
-            np.random.binomial(1, random_replace_prob_scaled, size=labels.shape).astype(bool)
-            & masked_indices
-            & ~indices_replaced
-        )
-        random_words = np.random.randint(
-            low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
-        )
+        if self.generator:
+            indices_random = (
+                self.generator.binomial(1, random_replace_prob_scaled, size=labels.shape).astype(bool)
+                & masked_indices
+                & ~indices_replaced
+            )
+            random_words = self.generator.integers(
+                low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
+            )
+        else:
+            indices_random = (
+                np.random.binomial(1, random_replace_prob_scaled, size=labels.shape).astype(bool)
+                & masked_indices
+                & ~indices_replaced
+            )
+            random_words = np.random.randint(
+                low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
+            )
         inputs[indices_random] = random_words
 
         # The rest of the time (10% of the time) we keep the masked input tokens unchanged
@@ -1096,6 +1193,11 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
     </Tip>"""
 
     def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
         if isinstance(examples[0], Mapping):
             input_ids = [e["input_ids"] for e in examples]
         else:
@@ -1126,6 +1228,11 @@ def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> D
     def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
         import tensorflow as tf
 
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
         if isinstance(examples[0], Mapping):
             input_ids = [e["input_ids"] for e in examples]
         else:
@@ -1154,6 +1261,11 @@ def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict
         return {"input_ids": inputs, "labels": labels}
 
     def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        if self.seed and self.generator is None:
+            # If we have a seed, we need to create a generator object. Subsequent calls to this function will use the same generator.
+            # If no seed supplied, we will use the global RNG
+            self.create_rng()
+
         if isinstance(examples[0], Mapping):
             input_ids = [e["input_ids"] for e in examples]
         else:
@@ -1181,6 +1293,30 @@ def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> D
         inputs, labels = self.numpy_mask_tokens(batch_input, batch_mask)
         return {"input_ids": inputs, "labels": labels}
 
+    def _shuffle(self, cand_indexes):
+        # if no seed, just use random's shuffle
+        if self.seed is None:
+            random.shuffle(cand_indexes)
+            return cand_indexes
+
+        # if seed is provided, use the generator to shuffle
+        if self.return_tensors == "pt":
+            import torch
+
+            indices = torch.randperm(len(cand_indexes), generator=self.generator)
+            return [cand_indexes[i] for i in indices]
+
+        elif self.return_tensors == "tf":
+            import tensorflow as tf
+
+            seed = self.generator.make_seeds(2)[0]
+            indices = tf.random.experimental.stateless_shuffle(tf.range(len(cand_indexes)), seed=seed).numpy().tolist()
+            return [cand_indexes[i] for i in indices]
+
+        elif self.return_tensors == "np":
+            self.generator.shuffle(cand_indexes)
+            return cand_indexes
+
     def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
         """
         Get 0/1 labels for masked tokens with whole word mask proxy
@@ -1201,7 +1337,7 @@ def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
             else:
                 cand_indexes.append([i])
 
-        random.shuffle(cand_indexes)
+        cand_indexes = self._shuffle(cand_indexes)
         num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))
         masked_lms = []
         covered_indexes = set()
@@ -1212,13 +1348,6 @@ def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
             # predictions, then just skip this candidate.
             if len(masked_lms) + len(index_set) > num_to_predict:
                 continue
-            is_any_index_covered = False
-            for index in index_set:
-                if index in covered_indexes:
-                    is_any_index_covered = True
-                    break
-            if is_any_index_covered:
-                continue
             for index in index_set:
                 covered_indexes.add(index)
                 masked_lms.append(index)
@@ -1256,16 +1385,32 @@ def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
         masked_indices = probability_matrix.bool()
         labels[~masked_indices] = -100  # We only compute loss on masked tokens
 
-        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = (
+            torch.bernoulli(torch.full(labels.shape, self.mask_replace_prob), generator=self.generator).bool()
+            & masked_indices
+        )
         inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
 
-        # 10% of the time, we replace masked input tokens with random word
-        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
-        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+
+        # random_replacement_prob% of the time, we replace masked input tokens with random word
+        indices_random = (
+            torch.bernoulli(torch.full(labels.shape, random_replace_prob_scaled), generator=self.generator).bool()
+            & masked_indices
+            & ~indices_replaced
+        )
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long, generator=self.generator)
         inputs[indices_random] = random_words[indices_random]
 
-        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        # The rest of the time ((1-random_replacement_prob-mask_replace_prob)% of the time) we keep the masked input tokens unchanged
         return inputs, labels
 
     def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
@@ -1297,17 +1442,35 @@ def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
         # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
         labels = tf.where(masked_indices, inputs, -100)
 
-        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices
+        # mask_replace_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = self.tf_bernoulli(input_shape, self.mask_replace_prob, self.generator) & masked_indices
 
         inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)
 
-        # 10% of the time, we replace masked input tokens with random word
-        indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced
-        random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+
+        # random_replace_prob% of the time, we replace masked input tokens with random word
+        indices_random = (
+            self.tf_bernoulli(input_shape, random_replace_prob_scaled, self.generator)
+            & masked_indices
+            & ~indices_replaced
+        )
+
+        if self.generator:
+            random_words = self.generator.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
+        else:
+            random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
+
         inputs = tf.where(indices_random, random_words, inputs)
 
-        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        # The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged
         return inputs, labels
 
     def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
@@ -1335,19 +1498,44 @@ def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
 
         labels[~masked_indices] = -100  # We only compute loss on masked tokens
 
-        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-        indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
+        # mask_replacement_prob% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        if self.generator:
+            indices_replaced = (
+                self.generator.binomial(1, self.mask_replace_prob, size=labels.shape).astype(bool) & masked_indices
+            )
+        else:
+            indices_replaced = (
+                np.random.binomial(1, self.mask_replace_prob, size=labels.shape).astype(bool) & masked_indices
+            )
         inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
 
-        # 10% of the time, we replace masked input tokens with random word
-        # indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
-        indices_random = (
-            np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
-        )
-        random_words = np.random.randint(low=0, high=len(self.tokenizer), size=labels.shape, dtype=np.int64)
+        if self.mask_replace_prob == 1 or self.random_replace_prob == 0:
+            return inputs, labels
+
+        remaining_prob = 1 - self.mask_replace_prob
+        # scaling the random_replace_prob to the remaining probability for example if
+        # mask_replace_prob = 0.8 and random_replace_prob = 0.1,
+        # then random_replace_prob_scaled = 0.1 / 0.2 = 0.5
+        random_replace_prob_scaled = self.random_replace_prob / remaining_prob
+
+        if self.generator:
+            indices_random = (
+                self.generator.binomial(1, random_replace_prob_scaled, size=labels.shape).astype(bool)
+                & masked_indices
+                & ~indices_replaced
+            )
+            random_words = self.generator.integers(low=0, high=len(self.tokenizer), size=labels.shape, dtype=np.int64)
+        else:
+            indices_random = (
+                np.random.binomial(1, random_replace_prob_scaled, size=labels.shape).astype(bool)
+                & masked_indices
+                & ~indices_replaced
+            )
+            random_words = np.random.randint(low=0, high=len(self.tokenizer), size=labels.shape, dtype=np.int64)
+
         inputs[indices_random] = random_words[indices_random]
 
-        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        # The rest of the time ((1-mask_replace_prob-random_replace_prob)% of the time) we keep the masked input tokens unchanged
         return inputs, labels
 
 
@@ -1789,16 +1977,19 @@ class DataCollatorWithFlattening(DefaultDataCollator):
     - concatate the entire mini batch into single long sequence [1, total_tokens]
     - uses `separator_id` to separate sequences within the concatenated `labels`, default value is -100
     - no padding will be added, returns `input_ids`, `labels` and `position_ids`
+
+    <Tip warning={true}>
+
+    Using `DataCollatorWithFlattening` will flatten the entire mini batch into single long sequence.
+    Make sure your attention computation is able to handle it!
+
+    </Tip>
     """
 
     def __init__(self, *args, return_position_ids=True, separator_id=-100, **kwargs):
         super().__init__(*args, **kwargs)
         self.return_position_ids = return_position_ids
         self.separator_id = separator_id
-        warnings.warn(
-            "Using `DataCollatorWithFlattening` will flatten the entire mini batch into single long sequence."
-            "Make sure your attention computation is able to handle it!"
-        )
 
     def __call__(self, features, return_tensors=None, separator_id=None):
         if return_tensors is None:
diff --git a/src/transformers/data/datasets/squad.py b/src/transformers/data/datasets/squad.py
index d81217d818af..7546d7b49ed0 100644
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@@ -148,7 +148,7 @@ def __init__(
         with FileLock(lock_path):
             if os.path.exists(cached_features_file) and not args.overwrite_cache:
                 start = time.time()
-                self.old_features = torch.load(cached_features_file)
+                self.old_features = torch.load(cached_features_file, weights_only=True)
 
                 # Legacy cache files have only features, while new cache files
                 # will have dataset and examples also.
diff --git a/src/transformers/data/metrics/squad_metrics.py b/src/transformers/data/metrics/squad_metrics.py
index 5d98a0bfcf15..f83c23bdeecf 100644
--- a/src/transformers/data/metrics/squad_metrics.py
+++ b/src/transformers/data/metrics/squad_metrics.py
@@ -226,7 +226,7 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_
     no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
 
     if no_answer_probs is None:
-        no_answer_probs = {k: 0.0 for k in preds}
+        no_answer_probs = dict.fromkeys(preds, 0.0)
 
     exact, f1 = get_raw_scores(examples, preds)
 
diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py
index 6e95a6668498..cbb4a70ab027 100644
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -101,7 +101,7 @@ def gen():
 
         return tf.data.Dataset.from_generator(
             gen,
-            ({k: tf.int32 for k in input_names}, label_type),
+            (dict.fromkeys(input_names, tf.int32), label_type),
             ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
         )
 
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 4677af124e9b..5f3cd0fd28b8 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -16,6 +16,7 @@
 import os
 from functools import partial
 from multiprocessing import Pool, cpu_count
+from typing import Optional
 
 import numpy as np
 from tqdm import tqdm
@@ -800,8 +801,8 @@ def __init__(
         start_position,
         end_position,
         is_impossible,
-        qas_id: str = None,
-        encoding: BatchEncoding = None,
+        qas_id: Optional[str] = None,
+        encoding: Optional[BatchEncoding] = None,
     ):
         self.input_ids = input_ids
         self.attention_mask = attention_mask
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index dbceb1d84907..91425d9fed7a 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -206,7 +206,7 @@ def batch_start_frame(self):
         self.expand_frame(f"{'abs min':8} {'abs max':8} metadata")
 
     def batch_end_frame(self):
-        self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number-1} ***\n\n")
+        self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number - 1} ***\n\n")
 
     def create_frame(self, module, input, output):
         self.expand_frame(f"{self.prefix} {self.module_names[module]} {module.__class__.__name__}")
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 3a35919e9758..86cc5debec71 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -24,7 +24,8 @@
     "fugashi": "fugashi>=1.0",
     "GitPython": "GitPython<3.1.19",
     "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.26.0,<1.0",
+    "hf_xet": "hf_xet",
+    "huggingface-hub": "huggingface-hub>=0.30.0,<1.0",
     "importlib_metadata": "importlib_metadata",
     "ipadic": "ipadic>=1.0.0,<2.0",
     "isort": "isort>=5.5.4",
@@ -32,12 +33,14 @@
     "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
     "jieba": "jieba",
     "jinja2": "jinja2>=3.1.0",
-    "kenlm": "kenlm",
+    "kenlm@git+https://github.com/ydshieh/kenlm@78f664fb3dafe1468d868d71faf19534530698d5": "kenlm@git+https://github.com/ydshieh/kenlm@78f664fb3dafe1468d868d71faf19534530698d5",
     "keras": "keras>2.9,<2.16",
     "keras-nlp": "keras-nlp>=0.3.1,<0.14.0",
+    "kernels": "kernels>=0.3.2,<0.4",
     "librosa": "librosa",
-    "nltk": "nltk<=3.8.1",
     "natten": "natten>=0.14.6,<0.15.0",
+    "nltk": "nltk<=3.8.1",
+    "num2words": "num2words",
     "numpy": "numpy>=1.17",
     "onnxconverter-common": "onnxconverter-common",
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
@@ -55,8 +58,10 @@
     "pydantic": "pydantic",
     "pytest": "pytest>=7.2.0,<8.0.0",
     "pytest-asyncio": "pytest-asyncio",
+    "pytest-rerunfailures": "pytest-rerunfailures",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
+    "pytest-order": "pytest-order",
     "python": "python>=3.9.0",
     "ray[tune]": "ray[tune]>=2.7.0",
     "regex": "regex!=2019.12.17",
@@ -64,10 +69,10 @@
     "rhoknp": "rhoknp>=1.1.0,<1.3.1",
     "rjieba": "rjieba",
     "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
-    "ruff": "ruff==0.5.1",
+    "ruff": "ruff==0.11.2",
     "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses": "sacremoses",
-    "safetensors": "safetensors>=0.4.1",
+    "safetensors": "safetensors>=0.4.3",
     "sagemaker": "sagemaker>=2.31.0",
     "schedulefree": "schedulefree>=1.2.6",
     "scikit-learn": "scikit-learn",
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index bf44d4b427cf..9a72ab52e8fc 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,6 +13,7 @@
 # limitations under the License.
 """Utilities to dynamically load objects from the Hub."""
 
+import ast
 import filecmp
 import hashlib
 import importlib
@@ -24,11 +24,10 @@
 import signal
 import sys
 import threading
-import typing
 import warnings
 from pathlib import Path
 from types import ModuleType
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 
 from huggingface_hub import try_to_load_from_cache
 
@@ -84,7 +83,7 @@ def create_dynamic_module(name: Union[str, os.PathLike]) -> None:
         importlib.invalidate_caches()
 
 
-def get_relative_imports(module_file: Union[str, os.PathLike]) -> List[str]:
+def get_relative_imports(module_file: Union[str, os.PathLike]) -> list[str]:
     """
     Get the list of modules that are relatively imported in a module file.
 
@@ -92,9 +91,9 @@ def get_relative_imports(module_file: Union[str, os.PathLike]) -> List[str]:
         module_file (`str` or `os.PathLike`): The module file to inspect.
 
     Returns:
-        `List[str]`: The list of relative imports in the module.
+        `list[str]`: The list of relative imports in the module.
     """
-    with open(module_file, "r", encoding="utf-8") as f:
+    with open(module_file, encoding="utf-8") as f:
         content = f.read()
 
     # Imports of the form `import .xxx`
@@ -105,7 +104,7 @@ def get_relative_imports(module_file: Union[str, os.PathLike]) -> List[str]:
     return list(set(relative_imports))
 
 
-def get_relative_import_files(module_file: Union[str, os.PathLike]) -> List[str]:
+def get_relative_import_files(module_file: Union[str, os.PathLike]) -> list[str]:
     """
     Get the list of all files that are needed for a given module. Note that this function recurses through the relative
     imports (if a imports b and b imports c, it will return module files for b and c).
@@ -114,7 +113,7 @@ def get_relative_import_files(module_file: Union[str, os.PathLike]) -> List[str]
         module_file (`str` or `os.PathLike`): The module file to inspect.
 
     Returns:
-        `List[str]`: The list of all relative imports a given module needs (recursively), which will give us the list
+        `list[str]`: The list of all relative imports a given module needs (recursively), which will give us the list
         of module files a given module needs.
     """
     no_change = False
@@ -138,7 +137,7 @@ def get_relative_import_files(module_file: Union[str, os.PathLike]) -> List[str]
     return all_relative_imports
 
 
-def get_imports(filename: Union[str, os.PathLike]) -> List[str]:
+def get_imports(filename: Union[str, os.PathLike]) -> list[str]:
     """
     Extracts all the libraries (not relative imports this time) that are imported in a file.
 
@@ -146,29 +145,47 @@ def get_imports(filename: Union[str, os.PathLike]) -> List[str]:
         filename (`str` or `os.PathLike`): The module file to inspect.
 
     Returns:
-        `List[str]`: The list of all packages required to use the input module.
+        `list[str]`: The list of all packages required to use the input module.
     """
-    with open(filename, "r", encoding="utf-8") as f:
+    with open(filename, encoding="utf-8") as f:
         content = f.read()
-
-    # filter out try/except block so in custom code we can have try/except imports
-    content = re.sub(r"\s*try\s*:.*?except.*?:", "", content, flags=re.DOTALL)
-
-    # filter out imports under is_flash_attn_2_available block for avoid import issues in cpu only environment
-    content = re.sub(
-        r"if is_flash_attn[a-zA-Z0-9_]+available\(\):\s*(from flash_attn\s*.*\s*)+", "", content, flags=re.MULTILINE
-    )
-
-    # Imports of the form `import xxx`
-    imports = re.findall(r"^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
-    # Imports of the form `from xxx import yyy`
-    imports += re.findall(r"^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
-    # Only keep the top-level module
-    imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
-    return list(set(imports))
-
-
-def check_imports(filename: Union[str, os.PathLike]) -> List[str]:
+    imported_modules = set()
+
+    def recursive_look_for_imports(node):
+        if isinstance(node, ast.Try):
+            return  #  Don't recurse into Try blocks and ignore imports in them
+        elif isinstance(node, ast.If):
+            test = node.test
+            for condition_node in ast.walk(test):
+                if isinstance(condition_node, ast.Call) and getattr(condition_node.func, "id", "").startswith(
+                    "is_flash_attn"
+                ):
+                    # Don't recurse into "if flash_attn_available()" blocks and ignore imports in them
+                    return
+        elif isinstance(node, ast.Import):
+            # Handle 'import x' statements
+            for alias in node.names:
+                top_module = alias.name.split(".")[0]
+                if top_module:
+                    imported_modules.add(top_module)
+        elif isinstance(node, ast.ImportFrom):
+            # Handle 'from x import y' statements, ignoring relative imports
+            if node.level == 0 and node.module:
+                top_module = node.module.split(".")[0]
+                if top_module:
+                    imported_modules.add(top_module)
+
+        # Recursively visit all children
+        for child in ast.iter_child_nodes(node):
+            recursive_look_for_imports(child)
+
+    tree = ast.parse(content)
+    recursive_look_for_imports(tree)
+
+    return sorted(imported_modules)
+
+
+def check_imports(filename: Union[str, os.PathLike]) -> list[str]:
     """
     Check if the current Python environment contains all the libraries that are imported in a file. Will raise if a
     library is missing.
@@ -177,7 +194,7 @@ def check_imports(filename: Union[str, os.PathLike]) -> List[str]:
         filename (`str` or `os.PathLike`): The module file to check.
 
     Returns:
-        `List[str]`: The list of relative imports in the file.
+        `list[str]`: The list of relative imports in the file.
     """
     imports = get_imports(filename)
     missing_packages = []
@@ -208,7 +225,7 @@ def get_class_in_module(
     module_path: Union[str, os.PathLike],
     *,
     force_reload: bool = False,
-) -> typing.Type:
+) -> type:
     """
     Import a module on the cache directory for modules and extract a class from it.
 
@@ -235,7 +252,7 @@ def get_class_in_module(
         module_spec = importlib.util.spec_from_file_location(name, location=module_file)
 
         # Hash the module file and all its relative imports to check if we need to reload it
-        module_files: List[Path] = [module_file] + sorted(map(Path, get_relative_import_files(module_file)))
+        module_files: list[Path] = [module_file] + sorted(map(Path, get_relative_import_files(module_file)))
         module_hash: str = hashlib.sha256(b"".join(bytes(f) + f.read_bytes() for f in module_files)).hexdigest()
 
         module: ModuleType
@@ -258,7 +275,7 @@ def get_cached_module_file(
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
     resume_download: Optional[bool] = None,
-    proxies: Optional[Dict[str, str]] = None,
+    proxies: Optional[dict[str, str]] = None,
     token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
@@ -358,7 +375,7 @@ def get_cached_module_file(
         if not is_local and cached_module != resolved_module_file:
             new_files.append(module_file)
 
-    except EnvironmentError:
+    except OSError:
         logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
         raise
 
@@ -434,14 +451,14 @@ def get_class_from_dynamic_module(
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
     resume_download: Optional[bool] = None,
-    proxies: Optional[Dict[str, str]] = None,
+    proxies: Optional[dict[str, str]] = None,
     token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     repo_type: Optional[str] = None,
     code_revision: Optional[str] = None,
     **kwargs,
-) -> typing.Type:
+) -> type:
     """
     Extracts a class from a module file, present in the local folder or repository of a model.
 
@@ -553,7 +570,7 @@ def get_class_from_dynamic_module(
     return get_class_in_module(class_name, final_module, force_reload=force_download)
 
 
-def custom_object_save(obj: Any, folder: Union[str, os.PathLike], config: Optional[Dict] = None) -> List[str]:
+def custom_object_save(obj: Any, folder: Union[str, os.PathLike], config: Optional[dict] = None) -> list[str]:
     """
     Save the modeling files corresponding to a custom model/configuration/tokenizer etc. in a given folder. Optionally
     adds the proper fields in a config.
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index f74a3f0c40e2..c9a26bac9b3d 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,7 +15,7 @@
 Sequence feature extraction class for common feature extractors to preprocess sequences.
 """
 
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 
@@ -54,10 +53,10 @@ def pad(
         self,
         processed_features: Union[
             BatchFeature,
-            List[BatchFeature],
-            Dict[str, BatchFeature],
-            Dict[str, List[BatchFeature]],
-            List[Dict[str, BatchFeature]],
+            list[BatchFeature],
+            dict[str, BatchFeature],
+            dict[str, list[BatchFeature]],
+            list[dict[str, BatchFeature]],
         ],
         padding: Union[bool, str, PaddingStrategy] = True,
         max_length: Optional[int] = None,
@@ -226,7 +225,7 @@ def pad(
 
     def _pad(
         self,
-        processed_features: Union[Dict[str, np.ndarray], BatchFeature],
+        processed_features: Union[dict[str, np.ndarray], BatchFeature],
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
@@ -298,7 +297,7 @@ def _pad(
 
     def _truncate(
         self,
-        processed_features: Union[Dict[str, np.ndarray], BatchFeature],
+        processed_features: Union[dict[str, np.ndarray], BatchFeature],
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
         truncation: Optional[bool] = None,
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index 6e8007edbc0b..ca2a3b5fde31 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,7 +20,7 @@
 import os
 import warnings
 from collections import UserDict
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import numpy as np
 
@@ -74,7 +73,7 @@ class BatchFeature(UserDict):
             initialization.
     """
 
-    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+    def __init__(self, data: Optional[dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
         super().__init__(data)
         self.convert_to_tensors(tensor_type=tensor_type)
 
@@ -450,7 +449,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
     @classmethod
     def get_feature_extractor_dict(
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
         """
         From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
         feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`] using `from_dict`.
@@ -521,13 +520,13 @@ def get_feature_extractor_dict(
                     user_agent=user_agent,
                     revision=revision,
                 )
-            except EnvironmentError:
+            except OSError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                 # the original exception.
                 raise
             except Exception:
                 # For any other exception, we throw a generic error.
-                raise EnvironmentError(
+                raise OSError(
                     f"Can't load feature extractor for '{pretrained_model_name_or_path}'. If you were trying to load"
                     " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
                     f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
@@ -536,12 +535,12 @@ def get_feature_extractor_dict(
 
         try:
             # Load feature_extractor dict
-            with open(resolved_feature_extractor_file, "r", encoding="utf-8") as reader:
+            with open(resolved_feature_extractor_file, encoding="utf-8") as reader:
                 text = reader.read()
             feature_extractor_dict = json.loads(text)
 
         except json.JSONDecodeError:
-            raise EnvironmentError(
+            raise OSError(
                 f"It looks like the config file at '{resolved_feature_extractor_file}' is not a valid JSON file."
             )
 
@@ -565,7 +564,7 @@ def get_feature_extractor_dict(
         return feature_extractor_dict, kwargs
 
     @classmethod
-    def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor:
+    def from_dict(cls, feature_extractor_dict: dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor:
         """
         Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python dictionary of
         parameters.
@@ -601,7 +600,7 @@ def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> PreTrain
         else:
             return feature_extractor
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """
         Serializes this instance to a Python dictionary. Returns:
             `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
@@ -628,7 +627,7 @@ def from_json_file(cls, json_file: Union[str, os.PathLike]) -> PreTrainedFeature
             A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The feature_extractor
             object instantiated from that JSON file.
         """
-        with open(json_file, "r", encoding="utf-8") as reader:
+        with open(json_file, encoding="utf-8") as reader:
             text = reader.read()
         feature_extractor_dict = json.loads(text)
         return cls(**feature_extractor_dict)
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 4fae91f43fdb..ac6b36d2dbcf 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -71,7 +71,6 @@
     copy_func,
     default_cache_path,
     define_sagemaker_information,
-    get_file_from_repo,
     get_torch_version,
     has_file,
     http_user_agent,
diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py
index ea39e8a10b82..cf1fa3661e0c 100644
--- a/src/transformers/generation/__init__.py
+++ b/src/transformers/generation/__init__.py
@@ -26,7 +26,7 @@
         "SynthIDTextWatermarkingConfig",
         "WatermarkingConfig",
     ],
-    "streamers": ["AsyncTextIteratorStreamer", "TextIteratorStreamer", "TextStreamer"],
+    "streamers": ["AsyncTextIteratorStreamer", "BaseStreamer", "TextIteratorStreamer", "TextStreamer"],
 }
 
 try:
@@ -197,7 +197,7 @@
         SynthIDTextWatermarkingConfig,
         WatermarkingConfig,
     )
-    from .streamers import AsyncTextIteratorStreamer, TextIteratorStreamer, TextStreamer
+    from .streamers import AsyncTextIteratorStreamer, BaseStreamer, TextIteratorStreamer, TextStreamer
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index 7e4f13ad1e15..3938deb48260 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -612,7 +612,7 @@ def process(
                     if is_beam_token_worse_than_top_num_beams:
                         continue
 
-                    completes_constraint = self.check_completes_constraints(input_ids[batch_beam_idx].cpu().tolist())
+                    completes_constraint = self.check_completes_constraints(input_ids[batch_beam_idx].tolist())
                     if completes_constraint:
                         if beam_indices is not None:
                             beam_index = beam_indices[batch_beam_idx]
@@ -718,19 +718,19 @@ def step_sentence_constraint(
             # hypotheses.
 
             topk_state = topk_contraint_states[seq_idx]
-            topk_state.reset(full_hypotheses[seq_idx].cpu().tolist())
+            topk_state.reset(full_hypotheses[seq_idx].tolist())
 
             advance_state = advance_constraint_states[seq_idx]
-            advance_state.reset(pre_seq.cpu().tolist())
+            advance_state.reset(pre_seq.tolist())
 
             if not advance_state.completed:
-                advance_tokens = torch.LongTensor(advance_state.advance()).to(device)
+                advance_tokens = torch.tensor(advance_state.advance(), dtype=torch.long, device=device)
                 for advance_token in advance_tokens:
                     # since adding each `advance_token` leads to a different hypothesis, create new state instance.
                     new_state = advance_state.copy(stateful=True)
-                    new_state.add(advance_token.cpu().tolist())
+                    new_state.add(advance_token.tolist())
 
-                    advance_seq = torch.cat((pre_seq, advance_token.unsqueeze(0)), -1).cpu().tolist()
+                    advance_seq = torch.cat((pre_seq, advance_token.unsqueeze(0)), -1).tolist()
                     if advance_seq not in track_new["new_seqs"]:
                         # prevent duplicates, which are basically bound to happen in this process.
                         track_new["new_seqs"].append(advance_seq)
@@ -763,7 +763,7 @@ def step_sentence_constraint(
 
                 advance_state = advance_constraint_states[seq_idx]
 
-                advance_seq = advance_seq.cpu().tolist()
+                advance_seq = advance_seq.tolist()
 
                 advance_state.reset(advance_seq)
                 if advance_seq not in track_new["new_seqs"]:
@@ -775,14 +775,14 @@ def step_sentence_constraint(
                     track_new["new_states"].append(advance_state)
 
         if len(track_new["new_indices"]) > 0:
-            new_indices = torch.tensor(track_new["new_indices"]).to(device)
+            new_indices = torch.tensor(track_new["new_indices"], device=device)
             new_tokens = torch.stack(track_new["new_tokens"]).to(device)
             new_scores = torch.stack(track_new["new_scores"]).to(device)
 
             all_states = topk_contraint_states + track_new["new_states"]
             all_tokens = torch.cat((sent_beam_tokens, new_tokens), -1)
             all_scores = torch.cat((sent_beam_scores, new_scores), -1)
-            all_banks = torch.tensor([one.get_bank() for one in all_states]).to(device)
+            all_banks = torch.tensor([one.get_bank() for one in all_states], device=device)
 
             zipped = all_banks * 100 + all_scores
             indices = zipped.sort(descending=True).indices
@@ -843,7 +843,7 @@ def finalize(
                 final_score = final_beam_scores[batch_beam_idx].item()
                 final_tokens = input_ids[batch_beam_idx]
 
-                completes_constraint = self.check_completes_constraints(final_tokens.cpu().tolist())
+                completes_constraint = self.check_completes_constraints(final_tokens.tolist())
                 if completes_constraint:
                     beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
                     generated_len = final_tokens.shape[-1] - decoder_prompt_len
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
index 2c4ab9c2a974..fe57f532e687 100644
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import copy
+import weakref
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 
 import numpy as np
@@ -27,7 +28,7 @@
 
 from ..cache_utils import DynamicCache
 from ..pytorch_utils import isin_mps_friendly
-from .logits_process import LogitsProcessorList, MinLengthLogitsProcessor
+from .logits_process import LogitsProcessorList, MinLengthLogitsProcessor, SuppressTokensLogitsProcessor
 
 
 if TYPE_CHECKING:
@@ -283,18 +284,21 @@ def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> Tuple[int, int]:
         min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0)
         return min_new_tokens, max_new_tokens
 
-    def _update_past_and_masks(self, input_ids: torch.LongTensor, remove_from_pkv: int = 0) -> bool:
+    def _update_past_and_masks(
+        self, input_ids: torch.LongTensor, remove_from_pkv: int = 0, num_added_tokens: int = 1
+    ) -> bool:
         """Update past key values and attention masks for subsequent generation rounds."""
         has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
         if has_past_key_values:
             new_cache_size = input_ids.shape[-1] - 1 - remove_from_pkv
             self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
-                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
+                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - num_added_tokens
             )
             self.assistant_kwargs = _prepare_attention_mask(
                 self.assistant_kwargs, input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder
             )
             self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, input_ids.shape[-1])
+
         return has_past_key_values
 
     def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict:
@@ -608,6 +612,292 @@ def _process_assistant_outputs(
         return new_target_ids
 
 
+class AssistantToTargetTranslator:
+    """
+    Translates token ids and logits between assistant and target model vocabularies. This class is used to handle
+    vocabulary mismatches when using different tokenizers for the assistant and target models in speculative decoding,
+    as introduced in the paper "Lossless Speculative Decoding Algorithms for Heterogeneous Vocabularies"
+    (https://www.arxiv.org/abs/2502.05202).
+    It maintains mappings between the two vocabularies and handles token/logit conversion.
+
+    Args:
+        target_tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used by the target (main) model.
+        assistant_tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used by the assistant model.
+        assistant_model_device (`str`, defaults to "cpu"):
+            The device where the assistant model is located. Used for placing tensors.
+        target_vocab_size (`int`, *optional*):
+            The size of the target model's vocabulary. If not provided, will be inferred from the target tokenizer.
+    """
+
+    FILTER_VALUE: float = -float("Inf")  # The value used to filter out unmapped tokens in the logits.
+    SUPPRESS_TOKEN_ID: int = -1  # The ID used to mark suppressed tokens in the mapping.
+
+    def __init__(
+        self,
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        target_vocab_size: int,  # required since target_vocab_size can be different from the length of target_tokenizer.get_vocab()
+        assistant_model_device: str = "cpu",
+    ):
+        self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer
+        self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer
+        self._assistant_model_device: str = assistant_model_device
+        self.target_vocab_size: int = target_vocab_size
+        self._assistant_to_target_input_ids, self.target_to_assistant_input_ids = (
+            self._get_assistant_to_target_input_ids()
+        )
+        self._suppress_input_ids: list[int] = self._get_suppress_input_ids()
+        self.logits_processors: Optional[LogitsProcessorList] = None
+        if len(self._suppress_input_ids) > 0:
+            # len(self._suppress_input_ids) = 0 if the assistant vocab is a subset of the target vocab
+            self.logits_processors = LogitsProcessorList(
+                [SuppressTokensLogitsProcessor(self._get_suppress_input_ids(), self._assistant_model_device)]
+            )
+
+    def _get_assistant_to_target_input_ids(self):
+        target_vocab = self._target_tokenizer.get_vocab()
+        assistant_vocab = self._assistant_tokenizer.get_vocab()
+
+        space_str = " "
+        target_space_ids = self._target_tokenizer(space_str, add_special_tokens=False)["input_ids"]
+        if len(target_space_ids) > 0:
+            target_space_sign = self._target_tokenizer.convert_ids_to_tokens(target_space_ids)[0][0]
+
+            assistant_space_ids = self._assistant_tokenizer(space_str, add_special_tokens=False)["input_ids"]
+            if len(assistant_space_ids) > 0:
+                assistant_space_sign = self._assistant_tokenizer.convert_ids_to_tokens(assistant_space_ids)[0][0]
+
+                if target_space_sign != assistant_space_sign:
+                    # If the assistant tokenizer has a different space sign than the target tokenizer,
+                    # we need to replace the assistant space sign with the target space sign in the assistant_vocab.
+                    assistant_vocab = {
+                        (
+                            tok.replace(assistant_space_sign, target_space_sign, 1)
+                            if tok.startswith(assistant_space_sign)
+                            else tok
+                        ): idx
+                        for tok, idx in assistant_vocab.items()
+                    }
+
+        max_assistant_index = max(assistant_vocab.values())
+        assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.SUPPRESS_TOKEN_ID, dtype=int)
+        target_to_assistant_input_ids: Dict[int, int] = {}
+        for tok, assistant_id in assistant_vocab.items():
+            target_id = target_vocab.get(tok)
+            if target_id is not None:
+                assistant_to_target_input_ids[assistant_id] = target_id
+                target_to_assistant_input_ids[target_id] = assistant_id
+        return assistant_to_target_input_ids.to(self._assistant_model_device), target_to_assistant_input_ids
+
+    def _get_suppress_input_ids(self) -> list[int]:
+        """
+        Get the input ids that are in the assistant vocab but not in the target vocab.
+        """
+        return torch.where(self._assistant_to_target_input_ids == self.SUPPRESS_TOKEN_ID)[0]
+
+    def get_target_ids(
+        self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor
+    ) -> torch.LongTensor:
+        """
+        Return the target candidate ids that correspond to the assistant candidate ids.
+        Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens.
+        Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids.
+        """
+
+        num_new_tokens = len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]
+        if num_new_tokens == 0:
+            return target_input_ids
+        else:
+            transformed_slice = self._assistant_to_target_input_ids[assistant_candidate_ids[0, -num_new_tokens:]]
+            return torch.cat((target_input_ids, transformed_slice.unsqueeze(0)), dim=1)
+
+    def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        Return the target logits that correspond to the assistant logits.
+        """
+
+        target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], self.target_vocab_size)
+        target_logits: torch.FloatTensor = torch.full(
+            target_shape, self.FILTER_VALUE, device=self._assistant_model_device
+        )
+        # Mask for valid indices
+        assistant_indices_mask = self._assistant_to_target_input_ids != self.SUPPRESS_TOKEN_ID
+        # Exclude invalid indices
+        target_logits_supported_indices = self._assistant_to_target_input_ids[assistant_indices_mask]
+        valid_assistant_logits = assistant_logits[..., : self._assistant_to_target_input_ids.shape[0]]
+
+        target_logits[..., target_logits_supported_indices] = valid_assistant_logits[..., assistant_indices_mask]
+
+        return target_logits
+
+
+class AssistantVocabTranslatorCache:
+    """
+    Cache for `AssistantToTargetTranslator` instances. The instances are computed at
+    pre-processing time, and this cache allows us to avoid recomputing them.
+    """
+
+    _cache = weakref.WeakKeyDictionary()
+
+    @classmethod
+    def get_translator(
+        cls,
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        target_vocab_size: int,
+        assistant_model_device: str = "cpu",
+    ) -> AssistantToTargetTranslator:
+        assistant_dict = cls._cache.get(target_tokenizer)
+        if assistant_dict is None:
+            assistant_dict = weakref.WeakKeyDictionary()
+            cls._cache[target_tokenizer] = assistant_dict
+
+        mapping = assistant_dict.get(assistant_tokenizer)
+        if mapping is None:
+            mapping = AssistantToTargetTranslator(
+                target_tokenizer, assistant_tokenizer, target_vocab_size, assistant_model_device
+            )
+            assistant_dict[assistant_tokenizer] = mapping
+
+        return mapping
+
+    @classmethod
+    def cleanup(cls):
+        """
+        Clean up dead references in the cache.
+        This removes entries where either the target_tokenizer or assistant_tokenizer
+        has been garbage collected.
+        """
+        # Remove entries from the outer cache where the target_tokenizer is no longer alive
+        dead_keys = [key for key in cls._cache if key is None]
+        for key in dead_keys:
+            del cls._cache[key]
+
+        # For each assistant_dict, remove entries where assistant_tokenizer is no longer alive
+        for assistant_dict in cls._cache.values():
+            dead_keys = [key for key in assistant_dict if key is None]
+            for key in dead_keys:
+                del assistant_dict[key]
+
+
+class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers):
+    """
+    `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers
+    for the assistant and main models. This class generates candidates through the use of a smaller model.
+    """
+
+    def __init__(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: "PreTrainedModel",
+        target_tokenizer: "PreTrainedTokenizerBase",
+        assistant_tokenizer: "PreTrainedTokenizerBase",
+        generation_config: "GenerationConfig",
+        model_kwargs: Dict,
+        atm_translator: AssistantToTargetTranslator,
+        inputs_tensor: Optional[torch.Tensor] = None,
+        logits_processor: "LogitsProcessorList" = None,
+    ):
+        # Initialize translator before parent class
+        self._atm_translator = atm_translator
+        super().__init__(
+            input_ids,
+            assistant_model,
+            target_tokenizer,
+            assistant_tokenizer,
+            generation_config,
+            model_kwargs,
+            inputs_tensor,
+            logits_processor,
+        )
+        # Track sequence lengths and previous assistant IDs
+        self._target_seq_len_with_candidates: int = 0
+        self._prev_assistant_ids: Optional[torch.LongTensor] = None
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
+        """
+        Simplified version of get_candidates that uses the translator cache for token conversion.
+        """
+        target_input_ids = input_ids.to(self.assistant_model.device)
+        assistant_input_ids, num_added_tokens = self._prepare_assistant_input_ids(target_input_ids)
+        min_new_tokens, max_new_tokens = self._calculate_new_tokens(target_input_ids)
+
+        if max_new_tokens == 0:
+            return input_ids, None
+
+        self._update_past_and_masks(assistant_input_ids, num_added_tokens=num_added_tokens)
+        generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
+
+        # Ensure scores are returned
+        generation_args["generation_config"].output_scores = True
+        generation_args["generation_config"].return_dict_in_generate = True
+
+        # Generate and process outputs using translator
+        if self._atm_translator.logits_processors is not None:
+            generation_args["logits_processor"] = self._atm_translator.logits_processors
+        self._prev_assistant_ids, assistant_candidate_logits = self._generate_candidates(generation_args)
+
+        # Use translator to convert tokens and logits
+        target_candidate_ids = self._atm_translator.get_target_ids(
+            assistant_input_ids, target_input_ids, self._prev_assistant_ids
+        )
+        self._target_seq_len_with_candidates = target_candidate_ids.shape[-1]
+        target_candidate_logits = self._atm_translator.get_target_logits(assistant_candidate_logits)
+
+        return target_candidate_ids, target_candidate_logits
+
+    def _update_past_and_masks(self, assistant_input_ids: torch.LongTensor, num_added_tokens: int = 1) -> bool:
+        if self._prev_assistant_ids is None:
+            # Prepare attention mask for the first generation.
+            # For subsequent generations, the attention mask is updated in super()_update_past_and_masks.
+            self.assistant_kwargs = _prepare_attention_mask(
+                self.assistant_kwargs, assistant_input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder
+            )
+        return super()._update_past_and_masks(assistant_input_ids, num_added_tokens=num_added_tokens)
+
+    def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> torch.LongTensor:
+        """
+        Simplified token conversion that only processes new tokens.
+        """
+        # Calculate new tokens since last call
+        target_seq_len = target_input_ids.shape[-1]
+        if self._target_seq_len_with_candidates == 0:
+            new_token_count = target_seq_len
+        else:
+            new_token_count = 1
+        target_new_ids = target_input_ids[:, -new_token_count:]
+
+        # Convert the new tokens
+        assistant_new_ids = None
+        if self._target_seq_len_with_candidates > 0:
+            # we have only one new token and we can directly convert it
+            assistant_new_ids = self._atm_translator.target_to_assistant_input_ids.get(target_new_ids[0].item())
+        if assistant_new_ids is None:
+            target_new_text = self.target_tokenizer.batch_decode(
+                target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+            )
+            assistant_new_ids = self.assistant_tokenizer(
+                target_new_text, add_special_tokens=False, return_tensors="pt"
+            )["input_ids"].to(self.assistant_model.device)
+        else:
+            assistant_new_ids = torch.tensor([[assistant_new_ids]], device=self.assistant_model.device)
+
+        # Update or initialize assistant IDs
+        if self._prev_assistant_ids is None:
+            assistant_input_ids = assistant_new_ids
+        else:
+            tokens_to_remove = self._target_seq_len_with_candidates + 1 - target_seq_len
+            # If the number of new tokens is greater than zero, truncate the previous assistant IDs
+            if tokens_to_remove > 0:
+                self._prev_assistant_ids = self._prev_assistant_ids[:, :-tokens_to_remove]
+            assistant_input_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1)
+        assistant_input_ids = assistant_input_ids.to(dtype=torch.long)
+
+        return assistant_input_ids, len(assistant_new_ids[0])
+
+
 class PromptLookupCandidateGenerator(CandidateGenerator):
     """
     `CandidateGenerator` class to be used for prompt lookup generation. This class generates candidates by looking up
@@ -626,9 +916,9 @@ class PromptLookupCandidateGenerator(CandidateGenerator):
 
     def __init__(
         self,
-        eos_token_id: torch.Tensor = None,
+        eos_token_id: Optional[torch.Tensor] = None,
         num_output_tokens: int = 10,
-        max_matching_ngram_size: int = None,
+        max_matching_ngram_size: Optional[int] = None,
         max_length: int = 20,
     ):
         self.num_output_tokens = num_output_tokens
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 50f7b29f7520..c743480d786b 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -52,6 +52,7 @@
     from ..cache_utils import (
         HQQQuantizedCache,
         HybridCache,
+        HybridChunkedCache,
         MambaCache,
         OffloadedStaticCache,
         QuantizedCacheConfig,
@@ -69,11 +70,12 @@
         "offloaded_static": OffloadedStaticCache,
         "sliding_window": SlidingWindowCache,
         "hybrid": HybridCache,
+        "hybrid_chunked": HybridChunkedCache,
         "mamba": MambaCache,
     }
     QUANT_BACKEND_CLASSES_MAPPING = {"quanto": QuantoQuantizedCache, "HQQ": HQQQuantizedCache}
     ALL_CACHE_IMPLEMENTATIONS = (
-        list(NEED_SETUP_CACHE_CLASSES_MAPPING.keys()) + list(CACHE_CONFIG_MAPPING.keys()) + ["offloaded"]
+        list(NEED_SETUP_CACHE_CLASSES_MAPPING.keys()) + list(CACHE_CONFIG_MAPPING.keys()) + ["offloaded", "dynamic"]
     )
 
 
@@ -175,6 +177,7 @@ class GenerationConfig(PushToHubMixin):
         cache_implementation (`str`, *optional*, default to `None`):
             Name of the cache class that will be instantiated in `generate`, for faster decoding. Possible values are:
 
+            - `"dynamic"`: [`DynamicCache`]
             - `"static"`: [`StaticCache`]
             - `"offloaded_static"`: [`OffloadedStaticCache`]
             - `"sliding_window"`: [`SlidingWindowCache`]
@@ -182,9 +185,8 @@ class GenerationConfig(PushToHubMixin):
             - `"mamba"`: [`MambaCache`]
             - `"quantized"`: [`QuantizedCache`]
 
-            We support other cache types, but they must be manually instantiated and
-            passed to `generate` through the `past_key_values` argument. See our
-            [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
+            If none is specified, we will use the default cache for the model (which is often [`DynamicCache`]). See
+            our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
         cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
             Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
             it will be converted to its repsective `CacheConfig` internally.
@@ -379,8 +381,7 @@ class GenerationConfig(PushToHubMixin):
             If using a static cache, this controls how `generate` will `compile` the forward pass for performance
             gains.
 
-        disable_compile (`bool`, *optional*): Whether to disable the compilation of the forward pass when using 'statis' cache
-            implementation.
+        disable_compile (`bool`, *optional*): Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when specific criteria are met, including using a compileable cache. Please open an issue if you find the need to use this flag.
 
         > Wild card
 
@@ -417,6 +418,7 @@ def __init__(self, **kwargs):
             if isinstance(self.cache_config, dict):
                 self.cache_config = cache_config_class.from_dict(self.cache_config)
         self.return_legacy_cache = kwargs.pop("return_legacy_cache", None)
+        self.prefill_chunk_size = kwargs.pop("prefill_chunk_size", None)
 
         # Parameters for manipulation of the model output logits
         self.temperature = kwargs.pop("temperature", 1.0)
@@ -483,7 +485,7 @@ def __init__(self, **kwargs):
         self.assistant_lookbehind = kwargs.pop("assistant_lookbehind", 10)
         self.target_lookbehind = kwargs.pop("target_lookbehind", 10)
 
-        # Performances
+        # Performance
         self.compile_config = kwargs.pop("compile_config", CompileConfig())
         self.disable_compile = kwargs.pop("disable_compile", False)
         # Wild card
diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index 0f6f2a0041c9..ddd718cbb8a6 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -63,7 +63,7 @@ class FlaxGreedySearchOutput(ModelOutput):
             The generated sequences.
     """
 
-    sequences: jnp.ndarray = None
+    sequences: Optional[jnp.ndarray] = None
 
 
 @flax.struct.dataclass
@@ -77,7 +77,7 @@ class FlaxSampleOutput(ModelOutput):
             The generated sequences.
     """
 
-    sequences: jnp.ndarray = None
+    sequences: Optional[jnp.ndarray] = None
 
 
 @flax.struct.dataclass
@@ -93,8 +93,8 @@ class FlaxBeamSearchOutput(ModelOutput):
             The scores (log probabilities) of the generated sequences.
     """
 
-    sequences: jnp.ndarray = None
-    scores: jnp.ndarray = None
+    sequences: Optional[jnp.ndarray] = None
+    scores: Optional[jnp.ndarray] = None
 
 
 @flax.struct.dataclass
@@ -171,8 +171,8 @@ def _prepare_encoder_decoder_kwargs_for_generation(self, input_ids, params, mode
     def _prepare_decoder_input_ids_for_generation(
         self,
         batch_size: int,
-        decoder_start_token_id: int = None,
-        bos_token_id: int = None,
+        decoder_start_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
         model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
     ) -> jnp.ndarray:
         if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
@@ -183,7 +183,9 @@ def _prepare_decoder_input_ids_for_generation(
         decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
         return jnp.array(decoder_start_token_id, dtype="i4").reshape(1, -1).repeat(batch_size, axis=0)
 
-    def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
+    def _get_decoder_start_token_id(
+        self, decoder_start_token_id: Optional[int] = None, bos_token_id: Optional[int] = None
+    ) -> int:
         # retrieve decoder_start_token_id for encoder-decoder models
         # fall back to bos_token_id if necessary
         decoder_start_token_id = (
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 7bc43e5664e6..16c04478f08a 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1102,7 +1102,7 @@ def __init__(self, sequence_bias: List[List[Union[List[int], float]]]):
         self._convert_list_arguments_into_dict()
 
         # Bias variables that will be populated on the first call (for retrocompatibility purposes, the vocabulary size
-        # is infered in the first usage, which inhibits initializing here)
+        # is inferred in the first usage, which inhibits initializing here)
         self.length_1_bias = None
         self.prepared_bias_variables = False
 
@@ -1157,7 +1157,7 @@ def _prepare_bias_variables(self, scores: torch.FloatTensor):
 
         # Precompute the bias tensors to be applied. Sequences of length 1 are kept separately, as they can be applied
         # with simpler logic.
-        self.length_1_bias = torch.zeros((vocabulary_size,), dtype=torch.float).to(scores.device)
+        self.length_1_bias = torch.zeros((vocabulary_size,), dtype=torch.float, device=scores.device)
         for sequence_ids, bias in self.sequence_bias.items():
             if len(sequence_ids) == 1:
                 self.length_1_bias[sequence_ids[-1]] = bias
@@ -1353,8 +1353,11 @@ def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         mask = torch.full_like(scores, -math.inf)
-        for batch_id, beam_sent in enumerate(input_ids.view(-1, self._num_beams, input_ids.shape[-1])):
-            for beam_id, sent in enumerate(beam_sent):
+        batch_size = input_ids.shape[0] // self._num_beams
+
+        for batch_id in range(batch_size):
+            for beam_id in range(self._num_beams):
+                sent = input_ids[batch_id * self._num_beams + beam_id]
                 prefix_allowed_tokens = self._prefix_allowed_tokens_fn(batch_id, sent)
                 if len(prefix_allowed_tokens) == 0:
                     raise ValueError(
@@ -2746,9 +2749,7 @@ def compute_ngram_keys(self, ngrams: torch.LongTensor) -> torch.LongTensor:
             ngram keys (batch_size, num_ngrams, depth).
         """
         if len(ngrams.shape) != 3:
-            raise ValueError(
-                "Ngrams should be of shape (batch_size, num_ngrams, ngram_len), but" f" is {ngrams.shape}"
-            )
+            raise ValueError(f"Ngrams should be of shape (batch_size, num_ngrams, ngram_len), but is {ngrams.shape}")
         if ngrams.shape[2] != self.ngram_len:
             raise ValueError(
                 "Ngrams should be of shape (batch_size, num_ngrams, ngram_len),"
@@ -2833,7 +2834,7 @@ def sample_g_values(self, ngram_keys: torch.LongTensor) -> torch.LongTensor:
     def _check_input_ids_shape(self, input_ids: torch.LongTensor):
         """Checks the shape of input ids."""
         if len(input_ids.shape) != 2:
-            raise ValueError("Input ids should be of shape (batch_size, input_len), but is" f" {input_ids.shape}")
+            raise ValueError(f"Input ids should be of shape (batch_size, input_len), but is {input_ids.shape}")
 
     def compute_g_values(self, input_ids: torch.LongTensor) -> torch.LongTensor:
         """
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index dd1f819fe548..344147e6e34c 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -76,7 +76,7 @@ class TFGreedySearchDecoderOnlyOutput(ModelOutput):
             `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
     """
 
-    sequences: tf.Tensor = None
+    sequences: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
     attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
     hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
@@ -115,7 +115,7 @@ class TFGreedySearchEncoderDecoderOutput(ModelOutput):
             `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
     """
 
-    sequences: tf.Tensor = None
+    sequences: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
     encoder_attentions: Optional[Tuple[tf.Tensor]] = None
     encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
@@ -146,7 +146,7 @@ class TFSampleDecoderOnlyOutput(ModelOutput):
             `tf.Tensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`.
     """
 
-    sequences: tf.Tensor = None
+    sequences: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
     attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
     hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
@@ -185,7 +185,7 @@ class TFSampleEncoderDecoderOutput(ModelOutput):
             `tf.Tensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`.
     """
 
-    sequences: tf.Tensor = None
+    sequences: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
     encoder_attentions: Optional[Tuple[tf.Tensor]] = None
     encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
@@ -221,7 +221,7 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput):
             `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
     """
 
-    sequences: tf.Tensor = None
+    sequences: Optional[tf.Tensor] = None
     sequences_scores: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
     beam_indices: Optional[tf.Tensor] = None
@@ -268,7 +268,7 @@ class TFBeamSearchEncoderDecoderOutput(ModelOutput):
             `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
     """
 
-    sequences: tf.Tensor = None
+    sequences: Optional[tf.Tensor] = None
     sequences_scores: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
     beam_indices: Optional[tf.Tensor] = None
@@ -306,7 +306,7 @@ class TFBeamSampleDecoderOnlyOutput(ModelOutput):
             `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
     """
 
-    sequences: tf.Tensor = None
+    sequences: Optional[tf.Tensor] = None
     sequences_scores: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
     beam_indices: Optional[tf.Tensor] = None
@@ -352,7 +352,7 @@ class TFBeamSampleEncoderDecoderOutput(ModelOutput):
             `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
     """
 
-    sequences: tf.Tensor = None
+    sequences: Optional[tf.Tensor] = None
     sequences_scores: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
     beam_indices: Optional[tf.Tensor] = None
@@ -384,7 +384,7 @@ class TFContrastiveSearchDecoderOnlyOutput(ModelOutput):
             `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
     """
 
-    sequences: tf.Tensor = None
+    sequences: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
     attentions: Optional[Tuple[Tuple[tf.Tensor]]] = None
     hidden_states: Optional[Tuple[Tuple[tf.Tensor]]] = None
@@ -422,7 +422,7 @@ class TFContrastiveSearchEncoderDecoderOutput(ModelOutput):
             `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
     """
 
-    sequences: tf.Tensor = None
+    sequences: Optional[tf.Tensor] = None
     scores: Optional[Tuple[tf.Tensor]] = None
     encoder_attentions: Optional[Tuple[tf.Tensor]] = None
     encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
@@ -483,7 +483,7 @@ def compute_transition_scores(
     ) -> tf.Tensor:
         """
         Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
-        used). This is a convenient method to quicky obtain the scores of the selected tokens at generation time.
+        used). This is a convenient method to quickly obtain the scores of the selected tokens at generation time.
 
         Parameters:
             sequences (`tf.Tensor`):
@@ -1077,8 +1077,8 @@ def _prepare_decoder_input_ids_for_generation(
         batch_size: int,
         model_input_name: str,
         model_kwargs: Dict[str, tf.Tensor],
-        decoder_start_token_id: int = None,
-        bos_token_id: int = None,
+        decoder_start_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
     ) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]:
         """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
         # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
@@ -1111,7 +1111,9 @@ def _prepare_decoder_input_ids_for_generation(
 
         return decoder_input_ids, model_kwargs
 
-    def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
+    def _get_decoder_start_token_id(
+        self, decoder_start_token_id: Optional[int] = None, bos_token_id: Optional[int] = None
+    ) -> int:
         # retrieve decoder_start_token_id for encoder-decoder models
         # fall back to bos_token_id if necessary
         decoder_start_token_id = (
@@ -2118,7 +2120,7 @@ def beam_search(
         a greedy approach, otherwise does multinomial sampling without replacement.
 
         Parameters:
-            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            input_ids (`tf.Tensor` of shape `(batch_size, num_beams, sequence_length)`):
                 The sequence used as a prompt for the generation.
             do_sample (`bool`, *optional*, defaults to `False`):
                 Whether or not to use sampling ; use greedy decoding otherwise.
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index db8bbe50e508..bc00e29ba552 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -23,9 +23,12 @@
 import numpy as np
 import torch
 import torch.distributed as dist
+from packaging import version
 from torch import nn
 from torch.nn import functional as F
 
+from transformers.generation.candidate_generator import AssistantVocabTranslatorCache
+
 from ..cache_utils import (
     Cache,
     DynamicCache,
@@ -45,7 +48,7 @@
     is_accelerate_available,
     is_hqq_available,
     is_optimum_quanto_available,
-    is_torchdynamo_compiling,
+    is_torchdynamo_exporting,
     logging,
 )
 from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint
@@ -56,6 +59,7 @@
     CandidateGenerator,
     EarlyExitCandidateGenerator,
     PromptLookupCandidateGenerator,
+    UniversalSpeculativeDecodingGenerator,
     _crop_past_key_values,
     _prepare_attention_mask,
     _prepare_token_type_ids,
@@ -154,7 +158,7 @@ class GenerateDecoderOnlyOutput(ModelOutput):
             the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
     """
 
-    sequences: torch.LongTensor = None
+    sequences: torch.LongTensor
     scores: Optional[Tuple[torch.FloatTensor]] = None
     logits: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
@@ -199,7 +203,7 @@ class GenerateEncoderDecoderOutput(ModelOutput):
             the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
     """
 
-    sequences: torch.LongTensor = None
+    sequences: torch.LongTensor
     scores: Optional[Tuple[torch.FloatTensor]] = None
     logits: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -244,7 +248,7 @@ class GenerateBeamDecoderOnlyOutput(ModelOutput):
             the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
     """
 
-    sequences: torch.LongTensor = None
+    sequences: torch.LongTensor
     sequences_scores: Optional[torch.FloatTensor] = None
     scores: Optional[Tuple[torch.FloatTensor]] = None
     logits: Optional[Tuple[torch.FloatTensor]] = None
@@ -298,7 +302,7 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
             the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
     """
 
-    sequences: torch.LongTensor = None
+    sequences: torch.LongTensor
     sequences_scores: Optional[torch.FloatTensor] = None
     scores: Optional[Tuple[torch.FloatTensor]] = None
     logits: Optional[Tuple[torch.FloatTensor]] = None
@@ -341,7 +345,22 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
 
 class GenerationMixin:
     """
-    A class containing all functions for auto-regressive text generation, to be used as a mixin in [`PreTrainedModel`].
+    A class containing all functions for auto-regressive text generation, to be used as a mixin in model classes.
+    Inheriting from this class causes the model to have special generation-related behavior, such as loading a
+    `GenerationConfig` at initialization time or ensuring `generate`-related tests are run in `transformers` CI.
+
+    A model class should inherit from `GenerationMixin` to enable calling methods like `generate`, or when it
+    has defined a custom `generate` method that relies on `GenerationMixin`, directly or indirectly, which
+    approximately shares the same interface to public methods like `generate`. Three examples:
+        - `LlamaForCausalLM` should inherit from `GenerationMixin` to enable calling `generate` and other public
+            methods in the mixin;
+        - `BlipForQuestionAnswering` has a custom `generate` method that approximately shares the same interface as
+           `GenerationMixin.generate` (it has a few extra arguments, and the same output). That function also calls
+           `GenerationMixin.generate` indirectly, through an inner model. As such, `BlipForQuestionAnswering` should
+           inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
+        - `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
+            However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
+            `BarkModel` shoud NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
 
     The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
         - *greedy decoding* if `num_beams=1` and `do_sample=False`
@@ -356,6 +375,102 @@ class GenerationMixin:
     To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
     """
 
+    def _cache_dependant_input_preparation(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: Optional[torch.FloatTensor],
+        cache_position: Optional[torch.LongTensor],
+    ) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+        """
+        Generic cache-dependent input preparation
+        The code is put in a separate function to allow granular unit testing
+        as it needs a different implementation to be exportable.
+
+        If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        - Exception 1: when passing input_embeds, input_ids may be missing entries
+        - Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        - Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        - Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
+          generate the first token for each sequence. Later use the generated Input ids for continuation.
+
+        The current implementation does not rely on ``self`` and could be
+        a class method. It is left as a standard method to be easily rewritten.
+        """
+        if is_torchdynamo_exporting():
+            return self._cache_dependant_input_preparation_exporting(input_ids, inputs_embeds, cache_position)
+        if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
+            inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
+        elif (
+            inputs_embeds is not None  # Exception 1
+            or (cache_position[-1] >= input_ids.shape[1])  # Exception 3
+        ):
+            input_ids = input_ids[:, -cache_position.shape[0] :]
+        elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+            input_ids = input_ids[:, cache_position]
+        return inputs_embeds, input_ids
+
+    def _cache_dependant_input_preparation_exporting(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: Optional[torch.FloatTensor],
+        cache_position: Optional[torch.LongTensor],
+    ) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+        """
+        This method implements method ``_cache_dependant_input_preparation``
+        with :func:`torch.cond` to make it exportable with :func:`torch.export.export`.
+        The code is put in a separate function to allow granular unit testing.
+        """
+        if inputs_embeds is None:
+            input_ids = input_ids[:, cache_position]
+        else:
+            # This is the code we need to implemented with torch.cond.
+            # if input_ids.shape[1] == 0:
+            #     inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
+            # else:
+            #     if cache_position[-1] >= input_ids.shape[1]:
+            #         input_ids = input_ids[:, -cache_position.shape[0] :]
+            #     else:
+            #         if input_ids.shape[1] != cache_position.shape[0]:
+            #             input_ids = input_ids[:, cache_position]
+            def branch_1(inputs_embeds, cache_position):
+                return inputs_embeds[:, -cache_position.shape[0] :]
+
+            def branch_2(input_ids, cache_position):
+                return input_ids[:, -cache_position.shape[0] :]
+
+            def branch_3(input_ids, cache_position):
+                return input_ids[:, cache_position]
+
+            inputs_embeds, input_ids = torch.cond(
+                input_ids.shape[1] == 0,
+                (
+                    lambda input_ids, inputs_embeds, cache_position: (
+                        branch_1(inputs_embeds, cache_position),
+                        input_ids,
+                    )
+                ),
+                (
+                    lambda input_ids, inputs_embeds, cache_position: (
+                        inputs_embeds,
+                        torch.cond(
+                            cache_position[-1] >= input_ids.shape[1],
+                            branch_2,
+                            lambda input_ids, cache_position: (
+                                torch.cond(
+                                    input_ids.shape[1] != cache_position.shape[0],
+                                    branch_3,
+                                    (lambda input_ids, cache_position: input_ids),
+                                    [input_ids, cache_position],
+                                )
+                            ),
+                            [input_ids, cache_position],
+                        ),
+                    )
+                ),
+                [input_ids, inputs_embeds, cache_position],
+            )
+        return inputs_embeds, input_ids
+
     def prepare_inputs_for_generation(
         self,
         input_ids: torch.LongTensor,
@@ -386,24 +501,11 @@ def prepare_inputs_for_generation(
             cache_position = torch.arange(past_length, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
 
         # 2. Generic cache-dependent input preparation
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        #              (we can't check exception 3 while compiling)
-        # Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
-        # generate the first token for each sequence. Later use the generated Input ids for continuation.
         if past_key_values is not None:
             model_inputs["past_key_values"] = past_key_values
-            if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
-                inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
-            elif (
-                inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
+            inputs_embeds, input_ids = self._cache_dependant_input_preparation(
+                input_ids, inputs_embeds, cache_position
+            )
 
         # 3. Prepare base model inputs
         input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
@@ -420,6 +522,7 @@ def prepare_inputs_for_generation(
             model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)
 
         # 4. Create missing `position_ids` on the fly
+        encoder_attention_mask = attention_mask if self.config.is_encoder_decoder else None
         attention_mask = (
             kwargs.pop("decoder_attention_mask", None) if self.config.is_encoder_decoder else attention_mask
         )
@@ -490,6 +593,9 @@ def prepare_inputs_for_generation(
         if attention_mask is not None:
             model_inputs[attention_mask_key] = attention_mask
 
+        if encoder_attention_mask is not None:
+            model_inputs["attention_mask"] = encoder_attention_mask
+
         # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
         for key, value in kwargs.items():
             if key not in model_inputs:
@@ -678,7 +784,7 @@ def _prepare_decoder_input_ids_for_generation(
         model_input_name: str,
         model_kwargs: Dict[str, torch.Tensor],
         decoder_start_token_id: torch.Tensor,
-        device: torch.device = None,
+        device: Optional[torch.device] = None,
     ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
         """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
         # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
@@ -854,16 +960,36 @@ def _get_candidate_generator(
                 max_length=generation_config.max_length,
             )
         elif different_tokenizers:
-            candidate_generator = AssistedCandidateGeneratorDifferentTokenizers(
-                input_ids=input_ids,
-                assistant_model=assistant_model,
-                generation_config=generation_config,
-                model_kwargs=model_kwargs,
-                inputs_tensor=inputs_tensor,
-                logits_processor=logits_processor,
-                target_tokenizer=target_tokenizer,
-                assistant_tokenizer=assistant_tokenizer,
-            )
+            if generation_config.do_sample is True:
+                atm_translator = AssistantVocabTranslatorCache.get_translator(
+                    target_tokenizer, assistant_tokenizer, self.config.vocab_size, assistant_model.device
+                )
+                candidate_generator = UniversalSpeculativeDecodingGenerator(
+                    input_ids=input_ids,
+                    assistant_model=assistant_model,
+                    generation_config=generation_config,
+                    model_kwargs=model_kwargs,
+                    inputs_tensor=inputs_tensor,
+                    logits_processor=logits_processor,
+                    target_tokenizer=target_tokenizer,
+                    assistant_tokenizer=assistant_tokenizer,
+                    atm_translator=atm_translator,
+                )
+            elif generation_config.do_sample is False:
+                candidate_generator = AssistedCandidateGeneratorDifferentTokenizers(
+                    input_ids=input_ids,
+                    assistant_model=assistant_model,
+                    generation_config=generation_config,
+                    model_kwargs=model_kwargs,
+                    inputs_tensor=inputs_tensor,
+                    logits_processor=logits_processor,
+                    target_tokenizer=target_tokenizer,
+                    assistant_tokenizer=assistant_tokenizer,
+                )
+            else:
+                raise ValueError(
+                    f"Invalid value for `do_sample`: expected a boolean, got {type(generation_config.do_sample).__name__}"
+                )
         else:
             candidate_generator = AssistedCandidateGenerator(
                 input_ids=input_ids,
@@ -882,7 +1008,7 @@ def _get_logits_processor(
         encoder_input_ids: torch.LongTensor,
         prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]],
         logits_processor: Optional[LogitsProcessorList],
-        device: str = None,
+        device: Optional[str] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
         negative_prompt_ids: Optional[torch.Tensor] = None,
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
@@ -1150,21 +1276,37 @@ def _merge_criteria_processor_list(
         default_list: Union[LogitsProcessorList, StoppingCriteriaList],
         custom_list: Union[LogitsProcessorList, StoppingCriteriaList],
     ) -> Union[LogitsProcessorList, StoppingCriteriaList]:
+        """
+        Merge user-defined processors/criteria with the ones instantiated inside `generate`. In case the same
+        processor/criteria is present on both lists, use the user-defined one.
+
+        (Note: up to v4.49.0, this funtion threw an exception is the same logit processor was found twice.)
+        """
         if len(custom_list) == 0:
             return default_list
+
+        final_list = type(default_list)()
         for default in default_list:
+            using_custom = False
             for custom in custom_list:
                 if type(custom) is type(default):
                     object_type = "stopping criteria" if isinstance(custom, StoppingCriteria) else "logits processor"
-                    raise ValueError(
-                        f"A custom {object_type} of type {type(custom)} with values {custom} has been passed to"
-                        f" `.generate()`, but it has already been created with the values {default}. {default} has been"
-                        " created by passing the corresponding arguments to generate or by the model's config default"
-                        f" values. If you just want to change the default values of {object_type} consider passing"
-                        f" them as arguments to `.generate()` instead of using a custom {object_type}."
+                    logger.warning_once(
+                        f"A custom {object_type} of type {type(custom)} has been passed to `.generate()`, but it "
+                        f"was also created in `.generate()`, given its parameterization. The custom {type(custom)} "
+                        f"will take precedence. Please check the docstring of {type(custom)} to see related "
+                        "`.generate()` flags."
                     )
-        default_list.extend(custom_list)
-        return default_list
+                    final_list.append(custom)
+                    using_custom = True
+                    break
+            if not using_custom:
+                final_list.append(default)
+
+        for custom in custom_list:
+            if custom not in final_list:
+                final_list.append(custom)
+        return final_list
 
     def compute_transition_scores(
         self,
@@ -1175,7 +1317,7 @@ def compute_transition_scores(
     ) -> torch.Tensor:
         """
         Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
-        used). This is a convenient method to quicky obtain the scores of the selected tokens at generation time.
+        used). This is a convenient method to quickly obtain the scores of the selected tokens at generation time.
 
         Parameters:
             sequences (`torch.LongTensor`):
@@ -1296,7 +1438,7 @@ def _validate_model_class(self):
         # TODO(joao): remove this function in v4.50, i.e. when we remove the inheritance of `GenerationMixin` from
         # `PreTrainedModel`. With that inheritance removed, all model classes inheriting from `GenerationMixin` can
         # safely call `GenerationMixin.generate`
-        if not is_torchdynamo_compiling() and not self.can_generate():
+        if not self.can_generate():
             terminations_with_generation_support = [
                 "ForCausalLM",
                 "ForConditionalGeneration",
@@ -1397,11 +1539,6 @@ def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
 
     def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length):
         """Performs validation related to the resulting generated length"""
-
-        # Can't throw warnings/exceptions during compilation
-        if is_torchdynamo_compiling():
-            return
-
         # 1. Max length warnings related to poor parameterization
         if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20:
             # 20 is the default max_length of the generation config
@@ -1501,17 +1638,16 @@ def _prepare_generated_length(
         return generation_config
 
     def _prepare_generation_config(
-        self, generation_config: Optional[GenerationConfig], **kwargs: Dict
+        self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: Dict
     ) -> Tuple[GenerationConfig, Dict]:
         """
         Prepares the base generation config, then applies any generation configuration options from kwargs. This
         function handles retrocompatibility with respect to configuration files.
         """
-        # TODO joao: when we can detect `fullgraph=True` in `torch.compile` (https://github.com/pytorch/pytorch/pull/120400)
-        # replace `is_torchdynamo_compiling` by the corresponding check. As it is, we are being too restrictive with
-        # the parameterization in `fullgraph=False` so as to enable `fullgraph=True`.
+        # parameterization priority:
+        # kwargs > non-global default values in `generation_config` > `model.generation_config` > GenerationConfig()
+        # TODO (joao): per-model generation config classes.
 
-        # priority: `generation_config` argument > `model.generation_config` (the default generation config)
         using_model_generation_config = False
         if generation_config is None:
             # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
@@ -1520,10 +1656,8 @@ def _prepare_generation_config(
             # 2) the generation config must have seen no modification since its creation (the hash is the same);
             # 3) there are non-default generation parameters in the model config.
             # 4) the user must have set new generation parameters in the model config.
-            # NOTE: `torch.compile` can't compile `hash`, this legacy support is disabled with compilation.
             if (
-                not is_torchdynamo_compiling()
-                and self.generation_config._from_model_config  # 1)
+                self.generation_config._from_model_config  # 1)
                 and self.generation_config._original_object_hash == hash(self.generation_config)  # 2)
                 and len(self.config._get_non_default_generation_parameters()) > 0  # 3)
             ):
@@ -1541,14 +1675,34 @@ def _prepare_generation_config(
             generation_config = self.generation_config
             using_model_generation_config = True
 
-        # `torch.compile` can't compile `copy.deepcopy`, arguments in `kwargs` that are part of `generation_config`
-        # will mutate the object with `.update`. As such, passing these arguments through `kwargs` is disabled -- an
-        # exception will be raised in `_validate_model_kwargs`
-        if not is_torchdynamo_compiling():
-            generation_config = copy.deepcopy(generation_config)
-            model_kwargs = generation_config.update(**kwargs)
-            # If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model
-            if not using_model_generation_config:
+        # `torch.export.export` usually raises an exception if it is called
+        # with ``strict=True``. deepcopy can only be processed if ``strict=False``.
+        generation_config = copy.deepcopy(generation_config)
+
+        if not using_model_generation_config:
+            # If `generation_config` is provided:
+            # - `use_model_defaults`: let's fallback ALL default values to the model's generation config
+            # - otherwise: legacy behavior, let's just make sure we have the tokens defined
+            model_base_version = version.parse(version.parse(self.generation_config.transformers_version).base_version)
+            if use_model_defaults is True or (
+                use_model_defaults is None and model_base_version >= version.parse("4.50.0")
+            ):
+                modified_values = {}
+                default_generation_config = GenerationConfig()
+                for key, default_value in default_generation_config.__dict__.items():
+                    if key.startswith("_") or key == "transformers_version":  # metadata
+                        continue
+                    custom_gen_config_value = getattr(generation_config, key)
+                    model_gen_config_value = getattr(self.generation_config, key)
+                    if custom_gen_config_value == default_value and model_gen_config_value != default_value:
+                        modified_values[key] = model_gen_config_value
+                        setattr(generation_config, key, model_gen_config_value)
+                if len(modified_values) > 0:
+                    logger.warning_once(
+                        f"`generation_config` default values have been modified to match model-specific defaults: "
+                        f"{modified_values}. If this is not desired, please set these values explicitly."
+                    )
+            else:
                 if generation_config.bos_token_id is None:
                     generation_config.bos_token_id = self.generation_config.bos_token_id
                 if generation_config.eos_token_id is None:
@@ -1557,8 +1711,9 @@ def _prepare_generation_config(
                     generation_config.pad_token_id = self.generation_config.pad_token_id
                 if generation_config.decoder_start_token_id is None:
                     generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id
-        else:
-            model_kwargs = kwargs
+
+        # Finally, apply any passed kwargs
+        model_kwargs = generation_config.update(**kwargs)
 
         return generation_config, model_kwargs
 
@@ -1583,14 +1738,89 @@ def _get_initial_cache_position(self, input_ids, model_kwargs):
             elif hasattr(cache, "get_seq_length") and cache.get_seq_length() is not None:
                 past_length = cache.get_seq_length()
 
-            # TODO(joao): this is not torch.compile-friendly, find a work-around. If the cache is not empty,
-            # end-to-end compilation will yield bad results because `cache_position` will be incorrect.
-            if not is_torchdynamo_compiling():
-                cache_position = cache_position[past_length:]
+            cache_position = cache_position[past_length:]
 
         model_kwargs["cache_position"] = cache_position
         return model_kwargs
 
+    def _get_layer_device_map_for_cache_init(self) -> Optional[Dict[int, Union[str, int]]]:
+        """
+        Returns the device map for each decoder layer, to allocate the cache on the right device.
+        Inspired from `dispatch_model` in accelerate.
+        """
+        execution_device_map = None
+
+        if hasattr(self, "hf_device_map"):
+            if set(self.hf_device_map.values()) == {"cpu"} or set(self.hf_device_map.values()) == {"cpu", "disk"}:
+                main_device = "cpu"
+            else:
+                main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0]
+            execution_device_map = {
+                name: main_device if device in ["cpu", "disk"] else device
+                for name, device in self.hf_device_map.items()
+            }
+
+        # No `execution_device_map` -> rely on `self.device` to allocate the cache
+        if execution_device_map is None:
+            return None
+
+        # Single device for all layers
+        num_hidden_layers = self.config.get_text_config().num_hidden_layers
+        if len(execution_device_map) == 1 and "" in execution_device_map:
+            return dict.fromkeys(range(num_hidden_layers), execution_device_map[""])
+
+        # Multiple devices in `execution_device_map` -> we need to map decoder layers to the correct device.
+        layer_device_map = {}
+        # Case 1: The model has a `get_decoder` method, we can use it to find the decoder name.
+        if hasattr(self, "get_decoder"):
+            decoder_name = None
+            for name, module in self.named_modules():
+                if module is self.get_decoder():
+                    decoder_name = name
+                    break
+            if decoder_name is None:
+                raise RuntimeError(
+                    "`model.get_decoder()` is not returning a named module of the model. This is unexpected, please "
+                    "open an issue on GitHub."
+                )
+
+            decoder_mapped_modules = [
+                module_name for module_name in execution_device_map.keys() if decoder_name in module_name
+            ]
+            # The decoder name may be present in `execution_device_map` in two forms:
+            # a) each layer has a device mapping
+            if len(decoder_mapped_modules) >= num_hidden_layers:
+                for idx in range(num_hidden_layers):
+                    for module_name in decoder_mapped_modules:
+                        if f".{idx}." in f"{module_name}.":
+                            layer_device_map[idx] = execution_device_map[module_name]
+                            break
+
+            # b) the whole module is mapped to a single device. If the decoder name is NOT present in the device map,
+            # then the mapping is done in a parent module
+            else:
+                while True:
+                    if decoder_name in execution_device_map:
+                        layer_device_map = dict.fromkeys(range(num_hidden_layers), execution_device_map[decoder_name])
+                        break
+                    elif "." in decoder_name:
+                        decoder_name = decoder_name.rsplit(".", 1)[0]  # gets the name of the parent module
+                    else:
+                        raise RuntimeError(f"Decoder name {decoder_name} not found in execution device map")
+
+        # Case 2: Legacy code path: assume the decoder layers are named as `(...).X` (X being the layer index)
+        else:
+            for layer in execution_device_map:
+                for idx in range(num_hidden_layers):
+                    if f".{idx}." in f"{layer}.":
+                        layer_device_map[idx] = execution_device_map[layer]
+                        break
+
+        for idx in range(num_hidden_layers):
+            if idx not in layer_device_map:
+                raise RuntimeError(f"layer {idx} has not been mapped to a device.")
+        return layer_device_map
+
     def _get_cache(
         self, cache_implementation: str, batch_size: int, max_cache_len: int, device: torch.device, model_kwargs
     ) -> Cache:
@@ -1600,6 +1830,9 @@ def _get_cache(
 
         Returns the resulting cache object.
         """
+        if cache_implementation == "hybrid" and "llama4" in getattr(self.config, "model_type", ""):
+            cache_implementation = "hybrid_chunked"
+
         cache_cls: Cache = NEED_SETUP_CACHE_CLASSES_MAPPING[cache_implementation]
         requires_cross_attention_cache = (
             self.config.is_encoder_decoder or model_kwargs.get("encoder_outputs") is not None
@@ -1629,20 +1862,16 @@ def _get_cache(
             if hasattr(self.config, "_pre_quantization_dtype"):
                 cache_dtype = self.config._pre_quantization_dtype
             else:
-                if not is_torchdynamo_compiling():
-                    cache_dtype = self.dtype
-                else:
-                    # NOTE: self.dtype is not compatible with torch.compile, as it calls `self.parameters()`.
-                    # Workaround: trust the lm_head, whose attribute name is somewhat consistent across generative
-                    # models. May cause trobles with non-text modalities.
-                    cache_dtype = self.get_output_embeddings().weight.dtype
+                cache_dtype = self.dtype
 
+            layer_device_map = self._get_layer_device_map_for_cache_init()
             cache_kwargs = {
                 "config": self.config.get_text_config(),
                 "max_batch_size": batch_size,
                 "max_cache_len": max_cache_len,
                 "dtype": cache_dtype,
-                "device": device if cache_implementation == "offloaded_static" else None,
+                "device": device,
+                "layer_device_map": layer_device_map,
             }
             self._cache = cache_cls(**cache_kwargs)
             if requires_cross_attention_cache:
@@ -1774,6 +2003,8 @@ def _prepare_cache_for_generation(
                 model_kwargs[cache_name] = cache_class(cache_config)
             elif generation_config.cache_implementation == "offloaded":
                 model_kwargs[cache_name] = OffloadedCache()
+            elif generation_config.cache_implementation == "dynamic":
+                model_kwargs[cache_name] = DynamicCache()
 
         # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
         # keeps copying the cache thus using much more memory
@@ -1833,12 +2064,11 @@ def _tensor_or_none(token, device=None):
 
         # Set pad token if unset (and there are conditions to do so)
         if pad_token_tensor is None and eos_token_tensor is not None:
-            if not is_torchdynamo_compiling():
-                if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask:
-                    logger.warning(
-                        "The attention mask and the pad token id were not set. As a consequence, you may observe "
-                        "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
-                    )
+            if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask:
+                logger.warning(
+                    "The attention mask and the pad token id were not set. As a consequence, you may observe "
+                    "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
+                )
             pad_token_tensor = eos_token_tensor[0]
             logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.")
 
@@ -1847,24 +2077,23 @@ def _tensor_or_none(token, device=None):
             raise ValueError(
                 "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
             )
-        if not is_torchdynamo_compiling():  # Checks that depend on tensor-dependent control flow
-            if (
-                eos_token_tensor is not None
-                and isin_mps_friendly(elements=eos_token_tensor, test_elements=pad_token_tensor).any()
-            ):
-                if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask:
-                    logger.warning_once(
-                        "The attention mask is not set and cannot be inferred from input because pad token is same as "
-                        "eos token. As a consequence, you may observe unexpected behavior. Please pass your input's "
-                        "`attention_mask` to obtain reliable results."
-                    )
-            if eos_token_tensor is not None and (
-                torch.is_floating_point(eos_token_tensor) or (eos_token_tensor < 0).any()
-            ):
-                logger.warning(
-                    f"`eos_token_id` should consist of positive integers, but is {eos_token_tensor}. Your generation "
-                    "will not stop until the maximum length is reached. Depending on other flags, it may even crash."
+        if (
+            eos_token_tensor is not None
+            and isin_mps_friendly(elements=eos_token_tensor, test_elements=pad_token_tensor).any()
+        ):
+            if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask:
+                logger.warning_once(
+                    "The attention mask is not set and cannot be inferred from input because pad token is same as "
+                    "eos token. As a consequence, you may observe unexpected behavior. Please pass your input's "
+                    "`attention_mask` to obtain reliable results."
                 )
+        if eos_token_tensor is not None and (
+            torch.is_floating_point(eos_token_tensor) or (eos_token_tensor < 0).any()
+        ):
+            logger.warning(
+                f"`eos_token_id` should consist of positive integers, but is {eos_token_tensor}. Your generation "
+                "will not stop until the maximum length is reached. Depending on other flags, it may even crash."
+            )
 
         # Update generation config with the updated special tokens tensors
         # NOTE: this must be written into a different attribute name than the one holding the original special tokens
@@ -1888,6 +2117,7 @@ def generate(
         streamer: Optional["BaseStreamer"] = None,
         negative_prompt_ids: Optional[torch.Tensor] = None,
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        use_model_defaults: Optional[bool] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         r"""
@@ -1952,6 +2182,11 @@ def generate(
                 size. This is an experimental feature, subject to breaking API changes in future versions.
             negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Attention_mask for `negative_prompt_ids`.
+            use_model_defaults (`bool`, *optional*):
+                When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
+                generation configuration (`model.generation_config`), as opposed to the global defaults
+                (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
+                `True`.
             kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -1979,7 +2214,9 @@ def generate(
         tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we only use it for stopping criteria
         assistant_tokenizer = kwargs.pop("assistant_tokenizer", None)  # only used for assisted generation
 
-        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
+        generation_config, model_kwargs = self._prepare_generation_config(
+            generation_config, use_model_defaults, **kwargs
+        )
         self._validate_model_kwargs(model_kwargs.copy())
         self._validate_assistant(assistant_model, tokenizer, assistant_tokenizer)
 
@@ -2004,7 +2241,7 @@ def generate(
         self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device)
 
         # decoder-only models must use left-padding for batched generation.
-        if not self.config.is_encoder_decoder and not is_torchdynamo_compiling():
+        if not self.config.is_encoder_decoder:
             # If `input_ids` was given, check if the last id in any sequence is `pad_token_id`
             # Note: If using, `inputs_embeds` this check does not work, because we want to be more hands-off.
             if (
@@ -2101,7 +2338,7 @@ def generate(
                 "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
             )
 
-        if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type:
+        if self.device.type != input_ids.device.type:
             warnings.warn(
                 "You are calling .generate() with the `input_ids` being on a device type different"
                 f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
@@ -2231,29 +2468,16 @@ def generate(
             )
 
         elif generation_mode in (GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH):
-            # 11. prepare beam search scorer
-            beam_scorer = BeamSearchScorer(
-                batch_size=batch_size,
-                num_beams=generation_config.num_beams,
-                device=inputs_tensor.device,
-                length_penalty=generation_config.length_penalty,
-                do_early_stopping=generation_config.early_stopping,
-                num_beam_hyps_to_keep=generation_config.num_return_sequences,
-                max_length=generation_config.max_length,
-            )
-
-            # 12. interleave input_ids with `num_beams` additional sequences per batch
+            # 11. interleave input_ids with `num_beams` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
                 input_ids=input_ids,
                 expand_size=generation_config.num_beams,
                 is_encoder_decoder=self.config.is_encoder_decoder,
                 **model_kwargs,
             )
-
-            # 13. run beam sample
+            # 12. run beam sample
             result = self._beam_search(
                 input_ids,
-                beam_scorer,
                 logits_processor=prepared_logits_processor,
                 stopping_criteria=prepared_stopping_criteria,
                 generation_config=generation_config,
@@ -2364,43 +2588,29 @@ def typeerror():
         # Convert to legacy cache format if requested
         if (
             generation_config.return_legacy_cache is True
-            and not is_torchdynamo_compiling()
             and hasattr(result, "past_key_values")
             and getattr(result.past_key_values, "to_legacy_cache") is not None
         ):
             result.past_key_values = result.past_key_values.to_legacy_cache()
         return result
 
-    def _has_unfinished_sequences(
-        self,
-        this_peer_finished: bool,
-        synced_gpus: bool,
-        device: torch.device,
-        cur_len: Optional[int] = None,
-        max_length: Optional[int] = None,
-    ) -> bool:
+    def _has_unfinished_sequences(self, this_peer_finished: bool, synced_gpus: bool, device: torch.device) -> bool:
         """
         Returns whether there are still unfinished sequences in the device. The existence of unfinished sequences is
         fed through `this_peer_finished`. ZeRO stage 3-friendly.
         """
-        # torch.compile does not support data-dependent control flow. This is a workaround to allow torch.compile,
-        # although we lose the ability to stop when all sequences return an EOS token (and other stopping criteria)
-        # TODO (joao): remove this when torch's support for control flow is not experimental (https://pytorch.org/docs/stable/generated/torch.cond.html)
-        if is_torchdynamo_compiling():
-            return cur_len < max_length
-        else:
-            if synced_gpus:
-                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
-                # The following logic allows an early break if all peers finished generating their sequence
-                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(device)
-                # send 0.0 if we finished, 1.0 otherwise
-                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
-                # did all peers finish? the reduced sum will be 0.0 then
-                if this_peer_finished_flag.item() == 0.0:
-                    return False
-            elif this_peer_finished:
+        if synced_gpus:
+            # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+            # The following logic allows an early break if all peers finished generating their sequence
+            this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0, device=device)
+            # send 0.0 if we finished, 1.0 otherwise
+            dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+            # did all peers finish? the reduced sum will be 0.0 then
+            if this_peer_finished_flag.item() == 0.0:
                 return False
-            return True
+        elif this_peer_finished:
+            return False
+        return True
 
     def heal_tokens(
         self, input_ids: torch.LongTensor, tokenizer: Optional["PreTrainedTokenizerBase"] = None
@@ -2621,7 +2831,7 @@ def _dola_decoding(
             )
 
             # .float() is needed to retain precision for later logits manipulations
-            final_layer_next_token_logits = outputs.logits[:, -1, :].detach().clone().float()
+            final_layer_next_token_logits = outputs.logits[:, -1, :].detach().to(copy=True, dtype=torch.float32)
             final_logits = outputs.logits[:, -1, :].float()
             candidate_premature_logits = {}
             for candidate_premature_layer in candidate_premature_layers:
@@ -2809,11 +3019,12 @@ def _contrastive_search(
                     last_hidden_states = outputs.hidden_states[-1]
 
                 # next logit for contrastive search to select top-k candidate tokens
-                # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for this first iteration
+                # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for this first iteration
                 # (the clone itself is always small)
-                # .float() is needed to retain precision for later logits manipulations
-                logit_for_next_step = outputs.logits[:, -1, :].clone().float()
-                logit_for_next_step = logit_for_next_step.to(input_ids.device)
+                # torch.float32 is needed to retain precision for later logits manipulations
+                logit_for_next_step = outputs.logits[:, -1, :].to(
+                    copy=True, dtype=torch.float32, device=input_ids.device
+                )
 
                 model_kwargs = self._update_model_kwargs_for_generation(
                     outputs,
@@ -3162,7 +3373,6 @@ def _sample(
         output_scores = generation_config.output_scores
         output_logits = generation_config.output_logits
         return_dict_in_generate = generation_config.return_dict_in_generate
-        max_length = generation_config.max_length
         has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
         do_sample = generation_config.do_sample
 
@@ -3189,17 +3399,22 @@ def _sample(
         model_forward = self.__call__
         if isinstance(model_kwargs.get("past_key_values"), Cache):
             is_compileable = model_kwargs["past_key_values"].is_compileable and self._supports_static_cache
-            is_compileable = is_compileable and not self.generation_config.disable_compile
+            if getattr(self, "hf_quantizer", None) is not None:
+                is_compileable &= self.hf_quantizer.is_compileable
+            is_compileable = is_compileable and not generation_config.disable_compile
             if is_compileable and (
                 self.device.type == "cuda" or generation_config.compile_config._compile_all_devices
             ):
                 os.environ["TOKENIZERS_PARALLELISM"] = "0"
                 model_forward = self.get_compiled_call(generation_config.compile_config)
 
-        is_prefill = True
-        while self._has_unfinished_sequences(
-            this_peer_finished, synced_gpus, device=input_ids.device, cur_len=cur_len, max_length=max_length
-        ):
+        if generation_config.prefill_chunk_size is not None:
+            model_kwargs = self._prefill_chunking(input_ids, generation_config, **model_kwargs)
+            is_prefill = False
+        else:
+            is_prefill = True
+
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
             # prepare model inputs
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
 
@@ -3222,10 +3437,9 @@ def _sample(
             if synced_gpus and this_peer_finished:
                 continue
 
-            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+            # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
             # (the clone itself is always small)
-            next_token_logits = outputs.logits[:, -1, :].clone().float()
-            next_token_logits = next_token_logits.to(input_ids.device)
+            next_token_logits = outputs.logits[:, -1, :].to(copy=True, dtype=torch.float32, device=input_ids.device)
 
             # pre-process distribution
             next_token_scores = logits_processor(input_ids, next_token_logits)
@@ -3303,6 +3517,7 @@ def _sample(
         else:
             return input_ids
 
+    # Auxiliary functions for beam search
     def _temporary_reorder_cache(self, past_key_values, beam_idx):
         """
         Temporary function to handle the different types of cache reordering processes while we roll out `Cache`.
@@ -3329,10 +3544,208 @@ def _temporary_reorder_cache(self, past_key_values, beam_idx):
             past_key_values.reorder_cache(beam_idx)
         return past_key_values
 
+    @staticmethod
+    def _flatten_beam_dim(tensor: torch.Tensor) -> torch.Tensor:
+        """[batch_size, num_beams, ...] -> [batch_size * num_beams, ...]"""
+        shape = list(tensor.shape)
+        return torch.reshape(tensor, [shape[0] * shape[1]] + shape[2:])
+
+    @staticmethod
+    def _unflatten_beam_dim(tensor: torch.Tensor, batch_size: int, num_beams: int) -> torch.Tensor:
+        """[batch_size * num_beams, ...] -> [batch_size, num_beams, ...]"""
+        shape = list(tensor.shape)
+        return torch.reshape(tensor, [batch_size, num_beams] + shape[1:])
+
+    @staticmethod
+    def _gather_beams(tensor: torch.Tensor, beam_indices: torch.Tensor) -> torch.Tensor:
+        """
+        Gathers the beam slices indexed by beam_indices into new beam array.
+
+        Args:
+            tensor (`torch.Tensor`): A tensor containing data to be gathered. The tensor is a 2D or a 3D tensor
+                with the two first dimensions depicting the batch and the beam dimensions.
+            beam_indices (`torch.Tensor` of shape `(batch_size, num_beams_to_select)`): The indices of the beams to
+                select .
+
+        Returns:
+            A tensor with the selected beams
+        """
+        # `take_along_dim` requires its indices arg to have the same number of dims as `input`
+        while len(beam_indices.shape) < len(tensor.shape):
+            beam_indices = beam_indices.unsqueeze(-1)
+        gathered_tensor = torch.take_along_dim(input=tensor, indices=beam_indices, dim=1)
+        return gathered_tensor
+
+    @staticmethod
+    def _beam_search_has_unfinished_sequences(
+        running_beam_scores: torch.Tensor,
+        beam_scores: torch.Tensor,
+        is_sent_finished: torch.Tensor,
+        next_token_hits_stopping_criteria: torch.Tensor,
+        cur_len: int,
+        max_length: int,
+        decoder_prompt_len: int,
+        early_stopping: Union[bool, str],
+        length_penalty: float,
+    ):
+        """
+        Beam Search stopping condition -- halts the generation loop if any of these conditions becomes False
+        """
+        # a. Can the open beams improve the top completed scores?
+        # early_stopping == False -> apply heuristic = always get the best score from
+        #   `cur_len - decoder_prompt_len`. See the discussion below for more details.
+        #   https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
+        # early_stopping == "never" -> compute the best score from `max_length` or `cur_len`, depending on the
+        #   sign of `length_penalty`. Positive `length_penalty` favors longer sequences, thus we use
+        #   `max_length` there.
+        if early_stopping == "never" and length_penalty > 0.0:
+            best_hypothetical_length = max_length - decoder_prompt_len
+        else:
+            best_hypothetical_length = cur_len - decoder_prompt_len
+        best_possible_running_score = running_beam_scores[:, :1] / (best_hypothetical_length**length_penalty)
+        worst_finished_score = torch.where(is_sent_finished, torch.min(beam_scores, dim=1, keepdim=True)[0], -1.0e9)
+        improvement_possible = torch.any(best_possible_running_score > worst_finished_score)
+
+        # b. Is there still a beam without fully completed sequences? This is only relevant if early_stopping is
+        # enabled, where we want to finish as soon as all beams have a completed sequence.
+        exists_open_beam = ~(torch.all(is_sent_finished) & (early_stopping is True))
+
+        # c. Have we hit a stopping criteria with all running sequences and have no way to continue? e.g. we have
+        # reached `max_length``
+        valid_continuations = ~torch.all(next_token_hits_stopping_criteria)
+
+        return improvement_possible & exists_open_beam & valid_continuations
+
+    def _get_top_k_continuations(
+        self,
+        accumulated_log_probs: torch.Tensor,
+        running_sequences: torch.Tensor,
+        running_beam_indices: torch.Tensor,
+        cur_len: int,
+        decoder_prompt_len: int,
+        do_sample: bool,
+        beams_to_keep: int,
+        num_beams: int,
+        vocab_size: int,
+        batch_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Get top-K continuations given the accumulated log probs on the next token.
+
+        A few notes to understand what's going on:
+        1. Each item in batch has `num_beams` * `vocab_size` candidate continuations. For each item, get the
+        top K [K = (number of EOS tokens + 1) * `num_beams`] candidates with the highest accumulated
+        log-probabilities, or sample them without replacement using the accumulated scores
+        2. We gather the top K (as opposed to `num_beams`, or any number lower than K) here so that we have at
+        least `num_beams` sequences remaining to continue the live beam search.
+        3. Note that other stopping criteria might result in impossible to continue beams, i.e. all continuations
+        selected in this step hit the stopping criteria.
+        """
+        # TODO (joao): This function should take an optional beam scorer function, to manipulate the scores after
+        # token selection. The function should be an argument exposed, so that custom scoring functions can be
+        # defined.
+
+        # Gather the top K scores from _all_ beams.
+        if do_sample:
+            topk_indices = torch.multinomial(
+                nn.functional.softmax(accumulated_log_probs, dim=-1), num_samples=beams_to_keep
+            )
+            topk_log_probs = torch.gather(input=accumulated_log_probs, dim=1, index=topk_indices)
+        else:
+            topk_log_probs, topk_indices = torch.topk(accumulated_log_probs, k=beams_to_keep)
+
+        # Gather K top beams, recover the beam index by floor division and token id by modulo division
+        topk_current_beam_indices = topk_indices // vocab_size
+        topk_running_beam_indices = self._gather_beams(running_beam_indices, topk_current_beam_indices)
+        topk_running_sequences = self._gather_beams(running_sequences, topk_current_beam_indices)
+        topk_ids = topk_indices % vocab_size
+
+        # Update sequences for the K top-k new sequences.
+        topk_running_sequences[:, :, cur_len] = topk_ids
+
+        # we want to store the beam indices with batch information -> real beam index = beam index % num beams
+        batch_offset = torch.arange(batch_size, device=topk_ids.device).view(-1, 1) * num_beams
+        batch_modified_indices = topk_current_beam_indices + batch_offset
+        topk_running_beam_indices[:, :, cur_len - decoder_prompt_len] = batch_modified_indices
+
+        return topk_log_probs, topk_running_sequences, topk_running_beam_indices
+
+    def _get_running_beams_for_next_iteration(
+        self,
+        topk_log_probs: torch.Tensor,
+        topk_running_sequences: torch.Tensor,
+        topk_running_beam_indices: torch.Tensor,
+        next_token_hits_stopping_criteria: torch.Tensor,
+        num_beams: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the
+        best non-finished beams to continue beam search in the next iteration.
+        """
+        # To prevent these just finished sequences from being used in subsequent iterations, set their log probs
+        # to a very large negative value
+        topk_running_log_probs = topk_log_probs + next_token_hits_stopping_criteria.to(torch.float32) * -1.0e9
+
+        next_topk_indices = torch.topk(topk_running_log_probs, k=num_beams)[1]
+        running_sequences = self._gather_beams(topk_running_sequences, next_topk_indices)
+        running_beam_scores = self._gather_beams(topk_running_log_probs, next_topk_indices)
+        running_beam_indices = self._gather_beams(topk_running_beam_indices, next_topk_indices)
+        return running_sequences, running_beam_scores, running_beam_indices
+
+    def _update_finished_beams(
+        self,
+        sequences: torch.Tensor,
+        topk_running_sequences: torch.Tensor,
+        beam_scores: torch.Tensor,
+        topk_log_probs: torch.Tensor,
+        beam_indices: torch.Tensor,
+        topk_running_beam_indices: torch.Tensor,
+        is_sent_finished: torch.Tensor,
+        next_token_hits_stopping_criteria: torch.Tensor,
+        top_num_beam_mask: torch.Tensor,
+        num_beams: int,
+        cur_len: int,
+        decoder_prompt_len: int,
+        length_penalty: float,
+        early_stopping: Union[bool, str],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Updates the finished beams if (and only if) there are new completed sequences that have a higher score than
+        the current finished sequences.
+        """
+        # Only the top `num_beam` sequences can be considered for the final returned sequences. Remember: the
+        # remaining sequences only exist as a backup to ensure that we have at least `num_beams` sequences to
+        # continue.
+        did_top_num_beams_just_finished = next_token_hits_stopping_criteria & top_num_beam_mask[None, :]
+
+        # Further process topk logits for the finished beams
+        # - add length penalty
+        topk_log_probs = topk_log_probs / ((cur_len + 1 - decoder_prompt_len) ** length_penalty)
+        # - make sure no scores can be added anymore if beam is full and early stopping is on
+        beams_in_batch_are_full = torch.all(is_sent_finished, axis=-1, keepdims=True) & (early_stopping is True)
+        topk_log_probs += beams_in_batch_are_full.to(torch.float32) * -1.0e9
+        # - make sure still running sequences cannot be chosen as finalized beam
+        topk_log_probs += (~did_top_num_beams_just_finished) * -1.0e9
+
+        # Get finalized  `num_beam` sequences for the next generation step -- combine the previous finalized
+        # data with the new finalized sequences (if any, non-finalized sequences have a very large negative score
+        # in this step), and keep the best `num_beams` sequences.
+        merged_sequences = torch.cat((sequences, topk_running_sequences), dim=1)
+        merged_scores = torch.cat((beam_scores, topk_log_probs), dim=1)
+        merged_beam_indices = torch.cat((beam_indices, topk_running_beam_indices), dim=1)
+        merged_is_sent_finished = torch.cat((is_sent_finished, did_top_num_beams_just_finished), dim=1)
+        topk_merged_indices = torch.topk(merged_scores, k=num_beams)[1]
+        sequences = self._gather_beams(merged_sequences, topk_merged_indices)
+        beam_scores = self._gather_beams(merged_scores, topk_merged_indices)
+        beam_indices = self._gather_beams(merged_beam_indices, topk_merged_indices)
+        is_sent_finished = self._gather_beams(merged_is_sent_finished, topk_merged_indices)
+        return sequences, beam_scores, beam_indices, is_sent_finished
+
+    # end of auxiliary functions for beam search
+
     def _beam_search(
         self,
         input_ids: torch.LongTensor,
-        beam_scorer: BeamScorer,
         logits_processor: LogitsProcessorList,
         stopping_criteria: StoppingCriteriaList,
         generation_config: GenerationConfig,
@@ -3343,12 +3756,15 @@ def _beam_search(
         Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
         can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
+        If it's the first time you're diving into Beam Search, we recommend you read the following blog post:
+        https://huggingface.co/blog/how-to-generate (especially the beam search section).
+
+        You can recompute the sequence scores from the individual scores using the `compute_transition_scores` function
+        (https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationMixin.compute_transition_scores)
+
         Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                 The sequence used as a prompt for the generation.
-            beam_scorer (`BeamScorer`):
-                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
-                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
             logits_processor (`LogitsProcessorList`):
                 An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                 used to modify the prediction scores of the language modeling head applied at each generation step.
@@ -3371,7 +3787,8 @@ def _beam_search(
             `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
             `model.config.is_encoder_decoder=True`.
         """
-        # init values
+
+        # 1. init beam_search values
         pad_token_id = generation_config._pad_token_tensor
         eos_token_id = generation_config._eos_token_tensor
         output_attentions = generation_config.output_attentions
@@ -3379,26 +3796,51 @@ def _beam_search(
         output_scores = generation_config.output_scores
         output_logits = generation_config.output_logits
         return_dict_in_generate = generation_config.return_dict_in_generate
-        sequential = generation_config.low_memory
         do_sample = generation_config.do_sample
+        early_stopping = generation_config.early_stopping
+        length_penalty = generation_config.length_penalty
+        max_length = generation_config.max_length
+        num_beams = generation_config.num_beams
+        num_return_sequences = generation_config.num_return_sequences
+
+        batch_size_unflattened, cur_len = input_ids.shape
+        batch_size = batch_size_unflattened // num_beams
+        # TODO (joao): standardize special cases
+        if self.__class__.__name__ == "MoshiDepthDecoder":
+            vocab_size = self.config.audio_vocab_size
+        elif self.__class__.__name__ == "ImageGPTForCausalImageModeling":
+            vocab_size = self.get_output_embeddings().out_features
+        else:
+            vocab_size = self.config.get_text_config().vocab_size
+        decoder_prompt_len = cur_len
+        this_peer_finished = False
 
-        batch_size = len(beam_scorer._beam_hyps)
-        num_beams = beam_scorer.num_beams
+        # At each beam search step, we want to keep top K [K = (number of EOS tokens + 1) * `num_beams`] candidates
+        # with the highest log-probabilities, or sample K continuations without replacement. We gather the top K
+        # (as opposed to `num_beams`, or any number lower than K) so that we have at least `num_beams` sequences
+        # non-finished to continue the live beam search, in case the top `num_beams` all select an EOS token.
+        n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
+        beams_to_keep = max(2, 1 + n_eos_tokens) * num_beams
+        top_num_beam_mask = torch.cat(
+            (torch.ones((num_beams), dtype=torch.bool), torch.zeros((beams_to_keep - num_beams), dtype=torch.bool)),
+            dim=0,
+        ).to(input_ids.device)
 
-        batch_beam_size, cur_len = input_ids.shape
         model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
 
-        if num_beams * batch_size != batch_beam_size:
+        # (joao) feature lost in the refactor. Probably won't implement, hurts readbility with minimal gains (there
+        # are newer low-memory alternatives like the offloaded cache)
+        sequential = generation_config.low_memory
+        if sequential:
             raise ValueError(
-                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+                "`low_memory=True` is not supported after the beam search refactor. Please check the discussion in "
+                "#35802 *after the PR got merged*, and add a comment there if your questions are not yet answered."
             )
 
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
+        # 2. init output tuples
+        all_scores = () if (return_dict_in_generate and output_scores) else None
         raw_logits = () if (return_dict_in_generate and output_logits) else None
-        beam_indices = (
-            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
-        )
+        beam_indices = () if (return_dict_in_generate and output_logits) else None
         decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
         cross_attentions = () if (return_dict_in_generate and output_attentions) else None
         decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
@@ -3410,184 +3852,200 @@ def _beam_search(
                 model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
             )
 
+        # 3. init running tensors and static-shaped placeholders
+
+        # per batch, beam-item holding current token in loop and completed sequences
+        output_fill_value = pad_token_id or eos_token_id[0] if eos_token_id is not None else -1
+        running_sequences = torch.full(
+            (batch_size, num_beams, max_length),
+            fill_value=output_fill_value,
+            dtype=torch.int64,
+            device=input_ids.device,
+        )
+        running_sequences[:, :, :cur_len] = self._unflatten_beam_dim(input_ids, batch_size, num_beams)
+        sequences = running_sequences.detach().clone()
+
+        # per batch, beam-item score, logprobs
         # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
         # of the first beam are considered to avoid sampling the exact same tokens across all beams.
-        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
-        beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view((batch_size * num_beams,))
+        running_beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        running_beam_scores[:, 1:] = -1e9
+        beam_scores = torch.full((batch_size, num_beams), fill_value=-1e9, dtype=torch.float, device=input_ids.device)
 
-        this_peer_finished = False
+        # per batch, beam-item state bit indicating if sentence has finished.
+        is_sent_finished = torch.zeros((batch_size, num_beams), dtype=torch.bool, device=input_ids.device)
 
-        decoder_prompt_len = input_ids.shape[-1]  # record the prompt length of decoder
+        # per batch, beam-item state bit indicating if there are valid continuations.
+        next_token_hits_stopping_criteria = torch.zeros(
+            (batch_size, num_beams), dtype=torch.bool, device=input_ids.device
+        )
+
+        # per batch selected beam indices
+        running_beam_indices = torch.full(
+            (batch_size, num_beams, max_length - cur_len), fill_value=-1, dtype=torch.int32, device=input_ids.device
+        )
+        beam_indices = running_beam_indices.detach().clone()
 
+        # 4. run the generation loop
         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # a. Forward current tokens, obtain the logits
+            flat_running_sequences = self._flatten_beam_dim(running_sequences[:, :, :cur_len])
+            model_inputs = self.prepare_inputs_for_generation(flat_running_sequences, **model_kwargs)
 
             # prepare variable output controls (note: some models won't accept all output controls)
             model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
             model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
 
-            # if sequential is True, split the input to batches of batch_size and run sequentially
-            if sequential:
-                if any(
-                    model_name in self.__class__.__name__.lower()
-                    for model_name in [
-                        "fsmt",
-                        "reformer",
-                        "ctrl",
-                        "gpt_bigcode",
-                        "transo_xl",
-                        "xlnet",
-                        "cpm",
-                        "jamba",
-                    ]
-                ):
-                    raise RuntimeError(
-                        f"Currently generation for {self.__class__.__name__} is not supported "
-                        f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature."
-                    )
-
-                inputs_per_sub_batches = _split_model_inputs(
-                    model_inputs,
-                    split_size=batch_size,
-                    full_batch_size=batch_beam_size,
-                    config=self.config.get_text_config(),
-                )
-                outputs_per_sub_batch = [
-                    self(**inputs_per_sub_batch, return_dict=True) for inputs_per_sub_batch in inputs_per_sub_batches
-                ]
-
-                outputs = stack_model_outputs(outputs_per_sub_batch, self.config.get_text_config())
-
-            else:  # Unchanged original behavior
-                outputs = self(**model_inputs, return_dict=True)
+            model_outputs = self(**model_inputs, return_dict=True)
 
             # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
             model_kwargs = self._update_model_kwargs_for_generation(
-                outputs,
+                model_outputs,
                 model_kwargs,
                 is_encoder_decoder=self.config.is_encoder_decoder,
             )
             if synced_gpus and this_peer_finished:
-                cur_len = cur_len + 1
                 continue
 
-            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
-            # (the clone itself is always small)
-            # .float() is needed to retain precision for later logits manipulations
-            next_token_logits = outputs.logits[:, -1, :].clone().float()
-            next_token_logits = next_token_logits.to(input_ids.device)
-            next_token_scores = nn.functional.log_softmax(
-                next_token_logits, dim=-1
-            )  # (batch_size * num_beams, vocab_size)
+            # Copy is needed to avoid keeping a hanging ref
+            logits = model_outputs.logits[:, -1, :].to(copy=True, dtype=torch.float32, device=input_ids.device)
 
-            next_token_scores_processed = logits_processor(input_ids, next_token_scores)
-            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
-                next_token_scores_processed
-            )
+            # b. Compute log probs -- get log probabilities from logits, process logits with processors (*e.g.*
+            # `temperature`, ...), and add new logprobs to existing running logprobs scores.
+            log_probs = nn.functional.log_softmax(logits, dim=-1)
+            log_probs = logits_processor(flat_running_sequences, log_probs)
 
-            # Store scores, attentions and hidden_states when required
+            # Store logits, attentions and hidden_states when required
             if return_dict_in_generate:
-                if output_scores:
-                    scores += (next_token_scores_processed,)
                 if output_logits:
-                    raw_logits += (next_token_logits,)
+                    raw_logits += (logits.clone(),)
+                if return_dict_in_generate and output_scores:
+                    all_scores += (log_probs.clone(),)
+
                 if output_attentions:
                     decoder_attentions += (
-                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                        (model_outputs.decoder_attentions,)
+                        if self.config.is_encoder_decoder
+                        else (model_outputs.attentions,)
                     )
                     if self.config.is_encoder_decoder:
-                        cross_attentions += (outputs.cross_attentions,)
+                        cross_attentions += (model_outputs.cross_attentions,)
+
                 if output_hidden_states:
                     decoder_hidden_states += (
-                        (outputs.decoder_hidden_states,)
+                        (model_outputs.decoder_hidden_states,)
                         if self.config.is_encoder_decoder
-                        else (outputs.hidden_states,)
+                        else (model_outputs.hidden_states,)
                     )
 
-            # reshape for beam search
-            vocab_size = next_token_scores.shape[-1]
-            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+            # This is needed to properly delete logits which may be very large for first iteration
+            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
+            del model_outputs
+
+            log_probs = self._unflatten_beam_dim(log_probs, batch_size, num_beams)
+            log_probs = log_probs + running_beam_scores[:, :, None]
+            log_probs = torch.reshape(log_probs, (batch_size, num_beams * vocab_size))
+
+            # c. Retrieve top-K continuations, i.e. select the next token (greedy or sampling) and then keep the best
+            # continuations among all beams based on the accumulated scores.
+            topk_log_probs, topk_running_sequences, topk_running_beam_indices = self._get_top_k_continuations(
+                accumulated_log_probs=log_probs,
+                running_sequences=running_sequences,
+                running_beam_indices=running_beam_indices,
+                cur_len=cur_len,
+                decoder_prompt_len=decoder_prompt_len,
+                do_sample=do_sample,
+                beams_to_keep=beams_to_keep,
+                num_beams=num_beams,
+                vocab_size=vocab_size,
+                batch_size=batch_size,
+            )
 
-            # Beam token selection: pick 1 + eos_token_id.shape[0] next tokens for each beam so we have at least 1
-            # non eos token per beam.
-            n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
-            n_tokens_to_keep = max(2, 1 + n_eos_tokens) * num_beams
-            if do_sample:
-                probs = nn.functional.softmax(next_token_scores, dim=-1)
-                next_tokens = torch.multinomial(probs, num_samples=n_tokens_to_keep)
-                next_token_scores = torch.gather(next_token_scores, -1, next_tokens)
-                next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1)
-                next_tokens = torch.gather(next_tokens, -1, _indices)
-            else:
-                next_token_scores, next_tokens = torch.topk(
-                    next_token_scores, n_tokens_to_keep, dim=1, largest=True, sorted=True
-                )
+            # d. Check which running sequences have finished
+            next_token_hits_stopping_criteria = stopping_criteria(
+                self._flatten_beam_dim(topk_running_sequences[:, :, : cur_len + 1]),  # remove unfilled token indexes
+                all_scores,
+            )
+            next_token_hits_stopping_criteria = self._unflatten_beam_dim(
+                next_token_hits_stopping_criteria, batch_size, beams_to_keep
+            )
 
-            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
-            next_tokens = next_tokens % vocab_size
+            # e. Get the non-finished running `num_beams` sequences for the next generation step
+            running_sequences, running_beam_scores, running_beam_indices = self._get_running_beams_for_next_iteration(
+                topk_log_probs=topk_log_probs,
+                topk_running_sequences=topk_running_sequences,
+                topk_running_beam_indices=topk_running_beam_indices,
+                next_token_hits_stopping_criteria=next_token_hits_stopping_criteria,
+                num_beams=num_beams,
+            )
 
-            # stateless
-            beam_outputs = beam_scorer.process(
-                input_ids,
-                next_token_scores,
-                next_tokens,
-                next_indices,
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
+            # f. Update the completed beams if a new high score in a finished sequence is found
+            sequences, beam_scores, beam_indices, is_sent_finished = self._update_finished_beams(
+                sequences=sequences,
+                topk_running_sequences=topk_running_sequences,
+                beam_scores=beam_scores,
+                topk_log_probs=topk_log_probs,
                 beam_indices=beam_indices,
+                topk_running_beam_indices=topk_running_beam_indices,
+                is_sent_finished=is_sent_finished,
+                next_token_hits_stopping_criteria=next_token_hits_stopping_criteria,
+                top_num_beam_mask=top_num_beam_mask,
+                num_beams=num_beams,
+                cur_len=cur_len,
                 decoder_prompt_len=decoder_prompt_len,
+                length_penalty=length_penalty,
+                early_stopping=early_stopping,
             )
 
-            beam_scores = beam_outputs["next_beam_scores"]
-            beam_next_tokens = beam_outputs["next_beam_tokens"]
-            beam_idx = beam_outputs["next_beam_indices"]
-
-            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
-
-            # This is needed to properly delete outputs.logits which may be very large for first iteration
-            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
-            # IMPORTANT: Note that this should appear BEFORE the call to _reorder_cache() to save the maximum memory
-            # (that way the memory peak does not include outputs.logits)
-            del outputs
+            # g. Prepare remaining data for the next iteration, including computing the stopping condition for
+            # beam search as a whole (as opposed to individual beams, i.e. `stopping_criteria`)
 
+            # pluck the cache from the beam indices that will be used in the next iteration
             if model_kwargs.get("past_key_values", None) is not None:
                 model_kwargs["past_key_values"] = self._temporary_reorder_cache(
-                    model_kwargs["past_key_values"], beam_idx
+                    past_key_values=model_kwargs["past_key_values"],
+                    beam_idx=self._flatten_beam_dim(running_beam_indices[..., cur_len - decoder_prompt_len]),
                 )
 
-            if return_dict_in_generate and output_scores:
-                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
-
-            # increase cur_len
             cur_len = cur_len + 1
+            this_peer_finished = not self._beam_search_has_unfinished_sequences(
+                running_beam_scores,
+                beam_scores,
+                is_sent_finished,
+                next_token_hits_stopping_criteria,
+                cur_len,
+                max_length,
+                decoder_prompt_len,
+                early_stopping,
+                length_penalty,
+            )
 
-            if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
-                this_peer_finished = True
-
-        sequence_outputs = beam_scorer.finalize(
-            input_ids,
-            beam_scores,
-            next_tokens,
-            next_indices,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            max_length=stopping_criteria.max_length,
-            beam_indices=beam_indices,
-            decoder_prompt_len=decoder_prompt_len,
-        )
+        # 5. prepare outputs
+        # Take best beams for each batch (the score is sorted in descending order)
+        sequences = self._flatten_beam_dim(sequences[:, :num_return_sequences, :])
+        beam_scores = self._flatten_beam_dim(beam_scores[:, :num_return_sequences])
+        beam_indices = self._flatten_beam_dim(beam_indices[:, :num_return_sequences, :])
+
+        # Crop the static-shaped tensors to the actual size.
+        # `beam_indices` is initialized with -1s, and is updated with the beam index of the generated token at each
+        # step. We can use it to detect the generated length, which may be != `cur_len`  (e.g. selected beam is from a
+        # previous decoding iteration)
+        max_generated_length = ((beam_indices + 1).bool()).sum(dim=1).max()
+        output_length = decoder_prompt_len + max_generated_length
+        sequences = sequences[:, :output_length]
+        beam_indices = beam_indices[:, :max_generated_length]
 
         if return_dict_in_generate:
             if not output_scores:
-                sequence_outputs["sequence_scores"] = None
+                beam_scores = None
 
             if self.config.is_encoder_decoder:
                 return GenerateBeamEncoderDecoderOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
+                    sequences=sequences,
+                    sequences_scores=beam_scores,
+                    scores=all_scores,
                     logits=raw_logits,
-                    beam_indices=sequence_outputs["beam_indices"],
+                    beam_indices=beam_indices,
                     encoder_attentions=encoder_attentions,
                     encoder_hidden_states=encoder_hidden_states,
                     decoder_attentions=decoder_attentions,
@@ -3597,17 +4055,17 @@ def _beam_search(
                 )
             else:
                 return GenerateBeamDecoderOnlyOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
+                    sequences=sequences,
+                    sequences_scores=beam_scores,
+                    scores=all_scores,
                     logits=raw_logits,
-                    beam_indices=sequence_outputs["beam_indices"],
+                    beam_indices=beam_indices,
                     attentions=decoder_attentions,
                     hidden_states=decoder_hidden_states,
                     past_key_values=model_kwargs.get("past_key_values"),
                 )
         else:
-            return sequence_outputs["sequences"]
+            return sequences
 
     def _group_beam_search(
         self,
@@ -3624,7 +4082,7 @@ def _group_beam_search(
         decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                 The sequence used as a prompt for the generation.
             beam_scorer (`BeamScorer`):
                 An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
@@ -3731,10 +4189,9 @@ def _group_beam_search(
             if output_scores:
                 processed_score = torch.zeros_like(outputs.logits[:, -1, :])
             if output_logits:
-                # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+                # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
                 # (the clone itself is always small)
-                raw_logit_score = outputs.logits[:, -1, :].clone()
-                raw_logit_score = raw_logit_score.to(input_ids.device)
+                raw_logit_score = outputs.logits[:, -1, :].to(copy=True, device=input_ids.device)
 
             for beam_group_idx in range(num_beam_groups):
                 group_start_idx = beam_group_idx * num_sub_beams
@@ -3753,8 +4210,9 @@ def _group_beam_search(
                 # select outputs of beams of current group only
                 # No need to clone() the logits here as they will not retain outputs.logits at the end of the loop
                 # .float() is needed to retain precision for later logits manipulations
-                next_token_logits = outputs.logits[batch_group_indices, -1, :].float()
-                next_token_logits = next_token_logits.to(input_ids.device)
+                next_token_logits = outputs.logits[batch_group_indices, -1, :].to(
+                    dtype=torch.float32, device=input_ids.device
+                )
 
                 next_token_scores = nn.functional.log_softmax(
                     next_token_logits, dim=-1
@@ -3915,7 +4373,7 @@ def _constrained_beam_search(
         decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
 
         Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                 The sequence used as a prompt for the generation.
             constrained_beam_scorer (`ConstrainedBeamSearchScorer`):
                 A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
@@ -4008,11 +4466,10 @@ def _constrained_beam_search(
                 cur_len = cur_len + 1
                 continue
 
-            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
+            # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
             # (the clone itself is always small)
             # .float() is needed to retain precision for later logits manipulations
-            next_token_logits = outputs.logits[:, -1, :].clone().float()
-            next_token_logits = next_token_logits.to(input_ids.device)
+            next_token_logits = outputs.logits[:, -1, :].to(copy=True, dtype=torch.float32, device=input_ids.device)
             next_token_scores = nn.functional.log_softmax(
                 next_token_logits, dim=-1
             )  # (batch_size * num_beams, vocab_size)
@@ -4221,7 +4678,6 @@ def _assisted_decoding(
 
             #  1. Fetch candidate sequences from a `CandidateGenerator` and move to the correct device
             candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
-
             candidate_input_ids = candidate_input_ids.to(self.device)
             if candidate_logits is not None:
                 candidate_logits = candidate_logits.to(self.device)
@@ -4261,8 +4717,9 @@ def _assisted_decoding(
 
             # 2.3. Process the new logits
             # .float() is needed to retain precision for later logits manipulations
-            new_logits = outputs.logits[:, -candidate_length - 1 :].float()  # excludes the input prompt if present
-            new_logits = new_logits.to(input_ids.device)
+            new_logits = outputs.logits[:, -candidate_length - 1 :].to(
+                dtype=torch.float32, device=input_ids.device
+            )  # excludes the input prompt if present
             next_token_logits = new_logits.clone()
             if len(logits_processor) > 0:
                 for i in range(candidate_length + 1):
@@ -4406,6 +4863,45 @@ def _assisted_decoding(
         else:
             return input_ids
 
+    def _prefill_chunking(self, input_ids: torch.LongTensor, generation_config: GenerationConfig, **model_kwargs):
+        # Even if we are not compiling the forward, flex is always compiled when used. With chunk prefill, we may
+        # end up needing just a bit more graphs than the default (which is 8). Doing this avoids very cryptic warnings
+        torch._dynamo.config.cache_size_limit = 64
+
+        chunk_size = generation_config.prefill_chunk_size
+        # Only chunk up the token just before last, so that decoding is completely performed outside this function
+        # (here we simply prefill the cache)
+        input_chunks = torch.split(input_ids[:, :-1], chunk_size, dim=-1)
+
+        if "past_key_values" not in model_kwargs:
+            raise ValueError("Cannot use prefill chunkink without a cache")
+
+        model_forward = self.get_compiled_call(generation_config.compile_config)
+        attention_mask = model_kwargs.pop("attention_mask", None)
+
+        past_length = 0
+        for input_chunk in input_chunks:
+            current_length = past_length + input_chunk.shape[-1]
+            # Prepare inputs
+            if attention_mask is not None:
+                model_kwargs["attention_mask"] = attention_mask[:, :current_length]
+            model_kwargs["cache_position"] = torch.arange(
+                past_length, current_length, dtype=torch.long, device=input_chunk.device
+            )
+            model_kwargs["position_ids"] = model_kwargs["cache_position"].unsqueeze(0)
+            model_inputs = self.prepare_inputs_for_generation(input_chunk, **model_kwargs)
+
+            outputs = model_forward(**model_inputs, return_dict=True)
+
+            model_kwargs["past_key_values"] = outputs.past_key_values
+            past_length = current_length
+
+        model_kwargs["attention_mask"] = attention_mask
+        model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
+        _ = model_kwargs.pop("position_ids", None)
+
+        return model_kwargs
+
 
 def _speculative_sampling(
     candidate_input_ids,
@@ -4520,7 +5016,7 @@ def _ranking_fast(
     return selected_idx
 
 
-def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int = None):
+def _split(data, full_batch_size: int, split_size: int):
     """
     Takes care of three cases:
     1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
@@ -4538,7 +5034,7 @@ def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int =
     elif isinstance(data, DynamicCache) or (
         isinstance(data, EncoderDecoderCache) and isinstance(data.self_attention_cache, DynamicCache)
     ):
-        return data.batch_split(full_batch_size, split_size, num_hidden_layers)
+        return data.batch_split(full_batch_size, split_size)
     elif isinstance(data, tuple):
         # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
         if isinstance(data[0], tuple):
@@ -4591,11 +5087,9 @@ def _split_model_inputs(
     keys_to_ignore = ["cache_position", "encoder_outputs", "logits_to_keep"]
     non_bool_keys = [k for k in keys if not isinstance(model_input[k], bool) and k not in keys_to_ignore]
 
-    num_hidden_layers = config.get_text_config().num_hidden_layers
-
     # we split the tensors and tuples of tensors
     data_split_list = [
-        {k: _split(model_input[k], full_batch_size, num_hidden_layers, split_size)[i] for k in non_bool_keys}
+        {k: _split(model_input[k], full_batch_size, split_size)[i] for k in non_bool_keys}
         for i in range(full_batch_size // split_size)
     ]
     # bool values are the same and replicated for each split
@@ -4632,7 +5126,6 @@ def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConf
 
     # Infer the class from the first object in the list
     model_output_cls = type(model_outputs[0])
-    num_hidden_layers = config.get_text_config().num_hidden_layers
 
     # Ensure all objects are of the same type
     if not all(isinstance(obj, model_output_cls) for obj in model_outputs):
@@ -4649,9 +5142,9 @@ def _concat(data):
             return torch.cat(data, dim=0)
         # New cache format
         elif isinstance(data[0], DynamicCache):
-            return DynamicCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
+            return DynamicCache.from_batch_splits(data)
         elif isinstance(data[0], EncoderDecoderCache):
-            return EncoderDecoderCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
+            return EncoderDecoderCache.from_batch_splits(data)
         elif isinstance(data[0], tuple):
             # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
             if isinstance(data[0][0], tuple):
@@ -4741,7 +5234,7 @@ def _dola_select_contrast(
 
     # 6. Reduce the batchmean
     js_divs = js_divs.mean(-1)  # shape: (num_premature_layers,)
-    premature_layer = candidate_premature_layers[int(js_divs.argmax().cpu().item())]
+    premature_layer = candidate_premature_layers[int(js_divs.argmax().item())]
 
     base_logits = candidate_premature_logits[premature_layer]
     final_logits, base_logits = _relative_top_filter(final_logits, base_logits)
diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py
index da90c03dd0da..e5f900c3b7d6 100644
--- a/src/transformers/generation/watermarking.py
+++ b/src/transformers/generation/watermarking.py
@@ -61,11 +61,11 @@ class WatermarkDetectorOutput:
             Array containing confidence scores of a text being machine-generated for each element in the batch.
     """
 
-    num_tokens_scored: np.array = None
-    num_green_tokens: np.array = None
-    green_fraction: np.array = None
-    z_score: np.array = None
-    p_value: np.array = None
+    num_tokens_scored: Optional[np.array] = None
+    num_green_tokens: Optional[np.array] = None
+    green_fraction: Optional[np.array] = None
+    z_score: Optional[np.array] = None
+    p_value: Optional[np.array] = None
     prediction: Optional[np.array] = None
     confidence: Optional[np.array] = None
 
@@ -257,7 +257,7 @@ class BayesianDetectorConfig(PretrainedConfig):
             Prior probability P(w) that a text is watermarked.
     """
 
-    def __init__(self, watermarking_depth: int = None, base_rate: float = 0.5, **kwargs):
+    def __init__(self, watermarking_depth: Optional[int] = None, base_rate: float = 0.5, **kwargs):
         self.watermarking_depth = watermarking_depth
         self.base_rate = base_rate
         # These can be set later to store information about this detector.
@@ -490,7 +490,7 @@ class SynthIDTextWatermarkDetector:
     Parameters:
         detector_module ([`BayesianDetectorModel`]):
             Bayesian detector module object initialized with parameters.
-            Check examples/research_projects/synthid_text/detector_training.py for usage.
+            Check https://github.com/huggingface/transformers-research-projects/tree/main/synthid_text for usage.
         logits_processor (`SynthIDTextWatermarkLogitsProcessor`):
             The logits processor used for watermarking.
         tokenizer (`Any`):
@@ -502,7 +502,7 @@ class SynthIDTextWatermarkDetector:
     ...     AutoTokenizer, BayesianDetectorModel, SynthIDTextWatermarkLogitsProcessor, SynthIDTextWatermarkDetector
     ... )
 
-    >>> # Load the detector. See examples/research_projects/synthid_text for training a detector.
+    >>> # Load the detector. See https://github.com/huggingface/transformers-research-projects/tree/main/synthid_text for training a detector.
     >>> detector_model = BayesianDetectorModel.from_pretrained("joaogante/dummy_synthid_detector")
     >>> logits_processor = SynthIDTextWatermarkLogitsProcessor(
     ...     **detector_model.config.watermarking_config, device="cpu"
diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py
index 625c6b90f511..0c5610dbfdf9 100644
--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@@ -18,11 +18,12 @@
 import sys
 import types
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, ArgumentTypeError
+from collections.abc import Iterable
 from copy import copy
 from enum import Enum
 from inspect import isclass
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Literal, NewType, Optional, Tuple, Union, get_type_hints
+from typing import Any, Callable, Literal, NewType, Optional, Union, get_type_hints
 
 import yaml
 
@@ -62,11 +63,11 @@ def make_choice_type_function(choices: list) -> Callable[[str], Any]:
 
 def HfArg(
     *,
-    aliases: Union[str, List[str]] = None,
-    help: str = None,
+    aliases: Optional[Union[str, list[str]]] = None,
+    help: Optional[str] = None,
     default: Any = dataclasses.MISSING,
     default_factory: Callable[[], Any] = dataclasses.MISSING,
-    metadata: dict = None,
+    metadata: Optional[dict] = None,
     **kwargs,
 ) -> dataclasses.Field:
     """Argument helper enabling a concise syntax to create dataclass fields for parsing with `HfArgumentParser`.
@@ -201,7 +202,7 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
             else:
                 kwargs["required"] = True
         elif field.type is bool or field.type == Optional[bool]:
-            # Copy the currect kwargs to use to instantiate a `no_*` complement argument below.
+            # Copy the correct kwargs to use to instantiate a `no_*` complement argument below.
             # We do not initialize it here because the `no_*` alternative must be instantiated after the real argument
             bool_kwargs = copy(kwargs)
 
@@ -254,7 +255,7 @@ def _add_dataclass_arguments(self, dtype: DataClassType):
             parser = self
 
         try:
-            type_hints: Dict[str, type] = get_type_hints(dtype)
+            type_hints: dict[str, type] = get_type_hints(dtype)
         except NameError:
             raise RuntimeError(
                 f"Type resolution failed for {dtype}. Try declaring the class in global scope or "
@@ -288,7 +289,7 @@ def parse_args_into_dataclasses(
         look_for_args_file=True,
         args_filename=None,
         args_file_flag=None,
-    ) -> Tuple[DataClass, ...]:
+    ) -> tuple[DataClass, ...]:
         """
         Parse command-line args into instances of the specified dataclass types.
 
@@ -367,7 +368,7 @@ def parse_args_into_dataclasses(
 
             return (*outputs,)
 
-    def parse_dict(self, args: Dict[str, Any], allow_extra_keys: bool = False) -> Tuple[DataClass, ...]:
+    def parse_dict(self, args: dict[str, Any], allow_extra_keys: bool = False) -> tuple[DataClass, ...]:
         """
         Alternative helper method that does not use `argparse` at all, instead uses a dict and populating the dataclass
         types.
@@ -397,7 +398,7 @@ def parse_dict(self, args: Dict[str, Any], allow_extra_keys: bool = False) -> Tu
 
     def parse_json_file(
         self, json_file: Union[str, os.PathLike], allow_extra_keys: bool = False
-    ) -> Tuple[DataClass, ...]:
+    ) -> tuple[DataClass, ...]:
         """
         Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
         dataclass types.
@@ -421,7 +422,7 @@ def parse_json_file(
 
     def parse_yaml_file(
         self, yaml_file: Union[str, os.PathLike], allow_extra_keys: bool = False
-    ) -> Tuple[DataClass, ...]:
+    ) -> tuple[DataClass, ...]:
         """
         Alternative helper method that does not use `argparse` at all, instead loading a yaml file and populating the
         dataclass types.
diff --git a/src/transformers/hyperparameter_search.py b/src/transformers/hyperparameter_search.py
index c14165165ca1..e8558ceed32f 100644
--- a/src/transformers/hyperparameter_search.py
+++ b/src/transformers/hyperparameter_search.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Optional
 
 from .integrations import (
     is_optuna_available,
@@ -38,7 +38,7 @@
 
 class HyperParamSearchBackendBase:
     name: str
-    pip_package: str = None
+    pip_package: Optional[str] = None
 
     @staticmethod
     def is_available():
diff --git a/src/transformers/image_processing_base.py b/src/transformers/image_processing_base.py
index a6ce7af3fa80..5398abe02822 100644
--- a/src/transformers/image_processing_base.py
+++ b/src/transformers/image_processing_base.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,7 +18,7 @@
 import os
 import warnings
 from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
+from typing import Any, Optional, TypeVar, Union
 
 import numpy as np
 import requests
@@ -98,7 +97,7 @@ def _set_processor_class(self, processor_class: str):
 
     @classmethod
     def from_pretrained(
-        cls: Type[ImageProcessorType],
+        cls: type[ImageProcessorType],
         pretrained_model_name_or_path: Union[str, os.PathLike],
         cache_dir: Optional[Union[str, os.PathLike]] = None,
         force_download: bool = False,
@@ -274,7 +273,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub:
     @classmethod
     def get_image_processor_dict(
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
         """
         From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
         image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
@@ -351,13 +350,13 @@ def get_image_processor_dict(
                     revision=revision,
                     subfolder=subfolder,
                 )
-            except EnvironmentError:
+            except OSError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                 # the original exception.
                 raise
             except Exception:
                 # For any other exception, we throw a generic error.
-                raise EnvironmentError(
+                raise OSError(
                     f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load"
                     " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
                     f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
@@ -366,12 +365,12 @@ def get_image_processor_dict(
 
         try:
             # Load image_processor dict
-            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
+            with open(resolved_image_processor_file, encoding="utf-8") as reader:
                 text = reader.read()
             image_processor_dict = json.loads(text)
 
         except json.JSONDecodeError:
-            raise EnvironmentError(
+            raise OSError(
                 f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
             )
 
@@ -393,7 +392,7 @@ def get_image_processor_dict(
         return image_processor_dict, kwargs
 
     @classmethod
-    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
         """
         Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
 
@@ -437,7 +436,7 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         else:
             return image_processor
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """
         Serializes this instance to a Python dictionary.
 
@@ -463,7 +462,7 @@ def from_json_file(cls, json_file: Union[str, os.PathLike]):
             A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
             instantiated from that JSON file.
         """
-        with open(json_file, "r", encoding="utf-8") as reader:
+        with open(json_file, encoding="utf-8") as reader:
             text = reader.read()
         image_processor_dict = json.loads(text)
         return cls(**image_processor_dict)
@@ -529,7 +528,7 @@ def register_for_auto_class(cls, auto_class="AutoImageProcessor"):
 
         cls._auto_class = auto_class
 
-    def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
+    def fetch_images(self, image_url_or_urls: Union[str, list[str]]):
         """
         Convert a single or a list of urls into the corresponding `PIL.Image` objects.
 
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 59aea9b8a5a8..ec0f817728d3 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +13,8 @@
 # limitations under the License.
 
 import math
-from typing import Dict, Iterable, Optional, Union
+from collections.abc import Iterable
+from typing import Optional, Union
 
 import numpy as np
 
@@ -116,7 +116,7 @@ def normalize(
     def center_crop(
         self,
         image: np.ndarray,
-        size: Dict[str, int],
+        size: dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         **kwargs,
@@ -207,7 +207,7 @@ def convert_to_size_dict(
 
 
 def get_size_dict(
-    size: Union[int, Iterable[int], Dict[str, int]] = None,
+    size: Union[int, Iterable[int], dict[str, int]] = None,
     max_size: Optional[int] = None,
     height_width_order: bool = True,
     default_to_square: bool = True,
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index f990ce100d9d..b671a1119111 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections.abc import Iterable
 from functools import lru_cache, partial
-from typing import Any, Dict, Iterable, List, Optional, Tuple, TypedDict, Union
+from typing import Any, Optional, TypedDict, Union
 
 import numpy as np
 
@@ -40,8 +40,8 @@
     get_image_type,
     infer_channel_dimension_format,
     make_flat_list_of_images,
-    validate_fast_preprocess_arguments,
     validate_kwargs,
+    validate_preprocess_arguments,
 )
 from .processing_utils import Unpack
 from .utils import (
@@ -72,6 +72,49 @@
 logger = logging.get_logger(__name__)
 
 
+@lru_cache(maxsize=10)
+def validate_fast_preprocess_arguments(
+    do_rescale: Optional[bool] = None,
+    rescale_factor: Optional[float] = None,
+    do_normalize: Optional[bool] = None,
+    image_mean: Optional[Union[float, list[float]]] = None,
+    image_std: Optional[Union[float, list[float]]] = None,
+    do_pad: Optional[bool] = None,
+    size_divisibility: Optional[int] = None,
+    do_center_crop: Optional[bool] = None,
+    crop_size: Optional[SizeDict] = None,
+    do_resize: Optional[bool] = None,
+    size: Optional[SizeDict] = None,
+    resample: Optional["PILImageResampling"] = None,
+    return_tensors: Optional[Union[str, TensorType]] = None,
+    data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+):
+    """
+    Checks validity of typically used arguments in an `ImageProcessorFast` `preprocess` method.
+    Raises `ValueError` if arguments incompatibility is caught.
+    """
+    validate_preprocess_arguments(
+        do_rescale=do_rescale,
+        rescale_factor=rescale_factor,
+        do_normalize=do_normalize,
+        image_mean=image_mean,
+        image_std=image_std,
+        do_pad=do_pad,
+        size_divisibility=size_divisibility,
+        do_center_crop=do_center_crop,
+        crop_size=crop_size,
+        do_resize=do_resize,
+        size=size,
+        resample=resample,
+    )
+    # Extra checks for ImageProcessorFast
+    if return_tensors is not None and return_tensors != "pt":
+        raise ValueError("Only returning PyTorch tensors is currently supported.")
+
+    if data_format != ChannelDimension.FIRST:
+        raise ValueError("Only channel first data format is currently supported.")
+
+
 def safe_squeeze(tensor: "torch.Tensor", axis: Optional[int] = None) -> "torch.Tensor":
     """
     Squeezes a tensor, but only if the axis specified has dim 1.
@@ -85,14 +128,14 @@ def safe_squeeze(tensor: "torch.Tensor", axis: Optional[int] = None) -> "torch.T
         return tensor
 
 
-def max_across_indices(values: Iterable[Any]) -> List[Any]:
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
     """
     Return the maximum value across all indices of an iterable of values.
     """
     return [max(values_i) for values_i in zip(*values)]
 
 
-def get_max_height_width(images: List["torch.Tensor"]) -> Tuple[int]:
+def get_max_height_width(images: list["torch.Tensor"]) -> tuple[int]:
     """
     Get the maximum height and width across all images in a batch.
     """
@@ -104,7 +147,7 @@ def get_max_height_width(images: List["torch.Tensor"]) -> Tuple[int]:
 
 def divide_to_patches(
     image: Union[np.array, "torch.Tensor"], patch_size: int
-) -> List[Union[np.array, "torch.Tensor"]]:
+) -> list[Union[np.array, "torch.Tensor"]]:
     """
     Divides an image into patches of a specified size.
 
@@ -126,22 +169,19 @@ def divide_to_patches(
     return patches
 
 
-class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
+class DefaultFastImageProcessorKwargs(TypedDict, total=False):
     do_resize: Optional[bool]
-    size: Optional[Dict[str, int]]
+    size: Optional[dict[str, int]]
     default_to_square: Optional[bool]
     resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
     do_center_crop: Optional[bool]
-    crop_size: Optional[Dict[str, int]]
+    crop_size: Optional[dict[str, int]]
     do_rescale: Optional[bool]
     rescale_factor: Optional[Union[int, float]]
     do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, List[float]]]
-    image_std: Optional[Union[float, List[float]]]
+    image_mean: Optional[Union[float, list[float]]]
+    image_std: Optional[Union[float, list[float]]]
     do_convert_rgb: Optional[bool]
-
-
-class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwargs):
     return_tensors: Optional[Union[str, TensorType]]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
@@ -185,8 +225,20 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
             Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_convert_rgb (`bool`, *optional*, defaults to `self.image_std`):
-            Whether to convert the image to RGB."""
+        do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+            Whether to convert the image to RGB.
+        return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
+            Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
+        data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
+            Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
+        input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
+            The channel dimension format for the input image. If unset, the channel dimension format is inferred
+            from the input image. Can be one of:
+            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        device (`torch.device`, *optional*, defaults to `self.device`):
+            The device to process the images on. If unset, the device is inferred from the input images."""
 
 BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
     Preprocess an image or batch of images.
@@ -219,20 +271,17 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa
             `True`.
         do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
             Whether to convert the image to RGB.
-        return_tensors (`str` or `TensorType`, *optional*):
+        return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
             Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
-        data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-            The channel dimension format for the output image. Can be one of:
-            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-            - Unset: Use the channel dimension format of the input image.
-        input_data_format (`ChannelDimension` or `str`, *optional*):
+        data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
+            Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
+        input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
             The channel dimension format for the input image. If unset, the channel dimension format is inferred
             from the input image. Can be one of:
             - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
             - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
             - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        device (`torch.device`, *optional*):
+        device (`torch.device`, *optional*, defaults to `self.device`):
             The device to process the images on. If unset, the device is inferred from the input images."""
 
 
@@ -253,15 +302,20 @@ class BaseImageProcessorFast(BaseImageProcessor):
     rescale_factor = 1 / 255
     do_normalize = None
     do_convert_rgb = None
+    return_tensors = None
+    data_format = ChannelDimension.FIRST
+    input_data_format = None
+    device = None
     model_input_names = ["pixel_values"]
-    valid_init_kwargs = DefaultFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = DefaultFastImageProcessorPreprocessKwargs
+    valid_kwargs = DefaultFastImageProcessorKwargs
+    unused_kwargs = None
 
     def __init__(
         self,
-        **kwargs: Unpack[DefaultFastImageProcessorInitKwargs],
+        **kwargs: Unpack[DefaultFastImageProcessorKwargs],
     ) -> None:
         super().__init__(**kwargs)
+        kwargs = self.filter_out_unused_kwargs(kwargs)
         size = kwargs.pop("size", self.size)
         self.size = (
             get_size_dict(size=size, default_to_square=kwargs.pop("default_to_square", self.default_to_square))
@@ -270,7 +324,7 @@ def __init__(
         )
         crop_size = kwargs.pop("crop_size", self.crop_size)
         self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None
-        for key in self.valid_init_kwargs.__annotations__.keys():
+        for key in self.valid_kwargs.__annotations__.keys():
             kwarg = kwargs.pop(key, None)
             if kwarg is not None:
                 setattr(self, key, kwarg)
@@ -369,31 +423,55 @@ def normalize(
         """
         return F.normalize(image, mean, std)
 
+    @lru_cache(maxsize=10)
+    def _fuse_mean_std_and_rescale_factor(
+        self,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        device: Optional["torch.device"] = None,
+    ) -> tuple:
+        if do_rescale and do_normalize:
+            # Fused rescale and normalize
+            image_mean = torch.tensor(image_mean, device=device) * (1.0 / rescale_factor)
+            image_std = torch.tensor(image_std, device=device) * (1.0 / rescale_factor)
+            do_rescale = False
+        return image_mean, image_std, do_rescale
+
     def rescale_and_normalize(
         self,
         images: "torch.Tensor",
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,
-        image_mean: Union[float, List[float]],
-        image_std: Union[float, List[float]],
+        image_mean: Union[float, list[float]],
+        image_std: Union[float, list[float]],
     ) -> "torch.Tensor":
         """
         Rescale and normalize images.
         """
-        if do_rescale and do_normalize:
+        image_mean, image_std, do_rescale = self._fuse_mean_std_and_rescale_factor(
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            device=images.device,
+        )
+        # if/elif as we use fused rescale and normalize if both are set to True
+        if do_normalize:
             images = self.normalize(images.to(dtype=torch.float32), image_mean, image_std)
         elif do_rescale:
-            images = images * rescale_factor
-        elif do_normalize:
-            images = self.normalize(images, image_mean, image_std)
+            images = self.rescale(images, rescale_factor)
 
         return images
 
     def center_crop(
         self,
         image: "torch.Tensor",
-        size: Dict[str, int],
+        size: dict[str, int],
         **kwargs,
     ) -> "torch.Tensor":
         """
@@ -429,6 +507,19 @@ def convert_to_rgb(
         """
         return convert_to_rgb(image)
 
+    def filter_out_unused_kwargs(self, kwargs: dict):
+        """
+        Filter out the unused kwargs from the kwargs dictionary.
+        """
+        if self.unused_kwargs is None:
+            return kwargs
+
+        for kwarg_name in self.unused_kwargs:
+            if kwarg_name in kwargs:
+                logger.warning_once(f"This processor does not use the `{kwarg_name}` parameter. It will be ignored.")
+                kwargs.pop(kwarg_name)
+        return kwargs
+
     def _prepare_images_structure(
         self,
         images: ImageInput,
@@ -482,10 +573,10 @@ def _process_image(
     def _prepare_input_images(
         self,
         images: ImageInput,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
         device: Optional["torch.device"] = None,
-    ) -> List["torch.Tensor"]:
+    ) -> list["torch.Tensor"]:
         """
         Prepare the input images for processing.
         """
@@ -503,25 +594,60 @@ def _prepare_input_images(
 
         return processed_images
 
-    @lru_cache(maxsize=10)
-    def _prepare_process_arguments(
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        crop_size: Optional[SizeDict] = None,
+        default_to_square: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if kwargs is None:
+            kwargs = {}
+        if size is not None:
+            size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
+        if crop_size is not None:
+            crop_size = SizeDict(**get_size_dict(crop_size, param_name="crop_size"))
+        if isinstance(image_mean, list):
+            image_mean = tuple(image_mean)
+        if isinstance(image_std, list):
+            image_std = tuple(image_std)
+        if data_format is None:
+            data_format = ChannelDimension.FIRST
+
+        kwargs["size"] = size
+        kwargs["crop_size"] = crop_size
+        kwargs["default_to_square"] = default_to_square
+        kwargs["image_mean"] = image_mean
+        kwargs["image_std"] = image_std
+        kwargs["data_format"] = data_format
+
+        return kwargs
+
+    def _validate_preprocess_kwargs(
         self,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, tuple[float]]] = None,
+        image_std: Optional[Union[float, tuple[float]]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[SizeDict] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[SizeDict] = None,
         resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        device: Optional["torch.device"] = None,
-    ) -> tuple:
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs,
+    ):
         """
-        Prepare the arguments for the process method.
+        validate the kwargs for the preprocess method.
         """
         validate_fast_preprocess_arguments(
             do_rescale=do_rescale,
@@ -538,84 +664,44 @@ def _prepare_process_arguments(
             data_format=data_format,
         )
 
-        if do_rescale and do_normalize:
-            # Fused rescale and normalize
-            image_mean = torch.tensor(image_mean, device=device) * (1.0 / rescale_factor)
-            image_std = torch.tensor(image_std, device=device) * (1.0 / rescale_factor)
-
-        interpolation = (
-            pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
-        )
-
-        return image_mean, image_std, interpolation
-
     @add_start_docstrings(BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS)
-    def preprocess(
-        self,
-        images: ImageInput,
-        **kwargs: Unpack[DefaultFastImageProcessorPreprocessKwargs],
-    ) -> BatchFeature:
-        validate_kwargs(
-            captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys()
-        )
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
         # Set default kwargs from self. This ensures that if a kwarg is not provided
         # by the user, it gets its default value from the instance, or is set to None.
-        for kwarg_name in self.valid_preprocess_kwargs.__annotations__:
+        for kwarg_name in self.valid_kwargs.__annotations__:
             kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
 
         # Extract parameters that are only used for preparing the input images
         do_convert_rgb = kwargs.pop("do_convert_rgb")
         input_data_format = kwargs.pop("input_data_format")
         device = kwargs.pop("device")
-
+        # Prepare input images
         images = self._prepare_input_images(
             images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
         )
 
-        # Pop kwargs that need further processing or won't be used in _preprocess
-        default_to_square = kwargs.pop("default_to_square")
-        size = kwargs.pop("size")
-        crop_size = kwargs.pop("crop_size")
-        image_mean = kwargs.pop("image_mean")
-        image_std = kwargs.pop("image_std")
-        data_format = kwargs.pop("data_format")
-        resample = kwargs.pop("resample")
+        # Update kwargs that need further processing before being validated
+        kwargs = self._further_process_kwargs(**kwargs)
 
-        # Make hashable for cache
-        size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square)) if size is not None else None
-        crop_size = SizeDict(**get_size_dict(crop_size, param_name="crop_size")) if crop_size is not None else None
-        image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
-        image_std = tuple(image_std) if isinstance(image_std, list) else image_std
+        # Validate kwargs
+        self._validate_preprocess_kwargs(**kwargs)
 
-        image_mean, image_std, interpolation = self._prepare_process_arguments(
-            size=size,
-            crop_size=crop_size,
-            resample=resample,
-            image_mean=image_mean,
-            image_std=image_std,
-            data_format=data_format if data_format is not None else ChannelDimension.FIRST,
-            device=images[0].device,
-            do_resize=kwargs.get("do_resize"),
-            do_center_crop=kwargs.get("do_center_crop"),
-            do_rescale=kwargs.get("do_rescale"),
-            rescale_factor=kwargs.get("rescale_factor"),
-            do_normalize=kwargs.get("do_normalize"),
-            return_tensors=kwargs.get("return_tensors"),
+        # torch resize uses interpolation instead of resample
+        resample = kwargs.pop("resample")
+        kwargs["interpolation"] = (
+            pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
         )
 
-        return self._preprocess(
-            images=images,
-            size=size,
-            crop_size=crop_size,
-            interpolation=interpolation,
-            image_mean=image_mean,
-            image_std=image_std,
-            **kwargs,
-        )
+        # Pop kwargs that are not needed in _preprocess
+        kwargs.pop("default_to_square")
+        kwargs.pop("data_format")
+
+        return self._preprocess(images=images, **kwargs)
 
     def _preprocess(
         self,
-        images: List["torch.Tensor"],
+        images: list["torch.Tensor"],
         do_resize: bool,
         size: SizeDict,
         interpolation: Optional["F.InterpolationMode"],
@@ -624,9 +710,10 @@ def _preprocess(
         do_rescale: bool,
         rescale_factor: float,
         do_normalize: bool,
-        image_mean: Optional[Union[float, List[float]]],
-        image_std: Optional[Union[float, List[float]]],
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
         return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
     ) -> BatchFeature:
         # Group images by size for batched resizing
         grouped_images, grouped_images_index = group_images_by_shape(images)
@@ -662,7 +749,7 @@ def to_dict(self):
 
 
 class SemanticSegmentationMixin:
-    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
+    def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple] = None):
         """
         Converts the output of [`MobileNetV2ForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
 
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index eaaadbf2425f..5b0ba3f9122f 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import warnings
+from collections.abc import Collection, Iterable
 from math import ceil
-from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
+from typing import Optional, Union
 
 import numpy as np
 
@@ -85,7 +84,7 @@ def to_channel_dimension_format(
     elif target_channel_dim == ChannelDimension.LAST:
         image = image.transpose((1, 2, 0))
     else:
-        raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
+        raise ValueError(f"Unsupported channel dimension format: {channel_dim}")
 
     return image
 
@@ -191,7 +190,7 @@ def to_pil_image(
     elif is_jax_tensor(image):
         image = np.array(image)
     elif not isinstance(image, np.ndarray):
-        raise ValueError("Input image type not supported: {}".format(type(image)))
+        raise ValueError(f"Input image type not supported: {type(image)}")
 
     # If the channel has been moved to first dim, we put it back at the end.
     image = to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
@@ -209,7 +208,7 @@ def to_pil_image(
     return PIL.Image.fromarray(image, mode=image_mode)
 
 
-def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
     """
     Computes the output image size given the input image size and the desired output size.
 
@@ -251,7 +250,7 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in
 # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
 def get_resize_output_image_size(
     input_image: np.ndarray,
-    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    size: Union[int, tuple[int, int], list[int], tuple[int]],
     default_to_square: bool = True,
     max_size: Optional[int] = None,
     input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -318,7 +317,7 @@ def get_resize_output_image_size(
 
 def resize(
     image: np.ndarray,
-    size: Tuple[int, int],
+    size: tuple[int, int],
     resample: "PILImageResampling" = None,
     reducing_gap: Optional[int] = None,
     data_format: Optional[ChannelDimension] = None,
@@ -389,8 +388,8 @@ def resize(
 
 def normalize(
     image: np.ndarray,
-    mean: Union[float, Sequence[float]],
-    std: Union[float, Sequence[float]],
+    mean: Union[float, Collection[float]],
+    std: Union[float, Collection[float]],
     data_format: Optional[ChannelDimension] = None,
     input_data_format: Optional[Union[str, ChannelDimension]] = None,
 ) -> np.ndarray:
@@ -402,9 +401,9 @@ def normalize(
     Args:
         image (`np.ndarray`):
             The image to normalize.
-        mean (`float` or `Sequence[float]`):
+        mean (`float` or `Collection[float]`):
             The mean to use for normalization.
-        std (`float` or `Sequence[float]`):
+        std (`float` or `Collection[float]`):
             The standard deviation to use for normalization.
         data_format (`ChannelDimension`, *optional*):
             The channel dimension format of the output image. If unset, will use the inferred format from the input.
@@ -425,14 +424,14 @@ def normalize(
     if not np.issubdtype(image.dtype, np.floating):
         image = image.astype(np.float32)
 
-    if isinstance(mean, Sequence):
+    if isinstance(mean, Collection):
         if len(mean) != num_channels:
             raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
     else:
         mean = [mean] * num_channels
     mean = np.array(mean, dtype=image.dtype)
 
-    if isinstance(std, Sequence):
+    if isinstance(std, Collection):
         if len(std) != num_channels:
             raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
     else:
@@ -450,10 +449,9 @@ def normalize(
 
 def center_crop(
     image: np.ndarray,
-    size: Tuple[int, int],
+    size: tuple[int, int],
     data_format: Optional[Union[str, ChannelDimension]] = None,
     input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    return_numpy: Optional[bool] = None,
 ) -> np.ndarray:
     """
     Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to
@@ -474,22 +472,11 @@ def center_crop(
                 - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
             If unset, will use the inferred format of the input image.
-        return_numpy (`bool`, *optional*):
-            Whether or not to return the cropped image as a numpy array. Used for backwards compatibility with the
-            previous ImageFeatureExtractionMixin method.
-                - Unset: will return the same type as the input image.
-                - `True`: will return a numpy array.
-                - `False`: will return a `PIL.Image.Image` object.
     Returns:
         `np.ndarray`: The cropped image.
     """
     requires_backends(center_crop, ["vision"])
 
-    if return_numpy is not None:
-        warnings.warn("return_numpy is deprecated and will be removed in v.4.33", FutureWarning)
-
-    return_numpy = True if return_numpy is None else return_numpy
-
     if not isinstance(image, np.ndarray):
         raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}")
 
@@ -541,9 +528,6 @@ def center_crop(
     new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)]
     new_image = to_channel_dimension_format(new_image, output_data_format, ChannelDimension.FIRST)
 
-    if not return_numpy:
-        new_image = to_pil_image(new_image)
-
     return new_image
 
 
@@ -584,7 +568,7 @@ def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
 
     center format: contains the coordinate for the center of the box and its width, height dimensions
         (center_x, center_y, width, height)
-    corners format: contains the coodinates for the top-left and bottom-right corners of the box
+    corners format: contains the coordinates for the top-left and bottom-right corners of the box
         (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
     """
     # Function is used during model forward pass, so we use the input framework if possible, without
@@ -704,7 +688,7 @@ class PaddingMode(ExplicitEnum):
 
 def pad(
     image: np.ndarray,
-    padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
+    padding: Union[int, tuple[int, int], Iterable[tuple[int, int]]],
     mode: PaddingMode = PaddingMode.CONSTANT,
     constant_values: Union[float, Iterable[float]] = 0.0,
     data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -854,8 +838,8 @@ def _cast_tensor_to_float(x):
 
 
 def group_images_by_shape(
-    images: List["torch.Tensor"],
-) -> Tuple[Dict[Tuple[int, int], List["torch.Tensor"]], Dict[int, Tuple[Tuple[int, int], int]]]:
+    images: list["torch.Tensor"],
+) -> tuple[dict[tuple[int, int], list["torch.Tensor"]], dict[int, tuple[tuple[int, int], int]]]:
     """
     Groups images by shape.
     Returns a dictionary with the shape as key and a list of images with that shape as value,
@@ -875,8 +859,8 @@ def group_images_by_shape(
 
 
 def reorder_images(
-    processed_images: Dict[Tuple[int, int], "torch.Tensor"], grouped_images_index: Dict[int, Tuple[int, int]]
-) -> List["torch.Tensor"]:
+    processed_images: dict[tuple[int, int], "torch.Tensor"], grouped_images_index: dict[int, tuple[int, int]]
+) -> list["torch.Tensor"]:
     """
     Reconstructs a list of images in the original order.
     """
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 101de2c78a7e..f07ac1ae7d91 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2021 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,10 +14,11 @@
 
 import base64
 import os
+from collections.abc import Iterable
 from contextlib import redirect_stdout
 from dataclasses import dataclass
 from io import BytesIO
-from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Optional, Union
 
 import numpy as np
 import requests
@@ -26,7 +26,6 @@
 
 from .utils import (
     ExplicitEnum,
-    TensorType,
     is_av_available,
     is_cv2_available,
     is_decord_available,
@@ -74,17 +73,6 @@
             PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
         }
 
-if is_decord_available():
-    from decord import VideoReader, cpu
-
-if is_av_available():
-    import av
-
-if is_cv2_available():
-    import cv2
-
-if is_yt_dlp_available():
-    from yt_dlp import YoutubeDL
 
 if TYPE_CHECKING:
     if is_torch_available():
@@ -95,19 +83,19 @@
 
 
 ImageInput = Union[
-    "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]
+    "PIL.Image.Image", np.ndarray, "torch.Tensor", list["PIL.Image.Image"], list[np.ndarray], list["torch.Tensor"]
 ]  # noqa
 
 
 VideoInput = Union[
-    List["PIL.Image.Image"],
+    list["PIL.Image.Image"],
     "np.ndarray",
     "torch.Tensor",
-    List["np.ndarray"],
-    List["torch.Tensor"],
-    List[List["PIL.Image.Image"]],
-    List[List["np.ndarrray"]],
-    List[List["torch.Tensor"]],
+    list["np.ndarray"],
+    list["torch.Tensor"],
+    list[list["PIL.Image.Image"]],
+    list[list["np.ndarray"]],
+    list[list["torch.Tensor"]],
 ]  # noqa
 
 
@@ -134,7 +122,7 @@ class VideoMetadata:
     video_backend: str
 
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+AnnotationType = dict[str, Union[int, str, list[dict]]]
 
 
 def is_pil_image(img):
@@ -167,7 +155,7 @@ def is_valid_image(img):
     return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
 
 
-def is_valid_list_of_images(images: List):
+def is_valid_list_of_images(images: list):
     return images and all(is_valid_image(image) for image in images)
 
 
@@ -200,7 +188,7 @@ def is_scaled_image(image: np.ndarray) -> bool:
     return np.min(image) >= 0 and np.max(image) <= 1
 
 
-def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
+def make_list_of_images(images, expected_ndims: int = 3) -> list[ImageInput]:
     """
     Ensure that the output is a list of images. If the input is a single image, it is converted to a list of length 1.
     If the input is a batch of images, it is converted to a list of images.
@@ -240,7 +228,7 @@ def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
 
 
 def make_flat_list_of_images(
-    images: Union[List[ImageInput], ImageInput],
+    images: Union[list[ImageInput], ImageInput],
 ) -> ImageInput:
     """
     Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
@@ -275,7 +263,7 @@ def make_flat_list_of_images(
 
 
 def make_nested_list_of_images(
-    images: Union[List[ImageInput], ImageInput],
+    images: Union[list[ImageInput], ImageInput],
 ) -> ImageInput:
     """
     Ensure that the output is a nested list of images.
@@ -351,7 +339,7 @@ def to_numpy_array(img) -> np.ndarray:
 
 
 def infer_channel_dimension_format(
-    image: np.ndarray, num_channels: Optional[Union[int, Tuple[int, ...]]] = None
+    image: np.ndarray, num_channels: Optional[Union[int, tuple[int, ...]]] = None
 ) -> ChannelDimension:
     """
     Infers the channel dimension format of `image`.
@@ -411,7 +399,7 @@ def get_channel_dimension_axis(
     raise ValueError(f"Unsupported data format: {input_data_format}")
 
 
-def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
+def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> tuple[int, int]:
     """
     Returns the (height, width) dimensions of the image.
 
@@ -436,10 +424,10 @@ def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> T
 
 
 def get_image_size_for_max_height_width(
-    image_size: Tuple[int, int],
+    image_size: tuple[int, int],
     max_height: int,
     max_width: int,
-) -> Tuple[int, int]:
+) -> tuple[int, int]:
     """
     Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
     Important, even if image_height < max_height and image_width < max_width, the image will be resized
@@ -466,7 +454,7 @@ def get_image_size_for_max_height_width(
     return new_height, new_width
 
 
-def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]) -> bool:
+def is_valid_annotation_coco_detection(annotation: dict[str, Union[list, tuple]]) -> bool:
     if (
         isinstance(annotation, dict)
         and "image_id" in annotation
@@ -481,7 +469,7 @@ def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]
     return False
 
 
-def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]]) -> bool:
+def is_valid_annotation_coco_panoptic(annotation: dict[str, Union[list, tuple]]) -> bool:
     if (
         isinstance(annotation, dict)
         and "image_id" in annotation
@@ -497,11 +485,11 @@ def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]])
     return False
 
 
-def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
+def valid_coco_detection_annotations(annotations: Iterable[dict[str, Union[list, tuple]]]) -> bool:
     return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
 
 
-def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
+def valid_coco_panoptic_annotations(annotations: Iterable[dict[str, Union[list, tuple]]]) -> bool:
     return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
 
 
@@ -556,7 +544,7 @@ def default_sample_indices_fn(metadata: VideoMetadata, num_frames=None, fps=None
 
     Args:
         metadata (`VideoMetadata`):
-            `VideoMetadata` object containing metadat about the video, such as "total_num_frames" or "fps".
+            `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
         num_frames (`int`, *optional*):
             Number of frames to sample uniformly.
         fps (`int`, *optional*):
@@ -608,6 +596,10 @@ def sample_indices_fn(metadata, **kwargs):
             - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
             - `VideoMetadata` object.
     """
+    # Lazy import cv2
+    requires_backends(read_video_opencv, ["cv2"])
+    import cv2
+
     video = cv2.VideoCapture(video_path)
     total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     video_fps = video.get(cv2.CAP_PROP_FPS)
@@ -661,6 +653,10 @@ def sample_indices_fn(metadata, **kwargs):
             - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
             - `VideoMetadata` object.
     """
+    # Lazy import from decord
+    requires_backends(read_video_decord, ["decord"])
+    from decord import VideoReader, cpu
+
     vr = VideoReader(uri=video_path, ctx=cpu(0))  # decord has problems with gpu
     video_fps = vr.get_avg_fps()
     total_num_frames = len(vr)
@@ -700,6 +696,10 @@ def sample_indices_fn(metadata, **kwargs):
             - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
             - `VideoMetadata` object.
     """
+    # Lazy import av
+    requires_backends(read_video_pyav, ["av"])
+    import av
+
     container = av.open(video_path)
     total_num_frames = container.streams.video[0].frames
     video_fps = container.streams.video[0].average_rate  # should we better use `av_guess_frame_rate`?
@@ -834,6 +834,10 @@ def sample_indices_fn_func(metadata, **fn_kwargs):
     if video.startswith("https://www.youtube.com") or video.startswith("http://www.youtube.com"):
         if not is_yt_dlp_available():
             raise ImportError("To load a video from YouTube url you have  to install `yt_dlp` first.")
+        # Lazy import from yt_dlp
+        requires_backends(load_video, ["yt_dlp"])
+        from yt_dlp import YoutubeDL
+
         buffer = BytesIO()
         with redirect_stdout(buffer), YoutubeDL() as f:
             f.download([video])
@@ -843,7 +847,7 @@ def sample_indices_fn_func(metadata, **fn_kwargs):
         file_obj = BytesIO(requests.get(video).content)
     elif os.path.isfile(video):
         file_obj = video
-    elif is_valid_image(video) or (isinstance(video, (list, tuple) and is_valid_image(video[0]))):
+    elif is_valid_image(video) or (isinstance(video, (list, tuple)) and is_valid_image(video[0])):
         file_obj = None
     else:
         raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.")
@@ -876,8 +880,8 @@ def sample_indices_fn_func(metadata, **fn_kwargs):
 
 
 def load_images(
-    images: Union[List, Tuple, str, "PIL.Image.Image"], timeout: Optional[float] = None
-) -> Union["PIL.Image.Image", List["PIL.Image.Image"], List[List["PIL.Image.Image"]]]:
+    images: Union[list, tuple, str, "PIL.Image.Image"], timeout: Optional[float] = None
+) -> Union["PIL.Image.Image", list["PIL.Image.Image"], list[list["PIL.Image.Image"]]]:
     """Loads images, handling different levels of nesting.
 
     Args:
@@ -900,14 +904,14 @@ def validate_preprocess_arguments(
     do_rescale: Optional[bool] = None,
     rescale_factor: Optional[float] = None,
     do_normalize: Optional[bool] = None,
-    image_mean: Optional[Union[float, List[float]]] = None,
-    image_std: Optional[Union[float, List[float]]] = None,
+    image_mean: Optional[Union[float, list[float]]] = None,
+    image_std: Optional[Union[float, list[float]]] = None,
     do_pad: Optional[bool] = None,
     size_divisibility: Optional[int] = None,
     do_center_crop: Optional[bool] = None,
-    crop_size: Optional[Dict[str, int]] = None,
+    crop_size: Optional[dict[str, int]] = None,
     do_resize: Optional[bool] = None,
-    size: Optional[Dict[str, int]] = None,
+    size: Optional[dict[str, int]] = None,
     resample: Optional["PILImageResampling"] = None,
 ):
     """
@@ -937,48 +941,6 @@ def validate_preprocess_arguments(
         raise ValueError("`size` and `resample` must be specified if `do_resize` is `True`.")
 
 
-def validate_fast_preprocess_arguments(
-    do_rescale: Optional[bool] = None,
-    rescale_factor: Optional[float] = None,
-    do_normalize: Optional[bool] = None,
-    image_mean: Optional[Union[float, List[float]]] = None,
-    image_std: Optional[Union[float, List[float]]] = None,
-    do_pad: Optional[bool] = None,
-    size_divisibility: Optional[int] = None,
-    do_center_crop: Optional[bool] = None,
-    crop_size: Optional[Dict[str, int]] = None,
-    do_resize: Optional[bool] = None,
-    size: Optional[Dict[str, int]] = None,
-    resample: Optional["PILImageResampling"] = None,
-    return_tensors: Optional[Union[str, TensorType]] = None,
-    data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-):
-    """
-    Checks validity of typically used arguments in an `ImageProcessorFast` `preprocess` method.
-    Raises `ValueError` if arguments incompatibility is caught.
-    """
-    validate_preprocess_arguments(
-        do_rescale=do_rescale,
-        rescale_factor=rescale_factor,
-        do_normalize=do_normalize,
-        image_mean=image_mean,
-        image_std=image_std,
-        do_pad=do_pad,
-        size_divisibility=size_divisibility,
-        do_center_crop=do_center_crop,
-        crop_size=crop_size,
-        do_resize=do_resize,
-        size=size,
-        resample=resample,
-    )
-    # Extra checks for ImageProcessorFast
-    if return_tensors is not None and return_tensors != "pt":
-        raise ValueError("Only returning PyTorch tensors is currently supported.")
-
-    if data_format != ChannelDimension.FIRST:
-        raise ValueError("Only channel first data format is currently supported.")
-
-
 # In the future we can add a TF implementation here when we have TF models.
 class ImageFeatureExtractionMixin:
     """
@@ -1333,8 +1295,8 @@ def rotate(self, image, angle, resample=None, expand=0, center=None, translate=N
 
 def validate_annotations(
     annotation_format: AnnotationFormat,
-    supported_annotation_formats: Tuple[AnnotationFormat, ...],
-    annotations: List[Dict],
+    supported_annotation_formats: tuple[AnnotationFormat, ...],
+    annotations: list[dict],
 ) -> None:
     if annotation_format not in supported_annotation_formats:
         raise ValueError(f"Unsupported annotation format: {format} must be one of {supported_annotation_formats}")
@@ -1356,7 +1318,7 @@ def validate_annotations(
             )
 
 
-def validate_kwargs(valid_processor_keys: List[str], captured_kwargs: List[str]):
+def validate_kwargs(valid_processor_keys: list[str], captured_kwargs: list[str]):
     unused_keys = set(captured_kwargs).difference(set(valid_processor_keys))
     if unused_keys:
         unused_key_str = ", ".join(unused_keys)
@@ -1370,12 +1332,12 @@ class SizeDict:
     Hashable dictionary to store image size information.
     """
 
-    height: int = None
-    width: int = None
-    longest_edge: int = None
-    shortest_edge: int = None
-    max_height: int = None
-    max_width: int = None
+    height: Optional[int] = None
+    width: Optional[int] = None
+    longest_edge: Optional[int] = None
+    shortest_edge: Optional[int] = None
+    max_height: Optional[int] = None
+    max_width: Optional[int] = None
 
     def __getitem__(self, key):
         if hasattr(self, key):
diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index b545c5da50a5..8d03c5cf790e 100755
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ..utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ..utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_torch_greater_or_equal
 
 
 _import_structure = {
@@ -53,7 +53,7 @@
         "unset_hf_deepspeed_config",
     ],
     "eetq": ["replace_with_eetq_linear"],
-    "fbgemm_fp8": ["FbgemmFp8Linear", "replace_with_fbgemm_fp8_linear"],
+    "fbgemm_fp8": ["FbgemmFp8Linear", "FbgemmFp8Llama4TextExperts", "replace_with_fbgemm_fp8_linear"],
     "finegrained_fp8": ["FP8Linear", "replace_with_fp8_linear"],
     "fsdp": ["is_fsdp_managed_module"],
     "ggml": [
@@ -63,8 +63,19 @@
         "load_dequant_gguf_tensor",
         "load_gguf",
     ],
-    "higgs": ["HiggsLinear", "dequantize_higgs", "quantize_with_higgs", "replace_with_higgs_linear"],
+    "higgs": [
+        "HiggsLinear",
+        "dequantize_higgs",
+        "quantize_with_higgs",
+        "replace_with_higgs_linear",
+    ],
     "hqq": ["prepare_for_hqq_linear"],
+    "hub_kernels": [
+        "LayerRepository",
+        "register_kernel_mapping",
+        "replace_kernel_forward_from_hub",
+        "use_kernel_forward_from_hub",
+    ],
     "integration_utils": [
         "INTEGRATION_TO_CALLBACK",
         "AzureMLCallback",
@@ -77,6 +88,7 @@
         "MLflowCallback",
         "NeptuneCallback",
         "NeptuneMissingConfiguration",
+        "SwanLabCallback",
         "TensorBoardCallback",
         "WandbCallback",
         "get_available_reporting_integrations",
@@ -96,6 +108,7 @@
         "is_ray_available",
         "is_ray_tune_available",
         "is_sigopt_available",
+        "is_swanlab_available",
         "is_tensorboard_available",
         "is_wandb_available",
         "rewrite_logs",
@@ -121,6 +134,27 @@
         "convert_and_export_with_cache",
     ]
 
+try:
+    if not is_torch_greater_or_equal("2.3"):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tensor_parallel"] = [
+        "shard_and_distribute_module",
+        "SUPPORTED_TP_STYLES",
+        "translate_to_torch_parallel_style",
+    ]
+try:
+    if not is_torch_greater_or_equal("2.5"):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["flex_attention"] = [
+        "make_flex_block_causal_mask",
+    ]
+
 if TYPE_CHECKING:
     from .aqlm import replace_with_aqlm_linear
     from .awq import (
@@ -158,7 +192,7 @@
         unset_hf_deepspeed_config,
     )
     from .eetq import replace_with_eetq_linear
-    from .fbgemm_fp8 import FbgemmFp8Linear, replace_with_fbgemm_fp8_linear
+    from .fbgemm_fp8 import FbgemmFp8Linear, FbgemmFp8Llama4TextExperts, replace_with_fbgemm_fp8_linear
     from .finegrained_fp8 import FP8Linear, replace_with_fp8_linear
     from .fsdp import is_fsdp_managed_module
     from .ggml import (
@@ -170,6 +204,12 @@
     )
     from .higgs import HiggsLinear, dequantize_higgs, quantize_with_higgs, replace_with_higgs_linear
     from .hqq import prepare_for_hqq_linear
+    from .hub_kernels import (
+        LayerRepository,
+        register_kernel_mapping,
+        replace_kernel_forward_from_hub,
+        use_kernel_forward_from_hub,
+    )
     from .integration_utils import (
         INTEGRATION_TO_CALLBACK,
         AzureMLCallback,
@@ -182,6 +222,7 @@
         MLflowCallback,
         NeptuneCallback,
         NeptuneMissingConfiguration,
+        SwanLabCallback,
         TensorBoardCallback,
         WandbCallback,
         get_available_reporting_integrations,
@@ -201,6 +242,7 @@
         is_ray_available,
         is_ray_tune_available,
         is_sigopt_available,
+        is_swanlab_available,
         is_tensorboard_available,
         is_wandb_available,
         rewrite_logs,
@@ -222,6 +264,25 @@
     else:
         from .executorch import TorchExportableModuleWithStaticCache, convert_and_export_with_cache
 
+    try:
+        if not is_torch_greater_or_equal("2.3"):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tensor_parallel import (
+            SUPPORTED_TP_STYLES,
+            shard_and_distribute_module,
+            translate_to_torch_parallel_style,
+        )
+
+    try:
+        if not is_torch_greater_or_equal("2.5"):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .flex_attention import make_flex_block_causal_mask
 else:
     import sys
 
diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index b10a3b599174..3973dc58c062 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -486,7 +486,7 @@ def _validate_bnb_multi_backend_availability(raise_exception):
     import bitsandbytes as bnb
 
     bnb_supported_devices = getattr(bnb, "supported_torch_devices", set())
-    available_devices = get_available_devices()
+    available_devices = set(get_available_devices())
 
     if available_devices == {"cpu"} and not is_ipex_available():
         from importlib.util import find_spec
@@ -496,7 +496,9 @@ def _validate_bnb_multi_backend_availability(raise_exception):
                 "You have Intel IPEX installed but if you're intending to use it for CPU, it might not have the right version. Be sure to double check that your PyTorch and IPEX installs are compatible."
             )
 
-        available_devices.discard("cpu")  # Only Intel CPU is supported by BNB at the moment
+        available_devices = frozenset(
+            [device for device in available_devices if device != "cpu"]
+        )  # Only Intel CPU is supported by BNB at the moment
 
     if not available_devices.intersection(bnb_supported_devices):
         if raise_exception:
diff --git a/src/transformers/integrations/compressed_tensors.py b/src/transformers/integrations/compressed_tensors.py
new file mode 100644
index 000000000000..752227914d98
--- /dev/null
+++ b/src/transformers/integrations/compressed_tensors.py
@@ -0,0 +1,54 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+from transformers.models.llama4.modeling_llama4 import Llama4TextMLP
+
+
+def skip(*args, **kwargs):
+    pass
+
+
+class CompressedExpertsLinear(nn.Module):
+    """
+    A module that implements a compressed version of a list of expert modules.
+    This is specifically designed to work with Llama4TextExperts in MoE layers.
+    """
+
+    def __init__(self, config):
+        # Skip random weight initialization for experts. Otherwise,
+        # the init of this module would take over minutes. For a model
+        # with tens of layers of experts, it would easily take over 20 minutes.
+        nn.init.kaiming_uniform_ = skip
+        nn.init.uniform_ = skip
+        nn.init.normal_ = skip
+        super().__init__()
+        self.num_experts = config.num_local_experts
+        self.expert_modules = nn.ModuleList([Llama4TextMLP(config) for _ in range(self.num_experts)])
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states.reshape(self.num_experts, -1, hidden_states.shape[-1])
+        expert_routed_out_list = []
+        for expert_idx in range(self.num_experts):
+            expert_routed_out_list.append(self.expert_modules[expert_idx](hidden_states[expert_idx]))
+        routed_out = torch.cat(expert_routed_out_list, dim=0)
+        return routed_out
diff --git a/src/transformers/integrations/deepspeed.py b/src/transformers/integrations/deepspeed.py
index e4742ecd1bfb..1700301db51f 100644
--- a/src/transformers/integrations/deepspeed.py
+++ b/src/transformers/integrations/deepspeed.py
@@ -22,11 +22,12 @@
 from functools import partialmethod
 
 from ..dependency_versions_check import dep_version_check
-from ..utils import is_accelerate_available, is_torch_available, is_torch_mlu_available, logging
+from ..utils import is_accelerate_available, is_torch_available, logging
 
 
 if is_torch_available():
     import torch
+    from torch import nn
 
 
 logger = logging.get_logger(__name__)
@@ -39,9 +40,6 @@ def is_deepspeed_available():
     # AND checking it has an author field in the metadata that is HuggingFace.
     if package_exists:
         try:
-            if is_torch_mlu_available():
-                _ = importlib_metadata.metadata("deepspeed-mlu")
-                return True
             _ = importlib_metadata.metadata("deepspeed")
             return True
         except importlib_metadata.PackageNotFoundError:
@@ -305,6 +303,54 @@ def deepspeed_config():
         return None
 
 
+def _load_state_dict_into_zero3_model(model_to_load, state_dict):
+    """
+    Loads state dict into a model specifically for Zero3, since DeepSpeed does not support the `transformers`
+    tensor parallelism API.
+
+    Nearly identical code to PyTorch's `_load_from_state_dict`
+    """
+    # copy state_dict so `_load_state_dict_into_zero3_model` can modify it
+    metadata = getattr(state_dict, "_metadata", None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    error_msgs = []
+
+    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+    # so we need to apply the function recursively.
+    def load(module: nn.Module, state_dict, prefix="", assign_to_params_buffers=False):
+        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+        local_metadata["assign_to_params_buffers"] = assign_to_params_buffers
+
+        args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
+        # Parameters of module and children will start with prefix. We can exit early if there are none in this
+        # state_dict
+        if is_deepspeed_zero3_enabled() and len([key for key in state_dict if key.startswith(prefix)]) > 0:
+            import deepspeed
+
+            # In sharded models, each shard has only part of the full state_dict, so only gather
+            # parameters that are in the current state_dict.
+            named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
+            params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
+            if len(params_to_gather) > 0:
+                # because zero3 puts placeholders in model params, this context
+                # manager gathers (unpartitions) the params of the current layer, then loads from
+                # the state dict and then re-partitions them again
+                with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        module._load_from_state_dict(*args)
+
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, state_dict, prefix + name + ".", assign_to_params_buffers)
+
+    load(model_to_load, state_dict, assign_to_params_buffers=False)
+
+    return error_msgs
+
+
 def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters):
     """
     A convenience wrapper that deals with optimizer and lr scheduler configuration.
@@ -412,6 +458,11 @@ def deepspeed_init(trainer, num_training_steps, inference=False):
         model_parameters = None
     else:
         trainer.optimizer = None  # important for when deepspeed_init is used as re-init
+        tp_size = hf_deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 0)
+        if tp_size > 1:
+            import deepspeed
+
+            model = deepspeed.tp_model_init(model=model, tp_size=tp_size, dtype=hf_deepspeed_config.dtype())
         model_parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
         optimizer, lr_scheduler = deepspeed_optim_sched(
             trainer, hf_deepspeed_config, args, num_training_steps, model_parameters
diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index a0cbc8ba4e78..591c556e59f0 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -10,17 +10,18 @@
 # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 # specific language governing permissions and limitations under the License.
 
+from typing import Optional
+
 import torch
 
+from transformers.generation.configuration_utils import GenerationConfig
+
 from ..utils.import_utils import is_torch_available
 
 
 if is_torch_available():
-    from transformers import (
-        PreTrainedModel,
-        StaticCache,
-    )
-    from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_3
+    from transformers import PreTrainedModel, StaticCache
+    from transformers.pytorch_utils import is_torch_greater_or_equal, is_torch_greater_or_equal_than_2_3
 
 
 class TorchExportableModuleWithStaticCache(torch.nn.Module):
@@ -70,11 +71,15 @@ def __init__(self, model: PreTrainedModel):
         self.model = model
         self.static_cache = StaticCache(
             config=self.model.config,
-            batch_size=self.model.generation_config.cache_config.batch_size,
+            max_batch_size=self.model.generation_config.cache_config.batch_size,
             max_cache_len=self.model.generation_config.cache_config.max_cache_len,
-            dtype=self.model.dtype,
             device=self.model.generation_config.cache_config.device,
+            dtype=self.model.dtype,
         )
+        for i in range(len(self.static_cache.key_cache)):
+            self.register_buffer(f"key_cache_{i}", self.static_cache.key_cache[i], persistent=False)
+            self.register_buffer(f"value_cache_{i}", self.static_cache.value_cache[i], persistent=False)
+
         self.is_causal = any("CausalLM" in arch for arch in self.model.config.architectures)
         if self.is_causal:
             causal_mask = torch.tril(
@@ -109,12 +114,15 @@ def forward(self, input_ids: torch.Tensor, cache_position: torch.Tensor):
         """
         _, seqlen = input_ids.shape
         attn_mask = self.mask[cache_position, :seqlen] if self.is_causal else None
+        position_ids = cache_position.unsqueeze(0)
+        past_key_values = self.static_cache
+
         outs = self.model(
             input_ids=input_ids,
             attention_mask=attn_mask,
-            position_ids=cache_position.unsqueeze(0),
+            position_ids=position_ids,
             cache_position=cache_position,
-            past_key_values=self.static_cache,
+            past_key_values=past_key_values,
             use_cache=True,
         )
         return outs.logits
@@ -143,7 +151,7 @@ def generate(
         prompt_token_len = prompt_token_ids.shape[-1]
         max_generation_length = prompt_token_len + max_new_tokens
         for buffer_name, buffer in exported_program.named_buffers():
-            if buffer_name.startswith("static_cache.key_cache"):
+            if buffer_name.startswith("key_cache"):
                 max_cache_len = buffer.shape[2]
                 max_generation_length = min(max_generation_length, max_cache_len)
                 break
@@ -172,8 +180,8 @@ def generate(
 
 def convert_and_export_with_cache(
     model: PreTrainedModel,
-    example_input_ids: torch.Tensor = None,
-    example_cache_position: torch.Tensor = None,
+    example_input_ids: Optional[torch.Tensor] = None,
+    example_cache_position: Optional[torch.Tensor] = None,
 ):
     """
     Convert a `PreTrainedModel` into an exportable module and export it using `torch.export`,
@@ -187,7 +195,6 @@ def convert_and_export_with_cache(
     Returns:
         Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
     """
-
     if not is_torch_greater_or_equal_than_2_3:
         raise ImportError("torch >= 2.3 is required.")
 
@@ -202,13 +209,200 @@ def convert_and_export_with_cache(
             example_cache_position if example_cache_position is not None else torch.tensor([0], dtype=torch.long)
         )
 
-        # Due to issue https://github.com/pytorch/pytorch/issues/128394, we need to switch to use an internal
-        # export API and pre_dispatch=False. Switch to use the public API once the issue is included in 2.5 release.
-        exported_program = torch.export._trace._export(
-            TorchExportableModuleWithStaticCache(model),
-            args=(example_input_ids,),
-            kwargs={"cache_position": example_cache_position},
-            pre_dispatch=False,
-            strict=True,
-        )
+        if is_torch_greater_or_equal("2.5.0"):
+            exported_program = torch.export.export(
+                TorchExportableModuleWithStaticCache(model),
+                args=(example_input_ids,),
+                kwargs={"cache_position": example_cache_position},
+                strict=True,
+            )
+        else:
+            # We have to keep this path for BC.
+            #
+            # Due to issue https://github.com/pytorch/pytorch/issues/128394, we need to switch to use an internal
+            # export API and pre_dispatch=False. Switch to use the public API once the issue is included in 2.5 release.
+            exported_program = torch.export._trace._export(
+                TorchExportableModuleWithStaticCache(model),
+                args=(example_input_ids,),
+                kwargs={"cache_position": example_cache_position},
+                pre_dispatch=False,
+                strict=True,
+            )
         return exported_program
+
+
+class Seq2SeqLMEncoderExportableModule(torch.nn.Module):
+    """
+    A wrapper module designed to make a Seq2Seq LM encoder exportable with `torch.export`.
+    This module ensures that the exported encoder model is compatible with ExecuTorch.
+    """
+
+    def __init__(self, encoder_model):
+        super().__init__()
+        self.encoder = encoder_model
+
+    def forward(self, input_ids):
+        return self.encoder(input_ids=input_ids).last_hidden_state
+
+
+class Seq2SeqLMDecoderExportableModuleWithStaticCache(torch.nn.Module):
+    """
+    A wrapper module designed to make a Seq2Seq LM decoder exportable with `torch.export`,
+    specifically for use with static caching. This module ensures the exported decoder
+    is compatible with ExecuTorch.
+    """
+
+    def __init__(self, model, max_static_cache_length, batch_size):
+        super().__init__()
+
+        # Get the decoder component
+        self.decoder = model.get_decoder()
+        self.lm_head = model.lm_head
+        self.config = model.config
+
+        # Initialize static cache
+        self.static_cache = StaticCache(
+            config=self.config,
+            max_batch_size=batch_size,
+            max_cache_len=max_static_cache_length,
+            device="cpu",
+            dtype=torch.float32,
+        )
+
+        # Register cache buffers to make them exportable
+        for i in range(len(self.static_cache.key_cache)):
+            self.register_buffer(f"key_cache_{i}", self.static_cache.key_cache[i], persistent=False)
+            self.register_buffer(f"value_cache_{i}", self.static_cache.value_cache[i], persistent=False)
+
+    def forward(self, decoder_input_ids, encoder_hidden_states, cache_position):
+        # Get outputs from decoder
+        outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=self.static_cache,
+            use_cache=True,
+            cache_position=cache_position,
+        )
+
+        # Apply language model head
+        lm_logits = self.lm_head(outputs[0])
+
+        return lm_logits
+
+
+class Seq2SeqLMExportableModule(torch.nn.Module):
+    def __init__(
+        self, model, batch_size=1, max_hidden_seq_length=4096, cache_implementation="static", max_cache_length=1024
+    ):
+        super().__init__()
+
+        self.full_model = model
+        self.encoder = model.get_encoder()
+        self.config = model.config
+        self.max_hidden_seq_length = max_hidden_seq_length
+        self.generation_config = GenerationConfig(
+            use_cache=True,
+            max_length=max_cache_length,
+            cache_implementation=cache_implementation,
+            cache_config={
+                "batch_size": batch_size,
+                "max_cache_len": max_cache_length,
+            },
+        )
+        self.exported_encoder = None
+        self.exported_decoder = None
+
+    def _export_encoder(self, encoder_input_ids):
+        wrapped_encoder = Seq2SeqLMEncoderExportableModule(self.encoder).to("cpu").eval()
+
+        # Define dynamic sequence length for encoder
+        seq_len_dim = torch.export.Dim("encoder_seq_length", max=self.max_hidden_seq_length)
+
+        # Export the encoder
+        with torch.no_grad():
+            exported_encoder = torch.export.export(
+                wrapped_encoder, (encoder_input_ids,), dynamic_shapes={"input_ids": {1: seq_len_dim}}, strict=True
+            )
+
+        return exported_encoder
+
+    def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_position):
+        wrapped_decoder = (
+            Seq2SeqLMDecoderExportableModuleWithStaticCache(
+                model=self.full_model,
+                max_static_cache_length=self.generation_config.cache_config.max_cache_len,
+                batch_size=self.generation_config.cache_config.batch_size,
+            )
+            .to("cpu")
+            .eval()
+        )
+
+        # Define dynamic dimension for encoder output sequence length
+        encoder_seq_len_dim = torch.export.Dim("encoder_hidden_seq_length", max=self.max_hidden_seq_length)
+
+        # Export the decoder
+        with torch.no_grad():
+            exported_decoder = torch.export.export(
+                wrapped_decoder,
+                (decoder_input_ids, encoder_hidden_states, cache_position),
+                dynamic_shapes={
+                    "decoder_input_ids": None,
+                    "encoder_hidden_states": {1: encoder_seq_len_dim},
+                    "cache_position": None,
+                },
+                strict=True,
+            )
+
+        return exported_decoder
+
+    def export(self, encoder_input_ids=None, decoder_input_ids=None, encoder_hidden_states=None, cache_position=None):
+        example_encoder_input_ids = (
+            encoder_input_ids if encoder_input_ids is not None else torch.ones((1, 10), dtype=torch.long)
+        )
+        example_decoder_input_ids = (
+            decoder_input_ids if decoder_input_ids is not None else torch.tensor([[0]], dtype=torch.long)
+        )  # Start token
+        example_cache_position = cache_position if cache_position is not None else torch.tensor([0], dtype=torch.long)
+        example_encoder_hidden_states = (
+            encoder_hidden_states
+            if encoder_hidden_states is not None
+            else torch.zeros(
+                (self.generation_config.cache_config.batch_size, 10, self.config.d_model), dtype=torch.float32
+            )
+        )
+        self.exported_encoder = self._export_encoder(example_encoder_input_ids)
+        self.exported_decoder = self._export_decoder(
+            example_decoder_input_ids, example_encoder_hidden_states, example_cache_position
+        )
+
+        # Return self to allow chaining
+        return self
+
+    def generate(self, prompt_token_ids, max_new_tokens):
+        with torch.no_grad():
+            # Run encoder
+            encoder_output = self.exported_encoder.module()(prompt_token_ids)
+
+            # Initialize with start token (0 for T5)
+            decoder_input_ids = torch.tensor([[0]], dtype=torch.long)
+            generated_ids = [0]
+
+            # Generate tokens one by one
+            for i in range(max_new_tokens - 1):
+                # Run decoder for next token prediction
+                logits = self.exported_decoder.module()(
+                    decoder_input_ids, encoder_output, torch.tensor([i], dtype=torch.long)
+                )
+
+                # Get next token
+                next_token = torch.argmax(logits[:, -1, :], dim=-1).item()
+                generated_ids.append(next_token)
+
+                # Update input for next iteration
+                decoder_input_ids = torch.tensor([[next_token]], dtype=torch.long)
+
+                # Check if EOS token
+                if next_token == self.config.eos_token_id:
+                    break
+
+            return generated_ids
diff --git a/src/transformers/integrations/fbgemm_fp8.py b/src/transformers/integrations/fbgemm_fp8.py
index 71c2b570cc0a..5cca37f51510 100644
--- a/src/transformers/integrations/fbgemm_fp8.py
+++ b/src/transformers/integrations/fbgemm_fp8.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ..activations import ACT2FN
 from ..utils import is_accelerate_available, is_fbgemm_gpu_available, is_torch_available, logging
 
 
@@ -28,36 +29,36 @@
 logger = logging.get_logger(__name__)
 
 
-class FbgemmFp8Linear(torch.nn.Module):
+class FbgemmFp8Linear(torch.nn.Linear):
     def __init__(self, in_features, out_features, bias, weight_dtype=torch.float32):
-        super().__init__()
+        super().__init__(in_features, out_features, bias)
         self.in_features = in_features
         self.out_features = out_features
 
-        self.register_buffer("weight", torch.zeros((out_features, in_features), dtype=torch.float8_e4m3fn))
-        self.register_buffer("weight_scale", torch.zeros((out_features, 1), dtype=weight_dtype))
+        self.weight = torch.nn.Parameter(torch.zeros((out_features, in_features), dtype=torch.float8_e4m3fn))
+        self.weight_scale = torch.nn.Parameter(torch.zeros((out_features, 1), dtype=weight_dtype))
         self.register_buffer("input_scale_ub", torch.zeros([1], dtype=torch.float), persistent=False)
 
         if bias:
-            self.register_buffer("bias", torch.zeros((self.out_features), dtype=weight_dtype))
+            self.bias = torch.nn.Parameter(torch.zeros((self.out_features), dtype=weight_dtype))
         else:
             self.bias = None
 
     def forward(self, x):
-        num_tokens = None
         # quantize_fp8_per_row will squash the leading dimensions, so save the desired shape here
         output_shape = (*x.shape[:-1], -1)
         # x_quantized and x_scale are not necessarily on the same device as x, this is an issue.
         # https://github.com/pytorch/FBGEMM/blob/e08af8539c391437f447173863df0f3f6f6f1855/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu#L1237C3-L1237C45
         x_quantized, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
-            x.view(-1, x.shape[-1]), num_tokens, self.input_scale_ub
+            x.view(-1, x.shape[-1]), scale_ub=self.input_scale_ub
         )
         # moving x_quantized, x_scale here creates glibberish output ... However, if we move the output, it works
         # x_quantized, x_scale = x_quantized.to(x.device), x_scale.to(x.device)
 
         # The computation still happens on the device where self.weight is even if x_quantized is not on the same device as self.weight
+        weight_scale_float32 = self.weight_scale.to(torch.float32)
         output = torch.ops.fbgemm.f8f8bf16_rowwise(
-            x_quantized, self.weight, x_scale, self.weight_scale, use_fast_accum=True
+            x_quantized, self.weight, x_scale, weight_scale_float32, use_fast_accum=True
         )
         output = output + self.bias if self.bias is not None else output
         # Hacky for now, we have the output to the device of x
@@ -67,6 +68,92 @@ def forward(self, x):
         return output
 
 
+class FbgemmFp8Llama4TextExperts(nn.Module):
+    def __init__(self, config, dtype=torch.float32):
+        super().__init__()
+        self.num_experts = config.num_local_experts
+        self.intermediate_size = config.intermediate_size
+        self.hidden_size = config.hidden_size
+        self.expert_dim = self.intermediate_size
+        self.act_fn = ACT2FN[config.hidden_act]
+        # Register FP8 buffers for gate_up_proj
+        self.gate_up_proj = torch.nn.Parameter(
+            torch.zeros((self.num_experts, self.hidden_size, 2 * self.expert_dim), dtype=torch.float8_e4m3fn)
+        )
+        self.gate_up_proj_scale = torch.nn.Parameter(
+            torch.zeros((self.num_experts, 1, self.expert_dim * 2), dtype=torch.float32)
+        )
+        # Register FP8 buffers for down_proj
+        self.down_proj = torch.nn.Parameter(
+            torch.zeros((self.num_experts, self.expert_dim, self.hidden_size), dtype=torch.float8_e4m3fn)
+        )
+        self.down_proj_scale = torch.nn.Parameter(
+            torch.zeros((self.num_experts, self.hidden_size, 1), dtype=torch.float32)
+        )
+        # Register input scale upper bound
+        self.register_buffer("input_scale_ub", torch.zeros([1], dtype=torch.float), persistent=False)
+
+    def forward(self, hidden_states):
+        """
+        Args:
+            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
+        Returns:
+            torch.Tensor: (batch_size * token_num, hidden_size)
+        """
+        # Reshape hidden states for expert computation
+        hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size)
+        num_tokens = None
+
+        # Pre-allocate tensor for all expert outputs with same shape as hidden_states
+        next_states = torch.empty_like(hidden_states)
+
+        for i in range(self.num_experts):
+            # Extract expert's hidden states
+            expert_hidden = hidden_states[i]
+            expert_hidden_reshaped = expert_hidden.reshape(-1, self.hidden_size)
+            # Quantize for this expert
+            expert_quantized, expert_scale = torch.ops.fbgemm.quantize_fp8_per_row(
+                expert_hidden_reshaped, num_tokens, self.input_scale_ub
+            )
+            sharded_expert_dim = self.gate_up_proj.shape[-1] // 2
+            gate_up_proj_scale_float32 = self.gate_up_proj_scale.to(torch.float32)
+
+            gate = torch.ops.fbgemm.f8f8bf16_rowwise(
+                expert_quantized,
+                self.gate_up_proj[i].transpose(0, 1)[:sharded_expert_dim].contiguous(),
+                expert_scale,
+                gate_up_proj_scale_float32[i][0][:sharded_expert_dim].view(-1, 1).contiguous(),
+                use_fast_accum=True,
+            )
+
+            up = torch.ops.fbgemm.f8f8bf16_rowwise(
+                expert_quantized,
+                self.gate_up_proj[i].transpose(0, 1)[sharded_expert_dim:].contiguous(),
+                expert_scale,
+                gate_up_proj_scale_float32[i][0][sharded_expert_dim:].view(-1, 1).contiguous(),
+                use_fast_accum=True,
+            )
+
+            activated = up * self.act_fn(gate)
+
+            activated_quantized, activated_scale = torch.ops.fbgemm.quantize_fp8_per_row(
+                activated, num_tokens, self.input_scale_ub
+            )
+
+            down_proj_scale_float32 = self.down_proj_scale.to(torch.float32)
+            expert_output = torch.ops.fbgemm.f8f8bf16_rowwise(
+                activated_quantized,
+                self.down_proj[i].transpose(0, 1).contiguous(),
+                activated_scale,
+                down_proj_scale_float32[i].view(-1, 1).contiguous(),
+                use_fast_accum=True,
+            )
+
+            next_states[i] = expert_output
+        next_states = next_states.to(hidden_states.device)
+        return next_states.view(-1, self.hidden_size)
+
+
 def _replace_with_fbgemm_fp8_linear(
     model,
     modules_to_not_convert=None,
@@ -74,12 +161,17 @@ def _replace_with_fbgemm_fp8_linear(
     quantization_config=None,
     has_been_replaced=False,
     pre_quantized=False,
+    config=None,
+    tp_plan=None,
 ):
     """
     Private method that wraps the recursion for module replacement.
 
     Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
     """
+
+    import re
+
     if current_key_name is None:
         current_key_name = []
 
@@ -105,9 +197,27 @@ def _replace_with_fbgemm_fp8_linear(
                     # Force requires grad to False to avoid unexpected errors
                     model._modules[name].requires_grad_(False)
                 # set non persistant buffer outside of init_empty_weights
+                model._modules[name].input_scale_ub = torch.tensor(
+                    [quantization_config.activation_scale_ub],
+                    dtype=torch.float,
+                )
+        if module.__class__.__name__ == "Llama4TextExperts" and name not in modules_to_not_convert:
+            current_key_name_str = ".".join(current_key_name)
+            if not any(
+                (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert
+            ):
+                with init_empty_weights(include_buffers=True):
+                    tp_plan[re.sub(r"\d+", "*", current_key_name_str + ".gate_up_proj_scale")] = tp_plan[
+                        re.sub(r"\d+", "*", current_key_name_str + ".gate_up_proj")
+                    ]
+                    tp_plan[re.sub(r"\d+", "*", current_key_name_str + ".down_proj_scale")] = None
+                    model._modules[name] = FbgemmFp8Llama4TextExperts(
+                        config.text_config,
+                    )
                 model._modules[name].input_scale_ub = torch.tensor(
                     [quantization_config.activation_scale_ub], dtype=torch.float
                 )
+
         if len(list(module.children())) > 0:
             _, has_been_replaced = _replace_with_fbgemm_fp8_linear(
                 module,
@@ -116,6 +226,8 @@ def _replace_with_fbgemm_fp8_linear(
                 quantization_config,
                 has_been_replaced=has_been_replaced,
                 pre_quantized=pre_quantized,
+                config=config,
+                tp_plan=tp_plan,
             )
         # Remove the last key for recursion
         current_key_name.pop(-1)
@@ -123,7 +235,13 @@ def _replace_with_fbgemm_fp8_linear(
 
 
 def replace_with_fbgemm_fp8_linear(
-    model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, pre_quantized=False
+    model,
+    modules_to_not_convert=None,
+    current_key_name=None,
+    quantization_config=None,
+    pre_quantized=False,
+    config=None,
+    tp_plan=None,
 ):
     """
     A helper function to replace all `torch.nn.Linear` modules by `FbgemmFp8Linear` modules.
@@ -151,9 +269,14 @@ def replace_with_fbgemm_fp8_linear(
         modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
     modules_to_not_convert = list(set(modules_to_not_convert))
     model, has_been_replaced = _replace_with_fbgemm_fp8_linear(
-        model, modules_to_not_convert, current_key_name, quantization_config, pre_quantized=pre_quantized
+        model,
+        modules_to_not_convert,
+        current_key_name,
+        quantization_config,
+        pre_quantized=pre_quantized,
+        config=config,
+        tp_plan=tp_plan,
     )
-
     if not has_been_replaced:
         logger.warning(
             "You are loading your model using FP8 quantization but no linear modules were found in your model."
diff --git a/src/transformers/integrations/finegrained_fp8.py b/src/transformers/integrations/finegrained_fp8.py
index 0815df434453..d4e472a990dc 100644
--- a/src/transformers/integrations/finegrained_fp8.py
+++ b/src/transformers/integrations/finegrained_fp8.py
@@ -291,7 +291,7 @@ def w8a8_block_fp8_matmul_compile(
     return output.to(output_dtype)
 
 
-class FP8Linear(nn.Module):
+class FP8Linear(nn.Linear):
     dtype = torch.float8_e4m3fn
 
     def __init__(
@@ -304,17 +304,20 @@ def __init__(
         device=None,
         activation_scheme="dynamic",
     ):
-        super().__init__()
+        super().__init__(in_features, out_features)
         self.in_features = in_features
         self.out_features = out_features
 
-        self.register_buffer("weight", torch.empty(out_features, in_features, dtype=FP8Linear.dtype, device=device))
+        self.weight = torch.nn.Parameter(torch.empty(out_features, in_features, dtype=FP8Linear.dtype, device=device))
 
-        scale_out_features = (out_features + block_size[0] - 1) // block_size[0]
-        scale_in_features = (in_features + block_size[1] - 1) // block_size[1]
-        self.register_buffer(
-            "weight_scale_inv", torch.empty(scale_out_features, scale_in_features, dtype=torch.float32, device=device)
-        )
+        if self.weight.element_size() == 1:
+            scale_out_features = (out_features + block_size[0] - 1) // block_size[0]
+            scale_in_features = (in_features + block_size[1] - 1) // block_size[1]
+            self.weight_scale_inv = nn.Parameter(
+                torch.empty(scale_out_features, scale_in_features, dtype=torch.float32, device=device)
+            )
+        else:
+            self.register_parameter("weight_scale_inv", None)
 
         self.block_size = block_size
 
@@ -330,11 +333,11 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
             return F.linear(input, self.weight, self.bias)
         else:
             # Context manager used to switch among the available cuda devices
-            with torch.cuda.device(input.device):
-                qinput, scale = act_quant(input, self.block_size[1])
+            # with torch.cuda.device(input.device):
+            qinput, scale = act_quant(input, self.block_size[1])
             # Blocks the CPU until all CUDA operations on the specified device are complete. It is used to ensure that the results of the
             # preceding operations are ready before proceeding
-            torch.cuda.synchronize(device=input.device)
+            # torch.cuda.synchronize(device=self.weight.device)
             with torch.cuda.device(input.device):
                 output = w8a8_block_fp8_matmul_triton(
                     qinput,
@@ -344,7 +347,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
                     self.block_size,
                     output_dtype=input.dtype,
                 )
-            torch.cuda.synchronize(device=input.device)
+            torch.cuda.synchronize()
             if self.bias is not None:
                 output = output + self.bias
             return output.to(dtype=input.dtype)
@@ -352,6 +355,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
 def _replace_with_fp8_linear(
     model,
+    tp_plan=None,
     modules_to_not_convert=None,
     current_key_name=None,
     quantization_config=None,
@@ -378,10 +382,12 @@ def _replace_with_fp8_linear(
                         block_size=quantization_config.weight_block_size,
                     )
                     has_been_replaced = True
+            # when changing a layer the TP PLAN for that layer should be updated. TODO
 
         if len(list(module.children())) > 0:
             _, has_been_replaced = _replace_with_fp8_linear(
                 module,
+                tp_plan,
                 modules_to_not_convert,
                 current_key_name,
                 quantization_config,
@@ -404,9 +410,9 @@ def replace_with_fp8_linear(
     if quantization_config.modules_to_not_convert is not None:
         modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
     modules_to_not_convert = list(set(modules_to_not_convert))
-
     model, has_been_replaced = _replace_with_fp8_linear(
         model,
+        tp_plan=model._tp_plan,
         modules_to_not_convert=modules_to_not_convert,
         quantization_config=quantization_config,
     )
diff --git a/src/transformers/integrations/flash_attention.py b/src/transformers/integrations/flash_attention.py
index a3ca4bea484d..a78166ed040b 100644
--- a/src/transformers/integrations/flash_attention.py
+++ b/src/transformers/integrations/flash_attention.py
@@ -2,11 +2,10 @@
 
 import torch
 
-from ..modeling_flash_attention_utils import _flash_attention_forward
-from ..utils import is_flash_attn_greater_or_equal_2_10
+from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
 
 
-_use_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+_use_top_left_mask = flash_attn_supports_top_left_mask()
 
 
 def flash_attention_forward(
diff --git a/src/transformers/integrations/flex_attention.py b/src/transformers/integrations/flex_attention.py
index 5181b2c1a0ad..1aa146e4a407 100644
--- a/src/transformers/integrations/flex_attention.py
+++ b/src/transformers/integrations/flex_attention.py
@@ -1,4 +1,32 @@
-from typing import Optional, Tuple
+"""
+Partially inspired by torchtune's flex attention implementation
+
+Citation:
+@software{torchtune,
+  title = {torchtune: PyTorch's finetuning library},
+  author = {torchtune maintainers and contributors},
+  url = {https//github.com/pytorch/torchtune},
+  license = {BSD-3-Clause},
+  month = apr,
+  year = {2024}
+}
+"""
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
 
 import torch
 
@@ -6,7 +34,147 @@
 
 
 if is_torch_flex_attn_available():
-    from torch.nn.attention.flex_attention import flex_attention
+    from torch.nn.attention.flex_attention import BlockMask, flex_attention
+    from torch.nn.attention.flex_attention import (
+        create_block_mask as create_block_causal_mask_flex,
+    )
+
+
+class WrappedFlexAttention:
+    """
+    We are doing a singleton class so that flex attention is compiled once when it's first called.
+    """
+
+    _instance = None
+    _is_flex_compiled = False
+    _compiled_flex_attention = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            # Create a new instance if one doesn't already exist
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    @torch.compiler.disable(recursive=False)
+    def __init__(self):
+        """
+        Initialize or update the singleton instance.
+        """
+        if self._is_flex_compiled is False:
+            self._compiled_flex_attention = torch.compile(flex_attention, backend="inductor")
+            self._is_flex_compiled = True
+
+    def __call__(self):
+        return self._compiled_flex_attention
+
+
+Offset = Union[torch.Tensor, int]
+
+
+def make_flex_block_causal_mask(
+    attention_mask_2d: torch.Tensor,
+    attention_chunk_size: Optional[int] = None,
+    query_length=None,
+    key_length=None,
+    offsets: Optional[Tuple[Offset, Offset]] = None,
+) -> "BlockMask":
+    """
+    Create a block causal document mask for a batch of sequences, both packed and unpacked.
+    Create Block causal logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
+    The resultant BlockMask is a compressed representation of the full block causal
+    mask. BlockMask is essential for performant computation of flex attention.
+    See: https://pytorch.org/blog/flexattention/
+
+    Args:
+        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
+        of shape (batch_size, total_seq_len). e.g.
+
+        For unpacked sequence:
+        [[1, 1, 1, 1, 0, 0, 0],
+         [1, 1, 1, 1, 1, 0, 0]]
+
+        For packed sequence:
+        [[1, 1, 1, 2, 2, 2, 0],
+         [1, 1, 2, 2, 2, 3, 3]]
+
+    Returns:
+        BlockMask
+    """
+    attention_mask_2d = torch.nn.functional.pad(attention_mask_2d, value=0, pad=(0, key_length))
+    device = attention_mask_2d.device
+    document_ids = attention_mask_2d.clone()
+
+    if attention_chunk_size is not None:
+        # we create an arange, then we just // by chunk size to get [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
+        document_ids = (document_ids.fill_(1).cumsum(-1) - 1) // (attention_chunk_size)
+
+    # Instead of passing a tensor mask, flex attention requires a mask_mod function
+    # that determines which elements of QK^T should be included in the attention
+    # computation prior to the softmax. For sample packing, we need both the
+    # logic for both causal mask and document mask. See PyTorch's official
+    # blog post for more details: https://pytorch.org/blog/flexattention/#mask-mods
+    def causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
+        """
+        Defines the logic of a block causal mask by combining both a standard causal mask
+        and a block diagonal document mask.
+
+        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
+        for an illustration.
+        """
+        causal_mask = q_idx >= kv_idx  # not valid when decoding
+        document_mask = document_ids[batch_idx, q_idx] == document_ids[batch_idx, kv_idx]
+        padding_mask = attention_mask_2d[batch_idx, q_idx] > 0
+        final_mask = causal_mask & padding_mask & document_mask
+        return final_mask
+
+    if offsets is not None:
+        q_offset = offsets[0]
+        kv_offset = offsets[1]
+
+        def mask_mod(batch_idx, head_idx, q_idx, kv_idx):
+            offset_q = q_idx + q_offset
+            offset_kv = kv_idx + kv_offset
+            return causal_mask_mod(batch_idx, head_idx, offset_q, offset_kv)
+    else:
+        mask_mod = causal_mask_mod
+    return create_block_causal_mask_flex(
+        mask_mod=mask_mod,
+        B=1,
+        H=None,  # attention head
+        Q_LEN=query_length,
+        KV_LEN=key_length,
+        device=device,
+        _compile=True,
+    )
+
+
+@torch.compiler.disable(recursive=False)
+def compile_friendly_flex_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    **kwargs,
+) -> torch.Tensor:
+    # First call initialise singleton wrapper object, second call invokes the object method to return compiled flex attention
+    flex_attention_compiled = WrappedFlexAttention()()
+    return flex_attention_compiled(
+        query,
+        key,
+        value,
+        **kwargs,
+    )
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
 def flex_attention_forward(
@@ -14,32 +182,50 @@ def flex_attention_forward(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
+    attention_mask: Union[torch.Tensor, "BlockMask"],
     scaling: Optional[float] = None,
     softcap: Optional[float] = None,
     head_mask: Optional[torch.Tensor] = None,
     **kwargs,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-    causal_mask = attention_mask
+    block_mask = None
+    causal_mask = None
+    if isinstance(attention_mask, BlockMask):
+        block_mask = attention_mask
+    else:
+        causal_mask = attention_mask
+
     if causal_mask is not None:
         causal_mask = causal_mask[:, :, :, : key.shape[-2]]
 
-    def causal_mod(score, b, h, q_idx, kv_idx):
+    def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
         if softcap is not None:
             score = softcap * torch.tanh(score / softcap)
         if causal_mask is not None:
-            score = score + causal_mask[b][0][q_idx][kv_idx]
+            score = score + causal_mask[batch_idx][0][q_idx][kv_idx]
         if head_mask is not None:
-            score = score + head_mask[b][h][0][0]
+            score = score + head_mask[batch_idx][head_idx][0][0]
         return score
 
-    attn_output, attention_weights = flex_attention(
+    enable_gqa = True
+    num_local_query_heads = query.shape[1]
+
+    # When running TP this helps:
+    if not ((num_local_query_heads & (num_local_query_heads - 1)) == 0):
+        key = repeat_kv(key, query.shape[1] // key.shape[1])
+        value = repeat_kv(value, query.shape[1] // value.shape[1])
+        enable_gqa = False
+
+    kernel_options = kwargs.get("kernel_options", None)
+    attn_output, attention_weights = compile_friendly_flex_attention(
         query,
         key,
         value,
-        score_mod=causal_mod,
-        enable_gqa=True,
+        score_mod=score_mod,
+        block_mask=block_mask,
+        enable_gqa=enable_gqa,
         scale=scaling,
+        kernel_options=kernel_options,
         # Last time checked on PyTorch == 2.5.1: Flex Attention always computes the lse regardless.
         # For simplification, we thus always return it as no additional computations are introduced.
         return_lse=True,
diff --git a/src/transformers/integrations/higgs.py b/src/transformers/integrations/higgs.py
index 3ba35eb4e4fd..dd31764dfe0c 100644
--- a/src/transformers/integrations/higgs.py
+++ b/src/transformers/integrations/higgs.py
@@ -446,7 +446,7 @@ def quantize_with_higgs(weight, bits: int = 4, p: int = 2, group_size: int = 256
 
     device = weight.device
     dtype = weight.dtype
-    weight = weight.clone().float()
+    weight = weight.to(copy=True, dtype=torch.float32)
     # Pad to Hadamard transform size
     weight = pad_to_block(weight, [1], hadamard_size)
 
diff --git a/src/transformers/integrations/hqq.py b/src/transformers/integrations/hqq.py
index 162b365668a0..4ff154ee2056 100755
--- a/src/transformers/integrations/hqq.py
+++ b/src/transformers/integrations/hqq.py
@@ -106,11 +106,11 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
 
     if any(key in linear_tags for key in quant_config.keys()):
         # If the user doesn't specify a key from get_linear_tags, the layer is not quantized via (key, None)
-        patch_params = {key: None for key in linear_tags}
+        patch_params = dict.fromkeys(linear_tags)
         patch_params.update(quant_config)
     else:
         # Same quant_config for all layers
-        patch_params = {k: quant_config for k in linear_tags}
+        patch_params = dict.fromkeys(linear_tags, quant_config)
 
     model, has_been_replaced = _prepare_for_hqq_linear(
         model, patch_params=patch_params, has_been_replaced=has_been_replaced
diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py
new file mode 100644
index 000000000000..b2ec6b53715a
--- /dev/null
+++ b/src/transformers/integrations/hub_kernels.py
@@ -0,0 +1,73 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Union
+
+
+try:
+    from kernels import (
+        Device,
+        LayerRepository,
+        register_kernel_mapping,
+        replace_kernel_forward_from_hub,
+        use_kernel_forward_from_hub,
+    )
+
+    _hub_kernels_available = True
+
+    _KERNEL_MAPPING: Dict[str, Dict[Union[Device, str], LayerRepository]] = {
+        "MultiScaleDeformableAttention": {
+            "cuda": LayerRepository(
+                repo_id="kernels-community/deformable-detr",
+                layer_name="MultiScaleDeformableAttention",
+            )
+        }
+    }
+
+    register_kernel_mapping(_KERNEL_MAPPING)
+
+except ImportError:
+    # Stub to make decorators int transformers work when `kernels`
+    # is not installed.
+    def use_kernel_forward_from_hub(*args, **kwargs):
+        def decorator(cls):
+            return cls
+
+        return decorator
+
+    class LayerRepository:
+        def __init__(self, *args, **kwargs):
+            raise RuntimeError("LayerRepository requires `kernels` to be installed. Run `pip install kernels`.")
+
+    def replace_kernel_forward_from_hub(*args, **kwargs):
+        raise RuntimeError(
+            "replace_kernel_forward_from_hub requires `kernels` to be installed. Run `pip install kernels`."
+        )
+
+    def register_kernel_mapping(*args, **kwargs):
+        raise RuntimeError("register_kernel_mapping requires `kernels` to be installed. Run `pip install kernels`.")
+
+    _hub_kernels_available = False
+
+
+def is_hub_kernels_available():
+    return _hub_kernels_available
+
+
+__all__ = [
+    "LayerRepository",
+    "is_hub_kernels_available",
+    "use_kernel_forward_from_hub",
+    "register_kernel_mapping",
+    "replace_kernel_forward_from_hub",
+]
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index fb9956905bbd..a696612c3bfd 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -204,6 +204,10 @@ def is_dvclive_available():
     return importlib.util.find_spec("dvclive") is not None
 
 
+def is_swanlab_available():
+    return importlib.util.find_spec("swanlab") is not None
+
+
 def hp_params(trial):
     if is_optuna_available():
         import optuna
@@ -581,11 +585,19 @@ def _objective():
 
         return trainer.objective
 
-    sweep_id = wandb.sweep(sweep_config, project=project, entity=entity) if not sweep_id else sweep_id
+    if not sweep_id:
+        sweep_id = wandb.sweep(sweep_config, project=project, entity=entity)
+    else:
+        import wandb.env
+
+        if entity:
+            wandb.env.set_entity(entity)
+        wandb.env.set_project(project)
+
     logger.info(f"wandb sweep id - {sweep_id}")
     wandb.agent(sweep_id, function=_objective, count=n_trials)
 
-    return BestRun(best_trial["run_id"], best_trial["objective"], best_trial["hyperparameters"])
+    return BestRun(best_trial["run_id"], best_trial["objective"], best_trial["hyperparameters"], sweep_id)
 
 
 def get_available_reporting_integrations():
@@ -610,6 +622,8 @@ def get_available_reporting_integrations():
         integrations.append("codecarbon")
     if is_clearml_available():
         integrations.append("clearml")
+    if is_swanlab_available():
+        integrations.append("swanlab")
     return integrations
 
 
@@ -1218,7 +1232,7 @@ def setup(self, args, state, model):
             Whether to use MLflow nested runs. If set to `True` or *1*, will create a nested run inside the current
             run.
         - **MLFLOW_RUN_ID** (`str`, *optional*):
-            Allow to reattach to an existing run which can be usefull when resuming training from a checkpoint. When
+            Allow to reattach to an existing run which can be useful when resuming training from a checkpoint. When
             `MLFLOW_RUN_ID` environment variable is set, `start_run` attempts to resume a run with the specified run ID
             and other parameters are ignored.
         - **MLFLOW_FLATTEN_PARAMS** (`str`, *optional*, defaults to `False`):
@@ -2141,6 +2155,162 @@ def on_train_end(self, args, state, control, **kwargs):
             self.live.end()
 
 
+class SwanLabCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that logs metrics, media, model checkpoints to [SwanLab](https://swanlab.cn/).
+    """
+
+    def __init__(self):
+        if not is_swanlab_available():
+            raise RuntimeError("SwanLabCallback requires swanlab to be installed. Run `pip install swanlab`.")
+        import swanlab
+
+        self._swanlab = swanlab
+        self._initialized = False
+        self._log_model = os.getenv("SWANLAB_LOG_MODEL", None)
+
+    def setup(self, args, state, model, **kwargs):
+        """
+        Setup the optional SwanLab (*swanlab*) integration.
+
+        One can subclass and override this method to customize the setup if needed. Find more information
+        [here](https://docs.swanlab.cn/guide_cloud/integration/integration-huggingface-transformers.html).
+
+        You can also override the following environment variables. Find more information about environment
+        variables [here](https://docs.swanlab.cn/en/api/environment-variable.html#environment-variables)
+
+        Environment:
+        - **SWANLAB_API_KEY** (`str`, *optional*, defaults to `None`):
+            Cloud API Key. During login, this environment variable is checked first. If it doesn't exist, the system
+            checks if the user is already logged in. If not, the login process is initiated.
+
+                - If a string is passed to the login interface, this environment variable is ignored.
+                - If the user is already logged in, this environment variable takes precedence over locally stored
+                login information.
+
+        - **SWANLAB_PROJECT** (`str`, *optional*, defaults to `None`):
+            Set this to a custom string to store results in a different project. If not specified, the name of the current
+            running directory is used.
+
+        - **SWANLAB_LOG_DIR** (`str`, *optional*, defaults to `swanlog`):
+            This environment variable specifies the storage path for log files when running in local mode.
+            By default, logs are saved in a folder named swanlog under the working directory.
+
+        - **SWANLAB_MODE** (`Literal["local", "cloud", "disabled"]`, *optional*, defaults to `cloud`):
+            SwanLab's parsing mode, which involves callbacks registered by the operator. Currently, there are three modes:
+            local, cloud, and disabled. Note: Case-sensitive. Find more information
+            [here](https://docs.swanlab.cn/en/api/py-init.html#swanlab-init)
+
+        - **SWANLAB_LOG_MODEL** (`str`, *optional*, defaults to `None`):
+            SwanLab does not currently support the save mode functionality.This feature will be available in a future
+            release
+
+        - **SWANLAB_WEB_HOST** (`str`, *optional*, defaults to `None`):
+            Web address for the SwanLab cloud environment for private version (its free)
+
+        - **SWANLAB_API_HOST** (`str`, *optional*, defaults to `None`):
+            API address for the SwanLab cloud environment for private version (its free)
+
+        """
+        self._initialized = True
+
+        if state.is_world_process_zero:
+            logger.info('Automatic SwanLab logging enabled, to disable set os.environ["SWANLAB_MODE"] = "disabled"')
+            combined_dict = {**args.to_dict()}
+
+            if hasattr(model, "config") and model.config is not None:
+                model_config = model.config if isinstance(model.config, dict) else model.config.to_dict()
+                combined_dict = {**model_config, **combined_dict}
+            if hasattr(model, "peft_config") and model.peft_config is not None:
+                peft_config = model.peft_config
+                combined_dict = {**{"peft_config": peft_config}, **combined_dict}
+            trial_name = state.trial_name
+            init_args = {}
+            if trial_name is not None:
+                init_args["experiment_name"] = f"{args.run_name}-{trial_name}"
+            elif args.run_name is not None:
+                init_args["experiment_name"] = args.run_name
+            init_args["project"] = os.getenv("SWANLAB_PROJECT", None)
+
+            if self._swanlab.get_run() is None:
+                self._swanlab.init(
+                    **init_args,
+                )
+            # show transformers logo!
+            self._swanlab.config["FRAMEWORK"] = "🤗transformers"
+            # add config parameters (run may have been created manually)
+            self._swanlab.config.update(combined_dict)
+
+            # add number of model parameters to swanlab config
+            try:
+                self._swanlab.config.update({"model_num_parameters": model.num_parameters()})
+                # get peft model parameters
+                if type(model).__name__ == "PeftModel" or type(model).__name__ == "PeftMixedModel":
+                    trainable_params, all_param = model.get_nb_trainable_parameters()
+                    self._swanlab.config.update({"peft_model_trainable_params": trainable_params})
+                    self._swanlab.config.update({"peft_model_all_param": all_param})
+            except AttributeError:
+                logger.info("Could not log the number of model parameters in SwanLab due to an AttributeError.")
+
+            # log the initial model architecture to an artifact
+            if self._log_model is not None:
+                logger.warning(
+                    "SwanLab does not currently support the save mode functionality. "
+                    "This feature will be available in a future release."
+                )
+                badge_markdown = (
+                    f'[<img src="https://raw.githubusercontent.com/SwanHubX/assets/main/badge1.svg"'
+                    f' alt="Visualize in SwanLab" height="28'
+                    f'0" height="32"/>]({self._swanlab.get_run().public.cloud.exp_url})'
+                )
+
+                modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}"
+
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        if not self._initialized:
+            self.setup(args, state, model, **kwargs)
+
+    def on_train_end(self, args, state, control, model=None, processing_class=None, **kwargs):
+        if self._log_model is not None and self._initialized and state.is_world_process_zero:
+            logger.warning(
+                "SwanLab does not currently support the save mode functionality. "
+                "This feature will be available in a future release."
+            )
+
+    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
+        single_value_scalars = [
+            "train_runtime",
+            "train_samples_per_second",
+            "train_steps_per_second",
+            "train_loss",
+            "total_flos",
+        ]
+
+        if not self._initialized:
+            self.setup(args, state, model)
+        if state.is_world_process_zero:
+            for k, v in logs.items():
+                if k in single_value_scalars:
+                    self._swanlab.log({f"single_value/{k}": v}, step=state.global_step)
+            non_scalar_logs = {k: v for k, v in logs.items() if k not in single_value_scalars}
+            non_scalar_logs = rewrite_logs(non_scalar_logs)
+            self._swanlab.log({**non_scalar_logs, "train/global_step": state.global_step}, step=state.global_step)
+
+    def on_save(self, args, state, control, **kwargs):
+        if self._log_model is not None and self._initialized and state.is_world_process_zero:
+            logger.warning(
+                "SwanLab does not currently support the save mode functionality. "
+                "This feature will be available in a future release."
+            )
+
+    def on_predict(self, args, state, control, metrics, **kwargs):
+        if not self._initialized:
+            self.setup(args, state, **kwargs)
+        if state.is_world_process_zero:
+            metrics = rewrite_logs(metrics)
+            self._swanlab.log(metrics)
+
+
 INTEGRATION_TO_CALLBACK = {
     "azure_ml": AzureMLCallback,
     "comet_ml": CometCallback,
@@ -2153,6 +2323,7 @@ def on_train_end(self, args, state, control, **kwargs):
     "dagshub": DagsHubCallback,
     "flyte": FlyteCallback,
     "dvclive": DVCLiveCallback,
+    "swanlab": SwanLabCallback,
 }
 
 
diff --git a/src/transformers/integrations/npu_flash_attention.py b/src/transformers/integrations/npu_flash_attention.py
new file mode 100644
index 000000000000..fa26379126fc
--- /dev/null
+++ b/src/transformers/integrations/npu_flash_attention.py
@@ -0,0 +1,233 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+import torch.nn.functional as F
+
+from ..utils.import_utils import is_torch_npu_available
+
+
+if is_torch_npu_available():
+    import torch_npu
+    from einops import rearrange, repeat
+
+
+# FlashAttention2 is supported on Ascend NPU with down-right aligned causal mask by default.
+# Set environment variable `NPU_FA2_SPARSE_MODE` to 2 when using top-left aligned causal mask.
+TOP_LEFT_ALIGNED_CAUSAL_MASK_MODE = 2
+DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODE = 3
+
+SPARSE_MODE = int(os.getenv("NPU_FA2_SPARSE_MODE", default=DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODE))
+if SPARSE_MODE not in [TOP_LEFT_ALIGNED_CAUSAL_MASK_MODE, DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODE]:
+    raise ValueError(
+        "Environment variable `NPU_FA2_SPARSE_MODE` can only be set as 2 (top-left aligned causal mask) "
+        "or 3 (down-right aligned causal mask)."
+    )
+
+
+def is_npu_fa2_top_left_aligned_causal_mask():
+    return SPARSE_MODE == TOP_LEFT_ALIGNED_CAUSAL_MASK_MODE if is_torch_npu_available() else False
+
+
+# Copied from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/bert_padding.py
+class IndexFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        # return input[indices]
+        return torch.gather(
+            rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
+        ).reshape(-1, *other_shape)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = rearrange(grad_output, "b ... -> b (...)")
+        grad_input = torch.zeros(
+            [ctx.first_axis_dim, grad_output.shape[1]],
+            device=grad_output.device,
+            dtype=grad_output.dtype,
+        )
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        # grad_input[indices] = grad_output
+        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+
+
+index_first_axis = IndexFirstAxis.apply
+
+
+# Copied from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/bert_padding.py
+class IndexPutFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, values, indices, first_axis_dim):
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = torch.zeros(first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype)
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        output[indices] = values
+        # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        grad_values = grad_output[indices]
+        # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
+        return grad_values, None, None
+
+
+index_put_first_axis = IndexPutFirstAxis.apply
+
+
+# Copied from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/bert_padding.py
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
+        batch: int, batch size for the padded sequence.
+        seqlen: int, maximum sequence length for the padded sequence.
+    Return:
+        hidden_states: (batch, seqlen, ...)
+    """
+    # dim = hidden_states.shape[-1]
+    # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
+    # output[indices] = hidden_states
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)
+
+
+# Copied from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/bert_padding.py
+def unpad_input(hidden_states, attention_mask, unused_mask=None):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
+        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
+    """
+    all_masks = (attention_mask + unused_mask) if unused_mask is not None else attention_mask
+    seqlens_in_batch = all_masks.sum(dim=-1, dtype=torch.int32)
+    used_seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(all_masks.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+        used_seqlens_in_batch,
+    )
+
+
+def npu_flash_attn_func(
+    q,
+    k,
+    v,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    **kwargs,
+):
+    keep_prob = 1.0 - dropout_p
+
+    if not causal:
+        head_num = q.shape[2]
+        output = torch_npu.npu_fusion_attention(q, k, v, head_num, "BSND", keep_prob=keep_prob, scale=softmax_scale)[0]
+    else:
+        attn_mask_npu = torch.triu(torch.ones([2048, 2048]), diagonal=1).bool().to(q.device)
+        head_num = q.shape[2]
+        output = torch_npu.npu_fusion_attention(
+            q,
+            k,
+            v,
+            head_num,
+            "BSND",
+            keep_prob=keep_prob,
+            scale=softmax_scale,
+            atten_mask=attn_mask_npu,
+            sparse_mode=SPARSE_MODE,
+        )[0]
+
+    return output
+
+
+def npu_flash_attn_varlen_func(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    **kwargs,
+):
+    keep_prob = 1.0 - dropout_p
+
+    if not causal:
+        head_num = q.shape[1]
+        output = torch_npu.npu_fusion_attention(
+            q,
+            k,
+            v,
+            head_num,
+            pse=None,
+            atten_mask=None,
+            scale=softmax_scale,
+            keep_prob=keep_prob,
+            input_layout="TND",
+            actual_seq_qlen=tuple(cu_seqlens_q[1:].cpu().numpy().tolist()),
+            actual_seq_kvlen=tuple(cu_seqlens_k[1:].cpu().numpy().tolist()),
+        )[0]
+    else:
+        attn_mask_npu = torch.triu(torch.ones([2048, 2048]), diagonal=1).bool().to(q.device)
+        head_num = q.shape[1]
+        output = torch_npu.npu_fusion_attention(
+            q,
+            k,
+            v,
+            head_num,
+            pse=None,
+            padding_mask=None,
+            atten_mask=attn_mask_npu,
+            scale=softmax_scale,
+            keep_prob=keep_prob,
+            input_layout="TND",
+            actual_seq_qlen=tuple(cu_seqlens_q[1:].cpu().numpy().tolist()),
+            actual_seq_kvlen=tuple(cu_seqlens_k[1:].cpu().numpy().tolist()),
+            sparse_mode=SPARSE_MODE,
+        )[0]
+
+    return output
diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py
index 791528a629ac..6aa3b137b191 100644
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@@ -446,7 +446,7 @@ def active_adapter(self) -> str:
 
         return self.active_adapters()[0]
 
-    def get_adapter_state_dict(self, adapter_name: Optional[str] = None) -> dict:
+    def get_adapter_state_dict(self, adapter_name: Optional[str] = None, state_dict: Optional[dict] = None) -> dict:
         """
         If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
         official documentation: https://huggingface.co/docs/peft
@@ -457,6 +457,10 @@ def get_adapter_state_dict(self, adapter_name: Optional[str] = None) -> dict:
         Args:
             adapter_name (`str`, *optional*):
                 The name of the adapter to get the state dict from. If no name is passed, the active adapter is used.
+            state_dict (nested dictionary of `torch.Tensor`, *optional*)
+                The state dictionary of the model. Will default to `self.state_dict()`, but can be used if special
+                precautions need to be taken when recovering the state dictionary of a model (like when using model
+                parallelism).
         """
         check_peft_version(min_version=MIN_PEFT_VERSION)
 
@@ -468,7 +472,7 @@ def get_adapter_state_dict(self, adapter_name: Optional[str] = None) -> dict:
         if adapter_name is None:
             adapter_name = self.active_adapters()[0]
 
-        adapter_state_dict = get_peft_model_state_dict(self, adapter_name=adapter_name)
+        adapter_state_dict = get_peft_model_state_dict(self, state_dict=state_dict, adapter_name=adapter_name)
         return adapter_state_dict
 
     def _dispatch_accelerate_model(
diff --git a/src/transformers/integrations/sdpa_attention.py b/src/transformers/integrations/sdpa_attention.py
index df6e96131a91..9c924c048ad5 100644
--- a/src/transformers/integrations/sdpa_attention.py
+++ b/src/transformers/integrations/sdpa_attention.py
@@ -31,7 +31,7 @@ def sdpa_attention_forward(
         value = repeat_kv(value, module.num_key_value_groups)
 
     causal_mask = attention_mask
-    if attention_mask is not None:
+    if attention_mask is not None and causal_mask.ndim == 4:
         causal_mask = causal_mask[:, :, :, : key.shape[-2]]
 
     # SDPA with memory-efficient backend is bugged with non-contiguous inputs and custom attn_mask for some torch versions
@@ -42,8 +42,9 @@ def sdpa_attention_forward(
 
     # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
     # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+    # Note that it is important to check first for the shape, otherwise compile will fail with `argument 'is_causal' must be bool, not SymBool`
     if is_causal is None:
-        is_causal = causal_mask is None and query.shape[2] > 1
+        is_causal = query.shape[2] > 1 and causal_mask is None
 
     # Shapes (e.g. query.shape[2]) are tensors during jit tracing, resulting in `is_causal` being a tensor.
     # We convert it to a bool for the SDPA kernel that only accepts bools.
diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
new file mode 100644
index 000000000000..34b21444fe35
--- /dev/null
+++ b/src/transformers/integrations/tensor_parallel.py
@@ -0,0 +1,672 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import re
+from functools import lru_cache, partial
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ..utils import is_torch_greater_or_equal, logging
+
+
+ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
+
+logger = logging.get_logger(__name__)
+
+# Cache this result has it's a C FFI call which can be pretty time-consuming
+_torch_distributed_available = torch.distributed.is_available()
+
+
+if is_torch_greater_or_equal("2.5") and _torch_distributed_available:
+    from torch.distributed.tensor import DTensor, Placement, Replicate, Shard
+
+
+def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]]) -> List[int]:
+    """
+    Convert block count or proportions to block sizes.
+
+    This function accepts
+
+    - The number of blocks (int), in which case the block size is
+      total_size//blocks; or
+    - A list of block sizes (List[int]).
+
+    In the second case, if sum(blocks) < total_size, the ratios between
+    the block sizes will be preserved. For instance, if blocks is
+    [2, 1, 1] and total_size is 1024, the returned block sizes are
+    [512, 256, 256].
+    """
+    if isinstance(blocks, list):
+        total_blocks = sum(blocks)
+        assert total_size % total_blocks == 0, f"Cannot split {total_size} in proportional blocks: {blocks}"
+        part_size = total_size // total_blocks
+        return [part_size * block for block in blocks]
+    else:
+        assert total_size % blocks == 0, f"Prepacked is not divisible by {blocks}"
+        single_size = total_size // blocks
+        return [single_size] * blocks
+
+
+str_to_torch_dtype = {
+    "BOOL": torch.bool,
+    "U8": torch.uint8,
+    "I8": torch.int8,
+    "I16": torch.int16,
+    "F16": torch.float16,
+    "BF16": torch.bfloat16,
+    "I32": torch.int32,
+    "F32": torch.float32,
+    "F64": torch.float64,
+    "I64": torch.int64,
+    "F8_E4M3": torch.float8_e4m3fn,
+}
+
+
+def get_packed_weights(param, empty_param, device_mesh, rank, dim):
+    """
+    When weights are packed (gate_up_proj), we need to make sure each shard gets its correct share.
+    So if you have: gate_proj       ( 16, 5120, 8190)
+    and             up_proj         ( 16, 5120, 8190)
+    packed as       gate_up_proj    ( 16, 5120, 2 * 8190)
+    And you shard along the last dimension, you need to interleave the gate and up values:
+
+    Now, if we shard along the last dimension across TP_size (Tensor Parallelism size), we must interleave the values from gate and up projections correctly.
+
+    Let's take TP_size = 4 for an example:
+
+    Packed tensor `gate_up_proj`
+    ---------------------------------------------------------------
+    [ G0  G1  G2  G3 | G4  G5  G6  G7 | ... | U0  U1  U2  U3 | U4  U5  U6  U7 | ... ]
+     ↑─────────────↑   ↑─────────────↑        ↑─────────────↑  ↑─────────────↑
+       Gate Slice 0      Gate Slice 1            Up Slice 0       Up Slice 1
+
+    Explanation:
+    - The first half of the tensor (left of the center) holds the gate_proj values.
+    - The second half (right of the center) holds the up_proj values.
+    - For TP=4, we divide each half into 4 slices. In this example, we show two slices for brevity.
+    - Each shard receives one slice from the gate part and the corresponding slice from the up part.
+
+    For instance:
+    • Shard 0 gets: [ Gate Slice 0, Up Slice 0 ] = [ G0, G1, G2, G3, U0, U1, U2, U3 ]
+    • Shard 1 gets: [ Gate Slice 1, Up Slice 1 ] = [ G4, G5, G6, G7, U4, U5, U6, U7 ]
+    • … and so on.
+
+    This ensures that each shard receives an equal portion of both gate and up projections, maintaining consistency across tensor parallelism.
+    """
+    slice_ = param
+    total_size = empty_param.shape[dim]
+    world_size = device_mesh.size()
+    block_sizes = _blocks_to_block_sizes(total_size=total_size, blocks=2)
+
+    tensors_slices = []
+    block_offset = 0
+    for block_size in block_sizes:
+        shard_block_size = block_size // world_size
+        start = rank * shard_block_size
+        stop = (rank + 1) * shard_block_size
+        tensors_slices += range(block_offset + start, block_offset + stop)
+        block_offset += block_size
+
+    slice_dtype = slice_.get_dtype()
+    # Handle F8_E4M3 dtype by converting to float16 before slicing
+    # Without upcasting, the slicing causes : RuntimeError: "index_cpu" not implemented for 'Float8_e4m3fn'
+    if slice_dtype == "F8_E4M3":
+        slice_ = slice_[...].to(torch.float16)
+
+    if dim == 0:
+        tensor = slice_[tensors_slices, ...]
+    elif dim == 1 or dim == -2:
+        tensor = slice_[:, tensors_slices, ...]
+    elif dim == 2 or dim == -1:
+        tensor = slice_[..., tensors_slices]
+    else:
+        raise ValueError(f"Unsupported dim {dim}, only dim 0, 1 or 2 are supported")
+    return tensor.to(str_to_torch_dtype[slice_dtype])
+
+
+def get_tensor_shard(param, empty_param, device_mesh, rank, dim):
+    if dim == 0:
+        size_ = empty_param.shape[0]
+        param = param[rank * (size_ // device_mesh.size()) : (rank + 1) * (size_ // device_mesh.size()), ...]
+    elif dim == 1 or dim == -2:
+        size_ = empty_param.shape[-2]
+        param = param[..., rank * (size_ // device_mesh.size()) : (rank + 1) * (size_ // device_mesh.size()), :]
+    elif dim == 2 or dim == -1:
+        size_ = empty_param.shape[-1]
+        param = param[..., rank * (size_ // device_mesh.size()) : (rank + 1) * (size_ // device_mesh.size())]
+    else:
+        raise ValueError(f"Unsupported dim {dim}, only dim 0, 1 or 2 are supported")
+    return param
+
+
+def distribute_module(
+    module: nn.Module,
+    device_mesh=None,
+    input_fn=None,
+    output_fn=None,
+) -> nn.Module:
+    """
+    Copy pasted from torch's function but we remove the communications (partitionning)
+    as well as buffer registering that is similarly not efficient.
+    """
+    if len(module._forward_pre_hooks) == 0:
+        if input_fn is not None:
+            module.register_forward_pre_hook(lambda mod, inputs: input_fn(mod, inputs, device_mesh))
+        if output_fn is not None:
+            module.register_forward_hook(lambda mod, inputs, outputs: output_fn(mod, outputs, device_mesh))
+    return module
+
+
+class TensorParallelLayer:
+    """
+    General tensor parallel layer for transformers.
+    """
+
+    use_dtensor = True
+
+    @staticmethod
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh): ...
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh): ...
+
+    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
+        raise NotImplementedError
+
+    def prepare_module_tp(self, module: nn.Module, device_mesh) -> nn.Module:
+        if self.use_dtensor:
+            distribute_module(
+                module,
+                device_mesh,
+                partial(self._prepare_input_fn, self.input_layouts, self.desired_input_layouts),
+                partial(self._prepare_output_fn, self.output_layouts, self.use_local_output),
+            )
+
+
+# use_dtensor needs to be set to false for nn.Parameter when you want to view, chunk, slice
+# you name it. Whatever you want to do that is a bit unconventional, you need local tensors
+class GatherParallel(TensorParallelLayer):
+    """
+    Simple class used to define the hooks to add to a layer when we just want to gather the outputs
+    """
+
+    def __init__(
+        self,
+        *,
+        input_layouts: Optional[Placement] = None,
+        output_layouts: Optional[Placement] = None,
+        use_local_output: bool = True,
+    ):
+        super().__init__()
+        self.input_layouts = (input_layouts or Replicate(),)
+        self.output_layouts = output_layouts
+        self.desired_input_layouts = (Replicate(),)
+        self.use_local_output = use_local_output
+
+    @staticmethod
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+        if isinstance(inputs[0], DTensor):
+            inputs = inputs[0].to_local()
+        return inputs
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        # this op cannot be asynch, otherwise it completely breaks the outputs of models
+        torch.distributed.all_reduce(outputs[0], op=torch.distributed.ReduceOp.SUM, async_op=False)
+        return outputs
+
+
+class IsolatedParallel(TensorParallelLayer):
+    """
+    This class is used to isolate computation in a TP layer from the rest of the world.
+    Parameters need to be LOCAL, so not dtensors
+    """
+
+    @staticmethod
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh=None):
+        # annotate module input placements/sharding with input_layouts
+        input_tensor = inputs[0]
+        if isinstance(input_tensor, DTensor):
+            input_tensor = input_tensor.to_local()
+        return input_tensor
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh=None):
+        # TODO: figure out dynamo support for instance method and switch this to instance method
+        return outputs
+
+    def prepare_module_tp(self, module: nn.Module, device_mesh) -> nn.Module:
+        distribute_module(
+            module,
+            device_mesh,
+            partial(self._prepare_input_fn, None, None),
+            partial(self._prepare_output_fn, None, None),
+        )
+
+
+class ColwiseParallel(TensorParallelLayer):
+    """
+    General tensor parallel layer for transformers.
+    """
+
+    def __init__(
+        self,
+        *,
+        input_layouts: Optional[Placement] = None,
+        output_layouts: Optional[Placement] = None,
+        use_local_output: bool = True,
+        use_dtensor=True,
+    ):
+        super().__init__()
+        self.input_layouts = (input_layouts or Replicate(),)
+        self.output_layouts = (output_layouts or Shard(-1),)
+        self.desired_input_layouts = (Replicate(),)
+        self.use_local_output = use_local_output
+        self.use_dtensor = use_dtensor
+
+    @staticmethod
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+        # TODO: figure out dynamo support for instance method and switch this to instance method
+        # annotate module input placements/sharding with input_layouts
+        input_tensor = inputs[0]
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(input_tensor, device_mesh, input_layouts, run_check=False)
+
+        # transform the input layouts to the desired layouts of ColwiseParallel
+        if input_layouts != desired_input_layouts:
+            input_tensor = input_tensor.redistribute(placements=desired_input_layouts, async_op=False)
+        return input_tensor
+
+    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
+        # colwise shard weight/bias to Shard(0), weight be Shard(-2) (0 if you have 1 dim only)
+        # means Colwise as Linear is input * weight^T + bias, where
+        # weight would become Shard(1)
+        if param_type == "bias":
+            parameter = get_tensor_shard(param, empty_param, device_mesh, rank, -1)
+            shard = [Shard(-1)]
+        else:
+            shard = [Shard(-2)]
+            parameter = get_tensor_shard(param, empty_param, device_mesh, rank, -2)
+
+        parameter = parameter.to(param_casting_dtype)
+        if to_contiguous:
+            parameter = parameter.contiguous()
+        if self.use_dtensor:
+            parameter = DTensor.from_local(parameter, device_mesh, shard, run_check=False)
+        return nn.Parameter(parameter)
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        # outputs is a shard on last dimension DTensor, i.e. Shard(-1)
+        if outputs.placements != output_layouts:
+            outputs = outputs.redistribute(placements=output_layouts, async_op=False)
+        # back to local tensor
+        return outputs.to_local() if use_local_output else outputs
+
+
+class PackedColwiseParallel(ColwiseParallel):
+    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
+        # colwise shard weight/bias to Shard(0), weight be Shard(-2) (0 if you have 1 dim only)
+        # means Colwise as Linear is input * weight^T + bias, where
+        # weight would become Shard(1)
+        parameter = get_packed_weights(param, empty_param, device_mesh, rank, -2)
+        parameter = parameter.to(param_casting_dtype)
+        if to_contiguous:
+            parameter = parameter.contiguous()
+        if self.use_dtensor:
+            parameter = DTensor.from_local(parameter, device_mesh, [Shard(-2)], run_check=False)
+        return nn.Parameter(parameter)
+
+
+class RowwiseParallel(TensorParallelLayer):
+    """
+    Partition a compatible nn.Module in a row-wise fashion. Currently supports nn.Linear and nn.Embedding.
+    Users can compose it with ColwiseParallel to achieve the sharding of more complicated modules.
+    (i.e. MLP, Attention)
+
+    Keyword Args:
+        input_layouts (Placement, optional):
+            The DTensor layout of input tensor for the nn.Module, this is used to annotate the input tensor to
+            become a DTensor. If not specified, we assume the input tensor to be sharded on the last dimension.
+        output_layouts (Placement, optional):
+            The DTensor layout of the output for the nn.Module, this is used to ensure the output of the nn.Module
+            with the user desired layout. If not specified, the output tensor is replicated.
+        use_local_output (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module output, default: True.
+    Returns:
+        A :class:`ParallelStyle` object that represents Rowwise sharding of the nn.Module.
+    """
+
+    def __init__(
+        self,
+        *,
+        input_layouts: Optional[Placement] = None,
+        output_layouts: Optional[Placement] = None,
+        use_local_output: bool = True,
+        use_dtensor=True,
+    ):
+        super().__init__()
+        self.input_layouts = (input_layouts or Shard(-1),)
+        self.output_layouts = (output_layouts or Replicate(),)
+        self.use_local_output = use_local_output
+        self.use_dtensor = use_dtensor
+
+    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
+        # Rowwise shard weight to Shard(1), bias to Replicate(), weight be Shard(1)
+        # means Rowwise as nn.Linear is input * weight^T + bias, where
+        # weight would become Shard(0)
+        if param_type != "bias":
+            parameter = get_tensor_shard(param, empty_param, device_mesh, rank, -1)
+            shard = [Shard(-1)]
+        else:
+            shard = [Replicate()]
+            parameter = param[:]
+
+        parameter = parameter.to(param_casting_dtype)
+        if to_contiguous:
+            parameter = parameter.contiguous()
+        if self.use_dtensor:
+            parameter = DTensor.from_local(parameter, device_mesh, shard, run_check=False)
+        return nn.Parameter(parameter)
+
+    @staticmethod
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+        if hasattr(mod, "bias") and mod.bias is not None:
+            mod._bias = mod.bias
+            mod.bias = None
+
+        input_tensor = inputs[0]
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(input_tensor, device_mesh, input_layouts, run_check=False)
+
+        if input_layouts != desired_input_layouts:
+            input_tensor = input_tensor.redistribute(placements=desired_input_layouts, async_op=True)
+        return input_tensor
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        # Rowwise sharding produces partial output, depending on output layouts:
+        # 1. to replicate -> allreduce
+        # 2. to shard -> reduce_scatter
+        if outputs.placements != output_layouts:
+            outputs = outputs.redistribute(placements=output_layouts, async_op=True)
+        if hasattr(mod, "_bias"):
+            outputs += mod._bias
+        # back to local tensor if use_local_output is True
+        return outputs.to_local() if use_local_output else outputs
+
+    def prepare_module_tp(self, module: nn.Module, device_mesh) -> nn.Module:
+        module._distribute_module_applied = True
+        if self.use_dtensor:
+            if isinstance(module, nn.Linear):
+                # rowwise linear runtime sharding requires input tensor shard on last dim
+                self.desired_input_layouts: Tuple[Placement, ...] = (Shard(-1),)
+            elif isinstance(module, nn.Embedding):
+                # rowwise embedding runtime sharding requires input tensor replicated
+                self.desired_input_layouts = (Replicate(),)
+            elif isinstance(module, nn.Parameter):
+                # rowwise embedding runtime sharding requires input tensor replicated
+                self.desired_input_layouts = (Shard(-1),)
+            else:
+                raise NotImplementedError("RowwiseParallel currently only support nn.Linear and nn.Embedding!")
+
+            distribute_module(
+                module,
+                device_mesh,
+                partial(self._prepare_input_fn, self.input_layouts, self.desired_input_layouts),
+                partial(self._prepare_output_fn, self.output_layouts, self.use_local_output),
+            )
+
+
+class PackedRowwiseParallel(RowwiseParallel):
+    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
+        # colwise shard weight/bias to Shard(0), weight be Shard(-2) (0 if you have 1 dim only)
+        # means Colwise as Linear is input * weight^T + bias, where
+        # weight would become Shard(1)
+        parameter = get_packed_weights(param, empty_param, device_mesh, rank, -1)
+        parameter = parameter.to(param_casting_dtype)
+        if to_contiguous:
+            parameter = parameter.contiguous()
+        if self.use_dtensor:
+            parameter = DTensor.from_local(parameter, device_mesh, [Shard(-1)], run_check=False)
+        return nn.Parameter(parameter)
+
+
+class SequenceParallel(TensorParallelLayer):
+    """
+    SequenceParallel replicates a compatible ``nn.Module`` parameters and runs the sharded computation with
+    input sharded on the sequence dimension. This currently supports ``nn.LayerNorm``, ``nn.Dropout``, and the
+    `RMSNorm python implementation <https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34>`__
+
+    This style implements the operation that is described in the paper
+    `Reducing Activation Recomputation in Large Transformer Models <https://arxiv.org/abs/2205.05198>`__
+
+    If the input passed in to this ``nn.Module`` is a :class:`torch.Tensor`, it assumes that the input is already sharded
+    on the sequence dimension and converts the input to a :class:`DTensor` sharded on the sequence dimension. If the input
+    passed in to this ``nn.Module`` is already a :class:`DTensor` but is not sharded on the sequence dimension, it would
+    redistribute the input to be sharded on the sequence dimension.
+
+    The output of the ``nn.Module`` will be sharded on the sequence dimension.
+
+    Keyword Args:
+        sequence_dim (int, optional):
+            The sequence dimension of the input tensor for the ``nn.Module``, this is used to annotate the input tensor to
+            become a DTensor that is sharded on the sequence dimension, default: 1.
+        use_local_output (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module output, default: False.
+    Returns:
+        A :class:`ParallelStyle` object that represents Sequence Parallel of the ``nn.Module``.
+
+    Example::
+        >>> # xdoctest: +SKIP(failing)
+        >>> from torch.distributed.tensor.parallel import parallelize_module, SequenceParallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> m = Model(...)  # m is a nn.Module that contains a "norm" nn.LayerNorm submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # By default, the input of the "norm" will be converted to DTensor that shards on the sequence dim
+        >>> # and the output of "norm" will return a sharded on sequence dimension :class:`DTensor`.
+        >>>
+        >>> sharded_mod = parallelize_module(m, tp_mesh, {"norm": SequenceParallel()}),
+        >>> ...
+
+    .. note:: SequenceParallel style assumes ones initialization if there are weights in the nn.Module (i.e.
+        ``nn.LayerNorm`` or ``RMSNorm``, and they by default have ones initialization). If you have custom
+        inits for the weights on those modules, you need to broadcast the weights before/after parallelizing
+        to ensure that they are replicated.
+    """
+
+    def __init__(self, *, sequence_dim: int = 1, use_local_output: bool = False, use_dtensor=False):
+        super().__init__()
+        self.input_layouts = (Replicate(),)
+        self.desired_input_layouts = (Shard(1),)
+        self.output_layouts = (Replicate(),)
+        self.use_local_output = use_local_output
+        self.use_dtensor = True
+        self.sequence_sharding = (Shard(sequence_dim),)
+        self.use_local_output = use_local_output
+
+    @staticmethod
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+        input_tensor = inputs[0]
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(input_tensor, device_mesh, input_layouts, run_check=False)
+        if input_layouts != desired_input_layouts:
+            input_tensor = input_tensor.redistribute(placements=desired_input_layouts, async_op=True)
+        return input_tensor
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        outputs = outputs.redistribute(
+            placements=(Replicate(),), async_op=True
+        )  # maybe we have to replicate ? because next layer is not sharded
+        return outputs.to_local()  # if use_local_output else outputs
+
+    def partition_tensor(self, param, empty_param, param_type, param_casting_dtype, to_contiguous, rank, device_mesh):
+        # colwise shard weight/bias to Shard(0), weight be Shard(-2) (0 if you have 1 dim only)
+        # means Colwise as Linear is input * weight^T + bias, where
+        # weight would become Shard(1)
+        parameter = param[:]
+        parameter = parameter.to(param_casting_dtype)
+        if to_contiguous:
+            parameter = parameter.contiguous()
+        if self.use_dtensor:
+            parameter = DTensor.from_local(parameter, device_mesh, [Replicate()], run_check=False)
+        return nn.Parameter(parameter)
+
+
+SUPPORTED_TP_STYLES = {
+    "colwise",
+    "rowwise",
+    "colwise_rep",
+    "rowwise_rep",
+    "local_colwise",
+    "local_rowwise",
+    "local",
+    "gather",
+    "local_packed_rowwise",
+    "sequence_parallel",
+}
+
+
+@lru_cache
+def translate_to_torch_parallel_style(style: str):
+    """
+    In model configurations, we use a neutral type (string) to specify parallel
+    styles, here we translate them into torch.distributed tensor-parallel
+    types.
+    """
+    if not isinstance(style, str):
+        raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
+
+    if style == "colwise":
+        return ColwiseParallel()
+    elif style == "rowwise":
+        return RowwiseParallel()
+    elif style == "colwise_rep":
+        return ColwiseParallel(output_layouts=Replicate())
+    elif style == "rowwise_rep":
+        return RowwiseParallel(input_layouts=Replicate())
+    elif style == "local_colwise":
+        return ColwiseParallel(use_dtensor=False)
+    elif style == "local_rowwise":
+        return RowwiseParallel(use_dtensor=False)
+    elif style == "local":
+        return IsolatedParallel()
+    elif style == "gather":
+        return GatherParallel()
+    elif style == "local_packed_rowwise":
+        return PackedRowwiseParallel(use_dtensor=False)
+    elif style == "sequence_parallel":
+        return SequenceParallel()
+    else:
+        raise ValueError(f"Unsupported parallel style value: {style}")
+
+
+def add_tensor_parallel_hooks_to_module(model, module, tp_plan, layer_name, current_module_plan, device_mesh):
+    """
+    Add hooks to the module holding the layer. Meaning:
+    ```
+    class MyModel(nn.Module):
+        def __init__(self):
+            self.layer = nn.Linear(10, 10)
+    ```
+    has state_dict like:
+    ```
+    {
+        "layer.weight": torch.Tensor,
+        "layer.bias": torch.Tensor
+    }
+    ```
+    we add hooks to `MyModel` as well as `layer` to make sure that the tensors are correctly sharded and gathered.
+    """
+
+    # 1. We add hooks to the layer being loaded:
+    if current_module_plan is not None:
+        tp_layer = translate_to_torch_parallel_style(current_module_plan)
+        try:
+            tp_layer.prepare_module_tp(module, device_mesh)
+        except NotImplementedError as e:
+            print(
+                f"Trying to prepare {layer_name}, but it's not supported. Corresponding module: {module} Fix it's TP plan: {e}"
+            )
+
+    # 2. We add hooks to the parrent module if needed
+    if "." in layer_name:
+        parrent_layer_name = layer_name.rsplit(".", 1)[0]
+        generic_name = re.sub(r"\d+", "*", parrent_layer_name)
+        # The module itself needs hooks
+        if module_plan := tp_plan.get(generic_name, False):
+            tp_layer = translate_to_torch_parallel_style(module_plan)
+            module_to_tp_ = model.get_submodule(parrent_layer_name)
+            tp_layer.prepare_module_tp(module_to_tp_, device_mesh)
+
+
+def shard_and_distribute_module(
+    model, param, empty_param, parameter_name, param_casting_dtype, is_contiguous, rank, device_mesh
+):
+    r"""
+    Main uses cases:
+    - column / rowise parallelism, you just shard all the weights of the layer (weight and bias)
+    - packed layers: you slice the weights, then shard like above
+    - custom operation:
+        - you want to add an all-gather at the end of a local layer.
+        - you want to have a layer that is isolated from the rest of the world (because torch.DTensor does not work well with `.view` for instance)
+
+    """
+    param_name, param_type = parameter_name.rsplit(".", 1) if "." in parameter_name else parameter_name
+    tp_plan = model._tp_plan
+    module_to_tp = model.get_submodule(param_name)
+    current_module_plan = None
+    rank = int(rank)
+    generic_param_name = re.sub(r"\d+", "*", parameter_name)
+    if generic_param_name in tp_plan:
+        current_module_plan = tp_plan[generic_param_name]
+    elif "." in generic_param_name and generic_param_name.rsplit(".", 1)[0] in tp_plan:
+        current_module_plan = tp_plan[generic_param_name.rsplit(".", 1)[0]]
+
+    # Add hooks to the module if not done yet
+    # add_tensor_parallel_hooks_to_module(model, module_to_tp, tp_plan, param_name, current_module_plan, device_mesh)
+    if not getattr(module_to_tp, "_is_hooked", False):
+        add_tensor_parallel_hooks_to_module(model, module_to_tp, tp_plan, param_name, current_module_plan, device_mesh)
+        module_to_tp._is_hooked = True
+
+    if current_module_plan is not None:
+        try:
+            tp_layer = translate_to_torch_parallel_style(current_module_plan)
+            param = tp_layer.partition_tensor(
+                param, empty_param, param_type, param_casting_dtype, is_contiguous, rank, device_mesh
+            )
+        except NotImplementedError as e:
+            print(
+                f"Trying to prepare {parameter_name}, but it's not supported. Corresponding module: {module_to_tp} Fix it's TP plan, current layer: {tp_layer} : {e}"
+            )
+    else:
+        # TODO log no plan modules in set
+        # print("No plan for", parameter_name,end ="\n")
+        param = param[...].to(param_casting_dtype)
+        if is_contiguous:
+            param = param.contiguous()
+
+    # SUPER IMPORTANT we have to use setattr
+    # otherwise loading is crazy slow
+    if not isinstance(param, torch.nn.Parameter):
+        param = torch.nn.Parameter(param)
+    setattr(module_to_tp, param_type, param)
+    # module_to_tp.load_state_dict({param_type: param}, strict=False, assign=True)
+    return param
diff --git a/src/transformers/integrations/tpu.py b/src/transformers/integrations/tpu.py
index 29262789dc98..1f7b0df3cd3b 100644
--- a/src/transformers/integrations/tpu.py
+++ b/src/transformers/integrations/tpu.py
@@ -21,9 +21,9 @@ def tpu_spmd_dataloader(dataloader: DataLoader):
     if is_torch_xla_available():
         import torch_xla.distributed.parallel_loader as pl
 
-        assert isinstance(
-            dataloader, pl.MpDeviceLoader
-        ), "The dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`."
+        assert isinstance(dataloader, pl.MpDeviceLoader), (
+            "The dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`."
+        )
 
         # This is to support PyTorch/XLA FSDP via SPMD.
         # Here we shard the input data's 0th dim across the fsdp axis.
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index b6e832729a1e..57e72aea7e6c 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -2,7 +2,7 @@
 import os
 from pathlib import Path
 from time import sleep
-from typing import Callable, List, Optional, Union
+from typing import Callable, Optional, Union
 
 import numpy as np
 import tensorflow as tf
@@ -79,8 +79,8 @@ def __init__(
         self,
         metric_fn: Callable,
         eval_dataset: Union[tf.data.Dataset, np.ndarray, tf.Tensor, tuple, dict],
-        output_cols: Optional[List[str]] = None,
-        label_cols: Optional[List[str]] = None,
+        output_cols: Optional[list[str]] = None,
+        label_cols: Optional[list[str]] = None,
         batch_size: Optional[int] = None,
         predict_with_generate: bool = False,
         use_xla_generation: bool = False,
diff --git a/src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.cpp b/src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.cpp
deleted file mode 100644
index 388a73d22d4c..000000000000
--- a/src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-#include <vector>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-
-
-at::Tensor
-ms_deform_attn_cpu_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    AT_ERROR("Not implement on cpu");
-}
-
-std::vector<at::Tensor>
-ms_deform_attn_cpu_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-    AT_ERROR("Not implement on cpu");
-}
diff --git a/src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.h b/src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.h
deleted file mode 100644
index 7eac8c8bcd1b..000000000000
--- a/src/transformers/kernels/deformable_detr/cpu/ms_deform_attn_cpu.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-#pragma once
-#include <torch/extension.h>
-
-at::Tensor
-ms_deform_attn_cpu_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step);
-
-std::vector<at::Tensor>
-ms_deform_attn_cpu_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step);
-
diff --git a/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu
deleted file mode 100644
index c2b3a462a1a0..000000000000
--- a/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu
+++ /dev/null
@@ -1,159 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-#include <vector>
-#include "cuda/ms_deform_im2col_cuda.cuh"
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#pragma once
-#include <torch/extension.h>
-
-
-at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    at::DeviceGuard guard(value.device());
-
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-
-    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
-
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-
-    const int num_levels = spatial_shapes.size(0);
-
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-
-    const int im2col_step_ = std::min(batch, im2col_step);
-
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-    
-    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
-
-    const int batch_n = im2col_step_;
-    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto columns = output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
-            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
-                value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
-                spatial_shapes.data_ptr<int64_t>(),
-                level_start_index.data_ptr<int64_t>(),
-                sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                columns.data_ptr<scalar_t>());
-
-        }));
-    }
-
-    output = output.view({batch, num_query, num_heads*channels});
-
-    return output;
-}
-
-
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-    at::DeviceGuard guard(value.device());
-
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
-
-    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
-    AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
-
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-
-    const int num_levels = spatial_shapes.size(0);
-
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-
-    const int im2col_step_ = std::min(batch, im2col_step);
-
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-
-    auto grad_value = at::zeros_like(value);
-    auto grad_sampling_loc = at::zeros_like(sampling_loc);
-    auto grad_attn_weight = at::zeros_like(attn_weight);
-
-    const int batch_n = im2col_step_;
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto grad_output_g = grad_output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
-            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
-                                    grad_output_g.data_ptr<scalar_t>(),
-                                    value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
-                                    spatial_shapes.data_ptr<int64_t>(),
-                                    level_start_index.data_ptr<int64_t>(),
-                                    sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                                    grad_value.data_ptr<scalar_t>() +  n * im2col_step_ * per_value_size,
-                                    grad_sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    grad_attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
-
-        }));
-    }
-
-    return {
-        grad_value, grad_sampling_loc, grad_attn_weight
-    };
-}
diff --git a/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh
deleted file mode 100644
index 20ae6892e4b9..000000000000
--- a/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh
+++ /dev/null
@@ -1,1467 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-#include <vector>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <cstdio>
-#include <algorithm>
-#include <cstring>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THCAtomics.cuh>
-
-#define CUDA_KERNEL_LOOP(i, n)                          \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
-      i < (n);                                          \
-      i += blockDim.x * gridDim.x)
-
-
-at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-
-    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
-
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-
-    const int num_levels = spatial_shapes.size(0);
-
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-
-    const int im2col_step_ = std::min(batch, im2col_step);
-
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-    
-    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
-
-    const int batch_n = im2col_step_;
-    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto columns = output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
-            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
-                value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
-                spatial_shapes.data_ptr<int64_t>(),
-                level_start_index.data_ptr<int64_t>(),
-                sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                columns.data_ptr<scalar_t>());
-
-        }));
-    }
-
-    output = output.view({batch, num_query, num_heads*channels});
-
-    return output;
-}
-
-
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
-
-    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
-    AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
-
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-
-    const int num_levels = spatial_shapes.size(0);
-
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-
-    const int im2col_step_ = std::min(batch, im2col_step);
-
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-
-    auto grad_value = at::zeros_like(value);
-    auto grad_sampling_loc = at::zeros_like(sampling_loc);
-    auto grad_attn_weight = at::zeros_like(attn_weight);
-
-    const int batch_n = im2col_step_;
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto grad_output_g = grad_output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
-            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
-                                    grad_output_g.data_ptr<scalar_t>(),
-                                    value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
-                                    spatial_shapes.data_ptr<int64_t>(),
-                                    level_start_index.data_ptr<int64_t>(),
-                                    sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                                    grad_value.data_ptr<scalar_t>() +  n * im2col_step_ * per_value_size,
-                                    grad_sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    grad_attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
-
-        }));
-    }
-
-    return {
-        grad_value, grad_sampling_loc, grad_attn_weight
-    };
-}
-
-const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N, const int num_threads)
-{
-  return (N + num_threads - 1) / num_threads;
-}
-
-
-template <typename scalar_t>
-__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-  }
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value, 
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  *grad_attn_weight = top_grad * val;
-  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
-  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
-}
-
-
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value, 
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  atomicAdd(grad_attn_weight, top_grad * val); 
-  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
-  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_im2col_gpu_kernel(const int n,
-                                                const scalar_t *data_value, 
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *data_col)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    scalar_t *data_col_ptr = data_col + index;
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    scalar_t col = 0;
-    
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
-        }
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-      }
-    }
-    *data_col_ptr = col;
-  }
-}
-
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockSize; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          
-          
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockSize/2; s>0; s>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        { 
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          
-          
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            } 
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        {
-          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
-          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
-          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear_gm(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            grad_sampling_loc, grad_attn_weight);
-        }
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-void ms_deformable_im2col_cuda(cudaStream_t stream,
-                              const scalar_t* data_value,
-                              const int64_t* data_spatial_shapes, 
-                              const int64_t* data_level_start_index, 
-                              const scalar_t* data_sampling_loc,
-                              const scalar_t* data_attn_weight,
-                              const int batch_size,
-                              const int spatial_size, 
-                              const int num_heads, 
-                              const int channels, 
-                              const int num_levels, 
-                              const int num_query,
-                              const int num_point,
-                              scalar_t* data_col)
-{
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  const int num_threads = CUDA_NUM_THREADS;
-  ms_deformable_im2col_gpu_kernel<scalar_t>
-      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-          0, stream>>>(
-      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
-      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
-  
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
-  }
-
-}
-
-template <typename scalar_t>
-void ms_deformable_col2im_cuda(cudaStream_t stream,
-                              const scalar_t* grad_col,
-                              const scalar_t* data_value,
-                              const int64_t * data_spatial_shapes,
-                              const int64_t * data_level_start_index,
-                              const scalar_t * data_sampling_loc,
-                              const scalar_t * data_attn_weight,
-                              const int batch_size, 
-                              const int spatial_size, 
-                              const int num_heads,
-                              const int channels, 
-                              const int num_levels,
-                              const int num_query,
-                              const int num_point, 
-                              scalar_t* grad_value,
-                              scalar_t* grad_sampling_loc,
-                              scalar_t* grad_attn_weight)
-{
-  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  if (channels > 1024)
-  {
-    if ((channels & 1023) == 0)
-    {
-      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-    }
-    else
-    {
-      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-    }
-  }
-  else{
-    switch(channels)
-    {
-      case 1:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 2:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 4:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 8:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 16:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 32:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 64:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 128:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 256:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 512:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 1024:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      default:
-        if (channels < 64)
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-        else
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-    }
-  }
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
-  }
-
-}
diff --git a/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.h b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.h
deleted file mode 100644
index d8c21b4e54dc..000000000000
--- a/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-#pragma once
-#include <torch/extension.h>
-
-at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step);
-
-at::Tensor ms_deform_attn_cuda_forward_bf16(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step);
-
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step);
-
-std::vector<at::Tensor> ms_deform_attn_cuda_backward_bf16(
-    const at::Tensor &value,
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step);
diff --git a/src/transformers/kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh b/src/transformers/kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh
deleted file mode 100644
index 4fb544bf791d..000000000000
--- a/src/transformers/kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh
+++ /dev/null
@@ -1,1327 +0,0 @@
-/*!
-**************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************
-* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
-* Copyright (c) 2018 Microsoft
-**************************************************************************
-*/
-
-#include <cstdio>
-#include <algorithm>
-#include <cstring>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THCAtomics.cuh>
-
-#define CUDA_KERNEL_LOOP(i, n)                          \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
-      i < (n);                                          \
-      i += blockDim.x * gridDim.x)
-
-const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N, const int num_threads)
-{
-  return (N + num_threads - 1) / num_threads;
-}
-
-
-template <typename scalar_t>
-__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-  }
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value, 
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  *grad_attn_weight = top_grad * val;
-  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
-  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
-}
-
-
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value, 
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  atomicAdd(grad_attn_weight, top_grad * val); 
-  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
-  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_im2col_gpu_kernel(const int n,
-                                                const scalar_t *data_value, 
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *data_col)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    scalar_t *data_col_ptr = data_col + index;
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    scalar_t col = 0;
-    
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
-        }
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-      }
-    }
-    *data_col_ptr = col;
-  }
-}
-
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockSize; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          
-          
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockSize/2; s>0; s>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        { 
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          
-          
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            } 
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        {
-          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
-          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
-          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    [[maybe_unused]] const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear_gm(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            grad_sampling_loc, grad_attn_weight);
-        }
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-void ms_deformable_im2col_cuda(cudaStream_t stream,
-                              const scalar_t* data_value,
-                              const int64_t* data_spatial_shapes, 
-                              const int64_t* data_level_start_index, 
-                              const scalar_t* data_sampling_loc,
-                              const scalar_t* data_attn_weight,
-                              const int batch_size,
-                              const int spatial_size, 
-                              const int num_heads, 
-                              const int channels, 
-                              const int num_levels, 
-                              const int num_query,
-                              const int num_point,
-                              scalar_t* data_col)
-{
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  const int num_threads = CUDA_NUM_THREADS;
-  ms_deformable_im2col_gpu_kernel<scalar_t>
-      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-          0, stream>>>(
-      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
-      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
-  
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
-  }
-
-}
-
-template <typename scalar_t>
-void ms_deformable_col2im_cuda(cudaStream_t stream,
-                              const scalar_t* grad_col,
-                              const scalar_t* data_value,
-                              const int64_t * data_spatial_shapes,
-                              const int64_t * data_level_start_index,
-                              const scalar_t * data_sampling_loc,
-                              const scalar_t * data_attn_weight,
-                              const int batch_size, 
-                              const int spatial_size, 
-                              const int num_heads,
-                              const int channels, 
-                              const int num_levels,
-                              const int num_query,
-                              const int num_point, 
-                              scalar_t* grad_value,
-                              scalar_t* grad_sampling_loc,
-                              scalar_t* grad_attn_weight)
-{
-  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  if (channels > 1024)
-  {
-    if ((channels & 1023) == 0)
-    {
-      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-    }
-    else
-    {
-      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-    }
-  }
-  else{
-    switch(channels)
-    {
-      case 1:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 2:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 4:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 8:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 16:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 32:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 64:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 128:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 256:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 512:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 1024:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      default:
-        if (channels < 64)
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-        else
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-    }
-  }
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
-  }
-
-}
diff --git a/src/transformers/kernels/deformable_detr/ms_deform_attn.h b/src/transformers/kernels/deformable_detr/ms_deform_attn.h
deleted file mode 100644
index e649e65e37ae..000000000000
--- a/src/transformers/kernels/deformable_detr/ms_deform_attn.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-#pragma once
-
-#include "cpu/ms_deform_attn_cpu.h"
-
-#ifdef WITH_CUDA
-#include "cuda/ms_deform_attn_cuda.h"
-#endif
-
-
-at::Tensor
-ms_deform_attn_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    if (value.is_cuda())
-    {
-#ifdef WITH_CUDA
-        return ms_deform_attn_cuda_forward(
-            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
-#else
-        AT_ERROR("Not compiled with GPU support");
-#endif
-    }
-    AT_ERROR("Not implemented on the CPU");
-}
-
-std::vector<at::Tensor>
-ms_deform_attn_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-    if (value.is_cuda())
-    {
-#ifdef WITH_CUDA
-        return ms_deform_attn_cuda_backward(
-            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
-#else
-        AT_ERROR("Not compiled with GPU support");
-#endif
-    }
-    AT_ERROR("Not implemented on the CPU");
-}
diff --git a/src/transformers/kernels/deformable_detr/vision.cpp b/src/transformers/kernels/deformable_detr/vision.cpp
deleted file mode 100644
index 6ce3875568b9..000000000000
--- a/src/transformers/kernels/deformable_detr/vision.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-#include "ms_deform_attn.h"
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
-  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
-}
\ No newline at end of file
diff --git a/src/transformers/loss/loss_grounding_dino.py b/src/transformers/loss/loss_grounding_dino.py
new file mode 100644
index 000000000000..0b5e4f605495
--- /dev/null
+++ b/src/transformers/loss/loss_grounding_dino.py
@@ -0,0 +1,271 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+
+from ..image_transforms import center_to_corners_format
+from ..utils import is_scipy_available
+from .loss_for_object_detection import HungarianMatcher, ImageLoss, _set_aux_loss, generalized_box_iou
+
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+
+# Similar to the one used in `DeformableDetr` but we reduce with sum and normalize by num_boxes
+# instead of mean.
+def sigmoid_focal_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    num_boxes: int,
+    alpha: float = 0.25,
+    gamma: float = 2,
+):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+
+    Args:
+        inputs (`torch.FloatTensor` of arbitrary shape):
+            The predictions for each example.
+        targets (`torch.FloatTensor` with the same shape as `inputs`)
+            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
+            and 1 for the positive class).
+        num_boxes (`int`):
+            The total number of boxes in the batch.
+        alpha (`float`, *optional*, defaults to 0.25):
+            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
+        gamma (`int`, *optional*, defaults to 2):
+            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    # add modulating factor
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.sum() / num_boxes
+
+
+class GroundingDinoHungarianMatcher(HungarianMatcher):
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+                * "label_maps": Tuple of tensors of dim [num_classes, hidden_dim].
+            targets (`List[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, hidden_dim]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+        label_maps = outputs["label_maps"]
+
+        # First take the label map for each class in each batch and then concatenate them
+        label_maps = torch.cat([label_map[target["class_labels"]] for label_map, target in zip(label_maps, targets)])
+        # Normalize label maps based on number of tokens per class
+        label_maps = label_maps / label_maps.sum(dim=-1, keepdim=True)
+
+        # Also concat the target labels and boxes
+        target_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        # Compute the classification cost by taking pos and neg cost in the appropriate index
+        class_cost = (pos_cost_class - neg_cost_class) @ label_maps.t()
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+class GroundingDinoImageLoss(ImageLoss):
+    """
+    This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we
+    compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
+    matched ground-truth / prediction (supervise class and box).
+
+    Args:
+        matcher (`GroundingDinoHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        focal_alpha (`float`):
+            Alpha parameter in focal loss.
+        losses (`List[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(self, matcher, focal_alpha, losses):
+        nn.Module.__init__(self)
+        self.matcher = matcher
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+
+    def _get_target_classes_one_hot(self, outputs, targets, indices):
+        """
+        Create one_hot based on the matching indices
+        """
+        logits = outputs["logits"]
+        # Add offsets to class_labels to select the correct label map
+        class_labels = torch.cat(
+            [
+                target["class_labels"][J] + len(outputs["label_maps"][i]) if i > 0 else target["class_labels"][J]
+                for i, (target, (_, J)) in enumerate(zip(targets, indices))
+            ]
+        )
+        label_maps = torch.cat(outputs["label_maps"], dim=0)
+
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_onehot = torch.zeros_like(logits, device=logits.device, dtype=torch.long)
+        target_classes_onehot[idx] = label_maps[class_labels].to(torch.long)
+
+        return target_classes_onehot
+
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        if "text_mask" not in outputs:
+            raise KeyError("No text_mask were found in the outputs")
+
+        target_classes_onehot = self._get_target_classes_one_hot(outputs, targets, indices)
+        source_logits = outputs["logits"]
+        text_mask = outputs["text_mask"]
+
+        # Select only valid logits
+        source_logits = torch.masked_select(source_logits, text_mask)
+        target_classes_onehot = torch.masked_select(target_classes_onehot, text_mask)
+
+        target_classes_onehot = target_classes_onehot.float()
+        loss_ce = sigmoid_focal_loss(
+            inputs=source_logits,
+            targets=target_classes_onehot,
+            num_boxes=num_boxes,
+            alpha=self.focal_alpha,
+            gamma=2,
+        )
+
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+
+def GroundingDinoForObjectDetectionLoss(
+    logits,
+    labels,
+    device,
+    pred_boxes,
+    config,
+    label_maps,
+    text_mask,
+    outputs_class=None,
+    outputs_coord=None,
+    encoder_logits=None,
+    encoder_pred_boxes=None,
+):
+    # First: create the matcher
+    matcher = GroundingDinoHungarianMatcher(
+        class_cost=config.class_cost, bbox_cost=config.bbox_cost, giou_cost=config.giou_cost
+    )
+    # Second: create the criterion
+    losses = ["labels", "boxes", "cardinality"]
+    criterion = GroundingDinoImageLoss(
+        matcher=matcher,
+        focal_alpha=config.focal_alpha,
+        losses=losses,
+    )
+    criterion.to(device)
+    # Third: compute the losses, based on outputs and labels
+    outputs_loss = {}
+    outputs_loss["logits"] = logits
+    outputs_loss["pred_boxes"] = pred_boxes
+    outputs_loss["label_maps"] = label_maps
+    outputs_loss["text_mask"] = text_mask
+
+    auxiliary_outputs = None
+    if config.auxiliary_loss:
+        auxiliary_outputs = _set_aux_loss(outputs_class, outputs_coord)
+        for aux_output in auxiliary_outputs:
+            aux_output["label_maps"] = label_maps
+            aux_output["text_mask"] = text_mask
+        outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+
+    loss_dict = criterion(outputs_loss, labels)
+
+    if config.two_stage:
+        encoder_outputs_loss = {
+            "logits": encoder_logits,
+            "pred_boxes": encoder_pred_boxes,
+            "label_maps": label_maps,
+            "text_mask": text_mask,
+        }
+        encoder_loss_dict = criterion(encoder_outputs_loss, labels)
+        encoder_loss_dict = {k + "_enc": v for k, v in encoder_loss_dict.items()}
+        loss_dict.update(encoder_loss_dict)
+    # Fourth: compute total loss, as a weighted sum of the various losses
+    weight_dict = {
+        "loss_ce": 2.0,
+        "loss_bbox": config.bbox_loss_coefficient,
+        "loss_giou": config.giou_loss_coefficient,
+    }
+
+    if config.two_stage:
+        enc_weight_dict = {k + "_enc": v for k, v in weight_dict.items()}
+        weight_dict.update(enc_weight_dict)
+
+    if config.auxiliary_loss:
+        aux_weight_dict = {}
+        for i in range(config.decoder_layers - 1):
+            aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+
+    loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+    return loss, loss_dict, auxiliary_outputs
diff --git a/src/transformers/loss/loss_rt_detr.py b/src/transformers/loss/loss_rt_detr.py
index fdfdd041cfc4..88a4ac7cf4fa 100644
--- a/src/transformers/loss/loss_rt_detr.py
+++ b/src/transformers/loss/loss_rt_detr.py
@@ -175,8 +175,8 @@ def loss_labels_vfl(self, outputs, targets, indices, num_boxes, log=True):
 
         src_boxes = outputs["pred_boxes"][idx]
         target_boxes = torch.cat([_target["boxes"][i] for _target, (_, i) in zip(targets, indices)], dim=0)
-        ious, _ = box_iou(center_to_corners_format(src_boxes), center_to_corners_format(target_boxes))
-        ious = torch.diag(ious).detach()
+        ious, _ = box_iou(center_to_corners_format(src_boxes.detach()), center_to_corners_format(target_boxes))
+        ious = torch.diag(ious)
 
         src_logits = outputs["logits"]
         target_classes_original = torch.cat([_target["class_labels"][i] for _target, (_, i) in zip(targets, indices)])
@@ -190,7 +190,7 @@ def loss_labels_vfl(self, outputs, targets, indices, num_boxes, log=True):
         target_score_original[idx] = ious.to(target_score_original.dtype)
         target_score = target_score_original.unsqueeze(-1) * target
 
-        pred_score = F.sigmoid(src_logits).detach()
+        pred_score = F.sigmoid(src_logits.detach())
         weight = self.alpha * pred_score.pow(self.gamma) * (1 - target) + target_score
 
         loss = F.binary_cross_entropy_with_logits(src_logits, target_score, weight=weight, reduction="none")
diff --git a/src/transformers/loss/loss_utils.py b/src/transformers/loss/loss_utils.py
index bf1ae53f5a73..0e052aed6a9b 100644
--- a/src/transformers/loss/loss_utils.py
+++ b/src/transformers/loss/loss_utils.py
@@ -12,16 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 import torch
 import torch.nn as nn
 from torch.nn import BCEWithLogitsLoss, MSELoss
 
 from .loss_deformable_detr import DeformableDetrForObjectDetectionLoss, DeformableDetrForSegmentationLoss
 from .loss_for_object_detection import ForObjectDetectionLoss, ForSegmentationLoss
+from .loss_grounding_dino import GroundingDinoForObjectDetectionLoss
 from .loss_rt_detr import RTDetrForObjectDetectionLoss
 
 
-def fixed_cross_entropy(source, target, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs):
+def fixed_cross_entropy(
+    source: torch.Tensor,
+    target: torch.Tensor,
+    num_items_in_batch: Optional[int] = None,
+    ignore_index: int = -100,
+    **kwargs,
+) -> torch.Tensor:
     reduction = "sum" if num_items_in_batch is not None else "mean"
     loss = nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction=reduction)
     if reduction == "sum":
@@ -30,14 +39,21 @@ def fixed_cross_entropy(source, target, num_items_in_batch: int = None, ignore_i
 
 
 def ForCausalLMLoss(
-    logits, labels, vocab_size: int, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs
-):
+    logits,
+    labels,
+    vocab_size: int,
+    num_items_in_batch: Optional[int] = None,
+    ignore_index: int = -100,
+    shift_labels: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> torch.Tensor:
     # Upcast to float if we need to compute the loss to avoid potential precision issues
     logits = logits.float()
-    labels = labels.to(logits.device)
-    # Shift so that tokens < n predict n
-    labels = nn.functional.pad(labels, (0, 1), value=ignore_index)
-    shift_labels = labels[..., 1:].contiguous()
+
+    if shift_labels is None:
+        # Shift so that tokens < n predict n
+        labels = nn.functional.pad(labels, (0, 1), value=ignore_index)
+        shift_labels = labels[..., 1:].contiguous()
 
     # Flatten the tokens
     logits = logits.view(-1, vocab_size)
@@ -49,11 +65,15 @@ def ForCausalLMLoss(
 
 
 def ForMaskedLMLoss(
-    logits, labels, vocab_size: int, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs
+    logits: torch.Tensor,
+    labels: torch.Tensor,
+    vocab_size: int,
+    num_items_in_batch: Optional[int] = None,
+    ignore_index: int = -100,
+    **kwargs,
 ):
     # Upcast to float if we need to compute the loss to avoid potential precision issues
     logits = logits.float()
-    labels = labels.to(logits.device)
 
     # Flatten the tokens
     logits = logits.view(-1, vocab_size)
@@ -65,12 +85,12 @@ def ForMaskedLMLoss(
     return loss
 
 
-def ForSequenceClassificationLoss(labels, pooled_logits, config, **kwargs):
+def ForSequenceClassificationLoss(labels: torch.Tensor, pooled_logits: torch.Tensor, config, **kwargs) -> torch.Tensor:
     num_labels = config.num_labels
     if config.problem_type is None:
         if num_labels == 1:
             config.problem_type = "regression"
-        elif num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+        elif num_labels > 1 and (labels.dtype in (torch.long, torch.int)):
             config.problem_type = "single_label_classification"
         else:
             config.problem_type = "multi_label_classification"
@@ -79,15 +99,17 @@ def ForSequenceClassificationLoss(labels, pooled_logits, config, **kwargs):
     if config.problem_type == "regression":
         loss_fct = MSELoss()
         if num_labels == 1:
-            loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+            return loss_fct(pooled_logits.squeeze(), labels.squeeze())
         else:
-            loss = loss_fct(pooled_logits, labels)
-    elif config.problem_type == "single_label_classification":
-        loss = fixed_cross_entropy(pooled_logits.view(-1, num_labels), labels.view(-1), **kwargs)
-    elif config.problem_type == "multi_label_classification":
+            return loss_fct(pooled_logits, labels)
+    if config.problem_type == "single_label_classification":
+        return fixed_cross_entropy(pooled_logits.view(-1, num_labels), labels.view(-1), **kwargs)
+
+    if config.problem_type == "multi_label_classification":
         loss_fct = BCEWithLogitsLoss()
-        loss = loss_fct(pooled_logits, labels)
-    return loss
+        return loss_fct(pooled_logits, labels)
+
+    raise RuntimeError(f"Invalid problem type: {config.problem_type}")
 
 
 def ForQuestionAnsweringLoss(start_logits, end_logits, start_positions, end_positions, **kwargs):
@@ -109,7 +131,7 @@ def ForQuestionAnsweringLoss(start_logits, end_logits, start_positions, end_posi
     return total_loss
 
 
-def ForTokenClassification(logits, labels, config, **kwargs):
+def ForTokenClassification(logits: torch.Tensor, labels, config, **kwargs):
     # Upcast to float if we need to compute the loss to avoid potential precision issues
     logits = logits.view(-1, config.num_labels)
     labels = labels.view(-1).to(logits.device)
@@ -129,7 +151,7 @@ def ForTokenClassification(logits, labels, config, **kwargs):
     "DeformableDetrForObjectDetection": DeformableDetrForObjectDetectionLoss,
     "ConditionalDetrForObjectDetection": DeformableDetrForObjectDetectionLoss,
     "DabDetrForObjectDetection": DeformableDetrForObjectDetectionLoss,
-    "GroundingDinoForObjectDetection": DeformableDetrForObjectDetectionLoss,
+    "GroundingDinoForObjectDetection": GroundingDinoForObjectDetectionLoss,
     "ConditionalDetrForSegmentation": DeformableDetrForSegmentationLoss,
     "RTDetrForObjectDetection": RTDetrForObjectDetectionLoss,
     "RTDetrV2ForObjectDetection": RTDetrForObjectDetectionLoss,
diff --git a/src/transformers/model_debugging_utils.py b/src/transformers/model_debugging_utils.py
new file mode 100644
index 000000000000..0992636a141c
--- /dev/null
+++ b/src/transformers/model_debugging_utils.py
@@ -0,0 +1,329 @@
+# Copyright 2025 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import json
+import os
+import re
+from contextlib import contextmanager
+from typing import Optional
+
+from transformers.utils.import_utils import export
+
+from .utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+    import torch.distributed.tensor
+    from torch import nn
+
+    from .modeling_utils import PreTrainedModel
+
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+# Note to code inspectors: this toolbox is intended for people who add models to `transformers`.
+_torch_distributed_available = torch.distributed.is_available()
+
+
+def _is_rank_zero():
+    """Return True if rank=0 or we aren't running distributed."""
+    if not (_torch_distributed_available and torch.distributed.is_initialized()):
+        return True
+    return torch.distributed.get_rank() == 0
+
+
+MEMORY_ADDRESS_REGEX = re.compile(r"object at 0x[0-9A-Fa-f]+")
+
+
+def _sanitize_repr_for_diff(x_str: str) -> str:
+    """
+    Replace memory addresses in an object's repr with a stable placeholder
+    so that beautiful JSON diffs won't be ruined by ephemeral addresses.
+    """
+    return MEMORY_ADDRESS_REGEX.sub("object at 0xXXXXXXXX", x_str)
+
+
+def _dtensor_repr(x):
+    """Return a stable string representation for a DTensor-like object."""
+    if _is_rank_zero():
+        return f"DTensor (rank0) -> {repr(x._local_tensor)}"
+    return "DTensor(non-rank0)"
+
+
+def _serialize_io(value):
+    """
+    Recursively build a JSON-serializable Python structure from `value`.
+    Tensors and DTensors become sanitized repr strings.
+    Lists/tuples/dicts are recursed into.
+    All memory addresses are replaced with a stable placeholder.
+
+    Args:
+        value: Any Python object, often including torch Tensors, lists, dicts, etc.
+
+    Returns:
+        A nested Python structure (list, dict, or sanitized string) that is safe to json.dump.
+    """
+    if isinstance(value, (list, tuple)):
+        return [_serialize_io(v) for v in value]
+
+    if isinstance(value, dict):
+        return {k: _serialize_io(v) for k, v in value.items()}
+
+    if hasattr(value, "_local_tensor"):
+        # DTensor-like handling, just use local tensor attribute
+        return {
+            "shape": repr(value._local_tensor.shape),
+            "dtype": repr(value._local_tensor.dtype),
+            "value": _sanitize_repr_for_diff(repr(value)),
+        }
+
+    if isinstance(value, torch.Tensor):
+        # standard PyTorch Tensor
+        # return also the shape of such
+        return {"shape": repr(value.shape), "dtype": repr(value.dtype), "value": _sanitize_repr_for_diff(repr(value))}
+
+    # fallback for everything else (bool, int, float, None, or custom class)
+    return _sanitize_repr_for_diff(repr(value))
+
+
+def prune_outputs_if_children(node):
+    # if there are children, remove this node's "outputs"
+    # so we only see outputs at the leaf level
+    if node.get("children"):
+        node.pop("outputs", None)
+        for child in node["children"]:
+            prune_outputs_if_children(child)
+
+
+def log_model_debug_trace(debug_path, model):
+    if debug_path:
+        try:
+            os.makedirs(debug_path, exist_ok=False)
+            output_path = os.path.join(debug_path, model._debugger_module_dump_name + "_debug_tree.json")
+        except Exception as e:
+            raise ValueError(f"Unexpected or existing debug_path={debug_path}. {e}")
+    else:
+        output_path = model._debugger_module_dump_name + "_debug_tree.json"
+    logger.info(f"Writing model trace at {output_path}")
+    with open(output_path, "w") as outfile:
+        prune_outputs_if_children(model._call_tree)
+        json.dump(model._call_tree, outfile, indent=2)
+
+
+def _attach_debugger_logic(model, class_name, debug_path: str):
+    # Prepare data structures on the model object
+    model._call_tree = {"module_path": class_name, "inputs": None, "outputs": None, "children": []}
+    model._debugger_model_call_stack = []
+    model._debugger_module_dump_name = class_name  # used for final JSON filename
+
+    def wrap_forward(module, full_path):
+        orig_forward = module.forward
+
+        @functools.wraps(orig_forward)
+        def wrapped_forward(*inps, **kws):
+            if _is_rank_zero():
+                dict_inputs = {"args": inps, "kwargs": kws}
+                dict_inputs = {k: dict_inputs[k] for k in dict_inputs if len(dict_inputs[k]) > 0}
+                node = {
+                    "module_path": full_path,
+                    "inputs": _serialize_io(dict_inputs),
+                    "outputs": None,
+                    "children": [],
+                }
+                model._debugger_model_call_stack.append(node)
+            with torch.inference_mode():
+                out = orig_forward(*inps, **kws)
+
+            if _is_rank_zero():
+                if sum(1 for _ in module.named_children()) > 0:
+                    node["outputs"] = None
+                else:
+                    node["outputs"] = _serialize_io(out)
+
+                finished = model._debugger_model_call_stack.pop()
+                # prune empty vertices here as well (mostly empty children nodes)
+                if not finished["children"]:
+                    finished.pop("children")
+
+                if model._debugger_model_call_stack:
+                    model._debugger_model_call_stack[-1]["children"].append(finished)
+            return out
+
+        module.forward = wrapped_forward
+
+    # wrap all submodules
+    for name, submodule in model.named_modules():
+        if name == "":
+            continue
+        wrap_forward(submodule, f"{class_name}.{name}")
+
+    # wrap top-level forward
+    real_top_forward = model.forward
+
+    @functools.wraps(real_top_forward)
+    def top_wrapped_forward(*inps, **kws):
+        if _is_rank_zero():
+            top_node = {
+                "module_path": f"{class_name} (top-level)",
+                "inputs": _serialize_io({"args": inps, "kwargs": kws}),
+                "outputs": None,
+                "children": [],
+            }
+            model._debugger_model_call_stack.append(top_node)
+
+        out = real_top_forward(*inps, **kws)
+
+        if _is_rank_zero() and model._debugger_model_call_stack:
+            top_node["outputs"] = _serialize_io(out)
+            finished = model._debugger_model_call_stack.pop()
+            model._call_tree["inputs"] = finished["inputs"]
+            model._call_tree["outputs"] = finished["outputs"]
+            model._call_tree["children"] = finished["children"]
+            # prune empty stuff for visibility
+            [model._call_tree.pop(k, None) for k in list(model._call_tree.keys()) if not model._call_tree[k]]
+
+        return out
+
+    model.forward = top_wrapped_forward
+
+    # Final hook for writing JSON on forward-end
+    def final_hook(_, inputs, outputs):
+        if _is_rank_zero() and model._debugger_model_call_stack:
+            finished = model._debugger_model_call_stack.pop()
+            model._call_tree["inputs"] = finished["inputs"]
+            model._call_tree["outputs"] = finished["outputs"]
+            model._call_tree["children"] = finished["children"]
+
+        if _is_rank_zero():
+            log_model_debug_trace(debug_path=debug_path, model=model)
+
+    model.register_forward_hook(final_hook)
+    # Optionally also for a couple possible hooks that have specific names. It should be just one.
+    # This means modules that are not typically called "forward" within the model. But we should not need to recurse
+    # through them.
+    possible_model_calls = ["language_model", "model"]
+    for model_call in possible_model_calls:
+        this_model_call = getattr(model, model_call, None)
+        if this_model_call and isinstance(this_model_call, (nn.Module, PreTrainedModel)):
+            this_model_call.register_forward_hook(final_hook)
+            break  # exit the loop after finding one (unsure, but should be just one call.)
+
+
+@export(backends=("torch",))
+def model_addition_debugger(cls):
+    """
+    # Model addition debugger - a model adder tracer
+    This decorator is a power user tool intended for model adders.
+    It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
+    To note, this decorator enforces `torch.inference_mode()`.
+    ## Usage
+
+    add decorator to your model class
+    ```python
+    from ...modeling_utils import model_addition_debugger
+
+    @model_addition_debugger
+    class MyModel(nn.Module) # Can inherit from PreTrainedModel too
+        # ... nothing else changes
+    ```
+    Then, in a separate script (example is for Llava)
+
+    ```python
+    import torch
+    from PIL import Image
+    import requests
+    from transformers import LlavaProcessor, LlavaForConditionalGeneration
+    torch.random.manual_seed(673)
+
+    # load pretrained model and processor
+    model_id = "llava-hf/llava-1.5-7b-hf"
+    processor = LlavaProcessor.from_pretrained(model_id)
+    model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True)
+
+    # create random image input
+    random_image = Image.fromarray(torch.randint(0, 256, (224, 224, 3), dtype=torch.uint8).numpy())
+
+    # prompt
+    prompt = "<image>Describe this image."
+
+    # process inputs
+    inputs = processor(text=prompt, images=random_image, return_tensors="pt")
+
+    # call forward method (not .generate!)
+    with torch.no_grad():
+        output = model.forward(**inputs)
+    ```
+
+    """
+    orig_init = cls.__init__
+
+    @functools.wraps(cls.__init__)
+    def wrapped_init(self, *args, **kwargs):
+        orig_init(self, *args, **kwargs)
+        _attach_debugger_logic(self, cls.__name__)
+
+    cls.__init__ = wrapped_init
+    return cls
+
+
+@export(backends=("torch",))
+@contextmanager
+def model_addition_debugger_context(model, debug_path: Optional[str] = None):
+    """
+    # Model addition debugger - context manager for model adders
+    This context manager is a power user tool intended for model adders.
+    It tracks all forward calls within a model forward and logs a slice of each input and output on a nested Json.
+    To note, this context manager enforces `torch.inference_mode()`.
+
+    ## Usage
+
+    add the context manager to a model to debug
+
+    ```python
+    import torch
+    from PIL import Image
+    import requests
+    from transformers import LlavaProcessor, LlavaForConditionalGeneration
+    torch.random.manual_seed(673)
+
+    # load pretrained model and processor
+    model_id = "llava-hf/llava-1.5-7b-hf"
+    processor = LlavaProcessor.from_pretrained(model_id)
+    model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True)
+
+    # create random image input
+    random_image = Image.fromarray(torch.randint(0, 256, (224, 224, 3), dtype=torch.uint8).numpy())
+
+    # prompt
+    prompt = "<image>Describe this image."
+
+    # process inputs
+    inputs = processor(text=prompt, images=random_image, return_tensors="pt")
+
+    # call forward method (not .generate!)
+    with model_addition_debugger_context(model):
+        output = model.forward(**inputs)
+    ```
+
+    """
+    _attach_debugger_logic(model, model.__class__.__name__, debug_path)
+    try:
+        yield model
+    finally:
+        pass
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 9a0a63ef0741..da5c4077785a 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,7 +19,7 @@
 import warnings
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 
 import requests
 import yaml
@@ -34,6 +33,7 @@
     MODEL_FOR_CTC_MAPPING_NAMES,
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
     MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
     MODEL_FOR_MASKED_LM_MAPPING_NAMES,
     MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
     MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
@@ -71,6 +71,7 @@
     "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
     "automatic-speech-recognition": {**MODEL_FOR_CTC_MAPPING_NAMES, **MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES},
     "zero-shot-image-classification": MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    "image-text-to-text": MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
 }
 
 logger = logging.get_logger(__name__)
@@ -196,7 +197,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 # Load model card
                 modelcard = cls.from_json_file(resolved_model_card_file)
 
-            except (EnvironmentError, json.JSONDecodeError):
+            except (OSError, json.JSONDecodeError):
                 # We fall back on creating an empty model card
                 modelcard = cls()
 
@@ -223,7 +224,7 @@ def from_dict(cls, json_object):
     @classmethod
     def from_json_file(cls, json_file):
         """Constructs a `ModelCard` from a json file of parameters."""
-        with open(json_file, "r", encoding="utf-8") as reader:
+        with open(json_file, encoding="utf-8") as reader:
             text = reader.read()
         dict_obj = json.loads(text)
         return cls(**dict_obj)
@@ -357,18 +358,18 @@ def _get_mapping_values(mapping):
 @dataclass
 class TrainingSummary:
     model_name: str
-    language: Optional[Union[str, List[str]]] = None
+    language: Optional[Union[str, list[str]]] = None
     license: Optional[str] = None
-    tags: Optional[Union[str, List[str]]] = None
+    tags: Optional[Union[str, list[str]]] = None
     finetuned_from: Optional[str] = None
-    tasks: Optional[Union[str, List[str]]] = None
-    dataset: Optional[Union[str, List[str]]] = None
-    dataset_tags: Optional[Union[str, List[str]]] = None
-    dataset_args: Optional[Union[str, List[str]]] = None
-    dataset_metadata: Optional[Dict[str, Any]] = None
-    eval_results: Optional[Dict[str, float]] = None
-    eval_lines: Optional[List[str]] = None
-    hyperparameters: Optional[Dict[str, Any]] = None
+    tasks: Optional[Union[str, list[str]]] = None
+    dataset: Optional[Union[str, list[str]]] = None
+    dataset_tags: Optional[Union[str, list[str]]] = None
+    dataset_args: Optional[Union[str, list[str]]] = None
+    dataset_metadata: Optional[dict[str, Any]] = None
+    eval_results: Optional[dict[str, float]] = None
+    eval_lines: Optional[list[str]] = None
+    hyperparameters: Optional[dict[str, Any]] = None
     source: Optional[str] = "trainer"
 
     def __post_init__(self):
diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py
index 09fc77e46b07..dfdd976f0156 100755
--- a/src/transformers/modeling_attn_mask_utils.py
+++ b/src/transformers/modeling_attn_mask_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 
@@ -301,7 +301,7 @@ def _ignore_causal_mask_sdpa(
 
 def _prepare_4d_causal_attention_mask(
     attention_mask: Optional[torch.Tensor],
-    input_shape: Union[torch.Size, Tuple, List],
+    input_shape: Union[torch.Size, tuple, list],
     inputs_embeds: torch.Tensor,
     past_key_values_length: int,
     sliding_window: Optional[int] = None,
@@ -354,7 +354,7 @@ def _prepare_4d_causal_attention_mask(
 # Adapted from _prepare_4d_causal_attention_mask
 def _prepare_4d_causal_attention_mask_for_sdpa(
     attention_mask: Optional[torch.Tensor],
-    input_shape: Union[torch.Size, Tuple, List],
+    input_shape: Union[torch.Size, tuple, list],
     inputs_embeds: torch.Tensor,
     past_key_values_length: int,
     sliding_window: Optional[int] = None,
@@ -452,7 +452,7 @@ def _prepare_4d_attention_mask_for_sdpa(mask: torch.Tensor, dtype: torch.dtype,
 
 
 def _create_4d_causal_attention_mask(
-    input_shape: Union[torch.Size, Tuple, List],
+    input_shape: Union[torch.Size, tuple, list],
     dtype: torch.dtype,
     device: torch.device,
     past_key_values_length: int = 0,
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index d4c7bec07902..c7d54dc41514 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,25 +14,74 @@
 
 import inspect
 import os
-from typing import Optional, Tuple, TypedDict
+from typing import Optional, TypedDict
 
 import torch
 import torch.nn.functional as F
 
-from .utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal, logging
+from .utils import (
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal,
+    is_flash_attn_greater_or_equal_2_10,
+    is_torch_npu_available,
+    logging,
+)
 
 
 logger = logging.get_logger(__name__)
+flash_attn_func = None
 
 
 if is_flash_attn_2_available():
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
     from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.layers.rotary import apply_rotary_emb  # noqa
 
+
+# patch functions in package `flash-attn` when using flash-attention on Ascend NPU.
+if is_torch_npu_available():
+    from torch_npu import npu_rotary_mul as apply_rotary_emb  # noqa
+
+    from .integrations.npu_flash_attention import index_first_axis, pad_input, unpad_input
+    from .integrations.npu_flash_attention import npu_flash_attn_func as flash_attn_func
+    from .integrations.npu_flash_attention import npu_flash_attn_varlen_func as flash_attn_varlen_func
+
+
+if flash_attn_func:
     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 
 
-def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
+def is_flash_attn_available():
+    """Determine whether flash-attention can be used or not."""
+
+    # if package `flash-attn` is available, flash-attention can be used natively.
+    if is_flash_attn_2_available():
+        return True
+
+    # flash-attention can be used on Ascend NPU without package `flash-attn`
+    if is_torch_npu_available():
+        return True
+
+    return False
+
+
+def flash_attn_supports_top_left_mask():
+    """Determine whether flash-attention uses top-left or down-right mask"""
+
+    if is_flash_attn_2_available():
+        # top-left mask is used in package `flash-attn` with version lower than 2.1.0
+        return not is_flash_attn_greater_or_equal_2_10()
+
+    if is_torch_npu_available():
+        # down-right mask is used on Ascend NPU by default, set env `NPU_FA2_SPARSE_MODE=2` to activate top-left mask.
+        from .integrations.npu_flash_attention import is_npu_fa2_top_left_aligned_causal_mask
+
+        return is_npu_fa2_top_left_aligned_causal_mask()
+
+    return False
+
+
+def _get_unpad_data(attention_mask: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, int]:
     """
     Retrieves indexing data required to repad unpadded (ragged) tensors.
 
@@ -137,9 +185,9 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
     """
     This function returns necessary arguments to call `flash_attn_varlen_func`.
     All three query, key, value states will be flattened.
-    Cummulative lengths of each examples in the batch will be extracted from position_ids.
+    Cumulative lengths of each examples in the batch will be extracted from position_ids.
 
-    NOTE: ideally cummulative lengths should be prepared at the data collator stage
+    NOTE: ideally cumulative lengths should be prepared at the data collator stage
 
     Arguments:
         query (`torch.Tensor`):
@@ -232,7 +280,7 @@ def _flash_attention_forward(
     query_states: torch.Tensor,
     key_states: torch.Tensor,
     value_states: torch.Tensor,
-    attention_mask: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
     query_length: int,
     is_causal: bool,
     dropout: float = 0.0,
@@ -241,7 +289,7 @@ def _flash_attention_forward(
     sliding_window: Optional[int] = None,
     use_top_left_mask: bool = False,
     softcap: Optional[float] = None,
-    deterministic: bool = None,
+    deterministic: Optional[bool] = None,
     cu_seq_lens_q: Optional[torch.LongTensor] = None,
     cu_seq_lens_k: Optional[torch.LongTensor] = None,
     max_length_q: Optional[int] = None,
@@ -260,7 +308,7 @@ def _flash_attention_forward(
             Input key states to be passed to Flash Attention API
         value_states (`torch.Tensor`):
             Input value states to be passed to Flash Attention API
-        attention_mask (`torch.Tensor`):
+        attention_mask (`torch.Tensor`, *optional*):
             The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
             position of padding tokens and 1 for the position of non-padding tokens.
         dropout (`float`):
@@ -268,7 +316,7 @@ def _flash_attention_forward(
         softmax_scale (`float`, *optional*):
             The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         use_top_left_mask (`bool`, defaults to `False`):
-            flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
+            flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
         softcap (`float`, *optional*):
             Softcap for the attention logits, used e.g. in gemma2.
         deterministic (`bool`, *optional*):
@@ -374,9 +422,9 @@ class FlashAttentionKwargs(TypedDict, total=False):
 
     Attributes:
         cu_seq_lens_q (`torch.LongTensor`, *optional*)
-            Gets cumlative sequence length for query state.
+            Gets cumulative sequence length for query state.
         cu_seq_lens_k (`torch.LongTensor`, *optional*)
-            Gets cumlative sequence length for key state.
+            Gets cumulative sequence length for key state.
         max_length_q (`int`, *optional*):
             Maximum sequence length for query state.
         max_length_k (`int`, *optional*):
diff --git a/src/transformers/modeling_flax_outputs.py b/src/transformers/modeling_flax_outputs.py
index 179a0b787936..325f968571ed 100644
--- a/src/transformers/modeling_flax_outputs.py
+++ b/src/transformers/modeling_flax_outputs.py
@@ -40,7 +40,7 @@ class FlaxBaseModelOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: jnp.ndarray = None
+    last_hidden_state: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
 
@@ -59,7 +59,7 @@ class FlaxBaseModelOutputWithNoAttention(ModelOutput):
             model at the output of each layer plus the optional initial embedding outputs.
     """
 
-    last_hidden_state: jnp.ndarray = None
+    last_hidden_state: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
 
 
@@ -79,8 +79,8 @@ class FlaxBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
             model at the output of each layer plus the optional initial embedding outputs.
     """
 
-    last_hidden_state: jnp.ndarray = None
-    pooler_output: jnp.ndarray = None
+    last_hidden_state: Optional[jnp.ndarray] = None
+    pooler_output: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
 
 
@@ -99,7 +99,7 @@ class FlaxImageClassifierOutputWithNoAttention(ModelOutput):
             called feature maps) of the model at the output of each stage.
     """
 
-    logits: jnp.ndarray = None
+    logits: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
 
 
@@ -127,7 +127,7 @@ class FlaxBaseModelOutputWithPast(ModelOutput):
             heads.
     """
 
-    last_hidden_state: jnp.ndarray = None
+    last_hidden_state: Optional[jnp.ndarray] = None
     past_key_values: Optional[Dict[str, jnp.ndarray]] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
@@ -158,8 +158,8 @@ class FlaxBaseModelOutputWithPooling(ModelOutput):
             heads.
     """
 
-    last_hidden_state: jnp.ndarray = None
-    pooler_output: jnp.ndarray = None
+    last_hidden_state: Optional[jnp.ndarray] = None
+    pooler_output: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
 
@@ -205,8 +205,8 @@ class FlaxBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
             input) to speed up sequential decoding.
     """
 
-    last_hidden_state: jnp.ndarray = None
-    pooler_output: jnp.ndarray = None
+    last_hidden_state: Optional[jnp.ndarray] = None
+    pooler_output: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
@@ -252,7 +252,7 @@ class FlaxBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
             weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: jnp.ndarray = None
+    last_hidden_state: Optional[jnp.ndarray] = None
     past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
@@ -310,7 +310,7 @@ class FlaxSeq2SeqModelOutput(ModelOutput):
             self-attention heads.
     """
 
-    last_hidden_state: jnp.ndarray = None
+    last_hidden_state: Optional[jnp.ndarray] = None
     past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None
     decoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None
     decoder_attentions: Optional[Tuple[jnp.ndarray]] = None
@@ -354,7 +354,7 @@ class FlaxCausalLMOutputWithCrossAttentions(ModelOutput):
             `past_key_values` input) to speed up sequential decoding.
     """
 
-    logits: jnp.ndarray = None
+    logits: Optional[jnp.ndarray] = None
     past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
@@ -382,7 +382,7 @@ class FlaxMaskedLMOutput(ModelOutput):
             heads.
     """
 
-    logits: jnp.ndarray = None
+    logits: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
 
@@ -437,7 +437,7 @@ class FlaxSeq2SeqLMOutput(ModelOutput):
             self-attention heads.
     """
 
-    logits: jnp.ndarray = None
+    logits: Optional[jnp.ndarray] = None
     past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None
     decoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None
     decoder_attentions: Optional[Tuple[jnp.ndarray]] = None
@@ -469,7 +469,7 @@ class FlaxNextSentencePredictorOutput(ModelOutput):
             heads.
     """
 
-    logits: jnp.ndarray = None
+    logits: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
 
@@ -495,7 +495,7 @@ class FlaxSequenceClassifierOutput(ModelOutput):
             heads.
     """
 
-    logits: jnp.ndarray = None
+    logits: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
 
@@ -547,7 +547,7 @@ class FlaxSeq2SeqSequenceClassifierOutput(ModelOutput):
             self-attention heads.
     """
 
-    logits: jnp.ndarray = None
+    logits: Optional[jnp.ndarray] = None
     past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None
     decoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None
     decoder_attentions: Optional[Tuple[jnp.ndarray]] = None
@@ -580,7 +580,7 @@ class FlaxMultipleChoiceModelOutput(ModelOutput):
             heads.
     """
 
-    logits: jnp.ndarray = None
+    logits: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
 
@@ -606,7 +606,7 @@ class FlaxTokenClassifierOutput(ModelOutput):
             heads.
     """
 
-    logits: jnp.ndarray = None
+    logits: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
 
@@ -634,8 +634,8 @@ class FlaxQuestionAnsweringModelOutput(ModelOutput):
             heads.
     """
 
-    start_logits: jnp.ndarray = None
-    end_logits: jnp.ndarray = None
+    start_logits: Optional[jnp.ndarray] = None
+    end_logits: Optional[jnp.ndarray] = None
     hidden_states: Optional[Tuple[jnp.ndarray]] = None
     attentions: Optional[Tuple[jnp.ndarray]] = None
 
@@ -689,8 +689,8 @@ class FlaxSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
             self-attention heads.
     """
 
-    start_logits: jnp.ndarray = None
-    end_logits: jnp.ndarray = None
+    start_logits: Optional[jnp.ndarray] = None
+    end_logits: Optional[jnp.ndarray] = None
     past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None
     decoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None
     decoder_attentions: Optional[Tuple[jnp.ndarray]] = None
diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index 8fbba8a16513..072850657725 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -71,8 +71,7 @@ def load_pytorch_checkpoint_in_flax_state_dict(
                 )
                 raise
 
-            weights_only_kwarg = {"weights_only": True}
-            pt_state_dict = torch.load(pt_path, map_location="cpu", **weights_only_kwarg)
+            pt_state_dict = torch.load(pt_path, map_location="cpu", weights_only=True)
             logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters.")
 
         flax_state_dict = convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model)
@@ -248,8 +247,7 @@ def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model):
     flax_state_dict = {}
     for shard_file in shard_filenames:
         # load using msgpack utils
-        weights_only_kwarg = {"weights_only": True}
-        pt_state_dict = torch.load(shard_file, **weights_only_kwarg)
+        pt_state_dict = torch.load(shard_file, weights_only=True)
         weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()}
         pt_state_dict = {
             k: v.numpy() if v.dtype != torch.bfloat16 else v.float().numpy() for k, v in pt_state_dict.items()
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index dc4a3be732a4..c775ee85bbef 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -17,7 +17,6 @@
 import gc
 import json
 import os
-import re
 import warnings
 from functools import partial
 from pickle import UnpicklingError
@@ -79,26 +78,10 @@ def quick_gelu(x):
     "gelu_new": partial(nn.gelu, approximate=True),
     "quick_gelu": quick_gelu,
     "gelu_pytorch_tanh": partial(nn.gelu, approximate=True),
+    "tanh": nn.tanh,
 }
 
 
-def dtype_byte_size(dtype):
-    """
-    Returns the size (in bytes) occupied by one parameter of type `dtype`. Example:
-    ```py
-    >>> dtype_byte_size(np.float32)
-    4
-    ```
-    """
-    if dtype is bool:
-        return 1 / 8
-    bit_search = re.search(r"[^\d](\d+)$", dtype.name)
-    if bit_search is None:
-        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
-    bit_size = int(bit_search.groups()[0])
-    return bit_size // 8
-
-
 def flax_shard_checkpoint(params, max_shard_size="10GB"):
     """
     Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
@@ -130,7 +113,7 @@ def flax_shard_checkpoint(params, max_shard_size="10GB"):
     # flatten the weights to chunk
     weights = flatten_dict(params, sep="/")
     for item in weights:
-        weight_size = weights[item].size * dtype_byte_size(weights[item].dtype)
+        weight_size = weights[item].size * weights[item].dtype.itemsize
 
         # If this weight is going to tip up over the maximal size, we split.
         if current_block_size + weight_size > max_shard_size:
@@ -153,7 +136,7 @@ def flax_shard_checkpoint(params, max_shard_size="10GB"):
     weight_map = {}
     shards = {}
     for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = FLAX_WEIGHTS_NAME.replace(".msgpack", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.msgpack")
+        shard_file = FLAX_WEIGHTS_NAME.replace(".msgpack", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.msgpack")
         shards[shard_file] = shard
         for weight_name in shard.keys():
             weight_map[weight_name] = shard_file
@@ -367,7 +350,7 @@ def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
 
     def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
         r"""
-        Cast the floating-point `parmas` to `jax.numpy.float32`. This method can be used to explicitly convert the
+        Cast the floating-point `params` to `jax.numpy.float32`. This method can be used to explicitly convert the
         model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in place.
 
         Arguments:
@@ -394,7 +377,7 @@ def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
 
     def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
         r"""
-        Cast the floating-point `parmas` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
+        Cast the floating-point `params` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
         `params` in place.
 
         This method can be used on GPU to explicitly convert the model parameters to float16 precision to do full
@@ -510,7 +493,7 @@ def can_generate(cls) -> bool:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
         # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
-        # Alternativelly, the model can also have a custom `generate` function.
+        # Alternatively, the model can also have a custom `generate` function.
         if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
             return False
         return True
@@ -968,7 +951,7 @@ def from_pretrained(
             )
             cls._missing_keys = missing_keys
 
-        # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
+        # Mismatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
         # matching the weights in the model.
         mismatched_keys = []
         for key in state.keys():
diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py
index 527bd35769a6..b8e6de57f5c9 100644
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 The ggml.ai team and The HuggingFace Inc. team. and pygguf author (github.com/99991)
 # https://github.com/99991/pygguf
 #
@@ -15,7 +14,7 @@
 # limitations under the License.
 
 import re
-from typing import Dict, NamedTuple, Optional
+from typing import NamedTuple, Optional
 
 import numpy as np
 from tqdm.auto import tqdm
@@ -115,7 +114,7 @@ def process(self, weights, name, **kwargs):
         return GGUFTensor(weights, name, {})
 
     def _split_moe_expert_tensor(
-        self, weights: np.ndarray, parsed_parameters: Dict[str, Dict], name: str, tensor_key_mapping: dict
+        self, weights: np.ndarray, parsed_parameters: dict[str, dict], name: str, tensor_key_mapping: dict
     ):
         # Original merge implementation
         # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
@@ -291,7 +290,7 @@ def get_gguf_hf_weights_map(
     # hack: ggufs have a different name for cohere
     if model_type == "cohere":
         model_type = "command-r"
-    if model_type == "qwen2_moe":
+    elif model_type == "qwen2_moe":
         model_type = "qwen2moe"
     arch = None
     for key, value in MODEL_ARCH_NAMES.items():
@@ -347,7 +346,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
     Args:
         gguf_checkpoint_path (`str`):
             The path the to GGUF file to load
-        return_tensors (`bool`, defaults to `True`):
+        return_tensors (`bool`, defaults to `False`):
             Whether to read the tensors from the file and return them. Not doing so is faster
             and only loads the metadata in memory.
     """
@@ -369,14 +368,17 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
     architecture = read_field(reader, "general.architecture")[0]
     model_name = read_field(reader, "general.name")
 
+    updated_architecture = None
     # in llama.cpp mistral models use the same architecture as llama. We need
     # to add this patch to ensure things work correctly on our side.
     if "llama" in architecture and "mistral" in model_name:
         updated_architecture = "mistral"
-    # FIXME: Currnetly this implementation is only for flan-t5 architecture.
+    # FIXME: Currently this implementation is only for flan-t5 architecture.
     # It needs to be developed for supporting legacy t5.
     elif "t5" in architecture or "t5encoder" in architecture:
         parsed_parameters["config"]["is_gated_act"] = True
+        if "t5encoder" in architecture:
+            parsed_parameters["config"]["architectures"] = ["T5EncoderModel"]
         updated_architecture = "t5"
     else:
         updated_architecture = architecture
@@ -395,7 +397,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
         parsed_parameters["config"]["use_qkv_bias"] = qkv_bias
         parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual
 
-    if architecture not in GGUF_SUPPORTED_ARCHITECTURES:
+    if architecture not in GGUF_SUPPORTED_ARCHITECTURES and updated_architecture not in GGUF_SUPPORTED_ARCHITECTURES:
         raise ValueError(f"GGUF model with architecture {architecture} is not supported yet.")
 
     # Handle tie_word_embeddings, if lm_head.weight is not present in tensors,
@@ -437,7 +439,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
             logger.info(f"Some keys were not parsed and added into account {gguf_key} | {value}")
 
     # retrieve config vocab_size from tokenizer
-    # Pleas refer to https://github.com/huggingface/transformers/issues/32526 for more details
+    # Please refer to https://github.com/huggingface/transformers/issues/32526 for more details
     if "vocab_size" not in parsed_parameters["config"]:
         tokenizer_parameters = parsed_parameters["tokenizer"]
         if "tokens" in tokenizer_parameters:
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index 7328e05186f2..60a3642f87c6 100755
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -42,7 +42,7 @@ class BaseModelOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -62,7 +62,7 @@ class BaseModelOutputWithNoAttention(ModelOutput):
             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
@@ -92,8 +92,8 @@ class BaseModelOutputWithPooling(ModelOutput):
             heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -115,8 +115,8 @@ class BaseModelOutputWithPoolingAndNoAttention(ModelOutput):
             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
@@ -153,7 +153,7 @@ class BaseModelOutputWithPast(ModelOutput):
             heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -186,7 +186,7 @@ class BaseModelOutputWithCrossAttentions(ModelOutput):
             weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -233,8 +233,8 @@ class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
             input) to speed up sequential decoding.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -280,7 +280,7 @@ class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
             weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -327,12 +327,12 @@ class MoECausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
-    z_loss: torch.FloatTensor = None
-    aux_loss: torch.FloatTensor = None
+    z_loss: Optional[torch.FloatTensor] = None
+    aux_loss: Optional[torch.FloatTensor] = None
     router_logits: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -362,7 +362,7 @@ class MoEModelOutput(ModelOutput):
             loss and the z_loss for Mixture of Experts models.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     router_probs: Optional[Tuple[torch.FloatTensor]] = None
@@ -403,7 +403,7 @@ class MoeModelOutputWithPast(ModelOutput):
             loss for Mixture of Experts models.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -452,7 +452,7 @@ class MoeCausalLMOutputWithPast(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     aux_loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -504,7 +504,7 @@ class MoEModelOutputWithPastAndCrossAttentions(ModelOutput):
             loss and the z_loss for Mixture of Experts models.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -563,7 +563,7 @@ class Seq2SeqModelOutput(ModelOutput):
             self-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -633,7 +633,7 @@ class Seq2SeqMoEModelOutput(ModelOutput):
             modules.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -669,7 +669,7 @@ class CausalLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -704,7 +704,7 @@ class CausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -747,7 +747,7 @@ class CausalLMOutputWithCrossAttentions(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -784,7 +784,7 @@ class SequenceClassifierOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -814,7 +814,7 @@ class MaskedLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -869,7 +869,7 @@ class Seq2SeqLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -938,11 +938,11 @@ class Seq2SeqMoEOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    encoder_z_loss: torch.FloatTensor = None
-    decoder_z_loss: torch.FloatTensor = None
-    encoder_aux_loss: torch.FloatTensor = None
-    decoder_aux_loss: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    encoder_z_loss: Optional[torch.FloatTensor] = None
+    decoder_z_loss: Optional[torch.FloatTensor] = None
+    encoder_aux_loss: Optional[torch.FloatTensor] = None
+    decoder_aux_loss: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -979,7 +979,7 @@ class NextSentencePredictorOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1008,7 +1008,7 @@ class SequenceClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1063,7 +1063,7 @@ class Seq2SeqSequenceClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -1099,7 +1099,7 @@ class MultipleChoiceModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1128,7 +1128,7 @@ class TokenClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1159,8 +1159,8 @@ class QuestionAnsweringModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1217,8 +1217,8 @@ class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -1261,7 +1261,7 @@ class SemanticSegmenterOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1289,7 +1289,7 @@ class ImageClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1311,7 +1311,7 @@ class ImageClassifierOutputWithNoAttention(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
@@ -1340,7 +1340,7 @@ class DepthEstimatorOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    predicted_depth: torch.FloatTensor = None
+    predicted_depth: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1368,7 +1368,7 @@ class ImageSuperResolutionOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    reconstruction: torch.FloatTensor = None
+    reconstruction: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1396,8 +1396,8 @@ class Wav2Vec2BaseModelOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    extract_features: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    extract_features: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1428,8 +1428,8 @@ class XVectorOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    embeddings: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    embeddings: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1456,7 +1456,7 @@ class BackboneOutput(ModelOutput):
             heads.
     """
 
-    feature_maps: Tuple[torch.FloatTensor] = None
+    feature_maps: Optional[Tuple[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1491,8 +1491,8 @@ class BaseModelOutputWithPoolingAndProjection(ModelOutput):
             Text embeddings before the projection layer, used to mimic the last hidden state of the teacher encoder.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     projection_state: Optional[Tuple[torch.FloatTensor]] = None
@@ -1548,7 +1548,7 @@ class Seq2SeqSpectrogramOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    spectrogram: torch.FloatTensor = None
+    spectrogram: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -1617,7 +1617,7 @@ class Seq2SeqTSModelOutput(ModelOutput):
             Static features of each time series' in a batch which are copied to the covariates at inference time.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -1713,7 +1713,7 @@ class SampleTSPredictionOutput(ModelOutput):
             Sampled values from the chosen distribution.
     """
 
-    sequences: torch.FloatTensor = None
+    sequences: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -1739,7 +1739,7 @@ class MaskedImageModelingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    reconstruction: torch.FloatTensor = None
+    reconstruction: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py
index b2d343e0237f..9a9cfd072b27 100644
--- a/src/transformers/modeling_rope_utils.py
+++ b/src/transformers/modeling_rope_utils.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 import math
-from typing import Optional, Tuple
+from functools import wraps
+from typing import Optional
 
 from .configuration_utils import PretrainedConfig
 from .utils import is_torch_available, logging
@@ -26,12 +27,74 @@
     import torch
 
 
+def dynamic_rope_update(rope_forward):
+    """
+    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
+    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+
+    Args:
+        rope_forward (Callable):
+            The forward pass of the RoPE implementation.
+
+    Returns:
+        The decorated forward pass.
+    """
+
+    def longrope_frequency_update(self, position_ids, device):
+        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        seq_len = torch.max(position_ids) + 1
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+        if seq_len > original_max_position_embeddings:
+            if not hasattr(self, "long_inv_freq"):
+                self.long_inv_freq, _ = self.rope_init_fn(
+                    self.config, device, seq_len=original_max_position_embeddings + 1
+                )
+            self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+        else:
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+    def dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @wraps(rope_forward)
+    def wrapper(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device)
+        return rope_forward(self, x, position_ids)
+
+    return wrapper
+
+
 def _compute_default_rope_parameters(
     config: Optional[PretrainedConfig] = None,
     device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
     **rope_kwargs,
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies according to the original RoPE implementation
     Args:
@@ -64,7 +127,7 @@ def _compute_default_rope_parameters(
     attention_factor = 1.0  # Unused in this type of RoPE
 
     # Compute the inverse frequencies
-    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim))
     return inv_freq, attention_factor
 
 
@@ -73,7 +136,7 @@ def _compute_linear_scaling_rope_parameters(
     device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
     **rope_kwargs,
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
     Args:
@@ -114,7 +177,7 @@ def _compute_dynamic_ntk_parameters(
     device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
     **rope_kwargs,
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
     Args:
@@ -156,13 +219,13 @@ def _compute_dynamic_ntk_parameters(
 
     # Compute the inverse frequencies
     base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
-    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim))
     return inv_freq, attention_factor
 
 
 def _compute_yarn_parameters(
     config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with NTK scaling. Please refer to the
     [original paper](https://arxiv.org/abs/2309.00071)
@@ -189,13 +252,31 @@ def _compute_yarn_parameters(
     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
     dim = int(head_dim * partial_rotary_factor)
-    max_position_embeddings = config.max_position_embeddings
     factor = config.rope_scaling["factor"]
+    attention_factor = config.rope_scaling.get("attention_factor")
+    mscale = config.rope_scaling.get("mscale")
+    mscale_all_dim = config.rope_scaling.get("mscale_all_dim")
+
+    # NOTE: DeekSeek-V3 (and potentially other models) modify `max_position_embeddings` and have a
+    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+    # values to compute the default attention scaling factor, instead of using `factor`.
+    if "original_max_position_embeddings" in config.rope_scaling:
+        original_max_position_embeddings = config.rope_scaling["original_max_position_embeddings"]
+        factor = config.max_position_embeddings / original_max_position_embeddings
+    else:
+        original_max_position_embeddings = config.max_position_embeddings
+
+    def get_mscale(scale, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
 
     # Sets the attention factor as suggested in the paper
-    attention_factor = config.rope_scaling.get("attention_factor")
     if attention_factor is None:
-        attention_factor = 0.1 * math.log(factor) + 1.0
+        if mscale and mscale_all_dim:
+            attention_factor = float(get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dim))
+        else:
+            attention_factor = get_mscale(factor)
 
     # Optional config options
     # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
@@ -223,25 +304,24 @@ def linear_ramp_factor(min, max, dim):
 
     # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
     # to expand the possible context length. In other words, interpolation = apply scaling factor.
-    pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
+    pos_freqs = base ** (torch.arange(0, dim, 2).to(device=device, dtype=torch.float) / dim)
     inv_freq_extrapolation = 1.0 / pos_freqs
     inv_freq_interpolation = 1.0 / (factor * pos_freqs)
 
-    low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
+    low, high = find_correction_range(beta_fast, beta_slow, dim, base, original_max_position_embeddings)
 
     # Get n-dimensional rotational scaling corrected for extrapolation
-    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
+    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).to(device=device, dtype=torch.float)
     inv_freq = (
         inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
         + inv_freq_extrapolation * inv_freq_extrapolation_factor
     )
-
     return inv_freq, attention_factor
 
 
 def _compute_longrope_parameters(
     config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies with LongRoPE scaling. Please refer to the
     [original implementation](https://github.com/microsoft/LongRoPE)
@@ -304,7 +384,7 @@ def _compute_longrope_parameters(
 
 def _compute_llama3_parameters(
     config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
-) -> Tuple["torch.Tensor", float]:
+) -> tuple["torch.Tensor", float]:
     """
     Computes the inverse frequencies for llama 3.1.
 
@@ -425,7 +505,14 @@ def _validate_yarn_parameters(config: PretrainedConfig, ignore_keys: Optional[se
     rope_scaling = config.rope_scaling
     rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
     required_keys = {"rope_type", "factor"}
-    optional_keys = {"attention_factor", "beta_fast", "beta_slow"}
+    optional_keys = {
+        "attention_factor",
+        "beta_fast",
+        "beta_slow",
+        "original_max_position_embeddings",
+        "mscale",
+        "mscale_all_dim",
+    }
     received_keys = set(rope_scaling.keys())
     _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)
 
diff --git a/src/transformers/modeling_tf_outputs.py b/src/transformers/modeling_tf_outputs.py
index 357c34bc1f25..cbc8b3682af6 100644
--- a/src/transformers/modeling_tf_outputs.py
+++ b/src/transformers/modeling_tf_outputs.py
@@ -44,7 +44,7 @@ class TFBaseModelOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -64,7 +64,7 @@ class TFBaseModelOutputWithNoAttention(ModelOutput):
             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
 
 
@@ -96,8 +96,8 @@ class TFBaseModelOutputWithPooling(ModelOutput):
             heads.
     """
 
-    last_hidden_state: tf.Tensor = None
-    pooler_output: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
+    pooler_output: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -119,8 +119,8 @@ class TFBaseModelOutputWithPoolingAndNoAttention(ModelOutput):
             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
     """
 
-    last_hidden_state: tf.Tensor = None
-    pooler_output: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
+    pooler_output: Optional[tf.Tensor] = None
     hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
 
 
@@ -164,8 +164,8 @@ class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
             weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: tf.Tensor = None
-    pooler_output: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
+    pooler_output: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
@@ -202,7 +202,7 @@ class TFBaseModelOutputWithPast(ModelOutput):
             heads.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
@@ -235,7 +235,7 @@ class TFBaseModelOutputWithCrossAttentions(ModelOutput):
             weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
     cross_attentions: Tuple[tf.Tensor] | None = None
@@ -277,7 +277,7 @@ class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput):
             weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
@@ -334,7 +334,7 @@ class TFSeq2SeqModelOutput(ModelOutput):
             self-attention heads.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     decoder_hidden_states: Tuple[tf.Tensor] | None = None
     decoder_attentions: Tuple[tf.Tensor] | None = None
@@ -368,7 +368,7 @@ class TFCausalLMOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -403,7 +403,7 @@ class TFCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
@@ -445,7 +445,7 @@ class TFCausalLMOutputWithCrossAttentions(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
@@ -476,7 +476,7 @@ class TFMaskedLMOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -530,7 +530,7 @@ class TFSeq2SeqLMOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     decoder_hidden_states: Tuple[tf.Tensor] | None = None
     decoder_attentions: Tuple[tf.Tensor] | None = None
@@ -565,7 +565,7 @@ class TFNextSentencePredictorOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -594,7 +594,7 @@ class TFSequenceClassifierOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -645,7 +645,7 @@ class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     decoder_hidden_states: Tuple[tf.Tensor] | None = None
     decoder_attentions: Tuple[tf.Tensor] | None = None
@@ -687,7 +687,7 @@ class TFSemanticSegmenterOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -719,7 +719,7 @@ class TFSemanticSegmenterOutputWithNoAttention(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
 
 
@@ -745,7 +745,7 @@ class TFImageClassifierOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -776,7 +776,7 @@ class TFMultipleChoiceModelOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -805,7 +805,7 @@ class TFTokenClassifierOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -836,8 +836,8 @@ class TFQuestionAnsweringModelOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    start_logits: tf.Tensor = None
-    end_logits: tf.Tensor = None
+    start_logits: Optional[tf.Tensor] = None
+    end_logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -887,8 +887,8 @@ class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    start_logits: tf.Tensor = None
-    end_logits: tf.Tensor = None
+    start_logits: Optional[tf.Tensor] = None
+    end_logits: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     decoder_hidden_states: Tuple[tf.Tensor] | None = None
     decoder_attentions: Tuple[tf.Tensor] | None = None
@@ -927,7 +927,7 @@ class TFSequenceClassifierOutputWithPast(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
@@ -950,7 +950,7 @@ class TFImageClassifierOutputWithNoAttention(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Optional[Tuple[tf.Tensor, ...]] = None
 
 
@@ -977,7 +977,7 @@ class TFMaskedImageModelingOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    reconstruction: tf.Tensor = None
+    reconstruction: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 8ec24d6e1872..84a6ddaebcc4 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
@@ -199,8 +198,7 @@ def load_pytorch_checkpoint_in_tf2_model(
         if pt_path.endswith(".safetensors"):
             state_dict = safe_load_file(pt_path)
         else:
-            weights_only_kwarg = {"weights_only": True}
-            state_dict = torch.load(pt_path, map_location="cpu", **weights_only_kwarg)
+            state_dict = torch.load(pt_path, map_location="cpu", weights_only=True)
 
         pt_state_dict.update(state_dict)
 
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 8264f48818cb..c0add5bb6672 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -617,26 +617,6 @@ def input_processing(func, config, **kwargs):
     return output
 
 
-def dtype_byte_size(dtype):
-    """
-    Returns the size (in bytes) occupied by one parameter of type `dtype`.
-
-    Example:
-
-    ```py
-    >>> dtype_byte_size(tf.float32)
-    4
-    ```
-    """
-    if dtype == tf.bool:
-        return 1 / 8
-    bit_search = re.search(r"[^\d](\d+)$", dtype.name)
-    if bit_search is None:
-        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
-    bit_size = int(bit_search.groups()[0])
-    return bit_size // 8
-
-
 def strip_model_name_and_prefix(name, _prefix=None):
     if _prefix is not None and name.startswith(_prefix):
         name = name[len(_prefix) :]
@@ -678,7 +658,7 @@ def tf_shard_checkpoint(weights, max_shard_size="10GB", weights_name: str = TF2_
     total_size = 0
 
     for item in weights:
-        weight_size = item.numpy().size * dtype_byte_size(item.dtype)
+        weight_size = item.numpy().size * item.dtype.size
 
         # If this weight is going to tip up over the maximal size, we split.
         if current_block_size + weight_size > max_shard_size:
@@ -701,7 +681,7 @@ def tf_shard_checkpoint(weights, max_shard_size="10GB", weights_name: str = TF2_
     weight_map = {}
     shards = {}
     for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = weights_name.replace(".h5", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.h5")
+        shard_file = weights_name.replace(".h5", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.h5")
         shard_file = shard_file.replace(
             ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
         )
@@ -795,7 +775,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
         ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys
 
     Returns:
-        `keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
+        `keras.models.Model`: Three lists, one for the layers that were found and successfully restored (from the
         shard file), one for the mismatched layers, and another one for the unexpected layers.
     """
     saved_weight_names_set = set()
@@ -868,7 +848,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
                 f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' "
                 f"at '{resolved_archive_file}'. "
                 "If you tried to load a TF model from a sharded checkpoint, you should try converting the model "
-                "by loading it in pytorch and saving it localy. A convertion script should be realeased soon."
+                "by loading it in pytorch and saving it locally. A convertion script should be released soon."
             )
 
 
@@ -1391,7 +1371,7 @@ def can_generate(cls) -> bool:
             `bool`: Whether this model can generate sequences with `.generate()`.
         """
         # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
-        # Alternativelly, the model can also have a custom `generate` function.
+        # Alternatively, the model can also have a custom `generate` function.
         if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
             return False
         return True
@@ -2550,7 +2530,7 @@ def from_pretrained(
         local_files_only: bool = False,
         token: Optional[Union[str, bool]] = None,
         revision: str = "main",
-        use_safetensors: bool = None,
+        use_safetensors: Optional[bool] = None,
         **kwargs,
     ):
         r"""
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
old mode 100755
new mode 100644
index b75151992c58..6a3286cbc948
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -21,11 +21,14 @@
 import inspect
 import itertools
 import json
+import math
 import os
 import re
 import shutil
 import tempfile
 import warnings
+from collections import defaultdict
+from collections.abc import MutableMapping
 from contextlib import contextmanager
 from dataclasses import dataclass
 from enum import Enum
@@ -35,6 +38,7 @@
 from zipfile import is_zipfile
 
 import torch
+import torch.distributed.tensor
 from huggingface_hub import split_torch_state_dict_into_shards
 from packaging import version
 from torch import Tensor, nn
@@ -42,14 +46,25 @@
 from torch.nn import CrossEntropyLoss, Identity
 from torch.utils.checkpoint import checkpoint
 
+from transformers.utils import is_torchao_available
+
+
+if is_torchao_available():
+    from torchao.quantization import Int4WeightOnlyConfig
+
 from .activations import get_activation
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
 from .generation import CompileConfig, GenerationConfig, GenerationMixin
 from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
+from .integrations.deepspeed import _load_state_dict_into_zero3_model, is_deepspeed_available
 from .integrations.flash_attention import flash_attention_forward
 from .integrations.flex_attention import flex_attention_forward
 from .integrations.sdpa_attention import sdpa_attention_forward
+from .integrations.tensor_parallel import (
+    SUPPORTED_TP_STYLES,
+    shard_and_distribute_module,
+)
 from .loss.loss_utils import LOSS_MAPPING
 from .pytorch_utils import (  # noqa: F401
     Conv1D,
@@ -59,13 +74,11 @@
     prune_conv1d_layer,
     prune_layer,
     prune_linear_layer,
-    translate_to_torch_parallel_style,
 )
 from .quantizers import AutoHfQuantizer, HfQuantizer
 from .quantizers.quantizers_utils import get_module_from_name
 from .safetensors_conversion import auto_conversion
 from .utils import (
-    ACCELERATE_MIN_VERSION,
     ADAPTER_SAFE_WEIGHTS_NAME,
     ADAPTER_WEIGHTS_NAME,
     CONFIG_NAME,
@@ -95,6 +108,8 @@
     is_safetensors_available,
     is_torch_flex_attn_available,
     is_torch_greater_or_equal,
+    is_torch_mlu_available,
+    is_torch_npu_available,
     is_torch_sdpa_available,
     is_torch_xla_available,
     logging,
@@ -127,7 +142,6 @@
         load_offloaded_weights,
         offload_weight,
         save_offload_index,
-        set_module_tensor_to_device,
     )
 
     accelerate_version = version.parse(importlib.metadata.version("accelerate"))
@@ -139,12 +153,17 @@
     from safetensors.torch import load_file as safe_load_file
     from safetensors.torch import save_file as safe_save_file
 
+
+if is_deepspeed_available():
+    import deepspeed
+
 logger = logging.get_logger(__name__)
 
 
 _init_weights = True
 _is_quantized = False
 _is_ds_init_called = False
+_torch_distributed_available = torch.distributed.is_available()
 
 
 def is_fsdp_enabled():
@@ -175,6 +194,7 @@ def is_local_dist_rank_0():
 if is_peft_available():
     from .utils import find_adapter_config_file
 
+
 SpecificPreTrainedModelType = TypeVar("SpecificPreTrainedModelType", bound="PreTrainedModel")
 
 TORCH_INIT_FUNCTIONS = {
@@ -196,32 +216,29 @@ def is_local_dist_rank_0():
 
 
 @contextmanager
-def no_init_weights(_enable=True):
+def no_init_weights():
     """
     Context manager to globally disable weight initialization to speed up loading large models.
-
-    TODO(Patrick): Delete safety argument `_enable=True` at next major version. .
     """
     global _init_weights
     old_init_weights = _init_weights
 
-    if _enable:
-        _init_weights = False
+    _init_weights = False
 
-        def _skip_init(*args, **kwargs):
-            pass
+    def _skip_init(*args, **kwargs):
+        pass
+
+    # Save the original initialization functions
+    for name, init_func in TORCH_INIT_FUNCTIONS.items():
+        setattr(torch.nn.init, name, _skip_init)
 
-        # # Save the original initialization functions
-        for name, init_func in TORCH_INIT_FUNCTIONS.items():
-            setattr(torch.nn.init, name, _skip_init)
     try:
         yield
     finally:
         _init_weights = old_init_weights
-        if _enable:
-            # # Restore the original initialization functions
-            for name, init_func in TORCH_INIT_FUNCTIONS.items():
-                setattr(torch.nn.init, name, init_func)
+        # Restore the original initialization functions
+        for name, init_func in TORCH_INIT_FUNCTIONS.items():
+            setattr(torch.nn.init, name, init_func)
 
 
 @contextmanager
@@ -372,60 +389,6 @@ def get_state_dict_dtype(state_dict):
         return next(state_dict.values()).dtype
 
 
-def dtype_byte_size(dtype):
-    """
-    Returns the size (in bytes) occupied by one parameter of type `dtype`.
-
-    Example:
-
-    ```py
-    >>> dtype_byte_size(torch.float32)
-    4
-    ```
-    """
-    if dtype == torch.bool:
-        return 1 / 8
-    bit_search = re.search(r"[^\d](\d+)_?", str(dtype))
-    if bit_search is None:
-        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
-    bit_size = int(bit_search.groups()[0])
-    return bit_size // 8
-
-
-def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefix=""):
-    """
-    Checks if `model_to_load` supports param buffer assignment (such
-    as when loading in empty weights) by first checking
-    if the model explicitly disables it, then by ensuring that the state dict keys
-    are a subset of the model's parameters.
-
-    Note: We fully disable this if we are using `deepspeed`
-    """
-    if model_to_load.device.type == "meta":
-        return False
-
-    if len([key for key in state_dict if key.startswith(start_prefix)]) == 0:
-        return False
-
-    if is_deepspeed_zero3_enabled():
-        return False
-
-    # Some models explicitly do not support param buffer assignment
-    if not getattr(model_to_load, "_supports_param_buffer_assignment", True):
-        logger.debug(
-            f"{model_to_load.__class__.__name__} does not support param buffer assignment, loading will be slower"
-        )
-        return False
-
-    # If the model does, the incoming `state_dict` and the `model_to_load` must be the same dtype
-    first_key = next(iter(model_to_load.state_dict().keys()))
-    if start_prefix + first_key in state_dict:
-        return state_dict[start_prefix + first_key].dtype == model_to_load.state_dict()[first_key].dtype
-
-    # For cases when the `state_dict` doesn't contain real weights to the model (`test_model_weights_reload_no_missing_tied_weights`)
-    return False
-
-
 def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
     """
     This is the same as
@@ -438,9 +401,9 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
     Args:
         model (`torch.nn.Module`): The model in which to load the checkpoint.
         folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint.
-        strict (`bool`, *optional`, defaults to `True`):
+        strict (`bool`, *optional*, defaults to `True`):
             Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.
-        prefer_safe (`bool`, *optional*, defaults to `False`)
+        prefer_safe (`bool`, *optional*, defaults to `False`):
             If both safetensors and PyTorch save files are present in checkpoint and `prefer_safe` is True, the
             safetensors files will be loaded. Otherwise, PyTorch files are always loaded when possible.
 
@@ -496,8 +459,7 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
             error_message += f"\nMissing key(s): {str_unexpected_keys}."
         raise RuntimeError(error_message)
 
-    weights_only_kwarg = {"weights_only": True}
-    loader = safe_load_file if load_safe else partial(torch.load, map_location="cpu", **weights_only_kwarg)
+    loader = safe_load_file if load_safe else partial(torch.load, map_location="cpu", weights_only=True)
 
     for shard_file in shard_files:
         state_dict = loader(os.path.join(folder, shard_file))
@@ -511,25 +473,64 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
     return torch.nn.modules.module._IncompatibleKeys(missing_keys, unexpected_keys)
 
 
+str_to_torch_dtype = {
+    "BOOL": torch.bool,
+    "U8": torch.uint8,
+    "I8": torch.int8,
+    "I16": torch.int16,
+    "F16": torch.float16,
+    "BF16": torch.bfloat16,
+    "I32": torch.int32,
+    "F32": torch.float32,
+    "F64": torch.float64,
+    "I64": torch.int64,
+    "F8_E4M3": torch.float8_e4m3fn,
+}
+
+if is_torch_greater_or_equal("2.1.0"):
+    str_to_torch_dtype["F8_E4M3"] = torch.float8_e4m3fn
+
+if is_torch_greater_or_equal("2.3.0"):
+    str_to_torch_dtype["U16"] = torch.uint16
+    str_to_torch_dtype["U32"] = torch.uint32
+    str_to_torch_dtype["U64"] = torch.uint64
+
+if is_torch_greater_or_equal("2.1.0"):
+    str_to_torch_dtype["F8_E4M3"] = torch.float8_e4m3fn
+    str_to_torch_dtype["F8_E5M2"] = torch.float8_e5m2
+
+
 def load_state_dict(
     checkpoint_file: Union[str, os.PathLike],
     is_quantized: bool = False,
-    map_location: Optional[Union[str, torch.device]] = None,
+    map_location: Optional[Union[str, torch.device]] = "cpu",
     weights_only: bool = True,
 ):
     """
-    Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
+    Reads a `safetensor` or a `.bin` checkpoint file. We load the checkpoint on "cpu" by default.
     """
     if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
-        # Check format of the archive
         with safe_open(checkpoint_file, framework="pt") as f:
             metadata = f.metadata()
-        if metadata is not None and metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
-            raise OSError(
-                f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
-                "you save your model with the `save_pretrained` method."
-            )
-        return safe_load_file(checkpoint_file)
+
+            if metadata is not None and metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
+                raise OSError(
+                    f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
+                    "you save your model with the `save_pretrained` method."
+                )
+            state_dict = {}
+            for k in f.keys():
+                k_dtype = f.get_slice(k).get_dtype()
+                if k_dtype in str_to_torch_dtype:
+                    dtype = str_to_torch_dtype[k_dtype]
+                else:
+                    raise ValueError(f"Cannot load safetensors of unknown dtype {k_dtype}")
+                if map_location == "meta":
+                    state_dict[k] = torch.empty(size=f.get_slice(k).get_shape(), dtype=dtype, device="meta")
+                else:
+                    state_dict[k] = f.get_tensor(k)
+            return state_dict
+
     try:
         if map_location is None:
             if (
@@ -552,11 +553,10 @@ def load_state_dict(
             and is_zipfile(checkpoint_file)
         ):
             extra_args = {"mmap": True}
-        weights_only_kwarg = {"weights_only": weights_only}
         return torch.load(
             checkpoint_file,
             map_location=map_location,
-            **weights_only_kwarg,
+            weights_only=weights_only,
             **extra_args,
         )
     except Exception as e:
@@ -674,254 +674,757 @@ def _find_identical(tensors: List[Set[str]], state_dict: Dict[str, torch.Tensor]
     return shared_tensors, identical
 
 
-def _load_state_dict_into_model(model_to_load, state_dict, start_prefix, assign_to_params_buffers=False):
-    # copy state_dict so _load_from_state_dict can modify it
-    metadata = getattr(state_dict, "_metadata", None)
-    state_dict = state_dict.copy()
-    if metadata is not None:
-        state_dict._metadata = metadata
-
-    error_msgs = []
-
-    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-    # so we need to apply the function recursively.
-    def load(module: nn.Module, state_dict, prefix="", assign_to_params_buffers=False):
-        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-        local_metadata["assign_to_params_buffers"] = assign_to_params_buffers
-
-        args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
-        # Parameters of module and children will start with prefix. We can exit early if there are none in this
-        # state_dict
-        if len([key for key in state_dict if key.startswith(prefix)]) > 0:
-            if is_deepspeed_zero3_enabled():
-                import deepspeed
-
-                # In sharded models, each shard has only part of the full state_dict, so only gather
-                # parameters that are in the current state_dict.
-                named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
-                params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
-                if len(params_to_gather) > 0:
-                    # because zero3 puts placeholders in model params, this context
-                    # manager gathers (unpartitions) the params of the current layer, then loads from
-                    # the state dict and then re-partitions them again
-                    with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
-                        if torch.distributed.get_rank() == 0:
-                            module._load_from_state_dict(*args)
-            else:
-                module._load_from_state_dict(*args)
-
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, state_dict, prefix + name + ".", assign_to_params_buffers)
+def _infer_parameter_dtype(
+    model: "PreTrainedModel",
+    param_name: str,
+    empty_param: torch.Tensor,
+    keep_in_fp32_regex: Optional[re.Pattern] = None,
+    hf_quantizer: Optional[HfQuantizer] = None,
+) -> Union[bool, Optional[torch.dtype]]:
+    try:
+        old_param = model.get_parameter_or_buffer(param_name)
+    except Exception as e:
+        if hf_quantizer is not None and hf_quantizer.quantization_config.quant_method == QuantizationMethod.HQQ:
+            return True, None
+        else:
+            raise e
+    is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn")
+    # We convert floating dtypes to the `dtype` passed except for float8_e4m3fn type. We also want to keep the buffers/params
+    # in int/uint/bool and not cast them.
+    casting_dtype = None
+    is_param_float8_e4m3fn = is_torch_e4m3fn_available and empty_param.dtype == torch.float8_e4m3fn
+    if empty_param.dtype.is_floating_point and not is_param_float8_e4m3fn:
+        # First fp32 if part of the exception list
+        if keep_in_fp32_regex is not None and keep_in_fp32_regex.search(param_name):
+            casting_dtype = torch.float32
+        # Then dtype that was instantiated in the meta model -- note that this respects subconfigs dtypes
+        elif hf_quantizer is not None:
+            casting_dtype = model.config._pre_quantization_dtype
+        else:
+            casting_dtype = old_param.dtype
+    return old_param is not None and old_param.is_contiguous(), casting_dtype
 
-    load(model_to_load, state_dict, prefix=start_prefix, assign_to_params_buffers=assign_to_params_buffers)
-    # Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so
-    # it's safe to delete it.
-    del state_dict
 
-    return error_msgs
+def _load_parameter_into_model(model: "PreTrainedModel", param_name: str, tensor: torch.Tensor):
+    """Cast a single parameter `param_name` into the `model`, with value `tensor`."""
+    module, param_type = get_module_from_name(model, param_name)
+    # This will check potential shape mismatch if skipped before
+    module.load_state_dict({param_type: tensor}, strict=False, assign=True)
 
 
-def find_submodule_and_param_name(model, long_key, start_prefix):
-    """
-    A helper util to find the last sub-module and the param/buffer name. If `start_prefix` is supplied it'll be removed
-    from the start of the key
+@torch.no_grad()
+def _load_state_dict_into_meta_model(
+    model: "PreTrainedModel",
+    state_dict: Dict,
+    shard_file: str,
+    expected_keys: List[str],
+    reverse_renaming_mapping: Dict[str, str],
+    device_map: Optional[Dict] = None,
+    disk_offload_folder: Optional[str] = None,
+    disk_offload_index: Optional[Dict] = None,
+    cpu_offload_folder: Optional[str] = None,
+    cpu_offload_index: Optional[Dict] = None,
+    hf_quantizer: Optional[HfQuantizer] = None,
+    is_safetensors: bool = False,
+    keep_in_fp32_regex: Optional[re.Pattern] = None,
+    unexpected_keys: Optional[List[str]] = None,  # passing `unexpected` for cleanup from quantization items
+    device_mesh: Optional["torch.distributed.device_mesh.DeviceMesh"] = None,
+) -> Tuple[Optional[Dict], Optional[Dict]]:
+    """Load parameters from `meta_state_dict` into the model. The parameters of the `meta_state_dict` are on the meta
+    device in order to easily infer the shapes and dtypes that they will have. Then proper parameters are then loaded
+    from `shard_file`, which is the actual state dict file on disk.
+    This function takes care of correctly casting dtypes, devices, and sharding tensors in case of tensor parallelism.
     """
+    tensor_device = "cpu"
+    if device_map is not None and device_map.get("", None) is not None:
+        if device_map[""] not in ("cpu", torch.device("cpu")):
+            tensor_device = device_map[""].index if isinstance(device_map[""], torch.device) else device_map[""]
+    if device_map is not None:
+        device_map_regex = "|".join([re.escape(k) for k in sorted(device_map.keys(), reverse=True)])
 
-    if len(start_prefix) > 0 and long_key.startswith(start_prefix):
-        long_key = ".".join(long_key.split(".")[1:])
+    is_quantized = hf_quantizer is not None
+    is_hqq_or_bnb = is_quantized and hf_quantizer.quantization_config.quant_method in [
+        QuantizationMethod.HQQ,
+        QuantizationMethod.BITS_AND_BYTES,
+    ]
+    is_meta_state_dict = shard_file.endswith(".safetensors") and not is_hqq_or_bnb
+    file_pointer = None
+    if is_meta_state_dict:
+        file_pointer = safe_open(shard_file, framework="pt", device=tensor_device)
+
+    for param_name, empty_param in state_dict.items():
+        if param_name not in expected_keys:
+            continue
 
-    split_key = long_key.split(".")
-    submodule = model
-    while len(split_key) > 1:
-        if hasattr(submodule, split_key[0]):
-            submodule = getattr(submodule, split_key[0])
-            del split_key[0]
+        # we need to use serialized_param_name as file pointer is untouched
+        if is_meta_state_dict:
+            # This is the name of the parameter as it appears on disk file
+            serialized_param_name = reverse_renaming_mapping[param_name]
+            param = file_pointer.get_slice(serialized_param_name)
         else:
-            submodule = None
-            break
-    if submodule == model:
-        submodule = None
-    return submodule, split_key[0]
-
+            param = empty_param.to(tensor_device)  # It is actually not empty!
 
-def _move_model_to_meta(model, loaded_state_dict_keys, start_prefix):
-    """
-    Moves `loaded_state_dict_keys` in model to meta device which frees up the memory taken by those params.
-
-    `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in
-    `bert.pooler.dense.weight`
+        to_contiguous, casting_dtype = _infer_parameter_dtype(
+            model,
+            param_name,
+            empty_param,
+            keep_in_fp32_regex,
+            hf_quantizer,
+        )
 
-    """
+        if device_mesh is not None:  # In this case, the param is already on the correct device!
+            shard_and_distribute_module(
+                model,
+                param,
+                empty_param,
+                param_name,
+                casting_dtype,
+                to_contiguous,
+                int(os.environ["RANK"]),  # the rank
+                device_mesh,
+            )
+        else:
+            param = param[...]
+            if casting_dtype is not None:
+                param = param.to(casting_dtype)
+            if to_contiguous:
+                param = param.contiguous()
 
-    # dematerialize param storage for keys that are going to be replaced by state_dict, by
-    # putting those on the meta device
-    for k in loaded_state_dict_keys:
-        submodule, param_name = find_submodule_and_param_name(model, k, start_prefix)
-        if submodule is not None:
-            # selectively switch to the meta device only those params/buffers that will
-            # be next replaced from state_dict. This a complex way to do p.to_("meta")
-            # since we have no in-place to_ for tensors.
-            new_val = getattr(submodule, param_name)
-            if isinstance(new_val, torch.nn.Parameter):
-                # isinstance returns False for Params on meta device, so switch after the check
-                new_val = torch.nn.Parameter(new_val.to("meta"))
+            if device_map is None:
+                param_device = "cpu"
             else:
-                new_val = new_val.to("meta")
-            setattr(submodule, param_name, new_val)
+                module_layer = re.search(device_map_regex, param_name)
+                if not module_layer:
+                    raise ValueError(f"{param_name} doesn't have any device set.")
+                else:
+                    param_device = device_map[module_layer.group()]
 
+            if param_device == "disk":
+                if not is_safetensors:
+                    disk_offload_index = offload_weight(param, param_name, disk_offload_folder, disk_offload_index)
+            elif param_device == "cpu" and cpu_offload_index is not None:
+                cpu_offload_index = offload_weight(param, param_name, cpu_offload_folder, cpu_offload_index)
+            elif (
+                not is_quantized
+                or (not hf_quantizer.requires_parameters_quantization)
+                or (
+                    not hf_quantizer.check_quantized_param(
+                        model,
+                        param,
+                        param_name,
+                        state_dict,
+                        param_device=param_device,
+                        device_map=device_map,
+                    )
+                )
+            ):
+                if is_fsdp_enabled():
+                    param_device = "cpu" if is_local_dist_rank_0() else "meta"
 
-def _load_state_dict_into_meta_model(
-    model,
-    state_dict,
-    start_prefix,
-    expected_keys,
-    device_map=None,
-    offload_folder=None,
-    offload_index=None,
-    state_dict_folder=None,
-    state_dict_index=None,
-    dtype=None,
-    hf_quantizer=None,
-    is_safetensors=False,
-    keep_in_fp32_modules=None,
-    unexpected_keys=None,  # passing `unexpected` for cleanup from quantization items
-    pretrained_model_name_or_path=None,  # for flagging the user when the model contains renamed keys
-):
-    """
-    This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its
-    params on a `meta` device. It replaces the model params with the data from the `state_dict`, while moving the
-    params back to the normal device, but only for `loaded_state_dict_keys`.
+                _load_parameter_into_model(model, param_name, param.to(param_device))
 
-    `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in
-    `bert.pooler.dense.weight`
+            else:
+                hf_quantizer.create_quantized_param(
+                    model, param, param_name, param_device, state_dict, unexpected_keys
+                )
+                # For quantized modules with FSDP/DeepSpeed Stage 3, we need to quantize the parameter on the GPU
+                # and then cast it to CPU to avoid excessive memory usage on each GPU
+                # in comparison to the sharded model across GPUs.
+                if is_fsdp_enabled() or is_deepspeed_zero3_enabled():
+                    module, param_type = get_module_from_name(model, param_name)
+                    value = getattr(module, param_type)
+                    param_to = "cpu"
+                    if is_fsdp_enabled() and not is_local_dist_rank_0():
+                        param_to = "meta"
+                    val_kwargs = {}
+                    if hasattr(module, "weight") and module.weight.__class__.__name__ == "Int8Params":
+                        val_kwargs["requires_grad"] = False
+                    value = type(value)(value.data.to(param_to), **val_kwargs, **value.__dict__)
+                    setattr(module, param_type, value)
+
+    if file_pointer is not None:
+        file_pointer.__exit__(None, None, None)
+
+    return disk_offload_index, cpu_offload_index
 
-    """
 
-    # XXX: remaining features to implement to be fully compatible with _load_state_dict_into_model
-    # - deepspeed zero 3 support
-    # - need to copy metadata if any - see _load_state_dict_into_model
-    # - handling error_msgs - mimicking the error handling in module._load_from_state_dict()
+def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
+    if variant is not None:
+        path, name = weights_name.rsplit(".", 1)
+        weights_name = f"{path}.{variant}.{name}"
+    return weights_name
 
-    error_msgs = []
 
-    is_quantized = hf_quantizer is not None
+def _get_resolved_checkpoint_files(
+    pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+    subfolder: str,
+    variant: Optional[str],
+    gguf_file: Optional[str],
+    from_tf: bool,
+    from_flax: bool,
+    use_safetensors: bool,
+    cache_dir: str,
+    force_download: bool,
+    proxies: Optional[Dict[str, str]],
+    local_files_only: bool,
+    token: Optional[Union[str, bool]],
+    user_agent: dict,
+    revision: str,
+    commit_hash: Optional[str],
+) -> Tuple[Optional[List[str]], Optional[Dict]]:
+    """Get all the checkpoint filenames based on `pretrained_model_name_or_path`, and optional metadata if the
+    checkpoints are sharded.
+    This function will download the data if necesary.
+    """
+    is_sharded = False
+
+    if pretrained_model_name_or_path is not None and gguf_file is None:
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        if is_local:
+            if from_tf and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
+            ):
+                # Load from a TF 1.0 checkpoint in priority if from_tf
+                archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
+            elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)):
+                # Load from a TF 2.0 checkpoint in priority if from_tf
+                archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)
+            elif from_flax and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
+            ):
+                # Load from a Flax checkpoint in priority if from_flax
+                archive_file = os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
+            elif use_safetensors is not False and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant))
+            ):
+                # Load from a safetensors checkpoint
+                archive_file = os.path.join(
+                    pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant)
+                )
+            elif use_safetensors is not False and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant))
+            ):
+                # Load from a sharded safetensors checkpoint
+                archive_file = os.path.join(
+                    pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
+                )
+                is_sharded = True
+            elif not use_safetensors and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant))
+            ):
+                # Load from a PyTorch checkpoint
+                archive_file = os.path.join(
+                    pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)
+                )
+            elif not use_safetensors and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant))
+            ):
+                # Load from a sharded PyTorch checkpoint
+                archive_file = os.path.join(
+                    pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)
+                )
+                is_sharded = True
+            # At this stage we don't have a weight file so we will raise an error.
+            elif not use_safetensors and (
+                os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index"))
+                or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME))
+            ):
+                raise EnvironmentError(
+                    f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
+                    f" {pretrained_model_name_or_path} but there is a file for TensorFlow weights. Use"
+                    " `from_tf=True` to load this model from those weights."
+                )
+            elif not use_safetensors and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
+            ):
+                raise EnvironmentError(
+                    f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
+                    f" {pretrained_model_name_or_path} but there is a file for Flax weights. Use `from_flax=True`"
+                    " to load this model from those weights."
+                )
+            elif use_safetensors:
+                raise EnvironmentError(
+                    f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)} found in directory"
+                    f" {pretrained_model_name_or_path}."
+                )
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
+                    f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME + '.index'} or {FLAX_WEIGHTS_NAME} found in directory"
+                    f" {pretrained_model_name_or_path}."
+                )
+        elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
+            archive_file = pretrained_model_name_or_path
+            is_local = True
+        elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path + ".index")):
+            if not from_tf:
+                raise ValueError(
+                    f"We found a TensorFlow checkpoint at {pretrained_model_name_or_path + '.index'}, please set "
+                    "from_tf to True to load from this checkpoint."
+                )
+            archive_file = os.path.join(subfolder, pretrained_model_name_or_path + ".index")
+            is_local = True
+        elif is_remote_url(pretrained_model_name_or_path):
+            filename = pretrained_model_name_or_path
+            resolved_archive_file = download_url(pretrained_model_name_or_path)
+        else:
+            # set correct filename
+            if from_tf:
+                filename = TF2_WEIGHTS_NAME
+            elif from_flax:
+                filename = FLAX_WEIGHTS_NAME
+            elif use_safetensors is not False:
+                filename = _add_variant(SAFE_WEIGHTS_NAME, variant)
+            else:
+                filename = _add_variant(WEIGHTS_NAME, variant)
 
-    is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn")
+            try:
+                # Load from URL or cache if already cached
+                cached_file_kwargs = {
+                    "cache_dir": cache_dir,
+                    "force_download": force_download,
+                    "proxies": proxies,
+                    "local_files_only": local_files_only,
+                    "token": token,
+                    "user_agent": user_agent,
+                    "revision": revision,
+                    "subfolder": subfolder,
+                    "_raise_exceptions_for_gated_repo": False,
+                    "_raise_exceptions_for_missing_entries": False,
+                    "_commit_hash": commit_hash,
+                }
+                resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
+
+                # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None
+                # result when internet is up, the repo and revision exist, but the file does not.
+                if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant):
+                    # Maybe the checkpoint is sharded, we try to grab the index name in this case.
+                    resolved_archive_file = cached_file(
+                        pretrained_model_name_or_path,
+                        _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
+                        **cached_file_kwargs,
+                    )
+                    if resolved_archive_file is not None:
+                        is_sharded = True
+                    elif use_safetensors:
+                        if revision == "main":
+                            resolved_archive_file, revision, is_sharded = auto_conversion(
+                                pretrained_model_name_or_path, **cached_file_kwargs
+                            )
+                        cached_file_kwargs["revision"] = revision
+                        if resolved_archive_file is None:
+                            raise EnvironmentError(
+                                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} "
+                                "and thus cannot be loaded with `safetensors`. Please make sure that the model has "
+                                "been saved with `safe_serialization=True` or do not set `use_safetensors=True`."
+                            )
+                    else:
+                        # This repo has no safetensors file of any kind, we switch to PyTorch.
+                        filename = _add_variant(WEIGHTS_NAME, variant)
+                        resolved_archive_file = cached_file(
+                            pretrained_model_name_or_path, filename, **cached_file_kwargs
+                        )
+                if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant):
+                    # Maybe the checkpoint is sharded, we try to grab the index name in this case.
+                    resolved_archive_file = cached_file(
+                        pretrained_model_name_or_path,
+                        _add_variant(WEIGHTS_INDEX_NAME, variant),
+                        **cached_file_kwargs,
+                    )
+                    if resolved_archive_file is not None:
+                        is_sharded = True
+                if not local_files_only and not is_offline_mode():
+                    if resolved_archive_file is not None:
+                        if filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME]:
+                            # If the PyTorch file was found, check if there is a safetensors file on the repository
+                            # If there is no safetensors file on the repositories, start an auto conversion
+                            safe_weights_name = SAFE_WEIGHTS_INDEX_NAME if is_sharded else SAFE_WEIGHTS_NAME
+                            has_file_kwargs = {
+                                "revision": revision,
+                                "proxies": proxies,
+                                "token": token,
+                                "cache_dir": cache_dir,
+                                "local_files_only": local_files_only,
+                            }
+                            cached_file_kwargs = {
+                                "cache_dir": cache_dir,
+                                "force_download": force_download,
+                                "local_files_only": local_files_only,
+                                "user_agent": user_agent,
+                                "subfolder": subfolder,
+                                "_raise_exceptions_for_gated_repo": False,
+                                "_raise_exceptions_for_missing_entries": False,
+                                "_commit_hash": commit_hash,
+                                **has_file_kwargs,
+                            }
+                            if not has_file(pretrained_model_name_or_path, safe_weights_name, **has_file_kwargs):
+                                Thread(
+                                    target=auto_conversion,
+                                    args=(pretrained_model_name_or_path,),
+                                    kwargs={"ignore_errors_during_conversion": True, **cached_file_kwargs},
+                                    name="Thread-auto_conversion",
+                                ).start()
+                    else:
+                        # Otherwise, no PyTorch file was found, maybe there is a TF or Flax model file.
+                        # We try those to give a helpful error message.
+                        has_file_kwargs = {
+                            "revision": revision,
+                            "proxies": proxies,
+                            "token": token,
+                            "cache_dir": cache_dir,
+                            "local_files_only": local_files_only,
+                        }
+                        if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):
+                            raise EnvironmentError(
+                                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for TensorFlow weights."
+                                " Use `from_tf=True` to load this model from those weights."
+                            )
+                        elif has_file(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME, **has_file_kwargs):
+                            raise EnvironmentError(
+                                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for Flax weights. Use"
+                                " `from_flax=True` to load this model from those weights."
+                            )
+                        elif variant is not None and has_file(
+                            pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs
+                        ):
+                            raise EnvironmentError(
+                                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant"
+                                f" {variant}. Use `variant=None` to load this model from those weights."
+                            )
+                        else:
+                            raise EnvironmentError(
+                                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                f" {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
+                                f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
+                            )
 
-    for param_name, param in state_dict.items():
-        if param_name not in expected_keys:
-            continue
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                # to the original exception.
+                raise
+            except Exception as e:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
+                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+                    f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)},"
+                    f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
+                ) from e
+
+        if is_local:
+            logger.info(f"loading weights file {archive_file}")
+            resolved_archive_file = archive_file
+        else:
+            logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}")
+
+    elif gguf_file:
+        # Case 1: the GGUF file is present locally
+        if os.path.isfile(gguf_file):
+            resolved_archive_file = gguf_file
+        # Case 2: The GGUF path is a location on the Hub
+        # Load from URL or cache if already cached
+        else:
+            cached_file_kwargs = {
+                "cache_dir": cache_dir,
+                "force_download": force_download,
+                "proxies": proxies,
+                "local_files_only": local_files_only,
+                "token": token,
+                "user_agent": user_agent,
+                "revision": revision,
+                "subfolder": subfolder,
+                "_raise_exceptions_for_gated_repo": False,
+                "_raise_exceptions_for_missing_entries": False,
+                "_commit_hash": commit_hash,
+            }
 
-        if param_name.startswith(start_prefix):
-            param_name = param_name[len(start_prefix) :]
+            resolved_archive_file = cached_file(pretrained_model_name_or_path, gguf_file, **cached_file_kwargs)
+
+    # We now download and resolve all checkpoint files if the checkpoint is sharded
+    sharded_metadata = None
+    if is_sharded:
+        checkpoint_files, sharded_metadata = get_checkpoint_shard_files(
+            pretrained_model_name_or_path,
+            resolved_archive_file,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            user_agent=user_agent,
+            revision=revision,
+            subfolder=subfolder,
+            _commit_hash=commit_hash,
+        )
+    else:
+        checkpoint_files = [resolved_archive_file] if pretrained_model_name_or_path is not None else None
+
+    return checkpoint_files, sharded_metadata
+
+
+def _get_torch_dtype(
+    cls,
+    torch_dtype: Optional[Union[str, torch.dtype, Dict]],
+    checkpoint_files: Optional[List[str]],
+    config: PretrainedConfig,
+    sharded_metadata: Optional[Dict],
+    state_dict: Optional[Dict],
+    weights_only: bool,
+) -> Tuple[PretrainedConfig, Optional[torch.dtype], Optional[torch.dtype]]:
+    """Find the correct `torch_dtype` to use based on provided arguments. Also update the `config` based on the
+    inferred dtype. We do the following:
+    1. If torch_dtype is not None, we use that dtype
+    2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
+        weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
+    we also may have config.torch_dtype available, but we won't rely on it till v5
+    """
+    dtype_orig = None
+    is_sharded = sharded_metadata is not None
 
-        module_name = param_name
-        set_module_kwargs = {}
+    if torch_dtype is not None:
+        if isinstance(torch_dtype, str):
+            if torch_dtype == "auto":
+                if hasattr(config, "torch_dtype") and config.torch_dtype is not None:
+                    torch_dtype = config.torch_dtype
+                    logger.info(f"Will use torch_dtype={torch_dtype} as defined in model's config object")
+                else:
+                    if is_sharded and "dtype" in sharded_metadata:
+                        torch_dtype = sharded_metadata["dtype"]
+                    elif state_dict is not None:
+                        torch_dtype = get_state_dict_dtype(state_dict)
+                    else:
+                        state_dict = load_state_dict(
+                            checkpoint_files[0], map_location="meta", weights_only=weights_only
+                        )
+                        torch_dtype = get_state_dict_dtype(state_dict)
+                    logger.info(
+                        "Since the `torch_dtype` attribute can't be found in model's config object, "
+                        "will use torch_dtype={torch_dtype} as derived from model's weights"
+                    )
+            elif hasattr(torch, torch_dtype):
+                torch_dtype = getattr(torch, torch_dtype)
+                config.torch_dtype = torch_dtype
+                for sub_config_key in config.sub_configs.keys():
+                    sub_config = getattr(config, sub_config_key)
+                    sub_config.torch_dtype = torch_dtype
+        elif isinstance(torch_dtype, torch.dtype):
+            config.torch_dtype = torch_dtype
+            for sub_config_key in config.sub_configs.keys():
+                sub_config = getattr(config, sub_config_key)
+                sub_config.torch_dtype = torch_dtype
+        elif isinstance(torch_dtype, dict):
+            for key, curr_dtype in torch_dtype.items():
+                if hasattr(config, key):
+                    value = getattr(config, key)
+                    curr_dtype = curr_dtype if not isinstance(curr_dtype, str) else getattr(torch, curr_dtype)
+                    value.torch_dtype = curr_dtype
+            # main torch dtype for modules that aren't part of any sub-config
+            torch_dtype = torch_dtype.get("")
+            torch_dtype = torch_dtype if not isinstance(torch_dtype, str) else getattr(torch, torch_dtype)
+            config.torch_dtype = torch_dtype
+            if torch_dtype is None:
+                torch_dtype = torch.float32
+        else:
+            raise ValueError(
+                f"`torch_dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `torch_dtype` "
+                f"for each sub-config in composite configs, but received {torch_dtype}"
+            )
 
-        # We convert floating dtypes to the `dtype` passed except for float8_e4m3fn type. We also want to keep the buffers/params
-        # in int/uint/bool and not cast them.
-        is_param_float8_e4m3fn = is_torch_e4m3fn_available and param.dtype == torch.float8_e4m3fn
-        if dtype is not None and torch.is_floating_point(param) and not is_param_float8_e4m3fn:
-            if (
-                keep_in_fp32_modules is not None
-                and any(
-                    module_to_keep_in_fp32 in param_name.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules
-                )
-                and dtype == torch.float16
-            ):
-                param = param.to(torch.float32)
+        dtype_orig = cls._set_default_torch_dtype(torch_dtype)
+    else:
+        # set fp32 as the default dtype for BC
+        default_dtype = torch.get_default_dtype()
+        config.torch_dtype = default_dtype
+        for key in config.sub_configs.keys():
+            value = getattr(config, key)
+            value.torch_dtype = default_dtype
+
+    return config, torch_dtype, dtype_orig
+
+
+def _get_device_map(
+    model: "PreTrainedModel",
+    device_map: Optional[Union[str, Dict]],
+    max_memory: Optional[Dict],
+    hf_quantizer: Optional[HfQuantizer],
+    torch_dtype: Optional[torch.dtype],
+    keep_in_fp32_regex: Optional[re.Pattern],
+) -> Dict:
+    """Compute the final `device_map` to use if we passed a value in ['auto', 'balanced', 'balanced_low_0', 'sequential'].
+    Otherwise, we check for any device inconsistencies in the device_map.
+    """
+    if isinstance(device_map, str):
+        special_dtypes = {}
+        if hf_quantizer is not None:
+            special_dtypes.update(hf_quantizer.get_special_dtypes_update(model, torch_dtype))
+        if keep_in_fp32_regex is not None:
+            special_dtypes.update(
+                {name: torch.float32 for name, _ in model.named_parameters() if keep_in_fp32_regex.search(name)}
+            )
 
-                # For backward compatibility with older versions of `accelerate`
-                # TODO: @sgugger replace this check with version check at the next `accelerate` release
-                if "dtype" in list(inspect.signature(set_module_tensor_to_device).parameters):
-                    set_module_kwargs["dtype"] = torch.float32
-            else:
-                param = param.to(dtype)
+        target_dtype = torch_dtype
 
-        # For compatibility with PyTorch load_state_dict which converts state dict dtype to existing dtype in model, and which
-        # uses `param.copy_(input_param)` that preserves the contiguity of the parameter in the model.
-        # Reference: https://github.com/pytorch/pytorch/blob/db79ceb110f6646523019a59bbd7b838f43d4a86/torch/nn/modules/module.py#L2040C29-L2040C29
+        if hf_quantizer is not None:
+            target_dtype = hf_quantizer.adjust_target_dtype(target_dtype)
 
-        old_param = model
-        splits = param_name.split(".")
-        for split in splits:
-            # We shouldn't hit the default value unless for quant methods like hqq that modifies expected_keys.
-            old_param = getattr(old_param, split, None)
-            if old_param is None:
-                break
+        no_split_modules = model._get_no_split_modules(device_map)
+        device_map_kwargs = {"no_split_module_classes": no_split_modules}
 
-        if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)):
-            old_param = None
+        if "special_dtypes" in inspect.signature(infer_auto_device_map).parameters:
+            device_map_kwargs["special_dtypes"] = special_dtypes
+        elif len(special_dtypes) > 0:
+            logger.warning(
+                "This model has some weights that should be kept in higher precision, you need to upgrade "
+                "`accelerate` to properly deal with them (`pip install --upgrade accelerate`)."
+            )
 
-        if old_param is not None:
-            if dtype is None:
-                param = param.to(old_param.dtype)
+        if device_map != "sequential":
+            max_memory = get_balanced_memory(
+                model,
+                dtype=target_dtype,
+                low_zero=(device_map == "balanced_low_0"),
+                max_memory=max_memory,
+                **device_map_kwargs,
+            )
+        else:
+            max_memory = get_max_memory(max_memory)
+        if hf_quantizer is not None:
+            max_memory = hf_quantizer.adjust_max_memory(max_memory)
+        device_map_kwargs["max_memory"] = max_memory
 
-            if old_param.is_contiguous():
-                param = param.contiguous()
+        device_map = infer_auto_device_map(model, dtype=target_dtype, **device_map_kwargs)
 
-        set_module_kwargs["value"] = param
+        if hf_quantizer is not None:
+            hf_quantizer.validate_environment(device_map=device_map)
+
+    elif device_map is not None:
+        tied_params = find_tied_parameters(model)
+        # check if we don't have tied param in different devices
+        check_tied_parameters_on_same_device(tied_params, device_map)
+
+    return device_map
+
+
+def _find_missing_and_unexpected_keys(
+    cls,
+    model: "PreTrainedModel",
+    original_checkpoint_keys: List[str],
+    checkpoint_keys: List[str],
+    loading_base_model_from_task_state_dict: bool,
+    hf_quantizer: Optional[HfQuantizer],
+    device_map: Dict,
+) -> Tuple[List[str], List[str]]:
+    """Find missing keys (keys that are part of the model parameters but were NOT found in the loaded state dict keys) and unexpected keys
+    (keys found in the loaded state dict keys, but that are NOT part of the model parameters)
+    """
+    prefix = model.base_model_prefix
+
+    # Compute expected keys, i.e. keys that the FULL model (not model_to_load) expects
+    expected_keys = list(model.state_dict().keys())
+    if hf_quantizer is not None:
+        expected_keys = hf_quantizer.update_expected_keys(model, expected_keys, checkpoint_keys)
+
+    # Adjust prefix of the keys to make them match loaded keys before removing them
+    missing_keys = sorted(set(expected_keys) - set(checkpoint_keys))
+    unexpected_keys = set(checkpoint_keys) - set(expected_keys)
+    # If a module has the same name under the base and task specific model, we have to re-add it to unexpected keys
+    if loading_base_model_from_task_state_dict:
+        task_specific_keys = [k for k in original_checkpoint_keys if not k.startswith(f"{prefix}.")]
+        unexpected_keys.update(task_specific_keys)
+
+    # Remove nonpersistent buffers from unexpected keys: they are not in the expected keys (model state dict), but
+    # may be in the loaded keys. Note that removing all buffers does the job, as they were part of the expected keys anyway
+    model_buffers = {n for n, _ in model.named_buffers()}
+    unexpected_keys = sorted(unexpected_keys - model_buffers)
+
+    # Old checkpoints may have keys for rotary_emb.inv_freq for each layer, however we moved this buffer to the main model
+    # (so the buffer name has changed). Remove them in such a case
+    has_inv_freq_buffers = any(buffer.endswith("rotary_emb.inv_freq") for buffer in model_buffers)
+    if has_inv_freq_buffers:
+        unexpected_keys = [k for k in unexpected_keys if "rotary_emb.inv_freq" not in k]
+
+    tied_params = find_tied_parameters(model)
+    for group in tied_params:
+        missing_in_group = [k for k in missing_keys if k in group]
+        if len(missing_in_group) > 0 and len(missing_in_group) < len(group):
+            missing_keys = [k for k in missing_keys if k not in missing_in_group]
+
+    if hf_quantizer is not None:
+        missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix)
+        unexpected_keys = hf_quantizer.update_unexpected_keys(model, unexpected_keys, prefix)
+
+    # Model-specific exceptions for missing and unexpected keys (e.g. if the modeling change over time, or any other reason...)
+    if cls._keys_to_ignore_on_load_missing is not None:
+        for pattern in cls._keys_to_ignore_on_load_missing:
+            missing_keys = [k for k in missing_keys if re.search(pattern, k) is None]
+
+    if cls._keys_to_ignore_on_load_unexpected is not None:
+        for pattern in cls._keys_to_ignore_on_load_unexpected:
+            unexpected_keys = [k for k in unexpected_keys if re.search(pattern, k) is None]
+
+    return missing_keys, unexpected_keys
+
+
+def _find_mismatched_keys(
+    model: "PreTrainedModel",
+    state_dict: Optional[Dict],
+    checkpoint_files: Optional[List[str]],
+    ignore_mismatched_sizes: bool,
+    keys_to_rename_mapping: Dict[str, str],
+    is_quantized: bool,
+    weights_only: bool,
+) -> Tuple[List[str], List[Tuple[int, int]]]:
+    """
+    Find potential shape mismatch between the different state dicts and the model parameters, but only if `ignore_mismatched_sizes`
+    is True. Otherwise, return immediately and any shape mismatch that may exist will be raised later on. This avoids checking
+    every parameter in advance, as shape mismatch are extremely rare in practice. If we want to ignore them however, we do
+    need to check in advance as we need to know which parameters we need to move back from meta to cpu, and initialize
+    correctly. Indeed, as our model initialization takes place at the module level, and not the weight level, in the
+    case of a sharded checkpoint we cannot correctly initialize the weights according to `model._init_weights()` if we perform
+    this check on each state dict at loading time (after the first loaded checkpoint, there are no way to initialize only the
+    mismatched weights if any, without overwriting the previously loaded weights as well because all the module will be
+    initialized, not only the weights that are mismatched).
+    """
 
-        if device_map is None:
-            param_device = "cpu"
-        else:
-            # find next higher level module that is defined in device_map:
-            # bert.lm_head.weight -> bert.lm_head -> bert -> ''
-            while len(module_name) > 0 and module_name not in device_map:
-                module_name = ".".join(module_name.split(".")[:-1])
-            if module_name == "" and "" not in device_map:
-                # TODO: group all errors and raise at the end.
-                raise ValueError(f"{param_name} doesn't have any device set.")
-            param_device = device_map[module_name]
-
-        if param_device == "disk":
-            if not is_safetensors:
-                offload_index = offload_weight(param, param_name, offload_folder, offload_index)
-        elif param_device == "cpu" and state_dict_index is not None:
-            state_dict_index = offload_weight(param, param_name, state_dict_folder, state_dict_index)
-        elif (
-            not is_quantized
-            or (not hf_quantizer.requires_parameters_quantization)
-            or (
-                not hf_quantizer.check_quantized_param(
-                    model, param, param_name, state_dict, param_device=param_device, device_map=device_map
-                )
+    # An error will be raised later on anyway if there is a mismatch - this avoids running the rest of this function
+    # if there are no mismatch (which is almost always the case)
+    if not ignore_mismatched_sizes:
+        return [], []
+
+    if state_dict is not None:
+        checkpoint_files = [""]
+
+    model_state_dict = model.state_dict()
+    mismatched_keys = []
+    mismatched_shapes = []
+    for shard_file in checkpoint_files:
+        # If shard_file is "", we use the existing state_dict instead of loading it
+        if shard_file != "":
+            state_dict = load_state_dict(
+                shard_file, is_quantized=is_quantized, map_location="meta", weights_only=weights_only
             )
-        ):
-            if is_fsdp_enabled():
-                param_device = "cpu" if is_local_dist_rank_0() else "meta"
-
-            # For backward compatibility with older versions of `accelerate` and for non-quantized params
-            set_module_tensor_to_device(model, param_name, param_device, **set_module_kwargs)
-        else:
-            hf_quantizer.create_quantized_param(model, param, param_name, param_device, state_dict, unexpected_keys)
-            # For quantized modules with FSDP/DeepSpeed Stage 3, we need to quantize the parameter on the GPU
-            # and then cast it to CPU to avoid excessive memory usage on each GPU
-            # in comparison to the sharded model across GPUs.
-            if is_fsdp_enabled() or is_deepspeed_zero3_enabled():
-                module, tensor_name = get_module_from_name(model, param_name)
-                value = getattr(module, tensor_name)
-                param_to = "cpu"
-                if is_fsdp_enabled() and not is_local_dist_rank_0():
-                    param_to = "meta"
-                val_kwargs = {}
-                if hasattr(module, "weight") and module.weight.__class__.__name__ == "Int8Params":
-                    val_kwargs["requires_grad"] = False
-                value = type(value)(value.data.to(param_to), **val_kwargs, **value.__dict__)
-                setattr(module, tensor_name, value)
-            # TODO: consider removing used param_parts from state_dict before return
-
-    return error_msgs, offload_index, state_dict_index
 
+        # Fix the key names
+        new_state_dict = {keys_to_rename_mapping[k]: v for k, v in state_dict.items() if k in keys_to_rename_mapping}
 
-def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
-    if variant is not None:
-        splits = weights_name.split(".")
-        splits = splits[:-1] + [variant] + splits[-1:]
-        weights_name = ".".join(splits)
+        for key in new_state_dict.keys():
+            if key in model_state_dict and new_state_dict[key].shape != model_state_dict[key].shape:
+                # This skips size mismatches for 4-bit weights. Two 4-bit values share an 8-bit container, causing size differences.
+                # Without matching with module type or paramter type it seems like a practical way to detect valid 4bit weights.
+                if not (
+                    new_state_dict[key].shape[-1] == 1
+                    and new_state_dict[key].numel() * 2 == model_state_dict[key].numel()
+                ):
+                    mismatched_keys.append(key)
+                    mismatched_shapes.append((new_state_dict[key].shape, model_state_dict[key].shape))
 
-    return weights_name
+    return mismatched_keys, mismatched_shapes
 
 
 class PipelineParallel(Enum):
@@ -1315,7 +1818,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     # A tensor parallel plan to be applied to the model when TP is enabled. For
     # top-level models, this attribute is currently defined in respective model
     # code. For base models, this attribute comes from
-    # `config.base_model_tp_plan` during `post_init`.
+    # `config.base_model_tp_plan` during `__init__`.
+    # It should identify the layers exactly: if you want to TP model.language_model.layers.fc1
+    # by passing `tp_plan` to the init, it should be {"model.language_model.layers.fc1":"colwise"}
+    # for example.
     _tp_plan = None
 
     # A pipeline parallel plan specifying the layers which may not be present
@@ -1381,6 +1887,8 @@ def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
         # when a different component (e.g. language_model) is used.
         self._keep_in_fp32_modules = copy.copy(self.__class__._keep_in_fp32_modules)
 
+        self._no_split_modules = self._no_split_modules or []
+
     def post_init(self):
         """
         A method executed at the end of each Transformer model initialization, to execute code that needs the model's
@@ -1388,12 +1896,37 @@ def post_init(self):
         """
         self.init_weights()
         self._backward_compatibility_gradient_checkpointing()
-        # If current model is a base model, attach `base_model_tp_plan` from config
-        if self.base_model is self:
-            self._tp_plan = self.config.base_model_tp_plan
-        # If current model is a base model, attach `base_model_pp_plan` from config
-        if self.base_model is self:
-            self._pp_plan = self.config.base_model_pp_plan
+
+        # Make sure the modules correctly exist if the flag is active
+        if self._keep_in_fp32_modules is not None:
+            all_parameters = {name for name, _ in self.named_parameters() if len(name) > 0}
+            unique_module_names = set()
+            # Get all unique module names in the module graph, without the prefixes
+            for param in all_parameters:
+                unique_module_names.update(
+                    [name for name in param.split(".") if not name.isnumeric() and name not in ["weight", "bias"]]
+                )
+            # Check that every module in the keep_in_fp32 list is part of the module graph
+            for module in self._keep_in_fp32_modules:
+                if module not in unique_module_names:
+                    raise ValueError(
+                        f"{module} was specified in the `_keep_in_fp32_modules` list, but is not part of the modules in"
+                        f" {self.__class__.__name__}"
+                    )
+
+        # If current model is a base model, attach `base_model_tp_plan` and `base_model_pp_plan` from config
+        self._pp_plan = self.config.base_model_pp_plan.copy() if self.config.base_model_pp_plan is not None else None
+        self._tp_plan = self.config.base_model_tp_plan.copy() if self.config.base_model_tp_plan is not None else {}
+        for name, module in self.named_children():
+            if plan := getattr(module, "_tp_plan", None):
+                self._tp_plan.update({f"{name}.{k}": v for k, v in plan.copy().items()})
+
+        if self._tp_plan is not None and is_torch_greater_or_equal("2.3"):
+            for _, v in self._tp_plan.items():
+                if v not in SUPPORTED_TP_STYLES:
+                    raise ValueError(
+                        f"Unsupported tensor parallel style {v}. Supported styles are {SUPPORTED_TP_STYLES}"
+                    )
 
     def dequantize(self):
         """
@@ -1488,8 +2021,6 @@ def _from_config(cls, config, **kwargs):
             )
 
         if is_deepspeed_zero3_enabled() and not _is_quantized and not _is_ds_init_called:
-            import deepspeed
-
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
             # this immediately partitions the model across all gpus, to avoid the overhead in time
             # and memory copying it on CPU or each GPU first
@@ -1522,7 +2053,7 @@ def _autoset_attn_implementation(
             3. SDPA implementation, if available and supported by the model type. (`LlamaSdpaAttention` for example)
             4. The default model's implementation otherwise (`LlamaAttention` for example) .
         """
-        # Here we use config._attn_implementation_internal to check whether the attention implementation was explicitely set by the user.
+        # Here we use config._attn_implementation_internal to check whether the attention implementation was explicitly set by the user.
         # The property `PretrainedConfig._attn_implementation` is never `None`, for backward compatibility (always fall back on "eager").
         # The `hasattr` here is used as some Transformers tests for some reason do not call PretrainedConfig __init__ (e.g. test_no_super_init_config_and_model)
         requested_attn_implementation = None
@@ -1533,9 +2064,10 @@ def _autoset_attn_implementation(
                     ' We recommend to just use `attn_implementation="flash_attention_2"` when loading the model.'
                 )
 
-            if not isinstance(config._attn_implementation, dict) and config._attn_implementation not in [
-                "eager"
-            ] + list(ALL_ATTENTION_FUNCTIONS.keys()):
+            if (
+                not isinstance(config._attn_implementation, dict)
+                and config._attn_implementation not in ["eager"] + ALL_ATTENTION_FUNCTIONS.valid_keys()
+            ):
                 message = f'Specified `attn_implementation="{config._attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)'
                 if cls._supports_flash_attn_2:
                     message += ', `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)'
@@ -1563,7 +2095,9 @@ def _autoset_attn_implementation(
                 if not isinstance(requested_attn_implementation, dict)
                 else requested_attn_implementation.get(key, None)
             )
-            sub_config._attn_implementation_internal = curr_attn_implementation
+            # For models with backbone sub-config might be not initialized
+            if sub_config is not None:
+                sub_config._attn_implementation_internal = curr_attn_implementation
 
         if use_flash_attention_2:
             logger.warning_once(
@@ -1598,7 +2132,7 @@ def _autoset_attn_implementation(
                     "Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends."
                 )
                 torch.backends.cuda.enable_flash_sdp(False)
-        elif requested_attn_implementation in list(ALL_ATTENTION_FUNCTIONS.keys()):
+        elif requested_attn_implementation in ALL_ATTENTION_FUNCTIONS.valid_keys():
             config._attn_implementation = requested_attn_implementation
         elif isinstance(requested_attn_implementation, dict):
             config._attn_implementation = None
@@ -1645,7 +2179,10 @@ def base_model(self) -> nn.Module:
     @classmethod
     def can_generate(cls) -> bool:
         """
-        Returns whether this model can generate sequences with `.generate()`.
+        Returns whether this model can generate sequences with `.generate()` from the `GenerationMixin`.
+
+        Under the hood, on classes where this function returns True, some generation-specific changes are triggered:
+        for instance, the model instance will have a populated `generation_config` attribute.
 
         Returns:
             `bool`: Whether this model can generate sequences with `.generate()`.
@@ -1653,9 +2190,6 @@ def can_generate(cls) -> bool:
         # Directly inherits `GenerationMixin` -> can generate
         if "GenerationMixin" in str(cls.__bases__):
             return True
-        # Model class overwrites `generate` (e.g. time series models) -> can generate
-        if str(cls.__name__) in str(cls.generate):
-            return True
         # The class inherits from a class that can generate (recursive check) -> can generate
         for base in cls.__bases__:
             if not hasattr(base, "can_generate"):
@@ -1707,7 +2241,15 @@ def _check_and_enable_flash_attn_2(
             install_message = "Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2."
 
             if importlib.util.find_spec("flash_attn") is None:
-                raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+                # package `flash-attn` can not be installed on Ascend NPU, ignore related validation logic and early exit.
+                if is_torch_npu_available():
+                    if not hard_check_only:
+                        config._attn_implementation = "flash_attention_2"
+
+                    logger.info("Detect using FlashAttention2 on Ascend NPU.")
+                    return config
+                else:
+                    raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
 
             flash_attention_version = version.parse(importlib.metadata.version("flash_attn"))
             if torch.version.cuda:
@@ -1749,12 +2291,17 @@ def _check_and_enable_flash_attn_2(
 
         # The check `torch.empty(0).device.type != "cuda"` is needed as the model may be initialized after `torch.set_default_device` has been called,
         # or the model may be initialized under the context manager `with torch.device("cuda"):`.
-        if check_device_map and device_map is None and torch.empty(0).device.type != "cuda":
+        if check_device_map and device_map is None and torch.empty(0).device.type not in ["cuda", "mlu"]:
             if torch.cuda.is_available():
                 logger.warning_once(
                     "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU"
                     " after initializing it on CPU with `model.to('cuda')`."
                 )
+            elif is_torch_mlu_available():
+                logger.warning_once(
+                    "You are attempting to use Flash Attention 2.0 with a model not initialized on MLU. Make sure to move the model to MLU"
+                    " after initializing it on CPU with `model.to('mlu')`."
+                )
             else:
                 raise ValueError(
                     "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU and with no GPU available. "
@@ -1953,9 +2500,9 @@ def tie_encoder_to_decoder_recursively(
             total_decoder_name="",
             total_encoder_name="",
         ):
-            assert isinstance(decoder_pointer, nn.Module) and isinstance(
-                encoder_pointer, nn.Module
-            ), f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module"
+            assert isinstance(decoder_pointer, nn.Module) and isinstance(encoder_pointer, nn.Module), (
+                f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module"
+            )
             if hasattr(decoder_pointer, "weight"):
                 assert hasattr(encoder_pointer, "weight")
                 encoder_pointer.weight = decoder_pointer.weight
@@ -1969,9 +2516,9 @@ def tie_encoder_to_decoder_recursively(
             encoder_modules = encoder_pointer._modules
             decoder_modules = decoder_pointer._modules
             if len(decoder_modules) > 0:
-                assert (
-                    len(encoder_modules) > 0
-                ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
+                assert len(encoder_modules) > 0, (
+                    f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
+                )
 
                 all_encoder_weights = {module_name + "/" + sub_name for sub_name in encoder_modules.keys()}
                 encoder_layer_pos = 0
@@ -2110,11 +2657,9 @@ def resize_token_embeddings(
         if new_num_tokens is None and pad_to_multiple_of is None:
             return model_embeds
 
-        # Since we are basically resuing the same old embeddings with new weight values, gathering is required
+        # Since we are basically reusing the same old embeddings with new weight values, gathering is required
         is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None
         if is_deepspeed_zero3_enabled() and not is_quantized:
-            import deepspeed
-
             with deepspeed.zero.GatheredParameters(model_embeds.weight, modifier_rank=None):
                 vocab_size = model_embeds.weight.shape[0]
         else:
@@ -2145,8 +2690,6 @@ def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean
         # Update new_num_tokens with the actual size of new_embeddings
         if pad_to_multiple_of is not None:
             if is_deepspeed_zero3_enabled() and not is_quantized:
-                import deepspeed
-
                 with deepspeed.zero.GatheredParameters(new_embeddings.weight, modifier_rank=None):
                     new_num_tokens = new_embeddings.weight.shape[0]
             else:
@@ -2235,8 +2778,6 @@ def _get_resized_embeddings(
 
         is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None
         if is_deepspeed_zero3_enabled() and not is_quantized:
-            import deepspeed
-
             with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=None):
                 old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
         else:
@@ -2281,8 +2822,6 @@ def _get_resized_embeddings(
 
             added_num_tokens = new_num_tokens - old_num_tokens
             if is_deepspeed_zero3_enabled() and not is_quantized:
-                import deepspeed
-
                 with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None):
                     self._init_added_embeddings_weights_with_mean(
                         old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens
@@ -2298,8 +2837,6 @@ def _get_resized_embeddings(
         n = min(old_num_tokens, new_num_tokens)
 
         if is_deepspeed_zero3_enabled() and not is_quantized:
-            import deepspeed
-
             params = [old_embeddings.weight, new_embeddings.weight]
             with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
                 new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
@@ -2310,8 +2847,6 @@ def _get_resized_embeddings(
         # This ensures correct functionality when a Custom Embedding class is passed as input.
         # The input and output embedding types remain consistent. (c.f. https://github.com/huggingface/transformers/pull/31979)
         if is_deepspeed_zero3_enabled() and not is_quantized:
-            import deepspeed
-
             params = [old_embeddings.weight, new_embeddings.weight]
             with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
                 old_embeddings.weight = new_embeddings.weight
@@ -2369,8 +2904,6 @@ def _get_resized_lm_head(
 
         is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None
         if is_deepspeed_zero3_enabled() and not is_quantized:
-            import deepspeed
-
             with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=None):
                 old_num_tokens, old_lm_head_dim = (
                     old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
@@ -2421,8 +2954,6 @@ def _get_resized_lm_head(
 
             added_num_tokens = new_num_tokens - old_num_tokens
             if is_deepspeed_zero3_enabled() and not is_quantized:
-                import deepspeed
-
                 params = [old_lm_head.weight]
                 if has_new_lm_head_bias:
                     params += [old_lm_head.bias]
@@ -2443,8 +2974,6 @@ def _get_resized_lm_head(
         num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
 
         if is_deepspeed_zero3_enabled() and not is_quantized:
-            import deepspeed
-
             params = [old_lm_head.weight, old_lm_head.bias, new_lm_head.weight, new_lm_head.bias]
             with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
                 self._copy_lm_head_original_to_resized(
@@ -2477,7 +3006,7 @@ def _init_added_embeddings_weights_with_mean(
                 sample_shape=(added_num_tokens,)
             ).to(old_embeddings.weight.dtype)
         else:
-            # Otherwise, just initialize with the mean. because distribtion will not be created.
+            # Otherwise, just initialize with the mean. because distribution will not be created.
             new_embeddings.weight.data[-1 * added_num_tokens :, :] = (
                 mean_embeddings[None, :].repeat(added_num_tokens, 1).to(old_embeddings.weight.dtype)
             )
@@ -2496,7 +3025,7 @@ def _init_added_lm_head_weights_with_mean(
             new_lm_head.weight.data = new_lm_head.weight.data.T
             old_lm_head.weight.data = old_lm_head.weight.data.T
 
-        # The same initilization logic as Embeddings.
+        # The same initialization logic as Embeddings.
         self._init_added_embeddings_weights_with_mean(
             old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens
         )
@@ -2643,7 +3172,7 @@ def gradient_checkpointing_disable(self):
         """
         if self.supports_gradient_checkpointing:
             # For old GC format (transformers < 4.35.0) for models that live on the Hub
-            # we will fall back to the overwritten `_set_gradient_checkpointing` methid
+            # we will fall back to the overwritten `_set_gradient_checkpointing` method
             _is_using_old_format = "value" in inspect.signature(self._set_gradient_checkpointing).parameters
             if not _is_using_old_format:
                 self._set_gradient_checkpointing(enable=False)
@@ -2825,7 +3354,7 @@ def save_pretrained(
                 logger.info(
                     "Detected adapters on the model, saving the model in the PEFT format, only adapter weights will be saved."
                 )
-                state_dict = model_to_save.get_adapter_state_dict()
+                state_dict = model_to_save.get_adapter_state_dict(state_dict=state_dict)
 
                 if save_peft_format:
                     logger.info(
@@ -2882,7 +3411,7 @@ def save_pretrained(
                 if ignore_key in state_dict.keys():
                     del state_dict[ignore_key]
 
-        # Rename state_dict keys before saving to file. Do nothing unless overriden in a particular model.
+        # Rename state_dict keys before saving to file. Do nothing unless overridden in a particular model.
         # (initially introduced with TimmWrapperModel to remove prefix and make checkpoints compatible with timm)
         state_dict = self._fix_state_dict_keys_on_save(state_dict)
 
@@ -3015,8 +3544,11 @@ def save_pretrained(
                         f"Please upgrade accelerate with `pip install -U accelerate`"
                     )
                 # init state_dict for this shard
-                shard_state_dict = {name: "" for name in shard}
+                shard_state_dict = dict.fromkeys(shard, "")
                 for module_name in shard:
+                    # skip to collect this weight again
+                    if shard_state_dict.get(module_name) != "":
+                        continue
                     module = module_map[module_name]
                     # update state dict with onloaded parameters
                     shard_state_dict = get_state_dict_from_offload(module, module_name, shard_state_dict)
@@ -3135,6 +3667,10 @@ def to(self, *args, **kwargs):
 
         if getattr(self, "quantization_method", None) == QuantizationMethod.HQQ:
             raise ValueError("`.to` is not supported for HQQ-quantized models.")
+
+        if dtype_present_in_args and getattr(self, "quantization_method", None) == QuantizationMethod.QUARK:
+            raise ValueError("Casting a Quark quantized model to a new `dtype` is not supported.")
+
         # Checks if the model has been loaded in 4-bit or 8-bit with BNB
         if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
             if dtype_present_in_args:
@@ -3181,6 +3717,26 @@ def float(self, *args):
         else:
             return super().float(*args)
 
+    @classmethod
+    def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):
+        # With deepspeed, we cannot initialize the model on meta device
+        if is_deepspeed_zero3_enabled():
+            init_contexts = [no_init_weights()]
+            if not is_quantized and not _is_ds_init_called:
+                logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
+                init_contexts.extend(
+                    [
+                        deepspeed.zero.Init(config_dict_or_path=deepspeed_config()),
+                        set_zero3_state(),
+                    ]
+                )
+            elif is_quantized:
+                init_contexts.append(set_quantized_state())
+        else:
+            init_contexts = [no_init_weights(), init_empty_weights()]
+
+        return init_contexts
+
     @classmethod
     @restore_default_torch_dtype
     def from_pretrained(
@@ -3211,10 +3767,6 @@ def from_pretrained(
         The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
         weights are discarded.
 
-        If model weights are the same precision as the base model (and is a supported model), weights will be lazily loaded
-        in using the `meta` device and brought into memory once an input is passed through that layer regardless of
-        `low_cpu_mem_usage`.
-
         Parameters:
             pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
                 Can be either:
@@ -3292,36 +3844,12 @@ def from_pretrained(
 
                 To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
 
-                </Tip>
-
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-            _fast_init(`bool`, *optional*, defaults to `True`):
-                Whether or not to disable fast initialization.
-
-                <Tip warning={true}>
-
-                One should only disable *_fast_init* to ensure backwards compatibility with `transformers.__version__ <
-                4.6.0` for seeded model initialization. This argument will be removed at the next major version. See
-                [pull request 11471](https://github.com/huggingface/transformers/pull/11471) for more information.
-
                 </Tip>
             attn_implementation (`str`, *optional*):
                 The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), or `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.
 
             > Parameters for big model inference
 
-            low_cpu_mem_usage(`bool`, *optional*):
-                Tries not to use more than 1x model size in CPU memory (including peak memory) while loading the model.
-                Generally should be combined with a `device_map` (such as `"auto"`) for best results.
-                This is an experimental feature and a subject to change at any moment.
-                </Tip>
-                    If the model weights are in the same precision as the model loaded in, `low_cpu_mem_usage` (without
-                    `device_map`) is redundant and will not provide any benefit in regards to CPU memory usage. However,
-                    this should still be enabled if you are passing in a `device_map`.
-                </Tip>
             torch_dtype (`str` or `torch.dtype`, *optional*):
                 Override the default `torch.dtype` and load the model under a specific `dtype`. The different options
                 are:
@@ -3357,8 +3885,12 @@ def from_pretrained(
                 more information about each option see [designing a device
                 map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
             max_memory (`Dict`, *optional*):
-                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+                A dictionary device identifier to maximum memory if using `device_map`. Will default to the maximum memory available for each
                 GPU and the available CPU RAM if unset.
+            tp_plan (`str`, *optional*):
+                A torch tensor parallel plan, see [here](https://pytorch.org/tutorials/intermediate/TP_tutorial.html). Currently, it only accepts
+                `tp_plan="auto"` to use predefined plan based on the model. Note that if you use it, you should launch your script accordingly with
+                `torchrun [args] script.py`. This will be much faster than using a `device_map`, but has limitations.
             offload_folder (`str` or `os.PathLike`, *optional*):
                 If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
             offload_state_dict (`bool`, *optional*):
@@ -3382,12 +3914,13 @@ def from_pretrained(
             use_safetensors (`bool`, *optional*, defaults to `None`):
                 Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
                 is not installed, it will be set to `False`.
-
             weights_only (`bool`, *optional*, defaults to `True`):
                 Indicates whether unpickler should be restricted to loading only tensors, primitive types,
                 dictionaries and any types added via torch.serialization.add_safe_globals().
                 When set to False, we can load wrapper tensor subclass weights.
-
+            key_mapping (`Dict[str, str], *optional*):
+                A potential mapping of the weight names if using a model on the Hub which is compatible to a Transformers
+                architecture, but was not converted accordingly.
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                 `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -3427,37 +3960,16 @@ def from_pretrained(
         >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
         >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", from_flax=True)
         ```
-
-        * `low_cpu_mem_usage` algorithm:
-
-        This is an experimental function that loads the model using ~1x model size CPU memory
-
-        Here is how it works:
-
-        1. save which state_dict keys we have
-        2. drop state_dict before the model is created, since the latter takes 1x model size CPU memory
-        3. after the model has been instantiated switch to the meta device all params/buffers that
-        are going to be replaced from the loaded state_dict
-        4. load state_dict 2nd time
-        5. replace the params/buffers from the state_dict
-
-        Currently, it can't handle deepspeed ZeRO stage 3 and ignores loading errors
-
         """
         state_dict = kwargs.pop("state_dict", None)
         from_tf = kwargs.pop("from_tf", False)
         from_flax = kwargs.pop("from_flax", False)
-        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         output_loading_info = kwargs.pop("output_loading_info", False)
         use_auth_token = kwargs.pop("use_auth_token", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", None)
-        _ = kwargs.pop("mirror", None)
         from_pipeline = kwargs.pop("_from_pipeline", None)
         from_auto_class = kwargs.pop("_from_auto", False)
-        _fast_init = kwargs.pop("_fast_init", True)
         torch_dtype = kwargs.pop("torch_dtype", None)
-        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", None)
         device_map = kwargs.pop("device_map", None)
         max_memory = kwargs.pop("max_memory", None)
         offload_folder = kwargs.pop("offload_folder", None)
@@ -3473,41 +3985,77 @@ def from_pretrained(
         adapter_name = kwargs.pop("adapter_name", "default")
         use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False)
         generation_config = kwargs.pop("generation_config", None)
-
         gguf_file = kwargs.pop("gguf_file", None)
-        # Cache path to the GGUF file
-        gguf_path = None
-
         tp_plan = kwargs.pop("tp_plan", None)
+        key_mapping = kwargs.pop("key_mapping", None)
+        # Not used anymore -- remove them from the kwargs
+        _ = kwargs.pop("resume_download", None)
+        _ = kwargs.pop("trust_remote_code", None)
+        _ = kwargs.pop("mirror", None)
+        _ = kwargs.pop("_fast_init", True)
+        _ = kwargs.pop("low_cpu_mem_usage", None)
+
+        if state_dict is not None and (pretrained_model_name_or_path is not None or gguf_file is not None):
+            raise ValueError(
+                "`state_dict` cannot be passed together with a model name or a `gguf_file`. Use one of the two loading strategies."
+            )
+
         if tp_plan is not None and tp_plan != "auto":
             # TODO: we can relax this check when we support taking tp_plan from a json file, for example.
             raise ValueError(f"tp_plan supports 'auto' only for now but got {tp_plan}.")
-
         if tp_plan is not None and device_map is not None:
             raise ValueError(
                 "`tp_plan` and `device_map` are mutually exclusive. Choose either one for parallelization."
             )
 
+        # If torchrun was used, make sure to TP by default. This way people don't need to change tp or device map
+        if device_map == "auto" and tp_plan is None and int(os.environ.get("WORLD_SIZE", 0)):
+            tp_plan = "auto"  # device_map = "auto" in torchrun equivalent to TP plan = AUTO!
+            device_map = None
+
         # We need to correctly dispatch the model on the current process device. The easiest way for this is to use a simple
-        # `device_map` pointing to the correct device. If we don't, torch will use the default device (index 0) for all
-        # childs processes at parallelization time, resulting in excessive memory usage on device 0 and OOMs.
-        # And temporarily setting the default device to current process rank result in the following error
-        # `torch.distributed.DistBackendError: Attempt to perform collective on tensor not on device passed to init_process_group`
-        tp_device = None
+        # `device_map` pointing to the correct device
+        device_mesh = None
         if tp_plan is not None:
-            if not torch.distributed.is_initialized():
-                raise ValueError("Tensor Parallel requires torch.distributed to be initialized first.")
+            if not is_torch_greater_or_equal("2.5"):
+                raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.")
 
             # Detect the accelerator on the machine. If no accelerator is available, it returns CPU.
             device_type = torch._C._get_accelerator().type
-            device_module = torch.get_device_module(device_type)
+
+            if not torch.distributed.is_initialized():
+                try:
+                    rank = int(os.environ["RANK"])
+                    world_size = int(os.environ["WORLD_SIZE"])
+                    if device_type == "cuda":
+                        torch.distributed.init_process_group(
+                            "nccl", rank=rank, world_size=world_size, init_method="env://"
+                        )
+                        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+                    elif device_type == "cpu":
+                        cpu_backend = "ccl" if int(os.environ.get("CCL_WORKER_COUNT", 0)) else "gloo"
+                        torch.distributed.init_process_group(cpu_backend, rank=rank, world_size=world_size)
+
+                except Exception as e:
+                    raise EnvironmentError(
+                        "We tried to initialize torch.distributed for you, but it failed, make"
+                        "sure you init torch distributed in your script to use `tp_plan='auto'`"
+                    ) from e
+
             # Get device with index assuming equal number of devices per host
-            tp_device = torch.device(device_type, torch.distributed.get_rank() % device_module.device_count())
+            index = None if device_type == "cpu" else torch.cuda.current_device()
+            tp_device = torch.device(device_type, index)
+
+            if index is not None and index > 0:
+                import sys
+
+                sys.stdout = open(os.devnull, "w")
+                sys.stderr = open(os.devnull, "w")
             # This is the easiest way to dispatch to the current process device
             device_map = tp_device
-
-        if is_fsdp_enabled():
-            low_cpu_mem_usage = True
+            # Assuming sharding the model onto the world
+            world_size = torch.distributed.get_world_size()
+            device_mesh = torch.distributed.init_device_mesh(tp_device.type, (world_size,))
 
         if use_auth_token is not None:
             warnings.warn(
@@ -3525,11 +4073,6 @@ def from_pretrained(
 
         if use_safetensors is None and not is_safetensors_available():
             use_safetensors = False
-        if trust_remote_code is True:
-            logger.warning(
-                "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
-                " ignored."
-            )
 
         if gguf_file is not None and not is_accelerate_available():
             raise ValueError("accelerate is required when loading a GGUF file `pip install accelerate`.")
@@ -3542,7 +4085,6 @@ def from_pretrained(
                     CONFIG_NAME,
                     cache_dir=cache_dir,
                     force_download=force_download,
-                    resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
                     token=token,
@@ -3564,7 +4106,6 @@ def from_pretrained(
                     pretrained_model_name_or_path,
                     cache_dir=cache_dir,
                     force_download=force_download,
-                    resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
                     _commit_hash=commit_hash,
@@ -3597,20 +4138,8 @@ def from_pretrained(
                 device_map = {"": device_map}
 
         if device_map is not None:
-            if low_cpu_mem_usage is None:
-                low_cpu_mem_usage = True
-            elif not low_cpu_mem_usage:
-                raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")
-
-        if low_cpu_mem_usage:
             if is_deepspeed_zero3_enabled():
-                raise ValueError(
-                    "DeepSpeed Zero-3 is not compatible with `low_cpu_mem_usage=True` or with passing a `device_map`."
-                )
-            elif not is_accelerate_available():
-                raise ImportError(
-                    f"Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
-                )
+                raise ValueError("DeepSpeed Zero-3 is not compatible with passing a `device_map`.")
 
         # handling bnb config from kwargs, remove after `load_in_{4/8}bit` deprecation.
         if load_in_4bit or load_in_8bit:
@@ -3649,16 +4178,18 @@ def from_pretrained(
                 cache_dir=cache_dir,
                 return_unused_kwargs=True,
                 force_download=force_download,
-                resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
                 token=token,
                 revision=revision,
                 subfolder=subfolder,
+                gguf_file=gguf_file,
                 _from_auto=from_auto_class,
                 _from_pipeline=from_pipeline,
                 **kwargs,
             )
+            if "gguf_file" in model_kwargs:
+                model_kwargs.pop("gguf_file")
         else:
             # In case one passes a config to `from_pretrained` + "attn_implementation"
             # override the `_attn_implementation` attribute to `attn_implementation` of the kwargs
@@ -3704,342 +4235,58 @@ def from_pretrained(
             )
             torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
             device_map = hf_quantizer.update_device_map(device_map)
+            config = hf_quantizer.update_tp_plan(config)
 
             # In order to ensure popular quantization methods are supported. Can be disable with `disable_telemetry`
-            user_agent["quant"] = hf_quantizer.quantization_config.quant_method.value
-
-            # Force-set to `True` for more mem efficiency
-            if low_cpu_mem_usage is None:
-                low_cpu_mem_usage = True
-                logger.warning("`low_cpu_mem_usage` was None, now default to True since model is quantized.")
-        is_quantized = hf_quantizer is not None
-
-        # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
-        # index of the files.
-        is_sharded = False
-        sharded_metadata = None
-        # Load model
-        loading_info = None
-
-        # Keep in fp32 modules
-        keep_in_fp32_modules = None
-        use_keep_in_fp32_modules = False
+            if hasattr(hf_quantizer.quantization_config.quant_method, "value"):
+                user_agent["quant"] = hf_quantizer.quantization_config.quant_method.value
+            else:
+                user_agent["quant"] = hf_quantizer.quantization_config.quant_method
 
         if gguf_file is not None and hf_quantizer is not None:
             raise ValueError(
                 "You cannot combine Quantization and loading a model from a GGUF file, try again by making sure you did not passed a `quantization_config` or that you did not load a quantized model from the Hub."
             )
 
-        if pretrained_model_name_or_path is not None and gguf_file is None:
-            pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-            is_local = os.path.isdir(pretrained_model_name_or_path)
-            if is_local:
-                if from_tf and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
-                ):
-                    # Load from a TF 1.0 checkpoint in priority if from_tf
-                    archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")
-                elif from_tf and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)
-                ):
-                    # Load from a TF 2.0 checkpoint in priority if from_tf
-                    archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)
-                elif from_flax and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
-                ):
-                    # Load from a Flax checkpoint in priority if from_flax
-                    archive_file = os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
-                elif use_safetensors is not False and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant))
-                ):
-                    # Load from a safetensors checkpoint
-                    archive_file = os.path.join(
-                        pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant)
-                    )
-                elif use_safetensors is not False and os.path.isfile(
-                    os.path.join(
-                        pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
-                    )
-                ):
-                    # Load from a sharded safetensors checkpoint
-                    archive_file = os.path.join(
-                        pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
-                    )
-                    is_sharded = True
-                elif not use_safetensors and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant))
-                ):
-                    # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(
-                        pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)
-                    )
-                elif not use_safetensors and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant))
-                ):
-                    # Load from a sharded PyTorch checkpoint
-                    archive_file = os.path.join(
-                        pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)
-                    )
-                    is_sharded = True
-                # At this stage we don't have a weight file so we will raise an error.
-                elif not use_safetensors and (
-                    os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index"))
-                    or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME))
-                ):
-                    raise EnvironmentError(
-                        f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
-                        f" {pretrained_model_name_or_path} but there is a file for TensorFlow weights. Use"
-                        " `from_tf=True` to load this model from those weights."
-                    )
-                elif not use_safetensors and os.path.isfile(
-                    os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)
-                ):
-                    raise EnvironmentError(
-                        f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory"
-                        f" {pretrained_model_name_or_path} but there is a file for Flax weights. Use `from_flax=True`"
-                        " to load this model from those weights."
-                    )
-                elif use_safetensors:
-                    raise EnvironmentError(
-                        f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)} found in directory"
-                        f" {pretrained_model_name_or_path}."
-                    )
-                else:
-                    raise EnvironmentError(
-                        f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
-                        f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME + '.index'} or {FLAX_WEIGHTS_NAME} found in directory"
-                        f" {pretrained_model_name_or_path}."
-                    )
-            elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)):
-                archive_file = pretrained_model_name_or_path
-                is_local = True
-            elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path + ".index")):
-                if not from_tf:
-                    raise ValueError(
-                        f"We found a TensorFlow checkpoint at {pretrained_model_name_or_path + '.index'}, please set "
-                        "from_tf to True to load from this checkpoint."
-                    )
-                archive_file = os.path.join(subfolder, pretrained_model_name_or_path + ".index")
-                is_local = True
-            elif is_remote_url(pretrained_model_name_or_path):
-                filename = pretrained_model_name_or_path
-                resolved_archive_file = download_url(pretrained_model_name_or_path)
-            else:
-                # set correct filename
-                if from_tf:
-                    filename = TF2_WEIGHTS_NAME
-                elif from_flax:
-                    filename = FLAX_WEIGHTS_NAME
-                elif use_safetensors is not False:
-                    filename = _add_variant(SAFE_WEIGHTS_NAME, variant)
-                else:
-                    filename = _add_variant(WEIGHTS_NAME, variant)
-
-                try:
-                    # Load from URL or cache if already cached
-                    cached_file_kwargs = {
-                        "cache_dir": cache_dir,
-                        "force_download": force_download,
-                        "proxies": proxies,
-                        "resume_download": resume_download,
-                        "local_files_only": local_files_only,
-                        "token": token,
-                        "user_agent": user_agent,
-                        "revision": revision,
-                        "subfolder": subfolder,
-                        "_raise_exceptions_for_gated_repo": False,
-                        "_raise_exceptions_for_missing_entries": False,
-                        "_commit_hash": commit_hash,
-                    }
-                    resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
-
-                    # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None
-                    # result when internet is up, the repo and revision exist, but the file does not.
-                    if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant):
-                        # Maybe the checkpoint is sharded, we try to grab the index name in this case.
-                        resolved_archive_file = cached_file(
-                            pretrained_model_name_or_path,
-                            _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
-                            **cached_file_kwargs,
-                        )
-                        if resolved_archive_file is not None:
-                            is_sharded = True
-                        elif use_safetensors:
-                            if revision == "main":
-                                resolved_archive_file, revision, is_sharded = auto_conversion(
-                                    pretrained_model_name_or_path, **cached_file_kwargs
-                                )
-                            cached_file_kwargs["revision"] = revision
-                            if resolved_archive_file is None:
-                                raise EnvironmentError(
-                                    f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                    f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} "
-                                    "and thus cannot be loaded with `safetensors`. Please make sure that the model has "
-                                    "been saved with `safe_serialization=True` or do not set `use_safetensors=True`."
-                                )
-                        else:
-                            # This repo has no safetensors file of any kind, we switch to PyTorch.
-                            filename = _add_variant(WEIGHTS_NAME, variant)
-                            resolved_archive_file = cached_file(
-                                pretrained_model_name_or_path, filename, **cached_file_kwargs
-                            )
-                    if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant):
-                        # Maybe the checkpoint is sharded, we try to grab the index name in this case.
-                        resolved_archive_file = cached_file(
-                            pretrained_model_name_or_path,
-                            _add_variant(WEIGHTS_INDEX_NAME, variant),
-                            **cached_file_kwargs,
-                        )
-                        if resolved_archive_file is not None:
-                            is_sharded = True
-                    if not local_files_only and not is_offline_mode():
-                        if resolved_archive_file is not None:
-                            if filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME]:
-                                # If the PyTorch file was found, check if there is a safetensors file on the repository
-                                # If there is no safetensors file on the repositories, start an auto conversion
-                                safe_weights_name = SAFE_WEIGHTS_INDEX_NAME if is_sharded else SAFE_WEIGHTS_NAME
-                                has_file_kwargs = {
-                                    "revision": revision,
-                                    "proxies": proxies,
-                                    "token": token,
-                                    "cache_dir": cache_dir,
-                                    "local_files_only": local_files_only,
-                                }
-                                cached_file_kwargs = {
-                                    "cache_dir": cache_dir,
-                                    "force_download": force_download,
-                                    "resume_download": resume_download,
-                                    "local_files_only": local_files_only,
-                                    "user_agent": user_agent,
-                                    "subfolder": subfolder,
-                                    "_raise_exceptions_for_gated_repo": False,
-                                    "_raise_exceptions_for_missing_entries": False,
-                                    "_commit_hash": commit_hash,
-                                    **has_file_kwargs,
-                                }
-                                if not has_file(pretrained_model_name_or_path, safe_weights_name, **has_file_kwargs):
-                                    Thread(
-                                        target=auto_conversion,
-                                        args=(pretrained_model_name_or_path,),
-                                        kwargs={"ignore_errors_during_conversion": True, **cached_file_kwargs},
-                                        name="Thread-auto_conversion",
-                                    ).start()
-                        else:
-                            # Otherwise, no PyTorch file was found, maybe there is a TF or Flax model file.
-                            # We try those to give a helpful error message.
-                            has_file_kwargs = {
-                                "revision": revision,
-                                "proxies": proxies,
-                                "token": token,
-                                "cache_dir": cache_dir,
-                                "local_files_only": local_files_only,
-                            }
-                            if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):
-                                raise EnvironmentError(
-                                    f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                    f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for TensorFlow weights."
-                                    " Use `from_tf=True` to load this model from those weights."
-                                )
-                            elif has_file(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME, **has_file_kwargs):
-                                raise EnvironmentError(
-                                    f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                    f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for Flax weights. Use"
-                                    " `from_flax=True` to load this model from those weights."
-                                )
-                            elif variant is not None and has_file(
-                                pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs
-                            ):
-                                raise EnvironmentError(
-                                    f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                    f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant"
-                                    f" {variant}. Use `variant=None` to load this model from those weights."
-                                )
-                            else:
-                                raise EnvironmentError(
-                                    f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                    f" {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
-                                    f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
-                                )
-
-                except EnvironmentError:
-                    # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
-                    # to the original exception.
-                    raise
-                except Exception as e:
-                    # For any other exception, we throw a generic error.
-                    raise EnvironmentError(
-                        f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
-                        " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
-                        f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-                        f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)},"
-                        f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
-                    ) from e
-
-            if is_local:
-                logger.info(f"loading weights file {archive_file}")
-                resolved_archive_file = archive_file
-            else:
-                logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}")
-        elif gguf_file:
-            from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
-
-            # Case 1: the GGUF file is present locally
-            if os.path.isfile(gguf_file):
-                gguf_path = gguf_file
-            # Case 2: The GGUF path is a location on the Hub
-            # Load from URL or cache if already cached
-            else:
-                cached_file_kwargs = {
-                    "cache_dir": cache_dir,
-                    "force_download": force_download,
-                    "proxies": proxies,
-                    "resume_download": resume_download,
-                    "local_files_only": local_files_only,
-                    "token": token,
-                    "user_agent": user_agent,
-                    "revision": revision,
-                    "subfolder": subfolder,
-                    "_raise_exceptions_for_gated_repo": False,
-                    "_raise_exceptions_for_missing_entries": False,
-                    "_commit_hash": commit_hash,
-                }
-
-                gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **cached_file_kwargs)
-
-            # we need a dummy model to help rename state_dict
-            with torch.device("meta"):
-                dummy_model = cls(config)
-            state_dict = load_gguf_checkpoint(gguf_path, return_tensors=True, model_to_load=dummy_model)["tensors"]
+        if (
+            gguf_file
+            and device_map is not None
+            and ((isinstance(device_map, dict) and "disk" in device_map.values()) or "disk" in device_map)
+        ):
+            raise RuntimeError(
+                "One or more modules is configured to be mapped to disk. Disk offload is not supported for models "
+                "loaded from GGUF files."
+            )
 
-            resolved_archive_file = None
-            is_sharded = False
-        else:
-            resolved_archive_file = None
+        checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder,
+            variant=variant,
+            gguf_file=gguf_file,
+            from_tf=from_tf,
+            from_flax=from_flax,
+            use_safetensors=use_safetensors,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            user_agent=user_agent,
+            revision=revision,
+            commit_hash=commit_hash,
+        )
 
-        # We'll need to download and cache each checkpoint shard if the checkpoint is sharded.
-        if is_sharded:
-            # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case.
-            resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
-                pretrained_model_name_or_path,
-                resolved_archive_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                token=token,
-                user_agent=user_agent,
-                revision=revision,
-                subfolder=subfolder,
-                _commit_hash=commit_hash,
-            )
+        is_sharded = sharded_metadata is not None
+        is_quantized = hf_quantizer is not None
+        is_from_file = pretrained_model_name_or_path is not None or gguf_file is not None
 
         if (
             is_safetensors_available()
-            and isinstance(resolved_archive_file, str)
-            and resolved_archive_file.endswith(".safetensors")
+            and is_from_file
+            and not is_sharded
+            and checkpoint_files[0].endswith(".safetensors")
         ):
-            with safe_open(resolved_archive_file, framework="pt") as f:
+            with safe_open(checkpoint_files[0], framework="pt") as f:
                 metadata = f.metadata()
 
             if metadata is None:
@@ -4063,116 +4310,27 @@ def from_pretrained(
 
         from_pt = not (from_tf | from_flax)
 
-        # load pt weights early so that we know which dtype to init the model under
-
         if from_pt:
-            if not is_sharded and state_dict is None:
-                # Time to load the checkpoint
-                state_dict = load_state_dict(resolved_archive_file, weights_only=weights_only)
-
-            # set dtype to instantiate the model under:
-            # 1. If torch_dtype is not None, we use that dtype
-            # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
-            #    weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
-            # we also may have config.torch_dtype available, but we won't rely on it till v5
-            dtype_orig = None
-
-            if torch_dtype is not None:
-                if isinstance(torch_dtype, str):
-                    if torch_dtype == "auto":
-                        if hasattr(config, "torch_dtype") and config.torch_dtype is not None:
-                            torch_dtype = config.torch_dtype
-                            logger.info(f"Will use torch_dtype={torch_dtype} as defined in model's config object")
-                        else:
-                            if is_sharded and "dtype" in sharded_metadata:
-                                torch_dtype = sharded_metadata["dtype"]
-                            elif not is_sharded:
-                                torch_dtype = get_state_dict_dtype(state_dict)
-                            else:
-                                one_state_dict = load_state_dict(resolved_archive_file[0], weights_only=weights_only)
-                                torch_dtype = get_state_dict_dtype(one_state_dict)
-                                del one_state_dict  # free CPU memory
-                            logger.info(
-                                "Since the `torch_dtype` attribute can't be found in model's config object, "
-                                "will use torch_dtype={torch_dtype} as derived from model's weights"
-                            )
-                    elif hasattr(torch, torch_dtype):
-                        torch_dtype = getattr(torch, torch_dtype)
-                        for sub_config_key in config.sub_configs.keys():
-                            sub_config = getattr(config, sub_config_key)
-                            sub_config.torch_dtype = torch_dtype
-                elif isinstance(torch_dtype, torch.dtype):
-                    for sub_config_key in config.sub_configs.keys():
-                        sub_config = getattr(config, sub_config_key)
-                        sub_config.torch_dtype = torch_dtype
-                elif isinstance(torch_dtype, dict):
-                    for key, curr_dtype in torch_dtype.items():
-                        if hasattr(config, key):
-                            value = getattr(config, key)
-                            value.torch_dtype = curr_dtype
-                    # main torch dtype for modules that aren't part of any sub-config
-                    torch_dtype = torch_dtype.get("")
-                    config.torch_dtype = torch_dtype
-                    if isinstance(torch_dtype, str) and hasattr(torch, torch_dtype):
-                        torch_dtype = getattr(torch, torch_dtype)
-                    elif torch_dtype is None:
-                        torch_dtype = torch.float32
-                else:
-                    raise ValueError(
-                        f"`torch_dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `torch_dtype` "
-                        f"for each sub-config in composite configs, but received {torch_dtype}"
-                    )
-
-                dtype_orig = cls._set_default_torch_dtype(torch_dtype)
-            else:
-                # set fp32 as the default dtype for BC
-                default_dtype = str(torch.get_default_dtype()).split(".")[-1]
-                config.torch_dtype = default_dtype
-                for key in config.sub_configs.keys():
-                    value = getattr(config, key)
-                    value.torch_dtype = default_dtype
+            if gguf_file:
+                from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
+
+                # we need a dummy model to get the state_dict - for this reason, we keep the state_dict as if it was
+                # passed directly as a kwarg from now on
+                with torch.device("meta"):
+                    dummy_model = cls(config)
+                state_dict = load_gguf_checkpoint(checkpoint_files[0], return_tensors=True, model_to_load=dummy_model)[
+                    "tensors"
+                ]
 
-            # Check if `_keep_in_fp32_modules` is not None
-            use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
-                (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
+            # Find the correct dtype based on current state
+            config, torch_dtype, dtype_orig = _get_torch_dtype(
+                cls, torch_dtype, checkpoint_files, config, sharded_metadata, state_dict, weights_only
             )
 
-            if is_sharded:
-                loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
-            else:
-                loaded_state_dict_keys = list(state_dict.keys())
-            if (
-                gguf_path is None
-                and (low_cpu_mem_usage or (use_keep_in_fp32_modules and is_accelerate_available()))
-                and pretrained_model_name_or_path is not None
-            ):
-                # In case some weights need to be kept in float32 and accelerate is not installed,
-                # we later on want to take the path where state_dict is not None, that is the one
-                # that do not require accelerate.
-                state_dict = None
-
         config.name_or_path = pretrained_model_name_or_path
 
         # Instantiate model.
-        init_contexts = [no_init_weights(_enable=_fast_init)]
-
-        if is_deepspeed_zero3_enabled() and not is_quantized and not _is_ds_init_called:
-            import deepspeed
-
-            logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
-            init_contexts = [
-                deepspeed.zero.Init(config_dict_or_path=deepspeed_config()),
-                set_zero3_state(),
-            ] + init_contexts
-        elif low_cpu_mem_usage:
-            if not is_accelerate_available():
-                raise ImportError(
-                    f"Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
-                )
-            init_contexts.append(init_empty_weights())
-
-        if is_deepspeed_zero3_enabled() and is_quantized:
-            init_contexts.append(set_quantized_state())
+        model_init_context = cls.get_init_context(is_quantized, _is_ds_init_called)
 
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
         if not getattr(config, "_attn_implementation_autoset", False):
@@ -4180,130 +4338,53 @@ def from_pretrained(
                 config, use_flash_attention_2=use_flash_attention_2, torch_dtype=torch_dtype, device_map=device_map
             )
 
-        with ContextManagers(init_contexts):
+        with ContextManagers(model_init_context):
             # Let's make sure we don't run the init function of buffer modules
             model = cls(config, *model_args, **model_kwargs)
 
+        # Make sure to tie the weights correctly
+        model.tie_weights()
+
+        # Last check for tp
+        if device_mesh is not None and not model.supports_tp_plan:
+            if config.base_model_tp_plan is None and config.get_text_config().base_model_tp_plan is None:
+                raise NotImplementedError("This model does not have a tensor parallel plan.")
+
         # make sure we use the model's config since the __init__ call might have copied it
         config = model.config
 
-        # Check first if we are `from_pt`
-        if use_keep_in_fp32_modules:
-            if is_accelerate_available() and not is_deepspeed_zero3_enabled():
-                low_cpu_mem_usage = True
-            keep_in_fp32_modules = model._keep_in_fp32_modules
-        else:
-            keep_in_fp32_modules = []
+        # Find fp32 modules if needed
+        keep_in_fp32_regex = None
+        # The _keep_in_fp32_modules flag is only used to avoid bf16 -> fp16 casting precision issues. It was introduced
+        # in case of force loading a model that should stay bf16 in fp16 (which includes a few quantizers as this is a pre-processing
+        # step for e.g. bitsandbytes). See https://github.com/huggingface/transformers/issues/20287 for details.
+        if model._keep_in_fp32_modules is not None and (
+            torch_dtype == torch.float16 or getattr(hf_quantizer, "use_keep_in_fp32_modules", False)
+        ):
+            # We need to match exact layers, so we add either `.` on each side, or start/end of string
+            keep_in_fp32_regex = re.compile(
+                "|".join([rf"((^|\.){module}($|\.))" for module in model._keep_in_fp32_modules])
+            )
 
         if hf_quantizer is not None:
             hf_quantizer.preprocess_model(
-                model=model, device_map=device_map, keep_in_fp32_modules=keep_in_fp32_modules
+                model=model, device_map=device_map, keep_in_fp32_modules=model._keep_in_fp32_modules, config=config
             )
-
             # We store the original dtype for quantized models as we cannot easily retrieve it
             # once the weights have been quantized
             # Note that once you have loaded a quantized model, you can't change its dtype so this will
             # remain a single source of truth
-            config._pre_quantization_dtype = torch_dtype
+            config._pre_quantization_dtype = torch_dtype if torch_dtype is not None else torch.get_default_dtype()
 
-        if isinstance(device_map, str):
-            special_dtypes = {}
-
-            if hf_quantizer is not None:
-                special_dtypes.update(hf_quantizer.get_special_dtypes_update(model, torch_dtype))
-
-            special_dtypes.update(
-                {
-                    name: torch.float32
-                    for name, _ in model.named_parameters()
-                    if any(m in name for m in keep_in_fp32_modules)
-                }
-            )
-
-            target_dtype = torch_dtype
-
-            if hf_quantizer is not None:
-                target_dtype = hf_quantizer.adjust_target_dtype(target_dtype)
-
-            no_split_modules = model._get_no_split_modules(device_map)
-            if device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
-                raise ValueError(
-                    "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
-                    "'sequential'."
-                )
-
-            device_map_kwargs = {"no_split_module_classes": no_split_modules}
-            if "special_dtypes" in inspect.signature(infer_auto_device_map).parameters:
-                device_map_kwargs["special_dtypes"] = special_dtypes
-            elif len(special_dtypes) > 0:
-                logger.warning(
-                    "This model has some weights that should be kept in higher precision, you need to upgrade "
-                    "`accelerate` to properly deal with them (`pip install --upgrade accelerate`)."
-                )
-            if device_map != "sequential":
-                max_memory = get_balanced_memory(
-                    model,
-                    dtype=target_dtype,
-                    low_zero=(device_map == "balanced_low_0"),
-                    max_memory=max_memory,
-                    **device_map_kwargs,
-                )
-            else:
-                max_memory = get_max_memory(max_memory)
-            if hf_quantizer is not None:
-                max_memory = hf_quantizer.adjust_max_memory(max_memory)
-            device_map_kwargs["max_memory"] = max_memory
-
-            # Make sure tied weights are tied before creating the device map.
-            model.tie_weights()
-            device_map = infer_auto_device_map(model, dtype=target_dtype, **device_map_kwargs)
-
-            if hf_quantizer is not None:
-                hf_quantizer.validate_environment(device_map=device_map)
-
-        elif device_map is not None:
-            model.tie_weights()
-            tied_params = find_tied_parameters(model)
-            # check if we don't have tied param in different devices
-            check_tied_parameters_on_same_device(tied_params, device_map)
-
-        if gguf_path and device_map is not None and "disk" in device_map.values():
-            raise RuntimeError(
-                "One or more modules is configured to be mapped to disk. Disk offload is not supported for models "
-                "loaded from GGUF files."
-            )
+        # Prepare the full device map
+        if device_map is not None:
+            device_map = _get_device_map(model, device_map, max_memory, hf_quantizer, torch_dtype, keep_in_fp32_regex)
 
+        # Finalize model weight initialization
         if from_tf:
-            if resolved_archive_file.endswith(".index"):
-                # Load from a TensorFlow 1.X checkpoint - provided by original authors
-                model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
-            else:
-                # Load from our TensorFlow 2.0 checkpoints
-                try:
-                    from .modeling_tf_pytorch_utils import load_tf2_checkpoint_in_pytorch_model
-
-                    model, loading_info = load_tf2_checkpoint_in_pytorch_model(
-                        model, resolved_archive_file, allow_missing_keys=True, output_loading_info=True
-                    )
-                except ImportError:
-                    logger.error(
-                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed."
-                        " Please see https://pytorch.org/ and https://www.tensorflow.org/install/ for installation"
-                        " instructions."
-                    )
-                    raise
+            model, loading_info = cls._load_from_tf(model, config, checkpoint_files)
         elif from_flax:
-            try:
-                from .modeling_flax_pytorch_utils import load_flax_checkpoint_in_pytorch_model
-
-                model = load_flax_checkpoint_in_pytorch_model(model, resolved_archive_file)
-            except ImportError:
-                logger.error(
-                    "Loading a Flax model in PyTorch, requires both PyTorch and Flax to be installed. Please see"
-                    " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for"
-                    " installation instructions."
-                )
-                raise
+            model = cls._load_from_flax(model, checkpoint_files)
         elif from_pt:
             # restore default dtype
             if dtype_orig is not None:
@@ -4319,20 +4400,18 @@ def from_pretrained(
             ) = cls._load_pretrained_model(
                 model,
                 state_dict,
-                loaded_state_dict_keys,  # XXX: rename?
-                resolved_archive_file,
+                checkpoint_files,
                 pretrained_model_name_or_path,
                 ignore_mismatched_sizes=ignore_mismatched_sizes,
                 sharded_metadata=sharded_metadata,
-                _fast_init=_fast_init,
-                low_cpu_mem_usage=low_cpu_mem_usage,
                 device_map=device_map,
-                offload_folder=offload_folder,
+                disk_offload_folder=offload_folder,
                 offload_state_dict=offload_state_dict,
                 dtype=torch_dtype,
                 hf_quantizer=hf_quantizer,
-                keep_in_fp32_modules=keep_in_fp32_modules,
-                gguf_path=gguf_path,
+                keep_in_fp32_regex=keep_in_fp32_regex,
+                device_mesh=device_mesh,
+                key_mapping=key_mapping,
                 weights_only=weights_only,
             )
 
@@ -4352,7 +4431,6 @@ def from_pretrained(
                     pretrained_model_name_or_path,
                     cache_dir=cache_dir,
                     force_download=force_download,
-                    resume_download=resume_download,
                     proxies=proxies,
                     local_files_only=local_files_only,
                     token=token,
@@ -4368,8 +4446,9 @@ def from_pretrained(
                 )
                 pass
 
-        # Dispatch model with hooks on all devices if necessary
-        if device_map is not None:
+        # Dispatch model with hooks on all devices if necessary (not needed with a tp_plan, so we skip it as it slightly
+        # harm performances)
+        if device_map is not None and device_mesh is None:
             device_map_kwargs = {
                 "device_map": device_map,
                 "offload_dir": offload_folder,
@@ -4409,31 +4488,22 @@ def from_pretrained(
             )
 
         if output_loading_info:
-            if loading_info is None:
+            if from_pt:
                 loading_info = {
                     "missing_keys": missing_keys,
                     "unexpected_keys": unexpected_keys,
                     "mismatched_keys": mismatched_keys,
                     "error_msgs": error_msgs,
                 }
+            elif from_flax:
+                loading_info = None
             return model, loading_info
 
-        if tp_plan is not None:
-            assert tp_device is not None, "tp_device not set!"
-            if not model.supports_tp_plan:
-                raise NotImplementedError("This model does not have a tensor parallel plan.")
-            # Assuming sharding the model onto the world
-            world_size = torch.distributed.get_world_size()
-            device_mesh = torch.distributed.init_device_mesh(tp_device.type, (world_size,))
-            # Apply Tensor Parallelism
-            model.tensor_parallel(device_mesh)
-
         return model
 
     @staticmethod
-    def _fix_state_dict_key_on_load(key) -> Tuple[str, bool]:
+    def _fix_state_dict_key_on_load(key: str) -> Tuple[str, bool]:
         """Replace legacy parameter names with their modern equivalents. E.g. beta -> bias, gamma -> weight."""
-
         # Rename LayerNorm beta & gamma params for some early models ported from Tensorflow (e.g. Bert)
         # This rename is logged.
         if key.endswith("LayerNorm.beta"):
@@ -4457,34 +4527,65 @@ def _fix_state_dict_key_on_load(key) -> Tuple[str, bool]:
 
         return key, False
 
-    @classmethod
-    def _fix_state_dict_keys_on_load(cls, state_dict):
-        """Fixes state dict keys by replacing legacy parameter names with their modern equivalents.
-        Logs if any parameters have been renamed.
+    def _get_key_renaming_mapping(
+        self,
+        checkpoint_keys: List[str],
+        key_mapping: Optional[Dict[str, str]] = None,
+        loading_base_model_from_task_state_dict: bool = False,
+        loading_task_model_from_base_state_dict: bool = False,
+    ):
         """
+        Compute a mapping between the serialized keys on disk `checkpoint_keys`, and the keys that the model
+        that we are loading expects. This is the single entry point for key renaming that will be used during
+        loading.
+        Log if any parameters have been renamed.
+        """
+        prefix = self.base_model_prefix
+        _prefix = f"{prefix}."
 
         renamed_keys = {}
-        state_dict_keys = list(state_dict.keys())
-        for key in state_dict_keys:
-            new_key, has_changed = cls._fix_state_dict_key_on_load(key)
-            if has_changed:
-                state_dict[new_key] = state_dict.pop(key)
+        key_renaming_mapping = {}
+        for key in checkpoint_keys:
+            # Class specific rename
+            new_key, has_changed = self._fix_state_dict_key_on_load(key)
+
+            # Optionally map the key according to `key_mapping`
+            if key_mapping is not None:
+                for pattern, replacement in key_mapping.items():
+                    new_key, n_replace = re.subn(pattern, replacement, new_key)
+                    # Early exit of the loop
+                    if n_replace > 0:
+                        has_changed = True
+                        break
+
+            # In this case, we need to add the prefix to the keys, to match them to the expected keys
+            if loading_task_model_from_base_state_dict:
+                new_key = ".".join([prefix, new_key])
+            # In this case we need to remove the prefix from the key to match them to the expected keys, and use
+            # only the keys starting with the prefix
+            elif loading_base_model_from_task_state_dict:
+                if not new_key.startswith(_prefix):
+                    continue
+                new_key = new_key[len(_prefix) :]
 
-                # track gamma/beta rename for logging
+            key_renaming_mapping[key] = new_key
+
+            # track gamma/beta rename for logging
+            if has_changed:
                 if key.endswith("LayerNorm.gamma"):
                     renamed_keys["LayerNorm.gamma"] = (key, new_key)
                 elif key.endswith("LayerNorm.beta"):
                     renamed_keys["LayerNorm.beta"] = (key, new_key)
 
         if renamed_keys:
-            warning_msg = f"A pretrained model of type `{cls.__name__}` "
+            warning_msg = f"A pretrained model of type `{self.__class__.__name__}` "
             warning_msg += "contains parameters that have been renamed internally (a few are listed below but more are present in the model):\n"
             for old_key, new_key in renamed_keys.values():
                 warning_msg += f"* `{old_key}` -> `{new_key}`\n"
             warning_msg += "If you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users."
             logger.info_once(warning_msg)
 
-        return state_dict
+        return key_renaming_mapping
 
     @staticmethod
     def _fix_state_dict_key_on_save(key) -> Tuple[str, bool]:
@@ -4504,448 +4605,304 @@ def _fix_state_dict_keys_on_save(self, state_dict):
     @classmethod
     def _load_pretrained_model(
         cls,
-        model,
-        state_dict,
-        loaded_keys,
-        resolved_archive_file,
-        pretrained_model_name_or_path,
-        ignore_mismatched_sizes=False,
-        sharded_metadata=None,
-        _fast_init=True,
-        low_cpu_mem_usage=False,
-        device_map=None,
-        offload_folder=None,
-        offload_state_dict=None,
-        dtype=None,
-        hf_quantizer=None,
-        keep_in_fp32_modules=None,
-        gguf_path=None,
-        weights_only=True,
+        model: "PreTrainedModel",
+        state_dict: Optional[Dict],
+        checkpoint_files: Optional[List[str]],
+        pretrained_model_name_or_path: Optional[str],
+        ignore_mismatched_sizes: bool = False,
+        sharded_metadata: Optional[Dict] = None,
+        device_map: Optional[Dict] = None,
+        disk_offload_folder: Optional[str] = None,
+        offload_state_dict: Optional[bool] = None,
+        dtype: Optional[torch.dtype] = None,
+        hf_quantizer: Optional[HfQuantizer] = None,
+        keep_in_fp32_regex: Optional[re.Pattern] = None,
+        device_mesh: Optional["torch.distributed.device_mesh.DeviceMesh"] = None,
+        key_mapping: Optional[Dict[str, str]] = None,
+        weights_only: bool = True,
     ):
-        is_safetensors = False
+        # Useful flags
         is_quantized = hf_quantizer is not None
-        state_dict_folder = None
-        state_dict_index = None
-
-        if device_map is not None and "disk" in device_map.values():
-            archive_file = (
-                resolved_archive_file[0] if isinstance(resolved_archive_file, (list, tuple)) else resolved_archive_file
+        is_hqq_or_bnb = is_quantized and hf_quantizer.quantization_config.quant_method in [
+            QuantizationMethod.HQQ,
+            QuantizationMethod.BITS_AND_BYTES,
+        ]
+
+        # Get all the keys of the state dicts that we have to initialize the model
+        if sharded_metadata is not None:
+            original_checkpoint_keys = sharded_metadata["all_checkpoint_keys"]
+        elif state_dict is not None:
+            original_checkpoint_keys = list(state_dict.keys())
+        else:
+            original_checkpoint_keys = list(
+                load_state_dict(checkpoint_files[0], map_location="meta", weights_only=weights_only).keys()
             )
-            is_safetensors = archive_file is not None and archive_file.endswith(".safetensors")
-            if offload_folder is None and not is_safetensors:
-                raise ValueError(
-                    "The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder`"
-                    " for them. Alternatively, make sure you have `safetensors` installed if the model you are using"
-                    " offers the weights in this format."
-                )
-            if offload_folder is not None:
-                os.makedirs(offload_folder, exist_ok=True)
-            if offload_state_dict is None:
-                offload_state_dict = True
-
-        is_sharded_safetensors = is_safetensors and sharded_metadata is not None
-
-        # tie the model weights before retrieving the state_dict
-        model.tie_weights()
 
-        # Retrieve missing & unexpected_keys
-        model_state_dict = model.state_dict()
-        expected_keys = list(model_state_dict.keys())
+        # Check if we are in a special state, i.e. loading from a state dict coming from a different architecture
         prefix = model.base_model_prefix
+        _prefix = f"{prefix}."
+        has_prefix_module = any(s.startswith(prefix) for s in original_checkpoint_keys) if len(prefix) > 0 else False
+        expects_prefix_module = hasattr(model, prefix) if len(prefix) > 0 else False
+        loading_task_model_from_base_state_dict = not has_prefix_module and expects_prefix_module
+        loading_base_model_from_task_state_dict = has_prefix_module and not expects_prefix_module
+
+        # Find the key names that the model expects from the serialized keys
+        key_renaming_mapping = model._get_key_renaming_mapping(
+            original_checkpoint_keys,
+            key_mapping,
+            loading_base_model_from_task_state_dict,
+            loading_task_model_from_base_state_dict,
+        )
+        checkpoint_keys = list(key_renaming_mapping.values())
 
-        if hf_quantizer is not None:
-            expected_keys = hf_quantizer.update_expected_keys(model, expected_keys, loaded_keys)
+        # Find missing and unexpected keys from the state dict
+        missing_keys, unexpected_keys = _find_missing_and_unexpected_keys(
+            cls,
+            model,
+            original_checkpoint_keys,
+            checkpoint_keys,
+            loading_base_model_from_task_state_dict,
+            hf_quantizer,
+            device_map,
+        )
+        # Find all the keys with shape mismatch (if we ignore the mismatch, the weights need to be newly initialized the
+        # same way as missing keys)
+        mismatched_keys, mismatched_shapes = _find_mismatched_keys(
+            model,
+            state_dict,
+            checkpoint_files,
+            ignore_mismatched_sizes,
+            key_renaming_mapping,
+            is_quantized,
+            weights_only,
+        )
 
-        original_loaded_keys = loaded_keys
-        loaded_keys = [cls._fix_state_dict_key_on_load(key)[0] for key in loaded_keys]
+        # We need to update both the mapping and the list of checkpoint keys to remove the mismatched ones
+        key_renaming_mapping = {k: v for k, v in key_renaming_mapping.items() if v not in mismatched_keys}
+        checkpoint_keys = list(key_renaming_mapping.values())
 
-        if len(prefix) > 0:
-            has_prefix_module = any(s.startswith(prefix) for s in loaded_keys)
-            expects_prefix_module = any(s.startswith(prefix) for s in expected_keys)
-        else:
-            has_prefix_module = False
-            expects_prefix_module = False
-
-        # key re-naming operations are never done on the keys
-        # that are loaded, but always on the keys of the newly initialized model
-        remove_prefix_from_model = not has_prefix_module and expects_prefix_module
-        add_prefix_to_model = has_prefix_module and not expects_prefix_module
-
-        if remove_prefix_from_model:
-            _prefix = f"{prefix}."
-            expected_keys_not_prefixed = [s for s in expected_keys if not s.startswith(_prefix)]
-            expected_keys = [s[len(_prefix) :] if s.startswith(_prefix) else s for s in expected_keys]
-        elif add_prefix_to_model:
-            expected_keys = [".".join([prefix, s]) for s in expected_keys]
-
-        missing_keys = sorted(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = set(loaded_keys) - set(expected_keys)
-
-        # Remove nonpersistent buffers from unexpected keys: they are not in the state dict but will be in the model
-        # buffers
-        model_buffers = {n for n, _ in model.named_buffers()}
-        if remove_prefix_from_model:
-            model_buffers = {key[len(_prefix) :] if key.startswith(_prefix) else key for key in model_buffers}
-        elif add_prefix_to_model:
-            model_buffers = {".".join([prefix, key]) for key in model_buffers}
-        unexpected_keys = sorted(unexpected_keys - model_buffers)
-
-        # Clean up buffer for `inv-freq` because RoPE embedding moved under base model (https://github.com/huggingface/transformers/pull/34858)
-        has_inv_freq_buffers = any(buffer.endswith("rotary_emb.inv_freq") for buffer in model_buffers)
-        if has_inv_freq_buffers:
-            unexpected_keys = {k for k in unexpected_keys if "rotary_emb.inv_freq" not in k}
+        # Move missing (and potentially mismatched) keys back to cpu from meta device (because they won't be moved when
+        # loading the weights as they are not in the loaded state dict)
+        model._move_missing_keys_from_meta_to_cpu(missing_keys + mismatched_keys, unexpected_keys, dtype, hf_quantizer)
 
-        model.tie_weights()
-        if device_map is None and not is_fsdp_enabled() and not is_deepspeed_zero3_enabled():
-            ptrs = collections.defaultdict(list)
-            for name, tensor in model.state_dict().items():
-                id_tensor = id_tensor_storage(tensor)
-                ptrs[id_tensor].append(name)
+        # correctly initialize the missing (and potentially mismatched) keys
+        model._initialize_missing_keys(checkpoint_keys, ignore_mismatched_sizes, is_quantized)
 
-            # These are all the pointers of shared tensors.
-            tied_params = [names for _, names in ptrs.items() if len(names) > 1]
-        else:
-            # id function doesn't work for meta tensor so we need this function
-            tied_params = find_tied_parameters(model)
-
-        for group in tied_params:
-            if remove_prefix_from_model:
-                group = [key[len(_prefix) :] if key.startswith(_prefix) else key for key in group]
-            elif add_prefix_to_model:
-                group = [".".join([prefix, key]) for key in group]
-            missing_in_group = [k for k in missing_keys if k in group]
-            if len(missing_in_group) > 0 and len(missing_in_group) < len(group):
-                missing_keys = [k for k in missing_keys if k not in missing_in_group]
-
-        # Some models may have keys that are not in the state by design, removing them before needlessly warning
-        # the user.
-        if cls._keys_to_ignore_on_load_missing is not None:
-            for pat in cls._keys_to_ignore_on_load_missing:
-                missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
-
-        if cls._keys_to_ignore_on_load_unexpected is not None:
-            for pat in cls._keys_to_ignore_on_load_unexpected:
-                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-        if hf_quantizer is not None:
-            missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix)
-
-        # retrieve weights on meta device and put them back on CPU.
-        # This is not ideal in terms of memory, but if we don't do that not, we can't initialize them in the next step
-        if low_cpu_mem_usage:
-            for key in missing_keys:
-                if key in list(model_state_dict.keys()):
-                    key = key
-                elif f"{prefix}.{key}" in list(model_state_dict.keys()):
-                    key = f"{prefix}.{key}"
-                elif key.startswith(prefix) and ".".join(key.split(".")[1:]) in list(model_state_dict.keys()):
-                    key = ".".join(key.split(".")[1:])
-                param = model_state_dict[key]
-
-                # upcast in fp32 if any
-                target_dtype = dtype
-                if (
-                    keep_in_fp32_modules is not None
-                    and dtype == torch.float16
-                    and any(
-                        module_to_keep_in_fp32 in key.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules
-                    )
-                ):
-                    target_dtype = torch.float32
-
-                if param.device == torch.device("meta"):
-                    value = torch.empty(*param.size(), dtype=target_dtype)
-                    if (
-                        not is_quantized
-                        or (getattr(hf_quantizer, "requires_parameters_quantization", False))
-                        or not hf_quantizer.check_quantized_param(
-                            model, param_value=value, param_name=key, state_dict={}
-                        )
-                    ):
-                        set_module_tensor_to_device(model, key, "cpu", value)
-                    else:
-                        hf_quantizer.create_quantized_param(model, value, key, "cpu", state_dict, unexpected_keys)
-
-        # retrieve uninitialized modules and initialize before maybe overriding that with the pretrained weights.
-        if _fast_init:
-            if not ignore_mismatched_sizes:
-                if remove_prefix_from_model:
-                    _loaded_keys = [f"{prefix}.{k}" for k in loaded_keys]
-                elif add_prefix_to_model:
-                    _loaded_keys = [k[len(prefix) + 1 :] for k in loaded_keys]
-                else:
-                    _loaded_keys = loaded_keys
-                not_initialized_submodules = set_initialized_submodules(model, _loaded_keys)
-                # If we're about to tie the output embeds to the input embeds we don't need to init them
-                if (
-                    hasattr(model.config.get_text_config(decoder=True), "tie_word_embeddings")
-                    and model.config.get_text_config(decoder=True).tie_word_embeddings
-                ):
-                    output_embeddings = model.get_output_embeddings()
-                    if output_embeddings is not None:
-                        # Still need to initialize if there is a bias term since biases are not tied.
-                        if not hasattr(output_embeddings, "bias") or output_embeddings.bias is None:
-                            output_embeddings._is_hf_initialized = True
-            else:
-                not_initialized_submodules = dict(model.named_modules())
-            # This will only initialize submodules that are not marked as initialized by the line above.
-            if is_deepspeed_zero3_enabled() and not is_quantized:
-                import deepspeed
-
-                not_initialized_parameters = list(
-                    set(
-                        itertools.chain.from_iterable(
-                            submodule.parameters(recurse=False) for submodule in not_initialized_submodules.values()
-                        )
-                    )
-                )
-                with deepspeed.zero.GatheredParameters(not_initialized_parameters, modifier_rank=0):
-                    model.apply(model._initialize_weights)
-            else:
-                model.apply(model._initialize_weights)
-
-        # Set some modules to fp32 if any
-        if keep_in_fp32_modules is not None:
+        # Set some modules to fp32 if needed
+        if keep_in_fp32_regex is not None:
             for name, param in model.named_parameters():
-                if any(module_to_keep_in_fp32 in name.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules):
+                if keep_in_fp32_regex.search(name):
                     # param = param.to(torch.float32) does not work here as only in the local scope.
                     param.data = param.data.to(torch.float32)
 
-        # Make sure we are able to load base models as well as derived models (with heads)
-        start_prefix = ""
+        # Make sure we are able to load base models as well as derived models (specific task models, with heads)
         model_to_load = model
-        if len(cls.base_model_prefix) > 0 and not hasattr(model, cls.base_model_prefix) and has_prefix_module:
-            start_prefix = cls.base_model_prefix + "."
-        if len(cls.base_model_prefix) > 0 and hasattr(model, cls.base_model_prefix) and not has_prefix_module:
-            model_to_load = getattr(model, cls.base_model_prefix)
+        # In this case, we load a ForTaskModel with keys from a BaseModel -> only load keys to the BaseModel
+        if loading_task_model_from_base_state_dict:
+            model_to_load = getattr(model, prefix)
+            # Here we need to remove the prefix we added to correctly find missing/unexpected keys, as we will load
+            # in the submodule
+            key_renaming_mapping = {k: v[len(_prefix) :] for k, v in key_renaming_mapping.items()}
+            checkpoint_keys = list(key_renaming_mapping.values())
+            # We need to update the device map as well
+            if device_map is not None:
+                device_map = {k[len(_prefix) :] if k.startswith(_prefix) else k: v for k, v in device_map.items()}
+            # small sanity check: the base model should not contain task-specific head keys
+            task_specific_expected_keys = [s for s in model.state_dict().keys() if not s.startswith(_prefix)]
             base_model_expected_keys = list(model_to_load.state_dict().keys())
-            if any(key in expected_keys_not_prefixed and key not in base_model_expected_keys for key in loaded_keys):
+            if any(
+                key in task_specific_expected_keys and key not in base_model_expected_keys for key in checkpoint_keys
+            ):
                 raise ValueError(
                     "The state dictionary of the model you are trying to load is corrupted. Are you sure it was "
                     "properly saved?"
                 )
-            if device_map is not None:
-                device_map = {k.replace(f"{cls.base_model_prefix}.", ""): v for k, v in device_map.items()}
 
-        def _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            original_loaded_keys,
-            add_prefix_to_model,
-            remove_prefix_from_model,
-            ignore_mismatched_sizes,
-        ):
-            mismatched_keys = []
-            if ignore_mismatched_sizes:
-                for checkpoint_key, model_key in zip(original_loaded_keys, loaded_keys):
-                    # If the checkpoint is sharded, we may not have the key here.
-                    if checkpoint_key not in state_dict:
-                        continue
-                    if remove_prefix_from_model:
-                        # The model key starts with `prefix` but `checkpoint_key` doesn't so we add it.
-                        model_key = f"{prefix}.{model_key}"
-                    elif add_prefix_to_model:
-                        # The model key doesn't start with `prefix` but `checkpoint_key` does so we remove it.
-                        model_key = ".".join(model_key.split(".")[1:])
-
-                    if (
-                        model_key in model_state_dict
-                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
-                    ):
-                        if (
-                            state_dict[checkpoint_key].shape[-1] == 1
-                            and state_dict[checkpoint_key].numel() * 2 == model_state_dict[model_key].numel()
-                        ):
-                            # This skips size mismatches for 4-bit weights. Two 4-bit values share an 8-bit container, causing size differences.
-                            # Without matching with module type or paramter type it seems like a practical way to detect valid 4bit weights.
-                            pass
-                        else:
-                            mismatched_keys.append(
-                                (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                            )
-                            del state_dict[checkpoint_key]
-            return mismatched_keys
+        # Get reverse key mapping
+        reverse_key_renaming_mapping = {v: k for k, v in key_renaming_mapping.items()}
 
-        if resolved_archive_file is not None:
-            folder = os.path.sep.join(resolved_archive_file[0].split(os.path.sep)[:-1])
-        else:
-            folder = None
-        if device_map is not None and is_safetensors:
-            param_device_map = expand_device_map(device_map, original_loaded_keys, start_prefix)
-            str_dtype = str(dtype).replace("torch.", "") if dtype is not None else "float32"
-            if sharded_metadata is None:
-                archive_file = (
-                    resolved_archive_file[0]
-                    if isinstance(resolved_archive_file, (list, tuple))
-                    else resolved_archive_file
+        is_offloaded_safetensors = False
+        # This offload index if for params explicitly on the "disk" in the device_map
+        disk_offload_index = None
+        disk_only_shard_files = []
+        # Prepare parameters offloading if needed
+        if device_map is not None and "disk" in device_map.values():
+            if offload_state_dict is None:
+                offload_state_dict = True
+            if disk_offload_folder is not None:
+                os.makedirs(disk_offload_folder, exist_ok=True)
+            is_offloaded_safetensors = checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors")
+            if disk_offload_folder is None and not is_offloaded_safetensors:
+                raise ValueError(
+                    "The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder`"
+                    " for them. Alternatively, make sure you have `safetensors` installed if the model you are using"
+                    " offers the weights in this format."
                 )
-                weight_map = {p: archive_file for p in original_loaded_keys}
+            if is_offloaded_safetensors:
+                param_device_map = expand_device_map(device_map, checkpoint_keys)
+                str_dtype = str(dtype).replace("torch.", "") if dtype is not None else "float32"
+                if sharded_metadata is None:
+                    weight_map = dict.fromkeys(checkpoint_keys, checkpoint_files[0])
+                else:
+                    folder = os.path.sep.join(checkpoint_files[0].split(os.path.sep)[:-1])
+                    # Fix the weight map keys according to the key mapping
+                    weight_map = {
+                        key_renaming_mapping[k]: v
+                        for k, v in sharded_metadata["weight_map"].items()
+                        if k in key_renaming_mapping
+                    }
+                    weight_map = {k: os.path.join(folder, v) for k, v in weight_map.items()}
+                    # Find potential checkpoints containing only offloaded weights
+                    disk_only_shard_files = get_disk_only_shard_files(device_map, weight_map)
+                disk_offload_index = {
+                    name: {
+                        "safetensors_file": file,
+                        "weight_name": reverse_key_renaming_mapping[name],
+                        "dtype": str_dtype,
+                    }
+                    for name, file in weight_map.items()
+                    if param_device_map[name] == "disk"
+                }
             else:
-                weight_map = {p: os.path.join(folder, f) for p, f in sharded_metadata["weight_map"].items()}
-            offload_index = {
-                p[len(start_prefix) :]: {"safetensors_file": f, "weight_name": p, "dtype": str_dtype}
-                for p, f in weight_map.items()
-                if p.startswith(start_prefix) and param_device_map[p[len(start_prefix) :]] == "disk"
-            }
-        else:
-            offload_index = None
+                disk_offload_index = {}
+
+        # This offload index if for params that are supposed to be on the "cpu", either with or without a device_map
+        # It allows to load parameters one-by-one from the state dict, avoiding a memory peak of 2 x state_dict_size,
+        # i.e. 1x to load it, and 1x to copy it to model
+        cpu_offload_folder = None
+        cpu_offload_index = None
+        if offload_state_dict:
+            cpu_offload_folder = tempfile.mkdtemp()
+            cpu_offload_index = {}
+
+        # For nice tqdm bars
+        if checkpoint_files is not None and len(checkpoint_files) > 1:
+            checkpoint_files = logging.tqdm(checkpoint_files, desc="Loading checkpoint shards")
+        # To be able to iterate, even if we don't use it if the state_dict is already provided
+        elif state_dict is not None:
+            checkpoint_files = [""]
+
+        # Compute expected model keys
+        expected_keys = list(model_to_load.state_dict().keys())
+        if hf_quantizer is not None:
+            expected_keys = hf_quantizer.update_expected_keys(model_to_load, expected_keys, checkpoint_keys)
 
-        if state_dict is not None:
-            # Whole checkpoint
-            mismatched_keys = _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                loaded_keys,
-                original_loaded_keys,
-                add_prefix_to_model,
-                remove_prefix_from_model,
-                ignore_mismatched_sizes,
-            )
+        # Warmup cuda to load the weights much faster on devices
+        if device_map is not None:
+            expanded_device_map = expand_device_map(device_map, expected_keys)
+            caching_allocator_warmup(model_to_load, expanded_device_map, factor=2 if hf_quantizer is None else 4)
+
+        error_msgs = []
+        # Iterate on all the shards to load the weights
+        for shard_file in checkpoint_files:
+            # Skip the load for shards that only contain disk-offloaded weights
+            if shard_file in disk_only_shard_files:
+                continue
 
-            # For GGUF models `state_dict` is never set to None as the state dict is always small
-            if gguf_path or low_cpu_mem_usage:
-                fixed_state_dict = cls._fix_state_dict_keys_on_load(state_dict)
-                error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
-                    model_to_load,
-                    fixed_state_dict,
-                    start_prefix,
-                    expected_keys,
-                    device_map=device_map,
-                    offload_folder=offload_folder,
-                    offload_index=offload_index,
-                    state_dict_folder=state_dict_folder,
-                    state_dict_index=state_dict_index,
-                    dtype=dtype,
-                    hf_quantizer=hf_quantizer,
-                    is_safetensors=is_safetensors,
-                    keep_in_fp32_modules=keep_in_fp32_modules,
-                    unexpected_keys=unexpected_keys,
-                )
-            else:
-                # Sharded checkpoint or whole but low_cpu_mem_usage==True
-                assign_to_params_buffers = check_support_param_buffer_assignment(
-                    model_to_load, state_dict, start_prefix
-                )
-                fixed_state_dict = cls._fix_state_dict_keys_on_load(state_dict)
-                error_msgs = _load_state_dict_into_model(
-                    model_to_load, fixed_state_dict, start_prefix, assign_to_params_buffers
+            map_location = "cpu"
+            if shard_file.endswith(".safetensors") and not is_hqq_or_bnb and not is_deepspeed_zero3_enabled():
+                map_location = "meta"
+            elif (
+                device_map is not None
+                and hf_quantizer is not None
+                and hf_quantizer.quantization_config.quant_method == QuantizationMethod.TORCHAO
+                and (
+                    hf_quantizer.quantization_config.quant_type in ["int4_weight_only", "autoquant"]
+                    or isinstance(hf_quantizer.quantization_config.quant_type, Int4WeightOnlyConfig)
                 )
+            ):
+                map_location = torch.device([d for d in device_map.values() if d not in ["cpu", "disk"]][0])
 
-        else:
-            # This should always be a list but, just to be sure.
-            if not isinstance(resolved_archive_file, list):
-                resolved_archive_file = [resolved_archive_file]
-
-            error_msgs = []
-            mismatched_keys = []
-            if not is_safetensors:
-                offload_index = {} if device_map is not None and "disk" in device_map.values() else None
-            if offload_state_dict:
-                state_dict_folder = tempfile.mkdtemp()
-                state_dict_index = {}
-            else:
-                state_dict_folder = None
-                state_dict_index = None
-
-            if is_sharded_safetensors:
-                disk_only_shard_files = get_disk_only_shard_files(
-                    device_map, sharded_metadata=sharded_metadata, start_prefix=start_prefix
-                )
-                disk_only_shard_files = [os.path.join(folder, f) for f in disk_only_shard_files]
-            else:
-                disk_only_shard_files = []
-
-            if len(resolved_archive_file) > 1:
-                resolved_archive_file = logging.tqdm(resolved_archive_file, desc="Loading checkpoint shards")
-            assign_to_params_buffers = None
-            for shard_file in resolved_archive_file:
-                # Skip the load for shards that only contain disk-offloaded weights when using safetensors for the offload.
-                if shard_file in disk_only_shard_files:
-                    continue
-                map_location = None
-                if (
-                    device_map is not None
-                    and hf_quantizer is not None
-                    and hf_quantizer.quantization_config.quant_method == QuantizationMethod.TORCHAO
-                    and hf_quantizer.quantization_config.quant_type == "int4_weight_only"
-                ):
-                    map_location = torch.device([d for d in device_map.values() if d not in ["cpu", "disk"]][0])
+            # If shard_file is "", we use the existing state_dict instead of loading it
+            if shard_file != "":
                 state_dict = load_state_dict(
                     shard_file, is_quantized=is_quantized, map_location=map_location, weights_only=weights_only
                 )
 
-                # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
-                # matching the weights in the model.
-                mismatched_keys += _find_mismatched_keys(
+            # Fix the key names
+            state_dict = {key_renaming_mapping[k]: v for k, v in state_dict.items() if k in key_renaming_mapping}
+
+            if is_deepspeed_zero3_enabled():
+                error_msgs += _load_state_dict_into_zero3_model(model_to_load, state_dict)
+            # Skip it with fsdp on ranks other than 0
+            elif not (is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized):
+                disk_offload_index, cpu_offload_index = _load_state_dict_into_meta_model(
+                    model_to_load,
                     state_dict,
-                    model_state_dict,
-                    loaded_keys,
-                    original_loaded_keys,
-                    add_prefix_to_model,
-                    remove_prefix_from_model,
-                    ignore_mismatched_sizes,
+                    shard_file,
+                    expected_keys,
+                    reverse_key_renaming_mapping,
+                    device_map=device_map,
+                    disk_offload_folder=disk_offload_folder,
+                    disk_offload_index=disk_offload_index,
+                    cpu_offload_folder=cpu_offload_folder,
+                    cpu_offload_index=cpu_offload_index,
+                    hf_quantizer=hf_quantizer,
+                    is_safetensors=is_offloaded_safetensors,
+                    keep_in_fp32_regex=keep_in_fp32_regex,
+                    unexpected_keys=unexpected_keys,
+                    device_mesh=device_mesh,
                 )
-                if low_cpu_mem_usage:
-                    if is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized:
-                        for key, param in model_to_load.state_dict().items():
-                            if param.device == torch.device("meta"):
-                                set_module_tensor_to_device(
-                                    model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
-                                )
-                    else:
-                        fixed_state_dict = cls._fix_state_dict_keys_on_load(state_dict)
-                        new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
-                            model_to_load,
-                            fixed_state_dict,
-                            start_prefix,
-                            expected_keys,
-                            device_map=device_map,
-                            offload_folder=offload_folder,
-                            offload_index=offload_index,
-                            state_dict_folder=state_dict_folder,
-                            state_dict_index=state_dict_index,
-                            dtype=dtype,
-                            hf_quantizer=hf_quantizer,
-                            is_safetensors=is_safetensors,
-                            keep_in_fp32_modules=keep_in_fp32_modules,
-                            unexpected_keys=unexpected_keys,
-                        )
-                        error_msgs += new_error_msgs
-                else:
-                    # Sharded checkpoint or whole but low_cpu_mem_usage==True
-                    if assign_to_params_buffers is None:
-                        assign_to_params_buffers = check_support_param_buffer_assignment(
-                            model_to_load, state_dict, start_prefix
+
+            # force memory release if loading multiple shards, to avoid having 2 state dicts in memory in next loop
+            del state_dict
+
+        # Adjust offloaded weights name and save if needed
+        if disk_offload_index is not None and len(disk_offload_index) > 0:
+            if loading_task_model_from_base_state_dict:
+                # We need to add the prefix of the base model
+                prefix = cls.base_model_prefix
+                if not is_offloaded_safetensors:
+                    for weight_name in disk_offload_index:
+                        shutil.move(
+                            os.path.join(disk_offload_folder, f"{weight_name}.dat"),
+                            os.path.join(disk_offload_folder, f"{prefix}.{weight_name}.dat"),
                         )
-                    fixed_state_dict = cls._fix_state_dict_keys_on_load(state_dict)
-                    error_msgs += _load_state_dict_into_model(
-                        model_to_load, fixed_state_dict, start_prefix, assign_to_params_buffers
-                    )
+                disk_offload_index = {f"{prefix}.{key}": value for key, value in disk_offload_index.items()}
+            if not is_offloaded_safetensors:
+                save_offload_index(disk_offload_index, disk_offload_folder)
+                disk_offload_index = None
 
-                # force memory release
-                del state_dict
-                gc.collect()
+        # one-at-a-time param loading for the cpu offloaded params
+        if offload_state_dict:
+            # Load back temporarily offloaded state dict
+            load_offloaded_weights(model_to_load, cpu_offload_index, cpu_offload_folder)
+            shutil.rmtree(cpu_offload_folder)
 
-            if offload_index is not None and len(offload_index) > 0:
-                if model != model_to_load:
-                    # We need to add the prefix of the base model
-                    prefix = cls.base_model_prefix
-                    if not is_safetensors:
-                        for weight_name in offload_index:
-                            shutil.move(
-                                os.path.join(offload_folder, f"{weight_name}.dat"),
-                                os.path.join(offload_folder, f"{prefix}.{weight_name}.dat"),
-                            )
-                    offload_index = {f"{prefix}.{key}": value for key, value in offload_index.items()}
-                if not is_safetensors:
-                    save_offload_index(offload_index, offload_folder)
-                    offload_index = None
-
-            if offload_state_dict:
-                # Load back temporarily offloaded state dict
-                load_offloaded_weights(model_to_load, state_dict_index, state_dict_folder)
-                shutil.rmtree(state_dict_folder)
+        if hf_quantizer is not None:
+            missing_keys = hf_quantizer.update_missing_keys_after_loading(model_to_load, missing_keys, prefix)
+
+        # Post-processing for tensor parallelism
+        if device_mesh is not None:
+            # When using TP, the device map is a single device for all parameters
+            tp_device = list(device_map.values())[0]
+            # This is needed for the RotaryEmbedding, which was not initialized on the correct device as it is
+            # not part of the state_dict (persistent=False)
+            for buffer in model.buffers():
+                if buffer.device != tp_device:
+                    buffer.data = buffer.to(tp_device)
+
+            # In this case, the top-most task module weights were not moved to device and parallelized as they
+            # were not part of the loaded weights: do it now
+            if loading_task_model_from_base_state_dict:
+                parameters_to_initialize = {
+                    name: param for name, param in model.named_parameters() if not name.startswith(prefix)
+                }
+                for name, param in parameters_to_initialize.items():
+                    # First move data to correct
+                    to_contiguous, casting_dtype = _infer_parameter_dtype(model, name, param, keep_in_fp32_regex)
+                    shard_and_distribute_module(
+                        model,
+                        param.to(tp_device),
+                        param,
+                        name,
+                        casting_dtype,
+                        to_contiguous,
+                        os.environ["RANK"],
+                        device_mesh,
+                    )
 
+        # All potential warnings/infos
         if len(error_msgs) > 0:
             error_msg = "\n\t".join(error_msgs)
             if "size mismatch" in error_msg:
@@ -4953,7 +4910,6 @@ def _find_mismatched_keys(
                     "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
                 )
             raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
-
         if len(unexpected_keys) > 0:
             archs = [] if model.config.architectures is None else model.config.architectures
             warner = logger.warning if model.__class__.__name__ in archs else logger.info
@@ -4985,7 +4941,7 @@ def _find_mismatched_keys(
             mismatched_warning = "\n".join(
                 [
                     f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                    for key, shape1, shape2 in mismatched_keys
+                    for key, (shape1, shape2) in zip(mismatched_keys, mismatched_shapes)
                 ]
             )
             logger.warning(
@@ -4995,7 +4951,45 @@ def _find_mismatched_keys(
                 " to use it for predictions and inference."
             )
 
-        return model, missing_keys, unexpected_keys, mismatched_keys, offload_index, error_msgs
+        return model, missing_keys, unexpected_keys, mismatched_keys, disk_offload_index, error_msgs
+
+    @classmethod
+    def _load_from_tf(cls, model, config, checkpoint_files):
+        if checkpoint_files[0].endswith(".index"):
+            # Load from a TensorFlow 1.X checkpoint - provided by original authors
+            model = cls.load_tf_weights(model, config, checkpoint_files[0][:-6])  # Remove the '.index'
+            loading_info = None
+        else:
+            # Load from our TensorFlow 2.0 checkpoints
+            try:
+                from .modeling_tf_pytorch_utils import load_tf2_checkpoint_in_pytorch_model
+
+                model, loading_info = load_tf2_checkpoint_in_pytorch_model(
+                    model, checkpoint_files[0], allow_missing_keys=True, output_loading_info=True
+                )
+            except ImportError:
+                logger.error(
+                    "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed."
+                    " Please see https://pytorch.org/ and https://www.tensorflow.org/install/ for installation"
+                    " instructions."
+                )
+                raise
+        return model, loading_info
+
+    @classmethod
+    def _load_from_flax(cls, model, checkpoint_files):
+        try:
+            from .modeling_flax_pytorch_utils import load_flax_checkpoint_in_pytorch_model
+
+            model = load_flax_checkpoint_in_pytorch_model(model, checkpoint_files[0])
+        except ImportError:
+            logger.error(
+                "Loading a Flax model in PyTorch, requires both PyTorch and Flax to be installed. Please see"
+                " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for"
+                " installation instructions."
+            )
+            raise
+        return model
 
     def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=False):
         module_keys = {".".join(key.split(".")[:-1]) for key in names}
@@ -5020,47 +5014,6 @@ def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=Fal
 
         return retrieved_modules
 
-    @staticmethod
-    def _load_pretrained_model_low_mem(
-        model,
-        loaded_state_dict_keys,
-        resolved_archive_file,
-        start_prefix="",
-        hf_quantizer=None,
-        pretrained_model_name_or_path=None,
-        weights_only=True,
-    ):
-        """
-        This is an experimental function that loads the model using ~1.x model size CPU memory
-
-        Before you call it do:
-
-        1. save which state_dict keys are available
-        2. drop state_dict before model is created, since the latter takes 1x model size memory
-
-        Here then we continue:
-
-        3. switch to the meta device all params/buffers that are going to be replaced from the loaded state_dict
-        4. load state_dict 2nd time
-        5. replace the params/buffers from the state_dict
-
-        Currently, it doesn't handle missing_keys, unexpected_keys, mismatched_keys. It can't handle deepspeed. To
-        handle bitsandbytes, needs non-empty hf_quantizer argument.
-        """
-
-        _move_model_to_meta(model, loaded_state_dict_keys, start_prefix)
-        state_dict = load_state_dict(resolved_archive_file, weights_only=weights_only)
-        expected_keys = loaded_state_dict_keys  # plug for missing expected_keys. TODO: replace with proper keys
-        fixed_state_dict = model._fix_state_dict_keys_on_load(state_dict)
-        error_msgs = _load_state_dict_into_meta_model(
-            model,
-            fixed_state_dict,
-            start_prefix,
-            expected_keys=expected_keys,
-            hf_quantizer=hf_quantizer,
-        )
-        return error_msgs
-
     @classmethod
     def register_for_auto_class(cls, auto_class="AutoModel"):
         """
@@ -5184,44 +5137,6 @@ def supports_tp_plan(self):
             return True
         return False
 
-    def tensor_parallel(self, device_mesh):
-        """
-        Tensor parallelize the model across the given device mesh.
-
-        Args:
-            device_mesh (`torch.distributed.DeviceMesh`):
-                The device mesh to use for tensor parallelism.
-        """
-        if not is_torch_greater_or_equal("2.5"):
-            raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.")
-
-        # Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module.
-        # No op if `_tp_plan` attribute does not exist under the module.
-        # This is a helper function to be used with `model.apply` to recursively
-        # parallelize a model.
-        def tplize(mod: torch.nn.Module) -> None:
-            tp_plan = getattr(mod, "_tp_plan", None)
-            if tp_plan is None:
-                return
-            logger.debug(f"Applying tensor parallel to {mod.__class__.__name__}: {tp_plan}")
-            # In model configs, we use a neutral type (string) to specify
-            # parallel styles, here we translate them into torch TP types.
-            # Using tree_map because `tp_plan` is a dict.
-            tp_plan = torch.utils._pytree.tree_map(
-                translate_to_torch_parallel_style,
-                tp_plan,
-            )
-            # Apply TP to current module.
-            torch.distributed.tensor.parallel.parallelize_module(
-                mod,
-                device_mesh=device_mesh,
-                parallelize_plan=tp_plan,
-            )
-
-        # `apply` is a native method of `nn.Module` that recursively applies a
-        # function to every submodule.
-        self.apply(tplize)
-
     @property
     def supports_pp_plan(self):
         if self._pp_plan is not None:
@@ -5256,6 +5171,8 @@ def get_compiled_call(self, compile_config: CompileConfig):
         want to use compiled version to avoid recomputing the graph with new shapes) and iterative decoding
         (where we want the speed-ups of compiled version with static shapes)."""
         # Only reset it if not present or different from previous config
+        if "llama4" in self.config.model_type:  # TODO try to enable for FULL COMPILE HYBRID CACHE SUPPORT
+            return self.__call__
         default_config = getattr(self.generation_config, "compile_config", CompileConfig())
         if (
             not hasattr(self, "_compiled_call")
@@ -5269,6 +5186,96 @@ def get_compiled_call(self, compile_config: CompileConfig):
     def is_backend_compatible(cls):
         return cls._supports_attention_backend
 
+    def _move_missing_keys_from_meta_to_cpu(
+        self,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        dtype: Optional[torch.dtype],
+        hf_quantizer: Optional[HfQuantizer],
+    ) -> "PreTrainedModel":
+        """Move the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts) back
+        from meta device to cpu.
+        """
+        is_quantized = hf_quantizer is not None
+
+        # In this case we need to move everything back
+        if is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized:
+            # We only do it for the parameters, as the buffers are not initialized on the meta device by default
+            for key, param in self.named_parameters():
+                value = torch.empty_like(param, dtype=dtype, device="cpu")
+                _load_parameter_into_model(self, key, value)
+            return
+
+        model_state_dict = self.state_dict()
+        for key in missing_keys:
+            param = model_state_dict[key]
+            # Buffers are not initialized on the meta device, so we still need this check to avoid overwriting them
+            if param.device == torch.device("meta"):
+                value = torch.empty_like(param, dtype=dtype, device="cpu")
+                if (
+                    not is_quantized
+                    or (getattr(hf_quantizer, "requires_parameters_quantization", False))
+                    or not hf_quantizer.check_quantized_param(self, param_value=value, param_name=key, state_dict={})
+                ):
+                    _load_parameter_into_model(self, key, value)
+                else:
+                    hf_quantizer.create_quantized_param(self, value, key, "cpu", model_state_dict, unexpected_keys)
+
+    def _initialize_missing_keys(
+        self,
+        loaded_keys: List[str],
+        ignore_mismatched_sizes: bool,
+        is_quantized: bool,
+    ) -> "PreTrainedModel":
+        """Initialize the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts), according to
+        `_initialize_weights`. Indeed, since the corresponding weights are missing from the state dict, they will not be replaced and need to
+        be initialized correctly (i.e. weight initialization distribution).
+        Also take care of setting the `_is_hf_initialized` flag for keys that are not missing.
+        """
+        if not ignore_mismatched_sizes:
+            not_initialized_submodules = set_initialized_submodules(self, loaded_keys)
+            # If we're about to tie the output embeds to the input embeds we don't need to init them
+            if (
+                hasattr(self.config.get_text_config(decoder=True), "tie_word_embeddings")
+                and self.config.get_text_config(decoder=True).tie_word_embeddings
+            ):
+                output_embeddings = self.get_output_embeddings()
+                if output_embeddings is not None:
+                    # Still need to initialize if there is a bias term since biases are not tied.
+                    if not hasattr(output_embeddings, "bias") or output_embeddings.bias is None:
+                        output_embeddings._is_hf_initialized = True
+        else:
+            not_initialized_submodules = dict(self.named_modules())
+        # This will only initialize submodules that are not marked as initialized by the line above.
+        if is_deepspeed_zero3_enabled() and not is_quantized:
+            not_initialized_parameters = list(
+                set(
+                    itertools.chain.from_iterable(
+                        submodule.parameters(recurse=False) for submodule in not_initialized_submodules.values()
+                    )
+                )
+            )
+            with deepspeed.zero.GatheredParameters(not_initialized_parameters, modifier_rank=0):
+                self.apply(self._initialize_weights)
+        else:
+            self.apply(self._initialize_weights)
+
+    def get_parameter_or_buffer(self, target: str):
+        """
+        Return the parameter or buffer given by `target` if it exists, otherwise throw an error. This combines
+        `get_parameter()` and `get_buffer()` in a single handy function. Note that it only work if `target` is a
+        leaf of the model.
+        """
+        try:
+            return self.get_parameter(target)
+        except AttributeError:
+            pass
+        try:
+            return self.get_buffer(target)
+        except AttributeError:
+            pass
+        raise AttributeError(f"`{target}` is neither a parameter nor a buffer.")
+
 
 PreTrainedModel.push_to_hub = copy_func(PreTrainedModel.push_to_hub)
 if PreTrainedModel.push_to_hub.__doc__ is not None:
@@ -5361,9 +5368,9 @@ def forward(
         Returns:
             `torch.FloatTensor`: The end logits for SQuAD.
         """
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
         if start_positions is not None:
             slen, hsz = hidden_states.shape[-2:]
             start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
@@ -5429,9 +5436,9 @@ def forward(
         """
         # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
         hsz = hidden_states.shape[-1]
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
         if start_positions is not None:
             start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
             start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
@@ -5726,12 +5733,11 @@ def unwrap_model(model: nn.Module, recursive: bool = False) -> nn.Module:
             return model
 
 
-def expand_device_map(device_map, param_names, start_prefix):
+def expand_device_map(device_map, param_names):
     """
-    Expand a device map to return the correspondance parameter name to device.
+    Expand a device map to return the correspondence parameter name to device.
     """
     new_device_map = {}
-    param_names = [p[len(start_prefix) :] for p in param_names if p.startswith(start_prefix)]
     for module, device in device_map.items():
         new_device_map.update(
             {p: device for p in param_names if p == module or p.startswith(f"{module}.") or module == ""}
@@ -5739,14 +5745,64 @@ def expand_device_map(device_map, param_names, start_prefix):
     return new_device_map
 
 
-def get_disk_only_shard_files(device_map, sharded_metadata, start_prefix):
+def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict, factor=2):
+    """This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
+    device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
+    the model, which is actually the loading speed botteneck.
+    Calling this function allows to cut the model loading time by a very large margin.
+
+    A few facts related to loading speed (taking into account the use of this function):
+    - When loading a model the first time, it is usually slower than the subsequent times, because the OS is very likely
+    to cache the different state dicts (if enough ressources/RAM are available)
+    - Trying to force the OS to cache the files in advance (by e.g. accessing a small portion of them) is really hard,
+    and not a good idea in general as this is low level OS optimizations that depend on ressource usage anyway
+    - As of 18/03/2025, loading a Llama 70B model with TP takes ~1 min without file cache, and ~13s with full file cache.
+    The baseline, i.e. only loading the tensor shards on device and adjusting dtype (i.e. copying them) is ~5s with full cache.
+    These numbers are reported for TP on 4 H100 GPUs.
+    - It is useless to pre-allocate more than the model size in this function (i.e. using an `allocation_factor` > 1) as
+    cudaMalloc is not a bottleneck at all anymore
+    - Loading speed bottleneck is now almost only tensor copy (i.e. changing the dtype) and moving the tensors to the devices.
+    However, we cannot really improve on those aspects obviously, as the data needs to be moved/copied in the end.
+    """
+    # Remove disk and cpu devices, and cast to proper torch.device
+    accelerator_device_map = {
+        param: torch.device(device) for param, device in expanded_device_map.items() if device not in ["cpu", "disk"]
+    }
+    if not len(accelerator_device_map):
+        return
+
+    tp_plan_regex = (
+        re.compile("|".join([re.escape(plan) for plan in model._tp_plan]))
+        if _torch_distributed_available and torch.distributed.is_initialized()
+        else None
+    )
+    total_byte_count = defaultdict(lambda: 0)
+    for param_name, device in accelerator_device_map.items():
+        param = model.get_parameter_or_buffer(param_name)
+        # The dtype of different parameters may be different with composite models or `keep_in_fp32_modules`
+        param_byte_count = math.prod(param.shape) * param.element_size()
+
+        if tp_plan_regex is not None:
+            generic_name = re.sub(r"\.\d+\.", ".*.", param_name)
+            param_byte_count //= torch.distributed.get_world_size() if tp_plan_regex.search(generic_name) else 1
+
+        total_byte_count[device] += param_byte_count
+
+    # This will kick off the caching allocator to avoid having to Malloc afterwards
+    for device, byte_count in total_byte_count.items():
+        if device.type == "cuda":
+            index = device.index if device.index is not None else torch.cuda.current_device()
+            device_memory = torch.cuda.mem_get_info(index)[0]
+            # Allow up to 95% of max device memory
+            byte_count = min(byte_count, int(0.95 * device_memory))
+        # Allocate memory
+        _ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False)
+
+
+def get_disk_only_shard_files(device_map, weight_map):
     """
     Returns the list of shard files containing only weights offloaded to disk.
     """
-
-    weight_map = {
-        p[len(start_prefix) :]: v for p, v in sharded_metadata["weight_map"].items() if p.startswith(start_prefix)
-    }
     files_content = collections.defaultdict(list)
     for weight_name, filename in weight_map.items():
         while len(weight_name) > 0 and weight_name not in device_map:
@@ -5756,12 +5812,51 @@ def get_disk_only_shard_files(device_map, sharded_metadata, start_prefix):
     return [fname for fname, devices in files_content.items() if set(devices) == {"disk"}]
 
 
-ALL_ATTENTION_FUNCTIONS: Dict[str, Dict[str, Callable]] = {}
+class AttentionInterface(MutableMapping):
+    """
+    Dict-like object keeping track of allowed attention functions. You can easily add a new attention function
+    with a call to `register()`. If a model needs to locally overwrite an existing attention function, say `sdpa`,
+    it needs to declare a new instance of this class inside the `modeling_<model>.py`, and declare it on that instance.
+    """
 
-ALL_ATTENTION_FUNCTIONS.update(
-    {
+    # Class instance object, so that a call to `register` can be reflected into all other files correctly, even if
+    # a new instance is created (in order to locally override a given function)
+    _global_mapping = {
         "flash_attention_2": flash_attention_forward,
         "flex_attention": flex_attention_forward,
         "sdpa": sdpa_attention_forward,
     }
-)
+
+    def __init__(self):
+        self._local_mapping = {}
+
+    def __getitem__(self, key):
+        # First check if instance has a local override
+        if key in self._local_mapping:
+            return self._local_mapping[key]
+        return self._global_mapping[key]
+
+    def __setitem__(self, key, value):
+        # Allow local update of the default functions without impacting other instances
+        self._local_mapping.update({key: value})
+
+    def __delitem__(self, key):
+        del self._local_mapping[key]
+
+    def __iter__(self):
+        # Ensure we use all keys, with the overwritten ones on top
+        return iter({**self._global_mapping, **self._local_mapping})
+
+    def __len__(self):
+        return len(self._global_mapping.keys() | self._local_mapping.keys())
+
+    @classmethod
+    def register(cls, key: str, value: Callable):
+        cls._global_mapping.update({key: value})
+
+    def valid_keys(self) -> List[str]:
+        return list(self.keys())
+
+
+# Global AttentionInterface shared by all models which do not need to overwrite any of the existing ones
+ALL_ATTENTION_FUNCTIONS: AttentionInterface = AttentionInterface()
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 1a8bef3e9e13..08cec64b41ee 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -20,6 +20,7 @@
     audio_spectrogram_transformer,
     auto,
     autoformer,
+    aya_vision,
     bamba,
     bark,
     bart,
@@ -70,6 +71,7 @@
     deberta,
     deberta_v2,
     decision_transformer,
+    deepseek_v3,
     deformable_detr,
     deit,
     deprecated,
@@ -105,6 +107,7 @@
     fuyu,
     gemma,
     gemma2,
+    gemma3,
     git,
     glm,
     glpn,
@@ -118,6 +121,7 @@
     gptj,
     granite,
     granitemoe,
+    granitemoeshared,
     grounding_dino,
     groupvit,
     helium,
@@ -144,6 +148,7 @@
     levit,
     lilt,
     llama,
+    llama4,
     llava,
     llava_next,
     llava_next_video,
@@ -166,6 +171,7 @@
     mgp_str,
     mimi,
     mistral,
+    mistral3,
     mixtral,
     mllama,
     mluke,
@@ -208,6 +214,7 @@
     persimmon,
     phi,
     phi3,
+    phi4_multimodal,
     phimoe,
     phobert,
     pix2struct,
@@ -215,6 +222,7 @@
     plbart,
     poolformer,
     pop2piano,
+    prompt_depth_anything,
     prophetnet,
     pvt,
     pvt_v2,
@@ -223,6 +231,8 @@
     qwen2_audio,
     qwen2_moe,
     qwen2_vl,
+    qwen3,
+    qwen3_moe,
     rag,
     recurrent_gemma,
     reformer,
@@ -243,7 +253,10 @@
     seggpt,
     sew,
     sew_d,
+    shieldgemma2,
     siglip,
+    siglip2,
+    smolvlm,
     speech_encoder_decoder,
     speech_to_text,
     speecht5,
diff --git a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index df2a22610187..000000000000
--- a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ALBERT checkpoint."""
-
-import argparse
-
-import torch
-
-from ...utils import logging
-from . import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = AlbertConfig.from_json_file(albert_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = AlbertForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_albert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--albert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained ALBERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index bfd8e38687ac..574735c1890a 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -24,7 +24,6 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -610,8 +609,8 @@ class AlbertForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    sop_logits: torch.FloatTensor = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    sop_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -984,7 +983,7 @@ def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
     "Albert Model with a `language modeling` head on top.",
     ALBERT_START_DOCSTRING,
 )
-class AlbertForMaskedLM(AlbertPreTrainedModel, GenerationMixin):
+class AlbertForMaskedLM(AlbertPreTrainedModel):
     _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index b5b49219aebf..df2ebddc7e62 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -1087,7 +1087,7 @@ def __call__(
         hidden_states = outputs[0]
 
         logits = self.qa_outputs(hidden_states)
-        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 24a25658a4d4..6800cfa8d16a 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -164,10 +164,10 @@ def build(self, input_shape=None):
     # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        token_type_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        token_type_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
         past_key_values_length=0,
         training: bool = False,
     ) -> tf.Tensor:
@@ -749,9 +749,9 @@ class TFAlbertForPreTrainingOutput(ModelOutput):
             heads.
     """
 
-    loss: tf.Tensor = None
-    prediction_logits: tf.Tensor = None
-    sop_logits: tf.Tensor = None
+    loss: Optional[tf.Tensor] = None
+    prediction_logits: Optional[tf.Tensor] = None
+    sop_logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
diff --git a/src/transformers/models/align/convert_align_tf_to_hf.py b/src/transformers/models/align/convert_align_tf_to_hf.py
deleted file mode 100644
index 610db8482f91..000000000000
--- a/src/transformers/models/align/convert_align_tf_to_hf.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ALIGN checkpoints from the original repository."""
-
-import argparse
-import os
-
-import align
-import numpy as np
-import requests
-import tensorflow as tf
-import torch
-from PIL import Image
-from tokenizer import Tokenizer
-
-from transformers import (
-    AlignConfig,
-    AlignModel,
-    AlignProcessor,
-    BertConfig,
-    BertTokenizer,
-    EfficientNetConfig,
-    EfficientNetImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def preprocess(image):
-    image = tf.image.resize(image, (346, 346))
-    image = tf.image.crop_to_bounding_box(image, (346 - 289) // 2, (346 - 289) // 2, 289, 289)
-    return image
-
-
-def get_align_config():
-    vision_config = EfficientNetConfig.from_pretrained("google/efficientnet-b7")
-    vision_config.image_size = 289
-    vision_config.hidden_dim = 640
-    vision_config.id2label = {"0": "LABEL_0", "1": "LABEL_1"}
-    vision_config.label2id = {"LABEL_0": 0, "LABEL_1": 1}
-    vision_config.depthwise_padding = []
-
-    text_config = BertConfig()
-    config = AlignConfig.from_text_vision_configs(
-        text_config=text_config, vision_config=vision_config, projection_dim=640
-    )
-    return config
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def get_processor():
-    image_processor = EfficientNetImageProcessor(
-        do_center_crop=True,
-        rescale_factor=1 / 127.5,
-        rescale_offset=True,
-        do_normalize=False,
-        include_top=False,
-        resample=Image.BILINEAR,
-    )
-    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-    tokenizer.model_max_length = 64
-    processor = AlignProcessor(image_processor=image_processor, tokenizer=tokenizer)
-    return processor
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def rename_keys(original_param_names):
-    # EfficientNet image encoder
-    block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
-    block_names = list(set(block_names))
-    block_names = sorted(block_names)
-    num_blocks = len(block_names)
-    block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}
-
-    rename_keys = []
-    rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
-    rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
-    rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
-    rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
-    rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))
-
-    for b in block_names:
-        hf_b = block_name_mapping[b]
-        rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
-        rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
-        rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
-        rename_keys.append(
-            (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var")
-        )
-        rename_keys.append(
-            (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight")
-        )
-        rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
-        rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
-        rename_keys.append(
-            (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var")
-        )
-
-        rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
-        rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
-        rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
-        rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
-        rename_keys.append(
-            (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight")
-        )
-        rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
-        rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
-        rename_keys.append(
-            (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var")
-        )
-
-    key_mapping = {}
-    for item in rename_keys:
-        if item[0] in original_param_names:
-            key_mapping[item[0]] = "vision_model." + item[1]
-
-    # BERT text encoder
-    rename_keys = []
-    old = "tf_bert_model/bert"
-    new = "text_model"
-    for i in range(12):
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/query/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.self.query.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/query/bias:0",
-                f"{new}.encoder.layer.{i}.attention.self.query.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/key/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.self.key.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/key/bias:0",
-                f"{new}.encoder.layer.{i}.attention.self.key.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/value/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.self.value.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/self/value/bias:0",
-                f"{new}.encoder.layer.{i}.attention.self.value.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/dense/kernel:0",
-                f"{new}.encoder.layer.{i}.attention.output.dense.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/dense/bias:0",
-                f"{new}.encoder.layer.{i}.attention.output.dense.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/gamma:0",
-                f"{new}.encoder.layer.{i}.attention.output.LayerNorm.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/beta:0",
-                f"{new}.encoder.layer.{i}.attention.output.LayerNorm.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/intermediate/dense/kernel:0",
-                f"{new}.encoder.layer.{i}.intermediate.dense.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"{old}/encoder/layer_._{i}/intermediate/dense/bias:0",
-                f"{new}.encoder.layer.{i}.intermediate.dense.bias",
-            )
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/dense/kernel:0", f"{new}.encoder.layer.{i}.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/dense/bias:0", f"{new}.encoder.layer.{i}.output.dense.bias")
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/LayerNorm/gamma:0", f"{new}.encoder.layer.{i}.output.LayerNorm.weight")
-        )
-        rename_keys.append(
-            (f"{old}/encoder/layer_._{i}/output/LayerNorm/beta:0", f"{new}.encoder.layer.{i}.output.LayerNorm.bias")
-        )
-
-    rename_keys.append((f"{old}/embeddings/word_embeddings/weight:0", f"{new}.embeddings.word_embeddings.weight"))
-    rename_keys.append(
-        (f"{old}/embeddings/position_embeddings/embeddings:0", f"{new}.embeddings.position_embeddings.weight")
-    )
-    rename_keys.append(
-        (f"{old}/embeddings/token_type_embeddings/embeddings:0", f"{new}.embeddings.token_type_embeddings.weight")
-    )
-    rename_keys.append((f"{old}/embeddings/LayerNorm/gamma:0", f"{new}.embeddings.LayerNorm.weight"))
-    rename_keys.append((f"{old}/embeddings/LayerNorm/beta:0", f"{new}.embeddings.LayerNorm.bias"))
-
-    rename_keys.append((f"{old}/pooler/dense/kernel:0", f"{new}.pooler.dense.weight"))
-    rename_keys.append((f"{old}/pooler/dense/bias:0", f"{new}.pooler.dense.bias"))
-    rename_keys.append(("dense/kernel:0", "text_projection.weight"))
-    rename_keys.append(("dense/bias:0", "text_projection.bias"))
-    rename_keys.append(("dense/bias:0", "text_projection.bias"))
-    rename_keys.append(("temperature:0", "temperature"))
-
-    for item in rename_keys:
-        if item[0] in original_param_names:
-            key_mapping[item[0]] = item[1]
-    return key_mapping
-
-
-def replace_params(hf_params, tf_params, key_mapping):
-    list(hf_params.keys())
-
-    for key, value in tf_params.items():
-        if key not in key_mapping:
-            continue
-
-        hf_key = key_mapping[key]
-        if "_conv" in key and "kernel" in key:
-            new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1)
-        elif "embeddings" in key:
-            new_hf_value = torch.from_numpy(value)
-        elif "depthwise_kernel" in key:
-            new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1)
-        elif "kernel" in key:
-            new_hf_value = torch.from_numpy(np.transpose(value))
-        elif "temperature" in key:
-            new_hf_value = value
-        elif "bn/gamma" or "bn/beta" in key:
-            new_hf_value = torch.from_numpy(np.transpose(value)).squeeze()
-        else:
-            new_hf_value = torch.from_numpy(value)
-
-        # Replace HF parameters with original TF model parameters
-        hf_params[hf_key].copy_(new_hf_value)
-
-
-@torch.no_grad()
-def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_model, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our ALIGN structure.
-    """
-    # Load original model
-    seq_length = 64
-    tok = Tokenizer(seq_length)
-    original_model = align.Align("efficientnet-b7", "bert-base", 640, seq_length, tok.get_vocab_size())
-    original_model.compile()
-    original_model.load_weights(checkpoint_path)
-
-    tf_params = original_model.trainable_variables
-    tf_non_train_params = original_model.non_trainable_variables
-    tf_params = {param.name: param.numpy() for param in tf_params}
-    for param in tf_non_train_params:
-        tf_params[param.name] = param.numpy()
-    tf_param_names = list(tf_params.keys())
-
-    # Load HuggingFace model
-    config = get_align_config()
-    hf_model = AlignModel(config).eval()
-    hf_params = hf_model.state_dict()
-
-    # Create src-to-dst parameter name mapping dictionary
-    print("Converting parameters...")
-    key_mapping = rename_keys(tf_param_names)
-    replace_params(hf_params, tf_params, key_mapping)
-
-    # Initialize processor
-    processor = get_processor()
-    inputs = processor(
-        images=prepare_img(), text="A picture of a cat", padding="max_length", max_length=64, return_tensors="pt"
-    )
-
-    # HF model inference
-    hf_model.eval()
-    with torch.no_grad():
-        outputs = hf_model(**inputs)
-
-    hf_image_features = outputs.image_embeds.detach().numpy()
-    hf_text_features = outputs.text_embeds.detach().numpy()
-
-    # Original model inference
-    original_model.trainable = False
-    tf_image_processor = EfficientNetImageProcessor(
-        do_center_crop=True,
-        do_rescale=False,
-        do_normalize=False,
-        include_top=False,
-        resample=Image.BILINEAR,
-    )
-    image = tf_image_processor(images=prepare_img(), return_tensors="tf", data_format="channels_last")["pixel_values"]
-    text = tok(tf.constant(["A picture of a cat"]))
-
-    image_features = original_model.image_encoder(image, training=False)
-    text_features = original_model.text_encoder(text, training=False)
-
-    image_features = tf.nn.l2_normalize(image_features, axis=-1)
-    text_features = tf.nn.l2_normalize(text_features, axis=-1)
-
-    # Check whether original and HF model outputs match  -> np.allclose
-    if not np.allclose(image_features, hf_image_features, atol=1e-3):
-        raise ValueError("The predicted image features are not the same.")
-    if not np.allclose(text_features, hf_text_features, atol=1e-3):
-        raise ValueError("The predicted text features are not the same.")
-    print("Model outputs match!")
-
-    if save_model:
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-        # Save converted model and image processor
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Push model and image processor to hub
-        print("Pushing converted ALIGN to the hub...")
-        processor.push_to_hub("align-base")
-        hf_model.push_to_hub("align-base")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_path",
-        default="./weights/model-weights",
-        type=str,
-        help="Path to the pretrained TF ALIGN checkpoint.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="hf_model",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
-
-    args = parser.parse_args()
-    convert_align_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index f834aecf6932..a007b7a7c6d6 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -198,7 +198,7 @@ class AlignVisionModelOutput(ModelOutput):
     """
 
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -226,7 +226,7 @@ class AlignTextModelOutput(ModelOutput):
     """
 
     text_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -254,10 +254,10 @@ class AlignOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None
     vision_model_output: BaseModelOutputWithPoolingAndNoAttention = None
 
diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 792f614b10be..628f05b2e01b 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -83,7 +83,7 @@ def __call__(
         arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` arguments to
         EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
-        to the doctsring of the above two methods for more information.
+        to the docstring of the above two methods for more information.
 
         Args:
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 7e39a5f0f118..6e4c9e650da0 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -182,10 +182,10 @@ class AltCLIPOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -798,7 +798,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
@@ -1058,7 +1058,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals
         batch_size, _, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
             )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 153ecc2e2bfc..3ce4f2481d49 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -44,7 +44,7 @@ class AltCLIPProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "CLIPImageProcessor"
+    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
     tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
 
     @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
@@ -68,7 +68,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
         `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/aria/convert_aria_weights_to_hf.py b/src/transformers/models/aria/convert_aria_weights_to_hf.py
deleted file mode 100644
index dcc9e4d13976..000000000000
--- a/src/transformers/models/aria/convert_aria_weights_to_hf.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import glob
-
-import torch
-from huggingface_hub import snapshot_download
-from safetensors import safe_open
-
-from transformers import (
-    AddedToken,
-    AriaForConditionalGeneration,
-    AriaProcessor,
-    AutoConfig,
-    AutoTokenizer,
-)
-
-
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/aria/convert_aria_weights_to_hf.py --text_model_id rhymes-ai/Aria --vision_model_id rhymes-ai/Aria --output_hub_path m-ric/Aria_hf_2 --old_state_dict_id rhymes-ai/Aria
-
-Example for creating the old state dict file with Python:
-
-    import torch
-    from aria.model.language_model.aria_llama import AriaTextForCausalLM
-
-    # load model
-    kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
-    model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", low_cpu_mem_usage=True, **kwargs)
-
-    # load vision tower
-    model.get_vision_tower().load_model()
-
-    # Save state dict
-    torch.save(model.state_dict(), "tmp/hf_models/aria/model_state_dict.bin")
-"""
-
-KEYS_TO_MODIFY_MAPPING = {
-    "vision_tower.vision_model": "vision_tower",
-    "ln_ffn": "layer_norm",
-    "ffn": "feed_forward",
-    "ln_kv": "layer_norm_kv",
-}
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    return original_state_dict
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value
-    new_state_dict["vision_tower.post_layernorm.weight"] = torch.zeros((1152,))
-    new_state_dict["vision_tower.post_layernorm.bias"] = torch.zeros((1152,))
-
-    return new_state_dict
-
-
-def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
-    torch.set_default_dtype(torch.float16)
-
-    tokenizer = AutoTokenizer.from_pretrained(
-        text_model_id,
-        extra_special_tokens={
-            "image_token": "<|img|>",
-            "pad_token": "<pad>",
-        },
-    )
-    tokenizer.add_tokens(AddedToken("<|img|>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_special_tokens({"pad_token": "<pad>"})
-    tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<fim_prefix><|img|><fim_suffix>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
-
-    processor = AriaProcessor.from_pretrained(
-        text_model_id,
-        tokenizer=tokenizer,
-    )
-
-    config = AutoConfig.from_pretrained(text_model_id)
-    config.vision_config.hidden_size = 1152
-    config.vision_config.attention_heads = 16
-    config.pad_token_id = 2
-    config.image_token_index = 9
-    config.intermediate_size = config.moe_intermediate_size
-    config.auto_map = {
-        "AutoConfig": "modeling_aria.AriaConfig",
-        "AutoModelForCausalLM": "modeling_aria.AriaForConditionalGeneration",
-    }
-
-    with torch.device("meta"):
-        model = AriaForConditionalGeneration(config)
-
-    state_dict = load_original_state_dict(old_state_dict_id)
-
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, strict=False, assign=True)
-
-    # print("Saving models")
-    # model.save_pretrained("local_aria", safe_serialization=False)
-    # processor.save_pretrained("local_aria")
-    print("Pushing to hub")
-    model.push_to_hub(output_hub_path, create_pr=True)
-    processor.push_to_hub(output_hub_path, create_pr=True)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--text_model_id",
-        default="rhymes-ai/Aria",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--vision_model_id",
-        default="rhymes-ai/Aria",
-        help="Hub location of the vision model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        default="rhymes-ai/Aria",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--old_state_dict_id",
-        default="rhymes-ai/Aria",
-        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
-    )
-    args = parser.parse_args()
-    convert_aria_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py
index 90755045b635..c26fa30e0d4f 100644
--- a/src/transformers/models/aria/modeling_aria.py
+++ b/src/transformers/models/aria/modeling_aria.py
@@ -19,6 +19,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
+from functools import partial
 from typing import Callable, List, Optional, Tuple, Union
 
 from ...activations import ACT2FN
@@ -27,13 +28,15 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     LossKwargs,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -48,6 +51,12 @@
     from torch import nn
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "AriaTextConfig"
 
@@ -260,7 +269,7 @@ def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert):
     output = torch.zeros(num_tokens, out_features, dtype=token_states.dtype, device=token_states.device)
 
     cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0)
-    # Insert zero at the begining for offset index's convenience
+    # Insert zero at the beginning for offset index's convenience
     zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device)
     cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens))
 
@@ -743,45 +752,18 @@ def __init__(self, config: AriaTextConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -821,20 +803,12 @@ def forward(self, x, position_ids):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -895,10 +869,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(ARIA_TEXT_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -906,16 +881,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -926,6 +899,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -960,7 +937,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -994,13 +971,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -1008,12 +984,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1094,7 +1075,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1173,27 +1154,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(ARIA_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1228,10 +1208,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1240,12 +1219,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -1254,10 +1232,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -1300,7 +1274,7 @@ class AriaCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -1369,6 +1343,7 @@ class AriaCausalLMOutputWithPast(ModelOutput):
 class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
     config_class = AriaConfig
     _supports_flash_attn_2 = False
+    _supports_flex_attn = False
     _supports_sdpa = False
     _tied_weights_keys = ["language_model.lm_head.weight"]
 
@@ -1420,7 +1395,7 @@ def get_decoder(self):
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
-        pixel_mask: torch.FloatTensor = None,
+        pixel_mask: Optional[torch.FloatTensor] = None,
         vision_feature_layer: int = -1,
     ):
         patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
@@ -1436,14 +1411,15 @@ def get_image_features(
         image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask)
         return image_features
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(ARIA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=AriaCausalLMOutputWithPast, config_class=AriaConfig)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        pixel_mask: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1452,13 +1428,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         cache_position: Optional[torch.LongTensor] = None,
         **loss_kwargs,
-    ) -> Union[Tuple, AriaCausalLMOutputWithPast]:
+    ) -> AriaCausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`).
@@ -1523,7 +1497,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
@@ -1554,7 +1527,7 @@ def forward(
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
-        outputs = self.language_model(
+        outputs: CausalLMOutputWithPast = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
@@ -1562,12 +1535,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             logits_to_keep=logits_to_keep,
             cache_position=cache_position,
         )
 
-        logits = outputs[0]
+        logits = outputs.logits
 
         loss = None
         if labels is not None:
@@ -1575,10 +1547,6 @@ def forward(
                 logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **loss_kwargs
             )
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return AriaCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
index 7d579d6e37f3..3f38c87b5df8 100644
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -33,6 +33,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...modeling_outputs import CausalLMOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils import (
@@ -43,6 +44,7 @@
     TensorType,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -86,7 +88,7 @@ def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert):
     output = torch.zeros(num_tokens, out_features, dtype=token_states.dtype, device=token_states.device)
 
     cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0)
-    # Insert zero at the begining for offset index's convenience
+    # Insert zero at the beginning for offset index's convenience
     zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device)
     cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens))
 
@@ -1348,6 +1350,7 @@ class AriaCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
 class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin):
     config_class = AriaConfig
     _supports_flash_attn_2 = False
+    _supports_flex_attn = False
     _supports_sdpa = False
     _tied_weights_keys = ["language_model.lm_head.weight"]
 
@@ -1399,7 +1402,7 @@ def get_decoder(self):
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
-        pixel_mask: torch.FloatTensor = None,
+        pixel_mask: Optional[torch.FloatTensor] = None,
         vision_feature_layer: int = -1,
     ):
         patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
@@ -1415,14 +1418,15 @@ def get_image_features(
         image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask)
         return image_features
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(ARIA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=AriaCausalLMOutputWithPast, config_class=AriaConfig)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        pixel_mask: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1431,13 +1435,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         cache_position: Optional[torch.LongTensor] = None,
         **loss_kwargs,
-    ) -> Union[Tuple, AriaCausalLMOutputWithPast]:
+    ) -> AriaCausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`).
@@ -1502,7 +1504,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
@@ -1533,7 +1534,7 @@ def forward(
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
-        outputs = self.language_model(
+        outputs: CausalLMOutputWithPast = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
@@ -1541,12 +1542,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             logits_to_keep=logits_to_keep,
             cache_position=cache_position,
         )
 
-        logits = outputs[0]
+        logits = outputs.logits
 
         loss = None
         if labels is not None:
@@ -1554,10 +1554,6 @@ def forward(
                 logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **loss_kwargs
             )
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return AriaCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
deleted file mode 100644
index d211ef7ab058..000000000000
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Audio Spectrogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast"""
-
-import argparse
-import json
-from pathlib import Path
-
-import torch
-import torchaudio
-from datasets import load_dataset
-from huggingface_hub import hf_hub_download
-
-from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_audio_spectrogram_transformer_config(model_name):
-    config = ASTConfig()
-
-    if "10-10" in model_name:
-        pass
-    elif "speech-commands" in model_name:
-        config.max_length = 128
-    elif "12-12" in model_name:
-        config.time_stride = 12
-        config.frequency_stride = 12
-    elif "14-14" in model_name:
-        config.time_stride = 14
-        config.frequency_stride = 14
-    elif "16-16" in model_name:
-        config.time_stride = 16
-        config.frequency_stride = 16
-    else:
-        raise ValueError("Model not supported")
-
-    repo_id = "huggingface/label-files"
-    if "speech-commands" in model_name:
-        config.num_labels = 35
-        filename = "speech-commands-v2-id2label.json"
-    else:
-        config.num_labels = 527
-        filename = "audioset-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def rename_key(name):
-    if "module.v" in name:
-        name = name.replace("module.v", "audio_spectrogram_transformer")
-    if "cls_token" in name:
-        name = name.replace("cls_token", "embeddings.cls_token")
-    if "dist_token" in name:
-        name = name.replace("dist_token", "embeddings.distillation_token")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "embeddings.position_embeddings")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    # transformer blocks
-    if "blocks" in name:
-        name = name.replace("blocks", "encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    # final layernorm
-    if "audio_spectrogram_transformer.norm" in name:
-        name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm")
-    # classifier head
-    if "module.mlp_head.0" in name:
-        name = name.replace("module.mlp_head.0", "classifier.layernorm")
-    if "module.mlp_head.1" in name:
-        name = name.replace("module.mlp_head.1", "classifier.dense")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[3])
-            dim = config.hidden_size
-            if "weight" in key:
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight"
-                ] = val[:dim, :]
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight"
-                ] = val[dim : dim * 2, :]
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias"
-                ] = val[:dim]
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias"
-                ] = val[dim : dim * 2]
-                orig_state_dict[
-                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias"
-                ] = val[-dim:]
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def remove_keys(state_dict):
-    ignore_keys = [
-        "module.v.head.weight",
-        "module.v.head.bias",
-        "module.v.head_dist.weight",
-        "module.v.head_dist.bias",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-@torch.no_grad()
-def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our Audio Spectrogram Transformer structure.
-    """
-    config = get_audio_spectrogram_transformer_config(model_name)
-
-    model_name_to_url = {
-        "ast-finetuned-audioset-10-10-0.4593": (
-            "https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-10-10-0.450": (
-            "https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-10-10-0.448": (
-            "https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-10-10-0.448-v2": (
-            "https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-12-12-0.447": (
-            "https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-14-14-0.443": (
-            "https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1"
-        ),
-        "ast-finetuned-audioset-16-16-0.442": (
-            "https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1"
-        ),
-        "ast-finetuned-speech-commands-v2": (
-            "https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1"
-        ),
-    }
-
-    # load original state_dict
-    checkpoint_url = model_name_to_url[model_name]
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    # remove some keys
-    remove_keys(state_dict)
-    # rename some keys
-    new_state_dict = convert_state_dict(state_dict, config)
-
-    # load 🤗 model
-    model = ASTForAudioClassification(config)
-    model.eval()
-
-    model.load_state_dict(new_state_dict)
-
-    # verify outputs on dummy input
-    # source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62
-    mean = -4.2677393 if "speech-commands" not in model_name else -6.845978
-    std = 4.5689974 if "speech-commands" not in model_name else 5.5654526
-    max_length = 1024 if "speech-commands" not in model_name else 128
-    feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length)
-
-    if "speech-commands" in model_name:
-        # TODO: Convert dataset to Parquet
-        dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True)
-        waveform = dataset[0]["audio"]["array"]
-    else:
-        filepath = hf_hub_download(
-            repo_id="nielsr/audio-spectogram-transformer-checkpoint",
-            filename="sample_audio.flac",
-            repo_type="dataset",
-        )
-
-        waveform, _ = torchaudio.load(filepath)
-        waveform = waveform.squeeze().numpy()
-
-    inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt")
-
-    # forward pass
-    outputs = model(**inputs)
-    logits = outputs.logits
-
-    if model_name == "ast-finetuned-audioset-10-10-0.4593":
-        expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
-    elif model_name == "ast-finetuned-audioset-10-10-0.450":
-        expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718])
-    elif model_name == "ast-finetuned-audioset-10-10-0.448":
-        expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344])
-    elif model_name == "ast-finetuned-audioset-10-10-0.448-v2":
-        expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917])
-    elif model_name == "ast-finetuned-audioset-12-12-0.447":
-        expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843])
-    elif model_name == "ast-finetuned-audioset-14-14-0.443":
-        expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413])
-    elif model_name == "ast-finetuned-audioset-16-16-0.442":
-        expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470])
-    elif model_name == "ast-finetuned-speech-commands-v2":
-        expected_slice = torch.tensor([6.1589, -8.0566, -8.7984])
-    else:
-        raise ValueError("Unknown model name")
-    if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):
-        raise ValueError("Logits don't match")
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and feature extractor to the hub...")
-        model.push_to_hub(f"MIT/{model_name}")
-        feature_extractor.push_to_hub(f"MIT/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="ast-finetuned-audioset-10-10-0.4593",
-        type=str,
-        help="Name of the Audio Spectrogram Transformer model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_audio_spectrogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
index b181afe19e9e..888d38e18703 100644
--- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -91,7 +91,7 @@ def __init__(
 
         if not is_speech_available():
             mel_filters = mel_filter_bank(
-                num_frequency_bins=256,
+                num_frequency_bins=257,
                 num_mel_filters=self.num_mel_bins,
                 min_frequency=20,
                 max_frequency=sampling_rate // 2,
@@ -101,7 +101,7 @@ def __init__(
                 triangularize_in_mel_space=True,
             )
 
-            self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
+            self.mel_filters = mel_filters
             self.window = window_function(400, "hann", periodic=False)
 
     def _extract_fbank_features(
@@ -193,7 +193,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index a9fe0d75f5c3..1aa24ca08a5a 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """PyTorch Audio Spectrogram Transformer (AST) model."""
 
-import math
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -24,7 +23,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_audio_spectrogram_transformer import ASTConfig
@@ -108,26 +107,59 @@ def forward(self, input_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->AST
 class ASTSelfAttention(nn.Module):
     def __init__(self, config: ASTConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -136,85 +168,37 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaSelfAttention with ViT->AST
-class ASTSdpaSelfAttention(ASTSelfAttention):
-    def __init__(self, config: ASTConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions or head_mask is not None:
-            logger.warning_once(
-                "`ASTSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
-                "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but "
-                "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
 
-        return context_layer, None
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
 
 
 # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->AST
@@ -276,13 +260,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->AST
-class ASTSdpaAttention(ASTAttention):
-    def __init__(self, config: ASTConfig) -> None:
-        super().__init__(config)
-        self.attention = ASTSdpaSelfAttention(config)
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->AST
 class ASTIntermediate(nn.Module):
     def __init__(self, config: ASTConfig) -> None:
@@ -316,12 +293,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-AST_ATTENTION_CLASSES = {
-    "eager": ASTAttention,
-    "sdpa": ASTSdpaAttention,
-}
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->AST,VIT->AST
 class ASTLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
@@ -330,7 +301,7 @@ def __init__(self, config: ASTConfig) -> None:
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = AST_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = ASTAttention(config)
         self.intermediate = ASTIntermediate(config)
         self.output = ASTOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -428,8 +399,8 @@ class ASTPreTrainedModel(PreTrainedModel):
     main_input_name = "input_values"
     supports_gradient_checkpointing = True
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
-    # Copied from transformers.models.deit.modeling_deit.DeiTPreTrainedModel._init_weights
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
         if isinstance(module, (nn.Linear, nn.Conv2d)):
@@ -443,6 +414,10 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, ASTEmbeddings):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+            module.distillation_token.data.zero_()
 
 
 AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING = r"""
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 069269bb7f70..d21a16beb124 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -444,6 +444,11 @@ def from_config(cls, config, **kwargs):
             f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
         )
 
+    @classmethod
+    def _prepare_config_for_auto_class(cls, config: PretrainedConfig) -> PretrainedConfig:
+        """Additional autoclass-specific config post-loading manipulation. May be overridden in subclasses."""
+        return config
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         config = kwargs.pop("config", None)
@@ -561,6 +566,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             )
         elif type(config) in cls._model_mapping.keys():
             model_class = _get_model_class(config, cls._model_mapping)
+            if model_class.config_class == config.sub_configs.get("text_config", None):
+                config = config.get_text_config()
             return model_class.from_pretrained(
                 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
             )
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 15d8e6700186..759b7ad3d98a 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -39,6 +39,7 @@
         ("aria_text", "AriaTextConfig"),
         ("audio-spectrogram-transformer", "ASTConfig"),
         ("autoformer", "AutoformerConfig"),
+        ("aya_vision", "AyaVisionConfig"),
         ("bamba", "BambaConfig"),
         ("bark", "BarkConfig"),
         ("bart", "BartConfig"),
@@ -88,6 +89,7 @@
         ("deberta", "DebertaConfig"),
         ("deberta-v2", "DebertaV2Config"),
         ("decision_transformer", "DecisionTransformerConfig"),
+        ("deepseek_v3", "DeepseekV3Config"),
         ("deformable_detr", "DeformableDetrConfig"),
         ("deit", "DeiTConfig"),
         ("depth_anything", "DepthAnythingConfig"),
@@ -123,6 +125,8 @@
         ("fuyu", "FuyuConfig"),
         ("gemma", "GemmaConfig"),
         ("gemma2", "Gemma2Config"),
+        ("gemma3", "Gemma3Config"),
+        ("gemma3_text", "Gemma3TextConfig"),
         ("git", "GitConfig"),
         ("glm", "GlmConfig"),
         ("glpn", "GLPNConfig"),
@@ -137,6 +141,7 @@
         ("gptsan-japanese", "GPTSanJapaneseConfig"),
         ("granite", "GraniteConfig"),
         ("granitemoe", "GraniteMoeConfig"),
+        ("granitemoeshared", "GraniteMoeSharedConfig"),
         ("granitevision", "LlavaNextConfig"),
         ("graphormer", "GraphormerConfig"),
         ("grounding-dino", "GroundingDinoConfig"),
@@ -165,6 +170,8 @@
         ("levit", "LevitConfig"),
         ("lilt", "LiltConfig"),
         ("llama", "LlamaConfig"),
+        ("llama4", "Llama4Config"),
+        ("llama4_text", "Llama4TextConfig"),
         ("llava", "LlavaConfig"),
         ("llava_next", "LlavaNextConfig"),
         ("llava_next_video", "LlavaNextVideoConfig"),
@@ -188,6 +195,7 @@
         ("mgp-str", "MgpstrConfig"),
         ("mimi", "MimiConfig"),
         ("mistral", "MistralConfig"),
+        ("mistral3", "Mistral3Config"),
         ("mixtral", "MixtralConfig"),
         ("mllama", "MllamaConfig"),
         ("mobilebert", "MobileBertConfig"),
@@ -230,12 +238,14 @@
         ("persimmon", "PersimmonConfig"),
         ("phi", "PhiConfig"),
         ("phi3", "Phi3Config"),
+        ("phi4_multimodal", "Phi4MultimodalConfig"),
         ("phimoe", "PhimoeConfig"),
         ("pix2struct", "Pix2StructConfig"),
         ("pixtral", "PixtralVisionConfig"),
         ("plbart", "PLBartConfig"),
         ("poolformer", "PoolFormerConfig"),
         ("pop2piano", "Pop2PianoConfig"),
+        ("prompt_depth_anything", "PromptDepthAnythingConfig"),
         ("prophetnet", "ProphetNetConfig"),
         ("pvt", "PvtConfig"),
         ("pvt_v2", "PvtV2Config"),
@@ -246,6 +256,8 @@
         ("qwen2_audio_encoder", "Qwen2AudioEncoderConfig"),
         ("qwen2_moe", "Qwen2MoeConfig"),
         ("qwen2_vl", "Qwen2VLConfig"),
+        ("qwen3", "Qwen3Config"),
+        ("qwen3_moe", "Qwen3MoeConfig"),
         ("rag", "RagConfig"),
         ("realm", "RealmConfig"),
         ("recurrent_gemma", "RecurrentGemmaConfig"),
@@ -263,14 +275,19 @@
         ("rt_detr_v2", "RTDetrV2Config"),
         ("rwkv", "RwkvConfig"),
         ("sam", "SamConfig"),
+        ("sam_vision_model", "SamVisionConfig"),
         ("seamless_m4t", "SeamlessM4TConfig"),
         ("seamless_m4t_v2", "SeamlessM4Tv2Config"),
         ("segformer", "SegformerConfig"),
         ("seggpt", "SegGptConfig"),
         ("sew", "SEWConfig"),
         ("sew-d", "SEWDConfig"),
+        ("shieldgemma2", "ShieldGemma2Config"),
         ("siglip", "SiglipConfig"),
+        ("siglip2", "Siglip2Config"),
         ("siglip_vision_model", "SiglipVisionConfig"),
+        ("smolvlm", "SmolVLMConfig"),
+        ("smolvlm_vision", "SmolVLMVisionConfig"),
         ("speech-encoder-decoder", "SpeechEncoderDecoderConfig"),
         ("speech_to_text", "Speech2TextConfig"),
         ("speech_to_text_2", "Speech2Text2Config"),
@@ -355,6 +372,7 @@
         ("aria_text", "AriaText"),
         ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"),
         ("autoformer", "Autoformer"),
+        ("aya_vision", "AyaVision"),
         ("bamba", "Bamba"),
         ("bark", "Bark"),
         ("bart", "BART"),
@@ -411,6 +429,7 @@
         ("deberta", "DeBERTa"),
         ("deberta-v2", "DeBERTa-v2"),
         ("decision_transformer", "Decision Transformer"),
+        ("deepseek_v3", "DeepSeek-V3"),
         ("deformable_detr", "Deformable DETR"),
         ("deit", "DeiT"),
         ("deplot", "DePlot"),
@@ -453,6 +472,8 @@
         ("fuyu", "Fuyu"),
         ("gemma", "Gemma"),
         ("gemma2", "Gemma2"),
+        ("gemma3", "Gemma3ForConditionalGeneration"),
+        ("gemma3_text", "Gemma3ForCausalLM"),
         ("git", "GIT"),
         ("glm", "GLM"),
         ("glpn", "GLPN"),
@@ -467,6 +488,7 @@
         ("gptsan-japanese", "GPTSAN-japanese"),
         ("granite", "Granite"),
         ("granitemoe", "GraniteMoeMoe"),
+        ("granitemoeshared", "GraniteMoeSharedMoe"),
         ("granitevision", "LLaVA-NeXT"),
         ("graphormer", "Graphormer"),
         ("grounding-dino", "Grounding DINO"),
@@ -499,6 +521,8 @@
         ("llama", "LLaMA"),
         ("llama2", "Llama2"),
         ("llama3", "Llama3"),
+        ("llama4", "Llama4"),
+        ("llama4_text", "Llama4ForCausalLM"),
         ("llava", "LLaVa"),
         ("llava_next", "LLaVA-NeXT"),
         ("llava_next_video", "LLaVa-NeXT-Video"),
@@ -526,6 +550,7 @@
         ("mgp-str", "MGP-STR"),
         ("mimi", "Mimi"),
         ("mistral", "Mistral"),
+        ("mistral3", "Mistral3"),
         ("mixtral", "Mixtral"),
         ("mllama", "Mllama"),
         ("mluke", "mLUKE"),
@@ -572,6 +597,7 @@
         ("persimmon", "Persimmon"),
         ("phi", "Phi"),
         ("phi3", "Phi3"),
+        ("phi4_multimodal", "Phi4Multimodal"),
         ("phimoe", "Phimoe"),
         ("phobert", "PhoBERT"),
         ("pix2struct", "Pix2Struct"),
@@ -579,6 +605,7 @@
         ("plbart", "PLBart"),
         ("poolformer", "PoolFormer"),
         ("pop2piano", "Pop2Piano"),
+        ("prompt_depth_anything", "PromptDepthAnything"),
         ("prophetnet", "ProphetNet"),
         ("pvt", "PVT"),
         ("pvt_v2", "PVTv2"),
@@ -589,6 +616,8 @@
         ("qwen2_audio_encoder", "Qwen2AudioEncoder"),
         ("qwen2_moe", "Qwen2MoE"),
         ("qwen2_vl", "Qwen2VL"),
+        ("qwen3", "Qwen3"),
+        ("qwen3_moe", "Qwen3MoE"),
         ("rag", "RAG"),
         ("realm", "REALM"),
         ("recurrent_gemma", "RecurrentGemma"),
@@ -606,14 +635,20 @@
         ("rt_detr_v2", "RT-DETRv2"),
         ("rwkv", "RWKV"),
         ("sam", "SAM"),
+        ("sam_vision_model", "SamVisionModel"),
         ("seamless_m4t", "SeamlessM4T"),
         ("seamless_m4t_v2", "SeamlessM4Tv2"),
         ("segformer", "SegFormer"),
         ("seggpt", "SegGPT"),
         ("sew", "SEW"),
         ("sew-d", "SEW-D"),
+        ("shieldgemma2", "Shieldgemma2"),
         ("siglip", "SigLIP"),
+        ("siglip2", "SigLIP2"),
+        ("siglip2_vision_model", "Siglip2VisionModel"),
         ("siglip_vision_model", "SiglipVisionModel"),
+        ("smolvlm", "SmolVLM"),
+        ("smolvlm_vision", "SmolVLMVisionTransformer"),
         ("speech-encoder-decoder", "Speech Encoder decoder"),
         ("speech_to_text", "Speech2Text"),
         ("speech_to_text_2", "Speech2Text2"),
@@ -737,11 +772,15 @@
         ("qwen2_audio_encoder", "qwen2_audio"),
         ("clip_text_model", "clip"),
         ("aria_text", "aria"),
+        ("gemma3_text", "gemma3"),
         ("idefics3_vision", "idefics3"),
         ("siglip_vision_model", "siglip"),
+        ("smolvlm_vision", "smolvlm"),
         ("chinese_clip_vision_model", "chinese_clip"),
         ("rt_detr_resnet", "rt_detr"),
         ("granitevision", "llava_next"),
+        ("sam_vision_model", "sam"),
+        ("llama4_text", "llama4"),
     ]
 )
 
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 1b237caabaff..0b8b38bc347c 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -25,7 +25,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...feature_extraction_utils import FeatureExtractionMixin
-from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo, logging
+from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, cached_file, logging
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
     CONFIG_MAPPING_NAMES,
@@ -78,6 +78,7 @@
         ("nat", "ViTFeatureExtractor"),
         ("owlvit", "OwlViTFeatureExtractor"),
         ("perceiver", "PerceiverFeatureExtractor"),
+        ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"),
         ("poolformer", "PoolFormerFeatureExtractor"),
         ("pop2piano", "Pop2PianoFeatureExtractor"),
         ("regnet", "ConvNextFeatureExtractor"),
@@ -220,7 +221,7 @@ def get_feature_extractor_config(
             raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
         token = use_auth_token
 
-    resolved_config_file = get_file_from_repo(
+    resolved_config_file = cached_file(
         pretrained_model_name_or_path,
         FEATURE_EXTRACTOR_NAME,
         cache_dir=cache_dir,
@@ -230,6 +231,9 @@ def get_feature_extractor_config(
         token=token,
         revision=revision,
         local_files_only=local_files_only,
+        _raise_exceptions_for_gated_repo=False,
+        _raise_exceptions_for_missing_entries=False,
+        _raise_exceptions_for_connection_errors=False,
     )
     if resolved_config_file is None:
         logger.info(
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index ef4d9b25d1d2..2f9d42fcdb72 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -24,12 +24,12 @@
 # Build the list of all image processors
 from ...configuration_utils import PretrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
-from ...image_processing_utils import BaseImageProcessor, ImageProcessingMixin
+from ...image_processing_utils import ImageProcessingMixin
 from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...utils import (
     CONFIG_NAME,
     IMAGE_PROCESSOR_NAME,
-    get_file_from_repo,
+    cached_file,
     is_timm_config_dict,
     is_timm_local_checkpoint,
     is_torchvision_available,
@@ -56,7 +56,7 @@
     IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
         [
             ("align", ("EfficientNetImageProcessor",)),
-            ("aria", ("AriaImageProcessor")),
+            ("aria", ("AriaImageProcessor",)),
             ("beit", ("BeitImageProcessor",)),
             ("bit", ("BitImageProcessor",)),
             ("blip", ("BlipImageProcessor", "BlipImageProcessorFast")),
@@ -86,9 +86,10 @@
             ("flava", ("FlavaImageProcessor",)),
             ("focalnet", ("BitImageProcessor",)),
             ("fuyu", ("FuyuImageProcessor",)),
+            ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
             ("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("glpn", ("GLPNImageProcessor",)),
-            ("got_ocr2", ("GotOcr2ImageProcessor",)),
+            ("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
             ("grounding-dino", ("GroundingDinoImageProcessor",)),
             ("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
             ("hiera", ("BitImageProcessor",)),
@@ -103,6 +104,7 @@
             ("layoutlmv2", ("LayoutLMv2ImageProcessor",)),
             ("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
             ("levit", ("LevitImageProcessor",)),
+            ("llama4", ("Llama4ImageProcessor", "Llama4ImageProcessorFast")),
             ("llava", ("LlavaImageProcessor", "LlavaImageProcessorFast")),
             ("llava_next", ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")),
             ("llava_next_video", ("LlavaNextVideoImageProcessor",)),
@@ -110,6 +112,7 @@
             ("mask2former", ("Mask2FormerImageProcessor",)),
             ("maskformer", ("MaskFormerImageProcessor",)),
             ("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
+            ("mistral3", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
             ("mllama", ("MllamaImageProcessor",)),
             ("mobilenet_v1", ("MobileNetV1ImageProcessor",)),
             ("mobilenet_v2", ("MobileNetV2ImageProcessor",)),
@@ -122,9 +125,11 @@
             ("owlvit", ("OwlViTImageProcessor",)),
             ("paligemma", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
             ("perceiver", ("PerceiverImageProcessor",)),
+            ("phi4_multimodal", "Phi4MultimodalImageProcessorFast"),
             ("pix2struct", ("Pix2StructImageProcessor",)),
             ("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
             ("poolformer", ("PoolFormerImageProcessor",)),
+            ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor",)),
             ("pvt", ("PvtImageProcessor",)),
             ("pvt_v2", ("PvtImageProcessor",)),
             ("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
@@ -135,8 +140,10 @@
             ("sam", ("SamImageProcessor",)),
             ("segformer", ("SegformerImageProcessor",)),
             ("seggpt", ("SegGptImageProcessor",)),
+            ("shieldgemma2", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
             ("siglip", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
-            ("superglue", "SuperGlueImageProcessor"),
+            ("siglip2", ("Siglip2ImageProcessor", "Siglip2ImageProcessorFast")),
+            ("superglue", ("SuperGlueImageProcessor",)),
             ("swiftformer", ("ViTImageProcessor", "ViTImageProcessorFast")),
             ("swin", ("ViTImageProcessor", "ViTImageProcessorFast")),
             ("swin2sr", ("Swin2SRImageProcessor",)),
@@ -286,7 +293,7 @@ def get_image_processor_config(
             raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
         token = use_auth_token
 
-    resolved_config_file = get_file_from_repo(
+    resolved_config_file = cached_file(
         pretrained_model_name_or_path,
         IMAGE_PROCESSOR_NAME,
         cache_dir=cache_dir,
@@ -296,6 +303,9 @@ def get_image_processor_config(
         token=token,
         revision=revision,
         local_files_only=local_files_only,
+        _raise_exceptions_for_gated_repo=False,
+        _raise_exceptions_for_missing_entries=False,
+        _raise_exceptions_for_connection_errors=False,
     )
     if resolved_config_file is None:
         logger.info(
@@ -485,7 +495,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 image_processor_auto_map = config.auto_map["AutoImageProcessor"]
 
         image_processor_class = None
-        # TODO: @yoni, change logic in v4.48 (when use_fast set to True by default)
+        # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
         if image_processor_type is not None:
             # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor.
             if use_fast is None:
@@ -493,7 +503,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 if not use_fast:
                     logger.warning_once(
                         "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
-                        "`use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. "
+                        "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
                         "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
                     )
             # Update class name to reflect the use_fast option. If class is not found, we fall back to the slow version.
@@ -601,8 +611,10 @@ def register(
             raise ValueError("You need to specify either slow_image_processor_class or fast_image_processor_class")
         if slow_image_processor_class is not None and issubclass(slow_image_processor_class, BaseImageProcessorFast):
             raise ValueError("You passed a fast image processor in as the `slow_image_processor_class`.")
-        if fast_image_processor_class is not None and issubclass(fast_image_processor_class, BaseImageProcessor):
-            raise ValueError("You passed a slow image processor in as the `fast_image_processor_class`.")
+        if fast_image_processor_class is not None and not issubclass(
+            fast_image_processor_class, BaseImageProcessorFast
+        ):
+            raise ValueError("The `fast_image_processor_class` should inherit from `BaseImageProcessorFast`.")
 
         if (
             slow_image_processor_class is not None
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 49d48d0912c3..d33d0f20d540 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -87,6 +87,7 @@
         ("deberta", "DebertaModel"),
         ("deberta-v2", "DebertaV2Model"),
         ("decision_transformer", "DecisionTransformerModel"),
+        ("deepseek_v3", "DeepseekV3Model"),
         ("deformable_detr", "DeformableDetrModel"),
         ("deit", "DeiTModel"),
         ("depth_pro", "DepthProModel"),
@@ -118,6 +119,7 @@
         ("funnel", ("FunnelModel", "FunnelBaseModel")),
         ("gemma", "GemmaModel"),
         ("gemma2", "Gemma2Model"),
+        ("gemma3_text", "Gemma3TextModel"),
         ("git", "GitModel"),
         ("glm", "GlmModel"),
         ("glpn", "GLPNModel"),
@@ -132,6 +134,7 @@
         ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
         ("granite", "GraniteModel"),
         ("granitemoe", "GraniteMoeModel"),
+        ("granitemoeshared", "GraniteMoeSharedModel"),
         ("graphormer", "GraphormerModel"),
         ("grounding-dino", "GroundingDinoModel"),
         ("groupvit", "GroupViTModel"),
@@ -157,6 +160,7 @@
         ("levit", "LevitModel"),
         ("lilt", "LiltModel"),
         ("llama", "LlamaModel"),
+        ("llama4", "Llama4ForConditionalGeneration"),
         ("longformer", "LongformerModel"),
         ("longt5", "LongT5Model"),
         ("luke", "LukeModel"),
@@ -215,6 +219,7 @@
         ("persimmon", "PersimmonModel"),
         ("phi", "PhiModel"),
         ("phi3", "Phi3Model"),
+        ("phi4_multimodal", "Phi4MultimodalModel"),
         ("phimoe", "PhimoeModel"),
         ("pixtral", "PixtralVisionModel"),
         ("plbart", "PLBartModel"),
@@ -228,6 +233,8 @@
         ("qwen2_audio_encoder", "Qwen2AudioEncoder"),
         ("qwen2_moe", "Qwen2MoeModel"),
         ("qwen2_vl", "Qwen2VLModel"),
+        ("qwen3", "Qwen3Model"),
+        ("qwen3_moe", "Qwen3MoeModel"),
         ("recurrent_gemma", "RecurrentGemmaModel"),
         ("reformer", "ReformerModel"),
         ("regnet", "RegNetModel"),
@@ -242,6 +249,7 @@
         ("rt_detr_v2", "RTDetrV2Model"),
         ("rwkv", "RwkvModel"),
         ("sam", "SamModel"),
+        ("sam_vision_model", "SamVisionModel"),
         ("seamless_m4t", "SeamlessM4TModel"),
         ("seamless_m4t_v2", "SeamlessM4Tv2Model"),
         ("segformer", "SegformerModel"),
@@ -249,7 +257,10 @@
         ("sew", "SEWModel"),
         ("sew-d", "SEWDModel"),
         ("siglip", "SiglipModel"),
+        ("siglip2", "Siglip2Model"),
         ("siglip_vision_model", "SiglipVisionModel"),
+        ("smolvlm", "SmolVLMModel"),
+        ("smolvlm_vision", "SmolVLMVisionTransformer"),
         ("speech_to_text", "Speech2TextModel"),
         ("speecht5", "SpeechT5Model"),
         ("splinter", "SplinterModel"),
@@ -334,6 +345,7 @@
         ("fnet", "FNetForPreTraining"),
         ("fsmt", "FSMTForConditionalGeneration"),
         ("funnel", "FunnelForPreTraining"),
+        ("gemma3", "Gemma3ForConditionalGeneration"),
         ("gpt-sw3", "GPT2LMHeadModel"),
         ("gpt2", "GPT2LMHeadModel"),
         ("gpt_bigcode", "GPTBigCodeForCausalLM"),
@@ -355,6 +367,7 @@
         ("mamba2", "Mamba2ForCausalLM"),
         ("mega", "MegaForMaskedLM"),
         ("megatron-bert", "MegatronBertForPreTraining"),
+        ("mistral3", "Mistral3ForConditionalGeneration"),
         ("mllama", "MllamaForConditionalGeneration"),
         ("mobilebert", "MobileBertForPreTraining"),
         ("mpnet", "MPNetForMaskedLM"),
@@ -505,6 +518,7 @@
         ("ctrl", "CTRLLMHeadModel"),
         ("data2vec-text", "Data2VecTextForCausalLM"),
         ("dbrx", "DbrxForCausalLM"),
+        ("deepseek_v3", "DeepseekV3ForCausalLM"),
         ("diffllama", "DiffLlamaForCausalLM"),
         ("electra", "ElectraForCausalLM"),
         ("emu3", "Emu3ForCausalLM"),
@@ -514,6 +528,8 @@
         ("fuyu", "FuyuForCausalLM"),
         ("gemma", "GemmaForCausalLM"),
         ("gemma2", "Gemma2ForCausalLM"),
+        ("gemma3", "Gemma3ForConditionalGeneration"),
+        ("gemma3_text", "Gemma3ForCausalLM"),
         ("git", "GitForCausalLM"),
         ("glm", "GlmForCausalLM"),
         ("got_ocr2", "GotOcr2ForConditionalGeneration"),
@@ -526,10 +542,13 @@
         ("gptj", "GPTJForCausalLM"),
         ("granite", "GraniteForCausalLM"),
         ("granitemoe", "GraniteMoeForCausalLM"),
+        ("granitemoeshared", "GraniteMoeSharedForCausalLM"),
         ("helium", "HeliumForCausalLM"),
         ("jamba", "JambaForCausalLM"),
         ("jetmoe", "JetMoeForCausalLM"),
         ("llama", "LlamaForCausalLM"),
+        ("llama4", "Llama4ForCausalLM"),
+        ("llama4_text", "Llama4ForCausalLM"),
         ("mamba", "MambaForCausalLM"),
         ("mamba2", "Mamba2ForCausalLM"),
         ("marian", "MarianForCausalLM"),
@@ -555,12 +574,15 @@
         ("persimmon", "PersimmonForCausalLM"),
         ("phi", "PhiForCausalLM"),
         ("phi3", "Phi3ForCausalLM"),
+        ("phi4_multimodal", "Phi4MultimodalForCausalLM"),
         ("phimoe", "PhimoeForCausalLM"),
         ("plbart", "PLBartForCausalLM"),
         ("prophetnet", "ProphetNetForCausalLM"),
         ("qdqbert", "QDQBertLMHeadModel"),
         ("qwen2", "Qwen2ForCausalLM"),
         ("qwen2_moe", "Qwen2MoeForCausalLM"),
+        ("qwen3", "Qwen3ForCausalLM"),
+        ("qwen3_moe", "Qwen3MoeForCausalLM"),
         ("recurrent_gemma", "RecurrentGemmaForCausalLM"),
         ("reformer", "ReformerModelWithLMHead"),
         ("rembert", "RemBertForCausalLM"),
@@ -614,6 +636,7 @@
         ("ijepa", "IJepaModel"),
         ("imagegpt", "ImageGPTModel"),
         ("levit", "LevitModel"),
+        ("llama4", "Llama4VisionModel"),
         ("mllama", "MllamaVisionModel"),
         ("mobilenet_v1", "MobileNetV1Model"),
         ("mobilenet_v2", "MobileNetV2Model"),
@@ -716,7 +739,9 @@
         ("regnet", "RegNetForImageClassification"),
         ("resnet", "ResNetForImageClassification"),
         ("segformer", "SegformerForImageClassification"),
+        ("shieldgemma2", "ShieldGemma2ForImageClassification"),
         ("siglip", "SiglipForImageClassification"),
+        ("siglip2", "Siglip2ForImageClassification"),
         ("swiftformer", "SwiftFormerForImageClassification"),
         ("swin", "SwinForImageClassification"),
         ("swinv2", "Swinv2ForImageClassification"),
@@ -792,6 +817,7 @@
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
+        ("mistral3", "Mistral3ForConditionalGeneration"),
         ("mllama", "MllamaForConditionalGeneration"),
         ("paligemma", "PaliGemmaForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
@@ -812,11 +838,13 @@
 MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
     [
         ("aria", "AriaForConditionalGeneration"),
+        ("aya_vision", "AyaVisionForConditionalGeneration"),
         ("blip", "BlipForConditionalGeneration"),
         ("blip-2", "Blip2ForConditionalGeneration"),
         ("chameleon", "ChameleonForConditionalGeneration"),
         ("emu3", "Emu3ForConditionalGeneration"),
         ("fuyu", "FuyuForCausalLM"),
+        ("gemma3", "Gemma3ForConditionalGeneration"),
         ("git", "GitForCausalLM"),
         ("got_ocr2", "GotOcr2ForConditionalGeneration"),
         ("idefics", "IdeficsForVisionText2Text"),
@@ -824,15 +852,19 @@
         ("idefics3", "Idefics3ForConditionalGeneration"),
         ("instructblip", "InstructBlipForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
+        ("llama4", "Llama4ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
+        ("mistral3", "Mistral3ForConditionalGeneration"),
         ("mllama", "MllamaForConditionalGeneration"),
         ("paligemma", "PaliGemmaForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
         ("pixtral", "LlavaForConditionalGeneration"),
         ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),
         ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
+        ("shieldgemma2", "Gemma3ForConditionalGeneration"),
+        ("smolvlm", "SmolVLMForConditionalGeneration"),
         ("udop", "UdopForConditionalGeneration"),
         ("vipllava", "VipLlavaForConditionalGeneration"),
         ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
@@ -923,6 +955,7 @@
         ("depth_pro", "DepthProForDepthEstimation"),
         ("dpt", "DPTForDepthEstimation"),
         ("glpn", "GLPNForDepthEstimation"),
+        ("prompt_depth_anything", "PromptDepthAnythingForDepthEstimation"),
         ("zoedepth", "ZoeDepthForDepthEstimation"),
     ]
 )
@@ -1048,6 +1081,8 @@
         ("qdqbert", "QDQBertForSequenceClassification"),
         ("qwen2", "Qwen2ForSequenceClassification"),
         ("qwen2_moe", "Qwen2MoeForSequenceClassification"),
+        ("qwen3", "Qwen3ForSequenceClassification"),
+        ("qwen3_moe", "Qwen3MoeForSequenceClassification"),
         ("reformer", "ReformerForSequenceClassification"),
         ("rembert", "RemBertForSequenceClassification"),
         ("roberta", "RobertaForSequenceClassification"),
@@ -1116,6 +1151,7 @@
         ("mistral", "MistralForQuestionAnswering"),
         ("mixtral", "MixtralForQuestionAnswering"),
         ("mobilebert", "MobileBertForQuestionAnswering"),
+        ("modernbert", "ModernBertForQuestionAnswering"),
         ("mpnet", "MPNetForQuestionAnswering"),
         ("mpt", "MptForQuestionAnswering"),
         ("mra", "MraForQuestionAnswering"),
@@ -1128,6 +1164,8 @@
         ("qdqbert", "QDQBertForQuestionAnswering"),
         ("qwen2", "Qwen2ForQuestionAnswering"),
         ("qwen2_moe", "Qwen2MoeForQuestionAnswering"),
+        ("qwen3", "Qwen3ForQuestionAnswering"),
+        ("qwen3_moe", "Qwen3MoeForQuestionAnswering"),
         ("reformer", "ReformerForQuestionAnswering"),
         ("rembert", "RemBertForQuestionAnswering"),
         ("roberta", "RobertaForQuestionAnswering"),
@@ -1232,6 +1270,8 @@
         ("qdqbert", "QDQBertForTokenClassification"),
         ("qwen2", "Qwen2ForTokenClassification"),
         ("qwen2_moe", "Qwen2MoeForTokenClassification"),
+        ("qwen3", "Qwen3ForTokenClassification"),
+        ("qwen3_moe", "Qwen3MoeForTokenClassification"),
         ("rembert", "RemBertForTokenClassification"),
         ("roberta", "RobertaForTokenClassification"),
         ("roberta-prelayernorm", "RobertaPreLayerNormForTokenClassification"),
@@ -1398,6 +1438,7 @@
         ("clip", "CLIPModel"),
         ("clipseg", "CLIPSegModel"),
         ("siglip", "SiglipModel"),
+        ("siglip2", "Siglip2Model"),
     ]
 )
 
@@ -1455,6 +1496,7 @@
         ("emu3", "Emu3TextModel"),
         ("flaubert", "FlaubertModel"),
         ("ibert", "IBertModel"),
+        ("llama4", "Llama4TextModel"),
         ("longformer", "LongformerModel"),
         ("mllama", "MllamaTextModel"),
         ("mobilebert", "MobileBertModel"),
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index effa01ef2a94..74b3d66167d5 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -123,7 +123,7 @@
 
 FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
-        # Model for Image-classsification
+        # Model for Image-classification
         ("beit", "FlaxBeitForImageClassification"),
         ("dinov2", "FlaxDinov2ForImageClassification"),
         ("regnet", "FlaxRegNetForImageClassification"),
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 906fe411d0f7..67b69e2c6109 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -80,6 +80,7 @@
         ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"),
         ("roformer", "TFRoFormerModel"),
         ("sam", "TFSamModel"),
+        ("sam_vision_model", "TFSamVisionModel"),
         ("segformer", "TFSegformerModel"),
         ("speech_to_text", "TFSpeech2TextModel"),
         ("swiftformer", "TFSwiftFormerModel"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index f329d9e465e5..6e655edff1c9 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -28,7 +28,7 @@
 from ...image_processing_utils import ImageProcessingMixin
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils import TOKENIZER_CONFIG_FILE
-from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, get_file_from_repo, logging
+from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, cached_file, logging
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
     CONFIG_MAPPING_NAMES,
@@ -48,6 +48,7 @@
         ("align", "AlignProcessor"),
         ("altclip", "AltCLIPProcessor"),
         ("aria", "AriaProcessor"),
+        ("aya_vision", "AyaVisionProcessor"),
         ("bark", "BarkProcessor"),
         ("blip", "BlipProcessor"),
         ("blip-2", "Blip2Processor"),
@@ -62,6 +63,7 @@
         ("emu3", "Emu3Processor"),
         ("flava", "FlavaProcessor"),
         ("fuyu", "FuyuProcessor"),
+        ("gemma3", "Gemma3Processor"),
         ("git", "GitProcessor"),
         ("got_ocr2", "GotOcr2Processor"),
         ("grounding-dino", "GroundingDinoProcessor"),
@@ -75,6 +77,7 @@
         ("kosmos-2", "Kosmos2Processor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
+        ("llama4", "Llama4Processor"),
         ("llava", "LlavaProcessor"),
         ("llava_next", "LlavaNextProcessor"),
         ("llava_next_video", "LlavaNextVideoProcessor"),
@@ -82,12 +85,14 @@
         ("markuplm", "MarkupLMProcessor"),
         ("mctct", "MCTCTProcessor"),
         ("mgp-str", "MgpstrProcessor"),
+        ("mistral3", "PixtralProcessor"),
         ("mllama", "MllamaProcessor"),
         ("moonshine", "Wav2Vec2Processor"),
         ("oneformer", "OneFormerProcessor"),
         ("owlv2", "Owlv2Processor"),
         ("owlvit", "OwlViTProcessor"),
         ("paligemma", "PaliGemmaProcessor"),
+        ("phi4_multimodal", "Phi4MultimodalProcessor"),
         ("pix2struct", "Pix2StructProcessor"),
         ("pixtral", "PixtralProcessor"),
         ("pop2piano", "Pop2PianoProcessor"),
@@ -98,7 +103,9 @@
         ("seamless_m4t", "SeamlessM4TProcessor"),
         ("sew", "Wav2Vec2Processor"),
         ("sew-d", "Wav2Vec2Processor"),
+        ("shieldgemma2", "ShieldGemma2Processor"),
         ("siglip", "SiglipProcessor"),
+        ("siglip2", "Siglip2Processor"),
         ("speech_to_text", "Speech2TextProcessor"),
         ("speech_to_text_2", "Speech2Text2Processor"),
         ("speecht5", "SpeechT5Processor"),
@@ -251,15 +258,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         processor_auto_map = None
 
         # First, let's see if we have a processor or preprocessor config.
-        # Filter the kwargs for `get_file_from_repo`.
-        get_file_from_repo_kwargs = {
-            key: kwargs[key] for key in inspect.signature(get_file_from_repo).parameters.keys() if key in kwargs
+        # Filter the kwargs for `cached_file`.
+        cached_file_kwargs = {
+            key: kwargs[key] for key in inspect.signature(cached_file).parameters.keys() if key in kwargs
         }
+        # We don't want to raise
+        cached_file_kwargs.update(
+            {
+                "_raise_exceptions_for_gated_repo": False,
+                "_raise_exceptions_for_missing_entries": False,
+                "_raise_exceptions_for_connection_errors": False,
+            }
+        )
 
         # Let's start by checking whether the processor class is saved in a processor config
-        processor_config_file = get_file_from_repo(
-            pretrained_model_name_or_path, PROCESSOR_NAME, **get_file_from_repo_kwargs
-        )
+        processor_config_file = cached_file(pretrained_model_name_or_path, PROCESSOR_NAME, **cached_file_kwargs)
         if processor_config_file is not None:
             config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs)
             processor_class = config_dict.get("processor_class", None)
@@ -268,8 +281,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         if processor_class is None:
             # If not found, let's check whether the processor class is saved in an image processor config
-            preprocessor_config_file = get_file_from_repo(
-                pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **get_file_from_repo_kwargs
+            preprocessor_config_file = cached_file(
+                pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **cached_file_kwargs
             )
             if preprocessor_config_file is not None:
                 config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
@@ -288,8 +301,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
         if processor_class is None:
             # Next, let's check whether the processor class is saved in a tokenizer
-            tokenizer_config_file = get_file_from_repo(
-                pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, **get_file_from_repo_kwargs
+            tokenizer_config_file = cached_file(
+                pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, **cached_file_kwargs
             )
             if tokenizer_config_file is not None:
                 with open(tokenizer_config_file, encoding="utf-8") as reader:
@@ -373,6 +386,6 @@ def register(config_class, processor_class, exist_ok=False):
         Args:
             config_class ([`PretrainedConfig`]):
                 The configuration corresponding to the model to register.
-            processor_class ([`FeatureExtractorMixin`]): The processor to register.
+            processor_class ([`ProcessorMixin`]): The processor to register.
         """
         PROCESSOR_MAPPING.register(config_class, processor_class, exist_ok=exist_ok)
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 5ee4f612285f..eb54d95ab647 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -69,6 +69,7 @@
             ),
             ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+            ("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)),
             ("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("bart", ("BartTokenizer", "BartTokenizerFast")),
             (
@@ -170,6 +171,13 @@
                     "DebertaV2TokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "deepseek_v3",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             (
                 "diffllama",
                 (
@@ -214,6 +222,20 @@
                     "GemmaTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "gemma3",
+                (
+                    "GemmaTokenizer" if is_sentencepiece_available() else None,
+                    "GemmaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "gemma3_text",
+                (
+                    "GemmaTokenizer" if is_sentencepiece_available() else None,
+                    "GemmaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)),
@@ -270,6 +292,20 @@
                     "LlamaTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "llama4",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "llama4_text",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
@@ -432,6 +468,20 @@
                 ),
             ),
             ("qwen2_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "qwen3",
+                (
+                    "Qwen2Tokenizer",
+                    "Qwen2TokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "qwen3_moe",
+                (
+                    "Qwen2Tokenizer",
+                    "Qwen2TokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("rag", ("RagTokenizer", None)),
             ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
             (
@@ -478,7 +528,21 @@
                     "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "shieldgemma2",
+                (
+                    "GemmaTokenizer" if is_sentencepiece_available() else None,
+                    "GemmaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("siglip", ("SiglipTokenizer" if is_sentencepiece_available() else None, None)),
+            (
+                "siglip2",
+                (
+                    "GemmaTokenizer" if is_sentencepiece_available() else None,
+                    "GemmaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
             ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
             ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
@@ -975,6 +1039,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}."
         )
 
+    @staticmethod
     def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False):
         """
         Register a new tokenizer in this mapping.
diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index 5a5b5f24397b..5a96b6235de9 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -85,8 +85,8 @@ class AutoFormerDecoderOutput(ModelOutput):
             weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    trend: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    trend: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -153,8 +153,8 @@ class AutoformerModelOutput(ModelOutput):
             Static features of each time series' in a batch which are copied to the covariates at inference time.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    trend: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    trend: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -305,7 +305,7 @@ def __init__(self, config: AutoformerConfig):
         self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
+        self, data: torch.Tensor, observed_indicator: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters:
@@ -360,24 +360,21 @@ class AutoformerSinusoidalPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
         super().__init__(num_positions, embedding_dim)
-        self.weight = self._init_weight(self.weight)
 
-    @staticmethod
-    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+    def _init_weight(self):
         """
         Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
         the 2nd half of the vector. [dim // 2:]
         """
-        n_pos, dim = out.shape
+        n_pos, dim = self.weight.shape
         position_enc = np.array(
             [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
         )
-        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False)
         sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
         out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
         out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-        out.detach_()
-        return out
+        self.weight = nn.Parameter(out, requires_grad=False)
 
     @torch.no_grad()
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
@@ -904,7 +901,7 @@ def _init_weights(self, module):
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, AutoformerSinusoidalPositionalEmbedding):
-            pass
+            module._init_weight()
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
diff --git a/src/transformers/models/aya_vision/__init__.py b/src/transformers/models/aya_vision/__init__.py
new file mode 100644
index 000000000000..f8be47cb228b
--- /dev/null
+++ b/src/transformers/models/aya_vision/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_aya_vision import *
+    from .modeling_aya_vision import *
+    from .processing_aya_vision import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/aya_vision/configuration_aya_vision.py b/src/transformers/models/aya_vision/configuration_aya_vision.py
new file mode 100644
index 000000000000..574a5755abd6
--- /dev/null
+++ b/src/transformers/models/aya_vision/configuration_aya_vision.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2025 Cohere team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AyaVision model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class AyaVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AyaVisionForConditionalGeneration`]. It is used to instantiate an
+    AyaVision model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of AyaVision.
+    e.g. [CohereForAI/aya-vision-8b](https://huggingface.co/CohereForAI/aya-vision-8b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
+            If `"full"`, the full vision features are used.
+        vision_feature_layer (`int`, *optional*, defaults to -1):
+            The index of the layer to select the vision feature.
+        downsample_factor (`int`, *optional*, defaults to 2):
+            The downsample factor to apply to the vision features.
+        adapter_layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon value used for layer normalization in the adapter.
+        image_token_index (`int`, *optional*, defaults to 255036):
+            The image token index to encode the image prompt.
+    """
+
+    model_type = "aya_vision"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        vision_feature_select_strategy="full",
+        vision_feature_layer=-1,
+        downsample_factor=2,
+        adapter_layer_norm_eps=1e-6,
+        image_token_index=255036,
+        **kwargs,
+    ):
+        self.image_token_index = image_token_index
+        self.downsample_factor = downsample_factor
+        self.adapter_layer_norm_eps = adapter_layer_norm_eps
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                "vision_feature_select_strategy should be one of 'default', 'full'."
+                f"Got: {vision_feature_select_strategy}"
+            )
+
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
+            )
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["siglip_vision_model"](
+                hidden_size=1152,
+                intermediate_size=4304,
+                patch_size=14,
+                image_size=384,
+                num_hidden_layers=26,
+                num_attention_heads=14,
+                vision_use_head=False,
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["cohere2"]()
+
+        self.text_config = text_config
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["AyaVisionConfig"]
diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py
new file mode 100644
index 000000000000..1e6e76a21075
--- /dev/null
+++ b/src/transformers/models/aya_vision/modeling_aya_vision.py
@@ -0,0 +1,555 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/aya_vision/modular_aya_vision.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_aya_vision.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...generation import GenerationMixin
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torchdynamo_compiling,
+    replace_return_docstrings,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_aya_vision import AyaVisionConfig
+
+
+_CONFIG_FOR_DOC = "AyaVisionConfig"
+
+
+class AyaVisionMultiModalProjector(nn.Module):
+    def __init__(self, config: AyaVisionConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.alignment_intermediate_size = getattr(
+            config, "alignment_intermediate_size", config.text_config.hidden_size
+        )
+        self.layernorm = nn.LayerNorm(
+            config.vision_config.hidden_size * (config.downsample_factor**2), eps=config.adapter_layer_norm_eps
+        )
+
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2),
+            self.alignment_intermediate_size,
+            bias=True,
+        )
+
+        self.act = ACT2FN["silu"]  # SwiGLU uses SiLU activation
+        # For SwiGLU, project down to half size since we split intermediate dim
+        self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2, config.text_config.hidden_size, bias=True)
+
+    def forward(self, image_features):
+        image_features = self.pixel_shuffle(image_features)
+        image_features = self.layernorm(image_features)
+        hidden_states = self.linear_1(image_features)
+
+        # Split along last dimension and apply SwiGLU
+        x, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.act(gate) * x
+
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+    def pixel_shuffle(self, image_features):  # B, S, D
+        batch_size, seq_length, feature_dim = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(image_features.shape[0], width, height, -1)
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor)
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        return image_features
+
+
+AYA_VISION_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`AyaVisionConfig`] or [`AyaVisionVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Aya Vision Model outputting raw hidden-states without any specific head on top.",
+    AYA_VISION_START_DOCSTRING,
+)
+class AyaVisionPreTrainedModel(PreTrainedModel):
+    config_class = AyaVisionConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["AyaVisionVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_quantized_cache = False
+    _supports_static_cache = False
+
+    def _init_weights(self, module):
+        # important: this ported version of AyaVision isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/AyaVision/tree/main/aya_vision should serve for that purpose
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+@dataclass
+class AyaVisionCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for AyaVision causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+AYA_VISION_INPUTS_DOCSTRING = """
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`AyaVisionProcessor`] uses
+            [`GotOcr2ImageProcessor`] for processing images.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    """The AyaVision model which consists of a vision backbone and a language model.""",
+    AYA_VISION_START_DOCSTRING,
+)
+class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixin):
+    def __init__(self, config: AyaVisionConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = AyaVisionMultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Union[int, List[int]],
+        vision_feature_select_strategy: str,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+            vision_feature_layer (`Union[int, List[int]]`):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            vision_feature_select_strategy (`str`):
+                The feature selection strategy used to select the vision feature from the vision backbone.
+                Can be one of `"default"` or `"full"`
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, **kwargs)
+
+        # If we have one vision feature layer, return the corresponding hidden states,
+        # otherwise, select the hidden states of each feature layer and concatenate them
+        if isinstance(vision_feature_layer, int):
+            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+            if vision_feature_select_strategy == "default":
+                selected_image_feature = selected_image_feature[:, 1:]
+        else:
+            hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            # For default; crop CLS from each hidden state in the hidden state pool
+            if vision_feature_select_strategy == "default":
+                hs_pool = [hs[:, 1:] for hs in hs_pool]
+            selected_image_feature = torch.cat(hs_pool, dim=-1)
+
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(AYA_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=AyaVisionCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, List[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **lm_kwargs,
+    ) -> Union[Tuple, AyaVisionCausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, AyaVisionForConditionalGeneration
+        >>> import torch
+
+        >>> torch_device = "cuda:0"
+        >>> processor = AutoProcessor.from_pretrained("CohereForAI/aya-vision-8b", use_fast=True)
+        >>> model = AyaVisionForConditionalGeneration.from_pretrained("CohereForAI/aya-vision-8b", device_map=torch_device)
+
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium",
+        ...             },
+        ...             {"type": "text", "text": "चित्र में लिखा पाठ क्या कहता है?"},
+        ...         ],
+        ...     }
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", device=torch_device
+        ... ).to(model.device)
+
+        >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3)
+        >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+                image_sizes=image_sizes,
+            )
+
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
+                n_image_tokens = (input_ids == self.config.image_token_index).sum()
+                n_image_features = image_features.shape[0] * image_features.shape[1]
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return AyaVisionCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+
+
+__all__ = ["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel"]
diff --git a/src/transformers/models/aya_vision/modular_aya_vision.py b/src/transformers/models/aya_vision/modular_aya_vision.py
new file mode 100644
index 000000000000..b046275a2d3a
--- /dev/null
+++ b/src/transformers/models/aya_vision/modular_aya_vision.py
@@ -0,0 +1,300 @@
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch AyaVision model."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from transformers.models.llava.modeling_llava import (
+    LlavaCausalLMOutputWithPast,
+    LlavaForConditionalGeneration,
+    LlavaPreTrainedModel,
+)
+
+from ...activations import ACT2FN
+from ...utils import (
+    add_start_docstrings,
+    logging,
+)
+from .configuration_aya_vision import AyaVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "AyaVisionConfig"
+
+
+class AyaVisionMultiModalProjector(nn.Module):
+    def __init__(self, config: AyaVisionConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.alignment_intermediate_size = getattr(
+            config, "alignment_intermediate_size", config.text_config.hidden_size
+        )
+        self.layernorm = nn.LayerNorm(
+            config.vision_config.hidden_size * (config.downsample_factor**2), eps=config.adapter_layer_norm_eps
+        )
+
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2),
+            self.alignment_intermediate_size,
+            bias=True,
+        )
+
+        self.act = ACT2FN["silu"]  # SwiGLU uses SiLU activation
+        # For SwiGLU, project down to half size since we split intermediate dim
+        self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2, config.text_config.hidden_size, bias=True)
+
+    def forward(self, image_features):
+        image_features = self.pixel_shuffle(image_features)
+        image_features = self.layernorm(image_features)
+        hidden_states = self.linear_1(image_features)
+
+        # Split along last dimension and apply SwiGLU
+        x, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.act(gate) * x
+
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+    def pixel_shuffle(self, image_features):  # B, S, D
+        batch_size, seq_length, feature_dim = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(image_features.shape[0], width, height, -1)
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor)
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        return image_features
+
+
+AYA_VISION_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`AyaVisionConfig`] or [`AyaVisionVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Aya Vision Model outputting raw hidden-states without any specific head on top.",
+    AYA_VISION_START_DOCSTRING,
+)
+class AyaVisionPreTrainedModel(LlavaPreTrainedModel):
+    _supports_quantized_cache = False
+    _supports_static_cache = False
+
+
+class AyaVisionCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
+    pass
+
+
+AYA_VISION_INPUTS_DOCSTRING = """
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`AyaVisionProcessor`] uses
+            [`GotOcr2ImageProcessor`] for processing images.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    """The AyaVision model which consists of a vision backbone and a language model.""",
+    AYA_VISION_START_DOCSTRING,
+)
+class AyaVisionForConditionalGeneration(LlavaForConditionalGeneration):
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, List[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **lm_kwargs,
+    ) -> Union[Tuple, AyaVisionCausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, AyaVisionForConditionalGeneration
+        >>> import torch
+
+        >>> torch_device = "cuda:0"
+        >>> processor = AutoProcessor.from_pretrained("CohereForAI/aya-vision-8b", use_fast=True)
+        >>> model = AyaVisionForConditionalGeneration.from_pretrained("CohereForAI/aya-vision-8b", device_map=torch_device)
+
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium",
+        ...             },
+        ...             {"type": "text", "text": "चित्र में लिखा पाठ क्या कहता है?"},
+        ...         ],
+        ...     }
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", device=torch_device
+        ... ).to(model.device)
+
+        >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3)
+        >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        ```"""
+        super().forward(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            image_sizes=image_sizes,
+            **lm_kwargs,
+        )
+
+
+__all__ = ["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel"]
diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py
new file mode 100644
index 000000000000..20837000a83a
--- /dev/null
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Union
+
+from transformers.processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import (
+    ImageInput,
+    make_flat_list_of_images,
+)
+
+
+class AyaVisionImagesKwargs(ImagesKwargs, total=False):
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
+class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: AyaVisionImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding_side": "left",
+            "padding": True,
+        },
+        "images_kwargs": {
+            "crop_to_patches": True,
+        },
+    }
+
+
+class AyaVisionProcessor(ProcessorMixin):
+    r"""
+    Constructs a AyaVision processor which wraps a [`AutoImageProcessor`] and
+    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
+    tokenizer functionalities. See the [`~AyaVisionProcessor.__call__`] and [`~AyaVisionProcessor.decode`] for more information.
+    Args:
+        image_processor ([`AutoImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        patch_size (`int`, *optional*, defaults to 28):
+            The size of image patches for tokenization.
+        img_size (`int`, *optional*, defaults to 364):
+            The size of the image to be tokenized. This should correspond to the size given to the image processor.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            The token to be used to represent an image in the text.
+        downsample_factor (`int`, *optional*, defaults to 1):
+            The factor by which to scale the patch size.
+        start_of_img_token (`str`, *optional*, defaults to `"<|START_OF_IMG|>"`):
+            The token to be used to represent the start of an image in the text.
+        end_of_img_token (`str`, *optional*, defaults to `"<|END_OF_IMG|>"`):
+            The token to be used to represent the end of an image in the text.
+        img_patch_token (`str`, *optional*, defaults to `"<|IMG_PATCH|>"`):
+            The token to be used to represent an image patch in the text.
+        img_line_break_token (`str`, *optional*, defaults to `"<|IMG_LINE_BREAK|>"`):
+            The token to be used to represent a line break in the text.
+        tile_token (`str`, *optional*, defaults to `"TILE"`):
+            The token to be used to represent an image patch in the text.
+        tile_global_token (`str`, *optional*, defaults to `"TILE_GLOBAL"`):
+            The token to be used to represent the cover image in the text.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "image_token",
+        "patch_size",
+        "img_size",
+        "downsample_factor",
+        "start_of_img_token",
+        "end_of_img_token",
+        "img_patch_token",
+        "img_line_break_token",
+        "tile_token",
+        "tile_global_token",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size: int = 28,
+        img_size: int = 364,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        downsample_factor: int = 1,
+        start_of_img_token="<|START_OF_IMG|>",
+        end_of_img_token="<|END_OF_IMG|>",
+        img_patch_token="<|IMG_PATCH|>",
+        img_line_break_token="<|IMG_LINE_BREAK|>",
+        tile_token="TILE",
+        tile_global_token="TILE_GLOBAL",
+        chat_template=None,
+        **kwargs,
+    ):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+        self.image_token = image_token
+        self.patch_size = patch_size * downsample_factor
+        self.img_size = img_size
+
+        self.start_of_img_token = start_of_img_token
+        self.end_of_img_token = end_of_img_token
+        self.img_patch_token = img_patch_token
+        self.img_line_break_token = img_line_break_token
+        self.tile_token = tile_token
+        self.tile_global_token = tile_global_token
+
+    def _prompt_split_image(self, num_patches):
+        """
+        Create a structured string representation of image tokens
+
+        Args:
+           num_patches: Number of patches in the image
+
+        Returns:
+            String with appropriate image tokens
+        """
+
+        img_patches_per_tile = (self.img_size // self.patch_size) ** 2
+        img_string = f"{self.start_of_img_token}"
+        if num_patches > 1:
+            for idx in range(1, num_patches):
+                img_string += f"{self.tile_token}_{idx}" + f"{self.img_patch_token}" * img_patches_per_tile
+
+        img_string += f"{self.tile_global_token}" + f"{self.img_patch_token}" * img_patches_per_tile
+        img_string += f"{self.end_of_img_token}"
+        return img_string
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[AyaVisionProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
+        To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
+        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None:
+            raise ValueError("You have to specify text.")
+
+        output_kwargs = self._merge_kwargs(
+            AyaVisionProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if not isinstance(text, (list, tuple)):
+            text = [text]
+
+        # Process images
+        image_inputs = {}
+        if images is not None:
+            images = make_flat_list_of_images(images)
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            num_patches = image_inputs.pop("num_patches")
+            image_index = 0
+            processed_text = []
+            for prompt in text:
+                new_prompt = prompt
+                while "<image>" in new_prompt:
+                    # Replace the image placeholder with structured image tokens
+                    image_tokens = self._prompt_split_image(num_patches[image_index])
+                    new_prompt = new_prompt.replace("<image>", image_tokens, 1)
+                    image_index += 1
+                processed_text.append(new_prompt)
+
+            if image_index != len(images):
+                raise ValueError("Number of image placeholders in the prompt does not match the number of images.")
+
+            text = processed_text
+
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        return BatchFeature(data={**text_inputs, **image_inputs})
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(tokenizer_input_names) + list(image_processor_input_names)
+
+
+__all__ = ["AyaVisionProcessor"]
diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py
index f84d63ec04a9..36ac30ccca43 100644
--- a/src/transformers/models/bamba/configuration_bamba.py
+++ b/src/transformers/models/bamba/configuration_bamba.py
@@ -39,7 +39,7 @@ class BambaConfig(PretrainedConfig):
             `inputs_ids` passed when calling [`BambaModel`]
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
-            model has a output word embedding layer.
+            model has an output word embedding layer.
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 14336):
@@ -85,7 +85,7 @@ class BambaConfig(PretrainedConfig):
         mamba_n_heads (`int`, *optional*, defaults to 128):
             The number of mamba heads used in the v2 implementation.
         mamba_d_head (`int`, *optional*, defaults to `"auto"`):
-            Head embeddding dimension size
+            Head embedding dimension size
         mamba_n_groups (`int`, *optional*, defaults to 1):
             The number of the mamba groups used in the v2 implementation.
         mamba_d_state (`int`, *optional*, defaults to 256):
diff --git a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py b/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py
deleted file mode 100644
index a7b8cfc78290..000000000000
--- a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# coding=utf-8
-# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba2_ssm` package to be installed."""
-
-import argparse
-import json
-import os
-import re
-from os import path
-from typing import Dict, Union
-
-import torch
-from huggingface_hub import split_torch_state_dict_into_shards
-from safetensors.torch import save_file
-
-from transformers import AutoTokenizer
-from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
-
-from .configuration_bamba import BambaConfig
-
-
-def convert_state_dict_from_mamba_ssm(original_sd: Dict) -> Dict[str, torch.Tensor]:
-    state_dict = {}
-
-    for orig_k, param in original_sd.items():
-        k = orig_k.replace("backbone", "model")
-
-        # for embeddings
-        k = k.replace("embedding", "embed_tokens")
-
-        # for mixer
-        k = k.replace("mixer", "mamba")
-
-        # for final layernorm
-        k = k.replace("norm_f", "final_layernorm")
-
-        # for block layernorm
-        k = re.sub(r"(\d+)\.norm\.", r"\1.input_layernorm.", k)
-        k = re.sub(r"(\d+)\.norm2\.", r"\1.pre_ff_layernorm.", k)
-
-        # for mlp
-        k = k.replace("mlp.fc2", "feed_forward.down_proj")
-
-        if "mlp.fc1" in k:
-            param, param2 = torch.chunk(param, 2, dim=0)
-            k2 = k.replace("mlp.fc1", "feed_forward.gate_proj")
-            state_dict[k2] = param2
-            k = k.replace("mlp.fc1", "feed_forward.up_proj")
-
-        if ("in_proj" in k and orig_k.replace("in_proj", "conv1d") in original_sd) or (
-            "out_proj" in k and orig_k.replace("out_proj", "conv1d") in original_sd
-        ):
-            # then this must be a mamba
-            pass
-        else:
-            # for attn
-            # - because mixer was replaced to mamba above
-            k = k.replace("mamba.out_proj", "self_attn.o_proj")
-            if "mamba.in_proj" in k:
-                m, n = param.shape
-                d = (m - n) // 2
-                param, param2, param3 = torch.split(param, [n, d, d], dim=0)
-                k2 = k.replace("mamba.in_proj", "self_attn.k_proj")
-                state_dict[k2] = param2
-                k2 = k.replace("mamba.in_proj", "self_attn.v_proj")
-                state_dict[k2] = param3
-                k = k.replace("mamba.in_proj", "self_attn.q_proj")
-
-        state_dict[k] = param
-
-    return state_dict
-
-
-# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py
-def convert_ssm_config_to_hf_config(
-    config_ssm: Dict,
-    **kwargs,
-) -> BambaConfig:
-    """Convert a config from mamba_ssm to a BambaConfig from here."""
-    hf_config: BambaConfig = BambaConfig(**kwargs)
-
-    hf_config.architectures = ["BambaForCausalLM"]
-
-    # Set important values from config and recalculate other resulting entries
-    hf_config.hidden_size = config_ssm["d_model"]
-    hf_config.intermediate_size = config_ssm["d_intermediate"]
-    hf_config.mamba_n_heads = (hf_config.hidden_size * hf_config.mamba_expand) // hf_config.mamba_d_head
-    hf_config.num_hidden_layers = config_ssm["n_layer"]
-    hf_config.tie_word_embeddings = config_ssm["tie_embeddings"]
-
-    # currently this script assumes config_ssm belongs to v2
-    if config_ssm["ssm_cfg"].get("layer") != "Mamba2":
-        raise ValueError("Conversion script only supports Mamba2")
-
-    # Set attention values
-    attn_cfg = config_ssm.get("attn_cfg")
-    if attn_cfg:
-        assert attn_cfg["causal"], "Only support non-causal attention."
-        assert not attn_cfg["qkv_proj_bias"], "Only support no qkv bias."
-        assert not attn_cfg["out_proj_bias"], "Only support no out bias."
-        hf_config.attn_rotary_emb = attn_cfg["rotary_emb_dim"]
-        hf_config.num_attention_heads = attn_cfg["num_heads"]
-        hf_config.num_key_value_heads = attn_cfg["num_heads_kv"]
-
-    attention_layer_indices = config_ssm.get("attn_layer_idx")
-    if attention_layer_indices:
-        hf_config.attn_layer_indices = attention_layer_indices
-
-    # Padded vocab size, mostly of 16 but 32 is also very common in different models
-    vocab_size = config_ssm["vocab_size"]
-    pad_vocab_size_multiple = config_ssm["pad_vocab_size_multiple"]
-    if (vocab_size % pad_vocab_size_multiple) != 0:
-        vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
-    hf_config.vocab_size = vocab_size
-
-    return hf_config
-
-
-def save_single_safetensor(
-    state_dict: Dict,
-    save_directory: str,
-    metadata: Dict,
-):
-    save_file(
-        state_dict,
-        os.path.join(save_directory, SAFE_WEIGHTS_NAME),
-        metadata,
-    )
-
-
-def save_sharded_safetensors(
-    state_dict: Dict,
-    save_directory: str,
-    metadata: Dict,
-    max_shard_size: Union[int, str] = "5GB",
-):
-    filename_pattern = SAFE_WEIGHTS_NAME.replace(".bin", "{suffix}.bin").replace(
-        ".safetensors", "{suffix}.safetensors"
-    )
-    state_dict_split = split_torch_state_dict_into_shards(
-        state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size
-    )
-    index = {
-        "metadata": state_dict_split.metadata,
-        "weight_map": state_dict_split.tensor_to_filename,
-    }
-    # Save the index
-    with open(os.path.join(save_directory, SAFE_WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f:
-        content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-        f.write(content)
-
-    filename_to_tensors = state_dict_split.filename_to_tensors.items()
-    for shard_file, tensors in filename_to_tensors:
-        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
-        save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata)
-
-
-# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py
-def convert_mamba_ssm_checkpoint_file_to_huggingface_model_file(
-    mamba_ssm_checkpoint_path: str,
-    precision: str,
-    output_dir: str,
-    tokenizer_path: str = None,
-    save_model: Union[bool, str] = True,
-) -> None:
-    # load tokenizer if provided, this will be used to set the
-    # token_ids in the config file
-    token_ids = {}
-    if tokenizer_path:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-        for key in [
-            "bos_token_id",
-            "eos_token_id",
-            "pad_token_id",
-        ]:
-            id = getattr(tokenizer, key, None)
-            if id:
-                token_ids[key] = id
-
-    # there are some configs unsettable by mamba_ssn config, so
-    # if there are changes from the defaults, have to pass them into
-    # the function
-    unsettables = {
-        "mamba_d_head": 64,
-        "mamba_d_state": 128,
-        "mamba_n_groups": 1,
-        "rms_norm_eps": 1e-5,
-    }
-
-    # Load and save config based on name
-    config_path = path.join(mamba_ssm_checkpoint_path, "config.json")
-    with open(config_path, "r", encoding="utf-8") as json_file:
-        config = json.load(json_file)
-
-    # convert the config
-    hf_config = convert_ssm_config_to_hf_config(
-        config_ssm=config,
-        **token_ids,
-        **unsettables,
-    )
-    hf_config.save_pretrained(output_dir)
-
-    # Load state dict of the original model and transfer to hf model
-    state_dict = torch.load(
-        path.join(mamba_ssm_checkpoint_path, "pytorch_model.bin"),
-        map_location="cpu",
-        weights_only=True,
-    )
-    # FIXME: allow other parameters to pass in
-    state_dict = convert_state_dict_from_mamba_ssm(state_dict)
-
-    # Save new model to pytorch_dump_path
-    dtype = torch.float32 if precision == "fp32" else (torch.bfloat16 if precision == "bf16" else torch.float16)
-
-    save_file_fn = None
-    if isinstance(save_model, bool) and save_model:
-        save_file_fn = save_single_safetensor
-    elif isinstance(save_model, str) and save_model == "sharded":
-        save_file_fn = save_sharded_safetensors
-
-    if save_file_fn:
-        save_file_fn({k: v.to(dtype) for k, v in state_dict.items()}, output_dir, metadata={"format": "pt"})
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-i",
-        "--mamba_ssm_checkpoint_directory",
-        type=str,
-        required=True,
-        help="Path to a directory containing the `pytorch_model.bin` mamba_ssm checkpoint file to be converted.",
-    )
-    parser.add_argument(
-        "-p",
-        "--precision",
-        type=str,
-        default="fp16",
-        const="fp16",
-        required=True,
-        choices=("fp32", "fp16", "bf16"),
-        help="The precision the model will be saved in. Select from fp32, fp16 or bf16.",
-    )
-    parser.add_argument(
-        "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to."
-    )
-    parser.add_argument(
-        "-t",
-        "--tokenizer_model_path",
-        type=str,
-        default=None,
-        required=False,
-        help="Path to a the tokenizer file.",
-    )
-    args = parser.parse_args()
-
-    convert_mamba_ssm_checkpoint_file_to_huggingface_model_file(
-        args.mamba2_checkpoint_directory,
-        args.precision,
-        args.output_dir,
-    )
diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py
index 6fdce41e5a68..47c0ee5a9313 100644
--- a/src/transformers/models/bamba/modeling_bamba.py
+++ b/src/transformers/models/bamba/modeling_bamba.py
@@ -37,21 +37,18 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
 from ...utils.deprecation import deprecate_kwarg
-from ...utils.import_utils import (
-    is_causal_conv1d_available,
-    is_mamba_2_ssm_available,
-)
+from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available
 from .configuration_bamba import BambaConfig
 
 
@@ -145,45 +142,18 @@ def __init__(self, config: BambaConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -1195,10 +1165,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
@@ -1206,18 +1177,15 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs,  # NOOP kwargs, for now
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1302,8 +1270,6 @@ def forward(
 
         next_cache = None if not use_cache else past_key_values
 
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1399,7 +1365,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1475,12 +1441,13 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
@@ -1489,13 +1456,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1530,10 +1495,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1542,12 +1506,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -1556,10 +1519,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -1591,7 +1550,7 @@ def prepare_inputs_for_generation(
         if not empty_past_kv:
             if (
                 inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
             ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
diff --git a/src/transformers/models/bamba/modular_bamba.py b/src/transformers/models/bamba/modular_bamba.py
index 3972d25b51b9..b6cdf9077408 100644
--- a/src/transformers/models/bamba/modular_bamba.py
+++ b/src/transformers/models/bamba/modular_bamba.py
@@ -51,7 +51,7 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -936,10 +936,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
@@ -947,18 +948,15 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs,  # NOOP kwargs, for now
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1043,8 +1041,6 @@ def forward(
 
         next_cache = None if not use_cache else past_key_values
 
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1140,7 +1136,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1185,12 +1181,13 @@ def _update_mamba_mask(self, attention_mask, cache_position):
 
 
 class BambaForCausalLM(LlamaForCausalLM):
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
@@ -1199,13 +1196,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1246,7 +1241,6 @@ def forward(
             use_cache,
             output_attentions,
             output_hidden_states,
-            return_dict,
             cache_position,
             logits_to_keep,
             **kwargs,
@@ -1275,7 +1269,7 @@ def prepare_inputs_for_generation(
         if not empty_past_kv:
             if (
                 inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
             ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
diff --git a/src/transformers/models/bark/convert_suno_to_hf.py b/src/transformers/models/bark/convert_suno_to_hf.py
deleted file mode 100644
index 880debe60ae4..000000000000
--- a/src/transformers/models/bark/convert_suno_to_hf.py
+++ /dev/null
@@ -1,263 +0,0 @@
-"""Convert Bark checkpoint."""
-
-import argparse
-import os
-from pathlib import Path
-
-import torch
-from bark.generation import _load_model as _bark_load_model
-from huggingface_hub import hf_hub_download
-
-from transformers import EncodecConfig, EncodecModel, set_seed
-from transformers.models.bark.configuration_bark import (
-    BarkCoarseConfig,
-    BarkConfig,
-    BarkFineConfig,
-    BarkSemanticConfig,
-)
-from transformers.models.bark.generation_configuration_bark import (
-    BarkCoarseGenerationConfig,
-    BarkFineGenerationConfig,
-    BarkGenerationConfig,
-    BarkSemanticGenerationConfig,
-)
-from transformers.models.bark.modeling_bark import BarkCoarseModel, BarkFineModel, BarkModel, BarkSemanticModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-set_seed(770)
-
-
-new_layer_name_dict = {
-    "c_attn": "att_proj",
-    "c_proj": "out_proj",
-    "c_fc": "in_proj",
-    "transformer.": "",
-    "h.": "layers.",
-    "ln_1": "layernorm_1",
-    "ln_2": "layernorm_2",
-    "ln_f": "layernorm_final",
-    "wpe": "position_embeds_layer",
-    "wte": "input_embeds_layer",
-}
-
-
-REMOTE_MODEL_PATHS = {
-    "text_small": {
-        "repo_id": "suno/bark",
-        "file_name": "text.pt",
-    },
-    "coarse_small": {
-        "repo_id": "suno/bark",
-        "file_name": "coarse.pt",
-    },
-    "fine_small": {
-        "repo_id": "suno/bark",
-        "file_name": "fine.pt",
-    },
-    "text": {
-        "repo_id": "suno/bark",
-        "file_name": "text_2.pt",
-    },
-    "coarse": {
-        "repo_id": "suno/bark",
-        "file_name": "coarse_2.pt",
-    },
-    "fine": {
-        "repo_id": "suno/bark",
-        "file_name": "fine_2.pt",
-    },
-}
-
-CUR_PATH = os.path.dirname(os.path.abspath(__file__))
-default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
-CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0")
-
-
-def _get_ckpt_path(model_type, use_small=False):
-    key = model_type
-    if use_small:
-        key += "_small"
-    return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"])
-
-
-def _download(from_hf_path, file_name):
-    os.makedirs(CACHE_DIR, exist_ok=True)
-    hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)
-
-
-def _load_model(ckpt_path, device, use_small=False, model_type="text"):
-    if model_type == "text":
-        ModelClass = BarkSemanticModel
-        ConfigClass = BarkSemanticConfig
-        GenerationConfigClass = BarkSemanticGenerationConfig
-    elif model_type == "coarse":
-        ModelClass = BarkCoarseModel
-        ConfigClass = BarkCoarseConfig
-        GenerationConfigClass = BarkCoarseGenerationConfig
-    elif model_type == "fine":
-        ModelClass = BarkFineModel
-        ConfigClass = BarkFineConfig
-        GenerationConfigClass = BarkFineGenerationConfig
-    else:
-        raise NotImplementedError()
-    model_key = f"{model_type}_small" if use_small else model_type
-    model_info = REMOTE_MODEL_PATHS[model_key]
-    if not os.path.exists(ckpt_path):
-        logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
-        _download(model_info["repo_id"], model_info["file_name"])
-    checkpoint = torch.load(ckpt_path, map_location=device)
-    # this is a hack
-    model_args = checkpoint["model_args"]
-    if "input_vocab_size" not in model_args:
-        model_args["input_vocab_size"] = model_args["vocab_size"]
-        model_args["output_vocab_size"] = model_args["vocab_size"]
-        del model_args["vocab_size"]
-
-    # convert Bark model arguments to HF Bark model arguments
-    model_args["num_heads"] = model_args.pop("n_head")
-    model_args["hidden_size"] = model_args.pop("n_embd")
-    model_args["num_layers"] = model_args.pop("n_layer")
-
-    model_config = ConfigClass(**checkpoint["model_args"])
-    model = ModelClass(config=model_config)
-    model_generation_config = GenerationConfigClass()
-
-    model.generation_config = model_generation_config
-    state_dict = checkpoint["model"]
-    # fixup checkpoint
-    unwanted_prefix = "_orig_mod."
-    for k, v in list(state_dict.items()):
-        if k.startswith(unwanted_prefix):
-            # replace part of the key with corresponding layer name in HF implementation
-            new_k = k[len(unwanted_prefix) :]
-            for old_layer_name in new_layer_name_dict:
-                new_k = new_k.replace(old_layer_name, new_layer_name_dict[old_layer_name])
-
-            state_dict[new_k] = state_dict.pop(k)
-
-    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
-    extra_keys = {k for k in extra_keys if not k.endswith(".attn.bias")}
-    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
-    missing_keys = {k for k in missing_keys if not k.endswith(".attn.bias")}
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    model.load_state_dict(state_dict, strict=False)
-    n_params = model.num_parameters(exclude_embeddings=True)
-    val_loss = checkpoint["best_val_loss"].item()
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
-    model.eval()
-    model.to(device)
-    del checkpoint, state_dict
-
-    return model
-
-
-def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"):
-    if model_type not in ("text", "coarse", "fine"):
-        raise NotImplementedError()
-
-    device = "cpu"  # do conversion on cpu
-
-    ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
-    model = _load_model(ckpt_path, device, model_type=model_type, use_small=use_small)
-
-    # load bark initial model
-    bark_model = _bark_load_model(ckpt_path, "cpu", model_type=model_type, use_small=use_small)
-
-    if model_type == "text":
-        bark_model = bark_model["model"]
-
-    if model.num_parameters(exclude_embeddings=True) != bark_model.get_num_params():
-        raise ValueError("initial and new models don't have the same number of parameters")
-
-    # check if same output as the bark model
-    batch_size = 5
-    sequence_length = 10
-
-    if model_type in ["text", "coarse"]:
-        vec = torch.randint(256, (batch_size, sequence_length), dtype=torch.int)
-        output_old_model = bark_model(vec)[0]
-
-        output_new_model_total = model(vec)
-
-        # take last logits
-        output_new_model = output_new_model_total.logits[:, [-1], :]
-
-    else:
-        prediction_codeboook_channel = 3
-        n_codes_total = 8
-        vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int)
-
-        output_new_model_total = model(prediction_codeboook_channel, vec)
-        output_old_model = bark_model(prediction_codeboook_channel, vec)
-
-        output_new_model = output_new_model_total.logits
-
-    # output difference should come from the difference of self-attention implementation design
-    if output_new_model.shape != output_old_model.shape:
-        raise ValueError("initial and new outputs don't have the same shape")
-    if (output_new_model - output_old_model).abs().max().item() > 1e-3:
-        raise ValueError("initial and new outputs are not equal")
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-def load_whole_bark_model(
-    semantic_path,
-    coarse_path,
-    fine_path,
-    append_text,
-    hub_path,
-    folder_path,
-):
-    pytorch_dump_folder_path = os.path.join(folder_path, append_text)
-
-    semanticConfig = BarkSemanticConfig.from_pretrained(os.path.join(semantic_path, "config.json"))
-    coarseAcousticConfig = BarkCoarseConfig.from_pretrained(os.path.join(coarse_path, "config.json"))
-    fineAcousticConfig = BarkFineConfig.from_pretrained(os.path.join(fine_path, "config.json"))
-    codecConfig = EncodecConfig.from_pretrained("facebook/encodec_24khz")
-
-    semantic = BarkSemanticModel.from_pretrained(semantic_path)
-    coarseAcoustic = BarkCoarseModel.from_pretrained(coarse_path)
-    fineAcoustic = BarkFineModel.from_pretrained(fine_path)
-    codec = EncodecModel.from_pretrained("facebook/encodec_24khz")
-
-    bark_config = BarkConfig.from_sub_model_configs(
-        semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig
-    )
-
-    bark_generation_config = BarkGenerationConfig.from_sub_model_configs(
-        semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config
-    )
-
-    bark = BarkModel(bark_config)
-
-    bark.semantic = semantic
-    bark.coarse_acoustics = coarseAcoustic
-    bark.fine_acoustics = fineAcoustic
-    bark.codec_model = codec
-
-    bark.generation_config = bark_generation_config
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    bark.save_pretrained(pytorch_dump_folder_path, repo_id=hub_path, push_to_hub=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-
-    parser.add_argument("model_type", type=str, help="text, coarse or fine.")
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--is_small", action="store_true", help="convert the small version instead of the large.")
-
-    args = parser.parse_args()
-
-    load_model(args.pytorch_dump_folder_path, model_type=args.model_type, use_small=args.is_small)
diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
index 56f8ce4d1006..e5b7c0a49a3d 100644
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -29,14 +29,13 @@
     SuppressTokensLogitsProcessor,
 )
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import CausalLMOutputWithPast, MaskedLMOutput
 from ...modeling_utils import PreTrainedModel, get_parameter_device
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_accelerate_available,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
 )
 from ..auto import AutoModel
@@ -54,7 +53,7 @@
 )
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -201,9 +200,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _split_heads(self, tensor, num_heads, attn_head_size):
         """
@@ -898,7 +897,7 @@ def generate(
         # pass input_ids in order to stay consistent with the transformers generate method even though it is not used
         # (except to get the input seq_len - that's why we keep the first 257 tokens)
         semantic_output = super().generate(
-            torch.ones((batch_size, max_input_semantic_length + 1), dtype=torch.int).to(self.device),
+            torch.ones((batch_size, max_input_semantic_length + 1), dtype=torch.int, device=self.device),
             input_embeds=input_embeds,
             logits_processor=[suppress_tokens_logits_processor, early_stopping_logits_processor],
             generation_config=semantic_generation_config,
@@ -990,8 +989,8 @@ def preprocess_histories(
 
         else:
             # shape: (batch_size, 0)
-            x_semantic_history = torch.tensor([[]] * batch_size, dtype=torch.int).to(self.device)
-            x_coarse_history = torch.tensor([[]] * batch_size, dtype=torch.int).to(self.device)
+            x_semantic_history = torch.tensor([[]] * batch_size, dtype=torch.int, device=self.device)
+            x_coarse_history = torch.tensor([[]] * batch_size, dtype=torch.int, device=self.device)
 
         return x_semantic_history, x_coarse_history
 
@@ -1098,7 +1097,7 @@ def generate(
             input_coarse = torch.hstack(
                 [
                     input_coarse,
-                    torch.tensor([[coarse_generation_config.coarse_infer_token]] * batch_size).to(self.device),
+                    torch.tensor([[coarse_generation_config.coarse_infer_token]] * batch_size, device=self.device),
                     x_coarse[:, -max_coarse_history:],
                 ]
             )
@@ -1573,6 +1572,14 @@ def __init__(self, config):
 
         self.config = config
 
+    @classmethod
+    def can_generate(cls) -> bool:
+        # Bark has a unique model structure, where the external class (`BarkModel`) doesn't need to inherit from
+        # `GenerationMixin` (it has a non-standard generation method), but one of the internal models do
+        # (`BarkSemanticModel`). This means that the base `can_generate()` will return `False`, but we need to
+        # override it so as to do `GenerationConfig` handling in multiple parts of the codebase.
+        return True
+
     @property
     def device(self) -> torch.device:
         """
diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py
index 0bed6ca79f41..d2ec0629e421 100644
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@@ -25,7 +25,7 @@
 from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import ProcessorMixin
 from ...utils import logging
-from ...utils.hub import get_file_from_repo
+from ...utils.hub import cached_file
 from ..auto import AutoTokenizer
 
 
@@ -78,7 +78,7 @@ def from_pretrained(
                 - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
                   method, e.g., `./my_model_directory/`.
             speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
-                The name of the `.json` file containing the speaker_embeddings dictionnary located in
+                The name of the `.json` file containing the speaker_embeddings dictionary located in
                 `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
             **kwargs
                 Additional keyword arguments passed along to both
@@ -86,7 +86,7 @@ def from_pretrained(
         """
 
         if speaker_embeddings_dict_path is not None:
-            speaker_embeddings_path = get_file_from_repo(
+            speaker_embeddings_path = cached_file(
                 pretrained_processor_name_or_path,
                 speaker_embeddings_dict_path,
                 subfolder=kwargs.pop("subfolder", None),
@@ -97,12 +97,15 @@ def from_pretrained(
                 local_files_only=kwargs.pop("local_files_only", False),
                 token=kwargs.pop("use_auth_token", None),
                 revision=kwargs.pop("revision", None),
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
             )
             if speaker_embeddings_path is None:
                 logger.warning(
-                    f"""`{os.path.join(pretrained_processor_name_or_path,speaker_embeddings_dict_path)}` does not exists
+                    f"""`{os.path.join(pretrained_processor_name_or_path, speaker_embeddings_dict_path)}` does not exists
                     , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
-                    dictionnary if wanted, otherwise set `speaker_embeddings_dict_path=None`."""
+                    dictionary if wanted, otherwise set `speaker_embeddings_dict_path=None`."""
                 )
                 speaker_embeddings = None
             else:
@@ -132,7 +135,7 @@ def save_pretrained(
                 Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
                 if it does not exist).
             speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
-                The name of the `.json` file that will contains the speaker_embeddings nested path dictionnary, if it
+                The name of the `.json` file that will contains the speaker_embeddings nested path dictionary, if it
                 exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
             speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
                 The name of the folder in which the speaker_embeddings arrays will be saved.
@@ -172,7 +175,7 @@ def save_pretrained(
 
         super().save_pretrained(save_directory, push_to_hub, **kwargs)
 
-    def _load_voice_preset(self, voice_preset: str = None, **kwargs):
+    def _load_voice_preset(self, voice_preset: Optional[str] = None, **kwargs):
         voice_preset_paths = self.speaker_embeddings[voice_preset]
 
         voice_preset_dict = {}
@@ -182,7 +185,7 @@ def _load_voice_preset(self, voice_preset: str = None, **kwargs):
                     f"Voice preset unrecognized, missing {key} as a key in self.speaker_embeddings[{voice_preset}]."
                 )
 
-            path = get_file_from_repo(
+            path = cached_file(
                 self.speaker_embeddings.get("repo_or_path", "/"),
                 voice_preset_paths[key],
                 subfolder=kwargs.pop("subfolder", None),
@@ -193,10 +196,13 @@ def _load_voice_preset(self, voice_preset: str = None, **kwargs):
                 local_files_only=kwargs.pop("local_files_only", False),
                 token=kwargs.pop("use_auth_token", None),
                 revision=kwargs.pop("revision", None),
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
             )
             if path is None:
                 raise ValueError(
-                    f"""`{os.path.join(self.speaker_embeddings.get("repo_or_path", "/"),voice_preset_paths[key])}` does not exists
+                    f"""`{os.path.join(self.speaker_embeddings.get("repo_or_path", "/"), voice_preset_paths[key])}` does not exists
                     , no preloaded voice preset will be used - Make sure to provide correct paths to the {voice_preset}
                     embeddings."""
                 )
@@ -240,7 +246,7 @@ def __call__(
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             voice_preset (`str`, `Dict[np.ndarray]`):
                 The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
-                `"en_speaker_1"`, or directly a dictionnary of `np.ndarray` embeddings for each submodel of `Bark`. Or
+                `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
                 it can be a valid file name of a local `.npz` single voice preset.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
diff --git a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index e694d96ca0df..000000000000
--- a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BART checkpoint."""
-
-import argparse
-import os
-from pathlib import Path
-
-import fairseq
-import torch
-from packaging import version
-from torch import nn
-
-from transformers import (
-    BartConfig,
-    BartForConditionalGeneration,
-    BartForSequenceClassification,
-    BartModel,
-    BartTokenizer,
-)
-from transformers.utils import logging
-
-
-FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"]
-extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification}
-if version.parse(fairseq.__version__) < version.parse("0.9.0"):
-    raise Exception("requires fairseq >= 0.9.0")
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = " Hello world! cécé herlolip"
-
-mnli_rename_keys = [
-    ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
-    ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
-    ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
-    ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
-]
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "_float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def load_xsum_checkpoint(checkpoint_path):
-    """Checkpoint path should end in model.pt"""
-    sd = torch.load(checkpoint_path, map_location="cpu")
-    hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval()
-    hub_interface.model.load_state_dict(sd["model"])
-    return hub_interface
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-@torch.no_grad()
-def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
-    """
-    Copy/paste/tweak model's weights to our BERT structure.
-    """
-    if not os.path.exists(checkpoint_path):
-        bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
-    else:
-        bart = load_xsum_checkpoint(checkpoint_path)
-
-    bart.model.upgrade_state_dict(bart.model.state_dict())
-    if hf_checkpoint_name is None:
-        hf_checkpoint_name = checkpoint_path.replace(".", "-")
-    config = BartConfig.from_pretrained(hf_checkpoint_name)
-    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
-    tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
-    if not torch.eq(tokens, tokens2).all():
-        raise ValueError(
-            f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}"
-        )
-
-    if checkpoint_path == "bart.large.mnli":
-        state_dict = bart.state_dict()
-        remove_ignore_keys_(state_dict)
-        state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
-        for src, dest in mnli_rename_keys:
-            rename_key(state_dict, src, dest)
-        model = BartForSequenceClassification(config).eval()
-        model.load_state_dict(state_dict)
-        fairseq_output = bart.predict("mnli", tokens, return_logits=True)
-        new_model_outputs = model(tokens)[0]  # logits
-    else:  # no classification heads to worry about
-        state_dict = bart.model.state_dict()
-        remove_ignore_keys_(state_dict)
-        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-        fairseq_output = bart.extract_features(tokens)
-        if hf_checkpoint_name == "facebook/bart-large":
-            model = BartModel(config).eval()
-            model.load_state_dict(state_dict)
-            new_model_outputs = model(tokens).model[0]
-        else:
-            model = BartForConditionalGeneration(config).eval()  # an existing summarization ckpt
-            model.model.load_state_dict(state_dict)
-            if hasattr(model, "lm_head"):
-                model.lm_head = make_linear_from_emb(model.model.shared)
-            new_model_outputs = model.model(tokens)[0]
-
-    # Check results
-    if fairseq_output.shape != new_model_outputs.shape:
-        raise ValueError(
-            f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}"
-        )
-    if (fairseq_output != new_model_outputs).any().item():
-        raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
-    )
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum"
-    )
-    args = parser.parse_args()
-    convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config)
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index e64ab3b2d041..bc3d4dcd936c 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -32,6 +32,7 @@
     _prepare_4d_causal_attention_mask,
     _prepare_4d_causal_attention_mask_for_sdpa,
 )
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -47,15 +48,13 @@
     add_end_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_bart import BartConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -298,9 +297,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -994,7 +993,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -1179,7 +1178,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1442,8 +1441,15 @@ def __init__(self, config: BartConfig):
 
     def _tie_weights(self):
         if self.config.tie_word_embeddings:
-            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
-            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
+            # Some model checkpoints like "facebook/bart-large-cnn"'s embedding weight is in decoder.embed_tokens, need check here, see issue #36247
+            if self.shared.weight.device == torch.device(
+                "meta"
+            ) and self.decoder.embed_tokens.weight.device != torch.device("meta"):
+                self._tie_or_clone_weights(self.encoder.embed_tokens, self.decoder.embed_tokens)
+                self._tie_or_clone_weights(self.shared, self.decoder.embed_tokens)
+            else:
+                self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
+                self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
 
     def get_input_embeddings(self):
         return self.shared
@@ -1468,7 +1474,7 @@ def get_decoder(self):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1599,12 +1605,17 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self.model._tie_weights()
+            self._tie_or_clone_weights(self.lm_head, self.model.shared)
+
     @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     @add_end_docstrings(BART_GENERATION_EXAMPLE)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1731,7 +1742,7 @@ def __init__(self, config: BartConfig, **kwargs):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1860,7 +1871,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids: torch.Tensor = None,
+        input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -2009,7 +2020,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index b346eaa39fc1..18c8f6b85ccc 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -274,7 +274,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
deleted file mode 100644
index 46c72a97f495..000000000000
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BEiT checkpoints from the unilm repository."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from datasets import load_dataset
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    BeitConfig,
-    BeitForImageClassification,
-    BeitForMaskedImageModeling,
-    BeitForSemanticSegmentation,
-    BeitImageProcessor,
-)
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, has_lm_head=False, is_semantic=False):
-    prefix = "backbone." if is_semantic else ""
-
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias")
-        )
-        rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            (f"{prefix}cls_token", "beit.embeddings.cls_token"),
-            (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"),
-            (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"),
-        ]
-    )
-
-    if has_lm_head:
-        # mask token + shared relative position bias + layernorm
-        rename_keys.extend(
-            [
-                ("mask_token", "beit.embeddings.mask_token"),
-                (
-                    "rel_pos_bias.relative_position_bias_table",
-                    "beit.encoder.relative_position_bias.relative_position_bias_table",
-                ),
-                (
-                    "rel_pos_bias.relative_position_index",
-                    "beit.encoder.relative_position_bias.relative_position_index",
-                ),
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-            ]
-        )
-    elif is_semantic:
-        # semantic segmentation classification heads
-        rename_keys.extend(
-            [
-                ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
-                ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
-                ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
-                ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
-            ]
-        )
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("fc_norm.weight", "beit.pooler.layernorm.weight"),
-                ("fc_norm.bias", "beit.pooler.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False):
-    for i in range(config.num_hidden_layers):
-        prefix = "backbone." if is_semantic else ""
-        # queries, keys and values
-        in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
-        q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")
-
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
-
-        # gamma_1 and gamma_2
-        # we call them lambda because otherwise they are renamed when using .from_pretrained
-        gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
-        gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")
-
-        state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1
-        state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2
-
-        # relative_position bias table + index
-        if not has_lm_head:
-            # each layer has its own relative position bias
-            table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table")
-            index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index")
-
-            state_dict[
-                f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"
-            ] = table
-            state_dict[
-                f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"
-            ] = index
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our BEiT structure.
-    """
-
-    # define default BEiT configuration
-    config = BeitConfig()
-    has_lm_head = False
-    is_semantic = False
-    repo_id = "huggingface/label-files"
-    # set config parameters based on URL
-    if checkpoint_url[-9:-4] == "pt22k":
-        # masked image modeling
-        config.use_shared_relative_position_bias = True
-        config.use_mask_token = True
-        has_lm_head = True
-    elif checkpoint_url[-9:-4] == "ft22k":
-        # intermediate fine-tuning on ImageNet-22k
-        config.use_relative_position_bias = True
-        config.num_labels = 21841
-        filename = "imagenet-22k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        # this dataset contains 21843 labels but the model only has 21841
-        # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18
-        del id2label[9205]
-        del id2label[15027]
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    elif checkpoint_url[-8:-4] == "to1k":
-        # fine-tuning on ImageNet-1k
-        config.use_relative_position_bias = True
-        config.num_labels = 1000
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        if "384" in checkpoint_url:
-            config.image_size = 384
-        if "512" in checkpoint_url:
-            config.image_size = 512
-    elif "ade20k" in checkpoint_url:
-        # fine-tuning
-        config.use_relative_position_bias = True
-        config.num_labels = 150
-        filename = "ade20k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        config.image_size = 640
-        is_semantic = True
-    else:
-        raise ValueError("Checkpoint not supported, URL should either end with 'pt22k', 'ft22k', 'to1k' or 'ade20k'")
-
-    # size of the architecture
-    if "base" in checkpoint_url:
-        pass
-    elif "large" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        if "ade20k" in checkpoint_url:
-            config.image_size = 640
-            config.out_indices = [7, 11, 15, 23]
-    else:
-        raise ValueError("Should either find 'base' or 'large' in checkpoint URL")
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)
-    state_dict = state_dict["model"] if "ade20k" not in checkpoint_url else state_dict["state_dict"]
-
-    rename_keys = create_rename_keys(config, has_lm_head=has_lm_head, is_semantic=is_semantic)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head, is_semantic=is_semantic)
-    if is_semantic:
-        # add prefix to decoder keys
-        for key, val in state_dict.copy().items():
-            val = state_dict.pop(key)
-            if key.startswith("backbone.fpn"):
-                key = key.replace("backbone.fpn", "fpn")
-            state_dict[key] = val
-
-    # load HuggingFace model
-    if checkpoint_url[-9:-4] == "pt22k":
-        model = BeitForMaskedImageModeling(config)
-    elif "ade20k" in checkpoint_url:
-        model = BeitForSemanticSegmentation(config)
-    else:
-        model = BeitForImageClassification(config)
-    model.eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image
-    if is_semantic:
-        image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
-        ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True)
-        image = Image.open(ds[0]["file"])
-    else:
-        image_processor = BeitImageProcessor(
-            size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
-        )
-        image = prepare_img()
-
-    encoding = image_processor(images=image, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    outputs = model(pixel_values)
-    logits = outputs.logits
-
-    # verify logits
-    expected_shape = torch.Size([1, 1000])
-    if checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k"):
-        expected_shape = torch.Size([1, 196, 8192])
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k"):
-        expected_shape = torch.Size([1, 196, 8192])
-    elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22k"):
-        expected_shape = torch.Size([1, 21841])
-        expected_logits = torch.tensor([2.2288, 2.4671, 0.7395])
-        expected_class_idx = 2397
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22k"):
-        expected_shape = torch.Size([1, 21841])
-        expected_logits = torch.tensor([1.6881, -0.2787, 0.5901])
-        expected_class_idx = 2396
-    elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft1k"):
-        expected_logits = torch.tensor([0.1241, 0.0798, -0.6569])
-        expected_class_idx = 285
-    elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22kto1k"):
-        expected_logits = torch.tensor([-1.2385, -1.0987, -1.0108])
-        expected_class_idx = 281
-    elif checkpoint_url[:-4].endswith("beit_base_patch16_384_pt22k_ft22kto1k"):
-        expected_logits = torch.tensor([-1.5303, -0.9484, -0.3147])
-        expected_class_idx = 761
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft1k"):
-        expected_logits = torch.tensor([0.4610, -0.0928, 0.2086])
-        expected_class_idx = 761
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22kto1k"):
-        expected_logits = torch.tensor([-0.4804, 0.6257, -0.1837])
-        expected_class_idx = 761
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_384_pt22k_ft22kto1k"):
-        expected_logits = torch.tensor([[-0.5122, 0.5117, -0.2113]])
-        expected_class_idx = 761
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_512_pt22k_ft22kto1k"):
-        expected_logits = torch.tensor([-0.3062, 0.7261, 0.4852])
-        expected_class_idx = 761
-    elif checkpoint_url[:-4].endswith("beit_base_patch16_640_pt22k_ft22ktoade20k"):
-        expected_shape = (1, 150, 160, 160)
-        expected_logits = torch.tensor(
-            [
-                [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]],
-                [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]],
-                [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]],
-            ]
-        )
-    elif checkpoint_url[:-4].endswith("beit_large_patch16_640_pt22k_ft22ktoade20k"):
-        expected_shape = (1, 150, 160, 160)
-        expected_logits = torch.tensor(
-            [
-                [[-4.3305, -2.3049, -3.0161], [-2.9591, -1.5305, -2.2251], [-3.4198, -1.8004, -2.9062]],
-                [[-5.8922, -3.7435, -4.3978], [-4.2063, -2.7872, -3.4755], [-4.2791, -3.1874, -4.1681]],
-                [[0.9895, 4.3467, 4.7663], [4.2476, 5.6830, 6.1518], [4.5550, 6.2495, 6.5154]],
-            ]
-        )
-    else:
-        raise ValueError("Can't verify logits as model is not supported")
-
-    if logits.shape != expected_shape:
-        raise ValueError(f"Shape of logits not as expected. {logits.shape=}, {expected_shape=}")
-    if not has_lm_head:
-        if is_semantic:
-            if not torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-3):
-                raise ValueError("First elements of logits not as expected")
-        else:
-            print("Predicted class idx:", logits.argmax(-1).item())
-
-            if not torch.allclose(logits[0, :3], expected_logits, atol=1e-3):
-                raise ValueError("First elements of logits not as expected")
-            if logits.argmax(-1).item() != expected_class_idx:
-                raise ValueError("Predicted class index not as expected")
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_beit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index ce75797f5ca6..0d928f214107 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -190,15 +190,15 @@ def reduce_label(self, label: ImageInput) -> np.ndarray:
     def _preprocess(
         self,
         image: ImageInput,
-        do_reduce_labels: bool = None,
-        do_resize: bool = None,
+        do_reduce_labels: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -223,14 +223,14 @@ def _preprocess(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -268,12 +268,12 @@ def _preprocess_image(
     def _preprocess_segmentation_map(
         self,
         segmentation_map: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_reduce_labels: bool = None,
+        do_reduce_labels: Optional[bool] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """Preprocesses a single segmentation map."""
@@ -317,14 +317,14 @@ def preprocess(
         self,
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_reduce_labels: Optional[bool] = None,
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index 0f642c5e8e8a..264bce993a34 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -270,7 +270,7 @@ def __init__(self, config: BeitConfig, window_size: Optional[tuple] = None) -> N
         self.config = config
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {(config.hidden_size,)} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
@@ -770,6 +770,18 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, BeitEmbeddings):
+            module.cls_token.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, BeitRelativePositionBias):
+            module.relative_position_bias_table.data.zero_()
+        elif isinstance(module, BeitLayer):
+            if module.lambda_1 is not None:
+                module.lambda_1.data.fill_(self.config.layer_scale_init_value)
+                module.lambda_2.data.fill_(self.config.layer_scale_init_value)
 
 
 BEIT_START_DOCSTRING = r"""
diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py
index 2d79c1820088..d37eedea3f4e 100644
--- a/src/transformers/models/beit/modeling_flax_beit.py
+++ b/src/transformers/models/beit/modeling_flax_beit.py
@@ -271,7 +271,7 @@ def setup(self):
             self.config, "embedding_size"
         ):
             raise ValueError(
-                f"The hidden size {self.config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {self.config.hidden_size} is not a multiple of the number of attention "
                 f"heads {self.config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
deleted file mode 100644
index 9dfd8da474e3..000000000000
--- a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official (now
-deprecated) GitHub: https://github.com/tensorflow/models/tree/v2.3.0/official/nlp/bert
-
-TF2.x uses different variable names from the original BERT (TF 1.4) implementation. The script re-maps the TF2.x Bert
-weight names to the original names, so the model can be imported with Huggingface/transformer.
-
-You may adapt this script to include classification/MLM/NSP/etc. heads.
-
-Note: This script is only working with an older version of the TensorFlow models repository (<= v2.3.0).
-      Models trained with never versions are not compatible with this script.
-"""
-
-import argparse
-import os
-import re
-
-import tensorflow as tf
-import torch
-
-from transformers import BertConfig, BertModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def load_tf2_weights_in_bert(model, tf_checkpoint_path, config):
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    layer_depth = []
-    for full_name, shape in init_vars:
-        # logger.info(f"Loading TF weight {name} with shape {shape}")
-        name = full_name.split("/")
-        if full_name == "_CHECKPOINTABLE_OBJECT_GRAPH" or name[0] in ["global_step", "save_counter"]:
-            logger.info(f"Skipping non-model layer {full_name}")
-            continue
-        if "optimizer" in full_name:
-            logger.info(f"Skipping optimization layer {full_name}")
-            continue
-        if name[0] == "model":
-            # ignore initial 'model'
-            name = name[1:]
-        # figure out how many levels deep the name is
-        depth = 0
-        for _name in name:
-            if _name.startswith("layer_with_weights"):
-                depth += 1
-            else:
-                break
-        layer_depth.append(depth)
-        # read data
-        array = tf.train.load_variable(tf_path, full_name)
-        names.append("/".join(name))
-        arrays.append(array)
-    logger.info(f"Read a total of {len(arrays):,} layers")
-
-    # Sanity check
-    if len(set(layer_depth)) != 1:
-        raise ValueError(f"Found layer names with different depths (layer depth {list(set(layer_depth))})")
-    layer_depth = list(set(layer_depth))[0]
-    if layer_depth != 1:
-        raise ValueError(
-            "The model contains more than just the embedding/encoder layers. This script does not handle MLM/NSP"
-            " heads."
-        )
-
-    # convert layers
-    logger.info("Converting weights...")
-    for full_name, array in zip(names, arrays):
-        name = full_name.split("/")
-        pointer = model
-        trace = []
-        for i, m_name in enumerate(name):
-            if m_name == ".ATTRIBUTES":
-                # variable names end with .ATTRIBUTES/VARIABLE_VALUE
-                break
-            if m_name.startswith("layer_with_weights"):
-                layer_num = int(m_name.split("-")[-1])
-                if layer_num <= 2:
-                    # embedding layers
-                    # layer_num 0: word_embeddings
-                    # layer_num 1: position_embeddings
-                    # layer_num 2: token_type_embeddings
-                    continue
-                elif layer_num == 3:
-                    # embedding LayerNorm
-                    trace.extend(["embeddings", "LayerNorm"])
-                    pointer = getattr(pointer, "embeddings")
-                    pointer = getattr(pointer, "LayerNorm")
-                elif layer_num > 3 and layer_num < config.num_hidden_layers + 4:
-                    # encoder layers
-                    trace.extend(["encoder", "layer", str(layer_num - 4)])
-                    pointer = getattr(pointer, "encoder")
-                    pointer = getattr(pointer, "layer")
-                    pointer = pointer[layer_num - 4]
-                elif layer_num == config.num_hidden_layers + 4:
-                    # pooler layer
-                    trace.extend(["pooler", "dense"])
-                    pointer = getattr(pointer, "pooler")
-                    pointer = getattr(pointer, "dense")
-            elif m_name == "embeddings":
-                trace.append("embeddings")
-                pointer = getattr(pointer, "embeddings")
-                if layer_num == 0:
-                    trace.append("word_embeddings")
-                    pointer = getattr(pointer, "word_embeddings")
-                elif layer_num == 1:
-                    trace.append("position_embeddings")
-                    pointer = getattr(pointer, "position_embeddings")
-                elif layer_num == 2:
-                    trace.append("token_type_embeddings")
-                    pointer = getattr(pointer, "token_type_embeddings")
-                else:
-                    raise ValueError(f"Unknown embedding layer with name {full_name}")
-                trace.append("weight")
-                pointer = getattr(pointer, "weight")
-            elif m_name == "_attention_layer":
-                # self-attention layer
-                trace.extend(["attention", "self"])
-                pointer = getattr(pointer, "attention")
-                pointer = getattr(pointer, "self")
-            elif m_name == "_attention_layer_norm":
-                # output attention norm
-                trace.extend(["attention", "output", "LayerNorm"])
-                pointer = getattr(pointer, "attention")
-                pointer = getattr(pointer, "output")
-                pointer = getattr(pointer, "LayerNorm")
-            elif m_name == "_attention_output_dense":
-                # output attention dense
-                trace.extend(["attention", "output", "dense"])
-                pointer = getattr(pointer, "attention")
-                pointer = getattr(pointer, "output")
-                pointer = getattr(pointer, "dense")
-            elif m_name == "_output_dense":
-                # output dense
-                trace.extend(["output", "dense"])
-                pointer = getattr(pointer, "output")
-                pointer = getattr(pointer, "dense")
-            elif m_name == "_output_layer_norm":
-                # output dense
-                trace.extend(["output", "LayerNorm"])
-                pointer = getattr(pointer, "output")
-                pointer = getattr(pointer, "LayerNorm")
-            elif m_name == "_key_dense":
-                # attention key
-                trace.append("key")
-                pointer = getattr(pointer, "key")
-            elif m_name == "_query_dense":
-                # attention query
-                trace.append("query")
-                pointer = getattr(pointer, "query")
-            elif m_name == "_value_dense":
-                # attention value
-                trace.append("value")
-                pointer = getattr(pointer, "value")
-            elif m_name == "_intermediate_dense":
-                # attention intermediate dense
-                trace.extend(["intermediate", "dense"])
-                pointer = getattr(pointer, "intermediate")
-                pointer = getattr(pointer, "dense")
-            elif m_name == "_output_layer_norm":
-                # output layer norm
-                trace.append("output")
-                pointer = getattr(pointer, "output")
-            # weights & biases
-            elif m_name in ["bias", "beta"]:
-                trace.append("bias")
-                pointer = getattr(pointer, "bias")
-            elif m_name in ["kernel", "gamma"]:
-                trace.append("weight")
-                pointer = getattr(pointer, "weight")
-            else:
-                logger.warning(f"Ignored {m_name}")
-        # for certain layers reshape is necessary
-        trace = ".".join(trace)
-        if re.match(r"(\S+)\.attention\.self\.(key|value|query)\.(bias|weight)", trace) or re.match(
-            r"(\S+)\.attention\.output\.dense\.weight", trace
-        ):
-            array = array.reshape(pointer.data.shape)
-        if "kernel" in full_name:
-            array = array.transpose()
-        if pointer.shape == array.shape:
-            pointer.data = torch.from_numpy(array)
-        else:
-            raise ValueError(
-                f"Shape mismatch in layer {full_name}: Model expects shape {pointer.shape} but layer contains shape:"
-                f" {array.shape}"
-            )
-        logger.info(f"Successfully set variable {full_name} to PyTorch layer {trace}")
-    return model
-
-
-def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path, pytorch_dump_path):
-    # Instantiate model
-    logger.info(f"Loading model based on config from {config_path}...")
-    config = BertConfig.from_json_file(config_path)
-    model = BertModel(config)
-
-    # Load weights from checkpoint
-    logger.info(f"Loading weights from checkpoint {tf_checkpoint_path}...")
-    load_tf2_weights_in_bert(model, tf_checkpoint_path, config)
-
-    # Save pytorch-model
-    logger.info(f"Saving PyTorch model to {pytorch_dump_path}...")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow 2.x checkpoint path."
-    )
-    parser.add_argument(
-        "--bert_config_file",
-        type=str,
-        required=True,
-        help="The config json file corresponding to the BERT model. This specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path",
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model (must include filename).",
-    )
-    args = parser.parse_args()
-    convert_tf2_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index be904ddd7e6c..000000000000
--- a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BERT checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = BertConfig.from_json_file(bert_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = BertForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_bert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--bert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained BERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
deleted file mode 100644
index f7cb149053a3..000000000000
--- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
-
-import argparse
-import os
-
-import numpy as np
-import tensorflow as tf
-import torch
-
-from transformers import BertModel
-
-
-def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
-    """
-    Args:
-        model: BertModel Pytorch model instance to be converted
-        ckpt_dir: Tensorflow model directory
-        model_name: model name
-
-    Currently supported HF models:
-
-        - Y BertModel
-        - N BertForMaskedLM
-        - N BertForPreTraining
-        - N BertForMultipleChoice
-        - N BertForNextSentencePrediction
-        - N BertForSequenceClassification
-        - N BertForQuestionAnswering
-    """
-
-    tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")
-
-    var_map = (
-        ("layer.", "layer_"),
-        ("word_embeddings.weight", "word_embeddings"),
-        ("position_embeddings.weight", "position_embeddings"),
-        ("token_type_embeddings.weight", "token_type_embeddings"),
-        (".", "/"),
-        ("LayerNorm/weight", "LayerNorm/gamma"),
-        ("LayerNorm/bias", "LayerNorm/beta"),
-        ("weight", "kernel"),
-    )
-
-    if not os.path.isdir(ckpt_dir):
-        os.makedirs(ckpt_dir)
-
-    state_dict = model.state_dict()
-
-    def to_tf_var_name(name: str):
-        for patt, repl in iter(var_map):
-            name = name.replace(patt, repl)
-        return f"bert/{name}"
-
-    def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
-        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
-        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
-        session.run(tf.variables_initializer([tf_var]))
-        session.run(tf_var)
-        return tf_var
-
-    tf.reset_default_graph()
-    with tf.Session() as session:
-        for var_name in state_dict:
-            tf_name = to_tf_var_name(var_name)
-            torch_tensor = state_dict[var_name].numpy()
-            if any(x in var_name for x in tensors_to_transpose):
-                torch_tensor = torch_tensor.T
-            tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
-            tf_var.assign(tf.cast(torch_tensor, tf_var.dtype))
-            tf_weight = session.run(tf_var)
-            print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}")
-
-        saver = tf.train.Saver(tf.trainable_variables())
-        saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
-
-
-def main(raw_args=None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str, required=True, help="model name e.g. google-bert/bert-base-uncased")
-    parser.add_argument(
-        "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model"
-    )
-    parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin")
-    parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model")
-    args = parser.parse_args(raw_args)
-
-    model = BertModel.from_pretrained(
-        pretrained_model_name_or_path=args.model_name,
-        state_dict=torch.load(args.pytorch_model_path),
-        cache_dir=args.cache_dir,
-    )
-
-    convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
deleted file mode 100644
index cba1e1a2c3f7..000000000000
--- a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This script converts a lm-head checkpoint from the "Token Dropping" implementation into a PyTorch-compatible BERT
-model. The official implementation of "Token Dropping" can be found in the TensorFlow Models repository:
-
-https://github.com/tensorflow/models/tree/master/official/projects/token_dropping
-"""
-
-import argparse
-
-import tensorflow as tf
-import torch
-
-from transformers import BertConfig, BertForMaskedLM
-from transformers.models.bert.modeling_bert import (
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertPooler,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pytorch_dump_path: str):
-    def get_masked_lm_array(name: str):
-        full_name = f"masked_lm/{name}/.ATTRIBUTES/VARIABLE_VALUE"
-        array = tf.train.load_variable(tf_checkpoint_path, full_name)
-
-        if "kernel" in name:
-            array = array.transpose()
-
-        return torch.from_numpy(array)
-
-    def get_encoder_array(name: str):
-        full_name = f"encoder/{name}/.ATTRIBUTES/VARIABLE_VALUE"
-        array = tf.train.load_variable(tf_checkpoint_path, full_name)
-
-        if "kernel" in name:
-            array = array.transpose()
-
-        return torch.from_numpy(array)
-
-    def get_encoder_layer_array(layer_index: int, name: str):
-        full_name = f"encoder/_transformer_layers/{layer_index}/{name}/.ATTRIBUTES/VARIABLE_VALUE"
-        array = tf.train.load_variable(tf_checkpoint_path, full_name)
-
-        if "kernel" in name:
-            array = array.transpose()
-
-        return torch.from_numpy(array)
-
-    def get_encoder_attention_layer_array(layer_index: int, name: str, orginal_shape):
-        full_name = f"encoder/_transformer_layers/{layer_index}/_attention_layer/{name}/.ATTRIBUTES/VARIABLE_VALUE"
-        array = tf.train.load_variable(tf_checkpoint_path, full_name)
-        array = array.reshape(orginal_shape)
-
-        if "kernel" in name:
-            array = array.transpose()
-
-        return torch.from_numpy(array)
-
-    print(f"Loading model based on config from {config_path}...")
-    config = BertConfig.from_json_file(config_path)
-    model = BertForMaskedLM(config)
-
-    # Layers
-    for layer_index in range(0, config.num_hidden_layers):
-        layer: BertLayer = model.bert.encoder.layer[layer_index]
-
-        # Self-attention
-        self_attn: BertSelfAttention = layer.attention.self
-
-        self_attn.query.weight.data = get_encoder_attention_layer_array(
-            layer_index, "_query_dense/kernel", self_attn.query.weight.data.shape
-        )
-        self_attn.query.bias.data = get_encoder_attention_layer_array(
-            layer_index, "_query_dense/bias", self_attn.query.bias.data.shape
-        )
-        self_attn.key.weight.data = get_encoder_attention_layer_array(
-            layer_index, "_key_dense/kernel", self_attn.key.weight.data.shape
-        )
-        self_attn.key.bias.data = get_encoder_attention_layer_array(
-            layer_index, "_key_dense/bias", self_attn.key.bias.data.shape
-        )
-        self_attn.value.weight.data = get_encoder_attention_layer_array(
-            layer_index, "_value_dense/kernel", self_attn.value.weight.data.shape
-        )
-        self_attn.value.bias.data = get_encoder_attention_layer_array(
-            layer_index, "_value_dense/bias", self_attn.value.bias.data.shape
-        )
-
-        # Self-attention Output
-        self_output: BertSelfOutput = layer.attention.output
-
-        self_output.dense.weight.data = get_encoder_attention_layer_array(
-            layer_index, "_output_dense/kernel", self_output.dense.weight.data.shape
-        )
-        self_output.dense.bias.data = get_encoder_attention_layer_array(
-            layer_index, "_output_dense/bias", self_output.dense.bias.data.shape
-        )
-
-        self_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/gamma")
-        self_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/beta")
-
-        # Intermediate
-        intermediate: BertIntermediate = layer.intermediate
-
-        intermediate.dense.weight.data = get_encoder_layer_array(layer_index, "_intermediate_dense/kernel")
-        intermediate.dense.bias.data = get_encoder_layer_array(layer_index, "_intermediate_dense/bias")
-
-        # Output
-        bert_output: BertOutput = layer.output
-
-        bert_output.dense.weight.data = get_encoder_layer_array(layer_index, "_output_dense/kernel")
-        bert_output.dense.bias.data = get_encoder_layer_array(layer_index, "_output_dense/bias")
-
-        bert_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_output_layer_norm/gamma")
-        bert_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_output_layer_norm/beta")
-
-    # Embeddings
-    model.bert.embeddings.position_embeddings.weight.data = get_encoder_array("_position_embedding_layer/embeddings")
-    model.bert.embeddings.token_type_embeddings.weight.data = get_encoder_array("_type_embedding_layer/embeddings")
-    model.bert.embeddings.LayerNorm.weight.data = get_encoder_array("_embedding_norm_layer/gamma")
-    model.bert.embeddings.LayerNorm.bias.data = get_encoder_array("_embedding_norm_layer/beta")
-
-    # LM Head
-    lm_head = model.cls.predictions.transform
-
-    lm_head.dense.weight.data = get_masked_lm_array("dense/kernel")
-    lm_head.dense.bias.data = get_masked_lm_array("dense/bias")
-
-    lm_head.LayerNorm.weight.data = get_masked_lm_array("layer_norm/gamma")
-    lm_head.LayerNorm.bias.data = get_masked_lm_array("layer_norm/beta")
-
-    model.bert.embeddings.word_embeddings.weight.data = get_masked_lm_array("embedding_table")
-
-    # Pooling
-    model.bert.pooler = BertPooler(config=config)
-    model.bert.pooler.dense.weight.data: BertPooler = get_encoder_array("_pooler_layer/kernel")
-    model.bert.pooler.dense.bias.data: BertPooler = get_encoder_array("_pooler_layer/bias")
-
-    # Export final model
-    model.save_pretrained(pytorch_dump_path)
-
-    # Integration test - should load without any errors ;)
-    new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path)
-    print(new_model.eval())
-
-    print("Model conversion was done sucessfully!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow Token Dropping checkpoint path."
-    )
-    parser.add_argument(
-        "--bert_config_file",
-        type=str,
-        required=True,
-        help="The config json file corresponding to the BERT model. This specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path",
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model.",
-    )
-    args = parser.parse_args()
-    convert_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 8e48263c9300..38d2c8c8b5a5 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -848,6 +848,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, BertLMPredictionHead):
+            module.bias.data.zero_()
 
 
 @dataclass
@@ -878,8 +880,8 @@ class BertForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index 83358c86bd28..61939a53f4a4 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -263,7 +263,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index ce862194dc77..ba73faf23cf9 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -161,10 +161,10 @@ def build(self, input_shape=None):
 
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        token_type_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        token_type_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
         past_key_values_length=0,
         training: bool = False,
     ) -> tf.Tensor:
@@ -1048,8 +1048,8 @@ class TFBertForPreTrainingOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    prediction_logits: tf.Tensor = None
-    seq_relationship_logits: tf.Tensor = None
+    prediction_logits: Optional[tf.Tensor] = None
+    seq_relationship_logits: Optional[tf.Tensor] = None
     hidden_states: Optional[Union[Tuple[tf.Tensor], tf.Tensor]] = None
     attentions: Optional[Union[Tuple[tf.Tensor], tf.Tensor]] = None
 
diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py
index b1f49722fbdf..86658de5241a 100644
--- a/src/transformers/models/bert/tokenization_bert_tf.py
+++ b/src/transformers/models/bert/tokenization_bert_tf.py
@@ -1,5 +1,5 @@
 import os
-from typing import List, Union
+from typing import List, Optional, Union
 
 import tensorflow as tf
 from tensorflow_text import BertTokenizer as BertTokenizerLayer
@@ -58,13 +58,13 @@ def __init__(
         self,
         vocab_list: List,
         do_lower_case: bool,
-        cls_token_id: int = None,
-        sep_token_id: int = None,
-        pad_token_id: int = None,
+        cls_token_id: Optional[int] = None,
+        sep_token_id: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
         padding: str = "longest",
         truncation: bool = True,
         max_length: int = 512,
-        pad_to_multiple_of: int = None,
+        pad_to_multiple_of: Optional[int] = None,
         return_token_type_ids: bool = True,
         return_attention_mask: bool = True,
         use_fast_bert_tokenizer: bool = True,
diff --git a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 0b8e6590f937..000000000000
--- a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BigBird checkpoint."""
-
-import argparse
-
-from transformers import BigBirdConfig, BigBirdForPreTraining, BigBirdForQuestionAnswering, load_tf_weights_in_big_bird
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, pytorch_dump_path, is_trivia_qa):
-    # Initialise PyTorch model
-    config = BigBirdConfig.from_json_file(big_bird_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-
-    if is_trivia_qa:
-        model = BigBirdForQuestionAnswering(config)
-    else:
-        model = BigBirdForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=is_trivia_qa)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--big_bird_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained BERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--is_trivia_qa", action="store_true", help="Whether to convert a model with a trivia_qa head."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(
-        args.tf_checkpoint_path, args.big_bird_config_file, args.pytorch_dump_path, args.is_trivia_qa
-    )
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 4ddce6e9fe4b..c2f6646202b3 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -1860,8 +1860,8 @@ class BigBirdForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -1894,9 +1894,9 @@ class BigBirdForQuestionAnsweringModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -1970,7 +1970,7 @@ def set_attention_type(self, value: str):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -2268,7 +2268,7 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=BigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -2381,7 +2381,7 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -2527,7 +2527,7 @@ def set_output_embeddings(self, new_embeddings):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -2666,7 +2666,7 @@ def __init__(self, config):
     @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -2800,7 +2800,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -2895,7 +2895,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py
index 8d23180a8348..7f43a4c5abb8 100644
--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -284,7 +284,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
@@ -412,7 +412,7 @@ def __call__(
 
 class FlaxBigBirdBlockSparseAttention(nn.Module):
     config: BigBirdConfig
-    block_sparse_seed: int = None
+    block_sparse_seed: Optional[int] = None
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
@@ -1262,7 +1262,7 @@ def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
 
 class FlaxBigBirdAttention(nn.Module):
     config: BigBirdConfig
-    layer_id: int = None
+    layer_id: Optional[int] = None
     causal: bool = False
     dtype: jnp.dtype = jnp.float32
 
@@ -1362,7 +1362,7 @@ def __call__(self, hidden_states, attention_output, deterministic: bool = True):
 
 class FlaxBigBirdLayer(nn.Module):
     config: BigBirdConfig
-    layer_id: int = None
+    layer_id: Optional[int] = None
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
 
     def setup(self):
@@ -2407,7 +2407,7 @@ def __call__(
             # removing question tokens from the competition
             logits = logits - logits_mask * 1e6
 
-        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py
index 194cbc68cb56..3e2d13e47a1e 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -184,7 +184,7 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         spaces_between_special_tokens: bool = True,
         **kwargs,
     ) -> str:
diff --git a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
deleted file mode 100644
index e17369e48041..000000000000
--- a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-from typing import Dict
-
-import tensorflow as tf
-import torch
-from tqdm import tqdm
-
-from transformers import BigBirdPegasusConfig, BigBirdPegasusForConditionalGeneration
-
-
-INIT_COMMON = [
-    # tf -> hf
-    ("/", "."),
-    ("layer_", "layers."),
-    ("kernel", "weight"),
-    ("beta", "bias"),
-    ("gamma", "weight"),
-    ("pegasus", "model"),
-]
-END_COMMON = [
-    (".output.dense", ".fc2"),
-    ("intermediate.LayerNorm", "final_layer_norm"),
-    ("intermediate.dense", "fc1"),
-]
-
-DECODER_PATTERNS = (
-    INIT_COMMON
-    + [
-        ("attention.self.LayerNorm", "self_attn_layer_norm"),
-        ("attention.output.dense", "self_attn.out_proj"),
-        ("attention.self", "self_attn"),
-        ("attention.encdec.LayerNorm", "encoder_attn_layer_norm"),
-        ("attention.encdec_output.dense", "encoder_attn.out_proj"),
-        ("attention.encdec", "encoder_attn"),
-        ("key", "k_proj"),
-        ("value", "v_proj"),
-        ("query", "q_proj"),
-        ("decoder.LayerNorm", "decoder.layernorm_embedding"),
-    ]
-    + END_COMMON
-)
-
-REMAINING_PATTERNS = (
-    INIT_COMMON
-    + [
-        ("embeddings.word_embeddings", "shared.weight"),
-        ("embeddings.position_embeddings", "embed_positions.weight"),
-        ("attention.self.LayerNorm", "self_attn_layer_norm"),
-        ("attention.output.dense", "self_attn.output"),
-        ("attention.self", "self_attn.self"),
-        ("encoder.LayerNorm", "encoder.layernorm_embedding"),
-    ]
-    + END_COMMON
-)
-
-KEYS_TO_IGNORE = [
-    "encdec/key/bias",
-    "encdec/query/bias",
-    "encdec/value/bias",
-    "self/key/bias",
-    "self/query/bias",
-    "self/value/bias",
-    "encdec_output/dense/bias",
-    "attention/output/dense/bias",
-]
-
-
-def rename_state_dict_key(k, patterns):
-    for tf_name, hf_name in patterns:
-        k = k.replace(tf_name, hf_name)
-    return k
-
-
-def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPegasusForConditionalGeneration:
-    cfg = BigBirdPegasusConfig(**config_update)
-    torch_model = BigBirdPegasusForConditionalGeneration(cfg)
-    state_dict = torch_model.state_dict()
-    mapping = {}
-
-    # separating decoder weights
-    decoder_weights = {k: tf_weights[k] for k in tf_weights if k.startswith("pegasus/decoder")}
-    remaining_weights = {k: tf_weights[k] for k in tf_weights if not k.startswith("pegasus/decoder")}
-
-    for k, v in tqdm(decoder_weights.items(), "tf -> hf conversion"):
-        conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE]
-        if any(conditions):
-            continue
-        patterns = DECODER_PATTERNS
-        new_k = rename_state_dict_key(k, patterns)
-        if new_k not in state_dict:
-            raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-        if any(True if i in k else False for i in ["dense", "query", "key", "value"]):
-            v = v.T
-        mapping[new_k] = torch.from_numpy(v)
-        assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}"
-
-    for k, v in tqdm(remaining_weights.items(), "tf -> hf conversion"):
-        conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE]
-        if any(conditions):
-            continue
-        patterns = REMAINING_PATTERNS
-        new_k = rename_state_dict_key(k, patterns)
-        if new_k not in state_dict and k != "pegasus/embeddings/position_embeddings":
-            raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-        if any(True if i in k else False for i in ["dense", "query", "key", "value"]):
-            v = v.T
-        mapping[new_k] = torch.from_numpy(v)
-        if k != "pegasus/embeddings/position_embeddings":
-            assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}"
-
-    mapping["model.encoder.embed_positions.weight"] = mapping["model.embed_positions.weight"]
-    mapping["model.decoder.embed_positions.weight"] = mapping.pop("model.embed_positions.weight")
-    missing, extra = torch_model.load_state_dict(mapping, strict=False)
-    unexpected_missing = [
-        k
-        for k in missing
-        if k
-        not in [
-            "final_logits_bias",
-            "model.encoder.embed_tokens.weight",
-            "model.decoder.embed_tokens.weight",
-            "lm_head.weight",
-        ]
-    ]
-    assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}"
-    assert extra == [], f"no matches found for the following tf keys {extra}"
-    return torch_model
-
-
-def get_tf_weights_as_numpy(path) -> Dict:
-    init_vars = tf.train.list_variables(path)
-    tf_weights = {}
-    ignore_name = ["global_step"]
-    for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
-        skip_key = any(pat in name for pat in ignore_name)
-        if skip_key:
-            continue
-        array = tf.train.load_variable(path, name)
-        tf_weights[name] = array
-    return tf_weights
-
-
-def convert_bigbird_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str, config_update: dict):
-    tf_weights = get_tf_weights_as_numpy(ckpt_path)
-    torch_model = convert_bigbird_pegasus(tf_weights, config_update)
-    torch_model.save_pretrained(save_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--tf_ckpt_path", type=str, help="passed to tf.train.list_variables")
-    parser.add_argument("--save_dir", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    config_update = {}
-    convert_bigbird_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir, config_update=config_update)
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 616349012895..99333b7c159d 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -2346,7 +2346,7 @@ def get_decoder(self):
     # Copied from transformers.models.bart.modeling_bart.BartModel.forward with Bart->BigBirdPegasus
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -2479,12 +2479,17 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self.model._tie_weights()
+            self._tie_or_clone_weights(self.lm_head, self.model.shared)
+
     @add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     @add_end_docstrings(BIGBIRD_PEGASUS_GENERATION_EXAMPLE)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -2610,7 +2615,7 @@ def __init__(self, config: BigBirdPegasusConfig, **kwargs):
     # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -2738,7 +2743,7 @@ def __init__(self, config):
     # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward
     def forward(
         self,
-        input_ids: torch.Tensor = None,
+        input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -2882,7 +2887,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
diff --git a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100755
index c930a850462c..000000000000
--- a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import json
-import os
-import re
-import shutil
-
-import torch
-
-from transformers import BioGptConfig, BioGptForCausalLM
-from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES
-from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
-from transformers.utils import WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_warning()
-
-json_indent = 2
-
-
-# modified from https://github.com/facebookresearch/fairseq/blob/dd74992d0d143155998e9ed4076826bcea80fb06/fairseq/data/dictionary.py#L18
-class Dictionary:
-    """A mapping from symbols to consecutive integers"""
-
-    def __init__(
-        self,
-        *,  # begin keyword-only arguments
-        bos="<s>",
-        pad="<pad>",
-        eos="</s>",
-        unk="<unk>",
-        extra_special_symbols=None,
-    ):
-        self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
-        self.symbols = []
-        self.count = []
-        self.indices = {}
-        self.bos_index = self.add_symbol(bos)
-        self.pad_index = self.add_symbol(pad)
-        self.eos_index = self.add_symbol(eos)
-        self.unk_index = self.add_symbol(unk)
-        if extra_special_symbols:
-            for s in extra_special_symbols:
-                self.add_symbol(s)
-        self.nspecial = len(self.symbols)
-
-    def __eq__(self, other):
-        return self.indices == other.indices
-
-    def __getitem__(self, idx):
-        if idx < len(self.symbols):
-            return self.symbols[idx]
-        return self.unk_word
-
-    def __len__(self):
-        """Returns the number of symbols in the dictionary"""
-        return len(self.symbols)
-
-    def __contains__(self, sym):
-        return sym in self.indices
-
-    @classmethod
-    def load(cls, f):
-        """Loads the dictionary from a text file with the format:
-
-        ```
-        <symbol0> <count0>
-        <symbol1> <count1>
-        ...
-        ```
-        """
-        d = cls()
-        d.add_from_file(f)
-        return d
-
-    def add_symbol(self, word, n=1, overwrite=False):
-        """Adds a word to the dictionary"""
-        if word in self.indices and not overwrite:
-            idx = self.indices[word]
-            self.count[idx] = self.count[idx] + n
-            return idx
-        else:
-            idx = len(self.symbols)
-            self.indices[word] = idx
-            self.symbols.append(word)
-            self.count.append(n)
-            return idx
-
-    def _load_meta(self, lines):
-        return 0
-
-    def add_from_file(self, f):
-        """
-        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
-        """
-        if isinstance(f, str):
-            try:
-                with open(f, "r", encoding="utf-8") as fd:
-                    self.add_from_file(fd)
-            except FileNotFoundError as fnfe:
-                raise fnfe
-            except UnicodeError:
-                raise Exception("Incorrect encoding detected in {}, please rebuild the dataset".format(f))
-            return
-
-        lines = f.readlines()
-        indices_start_line = self._load_meta(lines)
-
-        for line in lines[indices_start_line:]:
-            try:
-                line, field = line.rstrip().rsplit(" ", 1)
-                if field == "#fairseq:overwrite":
-                    overwrite = True
-                    line, field = line.rsplit(" ", 1)
-                else:
-                    overwrite = False
-                count = int(field)
-                word = line
-                if word in self and not overwrite:
-                    raise RuntimeError(
-                        "Duplicate word found when loading Dictionary: '{}'. "
-                        "Duplicate words can overwrite earlier ones by adding the "
-                        "#fairseq:overwrite flag at the end of the corresponding row "
-                        "in the dictionary file. If using the Camembert model, please "
-                        "download an updated copy of the model file.".format(word)
-                    )
-                self.add_symbol(word, n=count, overwrite=overwrite)
-            except ValueError:
-                raise ValueError("Incorrect dictionary format, expected '<token> <cnt> [flags]'")
-
-
-def rewrite_dict_keys(d):
-    # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up,
-    # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er</w>': 7}
-    d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "</w>", k), v) for k, v in d.items())
-    keep_keys = "<s> <pad> </s> <unk>".split()
-    # restore the special tokens
-    for k in keep_keys:
-        del d2[f"{k}</w>"]
-        d2[k] = d[k]  # restore
-    return d2
-
-
-def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path):
-    # prep
-    if not os.path.exists(biogpt_checkpoint_path):
-        raise ValueError(f"path {biogpt_checkpoint_path} does not exist!")
-    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-    print(f"Writing results to {pytorch_dump_folder_path}")
-
-    # handle various types of models
-
-    checkpoint_file = os.path.join(biogpt_checkpoint_path, "checkpoint.pt")
-    if not os.path.isfile(checkpoint_file):
-        raise ValueError(f"path to the file {checkpoint_file} does not exist!")
-    chkpt = torch.load(checkpoint_file, map_location="cpu")
-
-    args = chkpt["cfg"]["model"]
-
-    # dicts
-    dict_file = os.path.join(biogpt_checkpoint_path, "dict.txt")
-    if not os.path.isfile(dict_file):
-        raise ValueError(f"path to the file {dict_file} does not exist!")
-    src_dict = Dictionary.load(dict_file)
-    src_vocab = rewrite_dict_keys(src_dict.indices)
-    src_vocab_size = len(src_vocab)
-    src_vocab_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["vocab_file"])
-    print(f"Generating {src_vocab_file} of {src_vocab_size} records")
-    with open(src_vocab_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
-
-    # merges_file (bpecodes)
-    bpecodes_file = os.path.join(biogpt_checkpoint_path, "bpecodes")
-    if not os.path.isfile(bpecodes_file):
-        raise ValueError(f"path to the file {bpecodes_file} does not exist!")
-
-    merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"])
-    shutil.copyfile(bpecodes_file, merges_file)
-
-    # model config
-    biogpt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json")
-
-    model_conf = {
-        "activation_dropout": args["activation_dropout"],
-        "architectures": ["BioGptForCausalLM"],
-        "attention_probs_dropout_prob": args["attention_dropout"],
-        "bos_token_id": 0,
-        "eos_token_id": 2,
-        "hidden_act": args["activation_fn"],
-        "hidden_dropout_prob": args["dropout"],
-        "hidden_size": args["decoder_embed_dim"],
-        "initializer_range": 0.02,
-        "intermediate_size": args["decoder_ffn_embed_dim"],
-        "layer_norm_eps": 1e-12,
-        "layerdrop": args["decoder_layerdrop"],
-        "max_position_embeddings": args["max_target_positions"],
-        "model_type": "biogpt",
-        "num_attention_heads": args["decoder_attention_heads"],
-        "num_hidden_layers": args["decoder_layers"],
-        "pad_token_id": 1,
-        "scale_embedding": not args["no_scale_embedding"],
-        "tie_word_embeddings": args["share_decoder_input_output_embed"],
-        "vocab_size": src_vocab_size,
-    }
-
-    # good hparam defaults to start with
-
-    print(f"Generating {biogpt_model_config_file}")
-    with open(biogpt_model_config_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))
-
-    # tokenizer config
-    biogpt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE)
-
-    tokenizer_conf = {
-        "bos_token": "<s>",
-        "eos_token": "</s>",
-        "model_max_length": 1024,
-        "pad_token": "<pad>",
-        "special_tokens_map_file": None,
-        "tokenizer_class": "BioGptTokenizer",
-        "unk_token": "<unk>",
-    }
-
-    print(f"Generating {biogpt_tokenizer_config_file}")
-    with open(biogpt_tokenizer_config_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))
-
-    # model
-    model_state_dict = chkpt["model"]
-
-    # remove unneeded keys
-    ignore_keys = [
-        "decoder.version",
-    ]
-    for k in ignore_keys:
-        model_state_dict.pop(k, None)
-
-    layer_names = list(model_state_dict.keys())
-    for layer_name in layer_names:
-        if layer_name.endswith("output_projection.weight"):
-            model_state_dict[layer_name.replace("decoder.", "")] = model_state_dict.pop(layer_name)
-        else:
-            model_state_dict[layer_name.replace("decoder", "biogpt")] = model_state_dict.pop(layer_name)
-
-    config = BioGptConfig.from_pretrained(pytorch_dump_folder_path)
-    model_new = BioGptForCausalLM(config)
-
-    # check that it loads ok
-    model_new.load_state_dict(model_state_dict)
-
-    # save
-    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
-    print(f"Generating {pytorch_weights_dump_path}")
-    torch.save(model_state_dict, pytorch_weights_dump_path)
-
-    print("Conversion is done!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--biogpt_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts,"
-            " bpecodes, etc."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_biogpt_checkpoint_to_pytorch(args.biogpt_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
deleted file mode 100644
index abc24290ab26..000000000000
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BiT checkpoints from the timm library."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from timm import create_model
-from timm.data import resolve_data_config
-from timm.data.transforms_factory import create_transform
-
-from transformers import BitConfig, BitForImageClassification, BitImageProcessor
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_config(model_name):
-    repo_id = "huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    label2id = {v: k for k, v in id2label.items()}
-
-    conv_layer = "std_conv" if "bit" in model_name else False
-
-    # note that when using BiT as backbone for ViT-hybrid checkpoints,
-    # one needs to additionally set config.layer_type = "bottleneck", config.stem_type = "same",
-    # config.conv_layer = "std_conv_same"
-    config = BitConfig(
-        conv_layer=conv_layer,
-        num_labels=1000,
-        id2label=id2label,
-        label2id=label2id,
-    )
-
-    return config
-
-
-def rename_key(name):
-    if "stem.conv" in name:
-        name = name.replace("stem.conv", "bit.embedder.convolution")
-    if "blocks" in name:
-        name = name.replace("blocks", "layers")
-    if "head.fc" in name:
-        name = name.replace("head.fc", "classifier.1")
-    if name.startswith("norm"):
-        name = "bit." + name
-    if "bit" not in name and "classifier" not in name:
-        name = "bit.encoder." + name
-
-    return name
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our BiT structure.
-    """
-
-    # define default BiT configuration
-    config = get_config(model_name)
-
-    # load original model from timm
-    timm_model = create_model(model_name, pretrained=True)
-    timm_model.eval()
-
-    # load state_dict of original model
-    state_dict = timm_model.state_dict()
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val.squeeze() if "head" in key else val
-
-    # load HuggingFace model
-    model = BitForImageClassification(config)
-    model.eval()
-    model.load_state_dict(state_dict)
-
-    # create image processor
-    transform = create_transform(**resolve_data_config({}, model=timm_model))
-    timm_transforms = transform.transforms
-
-    pillow_resamplings = {
-        "bilinear": PILImageResampling.BILINEAR,
-        "bicubic": PILImageResampling.BICUBIC,
-        "nearest": PILImageResampling.NEAREST,
-    }
-
-    processor = BitImageProcessor(
-        do_resize=True,
-        size={"shortest_edge": timm_transforms[0].size},
-        resample=pillow_resamplings[timm_transforms[0].interpolation.value],
-        do_center_crop=True,
-        crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]},
-        do_normalize=True,
-        image_mean=timm_transforms[-1].mean.tolist(),
-        image_std=timm_transforms[-1].std.tolist(),
-    )
-
-    image = prepare_img()
-    timm_pixel_values = transform(image).unsqueeze(0)
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    # verify pixel values
-    assert torch.allclose(timm_pixel_values, pixel_values)
-
-    # verify logits
-    with torch.no_grad():
-        outputs = model(pixel_values)
-        logits = outputs.logits
-
-    print("Logits:", logits[0, :3])
-    print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
-    timm_logits = timm_model(pixel_values)
-    assert timm_logits.shape == outputs.logits.shape
-    assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model {model_name} and processor to the hub")
-        model.push_to_hub(f"ybelkada/{model_name}")
-        processor.push_to_hub(f"ybelkada/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="resnetv2_50x1_bitm",
-        type=str,
-        help="Name of the BiT timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub.",
-    )
-
-    args = parser.parse_args()
-    convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index 0250bc3576e9..2b1f307a29fb 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -176,17 +176,17 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index c5919b94d42f..000000000000
--- a/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Blenderbot checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-PATTERNS = [
-    ["attention", "attn"],
-    ["encoder_attention", "encoder_attn"],
-    ["q_lin", "q_proj"],
-    ["k_lin", "k_proj"],
-    ["v_lin", "v_proj"],
-    ["out_lin", "out_proj"],
-    ["norm_embeddings", "layernorm_embedding"],
-    ["position_embeddings", "embed_positions"],
-    ["embeddings", "embed_tokens"],
-    ["ffn.lin", "fc"],
-]
-
-
-def rename_state_dict_key(k):
-    if k == "embeddings.weight":
-        return "shared.weight"
-
-    for parlai_name, hf_name in PATTERNS:
-        k = k.replace(parlai_name, hf_name)
-
-    if k.startswith("encoder"):
-        k = k.replace(".attn", ".self_attn")
-        k = k.replace("norm1", "self_attn_layer_norm")
-        k = k.replace("norm2", "final_layer_norm")
-    elif k.startswith("decoder"):
-        k = k.replace("norm1", "self_attn_layer_norm")
-        k = k.replace("norm2", "encoder_attn_layer_norm")
-        k = k.replace("norm3", "final_layer_norm")
-    return k
-
-
-def rename_layernorm_keys(sd):
-    keys = [
-        "model.encoder.layernorm_embedding.weight",
-        "model.encoder.layernorm_embedding.bias",
-        "model.decoder.layernorm_embedding.weight",
-        "model.decoder.layernorm_embedding.bias",
-    ]
-    for k in keys:
-        v = sd.pop(k)
-        new_k = k.replace("layernorm_embedding", "layer_norm")
-        assert new_k not in sd
-        sd[new_k] = v
-
-
-IGNORE_KEYS = ["START"]
-
-
-@torch.no_grad()
-def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path):
-    """
-    Copy/paste/tweak model's weights to our BERT structure.
-    """
-    model = torch.load(checkpoint_path, map_location="cpu")
-    sd = model["model"]
-    cfg = BlenderbotConfig.from_json_file(config_json_path)
-    m = BlenderbotForConditionalGeneration(cfg)
-    valid_keys = m.model.state_dict().keys()
-    failures = []
-    mapping = {}
-    for k, v in sd.items():
-        if k in IGNORE_KEYS:
-            continue
-
-        new_k = rename_state_dict_key(k)
-        if new_k not in valid_keys:
-            failures.append([k, new_k])
-        else:
-            mapping[new_k] = v
-    if cfg.normalize_before:  # Blenderbot-3B checkpoints. Rename layernorm_embedding -> layer_norm
-        rename_layernorm_keys(sd)
-    m.model.load_state_dict(mapping, strict=True)
-    m.half()
-    m.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--src_path", type=str, help="like blenderbot-model.bin")
-    parser.add_argument("--save_dir", default="hf_blenderbot", type=str, help="Where to save converted model.")
-    parser.add_argument(
-        "--hf_config_json", default="blenderbot-3b-config.json", type=str, help="Path to config to use"
-    )
-    args = parser.parse_args()
-    convert_parlai_checkpoint(args.src_path, args.save_dir, args.hf_config_json)
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 16bea0a09f41..9ce51bebe70c 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -320,9 +320,7 @@ def forward(
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
+        if hidden_states.dtype == torch.float16:
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
@@ -1399,7 +1397,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index fcef08fdeab8..1e0775cd08cc 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -262,7 +262,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index dec50328b76e..7bb4a49bb8e3 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1351,7 +1351,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index 236685ac5971..6aceaa611c95 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -273,7 +273,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
deleted file mode 100644
index 3de18c294ae8..000000000000
--- a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import re
-
-import requests
-import torch
-
-# git clone https://github.com/salesforce/BLIP.git
-from models.blip import blip_decoder
-from models.blip_itm import blip_itm
-from models.blip_vqa import blip_vqa
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-
-from transformers import (
-    BertTokenizer,
-    BlipConfig,
-    BlipForConditionalGeneration,
-    BlipForImageTextRetrieval,
-    BlipForQuestionAnswering,
-)
-
-
-def load_demo_image(image_size, device):
-    img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
-    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-
-    transform = transforms.Compose(
-        [
-            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
-            transforms.ToTensor(),
-            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-        ]
-    )
-    image = transform(raw_image).unsqueeze(0).to(device)
-    return image
-
-
-def rename_key(key):
-    if "visual_encoder" in key:
-        key = re.sub("visual_encoder*", "vision_model.encoder", key)
-    if "blocks" in key:
-        key = re.sub(r"blocks", "layers", key)
-    if "attn" in key:
-        key = re.sub(r"attn", "self_attn", key)
-    if "norm1" in key:
-        key = re.sub(r"norm1", "layer_norm1", key)
-    if "norm2" in key:
-        key = re.sub(r"norm2", "layer_norm2", key)
-    if "encoder.norm" in key:
-        key = re.sub(r"encoder.norm", "post_layernorm", key)
-    if "encoder.patch_embed.proj" in key:
-        key = re.sub(r"encoder.patch_embed.proj", "embeddings.patch_embedding", key)
-
-    if "encoder.pos_embed" in key:
-        key = re.sub(r"encoder.pos_embed", "embeddings.position_embedding", key)
-    if "encoder.cls_token" in key:
-        key = re.sub(r"encoder.cls_token", "embeddings.class_embedding", key)
-
-    if "self_attn" in key:
-        key = re.sub(r"self_attn.proj", "self_attn.projection", key)
-
-    return key
-
-
-@torch.no_grad()
-def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = BlipConfig.from_pretrained(config_path)
-    else:
-        config = BlipConfig(projection_dim=512, text_config={}, vision_config={})
-
-    hf_model = BlipForConditionalGeneration(config).eval()
-
-    model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
-
-    pt_model = blip_decoder(pretrained=model_url, image_size=384, vit="base")
-    pt_model = pt_model.eval()
-
-    modified_state_dict = pt_model.state_dict()
-    for key in modified_state_dict.copy():
-        value = modified_state_dict.pop(key)
-        renamed_key = rename_key(key)
-        modified_state_dict[renamed_key] = value
-
-    hf_model.load_state_dict(modified_state_dict)
-
-    image_size = 384
-    image = load_demo_image(image_size=image_size, device="cpu")
-    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-    input_ids = tokenizer(["a picture of"]).input_ids
-
-    out = hf_model.generate(image, input_ids)
-
-    assert out[0].tolist() == [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
-
-    out = hf_model.generate(image)
-
-    assert out[0].tolist() == [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]
-
-    if pytorch_dump_folder_path is not None:
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-
-    # model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth'
-    model_url = (
-        "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
-    )
-
-    vqa_model = blip_vqa(pretrained=model_url, image_size=image_size, vit="base")
-    vqa_model.eval()
-
-    modified_state_dict = vqa_model.state_dict()
-    for key in modified_state_dict.copy():
-        value = modified_state_dict.pop(key)
-        renamed_key = rename_key(key)
-        modified_state_dict[renamed_key] = value
-
-    hf_vqa_model = BlipForQuestionAnswering(config)
-
-    hf_vqa_model.load_state_dict(modified_state_dict)
-
-    question = ["How many dogs are in this image?"]
-    question_input_ids = tokenizer(question, return_tensors="pt").input_ids
-
-    answer = hf_vqa_model.generate(question_input_ids, image)
-    print(tokenizer.decode(answer[0]))
-
-    assert tokenizer.decode(answer[0]) == "[UNK] 1 [SEP]"
-    if pytorch_dump_folder_path is not None:
-        hf_vqa_model.save_pretrained(pytorch_dump_folder_path + "_vqa")
-
-    model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
-
-    itm_model = blip_itm(pretrained=model_url, image_size=image_size, vit="base")
-    itm_model.eval()
-
-    modified_state_dict = itm_model.state_dict()
-    for key in modified_state_dict.copy():
-        value = modified_state_dict.pop(key)
-        renamed_key = rename_key(key)
-        modified_state_dict[renamed_key] = value
-
-    hf_itm_model = BlipForImageTextRetrieval(config)
-
-    question = ["A picture of a woman with a dog sitting in a beach"]
-    question_input_ids = tokenizer(
-        question,
-        return_tensors="pt",
-        padding="max_length",
-        truncation=True,
-        max_length=35,
-    ).input_ids
-
-    hf_itm_model.load_state_dict(modified_state_dict)
-    hf_itm_model.eval()
-
-    out_itm = hf_itm_model(question_input_ids, image, use_itm_head=True)
-    out = hf_itm_model(question_input_ids, image, use_itm_head=False)
-
-    assert out[0].item() == 0.2110687494277954
-    assert torch.nn.functional.softmax(out_itm[0], dim=1)[:, 1].item() == 0.45698845386505127
-
-    if pytorch_dump_folder_path is not None:
-        hf_itm_model.save_pretrained(pytorch_dump_folder_path + "_itm")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-
-    convert_blip_checkpoint(args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py
index df2aee157dc2..9f28b33a66b5 100644
--- a/src/transformers/models/blip/image_processing_blip.py
+++ b/src/transformers/models/blip/image_processing_blip.py
@@ -169,7 +169,7 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> PIL.Image.Image:
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 154d6eb9f822..1f248ab8bee6 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -87,7 +87,7 @@ class BlipForConditionalGenerationModelOutput(ModelOutput):
     loss: Optional[Tuple[torch.FloatTensor]] = None
     logits: Optional[Tuple[torch.FloatTensor]] = None
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -129,7 +129,7 @@ class BlipTextVisionModelOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -170,7 +170,7 @@ class BlipImageTextMatchingModelOutput(ModelOutput):
     itm_score: Optional[torch.FloatTensor] = None
     loss: Optional[torch.FloatTensor] = None
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     vision_pooler_output: Optional[torch.FloatTensor] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -200,10 +200,10 @@ class BlipOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -1198,7 +1198,7 @@ def generate(
 
         image_embeds = vision_outputs[0]
 
-        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
 
         if isinstance(input_ids, list):
             input_ids = torch.LongTensor(input_ids)
@@ -1233,7 +1233,7 @@ def generate(
     """,
     BLIP_START_DOCSTRING,
 )
-class BlipForQuestionAnswering(BlipPreTrainedModel):
+class BlipForQuestionAnswering(BlipPreTrainedModel, GenerationMixin):
     config_class = BlipConfig
     _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
 
@@ -1424,7 +1424,7 @@ def generate(
 
         image_embeds = vision_outputs[0]
 
-        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
 
         if isinstance(input_ids, list):
             input_ids = torch.LongTensor(input_ids)
@@ -1439,7 +1439,9 @@ def generate(
 
         question_embeds = question_outputs[0]
 
-        question_attention_mask = torch.ones(question_embeds.size()[:-1], dtype=torch.long).to(question_embeds.device)
+        question_attention_mask = torch.ones(
+            question_embeds.size()[:-1], dtype=torch.long, device=question_embeds.device
+        )
 
         bos_ids = torch.full(
             (question_embeds.size(0), 1), fill_value=self.decoder_start_token_id, device=question_embeds.device
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index 92f61bf470d9..9573ca0fbbc5 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -96,7 +96,7 @@ class TFBlipForConditionalGenerationModelOutput(ModelOutput):
     loss: Tuple[tf.Tensor] | None = None
     logits: Tuple[tf.Tensor] | None = None
     image_embeds: tf.Tensor | None = None
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
 
@@ -138,7 +138,7 @@ class TFBlipTextVisionModelOutput(ModelOutput):
 
     loss: tf.Tensor | None = None
     image_embeds: tf.Tensor | None = None
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
 
@@ -179,7 +179,7 @@ class TFBlipImageTextMatchingModelOutput(ModelOutput):
     itm_score: tf.Tensor | None = None
     loss: tf.Tensor | None = None
     image_embeds: tf.Tensor | None = None
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     vision_pooler_output: tf.Tensor | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
@@ -209,10 +209,10 @@ class TFBlipOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits_per_image: tf.Tensor = None
-    logits_per_text: tf.Tensor = None
-    text_embeds: tf.Tensor = None
-    image_embeds: tf.Tensor = None
+    logits_per_image: Optional[tf.Tensor] = None
+    logits_per_text: Optional[tf.Tensor] = None
+    text_embeds: Optional[tf.Tensor] = None
+    image_embeds: Optional[tf.Tensor] = None
     text_model_output: TFBaseModelOutputWithPooling = None
     vision_model_output: TFBaseModelOutputWithPooling = None
 
@@ -309,9 +309,9 @@ def build(self, input_shape: tf.TensorShape = None):
 
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
     ) -> tf.Tensor:
         """
         Applies embedding based on inputs tensor.
diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
index edef863e4049..c65ff6b66fdc 100644
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -56,7 +56,7 @@ class BlipProcessor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer"]
     valid_kwargs = []
-    image_processor_class = "BlipImageProcessor"
+    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
     def __init__(self, image_processor, tokenizer, **kwargs):
diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
deleted file mode 100644
index d6640045b80c..000000000000
--- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
+++ /dev/null
@@ -1,390 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert BLIP-2 checkpoints from the original repository.
-
-URL: https://github.com/salesforce/LAVIS/tree/main/projects/blip2
-"""
-
-import argparse
-
-import requests
-import torch
-
-# pip3 install salesforce-lavis
-# I'm actually installing a slightly modified version: pip3 install -U git+https://github.com/nielsrogge/LAVIS.git@blip2_float32
-# to make sure we can compare both original and HF implementation in float32
-from lavis.models import load_model_and_preprocess
-from PIL import Image
-
-from transformers import (
-    AutoTokenizer,
-    BertTokenizer,
-    Blip2Config,
-    Blip2ForConditionalGeneration,
-    Blip2ForImageTextRetrieval,
-    Blip2Processor,
-    Blip2QFormerConfig,
-    Blip2VisionConfig,
-    BlipImageProcessor,
-    OPTConfig,
-    T5Config,
-    set_seed,
-)
-from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-
-
-def load_demo_image():
-    url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    return image
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, model_name):
-    rename_keys = []
-    # fmt: off
-
-    # vision encoder
-    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
-    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
-    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
-    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
-
-    for i in range(config.vision_config.num_hidden_layers):
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-
-    # QFormer
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight"))
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias"))
-    if "itm" in model_name:
-        rename_keys.append(("Qformer.bert.embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight"))
-        rename_keys.append(("Qformer.bert.embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight"))
-        rename_keys.append(("vision_proj.weight", "vision_projection.weight"))
-        rename_keys.append(("vision_proj.bias", "vision_projection.bias"))
-        rename_keys.append(("text_proj.weight", "text_projection.weight"))
-        rename_keys.append(("text_proj.bias", "text_projection.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def read_in_q_v_bias(state_dict, config):
-    for i in range(config.vision_config.num_hidden_layers):
-        # read in original q and v biases
-        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
-
-        # next, set bias in the state dict
-        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
-        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
-
-
-def get_blip2_config(model_name, eos_token_id):
-    image_size = 364 if "coco" in model_name else 224
-    vision_config = Blip2VisionConfig(image_size=image_size).to_dict()
-
-    # make sure the models have proper bos_token_id and eos_token_id set (important for generation)
-    # seems like flan-T5 models don't have bos_token_id properly set?
-    if "opt-2.7b" in model_name:
-        text_config = OPTConfig.from_pretrained("facebook/opt-2.7b", eos_token_id=eos_token_id).to_dict()
-    elif "opt-6.7b" in model_name:
-        text_config = OPTConfig.from_pretrained("facebook/opt-6.7b", eos_token_id=eos_token_id).to_dict()
-    elif "t5-xl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "t5-xxl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "itm" in model_name:
-        text_config = {}
-    else:
-        raise ValueError("Model name not supported")
-
-    if "itm" in model_name:
-        config = Blip2Config(
-            vision_config=vision_config,
-            qformer_config=Blip2QFormerConfig(vocab_size=30523, use_qformer_text_input=True).to_dict(),
-        )
-    else:
-        config = Blip2Config(vision_config=vision_config, text_config=text_config)
-
-    return config, image_size
-
-
-@torch.no_grad()
-def convert_blip2_checkpoint(
-    model_name, pytorch_dump_folder_path=None, push_to_hub=False, lavis_device="cpu", hf_model_device="cpu"
-):
-    """
-    Copy/paste/tweak model's weights to Transformers design.
-    """
-    if "opt" in model_name:
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-2.7b")
-    elif "itm" in model_name:
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right")
-        tokenizer.add_special_tokens({"bos_token": "[DEC]"})
-    else:
-        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
-
-    if "itm" in model_name:
-        eos_token_id = None
-    else:
-        eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0]
-    config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id)
-
-    if "itm" in model_name:
-        hf_model = Blip2ForImageTextRetrieval(config).eval()
-    else:
-        hf_model = Blip2ForConditionalGeneration(config).eval()
-
-    model_name_to_original = {
-        "blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"),
-        "blip2-opt-6.7b": ("blip2_opt", "pretrain_opt6.7b"),
-        "blip2-opt-2.7b-coco": ("blip2_opt", "caption_coco_opt2.7b"),
-        "blip2-opt-6.7b-coco": ("blip2_opt", "caption_coco_opt6.7b"),
-        "blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"),
-        "blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"),
-        "blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"),
-        "blip2-itm-vit-g": ("blip2_image_text_matching", "pretrain"),
-        "blip2-itm-vit-g-coco": ("blip2_image_text_matching", "coco"),
-    }
-
-    name, type = model_name_to_original[model_name]
-
-    # load original model
-    print("Loading original model...")
-    original_model, vis_processors, _ = load_model_and_preprocess(
-        name=name, model_type=type, is_eval=True, device=lavis_device
-    )
-    original_model.eval()
-    print("Done!")
-
-    # update state dict keys
-    state_dict = original_model.state_dict()
-    rename_keys = create_rename_keys(config, model_name)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-
-    # some keys can be renamed efficiently
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if key.startswith("Qformer.bert"):
-            key = key.replace("Qformer.bert", "qformer")
-        if "attention.self" in key:
-            key = key.replace("self", "attention")
-        if "opt_proj" in key:
-            key = key.replace("opt_proj", "language_projection")
-        if "t5_proj" in key:
-            key = key.replace("t5_proj", "language_projection")
-        if key.startswith("opt"):
-            key = key.replace("opt", "language")
-        if key.startswith("t5"):
-            key = key.replace("t5", "language")
-        state_dict[key] = val
-
-    # read in qv biases
-    read_in_q_v_bias(state_dict, config)
-
-    missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
-    assert len(missing_keys) == 0
-
-    if "itm" in model_name:
-        unexpected_keys = list(filter(lambda x: not x.startswith("Qformer.cls"), unexpected_keys))
-        assert unexpected_keys == ["temp", "qformer.embeddings.position_ids"]
-    else:
-        assert unexpected_keys == ["qformer.embeddings.position_ids"]
-
-    image = load_demo_image()
-    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
-
-    # create processor
-    image_processor = BlipImageProcessor(
-        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
-    )
-    processor = Blip2Processor(image_processor=image_processor, tokenizer=tokenizer)
-    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(hf_model_device)
-
-    # make sure processor creates exact same pixel values
-    assert torch.allclose(pixel_values, original_pixel_values.to(pixel_values.device))
-
-    original_model.to(lavis_device)
-    hf_model.to(hf_model_device)
-
-    if "itm" in model_name:
-        caption = "a large fountain spewing water into the air"
-        input_ids = tokenizer([caption], return_tensors="pt").input_ids.to(hf_model_device)
-        attention_mask = processor(text=caption, return_tensors="pt").attention_mask.to(hf_model_device)
-
-        with torch.no_grad():
-            original_logits = original_model(
-                {"image": original_pixel_values, "text_input": [caption]}, match_head="itm"
-            )
-            logits = hf_model(
-                pixel_values=pixel_values,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                use_image_text_matching_head=True,
-            )
-
-        assert original_logits.shape == logits.logits_per_image.shape
-        print("First values of original logits:", original_logits[0, :3])
-        print("First values of HF logits:", logits.logits_per_image[0, :3])
-
-        # assert values
-        # cast to same type
-        target_dtype = logits.logits_per_image.dtype
-        assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4)
-
-        original_itm_scores = torch.nn.functional.softmax(original_logits, dim=1)
-        itm_scores = torch.nn.functional.softmax(logits.logits_per_image, dim=1)
-        assert torch.allclose(original_itm_scores.to(target_dtype), itm_scores, atol=1e-4)
-        print("Looks ok!")
-
-        with torch.no_grad():
-            original_logits = original_model(
-                {"image": original_pixel_values, "text_input": [caption]}, match_head="itc"
-            )
-            logits = hf_model(
-                pixel_values=pixel_values,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                use_image_text_matching_head=False,
-            )
-
-        assert original_logits.shape == logits.logits_per_image.shape
-        print("First values of original logits:", original_logits[0, :3])
-        print("First values of HF logits:", logits.logits_per_image[0, :3])
-
-        # assert values
-        # cast to same type
-        target_dtype = logits.logits_per_image.dtype
-        assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4)
-        print("Looks ok!")
-
-    else:
-        input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device)
-
-        with torch.no_grad():
-            if "opt" in model_name:
-                original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits
-                logits = hf_model(pixel_values, input_ids).logits
-            else:
-                original_logits = original_model(
-                    {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]}
-                ).logits
-                labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100)
-                logits = hf_model(pixel_values, input_ids, labels=labels).logits
-
-        assert original_logits.shape == logits.shape
-        print("First values of original logits:", original_logits[0, :3, :3])
-        print("First values of HF logits:", logits[0, :3, :3])
-
-        # assert values
-        assert torch.allclose(original_logits.to(logits.device), logits, atol=1e-4)
-        print("Looks ok!")
-
-        print("Generating a caption...")
-        prompt = "Question: what object is in this image? Answer:"
-        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(hf_model_device)
-
-        set_seed(42)
-
-        original_outputs = original_model.generate(
-            {"image": original_pixel_values, "prompt": prompt}, use_nucleus_sampling=True, max_length=50
-        )
-        outputs = hf_model.generate(
-            pixel_values,
-            input_ids,
-            do_sample=True,
-            num_beams=5,
-            max_length=30,
-            min_length=1,
-            top_p=0.9,
-            repetition_penalty=1.0,
-            length_penalty=1.0,
-            temperature=1,
-        )
-        output_text = processor.batch_decode(outputs, skip_special_tokens=True)
-        output_text = [text.strip() for text in output_text]
-        print("Original generation:", original_outputs)
-        print("HF generation:", output_text)
-
-    if pytorch_dump_folder_path is not None:
-        processor.save_pretrained(pytorch_dump_folder_path)
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        processor.push_to_hub(f"nielsr/{model_name}")
-        hf_model.push_to_hub(f"nielsr/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    choices = [
-        "blip2-opt-2.7b",
-        "blip2-opt-6.7b",
-        "blip2-opt-2.7b-coco",
-        "blip2-opt-6.7b-coco",
-        "blip2-flan-t5-xl",
-        "blip2-flan-t5-xl-coco",
-        "blip2-flan-t5-xxl",
-        "blip2-itm-vit-g",
-        "blip2-itm-vit-g-coco",
-    ]
-    parser.add_argument(
-        "--model_name",
-        default="blip2-opt-2.7b",
-        choices=choices,
-        type=str,
-        help="Path to hf config.json of model to convert",
-    )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model and processor to the hub after converting",
-    )
-    # note: this script is tested on 2 GPUs, as models are compared in float32,
-    # which requires quite some memory. Hence loading both on a
-    # separate device is the easiest to compare
-    parser.add_argument(
-        "--lavis_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
-    )
-    parser.add_argument(
-        "--hf_model_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
-    )
-
-    args = parser.parse_args()
-
-    convert_blip2_checkpoint(
-        args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.lavis_device, args.hf_model_device
-    )
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 916631da7e8f..5e5b6f9f4b4d 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -106,10 +106,10 @@ class Blip2ImageTextMatchingModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -145,7 +145,7 @@ class Blip2TextModelOutput(ModelOutput):
     """
 
     text_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -175,7 +175,7 @@ class Blip2VisionModelOutput(ModelOutput):
     """
 
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -419,7 +419,6 @@ class Blip2PreTrainedModel(PreTrainedModel):
         "OPTDecoderLayer",
     ]
     _skip_keys_device_placement = "past_key_values"
-    _keep_in_fp32_modules = ["wo"]
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1239,6 +1238,9 @@ def forward(
                 embeddings += position_embeddings
 
             if query_embeds is not None:
+                # `query_embeds` are kept in fp32 when we use it with Qformer
+                if query_embeds.dtype != embeddings.dtype:
+                    query_embeds = query_embeds.to(embeddings.dtype)
                 embeddings = torch.cat((query_embeds, embeddings), dim=1)
         else:
             embeddings = query_embeds
@@ -1386,6 +1388,10 @@ def forward(
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if encoder_hidden_states is not None:
+            # Qformer and latent query tokens are kept in fp32. We cast `encoder_hidden_states` if not fp32 already
+            if encoder_hidden_states.dtype != query_embeds.dtype:
+                encoder_hidden_states = encoder_hidden_states.to(query_embeds.dtype)
+
             if isinstance(encoder_hidden_states, list):
                 encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
             else:
@@ -1448,6 +1454,7 @@ def forward(
 class Blip2Model(Blip2PreTrainedModel):
     config_class = Blip2Config
     main_input_name = "pixel_values"
+    _keep_in_fp32_modules = ["query_tokens", "qformer"]
 
     def __init__(self, config: Blip2Config):
         super().__init__(config)
@@ -1728,6 +1735,10 @@ def forward(
         )
         query_output = query_outputs[0]
 
+        # Qformer is kept in fp32, we downcast the output back if needed
+        if query_output.dtype != image_embeds.dtype:
+            query_output = query_output.to(image_embeds.dtype)
+
         # step 3: use the language model, conditioned on the query outputs and the prompt
         language_model_inputs = self.language_projection(query_output)
         language_model_attention_mask = torch.ones(
@@ -1799,7 +1810,7 @@ def forward(
 )
 class Blip2TextModelWithProjection(Blip2PreTrainedModel):
     supports_gradient_checkpointing = False
-    _keep_in_fp32_modules = []
+    _keep_in_fp32_modules = ["query_tokens", "qformer"]
 
     def __init__(self, config: Blip2Config):
         super().__init__(config)
@@ -1898,7 +1909,7 @@ def forward(
 )
 class Blip2VisionModelWithProjection(Blip2PreTrainedModel):
     main_input_name = "pixel_values"
-    _keep_in_fp32_modules = []
+    _keep_in_fp32_modules = ["query_tokens", "qformer"]
 
     def __init__(self, config: Blip2Config):
         super().__init__(config)
@@ -2019,6 +2030,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
     _supports_cache_class = True
     _supports_static_cache = True
     _supports_quantized_cache = False  # not all LM bacbones support (e.g. T5)
+    _keep_in_fp32_modules = ["query_tokens", "qformer"]
 
     def __init__(self, config: Blip2Config):
         super().__init__(config)
@@ -2191,6 +2203,10 @@ def forward(
         )
         query_output = query_outputs[0]
 
+        # Qformer is kept in fp32, we downcast the output back if needed
+        if query_output.dtype != image_embeds.dtype:
+            query_output = query_output.to(image_embeds.dtype)
+
         # step 3: use the language model, conditioned on the query outputs and the prompt
         language_model_inputs = self.language_projection(query_output)
         language_model_attention_mask = torch.ones(
@@ -2312,6 +2328,10 @@ def generate(
         )
         query_output = query_outputs.last_hidden_state
 
+        # Qformer is kept in fp32, we downcast the output back if needed
+        if query_output.dtype != image_embeds.dtype:
+            query_output = query_output.to(image_embeds.dtype)
+
         language_model_inputs = self.language_projection(query_output)
         language_attention_mask = torch.ones(
             language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
@@ -2371,7 +2391,7 @@ def generate(
 )
 class Blip2ForImageTextRetrieval(Blip2PreTrainedModel):
     main_input_name = "pixel_values"
-    _keep_in_fp32_modules = []
+    _keep_in_fp32_modules = ["query_tokens", "qformer"]
 
     def __init__(self, config: Blip2Config):
         super().__init__(config)
@@ -2478,7 +2498,7 @@ def forward(
 
         if use_image_text_matching_head:
             query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
-            query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(query_tokens.device)
+            query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=query_tokens.device)
             attention_mask = torch.cat([query_attention_mask, attention_mask], dim=1)
 
             query_embeds = self.embeddings(
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 5d09ea7c0766..36b663dccb76 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -68,7 +68,7 @@ class Blip2Processor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer"]
     valid_kwargs = ["num_query_tokens"]
-    image_processor_class = "BlipImageProcessor"
+    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
diff --git a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
deleted file mode 100644
index 40ba6240d3e4..000000000000
--- a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BigScience BLOOM checkpoint."""
-
-import argparse
-import json
-import os
-import re
-
-import torch
-
-from transformers import BloomConfig, BloomModel
-from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-WEIGHTS_TO_AVERAGE_ENDSWITH = [
-    "word_embeddings_layernorm.weight",
-    "word_embeddings_layernorm.bias",
-    "input_layernorm.weight",
-    "input_layernorm.bias",
-    "post_attention_layernorm.weight",
-    "post_attention_layernorm.bias",
-    "self_attention.dense.bias",
-    "mlp.dense_4h_to_h.bias",
-    "ln_f.weight",
-    "ln_f.bias",
-]
-
-WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [
-    "mlp.dense_4h_to_h.weight",
-    "self_attention.dense.weight",
-]
-
-
-def layer_name_mapping(key, file):
-    """Convert Megatron-DeepSpeed TP/PP weights mapping in transformers PP only"""
-    # Handle first and last layers
-    layer_rename_map = {
-        "word_embeddings.weight": "word_embeddings.weight",
-        "word_embeddings.norm.weight": "word_embeddings_layernorm.weight",
-        "word_embeddings.norm.bias": "word_embeddings_layernorm.bias",
-        "weight": "ln_f.weight",
-        "bias": "ln_f.bias",
-    }
-
-    if key in layer_rename_map:
-        return layer_rename_map[key]
-
-    # Handle transformer blocks
-    layer_number = int(re.match(r".*layer_(\d*).*", file)[1])
-    layer_number -= 3
-    return f"h.{layer_number}." + key
-
-
-def get_dtype_size(dtype):
-    if dtype == torch.bool:
-        return 1 / 8
-    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
-    if bit_search is None:
-        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
-    bit_size = int(bit_search.groups()[0])
-    return bit_size // 8
-
-
-def convert_bloom_checkpoint_to_pytorch(
-    bloom_checkpoint_path, bloom_config_file, pytorch_dump_folder_path, shard_model, pretraining_tp
-):
-    # Construct model
-    if bloom_config_file == "":
-        config = BloomConfig()
-    else:
-        config = BloomConfig.from_json_file(bloom_config_file)
-
-    if shard_model:
-        file_names = os.listdir(bloom_checkpoint_path)
-        file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names))
-
-        index_dict = {"weight_map": {}, "metadata": {}}
-        total_size = 0
-
-        missing_keys = None
-
-        config = BloomConfig()
-
-        for j, file in enumerate(file_names):
-            print("Processing file: {}".format(file))
-            tensors = None
-
-            for i in range(pretraining_tp):
-                # load all TP files
-                f_name = file.replace("model_00", f"model_0{i}")
-                temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu")
-
-                # Rename keys in the transformers names
-                keys = list(temp.keys())
-                for key in keys:
-                    temp[layer_name_mapping(key, file)] = temp.pop(key)
-
-                if tensors is None:
-                    tensors = temp
-                else:
-                    for key in tensors.keys():
-                        if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
-                            # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425)
-                            tensors[key] += temp[key]
-                        else:
-                            # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel
-                            cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0
-                            # We concatenate these weights accross TP ranks
-                            tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim)
-
-            # Divide by the number of TP the weights we want to average
-            for key in tensors.keys():
-                if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
-                    tensors[key] = tensors[key] / pretraining_tp
-            torch.save(
-                tensors,
-                os.path.join(
-                    pytorch_dump_folder_path,
-                    "pytorch_model_{}-of-{}.bin".format(str(j + 1).zfill(5), str(len(file_names)).zfill(5)),
-                ),
-            )
-
-            for key in tensors.keys():
-                value = tensors[key]
-                total_size += value.numel() * get_dtype_size(value.dtype)
-                if key not in index_dict["weight_map"]:
-                    index_dict["weight_map"][key] = "pytorch_model_{}-of-{}.bin".format(
-                        str(j + 1).zfill(5), str(len(file_names)).zfill(5)
-                    )
-
-        config = BloomConfig()
-        pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-        index_dict["metadata"]["total_size"] = total_size
-        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-            f.write(config.to_json_string())
-        with open(os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME + ".index.json"), "w", encoding="utf-8") as f:
-            json_config = json.dumps(index_dict, indent=2, sort_keys=True) + "\n"
-            f.write(json_config)
-    else:
-        model = BloomModel(config)
-
-        file_names = os.listdir(bloom_checkpoint_path)
-        file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names))
-
-        missing_keys = None
-        for i, file in enumerate(file_names):
-            tensors = None
-            for i in range(pretraining_tp):
-                # load all TP files
-                f_name = file.replace("model_00", f"model_0{i}")
-                temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu")
-
-                # Rename keys in the transformers names
-                keys = list(temp.keys())
-                for key in keys:
-                    temp[layer_name_mapping(key, file)] = temp.pop(key)
-
-                if tensors is None:
-                    tensors = temp
-                else:
-                    for key in tensors.keys():
-                        # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425)
-                        if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
-                            tensors[key] += temp[key]
-                        else:
-                            # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel
-                            cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0
-                            # We concatenate these weights accross TP ranks
-                            tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim)
-
-            # Divide by the number of TP the weights we want to average
-            for key in tensors.keys():
-                if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH):
-                    tensors[key] = tensors[key] / pretraining_tp
-
-            other_keys = model.load_state_dict(tensors, strict=False)
-            assert not other_keys.unexpected_keys, f"The keys {other_keys.unexpected_keys} are unexpected"
-            if missing_keys is None:
-                missing_keys = set(other_keys.missing_keys)
-            else:
-                missing_keys = missing_keys.intersection(set(other_keys.missing_keys))
-
-        assert not missing_keys, f"The keys {missing_keys} are missing"
-
-        # Save pytorch-model
-        os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-        pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-        pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-        print(f"Save PyTorch model to {pytorch_weights_dump_path} with dtype {config.torch_dtype}")
-        if config.torch_dtype is not None:
-            model = model.to(config.torch_dtype)
-        torch.save(model.state_dict(), pytorch_weights_dump_path)
-        print(f"Save configuration file to {pytorch_config_dump_path}")
-        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-            f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--bloom_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the Megatron-LM checkpoint path.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--bloom_config_file",
-        default="",
-        type=str,
-        help=(
-            "An optional config json file corresponding to the pre-trained model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--shard_model",
-        action="store_true",
-        help="An optional setting to shard the output model \nThis enables sharding the converted checkpoint",
-    )
-    parser.add_argument(
-        "--pretraining_tp",
-        default=4,
-        type=int,
-        help="Pretraining TP rank that has been used when training the model in Megatron-LM \n",
-    )
-    args = parser.parse_args()
-    convert_bloom_checkpoint_to_pytorch(
-        args.bloom_checkpoint_path,
-        args.bloom_config_file,
-        args.pytorch_dump_folder_path,
-        args.shard_model,
-        args.pretraining_tp,
-    )
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index df65f0aeb949..f91cd85779a3 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -36,10 +36,19 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import is_torchdynamo_compiling, logging
+from ...utils import (
+    is_torch_flex_attn_available,
+    logging,
+)
 from .configuration_bloom import BloomConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
@@ -737,12 +746,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -824,7 +838,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -904,7 +918,7 @@ def prepare_inputs_for_generation(
                 inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
             elif (
                 inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
             ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
@@ -1137,7 +1151,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/bloom/modeling_flax_bloom.py b/src/transformers/models/bloom/modeling_flax_bloom.py
index 077c2123bf95..51ccb4c3625c 100644
--- a/src/transformers/models/bloom/modeling_flax_bloom.py
+++ b/src/transformers/models/bloom/modeling_flax_bloom.py
@@ -187,7 +187,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 0d4338261eec..524b4caa7431 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -149,9 +149,9 @@ class BridgeTowerModelOutput(ModelOutput):
             heads.
     """
 
-    text_features: torch.FloatTensor = None
-    image_features: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
+    text_features: Optional[torch.FloatTensor] = None
+    image_features: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -182,7 +182,7 @@ class BridgeTowerContrastiveOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     text_embeds: Optional[Tuple[torch.FloatTensor]] = None
     image_embeds: Optional[Tuple[torch.FloatTensor]] = None
     cross_embeds: Optional[Tuple[torch.FloatTensor]] = None
@@ -225,7 +225,7 @@ def attention(self, hidden_state: torch.Tensor, attention_mask: torch.Tensor):
             key_padding_mask=attention_mask,
         )[0]
 
-    def forward(self, hidden_state: torch.Tensor, attention_mask: torch.Tensor = None):
+    def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None):
         residual_state = hidden_state + self.attention(self.ln_1(hidden_state), attention_mask)
         hidden_state = self.ln_2(residual_state)
         for _, layer in self.mlp.items():
@@ -329,7 +329,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals
         batch_size, _, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
             )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
diff --git a/src/transformers/models/bros/convert_bros_to_pytorch.py b/src/transformers/models/bros/convert_bros_to_pytorch.py
deleted file mode 100644
index c0984f2c74b2..000000000000
--- a/src/transformers/models/bros/convert_bros_to_pytorch.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Bros checkpoints."""
-
-import argparse
-
-import bros  # original repo
-import torch
-
-from transformers import BrosConfig, BrosModel, BrosProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_configs(model_name):
-    bros_config = BrosConfig.from_pretrained(model_name)
-    return bros_config
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "embeddings.bbox_sinusoid_emb.inv_freq",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(name):
-    if name == "embeddings.bbox_projection.weight":
-        name = "bbox_embeddings.bbox_projection.weight"
-
-    if name == "embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq":
-        name = "bbox_embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq"
-
-    if name == "embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq":
-        name = "bbox_embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq"
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model):
-    # rename keys
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-        orig_state_dict[rename_key(key)] = val
-
-    # remove ignore keys
-    remove_ignore_keys_(orig_state_dict)
-
-    return orig_state_dict
-
-
-def convert_bros_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    # load original model
-    original_model = bros.BrosModel.from_pretrained(model_name).eval()
-
-    # load HuggingFace Model
-    bros_config = get_configs(model_name)
-    model = BrosModel.from_pretrained(model_name, config=bros_config)
-    model.eval()
-
-    state_dict = original_model.state_dict()
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    # verify results
-
-    # original BROS model require 4 points (8 float values) for each bbox, prepare bbox with [batch_size, seq_len, 8] shape
-    bbox = torch.tensor(
-        [
-            [
-                [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
-                [0.4396, 0.6720, 0.4659, 0.6720, 0.4659, 0.6850, 0.4396, 0.6850],
-                [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850],
-                [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850],
-                [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000],
-                [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000],
-                [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
-            ]
-        ]
-    )
-
-    processor = BrosProcessor.from_pretrained(model_name)
-
-    encoding = processor("His name is Rocco.", return_tensors="pt")
-    encoding["bbox"] = bbox
-
-    original_hidden_states = original_model(**encoding).last_hidden_state
-    # pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    last_hidden_states = model(**encoding).last_hidden_state
-
-    assert torch.allclose(original_hidden_states, last_hidden_states, atol=1e-4)
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model")
-        processor.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="jinho8345/bros-base-uncased",
-        required=False,
-        type=str,
-        help="Name of the original model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        required=False,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model and processor to the 🤗 hub.",
-    )
-
-    args = parser.parse_args()
-    convert_bros_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/bros/modeling_bros.py b/src/transformers/models/bros/modeling_bros.py
index 0e1e86c0b39f..df39d2a49b7d 100755
--- a/src/transformers/models/bros/modeling_bros.py
+++ b/src/transformers/models/bros/modeling_bros.py
@@ -152,8 +152,8 @@ class BrosSpadeOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    initial_token_logits: torch.FloatTensor = None
-    subsequent_token_logits: torch.FloatTensor = None
+    initial_token_logits: Optional[torch.FloatTensor] = None
+    subsequent_token_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -1158,7 +1158,7 @@ def forward(
         subsequent_token_logits = subsequent_token_logits.masked_fill(
             invalid_token_mask[:, None, :], torch.finfo(subsequent_token_logits.dtype).min
         )
-        self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device).bool()
+        self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device=device, dtype=torch.bool)
         subsequent_token_logits = subsequent_token_logits.masked_fill(
             self_token_mask[None, :, :], torch.finfo(subsequent_token_logits.dtype).min
         )
@@ -1287,13 +1287,13 @@ def forward(
             batch_size, max_seq_length = attention_mask.shape
             device = attention_mask.device
 
-            self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device).bool()
+            self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device=device, dtype=torch.bool)
 
             mask = bbox_first_token_mask.view(-1)
             bbox_first_token_mask = torch.cat(
                 [
                     ~bbox_first_token_mask,
-                    torch.zeros([batch_size, 1], dtype=torch.bool).to(device),
+                    torch.zeros([batch_size, 1], dtype=torch.bool, device=device),
                 ],
                 axis=1,
             )
diff --git a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 9b1b15857cea..000000000000
--- a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The T5 authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert T5 checkpoint."""
-
-import argparse
-
-from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = T5Config.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = T5ForConditionalGeneration(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained T5 model. \nThis specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index e44ef805531e..b69590ae21a5 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -715,7 +715,7 @@ class CamembertPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _supports_sdpa = True
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->CamembertLMHead
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -731,6 +731,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, CamembertLMHead):
+            module.bias.data.zero_()
 
 
 CAMEMBERT_INPUTS_DOCSTRING = r"""
diff --git a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 45dcdb290333..000000000000
--- a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert CANINE checkpoint."""
-
-import argparse
-
-from transformers import CanineConfig, CanineModel, CanineTokenizer, load_tf_weights_in_canine
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path):
-    # Initialize PyTorch model
-    config = CanineConfig()
-    model = CanineModel(config)
-    model.eval()
-
-    print(f"Building PyTorch model from configuration: {config}")
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_canine(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model (weights and configuration)
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-    # Save tokenizer files
-    tokenizer = CanineTokenizer()
-    print(f"Save tokenizer files to {pytorch_dump_path}")
-    tokenizer.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the TensorFlow checkpoint. Should end with model.ckpt",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to a folder where the PyTorch model will be placed.",
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.pytorch_dump_path)
diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index 9f18fc9ac3df..7a699f6ca959 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -85,8 +85,8 @@ class CanineModelOutputWithPooling(ModelOutput):
             attention softmax, used to compute the weighted average in the self-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -1056,7 +1056,7 @@ def _downsample_attention_mask(self, char_attention_mask: torch.Tensor, downsamp
 
         return molecule_attention_mask
 
-    def _repeat_molecules(self, molecules: torch.Tensor, char_seq_length: torch.Tensor) -> torch.Tensor:
+    def _repeat_molecules(self, molecules: torch.Tensor, char_seq_length: int) -> torch.Tensor:
         """Repeats molecules to make them the same length as the char sequence."""
 
         rate = self.config.downsampling_rate
@@ -1070,7 +1070,7 @@ def _repeat_molecules(self, molecules: torch.Tensor, char_seq_length: torch.Tens
         # n elements (n < `downsampling_rate`), i.e. the remainder of floor
         # division. We do this by repeating the last molecule a few extra times.
         last_molecule = molecules[:, -1:, :]
-        remainder_length = torch.fmod(torch.tensor(char_seq_length), torch.tensor(rate)).item()
+        remainder_length = char_seq_length % rate
         remainder_repeated = torch.repeat_interleave(
             last_molecule,
             # +1 molecule to compensate for truncation.
diff --git a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py
deleted file mode 100644
index ff45c9b597e0..000000000000
--- a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py
+++ /dev/null
@@ -1,476 +0,0 @@
-# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import gc
-import json
-import os
-
-import requests
-import torch
-import yaml
-from accelerate import init_empty_weights
-from PIL import Image
-
-from transformers import (
-    ChameleonConfig,
-    ChameleonForConditionalGeneration,
-    ChameleonImageProcessor,
-    ChameleonProcessor,
-)
-
-
-try:
-    from transformers import LlamaTokenizerFast
-except ImportError:
-    raise ValueError(
-        "Chameleon conversion supports only FastTokenizer and LlamaTokenizerFast can't be imported! "
-        "Update your `tokenizers` library and re-run the tokenizer conversion."
-    )
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py \
-    --input_dir /path/to/downloaded/chameleon/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import ChameleonForConditionalGeneration, LlamaTokenizerFast
-
-model = ChameleonForConditionalGeneration.from_pretrained("/output/path")
-tokenizer = LlamaTokenizerFast.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-NUM_SHARDS = {
-    "7B": 1,
-    "30B": 4,
-}
-
-VOCAB_SIZE = 65536
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(model_path, input_base_path, model_size, chameleon_version=1):
-    os.makedirs(model_path, exist_ok=True)
-    input_model_path = os.path.join(input_base_path, "models", model_size.lower())
-    params_path = os.path.join(input_model_path, "params.json")
-    consolidate_params_path = os.path.join(input_model_path, "consolidate_params.json")
-
-    params = read_json(params_path)
-    if os.path.isfile(consolidate_params_path):
-        params = {**params, **read_json(consolidate_params_path)}
-    num_shards = NUM_SHARDS[model_size]
-    model_parallel_size = params["model_parallel_size"]
-    params = params.get("model", params)
-    n_layers = params["n_layers"]
-    n_heads = params["n_heads"]
-    n_heads_per_shard = n_heads // num_shards
-    dim = params["dim"]
-    dims_per_head = dim // n_heads
-    base = params.get("rope_theta", 10000.0)
-    swin_norm = params["swin_norm"]
-    if base > 10000.0:
-        max_position_embeddings = 16384
-    else:
-        # Depending on the Chameleon version, the default max_position_embeddings has different values.
-        if chameleon_version == 1:
-            max_position_embeddings = 4096
-        else:
-            raise NotImplementedError(
-                f"Version {chameleon_version} of chameleon is not supported yet. "
-                "Current supported versions of chameleon are [1]."
-            )
-
-    if params.get("n_kv_heads", None) is not None:
-        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
-        num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
-        key_value_dim = dim // num_key_value_heads
-    else:  # compatibility with other checkpoints
-        num_key_value_heads = n_heads
-        num_local_key_value_heads = n_heads_per_shard
-        key_value_dim = dim
-
-    print(f"Fetching all parameters from the checkpoint at {input_model_path}.")
-    # Load weights
-    if num_shards == 1:
-        # Not sharded
-        # (The sharded implementation would also work, but this is simpler.)
-        loaded = None
-        for possible_name in ["consolidated.pth", "consolidated.00.pth"]:
-            possible_path = os.path.join(input_model_path, possible_name)
-            if os.path.exists(possible_path):
-                loaded = torch.load(possible_path, map_location="cpu")
-                break
-        assert loaded is not None
-    else:
-        # Sharded
-        loaded = [
-            torch.load(os.path.join(input_model_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
-            for i in range(num_shards)
-        ]
-
-    # permute for sliced rotary
-    def permute(w, n_heads, dim1=dim, dim2=dim):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    # Load weights to the state dict
-    state_dict = {}
-    for layer_i in range(n_layers):
-        if num_shards == 1:
-            # Unsharded
-            state_dict.update(
-                {
-                    f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
-                        loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads
-                    ),
-                    f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
-                        loaded[f"layers.{layer_i}.attention.wk.weight"],
-                        n_heads=num_key_value_heads,
-                        dim1=key_value_dim,
-                    ),
-                    f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
-                    f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
-                    f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
-                    f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
-                    f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
-                    f"model.layers.{layer_i}.input_layernorm.weight": loaded[
-                        f"layers.{layer_i}.attention_norm.weight"
-                    ],
-                    f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
-                        f"layers.{layer_i}.ffn_norm.weight"
-                    ],
-                }
-            )
-            # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677)
-            state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = (
-                loaded[f"layers.{layer_i}.attention.q_normalization.weight"]
-                .view(dims_per_head // 2, 2)
-                .t()
-                .reshape(1, -1)
-                .repeat_interleave(n_heads, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = (
-                loaded[f"layers.{layer_i}.attention.q_normalization.bias"]
-                .view(dims_per_head // 2, 2)
-                .t()
-                .reshape(1, -1)
-                .repeat_interleave(n_heads, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = (
-                loaded[f"layers.{layer_i}.attention.k_normalization.weight"]
-                .view(dims_per_head // 2, 2)
-                .t()
-                .reshape(1, -1)
-                .repeat_interleave(num_key_value_heads, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = (
-                loaded[f"layers.{layer_i}.attention.k_normalization.bias"]
-                .view(dims_per_head // 2, 2)
-                .t()
-                .reshape(1, -1)
-                .repeat_interleave(num_key_value_heads, 0)
-            )
-
-        else:
-            # Sharded
-            state_dict.update(
-                {
-                    f"model.layers.{layer_i}.input_layernorm.weight": torch.stack(
-                        [l[f"layers.{layer_i}.attention_norm.weight"] for l in loaded]
-                    ).mean(dim=0),
-                    f"model.layers.{layer_i}.post_attention_layernorm.weight": torch.stack(
-                        [l[f"layers.{layer_i}.ffn_norm.weight"] for l in loaded]
-                    ).mean(dim=0),
-                }
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
-                torch.cat(
-                    [
-                        loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
-                        for i in range(num_shards)
-                    ],
-                    dim=0,
-                ).reshape(dim, dim),
-                n_heads=n_heads,
-            )
-
-            state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
-                torch.cat(
-                    [
-                        loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
-                            num_local_key_value_heads, dims_per_head, dim
-                        )
-                        for i in range(num_shards)
-                    ],
-                    dim=0,
-                ).reshape(key_value_dim, dim),
-                n_heads=num_key_value_heads,
-                dim1=key_value_dim,
-            )
-
-            # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677)
-            state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = (
-                torch.cat([l[f"layers.{layer_i}.attention.q_normalization.weight"].unsqueeze(0) for l in loaded])
-                .view(num_shards, dims_per_head // 2, 2)
-                .transpose(1, 2)
-                .reshape(num_shards, -1)
-                .repeat_interleave(n_heads // num_shards, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = (
-                torch.cat([l[f"layers.{layer_i}.attention.q_normalization.bias"].unsqueeze(0) for l in loaded])
-                .view(num_shards, dims_per_head // 2, 2)
-                .transpose(1, 2)
-                .reshape(num_shards, -1)
-                .repeat_interleave(n_heads // num_shards, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = (
-                torch.cat([l[f"layers.{layer_i}.attention.k_normalization.weight"].unsqueeze(0) for l in loaded])
-                .view(num_shards, dims_per_head // 2, 2)
-                .transpose(1, 2)
-                .reshape(num_shards, -1)
-                .repeat_interleave(num_key_value_heads // num_shards, 0)
-            )
-            state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = (
-                torch.cat([l[f"layers.{layer_i}.attention.k_normalization.bias"].unsqueeze(0) for l in loaded])
-                .view(num_shards, dims_per_head // 2, 2)
-                .transpose(1, 2)
-                .reshape(num_shards, -1)
-                .repeat_interleave(num_key_value_heads // num_shards, 0)
-            )
-
-            state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
-                [
-                    loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
-                        num_local_key_value_heads, dims_per_head, dim
-                    )
-                    for i in range(num_shards)
-                ],
-                dim=0,
-            ).reshape(key_value_dim, dim)
-
-            state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
-            )
-            state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
-            )
-            state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
-            )
-            state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
-            )
-
-    if num_shards == 1:
-        # Unsharded
-        state_dict.update(
-            {
-                "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
-                "model.norm.weight": loaded["norm.weight"],
-                "lm_head.weight": loaded["output.weight"],
-            }
-        )
-    else:
-        state_dict.update(
-            {
-                "model.embed_tokens.weight": torch.cat(
-                    [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
-                ),
-                "model.norm.weight": torch.stack([loaded[i]["norm.weight"] for i in range(num_shards)]).mean(dim=0),
-                "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
-            }
-        )
-
-    # Load VQGAN weights
-    vqgan_path = os.path.join(input_base_path, "tokenizer/vqgan.ckpt")
-    vqgan_state_dict = torch.load(vqgan_path, map_location="cpu")["state_dict"]
-    for k, v in vqgan_state_dict.items():
-        if "decoder" in k:
-            continue  # we dont do image generation yet
-        state_dict[f"model.vqmodel.{k}"] = v
-
-    # Write configs
-    ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
-    multiple_of = params["multiple_of"] if "multiple_of" in params else 256
-
-    with open(os.path.join(input_base_path, "tokenizer/text_tokenizer.json")) as tokenizer_file:
-        tokenizer_config = json.load(tokenizer_file)
-        vocabulary_map = tokenizer_config["model"]["vocab"]
-        vocabulary_map["<image>"] = vocabulary_map[
-            "<reserved08707>"
-        ]  # use a reserved token instead of adding a new one
-        del vocabulary_map["<reserved08707>"]
-
-        for token in tokenizer_config["added_tokens"]:
-            if token["content"] == "<reserved08707>":
-                token["content"] = "<image>"
-
-    with open(os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), "w") as f:
-        json.dump(tokenizer_config, f)  # save the new file to init tokenizer later
-
-    vq_keys_to_replace = [
-        ("ch", "base_channels"),
-        ("out_ch", "out_channels"),
-        ("n_embed", "num_embeddings"),
-        ("ch_mult", "channel_multiplier"),
-        ("double_z", "double_latent"),
-        ("z_channels", "latent_channels"),
-    ]
-    with open(os.path.join(input_base_path, "tokenizer/vqgan.yaml")) as vqgan_cfg_file:
-        vq_config = yaml.safe_load(vqgan_cfg_file)["model"]["params"]
-        vq_config.update(**vq_config["ddconfig"])
-        for old, new in vq_keys_to_replace:
-            vq_config[new] = vq_config[old]
-        del vq_config["ddconfig"]
-        del vq_config["ckpt_path"]
-        del vq_config["lossconfig"]
-
-    config = ChameleonConfig(
-        hidden_size=dim,
-        intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
-        num_attention_heads=params["n_heads"],
-        num_hidden_layers=params["n_layers"],
-        rms_norm_eps=params["norm_eps"],
-        num_key_value_heads=num_key_value_heads,
-        vocab_size=VOCAB_SIZE,
-        rope_theta=base,
-        max_position_embeddings=max_position_embeddings,
-        model_parallel_size=model_parallel_size,
-        swin_norm=swin_norm,
-        vq_config=vq_config,
-        vocabulary_map=vocabulary_map,
-    )
-    with init_empty_weights():
-        model = ChameleonForConditionalGeneration(config)
-
-    model.load_state_dict(state_dict, assign=True, strict=False)
-    model.save_pretrained(model_path, safe_serialization=True)
-
-    # Load and save the processor
-    tokenizer = LlamaTokenizerFast(
-        tokenizer_file=os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), legacy=False
-    )
-    tokenizer.sep_token_id = 8710  # assign <reserved08706> to sep so that we can append it after input text
-    tokenizer.pad_token_id = 1  # assing <pad> to special pad_token
-    image_processor = ChameleonImageProcessor()
-    processor = ChameleonProcessor(image_processor=image_processor, tokenizer=tokenizer)
-    processor.save_pretrained(model_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    del loaded
-    del vqgan_state_dict
-    gc.collect()
-
-    # Short inference on a few examples to check if generation makes sense
-    # taken from https://github.com/facebookresearch/chameleon/blob/7a72f40aa5f462965c8374f25257f55b65b25ff4/data/prompts_for_human_evaluations.jsonl
-    print("Loading the checkpoint in a Chameleon model...")
-    print("*" * 100)
-    model = ChameleonForConditionalGeneration.from_pretrained(
-        model_path, attn_implementation="eager", torch_dtype=torch.bfloat16, device_map="auto"
-    )
-    processor = ChameleonProcessor.from_pretrained(model_path)
-
-    prompt = "I'm very intrigued by this work of art:<image>Please tell me about the artist."
-    image = Image.open(
-        requests.get(
-            "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True
-        ).raw
-    )
-    inputs = processor(prompt, images=image, return_tensors="pt").to(model.device, torch.bfloat16)
-    length = inputs.input_ids.shape[1]
-
-    out = model.generate(**inputs, max_new_tokens=40, do_sample=False)
-    generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0]
-
-    print(f"Generation for single-image: {generated_text}")
-    print("*" * 100)
-
-    # Multi-image example
-    prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.<image><image>I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation."
-    image = Image.open(
-        requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw
-    )
-    image_2 = Image.open(
-        requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw
-    )
-
-    inputs = processor(prompt, images=[image, image_2], return_tensors="pt").to(model.device, dtype=torch.bfloat16)
-    length = inputs.input_ids.shape[1]
-    out = model.generate(**inputs, max_new_tokens=50, do_sample=False)
-    generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0]
-
-    print(f"Generation for multi-image: {generated_text}")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Chameleon weights",
-    )
-    parser.add_argument(
-        "--model_size",
-        choices=["7B", "30B"],
-        help=""
-        " models correspond to the finetuned versions, and are specific to the Chameleon official release. For more details on Chameleon, checkout the original repo: https://github.com/facebookresearch/chameleon",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model",
-    )
-    parser.add_argument(
-        "--test_inference",
-        action="store_true",
-        help="Whether to load the model for generation to test it's converted correctly.",
-    )
-    # Different Chameleon versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
-    parser.add_argument(
-        "--chameleon_version",
-        choices=[1],
-        default=1,
-        type=int,
-        help="Version of the Chameleon model to convert",
-    )
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        model_size=args.model_size,
-        chameleon_version=args.chameleon_version,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/chameleon/image_processing_chameleon.py b/src/transformers/models/chameleon/image_processing_chameleon.py
index c9d110ad229a..2d1417a8ee80 100644
--- a/src/transformers/models/chameleon/image_processing_chameleon.py
+++ b/src/transformers/models/chameleon/image_processing_chameleon.py
@@ -172,17 +172,17 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py
index 65322e236ca0..65ace7cbcc4f 100644
--- a/src/transformers/models/chameleon/modeling_chameleon.py
+++ b/src/transformers/models/chameleon/modeling_chameleon.py
@@ -28,7 +28,7 @@
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
-from ...modeling_flash_attention_utils import _flash_attention_forward
+from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -39,8 +39,7 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    is_torch_flex_attn_available,
     is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
@@ -48,8 +47,10 @@
 from .configuration_chameleon import ChameleonConfig, ChameleonVQVAEConfig
 
 
-if is_flash_attn_2_available():
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
 
 
 logger = logging.get_logger(__name__)
@@ -94,7 +95,10 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, s
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim)
+        )
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # For BC we register cos and sin cached
         self.max_seq_len_cached = max_position_embeddings
@@ -137,7 +141,8 @@ def forward(self, x, position_ids):
                 (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
             ) ** (self.dim / (self.dim - 2))
             inv_freq = 1.0 / (
-                base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim)
+                base
+                ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=x.device, dtype=torch.float) / self.dim)
             )
             self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: this may break with compilation
 
@@ -376,9 +381,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     # Ignore copy
     def forward(
@@ -1249,8 +1254,8 @@ def get_image_tokens(self, pixel_values: torch.FloatTensor):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -1285,7 +1290,7 @@ def forward(
         if pixel_values is not None:
             image_tokens = self.get_image_tokens(pixel_values)
             special_image_mask = input_ids == self.vocabulary_mapping.image_token_id
-            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_tokens.numel():
+            if not is_torchdynamo_compiling() and input_ids[special_image_mask].numel() != image_tokens.numel():
                 n_image_tokens_in_text = (input_ids == self.vocabulary_mapping.image_token_id).sum()
                 n_image_features = image_tokens.shape[0] * image_tokens.shape[1]
                 raise ValueError(
@@ -1383,12 +1388,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1470,7 +1480,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1540,8 +1550,8 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -1554,7 +1564,6 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1651,54 +1660,23 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        #              (we can't check exception 3 while compiling)
-        # Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
-        # generate the first token for each sequence. Later use the generated Input ids for continuation.
-        if past_key_values is not None:
-            if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
-                inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
-            elif (
-                inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                if inputs_embeds is not None and input_ids.shape[1] == 0:
-                    position_ids = position_ids[:, -inputs_embeds.shape[1] :]
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            pixel_values=pixel_values,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            **kwargs,
+        )
 
-        if cache_position[0] == 0:
+        if cache_position[0] != 0:
             # If we're in cached decoding stage, pixel values should be `None` because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
-            model_inputs["pixel_values"] = pixel_values
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-            }
-        )
+            model_inputs["pixel_values"] = None
+
         return model_inputs
 
 
diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index 99da53c6c612..5c80c7c6c401 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -87,7 +87,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
deleted file mode 100644
index 02c4b7b754b2..000000000000
--- a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-
-from transformers import ChineseCLIPConfig, ChineseCLIPModel
-
-
-def copy_attn_layer(hf_attn_layer, pt_weights, prefix):
-    q_proj, k_proj, v_proj = pt_weights[f"{prefix}.in_proj_weight"].chunk(3, dim=0)
-    q_proj_bias, k_proj_bias, v_proj_bias = pt_weights[f"{prefix}.in_proj_bias"].chunk(3, dim=0)
-
-    out_proj_weights = pt_weights[f"{prefix}.out_proj.weight"]
-    out_proj_bias = pt_weights[f"{prefix}.out_proj.bias"]
-
-    hf_attn_layer.q_proj.weight.data = q_proj
-    hf_attn_layer.q_proj.bias.data = q_proj_bias
-
-    hf_attn_layer.k_proj.weight.data = k_proj
-    hf_attn_layer.k_proj.bias.data = k_proj_bias
-
-    hf_attn_layer.v_proj.weight.data = v_proj
-    hf_attn_layer.v_proj.bias.data = v_proj_bias
-
-    hf_attn_layer.out_proj.weight.data = out_proj_weights
-    hf_attn_layer.out_proj.bias.data = out_proj_bias
-
-
-def copy_mlp(hf_mlp, pt_weights, prefix):
-    copy_linear(hf_mlp.fc1, pt_weights, f"{prefix}.c_fc")
-    copy_linear(hf_mlp.fc2, pt_weights, f"{prefix}.c_proj")
-
-
-def copy_linear(hf_linear, pt_weights, prefix):
-    hf_linear.weight.data = pt_weights[f"{prefix}.weight"].data
-    hf_linear.bias.data = pt_weights[f"{prefix}.bias"].data
-
-
-def copy_layer(hf_layer, pt_weights, prefix):
-    # copy layer norms
-    copy_linear(hf_layer.layer_norm1, pt_weights, f"{prefix}.ln_1")
-    copy_linear(hf_layer.layer_norm2, pt_weights, f"{prefix}.ln_2")
-
-    # copy MLP
-    copy_mlp(hf_layer.mlp, pt_weights, f"{prefix}.mlp")
-
-    # copy attn
-    copy_attn_layer(hf_layer.self_attn, pt_weights, f"{prefix}.attn")
-
-
-def copy_layers(hf_layers, pt_weights, prefix):
-    for layer_id, hf_layer in enumerate(hf_layers):
-        copy_layer(hf_layer, pt_weights, f"{prefix}.{layer_id}")
-
-
-def copy_text_model_and_projection(hf_model, pt_weights):
-    # copy projection
-    hf_model.text_projection.weight.data = pt_weights["text_projection"].data.T
-
-    # copy text encoder
-    for name, param in hf_model.text_model.named_parameters():
-        param.data = pt_weights[f"bert.{name}"].data
-
-
-def copy_vision_model_and_projection(hf_model, pt_weights):
-    # copy projection
-    hf_model.visual_projection.weight.data = pt_weights["visual.proj"].data.T
-
-    # copy layer norms
-    copy_linear(hf_model.vision_model.pre_layrnorm, pt_weights, "visual.ln_pre")
-    copy_linear(hf_model.vision_model.post_layernorm, pt_weights, "visual.ln_post")
-
-    # copy embeddings
-    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_weights["visual.conv1.weight"].data
-    hf_model.vision_model.embeddings.class_embedding.data = pt_weights["visual.class_embedding"].data
-    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_weights["visual.positional_embedding"].data
-
-    # copy encoder
-    copy_layers(hf_model.vision_model.encoder.layers, pt_weights, "visual.transformer.resblocks")
-
-
-@torch.no_grad()
-def convert_chinese_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-
-    assert config_path is not None, "Please specify the ChineseCLIP model config of the corresponding model size."
-    config = ChineseCLIPConfig.from_pretrained(config_path)
-
-    hf_model = ChineseCLIPModel(config).eval()
-
-    pt_weights = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
-    pt_weights = {(name[7:] if name.startswith("module.") else name): value for name, value in pt_weights.items()}
-
-    copy_text_model_and_projection(hf_model, pt_weights)
-    copy_vision_model_and_projection(hf_model, pt_weights)
-    hf_model.logit_scale.data = pt_weights["logit_scale"].data
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output folder storing converted hf PyTorch model.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, help="Path to original github format ChineseCLIP checkpoint."
-    )
-    parser.add_argument(
-        "--config_path", default=None, required=True, type=str, help="Path to hf config.json of model to convert."
-    )
-    args = parser.parse_args()
-
-    convert_chinese_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
-    print("The conversion is finished!")
diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
index e07c87dc3422..629de907b147 100644
--- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
@@ -165,17 +165,17 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index c9c19073b0e7..647e8f1c2421 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -86,10 +86,10 @@ class ChineseCLIPOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None
     vision_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None
 
@@ -234,7 +234,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals
         batch_size, _, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
             )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py
index 53ba3d31259b..958adfdd0aea 100644
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@@ -78,7 +78,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
deleted file mode 100644
index d422bc45ab3d..000000000000
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import re
-
-from laion_clap import CLAP_Module
-
-from transformers import AutoFeatureExtractor, ClapConfig, ClapModel
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "text_branch": "text_model",
-    "audio_branch": "audio_model.audio_encoder",
-    "attn": "attention.self",
-    "self.proj": "output.dense",
-    "attention.self_mask": "attn_mask",
-    "mlp.fc1": "intermediate.dense",
-    "mlp.fc2": "output.dense",
-    "norm1": "layernorm_before",
-    "norm2": "layernorm_after",
-    "bn0": "batch_norm",
-}
-
-processor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused", truncation="rand_trunc")
-
-
-def init_clap(checkpoint_path, model_type, enable_fusion=False):
-    model = CLAP_Module(
-        amodel=model_type,
-        enable_fusion=enable_fusion,
-    )
-    model.load_ckpt(checkpoint_path)
-    return model
-
-
-def get_config_from_original(clap_model):
-    audio_config = {
-        "patch_embeds_hidden_size": clap_model.model.audio_branch.embed_dim,
-        "depths": clap_model.model.audio_branch.depths,
-        "hidden_size": clap_model.model.audio_projection[0].in_features,
-    }
-
-    text_config = {"hidden_size": clap_model.model.text_branch.pooler.dense.in_features}
-
-    return ClapConfig(audio_config=audio_config, text_config=text_config)
-
-
-def rename_state_dict(state_dict):
-    model_state_dict = {}
-
-    sequential_layers_pattern = r".*sequential.(\d+).*"
-    text_projection_pattern = r".*_projection.(\d+).*"
-
-    for key, value in state_dict.items():
-        # check if any key needs to be modified
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        if re.match(sequential_layers_pattern, key):
-            # replace sequential layers with list
-            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
-
-            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
-        elif re.match(text_projection_pattern, key):
-            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
-
-            # Because in CLAP they use `nn.Sequential`...
-            transformers_projection_layer = 1 if projecton_layer == 0 else 2
-
-            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
-
-        if "audio" and "qkv" in key:
-            # split qkv into query key and value
-            mixed_qkv = value
-            qkv_dim = mixed_qkv.size(0) // 3
-
-            query_layer = mixed_qkv[:qkv_dim]
-            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
-            value_layer = mixed_qkv[qkv_dim * 2 :]
-
-            model_state_dict[key.replace("qkv", "query")] = query_layer
-            model_state_dict[key.replace("qkv", "key")] = key_layer
-            model_state_dict[key.replace("qkv", "value")] = value_layer
-        else:
-            model_state_dict[key] = value
-
-    return model_state_dict
-
-
-def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, model_type, enable_fusion=False):
-    clap_model = init_clap(checkpoint_path, model_type, enable_fusion=enable_fusion)
-
-    clap_model.eval()
-    state_dict = clap_model.model.state_dict()
-    state_dict = rename_state_dict(state_dict)
-
-    transformers_config = get_config_from_original(clap_model)
-    transformers_config.audio_config.enable_fusion = enable_fusion
-    model = ClapModel(transformers_config)
-
-    # ignore the spectrogram embedding layer
-    model.load_state_dict(state_dict, strict=False)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-    transformers_config.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument("--enable_fusion", action="store_true", help="Whether to enable fusion or not")
-    parser.add_argument("--model_type", default="HTSAT-tiny", type=str, help="Whether to enable fusion or not")
-    args = parser.parse_args()
-
-    convert_clap_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.model_type, args.enable_fusion
-    )
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index 42d3646065ec..c4a4428f7bc6 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -92,7 +92,7 @@ def __init__(
         return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
         frequency_min: float = 0,
         frequency_max: float = 14_000,
-        top_db: int = None,
+        top_db: Optional[int] = None,
         truncation: str = "fusion",
         padding: str = "repeatpad",
         **kwargs,
@@ -258,7 +258,7 @@ def _get_input_mel(self, waveform: np.array, max_length, truncation, padding) ->
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
-        truncation: str = None,
+        truncation: Optional[str] = None,
         padding: Optional[str] = None,
         max_length: Optional[int] = None,
         sampling_rate: Optional[int] = None,
@@ -308,7 +308,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 5792257e026d..b2fdf0dd7eeb 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -154,7 +154,7 @@ class ClapTextModelOutput(ModelOutput):
     """
 
     text_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -183,7 +183,7 @@ class ClapAudioModelOutput(ModelOutput):
     """
 
     audio_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -212,10 +212,10 @@ class ClapOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_audio: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    audio_embeds: torch.FloatTensor = None
+    logits_per_audio: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    audio_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     audio_model_output: BaseModelOutputWithPooling = None
 
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index 6df9d4aa3961..126fc384ebfb 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -46,7 +46,7 @@ def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
         and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to
         encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
         ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
-        doctsring of the above two methods for more information.
+        docstring of the above two methods for more information.
 
         Args:
             text (`str`, `List[str]`, `List[List[str]]`):
diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
deleted file mode 100644
index 3d88fc1929c3..000000000000
--- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-from clip import load
-
-from transformers import CLIPConfig, CLIPModel
-
-
-def copy_attn_layer(hf_attn_layer, pt_attn_layer):
-    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
-    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
-
-    out_proj_weights = pt_attn_layer.out_proj.weight
-    out_proj_bias = pt_attn_layer.out_proj.bias
-
-    hf_attn_layer.q_proj.weight.data = q_proj
-    hf_attn_layer.q_proj.bias.data = q_proj_bias
-
-    hf_attn_layer.k_proj.weight.data = k_proj
-    hf_attn_layer.k_proj.bias.data = k_proj_bias
-
-    hf_attn_layer.v_proj.weight.data = v_proj
-    hf_attn_layer.v_proj.bias.data = v_proj_bias
-
-    hf_attn_layer.out_proj.weight = out_proj_weights
-    hf_attn_layer.out_proj.bias = out_proj_bias
-
-
-def copy_mlp(hf_mlp, pt_mlp):
-    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
-    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
-
-
-def copy_linear(hf_linear, pt_linear):
-    hf_linear.weight = pt_linear.weight
-    hf_linear.bias = pt_linear.bias
-
-
-def copy_layer(hf_layer, pt_layer):
-    # copy layer norms
-    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
-    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
-
-    # copy MLP
-    copy_mlp(hf_layer.mlp, pt_layer.mlp)
-
-    # copy attn
-    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
-
-
-def copy_layers(hf_layers, pt_layers):
-    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
-        copy_layer(hf_layer, pt_layer)
-
-
-def copy_encoder(hf_encoder, pt_model):
-    # copy  embeds
-    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
-    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
-
-    # copy layer norm
-    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
-
-    # copy hidden layers
-    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
-
-
-def copy_text_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous()
-
-    # copy text encoder
-    copy_encoder(hf_model.text_model, pt_model)
-
-
-def copy_vison_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous()
-
-    # copy layer norms
-    copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre)
-    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
-
-    # copy embeds
-    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
-    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
-    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
-
-    # copy encoder
-    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
-
-
-@torch.no_grad()
-def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = CLIPConfig.from_pretrained(config_path)
-    else:
-        config = CLIPConfig(projection_dim=512, text_config={}, vision_config={})
-
-    hf_model = CLIPModel(config).eval()
-
-    pt_model, _ = load(checkpoint_path, device="cpu", jit=False)
-    pt_model = pt_model.eval()
-
-    copy_text_model_and_projection(hf_model, pt_model)
-    copy_vison_model_and_projection(hf_model, pt_model)
-    hf_model.logit_scale = pt_model.logit_scale
-
-    # Use `eos_token` so the example is more meaningful
-    input_ids = torch.tensor(
-        [
-            [config.text_config.bos_token_id]
-            + list(range(3, 77))
-            + [config.text_config.eos_token_id]
-            + [config.text_config.pad_token_id]
-        ]
-    )
-    pixel_values = torch.randn(1, 3, 224, 224)
-
-    hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True)
-    hf_logits_per_image = hf_outputs.logits_per_image
-    hf_logits_per_text = hf_outputs.logits_per_text
-    pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids)
-
-    assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3)
-    assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-
-    convert_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py
index 2155b306bc0d..8c02cd14ebc6 100644
--- a/src/transformers/models/clip/image_processing_clip.py
+++ b/src/transformers/models/clip/image_processing_clip.py
@@ -200,17 +200,17 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 01c8f4dcbc9a..2df825a8ada5 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -15,7 +15,7 @@
 """PyTorch CLIP model."""
 
 from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union
+from typing import Any, Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
@@ -24,6 +24,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import is_torch_greater_or_equal_than_2_2
@@ -32,8 +33,7 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
     torch_int,
@@ -41,7 +41,7 @@
 from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -103,7 +103,7 @@ class CLIPVisionModelOutput(ModelOutput):
     """
 
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -132,7 +132,7 @@ class CLIPTextModelOutput(ModelOutput):
     """
 
     text_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -160,10 +160,10 @@ class CLIPOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -242,7 +242,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals
         batch_size, _, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
             )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
@@ -373,7 +373,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
@@ -412,9 +412,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
     def forward(
@@ -820,6 +820,7 @@ def __init__(self, config: CLIPConfig):
         self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.gradient_checkpointing = False
 
+    @can_return_tuple
     def forward(
         self,
         inputs_embeds,
@@ -827,8 +828,7 @@ def forward(
         causal_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -862,7 +862,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -895,10 +894,10 @@ def forward(
         if output_hidden_states:
             encoder_states = encoder_states + (hidden_states,)
 
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
         )
 
 
@@ -917,6 +916,7 @@ def __init__(self, config: CLIPTextConfig):
         # For attention mask, it differs between `flash_attention_2` and other attention implementations
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
     def forward(
@@ -926,8 +926,7 @@ def forward(
         position_ids: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Returns:
 
@@ -936,7 +935,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if input_ids is None:
             raise ValueError("You have to specify input_ids")
@@ -957,16 +955,15 @@ def forward(
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
 
-        encoder_outputs = self.encoder(
+        encoder_outputs: BaseModelOutput = self.encoder(
             inputs_embeds=hidden_states,
             attention_mask=attention_mask,
             causal_attention_mask=causal_attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = encoder_outputs.last_hidden_state
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
         if self.eos_token_id == 2:
@@ -991,9 +988,6 @@ def forward(
                 .argmax(dim=-1),
             ]
 
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
         return BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
@@ -1023,6 +1017,7 @@ def get_input_embeddings(self) -> nn.Module:
     def set_input_embeddings(self, value):
         self.text_model.embeddings.token_embedding = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig)
     def forward(
@@ -1032,8 +1027,7 @@ def forward(
         position_ids: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Returns:
 
@@ -1051,7 +1045,6 @@ def forward(
         >>> last_hidden_state = outputs.last_hidden_state
         >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         return self.text_model(
             input_ids=input_ids,
@@ -1059,7 +1052,6 @@ def forward(
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
 
@@ -1074,6 +1066,7 @@ def __init__(self, config: CLIPVisionConfig):
         self.encoder = CLIPEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
     def forward(
@@ -1081,9 +1074,8 @@ def forward(
         pixel_values: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = False,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Returns:
 
@@ -1092,7 +1084,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
@@ -1100,20 +1091,16 @@ def forward(
         hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
         hidden_states = self.pre_layrnorm(hidden_states)
 
-        encoder_outputs = self.encoder(
+        encoder_outputs: BaseModelOutput = self.encoder(
             inputs_embeds=hidden_states,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = encoder_outputs.last_hidden_state
         pooled_output = last_hidden_state[:, 0, :]
         pooled_output = self.post_layernorm(pooled_output)
 
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
         return BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
@@ -1140,6 +1127,7 @@ def __init__(self, config: CLIPVisionConfig):
     def get_input_embeddings(self) -> nn.Module:
         return self.vision_model.embeddings.patch_embedding
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig)
     def forward(
@@ -1148,8 +1136,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: bool = False,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Returns:
 
@@ -1172,13 +1159,11 @@ def forward(
         >>> last_hidden_state = outputs.last_hidden_state
         >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         return self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
@@ -1231,7 +1216,6 @@ def get_text_features(
         position_ids: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> torch.FloatTensor:
         r"""
         Returns:
@@ -1254,18 +1238,16 @@ def get_text_features(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        text_outputs = self.text_model(
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        pooled_output = text_outputs[1]
+        pooled_output = text_outputs.pooler_output
         text_features = self.text_projection(pooled_output)
 
         return text_features
@@ -1277,7 +1259,6 @@ def get_image_features(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: bool = False,
-        return_dict: Optional[bool] = None,
     ) -> torch.FloatTensor:
         r"""
         Returns:
@@ -1306,21 +1287,20 @@ def get_image_features(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=return_dict,
         )
 
-        pooled_output = vision_outputs[1]  # pooled_output
+        pooled_output = vision_outputs.pooler_output
         image_features = self.visual_projection(pooled_output)
 
         return image_features
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CLIPOutput, config_class=CLIPConfig)
     def forward(
@@ -1333,8 +1313,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: bool = False,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLIPOutput]:
+    ) -> CLIPOutput:
         r"""
         Returns:
 
@@ -1364,29 +1343,26 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=return_dict,
         )
 
-        text_outputs = self.text_model(
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        image_embeds = vision_outputs[1]
+        image_embeds = vision_outputs.pooler_output
         image_embeds = self.visual_projection(image_embeds)
 
-        text_embeds = text_outputs[1]
+        text_embeds = text_outputs.pooler_output
         text_embeds = self.text_projection(text_embeds)
 
         # normalized features
@@ -1394,20 +1370,15 @@ def forward(
         text_embeds = text_embeds / _get_vector_norm(text_embeds)
 
         # cosine similarity as logits
-        logit_scale = self.logit_scale.exp()
-        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device)) * logit_scale.to(
-            text_embeds.device
-        )
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device))
+        logits_per_text = logits_per_text * self.logit_scale.exp().to(text_embeds.device)
+
         logits_per_image = logits_per_text.t()
 
         loss = None
         if return_loss:
             loss = clip_loss(logits_per_text)
 
-        if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
-            return ((loss,) + output) if loss is not None else output
-
         return CLIPOutput(
             loss=loss,
             logits_per_image=logits_per_image,
@@ -1447,6 +1418,7 @@ def get_input_embeddings(self) -> nn.Module:
     def set_input_embeddings(self, value):
         self.text_model.embeddings.token_embedding = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CLIPTextModelOutput, config_class=CLIPTextConfig)
     def forward(
@@ -1456,8 +1428,7 @@ def forward(
         position_ids: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLIPTextModelOutput]:
+    ) -> CLIPTextModelOutput:
         r"""
         Returns:
 
@@ -1474,25 +1445,17 @@ def forward(
         >>> outputs = model(**inputs)
         >>> text_embeds = outputs.text_embeds
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        text_outputs = self.text_model(
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-
-        pooled_output = text_outputs[1]
-
+        pooled_output = text_outputs.pooler_output
         text_embeds = self.text_projection(pooled_output)
 
-        if not return_dict:
-            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
-            return tuple(output for output in outputs if output is not None)
-
         return CLIPTextModelOutput(
             text_embeds=text_embeds,
             last_hidden_state=text_outputs.last_hidden_state,
@@ -1525,6 +1488,7 @@ def __init__(self, config: CLIPVisionConfig):
     def get_input_embeddings(self) -> nn.Module:
         return self.vision_model.embeddings.patch_embedding
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CLIPVisionModelOutput, config_class=CLIPVisionConfig)
     def forward(
@@ -1533,8 +1497,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         interpolate_pos_encoding: bool = False,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CLIPVisionModelOutput]:
+    ) -> CLIPVisionModelOutput:
         r"""
         Returns:
 
@@ -1556,24 +1519,16 @@ def forward(
         >>> outputs = model(**inputs)
         >>> image_embeds = outputs.image_embeds
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             interpolate_pos_encoding=interpolate_pos_encoding,
-            return_dict=return_dict,
         )
-
-        pooled_output = vision_outputs[1]  # pooled_output
-
+        pooled_output = vision_outputs.pooler_output
         image_embeds = self.visual_projection(pooled_output)
 
-        if not return_dict:
-            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
-            return tuple(output for output in outputs if output is not None)
-
         return CLIPVisionModelOutput(
             image_embeds=image_embeds,
             last_hidden_state=vision_outputs.last_hidden_state,
@@ -1607,6 +1562,7 @@ def __init__(self, config: CLIPConfig) -> None:
         # Initialize weights and apply final processing
         self.post_init()
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
@@ -1620,8 +1576,7 @@ def forward(
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[tuple, ImageClassifierOutput]:
+    ) -> ImageClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
@@ -1632,16 +1587,14 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.vision_model(
+        outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         # average pool the patch tokens
         sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1)
@@ -1673,10 +1626,6 @@ def forward(
                 loss_fct = BCEWithLogitsLoss()
                 loss = loss_fct(logits, labels)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return ImageClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index aedea502e886..6afdadd25295 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -108,10 +108,10 @@ class TFCLIPOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits_per_image: tf.Tensor = None
-    logits_per_text: tf.Tensor = None
-    text_embeds: tf.Tensor = None
-    image_embeds: tf.Tensor = None
+    logits_per_image: Optional[tf.Tensor] = None
+    logits_per_text: Optional[tf.Tensor] = None
+    text_embeds: Optional[tf.Tensor] = None
+    image_embeds: Optional[tf.Tensor] = None
     text_model_output: TFBaseModelOutputWithPooling = None
     vision_model_output: TFBaseModelOutputWithPooling = None
 
@@ -225,9 +225,9 @@ def build(self, input_shape: tf.TensorShape = None):
 
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
     ) -> tf.Tensor:
         """
         Applies embedding based on inputs tensor.
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index e69e65dec68d..6f835fb313b7 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -37,7 +37,7 @@ class CLIPProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "CLIPImageProcessor"
+    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
@@ -63,7 +63,7 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
deleted file mode 100644
index c614d61e5b3d..000000000000
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert CLIPSeg checkpoints from the original repository. URL: https://github.com/timojl/clipseg."""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import (
-    CLIPSegConfig,
-    CLIPSegForImageSegmentation,
-    CLIPSegProcessor,
-    CLIPSegTextConfig,
-    CLIPSegVisionConfig,
-    CLIPTokenizer,
-    ViTImageProcessor,
-)
-
-
-def get_clipseg_config(model_name):
-    text_config = CLIPSegTextConfig()
-    vision_config = CLIPSegVisionConfig(patch_size=16)
-
-    use_complex_transposed_convolution = True if "refined" in model_name else False
-    reduce_dim = 16 if "rd16" in model_name else 64
-
-    config = CLIPSegConfig.from_text_vision_configs(
-        text_config,
-        vision_config,
-        use_complex_transposed_convolution=use_complex_transposed_convolution,
-        reduce_dim=reduce_dim,
-    )
-    return config
-
-
-def rename_key(name):
-    # update prefixes
-    if "clip_model" in name:
-        name = name.replace("clip_model", "clip")
-    if "transformer" in name:
-        if "visual" in name:
-            name = name.replace("visual.transformer", "vision_model")
-        else:
-            name = name.replace("transformer", "text_model")
-    if "resblocks" in name:
-        name = name.replace("resblocks", "encoder.layers")
-    if "ln_1" in name:
-        name = name.replace("ln_1", "layer_norm1")
-    if "ln_2" in name:
-        name = name.replace("ln_2", "layer_norm2")
-    if "c_fc" in name:
-        name = name.replace("c_fc", "fc1")
-    if "c_proj" in name:
-        name = name.replace("c_proj", "fc2")
-    if "attn" in name and "self" not in name:
-        name = name.replace("attn", "self_attn")
-    # text encoder
-    if "token_embedding" in name:
-        name = name.replace("token_embedding", "text_model.embeddings.token_embedding")
-    if "positional_embedding" in name and "visual" not in name:
-        name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
-    if "ln_final" in name:
-        name = name.replace("ln_final", "text_model.final_layer_norm")
-    # vision encoder
-    if "visual.class_embedding" in name:
-        name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
-    if "visual.conv1" in name:
-        name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
-    if "visual.positional_embedding" in name:
-        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
-    if "visual.ln_pre" in name:
-        name = name.replace("visual.ln_pre", "vision_model.pre_layrnorm")
-    if "visual.ln_post" in name:
-        name = name.replace("visual.ln_post", "vision_model.post_layernorm")
-    # projection layers
-    if "visual.proj" in name:
-        name = name.replace("visual.proj", "visual_projection.weight")
-    if "text_projection" in name:
-        name = name.replace("text_projection", "text_projection.weight")
-    # decoder
-    if "trans_conv" in name:
-        name = name.replace("trans_conv", "transposed_convolution")
-    if "film_mul" in name or "film_add" in name or "reduce" in name or "transposed_convolution" in name:
-        name = "decoder." + name
-    if "blocks" in name:
-        name = name.replace("blocks", "decoder.layers")
-    if "linear1" in name:
-        name = name.replace("linear1", "mlp.fc1")
-    if "linear2" in name:
-        name = name.replace("linear2", "mlp.fc2")
-    if "norm1" in name and "layer_" not in name:
-        name = name.replace("norm1", "layer_norm1")
-    if "norm2" in name and "layer_" not in name:
-        name = name.replace("norm2", "layer_norm2")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if key.startswith("clip_model") and "attn.in_proj" in key:
-            key_split = key.split(".")
-            if "visual" in key:
-                layer_num = int(key_split[4])
-                dim = config.vision_config.hidden_size
-                prefix = "vision_model"
-            else:
-                layer_num = int(key_split[3])
-                dim = config.text_config.hidden_size
-                prefix = "text_model"
-
-            if "weight" in key:
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
-            else:
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-        elif "self_attn" in key and "out_proj" not in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[1])
-            dim = config.reduce_dim
-            if "weight" in key:
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
-            else:
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-        else:
-            new_name = rename_key(key)
-            if "visual_projection" in new_name or "text_projection" in new_name:
-                val = val.T
-            orig_state_dict[new_name] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
-    config = get_clipseg_config(model_name)
-    model = CLIPSegForImageSegmentation(config)
-    model.eval()
-
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-
-    # remove some keys
-    for key in state_dict.copy().keys():
-        if key.startswith("model"):
-            state_dict.pop(key, None)
-
-    # rename some keys
-    state_dict = convert_state_dict(state_dict, config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-
-    if missing_keys != ["clip.text_model.embeddings.position_ids", "clip.vision_model.embeddings.position_ids"]:
-        raise ValueError("Missing keys that are not expected: {}".format(missing_keys))
-    if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]:
-        raise ValueError(f"Unexpected keys: {unexpected_keys}")
-
-    image_processor = ViTImageProcessor(size=352)
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    processor = CLIPSegProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    image = prepare_img()
-    text = ["a glass", "something to fill", "wood", "a jar"]
-
-    inputs = processor(text=text, images=[image] * len(text), padding="max_length", return_tensors="pt")
-
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    # verify values
-    expected_conditional = torch.tensor([0.1110, -0.1882, 0.1645])
-    expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328])
-    if model_name == "clipseg-rd64-refined":
-        expected_masks_slice = torch.tensor(
-            [[-10.0407, -9.9431, -10.2646], [-9.9751, -9.7064, -9.9586], [-9.6891, -9.5645, -9.9618]]
-        )
-    elif model_name == "clipseg-rd64":
-        expected_masks_slice = torch.tensor(
-            [[-7.2877, -7.2711, -7.2463], [-7.2652, -7.2780, -7.2520], [-7.2239, -7.2204, -7.2001]]
-        )
-    elif model_name == "clipseg-rd16":
-        expected_masks_slice = torch.tensor(
-            [[-6.3955, -6.4055, -6.4151], [-6.3911, -6.4033, -6.4100], [-6.3474, -6.3702, -6.3762]]
-        )
-    else:
-        raise ValueError(f"Model name {model_name} not supported.")
-
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3)
-    assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3)
-    assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to the hub")
-        model.push_to_hub(f"CIDAS/{model_name}")
-        processor.push_to_hub(f"CIDAS/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="clipseg-rd64",
-        type=str,
-        choices=["clipseg-rd16", "clipseg-rd64", "clipseg-rd64-refined"],
-        help=(
-            "Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning"
-            " reduce dimension)"
-        ),
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/CLIPSeg/clip_plus_rd64-uni.pth",
-        type=str,
-        help=(
-            "Path to the original checkpoint. Note that the script assumes that the checkpoint includes both CLIP and"
-            " the decoder weights."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_clipseg_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 2d88746b7713..a24847471f72 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -81,10 +81,10 @@ class CLIPSegOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -110,7 +110,7 @@ class CLIPSegDecoderOutput(ModelOutput):
             the self-attention heads.
     """
 
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -127,9 +127,9 @@ class CLIPSegImageSegmentationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    conditional_embeddings: torch.FloatTensor = None
-    pooled_output: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    conditional_embeddings: Optional[torch.FloatTensor] = None
+    pooled_output: Optional[torch.FloatTensor] = None
     vision_model_output: BaseModelOutputWithPooling = None
     decoder_output: CLIPSegDecoderOutput = None
 
@@ -209,7 +209,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=True
         batch_size, _, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
             )
         patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
@@ -341,7 +341,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
@@ -1360,7 +1360,7 @@ def __init__(self, config: CLIPSegConfig):
 
     def get_conditional_embeddings(
         self,
-        batch_size: int = None,
+        batch_size: Optional[int] = None,
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index bd817ae78655..5a4c10930f72 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -37,7 +37,7 @@ class CLIPSegProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "ViTImageProcessor"
+    image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast")
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
@@ -63,7 +63,7 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of
+        ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of
         the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py
index cffc962eb322..b06cd5f6a41e 100644
--- a/src/transformers/models/clvp/configuration_clvp.py
+++ b/src/transformers/models/clvp/configuration_clvp.py
@@ -144,7 +144,7 @@ def from_pretrained(
         # this is to make sure that we can load only text or speech configs from the nested ClvpConfig.
         if config_type not in cls.base_config_key:
             raise ValueError(
-                f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}"
+                f"We can only load either 'text_config' or 'speech_config' but you are trying to load{config_type}"
             )
 
         # get the text config dict if we are loading from ClvpConfig
diff --git a/src/transformers/models/clvp/convert_clvp_to_hf.py b/src/transformers/models/clvp/convert_clvp_to_hf.py
deleted file mode 100644
index 4ae6fd425497..000000000000
--- a/src/transformers/models/clvp/convert_clvp_to_hf.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Weights conversion script for CLVP
-"""
-
-import argparse
-import os
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import ClvpConfig, ClvpModelForConditionalGeneration
-
-
-_MODELS = {
-    "clvp": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/clvp2.pth",
-    "decoder": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/autoregressive.pth",
-}
-
-dim = 1024
-sub_dim = dim // 16
-
-CLVP_ENCODERS_MAPPING = {
-    "text_transformer.transformer.attn_layers": "text_encoder_model",
-    "speech_transformer.transformer.attn_layers": "speech_encoder_model",
-    "text_transformer.transformer.norm": "text_encoder_model.final_layer_norm",
-    "speech_transformer.transformer.norm": "speech_encoder_model.final_layer_norm",
-    "to_text_latent": "text_encoder_model.projection",
-    "to_speech_latent": "speech_encoder_model.projection",
-    "text_emb": "text_encoder_model.token_embedding",
-    "speech_emb": "speech_encoder_model.token_embedding",
-    "1.wrap.net.0": "mlp.fc1",
-    "1.wrap.net.3": "mlp.fc2",
-    "1.wrap": "self_attn",
-    "to_out": "out_proj",
-    "to_q": "q_proj",
-    "to_k": "k_proj",
-    "to_v": "v_proj",
-    "temperature": "logit_scale",
-}
-
-CLVP_DECODER_MAPPING = {
-    "conditioning_encoder.init": "conditioning_encoder.mel_conv",
-    "conditioning_encoder.attn": "conditioning_encoder.mel_attn_blocks",
-    "mel_attn_blocks": "group_norms",
-    ".norm.weight": ".weight",
-    ".norm.bias": ".bias",
-    "text_embedding": "conditioning_encoder.text_token_embedding",
-    "text_pos_embedding.emb": "conditioning_encoder.text_position_embedding",
-    "final_norm": "speech_decoder_model.final_norm",
-    "mel_head": "speech_decoder_model.lm_head",
-    "gpt.ln_f": "speech_decoder_model.model.decoder.layer_norm",
-    "mel_embedding": "speech_decoder_model.model.decoder.input_embeds_layer",
-    "mel_pos_embedding.emb": "speech_decoder_model.model.decoder.position_embeds_layer",
-    "gpt.h": "speech_decoder_model.model.decoder.layers",
-    "ln_1": "input_layernorm",
-    "ln_2": "post_attention_layernorm",
-}
-
-
-def update_index(present_index):
-    if present_index % 2 == 0:
-        return int(present_index / 2)
-    else:
-        return int((present_index - 1) / 2)
-
-
-def convert_encoder_weights(original_weights):
-    converted_weights = {}
-    original_weights_keys = sorted(original_weights.keys())
-    for original_key in original_weights_keys:
-        updated_key = original_key
-        # for input_rmsnorm.weight and post_attention_rmsnorm.weight
-        if "0.0.g" in updated_key:
-            present_index = updated_key.split(".")[4]
-            if int(present_index) % 2 == 0:
-                updated_key = updated_key.replace("0.0.g", "input_rmsnorm.weight")
-            else:
-                updated_key = updated_key.replace("0.0.g", "post_attention_rmsnorm.weight")
-
-        if "transformer.attn_layers.layers" in updated_key:
-            present_index = updated_key.split(".")[4]
-            updated_index = update_index(int(present_index))
-            updated_key = updated_key.replace(
-                f"transformer.attn_layers.layers.{present_index}", f"transformer.attn_layers.layers.{updated_index}"
-            )
-
-        for k, v in CLVP_ENCODERS_MAPPING.items():
-            if k in updated_key:
-                updated_key = updated_key.replace(k, v)
-
-        converted_weights[updated_key] = original_weights.pop(original_key)
-
-    return converted_weights
-
-
-def convert_decoder_weights(original_weights):
-    converted_weights = {}
-    original_weights_keys = sorted(original_weights.keys())
-    for original_key in original_weights_keys:
-        updated_key = original_key
-        if len(updated_key.split(".")) > 3:
-            index, attr = updated_key.split(".")[2], updated_key.split(".")[-1]
-
-        # for decoder attention
-        if "attn.c_attn" in updated_key:
-            if attr == "weight":
-                slice1, slice2, slice3 = original_weights[updated_key].squeeze(-1).T.split(split_size=dim, dim=0)
-            else:
-                slice1, slice2, slice3 = original_weights[updated_key].split(split_size=dim, dim=0)
-            converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.q_proj.{attr}"] = slice1
-            converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.k_proj.{attr}"] = slice2
-            converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.v_proj.{attr}"] = slice3
-            continue
-
-        if "attn.c_proj" in updated_key:
-            converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.out_proj.{attr}"] = (
-                original_weights[updated_key].squeeze(-1).T
-            )
-            continue
-
-        if "attn.bias" in updated_key or "attn.masked_bias" in updated_key or "text_head" in updated_key:
-            original_weights.pop(updated_key)
-            continue
-
-        # conditional encoder attention
-        if "qkv" in updated_key:
-            if attr == "weight":
-                slice1, slice2, slice3 = original_weights[updated_key].squeeze(-1).split(split_size=dim, dim=0)
-            else:
-                slice1, slice2, slice3 = original_weights[updated_key].split(split_size=dim, dim=0)
-
-            indices = torch.arange(dim)
-            index1, index2, index3 = (
-                indices.unfold(0, sub_dim, sub_dim * 3).flatten(),
-                indices[sub_dim:].unfold(0, sub_dim, sub_dim * 3).flatten(),
-                indices[2 * sub_dim :].unfold(0, sub_dim, sub_dim * 3).flatten(),
-            )
-
-            converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.q_proj.{attr}"] = torch.concatenate(
-                [slice1[index1], slice2[index3], slice3[index2]],
-                axis=0,
-            )
-            converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.k_proj.{attr}"] = torch.concatenate(
-                [slice1[index2], slice2[index1], slice3[index3]],
-                axis=0,
-            )
-            converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.v_proj.{attr}"] = torch.concatenate(
-                [slice1[index3], slice2[index2], slice3[index1]],
-                axis=0,
-            )
-            continue
-
-        if "proj_out" in updated_key:
-            converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.out_proj.{attr}"] = original_weights[
-                updated_key
-            ].squeeze(-1)
-            continue
-
-        for k, v in CLVP_DECODER_MAPPING.items():
-            if k in updated_key:
-                updated_key = updated_key.replace(k, v)
-
-        converted_weights[updated_key] = original_weights.pop(original_key)
-
-    return converted_weights
-
-
-def _download(url: str, root: str):
-    repo_id = f"{url.split('/')[3]}/{url.split('/')[4]}"
-    filename = f"{url.split('/')[-2]}/{url.split('/')[-1]}"
-    hf_hub_download(
-        repo_id=repo_id,
-        filename=filename,
-        force_filename=root,
-        local_dir_use_symlinks=False,
-    )
-
-
-def convert_clvp_weights(checkpoint_path, pytorch_dump_folder_path):
-    converted_checkpoint = {}
-
-    for each_model_name, each_model_url in _MODELS.items():
-        each_model_path = os.path.join(checkpoint_path, each_model_url.split("/")[-1])
-        if not os.path.exists(each_model_path):
-            print(f"\n{each_model_name} was not found! Downloading it to {each_model_path}")
-            _download(url=each_model_url, root=each_model_path)
-
-        if each_model_name == "clvp":
-            clvp_checkpoint = torch.load(each_model_path, map_location="cpu")
-        else:
-            decoder_checkpoint = torch.load(each_model_path, map_location="cpu")
-
-    # Converting the weights
-    converted_checkpoint.update(**convert_encoder_weights(clvp_checkpoint))
-    converted_checkpoint.update(**convert_decoder_weights(decoder_checkpoint))
-
-    config = ClvpConfig.from_pretrained("susnato/clvp_dev")
-    model = ClvpModelForConditionalGeneration(config)
-
-    model.load_state_dict(converted_checkpoint, strict=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Model saved at {pytorch_dump_folder_path}!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # # Required parameters
-    parser.add_argument(
-        "--checkpoint_path", type=str, help="Path to the folder of downloaded checkpoints. (Please enter full path)"
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model. (Please enter full path)",
-    )
-    args = parser.parse_args()
-
-    convert_clvp_weights(args.checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/clvp/feature_extraction_clvp.py b/src/transformers/models/clvp/feature_extraction_clvp.py
index 2dbda430bb25..ce6f5c782e2f 100644
--- a/src/transformers/models/clvp/feature_extraction_clvp.py
+++ b/src/transformers/models/clvp/feature_extraction_clvp.py
@@ -188,7 +188,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py
index 844ca354cd10..afbab2938350 100644
--- a/src/transformers/models/clvp/modeling_clvp.py
+++ b/src/transformers/models/clvp/modeling_clvp.py
@@ -171,7 +171,7 @@ class ClvpEncoderOutput(ModelOutput):
     """
 
     embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -211,15 +211,15 @@ class ClvpOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     speech_ids: Optional[torch.LongTensor] = None
-    logits_per_speech: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    speech_embeds: torch.FloatTensor = None
+    logits_per_speech: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    speech_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     speech_model_output: BaseModelOutputWithPooling = None
-    decoder_hidden_states: torch.FloatTensor = None
-    text_encoder_hidden_states: torch.FloatTensor = None
-    speech_encoder_hidden_states: torch.FloatTensor = None
+    decoder_hidden_states: Optional[torch.FloatTensor] = None
+    text_encoder_hidden_states: Optional[torch.FloatTensor] = None
+    speech_encoder_hidden_states: Optional[torch.FloatTensor] = None
 
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Clvp
@@ -1346,8 +1346,8 @@ def _prepare_model_inputs(
             if hasattr(model_kwargs, "attention_mask"):
                 position_ids = model_kwargs["attention_mask"].long().cumsum(-1) - 1
             else:
-                position_ids = torch.range(
-                    0, conditioning_embeds.shape[1] - 1, dtype=torch.long, device=conditioning_embeds.device
+                position_ids = torch.arange(
+                    0, conditioning_embeds.shape[1], dtype=torch.long, device=conditioning_embeds.device
                 )
             position_ids = position_ids.unsqueeze(0).repeat(conditioning_embeds.shape[0], 1)
 
@@ -1737,8 +1737,8 @@ def get_speech_features(
     @replace_return_docstrings(output_type=ClvpOutput, config_class=ClvpConfig)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        input_features: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
         conditioning_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
         text_encoder_inputs_embeds: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
@@ -1868,8 +1868,8 @@ def forward(
     @torch.no_grad()
     def generate(
         self,
-        input_ids: torch.LongTensor = None,
-        input_features: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         generation_config: Optional[GenerationConfig] = None,
         pad_to_max_mel_tokens: Optional[int] = None,
diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py
index 3f4d54f25903..6a733030ee46 100644
--- a/src/transformers/models/clvp/processing_clvp.py
+++ b/src/transformers/models/clvp/processing_clvp.py
@@ -48,7 +48,7 @@ def __init__(self, feature_extractor, tokenizer):
     def __call__(self, *args, **kwargs):
         """
         Forwards the `audio` and `sampling_rate` arguments to [`~ClvpFeatureExtractor.__call__`] and the `text`
-        argument to [`~ClvpTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
+        argument to [`~ClvpTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
         information.
         """
 
diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
index 3bc831cdd6a1..c9d4c34b86ac 100644
--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@@ -82,7 +82,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
             [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
             contains everything needed to load the tokenizer.
         clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
-            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
+            Whether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
             spaces.
         unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
@@ -190,8 +190,8 @@ def update_post_processor(self):
         if eos is None and self.add_eos_token:
             raise ValueError("add_eos_token = True but eos_token = None")
 
-        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
 
         special_tokens = []
         if self.add_bos_token:
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index dcb24817e303..1d7d3b6d27cd 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -26,10 +26,22 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
+    logging,
+)
 from .configuration_codegen import CodeGenConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "Salesforce/codegen-2B-mono"
@@ -580,12 +592,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -667,7 +684,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py
index 2b584e83b1b9..f55d5a3f1594 100644
--- a/src/transformers/models/codegen/tokenization_codegen.py
+++ b/src/transformers/models/codegen/tokenization_codegen.py
@@ -344,7 +344,7 @@ def decode(
         self,
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         truncate_before_pattern: Optional[List[str]] = None,
         **kwargs,
     ) -> str:
diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py
index 86782cf80707..6ca74ff5326f 100644
--- a/src/transformers/models/codegen/tokenization_codegen_fast.py
+++ b/src/transformers/models/codegen/tokenization_codegen_fast.py
@@ -192,7 +192,7 @@ def decode(
         self,
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         truncate_before_pattern: Optional[List[str]] = None,
         **kwargs,
     ) -> str:
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index 69e7c579f9ce..bd527959794d 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -27,6 +27,7 @@
 # This file is based on the LLama model definition file in transformers
 
 
+from functools import partial
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
@@ -38,13 +39,15 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     LossKwargs,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -52,6 +55,12 @@
 from .configuration_cohere import CohereConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "CohereConfig"
@@ -92,45 +101,18 @@ def __init__(self, config: CohereConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.repeat_interleave(freqs, 2, dim=-1)  # diff from Llama: we interleave() instead of cat()
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -471,20 +453,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -545,10 +519,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -556,16 +531,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -576,6 +549,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -610,7 +587,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -644,13 +621,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -658,12 +634,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -744,7 +725,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -813,12 +794,13 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -827,13 +809,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -868,10 +848,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -880,12 +859,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -895,10 +873,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/cohere/modular_cohere.py b/src/transformers/models/cohere/modular_cohere.py
index 17eb3f6a3434..17644ff4f8c9 100644
--- a/src/transformers/models/cohere/modular_cohere.py
+++ b/src/transformers/models/cohere/modular_cohere.py
@@ -30,7 +30,8 @@
 
 from ...cache_utils import Cache
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_outputs import CausalLMOutputWithPast
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
@@ -73,25 +74,17 @@ def forward(self, hidden_states):
 
 class CohereRotaryEmbedding(LlamaRotaryEmbedding):
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.repeat_interleave(freqs, 2, dim=-1)  # diff from Llama: we interleave() instead of cat()
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -306,7 +299,7 @@ def __init__(self, config):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -315,13 +308,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -356,10 +347,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -368,12 +358,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -383,10 +372,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/cohere/tokenization_cohere_fast.py b/src/transformers/models/cohere/tokenization_cohere_fast.py
index e99df5c609c8..c8b0f6d3fed6 100644
--- a/src/transformers/models/cohere/tokenization_cohere_fast.py
+++ b/src/transformers/models/cohere/tokenization_cohere_fast.py
@@ -198,8 +198,8 @@ def update_post_processor(self):
         if eos is None and self.add_eos_token:
             raise ValueError("add_eos_token = True but eos_token = None")
 
-        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
 
         special_tokens = []
         if self.add_bos_token:
@@ -406,7 +406,7 @@ def apply_grounded_generation_template(
             conversation (Union[List[Dict[str, str]]]): A list of dicts
                 with "role" and "content" keys, representing the chat history so far.
             documents (List[Dict[str, str]): A list of dicts, representing documents or tool outputs to ground your
-                generation on. A document is a semistructured dict, wiht a string to string mapping. Common fields are
+                generation on. A document is a semistructured dict, with a string to string mapping. Common fields are
                 `url`, `title`, `snippet` etc but should be descriptive of the key. They will get rendered into the prompt.
             citation_mode: either "accurate" (prompt the model to generate an answer first, then rewrite it with citation
                 spans in) or "fast", where the prompt instructs the model to generate an answer with citations in directly.
diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py
index 75144c65ecff..18a3a50ac157 100644
--- a/src/transformers/models/cohere2/modeling_cohere2.py
+++ b/src/transformers/models/cohere2/modeling_cohere2.py
@@ -19,6 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from functools import partial
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
@@ -29,14 +30,14 @@
 from ...generation import GenerationMixin
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     LossKwargs,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -67,45 +68,18 @@ def __init__(self, config: Cohere2Config, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.repeat_interleave(freqs, 2, dim=-1)  # diff from Llama: we interleave() instead of cat()
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -487,20 +461,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -560,10 +526,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(COHERE2_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridCache] = None,
@@ -571,17 +538,15 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         last_cache_position: Optional[int] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -597,11 +562,13 @@ def forward(
 
         if use_cache and past_key_values is None and not self.training:
             batch_size, seq_len, _ = inputs_embeds.shape
+            # NOTE: ideally, `HybridCache` should be initialized outside the model with `layer_device_map`
             past_key_values = HybridCache(
                 self.config,
                 max_batch_size=batch_size,
                 max_cache_len=seq_len,
                 dtype=inputs_embeds.dtype,
+                device=self.device,
             )
 
         if cache_position is None:
@@ -641,7 +608,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     position_embeddings,
                     causal_mask,
@@ -675,13 +642,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     @torch.no_grad()
     def _update_causal_mask(
@@ -745,7 +711,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -814,12 +780,13 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(COHERE2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -828,13 +795,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -869,10 +834,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -881,12 +845,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -896,10 +859,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -930,7 +889,7 @@ def prepare_inputs_for_generation(
         if past_key_values is not None:
             if (
                 inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
             ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py
index 979b5abc2600..3d1bdaeca944 100644
--- a/src/transformers/models/cohere2/modular_cohere2.py
+++ b/src/transformers/models/cohere2/modular_cohere2.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Callable, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -29,7 +30,6 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
 from ...utils import (
-    is_torchdynamo_compiling,
     logging,
 )
 from ..cohere.modeling_cohere import (
@@ -454,7 +454,7 @@ def __init__(self, config: Cohere2Config):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridCache] = None,
@@ -462,17 +462,15 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         last_cache_position: Optional[int] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -488,11 +486,13 @@ def forward(
 
         if use_cache and past_key_values is None and not self.training:
             batch_size, seq_len, _ = inputs_embeds.shape
+            # NOTE: ideally, `HybridCache` should be initialized outside the model with `layer_device_map`
             past_key_values = HybridCache(
                 self.config,
                 max_batch_size=batch_size,
                 max_cache_len=seq_len,
                 dtype=inputs_embeds.dtype,
+                device=self.device,
             )
 
         if cache_position is None:
@@ -532,7 +532,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     position_embeddings,
                     causal_mask,
@@ -566,13 +566,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
 
 class Cohere2ForCausalLM(CohereForCausalLM):
@@ -601,7 +600,7 @@ def prepare_inputs_for_generation(
         if past_key_values is not None:
             if (
                 inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
             ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
diff --git a/src/transformers/models/colpali/convert_colpali_weights_to_hf.py b/src/transformers/models/colpali/convert_colpali_weights_to_hf.py
deleted file mode 100644
index 1b30f3f97acd..000000000000
--- a/src/transformers/models/colpali/convert_colpali_weights_to_hf.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert ColPali weights from the original repository to the HF model format.
-
-Original repository: https://github.com/illuin-tech/colpali.
-
-NOTE: This script was originally run using `torch==2.5.1` and with:
-
-```bash
-python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \
-    --model_id vidore/colpali-v1.2-merged \
-    --revision 89fd9736194236a1ecb7a9ec9b04f537f6f896af \
-    --original_vlm_name_or_path google/paligemma-3b-mix-448 \
-    --output_dir vidore/colpali-v1.2-hf-internal \
-    --push_to_hub
-
-python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \
-    --model_id vidore/colpali-v1.3-merged \
-    --revision 5b955e3415a7c5468ab33119d98d6d45c3a5b2c3 \
-    --original_vlm_name_or_path google/paligemma-3b-mix-448 \
-    --output_dir vidore/colpali-v1.3-hf \
-    --push_to_hub
-```
-"""
-
-import argparse
-import glob
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-import torch
-from huggingface_hub import snapshot_download
-from safetensors import safe_open
-
-from transformers import AutoConfig
-from transformers.models.colpali import ColPaliForRetrieval
-from transformers.models.colpali.configuration_colpali import ColPaliConfig
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-ORIGINAL_DTYPE = torch.bfloat16
-
-
-def rename_state_dict_keys(state_dict: Dict[str, Any]) -> Dict[str, Any]:
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        new_key = key
-        if key.startswith("custom_text_proj"):
-            new_key = key.replace("custom_text_proj", "embedding_proj_layer")
-        if key.startswith("model."):
-            new_key = key.replace("model.", "vlm.", 1)
-        new_state_dict[new_key] = value
-    return new_state_dict
-
-
-def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> Dict[str, torch.Tensor]:
-    directory_path = snapshot_download(
-        repo_id=model_id,
-        revision=revision,
-        allow_patterns=["*.safetensors"],
-    )
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    # Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict.
-    if "lm_head.weight" not in original_state_dict:
-        original_state_dict["vlm.language_model.lm_head.weight"] = original_state_dict[
-            "model.language_model.model.embed_tokens.weight"
-        ].clone()
-
-    return original_state_dict
-
-
-@torch.no_grad()
-def convert_colpali_weights_to_hf(
-    model_id: str,
-    output_dir: str,
-    push_to_hub: bool,
-    revision: Optional[str] = None,
-    original_vlm_name_or_path: Optional[str] = None,
-):
-    # Load the original model data
-    original_config = AutoConfig.from_pretrained(
-        model_id,
-        revision=revision,
-    )
-    if original_vlm_name_or_path is not None:
-        original_config._name_or_path = original_vlm_name_or_path
-    if hasattr(original_config, "architectures"):
-        delattr(original_config, "architectures")
-
-    original_state_dict = load_original_state_dict(model_id, revision=revision)
-
-    # Format the state_dict keys
-    original_state_dict = rename_state_dict_keys(original_state_dict)
-
-    # Create the new config
-    config = ColPaliConfig(
-        vlm_config=original_config,
-        embedding_dim=128,  # hardcoded in the original model
-    )
-    config.model_type = "colpali"
-    config.is_composition = False
-
-    # Load the untrained model
-    model = ColPaliForRetrieval(config=config).to("cpu").eval()
-    print("Created model with new config and randomly initialized weights")
-
-    # NOTE: The model was initialized with float32 weights. We need to convert it to the desired precision.
-    # There are two ways to set the model's dtype:
-    # - Using `model.from_pretrained(..., torch_dtype=dtype_precision)` doesn't convert the hyperparameters to the desired precision.
-    # - Using `model.to(dtype_precision)` converts all values - including the hyperparameters - to the desired precision.
-    # The following snippet allows a fine-grained control over the model's dtype, making sure that all
-    # the new weights' dtypes match the original model.
-    for param in model.parameters():
-        param.data = param.data.to(ORIGINAL_DTYPE)
-    print(f"Converted the new model weights to `{ORIGINAL_DTYPE}`")
-
-    # Load the original weights
-    model.load_state_dict(original_state_dict)
-    print("Loaded original model weights")
-
-    # Tie the weights (following ColPali's `__init__`` step)
-    if model.vlm.language_model._tied_weights_keys is not None:
-        model._tied_weights_keys = [f"vlm.language_model.{k}" for k in model.vlm.language_model._tied_weights_keys]
-
-    # Sanity check: ensure all keys are the same
-    state_dict_keys_old = set(original_state_dict.keys())
-    state_dict_keys_new = set(model.state_dict().keys())
-    disjoint_keys = state_dict_keys_old.symmetric_difference(state_dict_keys_new)
-    if disjoint_keys:
-        raise ValueError(f"Incompatible keys: {disjoint_keys}")
-
-    # Save the model
-    if push_to_hub:
-        model.push_to_hub(output_dir, private=True)
-        print(f"Model pushed to the hub at `{output_dir}`")
-    else:
-        Path(output_dir).mkdir(exist_ok=True, parents=True)
-        model.save_pretrained(output_dir)
-        print(f"Model saved to `{output_dir}`")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""
-        This script converts the original ColPali model to the HF model format.
-
-        Example usage:
-        ```bash
-        python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \
-            --model_id vidore/colpali-v1.2-merged \
-            --revision 89fd9736194236a1ecb7a9ec9b04f537f6f896af \
-            --original_vlm_name_or_path google/paligemma-3b-mix-448 \
-            --output_dir vidore/colpali-v1.2-hf \
-            --push_to_hub
-        ```
-        """
-    )
-    parser.add_argument(
-        "--model_id",
-        help="Model ID of the original model to convert",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--revision",
-        help="Revision of the model to download",
-        default=None,
-    )
-    parser.add_argument(
-        "--original_vlm_name_or_path",
-        help="Name or path of the original VLM backbone model",
-        default=None,
-    )
-    args = parser.parse_args()
-
-    convert_colpali_weights_to_hf(
-        model_id=args.model_id,
-        output_dir=args.output_dir,
-        push_to_hub=args.push_to_hub,
-        revision=args.revision,
-        original_vlm_name_or_path=args.original_vlm_name_or_path,
-    )
diff --git a/src/transformers/models/colpali/modeling_colpali.py b/src/transformers/models/colpali/modeling_colpali.py
index d84f29a3414f..4782bf7d3038 100644
--- a/src/transformers/models/colpali/modeling_colpali.py
+++ b/src/transformers/models/colpali/modeling_colpali.py
@@ -111,7 +111,7 @@ class ColPaliForRetrievalOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    embeddings: torch.Tensor = None
+    embeddings: Optional[torch.Tensor] = None
     past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -191,8 +191,8 @@ def __init__(self, config: ColPaliConfig):
     @replace_return_docstrings(output_type=ColPaliForRetrievalOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/colpali/modular_colpali.py b/src/transformers/models/colpali/modular_colpali.py
index 2cc6dded858d..e5bd804abd04 100644
--- a/src/transformers/models/colpali/modular_colpali.py
+++ b/src/transformers/models/colpali/modular_colpali.py
@@ -100,11 +100,11 @@ def __call__(
         wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
         both text and images at the same time.
 
-        When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
+        When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
         [`~LlamaTokenizerFast.__call__`].
-        When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
+        When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
         [`~SiglipImageProcessor.__call__`].
-        Please refer to the doctsring of the above two methods for more information.
+        Please refer to the docstring of the above two methods for more information.
 
         Args:
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py
index 342cd0cd3d6a..eeb14901f7d3 100644
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@@ -91,7 +91,7 @@ class ColPaliProcessor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer"]
     valid_kwargs = ["chat_template"]
-    image_processor_class = "SiglipImageProcessor"
+    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
     tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")
 
     visual_prompt_prefix: ClassVar[str] = "Describe the image."
@@ -140,11 +140,11 @@ def __call__(
         wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
         both text and images at the same time.
 
-        When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
+        When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
         [`~LlamaTokenizerFast.__call__`].
-        When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
+        When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
         [`~SiglipImageProcessor.__call__`].
-        Please refer to the doctsring of the above two methods for more information.
+        Please refer to the docstring of the above two methods for more information.
 
         Args:
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 91f00668be69..000000000000
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Conditional DETR checkpoints."""
-
-import argparse
-import json
-from collections import OrderedDict
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    ConditionalDetrConfig,
-    ConditionalDetrForObjectDetection,
-    ConditionalDetrForSegmentation,
-    ConditionalDetrImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-rename_keys = []
-for i in range(6):
-    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
-    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight",
-            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
-        )
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias",
-            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
-        )
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
-
-    # q, k, v projections in self/cross-attention in decoder for conditional DETR
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight")
-    )
-    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight")
-    )
-
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias")
-    )
-    # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias")
-    )
-
-# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-# for conditional DETR, also convert reference point head and query scale MLP
-rename_keys.extend(
-    [
-        ("input_proj.weight", "input_projection.weight"),
-        ("input_proj.bias", "input_projection.bias"),
-        ("query_embed.weight", "query_position_embeddings.weight"),
-        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
-        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
-        ("class_embed.weight", "class_labels_classifier.weight"),
-        ("class_embed.bias", "class_labels_classifier.bias"),
-        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
-        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
-        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
-        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
-        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
-        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
-        ("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"),
-        ("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"),
-        ("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"),
-        ("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"),
-        ("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"),
-        ("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"),
-        ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"),
-        ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"),
-        ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"),
-        ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"),
-    ]
-)
-
-
-def rename_key(state_dict, old, new):
-    val = state_dict.pop(old)
-    state_dict[new] = val
-
-
-def rename_backbone_keys(state_dict):
-    new_state_dict = OrderedDict()
-    for key, value in state_dict.items():
-        if "backbone.0.body" in key:
-            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
-            new_state_dict[new_key] = value
-        else:
-            new_state_dict[key] = value
-
-    return new_state_dict
-
-
-def read_in_q_k_v(state_dict, is_panoptic=False):
-    prefix = ""
-    if is_panoptic:
-        prefix = "conditional_detr."
-
-    # first: transformer encoder
-    for i in range(6):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our CONDITIONAL_DETR structure.
-    """
-
-    # load default config
-    config = ConditionalDetrConfig()
-    # set backbone and dilation attributes
-    if "resnet101" in model_name:
-        config.backbone = "resnet101"
-    if "dc5" in model_name:
-        config.dilation = True
-    is_panoptic = "panoptic" in model_name
-    if is_panoptic:
-        config.num_labels = 250
-    else:
-        config.num_labels = 91
-        repo_id = "huggingface/label-files"
-        filename = "coco-detection-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    # load image processor
-    format = "coco_panoptic" if is_panoptic else "coco_detection"
-    image_processor = ConditionalDetrImageProcessor(format=format)
-
-    # prepare image
-    img = prepare_img()
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    logger.info(f"Converting model {model_name}...")
-
-    # load original model from torch hub
-    conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval()
-    state_dict = conditional_detr.state_dict()
-    # rename keys
-    for src, dest in rename_keys:
-        if is_panoptic:
-            src = "conditional_detr." + src
-        rename_key(state_dict, src, dest)
-    state_dict = rename_backbone_keys(state_dict)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "conditional_detr.model." if is_panoptic else "model."
-    for key in state_dict.copy().keys():
-        if is_panoptic:
-            if (
-                key.startswith("conditional_detr")
-                and not key.startswith("class_labels_classifier")
-                and not key.startswith("bbox_predictor")
-            ):
-                val = state_dict.pop(key)
-                state_dict["conditional_detr.model" + key[4:]] = val
-            elif "class_labels_classifier" in key or "bbox_predictor" in key:
-                val = state_dict.pop(key)
-                state_dict["conditional_detr." + key] = val
-            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
-                continue
-            else:
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-        else:
-            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-    # finally, create HuggingFace model and load state dict
-    model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-    model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model")
-    # verify our conversion
-    original_outputs = conditional_detr(pixel_values)
-    outputs = model(pixel_values)
-    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
-    if is_panoptic:
-        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
-
-    # Save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        default="conditional_detr_resnet50",
-        type=str,
-        help="Name of the CONDITIONAL_DETR model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 0c109f682e60..923ccc20abe2 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -948,7 +948,7 @@ def prepare_annotation(
         image: np.ndarray,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -1264,7 +1264,7 @@ def preprocess(
         self,
         images: ImageInput,
         annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 5a839f9513a8..90b7b68bb431 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -178,8 +178,8 @@ class ConditionalDetrObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -248,9 +248,9 @@ class ConditionalDetrSegmentationOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
-    pred_masks: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    pred_masks: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -788,7 +788,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        object_queries: torch.Tensor = None,
+        object_queries: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
     ):
         """
diff --git a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py b/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
deleted file mode 100644
index 3d4ff779874b..000000000000
--- a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ConvBERT checkpoint."""
-
-import argparse
-
-from transformers import ConvBertConfig, ConvBertModel, TFConvBertModel, load_tf_weights_in_convbert
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_orig_tf1_checkpoint_to_pytorch(tf_checkpoint_path, convbert_config_file, pytorch_dump_path):
-    conf = ConvBertConfig.from_json_file(convbert_config_file)
-    model = ConvBertModel(conf)
-
-    model = load_tf_weights_in_convbert(model, conf, tf_checkpoint_path)
-    model.save_pretrained(pytorch_dump_path)
-
-    tf_model = TFConvBertModel.from_pretrained(pytorch_dump_path, from_pt=True)
-    tf_model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--convbert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained ConvBERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_orig_tf1_checkpoint_to_pytorch(args.tf_checkpoint_path, args.convbert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
index 9b2696a7e2b0..d9318f2b128b 100644
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -106,10 +106,10 @@ def build(self, input_shape=None):
     # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        token_type_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        token_type_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
         past_key_values_length=0,
         training: bool = False,
     ) -> tf.Tensor:
diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
deleted file mode 100644
index 27315ed73f91..000000000000
--- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ConvNext checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/ConvNeXt"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import ConvNextConfig, ConvNextForImageClassification, ConvNextImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_convnext_config(checkpoint_url):
-    config = ConvNextConfig()
-
-    if "tiny" in checkpoint_url:
-        depths = [3, 3, 9, 3]
-        hidden_sizes = [96, 192, 384, 768]
-    if "small" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [96, 192, 384, 768]
-    if "base" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [128, 256, 512, 1024]
-    if "large" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [192, 384, 768, 1536]
-    if "xlarge" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [256, 512, 1024, 2048]
-
-    if "1k" in checkpoint_url:
-        num_labels = 1000
-        filename = "imagenet-1k-id2label.json"
-        expected_shape = (1, 1000)
-    else:
-        num_labels = 21841
-        filename = "imagenet-22k-id2label.json"
-        expected_shape = (1, 21841)
-
-    repo_id = "huggingface/label-files"
-    config.num_labels = num_labels
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    if "1k" not in checkpoint_url:
-        # this dataset contains 21843 labels but the model only has 21841
-        # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18
-        del id2label[9205]
-        del id2label[15027]
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    config.hidden_sizes = hidden_sizes
-    config.depths = depths
-
-    return config, expected_shape
-
-
-def rename_key(name):
-    if "downsample_layers.0.0" in name:
-        name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings")
-    if "downsample_layers.0.1" in name:
-        name = name.replace("downsample_layers.0.1", "embeddings.norm")  # we rename to layernorm later on
-    if "downsample_layers.1.0" in name:
-        name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0")
-    if "downsample_layers.1.1" in name:
-        name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1")
-    if "downsample_layers.2.0" in name:
-        name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0")
-    if "downsample_layers.2.1" in name:
-        name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1")
-    if "downsample_layers.3.0" in name:
-        name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0")
-    if "downsample_layers.3.1" in name:
-        name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1")
-    if "stages" in name and "downsampling_layer" not in name:
-        # stages.0.0. for instance should be renamed to stages.0.layers.0.
-        name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :]
-    if "stages" in name:
-        name = name.replace("stages", "encoder.stages")
-    if "norm" in name:
-        name = name.replace("norm", "layernorm")
-    if "gamma" in name:
-        name = name.replace("gamma", "layer_scale_parameter")
-    if "head" in name:
-        name = name.replace("head", "classifier")
-
-    return name
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our ConvNext structure.
-    """
-
-    # define ConvNext configuration based on URL
-    config, expected_shape = get_convnext_config(checkpoint_url)
-    # load original state_dict from URL
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # add prefix to all keys expect classifier head
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        if not key.startswith("classifier"):
-            key = "convnext." + key
-        state_dict[key] = val
-
-    # load HuggingFace model
-    model = ConvNextForImageClassification(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # Check outputs on an image, prepared by ConvNextImageProcessor
-    size = 224 if "224" in checkpoint_url else 384
-    image_processor = ConvNextImageProcessor(size=size)
-    pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values
-
-    logits = model(pixel_values).logits
-
-    # note: the logits below were obtained without center cropping
-    if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth":
-        expected_logits = torch.tensor([-0.1210, -0.6605, 0.1918])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth":
-        expected_logits = torch.tensor([-0.4473, -0.1847, -0.6365])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth":
-        expected_logits = torch.tensor([0.4525, 0.7539, 0.0308])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_384.pth":
-        expected_logits = torch.tensor([0.3561, 0.6350, -0.0384])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth":
-        expected_logits = torch.tensor([0.4174, -0.0989, 0.1489])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_384.pth":
-        expected_logits = torch.tensor([0.2513, -0.1349, -0.1613])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth":
-        expected_logits = torch.tensor([1.2980, 0.3631, -0.1198])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth":
-        expected_logits = torch.tensor([1.2963, 0.1227, 0.1723])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth":
-        expected_logits = torch.tensor([1.7956, 0.8390, 0.2820])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth":
-        expected_logits = torch.tensor([-0.2822, -0.0502, -0.0878])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth":
-        expected_logits = torch.tensor([-0.5672, -0.0730, -0.4348])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth":
-        expected_logits = torch.tensor([0.2681, 0.2365, 0.6246])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth":
-        expected_logits = torch.tensor([-0.2642, 0.3931, 0.5116])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth":
-        expected_logits = torch.tensor([-0.6677, -0.1873, -0.8379])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth":
-        expected_logits = torch.tensor([-0.7749, -0.2967, -0.6444])
-    else:
-        raise ValueError(f"Unknown URL: {checkpoint_url}")
-
-    assert torch.allclose(logits[0, :3], expected_logits, atol=1e-3)
-    assert logits.shape == expected_shape
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    print("Pushing model to the hub...")
-    model_name = "convnext"
-    if "tiny" in checkpoint_url:
-        model_name += "-tiny"
-    elif "small" in checkpoint_url:
-        model_name += "-small"
-    elif "base" in checkpoint_url:
-        model_name += "-base"
-    elif "xlarge" in checkpoint_url:
-        model_name += "-xlarge"
-    elif "large" in checkpoint_url:
-        model_name += "-large"
-    if "224" in checkpoint_url:
-        model_name += "-224"
-    elif "384" in checkpoint_url:
-        model_name += "-384"
-    if "22k" in checkpoint_url and "1k" not in checkpoint_url:
-        model_name += "-22k"
-    if "22k" in checkpoint_url and "1k" in checkpoint_url:
-        model_name += "-22k-1k"
-
-    model.push_to_hub(
-        repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-        organization="nielsr",
-        commit_message="Add model",
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
-        type=str,
-        help="URL of the original ConvNeXT checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-
-    args = parser.parse_args()
-    convert_convnext_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
index 35cbc91797ad..e6b3125167dc 100644
--- a/src/transformers/models/convnext/image_processing_convnext.py
+++ b/src/transformers/models/convnext/image_processing_convnext.py
@@ -90,7 +90,7 @@ def __init__(
         self,
         do_resize: bool = True,
         size: Dict[str, int] = None,
-        crop_pct: float = None,
+        crop_pct: Optional[float] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
@@ -187,13 +187,13 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
-        crop_pct: float = None,
+        crop_pct: Optional[float] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/convnext/image_processing_convnext_fast.py b/src/transformers/models/convnext/image_processing_convnext_fast.py
index c2a8e37d53a0..19f959f07d88 100644
--- a/src/transformers/models/convnext/image_processing_convnext_fast.py
+++ b/src/transformers/models/convnext/image_processing_convnext_fast.py
@@ -21,8 +21,7 @@
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -54,11 +53,7 @@
         from torchvision.transforms import functional as F
 
 
-class ConvNextFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    crop_pct: Optional[float]
-
-
-class ConvNextFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     crop_pct: Optional[float]
 
 
@@ -81,10 +76,9 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     crop_pct = 224 / 256
-    valid_init_kwargs = ConvNextFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = ConvNextFastImageProcessorPreprocessKwargs
+    valid_kwargs = ConvNextFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -95,9 +89,7 @@ def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorInitKwargs]):
             overridden by `crop_pct` in the`preprocess` method.
         """,
     )
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorPreprocessKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def resize(
diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py
index 155f466ac4ae..5c769926d2c1 100755
--- a/src/transformers/models/convnext/modeling_convnext.py
+++ b/src/transformers/models/convnext/modeling_convnext.py
@@ -286,9 +286,12 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
+        elif isinstance(module, (nn.LayerNorm, ConvNextLayerNorm)):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, ConvNextLayer):
+            if module.layer_scale_parameter is not None:
+                module.layer_scale_parameter.data.fill_(self.config.layer_scale_init_value)
 
 
 CONVNEXT_START_DOCSTRING = r"""
@@ -344,7 +347,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
@@ -410,7 +413,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
deleted file mode 100644
index 8094ecf0d615..000000000000
--- a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ConvNeXTV2 checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/ConvNeXt"""
-
-import argparse
-import json
-import os
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import ConvNextImageProcessor, ConvNextV2Config, ConvNextV2ForImageClassification
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_convnextv2_config(checkpoint_url):
-    config = ConvNextV2Config()
-
-    if "atto" in checkpoint_url:
-        depths = [2, 2, 6, 2]
-        hidden_sizes = [40, 80, 160, 320]
-    if "femto" in checkpoint_url:
-        depths = [2, 2, 6, 2]
-        hidden_sizes = [48, 96, 192, 384]
-    if "pico" in checkpoint_url:
-        depths = [2, 2, 6, 2]
-        hidden_sizes = [64, 128, 256, 512]
-    if "nano" in checkpoint_url:
-        depths = [2, 2, 8, 2]
-        hidden_sizes = [80, 160, 320, 640]
-    if "tiny" in checkpoint_url:
-        depths = [3, 3, 9, 3]
-        hidden_sizes = [96, 192, 384, 768]
-    if "base" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [128, 256, 512, 1024]
-    if "large" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [192, 384, 768, 1536]
-    if "huge" in checkpoint_url:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [352, 704, 1408, 2816]
-
-    num_labels = 1000
-    filename = "imagenet-1k-id2label.json"
-    expected_shape = (1, 1000)
-
-    repo_id = "huggingface/label-files"
-    config.num_labels = num_labels
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    config.hidden_sizes = hidden_sizes
-    config.depths = depths
-
-    return config, expected_shape
-
-
-def rename_key(name):
-    if "downsample_layers.0.0" in name:
-        name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings")
-    if "downsample_layers.0.1" in name:
-        name = name.replace("downsample_layers.0.1", "embeddings.norm")  # we rename to layernorm later on
-    if "downsample_layers.1.0" in name:
-        name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0")
-    if "downsample_layers.1.1" in name:
-        name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1")
-    if "downsample_layers.2.0" in name:
-        name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0")
-    if "downsample_layers.2.1" in name:
-        name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1")
-    if "downsample_layers.3.0" in name:
-        name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0")
-    if "downsample_layers.3.1" in name:
-        name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1")
-    if "stages" in name and "downsampling_layer" not in name:
-        # stages.0.0. for instance should be renamed to stages.0.layers.0.
-        name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :]
-    if "gamma" in name:
-        name = name.replace("gamma", "weight")
-    if "beta" in name:
-        name = name.replace("beta", "bias")
-    if "stages" in name:
-        name = name.replace("stages", "encoder.stages")
-    if "norm" in name:
-        name = name.replace("norm", "layernorm")
-    if "head" in name:
-        name = name.replace("head", "classifier")
-
-    return name
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def convert_preprocessor(checkpoint_url):
-    if "224" in checkpoint_url:
-        size = 224
-        crop_pct = 224 / 256
-    elif "384" in checkpoint_url:
-        size = 384
-        crop_pct = None
-    else:
-        size = 512
-        crop_pct = None
-
-    return ConvNextImageProcessor(
-        size=size,
-        crop_pct=crop_pct,
-        image_mean=[0.485, 0.456, 0.406],
-        image_std=[0.229, 0.224, 0.225],
-        resample=PILImageResampling.BICUBIC,
-    )
-
-
-@torch.no_grad()
-def convert_convnextv2_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our ConvNeXTV2 structure.
-    """
-    print("Downloading original model from checkpoint...")
-    # define ConvNeXTV2 configuration based on URL
-    config, expected_shape = get_convnextv2_config(checkpoint_url)
-    # load original state_dict from URL
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
-
-    print("Converting model parameters...")
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # add prefix to all keys expect classifier head
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        if not key.startswith("classifier"):
-            key = "convnextv2." + key
-        state_dict[key] = val
-
-    # load HuggingFace model
-    model = ConvNextV2ForImageClassification(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # Check outputs on an image, prepared by ConvNextImageProcessor
-    preprocessor = convert_preprocessor(checkpoint_url)
-    inputs = preprocessor(images=prepare_img(), return_tensors="pt")
-    logits = model(**inputs).logits
-
-    # note: the logits below were obtained without center cropping
-    if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.3930, 0.1747, -0.5246, 0.4177, 0.4295])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.1727, -0.5341, -0.7818, -0.4745, -0.6566])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.0333, 0.1563, -0.9137, 0.1054, 0.0381])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.1744, -0.1555, -0.0713, 0.0950, -0.1431])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt":
-        expected_logits = torch.tensor([0.9996, 0.1966, -0.4386, -0.3472, 0.6661])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.2553, -0.6708, -0.1359, 0.2518, -0.2488])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.0673, -0.5627, -0.3753, -0.2722, 0.0178])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt":
-        expected_logits = torch.tensor([-0.6377, -0.7458, -0.2150, 0.1184, -0.0597])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt":
-        expected_logits = torch.tensor([1.0799, 0.2322, -0.8860, 1.0219, 0.6231])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt":
-        expected_logits = torch.tensor([0.3766, 0.4917, -1.1426, 0.9942, 0.6024])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt":
-        expected_logits = torch.tensor([0.4220, -0.6919, -0.4317, -0.2881, -0.6609])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt":
-        expected_logits = torch.tensor([0.1082, -0.8286, -0.5095, 0.4681, -0.8085])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt":
-        expected_logits = torch.tensor([-0.2419, -0.6221, 0.2176, -0.0980, -0.7527])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt":
-        expected_logits = torch.tensor([0.0391, -0.4371, 0.3786, 0.1251, -0.2784])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt":
-        expected_logits = torch.tensor([-0.0504, 0.5636, -0.1729, -0.6507, -0.3949])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt":
-        expected_logits = torch.tensor([0.3560, 0.9486, 0.3149, -0.2667, -0.5138])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt":
-        expected_logits = torch.tensor([-0.2469, -0.4550, -0.5853, -0.0810, 0.0309])
-    elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt":
-        expected_logits = torch.tensor([-0.3090, 0.0802, -0.0682, -0.1979, -0.2826])
-    else:
-        raise ValueError(f"Unknown URL: {checkpoint_url}")
-
-    assert torch.allclose(logits[0, :5], expected_logits, atol=1e-3)
-    assert logits.shape == expected_shape
-    print("Model outputs match the original results!")
-
-    if save_model:
-        print("Saving model to local...")
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-
-        model.save_pretrained(pytorch_dump_folder_path)
-        preprocessor.save_pretrained(pytorch_dump_folder_path)
-
-    model_name = "convnextv2"
-    if "atto" in checkpoint_url:
-        model_name += "-atto"
-    if "femto" in checkpoint_url:
-        model_name += "-femto"
-    if "pico" in checkpoint_url:
-        model_name += "-pico"
-    if "nano" in checkpoint_url:
-        model_name += "-nano"
-    elif "tiny" in checkpoint_url:
-        model_name += "-tiny"
-    elif "base" in checkpoint_url:
-        model_name += "-base"
-    elif "large" in checkpoint_url:
-        model_name += "-large"
-    elif "huge" in checkpoint_url:
-        model_name += "-huge"
-    if "22k" in checkpoint_url and "1k" not in checkpoint_url:
-        model_name += "-22k"
-    elif "22k" in checkpoint_url and "1k" in checkpoint_url:
-        model_name += "-22k-1k"
-    elif "1k" in checkpoint_url:
-        model_name += "-1k"
-    if "224" in checkpoint_url:
-        model_name += "-224"
-    elif "384" in checkpoint_url:
-        model_name += "-384"
-    elif "512" in checkpoint_url:
-        model_name += "-512"
-
-    if push_to_hub:
-        print(f"Pushing {model_name} to the hub...")
-        model.push_to_hub(model_name)
-        preprocessor.push_to_hub(model_name)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt",
-        type=str,
-        help="URL of the original ConvNeXTV2 checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="model",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
-
-    args = parser.parse_args()
-    convert_convnextv2_checkpoint(
-        args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub
-    )
diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py
index c0490eead21c..a9d8332ff0a4 100644
--- a/src/transformers/models/convnextv2/modeling_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_convnextv2.py
@@ -100,7 +100,7 @@ def __init__(self, dim: int):
 
     def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
         # Compute and normalize global spatial feature maps
-        global_features = torch.norm(hidden_states, p=2, dim=(1, 2), keepdim=True)
+        global_features = torch.linalg.norm(hidden_states, ord=2, dim=(1, 2), keepdim=True)
         norm_features = global_features / (global_features.mean(dim=-1, keepdim=True) + 1e-6)
         hidden_states = self.weight * (hidden_states * norm_features) + self.bias + hidden_states
 
@@ -287,7 +287,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.convnext.modeling_convnext.ConvNextPreTrainedModel with ConvNext->ConvNextV2, convnext->convnextv2
 class ConvNextV2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -307,9 +306,12 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
+        elif isinstance(module, (nn.LayerNorm, ConvNextV2LayerNorm)):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, ConvNextV2GRN):
+            module.weight.data.zero_()
+            module.bias.data.zero_()
 
 
 CONVNEXTV2_START_DOCSTRING = r"""
@@ -365,7 +367,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
@@ -432,7 +434,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index f530b702870f..f97adac913a1 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -796,7 +796,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 9f76c92887f4..000000000000
--- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert CvT checkpoints from the original repository.
-
-URL: https://github.com/microsoft/CvT"""
-
-import argparse
-import json
-from collections import OrderedDict
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import AutoImageProcessor, CvtConfig, CvtForImageClassification
-
-
-def embeddings(idx):
-    """
-    The function helps in renaming embedding layer weights.
-
-    Args:
-        idx: stage number in original model
-    """
-    embed = []
-    embed.append(
-        (
-            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.weight",
-            f"stage{idx}.patch_embed.proj.weight",
-        )
-    )
-    embed.append(
-        (
-            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.bias",
-            f"stage{idx}.patch_embed.proj.bias",
-        )
-    )
-    embed.append(
-        (
-            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.weight",
-            f"stage{idx}.patch_embed.norm.weight",
-        )
-    )
-    embed.append(
-        (
-            f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.bias",
-            f"stage{idx}.patch_embed.norm.bias",
-        )
-    )
-    return embed
-
-
-def attention(idx, cnt):
-    """
-    The function helps in renaming attention block layers weights.
-
-    Args:
-        idx: stage number in original model
-        cnt: count of blocks in each stage
-    """
-    attention_weights = []
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.convolution.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.conv.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.bias",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_mean",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_mean",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_var",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_var",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.num_batches_tracked",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.num_batches_tracked",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.convolution.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.conv.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.bias",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_mean",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_mean",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_var",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_var",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.num_batches_tracked",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.num_batches_tracked",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.convolution.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.conv.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.weight",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.bias",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_mean",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_mean",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_var",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_var",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.num_batches_tracked",
-            f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.num_batches_tracked",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.weight",
-            f"stage{idx}.blocks.{cnt}.attn.proj_q.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.bias",
-            f"stage{idx}.blocks.{cnt}.attn.proj_q.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.weight",
-            f"stage{idx}.blocks.{cnt}.attn.proj_k.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.bias",
-            f"stage{idx}.blocks.{cnt}.attn.proj_k.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.weight",
-            f"stage{idx}.blocks.{cnt}.attn.proj_v.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.bias",
-            f"stage{idx}.blocks.{cnt}.attn.proj_v.bias",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.weight",
-            f"stage{idx}.blocks.{cnt}.attn.proj.weight",
-        )
-    )
-    attention_weights.append(
-        (
-            f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.bias",
-            f"stage{idx}.blocks.{cnt}.attn.proj.bias",
-        )
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc1.weight")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc1.bias")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc2.weight")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc2.bias")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.weight", f"stage{idx}.blocks.{cnt}.norm1.weight")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.bias", f"stage{idx}.blocks.{cnt}.norm1.bias")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.weight", f"stage{idx}.blocks.{cnt}.norm2.weight")
-    )
-    attention_weights.append(
-        (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.bias", f"stage{idx}.blocks.{cnt}.norm2.bias")
-    )
-    return attention_weights
-
-
-def cls_token(idx):
-    """
-    Function helps in renaming cls_token weights
-    """
-    token = []
-    token.append((f"cvt.encoder.stages.{idx}.cls_token", "stage2.cls_token"))
-    return token
-
-
-def final():
-    """
-    Function helps in renaming final classification layer
-    """
-    head = []
-    head.append(("layernorm.weight", "norm.weight"))
-    head.append(("layernorm.bias", "norm.bias"))
-    head.append(("classifier.weight", "head.weight"))
-    head.append(("classifier.bias", "head.bias"))
-    return head
-
-
-def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder):
-    """
-    Fucntion to convert the microsoft cvt checkpoint to huggingface checkpoint
-    """
-    img_labels_file = "imagenet-1k-id2label.json"
-    num_labels = 1000
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.loads(Path(hf_hub_download(repo_id, img_labels_file, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    config = config = CvtConfig(num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    # For depth size 13 (13 = 1+2+10)
-    if cvt_model.rsplit("/", 1)[-1][4:6] == "13":
-        config.depth = [1, 2, 10]
-
-    # For depth size 21 (21 = 1+4+16)
-    elif cvt_model.rsplit("/", 1)[-1][4:6] == "21":
-        config.depth = [1, 4, 16]
-
-    # For wide cvt (similar to wide-resnet) depth size 24 (w24 = 2 + 2 20)
-    else:
-        config.depth = [2, 2, 20]
-        config.num_heads = [3, 12, 16]
-        config.embed_dim = [192, 768, 1024]
-
-    model = CvtForImageClassification(config)
-    image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
-    image_processor.size["shortest_edge"] = image_size
-    original_weights = torch.load(cvt_file_name, map_location=torch.device("cpu"))
-
-    huggingface_weights = OrderedDict()
-    list_of_state_dict = []
-
-    for idx in range(len(config.depth)):
-        if config.cls_token[idx]:
-            list_of_state_dict = list_of_state_dict + cls_token(idx)
-        list_of_state_dict = list_of_state_dict + embeddings(idx)
-        for cnt in range(config.depth[idx]):
-            list_of_state_dict = list_of_state_dict + attention(idx, cnt)
-
-    list_of_state_dict = list_of_state_dict + final()
-    for gg in list_of_state_dict:
-        print(gg)
-    for i in range(len(list_of_state_dict)):
-        huggingface_weights[list_of_state_dict[i][0]] = original_weights[list_of_state_dict[i][1]]
-
-    model.load_state_dict(huggingface_weights)
-    model.save_pretrained(pytorch_dump_folder)
-    image_processor.save_pretrained(pytorch_dump_folder)
-
-
-# Download the weights from zoo: https://1drv.ms/u/s!AhIXJn_J-blW9RzF3rMW7SsLHa8h?e=blQ0Al
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--cvt_model",
-        default="cvt-w24",
-        type=str,
-        help="Name of the cvt model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--image_size",
-        default=384,
-        type=int,
-        help="Input Image Size",
-    )
-    parser.add_argument(
-        "--cvt_file_name",
-        default=r"cvtmodels\CvT-w24-384x384-IN-22k.pth",
-        type=str,
-        help="Input Image Size",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_cvt_checkpoint(args.cvt_model, args.image_size, args.cvt_file_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py
index cd68b391ba1f..c2ca9eab5a12 100644
--- a/src/transformers/models/cvt/modeling_cvt.py
+++ b/src/transformers/models/cvt/modeling_cvt.py
@@ -60,8 +60,8 @@ class BaseModelOutputWithCLSToken(ModelOutput):
             plus the initial embedding outputs.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    cls_token_value: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    cls_token_value: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
@@ -544,7 +544,7 @@ def _init_weights(self, module):
         elif isinstance(module, CvtStage):
             if self.config.cls_token[module.stage]:
                 module.cls_token.data = nn.init.trunc_normal_(
-                    torch.zeros(1, 1, self.config.embed_dim[-1]), mean=0.0, std=self.config.initializer_range
+                    module.cls_token.data, mean=0.0, std=self.config.initializer_range
                 )
 
 
diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py
index fa9a4d9a3a44..74202d880623 100644
--- a/src/transformers/models/cvt/modeling_tf_cvt.py
+++ b/src/transformers/models/cvt/modeling_tf_cvt.py
@@ -65,8 +65,8 @@ class TFBaseModelOutputWithCLSToken(ModelOutput):
             the initial embedding outputs.
     """
 
-    last_hidden_state: tf.Tensor = None
-    cls_token_value: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
+    cls_token_value: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
 
 
diff --git a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index a6e5081b484c..000000000000
--- a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DAB-DETR checkpoints."""
-
-import argparse
-import gc
-import json
-import re
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import ConditionalDetrImageProcessor, DabDetrConfig, DabDetrForObjectDetection
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-    # for dab-DETR, also convert reference point head and query scale MLP
-    r"input_proj\.(bias|weight)": r"input_projection.\1",
-    r"refpoint_embed\.weight": r"query_refpoint_embeddings.weight",
-    r"class_embed\.(bias|weight)": r"class_embed.\1",
-    # negative lookbehind because of the overlap
-    r"(?<!transformer\.decoder\.)bbox_embed\.layers\.(\d+)\.(bias|weight)": r"bbox_predictor.layers.\1.\2",
-    r"transformer\.encoder\.query_scale\.layers\.(\d+)\.(bias|weight)": r"encoder.query_scale.layers.\1.\2",
-    r"transformer\.decoder\.bbox_embed\.layers\.(\d+)\.(bias|weight)": r"decoder.bbox_embed.layers.\1.\2",
-    r"transformer\.decoder\.norm\.(bias|weight)": r"decoder.layernorm.\1",
-    r"transformer\.decoder\.ref_point_head\.layers\.(\d+)\.(bias|weight)": r"decoder.ref_point_head.layers.\1.\2",
-    r"transformer\.decoder\.ref_anchor_head\.layers\.(\d+)\.(bias|weight)": r"decoder.ref_anchor_head.layers.\1.\2",
-    r"transformer\.decoder\.query_scale\.layers\.(\d+)\.(bias|weight)": r"decoder.query_scale.layers.\1.\2",
-    r"transformer\.decoder\.layers\.0\.ca_qpos_proj\.(bias|weight)": r"decoder.layers.0.cross_attn.cross_attn_query_pos_proj.\1",
-    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms + activation function
-    # output projection
-    r"transformer\.encoder\.layers\.(\d+)\.self_attn\.out_proj\.(bias|weight)": r"encoder.layers.\1.self_attn.out_proj.\2",
-    # FFN layers
-    r"transformer\.encoder\.layers\.(\d+)\.linear(\d)\.(bias|weight)": r"encoder.layers.\1.fc\2.\3",
-    # normalization layers
-    # nm1
-    r"transformer\.encoder\.layers\.(\d+)\.norm1\.(bias|weight)": r"encoder.layers.\1.self_attn_layer_norm.\2",
-    # nm2
-    r"transformer\.encoder\.layers\.(\d+)\.norm2\.(bias|weight)": r"encoder.layers.\1.final_layer_norm.\2",
-    # activation function weight
-    r"transformer\.encoder\.layers\.(\d+)\.activation\.weight": r"encoder.layers.\1.activation_fn.weight",
-    #########################################################################################################################################
-    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms + activiation function weight
-    r"transformer\.decoder\.layers\.(\d+)\.self_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn.output_proj.\2",
-    r"transformer\.decoder\.layers\.(\d+)\.cross_attn\.out_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn.output_proj.\2",
-    # FFNs
-    r"transformer\.decoder\.layers\.(\d+)\.linear(\d)\.(bias|weight)": r"decoder.layers.\1.mlp.fc\2.\3",
-    # nm1
-    r"transformer\.decoder\.layers\.(\d+)\.norm1\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn_layer_norm.\2",
-    # nm2
-    r"transformer\.decoder\.layers\.(\d+)\.norm2\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn_layer_norm.\2",
-    # nm3
-    r"transformer\.decoder\.layers\.(\d+)\.norm3\.(bias|weight)": r"decoder.layers.\1.mlp.final_layer_norm.\2",
-    # activation function weight
-    r"transformer\.decoder\.layers\.(\d+)\.activation\.weight": r"decoder.layers.\1.mlp.activation_fn.weight",
-    # q, k, v projections and biases in self-attention in decoder
-    r"transformer\.decoder\.layers\.(\d+)\.sa_qcontent_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn_query_content_proj.\2",
-    r"transformer\.decoder\.layers\.(\d+)\.sa_kcontent_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn_key_content_proj.\2",
-    r"transformer\.decoder\.layers\.(\d+)\.sa_qpos_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn_query_pos_proj.\2",
-    r"transformer\.decoder\.layers\.(\d+)\.sa_kpos_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn_key_pos_proj.\2",
-    r"transformer\.decoder\.layers\.(\d+)\.sa_v_proj\.(bias|weight)": r"decoder.layers.\1.self_attn.self_attn_value_proj.\2",
-    # q, k, v projections in cross-attention in decoder
-    r"transformer\.decoder\.layers\.(\d+)\.ca_qcontent_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn_query_content_proj.\2",
-    r"transformer\.decoder\.layers\.(\d+)\.ca_kcontent_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn_key_content_proj.\2",
-    r"transformer\.decoder\.layers\.(\d+)\.ca_kpos_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn_key_pos_proj.\2",
-    r"transformer\.decoder\.layers\.(\d+)\.ca_v_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn_value_proj.\2",
-    r"transformer\.decoder\.layers\.(\d+)\.ca_qpos_sine_proj\.(bias|weight)": r"decoder.layers.\1.cross_attn.cross_attn_query_pos_sine_proj.\2",
-}
-
-
-# Copied from transformers.models.mllama.convert_mllama_weights_to_hf.convert_old_keys_to_new_keys
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
-    """
-    This function should be applied only once, on the concatenated keys to efficiently rename using
-    the key mappings.
-    """
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-            if replacement is None:
-                new_text = re.sub(pattern, "", new_text)  # an empty line
-                continue
-            new_text = re.sub(pattern, replacement, new_text)
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-    return output_dict
-
-
-def write_image_processor(model_name, pytorch_dump_folder_path, push_to_hub):
-    logger.info("Converting image processor...")
-    format = "coco_detection"
-    image_processor = ConditionalDetrImageProcessor(format=format)
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        image_processor.push_to_hub(repo_id=model_name, commit_message="Add new image processor")
-
-
-@torch.no_grad()
-def write_model(model_name, pretrained_model_weights_path, pytorch_dump_folder_path, push_to_hub):
-    # load modified config. Why? After loading the default config, the backbone kwargs are already set.
-    if "dc5" in model_name:
-        config = DabDetrConfig(dilation=True)
-    else:
-        # load default config
-        config = DabDetrConfig()
-    # set other attributes
-    if "dab-detr-resnet-50-dc5" == model_name:
-        config.temperature_height = 10
-        config.temperature_width = 10
-    if "fixxy" in model_name:
-        config.random_refpoints_xy = True
-    if "pat3" in model_name:
-        config.num_patterns = 3
-        # only when the number of patterns (num_patterns parameter in config) are more than 0 like r50-pat3 or r50dc5-pat3
-        ORIGINAL_TO_CONVERTED_KEY_MAPPING.update({r"transformer.patterns.weight": r"patterns.weight"})
-
-    config.num_labels = 91
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    # load original model from local path
-    loaded = torch.load(pretrained_model_weights_path, map_location=torch.device("cpu"))["model"]
-    # Renaming the original model state dictionary to HF compatibile
-    all_keys = list(loaded.keys())
-    new_keys = convert_old_keys_to_new_keys(all_keys)
-    state_dict = {}
-    for key in all_keys:
-        if "backbone.0.body" in key:
-            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model._backbone")
-            state_dict[new_key] = loaded[key]
-        # Q, K, V encoder values mapping
-        elif re.search("self_attn.in_proj_(weight|bias)", key):
-            # Dynamically find the layer number
-            pattern = r"layers\.(\d+)\.self_attn\.in_proj_(weight|bias)"
-            match = re.search(pattern, key)
-            if match:
-                layer_num = match.group(1)
-            else:
-                raise ValueError(f"Pattern not found in key: {key}")
-
-            in_proj_value = loaded.pop(key)
-            if "weight" in key:
-                state_dict[f"encoder.layers.{layer_num}.self_attn.q_proj.weight"] = in_proj_value[:256, :]
-                state_dict[f"encoder.layers.{layer_num}.self_attn.k_proj.weight"] = in_proj_value[256:512, :]
-                state_dict[f"encoder.layers.{layer_num}.self_attn.v_proj.weight"] = in_proj_value[-256:, :]
-            elif "bias" in key:
-                state_dict[f"encoder.layers.{layer_num}.self_attn.q_proj.bias"] = in_proj_value[:256]
-                state_dict[f"encoder.layers.{layer_num}.self_attn.k_proj.bias"] = in_proj_value[256:512]
-                state_dict[f"encoder.layers.{layer_num}.self_attn.v_proj.bias"] = in_proj_value[-256:]
-        else:
-            new_key = new_keys[key]
-            state_dict[new_key] = loaded[key]
-
-    del loaded
-    gc.collect()
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "model."
-    for key in state_dict.copy().keys():
-        if not key.startswith("class_embed") and not key.startswith("bbox_predictor"):
-            val = state_dict.pop(key)
-            state_dict[prefix + key] = val
-    # finally, create HuggingFace model and load state dict
-    model = DabDetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-    logger.info(f"Saving PyTorch model to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(repo_id=model_name, commit_message="Add new model")
-
-
-def convert_dab_detr_checkpoint(model_name, pretrained_model_weights_path, pytorch_dump_folder_path, push_to_hub):
-    logger.info("Converting image processor...")
-    write_image_processor(model_name, pytorch_dump_folder_path, push_to_hub)
-
-    logger.info(f"Converting model {model_name}...")
-    write_model(model_name, pretrained_model_weights_path, pytorch_dump_folder_path, push_to_hub)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        default="dab-detr-resnet-50",
-        type=str,
-        help="Name of the DAB_DETR model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pretrained_model_weights_path",
-        default="modelzoo/R50/checkpoint.pth",
-        type=str,
-        help="The path of the original model weights like: modelzoo/checkpoint.pth",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default="DAB_DETR", type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default=True,
-        type=bool,
-        help="Whether to upload the converted weights and image processor config to the HuggingFace model profile. Default is set to false.",
-    )
-    args = parser.parse_args()
-    convert_dab_detr_checkpoint(
-        args.model_name, args.pretrained_model_weights_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/dab_detr/modeling_dab_detr.py b/src/transformers/models/dab_detr/modeling_dab_detr.py
index 3e3294db07a1..84d6f276a893 100644
--- a/src/transformers/models/dab_detr/modeling_dab_detr.py
+++ b/src/transformers/models/dab_detr/modeling_dab_detr.py
@@ -174,8 +174,8 @@ class DabDetrObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
diff --git a/src/transformers/models/dac/convert_dac_checkpoint.py b/src/transformers/models/dac/convert_dac_checkpoint.py
deleted file mode 100644
index bfeb96fbdd4e..000000000000
--- a/src/transformers/models/dac/convert_dac_checkpoint.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import fnmatch
-import re
-
-import torch
-
-from transformers import (
-    DacConfig,
-    DacFeatureExtractor,
-    DacModel,
-    logging,
-)
-
-
-# checkpoints downloaded using:
-# pip install descript-audio-codec
-# python3 -m dac download # downloads the default 44kHz variant
-# python3 -m dac download --model_type 44khz # downloads the 44kHz variant
-# python3 -m dac download --model_type 24khz # downloads the 24kHz variant
-# python3 -m dac download --model_type 16khz # downloads the 16kHz variant
-# More informations: https://github.com/descriptinc/descript-audio-codec/tree/main
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.dac")
-
-
-def match_pattern(string, pattern):
-    # Split the pattern into parts
-    pattern_parts = pattern.split(".")
-    string_parts = string.split(".")
-
-    pattern_block_count = string_block_count = 0
-
-    for part in pattern_parts:
-        if part.startswith("block"):
-            pattern_block_count += 1
-
-    for part in string_parts:
-        if part.startswith("block"):
-            string_block_count += 1
-
-    return fnmatch.fnmatch(string, pattern) and string_block_count == pattern_block_count
-
-
-TOP_LEVEL_KEYS = []
-IGNORE_KEYS = []
-
-
-MAPPING_ENCODER = {
-    "encoder.block.0": ["encoder.conv1"],
-    "encoder.block.5": ["encoder.snake1"],
-    "encoder.block.6": ["encoder.conv2"],
-    "encoder.block.*.block.*.block.0".replace("*", r"\d+"): ["encoder.block", "res_unit", "snake1"],
-    "encoder.block.*.block.*.block.1".replace("*", r"\d+"): ["encoder.block", "res_unit", "conv1"],
-    "encoder.block.*.block.*.block.2".replace("*", r"\d+"): ["encoder.block", "res_unit", "snake2"],
-    "encoder.block.*.block.*.block.3".replace("*", r"\d+"): ["encoder.block", "res_unit", "conv2"],
-    "encoder.block.*.block.3".replace("*", r"\d+"): ["encoder.block", "snake1"],
-    "encoder.block.*.block.4".replace("*", r"\d+"): ["encoder.block", "conv1"],
-}
-
-MAPPING_QUANTIZER = {
-    "quantizer.quantizers.*": ["quantizer.quantizers.*"],
-}
-
-MAPPING_DECODER = {
-    "decoder.model.0": ["decoder.conv1"],
-    "decoder.model.5": ["decoder.snake1"],
-    "decoder.model.6": ["decoder.conv2"],
-    "decoder.model.*.block.0".replace("*", r"\d+"): ["decoder.block", "snake1"],
-    "decoder.model.*.block.1".replace("*", r"\d+"): ["decoder.block", "conv_t1"],
-    "decoder.model.*.block.*.block.0".replace("*", r"\d+"): ["decoder.block", "res_unit", "snake1"],
-    "decoder.model.*.block.*.block.1".replace("*", r"\d+"): ["decoder.block", "res_unit", "conv1"],
-    "decoder.model.*.block.*.block.2".replace("*", r"\d+"): ["decoder.block", "res_unit", "snake2"],
-    "decoder.model.*.block.*.block.3".replace("*", r"\d+"): ["decoder.block", "res_unit", "conv2"],
-}
-
-
-MAPPING = {
-    **MAPPING_ENCODER,
-    **MAPPING_QUANTIZER,
-    **MAPPING_DECODER,
-}
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "alpha":
-        hf_pointer.alpha.data = value
-    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
-
-
-def should_ignore(name, ignore_keys):
-    for key in ignore_keys:
-        if key.endswith(".*"):
-            if name.startswith(key[:-1]):
-                return True
-        elif ".*." in key:
-            prefix, suffix = key.split(".*.")
-            if prefix in name and suffix in name:
-                return True
-        elif key in name:
-            return True
-    return False
-
-
-def recursively_load_weights(orig_dict, hf_model, model_name):
-    unused_weights = []
-
-    if model_name not in ["dac_16khz", "dac_24khz", "dac_44khz"]:
-        raise ValueError(f"Unsupported model: {model_name}")
-
-    for name, value in orig_dict.items():
-        is_used = False
-        for key, mapped_key in MAPPING.items():
-            regex = re.compile(key)
-            if regex.search(name):
-                if len(mapped_key) == 1:
-                    if mapped_key[0][0] == "q":
-                        mapped_key = ".".join(name.split(".")[:-1])
-                    else:
-                        mapped_key = mapped_key[0]
-                elif len(mapped_key) == 3:
-                    integers = re.findall(r"\b\d+\b", name)
-                    if mapped_key[0][0] == "d":
-                        mapped_key = "{}.{}.{}{}.{}".format(
-                            mapped_key[0],
-                            str(int(integers[0]) - 1),
-                            mapped_key[1],
-                            str(int(integers[1]) - 1),
-                            mapped_key[2],
-                        )
-                    else:
-                        mapped_key = "{}.{}.{}{}.{}".format(
-                            mapped_key[0],
-                            str(int(integers[0]) - 1),
-                            mapped_key[1],
-                            str(int(integers[1]) + 1),
-                            mapped_key[2],
-                        )
-                elif len(mapped_key) == 2:
-                    integers = re.findall(r"\b\d+\b", name)
-                    mapped_key = "{}.{}.{}".format(mapped_key[0], str(int(integers[0]) - 1), mapped_key[1])
-
-                is_used = True
-                if "weight_g" in name:
-                    weight_type = "weight_g"
-                elif "weight_v" in name:
-                    weight_type = "weight_v"
-                elif "bias" in name:
-                    weight_type = "bias"
-                elif "alpha" in name:
-                    weight_type = "alpha"
-                elif "weight" in name:
-                    weight_type = "weight"
-                set_recursively(hf_model, mapped_key, value, name, weight_type)
-
-        if not is_used:
-            unused_weights.append(name)
-
-    print(list(set(unused_weights)))
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-@torch.no_grad()
-def convert_checkpoint(
-    model_name,
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    sample_rate=16000,
-    repo_id=None,
-):
-    model_dict = torch.load(checkpoint_path, "cpu")
-
-    config = DacConfig()
-
-    metadata = model_dict["metadata"]["kwargs"]
-    config.encoder_hidden_size = metadata["encoder_dim"]
-    config.downsampling_ratios = metadata["encoder_rates"]
-    config.codebook_size = metadata["codebook_size"]
-    config.n_codebooks = metadata["n_codebooks"]
-    config.codebook_dim = metadata["codebook_dim"]
-    config.decoder_hidden_size = metadata["decoder_dim"]
-    config.upsampling_ratios = metadata["decoder_rates"]
-    config.quantizer_dropout = float(metadata["quantizer_dropout"])
-    config.sampling_rate = sample_rate
-
-    model = DacModel(config)
-    feature_extractor = DacFeatureExtractor()
-    feature_extractor.sampling_rate = sample_rate
-
-    original_checkpoint = model_dict["state_dict"]
-
-    model.apply_weight_norm()
-    recursively_load_weights(original_checkpoint, model, model_name)
-    model.remove_weight_norm()
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        feature_extractor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        default="dac_44khz",
-        type=str,
-        help="The model to convert. Should be one of 'dac_16khz', 'dac_24khz', 'dac_44khz'.",
-    )
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-    parser.add_argument("--sample_rate", default=None, type=str, help="Sample rate used by DacFeatureExtractor")
-    args = parser.parse_args()
-
-    convert_checkpoint(
-        args.model, args.checkpoint_path, args.pytorch_dump_folder_path, args.sample_rate, args.push_to_hub
-    )
diff --git a/src/transformers/models/dac/feature_extraction_dac.py b/src/transformers/models/dac/feature_extraction_dac.py
index c22a7603f059..8e5bfadb6141 100644
--- a/src/transformers/models/dac/feature_extraction_dac.py
+++ b/src/transformers/models/dac/feature_extraction_dac.py
@@ -108,7 +108,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/dac/modeling_dac.py b/src/transformers/models/dac/modeling_dac.py
index f05f5d35bf34..47a45f44e919 100644
--- a/src/transformers/models/dac/modeling_dac.py
+++ b/src/transformers/models/dac/modeling_dac.py
@@ -53,11 +53,11 @@ class DacOutput(ModelOutput):
             Projected latents (continuous representation of input before quantization).
     """
 
-    loss: torch.FloatTensor = None
-    audio_values: torch.FloatTensor = None
-    quantized_representation: torch.FloatTensor = None
-    audio_codes: torch.LongTensor = None
-    projected_latents: torch.FloatTensor = None
+    loss: Optional[torch.FloatTensor] = None
+    audio_values: Optional[torch.FloatTensor] = None
+    quantized_representation: Optional[torch.FloatTensor] = None
+    audio_codes: Optional[torch.LongTensor] = None
+    projected_latents: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -74,10 +74,10 @@ class DacEncoderOutput(ModelOutput):
             Projected latents (continuous representation of input before quantization).
     """
 
-    loss: torch.FloatTensor = None
-    quantized_representation: torch.FloatTensor = None
-    audio_codes: torch.FloatTensor = None
-    projected_latents: torch.FloatTensor = None
+    loss: Optional[torch.FloatTensor] = None
+    quantized_representation: Optional[torch.FloatTensor] = None
+    audio_codes: Optional[torch.FloatTensor] = None
+    projected_latents: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -89,7 +89,7 @@ class DacDecoderOutput(ModelOutput):
             Decoded audio values, obtained using the decoder part of Dac.
     """
 
-    audio_values: torch.FloatTensor = None
+    audio_values: Optional[torch.FloatTensor] = None
 
 
 class Snake1d(nn.Module):
@@ -287,7 +287,7 @@ def __init__(self, config: DacConfig):
         self.quantizers = nn.ModuleList([DacVectorQuantize(config) for i in range(config.n_codebooks)])
         self.quantizer_dropout = quantizer_dropout
 
-    def forward(self, hidden_state, n_quantizers: int = None):
+    def forward(self, hidden_state, n_quantizers: Optional[int] = None):
         """
         Quantizes the input tensor using a fixed set of codebooks and returns corresponding codebook vectors.
         Args:
@@ -608,7 +608,7 @@ def __init__(self, config: DacConfig):
     def encode(
         self,
         input_values: torch.Tensor,
-        n_quantizers: int = None,
+        n_quantizers: Optional[int] = None,
         return_dict: Optional[bool] = None,
     ):
         """
@@ -681,7 +681,7 @@ def decode(
     def forward(
         self,
         input_values: torch.Tensor,
-        n_quantizers: int = None,
+        n_quantizers: Optional[int] = None,
         return_dict: Optional[bool] = None,
     ):
         """
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 5339f1671b07..000000000000
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2 checkpoint."""
-
-import argparse
-import os
-from functools import reduce
-
-import fairseq
-import torch
-from datasets import load_dataset
-
-from transformers import Wav2Vec2Processor, logging
-from transformers.models.data2vec.configuration_data2vec_audio import Data2VecAudioConfig
-
-# Copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
-from transformers.models.data2vec.data2vec_audio import Data2VecAudioModel as Dummy  # noqa: F401
-from transformers.models.data2vec.modeling_data2vec_audio import Data2VecAudioForCTC, Data2VecAudioModel
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "models.0.layer_norm": "feature_projection.layer_norm",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_headless):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    if not is_headless:
-        feature_extractor = hf_model.data2vec_audio.feature_extractor
-        pos_conv_embedding = hf_model.data2vec_audio.encoder.pos_conv_embed
-
-    else:
-        feature_extractor = hf_model.feature_extractor
-        pos_conv_embedding = hf_model.encoder.pos_conv_embed
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-            )
-            is_used = True
-        elif "pos_conv" in name:
-            load_pos_conv_layer(
-                name,
-                value,
-                pos_conv_embedding,
-                unused_weights,
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                if not is_headless:
-                    mapped_key = "data2vec_audio." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def access_by_string(module, path):
-    names = path.split(".")
-    return reduce(getattr, names, module)
-
-
-def set_weights(full_name, module, fsq_value, hf_weight_path):
-    hf_weight = access_by_string(module, hf_weight_path)
-    hf_value = hf_weight.data
-
-    if fsq_value.shape != hf_value.shape:
-        raise ValueError(f"{full_name} has size {fsq_value.shape}, but {hf_value.shape} was found.")
-    hf_weight.data = fsq_value
-    logger.info(f"{full_name} was correctly initialized from {hf_weight_path}.")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    weight_type = name.split(".")[-1]
-    if type_id == 0:
-        layer_type = "conv"
-    elif type_id == 2:
-        layer_type = "layer_norm"
-    else:
-        unused_weights.append(full_name)
-        return
-
-    set_weights(full_name, feature_extractor, value, f"conv_layers.{layer_id}.{layer_type}.{weight_type}")
-
-
-def load_pos_conv_layer(full_name, value, pos_conv_embeddings, unused_weights):
-    name = full_name.split("pos_conv.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    weight_type = name.split(".")[-1]
-    if type_id != 0:
-        unused_weights.append(full_name)
-        return
-    else:
-        layer_type = "conv"
-
-    set_weights(full_name, pos_conv_embeddings, value, f"layers.{layer_id}.{layer_type}.{weight_type}")
-
-
-@torch.no_grad()
-def convert_wav2vec2_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = Data2VecAudioConfig.from_pretrained(config_path)
-    else:
-        config = Data2VecAudioConfig()
-
-    if not is_finetuned:
-        # Modify final_proj layer name
-        hf_wav2vec = Data2VecAudioModel(config)
-        data2vec_checkpoint_dir = os.path.dirname(checkpoint_path)
-
-        state_dict = torch.load(checkpoint_path)
-        state_dict["model"]["final_proj.weight"] = state_dict["model"].pop("final_proj.0.weight")
-        state_dict["model"]["final_proj.bias"] = state_dict["model"].pop("final_proj.0.bias")
-        converted_ckpt = os.path.join(data2vec_checkpoint_dir, "converted.pt")
-        torch.save(state_dict, converted_ckpt)
-    else:
-        hf_wav2vec = Data2VecAudioForCTC(config)
-        converted_ckpt = checkpoint_path
-
-    def load_data2vec(path):
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([path])
-        return model[0].eval()
-
-    model = load_data2vec(converted_ckpt)
-
-    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
-
-    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
-
-    ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True)
-    input_audio = [x["array"] for x in ds[:4]["audio"]]
-
-    inputs = processor(input_audio, return_tensors="pt", padding=True)
-
-    input_values = inputs.input_values
-    attention_mask = inputs.attention_mask
-    #    input_values = inputs.input_values[:, :-1]
-    #    attention_mask = inputs.attention_mask[:, :-1]
-
-    hf_wav2vec.eval()
-    model.eval()
-    if is_finetuned:
-        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[
-            "encoder_out"
-        ].transpose(0, 1)
-        our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["logits"]
-
-        pred_ids = torch.argmax(our_output, dim=-1)
-        output_string = processor.batch_decode(pred_ids)
-
-        print(f"Expected Output: {ds[:4]['text']}, Pred: {output_string}")
-    else:
-        their_output = model(source=input_values, padding_mask=(1 - attention_mask), mask=False, features_only=True)[
-            "layer_results"
-        ][-1][0].transpose(0, 1)
-        our_output = hf_wav2vec(input_values, attention_mask=attention_mask)["last_hidden_state"]
-
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-    if is_finetuned:
-        processor.save_pretrained(pytorch_dump_folder_path)
-    else:
-        processor.feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_wav2vec2_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
-    )
diff --git a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 10b97dc93d0a..000000000000
--- a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert data2vec checkpoint."""
-
-import argparse
-import os
-import pathlib
-
-import fairseq
-import torch
-from fairseq.modules import TransformerSentenceEncoderLayer
-from packaging import version
-
-from transformers import (
-    Data2VecTextConfig,
-    Data2VecTextForMaskedLM,
-    Data2VecTextForSequenceClassification,
-    Data2VecTextModel,
-)
-from transformers.models.bert.modeling_bert import (
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-
-# IMPORTANT: In order for this script to run, please make sure to download the dictionary: `dict.txt` from wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
-# File copied from https://github.com/pytorch/fairseq/blob/main/examples/data2vec/models/data2vec_text.py
-from transformers.utils import logging
-
-
-if version.parse(fairseq.__version__) < version.parse("0.9.0"):
-    raise Exception("requires fairseq >= 0.9.0")
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = "Hello world! cécé herlolip"
-
-
-def convert_data2vec_checkpoint_to_pytorch(
-    data2vec_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
-):
-    """
-    Copy/paste/tweak data2vec's weights to our BERT structure.
-    """
-    data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path)
-    data2vec = Data2VecTextModel.from_pretrained(
-        data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name
-    )
-    data2vec.eval()  # disable dropout
-    data2vec_model = data2vec.models[0]
-    data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder
-    config = Data2VecTextConfig(
-        vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=data2vec_model.args.encoder_embed_dim,
-        num_hidden_layers=data2vec_model.args.encoder_layers,
-        num_attention_heads=data2vec_model.args.encoder_attention_heads,
-        intermediate_size=data2vec_model.args.encoder_ffn_embed_dim,
-        max_position_embeddings=514,
-        type_vocab_size=1,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-    )
-    if classification_head:
-        config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0]
-    print("Our BERT config:", config)
-
-    model = Data2VecTextForSequenceClassification(config) if classification_head else Data2VecTextForMaskedLM(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.data2vec_text.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight
-    model.data2vec_text.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight
-    model.data2vec_text.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.data2vec_text.embeddings.token_type_embeddings.weight
-    )  # just zero them out b/c data2vec doesn't use them.
-    model.data2vec_text.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight
-    model.data2vec_text.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer: BertLayer = model.data2vec_text.encoder.layer[i]
-        data2vec_layer: TransformerSentenceEncoderLayer = data2vec_sent_encoder.layers[i]
-
-        # self attention
-        self_attn: BertSelfAttention = layer.attention.self
-        assert data2vec_layer.self_attn.k_proj.weight.data.shape == torch.Size(
-            (config.hidden_size, config.hidden_size)
-        ), (
-            "Shape for data2vec_layer.self_attn.k_proj.weight.data should be"
-            f" {torch.Size((config.hidden_size, config.hidden_size))}"
-        )
-        assert data2vec_layer.self_attn.q_proj.weight.data.shape == torch.Size(
-            (config.hidden_size, config.hidden_size)
-        ), (
-            "Shape for data2vec_layer.self_attn.q_proj.weight.data should be"
-            f" {torch.Size((config.hidden_size, config.hidden_size))}"
-        )
-        assert data2vec_layer.self_attn.v_proj.weight.data.shape == torch.Size(
-            (config.hidden_size, config.hidden_size)
-        ), (
-            "Shape for data2vec_layer.self_attn.v_proj.weight.data should be"
-            f" {torch.Size((config.hidden_size, config.hidden_size))}"
-        )
-
-        self_attn.query.weight.data = data2vec_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = data2vec_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = data2vec_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = data2vec_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = data2vec_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = data2vec_layer.self_attn.v_proj.bias
-
-        # self-attention output
-        self_output: BertSelfOutput = layer.attention.output
-        assert (
-            self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape
-        ), f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}"
-        self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight
-        self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias
-        self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight
-        self_output.LayerNorm.bias = data2vec_layer.self_attn_layer_norm.bias
-
-        # intermediate
-        intermediate: BertIntermediate = layer.intermediate
-        assert (
-            intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape
-        ), f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}"
-        intermediate.dense.weight = data2vec_layer.fc1.weight
-        intermediate.dense.bias = data2vec_layer.fc1.bias
-
-        # output
-        bert_output: BertOutput = layer.output
-        assert (
-            bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape
-        ), f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}"
-        bert_output.dense.weight = data2vec_layer.fc2.weight
-        bert_output.dense.bias = data2vec_layer.fc2.bias
-        bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight
-        bert_output.LayerNorm.bias = data2vec_layer.final_layer_norm.bias
-        # end of layer
-
-    if classification_head:
-        model.classifier.dense.weight = data2vec.model.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = data2vec.model.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = data2vec.model.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = data2vec_model.encoder.lm_head.dense.weight
-        model.lm_head.dense.bias = data2vec_model.encoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = data2vec_model.encoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = data2vec_model.encoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = data2vec_model.encoder.lm_head.weight
-        model.lm_head.decoder.bias = data2vec_model.encoder.lm_head.bias
-
-    # Let's check that we get the same results.
-    input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
-
-    our_output = model(input_ids)[0]
-    if classification_head:
-        their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids))
-    else:
-        their_output = data2vec_model(input_ids)[0]
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    args = parser.parse_args()
-    convert_data2vec_checkpoint_to_pytorch(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
-    )
diff --git a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100755
index 0c6f42f4ba7f..000000000000
--- a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,374 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import json
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from timm.models import create_model
-
-from transformers import (
-    BeitImageProcessor,
-    Data2VecVisionConfig,
-    Data2VecVisionForImageClassification,
-    Data2VecVisionModel,
-)
-
-
-def create_rename_keys(config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec."):
-    prefix = "backbone." if is_semantic else ""
-
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.norm1.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_before.weight")
-        )
-        rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.weight", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.bias", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.bias")
-        )
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.norm2.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_after.weight")
-        )
-        rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.mlp.fc1.weight", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.weight")
-        )
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.mlp.fc1.bias", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.bias")
-        )
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"{hf_prefix}encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"{hf_prefix}encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            (f"{prefix}cls_token", f"{hf_prefix}embeddings.cls_token"),
-            (f"{prefix}patch_embed.proj.weight", f"{hf_prefix}embeddings.patch_embeddings.projection.weight"),
-            (f"{prefix}patch_embed.proj.bias", f"{hf_prefix}embeddings.patch_embeddings.projection.bias"),
-        ]
-    )
-
-    if has_lm_head:
-        # mask token + shared relative position bias + layernorm
-        rename_keys.extend(
-            [
-                ("mask_token", f"{hf_prefix}embeddings.mask_token"),
-                (
-                    "rel_pos_bias.relative_position_bias_table",
-                    f"{hf_prefix}encoder.relative_position_bias.relative_position_bias_table",
-                ),
-                (
-                    "rel_pos_bias.relative_position_index",
-                    f"{hf_prefix}encoder.relative_position_bias.relative_position_index",
-                ),
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-            ]
-        )
-    elif is_semantic:
-        # semantic segmentation classification heads
-        rename_keys.extend(
-            [
-                ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
-                ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
-                ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
-                ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
-            ]
-        )
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("fc_norm.weight", f"{hf_prefix}pooler.layernorm.weight"),
-                ("fc_norm.bias", f"{hf_prefix}pooler.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec_vision."):
-    for i in range(config.num_hidden_layers):
-        prefix = "backbone." if is_semantic else ""
-        # queries, keys and values
-        in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
-        q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")
-
-        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.bias"] = q_bias
-        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.bias"] = v_bias
-
-        # gamma_1 and gamma_2
-        # we call them lambda because otherwise they are renamed when using .from_pretrained
-        gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
-        gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")
-
-        state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_1"] = gamma_1
-        state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_2"] = gamma_2
-
-        # relative_position bias table + index
-        if not has_lm_head:
-            # each layer has its own relative position bias
-            table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table")
-            index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index")
-
-            state_dict[
-                f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"
-            ] = table
-            state_dict[
-                f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"
-            ] = index
-
-
-def get_args():
-    parser = argparse.ArgumentParser(
-        "Convert Data2VecVision to HF for image classification and pretraining", add_help=False
-    )
-    parser.add_argument("--hf_checkpoint_name", type=str)
-    parser.add_argument("--input_size", default=224, type=int, help="images input size")
-    parser.add_argument("--beit_checkpoint", default="", help="beit checkpoint")
-
-    return parser.parse_args()
-
-
-def load_beit_model(args, is_finetuned, is_large):
-    def load_state_dict(model, state_dict, prefix="", ignore_missing="relative_position_index"):
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, "_metadata", None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=""):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
-            )
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + ".")
-
-        load(model, prefix=prefix)
-
-        warn_missing_keys = []
-        ignore_missing_keys = []
-        for key in missing_keys:
-            keep_flag = True
-            for ignore_key in ignore_missing.split("|"):
-                if ignore_key in key:
-                    keep_flag = False
-                    break
-            if keep_flag:
-                warn_missing_keys.append(key)
-            else:
-                ignore_missing_keys.append(key)
-
-        missing_keys = warn_missing_keys
-
-        if len(missing_keys) > 0:
-            print(
-                "Weights of {} not initialized from pretrained model: {}".format(
-                    model.__class__.__name__, missing_keys
-                )
-            )
-        if len(unexpected_keys) > 0:
-            print("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys))
-        if len(ignore_missing_keys) > 0:
-            print(
-                "Ignored weights of {} not initialized from pretrained model: {}".format(
-                    model.__class__.__name__, ignore_missing_keys
-                )
-            )
-        if len(error_msgs) > 0:
-            print("\n".join(error_msgs))
-
-    model_kwargs = {
-        "pretrained": False,
-        "use_shared_rel_pos_bias": True,
-        "use_abs_pos_emb": False,
-        "init_values": 0.1,
-    }
-
-    if is_finetuned:
-        model_kwargs.update(
-            {
-                "num_classes": 1000,
-                "use_mean_pooling": True,
-                "init_scale": 0.001,
-                "use_rel_pos_bias": True,
-            }
-        )
-
-    model = create_model(
-        "beit_large_patch16_224" if is_large else "beit_base_patch16_224",
-        **model_kwargs,
-    )
-    patch_size = model.patch_embed.patch_size
-    args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1])
-    checkpoint = torch.load(args.beit_checkpoint, map_location="cpu")
-
-    print(f"Load ckpt from {args.beit_checkpoint}")
-    checkpoint_model = None
-    for model_key in ("model", "module"):
-        if model_key in checkpoint:
-            checkpoint_model = checkpoint[model_key]
-            print(f"Load state_dict by model_key = {model_key}")
-            break
-
-    all_keys = list(checkpoint_model.keys())
-    for key in all_keys:
-        if "relative_position_index" in key:
-            checkpoint_model.pop(key)
-
-        if "relative_position_bias_table" in key:
-            rel_pos_bias = checkpoint_model[key]
-            src_num_pos, num_attn_heads = rel_pos_bias.size()
-            dst_num_pos, _ = model.state_dict()[key].size()
-            dst_patch_shape = model.patch_embed.patch_shape
-            if dst_patch_shape[0] != dst_patch_shape[1]:
-                raise NotImplementedError()
-
-    load_state_dict(model, checkpoint_model, prefix="")
-
-    return model
-
-
-def main():
-    args = get_args()
-
-    is_finetuned = "ft1k" in args.hf_checkpoint_name
-    is_large = "large" in args.hf_checkpoint_name
-
-    if is_finetuned:
-        # To convert Beit's data2vec_vision to HF you need to copy
-        # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_finetune.py
-        # into this folder.
-        import modeling_finetune  # noqa: F401
-    else:
-        # To convert Beit's data2vec_vision to HF you need to copy
-        # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_cyclical.py
-        # into this folder
-        # IMPORTANT: Note that for now we've only converted the down-stream
-        # model and not the full pretrained model. This means for the integration
-        # test you need to add a `return x` after the following line:
-        # https://github.com/facebookresearch/data2vec_vision/blob/af9a36349aaed59ae66e69b5dabeef2d62fdc5da/beit/modeling_cyclical.py#L197
-        # to make the integration test pass.
-        import modeling_cyclical  # noqa: F401
-
-    # 1. Create model config
-    config = Data2VecVisionConfig()
-    if is_finetuned:
-        config.use_relative_position_bias = True
-        config.use_shared_relative_position_bias = False
-        config.use_mean_pooling = True
-        config.num_labels = 1000
-
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    else:
-        config.use_relative_position_bias = False
-        config.use_shared_relative_position_bias = True
-        config.use_mean_pooling = False
-
-    if is_large:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-
-    # 2. Load Beit model
-    orig_model = load_beit_model(args, is_finetuned, is_large)
-    orig_model.eval()
-
-    # 3. Forward Beit model
-    image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False)
-    image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
-    encoding = image_processor(images=image, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
-    with torch.no_grad():
-        orig_model_output = orig_model(*orig_args)
-
-    # 4. Load HF Data2VecVision model
-    if is_finetuned:
-        hf_model = Data2VecVisionForImageClassification(config)
-        hf_model.eval()
-        has_lm_head = False
-        hf_prefix = "data2vec_vision."
-    else:
-        hf_model = Data2VecVisionModel(config)
-        hf_model.eval()
-        has_lm_head = True
-        hf_prefix = ""
-
-    rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
-    state_dict = orig_model.state_dict()
-    for src, dest in rename_keys:
-        val = state_dict.pop(src)
-        state_dict[dest] = val
-
-    read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
-    missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
-    print("HF missing", missing_keys)
-    print("HF unexpected_keys", unexpected_keys)
-
-    # 5. Forward HF Data2VecVision model
-    with torch.no_grad():
-        hf_model_output = hf_model(pixel_values)
-
-    hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state
-
-    # 6. Compare
-    max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item()
-
-    print(f"max_absolute_diff = {max_absolute_diff}")
-    success = torch.allclose(hf_output, orig_model_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    # 7. Save
-    print(f"Saving to {args.hf_checkpoint_name}")
-    hf_model.save_pretrained(args.hf_checkpoint_name)
-    image_processor.save_pretrained(args.hf_checkpoint_name)
-
-
-if __name__ == "__main__":
-    main()
-    # Run the following to convert checkpoints
-    #  python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
-    #          --beit_checkpoint ./pretrained_base.pt \
-    #          --hf_checkpoint_name "./data2vec-vision-base"
-    #  python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
-    #          --beit_checkpoint ./finetuned_base.pt \
-    #          --hf_checkpoint_name "./data2vec-vision-base-ft1k"
-    #  python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
-    #          --beit_checkpoint ./pretrained_large.pt \
-    #          --hf_checkpoint_name "./data2vec-vision-large"
-    #  python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
-    #          --beit_checkpoint ./finetuned_large.pt \
-    #          --hf_checkpoint_name "./data2vec-vision-large-ft1k"
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index b1be8ab19660..7f799e812970 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -1,32 +1,22 @@
-# coding=utf-8
-# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Data2VecAudio model."""
-
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/data2vec/modular_data2vec_audio.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_data2vec_audio.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import warnings
 from typing import Optional, Tuple, Union
 
 import numpy as np
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -40,152 +30,23 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     is_peft_available,
     logging,
 )
 from .configuration_data2vec_audio import Data2VecAudioConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
-logger = logging.get_logger(__name__)
-
-
-_HIDDEN_STATES_START_POSITION = 2
 
-# General docstring
-_CONFIG_FOR_DOC = "Data2VecAudioConfig"
+logger = logging.get_logger(__name__)
 
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/data2vec-audio-base-960h"
-_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
-
-# CTC docstring
-_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
-_CTC_EXPECTED_LOSS = 66.95
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
-def _compute_mask_indices(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    attention_mask: Optional[torch.LongTensor] = None,
-    min_masks: int = 0,
-) -> np.ndarray:
-    """
-    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
-    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
-    CPU as part of the preprocessing during training.
-
-    Args:
-        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-                    independently generated mask spans of length `mask_length` is computed by
-                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-                    actual percentage will be smaller.
-        mask_length: size of the mask
-        min_masks: minimum number of masked spans
-        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-                        each batch dimension.
-    """
-    batch_size, sequence_length = shape
-
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
-
-    if mask_length > sequence_length:
-        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
-            f" and `sequence_length`: {sequence_length}`"
-        )
-
-    # epsilon is used for probabilistic rounding
-    epsilon = np.random.rand(1).item()
-
-    def compute_num_masked_span(input_length):
-        """Given input length, compute how many spans should be masked"""
-        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
-        num_masked_span = max(num_masked_span, min_masks)
-
-        # make sure num masked span <= sequence_length
-        if num_masked_span * mask_length > sequence_length:
-            num_masked_span = sequence_length // mask_length
-
-        # make sure num_masked span is also <= input_length - (mask_length - 1)
-        if input_length - (mask_length - 1) < num_masked_span:
-            num_masked_span = max(input_length - (mask_length - 1), 0)
-
-        return num_masked_span
-
-    # compute number of masked spans in batch
-    input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
-        if attention_mask is not None
-        else [sequence_length for _ in range(batch_size)]
-    )
-
-    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
-    spec_aug_mask_idxs = []
-
-    max_num_masked_span = compute_num_masked_span(sequence_length)
-
-    if max_num_masked_span == 0:
-        return spec_aug_mask
-
-    for input_length in input_lengths:
-        # compute num of masked spans for this input
-        num_masked_span = compute_num_masked_span(input_length)
-
-        # get random indices to mask
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
-        )
-
-        # pick first sampled index that will serve as a dummy index to pad vector
-        # to ensure same dimension for all batches due to probabilistic rounding
-        # Picking first sample just pads those vectors twice.
-        if len(spec_aug_mask_idx) == 0:
-            # this case can only happen if `input_length` is strictly smaller then
-            # `sequence_length` in which case the last token has to be a padding
-            # token which we can use as a dummy mask id
-            dummy_mask_idx = sequence_length - 1
-        else:
-            dummy_mask_idx = spec_aug_mask_idx[0]
-
-        spec_aug_mask_idx = np.concatenate(
-            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
-        )
-        spec_aug_mask_idxs.append(spec_aug_mask_idx)
-
-    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
-
-    # expand masked indices to masked spans
-    spec_aug_mask_idxs = np.broadcast_to(
-        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
-
-    # add offset to the starting indexes so that indexes now create a span
-    offsets = np.arange(mask_length)[None, None, :]
-    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
-        batch_size, max_num_masked_span * mask_length
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
-
-    # ensure that we cannot have indices larger than sequence_length
-    if spec_aug_mask_idxs.max() > sequence_length - 1:
-        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
-
-    # scatter indices to mask
-    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
 
-    return spec_aug_mask
+# General docstring
+_CONFIG_FOR_DOC = "Data2VecAudioConfig"
 
 
 class Data2VecAudioConvLayer(nn.Module):
@@ -215,7 +76,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->Data2VecAudio
 class Data2VecAudioPadLayer(nn.Module):
     def __init__(self, num_conv_pos_embeddings):
         super().__init__()
@@ -280,13 +140,11 @@ def __init__(self, config):
         self.gradient_checkpointing = False
         self._requires_grad = True
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder._freeze_parameters
     def _freeze_parameters(self):
         for param in self.parameters():
             param.requires_grad = False
         self._requires_grad = False
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder.forward
     def forward(self, input_values):
         hidden_states = input_values[:, None]
 
@@ -306,7 +164,6 @@ def forward(self, input_values):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->Data2VecAudio
 class Data2VecAudioFeatureProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -322,7 +179,6 @@ def forward(self, hidden_states):
         return hidden_states, norm_hidden_states
 
 
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Data2VecAudio
 class Data2VecAudioAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -481,7 +337,6 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->Data2VecAudio
 class Data2VecAudioFlashAttention2(Data2VecAudioAttention):
     """
     Data2VecAudio flash attention module. This module inherits from `Data2VecAudioAttention` as the weights of the module stays
@@ -493,9 +348,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -609,7 +464,6 @@ def forward(
 
 
 class Data2VecAudioSdpaAttention(Data2VecAudioAttention):
-    # Copied from transformers.models.bart.modeling_bart.BartSdpaAttention.forward with Bart->Data2VecAudio
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -715,14 +569,6 @@ def forward(
         return attn_output, None, past_key_value
 
 
-DATA2VEC2AUDIO_ATTENTION_CLASSES = {
-    "eager": Data2VecAudioAttention,
-    "sdpa": Data2VecAudioSdpaAttention,
-    "flash_attention_2": Data2VecAudioFlashAttention2,
-}
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Data2VecAudio
 class Data2VecAudioFeedForward(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -747,11 +593,17 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Wav2Vec2->Data2VecAudio, WAV2VEC2->DATA2VEC2AUDIO
+DATA2VEC_AUDIO_ATTENTION_CLASSES = {
+    "eager": Data2VecAudioAttention,
+    "sdpa": Data2VecAudioSdpaAttention,
+    "flash_attention_2": Data2VecAudioFlashAttention2,
+}
+
+
 class Data2VecAudioEncoderLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.attention = DATA2VEC2AUDIO_ATTENTION_CLASSES[config._attn_implementation](
+        self.attention = DATA2VEC_AUDIO_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=config.hidden_size,
             num_heads=config.num_attention_heads,
             dropout=config.attention_dropout,
@@ -783,7 +635,6 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder with Wav2Vec2->Data2VecAudio
 class Data2VecAudioEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -869,7 +720,24 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter with Wav2Vec2->Data2VecAudio
+class Data2VecAudioAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        return hidden_states
+
+
 class Data2VecAudioAdapter(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -901,25 +769,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer with Wav2Vec2->Data2VecAudio
-class Data2VecAudioAdapterLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            config.output_hidden_size,
-            2 * config.output_hidden_size,
-            config.adapter_kernel_size,
-            stride=config.adapter_stride,
-            padding=1,
-        )
-
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = nn.functional.glu(hidden_states, dim=1)
-
-        return hidden_states
-
-
 class Data2VecAudioPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -958,7 +807,6 @@ def _init_weights(self, module):
                 k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                 nn.init.uniform_(module.bias, a=-k, b=k)
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PreTrainedModel._get_feat_extract_output_lengths with
     def _get_feat_extract_output_lengths(
         self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
     ):
@@ -982,7 +830,6 @@ def _conv_out_length(input_length, kernel_size, stride):
 
         return input_lengths
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PreTrainedModel._get_feature_vector_attention_mask
     def _get_feature_vector_attention_mask(
         self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
     ):
@@ -1004,6 +851,128 @@ def _get_feature_vector_attention_mask(
         return attention_mask
 
 
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+
 DATA2VEC_AUDIO_START_DOCSTRING = r"""
     Data2VecAudio was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
     Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
@@ -1022,7 +991,6 @@ def _get_feature_vector_attention_mask(
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-
 DATA2VEC_AUDIO_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -1059,6 +1027,8 @@ def _get_feature_vector_attention_mask(
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
+Data2VecAudioBaseModelOutput = Wav2Vec2BaseModelOutput
+
 
 @add_start_docstrings(
     "The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.",
@@ -1138,7 +1108,7 @@ def _mask_hidden_states(
     @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Wav2Vec2BaseModelOutput,
+        output_type=Data2VecAudioBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
         expected_output=_EXPECTED_OUTPUT_SHAPE,
@@ -1151,7 +1121,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+    ) -> Union[Tuple, Data2VecAudioBaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1188,7 +1158,7 @@ def forward(
         if not return_dict:
             return (hidden_states, extract_features) + encoder_outputs[1:]
 
-        return Wav2Vec2BaseModelOutput(
+        return Data2VecAudioBaseModelOutput(
             last_hidden_state=hidden_states,
             extract_features=extract_features,
             hidden_states=encoder_outputs.hidden_states,
@@ -1196,6 +1166,13 @@ def forward(
         )
 
 
+_HIDDEN_STATES_START_POSITION = 2
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 66.95
+
+
 @add_start_docstrings(
     """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     DATA2VEC_AUDIO_START_DOCSTRING,
@@ -1249,7 +1226,6 @@ def freeze_feature_encoder(self):
         expected_output=_CTC_EXPECTED_OUTPUT,
         expected_loss=_CTC_EXPECTED_LOSS,
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward with wav2vec2->data2vec_audio
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1380,7 +1356,6 @@ def freeze_base_model(self):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with wav2vec2->data2vec_audio
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1456,8 +1431,7 @@ def __init__(self, config):
 
         if hasattr(config, "add_adapter") and config.add_adapter:
             raise ValueError(
-                "Audio frame classification does not support the use of Data2VecAudio adapters"
-                " (config.add_adapter=True)"
+                "Audio frame classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)"
             )
         self.data2vec_audio = Data2VecAudioModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
@@ -1502,7 +1476,6 @@ def freeze_base_model(self):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.forward with wav2vec2->data2vec_audio
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1557,7 +1530,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
 class AMSoftmaxLoss(nn.Module):
     def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
         super(AMSoftmaxLoss, self).__init__()
@@ -1581,7 +1553,6 @@ def forward(self, hidden_states, labels):
         return loss
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
 class TDNNLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -1597,6 +1568,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if is_peft_available():
             from peft.tuners.lora import LoraLayer
 
+        if is_peft_available():
             if isinstance(self.kernel, LoraLayer):
                 warnings.warn(
                     "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
@@ -1688,7 +1660,6 @@ def _conv_out_length(input_length, kernel_size, stride):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.forward with wav2vec2->data2vec_audio
     def forward(
         self,
         input_values: Optional[torch.Tensor],
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index 1b6834a51797..12a407c51ad6 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -271,7 +271,7 @@ def __init__(self, config: Data2VecVisionConfig, window_size: Optional[tuple] =
         self.config = config
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {(config.hidden_size,)} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
@@ -784,6 +784,18 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, Data2VecVisionEmbeddings):
+            module.cls_token.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, Data2VecVisionRelativePositionBias):
+            module.relative_position_bias_table.data.zero_()
+        elif isinstance(module, Data2VecVisionLayer):
+            if module.lambda_1 is not None:
+                module.lambda_1.data.fill_(self.config.layer_scale_init_value)
+                module.lambda_2.data.fill_(self.config.layer_scale_init_value)
 
 
 DATA2VEC_VISION_START_DOCSTRING = r"""
diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
index 71595b4a43ce..813fad89dc54 100644
--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -90,8 +90,8 @@ class TFData2VecVisionModelOutputWithPooling(TFBaseModelOutputWithPooling):
             heads.
     """
 
-    last_hidden_state: tf.Tensor = None
-    pooler_output: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
+    pooler_output: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -1491,7 +1491,7 @@ def __init__(
                     kernel_size=kernel_size,
                     padding="same",
                     dilation=dilation,
-                    name=f"conv_module_{i+2}",
+                    name=f"conv_module_{i + 2}",
                 )
             )
         if self.num_convs == 0:
diff --git a/src/transformers/models/data2vec/modular_data2vec_audio.py b/src/transformers/models/data2vec/modular_data2vec_audio.py
new file mode 100644
index 000000000000..052f22a960fb
--- /dev/null
+++ b/src/transformers/models/data2vec/modular_data2vec_audio.py
@@ -0,0 +1,400 @@
+import math
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Adapter,
+    Wav2Vec2Encoder,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2ForAudioFrameClassification,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2ForXVector,
+    Wav2Vec2Model,
+    Wav2Vec2PreTrainedModel,
+    Wav2Vec2SamePadLayer,
+)
+from .configuration_data2vec_audio import Data2VecAudioConfig
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "Data2VecAudioConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/data2vec-audio-base-960h"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 66.95
+
+
+class Data2VecAudioConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Data2VecAudioPadLayer(Wav2Vec2SamePadLayer):
+    pass
+
+
+class Data2VecAudioPositionalConvLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.conv_pos_kernel_size,
+            padding=config.conv_pos_kernel_size // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        self.padding = Data2VecAudioPadLayer(config.conv_pos_kernel_size)
+        self.activation = ACT2FN[config.feat_extract_activation]
+        # no learnable parameters
+        self.layer_norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Data2VecAudioPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [Data2VecAudioPositionalConvLayer(config) for _ in range(config.num_conv_pos_embeddings)]
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Data2VecAudioFeatureEncoder(Wav2Vec2FeatureEncoder, nn.Module):
+    def __init__(self, config):
+        nn.Module.__init__()
+        self.conv_layers = nn.ModuleList(
+            [Data2VecAudioConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+        )
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+
+class Data2VecAudioFeatureProjection(Wav2Vec2FeatureProjection):
+    pass
+
+
+class Data2VecAudioEncoder(Wav2Vec2Encoder):
+    pass
+
+
+class Data2VecAudioAdapter(Wav2Vec2Adapter):
+    pass
+
+
+class Data2VecAudioPreTrainedModel(PreTrainedModel, Wav2Vec2PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Data2VecAudioConfig
+    base_model_prefix = "data2vec_audio"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Data2VecAudioFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, Data2VecAudioPositionalConvLayer):
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            if module.bias is not None:
+                module.bias.data.zero_()
+            if module.weight is not None:
+                module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_adapters(self):
+        raise AttributeError("Not needed for Data2VecAudio")
+
+    def init_adapter_layers(self):
+        raise AttributeError("Not needed for Data2VecAudio")
+
+    def load_adapter(self):
+        raise AttributeError("Not needed for Data2VecAudio")
+
+
+DATA2VEC_AUDIO_START_DOCSTRING = r"""
+    Data2VecAudio was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and
+    Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and
+    Michael Auli.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Data2VecAudioConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DATA2VEC_AUDIO_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+            into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install
+            soundfile*). To prepare the array into *input_values*, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should be passed if the corresponding processor has `config.return_attention_mask ==
+            True`, which is the case for all pre-trained Data2Vec Audio models. Be aware that that even with
+            `attention_mask`, zero-padded inputs will have slightly different outputs compared to non-padded inputs
+            because there are more than one convolutional layer in the positional encodings. For a more detailed
+            explanation, see [here](https://github.com/huggingface/transformers/issues/25621#issuecomment-1713759349).
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+Data2VecAudioBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare Data2VecAudio Model transformer outputting raw hidden-states without any specific head on top.",
+    DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioModel(Data2VecAudioPreTrainedModel, Wav2Vec2Model):
+    def __init__(self, config: Data2VecAudioConfig):
+        Data2VecAudioPreTrainedModel.__init__(config)
+        self.config = config
+        self.feature_extractor = Data2VecAudioFeatureEncoder(config)
+        self.feature_projection = Data2VecAudioFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Data2VecAudioEncoder(config)
+
+        self.adapter = Data2VecAudioAdapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Data2VecAudio")
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Data2VecAudioBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """Data2VecAudio Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForCTC(Data2VecAudioPreTrainedModel, Wav2Vec2ForCTC):
+    def __init__(self, config):
+        Data2VecAudioPreTrainedModel.__init__(config)
+
+        self.data2vec_audio = Data2VecAudioModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_base_model(self):
+        raise AttributeError("Not needed for Data2VecAudio")
+
+    def tie_weights(self):
+        raise AttributeError("Not needed for Data2VecAudio")
+
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    Data2VecAudio Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
+    like SUPERB Keyword Spotting.
+    """,
+    DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    Data2VecAudio Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification):
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    Data2VecAudio Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    DATA2VEC_AUDIO_START_DOCSTRING,
+)
+class Data2VecAudioForXVector(Wav2Vec2ForXVector):
+    @add_start_docstrings_to_model_forward(DATA2VEC_AUDIO_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+__all__ = [
+    "Data2VecAudioForAudioFrameClassification",
+    "Data2VecAudioForCTC",
+    "Data2VecAudioForSequenceClassification",
+    "Data2VecAudioForXVector",
+    "Data2VecAudioModel",
+    "Data2VecAudioPreTrainedModel",
+]
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index fceefbe2c752..258023c60b4d 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -25,13 +25,13 @@
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -39,7 +39,13 @@
 from .configuration_dbrx import DbrxConfig
 
 
-if is_flash_attn_2_available():
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 logger = logging.get_logger(__name__)
@@ -322,9 +328,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -738,7 +744,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_ids: torch.LongTensor = None,
+        position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Cache] = None,
         output_attentions: Optional[bool] = False,
         output_router_logits: Optional[bool] = False,
@@ -1112,12 +1118,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1199,7 +1210,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1283,9 +1294,7 @@ def forward(
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs,
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
-        r"""Forward function for causal language modeling.
-
-        Args:
+        r"""
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index 6a8b233978ff..cad151711323 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -207,9 +207,9 @@ def call(
         self,
         input_tensor: tf.Tensor,
         attention_mask: tf.Tensor,
-        query_states: tf.Tensor = None,
-        relative_pos: tf.Tensor = None,
-        rel_embeddings: tf.Tensor = None,
+        query_states: Optional[tf.Tensor] = None,
+        relative_pos: Optional[tf.Tensor] = None,
+        rel_embeddings: Optional[tf.Tensor] = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -318,9 +318,9 @@ def call(
         self,
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
-        query_states: tf.Tensor = None,
-        relative_pos: tf.Tensor = None,
-        rel_embeddings: tf.Tensor = None,
+        query_states: Optional[tf.Tensor] = None,
+        relative_pos: Optional[tf.Tensor] = None,
+        rel_embeddings: Optional[tf.Tensor] = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -408,8 +408,8 @@ def call(
         self,
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
-        query_states: tf.Tensor = None,
-        relative_pos: tf.Tensor = None,
+        query_states: Optional[tf.Tensor] = None,
+        relative_pos: Optional[tf.Tensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
@@ -650,9 +650,9 @@ def call(
         self,
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
-        query_states: tf.Tensor = None,
-        relative_pos: tf.Tensor = None,
-        rel_embeddings: tf.Tensor = None,
+        query_states: Optional[tf.Tensor] = None,
+        relative_pos: Optional[tf.Tensor] = None,
+        rel_embeddings: Optional[tf.Tensor] = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -880,11 +880,11 @@ def build(self, input_shape=None):
 
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        token_type_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
-        mask: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        token_type_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
+        mask: Optional[tf.Tensor] = None,
         training: bool = False,
     ) -> tf.Tensor:
         """
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index b02628ed6929..e9a4bbf8ae51 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -300,7 +300,7 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_
             raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
 
         att_span = self.pos_ebd_size
-        relative_pos = relative_pos.long().to(query_layer.device)
+        relative_pos = relative_pos.to(device=query_layer.device, dtype=torch.long)
 
         rel_embeddings = rel_embeddings[0 : att_span * 2, :].unsqueeze(0)
         if self.share_att_key:
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index b7b5a01d170f..899564eef05d 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -192,9 +192,9 @@ def call(
         self,
         input_tensor: tf.Tensor,
         attention_mask: tf.Tensor,
-        query_states: tf.Tensor = None,
-        relative_pos: tf.Tensor = None,
-        rel_embeddings: tf.Tensor = None,
+        query_states: Optional[tf.Tensor] = None,
+        relative_pos: Optional[tf.Tensor] = None,
+        rel_embeddings: Optional[tf.Tensor] = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -306,9 +306,9 @@ def call(
         self,
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
-        query_states: tf.Tensor = None,
-        relative_pos: tf.Tensor = None,
-        rel_embeddings: tf.Tensor = None,
+        query_states: Optional[tf.Tensor] = None,
+        relative_pos: Optional[tf.Tensor] = None,
+        rel_embeddings: Optional[tf.Tensor] = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -485,8 +485,8 @@ def call(
         self,
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
-        query_states: tf.Tensor = None,
-        relative_pos: tf.Tensor = None,
+        query_states: Optional[tf.Tensor] = None,
+        relative_pos: Optional[tf.Tensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         return_dict: bool = True,
@@ -718,9 +718,9 @@ def call(
         self,
         hidden_states: tf.Tensor,
         attention_mask: tf.Tensor,
-        query_states: tf.Tensor = None,
-        relative_pos: tf.Tensor = None,
-        rel_embeddings: tf.Tensor = None,
+        query_states: Optional[tf.Tensor] = None,
+        relative_pos: Optional[tf.Tensor] = None,
+        rel_embeddings: Optional[tf.Tensor] = None,
         output_attentions: bool = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
@@ -985,11 +985,11 @@ def build(self, input_shape=None):
 
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        token_type_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
-        mask: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        token_type_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
+        mask: Optional[tf.Tensor] = None,
         training: bool = False,
     ) -> tf.Tensor:
         """
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
index 4440cc2e1c48..e87c855be59e 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -462,7 +462,7 @@ def _run_split_on_punc(self, text):
 
         return ["".join(x) for x in output]
 
-    def save_pretrained(self, path: str, filename_prefix: str = None):
+    def save_pretrained(self, path: str, filename_prefix: Optional[str] = None):
         filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
         if filename_prefix is not None:
             filename = filename_prefix + "-" + filename
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 08c0f918c435..ab8c19077506 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -233,7 +233,7 @@ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, hea
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
             attn_weights = torch.where(causal_mask, attn_weights, mask_value)
 
         if attention_mask is not None:
@@ -732,12 +732,12 @@ class DecisionTransformerOutput(ModelOutput):
             heads.
     """
 
-    state_preds: torch.FloatTensor = None
-    action_preds: torch.FloatTensor = None
-    return_preds: torch.FloatTensor = None
-    hidden_states: torch.FloatTensor = None
-    attentions: torch.FloatTensor = None
-    last_hidden_state: torch.FloatTensor = None
+    state_preds: Optional[torch.FloatTensor] = None
+    action_preds: Optional[torch.FloatTensor] = None
+    return_preds: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[torch.FloatTensor] = None
+    attentions: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
 
 
 class DecisionTransformerPreTrainedModel(PreTrainedModel):
diff --git a/src/transformers/models/deepseek_v3/__init__.py b/src/transformers/models/deepseek_v3/__init__.py
new file mode 100644
index 000000000000..298f4c968375
--- /dev/null
+++ b/src/transformers/models/deepseek_v3/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_deepseek_v3 import *
+    from .modeling_deepseek_v3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py
new file mode 100644
index 000000000000..8f04f9a8e9dd
--- /dev/null
+++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py
@@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2025 bzantium and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on the DeepSeekV3 implementations from the DeepSeek AI team. (https://huggingface.co/deepseek-ai/DeepSeek-V3)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DeepSeekV3 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class DeepseekV3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DeepSeek-V3.
+    e.g. [bzantium/tiny-deepseek-v3](https://huggingface.co/bzantium/tiny-deepseek-v3)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 129280):
+            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DeepseekV3Model`]
+        hidden_size (`int`, *optional*, defaults to 7168):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 18432):
+            Dimension of the MLP representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimension of the MoE representations.
+        num_hidden_layers (`int`, *optional*, defaults to 61):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 128):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 128):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        n_shared_experts (`int`, *optional*, defaults to 1):
+            Number of shared experts.
+        n_routed_experts (`int`, *optional*, defaults to 256):
+            Number of routed experts.
+        routed_scaling_factor (`float`, *optional*, defaults to 2.5):
+            Scaling factor or routed experts.
+        kv_lora_rank (`int`, *optional*, defaults to 512):
+            Rank of the LoRA matrices for key and value projections.
+        q_lora_rank (`int`, *optional*, defaults to 1536):
+            Rank of the LoRA matrices for query projections.
+        qk_rope_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of the query/key heads that use rotary position embeddings.
+        v_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of the value heads.
+        qk_nope_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of the query/key heads that don't use rotary position embeddings.
+        n_group (`int`, *optional*, defaults to 8):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to 4):
+            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            Number of selected experts, None means dense model.
+        first_k_dense_replace (`int`, *optional*, defaults to 3):
+            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+                                                            \--k dense layers--/
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the weights of the routed experts.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+        rope_interleave (`bool`, *optional*, defaults to `True`):
+            Whether to interleave the rotary position embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import DeepseekV3Model, DeepseekV3Config
+
+    >>> # Initializing a Deepseek-V3 style configuration
+    >>> configuration = DeepseekV3Config()
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "deepseek_v3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {  # TODO: only replicate attention layers when > first_k_dense_replace
+        "layers.*.mlp.experts.*.gate_proj": "local_colwise",
+        "layers.*.mlp.experts.*.up_proj": "local_colwise",
+        "layers.*.mlp.experts.*.down_proj": "local_rowwise",
+        "layers.*.mlp.experts.*": "local",  # each expert is wrapped in a module list
+        "layers.*.mlp.shared_experts.gate_proj": "local_colwise",
+        "layers.*.mlp.shared_experts.up_proj": "local_colwise",
+        "layers.*.mlp.shared_experts.down_proj": "local_rowwise",
+        "layers.*.mlp.shared_experts": "local",
+        "layers.*.mlp.gate_proj": "local_colwise",
+        "layers.*.mlp.up_proj": "local_colwise",
+        "layers.*.mlp.down_proj": "local_rowwise",
+        "layers.*.mlp": "gather",  # This is the only moment where results are gathered
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=129280,
+        hidden_size=7168,
+        intermediate_size=18432,
+        moe_intermediate_size=2048,
+        num_hidden_layers=61,
+        num_attention_heads=128,
+        num_key_value_heads=128,
+        n_shared_experts=1,
+        n_routed_experts=256,
+        routed_scaling_factor=2.5,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        n_group=8,
+        topk_group=4,
+        num_experts_per_tok=8,
+        first_k_dense_replace=3,
+        norm_topk_prob=True,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=0,
+        eos_token_id=1,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        rope_interleave=True,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.head_dim = qk_rope_head_dim
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.rope_interleave = rope_interleave
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["DeepseekV3Config"]
diff --git a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py
new file mode 100644
index 000000000000..7d932a4f1585
--- /dev/null
+++ b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py
@@ -0,0 +1,1028 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/deepseek_v3/modular_deepseek_v3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_deepseek_v3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    LossKwargs,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_deepseek_v3 import DeepseekV3Config
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "DeepseekV3Config"
+
+
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class DeepseekV3RotaryEmbedding(nn.Module):
+    def __init__(self, config: DeepseekV3Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class DeepseekV3MLP(nn.Module):
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class DeepseekV3TopkRouter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.norm_topk_prob = config.norm_topk_prob
+
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
+        self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts)))
+
+    @torch.no_grad()
+    def get_topk_indices(self, scores):
+        scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
+        group_scores = (
+            scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .topk(2, dim=-1)[0]
+            .sum(dim=-1)
+        )
+        group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+        group_mask = torch.zeros_like(group_scores)
+        group_mask.scatter_(1, group_idx, 1)
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .reshape(-1, self.n_routed_experts)
+        )
+        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
+        topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
+        return topk_indices
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.view(-1, self.config.hidden_size)
+        router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
+        scores = router_logits.sigmoid()
+        topk_indices = self.get_topk_indices(scores)
+        topk_weights = scores.gather(1, topk_indices)
+        if self.norm_topk_prob:
+            denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weights /= denominator
+        topk_weights = topk_weights * self.routed_scaling_factor
+        return topk_indices, topk_weights
+
+
+class DeepseekV3MoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList(
+            [
+                DeepseekV3MLP(config, intermediate_size=config.moe_intermediate_size)
+                for _ in range(config.n_routed_experts)
+            ]
+        )
+        self.gate = DeepseekV3TopkRouter(config)
+        self.shared_experts = DeepseekV3MLP(
+            config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
+        )
+
+    def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+        r"""
+        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
+        to not have to do a loop here (deepseek has 256 experts soooo yeah).
+        """
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+        expert_mask = expert_mask.permute(2, 0, 1)
+
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+
+            if token_indices.numel() > 0:
+                expert_weights = topk_weights[token_indices, weight_indices]
+                expert_input = hidden_states[token_indices]
+                expert_output = expert(expert_input)
+                weighted_output = expert_output * expert_weights.unsqueeze(-1)
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+
+        # in original deepseek, the output of the experts are gathered once we leave this module
+        # thus the moe module is itelsf an IsolatedParallel module
+        # and all expert are "local" meaning we shard but we don't gather
+        return final_hidden_states.type(hidden_states.dtype)
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    r"""
+    TODO let's just use the original freqcis computation to not have the view
+    transpose + reshape! This is not optimized!
+    Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DeepseekV3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.attention_dropout = config.attention_dropout
+        self.num_heads = config.num_attention_heads
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_head_dim = config.qk_head_dim
+
+        self.is_causal = True
+        self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
+        self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
+        self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
+
+        self.kv_a_proj_with_mqa = nn.Linear(
+            config.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            config.hidden_size,
+            bias=config.attention_bias,
+        )
+
+        self.scaling = self.qk_head_dim ** (-0.5)
+        if self.config.rope_scaling is not None:
+            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_scaling["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.scaling = self.scaling * mscale * mscale
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        batch_size, seq_length = hidden_states.shape[:-1]
+        query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+        key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+
+        q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))).view(query_shape).transpose(1, 2)
+        q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
+        k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
+        k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+
+        cos, sin = position_embeddings
+        if self.config.rope_interleave:  # support using interleaved weights for efficiency
+            q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
+        else:
+            q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
+        k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+
+        query_states = torch.cat((q_pass, q_rot), dim=-1)
+        key_states = torch.cat((k_pass, k_rot), dim=-1)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DeepseekV3DecoderLayer(nn.Module):
+    def __init__(self, config: DeepseekV3Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = DeepseekV3Attention(config=config, layer_idx=layer_idx)
+
+        if layer_idx >= config.first_k_dense_replace:
+            self.mlp = DeepseekV3MoE(config)
+        else:
+            self.mlp = DeepseekV3MLP(config)
+
+        self.input_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+DEEPSEEK_V3_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`DeepseekV3Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
+    DEEPSEEK_V3_START_DOCSTRING,
+)
+class DeepseekV3PreTrainedModel(PreTrainedModel):
+    config_class = DeepseekV3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DeepseekV3DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, DeepseekV3TopkRouter):
+            module.weight.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, nn.Parameter):
+            module.weight.data.normal_(mean=0.0, std=std)
+
+
+DEEPSEEK_V3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
+    DEEPSEEK_V3_START_DOCSTRING,
+)
+class DeepseekV3Model(DeepseekV3PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV3DecoderLayer`]
+
+    Args:
+        config: DeepseekV3Config
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r"model\.layers\.61.*"]
+
+    def __init__(self, config: DeepseekV3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [DeepseekV3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = DeepseekV3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(DEEPSEEK_V3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = DeepseekV3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @can_return_tuple
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(DEEPSEEK_V3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> CausalLMOutputWithPast:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM
+
+        >>> model = DeepseekV3ForCausalLM.from_pretrained("meta-deepseek_v3/DeepseekV3-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-deepseek_v3/DeepseekV3-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["DeepseekV3PreTrainedModel", "DeepseekV3Model", "DeepseekV3ForCausalLM"]
diff --git a/src/transformers/models/deepseek_v3/modular_deepseek_v3.py b/src/transformers/models/deepseek_v3/modular_deepseek_v3.py
new file mode 100644
index 000000000000..7713eb3b2707
--- /dev/null
+++ b/src/transformers/models/deepseek_v3/modular_deepseek_v3.py
@@ -0,0 +1,368 @@
+import math
+from typing import Callable, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import logging
+from ..llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+    rotate_half,
+)
+from .configuration_deepseek_v3 import DeepseekV3Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekV3RMSNorm(LlamaRMSNorm):
+    pass
+
+
+class DeepseekV3RotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    r"""
+    TODO let's just use the original freqcis computation to not have the view
+    transpose + reshape! This is not optimized!
+    Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV3MLP(nn.Module):
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class DeepseekV3TopkRouter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.norm_topk_prob = config.norm_topk_prob
+
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
+        self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts)))
+
+    @torch.no_grad()
+    def get_topk_indices(self, scores):
+        scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
+        group_scores = (
+            scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .topk(2, dim=-1)[0]
+            .sum(dim=-1)
+        )
+        group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+        group_mask = torch.zeros_like(group_scores)
+        group_mask.scatter_(1, group_idx, 1)
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .reshape(-1, self.n_routed_experts)
+        )
+        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
+        topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
+        return topk_indices
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.view(-1, self.config.hidden_size)
+        router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
+        scores = router_logits.sigmoid()
+        topk_indices = self.get_topk_indices(scores)
+        topk_weights = scores.gather(1, topk_indices)
+        if self.norm_topk_prob:
+            denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weights /= denominator
+        topk_weights = topk_weights * self.routed_scaling_factor
+        return topk_indices, topk_weights
+
+
+class DeepseekV3MoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList(
+            [
+                DeepseekV3MLP(config, intermediate_size=config.moe_intermediate_size)
+                for _ in range(config.n_routed_experts)
+            ]
+        )
+        self.gate = DeepseekV3TopkRouter(config)
+        self.shared_experts = DeepseekV3MLP(
+            config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts
+        )
+
+    def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+        r"""
+        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
+        to not have to do a loop here (deepseek has 256 experts soooo yeah).
+        """
+        final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+        expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts))
+        expert_mask = expert_mask.permute(2, 0, 1)
+
+        for expert_idx in range(len(self.experts)):
+            expert = self.experts[expert_idx]
+            mask = expert_mask[expert_idx]
+            token_indices, weight_indices = torch.where(mask)
+
+            if token_indices.numel() > 0:
+                expert_weights = topk_weights[token_indices, weight_indices]
+                expert_input = hidden_states[token_indices]
+                expert_output = expert(expert_input)
+                weighted_output = expert_output * expert_weights.unsqueeze(-1)
+                final_hidden_states.index_add_(0, token_indices, weighted_output)
+
+        # in original deepseek, the output of the experts are gathered once we leave this module
+        # thus the moe module is itelsf an IsolatedParallel module
+        # and all expert are "local" meaning we shard but we don't gather
+        return final_hidden_states.type(hidden_states.dtype)
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+        hidden_states = hidden_states + self.shared_experts(residuals)
+        return hidden_states
+
+
+class DeepseekV3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DeepseekV3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.attention_dropout = config.attention_dropout
+        self.num_heads = config.num_attention_heads
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_head_dim = config.qk_head_dim
+
+        self.is_causal = True
+        self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
+        self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
+        self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
+
+        self.kv_a_proj_with_mqa = nn.Linear(
+            config.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            config.hidden_size,
+            bias=config.attention_bias,
+        )
+
+        self.scaling = self.qk_head_dim ** (-0.5)
+        if self.config.rope_scaling is not None:
+            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_scaling["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.scaling = self.scaling * mscale * mscale
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        batch_size, seq_length = hidden_states.shape[:-1]
+        query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+        key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+
+        q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))).view(query_shape).transpose(1, 2)
+        q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
+        k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
+        k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+
+        cos, sin = position_embeddings
+        if self.config.rope_interleave:  # support using interleaved weights for efficiency
+            q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
+        else:
+            q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
+        k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+
+        query_states = torch.cat((q_pass, q_rot), dim=-1)
+        key_states = torch.cat((k_pass, k_rot), dim=-1)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DeepseekV3DecoderLayer(LlamaDecoderLayer, nn.Module):
+    def __init__(self, config: DeepseekV3Config, layer_idx: int):
+        nn.Module().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = DeepseekV3Attention(config=config, layer_idx=layer_idx)
+
+        if layer_idx >= config.first_k_dense_replace:
+            self.mlp = DeepseekV3MoE(config)
+        else:
+            self.mlp = DeepseekV3MLP(config)
+
+        self.input_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class DeepseekV3PreTrainedModel(LlamaPreTrainedModel):
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, DeepseekV3TopkRouter):
+            module.weight.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, nn.Parameter):
+            module.weight.data.normal_(mean=0.0, std=std)
+
+
+class DeepseekV3Model(LlamaModel):
+    _keys_to_ignore_on_load_unexpected = [r"model\.layers\.61.*"]
+
+
+class DeepseekV3ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+__all__ = [
+    "DeepseekV3PreTrainedModel",
+    "DeepseekV3Model",
+    "DeepseekV3ForCausalLM",
+]
diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
deleted file mode 100644
index 781b823e96f3..000000000000
--- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Deformable DETR checkpoints."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def rename_key(orig_key):
-    if "backbone.0.body" in orig_key:
-        orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model")
-    if "transformer" in orig_key:
-        orig_key = orig_key.replace("transformer.", "")
-    if "norm1" in orig_key:
-        if "encoder" in orig_key:
-            orig_key = orig_key.replace("norm1", "self_attn_layer_norm")
-        else:
-            orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm")
-    if "norm2" in orig_key:
-        if "encoder" in orig_key:
-            orig_key = orig_key.replace("norm2", "final_layer_norm")
-        else:
-            orig_key = orig_key.replace("norm2", "self_attn_layer_norm")
-    if "norm3" in orig_key:
-        orig_key = orig_key.replace("norm3", "final_layer_norm")
-    if "linear1" in orig_key:
-        orig_key = orig_key.replace("linear1", "fc1")
-    if "linear2" in orig_key:
-        orig_key = orig_key.replace("linear2", "fc2")
-    if "query_embed" in orig_key:
-        orig_key = orig_key.replace("query_embed", "query_position_embeddings")
-    if "cross_attn" in orig_key:
-        orig_key = orig_key.replace("cross_attn", "encoder_attn")
-
-    return orig_key
-
-
-def read_in_q_k_v(state_dict):
-    # transformer decoder self-attention layers
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_deformable_detr_checkpoint(
-    checkpoint_path,
-    single_scale,
-    dilation,
-    with_box_refine,
-    two_stage,
-    pytorch_dump_folder_path,
-    push_to_hub,
-):
-    """
-    Copy/paste/tweak model's weights to our Deformable DETR structure.
-    """
-
-    # load default config
-    config = DeformableDetrConfig()
-    # set config attributes
-    if single_scale:
-        config.num_feature_levels = 1
-    config.dilation = dilation
-    config.with_box_refine = with_box_refine
-    config.two_stage = two_stage
-    # set labels
-    config.num_labels = 91
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-id2label.json"
-    id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    # load image processor
-    image_processor = DeformableDetrImageProcessor(format="coco_detection")
-
-    # prepare image
-    img = prepare_img()
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    logger.info("Converting model...")
-
-    # load original state dict
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "model."
-    for key in state_dict.copy().keys():
-        if not key.startswith("class_embed") and not key.startswith("bbox_embed"):
-            val = state_dict.pop(key)
-            state_dict[prefix + key] = val
-    # finally, create HuggingFace model and load state dict
-    model = DeformableDetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device)
-    # verify our conversion
-    outputs = model(pixel_values.to(device))
-
-    expected_logits = torch.tensor(
-        [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]]
-    )
-    expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]])
-
-    if single_scale:
-        expected_logits = torch.tensor(
-            [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]]
-        )
-        expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]])
-
-    if single_scale and dilation:
-        expected_logits = torch.tensor(
-            [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]]
-        )
-        expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]])
-
-    if with_box_refine:
-        expected_logits = torch.tensor(
-            [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]]
-        )
-        expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]])
-
-    if with_box_refine and two_stage:
-        expected_logits = torch.tensor(
-            [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]]
-        )
-        expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]])
-
-    print("Logits:", outputs.logits[0, :3, :3])
-
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
-
-    print("Everything ok!")
-
-    # Save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Push to hub
-    if push_to_hub:
-        model_name = "deformable-detr"
-        model_name += "-single-scale" if single_scale else ""
-        model_name += "-dc5" if dilation else ""
-        model_name += "-with-box-refine" if with_box_refine else ""
-        model_name += "-two-stage" if two_stage else ""
-        print("Pushing model to hub...")
-        model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        default="/home/niels/checkpoints/deformable_detr/r50_deformable_detr-checkpoint.pth",
-        help="Path to Pytorch checkpoint (.pth file) you'd like to convert.",
-    )
-    parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.")
-    parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.")
-    parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.")
-    parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.")
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the folder to output PyTorch model.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_deformable_detr_checkpoint(
-        args.checkpoint_path,
-        args.single_scale,
-        args.dilation,
-        args.with_box_refine,
-        args.two_stage,
-        args.pytorch_dump_folder_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index e44733be812e..0a7dd1b06d80 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -946,7 +946,7 @@ def prepare_annotation(
         image: np.ndarray,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -1262,7 +1262,7 @@ def preprocess(
         self,
         images: ImageInput,
         annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
index 2aee1802ceb7..8f78d8a7bfc3 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
@@ -12,8 +12,7 @@
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -58,21 +57,12 @@
 logger = logging.get_logger(__name__)
 
 
-class DeformableDetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
+class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     format: Optional[Union[str, AnnotationFormat]]
     do_convert_annotations: Optional[bool]
     do_pad: Optional[bool]
     pad_size: Optional[Dict[str, int]]
-
-
-class DeformableDetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
-    format: Optional[AnnotationFormat]
-    annotations: Optional[Dict]
-    do_convert_annotations: Optional[bool]
-    do_pad: Optional[bool]
-    pad_size: Optional[Dict[str, int]]
     return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
 
 
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@@ -247,7 +237,7 @@ def prepare_coco_panoptic_annotation(
     new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
 
     if "segments_info" in target:
-        masks = read_image(annotation_path).permute(1, 2, 0).to(torch.int32).to(image.device)
+        masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device)
         masks = rgb_to_id(masks)
 
         ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
@@ -294,6 +284,8 @@ def prepare_coco_panoptic_annotation(
             The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
             provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
             height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
     """,
 )
 class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
@@ -308,10 +300,9 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
     size = {"shortest_edge": 800, "longest_edge": 1333}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_init_kwargs = DeformableDetrFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = DeformableDetrFastImageProcessorPreprocessKwargs
+    valid_kwargs = DeformableDetrFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorInitKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -355,7 +346,7 @@ def prepare_annotation(
         image: torch.Tensor,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -605,7 +596,11 @@ def pad(
         """,
     )
     def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[DeformableDetrFastImageProcessorPreprocessKwargs]
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs],
     ) -> BatchFeature:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
@@ -621,7 +616,7 @@ def preprocess(
             )
             kwargs["size"] = kwargs.pop("max_size")
 
-        return super().preprocess(images, **kwargs)
+        return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
 
     def _preprocess(
         self,
@@ -696,15 +691,8 @@ def _preprocess(
                         target_size=resized_image.size()[-2:],
                     )
                 image = resized_image
-
-            if do_rescale and do_normalize:
-                # fused rescale and normalize
-                image = F.normalize(image.to(dtype=torch.float32), image_mean, image_std)
-            elif do_rescale:
-                image = image * rescale_factor
-            elif do_normalize:
-                image = F.normalize(image, image_mean, image_std)
-
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
             if do_convert_annotations and annotations is not None:
                 annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
 
diff --git a/src/transformers/models/deformable_detr/load_custom.py b/src/transformers/models/deformable_detr/load_custom.py
deleted file mode 100644
index 3c0b3a432be1..000000000000
--- a/src/transformers/models/deformable_detr/load_custom.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Loading of Deformable DETR's CUDA kernels"""
-
-import os
-from pathlib import Path
-
-
-def load_cuda_kernels():
-    from torch.utils.cpp_extension import load
-
-    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
-    src_files = [
-        root / filename
-        for filename in [
-            "vision.cpp",
-            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
-            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
-        ]
-    ]
-
-    load(
-        "MultiScaleDeformableAttention",
-        src_files,
-        with_cuda=True,
-        extra_include_paths=[str(root)],
-        extra_cflags=["-DWITH_CUDA=1"],
-        extra_cuda_cflags=[
-            "-DCUDA_HAS_FP16=1",
-            "-D__CUDA_NO_HALF_OPERATORS__",
-            "-D__CUDA_NO_HALF_CONVERSIONS__",
-            "-D__CUDA_NO_HALF2_OPERATORS__",
-        ],
-    )
-
-    import MultiScaleDeformableAttention as MSDA
-
-    return MSDA
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 39d189418fea..4e177dde1a42 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -16,19 +16,16 @@
 
 import copy
 import math
-import os
 import warnings
 from dataclasses import dataclass
-from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
 
 from ...activations import ACT2FN
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
@@ -37,10 +34,7 @@
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_ninja_available,
     is_timm_available,
-    is_torch_cuda_available,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
     requires_backends,
@@ -51,38 +45,6 @@
 
 logger = logging.get_logger(__name__)
 
-MultiScaleDeformableAttention = None
-
-
-def load_cuda_kernels():
-    from torch.utils.cpp_extension import load
-
-    global MultiScaleDeformableAttention
-
-    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
-    src_files = [
-        root / filename
-        for filename in [
-            "vision.cpp",
-            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
-            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
-        ]
-    ]
-
-    MultiScaleDeformableAttention = load(
-        "MultiScaleDeformableAttention",
-        src_files,
-        with_cuda=True,
-        extra_include_paths=[str(root)],
-        extra_cflags=["-DWITH_CUDA=1"],
-        extra_cuda_cflags=[
-            "-DCUDA_HAS_FP16=1",
-            "-D__CUDA_NO_HALF_OPERATORS__",
-            "-D__CUDA_NO_HALF_CONVERSIONS__",
-            "-D__CUDA_NO_HALF2_OPERATORS__",
-        ],
-    )
-
 
 if is_timm_available():
     from timm import create_model
@@ -94,52 +56,59 @@ def load_cuda_kernels():
 _CHECKPOINT_FOR_DOC = "sensetime/deformable-detr"
 
 
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
+@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
+class MultiScaleDeformableAttention(nn.Module):
     def forward(
-        context,
-        value,
-        value_spatial_shapes,
-        value_level_start_index,
-        sampling_locations,
-        attention_weights,
-        im2col_step,
+        self,
+        value: Tensor,
+        value_spatial_shapes: Tensor,
+        value_spatial_shapes_list: List[Tuple],
+        level_start_index: Tensor,
+        sampling_locations: Tensor,
+        attention_weights: Tensor,
+        im2col_step: int,
     ):
-        context.im2col_step = im2col_step
-        output = MultiScaleDeformableAttention.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
+        batch_size, _, num_heads, hidden_dim = value.shape
+        _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+        value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
+        sampling_grids = 2 * sampling_locations - 1
+        sampling_value_list = []
+        for level_id, (height, width) in enumerate(value_spatial_shapes_list):
+            # batch_size, height*width, num_heads, hidden_dim
+            # -> batch_size, height*width, num_heads*hidden_dim
+            # -> batch_size, num_heads*hidden_dim, height*width
+            # -> batch_size*num_heads, hidden_dim, height, width
+            value_l_ = (
+                value_list[level_id]
+                .flatten(2)
+                .transpose(1, 2)
+                .reshape(batch_size * num_heads, hidden_dim, height, width)
+            )
+            # batch_size, num_queries, num_heads, num_points, 2
+            # -> batch_size, num_heads, num_queries, num_points, 2
+            # -> batch_size*num_heads, num_queries, num_points, 2
+            sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+            # batch_size*num_heads, hidden_dim, num_queries, num_points
+            sampling_value_l_ = nn.functional.grid_sample(
+                value_l_,
+                sampling_grid_l_,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            sampling_value_list.append(sampling_value_l_)
+        # (batch_size, num_queries, num_heads, num_levels, num_points)
+        # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+        # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+        attention_weights = attention_weights.transpose(1, 2).reshape(
+            batch_size * num_heads, 1, num_queries, num_levels * num_points
         )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
+        output = (
+            (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+            .sum(-1)
+            .view(batch_size, num_heads * hidden_dim, num_queries)
         )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+        return output.transpose(1, 2).contiguous()
 
 
 @dataclass
@@ -171,9 +140,9 @@ class DeformableDetrDecoderOutput(ModelOutput):
             used to compute the weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -223,10 +192,10 @@ class DeformableDetrModelOutput(ModelOutput):
             Logits of predicted bounding boxes coordinates in the first stage.
     """
 
-    init_reference_points: torch.FloatTensor = None
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -300,8 +269,8 @@ class DeformableDetrObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
     init_reference_points: Optional[torch.FloatTensor] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
@@ -564,48 +533,6 @@ def build_position_encoding(config):
     return position_embedding
 
 
-def multi_scale_deformable_attention(
-    value: Tensor,
-    value_spatial_shapes: Union[Tensor, List[Tuple]],
-    sampling_locations: Tensor,
-    attention_weights: Tensor,
-) -> Tensor:
-    batch_size, _, num_heads, hidden_dim = value.shape
-    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for level_id, (height, width) in enumerate(value_spatial_shapes):
-        # batch_size, height*width, num_heads, hidden_dim
-        # -> batch_size, height*width, num_heads*hidden_dim
-        # -> batch_size, num_heads*hidden_dim, height*width
-        # -> batch_size*num_heads, hidden_dim, height, width
-        value_l_ = (
-            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
-        )
-        # batch_size, num_queries, num_heads, num_points, 2
-        # -> batch_size, num_heads, num_queries, num_points, 2
-        # -> batch_size*num_heads, num_queries, num_points, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
-        # batch_size*num_heads, hidden_dim, num_queries, num_points
-        sampling_value_l_ = nn.functional.grid_sample(
-            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
-        )
-        sampling_value_list.append(sampling_value_l_)
-    # (batch_size, num_queries, num_heads, num_levels, num_points)
-    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
-    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
-    attention_weights = attention_weights.transpose(1, 2).reshape(
-        batch_size * num_heads, 1, num_queries, num_levels * num_points
-    )
-    output = (
-        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
-        .sum(-1)
-        .view(batch_size, num_heads * hidden_dim, num_queries)
-    )
-    return output.transpose(1, 2).contiguous()
-
-
 class DeformableDetrMultiscaleDeformableAttention(nn.Module):
     """
     Multiscale deformable attention as proposed in Deformable DETR.
@@ -614,12 +541,7 @@ class DeformableDetrMultiscaleDeformableAttention(nn.Module):
     def __init__(self, config: DeformableDetrConfig, num_heads: int, n_points: int):
         super().__init__()
 
-        kernel_loaded = MultiScaleDeformableAttention is not None
-        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
-            try:
-                load_cuda_kernels()
-            except Exception as e:
-                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+        self.attn = MultiScaleDeformableAttention()
 
         if config.d_model % num_heads != 0:
             raise ValueError(
@@ -706,27 +628,16 @@ def forward(
         else:
             raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
 
-        if self.disable_custom_kernels or MultiScaleDeformableAttention is None or is_torchdynamo_compiling():
-            # PyTorch implementation
-            output = multi_scale_deformable_attention(
-                value, spatial_shapes_list, sampling_locations, attention_weights
-            )
-        else:
-            try:
-                # custom kernel
-                output = MultiScaleDeformableAttentionFunction.apply(
-                    value,
-                    spatial_shapes,
-                    level_start_index,
-                    sampling_locations,
-                    attention_weights,
-                    self.im2col_step,
-                )
-            except Exception:
-                # PyTorch implementation
-                output = multi_scale_deformable_attention(
-                    value, spatial_shapes_list, sampling_locations, attention_weights
-                )
+        output = self.attn(
+            value,
+            spatial_shapes,
+            spatial_shapes_list,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+
         output = self.output_proj(output)
 
         return output, attention_weights
@@ -834,7 +745,11 @@ def forward(
 
         attn_output = torch.bmm(attn_probs, value_states)
 
-        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+        if attn_output.size() != (
+            batch_size * self.num_heads,
+            target_len,
+            self.head_dim,
+        ):
             raise ValueError(
                 f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
@@ -854,7 +769,9 @@ def __init__(self, config: DeformableDetrConfig):
         super().__init__()
         self.embed_dim = config.d_model
         self.self_attn = DeformableDetrMultiscaleDeformableAttention(
-            config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points
+            config,
+            num_heads=config.encoder_attention_heads,
+            n_points=config.encoder_n_points,
         )
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
@@ -868,7 +785,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes=None,
         spatial_shapes_list=None,
@@ -1054,7 +971,11 @@ class DeformableDetrPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
-    _no_split_modules = [r"DeformableDetrConvEncoder", r"DeformableDetrEncoderLayer", r"DeformableDetrDecoderLayer"]
+    _no_split_modules = [
+        r"DeformableDetrConvEncoder",
+        r"DeformableDetrEncoderLayer",
+        r"DeformableDetrDecoderLayer",
+    ]
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -1299,7 +1220,9 @@ def forward(
         if not return_dict:
             return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
         )
 
 
@@ -1525,7 +1448,13 @@ def __init__(self, config: DeformableDetrConfig):
             for _ in range(config.num_feature_levels - num_backbone_outs):
                 input_proj_list.append(
                     nn.Sequential(
-                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
+                        nn.Conv2d(
+                            in_channels,
+                            config.d_model,
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                        ),
                         nn.GroupNorm(32, config.d_model),
                     )
                 )
@@ -1535,7 +1464,11 @@ def __init__(self, config: DeformableDetrConfig):
             self.input_proj = nn.ModuleList(
                 [
                     nn.Sequential(
-                        nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
+                        nn.Conv2d(
+                            backbone.intermediate_channel_sizes[-1],
+                            config.d_model,
+                            kernel_size=1,
+                        ),
                         nn.GroupNorm(32, config.d_model),
                     )
                 ]
@@ -1625,8 +1558,20 @@ def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes)
             valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
 
             grid_y, grid_x = meshgrid(
-                torch.linspace(0, height - 1, height, dtype=enc_output.dtype, device=enc_output.device),
-                torch.linspace(0, width - 1, width, dtype=enc_output.dtype, device=enc_output.device),
+                torch.linspace(
+                    0,
+                    height - 1,
+                    height,
+                    dtype=enc_output.dtype,
+                    device=enc_output.device,
+                ),
+                torch.linspace(
+                    0,
+                    width - 1,
+                    width,
+                    dtype=enc_output.dtype,
+                    device=enc_output.device,
+                ),
                 indexing="ij",
             )
             grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
@@ -1802,7 +1747,9 @@ def forward(
             topk = self.config.two_stage_num_proposals
             topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
             topk_coords_logits = torch.gather(
-                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+                enc_outputs_coord_logits,
+                1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4),
             )
 
             topk_coords_logits = topk_coords_logits.detach()
@@ -1897,33 +1844,26 @@ def __init__(self, config: DeformableDetrConfig):
         # Detection heads on top
         self.class_embed = nn.Linear(config.d_model, config.num_labels)
         self.bbox_embed = DeformableDetrMLPPredictionHead(
-            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+            input_dim=config.d_model,
+            hidden_dim=config.d_model,
+            output_dim=4,
+            num_layers=3,
         )
 
-        prior_prob = 0.01
-        bias_value = -math.log((1 - prior_prob) / prior_prob)
-        self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
-        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
-        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
-
         # if two-stage, the last class_embed and bbox_embed is for region proposal generation
         num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
         if config.with_box_refine:
             self.class_embed = _get_clones(self.class_embed, num_pred)
             self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
-            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
             # hack implementation for iterative bounding box refinement
             self.model.decoder.bbox_embed = self.bbox_embed
         else:
-            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
             self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
             self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
             self.model.decoder.bbox_embed = None
         if config.two_stage:
             # hack implementation for two-stage
             self.model.decoder.class_embed = self.class_embed
-            for box_embed in self.bbox_embed:
-                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -2033,7 +1973,13 @@ def forward(
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
             loss, loss_dict, auxiliary_outputs = self.loss_function(
-                logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord
+                logits,
+                labels,
+                self.device,
+                pred_boxes,
+                self.config,
+                outputs_class,
+                outputs_coord,
             )
         if not return_dict:
             if auxiliary_outputs is not None:
diff --git a/src/transformers/models/deit/configuration_deit.py b/src/transformers/models/deit/configuration_deit.py
index d135144a2a40..63d772067c5f 100644
--- a/src/transformers/models/deit/configuration_deit.py
+++ b/src/transformers/models/deit/configuration_deit.py
@@ -69,6 +69,12 @@ class DeiTConfig(PretrainedConfig):
             Whether to add a bias to the queries, keys and values.
         encoder_stride (`int`, *optional*, defaults to 16):
             Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+        pooler_output_size (`int`, *optional*):
+           Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
+        pooler_act (`str`, *optional*, defaults to `"tanh"`):
+           The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and
+           Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are
+           supported for Tensorflow.
 
     Example:
 
@@ -103,6 +109,8 @@ def __init__(
         num_channels=3,
         qkv_bias=True,
         encoder_stride=16,
+        pooler_output_size=None,
+        pooler_act="tanh",
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -121,6 +129,8 @@ def __init__(
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
         self.encoder_stride = encoder_stride
+        self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size
+        self.pooler_act = pooler_act
 
 
 class DeiTOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
deleted file mode 100644
index e7bf3e7a12e8..000000000000
--- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DeiT distilled checkpoints from the timm library."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import timm
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"deit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"deit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"deit.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"deit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"deit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"deit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"deit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"deit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"deit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"deit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("cls_token", "deit.embeddings.cls_token"),
-            ("dist_token", "deit.embeddings.distillation_token"),
-            ("patch_embed.proj.weight", "deit.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "deit.embeddings.patch_embeddings.projection.bias"),
-            ("pos_embed", "deit.embeddings.position_embeddings"),
-        ]
-    )
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-                ("pre_logits.fc.weight", "pooler.dense.weight"),
-                ("pre_logits.fc.bias", "pooler.dense.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "deit" from all keys that start with "deit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("deit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification heads
-        rename_keys.extend(
-            [
-                ("norm.weight", "deit.layernorm.weight"),
-                ("norm.bias", "deit.layernorm.bias"),
-                ("head.weight", "cls_classifier.weight"),
-                ("head.bias", "cls_classifier.bias"),
-                ("head_dist.weight", "distillation_classifier.weight"),
-                ("head_dist.bias", "distillation_classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "deit."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our DeiT structure.
-    """
-
-    # define default DeiT configuration
-    config = DeiTConfig()
-    # all deit models have fine-tuned heads
-    base_model = False
-    # dataset (fine-tuned on ImageNet 2012), patch_size and image_size
-    config.num_labels = 1000
-    repo_id = "huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    config.patch_size = int(deit_name[-6:-4])
-    config.image_size = int(deit_name[-3:])
-    # size of the architecture
-    if deit_name[9:].startswith("tiny"):
-        config.hidden_size = 192
-        config.intermediate_size = 768
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 3
-    elif deit_name[9:].startswith("small"):
-        config.hidden_size = 384
-        config.intermediate_size = 1536
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 6
-    if deit_name[9:].startswith("base"):
-        pass
-    elif deit_name[4:].startswith("large"):
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-
-    # load original model from timm
-    timm_model = timm.create_model(deit_name, pretrained=True)
-    timm_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = timm_model.state_dict()
-    rename_keys = create_rename_keys(config, base_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model)
-
-    # load HuggingFace model
-    model = DeiTForImageClassificationWithTeacher(config).eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by DeiTImageProcessor
-    size = int(
-        (256 / 224) * config.image_size
-    )  # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103
-    image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values)
-
-    timm_logits = timm_model(pixel_values)
-    assert timm_logits.shape == outputs.logits.shape
-    assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {deit_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--deit_name",
-        default="vit_deit_base_distilled_patch16_224",
-        type=str,
-        help="Name of the DeiT timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_deit_checkpoint(args.deit_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/deit/image_processing_deit.py b/src/transformers/models/deit/image_processing_deit.py
index 770a1b2ec8e6..d1eceebcb5c0 100644
--- a/src/transformers/models/deit/image_processing_deit.py
+++ b/src/transformers/models/deit/image_processing_deit.py
@@ -163,14 +163,14 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample=None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index dfb7753d6f9d..cb88bca353fc 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -15,9 +15,8 @@
 """PyTorch DeiT model."""
 
 import collections.abc
-import math
 from dataclasses import dataclass
-from typing import Optional, Set, Tuple, Union
+from typing import Callable, Optional, Set, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -31,7 +30,7 @@
     ImageClassifierOutput,
     MaskedImageModelingOutput,
 )
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
@@ -180,26 +179,59 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return x
 
 
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DeiT
 class DeiTSelfAttention(nn.Module):
     def __init__(self, config: DeiTConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -208,85 +240,37 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaSelfAttention with ViT->DeiT
-class DeiTSdpaSelfAttention(DeiTSelfAttention):
-    def __init__(self, config: DeiTConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions or head_mask is not None:
-            logger.warning_once(
-                "`DeiTSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
-                "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but "
-                "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        return context_layer, None
+        return outputs
 
 
 # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DeiT
@@ -348,13 +332,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->DeiT
-class DeiTSdpaAttention(DeiTAttention):
-    def __init__(self, config: DeiTConfig) -> None:
-        super().__init__(config)
-        self.attention = DeiTSdpaSelfAttention(config)
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->DeiT
 class DeiTIntermediate(nn.Module):
     def __init__(self, config: DeiTConfig) -> None:
@@ -388,12 +365,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-DEIT_ATTENTION_CLASSES = {
-    "eager": DeiTAttention,
-    "sdpa": DeiTSdpaAttention,
-}
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->DeiT,VIT->DEIT
 class DeiTLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
@@ -402,7 +373,7 @@ def __init__(self, config: DeiTConfig) -> None:
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = DEIT_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = DeiTAttention(config)
         self.intermediate = DeiTIntermediate(config)
         self.output = DeiTOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -501,6 +472,7 @@ class DeiTPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["DeiTLayer"]
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -515,6 +487,12 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, DeiTEmbeddings):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+            module.distillation_token.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
 
 
 DEIT_START_DOCSTRING = r"""
@@ -656,8 +634,8 @@ def forward(
 class DeiTPooler(nn.Module):
     def __init__(self, config: DeiTConfig):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
+        self.dense = nn.Linear(config.hidden_size, config.pooler_output_size)
+        self.activation = ACT2FN[config.pooler_act]
 
     def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
@@ -927,9 +905,9 @@ class token).
             the self-attention heads.
     """
 
-    logits: torch.FloatTensor = None
-    cls_logits: torch.FloatTensor = None
-    distillation_logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    distillation_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/deit/modeling_tf_deit.py b/src/transformers/models/deit/modeling_tf_deit.py
index 7723f8fc3478..49c95268035a 100644
--- a/src/transformers/models/deit/modeling_tf_deit.py
+++ b/src/transformers/models/deit/modeling_tf_deit.py
@@ -88,9 +88,9 @@ class token).
             the self-attention heads.
     """
 
-    logits: tf.Tensor = None
-    cls_logits: tf.Tensor = None
-    distillation_logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
+    cls_logits: Optional[tf.Tensor] = None
+    distillation_logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -813,9 +813,9 @@ def __init__(self, config: DeiTConfig, **kwargs):
         super().__init__(**kwargs)
 
         self.dense = keras.layers.Dense(
-            units=config.hidden_size,
+            units=config.pooler_output_size,
             kernel_initializer=get_initializer(config.initializer_range),
-            activation="tanh",
+            activation=config.pooler_act,
             name="dense",
         )
         self.config = config
diff --git a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
deleted file mode 100644
index e2f64e9c3cd1..000000000000
--- a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# coding=utf-8
-# Copyright 2020, The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Bort checkpoint."""
-
-import argparse
-import os
-
-import gluonnlp as nlp
-import mxnet as mx
-import numpy as np
-import torch
-from gluonnlp.base import get_home_dir
-from gluonnlp.model.bert import BERTEncoder
-from gluonnlp.model.utils import _load_vocab
-from gluonnlp.vocab import Vocab
-from packaging import version
-from torch import nn
-
-from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer
-from transformers.models.bert.modeling_bert import (
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-from transformers.utils import logging
-
-
-if version.parse(nlp.__version__) != version.parse("0.8.3"):
-    raise Exception("requires gluonnlp == 0.8.3")
-
-if version.parse(mx.__version__) != version.parse("1.5.0"):
-    raise Exception("requires mxnet == 1.5.0")
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = "The Nymphenburg Palace is a beautiful palace in Munich!"
-
-
-def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str):
-    """
-    Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure-
-    """
-
-    # Original Bort configuration
-    bort_4_8_768_1024_hparams = {
-        "attention_cell": "multi_head",
-        "num_layers": 4,
-        "units": 1024,
-        "hidden_size": 768,
-        "max_length": 512,
-        "num_heads": 8,
-        "scaled": True,
-        "dropout": 0.1,
-        "use_residual": True,
-        "embed_size": 1024,
-        "embed_dropout": 0.1,
-        "word_embed": None,
-        "layer_norm_eps": 1e-5,
-        "token_type_vocab_size": 2,
-    }
-
-    predefined_args = bort_4_8_768_1024_hparams
-
-    # Let's construct the original Bort model here
-    # Taken from official BERT implementation, see:
-    # https://github.com/alexa/bort/blob/master/bort/bort.py
-    encoder = BERTEncoder(
-        attention_cell=predefined_args["attention_cell"],
-        num_layers=predefined_args["num_layers"],
-        units=predefined_args["units"],
-        hidden_size=predefined_args["hidden_size"],
-        max_length=predefined_args["max_length"],
-        num_heads=predefined_args["num_heads"],
-        scaled=predefined_args["scaled"],
-        dropout=predefined_args["dropout"],
-        output_attention=False,
-        output_all_encodings=False,
-        use_residual=predefined_args["use_residual"],
-        activation=predefined_args.get("activation", "gelu"),
-        layer_norm_eps=predefined_args.get("layer_norm_eps", None),
-    )
-
-    # Vocab information needs to be fetched first
-    # It's the same as RoBERTa, so RobertaTokenizer can be used later
-    vocab_name = "openwebtext_ccnews_stories_books_cased"
-
-    # Specify download folder to Gluonnlp's vocab
-    gluon_cache_dir = os.path.join(get_home_dir(), "models")
-    bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab)
-
-    original_bort = nlp.model.BERTModel(
-        encoder,
-        len(bort_vocab),
-        units=predefined_args["units"],
-        embed_size=predefined_args["embed_size"],
-        embed_dropout=predefined_args["embed_dropout"],
-        word_embed=predefined_args["word_embed"],
-        use_pooler=False,
-        use_token_type_embed=False,
-        token_type_vocab_size=predefined_args["token_type_vocab_size"],
-        use_classifier=False,
-        use_decoder=False,
-    )
-
-    original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True)
-    params = original_bort._collect_params_with_prefix()
-
-    # Build our config 🤗
-    hf_bort_config_json = {
-        "architectures": ["BertForMaskedLM"],
-        "attention_probs_dropout_prob": predefined_args["dropout"],
-        "hidden_act": "gelu",
-        "hidden_dropout_prob": predefined_args["dropout"],
-        "hidden_size": predefined_args["embed_size"],
-        "initializer_range": 0.02,
-        "intermediate_size": predefined_args["hidden_size"],
-        "layer_norm_eps": predefined_args["layer_norm_eps"],
-        "max_position_embeddings": predefined_args["max_length"],
-        "model_type": "bort",
-        "num_attention_heads": predefined_args["num_heads"],
-        "num_hidden_layers": predefined_args["num_layers"],
-        "pad_token_id": 1,  # 2 = BERT, 1 = RoBERTa
-        "type_vocab_size": 1,  # 2 = BERT, 1 = RoBERTa
-        "vocab_size": len(bort_vocab),
-    }
-
-    hf_bort_config = BertConfig.from_dict(hf_bort_config_json)
-    hf_bort_model = BertForMaskedLM(hf_bort_config)
-    hf_bort_model.eval()
-
-    # Parameter mapping table (Gluonnlp to Transformers)
-    # * denotes layer index
-    #
-    # | Gluon Parameter                                                | Transformers Parameter
-    # | -------------------------------------------------------------- | ----------------------
-    # | `encoder.layer_norm.beta`                                      | `bert.embeddings.LayerNorm.bias`
-    # | `encoder.layer_norm.gamma`                                     | `bert.embeddings.LayerNorm.weight`
-    # | `encoder.position_weight`                                      | `bert.embeddings.position_embeddings.weight`
-    # | `word_embed.0.weight`                                          | `bert.embeddings.word_embeddings.weight`
-    # | `encoder.transformer_cells.*.attention_cell.proj_key.bias`     | `bert.encoder.layer.*.attention.self.key.bias`
-    # | `encoder.transformer_cells.*.attention_cell.proj_key.weight`   | `bert.encoder.layer.*.attention.self.key.weight`
-    # | `encoder.transformer_cells.*.attention_cell.proj_query.bias`   | `bert.encoder.layer.*.attention.self.query.bias`
-    # | `encoder.transformer_cells.*.attention_cell.proj_query.weight` | `bert.encoder.layer.*.attention.self.query.weight`
-    # | `encoder.transformer_cells.*.attention_cell.proj_value.bias`   | `bert.encoder.layer.*.attention.self.value.bias`
-    # | `encoder.transformer_cells.*.attention_cell.proj_value.weight` | `bert.encoder.layer.*.attention.self.value.weight`
-    # | `encoder.transformer_cells.*.ffn.ffn_2.bias`                   | `bert.encoder.layer.*.attention.output.dense.bias`
-    # | `encoder.transformer_cells.*.ffn.ffn_2.weight`                 | `bert.encoder.layer.*.attention.output.dense.weight`
-    # | `encoder.transformer_cells.*.layer_norm.beta`                  | `bert.encoder.layer.*.attention.output.LayerNorm.bias`
-    # | `encoder.transformer_cells.*.layer_norm.gamma`                 | `bert.encoder.layer.*.attention.output.LayerNorm.weight`
-    # | `encoder.transformer_cells.*.ffn.ffn_1.bias`                   | `bert.encoder.layer.*.intermediate.dense.bias`
-    # | `encoder.transformer_cells.*.ffn.ffn_1.weight`                 | `bert.encoder.layer.*.intermediate.dense.weight`
-    # | `encoder.transformer_cells.*.ffn.layer_norm.beta`              | `bert.encoder.layer.*.output.LayerNorm.bias`
-    # | `encoder.transformer_cells.*.ffn.layer_norm.gamma`             | `bert.encoder.layer.*.output.LayerNorm.weight`
-    # | `encoder.transformer_cells.*.proj.bias`                        | `bert.encoder.layer.*.output.dense.bias`
-    # | `encoder.transformer_cells.*.proj.weight`                      | `bert.encoder.layer.*.output.dense.weight`
-
-    # Helper function to convert MXNET Arrays to PyTorch
-    def to_torch(mx_array) -> nn.Parameter:
-        return nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy()))
-
-    # Check param shapes and map new HF param back
-    def check_and_map_params(hf_param, gluon_param):
-        shape_hf = hf_param.shape
-
-        gluon_param = to_torch(params[gluon_param])
-        shape_gluon = gluon_param.shape
-
-        assert (
-            shape_hf == shape_gluon
-        ), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"
-
-        return gluon_param
-
-    hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params(
-        hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight"
-    )
-    hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params(
-        hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight"
-    )
-    hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params(
-        hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta"
-    )
-    hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params(
-        hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma"
-    )
-
-    # Inspired by RoBERTa conversion script, we just zero them out (Bort does not use them)
-    hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        hf_bort_model.bert.embeddings.token_type_embeddings.weight.data
-    )
-
-    for i in range(hf_bort_config.num_hidden_layers):
-        layer: BertLayer = hf_bort_model.bert.encoder.layer[i]
-
-        # self attention
-        self_attn: BertSelfAttention = layer.attention.self
-
-        self_attn.key.bias.data = check_and_map_params(
-            self_attn.key.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.bias"
-        )
-
-        self_attn.key.weight.data = check_and_map_params(
-            self_attn.key.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.weight"
-        )
-        self_attn.query.bias.data = check_and_map_params(
-            self_attn.query.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.bias"
-        )
-        self_attn.query.weight.data = check_and_map_params(
-            self_attn.query.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.weight"
-        )
-        self_attn.value.bias.data = check_and_map_params(
-            self_attn.value.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.bias"
-        )
-        self_attn.value.weight.data = check_and_map_params(
-            self_attn.value.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.weight"
-        )
-
-        # self attention output
-        self_output: BertSelfOutput = layer.attention.output
-
-        self_output.dense.bias = check_and_map_params(
-            self_output.dense.bias, f"encoder.transformer_cells.{i}.proj.bias"
-        )
-        self_output.dense.weight = check_and_map_params(
-            self_output.dense.weight, f"encoder.transformer_cells.{i}.proj.weight"
-        )
-        self_output.LayerNorm.bias = check_and_map_params(
-            self_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.layer_norm.beta"
-        )
-        self_output.LayerNorm.weight = check_and_map_params(
-            self_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.layer_norm.gamma"
-        )
-
-        # intermediate
-        intermediate: BertIntermediate = layer.intermediate
-
-        intermediate.dense.bias = check_and_map_params(
-            intermediate.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_1.bias"
-        )
-        intermediate.dense.weight = check_and_map_params(
-            intermediate.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_1.weight"
-        )
-
-        # output
-        bert_output: BertOutput = layer.output
-
-        bert_output.dense.bias = check_and_map_params(
-            bert_output.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_2.bias"
-        )
-        bert_output.dense.weight = check_and_map_params(
-            bert_output.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_2.weight"
-        )
-        bert_output.LayerNorm.bias = check_and_map_params(
-            bert_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.ffn.layer_norm.beta"
-        )
-        bert_output.LayerNorm.weight = check_and_map_params(
-            bert_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.ffn.layer_norm.gamma"
-        )
-
-    # Save space and energy 🎄
-    hf_bort_model.half()
-
-    # Compare output of both models
-    tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
-
-    input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"]
-
-    # Get gluon output
-    gluon_input_ids = mx.nd.array([input_ids])
-    output_gluon = original_bort(inputs=gluon_input_ids, token_types=[])
-
-    # Get Transformer output (save and reload model again)
-    hf_bort_model.save_pretrained(pytorch_dump_folder_path)
-    hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path)
-    hf_bort_model.eval()
-
-    input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt")
-    output_hf = hf_bort_model(**input_ids)[0]
-
-    gluon_layer = output_gluon[0].asnumpy()
-    hf_layer = output_hf[0].detach().numpy()
-
-    max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item()
-    success = np.allclose(gluon_layer, hf_layer, atol=1e-3)
-
-    if success:
-        print("✔️ Both model do output the same tensors")
-    else:
-        print("❌ Both model do **NOT** output the same tensors")
-        print("Absolute difference is:", max_absolute_diff)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--bort_checkpoint_path", default=None, type=str, required=True, help="Path the official Bort params file."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_bort_checkpoint_to_pytorch(args.bort_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
deleted file mode 100644
index 60e93efe7c60..000000000000
--- a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DETA checkpoints from the original repository.
-
-URL: https://github.com/jozhang97/DETA/tree/master"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_deta_config():
-    config = DetaConfig(
-        num_queries=900,
-        encoder_ffn_dim=2048,
-        decoder_ffn_dim=2048,
-        num_feature_levels=5,
-        assign_first_stage=True,
-        with_box_refine=True,
-        two_stage=True,
-    )
-
-    # set labels
-    config.num_labels = 91
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-id2label.json"
-    id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight"))
-    rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight"))
-    rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias"))
-    rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean"))
-    rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var"))
-    # stages
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            # shortcut
-            if layer_idx == 0:
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
-                    )
-                )
-            # 3 convs
-            for i in range(3):
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
-                    )
-                )
-    # transformer encoder
-    for i in range(config.encoder_layers):
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
-
-    # transformer decoder
-    for i in range(config.decoder_layers):
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
-
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def read_in_decoder_q_k_v(state_dict, config):
-    # transformer decoder self-attention layers
-    hidden_size = config.d_model
-    for i in range(config.decoder_layers):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DETA structure.
-    """
-
-    # load config
-    config = get_deta_config()
-
-    # load original state dict
-    if model_name == "deta-resnet-50":
-        filename = "adet_checkpoint0011.pth"
-    elif model_name == "deta-resnet-50-24-epochs":
-        filename = "adet_2x_checkpoint0023.pth"
-    else:
-        raise ValueError(f"Model name {model_name} not supported")
-    checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename)
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_decoder_q_k_v(state_dict, config)
-
-    # fix some prefixes
-    for key in state_dict.copy().keys():
-        if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
-            val = state_dict.pop(key)
-            state_dict[key.replace("transformer.decoder", "model.decoder")] = val
-        if "input_proj" in key:
-            val = state_dict.pop(key)
-            state_dict["model." + key] = val
-        if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
-            val = state_dict.pop(key)
-            state_dict[key.replace("transformer", "model")] = val
-
-    # finally, create HuggingFace model and load state dict
-    model = DetaForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device)
-
-    # load image processor
-    processor = DetaImageProcessor(format="coco_detection")
-
-    # verify our conversion on image
-    img = prepare_img()
-    encoding = processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values.to(device))
-
-    # verify logits
-    if model_name == "deta-resnet-50":
-        expected_logits = torch.tensor(
-            [[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]]
-        )
-        expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]])
-    elif model_name == "deta-resnet-50-24-epochs":
-        expected_logits = torch.tensor(
-            [[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]]
-        )
-        expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]])
-
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
-    print("Everything ok!")
-
-    if pytorch_dump_folder_path:
-        # Save model and processor
-        logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Push to hub
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(f"jozhang97/{model_name}")
-        processor.push_to_hub(f"jozhang97/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="deta-resnet-50",
-        choices=["deta-resnet-50", "deta-resnet-50-24-epochs"],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the folder to output PyTorch model.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
deleted file mode 100644
index 392750fa67a1..000000000000
--- a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DETA checkpoints from the original repository.
-
-URL: https://github.com/jozhang97/DETA/tree/master"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_deta_config(model_name):
-    backbone_config = SwinConfig(
-        embed_dim=192,
-        depths=(2, 2, 18, 2),
-        num_heads=(6, 12, 24, 48),
-        window_size=12,
-        out_features=["stage2", "stage3", "stage4"],
-    )
-
-    config = DetaConfig(
-        backbone_config=backbone_config,
-        num_queries=900,
-        encoder_ffn_dim=2048,
-        decoder_ffn_dim=2048,
-        num_feature_levels=5,
-        assign_first_stage=True,
-        with_box_refine=True,
-        two_stage=True,
-    )
-
-    # set labels
-    repo_id = "huggingface/label-files"
-    if "o365" in model_name:
-        num_labels = 366
-        filename = "object365-id2label.json"
-    else:
-        num_labels = 91
-        filename = "coco-detection-id2label.json"
-
-    config.num_labels = num_labels
-    id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight"))
-    rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias"))
-    # stages
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
-
-        if i < 3:
-            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight"))
-            rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias"))
-
-    rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight"))
-    rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias"))
-    rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight"))
-    rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias"))
-    rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight"))
-    rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias"))
-
-    # transformer encoder
-    for i in range(config.encoder_layers):
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias"))
-
-    # transformer decoder
-    for i in range(config.decoder_layers):
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias"))
-
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_swin_q_k_v(state_dict, backbone_config):
-    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
-    for i in range(len(backbone_config.depths)):
-        dim = num_features[i]
-        for j in range(backbone_config.depths[i]):
-            # fmt: off
-            # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight")
-            in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias")
-            # next, add query, keys and values (in that order) to the state dict
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
-                dim : dim * 2, :
-            ]
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
-                dim : dim * 2
-            ]
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-                -dim :, :
-            ]
-            state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
-            # fmt: on
-
-
-def read_in_decoder_q_k_v(state_dict, config):
-    # transformer decoder self-attention layers
-    hidden_size = config.d_model
-    for i in range(config.decoder_layers):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DETA structure.
-    """
-
-    # load config
-    config = get_deta_config(model_name)
-
-    # load original state dict
-    if model_name == "deta-swin-large":
-        checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth")
-    elif model_name == "deta-swin-large-o365":
-        checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth")
-    else:
-        raise ValueError(f"Model name {model_name} not supported")
-
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-
-    # original state dict
-    for name, param in state_dict.items():
-        print(name, param.shape)
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_swin_q_k_v(state_dict, config.backbone_config)
-    read_in_decoder_q_k_v(state_dict, config)
-
-    # fix some prefixes
-    for key in state_dict.copy().keys():
-        if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key:
-            val = state_dict.pop(key)
-            state_dict[key.replace("transformer.decoder", "model.decoder")] = val
-        if "input_proj" in key:
-            val = state_dict.pop(key)
-            state_dict["model." + key] = val
-        if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key:
-            val = state_dict.pop(key)
-            state_dict[key.replace("transformer", "model")] = val
-
-    # finally, create HuggingFace model and load state dict
-    model = DetaForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device)
-
-    # load image processor
-    processor = DetaImageProcessor(format="coco_detection")
-
-    # verify our conversion on image
-    img = prepare_img()
-    encoding = processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values.to(device))
-
-    # verify logits
-    print("Logits:", outputs.logits[0, :3, :3])
-    print("Boxes:", outputs.pred_boxes[0, :3, :3])
-    if model_name == "deta-swin-large":
-        expected_logits = torch.tensor(
-            [[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]]
-        )
-        expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]])
-    elif model_name == "deta-swin-large-o365":
-        expected_logits = torch.tensor(
-            [[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]]
-        )
-        expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]])
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4)
-    print("Everything ok!")
-
-    if pytorch_dump_folder_path:
-        # Save model and processor
-        logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Push to hub
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(f"jozhang97/{model_name}")
-        processor.push_to_hub(f"jozhang97/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="deta-swin-large",
-        choices=["deta-swin-large", "deta-swin-large-o365"],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the folder to output PyTorch model.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/deprecated/deta/image_processing_deta.py b/src/transformers/models/deprecated/deta/image_processing_deta.py
index e59b7bd95bfb..0cfdc03e81a3 100644
--- a/src/transformers/models/deprecated/deta/image_processing_deta.py
+++ b/src/transformers/models/deprecated/deta/image_processing_deta.py
@@ -593,7 +593,7 @@ def prepare_annotation(
         image: np.ndarray,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -889,7 +889,7 @@ def preprocess(
         self,
         images: ImageInput,
         annotations: Optional[Union[List[Dict], List[List[Dict]]]] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
diff --git a/src/transformers/models/deprecated/deta/modeling_deta.py b/src/transformers/models/deprecated/deta/modeling_deta.py
index 075b490cfa7b..a37fb60a8730 100644
--- a/src/transformers/models/deprecated/deta/modeling_deta.py
+++ b/src/transformers/models/deprecated/deta/modeling_deta.py
@@ -178,9 +178,9 @@ class DetaDecoderOutput(ModelOutput):
             used to compute the weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -232,10 +232,10 @@ class DetaModelOutput(ModelOutput):
             Logits of proposal bounding boxes coordinates in the gen_encoder_output_proposals.
     """
 
-    init_reference_points: torch.FloatTensor = None
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -312,8 +312,8 @@ class DetaObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
     init_reference_points: Optional[torch.FloatTensor] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
@@ -843,7 +843,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes=None,
         level_start_index=None,
diff --git a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 7431cd6136a5..000000000000
--- a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert EfficientFormer checkpoints from the original repository.
-
-URL: https://github.com/snap-research/EfficientFormer
-"""
-
-import argparse
-import re
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
-
-from transformers import (
-    EfficientFormerConfig,
-    EfficientFormerForImageClassificationWithTeacher,
-    EfficientFormerImageProcessor,
-)
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
-
-
-def rename_key(old_name, num_meta4D_last_stage):
-    new_name = old_name
-
-    if "patch_embed" in old_name:
-        _, layer, param = old_name.split(".")
-
-        if layer == "0":
-            new_name = old_name.replace("0", "convolution1")
-        elif layer == "1":
-            new_name = old_name.replace("1", "batchnorm_before")
-        elif layer == "3":
-            new_name = old_name.replace("3", "convolution2")
-        else:
-            new_name = old_name.replace("4", "batchnorm_after")
-
-    if "network" in old_name and re.search(r"\d\.\d", old_name):
-        two_digit_num = r"\b\d{2}\b"
-        if bool(re.search(two_digit_num, old_name)):
-            match = re.search(r"\d\.\d\d.", old_name).group()
-        else:
-            match = re.search(r"\d\.\d.", old_name).group()
-        if int(match[0]) < 6:
-            trimmed_name = old_name.replace(match, "")
-            trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1])
-            new_name = "intermediate_stages." + trimmed_name
-        else:
-            trimmed_name = old_name.replace(match, "")
-            if int(match[2]) < num_meta4D_last_stage:
-                trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2])
-            else:
-                layer_index = str(int(match[2]) - num_meta4D_last_stage)
-                trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index)
-                if "norm1" in old_name:
-                    trimmed_name = trimmed_name.replace("norm1", "layernorm1")
-                elif "norm2" in old_name:
-                    trimmed_name = trimmed_name.replace("norm2", "layernorm2")
-                elif "fc1" in old_name:
-                    trimmed_name = trimmed_name.replace("fc1", "linear_in")
-                elif "fc2" in old_name:
-                    trimmed_name = trimmed_name.replace("fc2", "linear_out")
-
-            new_name = "last_stage." + trimmed_name
-
-    elif "network" in old_name and re.search(r".\d.", old_name):
-        new_name = old_name.replace("network", "intermediate_stages")
-
-    if "fc" in new_name:
-        new_name = new_name.replace("fc", "convolution")
-    elif ("norm1" in new_name) and ("layernorm1" not in new_name):
-        new_name = new_name.replace("norm1", "batchnorm_before")
-    elif ("norm2" in new_name) and ("layernorm2" not in new_name):
-        new_name = new_name.replace("norm2", "batchnorm_after")
-    if "proj" in new_name:
-        new_name = new_name.replace("proj", "projection")
-    if "dist_head" in new_name:
-        new_name = new_name.replace("dist_head", "distillation_classifier")
-    elif "head" in new_name:
-        new_name = new_name.replace("head", "classifier")
-    elif "patch_embed" in new_name:
-        new_name = "efficientformer." + new_name
-    elif new_name == "norm.weight" or new_name == "norm.bias":
-        new_name = new_name.replace("norm", "layernorm")
-        new_name = "efficientformer." + new_name
-    else:
-        new_name = "efficientformer.encoder." + new_name
-
-    return new_name
-
-
-def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage):
-    for key in checkpoint.copy().keys():
-        val = checkpoint.pop(key)
-        checkpoint[rename_key(key, num_meta4D_last_stage)] = val
-
-    return checkpoint
-
-
-# We will verify our results on a COCO image
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    return image
-
-
-def convert_efficientformer_checkpoint(
-    checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool
-):
-    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    config = EfficientFormerConfig.from_json_file(efficientformer_config_file)
-    model = EfficientFormerForImageClassificationWithTeacher(config)
-    model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1])
-
-    num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1
-    new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage)
-
-    model.load_state_dict(new_state_dict)
-    model.eval()
-
-    pillow_resamplings = {
-        "bilinear": PILImageResampling.BILINEAR,
-        "bicubic": PILImageResampling.BICUBIC,
-        "nearest": PILImageResampling.NEAREST,
-    }
-
-    # prepare image
-    image = prepare_img()
-    image_size = 256
-    crop_size = 224
-    processor = EfficientFormerImageProcessor(
-        size={"shortest_edge": image_size},
-        crop_size={"height": crop_size, "width": crop_size},
-        resample=pillow_resamplings["bicubic"],
-    )
-    pixel_values = processor(images=image, return_tensors="pt").pixel_values
-
-    # original processing pipeline
-    image_transforms = Compose(
-        [
-            Resize(image_size, interpolation=pillow_resamplings["bicubic"]),
-            CenterCrop(crop_size),
-            ToTensor(),
-            Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-        ]
-    )
-    original_pixel_values = image_transforms(image).unsqueeze(0)
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    outputs = model(pixel_values)
-    logits = outputs.logits
-
-    expected_shape = (1, 1000)
-
-    if "l1" in model_name:
-        expected_logits = torch.Tensor(
-            [-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328]
-        )
-        assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
-        assert logits.shape == expected_shape
-    elif "l3" in model_name:
-        expected_logits = torch.Tensor(
-            [-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127]
-        )
-        assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3)
-        assert logits.shape == expected_shape
-    elif "l7" in model_name:
-        expected_logits = torch.Tensor(
-            [-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878]
-        )
-        assert logits.shape == expected_shape
-    else:
-        raise ValueError(
-            f"Unknown model checkpoint: {checkpoint_path}. Supported version of efficientformer are l1, l3 and l7"
-        )
-
-    # Save Checkpoints
-    Path(pytorch_dump_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_path)
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
-    processor.save_pretrained(pytorch_dump_path)
-    print(f"Processor successfuly saved at {pytorch_dump_path}")
-
-    if push_to_hub:
-        print("Pushing model to the hub...")
-
-        model.push_to_hub(
-            repo_id=f"Bearnardd/{pytorch_dump_path}",
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-        processor.push_to_hub(
-            repo_id=f"Bearnardd/{pytorch_dump_path}",
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pytorch_model_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to EfficientFormer pytorch checkpoint.",
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The json file for EfficientFormer model config.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
-    parser.add_argument(
-        "--no-push_to_hub",
-        dest="push_to_hub",
-        action="store_false",
-        help="Do not push model and image processor to the hub",
-    )
-    parser.set_defaults(push_to_hub=True)
-
-    args = parser.parse_args()
-    convert_efficientformer_checkpoint(
-        checkpoint_path=args.pytorch_model_path,
-        efficientformer_config_file=args.config_file,
-        pytorch_dump_path=args.pytorch_dump_path,
-        push_to_hub=args.push_to_hub,
-    )
diff --git a/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py b/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
index d1503f661d57..1c42759ed29f 100644
--- a/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
@@ -181,8 +181,8 @@ def preprocess(
         do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
         do_normalize: Optional[bool] = None,
diff --git a/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py b/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
index 306790021a7b..f86656c0b13d 100644
--- a/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
@@ -717,9 +717,9 @@ class token).
             the self-attention heads.
     """
 
-    logits: torch.FloatTensor = None
-    cls_logits: torch.FloatTensor = None
-    distillation_logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    distillation_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py b/src/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
index d47d06e7837c..76fdaa1f088b 100644
--- a/src/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
@@ -1087,9 +1087,9 @@ class token).
             the self-attention heads.
     """
 
-    logits: tf.Tensor = None
-    cls_logits: tf.Tensor = None
-    distillation_logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
+    cls_logits: Optional[tf.Tensor] = None
+    distillation_logits: Optional[tf.Tensor] = None
     hidden_states: Optional[Tuple[tf.Tensor]] = None
     attentions: Optional[Tuple[tf.Tensor]] = None
 
diff --git a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index a84d000d4439..000000000000
--- a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert GPTSANJapanese checkpoints from the original repository to pytorch model."""
-
-import argparse
-import json
-import os
-from collections import OrderedDict
-
-import numpy as np
-import tensorflow as tf
-import torch
-
-
-def convert_tf_gptsan_to_pt(args):
-    parameter_file = os.path.join(args.tf_model_dir, "parameters.json")
-    params = json.loads(open(parameter_file).read())
-    if not params:
-        raise ValueError(
-            f"It seems that the json file at {parameter_file} is empty. Make sure you have a correct json file."
-        )
-    if not args.output.endswith(".pt"):
-        args.output = args.output + ".pt"
-    new_state = OrderedDict()
-    with tf.device("/CPU:0"):
-        reader = tf.train.load_checkpoint(args.tf_model_dir)
-        shapes = reader.get_variable_to_shape_map()
-        for key_name in shapes.keys():
-            vnp = reader.get_tensor(key_name).astype(np.float16)
-            if key_name.endswith("/adam_m") or key_name.endswith("/adam_v"):
-                continue
-            if key_name.startswith("pasts/"):
-                if key_name.startswith("pasts/mlp"):
-                    player = int(key_name[9])
-                elif key_name.startswith("pasts/out"):
-                    player = 8
-                name = "model.sqout.%d.weight" % (player * 2)  # enter to nn.Sequencial with Tanh, so 2 at a time
-                state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/moe"):
-                player = int(key_name[9:].split("/")[0])
-                if key_name.endswith("/switch_gating/kernel"):
-                    name = "model.blocks.%d.feed_forward.mlp.router.classifier.weight" % player
-                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/softmlp/kernel"):
-                    name = "model.blocks.%d.feed_forward.soft_bypass_mlp.weight" % player
-                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/wo/kernel") or key_name.endswith("/wi/kernel"):
-                    nlayer = key_name[-9:-7]
-                    for i in range(16):
-                        name = "model.blocks.%d.feed_forward.mlp.experts.expert_%d.%s.weight" % (player, i, nlayer)
-                        state = (
-                            vnp[i].transpose([1, 0]).copy()
-                        )  # In Mesh-Tensorflow, it is one array, so it is divided
-                        new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/mlp"):
-                player = int(key_name[9:].split("/")[0])
-                if key_name.endswith("/p1/kernel"):
-                    name = "model.blocks.%d.feed_forward.mlp.wi.weight" % player
-                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/p1/bias"):
-                    name = "model.blocks.%d.feed_forward.mlp.wi.bias" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/p2/kernel"):
-                    name = "model.blocks.%d.feed_forward.mlp.wo.weight" % player
-                    state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/p2/bias"):
-                    name = "model.blocks.%d.feed_forward.mlp.wo.bias" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/ln"):
-                player = int(key_name[8:].split("/")[0])
-                if key_name.endswith("/b"):
-                    name = "model.blocks.%d.feed_forward.norm.bias" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/g"):
-                    name = "model.blocks.%d.feed_forward.norm.weight" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/att"):
-                player = int(key_name[9:].split("/")[0])
-                if key_name.endswith("/qkv/kernel"):
-                    state = vnp.copy()  # Compute same dimension as Mesh-tensorflow using einsum
-                    state_q = state[:, 0, :, :]
-                    state_k = state[:, 1, :, :]
-                    state_v = state[:, 2, :, :]
-                    state_q = (
-                        state_q.reshape([state_q.shape[0], state_q.shape[1] * state_q.shape[2]])
-                        .transpose([1, 0])
-                        .copy()
-                    )  # Mesh-Tensorflow is a diagonal matrix
-                    state_k = (
-                        state_k.reshape([state_k.shape[0], state_k.shape[1] * state_k.shape[2]])
-                        .transpose([1, 0])
-                        .copy()
-                    )  # Mesh-Tensorflow is a diagonal matrix
-                    state_v = (
-                        state_v.reshape([state_v.shape[0], state_v.shape[1] * state_v.shape[2]])
-                        .transpose([1, 0])
-                        .copy()
-                    )  # Mesh-Tensorflow is a diagonal matrix
-                    name = "model.blocks.%d.self_attn.self_attn.q_proj.weight" % player
-                    new_state[name] = torch.tensor(state_q)
-                    name = "model.blocks.%d.self_attn.self_attn.k_proj.weight" % player
-                    new_state[name] = torch.tensor(state_k)
-                    name = "model.blocks.%d.self_attn.self_attn.v_proj.weight" % player
-                    new_state[name] = torch.tensor(state_v)
-                elif key_name.endswith("/o/kernel"):
-                    name = "model.blocks.%d.self_attn.self_attn.out_proj.weight" % player
-                    state = (
-                        vnp.reshape([vnp.shape[0] * vnp.shape[1], vnp.shape[2]]).transpose([1, 0]).copy()
-                    )  # Mesh-Tensorflow is a diagonal matrix
-                    new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/an"):
-                player = int(key_name[8:].split("/")[0])
-                if key_name.endswith("/b"):
-                    name = "model.blocks.%d.self_attn.norm.bias" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-                elif key_name.endswith("/g"):
-                    name = "model.blocks.%d.self_attn.norm.weight" % player
-                    state = vnp.copy()  # same because it is one dimensional
-                    new_state[name] = torch.tensor(state)
-            elif (
-                key_name.startswith("model/wte")
-                or key_name.startswith("model/wpe")
-                or key_name.startswith("model/ete")
-            ):
-                nlayer = {"wte": "embed_tokens", "wpe": "position_embeddings", "ete": "extra_position_embeddings"}[
-                    key_name[-3:]
-                ]
-                name = "model.%s.weight" % nlayer
-                state = vnp.copy()  # same in embedded
-                new_state[name] = torch.tensor(state)
-                if key_name.startswith("model/wte"):
-                    name = "lm_head.weight"
-                    state = vnp.copy()  # same in embedded
-                    new_state[name] = torch.tensor(state)
-            elif key_name.startswith("model/wob"):
-                name = "final_logits_bias"
-                state = vnp.copy()  # same in embedded
-                state = state.reshape((1, -1))
-                new_state[name] = torch.tensor(state)
-            elif key_name == "model/dense/kernel":
-                name = "model.last_project.weight"
-                state = vnp.transpose([1, 0]).copy()  # Mesh-Tensorflow is a diagonal matrix
-                new_state[name] = torch.tensor(state)
-            elif key_name == "model/dense_1/bias":
-                name = "model.last_project.bias"
-                state = vnp.copy()  # same because it is one dimensional
-                new_state[name] = torch.tensor(state)
-    torch.save(new_state, args.output)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="model converter.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--tf_model_dir", metavar="PATH", type=str, required=True, help="import model")
-    parser.add_argument("--output", metavar="PATH", type=str, required=True, help="output model")
-    args = parser.parse_args()
-    convert_tf_gptsan_to_pt(args)
diff --git a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
index f1331da83eec..a8d5eac1e19b 100644
--- a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -18,6 +18,7 @@
 import json
 import os
 import re
+import sys
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -407,12 +408,26 @@ def __init__(self, vocab, ids_to_tokens, emoji):
         self.content_repatter5 = re.compile(
             r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
         )
-        self.content_repatter6 = re.compile(
-            r"((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*億)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*万)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*千)*(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*(千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+(\(税込\)|\(税抜\)|\+tax)*"
-        )
+        # The original version of this regex displays catastrophic backtracking behaviour. We avoid this using
+        # possessive quantifiers in Py >= 3.11. In versions below this, we avoid the vulnerability using a slightly
+        # different regex that should generally have the same behaviour in most non-pathological cases.
+        if sys.version_info >= (3, 11):
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億])*+"
+                r"(?:\d,\d{3}|[\d万])*+"
+                r"(?:\d,\d{3}|[\d千])*+"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
+        else:
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億万千])*"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
         keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
         blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
-        self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks})
+        self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "<BLOCK>"))
 
     def __len__(self):
         return len(self.ids_to_tokens)
diff --git a/src/transformers/models/deprecated/graphormer/configuration_graphormer.py b/src/transformers/models/deprecated/graphormer/configuration_graphormer.py
index 058ef9d03a40..e32a853ae152 100644
--- a/src/transformers/models/deprecated/graphormer/configuration_graphormer.py
+++ b/src/transformers/models/deprecated/graphormer/configuration_graphormer.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """Graphormer model configuration"""
 
+from typing import Optional
+
 from ....configuration_utils import PretrainedConfig
 from ....utils import logging
 
@@ -153,14 +155,14 @@ def __init__(
         pre_layernorm: bool = False,
         apply_graphormer_init: bool = False,
         activation_fn: str = "gelu",
-        embed_scale: float = None,
+        embed_scale: Optional[float] = None,
         freeze_embeddings: bool = False,
         num_trans_layers_to_freeze: int = 0,
         traceable: bool = False,
         q_noise: float = 0.0,
         qn_block_size: int = 8,
-        kdim: int = None,
-        vdim: int = None,
+        kdim: Optional[int] = None,
+        vdim: Optional[int] = None,
         bias: bool = True,
         self_attention: bool = True,
         pad_token_id=0,
diff --git a/src/transformers/models/deprecated/jukebox/convert_jukebox.py b/src/transformers/models/deprecated/jukebox/convert_jukebox.py
deleted file mode 100644
index b56a25c57c70..000000000000
--- a/src/transformers/models/deprecated/jukebox/convert_jukebox.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Jukebox checkpoints"""
-
-import argparse
-import json
-import os
-from pathlib import Path
-
-import requests
-import torch
-
-from transformers import JukeboxConfig, JukeboxModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-PREFIX = "https://openaipublic.azureedge.net/jukebox/models/"
-MODEL_MAPPING = {
-    "jukebox-1b-lyrics": [
-        "5b/vqvae.pth.tar",
-        "5b/prior_level_0.pth.tar",
-        "5b/prior_level_1.pth.tar",
-        "1b_lyrics/prior_level_2.pth.tar",
-    ],
-    "jukebox-5b-lyrics": [
-        "5b/vqvae.pth.tar",
-        "5b/prior_level_0.pth.tar",
-        "5b/prior_level_1.pth.tar",
-        "5b_lyrics/prior_level_2.pth.tar",
-    ],
-}
-
-
-def replace_key(key):
-    if key.endswith(".model.1.bias") and len(key.split(".")) > 10:
-        key = key.replace(".model.1.bias", ".conv1d_1.bias")
-    elif key.endswith(".model.1.weight") and len(key.split(".")) > 10:
-        key = key.replace(".model.1.weight", ".conv1d_1.weight")
-    elif key.endswith(".model.3.bias") and len(key.split(".")) > 10:
-        key = key.replace(".model.3.bias", ".conv1d_2.bias")
-    elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
-        key = key.replace(".model.3.weight", ".conv1d_2.weight")
-
-    if "conditioner_blocks.0." in key:
-        key = key.replace("conditioner_blocks.0", "conditioner_blocks")
-
-    if "prime_prior" in key:
-        key = key.replace("prime_prior", "encoder")
-
-    if ".emb." in key and "total" not in key and "absolute" not in key and "relative" not in key:
-        key = key.replace(".emb.", ".")
-
-    if key.endswith("k"):  # replace vqvae.X.k with vqvae.X.codebook
-        return key.replace(".k", ".codebook")
-    if "y_emb." in key:
-        return key.replace("y_emb.", "metadata_embedding.")
-
-    if "x_emb.emb." in key:
-        key = key.replace("0.x_emb.emb", "embed_tokens")
-
-    if "prime_state_ln" in key:
-        return key.replace("prime_state_ln", "encoder.final_layer_norm")
-    if ".ln" in key:
-        return key.replace(".ln", ".layer_norm")
-    if "_ln" in key:
-        return key.replace("_ln", "_layer_norm")
-
-    if "prime_state_proj" in key:
-        return key.replace("prime_state_proj", "encoder.proj_in")
-    if "prime_x_out" in key:
-        return key.replace("prime_x_out", "encoder.lm_head")
-    if "prior.x_out" in key:
-        return key.replace("x_out", "fc_proj_out")
-    if "x_emb" in key:
-        return key.replace("x_emb", "embed_tokens")
-
-    return key
-
-
-def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
-    new_dict = {}
-    import re
-
-    re_encoder_block_conv_in = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
-    re_encoder_block_resnet = re.compile(
-        r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
-    )
-    re_encoder_block_proj_out = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
-
-    re_decoder_block_conv_out = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
-    re_decoder_block_resnet = re.compile(
-        r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
-    )
-    re_decoder_block_proj_in = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
-
-    re_prior_cond_conv_out = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)")
-    re_prior_cond_resnet = re.compile(
-        r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
-    )
-    re_prior_cond_proj_in = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)")
-
-    for original_key, value in state_dict.items():
-        # rename vqvae.encoder keys
-        if re_encoder_block_conv_in.fullmatch(original_key):
-            regex_match = re_encoder_block_conv_in.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[2]) * 2 + int(groups[3])
-            re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}.{groups[-1]}"
-            key = re_encoder_block_conv_in.sub(re_new_key, original_key)
-
-        elif re_encoder_block_resnet.fullmatch(original_key):
-            regex_match = re_encoder_block_resnet.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[2]) * 2 + int(groups[3])
-            conv_index = {"1": 1, "3": 2}[groups[-2]]
-            prefix = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}."
-            resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
-            re_new_key = prefix + resnet_block
-            key = re_encoder_block_resnet.sub(re_new_key, original_key)
-
-        elif re_encoder_block_proj_out.fullmatch(original_key):
-            regex_match = re_encoder_block_proj_out.match(original_key)
-            groups = regex_match.groups()
-            re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.proj_out.{groups[-1]}"
-            key = re_encoder_block_proj_out.sub(re_new_key, original_key)
-
-        # rename vqvae.decoder keys
-        elif re_decoder_block_conv_out.fullmatch(original_key):
-            regex_match = re_decoder_block_conv_out.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[2]) * 2 + int(groups[3]) - 2
-            re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}.{groups[-1]}"
-            key = re_decoder_block_conv_out.sub(re_new_key, original_key)
-
-        elif re_decoder_block_resnet.fullmatch(original_key):
-            regex_match = re_decoder_block_resnet.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[2]) * 2 + int(groups[3]) - 2
-            conv_index = {"1": 1, "3": 2}[groups[-2]]
-            prefix = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}."
-            resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
-            re_new_key = prefix + resnet_block
-            key = re_decoder_block_resnet.sub(re_new_key, original_key)
-
-        elif re_decoder_block_proj_in.fullmatch(original_key):
-            regex_match = re_decoder_block_proj_in.match(original_key)
-            groups = regex_match.groups()
-            re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.proj_in.{groups[-1]}"
-            key = re_decoder_block_proj_in.sub(re_new_key, original_key)
-
-        # rename prior cond.model to upsampler.upsample_block and resnet
-        elif re_prior_cond_conv_out.fullmatch(original_key):
-            regex_match = re_prior_cond_conv_out.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[1]) * 2 + int(groups[2]) - 2
-            re_new_key = f"conditioner_blocks.upsampler.upsample_block.{block_index}.{groups[-1]}"
-            key = re_prior_cond_conv_out.sub(re_new_key, original_key)
-
-        elif re_prior_cond_resnet.fullmatch(original_key):
-            regex_match = re_prior_cond_resnet.match(original_key)
-            groups = regex_match.groups()
-            block_index = int(groups[1]) * 2 + int(groups[2]) - 2
-            conv_index = {"1": 1, "3": 2}[groups[-2]]
-            prefix = f"conditioner_blocks.upsampler.upsample_block.{block_index}."
-            resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
-            re_new_key = prefix + resnet_block
-            key = re_prior_cond_resnet.sub(re_new_key, original_key)
-
-        elif re_prior_cond_proj_in.fullmatch(original_key):
-            regex_match = re_prior_cond_proj_in.match(original_key)
-            groups = regex_match.groups()
-            re_new_key = f"conditioner_blocks.upsampler.proj_in.{groups[-1]}"
-            key = re_prior_cond_proj_in.sub(re_new_key, original_key)
-
-        # keep original key
-        else:
-            key = original_key
-
-        key = replace_key(key)
-
-        if f"{key_prefix}.{key}" not in model_state_dict or key is None:
-            print(f"failed converting {original_key} to {key}, does not match")
-
-        # handle missmatched shape
-        elif value.shape != model_state_dict[f"{key_prefix}.{key}"].shape:
-            val = model_state_dict[f"{key_prefix}.{key}"]
-            print(f"{original_key}-> {key} : \nshape {val.shape} and { value.shape}, do not match")
-            key = original_key
-
-        mapping[key] = original_key
-        new_dict[key] = value
-
-    return new_dict
-
-
-@torch.no_grad()
-def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
-    """
-    Copy/paste/tweak model's weights to our Jukebox structure.
-    """
-    for file in MODEL_MAPPING[model_name]:
-        if not os.path.isfile(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}"):
-            r = requests.get(f"{PREFIX}{file}", allow_redirects=True)
-            os.makedirs(f"{pytorch_dump_folder_path}/", exist_ok=True)
-            open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content)
-
-    model_to_convert = MODEL_MAPPING[model_name.split("/")[-1]]
-
-    config = JukeboxConfig.from_pretrained(model_name)
-    model = JukeboxModel(config)
-
-    weight_dict = []
-    mapping = {}
-    for i, dict_name in enumerate(model_to_convert):
-        old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}")["model"]
-
-        new_dic = {}
-        for k in old_dic.keys():
-            if k.endswith(".b"):
-                new_dic[k.replace("b", "bias")] = old_dic[k]
-            elif k.endswith(".w"):
-                new_dic[k.replace("w", "weight")] = old_dic[k]
-            elif "level_2" not in dict_name and "cond.model." in k:
-                new_dic[k.replace(".blocks.", ".model.")] = old_dic[k]
-            else:
-                new_dic[k] = old_dic[k]
-
-        key_prefix = "vqvae" if i == 0 else f"priors.{3 - i}"
-        new_dic = fix_jukebox_keys(new_dic, model.state_dict(), key_prefix, mapping)
-        weight_dict.append(new_dic)
-
-    vqvae_state_dict = weight_dict.pop(0)
-    model.vqvae.load_state_dict(vqvae_state_dict)
-    for i in range(len(weight_dict)):
-        model.priors[i].load_state_dict(weight_dict[2 - i])
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    with open(f"{pytorch_dump_folder_path}/mapping.json", "w") as txtfile:
-        json.dump(mapping, txtfile)
-
-    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    return weight_dict
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="jukebox-5b-lyrics",
-        type=str,
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="jukebox-5b-lyrics-converted",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    args = parser.parse_args()
-    convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
index 6688c79e71a2..213711ae121b 100755
--- a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
@@ -148,7 +148,7 @@ def get_alignment(music_tokens, labels, prior, config):
             del w_hop
         weights = torch.cat(w_hops, dim=0)
         del w_hops
-        alignment_hop = weights.float().cpu().numpy()
+        alignment_hop = weights.to(device="cpu", dtype=torch.float).numpy()
         del weights
 
         # alignment_hop has shape (bs, n_ctx, nb_relevant_lyric_tokens)
@@ -429,7 +429,7 @@ def update_codebook(self, hidden_states, latent_states):
             entropy = -torch.sum(_codebook_prob * torch.log(_codebook_prob + 1e-8))  # entropy ie how diverse
             used_curr = (_codebook_elem >= self.threshold).sum()
             usage = torch.sum(usage)
-            dk = torch.norm(self.codebook - old_codebook) / np.sqrt(np.prod(old_codebook.shape))
+            dk = torch.linalg.norm(self.codebook - old_codebook) / np.sqrt(np.prod(old_codebook.shape))
         return {"entropy": entropy, "used_curr": used_curr, "usage": usage, "dk": dk}
 
     def preprocess(self, hidden_states):
@@ -437,11 +437,13 @@ def preprocess(self, hidden_states):
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
 
         if hidden_states.shape[-1] == self.codebook_width:
-            prenorm = torch.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(np.prod(hidden_states.shape))
+            prenorm = torch.linalg.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(
+                np.prod(hidden_states.shape)
+            )
         elif hidden_states.shape[-1] == 2 * self.codebook_width:
             x1, x2 = hidden_states[..., : self.codebook_width], hidden_states[..., self.codebook_width :]
-            prenorm = (torch.norm(x1 - torch.mean(x1)) / np.sqrt(np.prod(x1.shape))) + (
-                torch.norm(x2 - torch.mean(x2)) / np.sqrt(np.prod(x2.shape))
+            prenorm = (torch.linalg.norm(x1 - torch.mean(x1)) / np.sqrt(np.prod(x1.shape))) + (
+                torch.linalg.norm(x2 - torch.mean(x2)) / np.sqrt(np.prod(x2.shape))
             )
 
             # Normalise
@@ -517,7 +519,9 @@ def forward(self, hidden_states, update_codebook=True):
             update_metrics = {}
 
         # Loss
-        commit_loss = torch.norm(dequantised_states.detach() - hidden_states) ** 2 / np.prod(hidden_states.shape)
+        commit_loss = torch.linalg.norm(dequantised_states.detach() - hidden_states) ** 2 / np.prod(
+            hidden_states.shape
+        )
 
         # Passthrough
         dequantised_states = hidden_states + (dequantised_states - hidden_states).detach()
@@ -1303,7 +1307,7 @@ def __init__(
             n_ctx (`int`, *optional*):
                 Number of tokens or lyrics tokens provided in a single pass.
             embed_dim (`int`, *optional*):
-                Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook dimension,
+                Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codebook dimension,
                 if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
             audio_conditioning (`bool`, *optional*, defaults to `False`):
                 Whether or not the prior supports conditionning on audio.
@@ -1921,7 +1925,7 @@ def get_metadata(self, labels, start, total_length, offset, get_indices=False):
 
     def set_metadata_lyric_tokens(self, labels):
         """
-        Processes the full labels to only retreive the relevant lyric tokens and keep the metadata conditioning tokens.
+        Processes the full labels to only retrieve the relevant lyric tokens and keep the metadata conditioning tokens.
         """
         if self.nb_relevant_lyric_tokens > 0:
             tokens_list = torch.zeros(
@@ -2147,7 +2151,7 @@ def sample(
 
     def get_encoder_states(self, lyric_tokens, sample=False):
         """
-        Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
+        Retrieve the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
         the lyric encoder.
         """
         if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
@@ -2205,12 +2209,12 @@ def forward_tokens(
         loss += next_token_prediction_loss * self.next_token_prediction_loss_dims / self.total_loss_dims
 
         metrics = {
-            "bpd": next_token_prediction_loss.clone().detach(),
-            "encoder_loss": encoder_loss.clone().detach(),
-            "next_token_prediction_loss": next_token_prediction_loss.clone().detach(),
+            "bpd": next_token_prediction_loss.detach().clone(),
+            "encoder_loss": encoder_loss.detach().clone(),
+            "next_token_prediction_loss": next_token_prediction_loss.detach().clone(),
         }
         if get_preds:
-            metrics["preds"] = preds.clone().detach()
+            metrics["preds"] = preds.detach().clone()
         if get_attn_weights:
             saved_attn_weights = self.prior.transformer.saved_attn_weights
             self.prior.transformer.set_record_attn(False)
@@ -2366,7 +2370,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         new_tokens = sample_tokens - previous_sampled_tokens.shape[1]
 
         logger.info(
-            f"Sampling {sample_tokens} tokens for [{start},{start+sample_tokens}]. Conditioning on"
+            f"Sampling {sample_tokens} tokens for [{start},{start + sample_tokens}]. Conditioning on"
             f" {conditioning_tokens} tokens"
         )
 
@@ -2390,7 +2394,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
             name = ["Ancestral", "Primed"][music_tokens_i.shape[1] == 0]
             iterator.set_description(
                 f"[prior level {level}] {name} Sampling {sample_tokens} tokens out of"
-                f" {self.total_length//prior.raw_to_tokens}",
+                f" {self.total_length // prior.raw_to_tokens}",
                 refresh=True,
             )
             tokens_i = prior.sample(
diff --git a/src/transformers/models/deprecated/mctct/processing_mctct.py b/src/transformers/models/deprecated/mctct/processing_mctct.py
index e2201c0ed543..7dcbefe1018d 100644
--- a/src/transformers/models/deprecated/mctct/processing_mctct.py
+++ b/src/transformers/models/deprecated/mctct/processing_mctct.py
@@ -49,7 +49,7 @@ def __call__(self, *args, **kwargs):
         When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
         [`~MCTCTFeatureExtractor.__call__`] and returns its output. If used in the context
         [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's
-        [`~AutoTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
+        [`~AutoTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
         """
         # For backward compatibility
         if self._in_target_context_manager:
diff --git a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 1f791dab2404..000000000000
--- a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Convert Mega pretrained checkpoint. Built to convert the Masked LM checkpoint located at
-https://huggingface.co/mnaylor/mega-wikitext-103
-
-Requirements:
-  - clone the Mega repo and install fairseq from there
-    1. git clone https://github.com/facebookresearch/mega.git
-    2. cd mega && pip install -e
-  - clone the pretrained weights for the original implementation from the hugging face repo
-    * use this location as the path for pretrained weights
-"""
-
-import argparse
-
-# utilities to import the model weights and config file
-import os
-import pickle as pkl
-
-# PyTorch + new model classes
-import torch
-from torch import nn
-
-from transformers import AutoTokenizer, MegaConfig, MegaForMaskedLM
-
-
-# import the EncoderLayer class used to pretrain
-# !! NOTE !! this requires the version of fairseq that is built when you install the Mega source
-try:
-    from fairseq.modules.mega_layer import MegaEncoderLayer
-except ImportError:
-    raise ImportError("You need to install the version of fairseq from the Mega repo!")
-
-
-# define the wrapper classes used to train the MLM  (see colab notebook below)
-# https://colab.research.google.com/drive/1qfUO6o5HRdxBblWlw058HVyvaEPhPpH8?usp=sharing
-# MegaLM outputs hidden states
-class MegaLM(nn.Module):
-    "The base class for our Mega encoder - given input IDs, embed text and return encoder output"
-
-    def __init__(self, mega_args, depth, vocab_size):
-        super().__init__()
-        self.mega_args = mega_args
-        self.embedding_layer = nn.Embedding(vocab_size, self.mega_args.encoder_embed_dim)
-        self.encoders = nn.ModuleList([MegaEncoderLayer(self.mega_args) for _ in range(depth)])
-        self.depth = depth
-
-    def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
-        """
-        Code for a forward pass - expects input_ids and attention_mask to come from a Hugging Face tokenizer as PyTorch
-        tensors, and returns a tensor of size (batch, n_classes) containing classification logits
-
-        Other options:
-          - batch_first: boolean indicating whether the batch dimension is first in input_ids (default: True, which
-            aligns with the HF tokenizer behavior)
-          - ignore_mask_value: the value in attention_mask that identifies tokens that should be ignored (default: 0,
-            which aligns with HF tokenizer)
-        """
-
-        # Mega expects embeddings to be (time, batch, embedding size), but
-        # Hugging Face returns tokens as (batch, time)
-        if batch_first:
-            input_ids = input_ids.T
-
-        # to make things more confusing, Mega expects the attention mask to
-        # be (batch, time), but with values of 0 (normal token) and 1 (ignore token)
-        # which is the opposite of what HF returns
-        if ignore_mask_value == 0:
-            attention_mask = 1 - attention_mask
-
-        # get token embeddings from IDs
-        embeds = self.embedding_layer(input_ids)
-
-        # pass through the Mega layers
-        # input is (time, batch, encoder dim) and output is the same
-        for encoder in self.encoders:
-            embeds = encoder(embeds, attention_mask)
-
-        # return according to the shape specified
-        if batch_first:
-            # (T, B, H) --> (B, T, H)
-            return torch.transpose(embeds, 0, 1)
-        else:
-            return embeds
-
-
-# renamed from MegaForMaskedLM to avoid confusion with new module
-class OriginalMegaForMaskedLM(nn.Module):
-    "A wrapper class for doing masked language modeling with Mega"
-
-    def __init__(self, mega_args, depth, vocab_size):
-        super().__init__()
-        self.mega = MegaLM(mega_args, depth, vocab_size)
-        self.mlm_head = nn.Linear(mega_args.encoder_embed_dim, vocab_size)
-        self.dropout = nn.Dropout(p=0.1)
-
-    def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0):
-        """
-        Perform a forward pass through the Mega encoder and the masked LM head. Returns logits for each vocabulary
-        entry.
-
-        If `batch_first` (default to align with Hugging Face tokenizer behavior), output will have the shape (Batch
-        size, Sequence length, Vocab size); otherwise (S, B, V)
-        """
-        encoder_output = self.mega(input_ids, attention_mask, batch_first, ignore_mask_value)
-        return self.mlm_head(self.dropout(encoder_output))
-
-
-# code to convert the checkpoint located in the user-specified location
-def convert_checkpoint_to_huggingface(pretrained_checkpoint_path, output_path, includes_tokenizer):
-    with open(os.path.join(pretrained_checkpoint_path, "model_args.pkl"), "rb") as f:
-        mega_original_args = pkl.load(f)
-
-    # load the original encoder
-    original_mlm = OriginalMegaForMaskedLM(**mega_original_args).eval()
-
-    # load its weights
-    print(
-        "Original Mega encoder:",
-        original_mlm.mega.load_state_dict(
-            torch.load(os.path.join(pretrained_checkpoint_path, "encoder_weights.pt"), map_location="cpu")
-        ),
-    )
-    print(
-        "Original Mega MLM layer:",
-        original_mlm.mlm_head.load_state_dict(
-            torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu")
-        ),
-    )
-
-    # create a new config from the old one
-    hf_config = MegaConfig(
-        num_hidden_layers=mega_original_args["depth"],
-        vocab_size=mega_original_args["vocab_size"],
-        hidden_size=mega_original_args["mega_args"].encoder_embed_dim,
-        shared_representation_size=mega_original_args["mega_args"].encoder_z_dim,
-        intermediate_size=mega_original_args["mega_args"].encoder_hidden_dim,
-        ema_projection_size=mega_original_args["mega_args"].encoder_n_dim,
-        dropout_prob=mega_original_args["mega_args"].dropout,
-        attention_probs_dropout_prob=mega_original_args["mega_args"].attention_dropout,
-        hidden_dropout_prob=mega_original_args["mega_args"].hidden_dropout,
-        activation=mega_original_args["mega_args"].activation_fn,
-        attention_activation=mega_original_args["mega_args"].attention_activation_fn,
-        bidirectional=mega_original_args["mega_args"].bidirectional,
-        use_chunking=mega_original_args["mega_args"].encoder_chunk_size > 0,
-        chunk_size=mega_original_args["mega_args"].encoder_chunk_size,
-        truncation=mega_original_args["mega_args"].truncation_length,
-        normalization_type=mega_original_args["mega_args"].normalization_type,
-        normalize_before_mega=True,
-        norm_affine=True,
-        use_feature_dropout=mega_original_args["mega_args"].feature_dropout,
-        relative_positional_bias=mega_original_args["mega_args"].rel_pos_bias,
-        max_positions=mega_original_args["mega_args"].max_source_positions,
-        nffn_hidden_size=mega_original_args["mega_args"].encoder_ffn_embed_dim,
-        normalize_before_ffn=mega_original_args["mega_args"].normalize_before,
-        # new arguments added for HF implementation
-        nffn_activation_dropout_prob=0.0,
-        add_token_type_embeddings=False,
-        add_lm_hidden_dense_layer=False,
-    )
-
-    hf_mlm = MegaForMaskedLM(hf_config).eval()
-
-    # the originl checkpoint just uses nn.Embedding for the word embeddings
-    # we use a wrapper module for embeddings to add support for positional embeddings
-    hf_mlm.mega.embedding_layer.word_embeddings.weight = original_mlm.mega.embedding_layer.weight
-
-    # modify the state dictionary of the original checkpoint to account for naming issues in the Hugging Face
-    # ecosystem -- any names containing "beta" or "gamma" aren't safe to use and are renamed upon _load_pretrained,
-    # also renaming previously confusing parameter names
-    original_state_dict = original_mlm.mega.encoders.state_dict()
-    updated_keys = {}
-    for module_name in original_state_dict.keys():
-        new_module_name = None
-        # have to handle gamma, beta, and alpha differently due to their use
-        # in multiple modules within the original repository;
-        # beta is used in EMA, MovingAverageGatedAttention, and RotaryRelativePositionalBias, and must be renamed due to flax/tf weights
-        # the EMA sublayer was renamed from "move" to "ema_gate" for readability, so that is also done here
-        if "beta" in module_name:
-            # EMA sub-layers were always called "move" in the original repo
-            if "move.beta" in module_name:
-                new_module_name = module_name.replace("move.beta", "ema_gate.ema_expansion_matrix")
-            elif "mega_layer.beta" in module_name:
-                new_module_name = module_name.replace("beta", "qk_bias")
-            else:
-                new_module_name = module_name.replace("beta", "b_param")
-        # beta is used in EMA and MovingAverageGatedAttention, and must be renamed due to flax/tf weights
-        elif "gamma" in module_name:
-            if "move.gamma" in module_name:
-                new_module_name = module_name.replace("move.gamma", "ema_gate.kernel_projection_matrix")
-            elif "mega_layer.gamma" in module_name:
-                new_module_name = module_name.replace("gamma", "qk_weight")
-            else:
-                new_module_name = module_name.replace("gamma", "g_param")
-        # alpha is used in EMA and positional bias; renaming to improve readability
-        elif "move.alpha" in module_name:
-            new_module_name = module_name.replace("move.alpha", "ema_gate.decay_factor")
-        # delta is only used in EMA; renaming to improve readability
-        elif "move.delta" in module_name:
-            new_module_name = module_name.replace("move.delta", "ema_gate.damping_factor")
-        # omega is only used in EMA; renaming to improve readability
-        elif "omega" in module_name:
-            new_module_name = module_name.replace("move.omega", "ema_gate.residual_weight")
-
-        if new_module_name:
-            updated_keys[module_name] = new_module_name
-
-    if len(updated_keys) != 0:
-        print(f"Renaming these keys: {updated_keys.keys()}")
-    else:
-        print("No need to rename state dict entries")
-    for old, new in updated_keys.items():
-        original_state_dict[new] = original_state_dict.pop(old)
-
-    # now attempt to load the state dictionary with updated names
-    # note that we now call it `mega.layers` instead of `mega.encoders` due to hugging face style
-    print("HF Mega encoder:", hf_mlm.mega.layers.load_state_dict(original_state_dict))
-
-    # load the MLM head weights directly
-    print(
-        "HF Mega MLM layer:",
-        hf_mlm.mlm_head.load_state_dict(
-            torch.load(os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu")
-        ),
-    )
-
-    # test on a randomly generated input sequence
-    input_ids = torch.randint(0, hf_config.vocab_size, size=(4, 256))
-    input_mask = torch.ones_like(input_ids)
-    # mask a few tokens to make sure masking is applied appropriately :)
-    input_mask[:, -10:] = 0
-
-    # run forward passes
-    original_output = original_mlm(input_ids, input_mask, batch_first=True, ignore_mask_value=0)
-    hf_output = hf_mlm(input_ids, input_mask)[0]
-
-    # print shapes and diff
-    print(f"original output {original_output.shape}")
-    print(f"hf output {hf_output.shape}")
-    print(f"max diff: {(original_output - hf_output).max()}")  # 0.0
-    success = torch.allclose(original_output, hf_output, atol=1e-3)
-
-    if success:
-        print("Yay!")
-        hf_mlm.save_pretrained(output_path)
-    else:
-        raise RuntimeError(f"Something's broken :(\nOriginal:\n{original_output}\n\nHF\n{hf_output}\n{hf_mlm}")
-
-    if includes_tokenizer:
-        print("Transferring tokenizer")
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_checkpoint_path)
-        tokenizer.save_pretrained(output_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--pretrained_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Point to the directory containing your model weights using the official Mega repo",
-    )
-
-    parser.add_argument(
-        "--output_path", default=None, type=str, required=True, help="Location to save the Hugging Face version"
-    )
-
-    parser.add_argument(
-        "--includes_tokenizer",
-        action="store_true",
-        help="Use this flag if there is a Hugging Face tokenizer in the original checkpoint repo",
-    )
-
-    args = parser.parse_args()
-
-    convert_checkpoint_to_huggingface(args.pretrained_checkpoint_path, args.output_path, args.includes_tokenizer)
diff --git a/src/transformers/models/deprecated/nat/modeling_nat.py b/src/transformers/models/deprecated/nat/modeling_nat.py
index b3827f3787ef..0a59c827cd5a 100644
--- a/src/transformers/models/deprecated/nat/modeling_nat.py
+++ b/src/transformers/models/deprecated/nat/modeling_nat.py
@@ -97,7 +97,7 @@ class NatEncoderOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -132,7 +132,7 @@ class NatModelOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -169,7 +169,7 @@ class NatImageClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
diff --git a/src/transformers/models/deprecated/nezha/modeling_nezha.py b/src/transformers/models/deprecated/nezha/modeling_nezha.py
index 3346a4f835a3..1f76a217719d 100644
--- a/src/transformers/models/deprecated/nezha/modeling_nezha.py
+++ b/src/transformers/models/deprecated/nezha/modeling_nezha.py
@@ -760,8 +760,8 @@ class NezhaForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
index e20c33f24a32..3a19fd24a4c4 100644
--- a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
@@ -154,7 +154,7 @@ def _rope_scaling_validation(self):
 
         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
             raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+                f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
             )
         rope_scaling_type = self.rope_scaling.get("type", None)
         rope_scaling_factor = self.rope_scaling.get("factor", None)
diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
index b6043fde047e..4b3f07d7a854 100644
--- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -73,7 +73,10 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim)
+        )
         self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         # Build here to make `torch.jit.trace` work.
@@ -135,7 +138,10 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
             base = self.base * (
                 (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
             ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+            inv_freq = 1.0 / (
+                base
+                ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim)
+            )
             self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
@@ -546,7 +552,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -704,7 +710,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -716,7 +722,6 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -878,7 +883,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
diff --git a/src/transformers/models/deprecated/realm/modeling_realm.py b/src/transformers/models/deprecated/realm/modeling_realm.py
index 67eb94c6c4e8..b518849cedc2 100644
--- a/src/transformers/models/deprecated/realm/modeling_realm.py
+++ b/src/transformers/models/deprecated/realm/modeling_realm.py
@@ -139,9 +139,9 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         elif m_name == "kernel":
             array = np.transpose(array)
         try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            assert pointer.shape == array.shape, (
+                f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            )
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
@@ -663,7 +663,7 @@ class RealmEmbedderOutput(ModelOutput):
             heads.
     """
 
-    projected_score: torch.FloatTensor = None
+    projected_score: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -682,9 +682,9 @@ class RealmScorerOutput(ModelOutput):
             Candidate score derived from the embedder.
     """
 
-    relevance_score: torch.FloatTensor = None
-    query_score: torch.FloatTensor = None
-    candidate_score: torch.FloatTensor = None
+    relevance_score: Optional[torch.FloatTensor] = None
+    query_score: Optional[torch.FloatTensor] = None
+    candidate_score: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -724,13 +724,13 @@ class RealmReaderOutput(ModelOutput):
             heads.
     """
 
-    loss: torch.FloatTensor = None
-    retriever_loss: torch.FloatTensor = None
-    reader_loss: torch.FloatTensor = None
+    loss: Optional[torch.FloatTensor] = None
+    retriever_loss: Optional[torch.FloatTensor] = None
+    reader_loss: Optional[torch.FloatTensor] = None
     retriever_correct: torch.BoolTensor = None
     reader_correct: torch.BoolTensor = None
-    block_idx: torch.LongTensor = None
-    candidate: torch.LongTensor = None
+    block_idx: Optional[torch.LongTensor] = None
+    candidate: Optional[torch.LongTensor] = None
     start_pos: torch.int32 = None
     end_pos: torch.int32 = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -751,7 +751,7 @@ class RealmForOpenQAOutput(ModelOutput):
     """
 
     reader_output: dict = None
-    predicted_answer_ids: torch.LongTensor = None
+    predicted_answer_ids: Optional[torch.LongTensor] = None
 
 
 class RealmPredictionHeadTransform(nn.Module):
diff --git a/src/transformers/models/deprecated/realm/retrieval_realm.py b/src/transformers/models/deprecated/realm/retrieval_realm.py
index 4bfa2106c65c..2b499fca280c 100644
--- a/src/transformers/models/deprecated/realm/retrieval_realm.py
+++ b/src/transformers/models/deprecated/realm/retrieval_realm.py
@@ -21,7 +21,7 @@
 from huggingface_hub import hf_hub_download
 
 from .... import AutoTokenizer
-from ....utils import logging
+from ....utils import logging, strtobool
 
 
 _REALM_BLOCK_RECORDS_FILENAME = "block_records.npy"
@@ -114,6 +114,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             block_records_path = hf_hub_download(
                 repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kwargs
             )
+        if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+            raise ValueError(
+                "This part uses `pickle.load` which is insecure and will execute arbitrary code that is "
+                "potentially malicious. It's recommended to never unpickle data that could have come from an "
+                "untrusted source, or that could have been tampered with. If you already verified the pickle "
+                "data and decided to use it, you can set the environment variable "
+                "`TRUST_REMOTE_CODE` to `True` to allow it."
+            )
         block_records = np.load(block_records_path, allow_pickle=True)
 
         tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
diff --git a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
index 8f1a8370933c..18ab8db6d05a 100755
--- a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
@@ -44,7 +44,6 @@ def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional
         self.offset = 2
         self.embedding_dim = embedding_dim
         self.padding_idx = padding_idx
-        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
 
     def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
         emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
@@ -399,6 +398,11 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, Speech2Text2SinusoidalPositionalEmbedding):
+            weight = module.get_embedding(*module.weight.shape, module.padding_idx)
+            weight = nn.Parameter(weight, requires_grad=False)
+            weight.detach_()
+            module.weight = weight
 
 
 SPEECH_TO_TEXT_2_START_DOCSTRING = r"""
@@ -579,7 +583,7 @@ def forward(
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
-                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache =" " False`..."
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
                 )
                 use_cache = False
 
diff --git a/src/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py
index ce8527e4a72e..f3eb696f893d 100644
--- a/src/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py
+++ b/src/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py
@@ -50,7 +50,7 @@ def __call__(self, *args, **kwargs):
         When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
         [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
         [`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to
-        Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of the above two
+        Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the docstring of the above two
         methods for more information.
         """
         # For backward compatibility
diff --git a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index da7f7806671d..000000000000
--- a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TrajectoryTransformer pytorch checkpoint conversion"""
-
-import torch
-import trajectory.utils as utils
-
-from transformers import TrajectoryTransformerModel
-
-
-class Parser(utils.Parser):
-    dataset: str = "halfcheetah-medium-expert-v2"
-    config: str = "config.offline"
-
-
-def convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(logbase, dataset, loadpath, epoch, device):
-    """Converting Sequential blocks to ModuleList"""
-
-    gpt, gpt_epoch = utils.load_model(logbase, dataset, loadpath, epoch=epoch, device=device)
-    trajectory_transformer = TrajectoryTransformerModel(gpt.config)
-
-    trajectory_transformer.tok_emb.load_state_dict(gpt.tok_emb.state_dict())
-    trajectory_transformer.pos_emb = gpt.pos_emb
-    trajectory_transformer.drop.load_state_dict(gpt.drop.state_dict())
-    trajectory_transformer.ln_f.load_state_dict(gpt.ln_f.state_dict())
-    trajectory_transformer.head.load_state_dict(gpt.head.state_dict())
-
-    for i, block in enumerate(gpt.blocks):
-        trajectory_transformer.blocks[i].ln1.load_state_dict(gpt.blocks[i].ln1.state_dict())
-        trajectory_transformer.blocks[i].ln2.load_state_dict(gpt.blocks[i].ln2.state_dict())
-        trajectory_transformer.blocks[i].attn.load_state_dict(gpt.blocks[i].attn.state_dict())
-
-        trajectory_transformer.blocks[i].l1.load_state_dict(gpt.blocks[i].mlp[0].state_dict())
-        trajectory_transformer.blocks[i].act.load_state_dict(gpt.blocks[i].mlp[1].state_dict())
-        trajectory_transformer.blocks[i].l2.load_state_dict(gpt.blocks[i].mlp[2].state_dict())
-        trajectory_transformer.blocks[i].drop.load_state_dict(gpt.blocks[i].mlp[3].state_dict())
-
-    torch.save(trajectory_transformer.state_dict(), "pytorch_model.bin")
-
-
-if __name__ == "__main__":
-    """
-    To run this script you will need to install the original repository to run the original model. You can find it
-    here: https://github.com/jannerm/trajectory-transformer From this repository code you can also download the
-    original pytorch checkpoints.
-
-    Run with the command:
-
-    ```sh
-    >>> python convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py --dataset <dataset_name>
-    ...     --gpt_loadpath <path_to_original_pytorch_checkpoint>
-    ```
-    """
-
-    args = Parser().parse_args("plan")
-    convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(
-        args.logbase, args.dataset, args.gpt_loadpath, args.gpt_epoch, args.device
-    )
diff --git a/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
index 5bb787b87d0b..52afb77885cb 100644
--- a/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
@@ -140,7 +140,7 @@ class TrajectoryTransformerOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
diff --git a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 2c7b687c4d98..000000000000
--- a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Transformer XL checkpoint and datasets."""
-
-import argparse
-import os
-import pickle
-import sys
-
-import torch
-
-from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl
-from transformers.models.deprecated.transfo_xl import tokenization_transfo_xl as data_utils
-from transformers.models.deprecated.transfo_xl.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-# We do this to be able to load python 2 datasets pickles
-# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
-data_utils.Vocab = data_utils.TransfoXLTokenizer
-data_utils.Corpus = data_utils.TransfoXLCorpus
-sys.modules["data_utils"] = data_utils
-sys.modules["vocabulary"] = data_utils
-
-
-def convert_transfo_xl_checkpoint_to_pytorch(
-    tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file
-):
-    if transfo_xl_dataset_file:
-        # Convert a pre-processed corpus (see original TensorFlow repo)
-        with open(transfo_xl_dataset_file, "rb") as fp:
-            corpus = pickle.load(fp, encoding="latin1")
-        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
-        pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"]
-        print(f"Save vocabulary to {pytorch_vocab_dump_path}")
-        corpus_vocab_dict = corpus.vocab.__dict__
-        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
-
-        corpus_dict_no_vocab = corpus.__dict__
-        corpus_dict_no_vocab.pop("vocab", None)
-        pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME
-        print(f"Save dataset to {pytorch_dataset_dump_path}")
-        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
-
-    if tf_checkpoint_path:
-        # Convert a pre-trained TensorFlow model
-        config_path = os.path.abspath(transfo_xl_config_file)
-        tf_path = os.path.abspath(tf_checkpoint_path)
-
-        print(f"Converting Transformer XL checkpoint from {tf_path} with config at {config_path}.")
-        # Initialise PyTorch model
-        if transfo_xl_config_file == "":
-            config = TransfoXLConfig()
-        else:
-            config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
-        print(f"Building PyTorch model from configuration: {config}")
-        model = TransfoXLLMHeadModel(config)
-
-        model = load_tf_weights_in_transfo_xl(model, config, tf_path)
-        # Save pytorch-model
-        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
-        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
-        print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
-        torch.save(model.state_dict(), pytorch_weights_dump_path)
-        print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
-        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-            f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the folder to store the PyTorch model or dataset/vocab.",
-    )
-    parser.add_argument(
-        "--tf_checkpoint_path",
-        default="",
-        type=str,
-        help="An optional path to a TensorFlow checkpoint path to be converted.",
-    )
-    parser.add_argument(
-        "--transfo_xl_config_file",
-        default="",
-        type=str,
-        help=(
-            "An optional config json file corresponding to the pre-trained BERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--transfo_xl_dataset_file",
-        default="",
-        type=str,
-        help="An optional dataset file to be converted in a vocabulary.\n"
-        "Given the files are in the pickle format, please be wary of passing it files you trust.",
-    )
-    args = parser.parse_args()
-    convert_transfo_xl_checkpoint_to_pytorch(
-        args.tf_checkpoint_path,
-        args.transfo_xl_config_file,
-        args.pytorch_dump_folder_path,
-        args.transfo_xl_dataset_file,
-    )
diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
index 982995a43e18..496638e5f2d6 100644
--- a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
@@ -690,7 +690,7 @@ class TFTransfoXLModelOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     mems: List[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
@@ -723,7 +723,7 @@ class TFTransfoXLLMHeadModelOutput(ModelOutput):
             heads.
     """
 
-    prediction_scores: tf.Tensor = None
+    prediction_scores: Optional[tf.Tensor] = None
     mems: List[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
@@ -757,7 +757,7 @@ class TFTransfoXLSequenceClassifierOutputWithPast(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     mems: List[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
@@ -1095,9 +1095,9 @@ def call(
                 batch_size, sequence_length = shape_list(input_ids)[:2]
             else:
                 batch_size, sequence_length = shape_list(inputs_embeds)[:2]
-            assert (
-                self.config.pad_token_id is not None or batch_size == 1
-            ), "Cannot handle batch sizes > 1 if no padding token is defined."
+            assert self.config.pad_token_id is not None or batch_size == 1, (
+                "Cannot handle batch sizes > 1 if no padding token is defined."
+            )
 
             if not tf.is_tensor(sequence_lengths):
                 in_logits = logits[0:batch_size, sequence_lengths]
diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
index da7ce4058020..abe7e599274c 100644
--- a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
@@ -155,9 +155,9 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
                 p_i.data = torch.from_numpy(arr_i)
         else:
             try:
-                assert (
-                    pointer.shape == array.shape
-                ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+                assert pointer.shape == array.shape, (
+                    f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+                )
             except AssertionError as e:
                 e.args += (pointer.shape, array.shape)
                 raise
@@ -651,7 +651,7 @@ class TransfoXLSequenceClassifierOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     mems: List[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -687,7 +687,7 @@ class TransfoXLLMHeadModelOutput(ModelOutput):
     """
 
     losses: Optional[torch.FloatTensor] = None
-    prediction_scores: torch.FloatTensor = None
+    prediction_scores: Optional[torch.FloatTensor] = None
     mems: List[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -1238,9 +1238,9 @@ def forward(
         else:
             batch_size, sequence_length = inputs_embeds.shape[:2]
 
-        assert (
-            self.config.pad_token_id is not None or batch_size == 1
-        ), "Cannot handle batch sizes > 1 if no padding token is defined."
+        assert self.config.pad_token_id is not None or batch_size == 1, (
+            "Cannot handle batch sizes > 1 if no padding token is defined."
+        )
         if self.config.pad_token_id is None:
             sequence_lengths = -1
         else:
diff --git a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
index 53dec63cfc4f..ac4b6d7a13cc 100644
--- a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
@@ -162,7 +162,7 @@ def __init__(
         lower_case=False,
         delimiter=None,
         vocab_file=None,
-        pretrained_vocab_file: str = None,
+        pretrained_vocab_file: Optional[str] = None,
         never_split=None,
         unk_token="<unk>",
         eos_token="<eos>",
diff --git a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
index 46c6b0c7ca36..2c5f853b2a23 100644
--- a/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/image_processing_tvlt.py
@@ -220,14 +220,14 @@ def resize(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
@@ -277,16 +277,16 @@ def _preprocess_image(
     def preprocess(
         self,
         videos: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         patch_size: List[int] = None,
-        num_frames: int = None,
+        num_frames: Optional[int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         is_mixed: bool = False,
diff --git a/src/transformers/models/deprecated/tvlt/modeling_tvlt.py b/src/transformers/models/deprecated/tvlt/modeling_tvlt.py
index 7f82aacf6e8b..561b7f90d147 100644
--- a/src/transformers/models/deprecated/tvlt/modeling_tvlt.py
+++ b/src/transformers/models/deprecated/tvlt/modeling_tvlt.py
@@ -75,13 +75,13 @@ class TvltModelOutput(ModelOutput):
             the self-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    last_pixel_hidden_state: torch.FloatTensor = None
-    last_audio_hidden_state: torch.FloatTensor = None
-    pixel_label_masks: torch.LongTensor = None
-    audio_label_masks: torch.LongTensor = None
-    pixel_ids_restore: torch.LongTensor = None
-    audio_ids_restore: torch.LongTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    last_pixel_hidden_state: Optional[torch.FloatTensor] = None
+    last_audio_hidden_state: Optional[torch.FloatTensor] = None
+    pixel_label_masks: Optional[torch.LongTensor] = None
+    audio_label_masks: Optional[torch.LongTensor] = None
+    pixel_ids_restore: Optional[torch.LongTensor] = None
+    audio_ids_restore: Optional[torch.LongTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -104,7 +104,7 @@ class TvltDecoderOutput(ModelOutput):
             the self-attention heads.
     """
 
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -136,9 +136,9 @@ class TvltForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    matching_logits: torch.FloatTensor = None
-    pixel_logits: torch.FloatTensor = None
-    audio_logits: torch.FloatTensor = None
+    matching_logits: Optional[torch.FloatTensor] = None
+    pixel_logits: Optional[torch.FloatTensor] = None
+    audio_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -345,7 +345,7 @@ def __init__(self, config):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
deleted file mode 100644
index 51466e77bae0..000000000000
--- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# coding=utf-8
-# Copyright 2022 BNRist (Tsinghua University), TKLNDST (Nankai University) and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VAN checkpoints from the original repository.
-
-URL: https://github.com/Visual-Attention-Network/VAN-Classification"""
-
-import argparse
-import json
-import sys
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from typing import List
-
-import torch
-import torch.nn as nn
-from huggingface_hub import cached_download, hf_hub_download
-from torch import Tensor
-
-from transformers import AutoImageProcessor, VanConfig, VanForImageClassification
-from transformers.models.deprecated.van.modeling_van import VanLayerScaling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-@dataclass
-class Tracker:
-    module: nn.Module
-    traced: List[nn.Module] = field(default_factory=list)
-    handles: list = field(default_factory=list)
-
-    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
-        if has_not_submodules:
-            if not isinstance(m, VanLayerScaling):
-                self.traced.append(m)
-
-    def __call__(self, x: Tensor):
-        for m in self.module.modules():
-            self.handles.append(m.register_forward_hook(self._forward_hook))
-        self.module(x)
-        [x.remove() for x in self.handles]
-        return self
-
-    @property
-    def parametrized(self):
-        # check the len of the state_dict keys to see if we have learnable params
-        return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))
-
-
-@dataclass
-class ModuleTransfer:
-    src: nn.Module
-    dest: nn.Module
-    verbose: int = 0
-    src_skip: List = field(default_factory=list)
-    dest_skip: List = field(default_factory=list)
-
-    def __call__(self, x: Tensor):
-        """
-        Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
-        hood we tracked all the operations in both modules.
-        """
-        dest_traced = Tracker(self.dest)(x).parametrized
-        src_traced = Tracker(self.src)(x).parametrized
-
-        src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
-        dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))
-
-        if len(dest_traced) != len(src_traced):
-            raise Exception(
-                f"Numbers of operations are different. Source module has {len(src_traced)} operations while"
-                f" destination module has {len(dest_traced)}."
-            )
-
-        for dest_m, src_m in zip(dest_traced, src_traced):
-            dest_m.load_state_dict(src_m.state_dict())
-            if self.verbose == 1:
-                print(f"Transfered from={src_m} to={dest_m}")
-
-
-def copy_parameters(from_model: nn.Module, our_model: nn.Module) -> nn.Module:
-    # nn.Parameter cannot be tracked by the Tracker, thus we need to manually convert them
-    from_state_dict = from_model.state_dict()
-    our_state_dict = our_model.state_dict()
-    config = our_model.config
-    all_keys = []
-    for stage_idx in range(len(config.hidden_sizes)):
-        for block_id in range(config.depths[stage_idx]):
-            from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_1"
-            to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.attention_scaling.weight"
-
-            all_keys.append((from_key, to_key))
-            from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_2"
-            to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.mlp_scaling.weight"
-
-            all_keys.append((from_key, to_key))
-
-    for from_key, to_key in all_keys:
-        our_state_dict[to_key] = from_state_dict.pop(from_key)
-
-    our_model.load_state_dict(our_state_dict)
-    return our_model
-
-
-def convert_weight_and_push(
-    name: str,
-    config: VanConfig,
-    checkpoint: str,
-    from_model: nn.Module,
-    save_directory: Path,
-    push_to_hub: bool = True,
-):
-    print(f"Downloading weights for {name}...")
-    checkpoint_path = cached_download(checkpoint)
-    print(f"Converting {name}...")
-    from_state_dict = torch.load(checkpoint_path)["state_dict"]
-    from_model.load_state_dict(from_state_dict)
-    from_model.eval()
-    with torch.no_grad():
-        our_model = VanForImageClassification(config).eval()
-        module_transfer = ModuleTransfer(src=from_model, dest=our_model)
-        x = torch.randn((1, 3, 224, 224))
-        module_transfer(x)
-        our_model = copy_parameters(from_model, our_model)
-
-    if not torch.allclose(from_model(x), our_model(x).logits):
-        raise ValueError("The model logits don't match the original one.")
-
-    checkpoint_name = name
-    print(checkpoint_name)
-
-    if push_to_hub:
-        our_model.push_to_hub(
-            repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-
-        # we can use the convnext one
-        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
-        image_processor.push_to_hub(
-            repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-        print(f"Pushed {checkpoint_name}")
-
-
-def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
-    filename = "imagenet-1k-id2label.json"
-    num_labels = 1000
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    ImageNetPreTrainedConfig = partial(VanConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    names_to_config = {
-        "van-tiny": ImageNetPreTrainedConfig(
-            hidden_sizes=[32, 64, 160, 256],
-            depths=[3, 3, 5, 2],
-            mlp_ratios=[8, 8, 4, 4],
-        ),
-        "van-small": ImageNetPreTrainedConfig(
-            hidden_sizes=[64, 128, 320, 512],
-            depths=[2, 2, 4, 2],
-            mlp_ratios=[8, 8, 4, 4],
-        ),
-        "van-base": ImageNetPreTrainedConfig(
-            hidden_sizes=[64, 128, 320, 512],
-            depths=[3, 3, 12, 3],
-            mlp_ratios=[8, 8, 4, 4],
-        ),
-        "van-large": ImageNetPreTrainedConfig(
-            hidden_sizes=[64, 128, 320, 512],
-            depths=[3, 5, 27, 3],
-            mlp_ratios=[8, 8, 4, 4],
-        ),
-    }
-
-    names_to_original_models = {
-        "van-tiny": van_tiny,
-        "van-small": van_small,
-        "van-base": van_base,
-        "van-large": van_large,
-    }
-
-    names_to_original_checkpoints = {
-        "van-tiny": (
-            "https://huggingface.co/Visual-Attention-Network/VAN-Tiny-original/resolve/main/van_tiny_754.pth.tar"
-        ),
-        "van-small": (
-            "https://huggingface.co/Visual-Attention-Network/VAN-Small-original/resolve/main/van_small_811.pth.tar"
-        ),
-        "van-base": (
-            "https://huggingface.co/Visual-Attention-Network/VAN-Base-original/resolve/main/van_base_828.pth.tar"
-        ),
-        "van-large": (
-            "https://huggingface.co/Visual-Attention-Network/VAN-Large-original/resolve/main/van_large_839.pth.tar"
-        ),
-    }
-
-    if model_name:
-        convert_weight_and_push(
-            model_name,
-            names_to_config[model_name],
-            checkpoint=names_to_original_checkpoints[model_name],
-            from_model=names_to_original_models[model_name](),
-            save_directory=save_directory,
-            push_to_hub=push_to_hub,
-        )
-    else:
-        for model_name, config in names_to_config.items():
-            convert_weight_and_push(
-                model_name,
-                config,
-                checkpoint=names_to_original_checkpoints[model_name],
-                from_model=names_to_original_models[model_name](),
-                save_directory=save_directory,
-                push_to_hub=push_to_hub,
-            )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model-name",
-        default=None,
-        type=str,
-        help=(
-            "The name of the model you wish to convert, it must be one of the supported resnet* architecture,"
-            " currently: van-tiny/small/base/large. If `None`, all of them will the converted."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=Path,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--van_dir",
-        required=True,
-        type=Path,
-        help=(
-            "A path to VAN's original implementation directory. You can download from here:"
-            " https://github.com/Visual-Attention-Network/VAN-Classification"
-        ),
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default=True,
-        type=bool,
-        required=False,
-        help="If True, push model and image processor to the hub.",
-    )
-
-    args = parser.parse_args()
-    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
-    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
-    van_dir = args.van_dir
-    # append the path to the parents to maskformer dir
-    sys.path.append(str(van_dir.parent))
-    from van.models.van import van_base, van_large, van_small, van_tiny
-
-    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
deleted file mode 100644
index 1d717d74c961..000000000000
--- a/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViT hybrid checkpoints from the timm library."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import timm
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from timm.data import resolve_data_config
-from timm.data.transforms_factory import create_transform
-
-from transformers import (
-    BitConfig,
-    ViTHybridConfig,
-    ViTHybridForImageClassification,
-    ViTHybridImageProcessor,
-    ViTHybridModel,
-)
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-
-    # fmt: off
-    # stem:
-    rename_keys.append(("cls_token", "vit.embeddings.cls_token"))
-    rename_keys.append(("pos_embed", "vit.embeddings.position_embeddings"))
-
-    rename_keys.append(("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"))
-
-    # backbone
-    rename_keys.append(("patch_embed.backbone.stem.conv.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.convolution.weight"))
-    rename_keys.append(("patch_embed.backbone.stem.norm.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.weight"))
-    rename_keys.append(("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.bias"))
-
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv1.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.bias"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv2.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.bias"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv3.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.weight"))
-            rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.bias"))
-
-        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.conv.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.conv.weight"))
-        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.weight"))
-        rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.bias"))
-
-    # transformer encoder
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-                ("pre_logits.fc.weight", "pooler.dense.weight"),
-                ("pre_logits.fc.bias", "pooler.dense.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "vit" from all keys that start with "vit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "vit.layernorm.weight"),
-                ("norm.bias", "vit.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-    # fmt: on
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "vit."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our ViT structure.
-    """
-
-    # define default ViT hybrid configuration
-    backbone_config = BitConfig(
-        global_padding="same",
-        layer_type="bottleneck",
-        depths=(3, 4, 9),
-        out_features=["stage3"],
-        embedding_dynamic_padding=True,
-    )
-    config = ViTHybridConfig(backbone_config=backbone_config, image_size=384, num_labels=1000)
-    base_model = False
-
-    # load original model from timm
-    timm_model = timm.create_model(vit_name, pretrained=True)
-    timm_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = timm_model.state_dict()
-    if base_model:
-        remove_classification_head_(state_dict)
-    rename_keys = create_rename_keys(config, base_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model)
-
-    repo_id = "huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    # load HuggingFace model
-    if vit_name[-5:] == "in21k":
-        model = ViTHybridModel(config).eval()
-    else:
-        model = ViTHybridForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-
-    # create image processor
-    transform = create_transform(**resolve_data_config({}, model=timm_model))
-    timm_transforms = transform.transforms
-
-    pillow_resamplings = {
-        "bilinear": PILImageResampling.BILINEAR,
-        "bicubic": PILImageResampling.BICUBIC,
-        "nearest": PILImageResampling.NEAREST,
-    }
-
-    processor = ViTHybridImageProcessor(
-        do_resize=True,
-        size={"shortest_edge": timm_transforms[0].size},
-        resample=pillow_resamplings[timm_transforms[0].interpolation.value],
-        do_center_crop=True,
-        crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]},
-        do_normalize=True,
-        image_mean=timm_transforms[-1].mean.tolist(),
-        image_std=timm_transforms[-1].std.tolist(),
-    )
-
-    image = prepare_img()
-    timm_pixel_values = transform(image).unsqueeze(0)
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    # verify pixel values
-    assert torch.allclose(timm_pixel_values, pixel_values)
-
-    # verify logits
-    with torch.no_grad():
-        outputs = model(pixel_values)
-        logits = outputs.logits
-
-    print("Predicted class:", logits.argmax(-1).item())
-    if base_model:
-        timm_pooled_output = timm_model.forward_features(pixel_values)
-        assert timm_pooled_output.shape == outputs.pooler_output.shape
-        assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3)
-    else:
-        timm_logits = timm_model(pixel_values)
-        assert timm_logits.shape == outputs.logits.shape
-        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor to the hub {vit_name}")
-        model.push_to_hub(f"ybelkada/{vit_name}")
-        processor.push_to_hub(f"ybelkada/{vit_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--vit_name",
-        default="vit_base_r50_s16_384",
-        type=str,
-        help="Name of the hybrid ViT timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub."
-    )
-
-    args = parser.parse_args()
-    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
index 0424fd058e54..c78790f1340e 100644
--- a/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
@@ -192,17 +192,17 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
index dca17adf2b09..6ad8a14a7329 100644
--- a/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
@@ -204,7 +204,7 @@ def __init__(self, config: ViTHybridConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
@@ -516,12 +516,12 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
                 mean=0.0,
                 std=self.config.initializer_range,
             ).to(module.position_embeddings.dtype)
-
             module.cls_token.data = nn.init.trunc_normal_(
                 module.cls_token.data.to(torch.float32),
                 mean=0.0,
                 std=self.config.initializer_range,
             ).to(module.cls_token.dtype)
+            module.mask_token.data.zero_()
 
 
 VIT_START_DOCSTRING = r"""
diff --git a/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
index e9e709af993d..cf1abf5bbacc 100644
--- a/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -308,7 +308,7 @@ class XLMProphetNetSeq2SeqLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     logits_ngram: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[torch.FloatTensor]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -528,7 +528,7 @@ class XLMProphetNetDecoderLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     logits_ngram: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -588,9 +588,9 @@ def __init__(self, config: XLMProphetNetConfig) -> None:
         super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
 
     def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
-        assert (position_ids is None) or (
-            self.padding_idx is None
-        ), "If position_ids is pre-computed then padding_idx should not be set."
+        assert (position_ids is None) or (self.padding_idx is None), (
+            "If position_ids is pre-computed then padding_idx should not be set."
+        )
 
         if position_ids is None:
             if past_key_values is not None:
@@ -784,9 +784,9 @@ def __init__(self, config: XLMProphetNetConfig):
         self.head_dim = config.hidden_size // self.num_attn_heads
         self.ngram = config.ngram
 
-        assert (
-            self.head_dim * self.num_attn_heads == config.hidden_size
-        ), "config.hidden_size must be divisible by num_attn_heads"
+        assert self.head_dim * self.num_attn_heads == config.hidden_size, (
+            "config.hidden_size must be divisible by num_attn_heads"
+        )
         # key, value, query projection
         self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
         self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
@@ -1041,9 +1041,9 @@ def get_predict_relative_pos_embeddings(
 
         if predict_relative_position_buckets is None:
             key_sequence_length = attn_weights.shape[-1]
-            assert (
-                position_ids[0][0] == key_sequence_length - 1
-            ), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            assert position_ids[0][0] == key_sequence_length - 1, (
+                "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            )
             relative_positions = (
                 torch.arange(0, key_sequence_length)
                 .unsqueeze(0)
@@ -1313,9 +1313,9 @@ def forward(
 
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 encoder_hidden_states = encoder_hidden_states + (hidden_states,)
@@ -1488,9 +1488,9 @@ def forward(
 
         # prepare attention mask
         if past_key_values is not None:
-            assert (
-                hidden_states.size(1) == 1
-            ), "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+            assert hidden_states.size(1) == 1, (
+                "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+            )
 
             ngram_hidden_states = [
                 (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1)
diff --git a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
deleted file mode 100644
index 5c6da13ae885..000000000000
--- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Depth Anything checkpoints from the original repository. URL:
-https://github.com/LiheYoung/Depth-Anything"""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(model_name):
-    if "small" in model_name:
-        out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12]
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
-        )
-        fusion_hidden_size = 64
-        neck_hidden_sizes = [48, 96, 192, 384]
-    elif "base" in model_name:
-        out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12]
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
-        )
-        fusion_hidden_size = 128
-        neck_hidden_sizes = [96, 192, 384, 768]
-    elif "large" in model_name:
-        out_indices = [5, 12, 18, 24] if "v2" in model_name else [21, 22, 23, 24]
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False
-        )
-        fusion_hidden_size = 256
-        neck_hidden_sizes = [256, 512, 1024, 1024]
-    else:
-        raise NotImplementedError(f"Model not supported: {model_name}")
-
-    if "metric" in model_name:
-        depth_estimation_type = "metric"
-        max_depth = 20 if "indoor" in model_name else 80
-    else:
-        depth_estimation_type = "relative"
-        max_depth = None
-
-    config = DepthAnythingConfig(
-        reassemble_hidden_size=backbone_config.hidden_size,
-        patch_size=backbone_config.patch_size,
-        backbone_config=backbone_config,
-        fusion_hidden_size=fusion_hidden_size,
-        neck_hidden_sizes=neck_hidden_sizes,
-        depth_estimation_type=depth_estimation_type,
-        max_depth=max_depth,
-    )
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("pretrained.cls_token", "backbone.embeddings.cls_token"))
-    rename_keys.append(("pretrained.mask_token", "backbone.embeddings.mask_token"))
-    rename_keys.append(("pretrained.pos_embed", "backbone.embeddings.position_embeddings"))
-    rename_keys.append(("pretrained.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("pretrained.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
-
-    # Transfomer encoder
-    for i in range(config.backbone_config.num_hidden_layers):
-        rename_keys.append((f"pretrained.blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1"))
-        rename_keys.append((f"pretrained.blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1"))
-        rename_keys.append((f"pretrained.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight"))
-        rename_keys.append((f"pretrained.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias"))
-        rename_keys.append((f"pretrained.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight"))
-        rename_keys.append((f"pretrained.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias"))
-        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"pretrained.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"pretrained.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
-
-    # Head
-    rename_keys.append(("pretrained.norm.weight", "backbone.layernorm.weight"))
-    rename_keys.append(("pretrained.norm.bias", "backbone.layernorm.bias"))
-
-    # activation postprocessing (readout projections + resize blocks)
-    # Depth Anything does not use CLS token => readout_projects not required
-
-    for i in range(4):
-        rename_keys.append((f"depth_head.projects.{i}.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
-        rename_keys.append((f"depth_head.projects.{i}.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
-
-        if i != 2:
-            rename_keys.append((f"depth_head.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
-            rename_keys.append((f"depth_head.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
-
-    # refinenet (tricky here)
-    mapping = {1:3, 2:2, 3:1, 4:0}
-
-    for i in range(1, 5):
-        j = mapping[i]
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
-        rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))
-
-    # scratch convolutions
-    for i in range(4):
-        rename_keys.append((f"depth_head.scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))
-
-    # head
-    rename_keys.append(("depth_head.scratch.output_conv1.weight", "head.conv1.weight"))
-    rename_keys.append(("depth_head.scratch.output_conv1.bias", "head.conv1.bias"))
-    rename_keys.append(("depth_head.scratch.output_conv2.0.weight", "head.conv2.weight"))
-    rename_keys.append(("depth_head.scratch.output_conv2.0.bias", "head.conv2.bias"))
-    rename_keys.append(("depth_head.scratch.output_conv2.2.weight", "head.conv3.weight"))
-    rename_keys.append(("depth_head.scratch.output_conv2.2.bias", "head.conv3.bias"))
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    hidden_size = config.backbone_config.hidden_size
-    for i in range(config.backbone_config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            hidden_size : hidden_size * 2
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-name_to_checkpoint = {
-    "depth-anything-small": "pytorch_model.bin",
-    "depth-anything-base": "pytorch_model.bin",
-    "depth-anything-large": "pytorch_model.bin",
-    "depth-anything-v2-small": "depth_anything_v2_vits.pth",
-    "depth-anything-v2-base": "depth_anything_v2_vitb.pth",
-    "depth-anything-v2-large": "depth_anything_v2_vitl.pth",
-    "depth-anything-v2-metric-indoor-small": "depth_anything_v2_metric_hypersim_vits.pth",
-    "depth-anything-v2-metric-indoor-base": "depth_anything_v2_metric_hypersim_vitb.pth",
-    "depth-anything-v2-metric-indoor-large": "depth_anything_v2_metric_hypersim_vitl.pth",
-    "depth-anything-v2-metric-outdoor-small": "depth_anything_v2_metric_vkitti_vits.pth",
-    "depth-anything-v2-metric-outdoor-base": "depth_anything_v2_metric_vkitti_vitb.pth",
-    "depth-anything-v2-metric-outdoor-large": "depth_anything_v2_metric_vkitti_vitl.pth",
-    # v2-giant pending
-}
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    # define DPT configuration
-    config = get_dpt_config(model_name)
-
-    model_name_to_repo = {
-        "depth-anything-small": "LiheYoung/depth_anything_vits14",
-        "depth-anything-base": "LiheYoung/depth_anything_vitb14",
-        "depth-anything-large": "LiheYoung/depth_anything_vitl14",
-        "depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small",
-        "depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base",
-        "depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large",
-        "depth-anything-v2-metric-indoor-small": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Small",
-        "depth-anything-v2-metric-indoor-base": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Base",
-        "depth-anything-v2-metric-indoor-large": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Large",
-        "depth-anything-v2-metric-outdoor-small": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Small",
-        "depth-anything-v2-metric-outdoor-base": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Base",
-        "depth-anything-v2-metric-outdoor-large": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Large",
-    }
-
-    # load original state_dict
-    repo_id = model_name_to_repo[model_name]
-    filename = name_to_checkpoint[model_name]
-    filepath = hf_hub_download(
-        repo_id=repo_id,
-        filename=f"{filename}",
-    )
-
-    state_dict = torch.load(filepath, map_location="cpu")
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = DepthAnythingForDepthEstimation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    processor = DPTImageProcessor(
-        do_resize=True,
-        size={"height": 518, "width": 518},
-        ensure_multiple_of=14,
-        keep_aspect_ratio=True,
-        do_rescale=True,
-        do_normalize=True,
-        image_mean=[0.485, 0.456, 0.406],
-        image_std=[0.229, 0.224, 0.225],
-    )
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    # Verify forward pass
-    with torch.no_grad():
-        outputs = model(pixel_values)
-        predicted_depth = outputs.predicted_depth
-
-    print("Shape of predicted depth:", predicted_depth.shape)
-    print("First values:", predicted_depth[0, :3, :3])
-
-    # assert logits
-    if verify_logits:
-        expected_shape = torch.Size([1, 518, 686])
-        if model_name == "depth-anything-small":
-            expected_slice = torch.tensor(
-                [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]],
-            )
-        elif model_name == "depth-anything-base":
-            expected_slice = torch.tensor(
-                [[26.3997, 26.3004, 26.3928], [26.2260, 26.2092, 26.3427], [26.0719, 26.0483, 26.1254]],
-            )
-        elif model_name == "depth-anything-large":
-            expected_slice = torch.tensor(
-                [[87.9968, 87.7493, 88.2704], [87.1927, 87.6611, 87.3640], [86.7789, 86.9469, 86.7991]]
-            )
-        elif model_name == "depth-anything-v2-small":
-            expected_slice = torch.tensor(
-                [[2.6751, 2.6211, 2.6571], [2.5820, 2.6138, 2.6271], [2.6160, 2.6141, 2.6306]]
-            )
-        elif model_name == "depth-anything-v2-base":
-            expected_slice = torch.tensor(
-                [[4.3576, 4.3723, 4.3908], [4.3231, 4.3146, 4.3611], [4.3016, 4.3170, 4.3121]]
-            )
-        elif model_name == "depth-anything-v2-large":
-            expected_slice = torch.tensor(
-                [[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]]
-            )
-        elif model_name == "depth-anything-v2-metric-indoor-small":
-            expected_slice = torch.tensor(
-                [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]]
-            )
-        elif model_name == "depth-anything-v2-metric-indoor-base":
-            expected_slice = torch.tensor(
-                [[1.4601, 1.3824, 1.4904], [1.5031, 1.4349, 1.4274], [1.4570, 1.4578, 1.4200]]
-            )
-        elif model_name == "depth-anything-v2-metric-indoor-large":
-            expected_slice = torch.tensor(
-                [[1.5040, 1.5019, 1.5218], [1.5087, 1.5195, 1.5149], [1.5437, 1.5128, 1.5252]]
-            )
-        elif model_name == "depth-anything-v2-metric-outdoor-small":
-            expected_slice = torch.tensor(
-                [[9.5804, 8.0339, 7.7386], [7.9890, 7.2464, 7.7149], [7.7021, 7.2330, 7.3304]]
-            )
-        elif model_name == "depth-anything-v2-metric-outdoor-base":
-            expected_slice = torch.tensor(
-                [[10.2916, 9.0933, 8.8622], [9.1964, 9.3393, 9.0644], [8.9618, 9.4201, 9.2262]]
-            )
-        elif model_name == "depth-anything-v2-metric-outdoor-large":
-            expected_slice = torch.tensor(
-                [[14.0137, 13.3627, 13.1080], [13.2522, 13.3943, 13.3705], [13.0581, 13.4505, 13.3925]]
-            )
-        else:
-            raise ValueError("Not supported")
-
-        assert predicted_depth.shape == torch.Size(expected_shape)
-        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4)
-        print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(repo_id=f"{model_name.title()}-hf")
-        processor.push_to_hub(repo_id=f"{model_name.title()}-hf")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="depth-anything-small",
-        type=str,
-        choices=name_to_checkpoint.keys(),
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub after conversion.",
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_false",
-        required=False,
-        help="Whether to verify the logits after conversion.",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)
diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py
index 98a6ccde8c17..17b79134adbe 100644
--- a/src/transformers/models/depth_anything/modeling_depth_anything.py
+++ b/src/transformers/models/depth_anything/modeling_depth_anything.py
@@ -240,7 +240,8 @@ def forward(self, hidden_states, size=None):
         return fused_hidden_states
 
 
-# Copied from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->DepthAnything,dpt->depth_anything
+# Modified from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->DepthAnything,dpt->depth_anything
+# avoiding sdpa and flash_attn_2 support, it's done in the backend
 class DepthAnythingPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py
index 36de741b704a..64b58d552067 100644
--- a/src/transformers/models/depth_pro/configuration_depth_pro.py
+++ b/src/transformers/models/depth_pro/configuration_depth_pro.py
@@ -114,7 +114,7 @@ def __init__(
         # scaled_images_ratios is sorted
         if scaled_images_ratios != sorted(scaled_images_ratios):
             raise ValueError(
-                f"Values in scaled_images_ratios={scaled_images_ratios} " "should be sorted from low to high"
+                f"Values in scaled_images_ratios={scaled_images_ratios} should be sorted from low to high"
             )
 
         # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent
diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
deleted file mode 100644
index b24c6a5174f0..000000000000
--- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import gc
-import os
-
-import regex as re
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    DepthProConfig,
-    DepthProForDepthEstimation,
-    DepthProImageProcessorFast,
-)
-
-
-# fmt: off
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-
-    # encoder
-    r"encoder.(patch|image)_encoder.cls_token":                                 r"depth_pro.encoder.\1_encoder.model.embeddings.cls_token",
-    r"encoder.(patch|image)_encoder.pos_embed":                                 r"depth_pro.encoder.\1_encoder.model.embeddings.position_embeddings",
-    r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)":            r"depth_pro.encoder.\1_encoder.model.embeddings.patch_embeddings.projection.\2",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)":      r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.norm\3.\4",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)":       r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.attention.(query|key|value).\3",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)":      r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.output.dense.\3",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma":                r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.layer_scale\3.lambda1",
-    r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)":    r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.mlp.fc\3.\4",
-    r"encoder.(patch|image)_encoder.norm.(weight|bias)":                        r"depth_pro.encoder.\1_encoder.model.layernorm.\2",
-    r"encoder.fuse_lowres.(weight|bias)":                                       r"depth_pro.neck.fuse_image_with_low_res.\1",
-
-    # fov
-    r"fov.encoder.0.cls_token":                                                 r"fov_model.fov_encoder.model.embeddings.cls_token",
-    r"fov.encoder.0.pos_embed":                                                 r"fov_model.fov_encoder.model.embeddings.position_embeddings",
-    r"fov.encoder.0.patch_embed.proj.(weight|bias)":                            r"fov_model.fov_encoder.model.embeddings.patch_embeddings.projection.\1",
-    r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)":                      r"fov_model.fov_encoder.model.encoder.layer.\1.norm\2.\3",
-    r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)":                       r"fov_model.fov_encoder.model.encoder.layer.\1.attention.attention.(query|key|value).\2",
-    r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)":                      r"fov_model.fov_encoder.model.encoder.layer.\1.attention.output.dense.\2",
-    r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma":                                r"fov_model.fov_encoder.model.encoder.layer.\1.layer_scale\2.lambda1",
-    r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)":                    r"fov_model.fov_encoder.model.encoder.layer.\1.mlp.fc\2.\3",
-    r"fov.encoder.0.norm.(weight|bias)":                                        r"fov_model.fov_encoder.model.layernorm.\1",
-    r"fov.downsample.0.(weight|bias)":                                          r"fov_model.conv.\1",
-    r"fov.encoder.1.(weight|bias)":                                             r"fov_model.fov_encoder.neck.\1",
-    r"fov.head.(\d+).(weight|bias)":                                            r"fov_model.head.layers.\1.\2",
-
-    # head
-    r"head.(\d+).(weight|bias)":                                                r"head.layers.\1.\2",
-
-    # upsamples
-    r"encoder.upsample_lowres.(weight|bias)":                                   r"depth_pro.neck.feature_upsample.image_block.layers.0.\1",
-    r"encoder.upsample_latent(\d+).(\d+).(weight|bias)": lambda match: (
-        f"depth_pro.neck.feature_upsample.intermediate.{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
-    ),
-    r"encoder.upsample(\d+).(\d+).(weight|bias)": lambda match: (
-        f"depth_pro.neck.feature_upsample.scaled_images.{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}"
-    ),
-
-    # projections between encoder and fusion
-    r"decoder.convs.(\d+).weight": lambda match: (
-        f"depth_pro.neck.feature_projection.projections.{4-int(match.group(1))}.weight"
-    ),
-
-    # fusion stage
-    r"decoder.fusions.([1234]).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
-        f"fusion_stage.intermediate.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}"
-    ),
-    r"decoder.fusions.0.resnet(\d+).residual.(\d+).(weight|bias)": lambda match: (
-        f"fusion_stage.final.residual_layer{match.group(1)}.convolution{(int(match.group(2))+1)//2}.{match.group(3)}"
-    ),
-    r"decoder.fusions.([1234]).out_conv.(weight|bias)": lambda match: (
-        f"fusion_stage.intermediate.{4-int(match.group(1))}.projection.{match.group(2)}"
-    ),
-    r"decoder.fusions.0.out_conv.(weight|bias)": lambda match: (
-        f"fusion_stage.final.projection.{match.group(1)}"
-    ),
-    r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: (
-        f"fusion_stage.intermediate.{4-int(match.group(1))}.deconv.{match.group(2)}"
-    ),
-}
-# fmt: on
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-            if replacement is None:
-                new_text = re.sub(pattern, "", new_text)  # an empty line
-                continue
-            new_text = re.sub(pattern, replacement, new_text)
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-    return output_dict
-
-
-def get_qkv_state_dict(key, parameter):
-    """
-    new key which looks like this
-    xxxx.(q|k|v).xxx    (m, n)
-
-    is converted to
-    xxxx.q.xxxx         (m//3, n)
-    xxxx.k.xxxx         (m//3, n)
-    xxxx.v.xxxx         (m//3, n)
-    """
-    qkv_state_dict = {}
-    placeholder = re.search(r"(\(.*?\))", key).group(1)  # finds   "(query|key|value)"
-    replacements_keys = placeholder[1:-1].split("|")  # creates ['query', 'key', 'value']
-    replacements_vals = torch.split(
-        parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0
-    )
-    for replacement_key, replacement_val in zip(replacements_keys, replacements_vals):
-        qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val
-    return qkv_state_dict
-
-
-def write_model(
-    hf_repo_id: str,
-    output_dir: str,
-    safe_serialization: bool = True,
-):
-    os.makedirs(output_dir, exist_ok=True)
-
-    # ------------------------------------------------------------
-    # Create and save config
-    # ------------------------------------------------------------
-
-    # create config
-    backbone_config = {
-        "model_type": "dinov2",
-        "num_hidden_layers": 24,
-        "patch_size": 16,
-        "hidden_size": 1024,
-        "num_attention_heads": 16,
-        "image_size": 384,
-        "use_mask_token": False,
-    }
-    config = DepthProConfig(
-        # original implementation uses same config for all 3 models
-        image_model_config=backbone_config,
-        patch_model_config=backbone_config,
-        fov_model_config=backbone_config,
-        use_fov_model=True,
-    )
-
-    # save config
-    config.save_pretrained(output_dir)
-    print("Model config saved successfully...")
-
-    # ------------------------------------------------------------
-    # Convert weights
-    # ------------------------------------------------------------
-
-    # download and load state_dict from hf repo
-    file_path = hf_hub_download(hf_repo_id, "depth_pro.pt")
-    loaded = torch.load(file_path, weights_only=True)
-
-    print("Converting model...")
-    all_keys = list(loaded.keys())
-    new_keys = convert_old_keys_to_new_keys(all_keys)
-
-    state_dict = {}
-    for key in all_keys:
-        new_key = new_keys[key]
-        current_parameter = loaded.pop(key)
-
-        if "qkv" in key:
-            qkv_state_dict = get_qkv_state_dict(new_key, current_parameter)
-            state_dict.update(qkv_state_dict)
-        else:
-            state_dict[new_key] = current_parameter
-
-    print("Loading the checkpoint in a DepthPro model.")
-    model = DepthProForDepthEstimation(config)
-    model.load_state_dict(state_dict, strict=True, assign=True)
-    print("Checkpoint loaded successfully.")
-
-    print("Saving the model.")
-    model.save_pretrained(output_dir, safe_serialization=safe_serialization)
-    del state_dict, model
-
-    # Safety check: reload the converted model
-    gc.collect()
-    print("Reloading the model to check if it's saved correctly.")
-    model = DepthProForDepthEstimation.from_pretrained(output_dir, device_map="auto")
-    print("Model reloaded successfully.")
-    return model
-
-
-def write_image_processor(output_dir: str):
-    image_processor = DepthProImageProcessorFast()
-    image_processor.save_pretrained(output_dir)
-    return image_processor
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--hf_repo_id",
-        default="apple/DepthPro",
-        help="Location of official weights from apple on HF",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="apple_DepthPro",
-        help="Location to write the converted model and processor",
-    )
-    parser.add_argument(
-        "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action=argparse.BooleanOptionalAction,
-        help="Whether or not to push the converted model to the huggingface hub.",
-    )
-    parser.add_argument(
-        "--hub_repo_id",
-        default="apple/DepthPro-hf",
-        help="Huggingface hub repo to write the converted model and processor",
-    )
-    args = parser.parse_args()
-
-    model = write_model(
-        hf_repo_id=args.hf_repo_id,
-        output_dir=args.output_dir,
-        safe_serialization=args.safe_serialization,
-    )
-
-    image_processor = write_image_processor(
-        output_dir=args.output_dir,
-    )
-
-    if args.push_to_hub:
-        print("Pushing to hub...")
-        model.push_to_hub(args.hub_repo_id)
-        image_processor.push_to_hub(args.hub_repo_id)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py
index 67715723d133..c26bf484f50a 100644
--- a/src/transformers/models/depth_pro/modeling_depth_pro.py
+++ b/src/transformers/models/depth_pro/modeling_depth_pro.py
@@ -62,7 +62,7 @@ class DepthProOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     features: Union[torch.FloatTensor, List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -94,7 +94,7 @@ class DepthProDepthEstimatorOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    predicted_depth: torch.FloatTensor = None
+    predicted_depth: Optional[torch.FloatTensor] = None
     field_of_view: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index ba985145014c..000000000000
--- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DETR checkpoints with timm backbone."""
-
-import argparse
-import json
-from collections import OrderedDict
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-rename_keys = []
-for i in range(6):
-    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
-    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
-            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
-        )
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
-            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
-        )
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
-
-# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-rename_keys.extend(
-    [
-        ("input_proj.weight", "input_projection.weight"),
-        ("input_proj.bias", "input_projection.bias"),
-        ("query_embed.weight", "query_position_embeddings.weight"),
-        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
-        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
-        ("class_embed.weight", "class_labels_classifier.weight"),
-        ("class_embed.bias", "class_labels_classifier.bias"),
-        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
-        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
-        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
-        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
-        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
-        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
-    ]
-)
-
-
-def rename_key(state_dict, old, new):
-    val = state_dict.pop(old)
-    state_dict[new] = val
-
-
-def rename_backbone_keys(state_dict):
-    new_state_dict = OrderedDict()
-    for key, value in state_dict.items():
-        if "backbone.0.body" in key:
-            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
-            new_state_dict[new_key] = value
-        else:
-            new_state_dict[key] = value
-
-    return new_state_dict
-
-
-def read_in_q_k_v(state_dict, is_panoptic=False):
-    prefix = ""
-    if is_panoptic:
-        prefix = "detr."
-
-    # first: transformer encoder
-    for i in range(6):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-        # read in weights + bias of input projection layer of cross-attention
-        in_proj_weight_cross_attn = state_dict.pop(
-            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
-        )
-        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) of cross-attention to the state dict
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_detr_checkpoint(model_name, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our DETR structure.
-    """
-
-    # load default config
-    config = DetrConfig()
-    # set backbone and dilation attributes
-    if "resnet101" in model_name:
-        config.backbone = "resnet101"
-    if "dc5" in model_name:
-        config.dilation = True
-    is_panoptic = "panoptic" in model_name
-    if is_panoptic:
-        config.num_labels = 250
-    else:
-        config.num_labels = 91
-        repo_id = "huggingface/label-files"
-        filename = "coco-detection-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    # load image processor
-    format = "coco_panoptic" if is_panoptic else "coco_detection"
-    image_processor = DetrImageProcessor(format=format)
-
-    # prepare image
-    img = prepare_img()
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    logger.info(f"Converting model {model_name}...")
-
-    # load original model from torch hub
-    detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval()
-    state_dict = detr.state_dict()
-    # rename keys
-    for src, dest in rename_keys:
-        if is_panoptic:
-            src = "detr." + src
-        rename_key(state_dict, src, dest)
-    state_dict = rename_backbone_keys(state_dict)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "detr.model." if is_panoptic else "model."
-    for key in state_dict.copy().keys():
-        if is_panoptic:
-            if (
-                key.startswith("detr")
-                and not key.startswith("class_labels_classifier")
-                and not key.startswith("bbox_predictor")
-            ):
-                val = state_dict.pop(key)
-                state_dict["detr.model" + key[4:]] = val
-            elif "class_labels_classifier" in key or "bbox_predictor" in key:
-                val = state_dict.pop(key)
-                state_dict["detr." + key] = val
-            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
-                continue
-            else:
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-        else:
-            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-    # finally, create HuggingFace model and load state dict
-    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-    # verify our conversion
-    original_outputs = detr(pixel_values)
-    outputs = model(pixel_values)
-    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4)
-    if is_panoptic:
-        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
-
-    # Save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/detr/convert_detr_to_pytorch.py b/src/transformers/models/detr/convert_detr_to_pytorch.py
deleted file mode 100644
index 6ba6a0e2920a..000000000000
--- a/src/transformers/models/detr/convert_detr_to_pytorch.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DETR checkpoints with native (Transformers) backbone."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_detr_config(model_name):
-    # initialize config
-    if "resnet-50" in model_name:
-        backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50")
-    elif "resnet-101" in model_name:
-        backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-101")
-    else:
-        raise ValueError("Model name should include either resnet50 or resnet101")
-
-    config = DetrConfig(use_timm_backbone=False, backbone_config=backbone_config)
-
-    # set label attributes
-    is_panoptic = "panoptic" in model_name
-    if is_panoptic:
-        config.num_labels = 250
-    else:
-        config.num_labels = 91
-        repo_id = "huggingface/label-files"
-        filename = "coco-detection-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    return config, is_panoptic
-
-
-def create_rename_keys(config):
-    # here we list all keys to be renamed (original name on the left, our name on the right)
-    rename_keys = []
-
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight"))
-    rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight"))
-    rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias"))
-    rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean"))
-    rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var"))
-    # stages
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            # shortcut
-            if layer_idx == 0:
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
-                    )
-                )
-            # 3 convs
-            for i in range(3):
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
-                    )
-                )
-    # fmt: on
-
-    for i in range(config.encoder_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(
-            (
-                f"transformer.encoder.layers.{i}.self_attn.out_proj.weight",
-                f"encoder.layers.{i}.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
-        )
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")
-        )
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
-        # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.self_attn.out_proj.weight",
-                f"decoder.layers.{i}.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
-        )
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
-                f"decoder.layers.{i}.encoder_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
-                f"decoder.layers.{i}.encoder_attn.out_proj.bias",
-            )
-        )
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")
-        )
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
-
-    # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-    rename_keys.extend(
-        [
-            ("input_proj.weight", "input_projection.weight"),
-            ("input_proj.bias", "input_projection.bias"),
-            ("query_embed.weight", "query_position_embeddings.weight"),
-            ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
-            ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
-            ("class_embed.weight", "class_labels_classifier.weight"),
-            ("class_embed.bias", "class_labels_classifier.bias"),
-            ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
-            ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
-            ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
-            ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
-            ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
-            ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-def rename_key(state_dict, old, new):
-    val = state_dict.pop(old)
-    state_dict[new] = val
-
-
-def read_in_q_k_v(state_dict, is_panoptic=False):
-    prefix = ""
-    if is_panoptic:
-        prefix = "detr."
-
-    # first: transformer encoder
-    for i in range(6):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-        # read in weights + bias of input projection layer of cross-attention
-        in_proj_weight_cross_attn = state_dict.pop(
-            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
-        )
-        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) of cross-attention to the state dict
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our DETR structure.
-    """
-
-    # load default config
-    config, is_panoptic = get_detr_config(model_name)
-
-    # load original model from torch hub
-    model_name_to_original_name = {
-        "detr-resnet-50": "detr_resnet50",
-        "detr-resnet-101": "detr_resnet101",
-    }
-    logger.info(f"Converting model {model_name}...")
-    detr = torch.hub.load("facebookresearch/detr", model_name_to_original_name[model_name], pretrained=True).eval()
-    state_dict = detr.state_dict()
-    # rename keys
-    for src, dest in create_rename_keys(config):
-        if is_panoptic:
-            src = "detr." + src
-        rename_key(state_dict, src, dest)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict, is_panoptic=is_panoptic)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "detr.model." if is_panoptic else "model."
-    for key in state_dict.copy().keys():
-        if is_panoptic:
-            if (
-                key.startswith("detr")
-                and not key.startswith("class_labels_classifier")
-                and not key.startswith("bbox_predictor")
-            ):
-                val = state_dict.pop(key)
-                state_dict["detr.model" + key[4:]] = val
-            elif "class_labels_classifier" in key or "bbox_predictor" in key:
-                val = state_dict.pop(key)
-                state_dict["detr." + key] = val
-            elif key.startswith("bbox_attention") or key.startswith("mask_head"):
-                continue
-            else:
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-        else:
-            if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
-                val = state_dict.pop(key)
-                state_dict[prefix + key] = val
-
-    # finally, create HuggingFace model and load state dict
-    model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # verify our conversion on an image
-    format = "coco_panoptic" if is_panoptic else "coco_detection"
-    processor = DetrImageProcessor(format=format)
-
-    encoding = processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    original_outputs = detr(pixel_values)
-    outputs = model(pixel_values)
-
-    assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-3)
-    assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-3)
-    if is_panoptic:
-        assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        # Save model and image processor
-        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Upload model and image processor to the hub
-        logger.info("Uploading PyTorch model and image processor to the hub...")
-        model.push_to_hub(f"nielsr/{model_name}")
-        processor.push_to_hub(f"nielsr/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        default="detr-resnet-50",
-        type=str,
-        choices=["detr-resnet-50", "detr-resnet-101"],
-        help="Name of the DETR model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
-    args = parser.parse_args()
-    convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index acb90bcf9d4a..b2677af85957 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -928,7 +928,7 @@ def prepare_annotation(
         image: np.ndarray,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -1237,7 +1237,7 @@ def preprocess(
         self,
         images: ImageInput,
         annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
@@ -1568,7 +1568,7 @@ def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_t
         def to_tuple(tup):
             if isinstance(tup, tuple):
                 return tup
-            return tuple(tup.cpu().tolist())
+            return tuple(tup.tolist())
 
         for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
             # we filter empty queries and detection below threshold
@@ -1677,7 +1677,7 @@ def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_
         def to_tuple(tup):
             if isinstance(tup, tuple):
                 return tup
-            return tuple(tup.cpu().tolist())
+            return tuple(tup.tolist())
 
         for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
             out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py
index e49b1761676b..b6227ce5c5d2 100644
--- a/src/transformers/models/detr/image_processing_detr_fast.py
+++ b/src/transformers/models/detr/image_processing_detr_fast.py
@@ -24,8 +24,7 @@
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -255,7 +254,7 @@ def prepare_coco_panoptic_annotation(
     new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
 
     if "segments_info" in target:
-        masks = read_image(annotation_path).permute(1, 2, 0).to(torch.int32).to(image.device)
+        masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device)
         masks = rgb_to_id(masks)
 
         ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
@@ -283,21 +282,12 @@ def prepare_coco_panoptic_annotation(
     return new_target
 
 
-class DetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
+class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     format: Optional[Union[str, AnnotationFormat]]
     do_convert_annotations: Optional[bool]
     do_pad: Optional[bool]
     pad_size: Optional[Dict[str, int]]
-
-
-class DetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
-    format: Optional[AnnotationFormat]
-    annotations: Optional[Dict]
-    do_convert_annotations: Optional[bool]
-    do_pad: Optional[bool]
-    pad_size: Optional[Dict[str, int]]
     return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
 
 
 @add_start_docstrings(
@@ -319,6 +309,8 @@ class DetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocess
             The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
             provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
             height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
     """,
 )
 class DetrImageProcessorFast(BaseImageProcessorFast):
@@ -333,10 +325,9 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
     size = {"shortest_edge": 800, "longest_edge": 1333}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_init_kwargs = DetrFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = DetrFastImageProcessorPreprocessKwargs
+    valid_kwargs = DetrFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[DetrFastImageProcessorInitKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -380,7 +371,7 @@ def prepare_annotation(
         image: torch.Tensor,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -629,7 +620,13 @@ def pad(
             Path to the directory containing the segmentation masks.
         """,
     )
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[DetrFastImageProcessorPreprocessKwargs]) -> BatchFeature:
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[DetrFastImageProcessorKwargs],
+    ) -> BatchFeature:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
             logger.warning_once(
@@ -644,7 +641,7 @@ def preprocess(self, images: ImageInput, **kwargs: Unpack[DetrFastImageProcessor
             )
             kwargs["size"] = kwargs.pop("max_size")
 
-        return super().preprocess(images, **kwargs)
+        return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
 
     def _preprocess(
         self,
@@ -719,15 +716,8 @@ def _preprocess(
                         target_size=resized_image.size()[-2:],
                     )
                 image = resized_image
-
-            if do_rescale and do_normalize:
-                # fused rescale and normalize
-                image = F.normalize(image.to(dtype=torch.float32), image_mean, image_std)
-            elif do_rescale:
-                image = image * rescale_factor
-            elif do_normalize:
-                image = F.normalize(image, image_mean, image_std)
-
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
             if do_convert_annotations and annotations is not None:
                 annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
 
@@ -841,7 +831,7 @@ def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_t
         def to_tuple(tup):
             if isinstance(tup, tuple):
                 return tup
-            return tuple(tup.cpu().tolist())
+            return tuple(tup.tolist())
 
         for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
             # we filter empty queries and detection below threshold
@@ -950,7 +940,7 @@ def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_
         def to_tuple(tup):
             if isinstance(tup, tuple):
                 return tup
-            return tuple(tup.cpu().tolist())
+            return tuple(tup.tolist())
 
         for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
             out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 0b006c44ad98..cb47f58bda00 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -169,8 +169,8 @@ class DetrObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -238,9 +238,9 @@ class DetrSegmentationOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
-    pred_masks: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    pred_masks: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -632,7 +632,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        object_queries: torch.Tensor = None,
+        object_queries: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
     ):
         """
diff --git a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index fbf34012924b..000000000000
--- a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-import torch
-
-from transformers.utils import WEIGHTS_NAME
-
-
-DIALOGPT_MODELS = ["small", "medium", "large"]
-
-OLD_KEY = "lm_head.decoder.weight"
-NEW_KEY = "lm_head.weight"
-
-
-def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str):
-    d = torch.load(checkpoint_path)
-    d[NEW_KEY] = d.pop(OLD_KEY)
-    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-    torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dialogpt_path", default=".", type=str)
-    args = parser.parse_args()
-    for MODEL in DIALOGPT_MODELS:
-        checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
-        pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
-        convert_dialogpt_checkpoint(
-            checkpoint_path,
-            pytorch_dump_folder_path,
-        )
diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py
index 16aeefcb1c88..e444a4232341 100644
--- a/src/transformers/models/diffllama/modeling_diffllama.py
+++ b/src/transformers/models/diffllama/modeling_diffllama.py
@@ -22,7 +22,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import List, Optional, Tuple, Union
+from functools import partial
+from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -31,7 +32,11 @@
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
-from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward
+from ...modeling_flash_attention_utils import (
+    FlashAttentionKwargs,
+    _flash_attention_forward,
+    flash_attn_supports_top_left_mask,
+)
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -39,7 +44,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -47,7 +52,8 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_greater_or_equal_2_10,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -55,6 +61,12 @@
 from .configuration_diffllama import DiffLlamaConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "kajuma/DiffLlama-0.3B-handcut"
@@ -244,9 +256,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -632,45 +644,18 @@ def __init__(self, config: DiffLlamaConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -710,20 +695,12 @@ def forward(self, x, position_ids):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -784,10 +761,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(DIFFLLAMA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -795,16 +773,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -815,6 +791,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -849,7 +829,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -883,13 +863,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -897,12 +876,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -983,7 +967,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1050,27 +1034,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(DIFFLLAMA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1105,10 +1088,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1117,12 +1099,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -1131,10 +1112,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -1175,29 +1152,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(DIFFLLAMA_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1206,9 +1182,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1223,7 +1198,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1238,10 +1213,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1275,21 +1246,21 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.transformer.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(DIFFLLAMA_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         start_positions: Optional[torch.LongTensor] = None,
         end_positions: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -1300,9 +1271,8 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
             are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.transformer(
+        outputs: BaseModelOutputWithPast = self.transformer(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1310,10 +1280,9 @@ def forward(
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -1324,10 +1293,6 @@ def forward(
         if start_positions is not None and end_positions is not None:
             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
 
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return QuestionAnsweringModelOutput(
             loss=loss,
             start_logits=start_logits,
@@ -1367,6 +1332,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(DIFFLLAMA_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1378,23 +1344,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1403,9 +1367,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1413,10 +1376,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/diffllama/modular_diffllama.py b/src/transformers/models/diffllama/modular_diffllama.py
index c6bdf18093d4..68102347904a 100644
--- a/src/transformers/models/diffllama/modular_diffllama.py
+++ b/src/transformers/models/diffllama/modular_diffllama.py
@@ -23,11 +23,8 @@
 from torch import nn
 
 from ...cache_utils import Cache, StaticCache
-from ...modeling_flash_attention_utils import _flash_attention_forward
-from ...utils import (
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-)
+from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
+from ...utils import logging
 from ..gemma.modeling_gemma import GemmaForCausalLM
 from ..llama.modeling_llama import (
     LlamaDecoderLayer,
@@ -174,9 +171,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py
index 69677e406410..0e0121b78dab 100644
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -97,7 +97,7 @@ class DinatEncoderOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -132,7 +132,7 @@ class DinatModelOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -169,7 +169,7 @@ class DinatImageClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
diff --git a/src/transformers/models/dinov2/convert_dinov2_to_hf.py b/src/transformers/models/dinov2/convert_dinov2_to_hf.py
deleted file mode 100644
index d716191b2fcb..000000000000
--- a/src/transformers/models/dinov2/convert_dinov2_to_hf.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DINOv2 checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/dinov2/tree/main
-"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-import torch.nn as nn
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dinov2_config(model_name, image_classifier=False):
-    config = Dinov2Config(image_size=518, patch_size=14)
-
-    # size of the architecture
-    if "vits" in model_name:
-        config.hidden_size = 384
-        config.num_attention_heads = 6
-    elif "vitb" in model_name:
-        pass
-    elif "vitl" in model_name:
-        config.hidden_size = 1024
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-    elif "vitg" in model_name:
-        config.use_swiglu_ffn = True
-        config.hidden_size = 1536
-        config.num_hidden_layers = 40
-        config.num_attention_heads = 24
-    else:
-        raise ValueError("Model not supported")
-
-    if image_classifier:
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        config.num_labels = 1000
-        config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        config.id2label = {int(k): v for k, v in config.id2label.items()}
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # patch embedding layer
-    rename_keys.append(("cls_token", "embeddings.cls_token"))
-    rename_keys.append(("mask_token", "embeddings.mask_token"))
-    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
-    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
-
-    for i in range(config.num_hidden_layers):
-        # layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
-        # MLP
-        if config.use_swiglu_ffn:
-            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
-        else:
-            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
-        # layerscale
-        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
-        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
-        # attention projection layer
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
-
-    # final layernorm
-    rename_keys.append(("norm.weight", "layernorm.weight"))
-    rename_keys.append(("norm.bias", "layernorm.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return image
-
-
-@torch.no_grad()
-def convert_dinov2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our DINOv2 structure.
-    """
-
-    # define default Dinov2 configuration
-    image_classifier = "1layer" in model_name
-    config = get_dinov2_config(model_name, image_classifier=image_classifier)
-
-    # load original model from torch hub
-    original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", ""))
-    original_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = original_model.state_dict()
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config)
-
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if "w12" in key:
-            key = key.replace("w12", "weights_in")
-        if "w3" in key:
-            key = key.replace("w3", "weights_out")
-        state_dict[key] = val
-
-    # load HuggingFace model
-    if image_classifier:
-        model = Dinov2ForImageClassification(config).eval()
-        model.dinov2.load_state_dict(state_dict)
-        model_name_to_classifier_dict_url = {
-            "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth",
-            "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth",
-            "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth",
-            "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth",
-        }
-        url = model_name_to_classifier_dict_url[model_name]
-        classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
-        model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
-        model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
-    else:
-        model = Dinov2Model(config).eval()
-        model.load_state_dict(state_dict)
-
-    # load image
-    image = prepare_img()
-
-    # preprocess image
-    transformations = transforms.Compose(
-        [
-            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
-                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
-            ),
-        ]
-    )
-
-    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
-
-    processor = BitImageProcessor(
-        size={"shortest_edge": 256},
-        resample=PILImageResampling.BICUBIC,
-        image_mean=IMAGENET_DEFAULT_MEAN,
-        image_std=IMAGENET_DEFAULT_STD,
-    )
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    with torch.no_grad():
-        outputs = model(pixel_values, output_hidden_states=True)
-        original_outputs = original_model(pixel_values)
-
-    # assert values
-    if image_classifier:
-        print("Predicted class:")
-        class_idx = outputs.logits.argmax(-1).item()
-        print(model.config.id2label[class_idx])
-    else:
-        assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
-        assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_name_to_hf_name = {
-            "dinov2_vits14": "dinov2-small",
-            "dinov2_vitb14": "dinov2-base",
-            "dinov2_vitl14": "dinov2-large",
-            "dinov2_vitg14": "dinov2-giant",
-            "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer",
-            "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer",
-            "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer",
-            "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer",
-        }
-
-        name = model_name_to_hf_name[model_name]
-        model.push_to_hub(f"facebook/{name}")
-        processor.push_to_hub(f"facebook/{name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dinov2_vitb14",
-        type=str,
-        choices=[
-            "dinov2_vits14",
-            "dinov2_vitb14",
-            "dinov2_vitl14",
-            "dinov2_vitg14",
-            "dinov2_vits14_1layer",
-            "dinov2_vitb14_1layer",
-            "dinov2_vitl14_1layer",
-            "dinov2_vitg14_1layer",
-        ],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_dinov2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
index 33ec1c054990..7ed5a4ec6cb7 100644
--- a/src/transformers/models/dinov2/modeling_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -15,8 +15,7 @@
 """PyTorch DINOv2 model."""
 
 import collections.abc
-import math
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -30,7 +29,7 @@
     BaseModelOutputWithPooling,
     ImageClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
@@ -172,26 +171,59 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2
 class Dinov2SelfAttention(nn.Module):
     def __init__(self, config: Dinov2Config) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -200,78 +232,37 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-class Dinov2SdpaSelfAttention(Dinov2SelfAttention):
-    def __init__(self, config: Dinov2Config) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Dinov2Model is using Dinov2SdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        return context_layer, None
+        return outputs
 
 
 # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2
@@ -333,13 +324,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->Dinov2
-class Dinov2SdpaAttention(Dinov2Attention):
-    def __init__(self, config: Dinov2Config) -> None:
-        super().__init__(config)
-        self.attention = Dinov2SdpaSelfAttention(config)
-
-
 class Dinov2LayerScale(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
@@ -421,12 +405,6 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         return self.weights_out(hidden)
 
 
-DINOV2_ATTENTION_CLASSES = {
-    "eager": Dinov2Attention,
-    "sdpa": Dinov2SdpaAttention,
-}
-
-
 class Dinov2Layer(nn.Module):
     """This corresponds to the Block class in the original implementation."""
 
@@ -434,7 +412,7 @@ def __init__(self, config: Dinov2Config) -> None:
         super().__init__()
 
         self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.attention = DINOV2_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = Dinov2Attention(config)
         self.layer_scale1 = Dinov2LayerScale(config)
         self.drop_path = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
 
@@ -542,6 +520,7 @@ class Dinov2PreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["Dinov2SwiGLUFFN"]
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -569,6 +548,11 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
                 std=self.config.initializer_range,
             ).to(module.cls_token.dtype)
 
+            if self.config.use_mask_token:
+                module.mask_token.data.zero_()
+        elif isinstance(module, Dinov2LayerScale):
+            module.lambda1.data.fill_(self.config.layerscale_value)
+
 
 DINOV2_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
diff --git a/src/transformers/models/dinov2/modeling_flax_dinov2.py b/src/transformers/models/dinov2/modeling_flax_dinov2.py
index cf2a6e04c4ea..8093b3a0b799 100644
--- a/src/transformers/models/dinov2/modeling_flax_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_flax_dinov2.py
@@ -185,9 +185,11 @@ def interpolate_pos_encoding(self, config, hidden_states, height, width, positio
             antialias=False,
         )
         patch_pos_embed = patch_pos_embed.astype(target_dtype)
-        patch_pos_embed = jnp.transpose(patch_pos_embed, (0, 2, 3, 1)).reshape((hidden_states.shape[0], -1, dim))
+        patch_pos_embed = jnp.transpose(patch_pos_embed, (0, 2, 3, 1)).reshape((position_embeddings.shape[0], -1, dim))
+        patch_pos_embed_expanded = jnp.tile(patch_pos_embed, (hidden_states.shape[0], 1, 1))
+        class_pos_embed_expanded = jnp.tile(class_pos_embed, (hidden_states.shape[0], 1, 1))
 
-        return jnp.concatenate((class_pos_embed[jnp.newaxis, :], patch_pos_embed), axis=1)
+        return jnp.concatenate((class_pos_embed_expanded, patch_pos_embed_expanded), axis=1)
 
     def __call__(self, pixel_values, deterministic=True):
         batch_size = pixel_values.shape[0]
@@ -778,7 +780,7 @@ class FlaxDinov2ForImageClassification(FlaxDinov2PreTrainedModel):
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
     >>> image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base-imagenet1k-1-layer")
-    >>> model = FlaxDinov2ForImageClassification.from_pretrained("facebook/dinov2-base-imagenet1k-1-layer")
+    >>> model = FlaxDinov2ForImageClassification.from_pretrained("facebook/dinov2-base-imagenet1k-1-layer", from_pt=True)
 
     >>> inputs = image_processor(images=image, return_tensors="np")
     >>> outputs = model(**inputs)
diff --git a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py b/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py
deleted file mode 100644
index 0ff2697f7466..000000000000
--- a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DINOv2 with Registers checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/dinov2/tree/main
-"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-import torch.nn as nn
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import (
-    BitImageProcessor,
-    Dinov2WithRegistersConfig,
-    Dinov2WithRegistersForImageClassification,
-    Dinov2WithRegistersModel,
-)
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dinov2_with_registers_config(model_name, image_classifier=False):
-    config = Dinov2WithRegistersConfig(image_size=518, patch_size=14)
-
-    # size of the architecture
-    if "vits" in model_name:
-        config.hidden_size = 384
-        config.num_attention_heads = 6
-    elif "vitb" in model_name:
-        pass
-    elif "vitl" in model_name:
-        config.hidden_size = 1024
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-    elif "vitg" in model_name:
-        config.use_swiglu_ffn = True
-        config.hidden_size = 1536
-        config.num_hidden_layers = 40
-        config.num_attention_heads = 24
-    else:
-        raise ValueError("Model not supported")
-
-    if image_classifier:
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        config.num_labels = 1000
-        config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        config.id2label = {int(k): v for k, v in config.id2label.items()}
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # patch embedding layer
-    rename_keys.append(("cls_token", "embeddings.cls_token"))
-    rename_keys.append(("mask_token", "embeddings.mask_token"))
-    rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
-    rename_keys.append(("register_tokens", "embeddings.register_tokens"))
-    rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
-
-    for i in range(config.num_hidden_layers):
-        # layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
-        # MLP
-        if config.use_swiglu_ffn:
-            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
-        else:
-            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
-        # layerscale
-        rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
-        rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
-        # attention projection layer
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
-
-    # final layernorm
-    rename_keys.append(("norm.weight", "layernorm.weight"))
-    rename_keys.append(("norm.bias", "layernorm.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return image
-
-
-@torch.no_grad()
-def convert_dinov2_with_registers_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our Dinov2WithRegisters structure.
-    """
-
-    # define default Dinov2WithRegisters configuration
-    image_classifier = "1layer" in model_name
-    config = get_dinov2_with_registers_config(model_name, image_classifier=image_classifier)
-
-    # load original model from torch hub
-    original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", ""))
-    original_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = original_model.state_dict()
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config)
-
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if "w12" in key:
-            key = key.replace("w12", "weights_in")
-        if "w3" in key:
-            key = key.replace("w3", "weights_out")
-        state_dict[key] = val
-
-    # load HuggingFace model
-    if image_classifier:
-        model = Dinov2WithRegistersForImageClassification(config).eval()
-        model.dinov2_with_registers.load_state_dict(state_dict)
-        model_name_to_classifier_dict_url = {
-            "dinov2_vits14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_reg4_linear_head.pth",
-            "dinov2_vitb14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_reg4_linear_head.pth",
-            "dinov2_vitl14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_reg4_linear_head.pth",
-            "dinov2_vitg14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_linear_head.pth",
-        }
-        url = model_name_to_classifier_dict_url[model_name]
-        classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
-        model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
-        model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
-    else:
-        model = Dinov2WithRegistersModel(config).eval()
-        model.load_state_dict(state_dict)
-
-    # load image
-    image = prepare_img()
-
-    # preprocess image
-    transformations = transforms.Compose(
-        [
-            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=IMAGENET_DEFAULT_MEAN,  # these are RGB mean+std values
-                std=IMAGENET_DEFAULT_STD,  # across a large photo dataset.
-            ),
-        ]
-    )
-
-    original_pixel_values = transformations(image).unsqueeze(0)  # insert batch dimension
-
-    processor = BitImageProcessor(
-        size={"shortest_edge": 256},
-        resample=PILImageResampling.BICUBIC,
-        image_mean=IMAGENET_DEFAULT_MEAN,
-        image_std=IMAGENET_DEFAULT_STD,
-    )
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    with torch.no_grad():
-        outputs = model(pixel_values, output_hidden_states=True)
-        original_outputs = original_model(pixel_values)
-
-    # assert values
-    if image_classifier:
-        print("Predicted class:")
-        class_idx = outputs.logits.argmax(-1).item()
-        print(model.config.id2label[class_idx])
-    else:
-        assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
-        assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_name_to_hf_name = {
-            "dinov2_vits14_reg": "dinov2-with-registers-small",
-            "dinov2_vitb14_reg": "dinov2-with-registers-base",
-            "dinov2_vitl14_reg": "dinov2-with-registers-large",
-            "dinov2_vitg14_reg": "dinov2-with-registers-giant",
-            "dinov2_vits14_reg_1layer": "dinov2-with-registers-small-imagenet1k-1-layer",
-            "dinov2_vitb14_reg_1layer": "dinov2-with-registers-base-imagenet1k-1-layer",
-            "dinov2_vitl14_reg_1layer": "dinov2-with-registers-large-imagenet1k-1-layer",
-            "dinov2_vitg14_reg_1layer": "dinov2-with-registers-giant-imagenet1k-1-layer",
-        }
-
-        name = model_name_to_hf_name[model_name]
-        model.push_to_hub(f"nielsr/{name}")
-        processor.push_to_hub(f"nielsr/{name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dinov2_vits14_reg",
-        type=str,
-        choices=[
-            "dinov2_vits14_reg",
-            "dinov2_vitb14_reg",
-            "dinov2_vitl14_reg",
-            "dinov2_vitg14_reg",
-            "dinov2_vits14_reg_1layer",
-            "dinov2_vitb14_reg_1layer",
-            "dinov2_vitl14_reg_1layer",
-            "dinov2_vitg14_reg_1layer",
-        ],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_dinov2_with_registers_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
index bd9d181cdf35..449bfb9b91cd 100644
--- a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
+++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
@@ -21,8 +21,7 @@
 # limitations under the License.
 
 import collections.abc
-import math
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -30,7 +29,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
@@ -185,25 +184,57 @@ def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Te
         return embeddings
 
 
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 class Dinov2WithRegistersSelfAttention(nn.Module):
     def __init__(self, config: Dinov2WithRegistersConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -212,78 +243,37 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-class Dinov2WithRegistersSdpaSelfAttention(Dinov2WithRegistersSelfAttention):
-    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Dinov2WithRegistersModel is using Dinov2WithRegistersSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        return context_layer, None
+        return outputs
 
 
 class Dinov2WithRegistersSelfOutput(nn.Module):
@@ -343,12 +333,6 @@ def forward(
         return outputs
 
 
-class Dinov2WithRegistersSdpaAttention(Dinov2WithRegistersAttention):
-    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
-        super().__init__(config)
-        self.attention = Dinov2WithRegistersSdpaSelfAttention(config)
-
-
 class Dinov2WithRegistersLayerScale(nn.Module):
     def __init__(self, config) -> None:
         super().__init__()
@@ -428,12 +412,6 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
         return self.weights_out(hidden)
 
 
-DINOV2_WITH_REGISTERS_ATTENTION_CLASSES = {
-    "eager": Dinov2WithRegistersAttention,
-    "sdpa": Dinov2WithRegistersSdpaAttention,
-}
-
-
 class Dinov2WithRegistersLayer(nn.Module):
     """This corresponds to the Block class in the original implementation."""
 
@@ -441,7 +419,7 @@ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
         super().__init__()
 
         self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.attention = DINOV2_WITH_REGISTERS_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = Dinov2WithRegistersAttention(config)
         self.layer_scale1 = Dinov2WithRegistersLayerScale(config)
         self.drop_path = (
             Dinov2WithRegistersDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
@@ -550,6 +528,7 @@ class Dinov2WithRegistersPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["Dinov2WithRegistersSwiGLUFFN"]
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -577,6 +556,11 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
                 std=self.config.initializer_range,
             ).to(module.cls_token.dtype)
 
+            module.mask_token.data.zero_()
+            module.register_tokens.data.zero_()
+        elif isinstance(module, Dinov2WithRegistersLayerScale):  # noqa: F821
+            module.lambda1.data.fill_(self.config.layerscale_value)
+
 
 _EXPECTED_OUTPUT_SHAPE = [1, 257, 768]
 
diff --git a/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
index cbd316c421b0..59777e215789 100644
--- a/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
+++ b/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 import torch.utils.checkpoint
@@ -277,7 +277,36 @@ class Dinov2WithRegistersEncoder(Dinov2Encoder):
 
 
 class Dinov2WithRegistersPreTrainedModel(Dinov2PreTrainedModel):
-    pass
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2WithRegistersEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+            module.mask_token.data.zero_()
+            module.register_tokens.data.zero_()
+        elif isinstance(module, Dinov2WithRegistersLayerScale):  # noqa: F821
+            module.lambda1.data.fill_(self.config.layerscale_value)
 
 
 class Dinov2WithRegistersModel(Dinov2Model):
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index 6aa50397d42c..b78050a01aeb 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -30,6 +30,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutput,
     MaskedLMOutput,
@@ -49,15 +50,13 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_distilbert import DistilBertConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -249,9 +248,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
diff --git a/src/transformers/models/distilbert/modeling_flax_distilbert.py b/src/transformers/models/distilbert/modeling_flax_distilbert.py
index f1cf0faaed3f..1f2b6ac96ab6 100644
--- a/src/transformers/models/distilbert/modeling_flax_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py
@@ -275,9 +275,9 @@ class FlaxTransformerBlock(nn.Module):
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
 
     def setup(self):
-        assert (
-            self.config.dim % self.config.n_heads == 0
-        ), f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}"
+        assert self.config.dim % self.config.n_heads == 0, (
+            f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}"
+        )
 
         self.attention = FlaxMultiHeadSelfAttention(self.config, dtype=self.dtype)
         self.sa_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype)
@@ -861,7 +861,7 @@ def __call__(
 
         hidden_states = self.dropout(hidden_states, deterministic=deterministic)
         logits = self.qa_outputs(hidden_states)
-        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py
index 09b14b89c563..d0ee2f84835d 100644
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -269,9 +269,9 @@ def __init__(self, config, **kwargs):
         self.activation = config.activation
         self.output_attentions = config.output_attentions
 
-        assert (
-            config.dim % config.n_heads == 0
-        ), f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}"
+        assert config.dim % config.n_heads == 0, (
+            f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}"
+        )
 
         self.attention = TFMultiHeadSelfAttention(config, name="attention")
         self.sa_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
diff --git a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
deleted file mode 100644
index 40c5b22e3b9a..000000000000
--- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DiT checkpoints from the unilm repository."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor
-from transformers.image_utils import PILImageResampling
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, has_lm_head=False, is_semantic=False):
-    prefix = "backbone." if is_semantic else ""
-
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias")
-        )
-        rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            (f"{prefix}cls_token", "beit.embeddings.cls_token"),
-            (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"),
-            (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"),
-            (f"{prefix}pos_embed", "beit.embeddings.position_embeddings"),
-        ]
-    )
-
-    if has_lm_head:
-        # mask token + layernorm
-        rename_keys.extend(
-            [
-                ("mask_token", "beit.embeddings.mask_token"),
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-            ]
-        )
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("fc_norm.weight", "beit.pooler.layernorm.weight"),
-                ("fc_norm.bias", "beit.pooler.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False):
-    for i in range(config.num_hidden_layers):
-        prefix = "backbone." if is_semantic else ""
-        # queries, keys and values
-        in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
-        q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")
-
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
-
-        # gamma_1 and gamma_2
-        # we call them lambda because otherwise they are renamed when using .from_pretrained
-        gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
-        gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")
-
-        state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1
-        state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our BEiT structure.
-    """
-
-    # define default BEiT configuration
-    has_lm_head = False if "rvlcdip" in checkpoint_url else True
-    config = BeitConfig(use_absolute_position_embeddings=True, use_mask_token=has_lm_head)
-
-    # size of the architecture
-    if "large" in checkpoint_url or "dit-l" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-
-    # labels
-    if "rvlcdip" in checkpoint_url:
-        config.num_labels = 16
-        repo_id = "huggingface/label-files"
-        filename = "rvlcdip-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-
-    rename_keys = create_rename_keys(config, has_lm_head=has_lm_head)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head)
-
-    # load HuggingFace model
-    model = BeitForMaskedImageModeling(config) if has_lm_head else BeitForImageClassification(config)
-    model.eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image
-    image_processor = BeitImageProcessor(
-        size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False
-    )
-    image = prepare_img()
-
-    encoding = image_processor(images=image, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    outputs = model(pixel_values)
-    logits = outputs.logits
-
-    # verify logits
-    expected_shape = [1, 16] if "rvlcdip" in checkpoint_url else [1, 196, 8192]
-    assert logits.shape == torch.Size(expected_shape), "Shape of logits not as expected"
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        if has_lm_head:
-            model_name = "dit-base" if "base" in checkpoint_url else "dit-large"
-        else:
-            model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip"
-        image_processor.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-        model.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-    )
-    args = parser.parse_args()
-    convert_dit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
deleted file mode 100644
index f6f14f6d08e3..000000000000
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Donut checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut"""
-
-import argparse
-
-import torch
-from datasets import load_dataset
-from donut import DonutModel
-
-from transformers import (
-    DonutImageProcessor,
-    DonutProcessor,
-    DonutSwinConfig,
-    DonutSwinModel,
-    MBartConfig,
-    MBartForCausalLM,
-    VisionEncoderDecoderModel,
-    XLMRobertaTokenizerFast,
-)
-
-
-def get_configs(model):
-    original_config = model.config
-
-    encoder_config = DonutSwinConfig(
-        image_size=original_config.input_size,
-        patch_size=4,
-        depths=original_config.encoder_layer,
-        num_heads=[4, 8, 16, 32],
-        window_size=original_config.window_size,
-        embed_dim=128,
-    )
-    decoder_config = MBartConfig(
-        is_decoder=True,
-        is_encoder_decoder=False,
-        add_cross_attention=True,
-        decoder_layers=original_config.decoder_layer,
-        max_position_embeddings=original_config.max_position_embeddings,
-        vocab_size=len(
-            model.decoder.tokenizer
-        ),  # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json)
-        scale_embedding=True,
-        add_final_layer_norm=True,
-    )
-
-    return encoder_config, decoder_config
-
-
-def rename_key(name):
-    if "encoder.model" in name:
-        name = name.replace("encoder.model", "encoder")
-    if "decoder.model" in name:
-        name = name.replace("decoder.model", "decoder")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.norm")
-    if name.startswith("encoder"):
-        if "layers" in name:
-            name = "encoder." + name
-        if "attn.proj" in name:
-            name = name.replace("attn.proj", "attention.output.dense")
-        if "attn" in name and "mask" not in name:
-            name = name.replace("attn", "attention.self")
-        if "norm1" in name:
-            name = name.replace("norm1", "layernorm_before")
-        if "norm2" in name:
-            name = name.replace("norm2", "layernorm_after")
-        if "mlp.fc1" in name:
-            name = name.replace("mlp.fc1", "intermediate.dense")
-        if "mlp.fc2" in name:
-            name = name.replace("mlp.fc2", "output.dense")
-
-        if name == "encoder.norm.weight":
-            name = "encoder.layernorm.weight"
-        if name == "encoder.norm.bias":
-            name = "encoder.layernorm.bias"
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[3])
-            block_num = int(key_split[5])
-            dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
-
-            if "weight" in key:
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
-                ] = val[:dim, :]
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = (
-                    val[dim : dim * 2, :]
-                )
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = (
-                    val[:dim]
-                )
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = (
-                    val[dim : dim * 2]
-                )
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = (
-                    val[-dim:]
-                )
-        elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
-            # HuggingFace implementation doesn't use attn_mask buffer
-            # and model doesn't use final LayerNorms for the encoder
-            pass
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    # load original model
-    original_model = DonutModel.from_pretrained(model_name).eval()
-
-    # load HuggingFace model
-    encoder_config, decoder_config = get_configs(original_model)
-    encoder = DonutSwinModel(encoder_config)
-    decoder = MBartForCausalLM(decoder_config)
-    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
-    model.eval()
-
-    state_dict = original_model.state_dict()
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    # verify results on scanned document
-    dataset = load_dataset("hf-internal-testing/example-documents")  # no-script
-    image = dataset["test"][0]["image"].convert("RGB")
-
-    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True)
-    image_processor = DonutImageProcessor(
-        do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1]
-    )
-    processor = DonutProcessor(image_processor, tokenizer)
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    if model_name == "naver-clova-ix/donut-base-finetuned-docvqa":
-        task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
-        question = "When is the coffee break?"
-        task_prompt = task_prompt.replace("{user_input}", question)
-    elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip":
-        task_prompt = "<s_rvlcdip>"
-    elif model_name in [
-        "naver-clova-ix/donut-base-finetuned-cord-v1",
-        "naver-clova-ix/donut-base-finetuned-cord-v1-2560",
-    ]:
-        task_prompt = "<s_cord>"
-    elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
-        task_prompt = "s_cord-v2>"
-    elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket":
-        task_prompt = "<s_zhtrainticket>"
-    elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]:
-        # use a random prompt
-        task_prompt = "hello world"
-    else:
-        raise ValueError("Model name not supported")
-    prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")[
-        "input_ids"
-    ]
-
-    original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
-    patch_embeddings, _ = model.encoder.embeddings(pixel_values)
-    assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3)
-
-    # verify encoder hidden states
-    original_last_hidden_state = original_model.encoder(pixel_values)
-    last_hidden_state = model.encoder(pixel_values).last_hidden_state
-    assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)
-
-    # verify decoder hidden states
-    original_logits = original_model(pixel_values, prompt_tensors, None).logits
-    logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits
-    assert torch.allclose(original_logits, logits, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
-        processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="naver-clova-ix/donut-base-finetuned-docvqa",
-        required=False,
-        type=str,
-        help="Name of the original model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        required=False,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model and processor to the 🤗 hub.",
-    )
-
-    args = parser.parse_args()
-    convert_donut_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index a10e2846cb76..239bc54db291 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -299,16 +299,16 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_thumbnail: bool = None,
-        do_align_long_axis: bool = None,
-        do_pad: bool = None,
+        do_thumbnail: Optional[bool] = None,
+        do_align_long_axis: Optional[bool] = None,
+        do_pad: Optional[bool] = None,
         random_padding: bool = False,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
index 1434ae415045..0d44069fc8b8 100644
--- a/src/transformers/models/donut/modeling_donut_swin.py
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -78,7 +78,7 @@ class DonutSwinEncoderOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -114,7 +114,7 @@ class DonutSwinModelOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -869,6 +869,13 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, DonutSwinEmbeddings):
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, DonutSwinSelfAttention):
+            module.relative_position_bias_table.data.zero_()
 
 
 SWIN_START_DOCSTRING = r"""
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index ed3112ff8dd9..689aa5122f8f 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -86,19 +86,8 @@ def __call__(
         When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
         [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
         [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
-        [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
+        [`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
         """
-        # For backward compatibility
-        legacy = kwargs.pop("legacy", True)
-        if legacy:
-            # With `add_special_tokens=True`, the performance of donut are degraded when working with both images and text.
-            logger.warning_once(
-                "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. "
-                "In the new behavior, if both images and text are provided, the default value of `add_special_tokens` "
-                "will be changed to `False` when calling the tokenizer if `add_special_tokens` is unset. "
-                "To test the new behavior, set `legacy=False`as a processor call argument."
-            )
-
         if self._in_target_context_manager:
             return self.current_processor(images, text, **kwargs)
 
@@ -114,7 +103,7 @@ def __call__(
         if images is not None:
             inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         if text is not None:
-            if not legacy and images is not None:
+            if images is not None:
                 output_kwargs["text_kwargs"].setdefault("add_special_tokens", False)
             encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
diff --git a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py b/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
deleted file mode 100644
index c11345d1eb4e..000000000000
--- a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import collections
-from pathlib import Path
-
-import torch
-from torch.serialization import default_restore_location
-
-from transformers import BertConfig, DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader
-
-
-CheckpointState = collections.namedtuple(
-    "CheckpointState", ["model_dict", "optimizer_dict", "scheduler_dict", "offset", "epoch", "encoder_params"]
-)
-
-
-def load_states_from_checkpoint(model_file: str) -> CheckpointState:
-    print(f"Reading saved model from {model_file}")
-    state_dict = torch.load(model_file, map_location=lambda s, l: default_restore_location(s, "cpu"))
-    return CheckpointState(**state_dict)
-
-
-class DPRState:
-    def __init__(self, src_file: Path):
-        self.src_file = src_file
-
-    def load_dpr_model(self):
-        raise NotImplementedError
-
-    @staticmethod
-    def from_type(comp_type: str, *args, **kwargs) -> "DPRState":
-        if comp_type.startswith("c"):
-            return DPRContextEncoderState(*args, **kwargs)
-        if comp_type.startswith("q"):
-            return DPRQuestionEncoderState(*args, **kwargs)
-        if comp_type.startswith("r"):
-            return DPRReaderState(*args, **kwargs)
-        else:
-            raise ValueError("Component type must be either 'ctx_encoder', 'question_encoder' or 'reader'.")
-
-
-class DPRContextEncoderState(DPRState):
-    def load_dpr_model(self):
-        model = DPRContextEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
-        print(f"Loading DPR biencoder from {self.src_file}")
-        saved_state = load_states_from_checkpoint(self.src_file)
-        encoder, prefix = model.ctx_encoder, "ctx_model."
-        # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3
-        state_dict = {"bert_model.embeddings.position_ids": model.ctx_encoder.bert_model.embeddings.position_ids}
-        for key, value in saved_state.model_dict.items():
-            if key.startswith(prefix):
-                key = key[len(prefix) :]
-                if not key.startswith("encode_proj."):
-                    key = "bert_model." + key
-                state_dict[key] = value
-        encoder.load_state_dict(state_dict)
-        return model
-
-
-class DPRQuestionEncoderState(DPRState):
-    def load_dpr_model(self):
-        model = DPRQuestionEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
-        print(f"Loading DPR biencoder from {self.src_file}")
-        saved_state = load_states_from_checkpoint(self.src_file)
-        encoder, prefix = model.question_encoder, "question_model."
-        # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3
-        state_dict = {"bert_model.embeddings.position_ids": model.question_encoder.bert_model.embeddings.position_ids}
-        for key, value in saved_state.model_dict.items():
-            if key.startswith(prefix):
-                key = key[len(prefix) :]
-                if not key.startswith("encode_proj."):
-                    key = "bert_model." + key
-                state_dict[key] = value
-        encoder.load_state_dict(state_dict)
-        return model
-
-
-class DPRReaderState(DPRState):
-    def load_dpr_model(self):
-        model = DPRReader(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0]))
-        print(f"Loading DPR reader from {self.src_file}")
-        saved_state = load_states_from_checkpoint(self.src_file)
-        # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3
-        state_dict = {
-            "encoder.bert_model.embeddings.position_ids": model.span_predictor.encoder.bert_model.embeddings.position_ids
-        }
-        for key, value in saved_state.model_dict.items():
-            if key.startswith("encoder.") and not key.startswith("encoder.encode_proj"):
-                key = "encoder.bert_model." + key[len("encoder.") :]
-            state_dict[key] = value
-        model.span_predictor.load_state_dict(state_dict)
-        return model
-
-
-def convert(comp_type: str, src_file: Path, dest_dir: Path):
-    dest_dir = Path(dest_dir)
-    dest_dir.mkdir(exist_ok=True)
-
-    dpr_state = DPRState.from_type(comp_type, src_file=src_file)
-    model = dpr_state.load_dpr_model()
-    model.save_pretrained(dest_dir)
-    model.from_pretrained(dest_dir)  # sanity check
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--type", type=str, help="Type of the component to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
-    )
-    parser.add_argument(
-        "--src",
-        type=str,
-        help=(
-            "Path to the dpr checkpoint file. They can be downloaded from the official DPR repo"
-            " https://github.com/facebookresearch/DPR. Note that in the official repo, both encoders are stored in the"
-            " 'retriever' checkpoints."
-        ),
-    )
-    parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model directory.")
-    args = parser.parse_args()
-
-    src_file = Path(args.src)
-    dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest
-    dest_dir = Path(dest_dir)
-    assert src_file.exists()
-    assert (
-        args.type is not None
-    ), "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'."
-    convert(args.type, src_file, dest_dir)
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index 79317202b8ca..3ff4aa11523e 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -127,8 +127,8 @@ class DPRReaderOutput(ModelOutput):
     """
 
     start_logits: torch.FloatTensor
-    end_logits: torch.FloatTensor = None
-    relevance_logits: torch.FloatTensor = None
+    end_logits: Optional[torch.FloatTensor] = None
+    relevance_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index 49a750fa4ff4..303b03ec244d 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -18,7 +18,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Tuple, Union
+from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
@@ -68,7 +68,7 @@ class TFDPRContextEncoderOutput(ModelOutput):
             heads.
     """
 
-    pooler_output: tf.Tensor = None
+    pooler_output: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
 
@@ -96,7 +96,7 @@ class TFDPRQuestionEncoderOutput(ModelOutput):
             heads.
     """
 
-    pooler_output: tf.Tensor = None
+    pooler_output: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
 
@@ -127,9 +127,9 @@ class TFDPRReaderOutput(ModelOutput):
             heads.
     """
 
-    start_logits: tf.Tensor = None
-    end_logits: tf.Tensor = None
-    relevance_logits: tf.Tensor = None
+    start_logits: Optional[tf.Tensor] = None
+    end_logits: Optional[tf.Tensor] = None
+    relevance_logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
 
@@ -155,13 +155,13 @@ def __init__(self, config: DPRConfig, **kwargs):
     @unpack_inputs
     def call(
         self,
-        input_ids: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
         attention_mask: tf.Tensor | None = None,
         token_type_ids: tf.Tensor | None = None,
         inputs_embeds: tf.Tensor | None = None,
-        output_attentions: bool = None,
-        output_hidden_states: bool = None,
-        return_dict: bool = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor, ...]]:
         outputs = self.bert_model(
@@ -226,7 +226,7 @@ def __init__(self, config: DPRConfig, **kwargs):
     @unpack_inputs
     def call(
         self,
-        input_ids: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
         attention_mask: tf.Tensor | None = None,
         inputs_embeds: tf.Tensor | None = None,
         output_attentions: bool = False,
@@ -296,7 +296,7 @@ def __init__(self, config: DPRConfig, **kwargs):
     @unpack_inputs
     def call(
         self,
-        input_ids: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
         attention_mask: tf.Tensor | None = None,
         token_type_ids: tf.Tensor | None = None,
         inputs_embeds: tf.Tensor | None = None,
@@ -329,7 +329,7 @@ def __init__(self, config: DPRConfig, **kwargs):
     @unpack_inputs
     def call(
         self,
-        input_ids: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
         attention_mask: tf.Tensor | None = None,
         token_type_ids: tf.Tensor | None = None,
         inputs_embeds: tf.Tensor | None = None,
diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py
index 026ba1a8907d..f4e7c0fdcdbf 100644
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@@ -170,9 +170,9 @@ def __call__(
         texts = texts if not isinstance(texts, str) else [texts]
         n_passages = len(titles)
         questions = questions if not isinstance(questions, str) else [questions] * n_passages
-        assert len(titles) == len(
-            texts
-        ), f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
+        assert len(titles) == len(texts), (
+            f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts."
+        )
         encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"]
         encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"]
         encoded_inputs = {
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index 516f8f43f0d2..22f25e18423f 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -119,6 +119,12 @@ class DPTConfig(PretrainedConfig):
         backbone_kwargs (`dict`, *optional*):
             Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
             e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        pooler_output_size (`int`, *optional*):
+           Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
+        pooler_act (`str`, *optional*, defaults to `"tanh"`):
+           The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and
+           Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are
+           supported for Tensorflow.
 
     Example:
 
@@ -173,6 +179,8 @@ def __init__(
         use_pretrained_backbone=False,
         use_timm_backbone=False,
         backbone_kwargs=None,
+        pooler_output_size=None,
+        pooler_act="tanh",
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -268,6 +276,8 @@ def __init__(
         self.auxiliary_loss_weight = auxiliary_loss_weight
         self.semantic_loss_ignore_index = semantic_loss_ignore_index
         self.semantic_classifier_dropout = semantic_classifier_dropout
+        self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size
+        self.pooler_act = pooler_act
 
     def to_dict(self):
         """
@@ -282,5 +292,9 @@ def to_dict(self):
         output["model_type"] = self.__class__.model_type
         return output
 
+    @property
+    def sub_configs(self):
+        return {"backbone_config": type(self.backbone_config)} if self.backbone_config is not None else {}
+
 
 __all__ = ["DPTConfig"]
diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
deleted file mode 100644
index 367aff7f90e1..000000000000
--- a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DINOv2 + DPT checkpoints from the original repository. URL:
-https://github.com/facebookresearch/dinov2/tree/main"""
-
-import argparse
-import itertools
-import math
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-from torchvision import transforms
-
-from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(model_name):
-    if "small" in model_name:
-        # equivalent to stage 3, stage 6, stage 9, stage 12
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-small", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False
-        )
-        neck_hidden_sizes = [48, 96, 192, 384]
-    elif "base" in model_name:
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-base", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False
-        )
-        neck_hidden_sizes = [96, 192, 384, 768]
-    elif "large" in model_name:
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-large", out_indices=[5, 12, 18, 24], apply_layernorm=False, reshape_hidden_states=False
-        )
-        neck_hidden_sizes = [128, 256, 512, 1024]
-    elif "giant" in model_name:
-        backbone_config = Dinov2Config.from_pretrained(
-            "facebook/dinov2-giant", out_indices=[10, 20, 30, 40], apply_layernorm=False, reshape_hidden_states=False
-        )
-        neck_hidden_sizes = [192, 384, 768, 1536]
-    else:
-        raise NotImplementedError("To do")
-
-    config = DPTConfig(
-        backbone_config=backbone_config,
-        neck_hidden_sizes=neck_hidden_sizes,
-        use_bias_in_fusion_residual=False,
-        add_projection=True,
-    )
-
-    return config
-
-
-# here we list all DPT keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys_dpt(config):
-    rename_keys = []
-
-    # fmt: off
-    # activation postprocessing (projections, readout projections + resize blocks)
-    for i in range(4):
-        rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
-        rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
-
-        rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight"))
-        rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias"))
-
-        if i != 2:
-            rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
-            rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
-
-    # fusion layers
-    for i in range(4):
-        rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.weight", f"neck.fusion_stage.layers.{i}.projection.weight"))
-        rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.bias", f"neck.fusion_stage.layers.{i}.projection.bias"))
-        if i != 0:
-            rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution1.weight"))
-            rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution2.weight"))
-        rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution1.weight"))
-        rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution2.weight"))
-
-    # neck convolutions
-    for i in range(4):
-        rename_keys.append((f"decode_head.convs.{i}.conv.weight", f"neck.convs.{i}.weight"))
-
-    # head
-    rename_keys.append(("decode_head.project.conv.weight", "head.projection.weight"))
-    rename_keys.append(("decode_head.project.conv.bias", "head.projection.bias"))
-
-    for i in range(0, 5, 2):
-        rename_keys.append((f"decode_head.conv_depth.head.{i}.weight", f"head.head.{i}.weight"))
-        rename_keys.append((f"decode_head.conv_depth.head.{i}.bias", f"head.head.{i}.bias"))
-    # fmt: on
-
-    return rename_keys
-
-
-# here we list all backbone keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys_backbone(config):
-    rename_keys = []
-
-    # fmt: off
-    # patch embedding layer
-    rename_keys.append(("cls_token", "backbone.embeddings.cls_token"))
-    rename_keys.append(("mask_token", "backbone.embeddings.mask_token"))
-    rename_keys.append(("pos_embed", "backbone.embeddings.position_embeddings"))
-    rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
-
-    # Transfomer encoder
-    for i in range(config.backbone_config.num_hidden_layers):
-        # layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias"))
-        # MLP
-        if config.backbone_config.use_swiglu_ffn:
-            rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"backbone.encoder.layer.{i}.mlp.w12.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"backbone.encoder.layer.{i}.mlp.w12.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"backbone.encoder.layer.{i}.mlp.w3.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"backbone.encoder.layer.{i}.mlp.w3.bias"))
-        else:
-            rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight"))
-            rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias"))
-        # layerscale
-        rename_keys.append((f"blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1"))
-        rename_keys.append((f"blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1"))
-        # attention projection layer
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
-    # fmt: on
-
-    rename_keys.append(("norm.weight", "backbone.layernorm.weight"))
-    rename_keys.append(("norm.bias", "backbone.layernorm.bias"))
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.backbone_config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        hidden_size = config.backbone_config.hidden_size
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            hidden_size : hidden_size * 2
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "https://dl.fbaipublicfiles.com/dinov2/images/example.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-name_to_url = {
-    "dpt-dinov2-small-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_nyu_dpt_head.pth",
-    "dpt-dinov2-small-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_kitti_dpt_head.pth",
-    "dpt-dinov2-base-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_nyu_dpt_head.pth",
-    "dpt-dinov2-base-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_kitti_dpt_head.pth",
-    "dpt-dinov2-large-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_head.pth",
-    "dpt-dinov2-large-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_kitti_dpt_head.pth",
-    "dpt-dinov2-giant-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_nyu_dpt_head.pth",
-    "dpt-dinov2-giant-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_kitti_dpt_head.pth",
-}
-
-
-def get_original_pixel_values(image):
-    class CenterPadding:
-        def __init__(self, multiple):
-            super().__init__()
-            self.multiple = multiple
-
-        def _get_pad(self, size):
-            new_size = math.ceil(size / self.multiple) * self.multiple
-            pad_size = new_size - size
-            pad_size_left = pad_size // 2
-            pad_size_right = pad_size - pad_size_left
-            return pad_size_left, pad_size_right
-
-        def __call__(self, img):
-            pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in img.shape[-2:][::-1]))
-            output = torch.nn.functional.pad(img, pads)
-            return output
-
-        def __repr__(self):
-            return self.__class__.__name__ + "()"
-
-    def make_depth_transform() -> transforms.Compose:
-        return transforms.Compose(
-            [
-                transforms.ToTensor(),
-                lambda x: 255.0 * x[:3],  # Discard alpha component and scale by 255
-                transforms.Normalize(
-                    mean=(123.675, 116.28, 103.53),
-                    std=(58.395, 57.12, 57.375),
-                ),
-                CenterPadding(multiple=14),
-            ]
-        )
-
-    transform = make_depth_transform()
-    original_pixel_values = transform(image).unsqueeze(0)
-
-    return original_pixel_values
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    # define DPT configuration based on URL
-    checkpoint_url = name_to_url[model_name]
-    config = get_dpt_config(model_name)
-
-    # load original DPT state_dict from URL
-    print("URL:", checkpoint_url)
-    dpt_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
-    # rename keys
-    rename_keys = create_rename_keys_dpt(config)
-    for src, dest in rename_keys:
-        rename_key(dpt_state_dict, src, dest)
-
-    # load original backbone state_dict from URL
-    if "small" in model_name:
-        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14")
-    elif "base" in model_name:
-        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")
-    elif "large" in model_name:
-        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitl14")
-    elif "giant" in model_name:
-        original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitg14")
-    else:
-        raise NotImplementedError("To do")
-    original_model.eval()
-    backbone_state_dict = original_model.state_dict()
-
-    # rename keys
-    rename_keys = create_rename_keys_backbone(config)
-    for src, dest in rename_keys:
-        rename_key(backbone_state_dict, src, dest)
-
-    # read in qkv matrices
-    read_in_q_k_v(backbone_state_dict, config)
-
-    for key, val in backbone_state_dict.copy().items():
-        val = backbone_state_dict.pop(key)
-        if "w12" in key:
-            key = key.replace("w12", "weights_in")
-        if "w3" in key:
-            key = key.replace("w3", "weights_out")
-        backbone_state_dict[key] = val
-
-    # merge state_dicts
-    state_dict = {**backbone_state_dict, **dpt_state_dict}
-
-    # load HuggingFace model
-    model = DPTForDepthEstimation(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-    assert missing_keys == [
-        "neck.fusion_stage.layers.0.residual_layer1.convolution1.weight",
-        "neck.fusion_stage.layers.0.residual_layer1.convolution2.weight",
-    ]
-    model.eval()
-
-    # Verify image processor
-    processor = DPTImageProcessor(
-        do_resize=False,
-        do_rescale=False,
-        do_pad=True,
-        size_divisor=14,
-        do_normalize=True,
-        image_mean=(123.675, 116.28, 103.53),
-        image_std=(58.395, 57.12, 57.375),
-    )
-
-    image = prepare_img()
-    pixel_values = processor(image, return_tensors="pt").pixel_values.float()
-    original_pixel_values = get_original_pixel_values(image)
-
-    assert torch.allclose(pixel_values, original_pixel_values)
-
-    # Verify forward pass
-    with torch.no_grad():
-        outputs = model(pixel_values)
-
-    predicted_depth = outputs.predicted_depth
-
-    print("Shape of predicted depth:", predicted_depth.shape)
-    print("First values of predicted depth:", predicted_depth[0, :3, :3])
-
-    # assert logits
-    if verify_logits:
-        if model_name == "dpt-dinov2-small-nyu":
-            expected_shape = torch.Size([1, 576, 736])
-            expected_slice = torch.tensor(
-                [[3.3576, 3.4741, 3.4345], [3.4324, 3.5012, 3.2775], [3.2560, 3.3563, 3.2354]]
-            )
-
-        assert predicted_depth.shape == torch.Size(expected_shape)
-        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-5)
-        print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(repo_id=f"facebook/{model_name}")
-        processor.push_to_hub(repo_id=f"facebook/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dpt-dinov2-small-nyu",
-        type=str,
-        choices=name_to_url.keys(),
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub after conversion.",
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_true",
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits)
diff --git a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
deleted file mode 100644
index 3a576d772f57..000000000000
--- a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS"""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import BeitConfig, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(model_name):
-    hidden_size = 768
-    num_hidden_layers = 12
-    num_attention_heads = 12
-    intermediate_size = 3072
-    out_features = ["stage3", "stage6", "stage9", "stage12"]  # beit-base-384 uses [2, 5, 8, 11]
-
-    if "large" in model_name:
-        hidden_size = 1024
-        num_hidden_layers = 24
-        num_attention_heads = 16
-        intermediate_size = 4096
-        out_features = ["stage6", "stage12", "stage18", "stage24"]  # beit-large-512 uses [5, 11, 17, 23]
-
-    if "512" in model_name:
-        image_size = 512
-    elif "384" in model_name:
-        image_size = 384
-    else:
-        raise ValueError("Model not supported")
-
-    backbone_config = BeitConfig(
-        image_size=image_size,
-        num_hidden_layers=num_hidden_layers,
-        hidden_size=hidden_size,
-        intermediate_size=intermediate_size,
-        num_attention_heads=num_attention_heads,
-        use_relative_position_bias=True,
-        reshape_hidden_states=False,
-        out_features=out_features,
-    )
-
-    neck_hidden_sizes = [256, 512, 1024, 1024] if "large" in model_name else [96, 192, 384, 768]
-    config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes)
-
-    return config, image_size
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("pretrained.model.cls_token", "backbone.embeddings.cls_token"))
-    rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
-
-    # Transfomer encoder
-    for i in range(config.backbone_config.num_hidden_layers):
-        rename_keys.append((f"pretrained.model.blocks.{i}.gamma_1", f"backbone.encoder.layer.{i}.lambda_1"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.gamma_2", f"backbone.encoder.layer.{i}.lambda_2"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.output.dense.bias"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_bias_table", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"))
-        rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_index", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"))
-
-    # activation postprocessing (readout projections + resize blocks)
-    for i in range(4):
-        rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight"))
-        rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias"))
-
-        rename_keys.append((f"pretrained.act_postprocess{i+1}.3.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
-        rename_keys.append((f"pretrained.act_postprocess{i+1}.3.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
-
-        if i != 2:
-            rename_keys.append((f"pretrained.act_postprocess{i+1}.4.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
-            rename_keys.append((f"pretrained.act_postprocess{i+1}.4.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
-
-    # refinenet (tricky here)
-    mapping = {1:3, 2:2, 3:1, 4:0}
-
-    for i in range(1, 5):
-        j = mapping[i]
-        rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))
-
-    # scratch convolutions
-    for i in range(4):
-        rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))
-
-    # head
-    for i in range(0, 5, 2):
-        rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight"))
-        rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias"))
-
-    return rename_keys
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    hidden_size = config.backbone_config.hidden_size
-    for i in range(config.backbone_config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"pretrained.model.blocks.{i}.attn.qkv.weight")
-        q_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.v_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    name_to_url = {
-        "dpt-beit-large-512": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",
-        "dpt-beit-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt",
-        "dpt-beit-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt",
-    }
-
-    # define DPT configuration based on URL
-    checkpoint_url = name_to_url[model_name]
-    config, image_size = get_dpt_config(model_name)
-    # load original state_dict from URL
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    # remove certain keys
-    remove_ignore_keys_(state_dict)
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = DPTForDepthEstimation(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-    assert missing_keys == []
-    # assert unexpected_keys == ["pretrained.model.fc_norm.weight", "pretrained.model.fc_norm.bias"]
-    model.eval()
-
-    # Check outputs on an image
-    # We set `keep_aspect_ratio=False` as our current BEiT does not support arbitrary window sizes
-    processor = DPTImageProcessor(
-        size={"height": image_size, "width": image_size}, keep_aspect_ratio=False, ensure_multiple_of=32
-    )
-
-    image = prepare_img()
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    print("First values of pixel values:", pixel_values[0, 0, :3, :3])
-    print("Mean of pixel values:", pixel_values.mean().item())
-    print("Shape of pixel values:", pixel_values.shape)
-
-    import requests
-    from PIL import Image
-    from torchvision import transforms
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    transforms = transforms.Compose(
-        [
-            transforms.Resize((image_size, image_size)),
-            transforms.ToTensor(),
-        ]
-    )
-    pixel_values = transforms(image).unsqueeze(0)
-
-    # forward pass
-    with torch.no_grad():
-        outputs = model(pixel_values)
-
-    predicted_depth = outputs.predicted_depth
-
-    print("Shape of predicted depth:", predicted_depth.shape)
-    print("First values of predicted depth:", predicted_depth[0, :3, :3])
-
-    # assert logits
-    # TODO there's still a small difference with the original logits
-    if model_name == "dpt-beit-large-512":
-        # OK, checked
-        expected_shape = torch.Size([1, 512, 512])
-        expected_slice = torch.tensor(
-            [[2804.6260, 2792.5708, 2812.9263], [2772.0288, 2780.1118, 2796.2529], [2748.1094, 2766.6558, 2766.9834]]
-        )
-    elif model_name == "dpt-beit-large-384":
-        # OK, checked
-        expected_shape = torch.Size([1, 384, 384])
-        expected_slice = torch.tensor(
-            [[1783.2273, 1780.5729, 1792.6453], [1759.9817, 1765.5359, 1778.5002], [1739.1633, 1754.7903, 1757.1990]],
-        )
-    elif model_name == "dpt-beit-base-384":
-        # OK, checked
-        expected_shape = torch.Size([1, 384, 384])
-        expected_slice = torch.tensor(
-            [[2898.4482, 2891.3750, 2904.8079], [2858.6685, 2877.2615, 2894.4507], [2842.1235, 2854.1023, 2861.6328]],
-        )
-
-    assert predicted_depth.shape == torch.Size(expected_shape)
-    assert torch.allclose(predicted_depth[0, :3, :3], expected_slice)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(repo_id=f"nielsr/{model_name}")
-        processor.push_to_hub(repo_id=f"nielsr/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dpt-beit-large-512",
-        type=str,
-        choices=["dpt-beit-large-512", "dpt-beit-large-384", "dpt-beit-base-384"],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub after conversion.",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
deleted file mode 100644
index 16e4d71212b5..000000000000
--- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(checkpoint_url):
-    config = DPTConfig(embedding_type="hybrid")
-
-    if "large" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.backbone_out_indices = [5, 11, 17, 23]
-        config.neck_hidden_sizes = [256, 512, 1024, 1024]
-        expected_shape = (1, 384, 384)
-
-    if "nyu" in checkpoint_url or "midas" in checkpoint_url:
-        config.hidden_size = 768
-        config.reassemble_factors = [1, 1, 1, 0.5]
-        config.neck_hidden_sizes = [256, 512, 768, 768]
-        config.num_labels = 150
-        config.patch_size = 16
-        expected_shape = (1, 384, 384)
-        config.use_batch_norm_in_fusion_residual = False
-        config.readout_type = "project"
-
-    if "ade" in checkpoint_url:
-        config.use_batch_norm_in_fusion_residual = True
-        config.hidden_size = 768
-        config.reassemble_stage = [1, 1, 1, 0.5]
-        config.num_labels = 150
-        config.patch_size = 16
-        repo_id = "huggingface/label-files"
-        filename = "ade20k-id2label.json"
-        id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        expected_shape = [1, 150, 480, 480]
-
-    return config, expected_shape
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(name):
-    if (
-        "pretrained.model" in name
-        and "cls_token" not in name
-        and "pos_embed" not in name
-        and "patch_embed" not in name
-    ):
-        name = name.replace("pretrained.model", "dpt.encoder")
-    if "pretrained.model" in name:
-        name = name.replace("pretrained.model", "dpt.embeddings")
-    if "patch_embed" in name:
-        name = name.replace("patch_embed", "")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "position_embeddings")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "proj" in name and "project" not in name:
-        name = name.replace("proj", "projection")
-    if "blocks" in name:
-        name = name.replace("blocks", "layer")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "norm1" in name and "backbone" not in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name and "backbone" not in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "scratch.output_conv" in name:
-        name = name.replace("scratch.output_conv", "head")
-    if "scratch" in name:
-        name = name.replace("scratch", "neck")
-    if "layer1_rn" in name:
-        name = name.replace("layer1_rn", "convs.0")
-    if "layer2_rn" in name:
-        name = name.replace("layer2_rn", "convs.1")
-    if "layer3_rn" in name:
-        name = name.replace("layer3_rn", "convs.2")
-    if "layer4_rn" in name:
-        name = name.replace("layer4_rn", "convs.3")
-    if "refinenet" in name:
-        layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1])
-        # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3
-        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")
-    if "out_conv" in name:
-        name = name.replace("out_conv", "projection")
-    if "resConfUnit1" in name:
-        name = name.replace("resConfUnit1", "residual_layer1")
-    if "resConfUnit2" in name:
-        name = name.replace("resConfUnit2", "residual_layer2")
-    if "conv1" in name:
-        name = name.replace("conv1", "convolution1")
-    if "conv2" in name:
-        name = name.replace("conv2", "convolution2")
-    # readout blocks
-    if "pretrained.act_postprocess1.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0")
-    if "pretrained.act_postprocess2.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0")
-    if "pretrained.act_postprocess3.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0")
-    if "pretrained.act_postprocess4.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0")
-
-    # resize blocks
-    if "pretrained.act_postprocess1.3" in name:
-        name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
-    if "pretrained.act_postprocess1.4" in name:
-        name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
-    if "pretrained.act_postprocess2.3" in name:
-        name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
-    if "pretrained.act_postprocess2.4" in name:
-        name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
-    if "pretrained.act_postprocess3.3" in name:
-        name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
-    if "pretrained.act_postprocess4.3" in name:
-        name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
-    if "pretrained.act_postprocess4.4" in name:
-        name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
-    if "pretrained" in name:
-        name = name.replace("pretrained", "dpt")
-    if "bn" in name:
-        name = name.replace("bn", "batch_norm")
-    if "head" in name:
-        name = name.replace("head", "head.head")
-    if "encoder.norm" in name:
-        name = name.replace("encoder.norm", "layernorm")
-    if "auxlayer" in name:
-        name = name.replace("auxlayer", "auxiliary_head.head")
-    if "backbone" in name:
-        name = name.replace("backbone", "backbone.bit.encoder")
-
-    if ".." in name:
-        name = name.replace("..", ".")
-
-    if "stem.conv" in name:
-        name = name.replace("stem.conv", "bit.embedder.convolution")
-    if "blocks" in name:
-        name = name.replace("blocks", "layers")
-    if "convolution" in name and "backbone" in name:
-        name = name.replace("convolution", "conv")
-    if "layer" in name and "backbone" in name:
-        name = name.replace("layer", "layers")
-    if "backbone.bit.encoder.bit" in name:
-        name = name.replace("backbone.bit.encoder.bit", "backbone.bit")
-    if "embedder.conv" in name:
-        name = name.replace("embedder.conv", "embedder.convolution")
-    if "backbone.bit.encoder.stem.norm" in name:
-        name = name.replace("backbone.bit.encoder.stem.norm", "backbone.bit.embedder.norm")
-    return name
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name, show_prediction):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    # define DPT configuration based on URL
-    config, expected_shape = get_dpt_config(checkpoint_url)
-    # load original state_dict from URL
-    # state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    state_dict = torch.load(checkpoint_url, map_location="cpu")
-    # remove certain keys
-    remove_ignore_keys_(state_dict)
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # Check outputs on an image
-    size = 480 if "ade" in checkpoint_url else 384
-    image_processor = DPTImageProcessor(size=size)
-
-    image = prepare_img()
-    encoding = image_processor(image, return_tensors="pt")
-
-    # forward pass
-    outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
-
-    if show_prediction:
-        prediction = (
-            torch.nn.functional.interpolate(
-                outputs.unsqueeze(1),
-                size=(image.size[1], image.size[0]),
-                mode="bicubic",
-                align_corners=False,
-            )
-            .squeeze()
-            .cpu()
-            .numpy()
-        )
-
-        Image.fromarray((prediction / prediction.max()) * 255).show()
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub("ybelkada/dpt-hybrid-midas")
-        image_processor.push_to_hub("ybelkada/dpt-hybrid-midas")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt",
-        type=str,
-        help="URL of the original DPT checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--model_name",
-        default="dpt-large",
-        type=str,
-        help="Name of the model, in case you're pushing to the hub.",
-    )
-    parser.add_argument(
-        "--show_prediction",
-        action="store_true",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(
-        args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name, args.show_prediction
-    )
diff --git a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py b/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py
deleted file mode 100644
index 0feebe72d474..000000000000
--- a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS"""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import DPTConfig, DPTForDepthEstimation, DPTImageProcessor, Swinv2Config
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(model_name):
-    if "tiny" in model_name:
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-        window_size = 16
-        # note: for Swinv2-tiny authors used the window_size = 16 variant
-        # as seen here: https://github.com/isl-org/MiDaS/blob/bdc4ed64c095e026dc0a2f17cabb14d58263decb/midas/backbones/swin2.py#L26
-        pretrained_window_sizes = (0, 0, 0, 0)
-    elif "base" in model_name:
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-        window_size = 24
-        pretrained_window_sizes = (12, 12, 12, 6)
-    elif "large" in model_name:
-        embed_dim = 192
-        depths = (2, 2, 18, 2)
-        num_heads = (6, 12, 24, 48)
-        window_size = 24
-        pretrained_window_sizes = (12, 12, 12, 6)
-
-    if "384" in model_name:
-        image_size = 384
-    elif "256" in model_name:
-        image_size = 256
-    else:
-        raise ValueError("Model not supported, to do")
-
-    backbone_config = Swinv2Config(
-        image_size=image_size,
-        embed_dim=embed_dim,
-        depths=depths,
-        window_size=window_size,
-        pretrained_window_sizes=pretrained_window_sizes,
-        num_heads=num_heads,
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-    )
-
-    if model_name == "dpt-swinv2-tiny-256":
-        neck_hidden_sizes = [96, 192, 384, 768]
-    elif model_name == "dpt-swinv2-base-384":
-        neck_hidden_sizes = [128, 256, 512, 1024]
-    elif model_name == "dpt-swinv2-large-384":
-        neck_hidden_sizes = [192, 384, 768, 1536]
-
-    config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes)
-
-    return config, image_size
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("pretrained.model.patch_embed.norm.weight", "backbone.embeddings.norm.weight"))
-    rename_keys.append(("pretrained.model.patch_embed.norm.bias", "backbone.embeddings.norm.bias"))
-
-    # transformer encoder
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.logit_scale", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.logit_scale"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.0.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.0.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.0.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.0.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.2.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.q_bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.v_bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.proj.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.proj.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
-
-        # downsample parameters
-        if i in [0,1,2]:
-            rename_keys.append((f"pretrained.model.layers.{i}.downsample.reduction.weight", f"backbone.encoder.layers.{i}.downsample.reduction.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.downsample.norm.weight", f"backbone.encoder.layers.{i}.downsample.norm.weight"))
-            rename_keys.append((f"pretrained.model.layers.{i}.downsample.norm.bias", f"backbone.encoder.layers.{i}.downsample.norm.bias"))
-
-    # note: non-Transformer backbones like Swinv2, LeViT et al don't require activation postprocessing (readout projections + resize blocks)
-
-    # refinenet (tricky here)
-    mapping = {1:3, 2:2, 3:1, 4:0}
-
-    for i in range(1, 5):
-        j = mapping[i]
-        rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
-        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))
-
-    # scratch convolutions
-    for i in range(4):
-        rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))
-
-    # head
-    for i in range(0, 5, 2):
-        rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight"))
-        rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias"))
-
-    return rename_keys
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, model):
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            dim = model.backbone.encoder.layers[i].blocks[j].attention.self.all_head_size
-            # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"pretrained.model.layers.{i}.blocks.{j}.attn.qkv.weight")
-            # next, add query, keys and values (in that order) to the state dict
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
-                dim : dim * 2, :
-            ]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-                -dim:, :
-            ]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, verify_logits, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    name_to_url = {
-        "dpt-swinv2-tiny-256": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt",
-        "dpt-swinv2-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt",
-        "dpt-swinv2-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt",
-    }
-
-    # define DPT configuration based on URL
-    checkpoint_url = name_to_url[model_name]
-    config, image_size = get_dpt_config(model_name)
-    # load original state_dict from URL
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-
-    # load HuggingFace model
-    model = DPTForDepthEstimation(config)
-
-    # remove certain keys
-    remove_ignore_keys_(state_dict)
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config, model)
-
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-    model.eval()
-
-    # Check outputs on an image
-    processor = DPTImageProcessor(size={"height": image_size, "width": image_size})
-
-    image = prepare_img()
-    processor(image, return_tensors="pt")
-
-    if verify_logits:
-        from torchvision import transforms
-
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        image = Image.open(requests.get(url, stream=True).raw)
-
-        transforms = transforms.Compose(
-            [
-                transforms.Resize((image_size, image_size)),
-                transforms.ToTensor(),
-            ]
-        )
-        pixel_values = transforms(image).unsqueeze(0)
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(pixel_values)
-
-        predicted_depth = outputs.predicted_depth
-
-        print("Shape of predicted depth:", predicted_depth.shape)
-        print("First values of predicted depth:", predicted_depth[0, :3, :3])
-
-        # assert logits
-        if model_name == "dpt-swinv2-base-384":
-            # OK, checked
-            expected_shape = torch.Size([1, 384, 384])
-            expected_slice = torch.tensor(
-                [
-                    [1998.5575, 1997.3887, 2009.2981],
-                    [1952.8607, 1979.6488, 2001.0854],
-                    [1953.7697, 1961.7711, 1968.8904],
-                ],
-            )
-        elif model_name == "dpt-swinv2-tiny-256":
-            # OK, checked
-            expected_shape = torch.Size([1, 256, 256])
-            expected_slice = torch.tensor(
-                [[978.9163, 976.5215, 978.5349], [974.1859, 971.7249, 975.8046], [971.3419, 970.3118, 971.6830]],
-            )
-        elif model_name == "dpt-swinv2-large-384":
-            # OK, checked
-            expected_shape = torch.Size([1, 384, 384])
-            expected_slice = torch.tensor(
-                [
-                    [1203.7206, 1200.1495, 1197.8234],
-                    [1196.2484, 1183.5033, 1186.4640],
-                    [1178.8131, 1182.3260, 1174.3975],
-                ],
-            )
-
-        assert predicted_depth.shape == torch.Size(expected_shape)
-        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice)
-        print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and processor to hub...")
-        model.push_to_hub(repo_id=f"Intel/{model_name}")
-        processor.push_to_hub(repo_id=f"Intel/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dpt-swinv2-base-384",
-        type=str,
-        choices=["dpt-swinv2-tiny-256", "dpt-swinv2-base-384", "dpt-swinv2-large-384"],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_true",
-        help="Whether to verify logits after conversion.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub after conversion.",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub)
diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
deleted file mode 100644
index 489da9acd19c..000000000000
--- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_dpt_config(checkpoint_url):
-    config = DPTConfig()
-
-    if "large" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.backbone_out_indices = [5, 11, 17, 23]
-        config.neck_hidden_sizes = [256, 512, 1024, 1024]
-        expected_shape = (1, 384, 384)
-
-    if "ade" in checkpoint_url:
-        config.use_batch_norm_in_fusion_residual = True
-
-        config.num_labels = 150
-        repo_id = "huggingface/label-files"
-        filename = "ade20k-id2label.json"
-        id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        expected_shape = [1, 150, 480, 480]
-
-    return config, expected_shape
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(name):
-    if (
-        "pretrained.model" in name
-        and "cls_token" not in name
-        and "pos_embed" not in name
-        and "patch_embed" not in name
-    ):
-        name = name.replace("pretrained.model", "dpt.encoder")
-    if "pretrained.model" in name:
-        name = name.replace("pretrained.model", "dpt.embeddings")
-    if "patch_embed" in name:
-        name = name.replace("patch_embed", "patch_embeddings")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "position_embeddings")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "proj" in name and "project" not in name:
-        name = name.replace("proj", "projection")
-    if "blocks" in name:
-        name = name.replace("blocks", "layer")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "scratch.output_conv" in name:
-        name = name.replace("scratch.output_conv", "head")
-    if "scratch" in name:
-        name = name.replace("scratch", "neck")
-    if "layer1_rn" in name:
-        name = name.replace("layer1_rn", "convs.0")
-    if "layer2_rn" in name:
-        name = name.replace("layer2_rn", "convs.1")
-    if "layer3_rn" in name:
-        name = name.replace("layer3_rn", "convs.2")
-    if "layer4_rn" in name:
-        name = name.replace("layer4_rn", "convs.3")
-    if "refinenet" in name:
-        layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1])
-        # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3
-        name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx-4)}")
-    if "out_conv" in name:
-        name = name.replace("out_conv", "projection")
-    if "resConfUnit1" in name:
-        name = name.replace("resConfUnit1", "residual_layer1")
-    if "resConfUnit2" in name:
-        name = name.replace("resConfUnit2", "residual_layer2")
-    if "conv1" in name:
-        name = name.replace("conv1", "convolution1")
-    if "conv2" in name:
-        name = name.replace("conv2", "convolution2")
-    # readout blocks
-    if "pretrained.act_postprocess1.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0")
-    if "pretrained.act_postprocess2.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0")
-    if "pretrained.act_postprocess3.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0")
-    if "pretrained.act_postprocess4.0.project.0" in name:
-        name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0")
-    # resize blocks
-    if "pretrained.act_postprocess1.3" in name:
-        name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
-    if "pretrained.act_postprocess1.4" in name:
-        name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
-    if "pretrained.act_postprocess2.3" in name:
-        name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
-    if "pretrained.act_postprocess2.4" in name:
-        name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
-    if "pretrained.act_postprocess3.3" in name:
-        name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
-    if "pretrained.act_postprocess4.3" in name:
-        name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
-    if "pretrained.act_postprocess4.4" in name:
-        name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
-    if "pretrained" in name:
-        name = name.replace("pretrained", "dpt")
-    if "bn" in name:
-        name = name.replace("bn", "batch_norm")
-    if "head" in name:
-        name = name.replace("head", "head.head")
-    if "encoder.norm" in name:
-        name = name.replace("encoder.norm", "layernorm")
-    if "auxlayer" in name:
-        name = name.replace("auxlayer", "auxiliary_head.head")
-
-    return name
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name):
-    """
-    Copy/paste/tweak model's weights to our DPT structure.
-    """
-
-    # define DPT configuration based on URL
-    config, expected_shape = get_dpt_config(checkpoint_url)
-    # load original state_dict from URL
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    # remove certain keys
-    remove_ignore_keys_(state_dict)
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # Check outputs on an image
-    size = 480 if "ade" in checkpoint_url else 384
-    image_processor = DPTImageProcessor(size=size)
-
-    image = prepare_img()
-    encoding = image_processor(image, return_tensors="pt")
-
-    # forward pass
-    outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth
-
-    # Assert logits
-    expected_slice = torch.tensor([[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]])
-    if "ade" in checkpoint_url:
-        expected_slice = torch.tensor([[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]])
-    assert outputs.shape == torch.Size(expected_shape)
-    assert (
-        torch.allclose(outputs[0, 0, :3, :3], expected_slice, atol=1e-4)
-        if "ade" in checkpoint_url
-        else torch.allclose(outputs[0, :3, :3], expected_slice)
-    )
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model to hub...")
-        model.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-        image_processor.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt",
-        type=str,
-        help="URL of the original DPT checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--model_name",
-        default="dpt-large",
-        type=str,
-        required=False,
-        help="Name of the model, in case you're pushing to the hub.",
-    )
-
-    args = parser.parse_args()
-    convert_dpt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name)
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 3c2162409c57..d034ff0a4ff9 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -161,7 +161,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: bool = False,
-        size_divisor: int = None,
+        size_divisor: Optional[int] = None,
         do_reduce_labels: bool = False,
         **kwargs,
     ) -> None:
@@ -294,19 +294,19 @@ def reduce_label(self, label: ImageInput) -> np.ndarray:
     def _preprocess(
         self,
         image: ImageInput,
-        do_reduce_labels: bool = None,
-        do_resize: bool = None,
+        do_reduce_labels: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        keep_aspect_ratio: bool = None,
-        ensure_multiple_of: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_pad: bool = None,
-        size_divisor: int = None,
+        do_pad: Optional[bool] = None,
+        size_divisor: Optional[int] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         if do_reduce_labels:
@@ -336,18 +336,18 @@ def _preprocess(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        keep_aspect_ratio: bool = None,
-        ensure_multiple_of: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_pad: bool = None,
-        size_divisor: int = None,
+        do_pad: Optional[bool] = None,
+        size_divisor: Optional[int] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
@@ -387,12 +387,12 @@ def _preprocess_image(
     def _preprocess_segmentation_map(
         self,
         segmentation_map: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        keep_aspect_ratio: bool = None,
-        ensure_multiple_of: int = None,
-        do_reduce_labels: bool = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
+        do_reduce_labels: Optional[bool] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
         """Preprocesses a single segmentation map."""
@@ -436,18 +436,18 @@ def preprocess(
         self,
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
-        do_resize: bool = None,
-        size: int = None,
-        keep_aspect_ratio: bool = None,
-        ensure_multiple_of: int = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[int] = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_pad: bool = None,
-        size_divisor: int = None,
+        do_pad: Optional[bool] = None,
+        size_divisor: Optional[int] = None,
         do_reduce_labels: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index a82227b45809..c69bf618fe1e 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -20,9 +20,8 @@
 """
 
 import collections.abc
-import math
 from dataclasses import dataclass
-from typing import List, Optional, Set, Tuple, Union
+from typing import Callable, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -37,7 +36,7 @@
     replace_return_docstrings,
 )
 from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput, SemanticSegmenterOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import ModelOutput, logging, torch_int
 from ...utils.backbone_utils import load_backbone
@@ -67,7 +66,7 @@ class BaseModelOutputWithIntermediateActivations(ModelOutput):
             Intermediate activations that can be used to compute hidden states of the model at various layers.
     """
 
-    last_hidden_states: torch.FloatTensor = None
+    last_hidden_states: Optional[torch.FloatTensor] = None
     intermediate_activations: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
@@ -100,8 +99,8 @@ class BaseModelOutputWithPoolingAndIntermediateActivations(ModelOutput):
             Intermediate activations that can be used to compute hidden states of the model at various layers.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     intermediate_activations: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -295,26 +294,59 @@ def forward(self, pixel_values):
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DPT
-class DPTViTSelfAttention(nn.Module):
+class DPTSelfAttention(nn.Module):
     def __init__(self, config: DPTConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -323,33 +355,33 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
 
         outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
@@ -378,7 +410,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
 class DPTViTAttention(nn.Module):
     def __init__(self, config: DPTConfig) -> None:
         super().__init__()
-        self.attention = DPTViTSelfAttention(config)
+        self.attention = DPTSelfAttention(config)
         self.output = DPTViTSelfOutput(config)
         self.pruned_heads = set()
 
@@ -809,6 +841,8 @@ class DPTPreTrainedModel(PreTrainedModel):
     base_model_prefix = "dpt"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -821,6 +855,9 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        if isinstance(module, (DPTViTEmbeddings, DPTViTHybridEmbeddings)):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
 
 
 DPT_START_DOCSTRING = r"""
@@ -955,8 +992,8 @@ def forward(
 class DPTViTPooler(nn.Module):
     def __init__(self, config: DPTConfig):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
+        self.dense = nn.Linear(config.hidden_size, config.pooler_output_size)
+        self.activation = ACT2FN[config.pooler_act]
 
     def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
diff --git a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py b/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
deleted file mode 100644
index e9988524aca0..000000000000
--- a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert EfficientNet checkpoints from the original repository.
-
-URL: https://github.com/keras-team/keras/blob/v2.11.0/keras/applications/efficientnet.py"""
-
-import argparse
-import json
-import os
-
-import numpy as np
-import PIL
-import requests
-import tensorflow.keras.applications.efficientnet as efficientnet
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from tensorflow.keras.preprocessing import image
-
-from transformers import (
-    EfficientNetConfig,
-    EfficientNetForImageClassification,
-    EfficientNetImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-model_classes = {
-    "b0": efficientnet.EfficientNetB0,
-    "b1": efficientnet.EfficientNetB1,
-    "b2": efficientnet.EfficientNetB2,
-    "b3": efficientnet.EfficientNetB3,
-    "b4": efficientnet.EfficientNetB4,
-    "b5": efficientnet.EfficientNetB5,
-    "b6": efficientnet.EfficientNetB6,
-    "b7": efficientnet.EfficientNetB7,
-}
-
-CONFIG_MAP = {
-    "b0": {
-        "hidden_dim": 1280,
-        "width_coef": 1.0,
-        "depth_coef": 1.0,
-        "image_size": 224,
-        "dropout_rate": 0.2,
-        "dw_padding": [],
-    },
-    "b1": {
-        "hidden_dim": 1280,
-        "width_coef": 1.0,
-        "depth_coef": 1.1,
-        "image_size": 240,
-        "dropout_rate": 0.2,
-        "dw_padding": [16],
-    },
-    "b2": {
-        "hidden_dim": 1408,
-        "width_coef": 1.1,
-        "depth_coef": 1.2,
-        "image_size": 260,
-        "dropout_rate": 0.3,
-        "dw_padding": [5, 8, 16],
-    },
-    "b3": {
-        "hidden_dim": 1536,
-        "width_coef": 1.2,
-        "depth_coef": 1.4,
-        "image_size": 300,
-        "dropout_rate": 0.3,
-        "dw_padding": [5, 18],
-    },
-    "b4": {
-        "hidden_dim": 1792,
-        "width_coef": 1.4,
-        "depth_coef": 1.8,
-        "image_size": 380,
-        "dropout_rate": 0.4,
-        "dw_padding": [6],
-    },
-    "b5": {
-        "hidden_dim": 2048,
-        "width_coef": 1.6,
-        "depth_coef": 2.2,
-        "image_size": 456,
-        "dropout_rate": 0.4,
-        "dw_padding": [13, 27],
-    },
-    "b6": {
-        "hidden_dim": 2304,
-        "width_coef": 1.8,
-        "depth_coef": 2.6,
-        "image_size": 528,
-        "dropout_rate": 0.5,
-        "dw_padding": [31],
-    },
-    "b7": {
-        "hidden_dim": 2560,
-        "width_coef": 2.0,
-        "depth_coef": 3.1,
-        "image_size": 600,
-        "dropout_rate": 0.5,
-        "dw_padding": [18],
-    },
-}
-
-
-def get_efficientnet_config(model_name):
-    config = EfficientNetConfig()
-    config.hidden_dim = CONFIG_MAP[model_name]["hidden_dim"]
-    config.width_coefficient = CONFIG_MAP[model_name]["width_coef"]
-    config.depth_coefficient = CONFIG_MAP[model_name]["depth_coef"]
-    config.image_size = CONFIG_MAP[model_name]["image_size"]
-    config.dropout_rate = CONFIG_MAP[model_name]["dropout_rate"]
-    config.depthwise_padding = CONFIG_MAP[model_name]["dw_padding"]
-
-    repo_id = "huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    config.num_labels = 1000
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    return config
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def convert_image_processor(model_name):
-    size = CONFIG_MAP[model_name]["image_size"]
-    preprocessor = EfficientNetImageProcessor(
-        size={"height": size, "width": size},
-        image_mean=[0.485, 0.456, 0.406],
-        image_std=[0.47853944, 0.4732864, 0.47434163],
-        do_center_crop=False,
-    )
-    return preprocessor
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def rename_keys(original_param_names):
-    block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
-    block_names = sorted(set(block_names))
-    num_blocks = len(block_names)
-    block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}
-
-    rename_keys = []
-    rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
-    rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
-    rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
-    rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
-    rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))
-
-    for b in block_names:
-        hf_b = block_name_mapping[b]
-        rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
-        rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
-        rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
-        rename_keys.append(
-            (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var")
-        )
-        rename_keys.append(
-            (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight")
-        )
-        rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
-        rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
-        rename_keys.append(
-            (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var")
-        )
-
-        rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
-        rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
-        rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
-        rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
-        rename_keys.append(
-            (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight")
-        )
-        rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
-        rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
-        rename_keys.append(
-            (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean")
-        )
-        rename_keys.append(
-            (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var")
-        )
-
-    rename_keys.append(("top_conv/kernel:0", "encoder.top_conv.weight"))
-    rename_keys.append(("top_bn/gamma:0", "encoder.top_bn.weight"))
-    rename_keys.append(("top_bn/beta:0", "encoder.top_bn.bias"))
-    rename_keys.append(("top_bn/moving_mean:0", "encoder.top_bn.running_mean"))
-    rename_keys.append(("top_bn/moving_variance:0", "encoder.top_bn.running_var"))
-
-    key_mapping = {}
-    for item in rename_keys:
-        if item[0] in original_param_names:
-            key_mapping[item[0]] = "efficientnet." + item[1]
-
-    key_mapping["predictions/kernel:0"] = "classifier.weight"
-    key_mapping["predictions/bias:0"] = "classifier.bias"
-    return key_mapping
-
-
-def replace_params(hf_params, tf_params, key_mapping):
-    for key, value in tf_params.items():
-        if "normalization" in key:
-            continue
-
-        hf_key = key_mapping[key]
-        if "_conv" in key and "kernel" in key:
-            new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1)
-        elif "depthwise_kernel" in key:
-            new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1)
-        elif "kernel" in key:
-            new_hf_value = torch.from_numpy(np.transpose(value))
-        else:
-            new_hf_value = torch.from_numpy(value)
-
-        # Replace HF parameters with original TF model parameters
-        assert hf_params[hf_key].shape == new_hf_value.shape
-        hf_params[hf_key].copy_(new_hf_value)
-
-
-@torch.no_grad()
-def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_model, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our EfficientNet structure.
-    """
-    # Load original model
-    original_model = model_classes[model_name](
-        include_top=True,
-        weights="imagenet",
-        input_tensor=None,
-        input_shape=None,
-        pooling=None,
-        classes=1000,
-        classifier_activation="softmax",
-    )
-
-    tf_params = original_model.trainable_variables
-    tf_non_train_params = original_model.non_trainable_variables
-    tf_params = {param.name: param.numpy() for param in tf_params}
-    for param in tf_non_train_params:
-        tf_params[param.name] = param.numpy()
-    tf_param_names = list(tf_params.keys())
-
-    # Load HuggingFace model
-    config = get_efficientnet_config(model_name)
-    hf_model = EfficientNetForImageClassification(config).eval()
-    hf_params = hf_model.state_dict()
-
-    # Create src-to-dst parameter name mapping dictionary
-    print("Converting parameters...")
-    key_mapping = rename_keys(tf_param_names)
-    replace_params(hf_params, tf_params, key_mapping)
-
-    # Initialize preprocessor and preprocess input image
-    preprocessor = convert_image_processor(model_name)
-    inputs = preprocessor(images=prepare_img(), return_tensors="pt")
-
-    # HF model inference
-    hf_model.eval()
-    with torch.no_grad():
-        outputs = hf_model(**inputs)
-    hf_logits = outputs.logits.detach().numpy()
-
-    # Original model inference
-    original_model.trainable = False
-    image_size = CONFIG_MAP[model_name]["image_size"]
-    img = prepare_img().resize((image_size, image_size), resample=PIL.Image.NEAREST)
-    x = image.img_to_array(img)
-    x = np.expand_dims(x, axis=0)
-    original_logits = original_model.predict(x)
-
-    # Check whether original and HF model outputs match  -> np.allclose
-    assert np.allclose(original_logits, hf_logits, atol=1e-3), "The predicted logits are not the same."
-    print("Model outputs match!")
-
-    if save_model:
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-        # Save converted model and image processor
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-        preprocessor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Push model and image processor to hub
-        print(f"Pushing converted {model_name} to the hub...")
-        model_name = f"efficientnet-{model_name}"
-        preprocessor.push_to_hub(model_name)
-        hf_model.push_to_hub(model_name)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="b0",
-        type=str,
-        help="Version name of the EfficientNet model you want to convert, select from [b0, b1, b2, b3, b4, b5, b6, b7].",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="hf_model",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
-
-    args = parser.parse_args()
-    convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index 79f92ec1cae6..612ede7086ea 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -212,18 +212,18 @@ def rescale(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample=None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        rescale_offset: bool = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        rescale_offset: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        include_top: bool = None,
+        include_top: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py
index 0ab5fa2e6aac..9e0b89072921 100644
--- a/src/transformers/models/efficientnet/modeling_efficientnet.py
+++ b/src/transformers/models/efficientnet/modeling_efficientnet.py
@@ -527,7 +527,7 @@ def __init__(self, config: EfficientNetConfig):
     )
     def forward(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
@@ -591,7 +591,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index b0abc30cd758..000000000000
--- a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ELECTRA checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):
-    # Initialise PyTorch model
-    config = ElectraConfig.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-
-    if discriminator_or_generator == "discriminator":
-        model = ElectraForPreTraining(config)
-    elif discriminator_or_generator == "generator":
-        model = ElectraForMaskedLM(config)
-    else:
-        raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'")
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_electra(
-        model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator
-    )
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--discriminator_or_generator",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or "
-            "'generator'."
-        ),
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(
-        args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator
-    )
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index f2138ac0f683..7b73f022122d 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -670,7 +670,6 @@ class ElectraPreTrainedModel(PreTrainedModel):
     base_model_prefix = "electra"
     supports_gradient_checkpointing = True
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -712,7 +711,7 @@ class ElectraForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py
index 4ca7d1d6dcf4..77a445e6ccaa 100644
--- a/src/transformers/models/electra/modeling_flax_electra.py
+++ b/src/transformers/models/electra/modeling_flax_electra.py
@@ -230,7 +230,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
@@ -1364,7 +1364,7 @@ def __call__(
         )
         hidden_states = outputs[0]
         logits = self.qa_outputs(hidden_states)
-        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index 827241d0a874..6dc3ac8ebf8c 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -601,10 +601,10 @@ def build(self, input_shape=None):
     # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        token_type_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        token_type_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
         past_key_values_length=0,
         training: bool = False,
     ) -> tf.Tensor:
@@ -931,7 +931,7 @@ class TFElectraForPreTrainingOutput(ModelOutput):
             heads.
     """
 
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
diff --git a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py b/src/transformers/models/emu3/convert_emu3_weights_to_hf.py
deleted file mode 100644
index 8ac8db7e4290..000000000000
--- a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py
+++ /dev/null
@@ -1,448 +0,0 @@
-# Copyright 2024 The Emu team, BAAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-import re
-from typing import Dict, Optional
-
-import requests
-import torch
-from accelerate import init_empty_weights
-from PIL import Image
-
-from transformers import (
-    AutoModel,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    Emu3Config,
-    Emu3ForConditionalGeneration,
-    Emu3ImageProcessor,
-    Emu3Processor,
-    Emu3TextConfig,
-    GenerationConfig,
-)
-from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/emu3/convert_emu3_weights_to_hf.py \
-    --vq_model_id BAAI/Emu3-VisionTokenizer --llm_model_id BAAI/Emu3-Chat --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import Emu3ForConditionalGeneration, Emu3Processor
-
-model = Emu3ForConditionalGeneration.from_pretrained("/output/path")
-processor = Emu3Processor.from_pretrained("/output/path")
-```
-
-"""
-
-
-byte_encoder = bytes_to_unicode()
-CHAT_TEMPLATE = "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"
-
-
-# Tiktoken to HF conversion, thanks for Xenova
-def token_bytes_to_string(b):
-    return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
-
-
-# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960
-def bpe(mergeable_ranks: Dict[bytes, int], token: bytes, max_rank: Optional[int] = None):
-    parts = [bytes([b]) for b in token]
-    while True:
-        min_idx = None
-        min_rank = None
-        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
-            rank = mergeable_ranks.get(pair[0] + pair[1])
-            if rank is not None and (min_rank is None or rank < min_rank):
-                min_idx = i
-                min_rank = rank
-        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
-            break
-        assert min_idx is not None
-        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]
-    return parts
-
-
-def generate_vocab_and_merges(encoder):
-    mergeable_ranks = encoder._mergeable_ranks
-
-    merges = []
-    vocab = {}
-    for token, rank in mergeable_ranks.items():
-        vocab[token_bytes_to_string(token)] = rank
-
-        if len(token) == 1:
-            continue
-        merged = tuple(bpe(mergeable_ranks, token, max_rank=rank))
-        assert len(merged) == 2
-        merges.append(" ".join(map(token_bytes_to_string, merged)))
-
-    # Also add special tokens
-    vocab.update(encoder._special_tokens)
-    return vocab, merges
-
-
-def convert_tiktoken(tokenizer, output_dir):
-    encoder = tokenizer.tokenizer
-    vocab, merges = generate_vocab_and_merges(encoder)
-    added_tokens = [
-        {
-            "id": id,
-            "content": content,
-            "single_word": False,
-            "lstrip": False,
-            "rstrip": False,
-            "normalized": False,
-            "special": True,
-        }
-        for content, id in encoder._special_tokens.items()
-        if content != "<|extra_0|>"
-    ]
-
-    # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json
-    tokenizer_config_template = {
-        "add_prefix_space": False,
-        "bos_token": "<|extra_203|>",
-        "clean_up_tokenization_spaces": False,
-        "eos_token": "<|extra_204|>",
-        "pad_token": "<|endoftext|>",
-    }
-    tokenizer_config_template.update({"tokenizer_class": "GPT2Tokenizer"})
-    tokenizer_config_template = dict(sorted(tokenizer_config_template.items(), key=lambda x: x[0]))
-
-    # add placeholder image token by taking one of the reserved tokens
-    reserved_token_id = vocab["<|extra_0|>"]
-    vocab["<image>"] = reserved_token_id
-    del vocab["<|extra_0|>"]
-    added_tokens.append(
-        {
-            "id": reserved_token_id,
-            "content": "<image>",
-            "single_word": False,
-            "lstrip": False,
-            "rstrip": False,
-            "normalized": False,
-            "special": True,
-        }
-    )
-
-    os.makedirs(output_dir, exist_ok=True)
-
-    pre_tokenizer = {
-        "type": "ByteLevel",
-        "add_prefix_space": False,
-        "trim_offsets": True,
-        "use_regex": True,
-    }
-
-    # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json
-    tokenizer_template = {
-        "version": "1.0",
-        "truncation": None,
-        "padding": None,
-        "added_tokens": added_tokens,
-        "normalizer": None,
-        "pre_tokenizer": pre_tokenizer,
-        "post_processor": None,
-        "decoder": {
-            "type": "ByteLevel",
-            "add_prefix_space": True,
-            "trim_offsets": True,
-            "use_regex": True,
-        },
-        "model": {
-            "type": "BPE",
-            "dropout": None,
-            "unk_token": None,
-            "continuing_subword_prefix": "",
-            "end_of_word_suffix": "",
-            "fuse_unk": False,
-            "byte_fallback": False,
-            "vocab": vocab,
-            "merges": merges,
-        },
-    }
-
-    # Save to files
-    with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as fp:
-        json.dump(vocab, fp, indent=2, ensure_ascii=False)
-
-    with open(os.path.join(output_dir, "tokenizer.json"), "w", encoding="utf-8") as fp:
-        json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False)
-
-    with open(os.path.join(output_dir, "tokenizer_config.json"), "w", encoding="utf-8") as fp:
-        json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False)
-
-    with open(os.path.join(output_dir, "special_tokens_map.json"), "w", encoding="utf-8") as fp:
-        json.dump(
-            {
-                "bos_token": "<|extra_203|>",
-                "eos_token": "<|extra_204|>",
-                "pad_token": "<|endoftext|>",
-            },
-            fp,
-            indent=2,
-            ensure_ascii=False,
-        )
-
-    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as fp:
-        fp.write("#version: 0.2\n")
-        fp.write("\n".join(merges))
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "^encoder": "model.vqmodel.encoder",
-    "^decoder": "model.vqmodel.decoder",
-    "^post_quant_conv": "model.vqmodel.post_quant_conv",
-    "^quant_conv": "model.vqmodel.quant_conv",
-    "^quantize": "model.vqmodel.quantize",
-    "^model": "text_model.model",
-    r"lm_head\.weight": "text_model.lm_head.weight",
-    r"^text_model\.model\.vqmodel": "vqmodel",
-    # rename QKV proj for the VQ-VAE model because we use SiglipAttention
-    r"\.q\.": ".q_proj.",
-    r"\.k\.": ".k_proj.",
-    r"\.v\.": ".v_proj.",
-    r"\.proj_out\.": ".out_proj.",
-    # move the attention norms outside of attention modules
-    r"mid\.attn_1\.norm\.": "mid.attn_norm.",
-    r"attn\.0\.norm\.": "attn_norms.0.",
-    r"attn\.1\.norm\.": "attn_norms.1.",
-    r"attn\.2\.norm\.": "attn_norms.2.",
-    r"attn\.3\.norm\.": "attn_norms.3.",
-    # isolate down/mid/up into separate classes for readability
-    r"\.down\.": ".down_block.down.",
-    r"\.up\.": ".up_block.up.",
-    r"\.mid\.": ".middle_block.",
-}
-
-
-def convert_state_dict_to_hf(old_state_dict, new_state_dict):
-    for key, value in old_state_dict.items():
-        # convert conv layers in attn to linear
-        if (
-            any(key.endswith(name) for name in ["q.weight", "k.weight", "v.weight", "proj_out.weight"])
-            and value.ndim == 4
-        ):
-            value = value.squeeze()
-
-        for old_pattern, new_pattern in KEYS_TO_MODIFY_MAPPING.items():
-            key = re.sub(old_pattern, new_pattern, key)
-
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-def convert_model(vq_model_id, llm_model_id, output_dir, hub_model_id=None, test_inference=False):
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Convert and save processor
-    tokenizer_tiktoken = AutoTokenizer.from_pretrained(llm_model_id, trust_remote_code=True)
-    convert_tiktoken(tokenizer_tiktoken, output_dir)
-    extra_special_tokens = extra_special_tokens = {
-        "image_token": "<image>",
-        "boi_token": "<|image start|>",
-        "eoi_token": "<|image end|>",
-        "image_wrapper_token": "<|image token|>",
-        "eof_token": "<|extra_201|>",
-    }
-    tokenizer_converted = AutoTokenizer.from_pretrained(output_dir, extra_special_tokens=extra_special_tokens)
-    tokenizer_converted.padding_side = "left"
-
-    image_processor = Emu3ImageProcessor.from_pretrained(vq_model_id)
-    processor = Emu3Processor(image_processor, tokenizer_converted, chat_template=CHAT_TEMPLATE)
-    processor.save_pretrained(output_dir)
-
-    # load models
-    model_llm = AutoModelForCausalLM.from_pretrained(
-        llm_model_id,
-        trust_remote_code=True,
-    )
-    model_vqgan = AutoModel.from_pretrained(vq_model_id, trust_remote_code=True)
-    with open(f"{output_dir}/tokenizer.json", "r") as file:
-        tokenizer_config = json.load(file)
-    vocabulary_map = tokenizer_config["model"]["vocab"]
-
-    text_config = Emu3TextConfig(
-        max_position_embeddings=model_llm.config.max_position_embeddings,
-        rope_scaling={"rope_type": "default"},
-    )
-    config = Emu3Config(text_config=text_config, vocabulary_map=vocabulary_map)
-
-    with init_empty_weights():
-        model = Emu3ForConditionalGeneration(config=config)
-        model.generation_config = GenerationConfig(
-            do_sample=True,
-            top_k=2048,
-            max_new_tokens=50_000,
-            pad_token_id=processor.tokenizer.pad_token_id,
-            eos_token_id=processor.tokenizer.eos_token_id,
-        )
-
-    state_dict = {}
-    state_dict = convert_state_dict_to_hf(model_llm.state_dict(), state_dict)
-    state_dict = convert_state_dict_to_hf(model_vqgan.state_dict(), state_dict)
-
-    model.load_state_dict(state_dict, assign=True, strict=True)
-    model.save_pretrained(output_dir, safe_serialization=True)
-
-    if hub_model_id is not None:
-        model.push_to_hub(hub_model_id)
-        processor.push_to_hub(hub_model_id)
-
-    if test_inference and llm_model_id.endswith("Chat"):
-        # Short inference on a few examples to check if generation makes sense
-        print("Loading the checkpoint in a Emu3 model...")
-        print("*" * 100)
-        model = Emu3ForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
-        processor = Emu3Processor.from_pretrained(output_dir)
-
-        conversation = [
-            {
-                "role": "system",
-                "content": [
-                    {"type": "text", "text": "You are a helpful assistant."},
-                ],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "Please tell me about this art work and its artist."},
-                    {"type": "image"},
-                ],
-            },
-        ]
-        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-
-        image = Image.open(
-            requests.get(
-                "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True
-            ).raw
-        )
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.bfloat16)
-        length = inputs.input_ids.shape[1]
-
-        out = model.generate(**inputs, max_new_tokens=40, do_sample=False)
-        generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0]
-
-        print(f"Generation for single-image: {generated_text}")
-        print("*" * 100)
-    elif test_inference and llm_model_id.endswith("Gen"):
-        processor = Emu3Processor.from_pretrained(output_dir)
-        model = Emu3ForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
-
-        inputs = processor(
-            text=[
-                "a portrait of young girl. masterpiece, film grained, best quality.",
-                "a dog running under the rain",
-            ],
-            padding=True,
-            return_tensors="pt",
-            return_for_image_generation=True,
-        )
-        inputs = inputs.to(device="cuda:0", dtype=torch.bfloat16)
-
-        neg_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry."
-        neg_inputs = processor(text=[neg_prompt] * 2, return_tensors="pt").to(device="cuda:0")
-
-        image_sizes = inputs.pop("image_sizes")
-        HEIGHT, WIDTH = image_sizes[0]
-        VISUAL_TOKENS = model.vocabulary_mapping.image_tokens
-
-        def prefix_allowed_tokens_fn(batch_id, input_ids):
-            height, width = HEIGHT, WIDTH
-            visual_tokens = VISUAL_TOKENS
-            image_token_id = processor.tokenizer.encode("<|image token|>", return_tensors="pt")[0].to(model.device)
-            eoi_token_id = processor.tokenizer.encode("<|image end|>", return_tensors="pt")[0]
-            eos_token_id = processor.tokenizer.encode("<|extra_204|>", return_tensors="pt")[0]
-            pad_token_id = processor.tokenizer.encode("<|endoftext|>", return_tensors="pt")[0]
-            eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0]
-            eof_token_id = processor.tokenizer.encode("<|extra_201|>", return_tensors="pt")[0]
-
-            position = torch.nonzero(input_ids == image_token_id, as_tuple=True)[0][0]
-            offset = input_ids.shape[0] - position
-            if offset % (width + 1) == 0:
-                return (eol_token_id,)
-            elif offset == (width + 1) * height + 1:
-                return (eof_token_id,)
-            elif offset == (width + 1) * height + 2:
-                return (eoi_token_id,)
-            elif offset == (width + 1) * height + 3:
-                return (eos_token_id,)
-            elif offset > (width + 1) * height + 3:
-                return (pad_token_id,)
-            else:
-                return visual_tokens
-
-        out = model.generate(
-            **inputs,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            negative_prompt_ids=neg_inputs.input_ids,
-            negative_prompt_attention_mask=neg_inputs.attention_mask,
-        )
-
-        image = model.decode_image_tokens(out[:, inputs.input_ids.shape[1] :], height=HEIGHT, width=WIDTH)
-        images = processor.postprocess(
-            list(image.float()), return_tensors="PIL.Image.Image"
-        )  # internally we convert to np but it's not supported in bf16 precision
-        for i, image in enumerate(images["pixel_values"]):
-            image.save(f"result_{i}.png")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--vq_model_id",
-        help="Model ID of Emu3 VQ-VAE on the hub",
-        default="BAAI/Emu3-VisionTokenizer",
-    )
-    parser.add_argument(
-        "--llm_model_id",
-        help="Model ID of Emu3 bacbone LLM on the hub",
-        default="BAAI/Emu3-Chat",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model",
-    )
-    parser.add_argument(
-        "--hub_model_id",
-        help="Model ID in the hub where to push the model.",
-    )
-    parser.add_argument(
-        "--test_inference",
-        action="store_true",
-        help="Whether to load the model for generation to test it's converted correctly.",
-    )
-    args = parser.parse_args()
-    convert_model(
-        vq_model_id=args.vq_model_id,
-        llm_model_id=args.llm_model_id,
-        output_dir=args.output_dir,
-        hub_model_id=args.hub_model_id,
-        test_inference=args.test_inference,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
index 1cc02f58ddce..a63269c99ef1 100644
--- a/src/transformers/models/emu3/image_processing_emu3.py
+++ b/src/transformers/models/emu3/image_processing_emu3.py
@@ -167,14 +167,14 @@ def __init__(
     def _preprocess(
         self,
         images: Union[ImageInput, VideoInput],
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
@@ -308,15 +308,15 @@ def _pad_for_batching(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         do_pad: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py
index 7d31b8d3d323..15d89e367707 100644
--- a/src/transformers/models/emu3/modeling_emu3.py
+++ b/src/transformers/models/emu3/modeling_emu3.py
@@ -21,7 +21,7 @@
 # limitations under the License.
 
 import math
-from functools import cached_property
+from functools import cached_property, partial
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
@@ -34,13 +34,15 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     LossKwargs,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -48,6 +50,12 @@
 from .configuration_emu3 import Emu3Config, Emu3TextConfig, Emu3VQVAEConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 
@@ -592,7 +600,7 @@ def forward(self, hidden_states: torch.Tensor, quant_channels: Optional[torch.Te
 class Emu3VQVAEAttentionBlock(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config):
+    def __init__(self, config: Emu3VQVAEConfig):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -605,12 +613,16 @@ def __init__(self, config):
             )
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
+        self.is_causal = False
 
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
 
+        # for compatibility with the attention interface
+        self.num_key_value_groups = 1
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -619,48 +631,43 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        batch_size, seq_length, embed_dim = hidden_states.shape
 
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
 
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
 
-        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                 )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
 
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
         attn_output = self.out_proj(attn_output)
 
+        if not output_attentions:
+            attn_weights = None
+
         return attn_output, attn_weights
 
 
@@ -699,7 +706,7 @@ def __init__(self, config, in_channels, quant_channels=None):
             quant_channels=quant_channels,
         )
 
-    def forward(self, hidden_states: torch.FloatTensor, quant_states: torch.FloatTensor = None):
+    def forward(self, hidden_states: torch.FloatTensor, quant_states: Optional[torch.FloatTensor] = None):
         hidden_states = self.block_1(hidden_states, quant_states)
         residual = hidden_states
         hidden_states = self.attn_norm(hidden_states, quant_states)
@@ -997,6 +1004,9 @@ class Emu3VQVAE(PreTrainedModel):
     config_class = Emu3VQVAEConfig
     base_model_prefix = "emuvideovq"
     main_input_name = "pixel_values"
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+    _supports_flex_attn = True
     _no_split_modules = [
         "Emu3VQVAETemporalResnetBlock",
         "Emu3VQVAEAttentionBlock",
@@ -1215,45 +1225,18 @@ def __init__(self, config: Emu3Config, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -1363,10 +1346,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(EMU3_TEXT_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -1374,16 +1358,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -1394,6 +1376,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -1428,7 +1414,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -1462,13 +1448,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -1476,12 +1461,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1562,7 +1552,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1630,27 +1620,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(EMU3_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class="Emu3TextConfig")
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1685,10 +1674,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1697,12 +1685,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -1711,10 +1698,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -1858,13 +1841,14 @@ def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width
         image = self.vqmodel.decode(image_tokens)
         return image
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(EMU3_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        image_sizes: torch.Tensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -1872,13 +1856,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1932,7 +1914,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
@@ -1951,7 +1932,7 @@ def forward(
             input_ids = input_ids.masked_scatter(special_image_mask, image_tokens)
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.text_model(
+        return self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1960,12 +1941,40 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
         )
 
-        return outputs
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+
+        return model_inputs
 
 
 __all__ = ["Emu3ForConditionalGeneration", "Emu3ForCausalLM", "Emu3TextModel", "Emu3PreTrainedModel", "Emu3VQVAE"]
diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py
index d645a88baf38..031dc26f0a4a 100644
--- a/src/transformers/models/emu3/modular_emu3.py
+++ b/src/transformers/models/emu3/modular_emu3.py
@@ -32,7 +32,7 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -50,10 +50,6 @@
 from .configuration_emu3 import Emu3Config, Emu3TextConfig, Emu3VQVAEConfig
 
 
-if is_flash_attn_2_available():
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-
 _CONFIG_FOR_DOC = "Emu3Config"
 _CHECKPOINT_FOR_DOC = "BAAI/Emu3-Chat-hf"
 
@@ -398,7 +394,11 @@ def forward(self, hidden_states: torch.Tensor, quant_channels: Optional[torch.Te
 
 
 class Emu3VQVAEAttentionBlock(SiglipAttention):
-    pass
+    def __init__(self, config: Emu3VQVAEConfig):
+        super().__init__(config)
+
+        # for compatibility with the attention interface
+        self.num_key_value_groups = 1
 
 
 class Emu3VQVAEGroupNorm(nn.GroupNorm):
@@ -436,7 +436,7 @@ def __init__(self, config, in_channels, quant_channels=None):
             quant_channels=quant_channels,
         )
 
-    def forward(self, hidden_states: torch.FloatTensor, quant_states: torch.FloatTensor = None):
+    def forward(self, hidden_states: torch.FloatTensor, quant_states: Optional[torch.FloatTensor] = None):
         hidden_states = self.block_1(hidden_states, quant_states)
         residual = hidden_states
         hidden_states = self.attn_norm(hidden_states, quant_states)
@@ -734,6 +734,9 @@ class Emu3VQVAE(PreTrainedModel):
     config_class = Emu3VQVAEConfig
     base_model_prefix = "emuvideovq"
     main_input_name = "pixel_values"
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+    _supports_flex_attn = True
     _no_split_modules = [
         "Emu3VQVAETemporalResnetBlock",
         "Emu3VQVAEAttentionBlock",
@@ -1060,6 +1063,7 @@ def __init__(self, config: Emu3Config):
             [Emu3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(EMU3_TEXT_INPUTS_DOCSTRING)
     def forward(self, **super_kwargs):
         super().forward(**super_kwargs)
@@ -1072,12 +1076,12 @@ def __init__(self, config):
         super().__init__(config)
         self.model = Emu3TextModel(config)
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(EMU3_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class="Emu3TextConfig")
     def forward(**super_kwargs):
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1166,13 +1170,14 @@ def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width
         image = self.vqmodel.decode(image_tokens)
         return image
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(EMU3_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        image_sizes: torch.Tensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -1180,13 +1185,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1240,7 +1243,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
@@ -1259,7 +1261,7 @@ def forward(
             input_ids = input_ids.masked_scatter(special_image_mask, image_tokens)
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.text_model(
+        return self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1268,12 +1270,40 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
         )
 
-        return outputs
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+
+        return model_inputs
 
 
 __all__ = [
diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py
index 01966e470bdf..ff0e68162321 100644
--- a/src/transformers/models/emu3/processing_emu3.py
+++ b/src/transformers/models/emu3/processing_emu3.py
@@ -95,7 +95,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
deleted file mode 100644
index 4db97bd68836..000000000000
--- a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert EnCodec checkpoints."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    EncodecConfig,
-    EncodecFeatureExtractor,
-    EncodecModel,
-    logging,
-)
-
-
-# checkpoints downloaded from:
-# https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th
-# https://huggingface.co/facebook/musicgen-small/resolve/main/compression_state_dict.bin
-# https://dl.fbaipublicfiles.com/encodec/v0/encodec_48khz-7e698e3e.th
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.encodec")
-
-MAPPING_QUANTIZER = {
-    "quantizer.vq.layers.*._codebook.inited": "quantizer.layers.*.codebook.inited",
-    "quantizer.vq.layers.*._codebook.cluster_size": "quantizer.layers.*.codebook.cluster_size",
-    "quantizer.vq.layers.*._codebook.embed": "quantizer.layers.*.codebook.embed",
-    "quantizer.vq.layers.*._codebook.embed_avg": "quantizer.layers.*.codebook.embed_avg",
-}
-MAPPING_ENCODER = {
-    "encoder.model.0.conv.conv": "encoder.layers.0.conv",
-    "encoder.model.1.block.1.conv.conv": "encoder.layers.1.block.1.conv",
-    "encoder.model.1.block.3.conv.conv": "encoder.layers.1.block.3.conv",
-    "encoder.model.1.shortcut.conv.conv": "encoder.layers.1.shortcut.conv",
-    "encoder.model.3.conv.conv": "encoder.layers.3.conv",
-    "encoder.model.4.block.1.conv.conv": "encoder.layers.4.block.1.conv",
-    "encoder.model.4.block.3.conv.conv": "encoder.layers.4.block.3.conv",
-    "encoder.model.4.shortcut.conv.conv": "encoder.layers.4.shortcut.conv",
-    "encoder.model.6.conv.conv": "encoder.layers.6.conv",
-    "encoder.model.7.block.1.conv.conv": "encoder.layers.7.block.1.conv",
-    "encoder.model.7.block.3.conv.conv": "encoder.layers.7.block.3.conv",
-    "encoder.model.7.shortcut.conv.conv": "encoder.layers.7.shortcut.conv",
-    "encoder.model.9.conv.conv": "encoder.layers.9.conv",
-    "encoder.model.10.block.1.conv.conv": "encoder.layers.10.block.1.conv",
-    "encoder.model.10.block.3.conv.conv": "encoder.layers.10.block.3.conv",
-    "encoder.model.10.shortcut.conv.conv": "encoder.layers.10.shortcut.conv",
-    "encoder.model.12.conv.conv": "encoder.layers.12.conv",
-    "encoder.model.13.lstm": "encoder.layers.13.lstm",
-    "encoder.model.15.conv.conv": "encoder.layers.15.conv",
-}
-MAPPING_ENCODER_48K = {
-    "encoder.model.0.conv.norm": "encoder.layers.0.norm",
-    "encoder.model.1.block.1.conv.norm": "encoder.layers.1.block.1.norm",
-    "encoder.model.1.block.3.conv.norm": "encoder.layers.1.block.3.norm",
-    "encoder.model.1.shortcut.conv.norm": "encoder.layers.1.shortcut.norm",
-    "encoder.model.3.conv.norm": "encoder.layers.3.norm",
-    "encoder.model.4.block.1.conv.norm": "encoder.layers.4.block.1.norm",
-    "encoder.model.4.block.3.conv.norm": "encoder.layers.4.block.3.norm",
-    "encoder.model.4.shortcut.conv.norm": "encoder.layers.4.shortcut.norm",
-    "encoder.model.6.conv.norm": "encoder.layers.6.norm",
-    "encoder.model.7.block.1.conv.norm": "encoder.layers.7.block.1.norm",
-    "encoder.model.7.block.3.conv.norm": "encoder.layers.7.block.3.norm",
-    "encoder.model.7.shortcut.conv.norm": "encoder.layers.7.shortcut.norm",
-    "encoder.model.9.conv.norm": "encoder.layers.9.norm",
-    "encoder.model.10.block.1.conv.norm": "encoder.layers.10.block.1.norm",
-    "encoder.model.10.block.3.conv.norm": "encoder.layers.10.block.3.norm",
-    "encoder.model.10.shortcut.conv.norm": "encoder.layers.10.shortcut.norm",
-    "encoder.model.12.conv.norm": "encoder.layers.12.norm",
-    "encoder.model.15.conv.norm": "encoder.layers.15.norm",
-}
-MAPPING_DECODER = {
-    "decoder.model.0.conv.conv": "decoder.layers.0.conv",
-    "decoder.model.1.lstm": "decoder.layers.1.lstm",
-    "decoder.model.3.convtr.convtr": "decoder.layers.3.conv",
-    "decoder.model.4.block.1.conv.conv": "decoder.layers.4.block.1.conv",
-    "decoder.model.4.block.3.conv.conv": "decoder.layers.4.block.3.conv",
-    "decoder.model.4.shortcut.conv.conv": "decoder.layers.4.shortcut.conv",
-    "decoder.model.6.convtr.convtr": "decoder.layers.6.conv",
-    "decoder.model.7.block.1.conv.conv": "decoder.layers.7.block.1.conv",
-    "decoder.model.7.block.3.conv.conv": "decoder.layers.7.block.3.conv",
-    "decoder.model.7.shortcut.conv.conv": "decoder.layers.7.shortcut.conv",
-    "decoder.model.9.convtr.convtr": "decoder.layers.9.conv",
-    "decoder.model.10.block.1.conv.conv": "decoder.layers.10.block.1.conv",
-    "decoder.model.10.block.3.conv.conv": "decoder.layers.10.block.3.conv",
-    "decoder.model.10.shortcut.conv.conv": "decoder.layers.10.shortcut.conv",
-    "decoder.model.12.convtr.convtr": "decoder.layers.12.conv",
-    "decoder.model.13.block.1.conv.conv": "decoder.layers.13.block.1.conv",
-    "decoder.model.13.block.3.conv.conv": "decoder.layers.13.block.3.conv",
-    "decoder.model.13.shortcut.conv.conv": "decoder.layers.13.shortcut.conv",
-    "decoder.model.15.conv.conv": "decoder.layers.15.conv",
-}
-MAPPING_DECODER_48K = {
-    "decoder.model.0.conv.norm": "decoder.layers.0.norm",
-    "decoder.model.3.convtr.norm": "decoder.layers.3.norm",
-    "decoder.model.4.block.1.conv.norm": "decoder.layers.4.block.1.norm",
-    "decoder.model.4.block.3.conv.norm": "decoder.layers.4.block.3.norm",
-    "decoder.model.4.shortcut.conv.norm": "decoder.layers.4.shortcut.norm",
-    "decoder.model.6.convtr.norm": "decoder.layers.6.norm",
-    "decoder.model.7.block.1.conv.norm": "decoder.layers.7.block.1.norm",
-    "decoder.model.7.block.3.conv.norm": "decoder.layers.7.block.3.norm",
-    "decoder.model.7.shortcut.conv.norm": "decoder.layers.7.shortcut.norm",
-    "decoder.model.9.convtr.norm": "decoder.layers.9.norm",
-    "decoder.model.10.block.1.conv.norm": "decoder.layers.10.block.1.norm",
-    "decoder.model.10.block.3.conv.norm": "decoder.layers.10.block.3.norm",
-    "decoder.model.10.shortcut.conv.norm": "decoder.layers.10.shortcut.norm",
-    "decoder.model.12.convtr.norm": "decoder.layers.12.norm",
-    "decoder.model.13.block.1.conv.norm": "decoder.layers.13.block.1.norm",
-    "decoder.model.13.block.3.conv.norm": "decoder.layers.13.block.3.norm",
-    "decoder.model.13.shortcut.conv.norm": "decoder.layers.13.shortcut.norm",
-    "decoder.model.15.conv.norm": "decoder.layers.15.norm",
-}
-MAPPING_24K = {
-    **MAPPING_QUANTIZER,
-    **MAPPING_ENCODER,
-    **MAPPING_DECODER,
-}
-MAPPING_48K = {
-    **MAPPING_QUANTIZER,
-    **MAPPING_ENCODER,
-    **MAPPING_ENCODER_48K,
-    **MAPPING_DECODER,
-    **MAPPING_DECODER_48K,
-}
-TOP_LEVEL_KEYS = []
-IGNORE_KEYS = []
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "running_mean":
-        hf_pointer.running_mean.data = value
-    elif weight_type == "running_var":
-        hf_pointer.running_var.data = value
-    elif weight_type == "num_batches_tracked":
-        hf_pointer.num_batches_tracked.data = value
-    elif weight_type == "weight_ih_l0":
-        hf_pointer.weight_ih_l0.data = value
-    elif weight_type == "weight_hh_l0":
-        hf_pointer.weight_hh_l0.data = value
-    elif weight_type == "bias_ih_l0":
-        hf_pointer.bias_ih_l0.data = value
-    elif weight_type == "bias_hh_l0":
-        hf_pointer.bias_hh_l0.data = value
-    elif weight_type == "weight_ih_l1":
-        hf_pointer.weight_ih_l1.data = value
-    elif weight_type == "weight_hh_l1":
-        hf_pointer.weight_hh_l1.data = value
-    elif weight_type == "bias_ih_l1":
-        hf_pointer.bias_ih_l1.data = value
-    elif weight_type == "bias_hh_l1":
-        hf_pointer.bias_hh_l1.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
-
-
-def should_ignore(name, ignore_keys):
-    for key in ignore_keys:
-        if key.endswith(".*"):
-            if name.startswith(key[:-1]):
-                return True
-        elif ".*." in key:
-            prefix, suffix = key.split(".*.")
-            if prefix in name and suffix in name:
-                return True
-        elif key in name:
-            return True
-    return False
-
-
-def recursively_load_weights(orig_dict, hf_model, model_name):
-    unused_weights = []
-
-    if model_name in ["encodec_24khz", "encodec_32khz"]:
-        MAPPING = MAPPING_24K
-    elif model_name == "encodec_48khz":
-        MAPPING = MAPPING_48K
-    else:
-        raise ValueError(f"Unsupported model: {model_name}")
-
-    for name, value in orig_dict.items():
-        if should_ignore(name, IGNORE_KEYS):
-            logger.info(f"{name} was ignored")
-            continue
-
-        is_used = False
-        for key, mapped_key in MAPPING.items():
-            if "*" in key:
-                prefix, suffix = key.split(".*.")
-                if prefix in name and suffix in name:
-                    key = suffix
-
-            if key in name:
-                # HACK otherwise .embed gets initialized with .embed_avg too
-                if key.endswith("embed") and name.endswith("embed_avg"):
-                    continue
-
-                is_used = True
-                if "*" in mapped_key:
-                    layer_index = name.split(key)[0].split(".")[-2]
-                    mapped_key = mapped_key.replace("*", layer_index)
-                if "weight_g" in name:
-                    weight_type = "weight_g"
-                elif "weight_v" in name:
-                    weight_type = "weight_v"
-                elif "weight_ih_l0" in name:
-                    weight_type = "weight_ih_l0"
-                elif "weight_hh_l0" in name:
-                    weight_type = "weight_hh_l0"
-                elif "bias_ih_l0" in name:
-                    weight_type = "bias_ih_l0"
-                elif "bias_hh_l0" in name:
-                    weight_type = "bias_hh_l0"
-                elif "weight_ih_l1" in name:
-                    weight_type = "weight_ih_l1"
-                elif "weight_hh_l1" in name:
-                    weight_type = "weight_hh_l1"
-                elif "bias_ih_l1" in name:
-                    weight_type = "bias_ih_l1"
-                elif "bias_hh_l1" in name:
-                    weight_type = "bias_hh_l1"
-                elif "bias" in name:
-                    weight_type = "bias"
-                elif "weight" in name:
-                    weight_type = "weight"
-                elif "running_mean" in name:
-                    weight_type = "running_mean"
-                elif "running_var" in name:
-                    weight_type = "running_var"
-                elif "num_batches_tracked" in name:
-                    weight_type = "num_batches_tracked"
-                else:
-                    weight_type = None
-                set_recursively(hf_model, mapped_key, value, name, weight_type)
-            continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-@torch.no_grad()
-def convert_checkpoint(
-    model_name,
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = EncodecConfig.from_pretrained(config_path)
-    else:
-        config = EncodecConfig()
-
-    if model_name == "encodec_24khz":
-        pass  # config is already correct
-    elif model_name == "encodec_32khz":
-        config.upsampling_ratios = [8, 5, 4, 4]
-        config.target_bandwidths = [2.2]
-        config.num_filters = 64
-        config.sampling_rate = 32_000
-        config.codebook_size = 2048
-        config.use_causal_conv = False
-        config.normalize = False
-        config.use_conv_shortcut = False
-    elif model_name == "encodec_48khz":
-        config.upsampling_ratios = [8, 5, 4, 2]
-        config.target_bandwidths = [3.0, 6.0, 12.0, 24.0]
-        config.sampling_rate = 48_000
-        config.audio_channels = 2
-        config.use_causal_conv = False
-        config.norm_type = "time_group_norm"
-        config.normalize = True
-        config.chunk_length_s = 1.0
-        config.overlap = 0.01
-    else:
-        raise ValueError(f"Unknown model name: {model_name}")
-
-    model = EncodecModel(config)
-
-    feature_extractor = EncodecFeatureExtractor(
-        feature_size=config.audio_channels,
-        sampling_rate=config.sampling_rate,
-        chunk_length_s=config.chunk_length_s,
-        overlap=config.overlap,
-    )
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    original_checkpoint = torch.load(checkpoint_path)
-    if "best_state" in original_checkpoint:
-        # we might have a training state saved, in which case discard the yaml results and just retain the weights
-        original_checkpoint = original_checkpoint["best_state"]
-    recursively_load_weights(original_checkpoint, model, model_name)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        feature_extractor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model",
-        default="encodec_24khz",
-        type=str,
-        help="The model to convert. Should be one of 'encodec_24khz', 'encodec_32khz', 'encodec_48khz'.",
-    )
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_checkpoint(
-        args.model,
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/encodec/feature_extraction_encodec.py b/src/transformers/models/encodec/feature_extraction_encodec.py
index 9bed59de45d8..f33191862e48 100644
--- a/src/transformers/models/encodec/feature_extraction_encodec.py
+++ b/src/transformers/models/encodec/feature_extraction_encodec.py
@@ -57,8 +57,8 @@ def __init__(
         feature_size: int = 1,
         sampling_rate: int = 24000,
         padding_value: float = 0.0,
-        chunk_length_s: float = None,
-        overlap: float = None,
+        chunk_length_s: Optional[float] = None,
+        overlap: Optional[float] = None,
         **kwargs,
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
@@ -132,7 +132,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py
index 6fe658a7057f..670ac99e03e0 100644
--- a/src/transformers/models/encodec/modeling_encodec.py
+++ b/src/transformers/models/encodec/modeling_encodec.py
@@ -50,8 +50,8 @@ class EncodecOutput(ModelOutput):
             Decoded audio values, obtained using the decoder part of Encodec.
     """
 
-    audio_codes: torch.LongTensor = None
-    audio_values: torch.FloatTensor = None
+    audio_codes: Optional[torch.LongTensor] = None
+    audio_values: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -64,8 +64,8 @@ class EncodecEncoderOutput(ModelOutput):
             Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
     """
 
-    audio_codes: torch.LongTensor = None
-    audio_scales: torch.FloatTensor = None
+    audio_codes: Optional[torch.LongTensor] = None
+    audio_scales: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -76,7 +76,7 @@ class EncodecDecoderOutput(ModelOutput):
             Decoded audio values, obtained using the decoder part of Encodec.
     """
 
-    audio_values: torch.FloatTensor = None
+    audio_values: Optional[torch.FloatTensor] = None
 
 
 class EncodecConv1d(nn.Module):
@@ -121,7 +121,7 @@ def __init__(
 
         self.register_buffer("stride", stride, persistent=False)
         self.register_buffer("kernel_size", kernel_size, persistent=False)
-        self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
+        self.register_buffer("padding_total", kernel_size - stride, persistent=False)
 
     def _get_extra_padding_for_conv1d(
         self,
@@ -589,7 +589,7 @@ def _encode_frame(
     def encode(
         self,
         input_values: torch.Tensor,
-        padding_mask: torch.Tensor = None,
+        padding_mask: Optional[torch.Tensor] = None,
         bandwidth: Optional[float] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor, Optional[torch.Tensor]], EncodecEncoderOutput]:
@@ -617,8 +617,7 @@ def encode(
             bandwidth = self.config.target_bandwidths[0]
         if bandwidth not in self.config.target_bandwidths:
             raise ValueError(
-                f"This model doesn't support the bandwidth {bandwidth}. "
-                f"Select one of {self.config.target_bandwidths}."
+                f"This model doesn't support the bandwidth {bandwidth}. Select one of {self.config.target_bandwidths}."
             )
 
         _, channels, input_length = input_values.shape
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 9ab4b7f2ced1..415fd058e45d 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -264,6 +264,8 @@ def __init__(
         self.tie_weights()
 
     def tie_weights(self):
+        self.encoder.tie_weights()
+        self.decoder.tie_weights()
         # tie encoder & decoder if needed
         if self.config.tie_encoder_decoder:
             # tie encoder and decoder base model
@@ -279,6 +281,12 @@ def tie_weights(self):
             # Leading to issues on subsequent calls by different tests or subsequent calls.
             self._dynamic_tied_weights_keys = tied_weights
 
+    def _init_weights(self, module):
+        if module in self.encoder.modules():
+            self.encoder._init_weights(module)
+        elif module in self.decoder.modules():
+            self.decoder._init_weights(module)
+
     def get_encoder(self):
         return self.encoder
 
@@ -385,21 +393,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
                 return model
 
-        # At the moment fast initialization is not supported for composite models
-        if kwargs.get("_fast_init", False):
-            logger.warning(
-                "Fast initialization is currently not supported for EncoderDecoderModel. "
-                "Falling back to slow initialization..."
-            )
-        kwargs["_fast_init"] = False
-
         return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
     @classmethod
     def from_encoder_decoder_pretrained(
         cls,
-        encoder_pretrained_model_name_or_path: str = None,
-        decoder_pretrained_model_name_or_path: str = None,
+        encoder_pretrained_model_name_or_path: Optional[str] = None,
+        decoder_pretrained_model_name_or_path: Optional[str] = None,
         *model_args,
         **kwargs,
     ) -> PreTrainedModel:
@@ -598,6 +598,8 @@ def forward(
         kwargs_decoder = {
             argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
         }
+        if "num_items_in_batch" in kwargs_encoder:
+            kwargs_decoder["num_items_in_batch"] = kwargs_encoder.pop("num_items_in_batch", None)
 
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 66009fc3ef06..a5abafc361b6 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -311,8 +311,8 @@ def tf_to_pt_weight_rename(self, tf_weight):
     @classmethod
     def from_encoder_decoder_pretrained(
         cls,
-        encoder_pretrained_model_name_or_path: str = None,
-        decoder_pretrained_model_name_or_path: str = None,
+        encoder_pretrained_model_name_or_path: Optional[str] = None,
+        decoder_pretrained_model_name_or_path: Optional[str] = None,
         *model_args,
         **kwargs,
     ) -> TFPreTrainedModel:
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index 975466f551d2..559078b1ef1a 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -700,8 +700,8 @@ class ErnieForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py
index c0a31e695847..ac56bc8d783a 100644
--- a/src/transformers/models/esm/configuration_esm.py
+++ b/src/transformers/models/esm/configuration_esm.py
@@ -172,7 +172,7 @@ def to_dict(self):
 
 @dataclass
 class EsmFoldConfig:
-    esm_type: str = None
+    esm_type: Optional[str] = None
     fp16_esm: bool = True
     use_esm_attn_map: bool = False
     esm_ablate_pairwise: bool = False
diff --git a/src/transformers/models/esm/convert_esm.py b/src/transformers/models/esm/convert_esm.py
deleted file mode 100644
index 020dd4e57663..000000000000
--- a/src/transformers/models/esm/convert_esm.py
+++ /dev/null
@@ -1,399 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ESM checkpoint."""
-
-import argparse
-import pathlib
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-import esm as esm_module
-import torch
-from esm.esmfold.v1.misc import batch_encode_sequences as esmfold_encode_sequences
-from esm.esmfold.v1.pretrained import esmfold_v1
-
-from transformers.models.esm.configuration_esm import EsmConfig, EsmFoldConfig
-from transformers.models.esm.modeling_esm import (
-    EsmForMaskedLM,
-    EsmForSequenceClassification,
-    EsmIntermediate,
-    EsmLayer,
-    EsmOutput,
-    EsmSelfAttention,
-    EsmSelfOutput,
-)
-from transformers.models.esm.modeling_esmfold import EsmForProteinFolding
-from transformers.models.esm.tokenization_esm import EsmTokenizer
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_DATA = [
-    (
-        "protein1",
-        "MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA",
-    ),
-    ("protein2", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA"),
-    ("protein3", "MKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLAGG"),
-    ("protein4", "MKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLA"),
-]
-
-MODEL_MAPPING = {
-    "esm1b_t33_650M_UR50S": esm_module.pretrained.esm1b_t33_650M_UR50S,
-    "esm1v_t33_650M_UR90S_1": esm_module.pretrained.esm1v_t33_650M_UR90S_1,
-    "esm1v_t33_650M_UR90S_2": esm_module.pretrained.esm1v_t33_650M_UR90S_2,
-    "esm1v_t33_650M_UR90S_3": esm_module.pretrained.esm1v_t33_650M_UR90S_3,
-    "esm1v_t33_650M_UR90S_4": esm_module.pretrained.esm1v_t33_650M_UR90S_4,
-    "esm1v_t33_650M_UR90S_5": esm_module.pretrained.esm1v_t33_650M_UR90S_5,
-    "esm2_t48_15B_UR50D": esm_module.pretrained.esm2_t48_15B_UR50D,
-    "esm2_t36_3B_UR50D": esm_module.pretrained.esm2_t36_3B_UR50D,
-    "esm2_t33_650M_UR50D": esm_module.pretrained.esm2_t33_650M_UR50D,
-    "esm2_t30_150M_UR50D": esm_module.pretrained.esm2_t30_150M_UR50D,
-    "esm2_t12_35M_UR50D": esm_module.pretrained.esm2_t12_35M_UR50D,
-    "esm2_t6_8M_UR50D": esm_module.pretrained.esm2_t6_8M_UR50D,
-    "esmfold_v1": esmfold_v1,
-}
-
-restypes = list("ARNDCQEGHILKMFPSTWYV")
-
-restypes_with_x = restypes + ["X"]
-restypes_with_extras = restypes_with_x + ["<pad>", "<mask>", "<cls>", "<sep>", "<eos>"]
-
-
-def get_esmfold_tokenizer():
-    with TemporaryDirectory() as tempdir:
-        vocab = "\n".join(restypes_with_extras)
-        vocab_file = Path(tempdir) / "vocab.txt"
-        vocab_file.write_text(vocab)
-        hf_tokenizer = EsmTokenizer(vocab_file=str(vocab_file))
-    hf_tokenizer.pad_token_id = 0  # Overlaps with 'A' but that seems to be what they want
-    return hf_tokenizer
-
-
-def transfer_and_check_weights(original_module, our_module):
-    status = our_module.load_state_dict(original_module.state_dict())
-    if status.missing_keys:
-        raise ValueError(f"Missing keys: {status.missing_keys}")
-    if status.unexpected_keys:
-        raise ValueError(f"Unexpected keys: {status.unexpected_keys}")
-
-
-def convert_esm_checkpoint_to_pytorch(
-    model: str, pytorch_dump_folder_path: str, classification_head: bool, push_to_repo: str, auth_token: str
-):
-    """
-    Copy/paste/tweak esm's weights to our BERT structure.
-    """
-    if model.startswith("esmfold"):
-        esm = MODEL_MAPPING[model]()
-    else:
-        esm, alphabet = MODEL_MAPPING[model]()
-    esm.eval()  # disable dropout
-
-    if model.startswith("esmfold"):
-        embed_dim = esm.esm.embed_dim
-        num_layers = esm.esm.num_layers
-        num_attention_heads = esm.esm.attention_heads
-        intermediate_size = 4 * embed_dim
-        token_dropout = esm.esm.token_dropout
-        emb_layer_norm_before = False  # This code path does not exist in ESM-2
-        position_embedding_type = "rotary"
-        is_folding_model = True
-        esmfold_config = EsmFoldConfig()
-        for key, val in esm.cfg.items():
-            if hasattr(esmfold_config, key) and key != "trunk":
-                setattr(esmfold_config, key, val)
-        for key, val in esm.cfg.trunk.items():
-            if hasattr(esmfold_config.trunk, key) and key != "structure_module":
-                setattr(esmfold_config.trunk, key, val)
-        for key, val in esm.cfg.trunk.structure_module.items():
-            if hasattr(esmfold_config.trunk.structure_module, key):
-                setattr(esmfold_config.trunk.structure_module, key, val)
-    elif hasattr(esm, "args"):
-        # Indicates an ESM-1b or ESM-1v model
-        embed_dim = esm.args.embed_dim
-        num_layers = esm.args.layers
-        num_attention_heads = esm.args.attention_heads
-        intermediate_size = esm.args.ffn_embed_dim
-        token_dropout = esm.args.token_dropout
-        emb_layer_norm_before = True if esm.emb_layer_norm_before else False
-        position_embedding_type = "absolute"
-        is_folding_model = False
-        esmfold_config = None
-    else:
-        # Indicates an ESM-2 model
-        embed_dim = esm.embed_dim
-        num_layers = esm.num_layers
-        num_attention_heads = esm.attention_heads
-        intermediate_size = 4 * embed_dim  # This is hardcoded in ESM-2
-        token_dropout = esm.token_dropout
-        emb_layer_norm_before = False  # This code path does not exist in ESM-2
-        position_embedding_type = "rotary"
-        is_folding_model = False
-        esmfold_config = None
-
-    if is_folding_model:
-        alphabet = esm.esm.alphabet
-    vocab_list = tuple(alphabet.all_toks)
-    mask_token_id = alphabet.mask_idx
-    pad_token_id = alphabet.padding_idx
-
-    if is_folding_model:
-        original_esm_model = esm.esm
-    else:
-        original_esm_model = esm
-
-    config = EsmConfig(
-        vocab_size=original_esm_model.embed_tokens.num_embeddings,
-        mask_token_id=mask_token_id,
-        hidden_size=embed_dim,
-        num_hidden_layers=num_layers,
-        num_attention_heads=num_attention_heads,
-        intermediate_size=intermediate_size,
-        max_position_embeddings=1026,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-        attention_probs_dropout_prob=0.0,
-        hidden_dropout_prob=0.0,
-        pad_token_id=pad_token_id,
-        emb_layer_norm_before=emb_layer_norm_before,
-        token_dropout=token_dropout,
-        position_embedding_type=position_embedding_type,
-        is_folding_model=is_folding_model,
-        esmfold_config=esmfold_config,
-        vocab_list=vocab_list,
-    )
-    if classification_head:
-        config.num_labels = esm.classification_heads["mnli"].out_proj.weight.shape[0]
-    print("Our ESM config:", config)
-
-    if model.startswith("esmfold"):
-        model_class = EsmForProteinFolding
-    elif classification_head:
-        model_class = EsmForSequenceClassification
-    else:
-        model_class = EsmForMaskedLM
-    model = model_class(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.esm.embeddings.word_embeddings.weight = original_esm_model.embed_tokens.weight
-    if position_embedding_type == "absolute":
-        model.esm.embeddings.position_embeddings.weight = original_esm_model.embed_positions.weight
-
-    if config.emb_layer_norm_before:
-        model.esm.embeddings.layer_norm.weight = original_esm_model.emb_layer_norm_before.weight
-        model.esm.embeddings.layer_norm.bias = original_esm_model.emb_layer_norm_before.bias
-
-    model.esm.encoder.emb_layer_norm_after.weight = original_esm_model.emb_layer_norm_after.weight
-    model.esm.encoder.emb_layer_norm_after.bias = original_esm_model.emb_layer_norm_after.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer: EsmLayer = model.esm.encoder.layer[i]
-        # esm_layer: TransformerSentenceEncoderLayer = original_esm_model.layers[i]
-        esm_layer = original_esm_model.layers[i]
-
-        # self attention
-        self_attn: EsmSelfAttention = layer.attention.self
-        assert (
-            esm_layer.self_attn.k_proj.weight.data.shape
-            == esm_layer.self_attn.q_proj.weight.data.shape
-            == esm_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        )
-
-        self_attn.query.weight.data = esm_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = esm_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = esm_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = esm_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = esm_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = esm_layer.self_attn.v_proj.bias
-
-        if getattr(esm_layer.self_attn, "rot_emb", None) is not None:
-            # Matt: Although inv_freq is not a trainable weight, it is computed at model init and cached.
-            # During the training of ESM-2 the model was converted to float16 precision, which also converts
-            # the inv_freq tensor, and the loss of precision remains even if the model is loaded later as float32.
-            # If we recompute inv_freq without this loss of precision then we will get subtly different rotary
-            # embeddings, which are enough to cause significant discrepancies in model outputs. To avoid this,
-            # we make sure the new model copies the data from the old inv_freq.
-            self_attn.rotary_embeddings.inv_freq.data = esm_layer.self_attn.rot_emb.inv_freq
-
-        # LayerNorm changes for pre-activation
-        layer.attention.LayerNorm.weight = esm_layer.self_attn_layer_norm.weight
-        layer.attention.LayerNorm.bias = esm_layer.self_attn_layer_norm.bias
-        layer.LayerNorm.weight = esm_layer.final_layer_norm.weight
-        layer.LayerNorm.bias = esm_layer.final_layer_norm.bias
-
-        # self-attention output
-        self_output: EsmSelfOutput = layer.attention.output
-        assert self_output.dense.weight.shape == esm_layer.self_attn.out_proj.weight.shape
-        self_output.dense.weight = esm_layer.self_attn.out_proj.weight
-        self_output.dense.bias = esm_layer.self_attn.out_proj.bias
-
-        # intermediate
-        intermediate: EsmIntermediate = layer.intermediate
-        assert intermediate.dense.weight.shape == esm_layer.fc1.weight.shape
-        intermediate.dense.weight = esm_layer.fc1.weight
-        intermediate.dense.bias = esm_layer.fc1.bias
-
-        # output
-        bert_output: EsmOutput = layer.output
-        assert bert_output.dense.weight.shape == esm_layer.fc2.weight.shape
-        bert_output.dense.weight = esm_layer.fc2.weight
-        bert_output.dense.bias = esm_layer.fc2.bias
-        # end of layer
-
-    if is_folding_model:
-        model.esm_s_combine.data = esm.esm_s_combine.data
-        model.af2_to_esm.data = esm.af2_to_esm.data
-        transfer_and_check_weights(esm.embedding, model.embedding)
-        transfer_and_check_weights(esm.esm_s_mlp, model.esm_s_mlp)
-        transfer_and_check_weights(esm.trunk, model.trunk)
-        transfer_and_check_weights(esm.distogram_head, model.distogram_head)
-        transfer_and_check_weights(esm.ptm_head, model.ptm_head)
-        transfer_and_check_weights(esm.lm_head, model.lm_head)
-        transfer_and_check_weights(esm.lddt_head, model.lddt_head)
-
-    elif classification_head:
-        model.classifier.dense.weight = esm.esm.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = esm.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = esm.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = esm.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = esm.lm_head.dense.weight
-        model.lm_head.dense.bias = esm.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = esm.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = esm.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = esm.lm_head.weight
-        model.lm_head.bias = esm.lm_head.bias
-
-    # Contact prediction head
-    transfer_and_check_weights(esm.contact_head, model.esm.contact_head)
-
-    # Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4)
-    if is_folding_model:
-        # Folding models aren't trained on masked inputs and don't like mask tokens.
-        sample_data = SAMPLE_DATA[:2]
-    else:
-        sample_data = SAMPLE_DATA
-
-    if is_folding_model:
-        hf_tokenizer = get_esmfold_tokenizer()
-        hf_tokens = hf_tokenizer(
-            [row[1] for row in sample_data], return_tensors="pt", padding=True, add_special_tokens=False
-        )
-        esmfold_aas, esmfold_mask, _, _, _ = esmfold_encode_sequences([row[1] for row in sample_data])
-        success = torch.all(hf_tokens["input_ids"] == esmfold_aas) and torch.all(
-            hf_tokens["attention_mask"] == esmfold_mask
-        )
-    else:
-        # Let's check that we get the same results.
-        batch_converter = alphabet.get_batch_converter()
-        batch_labels, batch_strs, batch_tokens = batch_converter(sample_data)
-        # Prepare tokenizer and make sure it matches
-        with TemporaryDirectory() as tempdir:
-            vocab = "\n".join(alphabet.all_toks)
-            vocab_file = Path(tempdir) / "vocab.txt"
-            vocab_file.write_text(vocab)
-            hf_tokenizer = EsmTokenizer(vocab_file=str(vocab_file))
-
-        hf_tokens = hf_tokenizer([row[1] for row in sample_data], return_tensors="pt", padding=True)
-        success = torch.all(hf_tokens["input_ids"] == batch_tokens)
-
-    print("Do both models tokenizers output the same tokens?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Tokenization does not match!")
-
-    with torch.no_grad():
-        if is_folding_model:
-            # Let's test the model in parts
-            # ESMFold always converts the ESM stem to float16, which requires float16 ops
-            # that don't exist on CPU. Therefore, to test it we need to run it on GPU. However,
-            # ESMFold is what we in the community call a "big boy" and so we desperately avoid putting both the
-            # original and the converted model on the GPU at the same time.
-            their_output = esm.cuda().infer([row[1] for row in sample_data])
-            our_output = model.cuda()(
-                input_ids=hf_tokens["input_ids"].cuda(), attention_mask=hf_tokens["attention_mask"].cuda()
-            )
-        else:
-            our_output = model(**hf_tokens, output_hidden_states=True)
-            our_output = our_output["logits"]
-            if classification_head:
-                their_output = esm.model.classification_heads["mnli"](esm.extract_features(batch_tokens))
-            else:
-                their_output = esm(hf_tokens["input_ids"], repr_layers=list(range(999)))
-                their_output = their_output["logits"]
-
-        if is_folding_model:
-            max_absolute_diff = torch.max(torch.abs(our_output["positions"] - their_output["positions"])).item()
-            success = torch.allclose(our_output["positions"], their_output["positions"], atol=1e-5)
-        else:
-            max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-            success = torch.allclose(our_output, their_output, atol=1e-5)
-
-        print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-5
-        print("Do both models output the same tensors?", "🔥" if success else "💩")
-
-        if not success:
-            raise Exception("Something went wRoNg")
-
-        if not is_folding_model:
-            # Let's check contact prediction too
-            our_output = model.predict_contacts(hf_tokens["input_ids"], hf_tokens["attention_mask"])
-            their_output = esm.predict_contacts(hf_tokens["input_ids"])
-            max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-            success = torch.allclose(our_output, their_output, atol=1e-5)
-
-            print("Contact prediction testing:")
-            print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-5
-            print("Do both models output the same tensors?", "🔥" if success else "💩")
-
-            if not success:
-                raise Exception("Something went wRoNg")
-
-        pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-        print(f"Saving model to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-
-        del esm  # Free up some memory before continuing
-
-    print(f"Saving tokenizer to {pytorch_dump_folder_path}")
-    hf_tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_repo:
-        model.push_to_hub(repo_id=push_to_repo, token_token=auth_token)
-        hf_tokenizer.push_to_hub(repo_id=push_to_repo, token_token=auth_token)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    parser.add_argument("--model", default=None, type=str, required=True, help="Name of model to convert.")
-    parser.add_argument("--push_to_repo", type=str, help="Repo to upload to (including username!).")
-    parser.add_argument("--auth_token", type=str, help="HuggingFace auth token.")
-    args = parser.parse_args()
-    convert_esm_checkpoint_to_pytorch(
-        args.model, args.pytorch_dump_folder_path, args.classification_head, args.push_to_repo, args.auth_token
-    )
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index a7d07904e06a..6f90d8d052f9 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -681,7 +681,7 @@ class EsmPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["EsmLayer", "EsmFoldTriangularSelfAttentionBlock", "EsmEmbeddings"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->EsmLMHead
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -697,6 +697,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, EsmLMHead):
+            module.bias.data.zero_()
 
 
 ESM_START_DOCSTRING = r"""
diff --git a/src/transformers/models/esm/modeling_esmfold.py b/src/transformers/models/esm/modeling_esmfold.py
index 67cee99294a8..645c9d16a5c5 100644
--- a/src/transformers/models/esm/modeling_esmfold.py
+++ b/src/transformers/models/esm/modeling_esmfold.py
@@ -112,29 +112,29 @@ class EsmForProteinFoldingOutput(ModelOutput):
             Per-sample maximum predicted error.
     """
 
-    frames: torch.FloatTensor = None
-    sidechain_frames: torch.FloatTensor = None
-    unnormalized_angles: torch.FloatTensor = None
-    angles: torch.FloatTensor = None
-    positions: torch.FloatTensor = None
-    states: torch.FloatTensor = None
-    s_s: torch.FloatTensor = None
-    s_z: torch.FloatTensor = None
-    distogram_logits: torch.FloatTensor = None
-    lm_logits: torch.FloatTensor = None
-    aatype: torch.FloatTensor = None
-    atom14_atom_exists: torch.FloatTensor = None
-    residx_atom14_to_atom37: torch.FloatTensor = None
-    residx_atom37_to_atom14: torch.FloatTensor = None
-    atom37_atom_exists: torch.FloatTensor = None
-    residue_index: torch.FloatTensor = None
-    lddt_head: torch.FloatTensor = None
-    plddt: torch.FloatTensor = None
-    ptm_logits: torch.FloatTensor = None
-    ptm: torch.FloatTensor = None
-    aligned_confidence_probs: torch.FloatTensor = None
-    predicted_aligned_error: torch.FloatTensor = None
-    max_predicted_aligned_error: torch.FloatTensor = None
+    frames: Optional[torch.FloatTensor] = None
+    sidechain_frames: Optional[torch.FloatTensor] = None
+    unnormalized_angles: Optional[torch.FloatTensor] = None
+    angles: Optional[torch.FloatTensor] = None
+    positions: Optional[torch.FloatTensor] = None
+    states: Optional[torch.FloatTensor] = None
+    s_s: Optional[torch.FloatTensor] = None
+    s_z: Optional[torch.FloatTensor] = None
+    distogram_logits: Optional[torch.FloatTensor] = None
+    lm_logits: Optional[torch.FloatTensor] = None
+    aatype: Optional[torch.FloatTensor] = None
+    atom14_atom_exists: Optional[torch.FloatTensor] = None
+    residx_atom14_to_atom37: Optional[torch.FloatTensor] = None
+    residx_atom37_to_atom14: Optional[torch.FloatTensor] = None
+    atom37_atom_exists: Optional[torch.FloatTensor] = None
+    residue_index: Optional[torch.FloatTensor] = None
+    lddt_head: Optional[torch.FloatTensor] = None
+    plddt: Optional[torch.FloatTensor] = None
+    ptm_logits: Optional[torch.FloatTensor] = None
+    ptm: Optional[torch.FloatTensor] = None
+    aligned_confidence_probs: Optional[torch.FloatTensor] = None
+    predicted_aligned_error: Optional[torch.FloatTensor] = None
+    max_predicted_aligned_error: Optional[torch.FloatTensor] = None
 
 
 ESMFOLD_INPUTS_DOCSTRING = r"""
@@ -1016,7 +1016,7 @@ def forward(self, x, mask=None, bias=None, indices=None):
         use mask.
 
         Inputs:
-            x: batch of input sequneces (.. x L x C) mask: batch of boolean masks where 1=valid, 0=padding position (..
+            x: batch of input sequences (.. x L x C) mask: batch of boolean masks where 1=valid, 0=padding position (..
             x L_k) bias: batch of scalar pairwise attention biases (.. x Lq x Lk x num_heads)
 
         Outputs:
diff --git a/src/transformers/models/esm/openfold_utils/residue_constants.py b/src/transformers/models/esm/openfold_utils/residue_constants.py
index 200e0d421b83..b05a603fb29f 100644
--- a/src/transformers/models/esm/openfold_utils/residue_constants.py
+++ b/src/transformers/models/esm/openfold_utils/residue_constants.py
@@ -399,13 +399,11 @@ def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> lis
 
 
 @functools.lru_cache(maxsize=None)
-def load_stereo_chemical_props() -> (
-    Tuple[
-        Mapping[str, List[Bond]],
-        Mapping[str, List[Bond]],
-        Mapping[str, List[BondAngle]],
-    ]
-):
+def load_stereo_chemical_props() -> Tuple[
+    Mapping[str, List[Bond]],
+    Mapping[str, List[Bond]],
+    Mapping[str, List[BondAngle]],
+]:
     """Load stereo_chemical_props.txt into a nice structure.
 
     Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite
diff --git a/src/transformers/models/esm/openfold_utils/rigid_utils.py b/src/transformers/models/esm/openfold_utils/rigid_utils.py
index 08f5ce0a4f7e..4d0f2f69b350 100644
--- a/src/transformers/models/esm/openfold_utils/rigid_utils.py
+++ b/src/transformers/models/esm/openfold_utils/rigid_utils.py
@@ -989,10 +989,10 @@ def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid:
 
     def to_tensor_4x4(self) -> torch.Tensor:
         """
-        Converts a transformation to a homogenous transformation tensor.
+        Converts a transformation to a homogeneous transformation tensor.
 
         Returns:
-            A [*, 4, 4] homogenous transformation tensor
+            A [*, 4, 4] homogeneous transformation tensor
         """
         tensor = self._trans.new_zeros((*self.shape, 4, 4))
         tensor[..., :3, :3] = self._rots.get_rot_mats()
@@ -1003,10 +1003,10 @@ def to_tensor_4x4(self) -> torch.Tensor:
     @staticmethod
     def from_tensor_4x4(t: torch.Tensor) -> Rigid:
         """
-        Constructs a transformation from a homogenous transformation tensor.
+        Constructs a transformation from a homogeneous transformation tensor.
 
         Args:
-            t: [*, 4, 4] homogenous transformation tensor
+            t: [*, 4, 4] homogeneous transformation tensor
         Returns:
             T object with shape [*]
         """
diff --git a/src/transformers/models/falcon/convert_custom_code_checkpoint.py b/src/transformers/models/falcon/convert_custom_code_checkpoint.py
deleted file mode 100644
index 0da817c3ffa7..000000000000
--- a/src/transformers/models/falcon/convert_custom_code_checkpoint.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import json
-from argparse import ArgumentParser
-from pathlib import Path
-
-
-"""
-This script converts Falcon custom code checkpoints to modern Falcon checkpoints that use code in the Transformers
-library. After conversion, performance (especially for generation) should improve and the checkpoint can be loaded
-without needing trust_remote_code=True.
-"""
-
-if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--checkpoint_dir",
-        type=Path,
-        required=True,
-        help="Directory containing a custom code checkpoint to convert to a modern Falcon checkpoint.",
-    )
-    args = parser.parse_args()
-
-    if not args.checkpoint_dir.is_dir():
-        raise ValueError("--checkpoint_dir argument should be a directory!")
-
-    if (
-        not (args.checkpoint_dir / "configuration_RW.py").is_file()
-        or not (args.checkpoint_dir / "modelling_RW.py").is_file()
-    ):
-        raise ValueError(
-            "The model directory should contain configuration_RW.py and modelling_RW.py files! Are you sure this is a custom code checkpoint?"
-        )
-    (args.checkpoint_dir / "configuration_RW.py").unlink()
-    (args.checkpoint_dir / "modelling_RW.py").unlink()
-
-    config = args.checkpoint_dir / "config.json"
-    text = config.read_text()
-    text = text.replace("RWForCausalLM", "FalconForCausalLM")
-    text = text.replace("RefinedWebModel", "falcon")
-    text = text.replace("RefinedWeb", "falcon")
-    json_config = json.loads(text)
-    del json_config["auto_map"]
-
-    if "n_head" in json_config:
-        json_config["num_attention_heads"] = json_config.pop("n_head")
-    if "n_layer" in json_config:
-        json_config["num_hidden_layers"] = json_config.pop("n_layer")
-    if "n_head_kv" in json_config:
-        json_config["num_kv_heads"] = json_config.pop("n_head_kv")
-        json_config["new_decoder_architecture"] = True
-    else:
-        json_config["new_decoder_architecture"] = False
-    bos_token_id = json_config.get("bos_token_id", 1)
-    eos_token_id = json_config.get("eos_token_id", 2)
-    config.unlink()
-    config.write_text(json.dumps(json_config, indent=2, sort_keys=True))
-
-    tokenizer_config = args.checkpoint_dir / "tokenizer_config.json"
-    if tokenizer_config.is_file():
-        text = tokenizer_config.read_text()
-        json_config = json.loads(text)
-        if json_config["tokenizer_class"] == "PreTrainedTokenizerFast":
-            json_config["model_input_names"] = ["input_ids", "attention_mask"]
-            tokenizer_config.unlink()
-            tokenizer_config.write_text(json.dumps(json_config, indent=2, sort_keys=True))
-
-    generation_config_path = args.checkpoint_dir / "generation_config.json"
-    generation_dict = {
-        "_from_model_config": True,
-        "bos_token_id": bos_token_id,
-        "eos_token_id": eos_token_id,
-        "transformers_version": "4.33.0.dev0",
-    }
-    generation_config_path.write_text(json.dumps(generation_dict, indent=2, sort_keys=True))
-    print("Done! Please double-check that the new checkpoint works as expected.")
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index e36ea9cef222..ee76f7cbb026 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -29,6 +29,7 @@
 from ...modeling_attn_mask_utils import (
     AttentionMaskConverter,
 )
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -36,14 +37,12 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
 )
 from ...utils.deprecation import deprecate_kwarg
@@ -53,7 +52,7 @@
 if TYPE_CHECKING:
     from ...configuration_utils import PretrainedConfig
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 logger = logging.get_logger(__name__)
@@ -128,45 +127,18 @@ def __init__(self, config: FalconConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -470,9 +442,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -1126,7 +1098,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1363,7 +1335,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
index 412755144564..4099920f4028 100644
--- a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
+++ b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py
@@ -80,7 +80,7 @@ class FalconMambaConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the cache should be used.
         use_mambapy (`bool`, *optional*, defaults to `False`):
-            Determines the fallback strategy during training if the CUDA-based official implementation of FalconMamba is not avaiable. If `True`, the falcon_mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
+            Determines the fallback strategy during training if the CUDA-based official implementation of FalconMamba is not available. If `True`, the falcon_mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited.
         mixer_rms_eps (`float`, *optional*, defaults to 1e-06):
             The RMS norm epsilon value that is used in the Mixer RMS norm for B, C and dt states.
     Example:
diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py
index d7a40ed5c5ff..bf8695d2a982 100644
--- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py
+++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py
@@ -119,7 +119,7 @@ def __init__(self, config: FalconMambaConfig, layer_idx: int):
 
         # projection of the input hidden states
         self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias)
-        # selective projection used to make dt, B and C input dependant
+        # selective projection used to make dt, B and C input dependent
         self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
         # time step projection (discretization)
         self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)
@@ -768,7 +768,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[torch.LongTensor] = None,
         **kwargs,
     ):
-        # Overwitten -- uses `cache_params` as opposed to `past_key_values`
+        # Overwritten -- uses `cache_params` as opposed to `past_key_values`
 
         if use_cache:
             # `cache_position` should have been initialized in `generate`
diff --git a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index bb9c432f8229..000000000000
--- a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert FastSpeech2Conformer checkpoint."""
-
-import argparse
-import json
-import re
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-import torch
-import yaml
-
-from transformers import (
-    FastSpeech2ConformerConfig,
-    FastSpeech2ConformerModel,
-    FastSpeech2ConformerTokenizer,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
-
-CONFIG_MAPPING = {
-    "adim": "hidden_size",
-    "aheads": "num_attention_heads",
-    "conformer_dec_kernel_size": "decoder_kernel_size",
-    "conformer_enc_kernel_size": "encoder_kernel_size",
-    "decoder_normalize_before": "decoder_normalize_before",
-    "dlayers": "decoder_layers",
-    "dunits": "decoder_linear_units",
-    "duration_predictor_chans": "duration_predictor_channels",
-    "duration_predictor_kernel_size": "duration_predictor_kernel_size",
-    "duration_predictor_layers": "duration_predictor_layers",
-    "elayers": "encoder_layers",
-    "encoder_normalize_before": "encoder_normalize_before",
-    "energy_embed_dropout": "energy_embed_dropout",
-    "energy_embed_kernel_size": "energy_embed_kernel_size",
-    "energy_predictor_chans": "energy_predictor_channels",
-    "energy_predictor_dropout": "energy_predictor_dropout",
-    "energy_predictor_kernel_size": "energy_predictor_kernel_size",
-    "energy_predictor_layers": "energy_predictor_layers",
-    "eunits": "encoder_linear_units",
-    "pitch_embed_dropout": "pitch_embed_dropout",
-    "pitch_embed_kernel_size": "pitch_embed_kernel_size",
-    "pitch_predictor_chans": "pitch_predictor_channels",
-    "pitch_predictor_dropout": "pitch_predictor_dropout",
-    "pitch_predictor_kernel_size": "pitch_predictor_kernel_size",
-    "pitch_predictor_layers": "pitch_predictor_layers",
-    "positionwise_conv_kernel_size": "positionwise_conv_kernel_size",
-    "postnet_chans": "speech_decoder_postnet_units",
-    "postnet_filts": "speech_decoder_postnet_kernel",
-    "postnet_layers": "speech_decoder_postnet_layers",
-    "reduction_factor": "reduction_factor",
-    "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor",
-    "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor",
-    "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate",
-    "transformer_dec_dropout_rate": "decoder_dropout_rate",
-    "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate",
-    "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate",
-    "transformer_enc_dropout_rate": "encoder_dropout_rate",
-    "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate",
-    "use_cnn_in_conformer": "use_cnn_in_conformer",
-    "use_macaron_style_in_conformer": "use_macaron_style_in_conformer",
-    "use_masking": "use_masking",
-    "use_weighted_masking": "use_weighted_masking",
-    "idim": "input_dim",
-    "odim": "num_mel_bins",
-    "spk_embed_dim": "speaker_embed_dim",
-    "langs": "num_languages",
-    "spks": "num_speakers",
-}
-
-
-def remap_model_yaml_config(yaml_config_path):
-    with Path(yaml_config_path).open("r", encoding="utf-8") as f:
-        args = yaml.safe_load(f)
-        args = argparse.Namespace(**args)
-
-    remapped_config = {}
-
-    model_params = args.tts_conf["text2mel_params"]
-    # espnet_config_key -> hf_config_key, any keys not included are ignored
-    for espnet_config_key, hf_config_key in CONFIG_MAPPING.items():
-        if espnet_config_key in model_params:
-            remapped_config[hf_config_key] = model_params[espnet_config_key]
-
-    return remapped_config, args.g2p, args.token_list
-
-
-def convert_espnet_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key in state_dict:
-        if "tts.generator.text2mel." in key:
-            new_key = key.replace("tts.generator.text2mel.", "")
-            if "postnet" in key:
-                new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers")
-                new_key = new_key.replace(".0.weight", ".conv.weight")
-                new_key = new_key.replace(".1.weight", ".batch_norm.weight")
-                new_key = new_key.replace(".1.bias", ".batch_norm.bias")
-                new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean")
-                new_key = new_key.replace(".1.running_var", ".batch_norm.running_var")
-                new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked")
-            if "feat_out" in key:
-                if "weight" in key:
-                    new_key = "speech_decoder_postnet.feat_out.weight"
-                if "bias" in key:
-                    new_key = "speech_decoder_postnet.feat_out.bias"
-            if "encoder.embed.0.weight" in key:
-                new_key = new_key.replace("0.", "")
-            if "w_1" in key:
-                new_key = new_key.replace("w_1", "conv1")
-            if "w_2" in key:
-                new_key = new_key.replace("w_2", "conv2")
-            if "predictor.conv" in key:
-                new_key = new_key.replace(".conv", ".conv_layers")
-                pattern = r"(\d)\.(\d)"
-                replacement = (
-                    r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm"
-                )
-                new_key = re.sub(pattern, replacement, new_key)
-            if "pitch_embed" in key or "energy_embed" in key:
-                new_key = new_key.replace("0", "conv")
-            if "encoders" in key:
-                new_key = new_key.replace("encoders", "conformer_layers")
-                new_key = new_key.replace("norm_final", "final_layer_norm")
-                new_key = new_key.replace("norm_mha", "self_attn_layer_norm")
-                new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm")
-                new_key = new_key.replace("norm_ff", "ff_layer_norm")
-                new_key = new_key.replace("norm_conv", "conv_layer_norm")
-            if "lid_emb" in key:
-                new_key = new_key.replace("lid_emb", "language_id_embedding")
-            if "sid_emb" in key:
-                new_key = new_key.replace("sid_emb", "speaker_id_embedding")
-
-            new_state_dict[new_key] = state_dict[key]
-
-    return new_state_dict
-
-
-@torch.no_grad()
-def convert_FastSpeech2ConformerModel_checkpoint(
-    checkpoint_path,
-    yaml_config_path,
-    pytorch_dump_folder_path,
-    repo_id=None,
-):
-    model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path)
-    config = FastSpeech2ConformerConfig(**model_params)
-
-    # Prepare the model
-    model = FastSpeech2ConformerModel(config)
-
-    espnet_checkpoint = torch.load(checkpoint_path)
-    hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
-
-    model.load_state_dict(hf_compatible_state_dict)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    # Prepare the tokenizer
-    with TemporaryDirectory() as tempdir:
-        vocab = {token: id for id, token in enumerate(vocab)}
-        vocab_file = Path(tempdir) / "vocab.json"
-        with open(vocab_file, "w") as f:
-            json.dump(vocab, f)
-        should_strip_spaces = "no_space" in tokenizer_name
-        tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces)
-
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        model.push_to_hub(repo_id)
-        tokenizer.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument(
-        "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert"
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_FastSpeech2ConformerModel_checkpoint(
-        args.checkpoint_path,
-        args.yaml_config_path,
-        args.pytorch_dump_folder_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_hifigan.py
deleted file mode 100644
index ec9f57ce7142..000000000000
--- a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert FastSpeech2Conformer HiFi-GAN checkpoint."""
-
-import argparse
-from pathlib import Path
-
-import torch
-import yaml
-
-from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
-
-
-def load_weights(checkpoint, hf_model, config):
-    vocoder_key_prefix = "tts.generator.vocoder."
-    checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k}
-
-    hf_model.apply_weight_norm()
-
-    hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
-    hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
-    hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]
-
-    for i in range(len(config.upsample_rates)):
-        hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
-        hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
-        hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]
-
-    for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
-        for j in range(len(config.resblock_dilation_sizes)):
-            hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
-            hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
-            hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]
-
-            hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
-            hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
-            hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]
-
-    hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
-    hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
-    hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]
-
-    hf_model.remove_weight_norm()
-
-
-def remap_hifigan_yaml_config(yaml_config_path):
-    with Path(yaml_config_path).open("r", encoding="utf-8") as f:
-        args = yaml.safe_load(f)
-        args = argparse.Namespace(**args)
-
-    vocoder_type = args.tts_conf["vocoder_type"]
-    if vocoder_type != "hifigan_generator":
-        raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}")
-
-    remapped_dict = {}
-    vocoder_params = args.tts_conf["vocoder_params"]
-
-    # espnet_config_key -> hf_config_key
-    key_mappings = {
-        "channels": "upsample_initial_channel",
-        "in_channels": "model_in_dim",
-        "resblock_dilations": "resblock_dilation_sizes",
-        "resblock_kernel_sizes": "resblock_kernel_sizes",
-        "upsample_kernel_sizes": "upsample_kernel_sizes",
-        "upsample_scales": "upsample_rates",
-    }
-    for espnet_config_key, hf_config_key in key_mappings.items():
-        remapped_dict[hf_config_key] = vocoder_params[espnet_config_key]
-    remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"]
-    remapped_dict["normalize_before"] = False
-    remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"]
-
-    return remapped_dict
-
-
-@torch.no_grad()
-def convert_hifigan_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    yaml_config_path=None,
-    repo_id=None,
-):
-    if yaml_config_path is not None:
-        config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
-        config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
-    else:
-        config = FastSpeech2ConformerHifiGanConfig()
-
-    model = FastSpeech2ConformerHifiGan(config)
-
-    orig_checkpoint = torch.load(checkpoint_path)
-    load_weights(orig_checkpoint, model, config)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_hifigan_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.yaml_config_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py
deleted file mode 100644
index 2a780d5cf0b8..000000000000
--- a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert FastSpeech2Conformer checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    FastSpeech2ConformerConfig,
-    FastSpeech2ConformerHifiGan,
-    FastSpeech2ConformerHifiGanConfig,
-    FastSpeech2ConformerModel,
-    FastSpeech2ConformerWithHifiGan,
-    FastSpeech2ConformerWithHifiGanConfig,
-    logging,
-)
-
-from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import (
-    convert_espnet_state_dict_to_hf,
-    remap_model_yaml_config,
-)
-from .convert_hifigan import load_weights, remap_hifigan_yaml_config
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.FastSpeech2Conformer")
-
-
-def convert_FastSpeech2ConformerWithHifiGan_checkpoint(
-    checkpoint_path,
-    yaml_config_path,
-    pytorch_dump_folder_path,
-    repo_id=None,
-):
-    # Prepare the model
-    model_params, *_ = remap_model_yaml_config(yaml_config_path)
-    model_config = FastSpeech2ConformerConfig(**model_params)
-
-    model = FastSpeech2ConformerModel(model_config)
-
-    espnet_checkpoint = torch.load(checkpoint_path)
-    hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint)
-    model.load_state_dict(hf_compatible_state_dict)
-
-    # Prepare the vocoder
-    config_kwargs = remap_hifigan_yaml_config(yaml_config_path)
-    vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs)
-
-    vocoder = FastSpeech2ConformerHifiGan(vocoder_config)
-    load_weights(espnet_checkpoint, vocoder, vocoder_config)
-
-    # Prepare the model + vocoder
-    config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config)
-    with_hifigan_model = FastSpeech2ConformerWithHifiGan(config)
-    with_hifigan_model.model = model
-    with_hifigan_model.vocoder = vocoder
-
-    with_hifigan_model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        with_hifigan_model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument(
-        "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert"
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        required=True,
-        default=None,
-        type=str,
-        help="Path to the output `FastSpeech2ConformerModel` PyTorch model.",
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-
-    convert_FastSpeech2ConformerWithHifiGan_checkpoint(
-        args.checkpoint_path,
-        args.yaml_config_path,
-        args.pytorch_dump_folder_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
index 81c1eef8959e..590786b195d7 100644
--- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
@@ -78,15 +78,15 @@ class FastSpeech2ConformerModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    spectrogram: torch.FloatTensor = None
+    spectrogram: Optional[torch.FloatTensor] = None
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    duration_outputs: torch.LongTensor = None
-    pitch_outputs: torch.FloatTensor = None
-    energy_outputs: torch.FloatTensor = None
+    duration_outputs: Optional[torch.LongTensor] = None
+    pitch_outputs: Optional[torch.FloatTensor] = None
+    energy_outputs: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -133,7 +133,7 @@ class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput):
             Outputs of the energy predictor.
     """
 
-    waveform: torch.FloatTensor = None
+    waveform: Optional[torch.FloatTensor] = None
 
 
 _CONFIG_FOR_DOC = "FastSpeech2ConformerConfig"
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index a08006815511..43ce980aa7b1 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -808,7 +808,7 @@ class TFFlaubertWithLMHeadModelOutput(ModelOutput):
             heads.
     """
 
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
diff --git a/src/transformers/models/flava/convert_dalle_to_flava_codebook.py b/src/transformers/models/flava/convert_dalle_to_flava_codebook.py
deleted file mode 100644
index 7b544125114c..000000000000
--- a/src/transformers/models/flava/convert_dalle_to_flava_codebook.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-import torch
-
-from transformers import FlavaImageCodebook, FlavaImageCodebookConfig
-
-
-def rreplace(s, old, new, occurrence):
-    li = s.rsplit(old, occurrence)
-    return new.join(li)
-
-
-def count_parameters(state_dict):
-    # encoder.embeddings are double copied in original FLAVA
-    return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items())
-
-
-def upgrade_state_dict(state_dict):
-    upgrade = {}
-
-    group_keys = ["group_1", "group_2", "group_3", "group_4"]
-    for key, value in state_dict.items():
-        for group_key in group_keys:
-            if group_key in key:
-                key = key.replace(f"{group_key}.", f"{group_key}.group.")
-
-        if "res_path" in key:
-            key = key.replace("res_path.", "res_path.path.")
-
-        if key.endswith(".w"):
-            key = rreplace(key, ".w", ".weight", 1)
-        if key.endswith(".b"):
-            key = rreplace(key, ".b", ".bias", 1)
-
-        upgrade[key] = value.float()
-
-    return upgrade
-
-
-@torch.no_grad()
-def convert_dalle_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, save_checkpoint=True):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    from dall_e import Encoder
-
-    encoder = Encoder()
-    if os.path.exists(checkpoint_path):
-        ckpt = torch.load(checkpoint_path)
-    else:
-        ckpt = torch.hub.load_state_dict_from_url(checkpoint_path)
-
-    if isinstance(ckpt, Encoder):
-        ckpt = ckpt.state_dict()
-    encoder.load_state_dict(ckpt)
-
-    if config_path is not None:
-        config = FlavaImageCodebookConfig.from_pretrained(config_path)
-    else:
-        config = FlavaImageCodebookConfig()
-
-    hf_model = FlavaImageCodebook(config).eval()
-    state_dict = encoder.state_dict()
-
-    hf_state_dict = upgrade_state_dict(state_dict)
-    hf_model.load_state_dict(hf_state_dict)
-    hf_state_dict = hf_model.state_dict()
-    hf_count = count_parameters(hf_state_dict)
-    state_dict_count = count_parameters(state_dict)
-
-    assert torch.allclose(hf_count, state_dict_count, atol=1e-3)
-
-    if save_checkpoint:
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-    else:
-        return hf_state_dict
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-
-    convert_dalle_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py b/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py
deleted file mode 100644
index 95ebb2bfdb23..000000000000
--- a/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-import torch
-
-from transformers import FlavaConfig, FlavaForPreTraining
-from transformers.models.flava.convert_dalle_to_flava_codebook import convert_dalle_checkpoint
-
-
-def count_parameters(state_dict):
-    # encoder.embeddings are double copied in original FLAVA
-    return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items())
-
-
-def upgrade_state_dict(state_dict, codebook_state_dict):
-    upgrade = {}
-
-    for key, value in state_dict.items():
-        if "text_encoder.embeddings" in key or "image_encoder.embeddings" in key:
-            continue
-
-        key = key.replace("heads.cmd.mim_head.cls.predictions", "mmm_image_head")
-        key = key.replace("heads.cmd.mlm_head.cls.predictions", "mmm_text_head")
-        key = key.replace("heads.cmd.itm_head.cls", "itm_head")
-        key = key.replace("heads.cmd.itm_head.pooler", "itm_head.pooler")
-        key = key.replace("heads.cmd.clip_head.logit_scale", "flava.logit_scale")
-        key = key.replace("heads.fairseq_mlm.cls.predictions", "mlm_head")
-        key = key.replace("heads.imagenet.mim_head.cls.predictions", "mim_head")
-        key = key.replace("mm_text_projection", "flava.text_to_mm_projection")
-        key = key.replace("mm_image_projection", "flava.image_to_mm_projection")
-        key = key.replace("image_encoder.module", "flava.image_model")
-        key = key.replace("text_encoder.module", "flava.text_model")
-        key = key.replace("mm_encoder.module.encoder.cls_token", "flava.multimodal_model.cls_token")
-        key = key.replace("mm_encoder.module", "flava.multimodal_model")
-        key = key.replace("text_projection", "flava.text_projection")
-        key = key.replace("image_projection", "flava.image_projection")
-
-        upgrade[key] = value.float()
-
-    for key, value in codebook_state_dict.items():
-        upgrade[f"image_codebook.{key}"] = value
-
-    return upgrade
-
-
-@torch.no_grad()
-def convert_flava_checkpoint(checkpoint_path, codebook_path, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = FlavaConfig.from_pretrained(config_path)
-    else:
-        config = FlavaConfig()
-
-    hf_model = FlavaForPreTraining(config).eval()
-
-    codebook_state_dict = convert_dalle_checkpoint(codebook_path, None, save_checkpoint=False)
-
-    if os.path.exists(checkpoint_path):
-        state_dict = torch.load(checkpoint_path, map_location="cpu")
-    else:
-        state_dict = torch.hub.load_state_dict_from_url(checkpoint_path, map_location="cpu")
-
-    hf_state_dict = upgrade_state_dict(state_dict, codebook_state_dict)
-    hf_model.load_state_dict(hf_state_dict)
-    hf_state_dict = hf_model.state_dict()
-    hf_count = count_parameters(hf_state_dict)
-    state_dict_count = count_parameters(state_dict) + count_parameters(codebook_state_dict)
-
-    assert torch.allclose(hf_count, state_dict_count, atol=1e-3)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint")
-    parser.add_argument("--codebook_path", default=None, type=str, help="Path to flava codebook checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-
-    convert_flava_checkpoint(args.checkpoint_path, args.codebook_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py
index 7ed2302ef594..3b8b8128c882 100644
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@@ -63,7 +63,7 @@ def __init__(
         mask_group_max_patches: Optional[int] = None,
         mask_group_min_patches: int = 16,
         mask_group_min_aspect_ratio: Optional[float] = 0.3,
-        mask_group_max_aspect_ratio: float = None,
+        mask_group_max_aspect_ratio: Optional[float] = None,
     ):
         if not isinstance(input_size, tuple):
             input_size = (input_size,) * 2
@@ -246,10 +246,10 @@ def __init__(
         # Codebook related params
         return_codebook_pixels: bool = False,
         codebook_do_resize: bool = True,
-        codebook_size: bool = None,
+        codebook_size: Optional[bool] = None,
         codebook_resample: int = PILImageResampling.LANCZOS,
         codebook_do_center_crop: bool = True,
-        codebook_crop_size: int = None,
+        codebook_crop_size: Optional[int] = None,
         codebook_do_rescale: bool = True,
         codebook_rescale_factor: Union[int, float] = 1 / 255,
         codebook_do_map_pixels: bool = True,
@@ -389,17 +389,17 @@ def map_pixels(self, image: np.ndarray) -> np.ndarray:
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_map_pixels: bool = None,
+        do_map_pixels: Optional[bool] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[ChannelDimension] = None,
     ) -> np.ndarray:
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index c893938e4284..74076eddf27f 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -438,7 +438,7 @@ def __init__(self, config: FlavaPossibleConfigs) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
@@ -874,6 +874,18 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, FlavaMaskedPredictionHead):
+            module.bias.data.zero_()
+        elif isinstance(module, FlavaImageEmbeddings):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+        elif isinstance(module, FlavaMultimodalModel):
+            if module.use_cls_token:
+                module.cls_token.data.zero_()
+        elif isinstance(module, FlavaModel):
+            module.logit_scale.data.fill_(self.config.logit_scale_init_value)
 
 
 @add_start_docstrings(
@@ -1495,9 +1507,9 @@ def __init__(self, num_blocks: int, num_layers: int, in_size: int, out_size: int
         blocks = OrderedDict()
         for i in range(num_blocks):
             if i == 0:
-                blocks[f"block_{i+1}"] = FlavaImageCodebookBlock(in_size, out_size, num_layers)
+                blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(in_size, out_size, num_layers)
             else:
-                blocks[f"block_{i+1}"] = FlavaImageCodebookBlock(out_size, out_size, num_layers)
+                blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(out_size, out_size, num_layers)
 
         if use_pool:
             blocks["pool"] = nn.MaxPool2d(kernel_size=2)
@@ -1791,7 +1803,7 @@ def forward(
         bool_masked_pos: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         image_attention_mask: Optional[torch.Tensor] = None,
-        skip_unmasked_multimodal_encoder: bool = None,
+        skip_unmasked_multimodal_encoder: Optional[bool] = None,
         mlm_labels: Optional[torch.Tensor] = None,
         mim_labels: Optional[torch.Tensor] = None,
         itm_labels: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
deleted file mode 100644
index 71660354db14..000000000000
--- a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert FNet checkpoint."""
-
-import argparse
-
-import torch
-from flax.training.checkpoints import restore_checkpoint
-
-from transformers import FNetConfig, FNetForPreTraining
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_flax_checkpoint_to_pytorch(flax_checkpoint_path, fnet_config_file, save_path):
-    # Initialise PyTorch model
-    config = FNetConfig.from_json_file(fnet_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    fnet_pretraining_model = FNetForPreTraining(config)
-
-    checkpoint_dict = restore_checkpoint(flax_checkpoint_path, None)
-    pretrained_model_params = checkpoint_dict["target"]
-
-    # Embeddings
-    # Position IDs
-    state_dict = fnet_pretraining_model.state_dict()
-
-    position_ids = state_dict["fnet.embeddings.position_ids"]
-    new_state_dict = {"fnet.embeddings.position_ids": position_ids}
-    # Embedding Layers
-    new_state_dict["fnet.embeddings.word_embeddings.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["word"]["embedding"]
-    )
-    new_state_dict["fnet.embeddings.position_embeddings.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["position"]["embedding"][0]
-    )
-    new_state_dict["fnet.embeddings.token_type_embeddings.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["type"]["embedding"]
-    )
-    new_state_dict["fnet.embeddings.projection.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["hidden_mapping_in"]["kernel"]
-    ).T
-    new_state_dict["fnet.embeddings.projection.bias"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["hidden_mapping_in"]["bias"]
-    )
-    new_state_dict["fnet.embeddings.LayerNorm.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["layer_norm"]["scale"]
-    )
-    new_state_dict["fnet.embeddings.LayerNorm.bias"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["layer_norm"]["bias"]
-    )
-
-    # Encoder Layers
-    for layer in range(config.num_hidden_layers):
-        new_state_dict[f"fnet.encoder.layer.{layer}.fourier.output.LayerNorm.weight"] = torch.tensor(
-            pretrained_model_params["encoder"][f"encoder_{layer}"]["mixing_layer_norm"]["scale"]
-        )
-        new_state_dict[f"fnet.encoder.layer.{layer}.fourier.output.LayerNorm.bias"] = torch.tensor(
-            pretrained_model_params["encoder"][f"encoder_{layer}"]["mixing_layer_norm"]["bias"]
-        )
-
-        new_state_dict[f"fnet.encoder.layer.{layer}.intermediate.dense.weight"] = torch.tensor(
-            pretrained_model_params["encoder"][f"feed_forward_{layer}"]["intermediate"]["kernel"]
-        ).T
-        new_state_dict[f"fnet.encoder.layer.{layer}.intermediate.dense.bias"] = torch.tensor(
-            pretrained_model_params["encoder"][f"feed_forward_{layer}"]["intermediate"]["bias"]
-        )
-
-        new_state_dict[f"fnet.encoder.layer.{layer}.output.dense.weight"] = torch.tensor(
-            pretrained_model_params["encoder"][f"feed_forward_{layer}"]["output"]["kernel"]
-        ).T
-        new_state_dict[f"fnet.encoder.layer.{layer}.output.dense.bias"] = torch.tensor(
-            pretrained_model_params["encoder"][f"feed_forward_{layer}"]["output"]["bias"]
-        )
-
-        new_state_dict[f"fnet.encoder.layer.{layer}.output.LayerNorm.weight"] = torch.tensor(
-            pretrained_model_params["encoder"][f"encoder_{layer}"]["output_layer_norm"]["scale"]
-        )
-        new_state_dict[f"fnet.encoder.layer.{layer}.output.LayerNorm.bias"] = torch.tensor(
-            pretrained_model_params["encoder"][f"encoder_{layer}"]["output_layer_norm"]["bias"]
-        )
-
-    # Pooler Layers
-    new_state_dict["fnet.pooler.dense.weight"] = torch.tensor(pretrained_model_params["encoder"]["pooler"]["kernel"]).T
-    new_state_dict["fnet.pooler.dense.bias"] = torch.tensor(pretrained_model_params["encoder"]["pooler"]["bias"])
-
-    # Masked LM Layers
-    new_state_dict["cls.predictions.transform.dense.weight"] = torch.tensor(
-        pretrained_model_params["predictions_dense"]["kernel"]
-    ).T
-    new_state_dict["cls.predictions.transform.dense.bias"] = torch.tensor(
-        pretrained_model_params["predictions_dense"]["bias"]
-    )
-    new_state_dict["cls.predictions.transform.LayerNorm.weight"] = torch.tensor(
-        pretrained_model_params["predictions_layer_norm"]["scale"]
-    )
-    new_state_dict["cls.predictions.transform.LayerNorm.bias"] = torch.tensor(
-        pretrained_model_params["predictions_layer_norm"]["bias"]
-    )
-    new_state_dict["cls.predictions.decoder.weight"] = torch.tensor(
-        pretrained_model_params["encoder"]["embedder"]["word"]["embedding"]
-    )
-    new_state_dict["cls.predictions.decoder.bias"] = torch.tensor(
-        pretrained_model_params["predictions_output"]["output_bias"]
-    )
-    new_state_dict["cls.predictions.bias"] = torch.tensor(pretrained_model_params["predictions_output"]["output_bias"])
-
-    # Seq Relationship Layers
-    new_state_dict["cls.seq_relationship.weight"] = torch.tensor(
-        pretrained_model_params["classification"]["output_kernel"]
-    )
-    new_state_dict["cls.seq_relationship.bias"] = torch.tensor(
-        pretrained_model_params["classification"]["output_bias"]
-    )
-
-    # Load State Dict
-    fnet_pretraining_model.load_state_dict(new_state_dict)
-
-    # Save PreTrained
-    print(f"Saving pretrained model to {save_path}")
-    fnet_pretraining_model.save_pretrained(save_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--flax_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--fnet_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained FNet model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument("--save_path", default=None, type=str, required=True, help="Path to the output model.")
-    args = parser.parse_args()
-    convert_flax_checkpoint_to_pytorch(args.flax_checkpoint_path, args.fnet_config_file, args.save_path)
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index 9d02d35210f3..63aaa42e9dc4 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -444,8 +444,8 @@ class FNetForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py
index 877a50cc2d1d..c113a505ef37 100644
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@@ -221,7 +221,7 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         spaces_between_special_tokens: bool = False,
         **kwargs,
     ) -> str:
diff --git a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
deleted file mode 100644
index 4aed15928062..000000000000
--- a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert FocalNet checkpoints from the original repository. URL: https://github.com/microsoft/FocalNet/tree/main"""
-
-import argparse
-import json
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import BitImageProcessor, FocalNetConfig, FocalNetForImageClassification
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
-
-
-def get_focalnet_config(model_name):
-    depths = [2, 2, 6, 2] if "tiny" in model_name else [2, 2, 18, 2]
-    use_conv_embed = True if "large" in model_name or "huge" in model_name else False
-    use_post_layernorm = True if "large" in model_name or "huge" in model_name else False
-    use_layerscale = True if "large" in model_name or "huge" in model_name else False
-
-    if "large" in model_name or "xlarge" in model_name or "huge" in model_name:
-        if "fl3" in model_name:
-            focal_levels = [3, 3, 3, 3]
-            focal_windows = [5, 5, 5, 5]
-        elif "fl4" in model_name:
-            focal_levels = [4, 4, 4, 4]
-            focal_windows = [3, 3, 3, 3]
-
-    if "tiny" in model_name or "small" in model_name or "base" in model_name:
-        focal_windows = [3, 3, 3, 3]
-        if "lrf" in model_name:
-            focal_levels = [3, 3, 3, 3]
-        else:
-            focal_levels = [2, 2, 2, 2]
-
-    if "tiny" in model_name:
-        embed_dim = 96
-    elif "small" in model_name:
-        embed_dim = 96
-    elif "base" in model_name:
-        embed_dim = 128
-    elif "large" in model_name:
-        embed_dim = 192
-    elif "xlarge" in model_name:
-        embed_dim = 256
-    elif "huge" in model_name:
-        embed_dim = 352
-
-    # set label information
-    repo_id = "huggingface/label-files"
-    if "large" in model_name or "huge" in model_name:
-        filename = "imagenet-22k-id2label.json"
-    else:
-        filename = "imagenet-1k-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    label2id = {v: k for k, v in id2label.items()}
-
-    config = FocalNetConfig(
-        embed_dim=embed_dim,
-        depths=depths,
-        focal_levels=focal_levels,
-        focal_windows=focal_windows,
-        use_conv_embed=use_conv_embed,
-        id2label=id2label,
-        label2id=label2id,
-        use_post_layernorm=use_post_layernorm,
-        use_layerscale=use_layerscale,
-    )
-
-    return config
-
-
-def rename_key(name):
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.norm")
-    if "layers" in name:
-        name = "encoder." + name
-    if "encoder.layers" in name:
-        name = name.replace("encoder.layers", "encoder.stages")
-    if "downsample.proj" in name:
-        name = name.replace("downsample.proj", "downsample.projection")
-    if "blocks" in name:
-        name = name.replace("blocks", "layers")
-    if "modulation.f.weight" in name or "modulation.f.bias" in name:
-        name = name.replace("modulation.f", "modulation.projection_in")
-    if "modulation.h.weight" in name or "modulation.h.bias" in name:
-        name = name.replace("modulation.h", "modulation.projection_context")
-    if "modulation.proj.weight" in name or "modulation.proj.bias" in name:
-        name = name.replace("modulation.proj", "modulation.projection_out")
-
-    if name == "norm.weight":
-        name = "layernorm.weight"
-    if name == "norm.bias":
-        name = "layernorm.bias"
-
-    if "head" in name:
-        name = name.replace("head", "classifier")
-    else:
-        name = "focalnet." + name
-
-    return name
-
-
-def convert_focalnet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    # fmt: off
-    model_name_to_url = {
-        "focalnet-tiny": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_srf.pth",
-        "focalnet-tiny-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_lrf.pth",
-        "focalnet-small": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_srf.pth",
-        "focalnet-small-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_lrf.pth",
-        "focalnet-base": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_srf.pth",
-        "focalnet-base-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_lrf.pth",
-        "focalnet-large-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth",
-        "focalnet-large-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384_fl4.pth",
-        "focalnet-xlarge-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384.pth",
-        "focalnet-xlarge-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384_fl4.pth",
-    }
-    # fmt: on
-
-    checkpoint_url = model_name_to_url[model_name]
-    print("Checkpoint URL: ", checkpoint_url)
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        state_dict[rename_key(key)] = val
-
-    config = get_focalnet_config(model_name)
-    model = FocalNetForImageClassification(config)
-    model.eval()
-
-    # load state dict
-    model.load_state_dict(state_dict)
-
-    # verify conversion
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-    processor = BitImageProcessor(
-        do_resize=True,
-        size={"shortest_edge": 256},
-        resample=PILImageResampling.BILINEAR,
-        do_center_crop=True,
-        crop_size=224,
-        do_normalize=True,
-        image_mean=IMAGENET_DEFAULT_MEAN,
-        image_std=IMAGENET_DEFAULT_STD,
-    )
-    image = Image.open(requests.get(url, stream=True).raw)
-    inputs = processor(images=image, return_tensors="pt")
-
-    image_transforms = transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]
-    )
-
-    original_pixel_values = image_transforms(image).unsqueeze(0)
-
-    # verify pixel_values
-    assert torch.allclose(inputs.pixel_values, original_pixel_values, atol=1e-4)
-
-    outputs = model(**inputs)
-
-    predicted_class_idx = outputs.logits.argmax(-1).item()
-    print("Predicted class:", model.config.id2label[predicted_class_idx])
-
-    print("First values of logits:", outputs.logits[0, :3])
-
-    if model_name == "focalnet-tiny":
-        expected_slice = torch.tensor([0.2166, -0.4368, 0.2191])
-    elif model_name == "focalnet-tiny-lrf":
-        expected_slice = torch.tensor([1.1669, 0.0125, -0.1695])
-    elif model_name == "focalnet-small":
-        expected_slice = torch.tensor([0.4917, -0.0430, 0.1341])
-    elif model_name == "focalnet-small-lrf":
-        expected_slice = torch.tensor([-0.2588, -0.5342, -0.2331])
-    elif model_name == "focalnet-base":
-        expected_slice = torch.tensor([-0.1655, -0.4090, -0.1730])
-    elif model_name == "focalnet-base-lrf":
-        expected_slice = torch.tensor([0.5306, -0.0483, -0.3928])
-    assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor of {model_name} to the hub...")
-        model.push_to_hub(f"{model_name}")
-        processor.push_to_hub(f"{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="focalnet-tiny",
-        type=str,
-        help="Name of the FocalNet model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model and processor to the hub.",
-    )
-
-    args = parser.parse_args()
-    convert_focalnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py
index 687654a22da3..143d4e066be1 100644
--- a/src/transformers/models/focalnet/modeling_focalnet.py
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@@ -75,7 +75,7 @@ class FocalNetEncoderOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -103,7 +103,7 @@ class FocalNetModelOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -133,7 +133,7 @@ class FocalNetMaskedImageModelingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    reconstruction: torch.FloatTensor = None
+    reconstruction: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -162,7 +162,7 @@ class FocalNetImageClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -621,7 +621,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->FocalNet,swin->focalnet
 class FocalNetPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -645,6 +644,13 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, FocalNetEmbeddings):
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+        elif isinstance(module, FocalNetLayer):
+            if self.config.use_layerscale:
+                module.gamma_1.data.fill_(self.config.layerscale_value)
+                module.gamma_2.data.fill_(self.config.layerscale_value)
 
 
 FOCALNET_START_DOCSTRING = r"""
diff --git a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100755
index ef2764f0ed10..000000000000
--- a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Note: if you intend to run this script make sure you look under scripts/fsmt/
-# to locate the appropriate script to do the work correctly. There is a set of scripts to:
-# - download and prepare data and run the conversion script
-# - perform eval to get the best hparam into the config
-# - generate model_cards - useful if you have multiple models from the same paper
-
-import argparse
-import json
-import os
-import re
-from collections import OrderedDict
-from os.path import basename, dirname
-
-import fairseq
-import torch
-from fairseq import hub_utils
-from fairseq.data.dictionary import Dictionary
-
-from transformers import FSMTConfig, FSMTForConditionalGeneration
-from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES
-from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
-from transformers.utils import WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_warning()
-
-json_indent = 2
-
-# based on the results of a search on a range of `num_beams`, `length_penalty` and `early_stopping`
-# values against wmt19 test data to obtain the best BLEU scores, we will use the following defaults:
-#
-# * `num_beams`: 5 (higher scores better, but requires more memory/is slower, can be adjusted by users)
-# * `early_stopping`: `False` consistently scored better
-# * `length_penalty` varied, so will assign the best one depending on the model
-best_score_hparams = {
-    # fairseq:
-    "wmt19-ru-en": {"length_penalty": 1.1},
-    "wmt19-en-ru": {"length_penalty": 1.15},
-    "wmt19-en-de": {"length_penalty": 1.0},
-    "wmt19-de-en": {"length_penalty": 1.1},
-    # allenai:
-    "wmt16-en-de-dist-12-1": {"length_penalty": 0.6},
-    "wmt16-en-de-dist-6-1": {"length_penalty": 0.6},
-    "wmt16-en-de-12-1": {"length_penalty": 0.8},
-    "wmt19-de-en-6-6-base": {"length_penalty": 0.6},
-    "wmt19-de-en-6-6-big": {"length_penalty": 0.6},
-}
-
-# this remaps the different models to their organization names
-org_names = {}
-for m in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]:
-    org_names[m] = "facebook"
-for m in [
-    "wmt16-en-de-dist-12-1",
-    "wmt16-en-de-dist-6-1",
-    "wmt16-en-de-12-1",
-    "wmt19-de-en-6-6-base",
-    "wmt19-de-en-6-6-big",
-]:
-    org_names[m] = "allenai"
-
-
-def rewrite_dict_keys(d):
-    # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up,
-    # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er</w>': 7}
-    d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "</w>", k), v) for k, v in d.items())
-    keep_keys = "<s> <pad> </s> <unk>".split()
-    # restore the special tokens
-    for k in keep_keys:
-        del d2[f"{k}</w>"]
-        d2[k] = d[k]  # restore
-    return d2
-
-
-def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path):
-    # prep
-    assert os.path.exists(fsmt_checkpoint_path)
-    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-    print(f"Writing results to {pytorch_dump_folder_path}")
-
-    # handle various types of models
-
-    checkpoint_file = basename(fsmt_checkpoint_path)
-    fsmt_folder_path = dirname(fsmt_checkpoint_path)
-
-    cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel
-    models = cls.hub_models()
-    kwargs = {"bpe": "fastbpe", "tokenizer": "moses"}
-    data_name_or_path = "."
-    # note: since the model dump is old, fairseq has upgraded its model some
-    # time later, and it does a whole lot of rewrites and splits on the saved
-    # weights, therefore we can't use torch.load() directly on the model file.
-    # see: upgrade_state_dict(state_dict) in fairseq_model.py
-    print(f"using checkpoint {checkpoint_file}")
-    chkpt = hub_utils.from_pretrained(
-        fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs
-    )
-
-    args = vars(chkpt["args"]["model"])
-
-    src_lang = args["source_lang"]
-    tgt_lang = args["target_lang"]
-
-    data_root = dirname(pytorch_dump_folder_path)
-    model_dir = basename(pytorch_dump_folder_path)
-
-    # dicts
-    src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt")
-    tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt")
-
-    src_dict = Dictionary.load(src_dict_file)
-    src_vocab = rewrite_dict_keys(src_dict.indices)
-    src_vocab_size = len(src_vocab)
-    src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json")
-    print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records")
-    with open(src_vocab_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
-
-    # detect whether this is a do_lower_case situation, which can be derived by checking whether we
-    # have at least one uppercase letter in the source vocab
-    do_lower_case = True
-    for k in src_vocab.keys():
-        if not k.islower():
-            do_lower_case = False
-            break
-
-    tgt_dict = Dictionary.load(tgt_dict_file)
-    tgt_vocab = rewrite_dict_keys(tgt_dict.indices)
-    tgt_vocab_size = len(tgt_vocab)
-    tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json")
-    print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records")
-    with open(tgt_vocab_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent))
-
-    # merges_file (bpecodes)
-    merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"])
-    for fn in ["bpecodes", "code"]:  # older fairseq called the merges file "code"
-        fsmt_merges_file = os.path.join(fsmt_folder_path, fn)
-        if os.path.exists(fsmt_merges_file):
-            break
-    with open(fsmt_merges_file, encoding="utf-8") as fin:
-        merges = fin.read()
-    merges = re.sub(r" \d+$", "", merges, 0, re.M)  # remove frequency number
-    print(f"Generating {merges_file}")
-    with open(merges_file, "w", encoding="utf-8") as fout:
-        fout.write(merges)
-
-    # model config
-    fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json")
-
-    # validate bpe/tokenizer config, as currently it's hardcoded to moses+fastbpe -
-    # may have to modify the tokenizer if a different type is used by a future model
-    assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}"
-    assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}"
-
-    model_conf = {
-        "architectures": ["FSMTForConditionalGeneration"],
-        "model_type": "fsmt",
-        "activation_dropout": args["activation_dropout"],
-        "activation_function": "relu",
-        "attention_dropout": args["attention_dropout"],
-        "d_model": args["decoder_embed_dim"],
-        "dropout": args["dropout"],
-        "init_std": 0.02,
-        "max_position_embeddings": args["max_source_positions"],
-        "num_hidden_layers": args["encoder_layers"],
-        "src_vocab_size": src_vocab_size,
-        "tgt_vocab_size": tgt_vocab_size,
-        "langs": [src_lang, tgt_lang],
-        "encoder_attention_heads": args["encoder_attention_heads"],
-        "encoder_ffn_dim": args["encoder_ffn_embed_dim"],
-        "encoder_layerdrop": args["encoder_layerdrop"],
-        "encoder_layers": args["encoder_layers"],
-        "decoder_attention_heads": args["decoder_attention_heads"],
-        "decoder_ffn_dim": args["decoder_ffn_embed_dim"],
-        "decoder_layerdrop": args["decoder_layerdrop"],
-        "decoder_layers": args["decoder_layers"],
-        "bos_token_id": 0,
-        "pad_token_id": 1,
-        "eos_token_id": 2,
-        "is_encoder_decoder": True,
-        "scale_embedding": not args["no_scale_embedding"],
-        "tie_word_embeddings": args["share_all_embeddings"],
-    }
-
-    # good hparam defaults to start with
-    model_conf["num_beams"] = 5
-    model_conf["early_stopping"] = False
-    if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]:
-        model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"]
-    else:
-        model_conf["length_penalty"] = 1.0
-
-    print(f"Generating {fsmt_model_config_file}")
-    with open(fsmt_model_config_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))
-
-    # tokenizer config
-    fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE)
-
-    tokenizer_conf = {
-        "langs": [src_lang, tgt_lang],
-        "model_max_length": 1024,
-        "do_lower_case": do_lower_case,
-    }
-
-    print(f"Generating {fsmt_tokenizer_config_file}")
-    with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f:
-        f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))
-
-    # model
-    model = chkpt["models"][0]
-    model_state_dict = model.state_dict()
-
-    # rename keys to start with 'model.'
-    model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items())
-
-    # remove unneeded keys
-    ignore_keys = [
-        "model.model",
-        "model.encoder.version",
-        "model.decoder.version",
-        "model.encoder_embed_tokens.weight",
-        "model.decoder_embed_tokens.weight",
-        "model.encoder.embed_positions._float_tensor",
-        "model.decoder.embed_positions._float_tensor",
-    ]
-    for k in ignore_keys:
-        model_state_dict.pop(k, None)
-
-    config = FSMTConfig.from_pretrained(pytorch_dump_folder_path)
-    model_new = FSMTForConditionalGeneration(config)
-
-    # check that it loads ok
-    model_new.load_state_dict(model_state_dict, strict=False)
-
-    # save
-    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
-    print(f"Generating {pytorch_weights_dump_path}")
-    torch.save(model_state_dict, pytorch_weights_dump_path)
-
-    print("Conversion is done!")
-    print("\nLast step is to upload the files to s3")
-    print(f"cd {data_root}")
-    print(f"transformers-cli upload {model_dir}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--fsmt_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts,"
-            " bpecodes, etc."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_fsmt_checkpoint_to_pytorch(args.fsmt_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 9961ea4c88f0..df53e2bb6736 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -351,7 +351,10 @@ def _init_weights(self, module):
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, SinusoidalPositionalEmbedding):
-            pass
+            weight = module.get_embedding(*module.weight.shape, module.padding_idx)
+            weight = nn.Parameter(weight, requires_grad=False)
+            weight.detach_()
+            module.weight = weight
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
@@ -479,7 +482,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: torch.Tensor = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
@@ -539,9 +542,9 @@ def forward(
         all_attentions = () if output_attentions else None
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 x = x.transpose(0, 1)  # T x B x C -> B x T x C
@@ -960,9 +963,9 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
             attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
@@ -1302,17 +1305,13 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
     """
 
     def __init__(self, num_positions, embedding_dim, padding_idx):
-        self.make_weight(num_positions, embedding_dim, padding_idx)
+        super().__init__(num_positions, embedding_dim, padding_idx)
 
     def make_weight(self, num_positions, embedding_dim, padding_idx):
         weight = self.get_embedding(num_positions, embedding_dim, padding_idx)
-        if not hasattr(self, "weight"):
-            # in ___init__
-            super().__init__(num_positions, embedding_dim, padding_idx, _weight=weight)
-        else:
-            # in forward put the weights on the correct dtype and device of the param
-            weight = weight.to(dtype=self.weight.dtype, device=self.weight.device)
-            self.weight = nn.Parameter(weight)
+        # in forward put the weights on the correct dtype and device of the param
+        weight = weight.to(dtype=self.weight.dtype, device=self.weight.device)
+        self.weight = nn.Parameter(weight)
         self.weight.detach_()
         self.weight.requires_grad = False
 
diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py
index b164f286042a..5dd1b09088b5 100644
--- a/src/transformers/models/funnel/configuration_funnel.py
+++ b/src/transformers/models/funnel/configuration_funnel.py
@@ -113,9 +113,9 @@ def __init__(
         self.vocab_size = vocab_size
         self.block_sizes = block_sizes
         self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
-        assert len(block_sizes) == len(
-            self.block_repeats
-        ), "`block_sizes` and `block_repeats` should have the same length."
+        assert len(block_sizes) == len(self.block_repeats), (
+            "`block_sizes` and `block_repeats` should have the same length."
+        )
         self.num_decoder_layers = num_decoder_layers
         self.d_model = d_model
         self.n_head = n_head
diff --git a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 37f71c0d233e..000000000000
--- a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Funnel checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import FunnelBaseModel, FunnelConfig, FunnelModel, load_tf_weights_in_funnel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, base_model):
-    # Initialise PyTorch model
-    config = FunnelConfig.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = FunnelBaseModel(config) if base_model else FunnelModel(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_funnel(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--base_model", action="store_true", help="Whether you want just the base model (no decoder) or not."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(
-        args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.base_model
-    )
-
-
-__all__ = []
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index 0d2f689da893..c5fecd0cc780 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -841,7 +841,7 @@ class FunnelForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index d4efd7ba0a3a..0f8e76b99feb 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -1104,7 +1104,7 @@ class TFFunnelForPreTrainingOutput(ModelOutput):
             heads.
     """
 
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py
index 23c3d88a8ec1..50a1975c763b 100644
--- a/src/transformers/models/fuyu/configuration_fuyu.py
+++ b/src/transformers/models/fuyu/configuration_fuyu.py
@@ -195,7 +195,7 @@ def _rope_scaling_validation(self):
 
         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
             raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+                f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
             )
         rope_scaling_type = self.rope_scaling.get("type", None)
         rope_scaling_factor = self.rope_scaling.get("factor", None)
diff --git a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py b/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py
deleted file mode 100644
index 6d029c0d13ab..000000000000
--- a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import sys
-import warnings
-
-import flatdict
-import torch
-
-from transformers import FuyuConfig, FuyuForCausalLM, LlamaTokenizer
-
-
-try:
-    from transformers import LlamaTokenizerFast
-
-    tokenizer_class = LlamaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    tokenizer_class = LlamaTokenizer
-
-"""
-Sample usage: # TODO fix clone links from persimmon to fuyu
-```
-git clone https://github.com/adept-ai-labs/adept-inference
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar
-python src/transformers/models/fuyu/convert_fuyu_weights_to_hf.py  --input_dir /path/to/downloaded/fuyu/weights/ --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import FuyuForCausalLM, FuyuTokenizer
-
-model = FuyuForCausalLM.from_pretrained("/output/path")
-tokenizer = FuyuTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "self_attention": "self_attn",
-    "language_model.encoder": "language_model.model",
-    "word_embeddings_for_head": "language_model.lm_head",
-    "language_model.embedding.word_embeddings": "language_model.model.embed_tokens",
-    "vit_encoder.linear_encoder": "vision_embed_tokens",
-}
-
-KEYS_TO_REMOVE = {
-    "rotary_emb.inv_freq",
-    "image_patch_projection",
-    "image_patch_projection.weight",
-    "image_patch_projection.bias",
-}
-
-
-def rename_state_dict(state_dict):
-    model_state_dict = {}
-    for key, value in state_dict.items():
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-        # if KEYS_TO_REMOVE in key:
-        if key in KEYS_TO_REMOVE:
-            continue
-        model_state_dict[key] = value
-    return model_state_dict
-
-
-def convert_fuyu_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False):
-    sys.path.insert(0, ada_lib_path)
-    model_state_dict_base = torch.load(pt_model_path, map_location="cpu")
-    state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".")
-    state_dict = rename_state_dict(state_dict)
-
-    transformers_config = FuyuConfig()
-    model = FuyuForCausalLM(transformers_config).to(torch.bfloat16)
-    model.load_state_dict(state_dict)
-    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
-    transformers_config.save_pretrained(pytorch_dump_folder_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Fuyu weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--pt_model_path",
-        help="Location of Fuyu `model_optim_rng.pt`",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--ada_lib_path",
-        help="Location of original source code from adept to deserialize .pt checkpoint",
-    )
-    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
-    args = parser.parse_args()
-    spm_path = os.path.join(args.input_dir, "adept_vocab.model")
-
-    convert_fuyu_checkpoint(
-        pytorch_dump_folder_path=args.output_dir,
-        pt_model_path=args.pt_model_path,
-        safe_serialization=args.safe_serialization,
-        ada_lib_path=args.ada_lib_path,
-    )
-    tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|")
-    tokenizer.save_pretrained(args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py
index d19c48b76293..fd19ff7b8d4f 100644
--- a/src/transformers/models/fuyu/modeling_fuyu.py
+++ b/src/transformers/models/fuyu/modeling_fuyu.py
@@ -227,9 +227,11 @@ def gather_continuous_embeddings(
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        image_patches: torch.Tensor = None,  # [batch_size, num_total_patches, patch_size_ x patch_size x num_channels ]
-        image_patches_indices: torch.Tensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_patches: Optional[
+            torch.Tensor
+        ] = None,  # [batch_size, num_total_patches, patch_size_ x patch_size x num_channels ]
+        image_patches_indices: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -345,36 +347,20 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        if past_key_values is not None:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1:]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        if image_patches_indices is not None:
-            model_inputs["image_patches_indices"] = image_patches_indices
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "image_patches_indices": image_patches_indices if past_key_values is None else None,
-                "image_patches": image_patches if past_key_values is None else None,
-            }
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            image_patches=image_patches,
+            image_patches_indices=image_patches_indices,
+            **kwargs,
         )
+
+        if past_key_values is not None:
+            model_inputs["image_patches_indices"] = None
+            model_inputs["image_patches"] = None
+
         return model_inputs
 
     @staticmethod
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index 768542a85cbe..c295d94a9df9 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -481,7 +481,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
         encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
@@ -682,7 +682,7 @@ def tokens_to_points(tokens, original_size):
 
         return results
 
-    def post_process_image_text_to_text(self, generated_outputs):
+    def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
         """
         Post-processes the output of `FuyuForConditionalGeneration` to only return the text output.
 
@@ -690,6 +690,10 @@ def post_process_image_text_to_text(self, generated_outputs):
             generated_outputs (`torch.Tensor` or `np.ndarray`):
                 The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                 containing the token ids of the generated sequences.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
 
         Returns:
             `List[str]`: The decoded text output.
@@ -706,7 +710,7 @@ def post_process_image_text_to_text(self, generated_outputs):
         for i, seq in enumerate(unpadded_output_sequences):
             padded_output_sequences[i, : len(seq)] = torch.tensor(seq)
 
-        return self.batch_decode(padded_output_sequences, skip_special_tokens=True)
+        return self.batch_decode(padded_output_sequences, skip_special_tokens=skip_special_tokens, **kwargs)
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py
index 2aeb20058058..e372817bf713 100644
--- a/src/transformers/models/gemma/configuration_gemma.py
+++ b/src/transformers/models/gemma/configuration_gemma.py
@@ -19,7 +19,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from ...configuration_utils import PretrainedConfig
 
 
diff --git a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py b/src/transformers/models/gemma/convert_gemma_weights_to_hf.py
deleted file mode 100644
index 9b71be35bfa1..000000000000
--- a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import warnings
-
-import torch
-from accelerate import init_empty_weights
-
-from transformers import GemmaConfig, GemmaForCausalLM, GemmaTokenizer
-
-
-try:
-    from transformers import GemmaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    GemmaTokenizerFast = None
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/gemma/convert_gemma_weights_to_hf.py \
-    --input_dir /path/to/downloaded/gemma/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import GemmaForCausalLM, GemmaTokenizerFast
-
-model = GemmaForCausalLM.from_pretrained("/output/path")
-tokenizer = GemmaTokenizerFast.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-gemma_2b_config = GemmaConfig(
-    num_hidden_layers=18,
-    num_attention_heads=8,
-    num_key_value_heads=1,
-    hidden_size=2048,
-    intermediate_size=16384,
-)
-
-gemma_7b_config = GemmaConfig()
-
-CONFIG_MAPPING = {"2B": gemma_2b_config, "7B": gemma_7b_config}
-LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"}
-
-
-def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32):
-    num_attn_heads = config.num_attention_heads
-    hidden_size = config.hidden_size
-    num_kv_heads = config.num_key_value_heads
-    head_dim = config.head_dim
-
-    print(f"Fetching all parameters from the checkpoint at '{input_base_path}'")
-    model_state_dict = torch.load(input_base_path, map_location="cpu")["model_state_dict"]
-    model_state_dict.pop("freqs_cis")
-
-    state_dict = {}
-    for k, v in model_state_dict.items():
-        if "qkv_proj" in k:
-            if num_kv_heads == 1:
-                v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size)
-                q_proj = v[:num_attn_heads, ...]
-                k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1)
-                v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1)
-
-                state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape(
-                    num_attn_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape(
-                    num_kv_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone()
-            else:
-                q_proj, k_proj, v_proj = torch.split(v, v.shape[0] // 3, 0)
-                state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape(
-                    num_attn_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape(
-                    num_kv_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.clone()
-
-        elif k == "embedder.weight":
-            state_dict[LAYER_NAME_MAPPING[k]] = v
-            state_dict["lm_head.weight"] = v
-        else:
-            state_dict[k] = v
-
-    torch.set_default_dtype(dtype)
-
-    print("Loading the checkpoint in a Gemma model.")
-    with init_empty_weights():
-        model = GemmaForCausalLM(config)
-    model.load_state_dict(state_dict, assign=True, strict=False)
-
-    model.config.torch_dtype = torch.float32
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-
-    if push_to_hub:
-        print(f"pushing the model to {save_path}")
-        model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True)
-    else:
-        model.save_pretrained(save_path, safe_serialization=safe_serialization)
-
-
-def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False):
-    # Initialize the tokenizer based on the `spm` model
-    tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast
-    print(f"Saving a {tokenizer_class.__name__} to {save_path}.")
-    tokenizer = tokenizer_class(input_tokenizer_path)
-    if push_to_hub:
-        tokenizer.push_to_hub(save_path)
-    else:
-        tokenizer.save_pretrained(save_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_checkpoint",
-        help="Absolute path to the target Gemma weights.",
-        required=True,
-    )
-    parser.add_argument(
-        "--tokenizer_checkpoint",
-        help="Location of Gemma tokenizer model",
-    )
-    parser.add_argument(
-        "--model_size",
-        default="7B",
-        choices=["2B", "7B", "tokenizer_only"],
-        help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="google/gemma-7b",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--pickle_serialization",
-        help="Whether or not to save using `safetensors`.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--convert_tokenizer",
-        help="Whether or not to convert the tokenizer as well.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--dtype",
-        default="float32",
-        help="Target dtype of the converted model",
-    )
-    args = parser.parse_args()
-
-    if args.convert_tokenizer:
-        if args.tokenizer_checkpoint is None:
-            raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer")
-
-        spm_path = os.path.join(args.tokenizer_checkpoint)
-        write_tokenizer(spm_path, args.output_dir, args.push_to_hub)
-
-    config = CONFIG_MAPPING[args.model_size]
-    dtype = getattr(torch, args.dtype)
-    write_model(
-        config=config,
-        input_base_path=args.input_checkpoint,
-        save_path=args.output_dir,
-        safe_serialization=not args.pickle_serialization,
-        push_to_hub=args.push_to_hub,
-        dtype=dtype,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/gemma/modeling_flax_gemma.py b/src/transformers/models/gemma/modeling_flax_gemma.py
index dfe9739ba655..2c3b2e57fd6e 100644
--- a/src/transformers/models/gemma/modeling_flax_gemma.py
+++ b/src/transformers/models/gemma/modeling_flax_gemma.py
@@ -234,7 +234,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index aeb742e16dd0..9b349a438163 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -35,7 +35,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -43,6 +43,8 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -50,6 +52,12 @@
 from .configuration_gemma import GemmaConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "google/gemma-7b"
@@ -110,45 +118,18 @@ def __init__(self, config: GemmaConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -437,20 +418,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -511,10 +484,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -522,16 +496,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs,  # NOOP kwarg for now
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -616,13 +588,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -630,12 +601,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -716,7 +692,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -783,27 +759,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -838,10 +813,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -850,12 +824,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -864,10 +837,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -908,29 +877,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -939,9 +907,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -956,7 +923,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -971,10 +938,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1014,6 +977,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1025,23 +989,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1050,9 +1012,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1060,10 +1021,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py
index dc8ced15f962..fa6af70ecfd4 100644
--- a/src/transformers/models/gemma/modular_gemma.py
+++ b/src/transformers/models/gemma/modular_gemma.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 import sentencepiece as spm
 import torch
@@ -369,7 +369,7 @@ def __init__(self, config):
 class GemmaModel(LlamaModel):
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -377,16 +377,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs,  # NOOP kwarg for now
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -471,19 +469,17 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
 
 class GemmaForCausalLM(LlamaForCausalLM):
     def forward(**super_kwargs):
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
diff --git a/src/transformers/models/gemma/tokenization_gemma_fast.py b/src/transformers/models/gemma/tokenization_gemma_fast.py
index 0e6f4a20b6d6..cb15e47d30a4 100644
--- a/src/transformers/models/gemma/tokenization_gemma_fast.py
+++ b/src/transformers/models/gemma/tokenization_gemma_fast.py
@@ -136,8 +136,8 @@ def update_post_processor(self):
         if eos is None and self.add_eos_token:
             raise ValueError("add_eos_token = True but eos_token = None")
 
-        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
 
         special_tokens = []
         if self.add_bos_token:
diff --git a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py b/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py
deleted file mode 100644
index 1ad7d23c3c3e..000000000000
--- a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import warnings
-
-import torch
-from accelerate import init_empty_weights
-
-from transformers import Gemma2Config, Gemma2ForCausalLM, GemmaTokenizer
-
-
-try:
-    from transformers import GemmaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    GemmaTokenizerFast = None
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py \
-    --input_dir /path/to/downloaded/gemma/weights --model_size 9B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import Gemma2ForCausalLM, GemmaTokenizerFast
-
-model = Gemma2ForCausalLM.from_pretrained("/output/path")
-tokenizer = GemmaTokenizerFast.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-gemma_9b_config = Gemma2Config(
-    num_hidden_layers=42,
-    num_attention_heads=16,
-    num_key_value_heads=8,
-    hidden_size=3584,
-    intermediate_size=14336,
-    final_logit_softcapping=30.0,
-    attn_logit_softcapping=50.0,
-    head_dim=256,
-    sliding_window=4096,
-    query_pre_attn_scalar=224,
-)
-
-gemma_27b_config = Gemma2Config(
-    num_hidden_layers=46,
-    num_attention_heads=32,
-    num_key_value_heads=16,
-    hidden_size=4608,
-    intermediate_size=36864,
-    final_logit_softcapping=30.0,
-    attn_logit_softcapping=50.0,
-    head_dim=128,
-    sliding_window=4096,
-    query_pre_attn_scalar=144,
-)
-
-CONFIG_MAPPING = {"9B": gemma_9b_config, "27B": gemma_27b_config}
-LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"}
-
-
-def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32):
-    num_attn_heads = config.num_attention_heads
-    hidden_size = config.hidden_size
-    num_kv_heads = config.num_key_value_heads
-    head_dim = config.head_dim
-
-    print(f"Fetching all parameters from the checkpoint at '{input_base_path}'")
-
-    if os.path.isdir(input_base_path):
-        print("Model seems sharded")
-
-        model_state_dict = {}
-        files = [file for file in os.listdir(input_base_path) if file.endswith(".bin")]
-
-        for file in files:
-            print(file)
-            loaded_state_dict = torch.load(os.path.join(input_base_path, file), map_location="cpu")
-            model_state_dict.update(loaded_state_dict)
-    else:
-        print("Model does not seem to be sharded")
-        model_state_dict = torch.load(input_base_path, map_location="cpu")["model_state_dict"]
-        model_state_dict.pop("freqs_cis")
-
-    state_dict = {}
-    for k, v in model_state_dict.items():
-        if "qkv_proj" in k:
-            if num_kv_heads == 1:
-                v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size)
-                q_proj = v[:num_attn_heads, ...]
-                k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1)
-                v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1)
-
-                state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape(
-                    num_attn_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape(
-                    num_kv_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone()
-            else:
-                q_proj, k_proj, v_proj = torch.split(
-                    v, [num_attn_heads * head_dim, num_kv_heads * head_dim, num_kv_heads * head_dim], 0
-                )
-                state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape(
-                    num_attn_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape(
-                    num_kv_heads * head_dim, hidden_size
-                ).clone()
-                state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.reshape(
-                    num_kv_heads * head_dim, hidden_size
-                ).clone()
-
-        elif k == "embedder.weight":
-            state_dict[LAYER_NAME_MAPPING[k]] = v
-            state_dict["lm_head.weight"] = v
-        else:
-            state_dict[k] = v
-
-    torch.set_default_dtype(dtype)
-
-    print("Loading the checkpoint in a Gemma2 model.")
-    with init_empty_weights():
-        model = Gemma2ForCausalLM(config)
-    model.load_state_dict(state_dict, assign=True, strict=False)
-
-    model.config.torch_dtype = torch.float32
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-
-    if push_to_hub:
-        print(f"pushing the model to {save_path}")
-        model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True)
-    else:
-        model.save_pretrained(save_path, safe_serialization=safe_serialization)
-
-
-def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False):
-    # Initialize the tokenizer based on the `spm` model
-    tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast
-    print(f"Saving a {tokenizer_class.__name__} to {save_path}.")
-    tokenizer = tokenizer_class(input_tokenizer_path)
-    if push_to_hub:
-        tokenizer.push_to_hub(save_path)
-    else:
-        tokenizer.save_pretrained(save_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_checkpoint",
-        help="Absolute path to the target Gemma2 weights.",
-        required=True,
-    )
-    parser.add_argument(
-        "--tokenizer_checkpoint",
-        help="Location of Gemma2 tokenizer model",
-    )
-    parser.add_argument(
-        "--model_size",
-        default="9B",
-        choices=["9B", "27B", "tokenizer_only"],
-        help="'f' models correspond to the finetuned versions, and are specific to the Gemma22 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="google/gemma-9b",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--pickle_serialization",
-        help="Whether or not to save using `safetensors`.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--convert_tokenizer",
-        help="Whether or not to convert the tokenizer as well.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--dtype",
-        default="float32",
-        help="Target dtype of the converted model",
-    )
-    args = parser.parse_args()
-
-    if args.convert_tokenizer:
-        if args.tokenizer_checkpoint is None:
-            raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer")
-
-        spm_path = os.path.join(args.tokenizer_checkpoint)
-        write_tokenizer(spm_path, args.output_dir, args.push_to_hub)
-    if not args.model_size == "tokenizer_only":
-        config = CONFIG_MAPPING[args.model_size]
-        dtype = getattr(torch, args.dtype)
-        write_model(
-            config=config,
-            input_base_path=args.input_checkpoint,
-            save_path=args.output_dir,
-            safe_serialization=not args.pickle_serialization,
-            push_to_hub=args.push_to_hub,
-            dtype=dtype,
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py
index c977f873dc8c..144a94ef33e9 100644
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -19,7 +19,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -34,14 +35,14 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -364,45 +365,18 @@ def __init__(self, config: Gemma2Config, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -489,20 +463,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -563,10 +529,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridCache] = None,
@@ -574,17 +541,15 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         last_cache_position: Optional[int] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -600,11 +565,13 @@ def forward(
 
         if use_cache and past_key_values is None and not self.training:
             batch_size, seq_len, _ = inputs_embeds.shape
+            # NOTE: ideally, `HybridCache` should be initialized outside the model with `layer_device_map`
             past_key_values = HybridCache(
                 self.config,
                 max_batch_size=batch_size,
                 max_cache_len=seq_len,
                 dtype=inputs_embeds.dtype,
+                device=self.device,
             )
 
         if cache_position is None:
@@ -652,7 +619,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     position_embeddings,
                     causal_mask,
@@ -687,13 +654,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     @torch.no_grad()
     def _update_causal_mask(
@@ -757,7 +723,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -821,12 +787,13 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridCache] = None,
@@ -835,13 +802,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **loss_kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -859,9 +824,9 @@ def forward(
         Example:
 
         ```python
-        >>> from transformers import AutoTokenizer, GemmaForCausalLM
+        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM
 
-        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
         >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
 
         >>> prompt = "What is your favorite condiment?"
@@ -882,9 +847,8 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -893,12 +857,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **loss_kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -911,10 +874,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -937,42 +896,23 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten: has a special cache type, `HybridCache`
 
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        #              (we can't check exception 3 while compiling)
-        if past_key_values is not None:
-            if (
-                inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s
-                # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride
-                # during the decoding. Here, simply using `.contiguous()` is not sufficient as in the
-                # batch size = 1 case, `position_ids` is already contiguous but with varying stride
-                # which retriggers a capture.
-                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
 
         # This is needed to correctly slice the mask without data-dependent slicing later on if using dynamo tracing
         # (retrieving the same value from `cache_position` later on would crash dynamo)
         model_inputs["last_cache_position"] = attention_mask.shape[-1] if attention_mask is not None else 0
+        if logits_to_keep is None:
+            _ = model_inputs.pop("logits_to_keep", None)
 
         if (
             isinstance(past_key_values, HybridCache)
@@ -995,19 +935,8 @@ def prepare_inputs_for_generation(
                 cache_position=cache_position,
                 batch_size=batch_size,
             )
+            model_inputs["attention_mask"] = attention_mask
 
-        if logits_to_keep is not None:
-            model_inputs["logits_to_keep"] = logits_to_keep
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-            }
-        )
         return model_inputs
 
 
@@ -1042,29 +971,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1073,9 +1001,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1090,7 +1017,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1105,10 +1032,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1148,6 +1071,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1159,23 +1083,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1184,9 +1106,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1194,10 +1115,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py
index 805e6ba0d2a3..384f3e08023d 100644
--- a/src/transformers/models/gemma2/modular_gemma2.py
+++ b/src/transformers/models/gemma2/modular_gemma2.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from functools import partial
 from typing import Callable, Optional, Tuple, Union
 
 import torch
@@ -29,7 +30,7 @@
 )
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
-from ...utils import is_torchdynamo_compiling, logging
+from ...utils import logging
 from ..gemma.modeling_gemma import (
     GemmaAttention,
     GemmaForCausalLM,
@@ -403,7 +404,7 @@ def __init__(self, config: Gemma2Config):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridCache] = None,
@@ -411,17 +412,15 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         last_cache_position: Optional[int] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -437,11 +436,13 @@ def forward(
 
         if use_cache and past_key_values is None and not self.training:
             batch_size, seq_len, _ = inputs_embeds.shape
+            # NOTE: ideally, `HybridCache` should be initialized outside the model with `layer_device_map`
             past_key_values = HybridCache(
                 self.config,
                 max_batch_size=batch_size,
                 max_cache_len=seq_len,
                 dtype=inputs_embeds.dtype,
+                device=self.device,
             )
 
         if cache_position is None:
@@ -489,7 +490,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     position_embeddings,
                     causal_mask,
@@ -524,13 +525,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     @torch.no_grad()
     def _update_causal_mask(
@@ -576,7 +576,7 @@ def __init__(self, config):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridCache] = None,
@@ -585,16 +585,31 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **loss_kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
         ```python
-        >>> from transformers import AutoTokenizer, GemmaForCausalLM
+        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM
 
-        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
         >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
 
         >>> prompt = "What is your favorite condiment?"
@@ -615,9 +630,8 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -626,12 +640,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **loss_kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -644,10 +657,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -670,42 +679,23 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten: has a special cache type, `HybridCache`
 
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        #              (we can't check exception 3 while compiling)
-        if past_key_values is not None:
-            if (
-                inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s
-                # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride
-                # during the decoding. Here, simply using `.contiguous()` is not sufficient as in the
-                # batch size = 1 case, `position_ids` is already contiguous but with varying stride
-                # which retriggers a capture.
-                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
 
         # This is needed to correctly slice the mask without data-dependent slicing later on if using dynamo tracing
         # (retrieving the same value from `cache_position` later on would crash dynamo)
         model_inputs["last_cache_position"] = attention_mask.shape[-1] if attention_mask is not None else 0
+        if logits_to_keep is None:
+            _ = model_inputs.pop("logits_to_keep", None)
 
         if (
             isinstance(past_key_values, HybridCache)
@@ -728,19 +718,8 @@ def prepare_inputs_for_generation(
                 cache_position=cache_position,
                 batch_size=batch_size,
             )
+            model_inputs["attention_mask"] = attention_mask
 
-        if logits_to_keep is not None:
-            model_inputs["logits_to_keep"] = logits_to_keep
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-            }
-        )
         return model_inputs
 
 
diff --git a/src/transformers/models/gemma3/__init__.py b/src/transformers/models/gemma3/__init__.py
new file mode 100644
index 000000000000..37ec82f91037
--- /dev/null
+++ b/src/transformers/models/gemma3/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gemma3 import *
+    from .image_processing_gemma3 import *
+    from .image_processing_gemma3_fast import *
+    from .modeling_gemma3 import *
+    from .processing_gemma3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py
new file mode 100644
index 000000000000..068e9a060bef
--- /dev/null
+++ b/src/transformers/models/gemma3/configuration_gemma3.py
@@ -0,0 +1,327 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+from ..siglip import SiglipVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma3Text-7B.
+    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262208):
+            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Gemma3TextModel`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            Scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096): in Gemma3Text, every other layer uses sliding window attention. This is the
+            size of the sliding window.
+        final_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the attention scores.
+        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+        sliding_window_pattern (`int`, *optional*, defaults to 6):
+            Pattern for the sliding window attention.
+
+    ```python
+    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
+    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
+    >>> configuration = Gemma3TextConfig()
+    >>> # Initializing a model from the gemma3_text-7b style configuration
+    >>> model = Gemma3TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+        sliding_window_pattern (`int`, *optional*, defaults to 6):
+            Pattern for the sliding window attention.
+    """
+
+    model_type = "gemma3_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=262_208,
+        hidden_size=2304,
+        intermediate_size=9216,
+        num_hidden_layers=26,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        head_dim=256,
+        hidden_activation="gelu_pytorch_tanh",
+        max_position_embeddings=131_072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=1_000_000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        query_pre_attn_scalar=256,
+        sliding_window=4096,
+        final_logit_softcapping=None,
+        attn_logit_softcapping=None,
+        cache_implementation="hybrid",
+        rope_scaling=None,
+        rope_local_base_freq=10_000.0,
+        sliding_window_pattern=6,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.query_pre_attn_scalar = query_pre_attn_scalar
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.attn_logit_softcapping = attn_logit_softcapping
+        self.cache_implementation = cache_implementation
+
+        self.rope_local_base_freq = rope_local_base_freq
+        # For configuring HybridCache to work with 5:1 attention pattern
+        self.sliding_window_pattern = sliding_window_pattern
+        self.rope_scaling = rope_scaling
+        rope_config_validation(self)
+
+
+class Gemma3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
+    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
+
+    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        mm_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of tokens per image embedding.
+        boi_token_index (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_index (`int`, *optional*, defaults to 256000):
+            The end-of-image token index to wrap the image prompt.
+        image_token_index (`int`, *optional*, defaults to 262144):
+            The image token index to encode the image prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig
+
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+
+    >>> # Initializing a Gemma3 Text config
+    >>> text_config = Gemma3TextConfig()
+
+    >>> # Initializing a Gemma3 gemma-3-4b style configuration
+    >>> configuration = Gemma3Config(vision_config, text_config)
+
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3TextConfig(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma3"
+    sub_configs = {
+        "text_config": Gemma3TextConfig,
+        "vision_config": SiglipVisionConfig,
+    }
+
+    def __init__(
+        self,
+        text_config: Optional[Union[Gemma3TextConfig, Dict[str, Any]]] = None,
+        vision_config: Optional[Union[SiglipVisionConfig, Dict[str, Any]]] = None,
+        mm_tokens_per_image: int = 256,
+        boi_token_index: int = 255_999,
+        eoi_token_index: int = 256_000,
+        image_token_index: int = 262_144,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = Gemma3TextConfig()
+            logger.info("text_config is None, using default Gemma3TextConfig text config.")
+        elif isinstance(text_config, dict):
+            text_config = Gemma3TextConfig(**text_config)
+
+        if isinstance(vision_config, dict):
+            vision_config = SiglipVisionConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = SiglipVisionConfig()
+            logger.info("vision_config is None, using default SiglipVisionConfig vision config.")
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.mm_tokens_per_image = mm_tokens_per_image
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.image_token_index = image_token_index
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["Gemma3Config", "Gemma3TextConfig"]
diff --git a/src/transformers/models/gemma3/image_processing_gemma3.py b/src/transformers/models/gemma3/image_processing_gemma3.py
new file mode 100644
index 000000000000..f9156ab1b606
--- /dev/null
+++ b/src/transformers/models/gemma3/image_processing_gemma3.py
@@ -0,0 +1,408 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Gemma3."""
+
+import itertools
+import math
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class Gemma3ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a SigLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
+            `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_pan_and_scan (`bool`, *optional*):
+            Whether to apply `pan_and_scan` to images.
+        pan_and_scan_min_crop_size (`int`, *optional*):
+            Minimum size of each crop in pan and scan.
+        pan_and_scan_max_num_crops (`int`, *optional*):
+            Maximum number of crops per image in pan and scan.
+        pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+            Minimum aspect ratio to activate pan and scan.
+    """
+
+    model_input_names = ["pixel_values", "num_crops"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        do_pan_and_scan: Optional[bool] = None,
+        pan_and_scan_min_crop_size: Optional[int] = None,
+        pan_and_scan_max_num_crops: Optional[int] = None,
+        pan_and_scan_min_ratio_to_activate: Optional[float] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size, default_to_square=True)
+        image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+        self.do_pan_and_scan = do_pan_and_scan
+        self.pan_and_scan_min_crop_size = pan_and_scan_min_crop_size
+        self.pan_and_scan_max_num_crops = pan_and_scan_max_num_crops
+        self.pan_and_scan_min_ratio_to_activate = pan_and_scan_min_ratio_to_activate
+
+    def pan_and_scan(
+        self,
+        image: np.ndarray,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds
+        minumum allowed ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            pan_and_scan_min_crop_size (`int`, *optional*):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+                Minimum aspect ratio to activate pan and scan.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        height, width = get_image_size(image)
+
+        # Square or landscape image.
+        if width >= height:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if width / height < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_w = int(math.floor(width / height + 0.5))  # Half round up rounding.
+            num_crops_w = min(int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+
+        # Portrait image.
+        else:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if height / width < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_h = int(math.floor(height / width + 0.5))
+            num_crops_h = min(int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+
+        crop_size_w = int(math.ceil(width / num_crops_w))
+        crop_size_h = int(math.ceil(height / num_crops_h))
+
+        # Don't apply PaS if crop size is too small.
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return []
+
+        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
+        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]
+
+        if input_data_format == ChannelDimension.LAST:
+            image_crops = [
+                image[pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+                for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+            ]
+        else:
+            image_crops = [
+                image[:, pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+                for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+            ]
+
+        return image_crops
+
+    def _process_images_for_pan_and_scan(
+        self,
+        images: List[np.ndarray],
+        do_pan_and_scan: bool,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        pas_images_list = []
+        num_crops = []
+        for image in images:
+            pas_images = self.pan_and_scan(
+                image=image,
+                pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            pas_images_list.extend([image] + pas_images)
+            num_crops.append(len(pas_images))
+        return pas_images_list, num_crops
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        do_pan_and_scan: Optional[bool] = None,
+        pan_and_scan_min_crop_size: Optional[int] = None,
+        pan_and_scan_max_num_crops: Optional[int] = None,
+        pan_and_scan_min_ratio_to_activate: Optional[float] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_pan_and_scan (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to apply `pan_and_scan` to images.
+            pan_and_scan_min_crop_size (`int`, *optional*, defaults to `self.pan_and_scan_min_crop_size`):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*, defaults to `self.pan_and_scan_max_num_crops`):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*, defaults to `self.pan_and_scan_min_ratio_to_activate`):
+                Minimum aspect ratio to activate pan and scan.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_pan_and_scan = do_pan_and_scan if do_pan_and_scan is not None else self.do_pan_and_scan
+        pan_and_scan_min_crop_size = (
+            pan_and_scan_min_crop_size if pan_and_scan_min_crop_size is not None else self.pan_and_scan_min_crop_size
+        )
+        pan_and_scan_max_num_crops = (
+            pan_and_scan_max_num_crops if pan_and_scan_max_num_crops is not None else self.pan_and_scan_max_num_crops
+        )
+        pan_and_scan_min_ratio_to_activate = (
+            pan_and_scan_min_ratio_to_activate
+            if pan_and_scan_min_ratio_to_activate is not None
+            else self.pan_and_scan_min_ratio_to_activate
+        )
+
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_pan_and_scan:
+            images, num_crops = self._process_images_for_pan_and_scan(
+                images=images,
+                do_pan_and_scan=do_pan_and_scan,
+                pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+
+        else:
+            num_crops = [0 for _ in images]
+
+        processed_images = []
+        for image in images:
+            if do_resize:
+                height, width = size["height"], size["width"]
+                image = resize(
+                    image=image, size=(height, width), resample=resample, input_data_format=input_data_format
+                )
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        data = {"pixel_values": processed_images, "num_crops": num_crops}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["Gemma3ImageProcessor"]
diff --git a/src/transformers/models/gemma3/image_processing_gemma3_fast.py b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
new file mode 100644
index 000000000000..f86dbd3c756a
--- /dev/null
+++ b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
@@ -0,0 +1,289 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for SigLIP."""
+
+import itertools
+import math
+from typing import List, Optional, Union
+
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageInput,
+    SizeDict,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_vision_available,
+    logging,
+)
+
+
+if is_vision_available():
+    from ...image_utils import PILImageResampling
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+logger = logging.get_logger(__name__)
+
+
+class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    do_pan_and_scan: Optional[bool]
+    pan_and_scan_min_crop_size: Optional[int]
+    pan_and_scan_max_num_crops: Optional[int]
+    pan_and_scan_min_ratio_to_activate: Optional[float]
+
+
+@add_start_docstrings(
+    "Constructs a fast ConvNeXT image processor. Based on [`SiglipImageProcessor`] with incorporation of Pan adn Scan cropping method.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        do_pan_and_scan (`bool`, *optional*):
+            Whether to apply `pan_and_scan` to images.
+        pan_and_scan_min_crop_size (`int`, *optional*):
+            Minimum size of each crop in pan and scan.
+        pan_and_scan_max_num_crops (`int`, *optional*):
+            Maximum number of crops per image in pan and scan.
+        pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+            Minimum aspect ratio to activate pan and scan.
+    """,
+)
+class Gemma3ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 224, "width": 224}
+    default_to_square = True
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pan_and_scan = None
+    pan_and_scan_min_crop_size = None
+    pan_and_scan_max_num_crops = None
+    pan_and_scan_min_ratio_to_activate = None
+    valid_kwargs = Gemma3FastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    def pan_and_scan_batched(
+        self,
+        images: "torch.Tensor",
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+    ):
+        """
+        Pan and Scan an image, by cropping into smaller images when the aspect ratio exceeds
+        minumum allowed ratio.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            pan_and_scan_min_crop_size (`int`, *optional*):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+                Minimum aspect ratio to activate pan and scan.
+        """
+        height, width = images.shape[-2:]
+
+        # Square or landscape image.
+        if width >= height:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if width / height < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_w = int(math.floor(width / height + 0.5))  # Half round up rounding.
+            num_crops_w = min(int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+
+        # Portrait image.
+        else:
+            # Only apply PaS if the image is sufficiently exaggerated
+            if height / width < pan_and_scan_min_ratio_to_activate:
+                return []
+
+            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
+            num_crops_h = int(math.floor(height / width + 0.5))
+            num_crops_h = min(int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h)
+
+            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+
+        crop_size_w = int(math.ceil(width / num_crops_w))
+        crop_size_h = int(math.ceil(height / num_crops_h))
+
+        # Don't apply PaS if crop size is too small.
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return []
+
+        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
+        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]
+
+        return [
+            images[..., pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
+            for pos_h, pos_w in itertools.product(crop_positions_h, crop_positions_w)
+        ]
+
+    def _process_images_for_pan_and_scan(
+        self,
+        images: List["torch.Tensor"],
+        do_pan_and_scan: bool,
+        pan_and_scan_min_crop_size: int,
+        pan_and_scan_max_num_crops: int,
+        pan_and_scan_min_ratio_to_activate: float,
+    ):
+        pas_images = self.pan_and_scan_batched(
+            images=images,
+            pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+            pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+            pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+        )
+        num_crops = [len(pas_images) for _ in images]
+        return pas_images, num_crops
+
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+            do_pan_and_scan (`bool`, *optional*):
+                Whether to apply `pan_and_scan` to images.
+            pan_and_scan_min_crop_size (`int`, *optional*):
+                Minimum size of each crop in pan and scan.
+            pan_and_scan_max_num_crops (`int`, *optional*):
+                Maximum number of crops per image in pan and scan.
+            pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+                Minimum aspect ratio to activate pan and scan.
+        """,
+    )
+    def preprocess(
+        self,
+        images: ImageInput,
+        **kwargs: Unpack[Gemma3FastImageProcessorKwargs],
+    ) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _preprocess(
+        self,
+        images: List[List["torch.Tensor"]],
+        do_resize: bool,
+        size: SizeDict,
+        do_pan_and_scan: Optional[bool],
+        pan_and_scan_min_crop_size: Optional[int],
+        pan_and_scan_max_num_crops: Optional[int],
+        pan_and_scan_min_ratio_to_activate: Optional[float],
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        # Group images by size for batched processing
+        processed_images_grouped = {}
+        num_crops_grouped = {}
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        for shape_images, stacked_images in grouped_images.items():
+            if do_pan_and_scan:
+                pas_images, num_crops = self._process_images_for_pan_and_scan(
+                    images=stacked_images,
+                    do_pan_and_scan=do_pan_and_scan,
+                    pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
+                    pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
+                    pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
+                )
+                # Add the thumbnails to the image patches
+                stacked_images = [stacked_images] + pas_images
+                # Group images by size for batched resizing (this will typically group thumbnails together and cropped patches together)
+                processed_image_patches_grouped = {}
+                grouped_image_patches, grouped_image_patches_index = group_images_by_shape(stacked_images)
+                for shape, stacked_image_patches in grouped_image_patches.items():
+                    stacked_image_patches = self.resize(
+                        image=stacked_image_patches,
+                        size=size,
+                        interpolation=interpolation,
+                    )
+                    processed_image_patches_grouped[shape] = stacked_image_patches
+                processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index)
+                # Transpose to have the thumbnails with their corresponding patches
+                stacked_images = torch.stack(processed_image_patches, dim=0).transpose(0, 1).contiguous()
+            else:
+                num_crops = [0 for _ in stacked_images]
+
+                if do_resize:
+                    stacked_images = self.resize(
+                        image=stacked_images,
+                        size=size,
+                        interpolation=interpolation,
+                    )
+            num_crops_grouped[shape_images] = num_crops
+            processed_images_grouped[shape_images] = stacked_images
+        resized_images = reorder_images(processed_images_grouped, grouped_images_index)
+        # If pan and scan is enabled, we need to flatten the list of images
+        if do_pan_and_scan:
+            resized_images = [image for images_list in resized_images for image in images_list]
+        num_crops = reorder_images(num_crops_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(
+            data={"pixel_values": processed_images, "num_crops": num_crops}, tensor_type=return_tensors
+        )
+
+
+__all__ = ["Gemma3ImageProcessorFast"]
diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py
new file mode 100644
index 000000000000..0988e2692aa4
--- /dev/null
+++ b/src/transformers/models/gemma3/modeling_gemma3.py
@@ -0,0 +1,1418 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_gemma3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from collections.abc import Callable
+from dataclasses import dataclass
+from functools import partial
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, HybridCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torchdynamo_compiling,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_gemma3 import Gemma3Config, Gemma3TextConfig
+
+
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "Gemma3Config"
+
+
+@dataclass
+class Gemma3CausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Gemma3 causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size `(batch_size, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class Gemma3TextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
+
+
+class Gemma3MLP(nn.Module):
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_activation]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Gemma3RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+class Gemma3RotaryEmbedding(nn.Module):
+    def __init__(self, config: Gemma3TextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if scaling is None:
+        scaling = module.head_dim**-0.5
+
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+
+    if softcap is not None:
+        attn_weights = attn_weights / softcap
+        attn_weights = torch.tanh(attn_weights)
+        attn_weights = attn_weights * softcap
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+class Gemma3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.is_sliding = bool((layer_idx + 1) % config.sliding_window_pattern)
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.attention_dropout = self.config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.attn_logit_softcapping = self.config.attn_logit_softcapping
+        self.sliding_window = config.sliding_window if self.is_sliding else None
+
+        self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+                "sliding_window": self.sliding_window,
+            }
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+            # Here we need to slice as we use a static cache by default, but FA2 does not support it
+            if attention_mask is not None and self.config._attn_implementation == "flash_attention_2":
+                seq_len = attention_mask.shape[-1]
+                key_states, value_states = key_states[:, :, :seq_len, :], value_states[:, :, :seq_len, :]
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. "
+                    "Falling back to eager attention. This warning can be removed using the argument "
+                    '`attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        if attention_mask is not None:
+            # backwards compatibility
+            attention_mask = attention_mask.to(query_states)
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Gemma3DecoderLayer(nn.Module):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma3MLP(config)
+        self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.is_sliding = self.self_attn.is_sliding
+        self.sliding_window = config.sliding_window
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings_global: torch.Tensor,
+        position_embeddings_local: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        last_cache_position: int = 0,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
+            # In prefill, we may be larger than sliding window
+            effective_seq_len = max(cache_position.shape[0], self.sliding_window)
+            # For FA2, the mask is 2D and is of shape [bs, processed_tokens] (not [bs, max_cache_len]),
+            # thus we must slice from the right (at most `effective_seq_len` elements)
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask[:, -effective_seq_len:]
+            # Otherwise, the mask is 4D of shape [bs, 1, query_len, max_cache_len] thus we must slice
+            # from the left, with an offset if we are beyond the sliding window
+            else:
+                min_dtype = torch.finfo(attention_mask.dtype).min
+                sliding_window_mask = torch.tril(
+                    torch.ones_like(attention_mask, dtype=torch.bool), diagonal=-self.sliding_window
+                )
+                attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
+                # In case we are beyond the sliding window, we need to correctly offset the mask slicing
+                # `last_cache_position` is equivalent to `cache_position[-1]` but without breaking dynamo
+                offset = last_cache_position - effective_seq_len
+                # Should only be used when beyond the sliding window (i.e. offset > 0)
+                offset = max(0, offset)
+                attention_mask = attention_mask[:, :, :, offset : offset + effective_seq_len]
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # apply global RoPE to non-sliding layer only
+        if self.self_attn.is_sliding:
+            position_embeddings = position_embeddings_local
+        else:
+            position_embeddings = position_embeddings_global
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+GEMMA3_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Gemma3Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Gemma3 Model outputting raw hidden-states without any specific head on top.",
+    GEMMA3_START_DOCSTRING,
+)
+class Gemma3PreTrainedModel(PreTrainedModel):
+    config_class = Gemma3Config
+    base_model_prefix = "language_model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "Gemma3DecoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        # important: this ported version of Gemma2 isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+GEMMA3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Gemma3Text Model outputting raw hidden-states without any specific head on top.",
+    GEMMA3_START_DOCSTRING,
+)
+class Gemma3TextModel(Gemma3PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Gemma3TextDecoderLayer`]
+
+    Args:
+        config: Gemma3TextConfig
+    """
+
+    config_class = Gemma3TextConfig
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        # Gemma3 downcasts the below to bfloat16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3TextScaledWordEmbedding(
+            config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5
+        )
+        self.layers = nn.ModuleList(
+            [Gemma3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Gemma3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Gemma3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # TODO: raushan fix this after RoPE refactor. For now we hack it by reassigning thetas
+        # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE
+        config = copy.deepcopy(config)
+        config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {"rope_type": "default"}
+        self.rotary_emb_local = Gemma3RotaryEmbedding(config=config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        last_cache_position: Optional[int] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None and not self.training:
+            batch_size, seq_len, _ = inputs_embeds.shape
+            past_key_values = HybridCache(
+                self.config,
+                max_batch_size=batch_size,
+                max_cache_len=seq_len,
+                dtype=inputs_embeds.dtype,
+            )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # This is needed to correctly slice the mask without data-dependent slicing later on if using dynamo tracing
+        # (retrieving the same value from `cache_position` later on would crash dynamo)
+        if last_cache_position is None:
+            last_cache_position = 0
+            if attention_mask is not None:
+                # In case a 4d mask is passed directly without using `generate`, we have to rely on cache_position
+                # It will break dynamo tracing but there are no way around it (and it should never happen in practice)
+                last_cache_position = (
+                    attention_mask.shape[-1] if attention_mask.dim() == 2 else cache_position[-1].item()
+                )
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values,
+            output_attentions,
+        )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings_global = self.rotary_emb(hidden_states, position_ids)
+        position_embeddings_local = self.rotary_emb_local(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
+                    hidden_states,
+                    position_embeddings_global,
+                    position_embeddings_local,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    last_cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_embeddings_global=position_embeddings_global,
+                    position_embeddings_local=position_embeddings_local,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    last_cache_position=last_cache_position,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    @torch.no_grad()
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: HybridCache,
+        output_attentions: bool,
+    ):
+        # Flash Attention currently doesn't support static cache but Gemma3Text work only with static cache.
+        # So we will pass in attention mask as is in any case, not only when ther's padding. Then we'll use its shape
+        # to cut out keys/values trailing 0 used in static cache. This workaround should be compile compatible
+        # as it doesn't cause dynamic control issues.
+        if self.config._attn_implementation == "flash_attention_2":
+            return attention_mask
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if isinstance(past_key_values, (HybridCache, StaticCache)):
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1]
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config_class = Gemma3TextConfig
+    base_model_prefix = "language_model"
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.model = Gemma3TextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @can_return_tuple
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **loss_kwargs,
+    ) -> CausalLMOutputWithPast:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM
+
+        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+
+        if self.training and self.config._attn_implementation != "eager":
+            logger.warning_once(
+                "It is strongly recommended to train Gemma3 models with the `eager` attention implementation "
+                f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **loss_kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten: has a special cache type, `HybridCache`
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        # This is needed to correctly slice the mask without data-dependent slicing later on if using dynamo tracing
+        # (retrieving the same value from `cache_position` later on would crash dynamo)
+        model_inputs["last_cache_position"] = attention_mask.shape[-1] if attention_mask is not None else 0
+        if logits_to_keep is None:
+            _ = model_inputs.pop("logits_to_keep", None)
+
+        if (
+            isinstance(past_key_values, HybridCache)
+            and attention_mask.ndim == 2
+            and not self.config._attn_implementation == "flash_attention_2"
+        ):
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+            model_inputs["attention_mask"] = attention_mask
+
+        return model_inputs
+
+
+class Gemma3MultiModalProjector(nn.Module):
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(config.vision_config.hidden_size, config.text_config.hidden_size)
+        )
+
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+
+        self.patches_per_image = int(config.vision_config.image_size // config.vision_config.patch_size)
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size)
+
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, seq_length = vision_outputs.shape
+
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, seq_length, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        projected_vision_outputs = torch.matmul(normed_vision_outputs, self.mm_input_projection_weight)
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+@add_start_docstrings(
+    """The GEMMA3 model which consists of a vision backbone and a language model.""",
+    GEMMA3_START_DOCSTRING,
+)
+class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin):
+    def __init__(self, config: Gemma3Config):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+        self.multi_modal_projector = Gemma3MultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+
+        language_model = AutoModelForCausalLM.from_config(config=config.text_config)
+
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+        self.language_model = language_model
+
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def _update_causal_mask(
+        self,
+        attention_mask,
+        token_type_ids,
+        past_key_values,
+        cache_position,
+        input_tensor,
+        is_training: bool = False,
+    ):
+        if self.config.text_config._attn_implementation == "flash_attention_2":
+            return attention_mask
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted
+            # form and requires no inversion or slicing.
+            return attention_mask
+
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        min_dtype = torch.finfo(self.dtype).min
+        inputs_lead_dim, sequence_length = input_tensor.shape[:2]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        elif isinstance(past_key_values, HybridCache):
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[0] + sequence_length + 1
+            )
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            return attention_mask
+
+        causal_mask = torch.full(
+            (sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device
+        )
+
+        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
+
+        # Apply bidirectional mask on images if token type ids are provided
+        if token_type_ids is not None and sequence_length != 1:
+            token_type_mask = token_type_ids.unsqueeze(1) == token_type_ids.unsqueeze(2)
+            token_type_mask[token_type_ids == 0] = False  # if text token do not change anything
+            token_type_mask = token_type_mask.unsqueeze(1).to(causal_mask.device, dtype=torch.bool)
+            causal_mask = causal_mask.clone()
+            causal_mask[:, :, :, :sequence_length] = causal_mask[:, :, :, :sequence_length].masked_fill(
+                token_type_mask, 0.0
+            )
+
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+
+            # Then apply padding mask (will mask pad tokens)
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+
+        return causal_mask
+
+    def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        vision_outputs = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        image_features = self.multi_modal_projector(vision_outputs)
+        return image_features
+
+    @can_return_tuple
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Gemma3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[Tuple, Gemma3CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": [
+        ...             {"type": "text", "text": "You are a helpful assistant."}
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user", "content": [
+        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+        ...             {"type": "text", "text": "Where is the cat standing?"},
+        ...         ]
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages,
+        ...     tokenizer=True,
+        ...     return_dict=True,
+        ...     return_tensors="pt",
+        ...     add_generation_prompt=True
+        ... )
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+        ```
+        """
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        is_training = token_type_ids is not None and labels is not None
+
+        # Replace image id woth PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_index >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_index
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+
+            if input_ids is None:
+                special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                    torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
+                )
+            else:
+                special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+                special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+
+            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
+                image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
+                raise ValueError(
+                    f"Number of images does not match number of special image tokens in the input text. "
+                    f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
+                    "tokens from image embeddings."
+                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        # mask out pad-token-ids in labels for BC
+        if labels is not None and self.pad_token_id in labels:
+            logger.warning_once(
+                "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
+                "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
+            )
+            labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds, is_training
+        )
+        outputs: CausalLMOutputWithPast = self.language_model(
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+
+        logits = outputs.logits
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+
+        return Gemma3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+        is_training = token_type_ids is not None and labels is not None
+        if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
+            input_tensor = inputs_embeds if inputs_embeds is not None else input_ids
+            causal_mask = self._update_causal_mask(
+                attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training
+            )
+            model_inputs["attention_mask"] = causal_mask
+
+        return model_inputs
+
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+
+__all__ = ["Gemma3PreTrainedModel", "Gemma3TextModel", "Gemma3ForCausalLM", "Gemma3ForConditionalGeneration"]
diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py
new file mode 100644
index 000000000000..3f7292f13a07
--- /dev/null
+++ b/src/transformers/models/gemma3/modular_gemma3.py
@@ -0,0 +1,1082 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from collections.abc import Callable
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...cache_utils import Cache, HybridCache, StaticCache
+from ...configuration_utils import PretrainedConfig
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    ModelOutput,
+)
+from ...modeling_rope_utils import rope_config_validation
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import (
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torchdynamo_compiling,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ..gemma2.configuration_gemma2 import Gemma2Config
+from ..gemma2.modeling_gemma2 import (
+    Gemma2Attention,
+    Gemma2ForCausalLM,
+    Gemma2MLP,
+    Gemma2Model,
+    Gemma2PreTrainedModel,
+    Gemma2RMSNorm,
+    Gemma2RotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from ..paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
+from ..siglip import SiglipVisionConfig
+
+
+_CHECKPOINT_FOR_DOC = "google/gemma-3-4b"
+_CONFIG_FOR_DOC = "Gemma3Config"
+
+logger = logging.get_logger(__name__)
+
+GEMMA3_INPUTS_DOCSTRING = None  # Will be picked up by modular
+
+
+class Gemma3TextConfig(Gemma2Config):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma3Text-7B.
+    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262208):
+            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Gemma3TextModel`]
+        hidden_size (`int`, *optional*, defaults to 2304):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 26):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            Scaling factor used on the attention scores
+        sliding_window (`int`, *optional*, defaults to 4096): in Gemma3Text, every other layer uses sliding window attention. This is the
+            size of the sliding window.
+        final_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the logits.
+        attn_logit_softcapping (`float`, *optional*):
+            Scaling factor when applying tanh softcapping on the attention scores.
+        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+        sliding_window_pattern (`int`, *optional*, defaults to 6):
+            Pattern for the sliding window attention.
+
+    ```python
+    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
+    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
+    >>> configuration = Gemma3TextConfig()
+    >>> # Initializing a model from the gemma3_text-7b style configuration
+    >>> model = Gemma3TextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+        rope_local_base_freq (float, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings for local attention.
+        sliding_window_pattern (`int`, *optional*, defaults to 6):
+            Pattern for the sliding window attention.
+    """
+
+    model_type = "gemma3_text"
+
+    def __init__(
+        self,
+        vocab_size=262_208,
+        rope_theta=1_000_000.0,
+        rope_scaling=None,
+        rope_local_base_freq=10_000.0,
+        sliding_window_pattern=6,
+        max_position_embeddings=131_072,
+        final_logit_softcapping=None,
+        attn_logit_softcapping=None,
+        **super_kwargs,
+    ):
+        super().__init__(self, **super_kwargs)
+
+        self.rope_local_base_freq = rope_local_base_freq
+        # For configuring HybridCache to work with 5:1 attention pattern
+        self.sliding_window_pattern = sliding_window_pattern
+        self.rope_scaling = rope_scaling
+        rope_config_validation(self)
+
+
+class Gemma3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
+    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
+
+    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        mm_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of tokens per image embedding.
+        boi_token_index (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_index (`int`, *optional*, defaults to 256000):
+            The end-of-image token index to wrap the image prompt.
+        image_token_index (`int`, *optional*, defaults to 262144):
+            The image token index to encode the image prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig
+
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+
+    >>> # Initializing a Gemma3 Text config
+    >>> text_config = Gemma3TextConfig()
+
+    >>> # Initializing a Gemma3 gemma-3-4b style configuration
+    >>> configuration = Gemma3Config(vision_config, text_config)
+
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = Gemma3TextConfig(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gemma3"
+    sub_configs = {
+        "text_config": Gemma3TextConfig,
+        "vision_config": SiglipVisionConfig,
+    }
+
+    def __init__(
+        self,
+        text_config: Optional[Union[Gemma3TextConfig, Dict[str, Any]]] = None,
+        vision_config: Optional[Union[SiglipVisionConfig, Dict[str, Any]]] = None,
+        mm_tokens_per_image: int = 256,
+        boi_token_index: int = 255_999,
+        eoi_token_index: int = 256_000,
+        image_token_index: int = 262_144,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = Gemma3TextConfig()
+            logger.info("text_config is None, using default Gemma3TextConfig text config.")
+        elif isinstance(text_config, dict):
+            text_config = Gemma3TextConfig(**text_config)
+
+        if isinstance(vision_config, dict):
+            vision_config = SiglipVisionConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = SiglipVisionConfig()
+            logger.info("vision_config is None, using default SiglipVisionConfig vision config.")
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.mm_tokens_per_image = mm_tokens_per_image
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.image_token_index = image_token_index
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+
+@dataclass
+class Gemma3CausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Gemma3 causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size `(batch_size, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class Gemma3TextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False)
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype)
+
+
+class Gemma3MLP(Gemma2MLP):
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+
+
+class Gemma3RMSNorm(Gemma2RMSNorm):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+
+
+class Gemma3RotaryEmbedding(Gemma2RotaryEmbedding):
+    def __init__(self, config: Gemma3TextConfig, device=None):
+        super().__init__(config)
+
+
+# Weird way to inherit but otherwise the sliding window gets defined first and can't access `is_sliding`
+class Gemma3Attention(Gemma2Attention):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        self.is_sliding = bool((layer_idx + 1) % config.sliding_window_pattern)
+
+        super().__init__()
+        self.sliding_window = config.sliding_window if self.is_sliding else None
+
+        self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+                "sliding_window": self.sliding_window,
+            }
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+            # Here we need to slice as we use a static cache by default, but FA2 does not support it
+            if attention_mask is not None and self.config._attn_implementation == "flash_attention_2":
+                seq_len = attention_mask.shape[-1]
+                key_states, value_states = key_states[:, :, :seq_len, :], value_states[:, :, :seq_len, :]
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. "
+                    "Falling back to eager attention. This warning can be removed using the argument "
+                    '`attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        if attention_mask is not None:
+            # backwards compatibility
+            attention_mask = attention_mask.to(query_states)
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=self.attention_dropout if self.training else 0.0,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Gemma3DecoderLayer(nn.Module):
+    def __init__(self, config: Gemma3TextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Gemma3MLP(config)
+        self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.is_sliding = self.self_attn.is_sliding
+        self.sliding_window = config.sliding_window
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings_global: torch.Tensor,
+        position_embeddings_local: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        last_cache_position: int = 0,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
+            # In prefill, we may be larger than sliding window
+            effective_seq_len = max(cache_position.shape[0], self.sliding_window)
+            # For FA2, the mask is 2D and is of shape [bs, processed_tokens] (not [bs, max_cache_len]),
+            # thus we must slice from the right (at most `effective_seq_len` elements)
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask[:, -effective_seq_len:]
+            # Otherwise, the mask is 4D of shape [bs, 1, query_len, max_cache_len] thus we must slice
+            # from the left, with an offset if we are beyond the sliding window
+            else:
+                min_dtype = torch.finfo(attention_mask.dtype).min
+                sliding_window_mask = torch.tril(
+                    torch.ones_like(attention_mask, dtype=torch.bool), diagonal=-self.sliding_window
+                )
+                attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
+                # In case we are beyond the sliding window, we need to correctly offset the mask slicing
+                # `last_cache_position` is equivalent to `cache_position[-1]` but without breaking dynamo
+                offset = last_cache_position - effective_seq_len
+                # Should only be used when beyond the sliding window (i.e. offset > 0)
+                offset = max(0, offset)
+                attention_mask = attention_mask[:, :, :, offset : offset + effective_seq_len]
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # apply global RoPE to non-sliding layer only
+        if self.self_attn.is_sliding:
+            position_embeddings = position_embeddings_local
+        else:
+            position_embeddings = position_embeddings_global
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+GEMMA3_START_DOCSTRING = None
+
+
+class Gemma3PreTrainedModel(Gemma2PreTrainedModel):
+    base_model_prefix = "language_model"
+    _no_split_modules = [
+        "Gemma3DecoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+
+    def _init_weights(self, module):
+        # important: this ported version of Gemma2 isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class Gemma3TextModel(Gemma2Model):
+    config_class = Gemma3TextConfig
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+
+        # Gemma3 downcasts the below to bfloat16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3TextScaledWordEmbedding(
+            config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5
+        )
+
+        # TODO: raushan fix this after RoPE refactor. For now we hack it by reassigning thetas
+        # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE
+        config = copy.deepcopy(config)
+        config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {"rope_type": "default"}
+        self.rotary_emb_local = Gemma3RotaryEmbedding(config=config)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        last_cache_position: Optional[int] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None and not self.training:
+            batch_size, seq_len, _ = inputs_embeds.shape
+            past_key_values = HybridCache(
+                self.config,
+                max_batch_size=batch_size,
+                max_cache_len=seq_len,
+                dtype=inputs_embeds.dtype,
+            )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # This is needed to correctly slice the mask without data-dependent slicing later on if using dynamo tracing
+        # (retrieving the same value from `cache_position` later on would crash dynamo)
+        if last_cache_position is None:
+            last_cache_position = 0
+            if attention_mask is not None:
+                # In case a 4d mask is passed directly without using `generate`, we have to rely on cache_position
+                # It will break dynamo tracing but there are no way around it (and it should never happen in practice)
+                last_cache_position = (
+                    attention_mask.shape[-1] if attention_mask.dim() == 2 else cache_position[-1].item()
+                )
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values,
+            output_attentions,
+        )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings_global = self.rotary_emb(hidden_states, position_ids)
+        position_embeddings_local = self.rotary_emb_local(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
+                    hidden_states,
+                    position_embeddings_global,
+                    position_embeddings_local,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    last_cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_embeddings_global=position_embeddings_global,
+                    position_embeddings_local=position_embeddings_local,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    last_cache_position=last_cache_position,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Gemma3ForCausalLM(Gemma2ForCausalLM):
+    config_class = Gemma3TextConfig
+    base_model_prefix = "language_model"
+
+    def __init__(self, config: Gemma3TextConfig):
+        super().__init__(config)
+        self.model = Gemma3TextModel(config)
+
+
+class Gemma3MultiModalProjector(nn.Module):
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(config.vision_config.hidden_size, config.text_config.hidden_size)
+        )
+
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+
+        self.patches_per_image = int(config.vision_config.image_size // config.vision_config.patch_size)
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size, stride=self.kernel_size)
+
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, seq_length = vision_outputs.shape
+
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, seq_length, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        projected_vision_outputs = torch.matmul(normed_vision_outputs, self.mm_input_projection_weight)
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+class Gemma3ForConditionalGeneration(PaliGemmaForConditionalGeneration):
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        vision_outputs = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        image_features = self.multi_modal_projector(vision_outputs)
+        return image_features
+
+    def _update_causal_mask(
+        self,
+        attention_mask,
+        token_type_ids,
+        past_key_values,
+        cache_position,
+        input_tensor,
+        is_training: bool = False,
+    ):
+        if self.config.text_config._attn_implementation == "flash_attention_2":
+            return attention_mask
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted
+            # form and requires no inversion or slicing.
+            return attention_mask
+
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        min_dtype = torch.finfo(self.dtype).min
+        inputs_lead_dim, sequence_length = input_tensor.shape[:2]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        elif isinstance(past_key_values, HybridCache):
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[0] + sequence_length + 1
+            )
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            return attention_mask
+
+        causal_mask = torch.full(
+            (sequence_length, target_length), fill_value=min_dtype, dtype=self.dtype, device=cache_position.device
+        )
+
+        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
+
+        # Apply bidirectional mask on images if token type ids are provided
+        if token_type_ids is not None and sequence_length != 1:
+            token_type_mask = token_type_ids.unsqueeze(1) == token_type_ids.unsqueeze(2)
+            token_type_mask[token_type_ids == 0] = False  # if text token do not change anything
+            token_type_mask = token_type_mask.unsqueeze(1).to(causal_mask.device, dtype=torch.bool)
+            causal_mask = causal_mask.clone()
+            causal_mask[:, :, :, :sequence_length] = causal_mask[:, :, :, :sequence_length].masked_fill(
+                token_type_mask, 0.0
+            )
+
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+
+            # Then apply padding mask (will mask pad tokens)
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+
+        return causal_mask
+
+    @can_return_tuple
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Gemma3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[Tuple, Gemma3CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "system",
+        ...         "content": [
+        ...             {"type": "text", "text": "You are a helpful assistant."}
+        ...         ]
+        ...     },
+        ...     {
+        ...         "role": "user", "content": [
+        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+        ...             {"type": "text", "text": "Where is the cat standing?"},
+        ...         ]
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages,
+        ...     tokenizer=True,
+        ...     return_dict=True,
+        ...     return_tensors="pt",
+        ...     add_generation_prompt=True
+        ... )
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+        ```
+        """
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        is_training = token_type_ids is not None and labels is not None
+
+        # Replace image id woth PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_index >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_index
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+
+            if input_ids is None:
+                special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                    torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
+                )
+            else:
+                special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+                special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+
+            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
+                image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
+                raise ValueError(
+                    f"Number of images does not match number of special image tokens in the input text. "
+                    f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
+                    "tokens from image embeddings."
+                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        # mask out pad-token-ids in labels for BC
+        if labels is not None and self.pad_token_id in labels:
+            logger.warning_once(
+                "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
+                "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
+            )
+            labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds, is_training
+        )
+        outputs: CausalLMOutputWithPast = self.language_model(
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+
+        logits = outputs.logits
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+
+        return Gemma3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+        is_training = token_type_ids is not None and labels is not None
+        if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
+            input_tensor = inputs_embeds if inputs_embeds is not None else input_ids
+            causal_mask = self._update_causal_mask(
+                attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training
+            )
+            model_inputs["attention_mask"] = causal_mask
+
+        return model_inputs
+
+
+__all__ = [
+    "Gemma3Config",
+    "Gemma3TextConfig",
+    "Gemma3PreTrainedModel",  # noqa: F822
+    "Gemma3TextModel",
+    "Gemma3ForCausalLM",
+    "Gemma3ForConditionalGeneration",
+]
diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py
new file mode 100644
index 000000000000..bfdf400212c8
--- /dev/null
+++ b/src/transformers/models/gemma3/processing_gemma3.py
@@ -0,0 +1,173 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, make_nested_list_of_images
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import to_py_obj
+
+
+class Gemma3ImagesKwargs(ImagesKwargs):
+    do_pan_and_scan: Optional[bool]
+    pan_and_scan_min_crop_size: Optional[int]
+    pan_and_scan_max_num_crops: Optional[int]
+    pan_and_scan_min_ratio_to_activate: Optional[float]
+    do_convert_rgb: Optional[bool]
+
+
+class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Gemma3ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            "do_pan_and_scan": False,
+            "pan_and_scan_min_crop_size": 256,
+            "pan_and_scan_max_num_crops": 4,
+            "pan_and_scan_min_ratio_to_activate": 1.2,
+        },
+    }
+
+
+class Gemma3Processor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "image_seq_length"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor,
+        tokenizer,
+        chat_template=None,
+        image_seq_length: int = 256,
+        **kwargs,
+    ):
+        self.image_seq_length = image_seq_length
+        self.image_token_id = tokenizer.image_token_id
+        self.boi_token = tokenizer.boi_token
+        self.image_token = tokenizer.boi_token
+        image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
+        self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
+
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            chat_template=chat_template,
+            **kwargs,
+        )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        videos=None,
+        audio=None,
+        **kwargs: Unpack[Gemma3ProcessorKwargs],
+    ) -> BatchFeature:
+        if text is None and images is None:
+            raise ValueError("Provide at least one of `text` or `images`.")
+
+        output_kwargs = self._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        image_inputs = {}
+        if images is not None:
+            batched_images = make_nested_list_of_images(images)
+            image_inputs = self.image_processor(batched_images, **output_kwargs["images_kwargs"])
+
+            # Create empty text to be replaced with placeholders
+            if not text:
+                text = [" ".join([self.boi_token] * len(images)) for images in batched_images]
+
+            if len(batched_images) != len(text):
+                raise ValueError(
+                    f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})."
+                )
+
+            # Replace image tokens by the full expanded sequence
+            num_crops = to_py_obj(image_inputs.pop("num_crops"))
+            batch_num_crops = [[num_crops.pop(0) for _ in range(len(images))] for images in batched_images]
+            for batch_idx, (prompt, images, num_crops) in enumerate(zip(text, batched_images, batch_num_crops)):
+                image_indexes = [m.start() for m in re.finditer(self.boi_token, prompt)]
+
+                if len(images) != len(image_indexes):
+                    raise ValueError(
+                        f"Prompt contained {len(image_indexes)} image tokens but received {len(images)} images."
+                    )
+
+                # Insert additional image tokens for Pan-and-Scan crops
+                for num, idx in reversed(list(zip(num_crops, image_indexes))):
+                    if num:
+                        formatted_image_text = (
+                            f"Here is the original image {self.boi_token} and here are some crops to help you see better "
+                            + " ".join([self.boi_token] * num)
+                        )
+                        prompt = prompt[:idx] + formatted_image_text + prompt[idx + len(self.boi_token) :]
+                        text[batch_idx] = prompt
+
+            # Expand placeholder image tokens to the full image token sequence
+            text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")
+
+        # Add token type ids manually, as tokenizer can't do arbitrary position token types
+        array_ids = text_inputs["input_ids"]
+        mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+        mm_token_type_ids[array_ids == self.image_token_id] = 1
+        text_inputs = {k: v.tolist() for k, v in text_inputs.items()}  # in case user requested list inputs
+        text_inputs["token_type_ids"] = mm_token_type_ids.tolist()
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Gemma
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"]
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["Gemma3Processor"]
diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py
deleted file mode 100644
index 2f93a6b03a65..000000000000
--- a/src/transformers/models/git/convert_git_to_pytorch.py
+++ /dev/null
@@ -1,448 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert GIT checkpoints from the original repository.
-
-URL: https://github.com/microsoft/GenerativeImage2Text/tree/main"""
-
-import argparse
-from pathlib import Path
-
-import av
-import numpy as np
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
-
-from transformers import (
-    AutoTokenizer,
-    CLIPImageProcessor,
-    GitConfig,
-    GitForCausalLM,
-    GitProcessor,
-    GitVisionConfig,
-    VideoMAEImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_git_config(model_name):
-    if "base" in model_name and "vqa" in model_name:
-        image_size = 480
-    elif "large" in model_name and "vqa" in model_name:
-        image_size = 420
-    else:
-        image_size = 224
-
-    vision_config = GitVisionConfig(image_size=image_size)
-
-    if "large" in model_name:
-        vision_config.patch_size = 14
-        vision_config.hidden_size = 1024
-        vision_config.intermediate_size = 4096
-        vision_config.num_hidden_layers = 24
-        vision_config.num_attention_heads = 16
-
-    is_video = "vatex" in model_name or "msrvtt" in model_name
-    num_image_with_embedding = 6 if is_video else None
-    config = GitConfig(vision_config=vision_config.to_dict(), num_image_with_embedding=num_image_with_embedding)
-
-    return config, image_size, is_video
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, prefix=""):
-    rename_keys = []
-
-    # image encoder
-    # ftm: off
-    rename_keys.append(
-        (f"{prefix}image_encoder.class_embedding", "git.image_encoder.vision_model.embeddings.class_embedding")
-    )
-    rename_keys.append(
-        (
-            f"{prefix}image_encoder.positional_embedding",
-            "git.image_encoder.vision_model.embeddings.position_embedding.weight",
-        )
-    )
-    rename_keys.append(
-        (f"{prefix}image_encoder.conv1.weight", "git.image_encoder.vision_model.embeddings.patch_embedding.weight")
-    )
-    rename_keys.append((f"{prefix}image_encoder.ln_pre.weight", "git.image_encoder.vision_model.pre_layrnorm.weight"))
-    rename_keys.append((f"{prefix}image_encoder.ln_pre.bias", "git.image_encoder.vision_model.pre_layrnorm.bias"))
-    rename_keys.append(
-        (f"{prefix}image_encoder.ln_post.weight", "git.image_encoder.vision_model.post_layernorm.weight")
-    )
-    rename_keys.append((f"{prefix}image_encoder.ln_post.bias", "git.image_encoder.vision_model.post_layernorm.bias"))
-    # fmt: on
-    rename_keys.append((f"{prefix}image_encoder.proj", "git.image_encoder.visual_projection.weight"))
-
-    # fmt: off
-    for i in range(config.vision_config.num_hidden_layers):
-        # image encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.bias"))
-    # fmt: on
-
-    # text decoder
-    # fmt: off
-    rename_keys.append((f"{prefix}textual.embedding.words.weight", "git.embeddings.word_embeddings.weight"))
-    rename_keys.append((f"{prefix}textual.embedding.positions.weight", "git.embeddings.position_embeddings.weight"))
-    rename_keys.append((f"{prefix}textual.visual_projection.0.weight", "git.visual_projection.visual_projection.0.weight"))
-    rename_keys.append((f"{prefix}textual.visual_projection.0.bias", "git.visual_projection.visual_projection.0.bias"))
-    rename_keys.append((f"{prefix}textual.visual_projection.1.weight", "git.visual_projection.visual_projection.1.weight"))
-    rename_keys.append((f"{prefix}textual.visual_projection.1.bias", "git.visual_projection.visual_projection.1.bias"))
-
-    rename_keys.append((f"{prefix}textual.embedding.layer_norm.weight", "git.embeddings.LayerNorm.weight"))
-    rename_keys.append((f"{prefix}textual.embedding.layer_norm.bias", "git.embeddings.LayerNorm.bias"))
-    rename_keys.append((f"{prefix}textual.output.weight", "output.weight"))
-    rename_keys.append((f"{prefix}textual.output.bias", "output.bias"))
-    for i in range(config.num_hidden_layers):
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.weight", f"git.encoder.layer.{i}.attention.self.query.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.bias", f"git.encoder.layer.{i}.attention.self.query.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.weight", f"git.encoder.layer.{i}.attention.self.key.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.bias", f"git.encoder.layer.{i}.attention.self.key.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.weight", f"git.encoder.layer.{i}.attention.self.value.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.bias", f"git.encoder.layer.{i}.attention.self.value.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.weight", f"git.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.bias", f"git.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.weight", f"git.encoder.layer.{i}.attention.output.LayerNorm.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.bias", f"git.encoder.layer.{i}.attention.output.LayerNorm.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.weight", f"git.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.bias", f"git.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.weight", f"git.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.bias", f"git.encoder.layer.{i}.output.dense.bias"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.weight", f"git.encoder.layer.{i}.output.LayerNorm.weight"))
-        rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.bias", f"git.encoder.layer.{i}.output.LayerNorm.bias"))
-    # fmt: on
-
-    if config.num_image_with_embedding is not None:
-        rename_keys.append(("img_temperal_embedding.0", "git.img_temperal_embedding.0"))
-        rename_keys.append(("img_temperal_embedding.1", "git.img_temperal_embedding.1"))
-        rename_keys.append(("img_temperal_embedding.2", "git.img_temperal_embedding.2"))
-        rename_keys.append(("img_temperal_embedding.3", "git.img_temperal_embedding.3"))
-        rename_keys.append(("img_temperal_embedding.4", "git.img_temperal_embedding.4"))
-        rename_keys.append(("img_temperal_embedding.5", "git.img_temperal_embedding.5"))
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val.T if "image_encoder.visual_projection" in new else val
-
-
-# we split up the matrix of each CLIP encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, prefix=""):
-    dim = config.vision_config.hidden_size
-    for i in range(config.vision_config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in the original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[
-            :dim, :
-        ]
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:dim]
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
-            dim : dim * 2, :
-        ]
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[
-            dim : dim * 2
-        ]
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[
-            -dim:, :
-        ]
-        state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-dim:]
-
-
-# We will verify our results on an image
-def prepare_img(model_name):
-    if "textvqa" in model_name:
-        filepath = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
-        image = Image.open(filepath).convert("RGB")
-    else:
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        image = Image.open(requests.get(url, stream=True).raw)
-
-    return image
-
-
-def prepare_video():
-    def read_video_pyav(container, indices):
-        """
-        Decode the video with PyAV decoder.
-
-        Args:
-            container (`av.container.input.InputContainer`): PyAV container.
-            indices (`List[int]`): List of frame indices to decode.
-
-        Returns:
-            result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
-        """
-        frames = []
-        container.seek(0)
-        start_index = indices[0]
-        end_index = indices[-1]
-        for i, frame in enumerate(container.decode(video=0)):
-            if i > end_index:
-                break
-            if i >= start_index and i in indices:
-                frames.append(frame)
-        return np.stack([x.to_ndarray(format="rgb24") for x in frames])
-
-    def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
-        """
-        Sample a given number of frame indices from the video.
-
-        Args:
-            clip_len (`int`): Total number of frames to sample.
-            frame_sample_rate (`int`): Sample every n-th frame.
-            seg_len (`int`): Maximum allowed index of sample's last frame.
-
-        Returns:
-            indices (`List[int]`): List of sampled frame indices
-        """
-        converted_len = int(clip_len * frame_sample_rate)
-        end_idx = np.random.randint(converted_len, seg_len)
-        start_idx = end_idx - converted_len
-        indices = np.linspace(start_idx, end_idx, num=clip_len)
-        indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
-        return indices
-
-    # set seed for reproducibility
-    np.random.seed(0)
-
-    file_path = hf_hub_download(repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset")
-    with av.open(file_path) as container:
-        # sample 6 frames
-        num_frames = 6
-        indices = sample_frame_indices(
-            clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
-        )
-        frames = read_video_pyav(container, indices)
-
-        return frames
-
-
-@torch.no_grad()
-def convert_git_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our GIT structure.
-    """
-
-    model_name_to_url = {
-        "git-base": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE/snapshot/model.pt",
-        "git-base-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_COCO/snapshot/model.pt",
-        "git-base-textcaps": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTCAPS/snapshot/model.pt",
-        "git-base-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VQAv2/snapshot/model.pt",
-        "git-base-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTVQA/snapshot/model.pt",  # todo
-        "git-base-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VATEX/snapshot/model.pt",
-        "git-base-msrvtt-qa": (
-            "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_MSRVTT_QA/snapshot/model.pt"
-        ),
-        "git-large": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE/snapshot/model.pt",
-        "git-large-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_COCO/snapshot/model.pt",
-        "git-large-textcaps": (
-            "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTCAPS/snapshot/model.pt"
-        ),
-        "git-large-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VQAv2/snapshot/model.pt",
-        "git-large-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTVQA/snapshot/model.pt",
-        "git-large-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VATEX/snapshot/model.pt",
-        "git-large-msrvtt-qa": (
-            "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_MSRVTT_QA/snapshot/model.pt"
-        ),
-        "git-large-r": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R/snapshot/model.pt",
-        "git-large-r-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_COCO/snapshot/model.pt",
-        "git-large-r-textcaps": (
-            "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_TEXTCAPS/snapshot/model.pt"
-        ),
-    }
-
-    model_name_to_path = {
-        "git-large": "/Users/nielsrogge/Documents/GIT/git_large_model.pt",
-        "git-large-coco": "/Users/nielsrogge/Documents/GIT/git_large_coco_model.pt",
-        "git-large-textcaps": "/Users/nielsrogge/Documents/GIT/git_large_textcaps_model.pt",
-        "git-large-vqav2": "/Users/nielsrogge/Documents/GIT/git_large_vqav2_model.pt",
-        "git-large-textvqa": "/Users/nielsrogge/Documents/GIT/git_large_textvqa_model.pt",
-    }
-
-    # define GIT configuration based on model name
-    config, image_size, is_video = get_git_config(model_name)
-    if "large" in model_name and not is_video and "large-r" not in model_name:
-        # large checkpoints take way too long to download
-        checkpoint_path = model_name_to_path[model_name]
-        state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    else:
-        checkpoint_url = model_name_to_url[model_name]
-        state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)[
-            "model"
-        ]
-    # rename keys
-    prefix = "module." if model_name == "git-base" else ""
-    rename_keys = create_rename_keys(config, prefix=prefix)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, prefix=prefix)
-
-    # load HuggingFace model
-    model = GitForCausalLM(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    model.eval()
-
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    assert missing_keys == ["git.embeddings.position_ids", "git.image_encoder.vision_model.embeddings.position_ids"]
-    assert unexpected_keys == ["git.image_encoder.visual_projection.weight"]
-
-    # verify results
-    image_processor = (
-        VideoMAEImageProcessor(
-            size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size}
-        )
-        if is_video
-        else CLIPImageProcessor(
-            size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size}
-        )
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        "google-bert/bert-base-uncased", model_input_names=["input_ids", "attention_mask"]
-    )
-    processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-    if is_video:
-        video = prepare_video()
-        pixel_values = processor(images=list(video), return_tensors="pt").pixel_values
-    else:
-        image = prepare_img(model_name)
-        image_transforms = Compose(
-            [
-                Resize(image_size, interpolation=Image.BICUBIC),
-                CenterCrop(image_size),
-                ToTensor(),
-                Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-            ]
-        )
-        original_pixel_values = image_transforms(image).unsqueeze(0)
-        pixel_values = processor(images=image, return_tensors="pt").pixel_values
-
-        assert torch.allclose(pixel_values, original_pixel_values)
-
-    input_ids = torch.tensor([[101]])
-    outputs = model(input_ids, pixel_values=pixel_values)
-    logits = outputs.logits
-    print("Logits:", logits[0, -1, :3])
-
-    if model_name == "git-base":
-        expected_slice_logits = torch.tensor([-1.2832, -1.2835, -1.2840])
-    elif model_name == "git-base-coco":
-        expected_slice_logits = torch.tensor([-0.9925, -0.9930, -0.9935])
-    elif model_name == "git-base-textcaps":
-        expected_slice_logits = torch.tensor([-1.2980, -1.2983, -1.2985])
-    elif model_name == "git-base-vqav2":
-        expected_slice_logits = torch.tensor([-0.8570, -0.8568, -0.8561])
-    elif model_name == "git-base-textvqa":
-        expected_slice_logits = torch.tensor([-1.4085, -1.4083, -1.4082])
-    elif model_name == "git-base-vatex":
-        expected_slice_logits = torch.tensor([-1.3451, -1.3447, -1.3447])
-    elif model_name == "git-base-msrvtt-qa":
-        expected_slice_logits = torch.tensor([-0.8554, -0.8550, -0.8540])
-    elif model_name == "git-large":
-        expected_slice_logits = torch.tensor([-1.1708, -1.1707, -1.1705])
-    elif model_name == "git-large-coco":
-        expected_slice_logits = torch.tensor([-1.0425, -1.0423, -1.0422])
-    elif model_name == "git-large-textcaps":
-        expected_slice_logits = torch.tensor([-1.2705, -1.2708, -1.2706])
-    elif model_name == "git-large-vqav2":
-        expected_slice_logits = torch.tensor([-0.7042, -0.7043, -0.7043])
-    elif model_name == "git-large-textvqa":
-        expected_slice_logits = torch.tensor([-0.8590, -0.8592, -0.8590])
-    elif model_name == "git-large-vatex":
-        expected_slice_logits = torch.tensor([-1.0113, -1.0114, -1.0113])
-    elif model_name == "git-large-msrvtt-qa":
-        expected_slice_logits = torch.tensor([0.0130, 0.0134, 0.0131])
-    elif model_name == "git-large-r":
-        expected_slice_logits = torch.tensor([-1.1283, -1.1285, -1.1286])
-    elif model_name == "git-large-r-coco":
-        expected_slice_logits = torch.tensor([-0.9641, -0.9641, -0.9641])
-    elif model_name == "git-large-r-textcaps":
-        expected_slice_logits = torch.tensor([-1.1121, -1.1120, -1.1124])
-
-    assert torch.allclose(logits[0, -1, :3], expected_slice_logits, atol=1e-4)
-    print("Looks ok!")
-
-    prompt = ""
-    if "textvqa" in model_name:
-        prompt = "what does the front of the bus say at the top?"
-    elif "msrvtt-qa" in model_name:
-        prompt = "what does the woman eat?"
-    elif "vqa" in model_name:
-        prompt = "what are the cats doing?"
-    input_ids = tokenizer(prompt, add_special_tokens=False).input_ids
-    input_ids = [processor.tokenizer.cls_token_id] + input_ids
-    input_ids = torch.tensor(input_ids).unsqueeze(0)
-    print("Generating caption...")
-    generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
-    print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor of {model_name} to the hub...")
-        model.push_to_hub(f"microsoft/{model_name}")
-        processor.push_to_hub(f"microsoft/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="git-base",
-        type=str,
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model to the hub.",
-    )
-
-    args = parser.parse_args()
-    convert_git_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index b3a88545fa37..7efdf2d45c48 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -77,7 +77,7 @@ class GitVisionModelOutput(ModelOutput):
     """
 
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -683,7 +683,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals
         batch_size, _, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
             )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
@@ -791,7 +791,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py
index 29f91badc85a..98cc3b83cf19 100644
--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@@ -66,7 +66,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
@@ -95,15 +95,6 @@ def __call__(
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
-        legacy = kwargs.pop("legacy", True)
-        if legacy:
-            logger.warning_once(
-                "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. "
-                "In the new behavior, if both images and text are provided, the last token (EOS token) "
-                "of the input_ids and attention_mask tensors will be removed. "
-                "To test the new behavior, set `legacy=False`as a processor call argument."
-            )
-
         if text is None and images is None:
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
@@ -123,9 +114,6 @@ def __call__(
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
             data.update(image_features)
-            if not legacy:
-                data["input_ids"] = data["input_ids"][:, :-1]
-                data["attention_mask"] = data["attention_mask"][:, :-1]
 
         return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py
deleted file mode 100644
index 1053f984d7f0..000000000000
--- a/src/transformers/models/glm/convert_glm_weights_to_hf.py
+++ /dev/null
@@ -1,195 +0,0 @@
-import argparse
-import json
-import os
-import re
-
-import torch
-from safetensors.torch import load_file
-from tokenizers import processors
-
-from transformers import GlmConfig, GlmForCausalLM, PreTrainedTokenizerFast
-
-
-# fmt: off
-# `None` means we drop the key
-STATE_DICT_MAPPING = {
-    # CausalLM keys
-    r"transformer.output_layer.weight":                                               r"lm_head.weight",
-
-    # Model keys
-    r"transformer.embedding.word_embeddings.weight":                                  r"model.embed_tokens.weight",
-    r"transformer.rotary_pos_emb.inv_freq":                                           None,
-    r"transformer.encoder.final_layernorm.weight":                                    r"model.norm.weight",
-
-    # Layers keys
-    r"transformer.encoder.layers.(\d+).input_layernorm.weight":                       r"model.layers.\1.input_layernorm.weight",
-    r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight":              r"model.layers.\1.post_attention_layernorm.weight",
-
-    # Attention keys
-    r"transformer.encoder.layers.(\d+).self_attention.dense.weight":                  r"model.layers.\1.self_attn.o_proj.weight",
-    # qkv_proj will later be split in q|k|v|_proj
-    r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2",
-
-    # MLP keys
-    r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight":                     r"model.layers.\1.mlp.gate_up_proj.weight",
-    r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight":                     r"model.layers.\1.mlp.down_proj.weight",
-}
-# fmt: on
-
-
-def load_weights(input_dir: str):
-    safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")]
-    bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")]
-
-    all_weights = {}
-
-    if safetensor_files:
-        safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1]))
-        for file in safetensor_files:
-            tensors = load_file(file)
-            all_weights.update(tensors)
-        return all_weights
-
-    elif bin_files:
-        bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1]))
-        for file in bin_files:
-            tensors = torch.load(file, map_location="cpu")
-            all_weights.update(tensors)
-        return all_weights
-
-    else:
-        raise ValueError("No .safetensors or .bin files found in the specified directory.")
-
-
-def map_old_key_to_new(old_key):
-    for pattern, replacement in STATE_DICT_MAPPING.items():
-        if replacement is None:
-            if re.fullmatch(pattern, old_key):
-                return None
-        else:
-            new_key, n_replace = re.subn(pattern, replacement, old_key)
-            # Early exit of the loop
-            if n_replace > 0:
-                return new_key
-
-    raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).")
-
-
-def convert_state_dict(original_state_dict: dict, config: GlmConfig):
-    new_dict = {}
-
-    head_dim = config.hidden_size // config.num_attention_heads
-    query_size = config.num_attention_heads * head_dim
-    kv_size = config.num_key_value_heads * head_dim
-
-    for old_key, value in original_state_dict.items():
-        new_key = map_old_key_to_new(old_key)
-        if new_key is None:
-            continue
-
-        if "qkv_proj." in new_key:
-            q_proj, k_proj, v_proj = (
-                value[:query_size, ...],
-                value[query_size : query_size + kv_size, ...],
-                value[query_size + kv_size :, ...],
-            )
-            new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj
-            new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj
-            new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj
-        else:
-            new_dict[new_key] = value
-    return new_dict
-
-
-def convert_config(original_config: dict):
-    key_mapping = {
-        "vocab_size": "padded_vocab_size",
-        "intermediate_size": "ffn_hidden_size",
-        "num_hidden_layers": "num_layers",
-        "max_position_embeddings": "seq_length",
-        "rms_norm_eps": "layernorm_epsilon",
-        "head_dim": "kv_channels",
-        "attention_bias": "add_qkv_bias",
-    }
-    similar_keys_to_keep = [
-        "num_attention_heads",
-        "hidden_size",
-        "attention_dropout",
-        "use_cache",
-        "eos_token_id",
-        "pad_token_id",
-        "tie_word_embeddings",
-    ]
-    new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()}
-    new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep})
-    new_config_kwargs["num_key_value_heads"] = (
-        new_config_kwargs["num_attention_heads"]
-        if not original_config["multi_query_attention"]
-        else original_config["multi_query_group_num"]
-    )
-    new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1)
-
-    new_config = GlmConfig(**new_config_kwargs)
-    return new_config
-
-
-def convert_glm_tokenizer(input_dir, use_post_processor=False):
-    fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"])
-    if use_post_processor:
-        fast_tok._tokenizer.post_processor = processors.Sequence(
-            [
-                processors.ByteLevel(trim_offsets=False),
-                processors.TemplateProcessing(
-                    single="[gMASK]:0 <sop>:0 $A:0",
-                    pair="[gMASK]:0 <sop>:0 $A:0 $B:1",
-                    special_tokens=[("[gMASK]", 151331), ("<sop>", 151333)],
-                ),
-            ],
-        )
-    else:
-        fast_tok._tokenizer.post_processor = processors.Sequence(
-            [processors.ByteLevel(trim_offsets=False)],
-        )
-    return fast_tok
-
-
-def convert_glm_model(input_dir, output_dir, use_post_processor=False):
-    # Load and convert config
-    with open(os.path.join(input_dir, "config.json")) as f:
-        original_config = json.load(f)
-    config = convert_config(original_config)
-    config.save_pretrained(output_dir)
-
-    # Load and convert weights
-    original_state_dict = load_weights(input_dir)
-    new_dict = convert_state_dict(original_state_dict, config)
-    with torch.device("meta"):
-        model = GlmForCausalLM(config)
-    model.load_state_dict(new_dict, strict=True, assign=True)
-    model.save_pretrained(output_dir)
-
-    # Load and convert tokenizer
-    tokenizer = convert_glm_tokenizer(input_dir, use_post_processor)
-    tokenizer.save_pretrained(output_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "input_dir",
-        type=str,
-        help="Location of the local folder copied from the Hub.",
-    )
-    parser.add_argument(
-        "output_dir",
-        type=str,
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--use_post_processor",
-        action="store_true",
-        help="Whether to apply post processor with special tokens",
-    )
-
-    args = parser.parse_args()
-    convert_glm_model(args.input_dir, args.output_dir, args.use_post_processor)
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index f1fddcda107a..07365a495f9b 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -19,7 +19,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -35,7 +36,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -43,6 +44,8 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -50,6 +53,12 @@
 from .configuration_glm import GlmConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b"
@@ -273,45 +282,18 @@ def __init__(self, config: GlmConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -452,20 +434,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -526,10 +500,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -537,16 +512,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -557,6 +530,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -591,7 +568,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -625,13 +602,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -639,12 +615,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -725,7 +706,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -792,27 +773,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -847,10 +827,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -859,12 +838,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -873,10 +851,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -917,29 +891,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -948,9 +921,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -965,7 +937,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -980,10 +952,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1023,6 +991,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1034,23 +1003,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1059,9 +1026,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1069,10 +1035,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/glpn/convert_glpn_to_pytorch.py b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
deleted file mode 100644
index e19ee9381980..000000000000
--- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert GLPN checkpoints."""
-
-import argparse
-from collections import OrderedDict
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def rename_keys(state_dict):
-    new_state_dict = OrderedDict()
-    for key, value in state_dict.items():
-        if key.startswith("module.encoder"):
-            key = key.replace("module.encoder", "glpn.encoder")
-        if key.startswith("module.decoder"):
-            key = key.replace("module.decoder", "decoder.stages")
-        if "patch_embed" in key:
-            # replace for example patch_embed1 by patch_embeddings.0
-            idx = key[key.find("patch_embed") + len("patch_embed")]
-            key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx)-1}")
-        if "norm" in key:
-            key = key.replace("norm", "layer_norm")
-        if "glpn.encoder.layer_norm" in key:
-            # replace for example layer_norm1 by layer_norm.0
-            idx = key[key.find("glpn.encoder.layer_norm") + len("glpn.encoder.layer_norm")]
-            key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx)-1}")
-        if "layer_norm1" in key:
-            key = key.replace("layer_norm1", "layer_norm_1")
-        if "layer_norm2" in key:
-            key = key.replace("layer_norm2", "layer_norm_2")
-        if "block" in key:
-            # replace for example block1 by block.0
-            idx = key[key.find("block") + len("block")]
-            key = key.replace(f"block{idx}", f"block.{int(idx)-1}")
-        if "attn.q" in key:
-            key = key.replace("attn.q", "attention.self.query")
-        if "attn.proj" in key:
-            key = key.replace("attn.proj", "attention.output.dense")
-        if "attn" in key:
-            key = key.replace("attn", "attention.self")
-        if "fc1" in key:
-            key = key.replace("fc1", "dense1")
-        if "fc2" in key:
-            key = key.replace("fc2", "dense2")
-        if "linear_pred" in key:
-            key = key.replace("linear_pred", "classifier")
-        if "linear_fuse" in key:
-            key = key.replace("linear_fuse.conv", "linear_fuse")
-            key = key.replace("linear_fuse.bn", "batch_norm")
-        if "linear_c" in key:
-            # replace for example linear_c4 by linear_c.3
-            idx = key[key.find("linear_c") + len("linear_c")]
-            key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx)-1}")
-        if "bot_conv" in key:
-            key = key.replace("bot_conv", "0.convolution")
-        if "skip_conv1" in key:
-            key = key.replace("skip_conv1", "1.convolution")
-        if "skip_conv2" in key:
-            key = key.replace("skip_conv2", "2.convolution")
-        if "fusion1" in key:
-            key = key.replace("fusion1", "1.fusion")
-        if "fusion2" in key:
-            key = key.replace("fusion2", "2.fusion")
-        if "fusion3" in key:
-            key = key.replace("fusion3", "3.fusion")
-        if "fusion" in key and "conv" in key:
-            key = key.replace("conv", "convolutional_layer")
-        if key.startswith("module.last_layer_depth"):
-            key = key.replace("module.last_layer_depth", "head.head")
-        new_state_dict[key] = value
-
-    return new_state_dict
-
-
-def read_in_k_v(state_dict, config):
-    # for each of the encoder blocks:
-    for i in range(config.num_encoder_blocks):
-        for j in range(config.depths[i]):
-            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
-            kv_weight = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.weight")
-            kv_bias = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.bias")
-            # next, add keys and values (in that order) to the state dict
-            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[
-                : config.hidden_sizes[i], :
-            ]
-            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
-            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
-                config.hidden_sizes[i] :, :
-            ]
-            state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :]
-
-
-# We will verify our results on a COCO image
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    return image
-
-
-@torch.no_grad()
-def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub=False, model_name=None):
-    """
-    Copy/paste/tweak model's weights to our GLPN structure.
-    """
-
-    # load GLPN configuration (Segformer-B4 size)
-    config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3])
-
-    # load image processor (only resize + rescale)
-    image_processor = GLPNImageProcessor()
-
-    # prepare image
-    image = prepare_img()
-    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
-
-    logger.info("Converting model...")
-
-    # load original state dict
-    state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
-
-    # rename keys
-    state_dict = rename_keys(state_dict)
-
-    # key and value matrices need special treatment
-    read_in_k_v(state_dict, config)
-
-    # create HuggingFace model and load state dict
-    model = GLPNForDepthEstimation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # forward pass
-    outputs = model(pixel_values)
-    predicted_depth = outputs.predicted_depth
-
-    # verify output
-    if model_name is not None:
-        if "nyu" in model_name:
-            expected_slice = torch.tensor(
-                [[4.4147, 4.0873, 4.0673], [3.7890, 3.2881, 3.1525], [3.7674, 3.5423, 3.4913]]
-            )
-        elif "kitti" in model_name:
-            expected_slice = torch.tensor(
-                [[3.4291, 2.7865, 2.5151], [3.2841, 2.7021, 2.3502], [3.1147, 2.4625, 2.2481]]
-            )
-        else:
-            raise ValueError(f"Unknown model name: {model_name}")
-
-        expected_shape = torch.Size([1, 480, 640])
-
-        assert predicted_depth.shape == expected_shape
-        assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4)
-        print("Looks ok!")
-
-    # finally, push to hub if required
-    if push_to_hub:
-        logger.info("Pushing model and image processor to the hub...")
-        model.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-        image_processor.push_to_hub(
-            repo_path_or_name=Path(pytorch_dump_folder_path, model_name),
-            organization="nielsr",
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_path",
-        default=None,
-        type=str,
-        help="Path to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub."
-    )
-    parser.add_argument(
-        "--model_name",
-        default="glpn-kitti",
-        type=str,
-        help="Name of the model in case you're pushing to the hub.",
-    )
-    args = parser.parse_args()
-    convert_glpn_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name)
diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py
index b753db265493..8842b10cc15b 100755
--- a/src/transformers/models/glpn/modeling_glpn.py
+++ b/src/transformers/models/glpn/modeling_glpn.py
@@ -437,7 +437,7 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
diff --git a/src/transformers/models/got_ocr2/__init__.py b/src/transformers/models/got_ocr2/__init__.py
index 071a7ea74081..00b6ccc53fc0 100644
--- a/src/transformers/models/got_ocr2/__init__.py
+++ b/src/transformers/models/got_ocr2/__init__.py
@@ -20,6 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_got_ocr2 import *
     from .image_processing_got_ocr2 import *
+    from .image_processing_got_ocr2_fast import *
     from .modeling_got_ocr2 import *
     from .processing_got_ocr2 import *
 
diff --git a/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py b/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py
deleted file mode 100644
index 3df7214410e1..000000000000
--- a/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import gc
-import glob
-import os
-from typing import List, Optional
-
-import regex as re
-import torch
-from huggingface_hub import snapshot_download
-from safetensors import safe_open
-
-from transformers import (
-    GotOcr2Config,
-    GotOcr2ForConditionalGeneration,
-    GotOcr2ImageProcessor,
-    GotOcr2Processor,
-    PreTrainedTokenizerFast,
-    is_vision_available,
-)
-from transformers.convert_slow_tokenizer import TikTokenConverter
-from transformers.tokenization_utils import AddedToken
-
-
-if is_vision_available():
-    from transformers.image_utils import load_image
-
-
-# fmt: off
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    # Vision encoder mapping
-    r"model.vision_tower_high.pos_embed":                           r"vision_tower.pos_embed",
-    r"model.vision_tower_high.patch_embed.proj":                    r"vision_tower.patch_embed.projection",
-    r"model.vision_tower_high.blocks.(\d+).norm":                   r"vision_tower.layers.\1.layer_norm",
-    r"model.vision_tower_high.blocks.(\d+).attn":                   r"vision_tower.layers.\1.attn",
-    r"model.vision_tower_high.blocks.(\d+).mlp":                    r"vision_tower.layers.\1.mlp",
-    r"model.vision_tower_high.neck.0":                              r"vision_tower.neck.conv1",
-    r"model.vision_tower_high.neck.1":                              r"vision_tower.neck.layer_norm1",
-    r"model.vision_tower_high.neck.2":                              r"vision_tower.neck.conv2",
-    r"model.vision_tower_high.neck.3":                              r"vision_tower.neck.layer_norm2",
-    r"model.vision_tower_high.net_(\d+)":                           lambda m: f"multi_modal_projector.conv_upsampler{int(m.group(1)) - 1}",
-    r"model.mm_projector_vary" :                                    r"multi_modal_projector.multimodal_projector",
-    r"model.":                                                      r"language_model.model.",
-    r"lm_head":                                                     r"language_model.lm_head",
-}
-# fmt: on
-
-CONTEXT_LENGTH = 8000
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
-    """
-    This function should be applied only once, on the concatenated keys to efficiently rename using
-    the key mappings.
-    """
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-            new_text = re.sub(pattern, replacement, new_text)
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-    return output_dict
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    return original_state_dict
-
-
-def get_got_ocr2_config():
-    config = GotOcr2Config()
-
-    return config
-
-
-def write_model(
-    model_path,
-    input_base_path,
-    push_to_hub=False,
-):
-    os.makedirs(model_path, exist_ok=True)
-
-    config = get_got_ocr2_config()
-    config.architectures = ["GotOcr2ForConditionalGeneration"]
-    config.save_pretrained(model_path)
-    print("Model config saved successfully...")
-
-    # ------------------------------------------------------------
-    # Convert weights
-    # ------------------------------------------------------------
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}...")
-    state_dict_old = load_original_state_dict(input_base_path)
-    print("Converting model...")
-    all_keys = list(state_dict_old.keys())
-    new_keys = convert_old_keys_to_new_keys(all_keys)
-    state_dict = {}
-    for key in all_keys:
-        new_key = new_keys[key]
-        state_dict[new_key] = state_dict_old[key]
-
-    del state_dict_old
-    gc.collect()
-
-    print("Loading the checkpoint in a GotOcr2ForConditionalGeneration model.")
-    model = GotOcr2ForConditionalGeneration(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    model = model.to(torch.bfloat16)
-    print("model dtype:", model.dtype)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    print("Saving the model.")
-    model.save_pretrained(model_path)
-    if push_to_hub:
-        model.push_to_hub("stepfun-ai/GOT-OCR-2.0-hf", use_temp_dir=True)
-    del state_dict, model
-
-    # Safety check: reload the converted model
-    gc.collect()
-    print("Reloading the model to check if it's saved correctly.")
-    model = GotOcr2ForConditionalGeneration.from_pretrained(model_path, device_map="auto")
-    processor = GotOcr2Processor.from_pretrained(model_path)
-    image = load_image(
-        "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
-    )
-
-    inputs = processor(image, return_tensors="pt", format=True).to(model.device, dtype=model.dtype)
-    generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4)
-    decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
-    expected_output = "\\title{\nR"
-    print("Decoded output:", decoded_output)
-    assert decoded_output == expected_output
-    print("Model reloaded successfully.")
-    del model
-
-
-class GotOcr2Converter(TikTokenConverter):
-    def __init__(
-        self,
-        vocab_file,
-        special_tokens: List[str],
-        pattern: str,
-        model_max_length: int,
-        chat_template: Optional[str] = None,
-        **kwargs,
-    ):
-        super().__init__(vocab_file, pattern=pattern)
-        self.additional_special_tokens = special_tokens
-        tokenizer = self.converted()
-        if chat_template is not None:
-            kwargs["chat_template"] = chat_template
-        self.tokenizer = PreTrainedTokenizerFast(
-            tokenizer_object=tokenizer,
-            model_input_names=["input_ids", "attention_mask"],
-            model_max_length=model_max_length,
-            **kwargs,
-        )
-
-
-def write_tokenizer(tokenizer_path: str, save_dir: str, push_to_hub: bool = False):
-    model_max_length = CONTEXT_LENGTH
-    pattern = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: W605
-    # Special tokens
-    special_tokens = (
-        ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
-        + [f"<|extra_{i}|>" for i in range(205)]
-        + [
-            "<ref>",
-            "</ref>",
-            "<box>",
-            "</box>",
-            "<quad>",
-            "</quad>",
-            "<img>",
-            "</img>",
-            "<imgpad>",
-        ]
-    )
-
-    pad_token = "<|endoftext|>"
-    pad_token = AddedToken(pad_token, lstrip=False, rstrip=False, normalized=False, single_word=False)
-
-    converter = GotOcr2Converter(
-        vocab_file=tokenizer_path,
-        pattern=pattern,
-        special_tokens=special_tokens,
-        model_max_length=model_max_length,
-        pad_token=pad_token,
-        bos_token="<|endoftext|>",
-        eos_token="<|endoftext|>",
-        clean_up_tokenization_spaces=True,
-    )
-    tokenizer = converter.tokenizer
-    tokenizer.save_pretrained(save_dir)
-
-    if push_to_hub:
-        tokenizer.push_to_hub("stepfun-ai/GOT-OCR-2.0-hf", use_temp_dir=True)
-
-
-def write_image_processor(save_dir: str, push_to_hub: bool = False):
-    image_processor = GotOcr2ImageProcessor(
-        do_resize=True,
-        size={"height": 1024, "width": 1024},
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_normalize=True,
-        image_mean=[0.48145466, 0.4578275, 0.40821073],
-        image_std=[0.26862954, 0.26130258, 0.27577711],
-    )
-
-    image_processor.save_pretrained(save_dir)
-    if push_to_hub:
-        image_processor.push_to_hub("stepfun-ai/GOT-OCR-2.0-hf", use_temp_dir=True)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        default="stepfun-ai/GOT-OCR2_0",
-        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="GotOcr2",
-        help="Location to write HF model and tokenizer",
-    )
-
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    write_tokenizer(
-        tokenizer_path="qwen.tiktoken",
-        save_dir=args.output_dir,
-        push_to_hub=args.push_to_hub,
-    )
-
-    write_image_processor(
-        save_dir=args.output_dir,
-        push_to_hub=args.push_to_hub,
-    )
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        push_to_hub=args.push_to_hub,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
index 7f7a0d7ae4c8..875c0742b96d 100644
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
@@ -1,9 +1,3 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/got_ocr2/modular_got_ocr2.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_got_ocr2.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 #
@@ -18,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+"""Image processor class for Got-OCR-2."""
 
 from functools import lru_cache
 from typing import Dict, List, Optional, Tuple, Union
@@ -27,11 +21,9 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    _rescale_for_pil_conversion,
     convert_to_rgb,
     resize,
     to_channel_dimension_format,
-    to_pil_image,
 )
 from ...image_utils import (
     OPENAI_CLIP_MEAN,
@@ -142,6 +134,15 @@ class GotOcr2ImageProcessor(BaseImageProcessor):
         size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
             Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
             method.
+        crop_to_patches (`bool`, *optional*, defaults to `False`):
+            Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+            `preprocess` method.
+        min_patches (`int`, *optional*, defaults to 1):
+            The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+            set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+        max_patches (`int`, *optional*, defaults to 12):
+            The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+            set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
             overridden by the `resample` parameter in the `preprocess` method.
@@ -172,6 +173,9 @@ def __init__(
         self,
         do_resize: bool = True,
         size: Dict[str, int] = None,
+        crop_to_patches: bool = False,
+        min_patches: int = 1,
+        max_patches: int = 12,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
@@ -187,6 +191,9 @@ def __init__(
 
         self.do_resize = do_resize
         self.size = size
+        self.crop_to_patches = crop_to_patches
+        self.min_patches = min_patches
+        self.max_patches = max_patches
         self.resample = resample
         self.do_rescale = do_rescale
         self.rescale_factor = rescale_factor
@@ -249,6 +256,9 @@ def preprocess(
         images: ImageInput,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
+        crop_to_patches: Optional[bool] = None,
+        min_patches: Optional[int] = None,
+        max_patches: Optional[int] = None,
         resample: PILImageResampling = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
@@ -256,7 +266,7 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> PIL.Image.Image:
@@ -274,6 +284,14 @@ def preprocess(
                 `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
                 is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
                 edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            crop_to_patches (`bool`, *optional*, defaults to `self.crop_to_patches`):
+                Whether to crop the image to patches.
+            min_patches (`int`, *optional*, defaults to `self.min_patches`):
+                The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+                set to `True`.
+            max_patches (`int`, *optional*, defaults to `self.max_patches`):
+                The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+                set to `True`.
             resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                 Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
@@ -308,6 +326,9 @@ def preprocess(
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
+        crop_to_patches = crop_to_patches if crop_to_patches is not None else self.crop_to_patches
+        min_patches = min_patches if min_patches is not None else self.min_patches
+        max_patches = max_patches if max_patches is not None else self.max_patches
         resample = resample if resample is not None else self.resample
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
@@ -353,40 +374,52 @@ def preprocess(
             # We assume that all images have the same channel dimension format.
             input_data_format = infer_channel_dimension_format(images[0])
 
-        if do_resize:
-            images = [
-                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        if do_rescale:
-            images = [
-                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-                for image in images
-            ]
-
-        if do_normalize:
+        if crop_to_patches and max_patches > 1:
             images = [
-                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                self.crop_image_to_patches(
+                    image,
+                    min_patches=min_patches,
+                    max_patches=max_patches,
+                    patch_size=size,
+                    data_format=input_data_format,
+                )
                 for image in images
             ]
+            num_patches = np.array([len(image) for image in images])
+            images = [image for images_list in images for image in images_list]
+        else:
+            num_patches = np.array([1] * len(images))
+
+        for i, image in enumerate(images):
+            if do_resize:
+                images[i] = self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_rescale:
+                images[i] = self.rescale(image=images[i], scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                images[i] = self.normalize(
+                    image=images[i],
+                    mean=image_mean,
+                    std=image_std,
+                    input_data_format=input_data_format,
+                )
 
-        images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
-        ]
+            images[i] = to_channel_dimension_format(images[i], data_format, input_channel_dim=input_data_format)
 
-        encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+        encoded_outputs = BatchFeature(
+            data={"pixel_values": images, "num_patches": num_patches}, tensor_type=return_tensors
+        )
 
         return encoded_outputs
 
     def crop_image_to_patches(
         self,
-        image: ImageInput,
+        images: np.ndarray,
         min_patches: int,
         max_patches: int,
         use_thumbnail: bool = True,
         patch_size: Union[Tuple, int, dict] = None,
-        return_numpy: bool = False,
         data_format: ChannelDimension = None,
     ):
         """
@@ -396,8 +429,8 @@ def crop_image_to_patches(
         The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio.
 
         Args:
-            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`):
-                The image to be cropped. The image can be a PIL image, NumPy array or PyTorch tensor.
+            images (`np.ndarray`):
+                The image to be cropped.
             min_patches (`int`):
                 The minimum number of patches to be extracted from the image.
             max_patches (`int`):
@@ -406,24 +439,17 @@ def crop_image_to_patches(
                 Whether to add a thumbnail image to the list of cropped patches.
             patch_size (`int`, `Tuple[int, int]`, `dict`, *optional*):
                 The size of the output patches.
-            return_numpy (`bool`, *optional*, defaults to `False`):
-                Whether to return the cropped images as NumPy arrays.
             data_format (`ChannelDimension`, *optional*):
                 The format of the image data. If `None`, the format is inferred from the input image.
 
         Returns:
             List[`PIL.Image.Image`] or List[np.ndarray]: The list of cropped images.
         """
-        patch_size = patch_size if patch_size is not None else self.size
-        patch_size = get_size_dict(patch_size, default_to_square=True)
-        original_size = get_size_dict(image.size, height_width_order=False)
-        do_rescale = False
-        if not isinstance(image, PIL.Image.Image):
-            do_rescale = _rescale_for_pil_conversion(image)
-            image = to_pil_image(image, do_rescale=do_rescale)
-
+        if data_format is None:
+            data_format = infer_channel_dimension_format(images)
+        images = to_channel_dimension_format(images, ChannelDimension.FIRST, data_format)
         patch_size_height, patch_size_width = patch_size["height"], patch_size["width"]
-        original_height, original_width = original_size["height"], original_size["width"]
+        original_height, original_width = images.shape[-2:]
         # find the closest aspect ratio to the target
         num_columns, num_rows = get_optimal_tiled_canvas(
             (original_height, original_width), (patch_size_height, patch_size_width), min_patches, max_patches
@@ -435,8 +461,12 @@ def crop_image_to_patches(
         num_blocks = num_columns * num_rows
 
         # resize the image so that each patch is of patch_size
-        resized_image = image.resize((target_width, target_height))
-
+        resized_image = self.resize(
+            images,
+            {"height": target_height, "width": target_width},
+            data_format=ChannelDimension.FIRST,
+            input_data_format=ChannelDimension.FIRST,
+        )
         # split the image into patches
         processed_images = []
         for i in range(num_blocks):
@@ -449,33 +479,16 @@ def crop_image_to_patches(
                 (row + 1) * patch_size_height,
             )
             # split the image
-            patch_image = resized_image.crop(box)
+            patch_image = resized_image[..., box[1] : box[3], box[0] : box[2]]
+            patch_image = to_channel_dimension_format(patch_image, data_format, ChannelDimension.FIRST)
             processed_images.append(patch_image)
 
         if use_thumbnail and len(processed_images) != 1:
-            thumbnail_img = image.resize((patch_size_width, patch_size_height))
+            thumbnail_img = self.resize(
+                images, patch_size, data_format=data_format, input_data_format=ChannelDimension.FIRST
+            )
             processed_images.append(thumbnail_img)
 
-        if return_numpy:
-            processed_images_numpy = []
-            for processed_image in processed_images:
-                processed_image = np.array(processed_image)
-                # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
-                # so we need to add it back if necessary.
-                processed_image = (
-                    np.expand_dims(processed_image, axis=-1) if processed_image.ndim == 2 else processed_image
-                )
-                # The image is always in channels last format after converting from a PIL image
-                if data_format is not None:
-                    processed_image = to_channel_dimension_format(
-                        processed_image, data_format, input_channel_dim=ChannelDimension.LAST
-                    )
-                # If an image was rescaled to be in the range [0, 255] before converting to a PIL image, then we need to
-                # rescale it back to the original range.
-                processed_image = self.rescale(processed_image, 1 / 255) if do_rescale else processed_image
-                processed_images_numpy.append(processed_image)
-            processed_images = processed_images_numpy
-
         return processed_images
 
 
diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
new file mode 100644
index 000000000000..8498e3780308
--- /dev/null
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
@@ -0,0 +1,249 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Got-OCR-2."""
+
+from typing import List, Optional, Tuple, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+)
+from .image_processing_got_ocr2 import get_optimal_tiled_canvas
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+class GotOcr2ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
+@add_start_docstrings(
+    "Constructs a fast GotOcr2 image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        crop_to_patches (`bool`, *optional*, defaults to `False`):
+            Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+            `preprocess` method.
+        min_patches (`int`, *optional*, defaults to 1):
+            The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+            set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+        max_patches (`int`, *optional*, defaults to 12):
+            The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+            set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+    """,
+)
+class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 384, "width": 384}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    crop_to_patches = False
+    min_patches = 1
+    max_patches = 12
+    valid_kwargs = GotOcr2ImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[valid_kwargs]):
+        super().__init__(**kwargs)
+
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+            crop_to_patches (`bool`, *optional*, defaults to `False`):
+                Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+                `preprocess` method.
+            min_patches (`int`, *optional*, defaults to 1):
+                The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+                set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+            max_patches (`int`, *optional*, defaults to 12):
+                The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+                set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+        """,
+    )
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[valid_kwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def crop_image_to_patches(
+        self,
+        images: "torch.Tensor",
+        min_patches: int,
+        max_patches: int,
+        use_thumbnail: bool = True,
+        patch_size: Union[Tuple, int, dict] = None,
+        interpolation: Optional["F.InterpolationMode"] = None,
+    ):
+        """
+        Crop the images to patches and return a list of cropped images.
+        The number of patches and their grid arrangement are determined by the original image size,
+        the target patch size and the minimum and maximum number of patches.
+        The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio.
+
+        Args:
+            images (`torch.Tensor`):
+                The images to be cropped.
+            min_patches (`int`):
+                The minimum number of patches to be extracted from the image.
+            max_patches (`int`):
+                The maximum number of patches to be extracted from the image.
+            use_thumbnail (`bool`, *optional*, defaults to `True`):
+                Whether to add a thumbnail image to the list of cropped patches.
+            patch_size (`int`, `Tuple[int, int]`, `dict`, *optional*):
+                The size of the output patches.
+                The format of the image data. If `None`, the format is inferred from the input image.
+
+        Returns:
+            List[`PIL.Image.Image`] or List[np.ndarray]: The list of cropped images.
+        """
+        patch_size_height, patch_size_width = patch_size.height, patch_size.width
+        original_height, original_width = images.shape[-2:]
+        # find the closest aspect ratio to the target
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (original_height, original_width), (patch_size_height, patch_size_width), min_patches, max_patches
+        )
+
+        # calculate the target width and height
+        target_width = patch_size_width * num_columns
+        target_height = patch_size_height * num_rows
+        num_blocks = num_columns * num_rows
+
+        # resize the image so that each patch is of patch_size
+        resized_image = self.resize(
+            images, SizeDict(height=target_height, width=target_width), interpolation=interpolation
+        )
+        # split the image into patches
+        processed_images = []
+        for i in range(num_blocks):
+            column = i % num_columns
+            row = i // num_columns
+            box = (
+                column * patch_size_width,
+                row * patch_size_height,
+                (column + 1) * patch_size_width,
+                (row + 1) * patch_size_height,
+            )
+            # split the image
+            patch_image = resized_image[..., box[1] : box[3], box[0] : box[2]]
+            processed_images.append(patch_image)
+
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = self.resize(images, patch_size, interpolation=interpolation)
+            processed_images.append(thumbnail_img)
+
+        processed_images = torch.stack(processed_images, dim=0).transpose(0, 1).contiguous()
+
+        return processed_images
+
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        crop_to_patches: bool,
+        min_patches: int,
+        max_patches: int,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        if crop_to_patches:
+            grouped_images, grouped_images_index = group_images_by_shape(images)
+            processed_images_grouped = {}
+            num_patches = {}
+            for shape, stacked_images in grouped_images.items():
+                stacked_images = self.crop_image_to_patches(
+                    stacked_images,
+                    min_patches,
+                    max_patches,
+                    patch_size=size,
+                    interpolation=interpolation,
+                )
+                processed_images_grouped[shape] = stacked_images
+                num_patches[shape] = [stacked_images.shape[1]] * stacked_images.shape[0]
+            images = reorder_images(processed_images_grouped, grouped_images_index)
+            images = [image for images_list in images for image in images_list]
+            num_patches = reorder_images(num_patches, grouped_images_index)
+        else:
+            num_patches = [1] * len(images)
+
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(
+            data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
+        )
+
+
+__all__ = ["GotOcr2ImageProcessorFast"]
diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py
index 86598ac08965..c474ef36900f 100644
--- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py
@@ -28,6 +28,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
 from ...activations import ACT2FN
 from ...generation import GenerationMixin
 from ...modeling_outputs import ModelOutput
@@ -35,6 +37,7 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     replace_return_docstrings,
 )
 from ..auto import AutoModelForCausalLM
@@ -118,9 +121,8 @@ def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.
 
         return rel_pos_resized[relative_coords.long()]
 
-    def add_decomposed_rel_pos(
+    def get_decomposed_rel_pos(
         self,
-        attn: torch.Tensor,
         query: torch.Tensor,
         rel_pos_h: torch.Tensor,
         rel_pos_w: torch.Tensor,
@@ -132,8 +134,6 @@ def add_decomposed_rel_pos(
         https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
 
         Args:
-            attn (`torch.Tensor`):
-                attention map.
             query (`torch.Tensor`):
                 query q in the attention layer with shape (batch_size, query_height * query_width, channel).
             rel_pos_h (`torch.Tensor`):
@@ -146,8 +146,8 @@ def add_decomposed_rel_pos(
                 spatial sequence size of key k with (key_height, key_width).
 
         Returns:
-            attn (`torch.Tensor`):
-                attention map with added relative positional embeddings.
+            decomposed_rel_pos (`torch.Tensor`):
+                decomposed relative position embeddings.
         """
         query_height, query_width = q_size
         key_height, key_width = k_size
@@ -158,10 +158,10 @@ def add_decomposed_rel_pos(
         reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
         rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
         rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
-        attn = attn.reshape(batch_size, query_height, query_width, key_height, key_width)
-        attn = attn + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
-        attn = attn.reshape(batch_size, query_height * query_width, key_height * key_width)
-        return attn
+
+        decomposed_rel_pos = rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+
+        return decomposed_rel_pos
 
     def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
         batch_size, height, width, _ = hidden_states.shape
@@ -177,9 +177,11 @@ def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch
         attn_weights = (query * self.scale) @ key.transpose(-2, -1)
 
         if self.use_rel_pos:
-            attn_weights = self.add_decomposed_rel_pos(
-                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            decomposed_rel_pos = self.get_decomposed_rel_pos(
+                query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
             )
+            decomposed_rel_pos = decomposed_rel_pos.reshape_as(attn_weights)
+            attn_weights = attn_weights + decomposed_rel_pos
 
         attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
 
@@ -318,7 +320,7 @@ class GotOcr2VisionEncoderOutput(ModelOutput):
     """
 
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -443,18 +445,17 @@ def __init__(self, config: GotOcr2VisionConfig):
     def get_input_embeddings(self):
         return self.patch_embed
 
+    @can_return_tuple
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, GotOcr2VisionEncoderOutput]:
+    ) -> GotOcr2VisionEncoderOutput:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
@@ -488,14 +489,6 @@ def forward(
 
         hidden_states = self.neck(hidden_states)
 
-        if not return_dict:
-            outputs = (hidden_states,)
-            if output_hidden_states:
-                outputs = outputs + (all_hidden_states,)
-            if output_attentions:
-                outputs = outputs + (all_self_attentions,)
-            return outputs
-
         return GotOcr2VisionEncoderOutput(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
@@ -557,7 +550,7 @@ class GotOcr2CausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -630,10 +623,10 @@ def _init_weights(self, module):
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+        pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)):
             The tensors corresponding to the input images. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`GotOcr2Processor`] uses
-            [`CLIPImageProcessor`] for processing images).
+            [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`GotOcr2Processor`] uses
+            [`GotOcr2ImageProcessor`] for processing images.
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -672,13 +665,6 @@ def _init_weights(self, module):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
             model's internal embedding lookup matrix.
-        vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
-            The index of the layer to select the vision feature. If multiple indices are provided,
-            the vision feature of the corresponding indices will be concatenated to form the
-            vision features.
-        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Can be one of `"default"` or `"full"`.
         use_cache (`bool`, *optional*):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
@@ -750,12 +736,13 @@ def get_image_features(
         image_outputs = self.vision_tower(pixel_values).last_hidden_state
         return self.multi_modal_projector(image_outputs)
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GOT_OCR2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=GotOcr2CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -764,12 +751,10 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
-    ) -> Union[Tuple, GotOcr2CausalLMOutputWithPast]:
+    ) -> GotOcr2CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -818,7 +803,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -844,7 +828,7 @@ def forward(
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
-        outputs = self.language_model(
+        outputs: CausalLMOutputWithPast = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
@@ -852,12 +836,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
         )
 
-        logits = outputs[0]
+        logits = outputs.logits
 
         loss = None
         if labels is not None:
@@ -877,10 +860,6 @@ def forward(
                 shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
             )
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return GotOcr2CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py
index fff434ead2e9..4b7e7f1adb5e 100644
--- a/src/transformers/models/got_ocr2/modular_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py
@@ -14,37 +14,24 @@
 # limitations under the License.
 
 
-from functools import lru_cache
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Union
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.utils.checkpoint
 
-from transformers.models.blip.image_processing_blip import BlipImageProcessor
+from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.llava.modeling_llava import (
     LlavaCausalLMOutputWithPast,
     LlavaForConditionalGeneration,
     LlavaPreTrainedModel,
 )
 from transformers.models.sam.modeling_sam import SamMLPBlock, SamVisionAttention, SamVisionEncoder, SamVisionLayer
-from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
-from transformers.tokenization_utils_base import (
-    PreTokenizedInput,
-    TextInput,
-)
 
 from ...configuration_utils import PretrainedConfig
-from ...image_processing_utils import BatchFeature, get_size_dict
-from ...image_transforms import (
-    _rescale_for_pil_conversion,
-    to_channel_dimension_format,
-    to_pil_image,
-)
-from ...image_utils import ChannelDimension, ImageInput
 from ...utils import (
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     is_vision_available,
     logging,
     replace_return_docstrings,
@@ -53,9 +40,7 @@
 
 
 if is_vision_available():
-    import PIL
-
-    from ...image_utils import load_images
+    pass
 
 logger = logging.get_logger(__name__)
 
@@ -243,440 +228,6 @@ def __init__(
         super().__init__(**kwargs)
 
 
-__all__ = ["GotOcr2VisionConfig", "GotOcr2Config"]
-
-
-class GotOcr2TextKwargs(TextKwargs, total=False):
-    format: Optional[bool]
-
-
-class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
-    box: Optional[Union[List, Tuple[float, float], Tuple[float, float, float, float]]]
-    color: Optional[str]
-    num_image_tokens: Optional[int]
-    multi_page: Optional[bool]
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
-
-
-class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):
-    text_kwargs: GotOcr2TextKwargs
-    images_kwargs: GotOcr2ImagesKwargs
-    _defaults = {
-        "text_kwargs": {
-            "padding": False,
-            "format": False,
-        },
-        "images_kwargs": {
-            "num_image_tokens": 256,
-            "multi_page": False,
-            "crop_to_patches": False,
-            "min_patches": 1,
-            "max_patches": 12,
-        },
-    }
-
-
-def preprocess_box_annotation(box: Union[List, Tuple], image_size: Tuple[int, int]) -> List:
-    """
-    Convert box annotation to the format [x1, y1, x2, y2] in the range [0, 1000].
-    """
-    width, height = image_size
-    if len(box) == 4:
-        box[0] = int(box[0] / width * 1000)
-        box[1] = int(box[1] / height * 1000)
-        box[2] = int(box[2] / width * 1000)
-        box[3] = int(box[3] / height * 1000)
-    else:
-        raise ValueError("Box must be a list or tuple of lists in the form [x1, y1, x2, y2].")
-
-    return list(box)
-
-
-# Similar to image_processing_mllama.get_all_supported_aspect_ratios
-@lru_cache(maxsize=10)
-def get_all_supported_aspect_ratios(min_image_tiles: int, max_image_tiles: int) -> List[Tuple[int, int]]:
-    """
-    Computes all allowed aspect ratios for a given minimum and maximum number of input tiles.
-
-    This function calculates all possible arrangements of tiles that can be formed
-    within the constraint of the minimum and maximum number of tiles. Each arrangement is
-    represented by its aspect ratio (width/height) and the corresponding tile configuration.
-
-    Args:
-        min_image_tiles (`int`):
-            The minimum number of tiles allowed.
-        max_image_tiles (`int`):
-            The maximum number of tiles allowed.
-
-    Returns:
-        `List[Tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height)
-        configuration in terms of number of tiles.
-
-    Example:
-        >>> get_all_supported_aspect_ratios(1, 4)
-        [(1, 1), (1, 2), (2, 1), (1, 3), (3, 1), (1, 4), (2, 2), (4, 1)]
-
-    """
-    aspect_ratios = []
-    for width in range(1, max_image_tiles + 1):
-        for height in range(1, max_image_tiles + 1):
-            if width * height <= max_image_tiles and width * height >= min_image_tiles:
-                aspect_ratios.append((width, height))
-
-    aspect_ratios = sorted(aspect_ratios, key=lambda x: x[0] * x[1])
-
-    return aspect_ratios
-
-
-@lru_cache(maxsize=100)
-def get_optimal_tiled_canvas(
-    original_image_size: Tuple[int, int],
-    target_tile_size: Tuple[int, int],
-    min_image_tiles: int,
-    max_image_tiles: int,
-) -> Tuple[int, int]:
-    """
-    Given a minimum and maximum number of tiles, find the canvas with the closest aspect ratio to the
-    original image aspect ratio.
-    In case of tie-breaking condition when two canvases have the same aspect ratio difference, we favor the canvas with
-    more tiles, until the area covered by the tiles is more than twice the target area, in order to avoid unnecessarily
-    excessive tiling.
-    """
-    possible_tile_arrangements = get_all_supported_aspect_ratios(min_image_tiles, max_image_tiles)
-
-    original_height, original_width = original_image_size
-    target_tile_height, target_tile_width = target_tile_size
-    aspect_ratio = original_width / original_height
-    area = original_width * original_height
-
-    # find the grid with the best aspect ratio
-    best_ratio_diff = float("inf")
-    best_grid = (1, 1)
-    for grid in possible_tile_arrangements:
-        grid_aspect_ratio = grid[0] / grid[1]
-        ratio_diff = abs(aspect_ratio - grid_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_grid = grid
-        elif ratio_diff == best_ratio_diff:
-            # if the aspect ratio difference is the same, we favor the grid with more patches
-            # until the area covered by the patches is more than twice the original image area
-            if area > 0.5 * target_tile_height * target_tile_width * grid[0] * grid[1]:
-                best_grid = grid
-
-    return best_grid
-
-
-class GotOcr2ImageProcessor(BlipImageProcessor):
-    def crop_image_to_patches(
-        self,
-        image: ImageInput,
-        min_patches: int,
-        max_patches: int,
-        use_thumbnail: bool = True,
-        patch_size: Union[Tuple, int, dict] = None,
-        return_numpy: bool = False,
-        data_format: ChannelDimension = None,
-    ):
-        """
-        Crop the image to patches and return a list of cropped images.
-        The number of patches and their grid arrangement are determined by the original image size,
-        the target patch size and the minimum and maximum number of patches.
-        The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio.
-
-        Args:
-            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`):
-                The image to be cropped. The image can be a PIL image, NumPy array or PyTorch tensor.
-            min_patches (`int`):
-                The minimum number of patches to be extracted from the image.
-            max_patches (`int`):
-                The maximum number of patches to be extracted from the image.
-            use_thumbnail (`bool`, *optional*, defaults to `True`):
-                Whether to add a thumbnail image to the list of cropped patches.
-            patch_size (`int`, `Tuple[int, int]`, `dict`, *optional*):
-                The size of the output patches.
-            return_numpy (`bool`, *optional*, defaults to `False`):
-                Whether to return the cropped images as NumPy arrays.
-            data_format (`ChannelDimension`, *optional*):
-                The format of the image data. If `None`, the format is inferred from the input image.
-
-        Returns:
-            List[`PIL.Image.Image`] or List[np.ndarray]: The list of cropped images.
-        """
-        patch_size = patch_size if patch_size is not None else self.size
-        patch_size = get_size_dict(patch_size, default_to_square=True)
-        original_size = get_size_dict(image.size, height_width_order=False)
-        do_rescale = False
-        if not isinstance(image, PIL.Image.Image):
-            do_rescale = _rescale_for_pil_conversion(image)
-            image = to_pil_image(image, do_rescale=do_rescale)
-
-        patch_size_height, patch_size_width = patch_size["height"], patch_size["width"]
-        original_height, original_width = original_size["height"], original_size["width"]
-        # find the closest aspect ratio to the target
-        num_columns, num_rows = get_optimal_tiled_canvas(
-            (original_height, original_width), (patch_size_height, patch_size_width), min_patches, max_patches
-        )
-
-        # calculate the target width and height
-        target_width = patch_size_width * num_columns
-        target_height = patch_size_height * num_rows
-        num_blocks = num_columns * num_rows
-
-        # resize the image so that each patch is of patch_size
-        resized_image = image.resize((target_width, target_height))
-
-        # split the image into patches
-        processed_images = []
-        for i in range(num_blocks):
-            column = i % num_columns
-            row = i // num_columns
-            box = (
-                column * patch_size_width,
-                row * patch_size_height,
-                (column + 1) * patch_size_width,
-                (row + 1) * patch_size_height,
-            )
-            # split the image
-            patch_image = resized_image.crop(box)
-            processed_images.append(patch_image)
-
-        if use_thumbnail and len(processed_images) != 1:
-            thumbnail_img = image.resize((patch_size_width, patch_size_height))
-            processed_images.append(thumbnail_img)
-
-        if return_numpy:
-            processed_images_numpy = []
-            for processed_image in processed_images:
-                processed_image = np.array(processed_image)
-                # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
-                # so we need to add it back if necessary.
-                processed_image = (
-                    np.expand_dims(processed_image, axis=-1) if processed_image.ndim == 2 else processed_image
-                )
-                # The image is always in channels last format after converting from a PIL image
-                if data_format is not None:
-                    processed_image = to_channel_dimension_format(
-                        processed_image, data_format, input_channel_dim=ChannelDimension.LAST
-                    )
-                # If an image was rescaled to be in the range [0, 255] before converting to a PIL image, then we need to
-                # rescale it back to the original range.
-                processed_image = self.rescale(processed_image, 1 / 255) if do_rescale else processed_image
-                processed_images_numpy.append(processed_image)
-            processed_images = processed_images_numpy
-
-        return processed_images
-
-
-class GotOcr2Processor(ProcessorMixin):
-    r"""
-    Constructs a GotOcr2 processor which wraps a [`GotOcr2ImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~GotOcr2Processor.__call__`] and [`~GotOcr2Processor.decode`] for more information.
-    Args:
-        image_processor ([`GotOcr2ImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
-    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
-    image_processor_class = "GotOcr2ImageProcessor"
-    tokenizer_class = "PreTrainedTokenizerFast"
-
-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
-        super().__init__(image_processor, tokenizer, chat_template=chat_template)
-
-        self.message_start_token = "<|im_start|>"
-        self.message_end_token = "<|im_end|>"
-        self.img_start_token = "<img>"
-        self.img_end_token = "</img>"
-        self.img_pad_token = "<imgpad>"
-        self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
-
-    def _make_list_of_inputs(self, images, text, box, color, multi_page):
-        if not isinstance(images, (list, tuple)):
-            images = [images]
-            if multi_page:
-                logger.warning("Multi-page inference is enabled but only one image is passed.")
-                images = [images]
-        elif isinstance(images[0], (list, tuple)) and not multi_page:
-            raise ValueError("Nested images are only supported with `multi_page` set to `True`.")
-        elif not isinstance(images[0], (list, tuple)) and multi_page:
-            images = [images]
-
-        if isinstance(text, str):
-            text = [text]
-
-        if not isinstance(box[0], (list, tuple)):
-            # Use the same box for all images
-            box = [box for _ in range(len(images))]
-        if not isinstance(color, (list, tuple)):
-            color = [color for _ in range(len(images))]
-
-        return images, text, box, color
-
-    def __call__(
-        self,
-        images: Optional[ImageInput] = None,
-        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
-        **kwargs: Unpack[GotOcr2ProcessorKwargs],
-    ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
-        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
-        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
-        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            format (`bool`, *optional*):
-                If set, will add the format token to the query, and the model will return the OCR result with formatting.
-            box (`List[float]`, `List[Tuple[float, float]]`, `List[Tuple[float, float, float, float]]`, *optional*):
-                The box annotation to be added to the query. If a list of floats or a tuple of floats is provided, it
-                will be interpreted as [x1, y1, x2, y2]. If a list of tuples is provided, each tuple should be in the
-                form (x1, y1, x2, y2).
-            color (`str`, *optional*):
-                The color annotation to be added to the query. The model will return the OCR result within the box with
-                the specified color.
-            multi_page (`bool`, *optional*):
-                If set, will enable multi-page inference. The model will return the OCR result across multiple pages.
-            crop_to_patches (`bool`, *optional*):
-                If set, will crop the image to patches. The model will return the OCR result upon the patch reference.
-            min_patches (`int`, *optional*):
-                The minimum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to
-                `True`.
-            max_patches (`int`, *optional*):
-                The maximum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to
-                `True`.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-
-        output_kwargs = self._merge_kwargs(
-            GotOcr2ProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-        format_output = output_kwargs["text_kwargs"].pop("format")
-        num_image_tokens = output_kwargs["images_kwargs"].pop("num_image_tokens")
-        box = output_kwargs["images_kwargs"].pop("box", [None])
-        color = output_kwargs["images_kwargs"].pop("color", None)
-        multi_page = output_kwargs["images_kwargs"].pop("multi_page")
-        crop_to_patches = output_kwargs["images_kwargs"].pop("crop_to_patches")
-        min_patches = output_kwargs["images_kwargs"].pop("min_patches")
-        max_patches = output_kwargs["images_kwargs"].pop("max_patches")
-
-        images, text, box, color = self._make_list_of_inputs(images, text, box, color, multi_page)
-
-        # Load images as we need to know the image size
-        images = load_images(images)
-        if text is None:
-            text = []
-            for index, (image_group, box_single, color_single) in enumerate(zip(images, box, color)):
-                if crop_to_patches:
-                    image_group = self.image_processor.crop_image_to_patches(
-                        image_group,
-                        patch_size=output_kwargs["images_kwargs"].get("size"),
-                        min_patches=min_patches,
-                        max_patches=max_patches,
-                    )
-                    images[index] = image_group
-                num_images = len(image_group) if (multi_page or crop_to_patches) else 1
-                if box_single[0] is not None:
-                    box_single = preprocess_box_annotation(box_single, image_group.size)
-                query = (
-                    f"{f'[{color_single}] ' if color_single is not None else ''}"
-                    f"{str(box_single) if box_single[0] is not None else ''} "
-                    "OCR"
-                    f"{' with format' if format_output else ''}"
-                    f"{' across multi pages' if multi_page else ''}"
-                    f"{' upon the patch reference' if crop_to_patches else ''}"
-                    ": "
-                )
-                prompt = (
-                    self.message_start_token
-                    + self.system_query
-                    + self.message_end_token
-                    + self.message_start_token
-                    + "user\n"
-                    + self.img_start_token
-                    + self.img_pad_token * num_image_tokens * num_images
-                    + self.img_end_token
-                    + "\n"
-                    + query
-                    + self.message_end_token
-                    + self.message_start_token
-                    + "assistant\n"
-                )
-                text.append(prompt)
-        elif crop_to_patches:
-            for index, (image_group, box_single, color_single) in enumerate(zip(images, box, color)):
-                image_group = self.image_processor.crop_image_to_patches(
-                    image_group,
-                    patch_size=output_kwargs["images_kwargs"].get("size"),
-                    min_patches=min_patches,
-                    max_patches=max_patches,
-                )
-                images[index] = image_group
-
-        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
-        if multi_page or crop_to_patches:
-            # flatten images
-            images = [image for image_group in images for image in image_group]
-        image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
-
-        return BatchFeature(data={**text_inputs, **image_inputs})
-
-    def batch_decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
-        refer to the docstring of this method for more information.
-        """
-        return self.tokenizer.batch_decode(*args, **kwargs)
-
-    def decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
-        the docstring of this method for more information.
-        """
-        return self.tokenizer.decode(*args, **kwargs)
-
-    @property
-    def model_input_names(self):
-        tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        return list(tokenizer_input_names) + list(image_processor_input_names)
-
-
 class GotOcr2MLPBlock(SamMLPBlock):
     pass
 
@@ -738,6 +289,10 @@ class GotOcr2PreTrainedModel(LlavaPreTrainedModel):
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`GotOcr2Processor`] uses
+            [`GotOcr2ImageProcessor`] for processing images.
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
@@ -779,10 +334,6 @@ class GotOcr2PreTrainedModel(LlavaPreTrainedModel):
         use_cache (`bool`, *optional*):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -791,10 +342,10 @@ class GotOcr2PreTrainedModel(LlavaPreTrainedModel):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)):
-            The tensors corresponding to the input images. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`GotOcr2Processor`] uses
-            [`GotOcr2ImageProcessor`] for processing images.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 
@@ -829,12 +380,13 @@ def get_image_features(
         image_outputs = self.vision_tower(pixel_values).last_hidden_state
         return self.multi_modal_projector(image_outputs)
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GOT_OCR2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=GotOcr2CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -843,12 +395,10 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
-    ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
+    ) -> GotOcr2CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -897,7 +447,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -923,7 +472,7 @@ def forward(
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
-        outputs = self.language_model(
+        outputs: CausalLMOutputWithPast = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
@@ -931,12 +480,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
         )
 
-        logits = outputs[0]
+        logits = outputs.logits
 
         loss = None
         if labels is not None:
@@ -956,10 +504,6 @@ def forward(
                 shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
             )
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return GotOcr2CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -973,8 +517,6 @@ def forward(
 __all__ = [
     "GotOcr2VisionConfig",
     "GotOcr2Config",
-    "GotOcr2Processor",
     "GotOcr2PreTrainedModel",
     "GotOcr2ForConditionalGeneration",
-    "GotOcr2ImageProcessor",
 ]
diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py
index 636db765f993..398ec36c9e75 100644
--- a/src/transformers/models/got_ocr2/processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py
@@ -1,9 +1,3 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/got_ocr2/modular_got_ocr2.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_got_ocr2.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 #
@@ -22,6 +16,8 @@
 
 from typing import List, Optional, Tuple, Union
 
+import numpy as np
+
 from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
@@ -100,7 +96,7 @@ class GotOcr2Processor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer"]
     valid_kwargs = ["chat_template"]
-    image_processor_class = "GotOcr2ImageProcessor"
+    image_processor_class = "AutoImageProcessor"
     tokenizer_class = "PreTrainedTokenizerFast"
 
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
@@ -205,28 +201,29 @@ def __call__(
         box = output_kwargs["images_kwargs"].pop("box", [None])
         color = output_kwargs["images_kwargs"].pop("color", None)
         multi_page = output_kwargs["images_kwargs"].pop("multi_page")
-        crop_to_patches = output_kwargs["images_kwargs"].pop("crop_to_patches")
-        min_patches = output_kwargs["images_kwargs"].pop("min_patches")
-        max_patches = output_kwargs["images_kwargs"].pop("max_patches")
 
+        crop_to_patches = output_kwargs["images_kwargs"].get("crop_to_patches")
         images, text, box, color = self._make_list_of_inputs(images, text, box, color, multi_page)
-
+        if multi_page:
+            # save the number of pages per batch
+            num_pages_per_batch = [len(image_group) for image_group in images]
+            # flatten the list of images
+            images = [image for image_group in images for image in image_group]
+        else:
+            num_pages_per_batch = [1 for _ in range(len(images))]
         # Load images as we need to know the image size
         images = load_images(images)
+        image_sizes = [image.size for image in images]
+        image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+        num_patches_array = image_inputs.pop("num_patches")
         if text is None:
             text = []
-            for index, (image_group, box_single, color_single) in enumerate(zip(images, box, color)):
-                if crop_to_patches:
-                    image_group = self.image_processor.crop_image_to_patches(
-                        image_group,
-                        patch_size=output_kwargs["images_kwargs"].get("size"),
-                        min_patches=min_patches,
-                        max_patches=max_patches,
-                    )
-                    images[index] = image_group
-                num_images = len(image_group) if (multi_page or crop_to_patches) else 1
+            patch_indices = np.cumsum(num_pages_per_batch)
+            for index, (num_pages, box_single, color_single) in enumerate(zip(num_pages_per_batch, box, color)):
+                current_patch_index = patch_indices[index - 1] if index > 0 else 0
+                num_patches = sum(num_patches_array[current_patch_index : current_patch_index + num_pages])
                 if box_single[0] is not None:
-                    box_single = preprocess_box_annotation(box_single, image_group.size)
+                    box_single = preprocess_box_annotation(box_single, image_sizes[index])
                 query = (
                     f"{f'[{color_single}] ' if color_single is not None else ''}"
                     f"{str(box_single) if box_single[0] is not None else ''} "
@@ -243,7 +240,7 @@ def __call__(
                     + self.message_start_token
                     + "user\n"
                     + self.img_start_token
-                    + self.img_pad_token * num_image_tokens * num_images
+                    + self.img_pad_token * num_image_tokens * num_patches
                     + self.img_end_token
                     + "\n"
                     + query
@@ -252,22 +249,8 @@ def __call__(
                     + "assistant\n"
                 )
                 text.append(prompt)
-        elif crop_to_patches:
-            for index, (image_group, box_single, color_single) in enumerate(zip(images, box, color)):
-                image_group = self.image_processor.crop_image_to_patches(
-                    image_group,
-                    patch_size=output_kwargs["images_kwargs"].get("size"),
-                    min_patches=min_patches,
-                    max_patches=max_patches,
-                )
-                images[index] = image_group
 
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
-        if multi_page or crop_to_patches:
-            # flatten images
-            images = [image for image_group in images for image in image_group]
-        image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
-
         return BatchFeature(data={**text_inputs, **image_inputs})
 
     def batch_decode(self, *args, **kwargs):
diff --git a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 33f9dabed07f..000000000000
--- a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
-    # Construct model
-    if gpt2_config_file == "":
-        config = GPT2Config()
-    else:
-        config = GPT2Config.from_json_file(gpt2_config_file)
-    model = GPT2Model(config)
-
-    # Load weights from numpy
-    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print(f"Save configuration file to {pytorch_config_dump_path}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--gpt2_config_file",
-        default="",
-        type=str,
-        help=(
-            "An optional config json file corresponding to the pre-trained OpenAI model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    args = parser.parse_args()
-    convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/gpt2/modeling_flax_gpt2.py b/src/transformers/models/gpt2/modeling_flax_gpt2.py
index 62704d203b0e..2c52e32822b6 100644
--- a/src/transformers/models/gpt2/modeling_flax_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_flax_gpt2.py
@@ -162,7 +162,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 931bba9ba965..75c148f233ac 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -243,7 +243,7 @@ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, hea
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
             attn_weights = torch.where(causal_mask, attn_weights, mask_value)
 
         if attention_mask is not None:
@@ -532,8 +532,8 @@ class GPT2DoubleHeadsModelOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     mc_loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    mc_logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    mc_logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -1400,7 +1400,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index 41eb5c19ef19..5812e42af75c 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -628,8 +628,8 @@ class TFGPT2DoubleHeadsModelOutput(ModelOutput):
             heads.
     """
 
-    logits: tf.Tensor = None
-    mc_logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
+    mc_logits: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_tf.py b/src/transformers/models/gpt2/tokenization_gpt2_tf.py
index 0c0fdb3ae806..34e6ca2d2555 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_tf.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_tf.py
@@ -1,5 +1,5 @@
 import os
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
 
 import tensorflow as tf
 from keras_nlp.tokenizers import BytePairTokenizer
@@ -25,7 +25,13 @@ class TFGPT2Tokenizer(keras.layers.Layer):
         merges (List[str]): Merges list for Byte Pair Tokenizer
     """
 
-    def __init__(self, vocab: Dict[str, int], merges: List[str], max_length: int = None, pad_token_id: int = None):
+    def __init__(
+        self,
+        vocab: Dict[str, int],
+        merges: List[str],
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+    ):
         super().__init__()
         self.pad_token_id = pad_token_id
         self.max_length = max_length
@@ -88,7 +94,7 @@ def get_config(self):
             "pad_token_id": self.pad_token_id,
         }
 
-    def call(self, x, max_length: int = None):
+    def call(self, x, max_length: Optional[int] = None):
         input_ids = self.tf_tokenizer(x)
         attention_mask = tf.ones_like(input_ids)
 
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 4729ee098da3..d9a84d6f463d 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -24,6 +24,7 @@
 from ...activations import ACT2FN
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -36,14 +37,12 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
 )
 from .configuration_gpt_bigcode import GPTBigCodeConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -282,9 +281,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -1288,7 +1287,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
deleted file mode 100644
index 3db22857293c..000000000000
--- a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The Eleuther AI and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert GPT Neo checkpoint."""
-
-import argparse
-import json
-
-from transformers import GPTNeoConfig, GPTNeoForCausalLM, load_tf_weights_in_gpt_neo
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config_json = json.load(open(config_file, "r"))
-    config = GPTNeoConfig(
-        hidden_size=config_json["n_embd"],
-        num_layers=config_json["n_layer"],
-        num_heads=config_json["n_head"],
-        attention_types=config_json["attention_types"],
-        max_position_embeddings=config_json["n_positions"],
-        resid_dropout=config_json["res_dropout"],
-        embed_dropout=config_json["embed_dropout"],
-        attention_dropout=config_json["attn_dropout"],
-    )
-    print(f"Building PyTorch model from configuration: {config}")
-    model = GPTNeoForCausalLM(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_gpt_neo(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained mesh-tf model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
index 851c20dfcfef..f282e117dd14 100644
--- a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
@@ -148,7 +148,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 8598d51e6871..9cc18c0ea95b 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -26,6 +26,7 @@
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -40,15 +41,20 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    is_torch_flex_attn_available,
     is_torch_fx_available,
     logging,
 )
 from .configuration_gpt_neo import GPTNeoConfig
 
 
-if is_flash_attn_2_available():
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -213,7 +219,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         mask_value = torch.finfo(attn_weights.dtype).min
         # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
         # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
         attn_weights = torch.where(causal_mask, attn_weights, mask_value)
 
         if attention_mask is not None:  # no matter the length, we just slice it
@@ -278,9 +284,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -789,12 +795,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -876,7 +887,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1109,7 +1120,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index f420a8ceb206..86ad00d3c2ec 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -21,7 +21,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -29,12 +29,20 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
 from .configuration_gpt_neox import GPTNeoXConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 
@@ -298,45 +306,18 @@ def __init__(self, config: GPTNeoXConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -430,20 +411,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -502,6 +475,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_in = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -520,15 +494,13 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -619,13 +591,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -633,12 +604,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -719,7 +695,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -777,6 +753,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.embed_out = new_embeddings
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -791,7 +768,6 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
@@ -823,9 +799,8 @@ def forward(
 
         >>> prediction_logits = outputs.logits
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.gpt_neox(
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -835,12 +810,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.embed_out(hidden_states[:, slice_indices, :])
@@ -849,10 +823,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -887,6 +857,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -905,17 +876,15 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.gpt_neox(
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -925,9 +894,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         batch_size = logits.shape[0]
@@ -938,7 +906,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -953,10 +921,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -978,6 +942,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint="LarsJonasson/pythia-410m-deduped-sft-swedish",
@@ -998,17 +963,15 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.gpt_neox(
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
             input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
@@ -1018,10 +981,9 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         hidden_states = self.dropout(hidden_states)
         logits = self.classifier(hidden_states)
 
@@ -1029,10 +991,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
@@ -1058,6 +1016,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1077,8 +1036,7 @@ def forward(
         end_positions: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -1089,9 +1047,8 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
             are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.gpt_neox(
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1099,10 +1056,9 @@ def forward(
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -1113,10 +1069,6 @@ def forward(
         if start_positions is not None and end_positions is not None:
             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions)
 
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return QuestionAnsweringModelOutput(
             loss=loss,
             start_logits=start_logits,
diff --git a/src/transformers/models/gpt_neox/modular_gpt_neox.py b/src/transformers/models/gpt_neox/modular_gpt_neox.py
index 3a7cc49542ef..58b697c3f105 100644
--- a/src/transformers/models/gpt_neox/modular_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modular_gpt_neox.py
@@ -22,6 +22,7 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -321,6 +322,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_in = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -339,15 +341,13 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -438,13 +438,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
-        return output if return_dict else output.to_tuple()
 
 
 class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
@@ -473,6 +472,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.embed_out = new_embeddings
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -487,7 +487,6 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
@@ -519,9 +518,8 @@ def forward(
 
         >>> prediction_logits = outputs.logits
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.gpt_neox(
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -531,12 +529,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.embed_out(hidden_states[:, slice_indices, :])
@@ -545,10 +542,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -583,6 +576,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -601,17 +595,15 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.gpt_neox(
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -621,9 +613,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         batch_size = logits.shape[0]
@@ -634,7 +625,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -649,10 +640,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -674,6 +661,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint="LarsJonasson/pythia-410m-deduped-sft-swedish",
@@ -694,17 +682,15 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.gpt_neox(
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
             input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
@@ -714,10 +700,9 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         hidden_states = self.dropout(hidden_states)
         logits = self.classifier(hidden_states)
 
@@ -725,10 +710,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
@@ -754,6 +735,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GPT_NEOX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -773,8 +755,7 @@ def forward(
         end_positions: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -785,9 +766,8 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
             are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.gpt_neox(
+        outputs: BaseModelOutputWithPast = self.gpt_neox(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -795,10 +775,9 @@ def forward(
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -809,10 +788,6 @@ def forward(
         if start_positions is not None and end_positions is not None:
             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions)
 
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return QuestionAnsweringModelOutput(
             loss=loss,
             start_logits=start_logits,
diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
index d2ea1c3984fc..767a8a68f63c 100644
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@@ -154,8 +154,8 @@ def update_post_processor(self):
         if eos is None and self.add_eos_token:
             raise ValueError("add_eos_token = True but eos_token = None")
 
-        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
 
         special_tokens = []
         if self.add_bos_token:
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index 10b6efbc5943..83955ff05010 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -27,12 +27,21 @@
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    is_torch_flex_attn_available,
+    logging,
+)
 from .configuration_gpt_neox_japanese import GPTNeoXJapaneseConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "abeja/gpt-neox-japanese-2.7b"
@@ -241,45 +250,18 @@ def __init__(self, config: GPTNeoXJapaneseConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -659,12 +641,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -746,7 +733,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
index dbb084e930bd..19b0fd2375c3 100644
--- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -18,6 +18,7 @@
 import json
 import os
 import re
+import sys
 from typing import Optional, Tuple
 
 import numpy as np
@@ -230,12 +231,26 @@ def __init__(self, vocab, ids_to_tokens, emoji):
         self.content_repatter5 = re.compile(
             r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
         )
-        self.content_repatter6 = re.compile(
-            r"((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*億)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*万)*((0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*千)*(0|[1-9]\d*|[1-9]\d{0,2}(,\d{3})+)*(千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+(\(税込\)|\(税抜\)|\+tax)*"
-        )
+        # The original version of this regex displays catastrophic backtracking behaviour. We avoid this using
+        # possessive quantifiers in Py >= 3.11. In versions below this, we avoid the vulnerability using a slightly
+        # different regex that should generally have the same behaviour in most non-pathological cases.
+        if sys.version_info >= (3, 11):
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億])*+"
+                r"(?:\d,\d{3}|[\d万])*+"
+                r"(?:\d,\d{3}|[\d千])*+"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
+        else:
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億万千])*"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
         keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
         blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
-        self.content_trans1 = str.maketrans({k: "<BLOCK>" for k in keisen + blocks})
+        self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "<BLOCK>"))
 
     def __len__(self):
         return len(self.ids_to_tokens)
diff --git a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
deleted file mode 100644
index 2625701c1a75..000000000000
--- a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team and the AI-Sweden team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert GPT-SW3 megatron checkpoints to pytorch"""
-
-import argparse
-import os
-from os.path import isfile
-
-import torch
-
-from transformers import GPT2Config
-
-
-def recursive_print(name, val, spaces=0):
-    # Format the message.
-    if name is None:
-        msg = None
-    else:
-        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
-        msg = fmt.format(name)
-
-    # Print and recurse (if needed).
-    if isinstance(val, dict):
-        if msg is not None:
-            print(msg)
-        for k in val.keys():
-            recursive_print(k, val[k], spaces + 2)
-    elif isinstance(val, torch.Tensor):
-        print(msg, ":", val.size())
-    else:
-        print(msg, ":", val)
-
-
-def fix_query_key_value_ordering(param, num_splits, num_heads, hidden_size):
-    # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :]
-    # for compatibility with later versions of NVIDIA Megatron-LM.
-    # The inverse operation is performed inside Megatron-LM to read checkpoints:
-    # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209
-    # If param is the weight tensor of the self-attention block, the returned tensor
-    # will have to be transposed one more time to be read by HuggingFace GPT2.
-    input_shape = param.size()
-    # other versions store [num_heads * num_splits * hidden_size, :]
-    saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:]
-    param = param.view(*saved_shape)
-    param = param.transpose(0, 1).contiguous()
-    param = param.view(*input_shape)
-    return param
-
-
-def convert_megatron_checkpoint(sd_megatron, config):
-    """
-    Converts a Megatron checkpoint to a HuggingFace GPT-SW3 checkpoint.
-    """
-    n_positions = config.n_positions
-    layers = config.n_layer
-    vocab_size = config.vocab_size
-    heads = config.n_head
-    hidden_size_per_head = config.n_embd // config.n_head
-
-    word_embeddings = sd_megatron["model.language_model.embedding.word_embeddings.weight"][:vocab_size, :]
-    sd_hf = {
-        "transformer.wte.weight": word_embeddings,
-        "transformer.wpe.weight": sd_megatron["model.language_model.embedding.position_embeddings.weight"],
-        "transformer.ln_f.weight": sd_megatron["model.language_model.encoder.final_layernorm.weight"],
-        "transformer.ln_f.bias": sd_megatron["model.language_model.encoder.final_layernorm.bias"],
-    }
-
-    pf = "model.language_model.encoder.layers."
-    for i in range(layers):
-        causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.bool))
-        causal_mask = causal_mask.view(1, 1, n_positions, n_positions)
-        sd_hf[f"transformer.h.{i}.attn.bias"] = causal_mask
-        sd_hf[f"transformer.h.{i}.attn.masked_bias"] = torch.tensor(-1e4, dtype=torch.bfloat16)
-
-        sd_hf[f"transformer.h.{i}.ln_1.weight"] = sd_megatron[f"{pf}{i}.input_layernorm.weight"]
-        sd_hf[f"transformer.h.{i}.ln_1.bias"] = sd_megatron[f"{pf}{i}.input_layernorm.bias"]
-
-        val1 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.weight"]
-        val1 = fix_query_key_value_ordering(val1, 3, heads, hidden_size_per_head)
-        sd_hf[f"transformer.h.{i}.attn.c_attn.weight"] = val1.transpose(0, 1).contiguous()
-
-        val2 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.bias"]
-        val2 = fix_query_key_value_ordering(val2, 3, heads, hidden_size_per_head)
-        sd_hf[f"transformer.h.{i}.attn.c_attn.bias"] = val2
-
-        sd_hf[f"transformer.h.{i}.attn.c_proj.weight"] = sd_megatron[f"{pf}{i}.self_attention.dense.weight"].transpose(
-            0, 1
-        )
-        sd_hf[f"transformer.h.{i}.attn.c_proj.bias"] = sd_megatron[f"{pf}{i}.self_attention.dense.bias"]
-        sd_hf[f"transformer.h.{i}.ln_2.weight"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.weight"]
-        sd_hf[f"transformer.h.{i}.ln_2.bias"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.bias"]
-        sd_hf[f"transformer.h.{i}.mlp.c_fc.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.weight"].transpose(0, 1)
-        sd_hf[f"transformer.h.{i}.mlp.c_fc.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.bias"]
-        sd_hf[f"transformer.h.{i}.mlp.c_proj.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.weight"].transpose(
-            0, 1
-        )
-        sd_hf[f"transformer.h.{i}.mlp.c_proj.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.bias"]
-
-    # For LM head, transformers' wants the matrix to weight embeddings.
-    sd_hf["lm_head.weight"] = word_embeddings
-
-    return sd_hf
-
-
-def copy_config(config_hf, config_megatron):
-    """Copy the config from Megatron to hf."""
-    config_hf.vocab_size = 64000
-    config_hf.n_positions = config_megatron["encoder_seq_length"]
-    config_hf.n_embd = config_megatron["hidden_size"]
-    config_hf.n_layer = config_megatron["num_layers"]
-    config_hf.n_head = config_megatron["num_attention_heads"]
-    config_hf.n_inner = config_megatron["ffn_hidden_size"]
-    config_hf.activation_function = "gelu"
-    config_hf.resid_pdrop = 0.1
-    config_hf.embd_pdrop = 0.1
-    config_hf.attn_pdrop = 0.1
-    config_hf.layer_norm_epsilon = config_megatron["layernorm_epsilon"]  # 1e-5
-    config_hf.initializer_range = config_megatron["init_method_std"]  # 0.02
-    config_hf.apply_query_key_layer_scaling = config_megatron["apply_query_key_layer_scaling"]  # True
-    config_hf.normalize_attention_scores = True
-    config_hf.use_cache = True
-
-    # This identifies the 6.7B (7B) model which uses a different tokenizer
-    if config_megatron["hidden_size"] == 4096:
-        config_hf.bos_token_id = 1  # <|endoftext|>
-        config_hf.eos_token_id = 1  # <|endoftext|>
-        config_hf.pad_token_id = 0  # <unk>
-    else:
-        config_hf.bos_token_id = 2  # <s>
-        config_hf.eos_token_id = 3  # <|endoftext|>
-        config_hf.pad_token_id = 0  # <pad>
-
-    return config_hf
-
-
-def main(args):
-    print(args)
-
-    checkpoint_path = args.checkpoint_path
-    save_path = args.save_path
-    if isfile(checkpoint_path):
-        raise FileNotFoundError(f"ERROR! could not find file {checkpoint_path}")
-
-    # Load the model.
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    # Load the config.
-    config_megatron = checkpoint["hyper_parameters"]["cfg"]
-    config_hf = GPT2Config()
-    config_hf = copy_config(config_hf=config_hf, config_megatron=config_megatron)
-    config_hf.architectures = ["GPT2LMHeadModel"]
-
-    sd_megatron = checkpoint["state_dict"]
-
-    # Convert.
-    print("Converting")
-    sd_hf = convert_megatron_checkpoint(sd_megatron, config_hf)
-
-    # Print the structure of converted state dict.
-    if args.print_checkpoint_structure:
-        recursive_print(None, sd_hf)
-
-    config_hf.tokenizer_class = "GPTSw3Tokenizer"
-
-    # Store the config to file.
-    print("Saving config")
-    config_hf.save_pretrained(save_path)
-
-    # Store the state_dict to file.
-    output_checkpoint_file = os.path.join(save_path, "pytorch_model.bin")
-    print(f'Saving checkpoint to "{output_checkpoint_file}"')
-    torch.save(sd_hf, output_checkpoint_file)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        required=True,
-        help="e.g. megatron_gpt--val_loss=2.42-step=38000-consumed_samples=54720000",
-    )
-    parser.add_argument("--save_path", type=str, required=True, help="e.g. /home/user/gpt-sw3/hf")
-    parser.add_argument("--print-checkpoint-structure", action="store_true")
-    _args = parser.parse_args()
-    main(_args)
diff --git a/src/transformers/models/gptj/modeling_flax_gptj.py b/src/transformers/models/gptj/modeling_flax_gptj.py
index 83abf840ac27..eb20180a8f59 100644
--- a/src/transformers/models/gptj/modeling_flax_gptj.py
+++ b/src/transformers/models/gptj/modeling_flax_gptj.py
@@ -174,7 +174,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 8c9de2dbced1..fca8b0e5a167 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -27,6 +27,7 @@
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -38,8 +39,7 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    is_torch_flex_attn_available,
     is_torch_fx_proxy,
     logging,
 )
@@ -47,7 +47,13 @@
 from .configuration_gptj import GPTJConfig
 
 
-if is_flash_attn_2_available():
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -270,9 +276,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -888,12 +894,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -975,7 +986,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1251,7 +1262,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
index d0579bb8a7a4..e65f43229146 100644
--- a/src/transformers/models/granite/modeling_granite.py
+++ b/src/transformers/models/granite/modeling_granite.py
@@ -19,6 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from functools import partial
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
@@ -30,13 +31,15 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     LossKwargs,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -44,6 +47,12 @@
 from .configuration_granite import GraniteConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "GraniteConfig"
 
@@ -327,45 +336,18 @@ def __init__(self, config: GraniteConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -452,20 +434,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -527,10 +501,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(GRANITE_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -538,16 +513,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -594,7 +567,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -628,13 +601,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -642,12 +614,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -728,7 +705,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -795,12 +772,13 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(GRANITE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -809,13 +787,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -850,10 +826,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -862,12 +837,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -877,10 +851,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/granite/modular_granite.py b/src/transformers/models/granite/modular_granite.py
index f23ae4a673c3..25929dbb3376 100644
--- a/src/transformers/models/granite/modular_granite.py
+++ b/src/transformers/models/granite/modular_granite.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from functools import partial
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -121,7 +122,7 @@ def __init__(self, config: GraniteConfig):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -129,16 +130,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -185,7 +184,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -219,13 +218,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
 
 class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
@@ -234,7 +232,7 @@ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
 class GraniteForCausalLM(LlamaForCausalLM):
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -243,19 +241,17 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -264,12 +260,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -279,10 +274,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
index 546e78eac148..3b71ca4da633 100644
--- a/src/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -24,25 +24,31 @@
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
-from ...modeling_flash_attention_utils import _flash_attention_forward
+from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     MoeCausalLMOutputWithPast,
     MoeModelOutputWithPast,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_greater_or_equal_2_10,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
 from .configuration_granitemoe import GraniteMoeConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "GraniteMoeConfig"
@@ -174,45 +180,18 @@ def __init__(self, config: GraniteMoeConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -525,9 +504,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -981,7 +960,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(GRANITEMOE_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -1115,12 +1094,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1202,7 +1186,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1272,7 +1256,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -1287,7 +1271,6 @@ def forward(
         **kwargs,
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
diff --git a/src/transformers/models/granitemoeshared/__init__.py b/src/transformers/models/granitemoeshared/__init__.py
new file mode 100644
index 000000000000..33d80cdd3425
--- /dev/null
+++ b/src/transformers/models/granitemoeshared/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_granitemoeshared import *
+    from .modeling_granitemoeshared import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py
new file mode 100644
index 000000000000..49df8e0bdf59
--- /dev/null
+++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GraniteMoeShared model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraniteMoeSharedConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteMoeSharedModel`]. It is used to instantiate an GraniteMoeShared
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [ibm-research/moe-7b-1b-active-shared-experts](https://huggingface.co/ibm-research/moe-7b-1b-active-shared-experts).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the GraniteMoeShared model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GraniteMoeSharedModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier
+        logits_scaling (`float`, *optional*, defaults to 1.0): divisor for output logits
+        residual_multiplier (`float`, *optional*, defaults to 1.0): residual multiplier
+        attention_multiplier (`float`, *optional*, defaults to 1.0): attention multiplier
+        num_local_experts (`int`, *optional*, defaults to 8): total number of experts
+        num_experts_per_tok (`int`, *optional*, defaults to 2): number of experts per token
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabeling this will also
+            allow the model to output the auxiliary loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxialiary loss coefficient
+        shared_intermediate_size (`int`, *optional*, defaults to 0): intermediate size for shared experts. 0 implies
+            no shared experts.
+
+    ```python
+    >>> from transformers import GraniteMoeSharedModel, GraniteMoeSharedConfig
+
+    >>> # Initializing a GraniteMoeShared granitemoe-3b style configuration
+    >>> configuration = GraniteMoeSharedConfig()
+
+    >>> # Initializing a model from the granitemoe-7b style configuration
+    >>> model = GraniteMoeSharedModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "granitemoeshared"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        embedding_multiplier=1.0,
+        logits_scaling=1.0,
+        residual_multiplier=1.0,
+        attention_multiplier=1.0,
+        num_local_experts=8,
+        num_experts_per_tok=2,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        shared_intermediate_size=0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.residual_multiplier = residual_multiplier
+        self.attention_multiplier = attention_multiplier
+
+        self.num_local_experts = num_local_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.shared_intermediate_size = shared_intermediate_size
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        rope_config_validation(self)
+
+
+__all__ = ["GraniteMoeSharedConfig"]
diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
new file mode 100644
index 000000000000..81644fc3c388
--- /dev/null
+++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
@@ -0,0 +1,1400 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/granitemoeshared/modular_granitemoeshared.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_granitemoeshared.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
+from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_granitemoeshared import GraniteMoeSharedConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "GraniteMoeSharedConfig"
+
+
+class GraniteMoeSharedMLP(nn.Module):
+    """
+    MLP layer for shared experts
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super(GraniteMoeSharedMLP, self).__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.shared_intermediate_size
+        self.activation = ACT2FN[config.hidden_act]
+        self.input_linear = nn.Linear(self.input_size, self.hidden_size * 2, bias=False)
+        self.output_linear = nn.Linear(self.hidden_size, self.input_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.input_linear(hidden_states)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        hidden_states = self.output_linear(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeSharedRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        GraniteMoeSharedRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class GraniteMoeSharedParallelExperts(nn.Module):
+    def __init__(self, num_experts: int, input_size: int, output_size: int) -> None:
+        """
+        Initialize the GraniteMoeSharedParallelExperts module.
+        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's comptible with
+        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
+        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
+        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
+        used in vllm.
+        Args:
+            num_experts (int):
+                Number of experts.
+            input_size (int):
+                Size of the input.
+            output_size (int):
+                Size of the output.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.output_size = output_size
+
+    def forward(self, inputs, expert_size):
+        """
+        Forward pass of the GraniteMoeSharedParallelExperts module.
+        Args:
+            inputs (Tensor):
+                Input tensor.
+            expert_size:
+                Expert size information.
+        Returns:
+            Tensor: Output tensor.
+        """
+        input_list = inputs.split(expert_size, dim=0)
+        output_list = []
+        for i in range(self.num_experts):
+            output_list.append(F.linear(input_list[i], self.weight[i]))
+        results = torch.cat(output_list, dim=0)
+        return results
+
+
+class GraniteMoeSharedTopKGating(nn.Module):
+    def __init__(self, input_size: int, num_experts: int, top_k: int):
+        """
+        Initialize the top-k gating mechanism.
+        Args:
+            input_size (`int`):
+                Size of the input.
+            num_experts (`int`):
+                Number of experts.
+            top_k (`int`):
+                Number of top experts to select.
+        """
+        super().__init__()
+
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.top_k = top_k
+
+        self.layer = nn.Linear(input_size, num_experts, bias=False)
+
+    def forward(self, hidden_states):
+        # compute the top_k routing decision
+        logits = self.layer(hidden_states).float()  # [batch_size x seq_len, num_experts]
+        top_k_logits, top_k_indices = logits.topk(self.top_k, dim=1)  # [num_tokens, top_k]
+        top_k_gates = torch.softmax(top_k_logits, dim=1).type_as(hidden_states)  # [num_tokens, top_k]
+
+        # compute number of input given to each expert
+        zeros = torch.zeros(
+            [top_k_gates.size(0), self.num_experts], dtype=top_k_gates.dtype, device=top_k_gates.device
+        )  # [num_tokens, num_experts]
+        gates = zeros.scatter(1, top_k_indices, 1)  # [num_tokens, num_experts]
+        expert_size = gates.long().sum(0)  # [num_experts,]
+        # (This cause torch.compile to fail with `torch._dynamo.exc.Unsupported: Backend compiler failed with a fake tensor exception at`)
+        # (and `DataDependentOutputException`)
+        expert_size = expert_size.tolist()
+
+        # sort and group input tokens according to expert assignment
+        top_k_experts = top_k_indices.flatten()  # [num_tokens * top_k]
+        _, index_sorted_experts = top_k_experts.sort(0)  # [num_tokens * top_k]
+        batch_index = index_sorted_experts.div(self.top_k, rounding_mode="trunc")  # [num_tokens * top_k]
+
+        # gather the gate values for grouped input tokens
+        top_k_gates = top_k_gates.flatten()  # [num_tokens * top_k]
+        batch_gates = top_k_gates[index_sorted_experts]  # [num_tokens * top_k]
+
+        return index_sorted_experts, batch_index, batch_gates, expert_size, logits
+
+
+class GraniteMoeSharedMoE(nn.Module):
+    """
+    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super(GraniteMoeSharedMoE, self).__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.intermediate_size
+        self.activation = ACT2FN[config.hidden_act]
+        self.input_linear = GraniteMoeSharedParallelExperts(
+            config.num_local_experts, self.input_size, self.hidden_size * 2
+        )
+        self.output_linear = GraniteMoeSharedParallelExperts(
+            config.num_local_experts, self.hidden_size, self.input_size
+        )
+
+        self.router = GraniteMoeSharedTopKGating(
+            input_size=self.input_size,
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+        )
+
+    def forward(self, layer_input):
+        """
+        Forward pass of the mixture of experts layer.
+
+        Args:
+            layer_input (Tensor):
+                Input tensor.
+
+        Returns:
+            Tensor:
+                Output tensor.
+            Tensor:
+                Router logits.
+        """
+        bsz, length, emb_size = layer_input.size()
+        layer_input = layer_input.reshape(-1, emb_size)
+        _, batch_index, batch_gates, expert_size, router_logits = self.router(layer_input)
+
+        expert_inputs = layer_input[batch_index]
+        hidden_states = self.input_linear(expert_inputs, expert_size)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        expert_outputs = self.output_linear(hidden_states, expert_size)
+
+        expert_outputs = expert_outputs * batch_gates[:, None]
+
+        zeros = torch.zeros((bsz * length, self.input_size), dtype=expert_outputs.dtype, device=expert_outputs.device)
+        layer_output = zeros.index_add(0, batch_index, expert_outputs)
+        layer_output = layer_output.view(bsz, length, self.input_size)
+        return layer_output, router_logits
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# copied from transformers.models.granite.modeling_granite.GraniteAttention with Granite->GraniteMoeShared
+# no longer copied after attention refactors
+class GraniteMoeSharedAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GraniteMoeSharedConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+
+        self.scaling = config.attention_multiplier
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# NO LONGER EXIST Copied from transformers.models.granite.modeling_granite.GraniteFlashAttention2 with Granite->GraniteMoeShared
+# TODO cyril: modular
+class GraniteMoeSharedFlashAttention2(GraniteMoeSharedAttention):
+    """
+    GraniteMoeShared flash attention module. This module inherits from `GraniteMoeSharedAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (GraniteMoeSharedRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            softmax_scale=self.scaling,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# NO LONGER EXIST Copied from transformers.models.granite.modeling_granite.GraniteSdpaAttention with Granite->GraniteMoeShared
+# TODO cyril: modular
+class GraniteMoeSharedSdpaAttention(GraniteMoeSharedAttention):
+    """
+    GraniteMoeShared attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `GraniteMoeSharedAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from GraniteMoeSharedAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "GraniteMoeSharedModel is using GraniteMoeSharedSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+            scale=self.scaling,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+GRANITEMOESHARED_ATTENTION_CLASSES = {
+    "eager": GraniteMoeSharedAttention,
+    "flash_attention_2": GraniteMoeSharedFlashAttention2,
+    "sdpa": GraniteMoeSharedSdpaAttention,
+}
+
+
+class GraniteMoeSharedDecoderLayer(nn.Module):
+    def __init__(self, config: GraniteMoeSharedConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = GRANITEMOESHARED_ATTENTION_CLASSES[config._attn_implementation](
+            config=config, layer_idx=layer_idx
+        )
+
+        self.block_sparse_moe = GraniteMoeSharedMoE(config)
+        self.input_layernorm = GraniteMoeSharedRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GraniteMoeSharedRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.residual_multiplier = config.residual_multiplier
+        self.shared_mlp = None if config.shared_intermediate_size == 0 else GraniteMoeSharedMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_router_logits: Optional[bool] = False,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        moe_hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+
+        if self.shared_mlp is None:
+            hidden_states = moe_hidden_states
+        else:
+            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+GRANITEMOESHARED_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`GraniteMoeSharedConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare GraniteMoeShared Model outputting raw hidden-states without any specific head on top.",
+    GRANITEMOESHARED_START_DOCSTRING,
+)
+class GraniteMoeSharedPreTrainedModel(PreTrainedModel):
+    config_class = GraniteMoeSharedConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GraniteMoeSharedDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, GraniteMoeSharedParallelExperts):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+class GraniteMoeSharedRotaryEmbedding(nn.Module):
+    def __init__(self, config: GraniteMoeSharedConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+GRANITEMOESHARED_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare GraniteMoeShared Model outputting raw hidden-states without any specific head on top.",
+    GRANITEMOESHARED_START_DOCSTRING,
+)
+class GraniteMoeSharedModel(GraniteMoeSharedPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GraniteMoeDecoderLayer`]
+
+    Args:
+        config: GraniteMoeSharedConfig
+    """
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GraniteMoeSharedDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = GraniteMoeSharedRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+
+        self.embedding_multiplier = config.embedding_multiplier
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        # rope
+        self.rotary_emb = GraniteMoeSharedRotaryEmbedding(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(GRANITEMOESHARED_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embeds = inputs_embeds * self.embedding_multiplier
+
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):  # kept for BC (non `Cache` `past_key_values` inputs)
+            return_legacy_cache = True
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            logger.warning_once(
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
+            )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    output_router_logits,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    output_router_logits=output_router_logits,
+                    position_embeddings=position_embeddings,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, Tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+class GraniteMoeSharedForCausalLM(GraniteMoeSharedPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__(config)
+        self.model = GraniteMoeSharedModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(GRANITEMOESHARED_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GraniteMoeSharedForCausalLM
+
+        >>> model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits / self.config.logits_scaling
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Flatten the tokens
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return (loss,) + output if loss is not None else output
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+__all__ = ["GraniteMoeSharedForCausalLM", "GraniteMoeSharedModel", "GraniteMoeSharedPreTrainedModel"]
diff --git a/src/transformers/models/granitemoeshared/modular_granitemoeshared.py b/src/transformers/models/granitemoeshared/modular_granitemoeshared.py
new file mode 100644
index 000000000000..d275eb5f4550
--- /dev/null
+++ b/src/transformers/models/granitemoeshared/modular_granitemoeshared.py
@@ -0,0 +1,285 @@
+# coding=utf-8
+# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...utils import add_start_docstrings, logging
+from ..granitemoe.modeling_granitemoe import (
+    GraniteMoeDecoderLayer,
+    GraniteMoeForCausalLM,
+    GraniteMoeModel,
+    GraniteMoePreTrainedModel,
+)
+from .configuration_granitemoeshared import GraniteMoeSharedConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "GraniteMoeSharedConfig"
+
+
+class GraniteMoeSharedMLP(nn.Module):
+    """
+    MLP layer for shared experts
+
+    Args:
+        config:
+            Configuration object with model hyperparameters.
+    """
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super(GraniteMoeSharedMLP, self).__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.shared_intermediate_size
+        self.activation = ACT2FN[config.hidden_act]
+        self.input_linear = nn.Linear(self.input_size, self.hidden_size * 2, bias=False)
+        self.output_linear = nn.Linear(self.hidden_size, self.input_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.input_linear(hidden_states)
+        chunked_hidden_states = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.activation(chunked_hidden_states[0]) * chunked_hidden_states[1]
+        hidden_states = self.output_linear(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeSharedDecoderLayer(GraniteMoeDecoderLayer):
+    def __init__(self, config: GraniteMoeSharedConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.shared_mlp = None if config.shared_intermediate_size == 0 else GraniteMoeSharedMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        output_router_logits: Optional[bool] = False,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        moe_hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+
+        if self.shared_mlp is None:
+            hidden_states = moe_hidden_states
+        else:
+            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+
+        del moe_hidden_states
+
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+GRANITEMOESHARED_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`GraniteMoeSharedConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare GraniteMoeShared Model outputting raw hidden-states without any specific head on top.",
+    GRANITEMOESHARED_START_DOCSTRING,
+)
+class GraniteMoeSharedPreTrainedModel(GraniteMoePreTrainedModel):
+    config_class = GraniteMoeSharedConfig
+    _no_split_modules = ["GraniteMoeSharedDecoderLayer"]
+
+
+GRANITEMOESHARED_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare GraniteMoeShared Model outputting raw hidden-states without any specific head on top.",
+    GRANITEMOESHARED_START_DOCSTRING,
+)
+class GraniteMoeSharedModel(GraniteMoeModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GraniteMoeDecoderLayer`]
+
+    Args:
+        config: GraniteMoeSharedConfig
+    """
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [GraniteMoeSharedDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+
+class GraniteMoeSharedForCausalLM(GraniteMoeForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GraniteMoeSharedConfig):
+        super().__init__(config)
+        self.model = GraniteMoeSharedModel(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+
+__all__ = ["GraniteMoeSharedForCausalLM", "GraniteMoeSharedModel", "GraniteMoeSharedPreTrainedModel"]
diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
deleted file mode 100644
index ac8e82bfd825..000000000000
--- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py
+++ /dev/null
@@ -1,491 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Grounding DINO checkpoints from the original repository.
-
-URL: https://github.com/IDEA-Research/GroundingDINO"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-from torchvision import transforms as T
-
-from transformers import (
-    AutoTokenizer,
-    GroundingDinoConfig,
-    GroundingDinoForObjectDetection,
-    GroundingDinoImageProcessor,
-    GroundingDinoProcessor,
-    SwinConfig,
-)
-
-
-IMAGENET_MEAN = [0.485, 0.456, 0.406]
-IMAGENET_STD = [0.229, 0.224, 0.225]
-
-
-def get_grounding_dino_config(model_name):
-    if "tiny" in model_name:
-        window_size = 7
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-        image_size = 224
-    elif "base" in model_name:
-        window_size = 12
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-        image_size = 384
-    else:
-        raise ValueError("Model not supported, only supports base and large variants")
-
-    backbone_config = SwinConfig(
-        window_size=window_size,
-        image_size=image_size,
-        embed_dim=embed_dim,
-        depths=depths,
-        num_heads=num_heads,
-        out_indices=[2, 3, 4],
-    )
-
-    config = GroundingDinoConfig(backbone_config=backbone_config)
-
-    return config
-
-
-def create_rename_keys(state_dict, config):
-    rename_keys = []
-    # fmt: off
-    ########################################## VISION BACKBONE - START
-    # patch embedding layer
-    rename_keys.append(("backbone.0.patch_embed.proj.weight",
-                        "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("backbone.0.patch_embed.proj.bias",
-                        "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("backbone.0.patch_embed.norm.weight",
-                        "model.backbone.conv_encoder.model.embeddings.norm.weight"))
-    rename_keys.append(("backbone.0.patch_embed.norm.bias",
-                        "model.backbone.conv_encoder.model.embeddings.norm.bias"))
-
-    for layer, depth in enumerate(config.backbone_config.depths):
-        for block in range(depth):
-            # layernorms
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias"))
-
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias"))
-            # attention
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias"))
-            # intermediate
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias"))
-
-            # output
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias",
-                            f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias"))
-
-        # downsample
-        if layer!=len(config.backbone_config.depths)-1:
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight"))
-            rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias",
-                                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias"))
-
-    for out_indice in config.backbone_config.out_indices:
-        # Grounding DINO implementation of out_indices isn't aligned with transformers
-        rename_keys.append((f"backbone.0.norm{out_indice-1}.weight",
-                        f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight"))
-        rename_keys.append((f"backbone.0.norm{out_indice-1}.bias",
-                        f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias"))
-
-    ########################################## VISION BACKBONE - END
-
-    ########################################## ENCODER - START
-    deformable_key_mappings = {
-        'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight',
-        'self_attn.sampling_offsets.bias': 'deformable_layer.self_attn.sampling_offsets.bias',
-        'self_attn.attention_weights.weight': 'deformable_layer.self_attn.attention_weights.weight',
-        'self_attn.attention_weights.bias': 'deformable_layer.self_attn.attention_weights.bias',
-        'self_attn.value_proj.weight': 'deformable_layer.self_attn.value_proj.weight',
-        'self_attn.value_proj.bias': 'deformable_layer.self_attn.value_proj.bias',
-        'self_attn.output_proj.weight': 'deformable_layer.self_attn.output_proj.weight',
-        'self_attn.output_proj.bias': 'deformable_layer.self_attn.output_proj.bias',
-        'norm1.weight': 'deformable_layer.self_attn_layer_norm.weight',
-        'norm1.bias': 'deformable_layer.self_attn_layer_norm.bias',
-        'linear1.weight': 'deformable_layer.fc1.weight',
-        'linear1.bias': 'deformable_layer.fc1.bias',
-        'linear2.weight': 'deformable_layer.fc2.weight',
-        'linear2.bias': 'deformable_layer.fc2.bias',
-        'norm2.weight': 'deformable_layer.final_layer_norm.weight',
-        'norm2.bias': 'deformable_layer.final_layer_norm.bias',
-    }
-    text_enhancer_key_mappings = {
-        'self_attn.in_proj_weight': 'text_enhancer_layer.self_attn.in_proj_weight',
-        'self_attn.in_proj_bias': 'text_enhancer_layer.self_attn.in_proj_bias',
-        'self_attn.out_proj.weight': 'text_enhancer_layer.self_attn.out_proj.weight',
-        'self_attn.out_proj.bias': 'text_enhancer_layer.self_attn.out_proj.bias',
-        'linear1.weight': 'text_enhancer_layer.fc1.weight',
-        'linear1.bias': 'text_enhancer_layer.fc1.bias',
-        'linear2.weight': 'text_enhancer_layer.fc2.weight',
-        'linear2.bias': 'text_enhancer_layer.fc2.bias',
-        'norm1.weight': 'text_enhancer_layer.layer_norm_before.weight',
-        'norm1.bias': 'text_enhancer_layer.layer_norm_before.bias',
-        'norm2.weight': 'text_enhancer_layer.layer_norm_after.weight',
-        'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias',
-    }
-    fusion_key_mappings = {
-        'gamma_v': 'fusion_layer.vision_param',
-        'gamma_l': 'fusion_layer.text_param',
-        'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight',
-        'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias',
-        'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight',
-        'layer_norm_l.bias': 'fusion_layer.layer_norm_text.bias',
-        'attn.v_proj.weight': 'fusion_layer.attn.vision_proj.weight',
-        'attn.v_proj.bias': 'fusion_layer.attn.vision_proj.bias',
-        'attn.l_proj.weight': 'fusion_layer.attn.text_proj.weight',
-        'attn.l_proj.bias': 'fusion_layer.attn.text_proj.bias',
-        'attn.values_v_proj.weight': 'fusion_layer.attn.values_vision_proj.weight',
-        'attn.values_v_proj.bias': 'fusion_layer.attn.values_vision_proj.bias',
-        'attn.values_l_proj.weight': 'fusion_layer.attn.values_text_proj.weight',
-        'attn.values_l_proj.bias': 'fusion_layer.attn.values_text_proj.bias',
-        'attn.out_v_proj.weight': 'fusion_layer.attn.out_vision_proj.weight',
-        'attn.out_v_proj.bias': 'fusion_layer.attn.out_vision_proj.bias',
-        'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight',
-        'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias',
-    }
-    for layer in range(config.encoder_layers):
-        # deformable
-        for src, dest in deformable_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.layers.{layer}.{src}",
-                                f"model.encoder.layers.{layer}.{dest}"))
-        # text enhance
-        for src, dest in text_enhancer_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}",
-                                f"model.encoder.layers.{layer}.{dest}"))
-        # fusion layers
-        for src, dest in fusion_key_mappings.items():
-            rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}",
-                                f"model.encoder.layers.{layer}.{dest}"))
-    ########################################## ENCODER - END
-
-    ########################################## DECODER - START
-    key_mappings_decoder = {
-        'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight',
-        'cross_attn.sampling_offsets.bias': 'encoder_attn.sampling_offsets.bias',
-        'cross_attn.attention_weights.weight': 'encoder_attn.attention_weights.weight',
-        'cross_attn.attention_weights.bias': 'encoder_attn.attention_weights.bias',
-        'cross_attn.value_proj.weight': 'encoder_attn.value_proj.weight',
-        'cross_attn.value_proj.bias': 'encoder_attn.value_proj.bias',
-        'cross_attn.output_proj.weight': 'encoder_attn.output_proj.weight',
-        'cross_attn.output_proj.bias': 'encoder_attn.output_proj.bias',
-        'norm1.weight': 'encoder_attn_layer_norm.weight',
-        'norm1.bias': 'encoder_attn_layer_norm.bias',
-        'ca_text.in_proj_weight': 'encoder_attn_text.in_proj_weight',
-        'ca_text.in_proj_bias': 'encoder_attn_text.in_proj_bias',
-        'ca_text.out_proj.weight': 'encoder_attn_text.out_proj.weight',
-        'ca_text.out_proj.bias': 'encoder_attn_text.out_proj.bias',
-        'catext_norm.weight': 'encoder_attn_text_layer_norm.weight',
-        'catext_norm.bias': 'encoder_attn_text_layer_norm.bias',
-        'self_attn.in_proj_weight': 'self_attn.in_proj_weight',
-        'self_attn.in_proj_bias': 'self_attn.in_proj_bias',
-        'self_attn.out_proj.weight': 'self_attn.out_proj.weight',
-        'self_attn.out_proj.bias': 'self_attn.out_proj.bias',
-        'norm2.weight': 'self_attn_layer_norm.weight',
-        'norm2.bias': 'self_attn_layer_norm.bias',
-        'linear1.weight': 'fc1.weight',
-        'linear1.bias': 'fc1.bias',
-        'linear2.weight': 'fc2.weight',
-        'linear2.bias': 'fc2.bias',
-        'norm3.weight': 'final_layer_norm.weight',
-        'norm3.bias': 'final_layer_norm.bias',
-    }
-    for layer_num in range(config.decoder_layers):
-        source_prefix_decoder = f'transformer.decoder.layers.{layer_num}.'
-        target_prefix_decoder = f'model.decoder.layers.{layer_num}.'
-
-        for source_name, target_name in key_mappings_decoder.items():
-            rename_keys.append((source_prefix_decoder + source_name,
-                               target_prefix_decoder + target_name))
-    ########################################## DECODER - END
-
-    ########################################## Additional - START
-    for layer_name, params in state_dict.items():
-        #### TEXT BACKBONE
-        if "bert" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone")))
-        #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE
-        if "input_proj" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision")))
-        #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE
-        if "feat_map" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("feat_map", "model.text_projection")))
-        #### DECODER REFERENCE POINT HEAD
-        if "transformer.decoder.ref_point_head" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head",
-                                                               "model.decoder.reference_points_head")))
-        #### DECODER BBOX EMBED
-        if "transformer.decoder.bbox_embed" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed",
-                                                               "model.decoder.bbox_embed")))
-        if "transformer.enc_output" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer", "model")))
-
-        if "transformer.enc_out_bbox_embed" in layer_name:
-            rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed",
-                                                               "model.encoder_output_bbox_embed")))
-
-    rename_keys.append(("transformer.level_embed", "model.level_embed"))
-    rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight"))
-    rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias"))
-    rename_keys.append(("transformer.tgt_embed.weight", "model.query_position_embeddings.weight"))
-    ########################################## Additional - END
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v_encoder(state_dict, config):
-    ########################################## VISION BACKBONE - START
-    embed_dim = config.backbone_config.embed_dim
-    for layer, depth in enumerate(config.backbone_config.depths):
-        hidden_size = embed_dim * 2**layer
-        for block in range(depth):
-            # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight")
-            in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias")
-            # next, add query, keys and values (in that order) to the state dict
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight"
-            ] = in_proj_weight[:hidden_size, :]
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias"
-            ] = in_proj_bias[:hidden_size]
-
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight"
-            ] = in_proj_weight[hidden_size : hidden_size * 2, :]
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias"
-            ] = in_proj_bias[hidden_size : hidden_size * 2]
-
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight"
-            ] = in_proj_weight[-hidden_size:, :]
-            state_dict[
-                f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias"
-            ] = in_proj_bias[-hidden_size:]
-    ########################################## VISION BACKBONE - END
-
-
-def read_in_q_k_v_text_enhancer(state_dict, config):
-    hidden_size = config.hidden_size
-    for idx in range(config.encoder_layers):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.weight"] = in_proj_weight[
-            :hidden_size, :
-        ]
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.bias"] = in_proj_bias[:hidden_size]
-
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.bias"] = in_proj_bias[
-            hidden_size : hidden_size * 2
-        ]
-
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.weight"] = in_proj_weight[
-            -hidden_size:, :
-        ]
-        state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.bias"] = in_proj_bias[
-            -hidden_size:
-        ]
-
-
-def read_in_q_k_v_decoder(state_dict, config):
-    hidden_size = config.hidden_size
-    for idx in range(config.decoder_layers):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{idx}.self_attn.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"model.decoder.layers.{idx}.self_attn.query.bias"] = in_proj_bias[:hidden_size]
-
-        state_dict[f"model.decoder.layers.{idx}.self_attn.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"model.decoder.layers.{idx}.self_attn.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-
-        state_dict[f"model.decoder.layers.{idx}.self_attn.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"model.decoder.layers.{idx}.self_attn.value.bias"] = in_proj_bias[-hidden_size:]
-
-        # read in weights + bias of cross-attention
-        in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_bias")
-
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.bias"] = in_proj_bias[:hidden_size]
-
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.bias"] = in_proj_bias[
-            hidden_size : hidden_size * 2
-        ]
-
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.bias"] = in_proj_bias[-hidden_size:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return image
-
-
-def preprocess_caption(caption: str) -> str:
-    result = caption.lower().strip()
-    if result.endswith("."):
-        return result
-    return result + "."
-
-
-@torch.no_grad()
-def convert_grounding_dino_checkpoint(args):
-    model_name = args.model_name
-    pytorch_dump_folder_path = args.pytorch_dump_folder_path
-    push_to_hub = args.push_to_hub
-    verify_logits = args.verify_logits
-
-    checkpoint_mapping = {
-        "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth",
-        "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth",
-    }
-    # Define default GroundingDino configuation
-    config = get_grounding_dino_config(model_name)
-
-    # Load original checkpoint
-    checkpoint_url = checkpoint_mapping[model_name]
-    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-    original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()}
-
-    for name, param in original_state_dict.items():
-        print(name, param.shape)
-
-    # Rename keys
-    new_state_dict = original_state_dict.copy()
-    rename_keys = create_rename_keys(original_state_dict, config)
-
-    for src, dest in rename_keys:
-        rename_key(new_state_dict, src, dest)
-    read_in_q_k_v_encoder(new_state_dict, config)
-    read_in_q_k_v_text_enhancer(new_state_dict, config)
-    read_in_q_k_v_decoder(new_state_dict, config)
-
-    # Load HF model
-    model = GroundingDinoForObjectDetection(config)
-    model.eval()
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    # Load and process test image
-    image = prepare_img()
-    transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
-    original_pixel_values = transforms(image).unsqueeze(0)
-
-    image_processor = GroundingDinoImageProcessor()
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-    processor = GroundingDinoProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    text = "a cat"
-    inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt")
-
-    assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4)
-
-    if verify_logits:
-        # Running forward
-        with torch.no_grad():
-            outputs = model(**inputs)
-
-        print(outputs.logits[0, :3, :3])
-
-        expected_slice = torch.tensor(
-            [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]]
-        )
-
-        assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
-        print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(f"EduardoPacheco/{model_name}")
-        processor.push_to_hub(f"EduardoPacheco/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="grounding-dino-tiny",
-        type=str,
-        choices=["grounding-dino-tiny", "grounding-dino-base"],
-        help="Name of the GroundingDino model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    parser.add_argument(
-        "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion."
-    )
-
-    args = parser.parse_args()
-    convert_grounding_dino_checkpoint(args)
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index dcf6fda89783..03a6c2e4e3c8 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -986,7 +986,7 @@ def prepare_annotation(
         image: np.ndarray,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -1302,7 +1302,7 @@ def preprocess(
         self,
         images: ImageInput,
         annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 695ef41e2e02..a238c1dc1d9f 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -15,17 +15,13 @@
 """PyTorch Grounding DINO model."""
 
 import math
-import os
 import warnings
 from dataclasses import dataclass
-from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
 
 from ...activations import ACT2FN
 from ...file_utils import (
@@ -33,13 +29,13 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_timm_available,
-    is_torch_cuda_available,
     replace_return_docstrings,
     requires_backends,
 )
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import meshgrid
-from ...utils import is_ninja_available, logging
+from ...utils import logging
 from ...utils.backbone_utils import load_backbone
 from ..auto import AutoModel
 from .configuration_grounding_dino import GroundingDinoConfig
@@ -51,93 +47,64 @@
 
 logger = logging.get_logger(__name__)
 
-MultiScaleDeformableAttention = None
-
-
-# Copied from models.deformable_detr.load_cuda_kernels
-def load_cuda_kernels():
-    from torch.utils.cpp_extension import load
-
-    global MultiScaleDeformableAttention
-
-    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
-    src_files = [
-        root / filename
-        for filename in [
-            "vision.cpp",
-            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
-            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
-        ]
-    ]
-
-    MultiScaleDeformableAttention = load(
-        "MultiScaleDeformableAttention",
-        src_files,
-        with_cuda=True,
-        extra_include_paths=[str(root)],
-        extra_cflags=["-DWITH_CUDA=1"],
-        extra_cuda_cflags=[
-            "-DCUDA_HAS_FP16=1",
-            "-D__CUDA_NO_HALF_OPERATORS__",
-            "-D__CUDA_NO_HALF_CONVERSIONS__",
-            "-D__CUDA_NO_HALF2_OPERATORS__",
-        ],
-    )
-
-
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
+_CONFIG_FOR_DOC = "GroundingDinoConfig"
+_CHECKPOINT_FOR_DOC = "IDEA-Research/grounding-dino-tiny"
+
+
+@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention
+class MultiScaleDeformableAttention(nn.Module):
     def forward(
-        context,
-        value,
-        value_spatial_shapes,
-        value_level_start_index,
-        sampling_locations,
-        attention_weights,
-        im2col_step,
+        self,
+        value: Tensor,
+        value_spatial_shapes: Tensor,
+        value_spatial_shapes_list: List[Tuple],
+        level_start_index: Tensor,
+        sampling_locations: Tensor,
+        attention_weights: Tensor,
+        im2col_step: int,
     ):
-        context.im2col_step = im2col_step
-        output = MultiScaleDeformableAttention.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
+        batch_size, _, num_heads, hidden_dim = value.shape
+        _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+        value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
+        sampling_grids = 2 * sampling_locations - 1
+        sampling_value_list = []
+        for level_id, (height, width) in enumerate(value_spatial_shapes_list):
+            # batch_size, height*width, num_heads, hidden_dim
+            # -> batch_size, height*width, num_heads*hidden_dim
+            # -> batch_size, num_heads*hidden_dim, height*width
+            # -> batch_size*num_heads, hidden_dim, height, width
+            value_l_ = (
+                value_list[level_id]
+                .flatten(2)
+                .transpose(1, 2)
+                .reshape(batch_size * num_heads, hidden_dim, height, width)
+            )
+            # batch_size, num_queries, num_heads, num_points, 2
+            # -> batch_size, num_heads, num_queries, num_points, 2
+            # -> batch_size*num_heads, num_queries, num_points, 2
+            sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+            # batch_size*num_heads, hidden_dim, num_queries, num_points
+            sampling_value_l_ = nn.functional.grid_sample(
+                value_l_,
+                sampling_grid_l_,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            sampling_value_list.append(sampling_value_l_)
+        # (batch_size, num_queries, num_heads, num_levels, num_points)
+        # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+        # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+        attention_weights = attention_weights.transpose(1, 2).reshape(
+            batch_size * num_heads, 1, num_queries, num_levels * num_points
         )
-        context.save_for_backward(
-            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
+        output = (
+            (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+            .sum(-1)
+            .view(batch_size, num_heads * hidden_dim, num_queries)
         )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "GroundingDinoConfig"
-_CHECKPOINT_FOR_DOC = "IDEA-Research/grounding-dino-tiny"
+        return output.transpose(1, 2).contiguous()
 
 
 @dataclass
@@ -165,9 +132,9 @@ class GroundingDinoDecoderOutput(ModelOutput):
             weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
 
@@ -199,8 +166,8 @@ class GroundingDinoEncoderOutput(ModelOutput):
             multi-scale deformable attention heads.
     """
 
-    last_hidden_state_vision: torch.FloatTensor = None
-    last_hidden_state_text: torch.FloatTensor = None
+    last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    last_hidden_state_text: Optional[torch.FloatTensor] = None
     vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
@@ -252,12 +219,16 @@ class GroundingDinoModelOutput(ModelOutput):
             background).
         enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
             Logits of predicted bounding boxes coordinates in the first stage.
+        encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+            Logits of top `config.num_queries` scoring bounding boxes in the first stage.
+        encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+            Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    init_reference_points: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
@@ -267,6 +238,8 @@ class GroundingDinoModelOutput(ModelOutput):
     encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    encoder_logits: Optional[torch.FloatTensor] = None
+    encoder_pred_boxes: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -331,14 +304,18 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
             background).
         enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
             Logits of predicted bounding boxes coordinates in the first stage.
+        encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+            Logits of top `config.num_queries` scoring bounding boxes in the first stage.
+        encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+            Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Encoded candidate labels sequence. Used in processor to post process object detection result.
     """
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
     init_reference_points: Optional[torch.FloatTensor] = None
@@ -353,6 +330,8 @@ class GroundingDinoObjectDetectionOutput(ModelOutput):
     encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
     enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    encoder_logits: Optional[torch.FloatTensor] = None
+    encoder_pred_boxes: Optional[torch.FloatTensor] = None
     input_ids: Optional[torch.LongTensor] = None
 
 
@@ -571,49 +550,6 @@ def build_position_encoding(config):
     return position_embedding
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
-def multi_scale_deformable_attention(
-    value: Tensor,
-    value_spatial_shapes: Union[Tensor, List[Tuple]],
-    sampling_locations: Tensor,
-    attention_weights: Tensor,
-) -> Tensor:
-    batch_size, _, num_heads, hidden_dim = value.shape
-    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for level_id, (height, width) in enumerate(value_spatial_shapes):
-        # batch_size, height*width, num_heads, hidden_dim
-        # -> batch_size, height*width, num_heads*hidden_dim
-        # -> batch_size, num_heads*hidden_dim, height*width
-        # -> batch_size*num_heads, hidden_dim, height, width
-        value_l_ = (
-            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
-        )
-        # batch_size, num_queries, num_heads, num_points, 2
-        # -> batch_size, num_heads, num_queries, num_points, 2
-        # -> batch_size*num_heads, num_queries, num_points, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
-        # batch_size*num_heads, hidden_dim, num_queries, num_points
-        sampling_value_l_ = nn.functional.grid_sample(
-            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
-        )
-        sampling_value_list.append(sampling_value_l_)
-    # (batch_size, num_queries, num_heads, num_levels, num_points)
-    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
-    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
-    attention_weights = attention_weights.transpose(1, 2).reshape(
-        batch_size * num_heads, 1, num_queries, num_levels * num_points
-    )
-    output = (
-        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
-        .sum(-1)
-        .view(batch_size, num_heads * hidden_dim, num_queries)
-    )
-    return output.transpose(1, 2).contiguous()
-
-
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDino, Deformable DETR->Grounding DINO
 class GroundingDinoMultiscaleDeformableAttention(nn.Module):
     """
@@ -623,12 +559,7 @@ class GroundingDinoMultiscaleDeformableAttention(nn.Module):
     def __init__(self, config: GroundingDinoConfig, num_heads: int, n_points: int):
         super().__init__()
 
-        kernel_loaded = MultiScaleDeformableAttention is not None
-        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
-            try:
-                load_cuda_kernels()
-            except Exception as e:
-                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+        self.attn = MultiScaleDeformableAttention()
 
         if config.d_model % num_heads != 0:
             raise ValueError(
@@ -715,23 +646,16 @@ def forward(
         else:
             raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
 
-        if self.disable_custom_kernels or MultiScaleDeformableAttention is None:
-            # PyTorch implementation
-            output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
-        else:
-            try:
-                # custom kernel
-                output = MultiScaleDeformableAttentionFunction.apply(
-                    value,
-                    spatial_shapes,
-                    level_start_index,
-                    sampling_locations,
-                    attention_weights,
-                    self.im2col_step,
-                )
-            except Exception:
-                # PyTorch implementation
-                output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        output = self.attn(
+            value,
+            spatial_shapes,
+            spatial_shapes_list,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+
         output = self.output_proj(output)
 
         return output, attention_weights
@@ -1088,9 +1012,10 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         output_attentions: bool = False,
     ):
@@ -1106,6 +1031,8 @@ def forward(
                 Reference points.
             spatial_shapes (`torch.LongTensor`, *optional*):
                 Spatial shapes of the backbone feature maps.
+            spatial_shapes_list (`List[Tuple[int, int]]`, *optional*):
+                Spatial shapes of the backbone feature maps (but as list for export compatibility).
             level_start_index (`torch.LongTensor`, *optional*):
                 Level start index.
             output_attentions (`bool`, *optional*):
@@ -1123,6 +1050,7 @@ def forward(
             position_embeddings=position_embeddings,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
             level_start_index=level_start_index,
             output_attentions=output_attentions,
         )
@@ -1223,6 +1151,7 @@ def forward(
         vision_features: Tensor,
         vision_position_embedding: Tensor,
         spatial_shapes: Tensor,
+        spatial_shapes_list: List[Tuple[int, int]],
         level_start_index: Tensor,
         key_padding_mask: Tensor,
         reference_points: Tensor,
@@ -1255,6 +1184,7 @@ def forward(
             position_embeddings=vision_position_embedding,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
             level_start_index=level_start_index,
         )
 
@@ -1371,6 +1301,7 @@ def forward(
         position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         vision_encoder_hidden_states: Optional[torch.Tensor] = None,
         vision_encoder_attention_mask: Optional[torch.Tensor] = None,
@@ -1423,6 +1354,7 @@ def forward(
             position_embeddings=position_embeddings,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
             level_start_index=level_start_index,
             output_attentions=output_attentions,
         )
@@ -1670,6 +1602,7 @@ def forward(
         vision_attention_mask: Tensor,
         vision_position_embedding: Tensor,
         spatial_shapes: Tensor,
+        spatial_shapes_list: List[Tuple[int, int]],
         level_start_index: Tensor,
         valid_ratios=None,
         text_features: Optional[Tensor] = None,
@@ -1694,6 +1627,8 @@ def forward(
                 Position embeddings that are added to the queries and keys in each self-attention layer.
             spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
                 Spatial shapes of each feature map.
+            spatial_shapes_list (`List[Tuple[int, int]]`):
+                Spatial shapes of each feature map (but as list for export compatibility).
             level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
                 Starting index of each feature map.
             valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
@@ -1746,6 +1681,7 @@ def forward(
                 vision_features=vision_features,
                 vision_position_embedding=vision_position_embedding,
                 spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
                 level_start_index=level_start_index,
                 key_padding_mask=vision_attention_mask,
                 reference_points=reference_points,
@@ -1824,6 +1760,7 @@ def forward(
         text_encoder_attention_mask=None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         valid_ratios=None,
         self_attn_mask=None,
@@ -1851,6 +1788,8 @@ def forward(
                 Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
             spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
                 Spatial shapes of the feature maps.
+            spatial_shapes_list (`List[Tuple[int, int]]`):
+                Spatial shapes of the feature maps (but as list for export compatibility).
             level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
                 Indexes for the start of each feature level. In range `[0, sequence_length]`.
             valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
@@ -1943,6 +1882,7 @@ def custom_forward(*inputs):
                     position_embeddings=query_pos,
                     reference_points=reference_points_input,
                     spatial_shapes=spatial_shapes,
+                    spatial_shapes_list=spatial_shapes_list,
                     level_start_index=level_start_index,
                     vision_encoder_hidden_states=vision_encoder_hidden_states,
                     vision_encoder_attention_mask=vision_encoder_attention_mask,
@@ -2324,11 +2264,11 @@ def forward(
         source_flatten = []
         mask_flatten = []
         lvl_pos_embed_flatten = []
-        spatial_shapes = []
+        spatial_shapes_list = []
         for level, (source, mask, pos_embed) in enumerate(zip(feature_maps, masks, position_embeddings_list)):
             batch_size, num_channels, height, width = source.shape
             spatial_shape = (height, width)
-            spatial_shapes.append(spatial_shape)
+            spatial_shapes_list.append(spatial_shape)
             source = source.flatten(2).transpose(1, 2)
             mask = mask.flatten(1)
             pos_embed = pos_embed.flatten(2).transpose(1, 2)
@@ -2339,7 +2279,7 @@ def forward(
         source_flatten = torch.cat(source_flatten, 1)
         mask_flatten = torch.cat(mask_flatten, 1)
         lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
-        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
         level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
         valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
         valid_ratios = valid_ratios.float()
@@ -2352,6 +2292,7 @@ def forward(
                 vision_attention_mask=~mask_flatten,
                 vision_position_embedding=lvl_pos_embed_flatten,
                 spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
                 level_start_index=level_start_index,
                 valid_ratios=valid_ratios,
                 text_features=text_features,
@@ -2374,8 +2315,11 @@ def forward(
             )
 
         # Fifth, prepare decoder inputs
+        topk_proposals = None
         enc_outputs_class = None
         enc_outputs_coord_logits = None
+        encoder_logits = None
+        encoder_pred_boxes = None
         if self.config.two_stage:
             object_query_embedding, output_proposals = self.generate_encoder_output_proposals(
                 encoder_outputs[0], ~mask_flatten, spatial_shapes
@@ -2408,6 +2352,10 @@ def forward(
                 target = torch.gather(
                     object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
                 ).detach()
+
+            # Set intermediate topk proposals (coords and class) for loss computation
+            encoder_pred_boxes = reference_points
+            encoder_logits = self.encoder_output_class_embed(target, text_features, text_token_mask)
         else:
             target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
             reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid()
@@ -2421,6 +2369,7 @@ def forward(
             text_encoder_attention_mask=~text_token_mask,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
             level_start_index=level_start_index,
             valid_ratios=valid_ratios,
             self_attn_mask=None,
@@ -2430,7 +2379,16 @@ def forward(
         )
 
         if not return_dict:
-            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+            enc_outputs = tuple(
+                value
+                for value in [
+                    enc_outputs_class,
+                    enc_outputs_coord_logits,
+                    encoder_logits,
+                    encoder_pred_boxes,
+                ]
+                if value is not None
+            )
             tuple_outputs = (
                 (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs
             )
@@ -2451,6 +2409,8 @@ def forward(
             encoder_attentions=encoder_outputs.attentions,
             enc_outputs_class=enc_outputs_class,
             enc_outputs_coord_logits=enc_outputs_coord_logits,
+            encoder_logits=encoder_logits,
+            encoder_pred_boxes=encoder_pred_boxes,
         )
 
 
@@ -2476,6 +2436,73 @@ def forward(self, x):
         return x
 
 
+def build_label_maps(logits: torch.FloatTensor, input_ids: torch.LongTensor) -> Tuple[torch.FloatTensor]:
+    """
+    Computes a mapping between tokens and their corresponding labels, where `num_labels` is determined by the number of classes in the input prompt.
+    The function identifies segments of tokens between specific delimiter tokens and generates label maps for those segments.
+    Args:
+        logits (`torch.Tensor` of shape `(batch_size, seq_length, hidden_size)`):
+            The output logits from the model, where `hidden_size` corresponds to the dimension of the model's output features.
+
+        input_ids (`torch.Tensor` of shape `(batch_size, seq_length)`):
+            The input token IDs corresponding to the input prompt. For example, given the prompt "fish. shark.",
+            `input_ids` might look like `[101, 3869, 1012, 11420, 1012, 102]` where each number corresponds to a token including special tokens.
+    Returns:
+        tuple: A tuple containing label maps for each instance in the batch.
+        - label_maps (tuple of `torch.Tensor`):
+            A tuple of tensors, where each tensor in the tuple corresponds to an instance in the batch. Each tensor
+            has shape `(num_labels, hidden_size)` and contains binary values (0 or 1), where `1` indicates the tokens
+            that are associated with a specific label (class) between delimiter tokens, and `0` elsewhere.
+    Example:
+        Given an input prompt "fish. shark." and corresponding `input_ids` as `[101, 3869, 1012, 11420, 1012, 102]`:
+        - The function identifies the tokens for "fish" (IDs `[3869]`) and "shark" (IDs `[11420]`).
+        - The function then constructs label maps for these tokens, where each label map indicates which tokens
+          correspond to which label between the delimiter tokens (e.g., between the period `.`).
+        - The output is a tuple of label maps, one for each instance in the batch.
+    Note:
+        - `SPECIAL_TOKENS` should be a predefined list of tokens that are considered special (e.g., `[CLS]`, `[SEP]`, etc.).
+    """
+    max_seq_len = logits.shape[-1]
+    # Add [PAD] token to the list of special tokens
+    delimiter_tokens = torch.tensor(SPECIAL_TOKENS + [0], device=input_ids.device)
+
+    delimiter_token_masks = torch.isin(input_ids, delimiter_tokens)
+    label_groups = torch.cumsum(delimiter_token_masks, dim=1) * (~delimiter_token_masks).to(torch.int32)
+
+    label_maps = ()
+
+    # Iterate over batch dimension as we can have different number of labels
+    for label_group in label_groups:
+        # `label_group` is a tensor of shape `(seq_len,)` with zeros for non-label tokens and integers for label tokens
+        # label tokens with same integer value are part of the same label group
+
+        # Get unique labels and exclude 0 (i.e. non-label tokens)
+        unique_labels = torch.unique(label_group)[1:, None]
+        num_labels = unique_labels.shape[0]
+
+        # Create one-hot encoding for each label group
+        label_map = label_group.unsqueeze(0).repeat(num_labels, 1)
+        label_map = torch.where(label_map == unique_labels, 1, 0)
+
+        # Pad label_map to match `max_seq_len`
+        label_map = F.pad(label_map, (0, max_seq_len - label_map.shape[1]), value=0)
+
+        label_maps += (label_map,)
+
+    return label_maps
+
+
+def build_text_mask(logits, attention_mask):
+    """
+    Create text_mask based on the matching indices
+    """
+    seq_len = attention_mask.shape[1]
+    text_mask = torch.zeros_like(logits, device=logits.device, dtype=attention_mask.dtype)
+    text_mask[:, :, :seq_len] = attention_mask[:, None, :]
+
+    return text_mask.bool()
+
+
 @add_start_docstrings(
     """
     Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,
@@ -2514,22 +2541,14 @@ def __init__(self, config: GroundingDinoConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
-    @torch.jit.unused
-    def _set_aux_loss(self, outputs_class, outputs_coord):
-        # this is a workaround to make torchscript happy, as torchscript
-        # doesn't support dictionary with non-homogeneous values, such
-        # as a dict having both a Tensor and a list.
-        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
-
     @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         pixel_values: torch.FloatTensor,
         input_ids: torch.LongTensor,
-        token_type_ids: torch.LongTensor = None,
-        attention_mask: torch.LongTensor = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
         pixel_mask: Optional[torch.BoolTensor] = None,
         encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None,
         output_attentions: Optional[bool] = None,
@@ -2648,8 +2667,20 @@ def forward(
 
         loss, loss_dict, auxiliary_outputs = None, None, None
         if labels is not None:
+            label_maps = build_label_maps(logits, input_ids)
+            text_mask = build_text_mask(logits, attention_mask)
             loss, loss_dict, auxiliary_outputs = self.loss_function(
-                logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord
+                logits,
+                labels,
+                self.device,
+                pred_boxes,
+                self.config,
+                label_maps,
+                text_mask,
+                outputs_class=outputs_class,
+                outputs_coord=outputs_coord,
+                encoder_logits=outputs[-2],
+                encoder_pred_boxes=outputs[-1],
             )
 
         if not return_dict:
@@ -2677,6 +2708,8 @@ def forward(
             init_reference_points=outputs.init_reference_points,
             enc_outputs_class=outputs.enc_outputs_class,
             enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            encoder_logits=outputs.encoder_logits,
+            encoder_pred_boxes=outputs.encoder_pred_boxes,
             input_ids=input_ids,
         )
 
diff --git a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
deleted file mode 100644
index 059f10f6129b..000000000000
--- a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Convert GroupViT checkpoints from the original repository.
-
-URL: https://github.com/NVlabs/GroupViT
-"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import CLIPProcessor, GroupViTConfig, GroupViTModel
-
-
-def rename_key(name):
-    # vision encoder
-    if "img_encoder.pos_embed" in name:
-        name = name.replace("img_encoder.pos_embed", "vision_model.embeddings.position_embeddings")
-    if "img_encoder.patch_embed.proj" in name:
-        name = name.replace("img_encoder.patch_embed.proj", "vision_model.embeddings.patch_embeddings.projection")
-    if "img_encoder.patch_embed.norm" in name:
-        name = name.replace("img_encoder.patch_embed.norm", "vision_model.embeddings.layernorm")
-    if "img_encoder.layers" in name:
-        name = name.replace("img_encoder.layers", "vision_model.encoder.stages")
-    if "blocks" in name and "res" not in name:
-        name = name.replace("blocks", "layers")
-    if "attn" in name and "pre_assign" not in name:
-        name = name.replace("attn", "self_attn")
-    if "proj" in name and "self_attn" in name and "text" not in name:
-        name = name.replace("proj", "out_proj")
-    if "pre_assign_attn.attn.proj" in name:
-        name = name.replace("pre_assign_attn.attn.proj", "pre_assign_attn.attn.out_proj")
-    if "norm1" in name:
-        name = name.replace("norm1", "layer_norm1")
-    if "norm2" in name and "pre_assign" not in name:
-        name = name.replace("norm2", "layer_norm2")
-    if "img_encoder.norm" in name:
-        name = name.replace("img_encoder.norm", "vision_model.layernorm")
-    # text encoder
-    if "text_encoder.token_embedding" in name:
-        name = name.replace("text_encoder.token_embedding", "text_model.embeddings.token_embedding")
-    if "text_encoder.positional_embedding" in name:
-        name = name.replace("text_encoder.positional_embedding", "text_model.embeddings.position_embedding.weight")
-    if "text_encoder.transformer.resblocks." in name:
-        name = name.replace("text_encoder.transformer.resblocks.", "text_model.encoder.layers.")
-    if "ln_1" in name:
-        name = name.replace("ln_1", "layer_norm1")
-    if "ln_2" in name:
-        name = name.replace("ln_2", "layer_norm2")
-    if "c_fc" in name:
-        name = name.replace("c_fc", "fc1")
-    if "c_proj" in name:
-        name = name.replace("c_proj", "fc2")
-    if "text_encoder" in name:
-        name = name.replace("text_encoder", "text_model")
-    if "ln_final" in name:
-        name = name.replace("ln_final", "final_layer_norm")
-    # projection layers
-    if "img_projector.linear_hidden." in name:
-        name = name.replace("img_projector.linear_hidden.", "visual_projection.")
-    if "img_projector.linear_out." in name:
-        name = name.replace("img_projector.linear_out.", "visual_projection.3.")
-    if "text_projector.linear_hidden" in name:
-        name = name.replace("text_projector.linear_hidden", "text_projection")
-    if "text_projector.linear_out" in name:
-        name = name.replace("text_projector.linear_out", "text_projection.3")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            # weights and biases of the key, value and query projections of vision encoder's attention layers require special treatment:
-            # we need to split them up into separate matrices/vectors
-            key_split = key.split(".")
-            stage_num, layer_num = int(key_split[2]), int(key_split[4])
-            dim = config.vision_config.hidden_size
-            if "weight" in key:
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.q_proj.weight"
-                ] = val[:dim, :]
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.k_proj.weight"
-                ] = val[dim : dim * 2, :]
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.v_proj.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.q_proj.bias"
-                ] = val[:dim]
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.k_proj.bias"
-                ] = val[dim : dim * 2]
-                orig_state_dict[
-                    f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.v_proj.bias"
-                ] = val[-dim:]
-        elif "in_proj" in key:
-            # weights and biases of the key, value and query projections of text encoder's attention layers require special treatment:
-            # we need to split them up into separate matrices/vectors
-            key_split = key.split(".")
-            layer_num = int(key_split[3])
-            dim = config.text_config.hidden_size
-            if "weight" in key:
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
-            else:
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-        else:
-            new_name = rename_key(key)
-            # squeeze if necessary
-            if (
-                "text_projection.0" in new_name
-                or "text_projection.3" in new_name
-                or "visual_projection.0" in new_name
-                or "visual_projection.3" in new_name
-            ):
-                orig_state_dict[new_name] = val.squeeze_()
-            else:
-                orig_state_dict[new_name] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_groupvit_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, model_name="groupvit-gcc-yfcc", push_to_hub=False
-):
-    """
-    Copy/paste/tweak model's weights to the Transformers design.
-    """
-    config = GroupViTConfig()
-    model = GroupViTModel(config).eval()
-
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    new_state_dict = convert_state_dict(state_dict, config)
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    assert missing_keys == ["text_model.embeddings.position_ids"]
-    assert (unexpected_keys == ["multi_label_logit_scale"]) or (len(unexpected_keys) == 0)
-
-    # verify result
-    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-    image = prepare_img()
-    inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt")
-
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    if model_name == "groupvit-gcc-yfcc":
-        expected_logits = torch.tensor([[13.3523, 6.3629]])
-    elif model_name == "groupvit-gcc-redcaps":
-        expected_logits = torch.tensor([[16.1873, 8.6230]])
-    else:
-        raise ValueError(f"Model name {model_name} not supported.")
-    assert torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3)
-
-    processor.save_pretrained(pytorch_dump_folder_path)
-    model.save_pretrained(pytorch_dump_folder_path)
-    print("Successfully saved processor and model to", pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing to the hub...")
-        processor.push_to_hub(model_name, organization="nielsr")
-        model.push_to_hub(model_name, organization="nielsr")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to dump the processor and PyTorch model."
-    )
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to GroupViT checkpoint")
-    parser.add_argument(
-        "--model_name",
-        default="groupvit-gccy-fcc",
-        type=str,
-        help="Name of the model. Expecting either 'groupvit-gcc-yfcc' or 'groupvit-gcc-redcaps'",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model and processor to the 🤗 hub using the provided `model_name`.",
-    )
-    args = parser.parse_args()
-
-    convert_groupvit_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index 889b200552d5..6a5c235cf82a 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -302,11 +302,11 @@ class GroupViTModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    segmentation_logits: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    segmentation_logits: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
index 7c6b3d05f325..a6b62ae70ca1 100644
--- a/src/transformers/models/groupvit/modeling_tf_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -253,11 +253,11 @@ class TFGroupViTModelOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits_per_image: tf.Tensor = None
-    logits_per_text: tf.Tensor = None
-    segmentation_logits: tf.Tensor = None
-    text_embeds: tf.Tensor = None
-    image_embeds: tf.Tensor = None
+    logits_per_image: Optional[tf.Tensor] = None
+    logits_per_text: Optional[tf.Tensor] = None
+    segmentation_logits: Optional[tf.Tensor] = None
+    text_embeds: Optional[tf.Tensor] = None
+    image_embeds: Optional[tf.Tensor] = None
     text_model_output: TFBaseModelOutputWithPooling = None
     vision_model_output: TFBaseModelOutputWithPooling = None
 
@@ -646,9 +646,9 @@ def build(self, input_shape: tf.TensorShape = None):
 
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
     ) -> tf.Tensor:
         """
         Applies embedding based on inputs tensor.
@@ -898,10 +898,10 @@ def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
     def call(
         self,
         hidden_states: tf.Tensor,
-        attention_mask: tf.Tensor = None,
-        causal_attention_mask: tf.Tensor = None,
-        output_attentions: bool = None,
-        encoder_hidden_states: tf.Tensor = None,
+        attention_mask: Optional[tf.Tensor] = None,
+        causal_attention_mask: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        encoder_hidden_states: Optional[tf.Tensor] = None,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
         """Input shape: Batch x Time x Channel"""
diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py
index fc6f862be258..37350ae4626b 100644
--- a/src/transformers/models/helium/modeling_helium.py
+++ b/src/transformers/models/helium/modeling_helium.py
@@ -20,7 +20,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -36,7 +37,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -44,6 +45,8 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -51,6 +54,12 @@
 from .configuration_helium import HeliumConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "google/helium-7b"
@@ -92,45 +101,18 @@ def __init__(self, config: HeliumConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -439,20 +421,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -513,10 +487,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(HELIUM_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -524,16 +499,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -544,6 +517,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -578,7 +555,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -612,13 +589,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -626,12 +602,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -712,7 +693,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -779,27 +760,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(HELIUM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -834,10 +814,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -846,12 +825,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -860,10 +838,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -904,29 +878,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(HELIUM_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -935,9 +908,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -952,7 +924,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -967,10 +939,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1010,6 +978,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(HELIUM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1021,23 +990,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1046,9 +1013,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1056,10 +1022,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py
deleted file mode 100644
index eed27645b344..000000000000
--- a/src/transformers/models/hiera/convert_hiera_to_hf.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hiera checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/hiera
-"""
-
-import argparse
-import json
-import math
-from typing import Dict, Tuple
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import BitImageProcessor, HieraConfig, HieraForImageClassification, HieraForPreTraining, HieraModel
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config: HieraConfig, base_model: bool, mae_model: bool):
-    rename_keys = []
-    # fmt: off
-    num_stages = len(config.depths)
-    # embedding dimensions for input and stages
-    dims = [config.embed_dim] + [int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(num_stages)]
-
-    global_layer_idx = 0
-    for stage_idx in range(num_stages):
-        dim_in = dims[stage_idx]
-        dim_out = dims[stage_idx + 1]
-        for layer_idx in range(config.depths[stage_idx]):
-            rename_keys.append((f"blocks.{global_layer_idx}.norm1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.norm1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.bias"))
-            rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.bias"))
-            rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.bias"))
-            rename_keys.append((f"blocks.{global_layer_idx}.norm2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.norm2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.bias"))
-            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.bias"))
-            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.weight"))
-            rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.bias"))
-
-            # projection layer only for the first layer of each stage boundary (except the first stage)
-            if dim_out != dim_in and layer_idx == 0:
-                rename_keys.append((f"blocks.{global_layer_idx}.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.weight"))
-                rename_keys.append((f"blocks.{global_layer_idx}.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.bias"))
-
-            global_layer_idx += 1
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("patch_embed.proj.weight", "hiera.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "hiera.embeddings.patch_embeddings.projection.bias")
-        ]
-    )
-
-    rename_keys.append(("pos_embed", "hiera.embeddings.position_embeddings"))
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend([("norm.weight", "pooler.layernorm.weight"), ("norm.bias", "pooler.layernorm.bias")])
-        # if just the base model, we should remove "hiera" from all keys that start with "hiera"
-        rename_keys = [(pair[0], pair[1][6:]) if pair[1].startswith("hiera") else pair for pair in rename_keys]
-    elif mae_model:
-        rename_keys.extend(
-            [
-                ("encoder_norm.weight", "encoder_norm.weight"),
-                ("encoder_norm.bias", "encoder_norm.bias"),
-                ("mask_token", "decoder.mask_token"),
-                ("decoder_pos_embed", "decoder.decoder_position_embeddings"),
-                ("decoder_norm.weight", "decoder.decoder_norm.weight"),
-                ("decoder_norm.bias", "decoder.decoder_norm.bias"),
-                ("decoder_pred.weight", "decoder.decoder_pred.weight"),
-                ("decoder_pred.bias", "decoder.decoder_pred.bias"),
-                ("decoder_embed.weight", "decoder.decoder_embeddings.weight"),
-                ("decoder_embed.bias", "decoder.decoder_embeddings.bias")
-            ]
-        )
-        for i in range(config.decoder_depth):
-            rename_keys.extend(
-                [
-                    (f"decoder_blocks.{i}.norm1.weight", f"decoder.decoder_block.layers.{i}.layernorm_before.weight"),
-                    (f"decoder_blocks.{i}.norm1.bias", f"decoder.decoder_block.layers.{i}.layernorm_before.bias"),
-                    (f"decoder_blocks.{i}.attn.qkv.weight", f"decoder.decoder_block.layers.{i}.attn.qkv.weight"),
-                    (f"decoder_blocks.{i}.attn.qkv.bias", f"decoder.decoder_block.layers.{i}.attn.qkv.bias"),
-                    (f"decoder_blocks.{i}.attn.proj.weight", f"decoder.decoder_block.layers.{i}.attn.proj.weight"),
-                    (f"decoder_blocks.{i}.attn.proj.bias", f"decoder.decoder_block.layers.{i}.attn.proj.bias"),
-                    (f"decoder_blocks.{i}.norm2.weight", f"decoder.decoder_block.layers.{i}.layernorm_after.weight"),
-                    (f"decoder_blocks.{i}.norm2.bias", f"decoder.decoder_block.layers.{i}.layernorm_after.bias"),
-                    (f"decoder_blocks.{i}.mlp.fc1.weight", f"decoder.decoder_block.layers.{i}.mlp.fc1.weight"),
-                    (f"decoder_blocks.{i}.mlp.fc1.bias", f"decoder.decoder_block.layers.{i}.mlp.fc1.bias"),
-                    (f"decoder_blocks.{i}.mlp.fc2.weight", f"decoder.decoder_block.layers.{i}.mlp.fc2.weight"),
-                    (f"decoder_blocks.{i}.mlp.fc2.bias", f"decoder.decoder_block.layers.{i}.mlp.fc2.bias"),
-                ]
-            )
-        for i in range(config.num_query_pool):
-            rename_keys.extend(
-                [
-                    (f"multi_scale_fusion_heads.{i}.weight", f"multiscale_fusion.multi_scale_fusion_heads.{i}.weight"),
-                    (f"multi_scale_fusion_heads.{i}.bias", f"multiscale_fusion.multi_scale_fusion_heads.{i}.bias")
-                ]
-            )
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "hiera.pooler.layernorm.weight"),
-                ("norm.bias", "hiera.pooler.layernorm.bias"),
-                ("head.projection.weight", "classifier.weight"),
-                ("head.projection.bias", "classifier.bias"),
-            ]
-        )
-    # fmt: on
-    return rename_keys
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.projection.weight", "head.projection.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def get_labels_for_classifier(model_name: str) -> Tuple[Dict[int, str], Dict[str, int], int]:
-    repo_id = "huggingface/label-files"
-
-    filename = "imagenet-1k-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    label2id = {v: k for k, v in id2label.items()}
-    num_labels = len(id2label)
-
-    return id2label, label2id, num_labels
-
-
-def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> HieraConfig:
-    if model_name == "hiera-tiny-224":
-        config = HieraConfig(depths=[1, 2, 7, 2])
-    elif model_name == "hiera-small-224":
-        config = HieraConfig(depths=[1, 2, 11, 2])
-    elif model_name == "hiera-base-224":
-        config = HieraConfig()
-    elif model_name == "hiera-base-plus-224":
-        config = HieraConfig(embed_dim=112, num_heads=[2, 4, 8, 16])
-    elif model_name == "hiera-large-224":
-        config = HieraConfig(embed_dim=144, num_heads=[2, 4, 8, 16], depths=[2, 6, 36, 4])
-    elif model_name == "hiera-huge-224":
-        config = HieraConfig(embed_dim=256, num_heads=[4, 8, 16, 32], depths=[2, 6, 36, 4])
-    else:
-        raise ValueError(f"Unrecognized model name: {model_name}")
-
-    if base_model:
-        pass
-    elif mae_model:
-        config.num_query_pool = 2
-        config.decoder_hidden_size = 512
-        config.decoder_depth = 8
-        config.decoder_num_heads = 16
-        # Table 3b from Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles
-        config.mask_ratio = 0.6
-    else:
-        id2label, label2id, num_labels = get_labels_for_classifier(model_name)
-        config.id2label = id2label
-        config.label2id = label2id
-        config.num_labels = num_labels
-
-    return config
-
-
-@torch.no_grad()
-def convert_hiera_checkpoint(args):
-    model_name = args.model_name
-    base_model = args.base_model
-    pytorch_dump_folder_path = args.pytorch_dump_folder_path
-    push_to_hub = args.push_to_hub
-    mae_model = args.mae_model
-
-    config = get_hiera_config(model_name, base_model, mae_model)
-
-    # Load original hiera model
-    original_model_name = model_name.replace("-", "_")
-    original_model_name = f"mae_{original_model_name}" if mae_model else original_model_name
-
-    original_checkpoint_name = "mae_in1k_ft_in1k" if not (base_model or mae_model) else "mae_in1k"
-
-    original_model = torch.hub.load(
-        "facebookresearch/hiera",
-        model=original_model_name,
-        pretrained=True,
-        checkpoint=original_checkpoint_name,
-    )
-
-    original_model.eval()
-    original_state_dict = original_model.state_dict()
-    # Don't need to remove head for MAE because original implementation doesn't have it on MAE
-    if base_model:
-        remove_classification_head_(original_state_dict)
-
-    # # Rename keys
-    new_state_dict = original_state_dict.copy()
-    rename_keys = create_rename_keys(config, base_model, mae_model)
-
-    for src, dest in rename_keys:
-        rename_key(new_state_dict, src, dest)
-
-    # Load HF hiera model
-    if base_model:
-        model = HieraModel(config)
-    elif mae_model:
-        model = HieraForPreTraining(config)
-    else:
-        model = HieraForImageClassification(config)
-
-    model.eval()
-
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    input_image = prepare_img()
-
-    original_image_preprocessor = transforms.Compose(
-        [
-            transforms.Resize(int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-        ]
-    )
-
-    image_processor = BitImageProcessor(
-        image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"shortest_edge": 256}
-    )
-    inputs = image_processor(images=input_image, return_tensors="pt")
-
-    expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
-
-    input_image = prepare_img()
-
-    inputs = image_processor(images=input_image, return_tensors="pt")
-    expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0)
-    assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4)
-    print("Pixel values look good!")
-    print(f"{inputs.pixel_values[0, :3, :3, :3]=}")
-
-    # If is MAE we pass a noise to generate a random mask
-    mask_spatial_shape = [
-        i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size)
-    ]
-    num_windows = math.prod(mask_spatial_shape)
-    torch.manual_seed(2)
-    noise = torch.rand(1, num_windows)
-    outputs = model(**inputs) if not mae_model else model(noise=noise, **inputs)
-    # original implementation returns logits.softmax(dim=-1)
-
-    if base_model:
-        expected_prob, expected_intermediates = original_model(expected_pixel_values, return_intermediates=True)
-        expected_last_hidden = expected_intermediates[-1]
-        batch_size, _, _, hidden_dim = expected_last_hidden.shape
-        expected_last_hidden = expected_last_hidden.reshape(batch_size, -1, hidden_dim)
-        assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-3)
-        print("Base Model looks good as hidden states match original implementation!")
-        print(f"{outputs.last_hidden_state[0, :3, :3]=}")
-    elif mae_model:
-        # get mask from noise to be able to compare outputs
-        mask, _ = model.hiera.embeddings.patch_embeddings.random_masking(expected_pixel_values, noise)
-        expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask.bool())
-        assert torch.allclose(outputs.loss, expected_loss, atol=1e-3)
-        print("MAE Model looks good as loss matches original implementation!")
-    else:
-        expected_prob = original_model(expected_pixel_values)
-        assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-3)
-        print("Classifier looks good as probs match original implementation")
-        print(f"{outputs.logits[:, :5]=}")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        hub_name = model_name
-        if base_model:
-            hub_name = model_name
-        elif mae_model:
-            hub_name = f"{model_name}-mae"
-        else:
-            hub_name = f"{model_name}-in1k"
-        repo_id = f"EduardoPacheco/{hub_name}"
-        print(f"Pushing model and processor for {model_name} to hub at {repo_id}")
-        model.push_to_hub(repo_id)
-        image_processor.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model-name",
-        default="hiera-tiny-224",
-        type=str,
-        choices=[
-            "hiera-tiny-224",
-            "hiera-small-224",
-            "hiera-base-224",
-            "hiera-base-plus-224",
-            "hiera-large-224",
-            "hiera-huge-224",
-        ],
-        help="Name of the Hiera model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch-dump-folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--verify-logits",
-        action="store_true",
-        help="Whether or not to verify the logits against the original implementation.",
-    )
-    parser.add_argument(
-        "--push-to-hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    parser.add_argument(
-        "--base-model",
-        action="store_true",
-        help="Whether to only convert the base model (no projection head weights).",
-    )
-    parser.add_argument(
-        "--mae-model", action="store_true", help="Whether to convert to MAE checkpoint to HieraForPreTraining."
-    )
-
-    args = parser.parse_args()
-    convert_hiera_checkpoint(args)
diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py
index dd602e9f048a..14a8dad524f2 100644
--- a/src/transformers/models/hiera/modeling_hiera.py
+++ b/src/transformers/models/hiera/modeling_hiera.py
@@ -85,7 +85,7 @@ class HieraEncoderOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -124,10 +124,10 @@ class HieraModelOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     pooler_output: Optional[torch.FloatTensor] = None
     bool_masked_pos: torch.BoolTensor = None
-    ids_restore: torch.LongTensor = None
+    ids_restore: Optional[torch.LongTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -163,7 +163,7 @@ class HieraForImageClassificationOutput(ImageClassifierOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -198,9 +198,9 @@ class HieraForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     bool_masked_pos: torch.BoolTensor = None
-    ids_restore: torch.LongTensor = None
+    ids_restore: Optional[torch.LongTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
diff --git a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
deleted file mode 100644
index f5914f35c546..000000000000
--- a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-
-import torch
-from s3prl.hub import distilhubert
-
-from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "mask_emb": "masked_spec_embed",
-}
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = mapped_key
-
-                if key in name:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def convert_config(model):
-    config = HubertConfig()
-    fs_config = model.config
-
-    config.activation_dropout = fs_config.activation_dropout
-    config.apply_spec_augment = False
-    config.attention_dropout = fs_config.attention_dropout
-    config.conv_bias = False
-    conv_layers = eval(fs_config.extractor_conv_feature_layers)
-    config.conv_dim = [x[0] for x in conv_layers]
-    config.conv_kernel = [x[1] for x in conv_layers]
-    config.conv_stride = [x[2] for x in conv_layers]
-    config.feat_extract_activation = "gelu"
-    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
-    config.feat_proj_layer_norm = False
-    config.feat_proj_dropout = 0.0
-    config.final_dropout = 0.0
-    config.hidden_act = fs_config.activation_fn
-    config.hidden_dropout = fs_config.dropout
-    config.hidden_size = fs_config.encoder_embed_dim
-    config.initializer_range = 0.02
-    config.intermediate_size = fs_config.encoder_ffn_embed_dim
-    config.layer_norm_eps = 1e-5
-    config.layerdrop = 0.0
-    config.num_attention_heads = fs_config.encoder_attention_heads
-    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
-    config.num_conv_pos_embeddings = fs_config.conv_pos
-    config.num_feat_extract_layers = len(conv_layers)
-    config.num_hidden_layers = fs_config.encoder_layers
-
-    return config
-
-
-@torch.no_grad()
-def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    model = distilhubert().model.model
-
-    if config_path is not None:
-        config = HubertConfig.from_pretrained(config_path)
-    else:
-        config = convert_config(model)
-    model = model.eval()
-
-    feature_extractor = Wav2Vec2FeatureExtractor(
-        feature_size=1,
-        sampling_rate=16000,
-        padding_value=0,
-        do_normalize=False,
-        return_attention_mask=False,
-    )
-    hf_model = HubertModel(config)
-
-    recursively_load_weights(model, hf_model)
-
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-    convert_hubert_checkpoint(args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 4966340493f3..000000000000
--- a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-from transformers import (
-    HubertConfig,
-    HubertForCTC,
-    HubertModel,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Processor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.batch_norm",
-    "encoder.pos_conv.1": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "running_mean":
-        hf_pointer.running_mean.data = value
-    elif weight_type == "running_var":
-        hf_pointer.running_var.data = value
-    elif weight_type == "num_batches_tracked":
-        hf_pointer.num_batches_tracked.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.hubert.feature_extractor if is_finetuned else hf_model.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "hubert." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
-
-                if key in name or (key.split("w2v_model.")[-1] == name.split(".")[0] and not is_finetuned):
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "running_mean" in name:
-                        weight_type = "running_mean"
-                    elif "running_var" in name:
-                        weight_type = "running_var"
-                    elif "num_batches_tracked" in name:
-                        weight_type = "num_batches_tracked"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_hubert_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = HubertConfig.from_pretrained(config_path)
-    else:
-        config = HubertConfig()
-
-    if is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(target_dict.indices, vocab_handle)
-            tokenizer = Wav2Vec2CTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            return_attention_mask = True if config.feat_extract_norm == "layer" else False
-            feature_extractor = Wav2Vec2FeatureExtractor(
-                feature_size=1,
-                sampling_rate=16000,
-                padding_value=0,
-                do_normalize=True,
-                return_attention_mask=return_attention_mask,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_wav2vec = HubertForCTC(config)
-    else:
-        hf_wav2vec = HubertModel(config)
-
-    if is_finetuned:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-        )
-    else:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
-
-    model = model[0].eval()
-
-    recursively_load_weights(model, hf_wav2vec, is_finetuned)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_hubert_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
-    )
diff --git a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py
deleted file mode 100644
index ff15b90088af..000000000000
--- a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import HubertConfig, HubertForSequenceClassification, Wav2Vec2FeatureExtractor, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SUPPORTED_MODELS = ["UtteranceLevel"]
-
-
-@torch.no_grad()
-def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-    if checkpoint["Config"]["downstream_expert"]["modelrc"]["select"] not in SUPPORTED_MODELS:
-        raise NotImplementedError(f"The supported s3prl models are {SUPPORTED_MODELS}")
-
-    downstream_dict = checkpoint["Downstream"]
-
-    hf_congfig = HubertConfig.from_pretrained(config_path)
-    hf_model = HubertForSequenceClassification.from_pretrained(base_model_name, config=hf_congfig)
-    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        base_model_name, return_attention_mask=True, do_normalize=False
-    )
-
-    if hf_congfig.use_weighted_layer_sum:
-        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
-
-    hf_model.projector.weight.data = downstream_dict["projector.weight"]
-    hf_model.projector.bias.data = downstream_dict["projector.bias"]
-    hf_model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
-    hf_model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
-
-    hf_feature_extractor.save_pretrained(model_dump_path)
-    hf_model.save_pretrained(model_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
-    )
-    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
-    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
-    args = parser.parse_args()
-    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index b986ab863680..ae03cea1c130 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -1,191 +1,107 @@
-# coding=utf-8
-# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Hubert model."""
-
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hubert/modular_hubert.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hubert.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import warnings
 from typing import Optional, Tuple, Union
 
 import numpy as np
 import torch
-import torch.utils.checkpoint
-from torch import nn
+import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_hubert import HubertConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
 logger = logging.get_logger(__name__)
 
-_HIDDEN_STATES_START_POSITION = 1
-
-# General docstring
-_CONFIG_FOR_DOC = "HubertConfig"
-
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/hubert-large-ls960-ft"
-_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
-
-# CTC docstring
-_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
-_CTC_EXPECTED_LOSS = 22.68
-
-# Audio class docstring
-_SEQ_CLASS_CHECKPOINT = "superb/hubert-base-superb-ks"
-_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
-_SEQ_CLASS_EXPECTED_LOSS = 8.53
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
-def _compute_mask_indices(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    attention_mask: Optional[torch.LongTensor] = None,
-    min_masks: int = 0,
-) -> np.ndarray:
-    """
-    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
-    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
-    CPU as part of the preprocessing during training.
-
-    Args:
-        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-                    independently generated mask spans of length `mask_length` is computed by
-                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-                    actual percentage will be smaller.
-        mask_length: size of the mask
-        min_masks: minimum number of masked spans
-        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-                        each batch dimension.
-    """
-    batch_size, sequence_length = shape
-
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
-
-    if mask_length > sequence_length:
-        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
-            f" and `sequence_length`: {sequence_length}`"
-        )
 
-    # epsilon is used for probabilistic rounding
-    epsilon = np.random.rand(1).item()
-
-    def compute_num_masked_span(input_length):
-        """Given input length, compute how many spans should be masked"""
-        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
-        num_masked_span = max(num_masked_span, min_masks)
-
-        # make sure num masked span <= sequence_length
-        if num_masked_span * mask_length > sequence_length:
-            num_masked_span = sequence_length // mask_length
-
-        # make sure num_masked span is also <= input_length - (mask_length - 1)
-        if input_length - (mask_length - 1) < num_masked_span:
-            num_masked_span = max(input_length - (mask_length - 1), 0)
-
-        return num_masked_span
-
-    # compute number of masked spans in batch
-    input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
-        if attention_mask is not None
-        else [sequence_length for _ in range(batch_size)]
-    )
-
-    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
-    spec_aug_mask_idxs = []
-
-    max_num_masked_span = compute_num_masked_span(sequence_length)
-
-    if max_num_masked_span == 0:
-        return spec_aug_mask
+# General docstring
+_CONFIG_FOR_DOC = "HubertConfig"
 
-    for input_length in input_lengths:
-        # compute num of masked spans for this input
-        num_masked_span = compute_num_masked_span(input_length)
 
-        # get random indices to mask
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+class HubertPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
         )
 
-        # pick first sampled index that will serve as a dummy index to pad vector
-        # to ensure same dimension for all batches due to probabilistic rounding
-        # Picking first sample just pads those vectors twice.
-        if len(spec_aug_mask_idx) == 0:
-            # this case can only happen if `input_length` is strictly smaller then
-            # `sequence_length` in which case the last token has to be a padding
-            # token which we can use as a dummy mask id
-            dummy_mask_idx = sequence_length - 1
+        self.batch_norm = None
+        if config.conv_pos_batch_norm:
+            self.batch_norm = nn.BatchNorm1d(config.hidden_size)
         else:
-            dummy_mask_idx = spec_aug_mask_idx[0]
+            weight_norm = nn.utils.weight_norm
+            if hasattr(nn.utils.parametrizations, "weight_norm"):
+                weight_norm = nn.utils.parametrizations.weight_norm
 
-        spec_aug_mask_idx = np.concatenate(
-            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
-        )
-        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
 
-    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+                with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                    self.conv = weight_norm(self.conv, name="weight", dim=2)
+                if hasattr(self.conv, "parametrizations"):
+                    weight_g = self.conv.parametrizations.weight.original0
+                    weight_v = self.conv.parametrizations.weight.original1
+                else:
+                    weight_g = self.conv.weight_g
+                    weight_v = self.conv.weight_v
+                deepspeed.zero.register_external_parameter(self, weight_v)
+                deepspeed.zero.register_external_parameter(self, weight_g)
+            else:
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
 
-    # expand masked indices to masked spans
-    spec_aug_mask_idxs = np.broadcast_to(
-        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+        self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
 
-    # add offset to the starting indexes so that indexes now create a span
-    offsets = np.arange(mask_length)[None, None, :]
-    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
-        batch_size, max_num_masked_span * mask_length
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        if self.batch_norm is not None:
+            hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
 
-    # ensure that we cannot have indices larger than sequence_length
-    if spec_aug_mask_idxs.max() > sequence_length - 1:
-        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
 
-    # scatter indices to mask
-    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
 
-    return spec_aug_mask
+class HubertSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->Hubert
 class HubertNoLayerNormConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -207,7 +123,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->Hubert
 class HubertLayerNormConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -235,7 +150,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->Hubert
 class HubertGroupNormConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -260,69 +174,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class HubertPositionalConvEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            config.hidden_size,
-            config.hidden_size,
-            kernel_size=config.num_conv_pos_embeddings,
-            padding=config.num_conv_pos_embeddings // 2,
-            groups=config.num_conv_pos_embedding_groups,
-        )
-
-        self.batch_norm = None
-        if config.conv_pos_batch_norm:
-            self.batch_norm = nn.BatchNorm1d(config.hidden_size)
-        else:
-            weight_norm = nn.utils.weight_norm
-            if hasattr(nn.utils.parametrizations, "weight_norm"):
-                weight_norm = nn.utils.parametrizations.weight_norm
-
-            if is_deepspeed_zero3_enabled():
-                import deepspeed
-
-                with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                    self.conv = weight_norm(self.conv, name="weight", dim=2)
-                if hasattr(self.conv, "parametrizations"):
-                    weight_g = self.conv.parametrizations.weight.original0
-                    weight_v = self.conv.parametrizations.weight.original1
-                else:
-                    weight_g = self.conv.weight_g
-                    weight_v = self.conv.weight_v
-                deepspeed.zero.register_external_parameter(self, weight_v)
-                deepspeed.zero.register_external_parameter(self, weight_g)
-            else:
-                self.conv = weight_norm(self.conv, name="weight", dim=2)
-
-        self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings)
-        self.activation = ACT2FN[config.feat_extract_activation]
-
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.transpose(1, 2)
-        if self.batch_norm is not None:
-            hidden_states = self.batch_norm(hidden_states)
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.padding(hidden_states)
-        hidden_states = self.activation(hidden_states)
-
-        hidden_states = hidden_states.transpose(1, 2)
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->Hubert
-class HubertSamePadLayer(nn.Module):
-    def __init__(self, num_conv_pos_embeddings):
-        super().__init__()
-        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
-
-    def forward(self, hidden_states):
-        if self.num_pad_remove > 0:
-            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Hubert
 class HubertFeatureEncoder(nn.Module):
     """Construct the features from raw audio waveform"""
 
@@ -367,17 +218,6 @@ def forward(self, input_values):
         return hidden_states
 
 
-class HubertFeatureExtractor(HubertFeatureEncoder):
-    def __init__(self, config):
-        super().__init__(config)
-        warnings.warn(
-            f"The class `{self.__class__.__name__}` has been depreciated "
-            "and will be removed in Transformers v5. "
-            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
-            FutureWarning,
-        )
-
-
 class HubertFeatureProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -396,7 +236,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Hubert
 class HubertAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -555,7 +394,6 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->Hubert
 class HubertFlashAttention2(HubertAttention):
     """
     Hubert flash attention module. This module inherits from `HubertAttention` as the weights of the module stays
@@ -567,9 +405,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -683,7 +521,6 @@ def forward(
 
 
 class HubertSdpaAttention(HubertAttention):
-    # Copied from transformers.models.bart.modeling_bart.BartSdpaAttention.forward with Bart->Hubert
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -789,102 +626,38 @@ def forward(
         return attn_output, None, past_key_value
 
 
-HUBERT_ATTENTION_CLASSES = {
-    "eager": HubertAttention,
-    "sdpa": HubertSdpaAttention,
-    "flash_attention_2": HubertFlashAttention2,
-}
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Hubert
-class HubertFeedForward(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
-
-        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.output_dropout = nn.Dropout(config.hidden_dropout)
-
-    def forward(self, hidden_states):
-        hidden_states = self.intermediate_dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        hidden_states = self.intermediate_dropout(hidden_states)
-
-        hidden_states = self.output_dense(hidden_states)
-        hidden_states = self.output_dropout(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Wav2Vec2->Hubert, WAV2VEC2->HUBERT
-class HubertEncoderLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.attention = HUBERT_ATTENTION_CLASSES[config._attn_implementation](
-            embed_dim=config.hidden_size,
-            num_heads=config.num_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=False,
-        )
-
-        self.dropout = nn.Dropout(config.hidden_dropout)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.feed_forward = HubertFeedForward(config)
-        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
-        attn_residual = hidden_states
-        hidden_states, attn_weights, _ = self.attention(
-            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-        )
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = attn_residual + hidden_states
-
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = hidden_states + self.feed_forward(hidden_states)
-        hidden_states = self.final_layer_norm(hidden_states)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->Hubert
-class HubertAttnAdapterLayer(nn.Module):
-    def __init__(self, config):
-        """
-        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
-        up training throughput.
-        """
+class HubertFeedForward(nn.Module):
+    def __init__(self, config):
         super().__init__()
-        self.input_dim = config.adapter_attn_dim
-        self.hidden_dim = config.hidden_size
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
 
-        self.norm = nn.LayerNorm(self.hidden_dim)
-        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
-        self.act_fn = nn.ReLU()
-        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
 
-    def forward(self, hidden_states: torch.FloatTensor):
-        hidden_states = self.norm(hidden_states)
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
 
-        hidden_states = self.linear_1(hidden_states)
-        hidden_states = self.act_fn(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
 
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->Hubert, WAV2VEC2->HUBERT
-class HubertEncoderLayerStableLayerNorm(nn.Module):
+HUBERT_ATTENTION_CLASSES = {
+    "eager": HubertAttention,
+    "sdpa": HubertSdpaAttention,
+    "flash_attention_2": HubertFlashAttention2,
+}
+
+
+class HubertEncoderLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.attention = HUBERT_ATTENTION_CLASSES[config._attn_implementation](
@@ -893,33 +666,23 @@ def __init__(self, config):
             dropout=config.attention_dropout,
             is_decoder=False,
         )
+
         self.dropout = nn.Dropout(config.hidden_dropout)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.feed_forward = HubertFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
-        if getattr(config, "adapter_attn_dim", None) is not None:
-            self.adapter_layer = HubertAttnAdapterLayer(config)
-        else:
-            self.adapter_layer = None
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ):
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         attn_residual = hidden_states
-        hidden_states = self.layer_norm(hidden_states)
         hidden_states, attn_weights, _ = self.attention(
             hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
         )
         hidden_states = self.dropout(hidden_states)
         hidden_states = attn_residual + hidden_states
-        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
 
-        if self.adapter_layer is not None:
-            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
 
         outputs = (hidden_states,)
 
@@ -929,7 +692,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder with Wav2Vec2->Hubert
 class HubertEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1015,7 +777,76 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderStableLayerNorm with Wav2Vec2->Hubert
+class HubertAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
+class HubertEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = HUBERT_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = HubertFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = HubertAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
 class HubertEncoderStableLayerNorm(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1171,6 +1002,125 @@ def _get_feature_vector_attention_mask(self, feature_vector_length: int, attenti
         return attention_mask
 
 
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
 HUBERT_START_DOCSTRING = r"""
     Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden
     Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia,
@@ -1239,6 +1189,7 @@ def __init__(self, config: HubertConfig):
         self.feature_extractor = HubertFeatureEncoder(config)
         self.feature_projection = HubertFeatureProjection(config)
 
+        # model only needs masking vector if mask prob is > 0.0
         if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
             self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
 
@@ -1250,7 +1201,6 @@ def __init__(self, config: HubertConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
     def _mask_hidden_states(
         self,
         hidden_states: torch.FloatTensor,
@@ -1371,11 +1321,17 @@ def forward(
         )
 
 
+_HIDDEN_STATES_START_POSITION = 1
+
+
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 22.68
+
+
 @add_start_docstrings(
     """Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     HUBERT_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->Hubert, wav2vec2->hubert, WAV_2_VEC_2->HUBERT
 class HubertForCTC(HubertPreTrainedModel):
     def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
@@ -1527,6 +1483,11 @@ def forward(
         )
 
 
+_SEQ_CLASS_CHECKPOINT = "superb/hubert-base-superb-ks"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
+_SEQ_CLASS_EXPECTED_LOSS = 8.53
+
+
 @add_start_docstrings(
     """
     Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
@@ -1534,7 +1495,6 @@ def forward(
     """,
     HUBERT_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->Hubert, wav2vec2->hubert, WAV_2_VEC_2->HUBERT
 class HubertForSequenceClassification(HubertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index 8971656365b7..3550f6395862 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -587,7 +587,7 @@ def __init__(self, config: HubertConfig, **kwargs: Any) -> None:
 
         if config.feat_extract_norm == "group":
             conv_layers = [TFHubertGroupNormConvLayer(config, layer_id=0, name=f"conv_layers.{0}")] + [
-                TFHubertNoLayerNormConvLayer(config, layer_id=i + 1, name=f"conv_layers.{i+1}")
+                TFHubertNoLayerNormConvLayer(config, layer_id=i + 1, name=f"conv_layers.{i + 1}")
                 for i in range(config.num_feat_extract_layers - 1)
             ]
         elif config.feat_extract_norm == "layer":
diff --git a/src/transformers/models/hubert/modular_hubert.py b/src/transformers/models/hubert/modular_hubert.py
new file mode 100644
index 000000000000..a42bb5e598b5
--- /dev/null
+++ b/src/transformers/models/hubert/modular_hubert.py
@@ -0,0 +1,400 @@
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Encoder,
+    Wav2Vec2EncoderStableLayerNorm,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2Model,
+    Wav2Vec2SamePadLayer,
+)
+from .configuration_hubert import HubertConfig
+
+
+_HIDDEN_STATES_START_POSITION = 1
+
+# General docstring
+_CONFIG_FOR_DOC = "HubertConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/hubert-large-ls960-ft"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 22.68
+
+
+_SEQ_CLASS_CHECKPOINT = "superb/hubert-base-superb-ks"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
+_SEQ_CLASS_EXPECTED_LOSS = 8.53
+
+
+class HubertPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        self.batch_norm = None
+        if config.conv_pos_batch_norm:
+            self.batch_norm = nn.BatchNorm1d(config.hidden_size)
+        else:
+            weight_norm = nn.utils.weight_norm
+            if hasattr(nn.utils.parametrizations, "weight_norm"):
+                weight_norm = nn.utils.parametrizations.weight_norm
+
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+
+                with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                    self.conv = weight_norm(self.conv, name="weight", dim=2)
+                if hasattr(self.conv, "parametrizations"):
+                    weight_g = self.conv.parametrizations.weight.original0
+                    weight_v = self.conv.parametrizations.weight.original1
+                else:
+                    weight_g = self.conv.weight_g
+                    weight_v = self.conv.weight_v
+                deepspeed.zero.register_external_parameter(self, weight_v)
+                deepspeed.zero.register_external_parameter(self, weight_g)
+            else:
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        if self.batch_norm is not None:
+            hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class HubertSamePadLayer(Wav2Vec2SamePadLayer):
+    pass
+
+
+class HubertFeatureEncoder(Wav2Vec2FeatureEncoder):
+    pass
+
+
+class HubertFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.feat_proj_layer_norm = config.feat_proj_layer_norm
+        if self.feat_proj_layer_norm:
+            self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        if self.feat_proj_layer_norm:
+            hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class HubertEncoder(Wav2Vec2Encoder):
+    pass
+
+
+class HubertEncoderStableLayerNorm(Wav2Vec2EncoderStableLayerNorm):
+    pass
+
+
+class HubertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = HubertConfig
+    base_model_prefix = "hubert"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+
+                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
+                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+                else:
+                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+            else:
+                nn.init.kaiming_normal_(module.weight.data)
+
+        if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
+        output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+HUBERT_START_DOCSTRING = r"""
+    Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden
+    Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia,
+    Ruslan Salakhutdinov, Abdelrahman Mohamed.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+HUBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, such as
+            [hubert-base](https://huggingface.co/facebook/hubert-base-ls960), `attention_mask` should **not** be passed
+            to avoid degraded performance when doing batched inference. For such models `input_values` should simply be
+            padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly different
+            results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Hubert Model transformer outputting raw hidden-states without any specific head on top.",
+    HUBERT_START_DOCSTRING,
+)
+class HubertModel(Wav2Vec2Model, HubertPreTrainedModel):
+    def __init__(self, config: HubertConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = HubertFeatureEncoder(config)
+        self.feature_projection = HubertFeatureProjection(config)
+
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = HubertEncoderStableLayerNorm(config)
+        else:
+            self.encoder = HubertEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        del self.adapter
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Hubert")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Hubert")
+
+    @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        """
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, HubertModel
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
+
+        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
+        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
+
+
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
+        >>> hidden_states = model(input_values).last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    HUBERT_START_DOCSTRING,
+)
+class HubertForCTC(Wav2Vec2ForCTC):
+    pass
+
+    @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """,
+    HUBERT_START_DOCSTRING,
+)
+class HubertForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    pass
+
+    @add_start_docstrings_to_model_forward(HUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+__all__ = ["HubertForCTC", "HubertForSequenceClassification", "HubertModel", "HubertPreTrainedModel"]
diff --git a/src/transformers/models/ibert/quant_modules.py b/src/transformers/models/ibert/quant_modules.py
index 8e2f123c578c..949702a5af97 100644
--- a/src/transformers/models/ibert/quant_modules.py
+++ b/src/transformers/models/ibert/quant_modules.py
@@ -171,9 +171,9 @@ def forward(
             x_min = x_act.data.min()
             x_max = x_act.data.max()
 
-            assert (
-                x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0
-            ), "NaN detected when computing min/max of the activation"
+            assert x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0, (
+                "NaN detected when computing min/max of the activation"
+            )
 
             # Initialization
             if self.x_min.min() > -1.1e-5 and self.x_max.max() < 1.1e-5:
@@ -651,7 +651,7 @@ def forward(ctx, x, k, percentile_mode, scale):
         Returns:
             `torch.Tensor`: Symmetric-quantized value of *input*.
         """
-        zero_point = torch.tensor(0.0).to(scale.device)
+        zero_point = torch.tensor(0.0, device=scale.device)
 
         n = 2 ** (k - 1) - 1
         new_quant_x = linear_quantize(x, scale, zero_point, inplace=False)
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index 8c6f1f059bfc..02d5fb3c05a4 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -38,6 +38,7 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -46,6 +47,12 @@
 from .vision import IdeficsVisionTransformer
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "IdeficsConfig"
@@ -89,7 +96,7 @@ class IdeficsBaseModelOutputWithPast(ModelOutput):
             image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -131,7 +138,7 @@ class IdeficsCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -408,7 +415,10 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim)
+        )
         self.register_buffer("inv_freq", inv_freq, persistent=False)
 
         # Build here to make `torch.jit.trace` work.
@@ -502,7 +512,7 @@ def __init__(
         is_cross_attention: bool = False,
         config: PretrainedConfig = None,
         qk_layer_norms: bool = False,
-        layer_idx: int = None,
+        layer_idx: Optional[int] = None,
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -668,7 +678,7 @@ def forward(
 
 # this was adapted from LlamaDecoderLayer
 class IdeficsDecoderLayer(nn.Module):
-    def __init__(self, config: IdeficsConfig, layer_idx: int = None):
+    def __init__(self, config: IdeficsConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = IdeficsAttention(
@@ -747,7 +757,7 @@ def forward(
 
 
 class IdeficsGatedCrossAttentionLayer(nn.Module):
-    def __init__(self, config: IdeficsConfig, layer_idx: int = None):
+    def __init__(self, config: IdeficsConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.cross_attn = IdeficsAttention(
@@ -1086,7 +1096,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1360,12 +1370,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1447,7 +1462,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1541,7 +1556,7 @@ def tie_weights(self):
     @replace_return_docstrings(output_type=IdeficsCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1559,7 +1574,6 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, IdeficsCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1667,63 +1681,33 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- custom processing based on `config.use_resampler`
 
-        model_inputs = {}
+        images_kwargs = {}
         if image_hidden_states is not None:
             if self.config.use_resampler:
-                model_inputs["perceiver_embeddings"] = image_hidden_states
+                images_kwargs["perceiver_embeddings"] = image_hidden_states
             else:
-                model_inputs["image_encoder_embeddings"] = image_hidden_states
-        else:
-            model_inputs["pixel_values"] = pixel_values
-
-        # If we have cache: let's slice `input_ids` or `input embeds` through `cache_position`, to keep only the unprocessed tokens
-        if past_key_values is not None:
-            if inputs_embeds is not None:
-                if input_ids.shape[1] == 0:
-                    inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
-                else:
-                    input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:
-                input_ids = input_ids[:, cache_position]
-                if image_attention_mask is not None:
-                    image_attention_mask = image_attention_mask[:, -input_ids.shape[1] :]
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-
-            # If past_key_values are present then slice the postion ids for only only the unprocessed tokens.
-            if past_key_values:
-                if inputs_embeds is not None and input_ids.shape[1] == 0:
-                    position_ids = position_ids[:, -inputs_embeds.shape[1] :]
-                else:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
-                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
-            model_inputs.update({"inputs_embeds": inputs_embeds, "input_ids": None})
+                images_kwargs["image_encoder_embeddings"] = image_hidden_states
         else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs.update(
-                {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
-            )
+            images_kwargs["pixel_values"] = pixel_values
+        images_kwargs["interpolate_pos_encoding"] = kwargs.pop("interpolate_pos_encoding", False)
 
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "cache_position": cache_position,
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "image_attention_mask": image_attention_mask,
-                "interpolate_pos_encoding": kwargs.get("interpolate_pos_encoding", False),
-            }
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            image_attention_mask=image_attention_mask,
+            **images_kwargs,
+            **kwargs,
         )
 
+        if image_attention_mask is not None and inputs_embeds is None:
+            seq_length = model_inputs["input_ids"].shape[1]
+            model_inputs["image_attention_mask"] = image_attention_mask[:, -seq_length:]
+
         return model_inputs
 
     def _update_model_kwargs_for_generation(
diff --git a/src/transformers/models/idefics/modeling_tf_idefics.py b/src/transformers/models/idefics/modeling_tf_idefics.py
index f412d28aa80f..057988d99211 100644
--- a/src/transformers/models/idefics/modeling_tf_idefics.py
+++ b/src/transformers/models/idefics/modeling_tf_idefics.py
@@ -91,7 +91,7 @@ class TFIdeficsBaseModelOutputWithPast(ModelOutput):
             image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     past_key_values: Optional[Tuple[Tuple[tf.Tensor]]] = None
     hidden_states: Optional[Tuple[tf.Tensor]] = None
     attentions: Optional[Tuple[tf.Tensor]] = None
@@ -133,7 +133,7 @@ class TFIdeficsCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[tf.Tensor] = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     past_key_values: Optional[List[tf.Tensor]] = None
     hidden_states: Optional[Tuple[tf.Tensor]] = None
     attentions: Optional[Tuple[tf.Tensor]] = None
@@ -1687,7 +1687,6 @@ def call(
         training=False,
     ) -> Union[TFIdeficsCausalLMOutputWithPast, Tuple[tf.Tensor]]:
         r"""
-        Args:
             labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py
index 5339b706924d..5e9f9b8ad772 100644
--- a/src/transformers/models/idefics/vision.py
+++ b/src/transformers/models/idefics/vision.py
@@ -55,7 +55,7 @@ class IdeficsVisionModelOutput(ModelOutput):
     """
 
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -237,7 +237,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
diff --git a/src/transformers/models/idefics/vision_tf.py b/src/transformers/models/idefics/vision_tf.py
index 7acfa0193942..c01e1c2e1fac 100644
--- a/src/transformers/models/idefics/vision_tf.py
+++ b/src/transformers/models/idefics/vision_tf.py
@@ -55,7 +55,7 @@ class TFIdeficsVisionModelOutput(ModelOutput):
     """
 
     image_embeds: Optional[tf.Tensor] = None
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Optional[Tuple[tf.Tensor]] = None
     attentions: Optional[Tuple[tf.Tensor]] = None
 
diff --git a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py b/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py
deleted file mode 100644
index ea44ee11e58c..000000000000
--- a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import copy
-
-import torch
-from accelerate import init_empty_weights
-
-from transformers import (
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    Idefics2Config,
-    Idefics2ForConditionalGeneration,
-    Idefics2ImageProcessor,
-    Idefics2Processor,
-    MistralConfig,
-)
-
-
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py --original_model_id HuggingFaceM4/idefics2-8b --output_hub_path org/idefics2
-"""
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "lm_head.weight": "lm_head.linear.weight",
-    "model.layers": "model.text_model.layers",
-    "model.norm": "model.text_model.norm",
-    "model.perceiver_resampler": "model.connector.perceiver_resampler",
-    "model.modality_projection": "model.connector.modality_projection",
-}
-
-
-WEIGHTS_TO_MERGE_MAPPING = (
-    # (weights to merge in merging order), (new weight name)
-    (
-        ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"),
-        "model.text_model.embed_tokens.weight",
-    ),
-    (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"),
-)
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-def merge_weights(state_dict):
-    new_state_dict = copy.deepcopy(state_dict)
-
-    # Merge the weights
-    for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
-        for weight in weights_to_merge:
-            assert weight in state_dict, f"Weight {weight} is missing in the state dict"
-            if new_weight_name not in new_state_dict:
-                new_state_dict[new_weight_name] = [state_dict[weight]]
-            else:
-                new_state_dict[new_weight_name].append(state_dict[weight])
-        new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0)
-
-    # Remove the weights that were merged
-    for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
-        for weight in weights_to_merge:
-            if weight in new_state_dict and weight != new_weight_name:
-                new_state_dict.pop(weight)
-
-    return new_state_dict
-
-
-def get_config(checkpoint):
-    if checkpoint == "HuggingFaceM4/idefics2":
-        # We load the config then recreate to use the text_config
-        config = AutoConfig.from_pretrained(checkpoint)
-        text_config = MistralConfig(
-            vocab_size=config.vocab_size + config.additional_vocab_size,
-            hidden_size=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            num_hidden_layers=config.num_hidden_layers,
-            num_attention_heads=config.num_attention_heads,
-            num_key_value_heads=config.num_key_value_heads,
-            hidden_act=config.hidden_act,
-            max_position_embeddings=config.max_position_embeddings,
-            initializer_range=config.initializer_range,
-            rms_norm_eps=config.rms_norm_eps,
-            tie_word_embeddings=config.tie_word_embeddings,
-            rope_theta=config.rope_theta,
-            sliding_window=config.sliding_window,
-            attention_dropout=config.attention_dropout,
-            pad_token_id=config.pad_token_id,
-            bos_token_id=config.bos_token_id,
-            eos_token_id=config.eos_token_id,
-        )
-        perceiver_config = config.perceiver_config.to_dict()
-        config = Idefics2Config(
-            text_config=text_config.to_dict(),
-            vision_config=config.vision_config,
-            perceiver_config=perceiver_config,
-            use_cache=config.use_cache,
-            image_token_id=config.image_token_id,
-            tie_word_embeddings=config.tie_word_embeddings,
-        )
-        return config
-
-    return AutoConfig.from_pretrained(checkpoint)
-
-
-def convert_idefics2_hub_to_hf(original_model_id, output_hub_path, push_to_hub):
-    # The original model maps to AutoModelForCausalLM, converted we map to Idefics2ForConditionalGeneration
-    original_model = AutoModelForCausalLM.from_pretrained(original_model_id, trust_remote_code=True)
-    # The original model doesn't use the idefics2 processing objects
-    image_seq_len = original_model.config.perceiver_config.resampler_n_latents
-    image_processor = Idefics2ImageProcessor()
-    tokenizer = AutoTokenizer.from_pretrained(original_model_id)
-    processor = Idefics2Processor(
-        image_processor=image_processor,
-        tokenizer=tokenizer,
-        image_seq_len=image_seq_len,
-    )
-    state_dict = original_model.state_dict()
-    state_dict = convert_state_dict_to_hf(state_dict)
-
-    # Merge weights
-    state_dict = merge_weights(state_dict)
-
-    config = get_config(original_model_id)
-
-    with init_empty_weights():
-        model = Idefics2ForConditionalGeneration(config)
-
-    model.load_state_dict(state_dict, strict=True, assign=True)
-
-    model.save_pretrained(output_hub_path)
-    processor.save_pretrained(output_hub_path)
-
-    if push_to_hub:
-        model.push_to_hub(output_hub_path, private=True)
-        processor.push_to_hub(output_hub_path, private=True)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--original_model_id",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="If set, the model will be pushed to the hub after conversion.",
-    )
-    args = parser.parse_args()
-    convert_idefics2_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index eb676c295a4f..16ff2873b1a8 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -14,9 +14,8 @@
 # limitations under the License.
 """PyTorch Idefics2 model."""
 
-import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -28,12 +27,10 @@
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_outputs import BaseModelOutput, ModelOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -42,10 +39,6 @@
 from .configuration_idefics2 import Idefics2Config, Idefics2PerceiverConfig, Idefics2VisionConfig
 
 
-if is_flash_attn_2_available():
-    from ...modeling_flash_attention_utils import _flash_attention_forward
-
-
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "Idefics2Config"
@@ -83,7 +76,7 @@ class Idefics2BaseModelOutputWithPast(ModelOutput):
             image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -121,7 +114,7 @@ class Idefics2CausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -186,6 +179,33 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B
         return embeddings
 
 
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    if hasattr(module, "num_key_value_groups"):
+        key = repeat_kv(key, module.num_key_value_groups)
+        value = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.siglip.modeling_siglip.SiglipAttention with Siglip->Idefics2Vision
 class Idefics2VisionAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -221,140 +241,38 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        batch_size, seq_length, embed_dim = hidden_states.shape
 
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
 
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
 
-        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                 )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights
-
-
-class Idefics2VisionFlashAttention2(Idefics2VisionAttention):
-    """
-    Idefics2Vision flash attention module. This module inherits from `Idefics2VisionAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (Idefics2VisionRMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
             else:
-                target_dtype = self.q_proj.weight.dtype
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
             attention_mask,
-            q_len,
-            dropout=dropout_rate,
             is_causal=self.is_causal,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
         attn_output = self.out_proj(attn_output)
 
         if not output_attentions:
@@ -363,12 +281,6 @@ def forward(
         return attn_output, attn_weights
 
 
-IDEFICS_VISION_ATTENTION_CLASSES = {
-    "eager": Idefics2VisionAttention,
-    "flash_attention_2": Idefics2VisionFlashAttention2,
-}
-
-
 # Copied from transformers.models.siglip.modeling_siglip.SiglipMLP with Siglip->Idefics2Vision
 class Idefics2VisionMLP(nn.Module):
     def __init__(self, config):
@@ -438,7 +350,7 @@ class Idefics2EncoderLayer(nn.Module):
     def __init__(self, config: Idefics2VisionConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.self_attn = IDEFICS_VISION_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.self_attn = Idefics2VisionAttention(config)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = Idefics2VisionMLP(config)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@@ -601,13 +513,14 @@ class Idefics2PreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
 
     def _init_weights(self, module):
         std = (
-            self.config.text_config.initializer_range
+            self.config.initializer_range
             if hasattr(self.config, "initializer_range")
-            else self.config.text_config.initializer_range
+            else self.config.get_text_config().initializer_range
         )
 
         if hasattr(module, "class_embedding"):
@@ -647,8 +560,10 @@ def _init_weights(self, module):
     IDEFICS2_START_DOCSTRING,
 )
 class Idefics2VisionTransformer(Idefics2PreTrainedModel):
-    _supports_sdpa = False
     config_class = Idefics2VisionConfig
+    _supports_sdpa = True
+    _supports_flash_attention_2 = True
+    _supports_flex_attn = True
 
     def __init__(self, config: Idefics2VisionConfig):
         super().__init__(config)
@@ -762,7 +677,7 @@ class Idefics2PerceiverAttention(nn.Module):
     def __init__(self, config, layer_idx: Optional[int] = None) -> None:
         """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
         super().__init__()
-
+        self.config = config
         self.layer_idx = None
         self.hidden_size = config.hidden_size
         self.num_heads = config.resampler_n_heads
@@ -770,6 +685,7 @@ def __init__(self, config, layer_idx: Optional[int] = None) -> None:
         self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.attention_dropout = config.attention_dropout
+        self.scaling = self.head_dim**-0.5
 
         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
@@ -805,179 +721,41 @@ def forward(
 
         hidden_states = torch.concat([context, latents], dim=-2)
 
-        query_states = self.q_proj(latents)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        queries = self.q_proj(latents)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
 
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        queries = queries.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        values = values.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         past_key_value = getattr(self, "past_key_value", past_key_value)
 
         if past_key_value is not None:
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+            keys, values = past_key_value.update(keys, values, self.layer_idx)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                 )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-# NO LONGER EXIST Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with MistralAttention->Idefics2PerceiverAttention,MistralFlashAttention->Idefics2PerceiverFlashAttention,Mistral->Idefics2
-# TODO cyril: modular
-class Idefics2PerceiverFlashAttention2(Idefics2PerceiverAttention):
-    """
-    Idefics2 flash attention module. This module inherits from `Idefics2PerceiverAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    # Ignore copy
-    def forward(
-        self,
-        latents: torch.Tensor,
-        context: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = latents.size()
-        kv_seq_len = q_len + context.size()[1]
-
-        # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
-        #   Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
-        query_states = self.q_proj(latents)
-        key_states = self.k_proj(torch.cat([context, latents], dim=-2))
-        value_states = self.v_proj(torch.cat([context, latents], dim=-2))
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window:
-                slicing_tokens = kv_seq_len - self.config.sliding_window
-
-                past_key = past_key_value[0]
-                past_value = past_key_value[1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        "past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1,"
-                        f" head_dim`), got {past_key.shape}"
-                    )
-
-                past_key_value = (past_key, past_value)
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
             else:
-                target_dtype = self.q_proj.weight.dtype
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
             attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            sliding_window=None,
             is_causal=self.is_causal,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.attention_dropout,
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
         attn_output = self.o_proj(attn_output)
 
         if not output_attentions:
@@ -986,12 +764,6 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-IDEFICS2_PERCEIVER_ATTENTION_CLASSES = {
-    "eager": Idefics2PerceiverAttention,
-    "flash_attention_2": Idefics2PerceiverFlashAttention2,
-}
-
-
 class Idefics2PerceiverLayer(nn.Module):
     def __init__(self, config, layer_idx: int):
         super().__init__()
@@ -1002,7 +774,7 @@ def __init__(self, config, layer_idx: int):
 
         self.input_latents_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
         self.input_context_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
-        self.self_attn = IDEFICS2_PERCEIVER_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+        self.self_attn = Idefics2PerceiverAttention(config, layer_idx=layer_idx)
         self.post_attention_layernorm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
         self.mlp = Idefics2MLP(
             hidden_size=config.hidden_size,
@@ -1085,8 +857,10 @@ def forward(
     IDEFICS2_START_DOCSTRING,
 )
 class Idefics2PerceiverResampler(Idefics2PreTrainedModel):
-    _supports_sdpa = False
     config_class = Idefics2PerceiverConfig
+    _supports_sdpa = True
+    _supports_flash_attention_2 = True
+    _supports_flex_attn = True
 
     def __init__(self, config) -> None:
         super().__init__(config)
@@ -1226,6 +1000,10 @@ def forward(self, image_hidden_states, attention_mask):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 
@@ -1323,7 +1101,7 @@ def inputs_merger(
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1334,6 +1112,7 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Idefics2BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1443,6 +1222,7 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
             return_dict=return_dict,
         )
 
@@ -1514,7 +1294,7 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=Idefics2CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1527,10 +1307,10 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
     ) -> Union[Tuple, Idefics2CausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics2ForConditionalGeneration`).
@@ -1603,6 +1383,7 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
             return_dict=return_dict,
         )
 
@@ -1659,49 +1440,28 @@ def prepare_inputs_for_generation(
         # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
         # precedence is moved to the model, we can remove this fn)
 
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        if past_key_values is not None:
-            if inputs_embeds is not None:  # Exception 1
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:
-                input_ids = input_ids[:, cache_position]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_hidden_states=image_hidden_states,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        # but IDEFICS requires noth ids and embeds to be present
+        # but IDEFICS requires both ids and embeds to be present
         if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": input_ids}
-        else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
-
-        if logits_to_keep is not None:
-            model_inputs["logits_to_keep"] = logits_to_keep
+            model_inputs["input_ids"] = input_ids
 
         if image_hidden_states is not None:
-            pixel_values = None
-            pixel_attention_mask = None
-        else:
-            pixel_values = pixel_values
-            pixel_attention_mask = pixel_attention_mask
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "pixel_attention_mask": pixel_attention_mask,
-                "image_hidden_states": image_hidden_states,
-            }
-        )
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_attention_mask"] = None
+
         return model_inputs
 
     def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs):
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index 9502f1e95829..c69945e28255 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -89,7 +89,9 @@ class Idefics2Processor(ProcessorMixin):
     image_processor_class = "Idefics2ImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: str = None, **kwargs):
+    def __init__(
+        self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs
+    ):
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
diff --git a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py b/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
deleted file mode 100644
index 204104a58b30..000000000000
--- a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-
-import torch
-from accelerate import init_empty_weights
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    Idefics3Config,
-    Idefics3ForConditionalGeneration,
-    Idefics3ImageProcessor,
-    Idefics3Processor,
-    LlamaConfig,
-)
-
-
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py --original_model_id HuggingFaceM4/Idefics3-8B-Llama3 --output_hub_path org/idefics3
-"""
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "lm_head.weight": "lm_head.linear.weight",
-    "model.layers": "model.text_model.layers",
-    "model.norm": "model.text_model.norm",
-    "model.modality_projection": "model.connector.modality_projection",
-}
-
-
-WEIGHTS_TO_MERGE_MAPPING = (
-    # (weights to merge in merging order), (new weight name)
-    (
-        ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"),
-        "model.text_model.embed_tokens.weight",
-    ),
-    (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"),
-)
-
-WEIGHTS_TO_DROP = (
-    # The original model had a vision head, but this is never used
-    "model.vision_model.head",
-)
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    old_state_dict_keys = set(state_dict.keys())
-
-    # Flattened list of weights to merge. We keep these in the original state dict to merge them later
-    original_weights_to_merge = [w for weights in WEIGHTS_TO_MERGE_MAPPING for w in weights[0]]
-
-    # for key, value in state_dict.items():
-    for old_key in old_state_dict_keys:
-        if old_key.endswith(".inv_freq") or any(w in old_key for w in WEIGHTS_TO_DROP):
-            state_dict.pop(old_key)
-            continue
-
-        key = old_key
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        weight = state_dict.pop(old_key)
-        if key in original_weights_to_merge:
-            new_state_dict[key] = weight
-            # Bit of a hack - we need to keep the original weights to merge them later
-            state_dict[key] = weight
-        else:
-            new_state_dict[key] = weight
-
-    return new_state_dict
-
-
-def merge_weights(state_dict, new_state_dict):
-    old_weight_names = set(state_dict.keys())
-
-    # Merge the weights
-    for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
-        for weight_to_merge in weights_to_merge:
-            print(weight_to_merge)
-            assert weight_to_merge in state_dict, f"Weight {weight_to_merge} is missing in the state dict"
-
-            weight = state_dict.pop(weight_to_merge)
-            if new_weight_name not in new_state_dict:
-                new_state_dict[new_weight_name] = [weight]
-            else:
-                new_state_dict[new_weight_name].append(weight)
-
-            old_weight_names.remove(weight_to_merge)
-
-        new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0)
-
-    # Remove the weights that were merged
-    for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING:
-        for weight in weights_to_merge:
-            if weight in new_state_dict and weight != new_weight_name:
-                new_state_dict.pop(weight)
-
-    return new_state_dict
-
-
-def get_config(checkpoint):
-    # We load the config then recreate to use the text_config
-
-    # download the config file
-    filepath = hf_hub_download(repo_id=checkpoint, filename="config.json")
-    with open(filepath, "r") as f:
-        config_json = json.load(f)
-
-    # Setup the vision config
-    vision_config = config_json.pop("vision_config")
-    vision_config.pop("vision_model_name", None)
-    if "embed_dim" in vision_config:
-        vision_config["hidden_size"] = vision_config.pop("embed_dim")
-
-    config_json["vocab_size"] = config_json.pop("vocab_size") + config_json.pop("additional_vocab_size")
-
-    image_token_id = config_json.pop("image_token_id", config_json["vocab_size"] - 2)
-    use_cache = config_json.pop("use_cache", True)
-    tie_word_embeddings = config_json.pop("tie_word_embeddings", True)
-    scale_factor = config_json.pop("scale_factor", 2)
-    vocab_size = config_json.pop("vocab_size", 100000)
-
-    # Remove "freeze" params from the config
-    config_json = {k: v for k, v in config_json.items() if not k.startswith("freeze_")}
-    text_config = LlamaConfig(**config_json)
-
-    config = Idefics3Config(
-        text_config=text_config,
-        vision_config=vision_config,
-        use_cache=use_cache,
-        image_token_id=image_token_id,
-        tie_word_embeddings=tie_word_embeddings,
-        scale_factor=scale_factor,
-        vocab_size=vocab_size,
-    )
-    return config
-
-
-def convert_idefics3_hub_to_hf(original_model_id, output_hub_path, push_to_hub):
-    # The original model maps to AutoModelForCausalLM, converted we map to Idefics3ForConditionalGeneration
-    original_model = AutoModelForCausalLM.from_pretrained(
-        original_model_id, trust_remote_code=True, torch_dtype=torch.bfloat16
-    )
-    # The original model doesn't use the Idefics3 processing objects
-    image_processor = Idefics3ImageProcessor()
-    tokenizer = AutoTokenizer.from_pretrained(original_model_id)
-    processor = Idefics3Processor(
-        image_processor=image_processor,
-        tokenizer=tokenizer,
-    )
-    state_dict = original_model.state_dict()
-    new_state_dict = convert_state_dict_to_hf(state_dict)
-
-    # Merge weights
-    new_state_dict = merge_weights(state_dict, new_state_dict)
-    del state_dict
-
-    config = get_config(original_model_id)
-    print(config)
-
-    with init_empty_weights():
-        model = Idefics3ForConditionalGeneration(config)
-
-    model.load_state_dict(new_state_dict, strict=True, assign=True)
-
-    model.save_pretrained(output_hub_path)
-    processor.save_pretrained(output_hub_path)
-
-    if push_to_hub:
-        model.push_to_hub(output_hub_path, private=True)
-        processor.push_to_hub(output_hub_path, private=True)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--original_model_id",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="If set, the model will be pushed to the hub after conversion.",
-    )
-    args = parser.parse_args()
-    convert_idefics3_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index e4cc8bda569f..64193c2a5d72 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -15,24 +15,22 @@
 """PyTorch Idefics3 model."""
 
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
-from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache
+from ...cache_utils import DynamicCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_outputs import BaseModelOutput, ModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -40,10 +38,6 @@
 from .configuration_idefics3 import Idefics3Config, Idefics3VisionConfig
 
 
-if is_flash_attn_2_available():
-    from ...modeling_flash_attention_utils import _flash_attention_forward
-
-
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "Idefics3Config"
@@ -81,7 +75,7 @@ class Idefics3BaseModelOutputWithPast(ModelOutput):
             image_hidden_states of the model produced by the vision encoder
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -119,7 +113,7 @@ class Idefics3CausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -185,6 +179,30 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B
         return embeddings
 
 
+# Copied from transformers.models.siglip.modeling_siglip.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.siglip.modeling_siglip.SiglipAttention with Siglip->Idefics3Vision
 class Idefics3VisionAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -220,141 +238,38 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
-        batch_size, q_len, _ = hidden_states.size()
+        batch_size, seq_length, embed_dim = hidden_states.shape
 
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
 
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
 
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
-
-        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                 )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights
-
-
-# Copied from transformers.models.idefics2.modeling_idefics2.Idefics2VisionFlashAttention2 with Idefics2->Idefics3
-class Idefics3VisionFlashAttention2(Idefics3VisionAttention):
-    """
-    Idefics3Vision flash attention module. This module inherits from `Idefics3VisionAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (Idefics3VisionRMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
             else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
             attention_mask,
-            q_len,
-            dropout=dropout_rate,
             is_causal=self.is_causal,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
         )
 
-        attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
         attn_output = self.out_proj(attn_output)
 
         if not output_attentions:
@@ -363,12 +278,6 @@ def forward(
         return attn_output, attn_weights
 
 
-IDEFICS_VISION_ATTENTION_CLASSES = {
-    "eager": Idefics3VisionAttention,
-    "flash_attention_2": Idefics3VisionFlashAttention2,
-}
-
-
 # Copied from transformers.models.siglip.modeling_siglip.SiglipMLP with Siglip->Idefics3Vision
 class Idefics3VisionMLP(nn.Module):
     def __init__(self, config):
@@ -401,7 +310,7 @@ class Idefics3EncoderLayer(nn.Module):
     def __init__(self, config: Idefics3VisionConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.self_attn = IDEFICS_VISION_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.self_attn = Idefics3VisionAttention(config)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = Idefics3VisionMLP(config)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@@ -621,14 +530,15 @@ class Idefics3PreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
 
     # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2PreTrainedModel._init_weights
     def _init_weights(self, module):
         std = (
-            self.config.text_config.initializer_range
+            self.config.initializer_range
             if hasattr(self.config, "initializer_range")
-            else self.config.text_config.initializer_range
+            else self.config.get_text_config().initializer_range
         )
 
         if hasattr(module, "class_embedding"):
@@ -667,7 +577,9 @@ def _init_weights(self, module):
 )
 class Idefics3VisionTransformer(Idefics3PreTrainedModel):
     config_class = Idefics3VisionConfig
-    _supports_sdpa = False
+    _supports_sdpa = True
+    _supports_flash_attention_2 = True
+    _supports_flex_attn = True
 
     def __init__(self, config: Idefics3VisionConfig):
         super().__init__(config)
@@ -812,6 +724,10 @@ def forward(
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 
@@ -917,7 +833,7 @@ def inputs_merger(
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -928,6 +844,7 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Idefics3BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1024,6 +941,7 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
             return_dict=return_dict,
         )
 
@@ -1098,7 +1016,7 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=Idefics3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1110,15 +1028,23 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
     ) -> Union[Tuple, Idefics3CausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`).
                 Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
                 computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
         Returns:
 
         Example:
@@ -1193,11 +1119,14 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
             return_dict=return_dict,
         )
 
         hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
 
         loss = None
         if labels is not None:
@@ -1248,49 +1177,28 @@ def prepare_inputs_for_generation(
         # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
         # precedence is moved to the model, we can remove this fn)
 
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        if past_key_values is not None:
-            if inputs_embeds is not None:  # Exception 1
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:
-                input_ids = input_ids[:, cache_position]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_hidden_states=image_hidden_states,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        # but IDEFICS requires noth ids and embeds to be present
+        # but IDEFICS requires both ids and embeds to be present
         if inputs_embeds is not None and cache_position[0] == 0:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": input_ids}
-        else:
-            # The clone here is for the same reason as for `position_ids`.
-            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
-
-        if logits_to_keep is not None:
-            model_inputs["logits_to_keep"] = logits_to_keep
+            model_inputs["input_ids"] = input_ids
 
         if image_hidden_states is not None:
-            pixel_values = None
-            pixel_attention_mask = None
-        else:
-            pixel_values = pixel_values
-            pixel_attention_mask = pixel_attention_mask
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "pixel_attention_mask": pixel_attention_mask,
-                "image_hidden_states": image_hidden_states,
-            }
-        )
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_attention_mask"] = None
+
         return model_inputs
 
     # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration._update_model_kwargs_for_generation
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 6501fca6b684..0f1cf7b24862 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -133,7 +133,9 @@ class Idefics3Processor(ProcessorMixin):
     image_processor_class = "Idefics3ImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: str = None, **kwargs):
+    def __init__(
+        self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
+    ):
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
diff --git a/src/transformers/models/ijepa/configuration_ijepa.py b/src/transformers/models/ijepa/configuration_ijepa.py
index e59920338aed..5f528adad0d5 100644
--- a/src/transformers/models/ijepa/configuration_ijepa.py
+++ b/src/transformers/models/ijepa/configuration_ijepa.py
@@ -56,6 +56,12 @@ class IJepaConfig(PretrainedConfig):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
+        pooler_output_size (`int`, *optional*):
+           Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
+        pooler_act (`str`, *optional*, defaults to `"tanh"`):
+           The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and
+           Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are
+           supported for Tensorflow.
 
     Example:
 
@@ -89,6 +95,8 @@ def __init__(
         patch_size=16,
         num_channels=3,
         qkv_bias=True,
+        pooler_output_size=None,
+        pooler_act="tanh",
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -106,6 +114,8 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
+        self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size
+        self.pooler_act = pooler_act
 
 
 __all__ = ["IJepaConfig"]
diff --git a/src/transformers/models/ijepa/convert_ijepa_to_hf.py b/src/transformers/models/ijepa/convert_ijepa_to_hf.py
deleted file mode 100644
index 5c15a72ff888..000000000000
--- a/src/transformers/models/ijepa/convert_ijepa_to_hf.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert IJEPA checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/ijepa
-"""
-
-import argparse
-import gc
-import re
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import (
-    IJepaConfig,
-    IJepaModel,
-    ViTImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# fmt: off
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    # Projection layer + position embeddings
-    r"pos_embed":                               r"embeddings.position_embeddings",
-    r"patch_embed.proj.weight":                 r"embeddings.patch_embeddings.projection.weight",
-    r"patch_embed.proj.bias":                   r"embeddings.patch_embeddings.projection.bias",
-
-    # Encoder layers: Layernorms, Attention, Feedforward layers
-    r"blocks.(\d+).norm1.weight":               r"encoder.layer.\1.layernorm_before.weight",
-    r"blocks.(\d+).norm1.bias":                 r"encoder.layer.\1.layernorm_before.bias",
-    r"blocks.(\d+).attn.proj.weight":           r"encoder.layer.\1.attention.output.dense.weight",
-    r"blocks.(\d+).attn.proj.bias":             r"encoder.layer.\1.attention.output.dense.bias",
-    r"blocks.(\d+).norm2.weight":               r"encoder.layer.\1.layernorm_after.weight",
-    r"blocks.(\d+).norm2.bias":                 r"encoder.layer.\1.layernorm_after.bias",
-    r"blocks.(\d+).mlp.fc1.weight":             r"encoder.layer.\1.intermediate.dense.weight",
-    r"blocks.(\d+).mlp.fc1.bias":               r"encoder.layer.\1.intermediate.dense.bias",
-    r"blocks.(\d+).mlp.fc2.weight":             r"encoder.layer.\1.output.dense.weight",
-    r"blocks.(\d+).mlp.fc2.bias":               r"encoder.layer.\1.output.dense.bias",
-
-    # Layernorm + pooler
-    r"norm.weight":                             r"layernorm.weight",
-    r"norm.bias":                               r"layernorm.bias",
-}
-# fmt: on
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
-    """
-    Converts old keys to new keys using the mapping and dynamically removes the 'ijepa.' prefix if necessary.
-
-    Args:
-        state_dict_keys (dict): The keys from the state_dict to convert.
-
-    Returns:
-        dict: A mapping from old keys to new keys.
-    """
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-
-        # Apply regex-based mapping
-        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-            if replacement is None:
-                new_text = re.sub(pattern, "", new_text)  # Skip the key
-                continue
-            new_text = re.sub(pattern, replacement, new_text)
-
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-
-    return output_dict
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def get_ijepa_config(model_name):
-    patch_size = int(model_name.split("_")[1][4:])
-    config = IJepaConfig(patch_size=patch_size)
-    if "vith" in model_name:
-        config.hidden_size = 1280
-        config.num_hidden_layers = 32
-        config.num_attention_heads = 16
-        config.layer_norm_eps = 1e-6
-        config.mlp_ratio = 4
-        config.intermediate_size = 5120
-        if model_name == "ijepa_vith16_1k":
-            config.image_size = 448
-    elif "vitg" in model_name:
-        config.hidden_size = 1408
-        config.num_hidden_layers = 40
-        config.num_attention_heads = 16
-        config.layer_norm_eps = 1e-6
-        config.mlp_ratio = 48 / 11
-        config.intermediate_size = 6144
-    else:
-        raise ValueError("Model not supported, only supports huge and giant models.")
-    return config
-
-
-@torch.no_grad()
-def write_model(model_name, output_dir, safe_serialization, push_to_hub, verify_logits):
-    """
-    Copy/paste/tweak model's weights to our IJEPA structure.
-    """
-
-    # define default IJEPA configuration
-    config = get_ijepa_config(model_name)
-
-    checkpoint_mapping = {
-        "ijepa_vith14_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.14-300e.pth.tar",
-        "ijepa_vith14_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.h.14-900e.pth.tar",
-        "ijepa_vith16_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.16-448px-300e.pth.tar",
-        "ijepa_vitg16_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.g.16-600e.pth.tar",
-    }
-
-    # Load original checkpoint
-    checkpoint_url = checkpoint_mapping[model_name]
-    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["encoder"]
-    original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()}
-
-    # Rename keys
-    state_dict = original_state_dict.copy()
-    new_keys = convert_old_keys_to_new_keys(state_dict.keys())
-    for old_key, new_key in new_keys.items():
-        rename_key(state_dict, old_key, new_key)
-    read_in_q_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = IJepaModel(config, add_pooling_layer=False).eval()
-    model.load_state_dict(state_dict)
-    size = {"height": config.image_size, "width": config.image_size}
-    image_processor = ViTImageProcessor(size=size)
-
-    if verify_logits:
-        # Check outputs on an image, prepared by ViTImageProcessor
-        encoding = image_processor(images=prepare_img(), return_tensors="pt")
-        pixel_values = encoding["pixel_values"]
-        with torch.no_grad():
-            outputs = model(pixel_values)
-
-        expected_slices = {
-            "ijepa_vith14_1k": torch.Tensor(
-                [[-0.0621, -0.0054, -2.7513], [-0.1952, 0.0909, -3.9536], [0.0942, -0.0331, -1.2833]]
-            ),
-            "ijepa_vith14_22k": torch.Tensor(
-                [[0.0358, -0.0045, -0.2154], [0.0418, -0.0246, 0.0108], [0.2529, -0.0345, -0.0246]]
-            ),
-            "ijepa_vith16_1k": torch.Tensor(
-                [[0.5145, -0.1259, 0.0615], [0.1132, 0.0028, -0.0496], [1.1586, -0.0056, -0.0387]]
-            ),
-            "ijepa_vitg16_22k": torch.Tensor(
-                [[0.0512, -0.0510, -0.0649], [0.1972, 0.0380, -0.0790], [0.1667, -0.0834, -0.1240]]
-            ),
-        }
-
-        assert torch.allclose(
-            expected_slices[model_name],
-            outputs.last_hidden_state[0, :3, :3],
-            atol=1e-4,
-        )
-
-    if output_dir:
-        Path(output_dir).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {output_dir}")
-        image_processor.save_pretrained(output_dir, safe_serialization=safe_serialization)
-        model.save_pretrained(output_dir, safe_serialization=safe_serialization)
-
-    if push_to_hub:
-        image_processor.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization)
-        model.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization)
-
-    if output_dir:
-        del model, state_dict
-        gc.collect()
-        print("Reloading the model to check if it's saved correctly.")
-        IJepaModel.from_pretrained(output_dir, device_map="auto")
-        print("Model reloaded successfully.")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="ijepa_vith14_1k",
-        type=str,
-        choices=[
-            "ijepa_vith14_1k",
-            "ijepa_vith14_22k",
-            "ijepa_vith16_1k",
-            "ijepa_vitg16_22k",
-        ],
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the model to the 🤗 Hub.",
-    )
-    parser.add_argument(
-        "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion."
-    )
-
-    parser.set_defaults()
-    args = parser.parse_args()
-    write_model(args.model_name, args.output_dir, args.safe_serialization, args.push_to_hub, args.verify_logits)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/ijepa/modeling_ijepa.py b/src/transformers/models/ijepa/modeling_ijepa.py
index e01290b089fd..c48310e4256b 100644
--- a/src/transformers/models/ijepa/modeling_ijepa.py
+++ b/src/transformers/models/ijepa/modeling_ijepa.py
@@ -5,8 +5,7 @@
 #                          modular_ijepa.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import collections.abc
-import math
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -14,7 +13,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
@@ -167,6 +166,7 @@ class IJepaPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["IJepaEmbeddings", "IJepaLayer"]
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -187,6 +187,38 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
                 mean=0.0,
                 std=self.config.initializer_range,
             ).to(module.position_embeddings.dtype)
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
 
 
 class IJepaSelfAttention(nn.Module):
@@ -194,20 +226,22 @@ def __init__(self, config: IJepaConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -216,84 +250,37 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-class IJepaSdpaSelfAttention(IJepaSelfAttention):
-    def __init__(self, config: IJepaConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions or head_mask is not None:
-            logger.warning_once(
-                "`IJepaSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
-                "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but "
-                "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        return context_layer, None
+        return outputs
 
 
 class IJepaSelfOutput(nn.Module):
@@ -353,12 +340,6 @@ def forward(
         return outputs
 
 
-class IJepaSdpaAttention(IJepaAttention):
-    def __init__(self, config: IJepaConfig) -> None:
-        super().__init__(config)
-        self.attention = IJepaSdpaSelfAttention(config)
-
-
 class IJepaIntermediate(nn.Module):
     def __init__(self, config: IJepaConfig) -> None:
         super().__init__()
@@ -390,12 +371,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-IJEPA_ATTENTION_CLASSES = {
-    "eager": IJepaAttention,
-    "sdpa": IJepaSdpaAttention,
-}
-
-
 class IJepaLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
 
@@ -403,7 +378,7 @@ def __init__(self, config: IJepaConfig) -> None:
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = IJEPA_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = IJepaAttention(config)
         self.intermediate = IJepaIntermediate(config)
         self.output = IJepaOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -492,8 +467,8 @@ def forward(
 class IJepaPooler(nn.Module):
     def __init__(self, config: IJepaConfig):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
+        self.dense = nn.Linear(config.hidden_size, config.pooler_output_size)
+        self.activation = ACT2FN[config.pooler_act]
 
     def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
@@ -531,7 +506,6 @@ def forward(self, hidden_states):
 
 _EXPECTED_OUTPUT_SHAPE = [1, 256, 1280]
 
-
 IJEPA_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
     as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
diff --git a/src/transformers/models/ijepa/modular_ijepa.py b/src/transformers/models/ijepa/modular_ijepa.py
index 3b3756dd5ce6..2cf0fe32bf5f 100644
--- a/src/transformers/models/ijepa/modular_ijepa.py
+++ b/src/transformers/models/ijepa/modular_ijepa.py
@@ -108,6 +108,7 @@ class IJepaPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["IJepaEmbeddings", "IJepaLayer"]
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -128,6 +129,8 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
                 mean=0.0,
                 std=self.config.initializer_range,
             ).to(module.position_embeddings.dtype)
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
 
 
 _EXPECTED_OUTPUT_SHAPE = [1, 256, 1280]
diff --git a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py b/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
deleted file mode 100644
index 182d66b9af28..000000000000
--- a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI Image GPT checkpoints."""
-
-import argparse
-
-import torch
-
-from transformers import ImageGPTConfig, ImageGPTForCausalLM, load_tf_weights_in_imagegpt
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_imagegpt_checkpoint_to_pytorch(imagegpt_checkpoint_path, model_size, pytorch_dump_folder_path):
-    # Construct configuration depending on size
-    MODELS = {"small": (512, 8, 24), "medium": (1024, 8, 36), "large": (1536, 16, 48)}
-    n_embd, n_head, n_layer = MODELS[model_size]  # set model hyperparameters
-    config = ImageGPTConfig(n_embd=n_embd, n_layer=n_layer, n_head=n_head)
-    model = ImageGPTForCausalLM(config)
-
-    # Load weights from numpy
-    load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print(f"Save configuration file to {pytorch_config_dump_path}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--imagegpt_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the TensorFlow checkpoint path.",
-    )
-    parser.add_argument(
-        "--model_size",
-        default=None,
-        type=str,
-        required=True,
-        help="Size of the model (can be either 'small', 'medium' or 'large').",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_imagegpt_checkpoint_to_pytorch(
-        args.imagegpt_checkpoint_path, args.model_size, args.pytorch_dump_folder_path
-    )
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index 357baf70d6cf..07e7604574a6 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -177,10 +177,10 @@ def normalize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_normalize: bool = None,
+        do_normalize: Optional[bool] = None,
         do_color_quantize: Optional[bool] = None,
         clusters: Optional[Union[List[List[int]], np.ndarray]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index be8b0d6567ec..f75e24852b88 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -247,7 +247,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
             attn_weights = torch.where(causal_mask, attn_weights, mask_value)
 
         if attention_mask is not None:
@@ -297,7 +297,7 @@ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, hea
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
             attn_weights = torch.where(causal_mask, attn_weights, mask_value)
 
         if attention_mask is not None:
@@ -951,7 +951,7 @@ def forward(
         >>> height = image_processor.size["height"]
         >>> width = image_processor.size["width"]
 
-        >>> samples = output[:, 1:].cpu().detach().numpy()
+        >>> samples = output[:, 1:].detach().cpu().numpy()
         >>> samples_img = [
         ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
         ... ]  # convert color cluster tokens back to pixels
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index f5c856dad6c2..a67950233f3d 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -178,7 +178,7 @@ def __init__(self, config: InformerConfig):
         self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
+        self, data: torch.Tensor, observed_indicator: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters:
@@ -233,24 +233,21 @@ class InformerSinusoidalPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
         super().__init__(num_positions, embedding_dim)
-        self.weight = self._init_weight(self.weight)
 
-    @staticmethod
-    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+    def _init_weight(self):
         """
         Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
         the 2nd half of the vector. [dim // 2:]
         """
-        n_pos, dim = out.shape
+        n_pos, dim = self.weight.shape
         position_enc = np.array(
             [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
         )
-        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False)
         sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
         out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
         out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-        out.detach_()
-        return out
+        self.weight = nn.Parameter(out, requires_grad=False)
 
     @torch.no_grad()
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
@@ -887,7 +884,9 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding) and not isinstance(module, InformerSinusoidalPositionalEmbedding):
+        elif isinstance(module, InformerSinusoidalPositionalEmbedding):
+            module._init_weight()
+        elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
diff --git a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
deleted file mode 100644
index f8b9c86cfddc..000000000000
--- a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert InstructBLIP checkpoints from the original repository.
-
-URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblip
-"""
-
-import argparse
-
-import requests
-import torch
-
-# pip3 install salesforce-lavis
-# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch)
-# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml
-# same for Vicuna-13b
-from lavis.models import load_model_and_preprocess
-from PIL import Image
-
-from transformers import (
-    AutoTokenizer,
-    BlipImageProcessor,
-    InstructBlipConfig,
-    InstructBlipForConditionalGeneration,
-    InstructBlipProcessor,
-    InstructBlipQFormerConfig,
-    InstructBlipVisionConfig,
-    LlamaConfig,
-    LlamaTokenizerFast,
-    T5Config,
-    T5TokenizerFast,
-)
-from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-
-
-def load_demo_image():
-    url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    return image
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # vision encoder
-    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
-    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
-    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
-    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
-
-    for i in range(config.vision_config.num_hidden_layers):
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-
-    # QFormer
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight"))
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def read_in_q_v_bias(state_dict, config):
-    for i in range(config.vision_config.num_hidden_layers):
-        # read in original q and v biases
-        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
-
-        # next, set bias in the state dict
-        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
-        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
-
-
-def get_blip2_config(model_name):
-    image_size = 364 if "coco" in model_name else 224
-    vision_config = InstructBlipVisionConfig(image_size=image_size).to_dict()
-
-    # make sure the models have proper bos_token_id and eos_token_id set (important for generation)
-    # seems like flan-T5 models don't have bos_token_id properly set?
-    if "t5-xl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "t5-xxl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "vicuna-7b" in model_name:
-        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict()
-    elif "vicuna-13b" in model_name:
-        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict()
-    else:
-        raise ValueError("Model name not supported")
-
-    # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1
-    qformer_config = InstructBlipQFormerConfig(vocab_size=30523).to_dict()
-    config = InstructBlipConfig(vision_config=vision_config, text_config=text_config, qformer_config=qformer_config)
-
-    return config, image_size
-
-
-@torch.no_grad()
-def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to Transformers design.
-    """
-    qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left")
-    qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
-
-    if "t5" in model_name:
-        tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left")
-    elif "vicuna" in model_name:
-        # the following was used in the original implementation:
-        # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left")
-        # tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        # tokenizer.add_special_tokens({"bos_token": "</s>"})
-        # tokenizer.add_special_tokens({"eos_token": "</s>"})
-        # tokenizer.add_special_tokens({"unk_token": "</s>"})
-        tokenizer = LlamaTokenizerFast.from_pretrained(
-            "huggyllama/llama-7b", truncation_side="left", bos_token="</s>", unk_token="</s>"
-        )
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-    config, image_size = get_blip2_config(model_name)
-    hf_model = InstructBlipForConditionalGeneration(config).eval()
-
-    model_name_to_original = {
-        "instructblip-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"),
-        "instructblip-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"),
-        "instructblip-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"),
-        "instructblip-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"),
-    }
-
-    name, type = model_name_to_original[model_name]
-
-    # load original model
-    print("Loading original model...")
-    hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu"
-    lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu"
-    original_model, vis_processors, _ = load_model_and_preprocess(
-        name=name, model_type=type, is_eval=True, device=lavis_device
-    )
-    original_model.eval()
-    print("Done!")
-
-    # update state dict keys
-    state_dict = original_model.state_dict()
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-
-    # some keys can be renamed efficiently
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if key.startswith("Qformer.bert"):
-            key = key.replace("Qformer.bert", "qformer")
-        if "attention.self" in key:
-            key = key.replace("self", "attention")
-        if "llm_proj" in key:
-            key = key.replace("llm_proj", "language_projection")
-        if "t5_proj" in key:
-            key = key.replace("t5_proj", "language_projection")
-        if key.startswith("llm_model"):
-            key = key.replace("llm_model", "language_model")
-        if key.startswith("t5"):
-            key = key.replace("t5", "language")
-        state_dict[key] = val
-
-    # read in qv biases
-    read_in_q_v_bias(state_dict, config)
-
-    # note: weights get loaded in torch.float32 by default
-    hf_model.load_state_dict(state_dict, strict=True)
-
-    image = load_demo_image()
-    prompt = "What is unusual about this image?"
-
-    # create processor
-    image_processor = BlipImageProcessor(
-        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
-    )
-    processor = InstructBlipProcessor(
-        image_processor=image_processor,
-        tokenizer=tokenizer,
-        qformer_tokenizer=qformer_tokenizer,
-    )
-    inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device)
-
-    # make sure processor creates exact same pixel values
-    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
-    pixel_values = inputs.pixel_values
-    assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values)
-
-    original_model.to(lavis_device)
-    hf_model.to(hf_model_device)
-    with torch.no_grad():
-        if "vicuna" in model_name:
-            original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits
-            logits = hf_model(**inputs).logits
-        else:
-            original_logits = original_model(
-                {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]}
-            ).logits
-            label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device)
-            labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100)
-            logits = hf_model(**inputs, labels=labels).logits
-
-    print("First values of original logits:", original_logits[0, :3, :3])
-    print("First values of HF logits:", logits[0, :3, :3])
-
-    # assert values
-    assert original_logits.shape == logits.shape
-    atol = 1e-4 if "vicuna" in model_name else 1e-5
-    assert torch.allclose(original_logits.to(logits.device), logits, atol=atol)
-    print("Looks ok!")
-
-    print("Generating with original model...")
-    original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5)
-
-    # important: we need to cast the weights of the HF model to the appropriate type
-    print("Generating with HF model...")
-    outputs = hf_model.generate(
-        **inputs,
-        do_sample=False,
-        num_beams=5,
-        max_length=256,
-        min_length=1,
-        top_p=0.9,
-        repetition_penalty=1.5,
-        length_penalty=1.0,
-        temperature=1,
-    )
-    if "vicuna" in model_name:
-        # convert output id 0 to 2 (eos_token_id)
-        # TODO add this in the generate method?
-        outputs[outputs == 0] = 2
-    print("Original generation:", original_outputs)
-    output_text = processor.batch_decode(outputs, skip_special_tokens=True)
-    output_text = [text.strip() for text in output_text]
-    print("HF generation:", output_text)
-
-    if pytorch_dump_folder_path is not None:
-        processor.save_pretrained(pytorch_dump_folder_path)
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        processor.push_to_hub(f"Salesforce/{model_name}")
-        hf_model.push_to_hub(f"Salesforce/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    choices = [
-        "instructblip-vicuna-7b",
-        "instructblip-vicuna-13b",
-        "instructblip-flan-t5-xl",
-        "instructblip-flan-t5-xxl",
-    ]
-    parser.add_argument(
-        "--model_name",
-        default="instructblip-flan-t5-xl",
-        choices=choices,
-        type=str,
-        help="Path to hf config.json of model to convert",
-    )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model and processor to the hub after converting",
-    )
-
-    args = parser.parse_args()
-
-    convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index ea42d65b845c..d685dd6e99b3 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -322,7 +322,6 @@ class InstructBlipPreTrainedModel(PreTrainedModel):
         "InstructBlipQFormerMultiHeadAttention",
         "InstructBlipQFormerSelfOutput",
     ]
-    _keep_in_fp32_modules = []
 
     # Copied from transformers.models.blip_2.modeling_blip_2.Blip2PreTrainedModel._init_weights with Blip2->InstructBlip
     def _init_weights(self, module):
@@ -1293,6 +1292,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
     _supports_cache_class = True
     _supports_static_cache = True
     _supports_quantized_cache = False  # not all LM bacbones support (e.g. T5)
+    _keep_in_fp32_modules = ["query_tokens"]  # TODO @ArthurZucker I don't know why this is required for FP8
 
     def __init__(self, config: InstructBlipConfig):
         super().__init__(config)
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index 9a46b9787552..408dfbd07565 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -73,7 +73,7 @@ class InstructBlipProcessor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer", "qformer_tokenizer"]
     valid_kwargs = ["num_query_tokens"]
-    image_processor_class = "BlipImageProcessor"
+    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
     tokenizer_class = "AutoTokenizer"
     qformer_tokenizer_class = "AutoTokenizer"
 
diff --git a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py b/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py
deleted file mode 100644
index 9b3d508db6ff..000000000000
--- a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert InstructBlipVideo checkpoints from the original repository.
-
-URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblipvideo
-"""
-
-import argparse
-
-import requests
-import torch
-
-# pip3 install salesforce-lavis
-# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch)
-# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml
-# same for Vicuna-13b
-from lavis.models import load_model_and_preprocess
-from PIL import Image
-
-from transformers import (
-    AutoTokenizer,
-    BlipImageProcessor,
-    InstructBlipProcessor,
-    InstructBlipVideoConfig,
-    InstructBlipVideoForConditionalGeneration,
-    InstructBlipVideoQFormerConfig,
-    InstructBlipVideoVisionConfig,
-    LlamaConfig,
-    LlamaTokenizerFast,
-    T5Config,
-    T5TokenizerFast,
-)
-from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-
-
-def load_demo_image():
-    url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    return image
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # vision encoder
-    rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding"))
-    rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight"))
-    rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias"))
-    rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight"))
-    rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias"))
-
-    for i in range(config.vision_config.num_hidden_layers):
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",))
-        rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-
-    # QFormer
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight"))
-    rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def read_in_q_v_bias(state_dict, config):
-    for i in range(config.vision_config.num_hidden_layers):
-        # read in original q and v biases
-        q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias")
-
-        # next, set bias in the state dict
-        qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
-        state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias
-
-
-def get_blip2_config(model_name):
-    image_size = 364 if "coco" in model_name else 224
-    vision_config = InstructBlipVideoVisionConfig(image_size=image_size).to_dict()
-
-    # make sure the models have proper bos_token_id and eos_token_id set (important for generation)
-    # seems like flan-T5 models don't have bos_token_id properly set?
-    if "t5-xl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "t5-xxl" in model_name:
-        text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict()
-    elif "vicuna-7b" in model_name:
-        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict()
-    elif "vicuna-13b" in model_name:
-        text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict()
-    else:
-        raise ValueError("Model name not supported")
-
-    # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1
-    qformer_config = InstructBlipVideoQFormerConfig(vocab_size=30523).to_dict()
-    config = InstructBlipVideoConfig(
-        vision_config=vision_config, text_config=text_config, qformer_config=qformer_config
-    )
-
-    return config, image_size
-
-
-@torch.no_grad()
-def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to Transformers design.
-    """
-    qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left")
-    qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
-
-    if "t5" in model_name:
-        tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left")
-    elif "vicuna" in model_name:
-        # the following was used in the original implementation:
-        # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left")
-        # tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        # tokenizer.add_special_tokens({"bos_token": "</s>"})
-        # tokenizer.add_special_tokens({"eos_token": "</s>"})
-        # tokenizer.add_special_tokens({"unk_token": "</s>"})
-        tokenizer = LlamaTokenizerFast.from_pretrained(
-            "huggyllama/llama-7b", truncation_side="left", bos_token="</s>", unk_token="</s>"
-        )
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-    config, image_size = get_blip2_config(model_name)
-    hf_model = InstructBlipVideoForConditionalGeneration(config).eval()
-
-    model_name_to_original = {
-        "instructblipvideo-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"),
-        "instructblipvideo-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"),
-        "instructblipvideo-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"),
-        "instructblipvideo-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"),
-    }
-
-    name, type = model_name_to_original[model_name]
-
-    # load original model
-    print("Loading original model...")
-    hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu"
-    lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu"
-    original_model, vis_processors, _ = load_model_and_preprocess(
-        name=name, model_type=type, is_eval=True, device=lavis_device
-    )
-    original_model.eval()
-    print("Done!")
-
-    # update state dict keys
-    state_dict = original_model.state_dict()
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-
-    # some keys can be renamed efficiently
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if key.startswith("Qformer.bert"):
-            key = key.replace("Qformer.bert", "qformer")
-        if "attention.self" in key:
-            key = key.replace("self", "attention")
-        if "llm_proj" in key:
-            key = key.replace("llm_proj", "language_projection")
-        if "t5_proj" in key:
-            key = key.replace("t5_proj", "language_projection")
-        if key.startswith("llm_model"):
-            key = key.replace("llm_model", "language_model")
-        if key.startswith("t5"):
-            key = key.replace("t5", "language")
-        state_dict[key] = val
-
-    # read in qv biases
-    read_in_q_v_bias(state_dict, config)
-
-    # note: weights get loaded in torch.float32 by default
-    hf_model.load_state_dict(state_dict, strict=True)
-
-    image = load_demo_image()
-    prompt = "What is unusual about this image?"
-
-    # create processor
-    image_processor = BlipImageProcessor(
-        size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD
-    )
-    processor = InstructBlipProcessor(
-        image_processor=image_processor,
-        tokenizer=tokenizer,
-        qformer_tokenizer=qformer_tokenizer,
-    )
-    inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device)
-
-    # make sure processor creates exact same pixel values
-    original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device)
-    pixel_values = inputs.pixel_values
-    assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values)
-
-    original_model.to(lavis_device)
-    hf_model.to(hf_model_device)
-    with torch.no_grad():
-        if "vicuna" in model_name:
-            original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits
-            logits = hf_model(**inputs).logits
-        else:
-            original_logits = original_model(
-                {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]}
-            ).logits
-            label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device)
-            labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100)
-            logits = hf_model(**inputs, labels=labels).logits
-
-    print("First values of original logits:", original_logits[0, :3, :3])
-    print("First values of HF logits:", logits[0, :3, :3])
-
-    # assert values
-    assert original_logits.shape == logits.shape
-    atol = 1e-4 if "vicuna" in model_name else 1e-5
-    assert torch.allclose(original_logits.to(logits.device), logits, atol=atol)
-    print("Looks ok!")
-
-    print("Generating with original model...")
-    original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5)
-
-    # important: we need to cast the weights of the HF model to the appropriate type
-    print("Generating with HF model...")
-    outputs = hf_model.generate(
-        **inputs,
-        do_sample=False,
-        num_beams=5,
-        max_length=256,
-        min_length=1,
-        top_p=0.9,
-        repetition_penalty=1.5,
-        length_penalty=1.0,
-        temperature=1,
-    )
-    if "vicuna" in model_name:
-        # convert output id 0 to 2 (eos_token_id)
-        # TODO add this in the generate method?
-        outputs[outputs == 0] = 2
-    print("Original generation:", original_outputs)
-    output_text = processor.batch_decode(outputs, skip_special_tokens=True)
-    output_text = [text.strip() for text in output_text]
-    print("HF generation:", output_text)
-
-    if pytorch_dump_folder_path is not None:
-        processor.save_pretrained(pytorch_dump_folder_path)
-        hf_model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        processor.push_to_hub(f"Salesforce/{model_name}")
-        hf_model.push_to_hub(f"Salesforce/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    choices = [
-        "instructblipvideo-vicuna-7b",
-        "instructblipvideo-vicuna-13b",
-        "instructblipvideo-flan-t5-xl",
-        "instructblipvideo-flan-t5-xxl",
-    ]
-    parser.add_argument(
-        "--model_name",
-        default="instructblipvideo-flan-t5-xl",
-        choices=choices,
-        type=str,
-        help="Path to hf config.json of model to convert",
-    )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model and processor to the hub after converting",
-    )
-
-    args = parser.parse_args()
-
-    convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
index 37cec22a9b36..6c9bf4d4d3ca 100644
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -172,7 +172,7 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> BatchFeature:
@@ -290,7 +290,7 @@ def _preprocess_image(
         do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index 5183a3c22faf..8648d53b8701 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -323,7 +323,6 @@ class InstructBlipVideoPreTrainedModel(PreTrainedModel):
         "InstructBlipVideoQFormerMultiHeadAttention",
         "InstructBlipVideoQFormerSelfOutput",
     ]
-    _keep_in_fp32_modules = []
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -1287,6 +1286,7 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel
     _supports_cache_class = True
     _supports_static_cache = True
     _supports_quantized_cache = False  # not all LM bacbones support (e.g. T5)
+    _keep_in_fp32_modules = ["query_tokens"]  # TODO @ArthurZucker I don't know why this is required for FP8
 
     def __init__(self, config: InstructBlipVideoConfig):
         super().__init__(config)
diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py
index fa95b126883a..9a797c81d97a 100755
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -20,7 +20,7 @@
 """PyTorch Jamba model."""
 
 import math
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -33,6 +33,7 @@
 from ...modeling_attn_mask_utils import (
     AttentionMaskConverter,
 )
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     MoeCausalLMOutputWithPast,
     MoeModelOutputWithPast,
@@ -42,21 +43,19 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
 from ...utils.deprecation import deprecate_kwarg
 from ...utils.import_utils import (
     is_causal_conv1d_available,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     is_mamba_ssm_available,
 )
 from .configuration_jamba import JambaConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -390,9 +389,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -1230,10 +1229,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
@@ -1242,9 +1242,8 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, MoeModelOutputWithPast]:
+    ) -> MoeModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
             output_router_logits if output_router_logits is not None else self.config.output_router_logits
@@ -1254,8 +1253,6 @@ def forward(
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1341,12 +1338,6 @@ def forward(
 
         next_cache = None if not use_cache else past_key_values
 
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
-                if v is not None
-            )
         return MoeModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1435,12 +1426,13 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
@@ -1450,13 +1442,11 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **loss_kwargs,
-    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1496,10 +1486,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: MoeModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1510,10 +1499,9 @@ def forward(
             output_hidden_states=output_hidden_states,
             output_router_logits=output_router_logits,
             cache_position=cache_position,
-            return_dict=return_dict,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
 
@@ -1524,7 +1512,7 @@ def forward(
         aux_loss = None
         if output_router_logits:
             aux_loss = load_balancing_loss_func(
-                outputs.router_logits if return_dict else outputs[-1],
+                outputs.router_logits,
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
@@ -1532,12 +1520,6 @@ def forward(
             if labels is not None:
                 loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            if output_router_logits:
-                output = (aux_loss,) + output
-            return (loss,) + output if loss is not None else output
-
         return MoeCausalLMOutputWithPast(
             loss=loss,
             aux_loss=aux_loss,
@@ -1572,7 +1554,7 @@ def prepare_inputs_for_generation(
         if not empty_past_kv:
             if (
                 inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
             ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
@@ -1624,7 +1606,7 @@ def prepare_inputs_for_generation(
     """,
     JAMBA_START_DOCSTRING,
 )
-# Copied from transformers.models.mixtral.modeling_mixtral.MixtralForSequenceClassification with Mixtral->Jamba, MIXTRAL->JAMBA
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralForSequenceClassification with Mixtral->Jamba, MIXTRAL->JAMBA, BaseModelOutputWithPast->MoeModelOutputWithPast
 class JambaForSequenceClassification(JambaPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1641,29 +1623,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: MoeModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1672,9 +1653,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1689,7 +1669,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1704,10 +1684,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
index 283174ba3cfd..7a4b6b36f1a7 100644
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -26,18 +26,19 @@
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     MoeCausalLMOutputWithPast,
     MoeModelOutputWithPast,
     SequenceClassifierOutputWithPast,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -45,7 +46,13 @@
 from .configuration_jetmoe import JetMoeConfig
 
 
-if is_flash_attn_2_available():
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 logger = logging.get_logger(__name__)
@@ -405,45 +412,18 @@ def __init__(self, config: JetMoeConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -676,9 +656,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -976,10 +956,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(JETMOE_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -988,9 +969,8 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, MoeModelOutputWithPast]:
+    ) -> MoeModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -999,7 +979,6 @@ def forward(
             output_router_logits if output_router_logits is not None else self.config.output_router_logits
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -1104,8 +1083,6 @@ def forward(
         if return_legacy_cache:
             next_cache = next_cache.to_legacy_cache()
 
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return MoeModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1121,12 +1098,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1208,7 +1190,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1278,12 +1260,13 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(JETMOE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1293,13 +1276,11 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs,
-    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1319,10 +1300,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: MoeModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1331,11 +1311,10 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -1362,7 +1341,7 @@ def forward(
         aux_loss = None
         if output_router_logits:
             aux_loss = load_balancing_loss_func(
-                outputs.router_logits if return_dict else outputs[-1],
+                outputs.router_logits,
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
@@ -1370,12 +1349,6 @@ def forward(
             if labels is not None:
                 loss += self.aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            if output_router_logits:
-                output = (aux_loss,) + output
-            return (loss,) + output if loss is not None else output
-
         return MoeCausalLMOutputWithPast(
             loss=loss,
             aux_loss=aux_loss,
@@ -1402,7 +1375,7 @@ def forward(
     """,
     JETMOE_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->JetMoe, LLAMA->JETMOE
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->JetMoe, LLAMA->JETMOE, BaseModelOutputWithPast->MoeModelOutputWithPast
 class JetMoeForSequenceClassification(JetMoePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1419,29 +1392,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(JETMOE_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: MoeModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1450,9 +1422,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1467,7 +1438,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1482,10 +1453,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
diff --git a/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 04c7712aa846..000000000000
--- a/src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import argparse
-
-from fairseq.checkpoint_utils import load_checkpoint_to_cpu
-
-from transformers import Kosmos2Config, Kosmos2ForConditionalGeneration
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "gpt_model.decoder.output_projection": "text_model.lm_head",
-    "gpt_model.decoder": "text_model.model",
-    "img_connector": "image_to_text_projection",
-    "img_model.visual.class_embedding": "vision_model.model.embeddings.class_embedding",
-    "img_model.visual.positional_embedding": "vision_model.model.embeddings.position_embedding.weight",
-    "img_model.visual.conv1": "vision_model.model.embeddings.patch_embedding",
-    "img_model.visual": "vision_model.model",
-    "ln_pre": "pre_layrnorm",
-    "ln_post": "post_layernorm",
-    "transformer.resblocks": "encoder.layers",
-    "ts_attn": "self_attn",
-    "ln_1": "layer_norm1",
-    "ln_2": "layer_norm2",
-    "c_fc": "fc1",
-    "c_proj": "fc2",
-}
-
-
-KEYS_TO_IGNORE = [
-    # this buffer in the original code is only used to send weights to the desired device
-    "gpt_model.decoder.embed_positions._float_tensor",
-    # this weight is never used in the forward in the original KOSMOS-2)
-    "gpt_model.decoder.self_attn_sope.scale",
-]
-
-
-def rename_key(key):
-    for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-        if key_to_modify in key:
-            key = key.replace(key_to_modify, new_key)
-
-    return key
-
-
-def convert_kosmos2_checkpoint_to_pytorch(checkpoint_path, pytorch_dump_folder_path):
-    state = load_checkpoint_to_cpu(checkpoint_path)
-    state_dict = state["model"]
-    state_dict_keys = list(state_dict.keys())
-
-    config = Kosmos2Config()
-    # This is necessary to match the results given by the original demo
-    config.text_config.no_repeat_ngram_size = 3
-    model = Kosmos2ForConditionalGeneration(config)
-
-    # convert (by renaming keys)
-    converted_state_dict = {}
-    for key in state_dict_keys:
-        if key in KEYS_TO_IGNORE:
-            continue
-        renamed_key = rename_key(key)
-        converted_state_dict[renamed_key] = state_dict[key]
-
-    # check weight loading
-    model.load_state_dict(converted_state_dict, strict=True)
-    # save the result
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--kosmos2_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_kosmos2_checkpoint_to_pytorch(args.kosmos2_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py
index 55277cd5a193..23a1391fa1f1 100644
--- a/src/transformers/models/kosmos2/modeling_kosmos2.py
+++ b/src/transformers/models/kosmos2/modeling_kosmos2.py
@@ -309,7 +309,7 @@ class Kosmos2ModelOutput(ModelOutput):
             input) to speed up sequential decoding.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -367,7 +367,7 @@ class Kosmos2ForConditionalGenerationModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -451,7 +451,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals
         batch_size, _, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
             )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
@@ -543,7 +543,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
@@ -837,10 +837,10 @@ def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional
     @torch.no_grad()
     def forward(
         self,
-        input_ids: torch.Tensor = None,
-        inputs_embeds: torch.Tensor = None,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         past_key_values_length: int = 0,
-        position_ids: torch.Tensor = None,
+        position_ids: Optional[torch.Tensor] = None,
     ):
         if input_ids is not None:
             bsz, seq_len = input_ids.size()
@@ -1187,11 +1187,11 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
     def forward_embedding(
         self,
         input_ids,
-        inputs_embeds: torch.Tensor = None,
-        image_embeds: torch.Tensor = None,
-        img_input_mask: torch.Tensor = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        img_input_mask: Optional[torch.Tensor] = None,
         past_key_values_length: int = 0,
-        position_ids: torch.Tensor = None,
+        position_ids: Optional[torch.Tensor] = None,
     ):
         # The argument `inputs_embeds` should be the one without being multiplied by `self.embed_scale`.
         if inputs_embeds is None:
@@ -1699,31 +1699,18 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        input_shape = input_ids.shape
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        position_ids = None
-        if cache_position is None:
-            past_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-            cache_position = torch.arange(past_length, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        # Kosmos2 has offset for position ids, so we need to create them correctly
+        position_ids = create_position_ids_from_input_ids(
+            input_ids,
+            padding_idx=self.config.pad_token_id,
+            past_key_values_length=0,
+        )
 
         if past_key_values is not None:
-            position_ids = create_position_ids_from_input_ids(
-                input_ids,
-                padding_idx=self.config.pad_token_id,
-                past_key_values_length=0,
-            )
-
-            if input_ids.shape[1] != cache_position.shape[0]:
-                input_ids = input_ids[:, cache_position]
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
             image_embeds = None
             image_embeds_position_mask = None
+        # appending `False` to `image_embeds_position_mask` (because `input_ids` grows during generation)
         elif image_embeds_position_mask is not None:
-            # appending `False` to `image_embeds_position_mask` (because `input_ids` grows during generation)
             batch_size, seq_len = input_ids.size()
             mask_len = image_embeds_position_mask.size()[-1]
             image_embeds_position_mask = torch.cat(
@@ -1734,15 +1721,19 @@ def prepare_inputs_for_generation(
                 dim=1,
             )
 
-        return {
-            "input_ids": input_ids,
-            "image_embeds": image_embeds,
-            "image_embeds_position_mask": image_embeds_position_mask,
-            "past_key_values": past_key_values,
-            "attention_mask": attention_mask,
-            "position_ids": position_ids,
-            "use_cache": use_cache,
-        }
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            image_embeds=image_embeds,
+            image_embeds_position_mask=image_embeds_position_mask,
+            use_cache=use_cache,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            **model_kwargs,
+        )
+
+        return model_inputs
 
     @staticmethod
     # Copied from transformers.models.umt5.modeling_umt5.UMT5ForConditionalGeneration._reorder_cache
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index ab85f593dd94..73a3f66f9b5d 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -85,7 +85,7 @@ class Kosmos2Processor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer"]
     valid_kwargs = ["num_patch_index_tokens"]
-    image_processor_class = "CLIPImageProcessor"
+    image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs):
@@ -428,7 +428,7 @@ def post_process_generation(self, text, cleanup_and_extract=True):
             return clean_text_and_extract_entities_with_bboxes(caption)
         return caption
 
-    def post_process_image_text_to_text(self, generated_outputs):
+    def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
         """
         Post-process the output of the model to decode the text.
 
@@ -436,11 +436,15 @@ def post_process_image_text_to_text(self, generated_outputs):
             generated_outputs (`torch.Tensor` or `np.ndarray`):
                 The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                 or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
 
         Returns:
             `List[str]`: The decoded text.
         """
-        generated_texts = self.batch_decode(generated_outputs, skip_special_tokens=True)
+        generated_texts = self.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
         return [self.post_process_generation(text, cleanup_and_extract=False) for text in generated_texts]
 
     @property
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 56a3776bde2d..8c31521a3f6d 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -635,6 +635,8 @@ def _init_weights(self, module):
         elif isinstance(module, LayoutLMLayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, LayoutLMLMPredictionHead):
+            module.bias.data.zero_()
 
 
 LAYOUTLM_START_DOCSTRING = r"""
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 43215f6157fa..c17d0f1ec573 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -127,11 +127,11 @@ def build(self, input_shape=None):
 
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        bbox: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        token_type_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        bbox: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        token_type_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
         training: bool = False,
     ) -> tf.Tensor:
         """
diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
index 4f326e36e2de..aa9c737bfaae 100644
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -198,10 +198,10 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        apply_ocr: bool = None,
+        apply_ocr: Optional[bool] = None,
         ocr_lang: Optional[str] = None,
         tesseract_config: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 015e43fd4ecc..8cb9cbdf959d 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -48,6 +48,10 @@
     import detectron2
     from detectron2.modeling import META_ARCH_REGISTRY
 
+    # This is needed as otherwise their overload will break sequential loading by overwriting buffer over and over. See
+    # https://github.com/facebookresearch/detectron2/blob/9604f5995cc628619f0e4fd913453b4d7d61db3f/detectron2/layers/batch_norm.py#L83-L86
+    detectron2.layers.batch_norm.FrozenBatchNorm2d._load_from_state_dict = torch.nn.Module._load_from_state_dict
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "microsoft/layoutlmv2-base-uncased"
@@ -510,6 +514,10 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, LayoutLMv2SelfAttention):
+            if self.config.fast_qkv:
+                module.q_bias.data.zero_()
+                module.v_bias.data.zero_()
         elif isinstance(module, LayoutLMv2Model):
             if hasattr(module, "visual_segment_embedding"):
                 module.visual_segment_embedding.data.normal_(mean=0.0, std=self.config.initializer_range)
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index edd01c47f06b..d324c1ac7d6e 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -414,7 +414,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -561,7 +561,7 @@ def batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[Union[List[int], List[List[int]]]] = None,
         add_special_tokens: bool = True,
@@ -570,7 +570,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -621,7 +621,7 @@ def _batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -630,7 +630,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -675,7 +675,7 @@ def _batch_encode_plus(
     def _batch_prepare_for_model(
         self,
         batch_text_or_text_pairs,
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[int]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -684,7 +684,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -758,7 +758,7 @@ def encode(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -807,7 +807,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -875,7 +875,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -930,7 +930,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1261,7 +1261,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
index 90d6904a5042..5d36e9fd2705 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
@@ -165,7 +165,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -312,7 +312,7 @@ def batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[Union[List[int], List[List[int]]]] = None,
         add_special_tokens: bool = True,
@@ -321,7 +321,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -386,7 +386,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -449,7 +449,7 @@ def _batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -458,7 +458,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -612,7 +612,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -674,7 +674,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
index 61c73d38b38d..246e9dcf1f1b 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -225,15 +225,15 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample=None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Union[float, Iterable[float]] = None,
         image_std: Union[float, Iterable[float]] = None,
-        apply_ocr: bool = None,
+        apply_ocr: Optional[bool] = None,
         ocr_lang: Optional[str] = None,
         tesseract_config: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
index 9183bb90240c..8c79ae42f0e5 100644
--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@@ -374,6 +374,10 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, LayoutLMv3Model):
+            if self.config.visual_embed:
+                module.cls_token.data.zero_()
+                module.pos_embed.data.zero_()
 
 
 class LayoutLMv3SelfAttention(nn.Module):
diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
index c0762afb49a9..4cdd15d5e46c 100644
--- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@@ -231,7 +231,7 @@ def create_position_ids(self, input_ids: tf.Tensor, inputs_embeds: tf.Tensor) ->
     def call(
         self,
         input_ids: tf.Tensor | None = None,
-        bbox: tf.Tensor = None,
+        bbox: Optional[tf.Tensor] = None,
         token_type_ids: tf.Tensor | None = None,
         position_ids: tf.Tensor | None = None,
         inputs_embeds: tf.Tensor | None = None,
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
index 088a58bfc6bc..b88f7b4c1b00 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -543,7 +543,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -691,7 +691,7 @@ def batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[Union[List[int], List[List[int]]]] = None,
         add_special_tokens: bool = True,
@@ -700,7 +700,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -752,7 +752,7 @@ def _batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -761,7 +761,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -807,7 +807,7 @@ def _batch_encode_plus(
     def _batch_prepare_for_model(
         self,
         batch_text_or_text_pairs,
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[int]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -816,7 +816,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -891,7 +891,7 @@ def encode(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -941,7 +941,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1010,7 +1010,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1065,7 +1065,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1400,7 +1400,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
index ff67d233ffe9..737a50df9f22 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -209,7 +209,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -357,7 +357,7 @@ def batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[Union[List[int], List[List[int]]]] = None,
         add_special_tokens: bool = True,
@@ -366,7 +366,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -433,7 +433,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -496,7 +496,7 @@ def _batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -505,7 +505,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -665,7 +665,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -728,7 +728,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index a8881c634c24..892a7c2cf1d9 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -101,8 +101,7 @@ def __call__(
         # verify input
         if self.image_processor.apply_ocr and (boxes is not None):
             raise ValueError(
-                "You cannot provide bounding boxes "
-                "if you initialized the image processor with apply_ocr set to True."
+                "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
             )
 
         if self.image_processor.apply_ocr and (word_labels is not None):
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index 9241d59f3c0b..f72039c88425 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -447,7 +447,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -593,7 +593,7 @@ def _batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -602,7 +602,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -647,7 +647,7 @@ def _batch_encode_plus(
     def _batch_prepare_for_model(
         self,
         batch_text_or_text_pairs,
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[int]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -656,7 +656,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -729,7 +729,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -784,7 +784,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1102,7 +1102,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index 89d784be04a4..4c16642c57c6 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -277,7 +277,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -436,7 +436,7 @@ def _batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -445,7 +445,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -600,7 +600,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -662,7 +662,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index e72ed197645c..9f800753382e 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -130,12 +130,12 @@ def __init__(self, config, layer_id):
 
         self.layer_id = layer_id
         attention_window = config.attention_window[self.layer_id]
-        assert (
-            attention_window % 2 == 0
-        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
-        assert (
-            attention_window > 0
-        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        assert attention_window % 2 == 0, (
+            f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        )
+        assert attention_window > 0, (
+            f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        )
 
         self.one_sided_attn_window_size = attention_window // 2
 
@@ -169,9 +169,9 @@ def forward(
         value_vectors = self.value(hidden_states)
 
         seq_len, batch_size, embed_dim = hidden_states.size()
-        assert (
-            embed_dim == self.embed_dim
-        ), f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+        assert embed_dim == self.embed_dim, (
+            f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+        )
 
         # normalize query
         query_vectors /= math.sqrt(self.head_dim)
@@ -239,9 +239,9 @@ def forward(
         )  # use fp32 for numerical stability
 
         if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
             attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs
 
         # softmax sometimes inserts NaN if all positions are masked, replace them with 0
@@ -433,9 +433,9 @@ def _sliding_chunks_query_key_matmul(self, query: torch.Tensor, key: torch.Tenso
         overlap of size window_overlap
         """
         batch_size, seq_len, num_heads, head_dim = query.size()
-        assert (
-            seq_len % (window_overlap * 2) == 0
-        ), f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+        assert seq_len % (window_overlap * 2) == 0, (
+            f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+        )
         assert query.size() == key.size()
 
         chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
@@ -706,9 +706,9 @@ def _compute_global_attn_output_from_hidden(
 
         # apply layer head masking
         if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
             global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
                 batch_size, self.num_heads, max_num_global_attn_indices, seq_len
             )
@@ -1247,7 +1247,7 @@ class LEDSeq2SeqModelOutput(ModelOutput):
             in the sequence.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -1314,7 +1314,7 @@ class LEDSeq2SeqLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -1381,7 +1381,7 @@ class LEDSeq2SeqSequenceClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -1450,8 +1450,8 @@ class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index ce94c504c4e3..25f7a2e5f526 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -182,12 +182,12 @@ def __init__(self, config, layer_id, **kwargs):
         self.layer_id = layer_id
         attention_window = config.attention_window[self.layer_id]
 
-        assert (
-            attention_window % 2 == 0
-        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
-        assert (
-            attention_window > 0
-        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        assert attention_window % 2 == 0, (
+            f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        )
+        assert attention_window > 0, (
+            f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        )
 
         self.one_sided_attn_window_size = attention_window // 2
 
@@ -1470,7 +1470,7 @@ class TFLEDEncoderBaseModelOutput(ModelOutput):
             in the sequence.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     global_attentions: Tuple[tf.Tensor, ...] | None = None
@@ -1533,7 +1533,7 @@ class TFLEDSeq2SeqModelOutput(ModelOutput):
             in the sequence.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     decoder_hidden_states: Tuple[tf.Tensor, ...] | None = None
     decoder_attentions: Tuple[tf.Tensor, ...] | None = None
@@ -1600,7 +1600,7 @@ class TFLEDSeq2SeqLMOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     decoder_hidden_states: Tuple[tf.Tensor, ...] | None = None
     decoder_attentions: Tuple[tf.Tensor, ...] | None = None
diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py
index 0b4df92882a5..dce0e7dc7c08 100644
--- a/src/transformers/models/led/tokenization_led.py
+++ b/src/transformers/models/led/tokenization_led.py
@@ -412,7 +412,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         encoded_inputs = super()._pad(
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
index 06e959e87542..0cc29622987a 100644
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ b/src/transformers/models/led/tokenization_led_fast.py
@@ -280,7 +280,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         encoded_inputs = super()._pad(
diff --git a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
deleted file mode 100644
index afef3f73de6c..000000000000
--- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert LeViT checkpoints from timm."""
-
-import argparse
-import json
-from collections import OrderedDict
-from functools import partial
-from pathlib import Path
-
-import timm
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import LevitConfig, LevitForImageClassificationWithTeacher, LevitImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-
-def convert_weight_and_push(
-    hidden_sizes: int, name: str, config: LevitConfig, save_directory: Path, push_to_hub: bool = True
-):
-    print(f"Converting {name}...")
-
-    with torch.no_grad():
-        if hidden_sizes == 128:
-            if name[-1] == "S":
-                from_model = timm.create_model("levit_128s", pretrained=True)
-            else:
-                from_model = timm.create_model("levit_128", pretrained=True)
-        if hidden_sizes == 192:
-            from_model = timm.create_model("levit_192", pretrained=True)
-        if hidden_sizes == 256:
-            from_model = timm.create_model("levit_256", pretrained=True)
-        if hidden_sizes == 384:
-            from_model = timm.create_model("levit_384", pretrained=True)
-
-        from_model.eval()
-        our_model = LevitForImageClassificationWithTeacher(config).eval()
-        huggingface_weights = OrderedDict()
-
-        weights = from_model.state_dict()
-        og_keys = list(from_model.state_dict().keys())
-        new_keys = list(our_model.state_dict().keys())
-        print(len(og_keys), len(new_keys))
-        for i in range(len(og_keys)):
-            huggingface_weights[new_keys[i]] = weights[og_keys[i]]
-        our_model.load_state_dict(huggingface_weights)
-
-        x = torch.randn((2, 3, 224, 224))
-        out1 = from_model(x)
-        out2 = our_model(x).logits
-
-    assert torch.allclose(out1, out2), "The model logits don't match the original one."
-
-    checkpoint_name = name
-    print(checkpoint_name)
-
-    if push_to_hub:
-        our_model.save_pretrained(save_directory / checkpoint_name)
-        image_processor = LevitImageProcessor()
-        image_processor.save_pretrained(save_directory / checkpoint_name)
-
-        print(f"Pushed {checkpoint_name}")
-
-
-def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
-    filename = "imagenet-1k-id2label.json"
-    num_labels = 1000
-    expected_shape = (1, num_labels)
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    ImageNetPreTrainedConfig = partial(LevitConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    names_to_hidden_sizes = {
-        "levit-128S": 128,
-        "levit-128": 128,
-        "levit-192": 192,
-        "levit-256": 256,
-        "levit-384": 384,
-    }
-
-    names_to_config = {
-        "levit-128S": ImageNetPreTrainedConfig(
-            hidden_sizes=[128, 256, 384],
-            num_attention_heads=[4, 6, 8],
-            depths=[2, 3, 4],
-            key_dim=[16, 16, 16],
-            drop_path_rate=0,
-        ),
-        "levit-128": ImageNetPreTrainedConfig(
-            hidden_sizes=[128, 256, 384],
-            num_attention_heads=[4, 8, 12],
-            depths=[4, 4, 4],
-            key_dim=[16, 16, 16],
-            drop_path_rate=0,
-        ),
-        "levit-192": ImageNetPreTrainedConfig(
-            hidden_sizes=[192, 288, 384],
-            num_attention_heads=[3, 5, 6],
-            depths=[4, 4, 4],
-            key_dim=[32, 32, 32],
-            drop_path_rate=0,
-        ),
-        "levit-256": ImageNetPreTrainedConfig(
-            hidden_sizes=[256, 384, 512],
-            num_attention_heads=[4, 6, 8],
-            depths=[4, 4, 4],
-            key_dim=[32, 32, 32],
-            drop_path_rate=0,
-        ),
-        "levit-384": ImageNetPreTrainedConfig(
-            hidden_sizes=[384, 512, 768],
-            num_attention_heads=[6, 9, 12],
-            depths=[4, 4, 4],
-            key_dim=[32, 32, 32],
-            drop_path_rate=0.1,
-        ),
-    }
-
-    if model_name:
-        convert_weight_and_push(
-            names_to_hidden_sizes[model_name], model_name, names_to_config[model_name], save_directory, push_to_hub
-        )
-    else:
-        for model_name, config in names_to_config.items():
-            convert_weight_and_push(names_to_hidden_sizes[model_name], model_name, config, save_directory, push_to_hub)
-    return config, expected_shape
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        type=str,
-        help="The name of the model you wish to convert, it must be one of the supported Levit* architecture,",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="levit-dump-folder/",
-        type=Path,
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
-    parser.add_argument(
-        "--no-push_to_hub",
-        dest="push_to_hub",
-        action="store_false",
-        help="Do not push model and image processor to the hub",
-    )
-
-    args = parser.parse_args()
-    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
-    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
-    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/levit/modeling_levit.py b/src/transformers/models/levit/modeling_levit.py
index e1825f7a3693..9924ac25dea8 100644
--- a/src/transformers/models/levit/modeling_levit.py
+++ b/src/transformers/models/levit/modeling_levit.py
@@ -68,9 +68,9 @@ class token).
             plus the initial embedding outputs.
     """
 
-    logits: torch.FloatTensor = None
-    cls_logits: torch.FloatTensor = None
-    distillation_logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    distillation_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -551,7 +551,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]:
@@ -618,7 +618,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -710,7 +710,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, LevitForImageClassificationWithTeacherOutput]:
diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py
index 09865489572e..b3d057fd648f 100644
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@@ -579,7 +579,6 @@ class LiltPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = []
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
deleted file mode 100644
index eb2862eb203d..000000000000
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
+++ /dev/null
@@ -1,601 +0,0 @@
-# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import gc
-import json
-import os
-import tempfile
-import warnings
-from typing import List
-
-import torch
-from tokenizers import AddedToken, processors
-
-from transformers import GenerationConfig, LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast
-from transformers.convert_slow_tokenizer import TikTokenConverter
-
-
-try:
-    from transformers import LlamaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    LlamaTokenizerFast = None
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/llama/convert_llama_weights_to_hf.py \
-    --input_dir /path/to/downloaded/llama/weights --model_size 1B --llama_version 3.2 --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import LlamaForCausalLM, LlamaTokenizer
-
-model = LlamaForCausalLM.from_pretrained("/output/path")
-tokenizer = LlamaTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-
-If you want your tokenizer to add a bos automatically you should update the tokenizer._tokenizers.post_processor:
-
-```py
-from tokenizers import processors
-bos = "<|begin_of_text|>"
-tokenizer._tokenizers.post_processor = processors.Sequence(
-    [
-        processors.ByteLevel(trim_offsets=False),
-        processors.TemplateProcessing(
-            single=f"{bos}:0 $A:0",
-            pair=f"{bos}:0 $A:0 {bos}:1 $B:1",
-            special_tokens=[
-                (bos, tokenizer.encode(bos)),
-            ],
-        ),
-    ]
-)
-```
-"""
-
-NUM_SHARDS = {
-    "1B": 1,
-    "3B": 1,
-    "7B": 1,
-    "8B": 1,
-    "8Bf": 1,
-    "7Bf": 1,
-    "13B": 2,
-    "13Bf": 2,
-    "34B": 4,
-    "30B": 4,
-    "65B": 8,
-    "70B": 8,
-    "70Bf": 8,
-    "405B": 8,
-    "405B-MP16": 16,
-}
-
-CONTEXT_LENGTH_FOR_VERSION = {"Guard-3": 131072, "3.2": 131072, "3.1": 131072, "3": 8192, "2": 4096, "1": 2048}
-
-BOS_ADDED_TOKEN = AddedToken(
-    "<|begin_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True
-)
-EOS_ADDED_TOKEN = AddedToken(
-    "<|end_of_text|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True
-)
-EOT_ADDED_TOKEN = AddedToken(
-    "<|eot_id|>", single_word=False, lstrip=False, rstrip=False, normalized=False, special=True
-)
-
-DEFAULT_LLAMA_SPECIAL_TOKENS = {
-    "3": [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|reserved_special_token_2|>",
-        "<|reserved_special_token_3|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|reserved_special_token_4|>",
-        "<|eot_id|>",  # end of turn
-    ]
-    + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)],
-    "3.1": [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|finetune_right_pad_id|>",
-        "<|reserved_special_token_2|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|eom_id|>",  # end of message
-        "<|eot_id|>",  # end of turn
-        "<|python_tag|>",
-    ]
-    + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)],
-    "3.2": [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|finetune_right_pad_id|>",
-        "<|reserved_special_token_2|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|eom_id|>",  # end of message
-        "<|eot_id|>",  # end of turn
-        "<|python_tag|>",
-    ]
-    + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)],
-    "Guard-3": [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|finetune_right_pad_id|>",
-        "<|reserved_special_token_2|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|eom_id|>",  # end of message
-        "<|eot_id|>",  # end of turn
-        "<|python_tag|>",
-    ]
-    + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)],
-}
-
-
-def is_llama_3(version):
-    return version in ["3", "3.1", "3.2", "Guard-3"]
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(
-    model_path,
-    input_base_path,
-    model_size=None,
-    safe_serialization=True,
-    llama_version="1",
-    vocab_size=None,
-    num_shards=None,
-    instruct=False,
-    push_to_hub=False,
-):
-    print("Converting the model.")
-    params = read_json(os.path.join(input_base_path, "params.json"))
-    num_shards = NUM_SHARDS[model_size] if num_shards is None else num_shards
-    params = params.get("model", params)
-    n_layers = params["n_layers"]
-    n_heads = params["n_heads"]
-    n_heads_per_shard = n_heads // num_shards
-    dim = params["dim"]
-    dims_per_head = dim // n_heads
-    base = params.get("rope_theta", 10000.0)
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    if base > 10000.0 and not is_llama_3(llama_version):
-        max_position_embeddings = 16384
-    else:
-        max_position_embeddings = CONTEXT_LENGTH_FOR_VERSION[llama_version]
-
-    if params.get("n_kv_heads", None) is not None:
-        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
-        num_key_value_heads_per_shard = num_key_value_heads // num_shards
-        key_value_dim = dims_per_head * num_key_value_heads
-    else:  # compatibility with other checkpoints
-        num_key_value_heads = n_heads
-        num_key_value_heads_per_shard = n_heads_per_shard
-        key_value_dim = dim
-
-    # permute for sliced rotary
-    def permute(w, n_heads, dim1=dim, dim2=dim):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    with tempfile.TemporaryDirectory() as tmp_model_path:
-        print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-        # Load weights
-        if num_shards == 1:
-            # Not sharded
-            # (The sharded implementation would also work, but this is simpler.)
-            loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
-        else:
-            # Sharded
-            checkpoint_list = sorted([file for file in os.listdir(input_base_path) if file.endswith(".pth")])
-            print("Loading in order:", checkpoint_list)
-            loaded = [torch.load(os.path.join(input_base_path, file), map_location="cpu") for file in checkpoint_list]
-        param_count = 0
-        index_dict = {"weight_map": {}}
-        for layer_i in range(n_layers):
-            filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
-            if num_shards == 1:
-                # Unsharded
-                state_dict = {
-                    f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
-                        loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads
-                    ),
-                    f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
-                        loaded[f"layers.{layer_i}.attention.wk.weight"],
-                        n_heads=num_key_value_heads,
-                        dim1=key_value_dim,
-                    ),
-                    f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
-                    f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
-                    f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
-                    f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
-                    f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
-                    f"model.layers.{layer_i}.input_layernorm.weight": loaded[
-                        f"layers.{layer_i}.attention_norm.weight"
-                    ],
-                    f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
-                        f"layers.{layer_i}.ffn_norm.weight"
-                    ],
-                }
-            else:
-                # Sharded
-                # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
-                # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
-                # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
-
-                state_dict = {
-                    f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
-                        f"layers.{layer_i}.attention_norm.weight"
-                    ].clone(),
-                    f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
-                        f"layers.{layer_i}.ffn_norm.weight"
-                    ].clone(),
-                }
-                state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
-                    torch.cat(
-                        [
-                            loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(
-                                n_heads_per_shard, dims_per_head, dim
-                            )
-                            for i in range(len(loaded))
-                        ],
-                        dim=0,
-                    ).reshape(dim, dim),
-                    n_heads=n_heads,
-                )
-                state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
-                    torch.cat(
-                        [
-                            loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
-                                num_key_value_heads_per_shard, dims_per_head, dim
-                            )
-                            for i in range(len(loaded))
-                        ],
-                        dim=0,
-                    ).reshape(key_value_dim, dim),
-                    num_key_value_heads,
-                    key_value_dim,
-                    dim,
-                )
-                state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
-                    [
-                        loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
-                            num_key_value_heads_per_shard, dims_per_head, dim
-                        )
-                        for i in range(len(loaded))
-                    ],
-                    dim=0,
-                ).reshape(key_value_dim, dim)
-
-                state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
-                    [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(len(loaded))], dim=1
-                )
-                state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
-                    [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(len(loaded))], dim=0
-                )
-                state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
-                    [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(len(loaded))], dim=1
-                )
-                state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
-                    [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(len(loaded))], dim=0
-                )
-
-            state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-            for k, v in state_dict.items():
-                index_dict["weight_map"][k] = filename
-                param_count += v.numel()
-            torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-        filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
-        if num_shards == 1:
-            # Unsharded
-            state_dict = {
-                "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
-                "model.norm.weight": loaded["norm.weight"],
-                "lm_head.weight": loaded["output.weight"],
-            }
-        else:
-            concat_dim = 0 if is_llama_3(llama_version) else 1
-            state_dict = {
-                "model.norm.weight": loaded[0]["norm.weight"],
-                "model.embed_tokens.weight": torch.cat(
-                    [loaded[i]["tok_embeddings.weight"] for i in range(len(loaded))], dim=concat_dim
-                ),
-                "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(len(loaded))], dim=0),
-            }
-
-        for k, v in state_dict.items():
-            index_dict["weight_map"][k] = filename
-            param_count += v.numel()
-        torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-        # Write configs
-        index_dict["metadata"] = {"total_size": param_count * 2}
-        write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-        ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
-        multiple_of = params["multiple_of"] if "multiple_of" in params else 256
-
-        if is_llama_3(llama_version):
-            bos_token_id = 128000
-
-            if instruct:
-                eos_token_id = [128001, 128008, 128009]
-            else:
-                eos_token_id = 128001
-        else:
-            bos_token_id = 1
-            eos_token_id = 2
-
-        if llama_version in ["3.1", "3.2", "Guard-3"]:
-            rope_scaling = {
-                "factor": 32.0 if llama_version == "3.2" else 8.0,
-                "low_freq_factor": 1.0,
-                "high_freq_factor": 4.0,
-                "original_max_position_embeddings": 8192,
-                "rope_type": "llama3",
-            }
-        else:
-            rope_scaling = None
-
-        config = LlamaConfig(
-            hidden_size=dim,
-            intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
-            num_attention_heads=params["n_heads"],
-            num_hidden_layers=params["n_layers"],
-            rms_norm_eps=params["norm_eps"],
-            num_key_value_heads=num_key_value_heads,
-            vocab_size=vocab_size,
-            rope_theta=base,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=True if llama_version in ["3.2"] else False,
-        )
-
-        config.save_pretrained(tmp_model_path)
-
-        generation_config = GenerationConfig(
-            do_sample=True,
-            temperature=0.6,
-            top_p=0.9,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-        )
-        generation_config.save_pretrained(tmp_model_path)
-
-        # Make space so we can load the model properly now.
-        del state_dict
-        del loaded
-        gc.collect()
-
-        print("Loading the checkpoint in a Llama model.")
-        model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
-
-        # Avoid saving this as part of the config.
-        del model.config._name_or_path
-        model.config.torch_dtype = torch.float16
-
-        print("Saving in the Transformers format.")
-        if push_to_hub:
-            print("Pushing to the hub.")
-            model.push_to_hub(model_path, safe_serialization=safe_serialization, private=True, use_temp_dir=True)
-        else:
-            print("Saving to disk.")
-            model.save_pretrained(model_path, safe_serialization=safe_serialization)
-
-
-class Llama3Converter(TikTokenConverter):
-    def __init__(self, vocab_file, special_tokens=None, instruct=False, llama_version="3.2", **kwargs):
-        super().__init__(vocab_file, additional_special_tokens=special_tokens, **kwargs)
-        tokenizer = self.converted()
-
-        # References for chat templates in instruct models
-        templates_for_version = {
-            "2": ("meta-llama/Llama-2-7b-chat-hf", "f5db02db724555f92da89c216ac04704f23d4590"),
-            "3": ("meta-llama/Meta-Llama-3-8B-Instruct", "5f0b02c75b57c5855da9ae460ce51323ea669d8a"),
-            "3.1": ("meta-llama/Llama-3.1-8B-Instruct", "0e9e39f249a16976918f6564b8830bc894c89659"),
-            "3.2": ("meta-llama/Llama-3.2-1B-Instruct", "e9f8effbab1cbdc515c11ee6e098e3d5a9f51e14"),
-            "Guard-3": ("meta-llama/Llama-Guard-3-1B", "acf7aafa60f0410f8f42b1fa35e077d705892029"),
-        }
-
-        # Add chat_template only if instruct is True.
-        # Prevents a null chat_template, which triggers
-        # a parsing warning in the Hub.
-        additional_kwargs = {}
-        if instruct or llama_version in ["Guard-3"]:
-            model_id, revision = templates_for_version.get(llama_version, (None, None))
-            if model_id is not None:
-                from transformers import AutoTokenizer
-
-                t = AutoTokenizer.from_pretrained(model_id, revision=revision)
-                additional_kwargs["chat_template"] = t.chat_template
-
-        self.converted_tokenizer = PreTrainedTokenizerFast(
-            tokenizer_object=tokenizer,
-            bos_token="<|begin_of_text|>",
-            eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>",
-            model_input_names=["input_ids", "attention_mask"],
-            model_max_length=CONTEXT_LENGTH_FOR_VERSION[llama_version],
-            clean_up_tokenization_spaces=True,
-            **additional_kwargs,
-        )
-        self.update_post_processor(self.converted_tokenizer)
-        # finer special_tokens_map.json
-        self.converted_tokenizer._bos_token = BOS_ADDED_TOKEN
-        self.converted_tokenizer._eos_token = EOT_ADDED_TOKEN if instruct else EOS_ADDED_TOKEN
-
-    # We can't do this while building the tokenizer because we have no easy access to the bos token id
-    def update_post_processor(self, tokenizer):
-        tokenizer._tokenizer.post_processor = processors.Sequence(
-            [
-                processors.ByteLevel(trim_offsets=False),
-                processors.TemplateProcessing(
-                    single="<|begin_of_text|> $A",
-                    pair="<|begin_of_text|>:0 $A:0 <|begin_of_text|>:1 $B:1",
-                    special_tokens=[
-                        ("<|begin_of_text|>", tokenizer.convert_tokens_to_ids("<|begin_of_text|>")),
-                    ],
-                ),
-            ]
-        )
-
-
-def write_tokenizer(
-    tokenizer_path, input_tokenizer_path, llama_version="2", special_tokens=None, instruct=False, push_to_hub=False
-):
-    print("Converting the tokenizer.")
-    tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
-    if is_llama_3(llama_version):
-        tokenizer = Llama3Converter(
-            input_tokenizer_path,
-            special_tokens,
-            instruct,
-            llama_version,
-        ).converted_tokenizer
-    else:
-        try:
-            tokenizer = tokenizer_class(input_tokenizer_path)
-        except Exception:
-            raise ValueError(
-                "Failed to instantiate tokenizer. Please, make sure you have sentencepiece and protobuf installed."
-            )
-
-    if push_to_hub:
-        print(f"Pushing a {tokenizer_class.__name__} to the Hub repo - {tokenizer_path}.")
-        tokenizer.push_to_hub(tokenizer_path, private=True, use_temp_dir=True)
-    else:
-        print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
-        tokenizer.save_pretrained(tokenizer_path)
-    return tokenizer
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Llama weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--model_size",
-        default=None,
-        help="'f' Deprecated in favor of `num_shards`: models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--safe_serialization", action="store_true", default=True, help="Whether or not to save using `safetensors`."
-    )
-    # Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
-    parser.add_argument(
-        "--llama_version",
-        choices=["1", "2", "3", "3.1", "3.2", "Guard-3"],
-        default="1",
-        type=str,
-        help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size",
-    )
-    parser.add_argument(
-        "--num_shards",
-        default=None,
-        type=int,
-        help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth",
-    )
-    parser.add_argument(
-        "--special_tokens",
-        default=None,
-        type=List[str],
-        help="The list of special tokens that should be added to the model.",
-    )
-    parser.add_argument(
-        "--instruct",
-        action="store_true",
-        default=False,
-        help="Whether the model is an instruct model or not. Will affect special tokens and chat template.",
-    )
-    args = parser.parse_args()
-    if args.model_size is None and args.num_shards is None:
-        raise ValueError("You have to set at least `num_shards` if you are not giving the `model_size`")
-    if args.special_tokens is None:
-        # no special tokens by default
-        args.special_tokens = DEFAULT_LLAMA_SPECIAL_TOKENS.get(str(args.llama_version), [])
-
-    spm_path = os.path.join(args.input_dir, "tokenizer.model")
-    vocab_size = len(
-        write_tokenizer(
-            args.output_dir,
-            spm_path,
-            llama_version=args.llama_version,
-            special_tokens=args.special_tokens,
-            instruct=args.instruct,
-            push_to_hub=args.push_to_hub,
-        )
-    )
-
-    if args.model_size != "tokenizer_only":
-        write_model(
-            model_path=args.output_dir,
-            input_base_path=args.input_dir,
-            model_size=args.model_size,
-            safe_serialization=args.safe_serialization,
-            llama_version=args.llama_version,
-            vocab_size=vocab_size,
-            num_shards=args.num_shards,
-            instruct=args.instruct,
-            push_to_hub=args.push_to_hub,
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 1ffe5ec71435..be457edbc5c1 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -228,7 +228,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 0d65e1417f52..938fd7d3265a 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -35,7 +36,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
@@ -44,6 +45,8 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -51,6 +54,12 @@
 from .configuration_llama import LlamaConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b-hf"
@@ -98,45 +107,18 @@ def __init__(self, config: LlamaConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -441,20 +423,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -515,10 +489,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -526,16 +501,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -546,6 +519,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -580,7 +557,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -614,13 +591,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -628,12 +604,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -714,7 +695,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -781,27 +762,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -836,10 +816,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -848,12 +827,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -862,10 +840,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -906,29 +880,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -937,9 +910,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -954,7 +926,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -969,10 +941,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1007,21 +975,21 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.transformer.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         start_positions: Optional[torch.LongTensor] = None,
         end_positions: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -1032,9 +1000,8 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
             are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.transformer(
+        outputs: BaseModelOutputWithPast = self.transformer(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1042,10 +1009,9 @@ def forward(
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -1056,10 +1022,6 @@ def forward(
         if start_positions is not None and end_positions is not None:
             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
 
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return QuestionAnsweringModelOutput(
             loss=loss,
             start_logits=start_logits,
@@ -1099,6 +1061,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1110,23 +1073,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1135,9 +1096,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1145,10 +1105,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index cb8b742ed41b..417a2078d279 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -192,8 +192,8 @@ def update_post_processor(self):
         if eos is None and self.add_eos_token:
             raise ValueError("add_eos_token = True but eos_token = None")
 
-        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
 
         special_tokens = []
         if self.add_bos_token:
diff --git a/src/transformers/models/llama4/__init__.py b/src/transformers/models/llama4/__init__.py
new file mode 100644
index 000000000000..59fe1686cec1
--- /dev/null
+++ b/src/transformers/models/llama4/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_llama4 import *
+    from .image_processing_llama4_fast import *
+    from .modeling_llama4 import *
+    from .processing_llama4 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py
new file mode 100644
index 000000000000..1c4c00f48fce
--- /dev/null
+++ b/src/transformers/models/llama4/configuration_llama4.py
@@ -0,0 +1,432 @@
+# coding=utf-8
+# Copyright 2025 The LLAMA4 and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Llama4VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Llama4VisionModel`]. It is used to instantiate a
+    Llama4 vision model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Llama4 109B.
+
+    e.g. [meta-llama/Llama-4-Scout-17B-16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        num_hidden_layers (`int`, *optional*, defaults to 34):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input image.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        vision_output_dim (`int`, *optional*, defaults to 7680):
+            Dimensionality of the vision model output. Includes output of transformer
+            encoder with intermediate layers and global transformer encoder.
+        image_size (`int`, *optional*, defaults to 448):
+            The size (resolution) of each image *tile*.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        vision_feature_layer (``, *optional*, defaults to -1): TODO
+        vision_feature_select_strategy (`int`, *optional*, defaults to `"default"`): TODO
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        pixel_shuffle_ratio (`int`, *optional*, defaults to 0.5): TODO
+        projector_input_dim (`int`, *optional*, defaults to 4096): TODO
+        projector_output_dim (`int`, *optional*, defaults to 4096): TODO
+        multi_modal_projector_bias (`int`, *optional*, defaults to `False`): TODO
+        projector_dropout (`int`, *optional*, defaults to 0.0): TODO
+        attention_dropout (`int`, *optional*, defaults to 0.0): TODO
+        rope_theta (`int`, *optional*, defaults to 10000): TODO
+    """
+
+    base_model_tp_plan = {
+        "model.layers.*.self_attn.q_proj": "colwise",
+        "model.layers.*.self_attn.k_proj": "colwise",
+        "model.layers.*.self_attn.v_proj": "colwise",
+        "model.layers.*.self_attn.o_proj": "rowwise",
+        "vision_adapter.mlp.fc1": "colwise",
+        "vision_adapter.mlp.fc2": "rowwise",
+        "patch_embedding.linear": "colwise_rep",
+    }
+    model_type = "llama4_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        hidden_act: str = "gelu",
+        num_hidden_layers: int = 34,
+        num_attention_heads: int = 16,
+        num_channels: int = 3,
+        intermediate_size: int = 5632,
+        vision_output_dim: int = 7680,
+        image_size: int = 448,
+        patch_size: int = 14,
+        norm_eps: float = 1e-5,
+        vision_feature_layer=-1,
+        vision_feature_select_strategy="default",
+        initializer_range: float = 0.02,
+        pixel_shuffle_ratio=0.5,
+        projector_input_dim=4096,
+        projector_output_dim=4096,
+        multi_modal_projector_bias=False,
+        projector_dropout=0.0,
+        attention_dropout=0.0,
+        rope_theta=10000,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.num_hidden_layers = num_hidden_layers
+        self.num_channels = num_channels
+        self.intermediate_size = intermediate_size
+        self.image_size = image_size
+        self.vision_output_dim = vision_output_dim
+        self.patch_size = patch_size
+        self.norm_eps = norm_eps
+        self.num_attention_heads = num_attention_heads
+        self.initializer_range = initializer_range
+        self.pixel_shuffle_ratio = pixel_shuffle_ratio
+        self.projector_input_dim = projector_input_dim
+        self.projector_output_dim = projector_output_dim
+        self.multi_modal_projector_bias = multi_modal_projector_bias
+        self.projector_dropout = projector_dropout
+        self.attention_dropout = attention_dropout
+        self.vision_feature_layer = vision_feature_layer
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.rope_theta = rope_theta
+        super().__init__(**kwargs)
+
+
+class Llama4TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Llama4TextModel`]. It is used to instantiate a
+    Llama4 text model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Llama4 109B.
+
+    e.g. [meta-llama/Llama-4-Scout-17B-16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 202048):
+            Vocabulary size of the Llama4 text model. Defines the maximum number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`Llama4TextModel`].
+        hidden_size (`int`, *optional*, defaults to 5120):
+            Dimensionality of the embeddings and hidden states.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        intermediate_size_mlp (`int`, *optional*, defaults to 16384): TODO
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 40):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If not
+            specified, will default to `num_attention_heads`.
+        head_dim (`int`, *optional*, defaults to 128): TODO
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions.
+        pad_token_id (`int`, *optional*, defaults to 128004):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the beginning of sentence token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the end of sentence token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to `500000.0`):
+            The base period of the RoPE embeddings.
+        attention_dropout (`int`, *optional*, defaults to 0.0): TODO
+        num_experts_per_tok (`int`, *optional*, defaults to 1): TODO
+        num_local_experts (`int`, *optional*, defaults to 16): TODO
+        moe_layers (`int`, *optional*): TODO
+        interleave_moe_layer_step (`int`, *optional*, defaults to 1): TODO
+        use_qk_norm (`int`, *optional*, defaults to `True`): TODO
+        output_router_logits (`int`, *optional*, defaults to `False`): TODO
+        router_aux_loss_coef (`int`, *optional*, defaults to 0.001): TODO
+        router_jitter_noise (`int`, *optional*, defaults to 0.0): TODO
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+            <TODO>
+            <TODO>
+        no_rope_layers (`int`, *optional*): TODO
+        no_rope_layer_interval (`int`, *optional*, defaults to 4): TODO
+        attention_chunk_size (`int`, *optional*, defaults to 8192):
+            <TODO>
+        attn_temperature_tuning (`int`, *optional*, defaults to 4): TODO
+        floor_scale (`int`, *optional*, defaults to 8192): TODO
+        attn_scale (`int`, *optional*, defaults to 0.1): TODO
+
+    Example:
+    """
+
+    model_type = "llama4_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.input_layernorm.weight": "sequence_parallel",
+        "layers.*.post_attention_layernorm.weight": "sequence_parallel",
+        "norm.weight": "sequence_parallel",
+        "layers.*.feed_forward.shared_expert.gate_proj": "local_colwise",
+        "layers.*.feed_forward.shared_expert.up_proj": "local_colwise",
+        "layers.*.feed_forward.shared_expert.down_proj": "local_rowwise",
+        "layers.*.feed_forward.experts.gate_up_proj": "local_packed_rowwise",  # row because not linear
+        "layers.*.feed_forward.experts.down_proj": "local_colwise",  # col because not linear
+        "layers.*.feed_forward.experts": "local",
+        "layers.*.feed_forward.gate_proj": "local_colwise",
+        "layers.*.feed_forward.up_proj": "local_colwise",
+        "layers.*.feed_forward.down_proj": "local_rowwise",
+        "layers.*.feed_forward": "gather",
+    }
+
+    def __init__(
+        self,
+        vocab_size=202048,
+        hidden_size=5120,
+        intermediate_size=8192,
+        intermediate_size_mlp=16384,
+        num_hidden_layers=48,
+        num_attention_heads=40,
+        num_key_value_heads=8,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=500000,
+        attention_dropout=0.0,
+        num_experts_per_tok=1,
+        num_local_experts=16,
+        moe_layers=None,
+        interleave_moe_layer_step=1,
+        use_qk_norm=True,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        rope_scaling=None,
+        no_rope_layers=None,
+        no_rope_layer_interval=4,
+        attention_chunk_size=8192,
+        attn_temperature_tuning=4,
+        floor_scale=8192,
+        attn_scale=0.1,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.attn_temperature_tuning = attn_temperature_tuning
+        self.attn_scale = attn_scale
+        self.floor_scale = floor_scale
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.intermediate_size_mlp = intermediate_size_mlp
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.rope_scaling = rope_scaling
+        self.attention_bias = False
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        self.use_qk_norm = use_qk_norm
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        default_no_rope_layers = [
+            int((layer_idx + 1) % no_rope_layer_interval != 0) for layer_idx in range(self.num_hidden_layers)
+        ]
+
+        # no_rope_layers == [] is invalid as we cannot have 0 layers
+        self.no_rope_layers = no_rope_layers if no_rope_layers else default_no_rope_layers
+
+        self.interleave_moe_layer_step = interleave_moe_layer_step
+        self.moe_layers = (
+            moe_layers
+            if moe_layers is not None
+            else list(range(interleave_moe_layer_step - 1, num_hidden_layers, interleave_moe_layer_step))
+        )
+        self.attention_chunk_size = attention_chunk_size
+
+
+class Llama4Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Llama4Model`]. It is used to instantiate an
+    Llama4 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Llama4 109B.
+
+    e.g. [meta-llama/Llama-4-Scout-17B-16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vision_config (`Llama4VisionConfig`, *optional*):
+            The Llama4 Vision config.
+        text_config (`Llama4TextConfig`, *optional*):
+            The Llama4 Text config.
+        boi_token_index (`int`, *optional*, defaults to 200080):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_index (`int`, *optional*, defaults to 200081):
+            The end-of-image token index to wrap the image prompt.
+        image_token_index (`int`, *optional*, defaults to 200092):
+            The image token index to encode the image prompt.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+
+    ```python
+    >>> from transformers import Llama4Model, Llama4Config
+
+    >>> # Initializing a Llama4 7B style configuration
+    >>> configuration = Llama4Config()
+
+    >>> # Initializing a model from the Llama4 7B style configuration
+    >>> model = Llama4Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "llama4"
+    sub_configs = {"text_config": Llama4TextConfig, "vision_config": Llama4VisionConfig}
+    base_model_tp_plan = {
+        "multi_modal_projector.linear_1": "colwise_rep",
+    }
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        boi_token_index=200080,
+        eoi_token_index=200081,
+        image_token_index=200092,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        if vision_config is None:
+            self.vision_config = Llama4VisionConfig()
+            logger.info("vision_config is None, using default llama4 vision config")
+        elif isinstance(vision_config, dict):
+            self.vision_config = Llama4VisionConfig(**vision_config)
+        elif isinstance(vision_config, Llama4VisionConfig):
+            self.vision_config = vision_config
+
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.image_token_index = image_token_index
+
+        if text_config is None:
+            self.text_config = Llama4TextConfig()
+            logger.info("text_config is None, using default llama4 text config")
+        elif isinstance(text_config, dict):
+            self.text_config = Llama4TextConfig(**text_config)
+        elif isinstance(text_config, Llama4TextConfig):
+            self.text_config = text_config
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["Llama4Config", "Llama4TextConfig", "Llama4VisionConfig"]
diff --git a/src/transformers/models/llama4/image_processing_llama4_fast.py b/src/transformers/models/llama4/image_processing_llama4_fast.py
new file mode 100644
index 000000000000..6935ba798f79
--- /dev/null
+++ b/src/transformers/models/llama4/image_processing_llama4_fast.py
@@ -0,0 +1,480 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Got-OCR-2."""
+
+import math
+from collections import defaultdict
+from functools import lru_cache
+from typing import List, Optional, Set, Tuple, Union
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+)
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+def get_factors(dividend: int) -> Set[int]:
+    """
+    Calculate all factors of a given number, i.e. a dividor that leaves
+    no remainder. For example, if dividend=12, it will return {1, 2, 3, 4, 6, 12}.
+
+    Args:
+        dividend (int): The number to find factors for.
+
+    Returns:
+        set: A set containing all factors of the number.
+    """
+    factors_set = set()
+
+    for i in range(1, int(dividend**0.5) + 1):
+        if dividend % i == 0:
+            factors_set.add(i)
+            factors_set.add(dividend // i)
+    return factors_set
+
+
+def get_max_res_without_distortion(
+    image_size: Tuple[int, int],
+    target_size: Tuple[int, int],
+) -> Tuple[int, int]:
+    """
+    Determines the maximum resolution to which an image can be resized to without distorting its
+    aspect ratio, based on the target resolution.
+
+    Args:
+        image_size (Tuple[int, int]): The original resolution of the image (height, width).
+        target_resolution (Tuple[int, int]): The desired resolution to fit the image into (height, width).
+    Returns:
+        Tuple[int, int]: The optimal dimensions (height, width) to which the image should be resized.
+    Example:
+        >>> _get_max_res_without_distortion([200, 300], target_size = [450, 200])
+        (134, 200)
+        >>> _get_max_res_without_distortion([800, 600], target_size = [450, 1300])
+        (450, 338)
+    """
+
+    original_height, original_width = image_size
+    target_height, target_width = target_size
+
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.floor(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.floor(original_width * scale_h), target_width)
+
+    return new_height, new_width
+
+
+class Llama4ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    max_patches: Optional[int]
+    resize_to_max_canvas: Optional[bool]
+
+
+def split_to_tiles(images: torch.Tensor, num_tiles_height: int, num_tiles_width: int) -> torch.Tensor:
+    # Split image into number of required tiles (width x height)
+    batch_size, num_channels, height, width = images.size()
+    images = images.view(
+        batch_size,
+        num_channels,
+        num_tiles_height,
+        height // num_tiles_height,
+        num_tiles_width,
+        width // num_tiles_width,
+    )
+    # Permute dimensions to reorder the axes
+    image = images.permute(0, 2, 4, 1, 3, 5).contiguous()
+    # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
+    image = image.view(
+        batch_size,
+        num_tiles_width * num_tiles_height,
+        num_channels,
+        height // num_tiles_height,
+        width // num_tiles_width,
+    )
+    return image
+
+
+@lru_cache(maxsize=1)
+def find_supported_resolutions(max_num_chunks: int, patch_size: SizeDict) -> torch.Tensor:
+    """
+    Computes all of the allowed resolutions for a fixed number of chunks
+    and patch_size. Useful for when dividing an image into chunks.
+
+    Args:
+        max_num_chunks (int): Maximum number of chunks for processing.
+        patch_size (int): Size of the side of the patch.
+
+    Returns:
+        torch.Tensor: List of possible resolutions as tuples (height, width).
+
+    Example:
+        >>> max_num_chunks = 5
+        >>> patch_size = 224
+        >>> find_supported_resolutions(max_num_chunks, patch_size)
+        tensor([(224, 896), (448, 448), (224, 224), (896, 224), (224, 672),
+        (672, 224), (224, 448), (448, 224)])
+
+        Given max_num_chunks=4, patch_size=224, it will create a dictionary:
+        {
+        0.25: [(1, 4)],
+        1.0: [(2, 2), (1, 1)],
+        4.0: [(4, 1)],
+        0.33: [(1, 3)],
+        3.0: [(3, 1)],
+        0.5: [(1, 2)],
+        2.0: [(2, 1)]
+        }
+
+        and return the resolutions multiplied by the patch_size:
+        [(1*224, 4*224), (2*224, 2*224), ..., (2*224, 1*224)]
+    """
+    height, width = patch_size.height, patch_size.width
+    if height != width:
+        raise ValueError("`size` must be square.")
+
+    patch_size = height
+
+    asp_dict = defaultdict(list)
+    for chunk_size in range(max_num_chunks, 0, -1):
+        _factors = sorted(get_factors(chunk_size))
+        _asp_ratios = [(factor, chunk_size // factor) for factor in _factors]
+        for height, width in _asp_ratios:
+            ratio_float = height / width
+            asp_dict[ratio_float].append((height, width))
+
+    # get the resolutions multiplied by the patch_size
+    possible_resolutions = []
+    for key, value in asp_dict.items():
+        for height, depth in value:
+            possible_resolutions.append((height * patch_size, depth * patch_size))
+
+    return possible_resolutions
+
+
+def pad_to_best_fit(
+    images: "torch.Tensor",
+    target_size: Tuple[int, int],
+    background_color: Union[int, Tuple[int, int, int]] = 0,
+) -> "torch.Tensor":
+    """
+    Pads an image to fit the target size.
+
+    Args:
+        images (`np.ndarray`):
+            The images to pad.
+        background_color (`int` or `Tuple[int, int, int]`, *optional*, defaults to 0):
+            The color to use for the padding. Can be an integer for single channel or a
+            tuple of integers representing for multi-channel images. If passed as integer
+            in mutli-channel mode, it will default to `0` in subsequent channels.
+    Returns:
+        `torch.Tensor`: The padded images.
+    """
+
+    num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0]
+    if isinstance(background_color, int):
+        background_color = [background_color] + [0] * (num_channels - 1)
+    elif len(background_color) != num_channels:
+        raise ValueError(
+            f"background_color must have no more than {num_channels} elements to match the number of channels"
+        )
+
+    height, width = images.shape[-2:]
+    target_height, target_width = target_size
+    paste_x_right = target_width - width
+    paste_y_right = target_height - height
+    padded_images = F.pad(images, padding=[0, 0, paste_x_right, paste_y_right], fill=background_color)
+
+    return padded_images
+
+
+def get_best_fit(
+    image_size: Tuple[int, int],
+    possible_resolutions: torch.Tensor,
+    resize_to_max_canvas: bool = False,
+) -> Tuple[int, int]:
+    """
+    Determines the best canvas possible from a list of possible resolutions to, without distortion,
+    resize an image to.
+
+    For each possible resolution, calculates the scaling factors for
+    width and height, and selects the smallest one, which is the limiting side.
+    E.g. to match the canvas you can upscale height by 2x, and width by 1.5x,
+    therefore, the maximum upscaling you can do is min(2, 1.5) = 1.5.
+
+    If upscaling is possible (any of the scaling factors is greater than 1),
+    then picks the smallest upscaling factor > 1, unless resize_to_max_canvas is True.
+
+    If upscaling is not possible, then picks the largest scaling factor <= 1, i.e.
+    reduce downscaling as much as possible.
+
+    If there are multiple resolutions with the same max scale, we pick the one with the lowest area,
+    to minimize padding. E.g., the same image can be upscaled to 224x224 and 224x448, but the latter
+    has more padding.
+
+    Args:
+        image_size (Tuple[int, int]): A tuple containing the height and width of the image.
+        possible_resolutions (torch.Tensor): A tensor of shape (N, 2) where each
+            row represents a possible resolution (height, width).
+        resize_to_max_canvas (bool): If True, will return the largest upscaling resolution.
+
+    Returns:
+        List[int]: The best resolution [height, width] for the given image.
+
+    Example:
+        >>> image_size = (200, 300)
+        >>> possible_resolutions = torch.tensor([[224, 672],
+        ...                                     [672, 224],
+        ...                                     [224, 448],
+        ...                                     [448, 224],
+        ...                                     [224, 224]])
+        >>> get_best_fit(image_size, possible_resolutions)
+        [224, 448]
+
+        We have:
+            scale_w = tensor([2.2400, 0.7467, 1.4933, 0.7467, 0.7467])
+            scale_h = tensor([1.1200, 3.3600, 1.1200, 2.2400, 1.1200])
+            scales = tensor([1.1200, 0.7467, 1.1200, 0.7467, 0.7467])
+        Only one of the scales > 1:
+            upscaling_possible = tensor([1.1200, 1.1200])
+            smallest_rescale = tensor(1.1200)
+        So we pick the resolution with the smallest smallest area:
+            areas = tensor([150528, 100352]) # [672, 224], [224, 448]
+            optimal_canvas = tensor([224, 448])
+    """
+
+    original_height, original_width = image_size
+
+    # get all possible resolutions heights/widths
+    target_heights, target_widths = (
+        possible_resolutions[:, 0],
+        possible_resolutions[:, 1],
+    )
+
+    # get scaling factors to resize the image without distortion
+    scale_w = target_widths / original_width
+    scale_h = target_heights / original_height
+
+    # get the min scale between width and height (limiting side -> no distortion)
+    scales = torch.where(scale_h > scale_w, scale_w, scale_h)
+
+    # filter only scales that allow upscaling
+    upscaling_options = scales[scales >= 1]
+    if len(upscaling_options) > 0:
+        if resize_to_max_canvas:
+            selected_scale = torch.max(upscaling_options)
+        else:
+            selected_scale = torch.min(upscaling_options)
+    else:
+        # no upscaling possible,
+        # get the minimum downscaling (max scale for scales<1)
+        downscaling_options = scales[scales < 1]
+        selected_scale = torch.max(downscaling_options)
+
+    # get all resolutions that support this scaling factor,
+    # e.g. you can upscale to 224x224, 224x448, 224x672 without distortion
+    chosen_canvas = possible_resolutions[scales == selected_scale]
+
+    # if there are multiple resolutions,
+    # get the one with minimum area to reduce padding
+    if len(chosen_canvas) > 1:
+        areas = chosen_canvas[:, 0] * chosen_canvas[:, 1]
+        optimal_idx = torch.argmin(areas)
+        optimal_canvas = chosen_canvas[optimal_idx]
+    else:
+        optimal_canvas = chosen_canvas[0]
+
+    return tuple(optimal_canvas.tolist())
+
+
+@add_start_docstrings(
+    "Constructs a fast Llama4 image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        max_patches (`int`, *optional*, defaults to 16):
+            The maximum number of patches to be extracted from the image.
+            Can be overridden by the `max_patches` parameter in the `preprocess` method.
+        resize_to_max_canvas (`bool`, *optional*, defaults to False):
+            Whether to resize the image to the maximum canvas size.
+            If True, picks the canvas the allows the largest resizing without distortion.
+            If False, downsample as little as possible, including no resizing at all,
+            but never upsample, unless the image is smaller than the patch size.
+    """,
+)
+class Llama4ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = [0.5, 0.5, 0.5]
+    image_std = [0.5, 0.5, 0.5]
+    size = {"height": 336, "width": 336}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    max_patches = 16
+    resize_to_max_canvas = False
+    valid_kwargs = Llama4ImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[Llama4ImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    def rescale_and_normalize(
+        self,
+        images: "torch.Tensor",
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+    ) -> "torch.Tensor":
+        """
+        Rescale and normalize images.
+        Override to rescale and normalize the images in torch.bfloat16 as in the original implementation
+        """
+        if do_rescale and do_normalize:
+            images = images.to(dtype=torch.bfloat16) * rescale_factor
+            images = self.normalize(images, image_mean, image_std)
+        elif do_rescale:
+            images = images * rescale_factor
+        elif do_normalize:
+            images = self.normalize(images, image_mean, image_std)
+
+        return images
+
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+        max_patches (`int`, *optional*, defaults to 16):
+            The maximum number of patches to be extracted from the image.
+            Can be overridden by the `max_patches` parameter in the `preprocess` method.
+        resize_to_max_canvas (`bool`, *optional*, defaults to False):
+            Whether to resize the image to the maximum canvas size.
+            If True, picks the canvas the allows the largest resizing without distortion.
+            If False, downsample as little as possible, including no resizing at all,
+            but never upsample, unless the image is smaller than the patch size.
+        """,
+    )
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Llama4ImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        size: SizeDict,
+        max_patches: int,
+        resize_to_max_canvas: bool,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        possible_resolutions = find_supported_resolutions(max_num_chunks=max_patches, patch_size=size)
+        possible_resolutions = torch.tensor(possible_resolutions)
+        # process images by batch, grouped by shape
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        grouped_processed_images = {}
+        grouped_aspect_ratios = {}
+        for shape, stacked_images in grouped_images.items():
+            image_size = stacked_images.shape[-2:]
+            target_size = get_best_fit(image_size, possible_resolutions, resize_to_max_canvas=resize_to_max_canvas)
+            # If target_size requires upscaling, we might want to limit the upscaling to max_upscaling_size
+            max_upscaling_size = None if resize_to_max_canvas else size.height
+            if max_upscaling_size is not None:
+                new_target_height = min(max(image_size[0], max_upscaling_size), target_size[0])
+                new_target_width = min(max(image_size[1], max_upscaling_size), target_size[1])
+                target_size_without_distortion = (new_target_height, new_target_width)
+
+            # resize to target_size while preserving aspect ratio
+            new_size_without_distortion = get_max_res_without_distortion(image_size, target_size_without_distortion)
+            new_size_without_distortion = SizeDict(
+                height=max(new_size_without_distortion[0], 1), width=max(new_size_without_distortion[1], 1)
+            )
+            processed_images = self.resize(
+                stacked_images,
+                new_size_without_distortion,
+                interpolation=interpolation,
+            )
+
+            # pad to target_size to be able to split into tiles
+            processed_images = pad_to_best_fit(processed_images, target_size)
+            processed_images = self.rescale_and_normalize(
+                processed_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+
+            ratio_h, ratio_w = (
+                target_size[0] // size.height,
+                target_size[1] // size.height,
+            )
+            # split into tiles
+            processed_images = split_to_tiles(processed_images, ratio_h, ratio_w)
+            grouped_processed_images[shape] = processed_images
+            grouped_aspect_ratios[shape] = torch.tensor([[ratio_h, ratio_w]] * stacked_images.shape[0])
+
+            # add a global tile to the processed tile if there are more than one tile
+            if ratio_h * ratio_w > 1:
+                global_tiles = self.resize(
+                    stacked_images,
+                    size,
+                    interpolation=interpolation,
+                )
+                global_tiles = self.rescale_and_normalize(
+                    global_tiles, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+                )
+                grouped_processed_images[shape] = torch.cat([processed_images, global_tiles.unsqueeze(1)], dim=1)
+        processed_images = reorder_images(grouped_processed_images, grouped_images_index)
+        aspect_ratios_list = reorder_images(grouped_aspect_ratios, grouped_images_index)
+
+        processed_images = torch.cat(processed_images, dim=0) if return_tensors else processed_images
+        aspect_ratios = torch.stack(aspect_ratios_list, dim=0) if return_tensors else aspect_ratios_list
+        return BatchFeature(
+            data={"pixel_values": processed_images, "aspect_ratios": aspect_ratios}, tensor_type=return_tensors
+        )
+
+
+__all__ = ["Llama4ImageProcessorFast"]
diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py
new file mode 100644
index 000000000000..f511674dc0ef
--- /dev/null
+++ b/src/transformers/models/llama4/modeling_llama4.py
@@ -0,0 +1,1903 @@
+# coding=utf-8
+# Copyright 2025 The LLAMA4 and HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+
+from transformers.models.llama4.configuration_llama4 import Llama4VisionConfig
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    ModelOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_llama4 import Llama4Config, Llama4TextConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "meta-ai/Llama-4-17B"
+_CONFIG_FOR_DOC = "Llama4Config"
+
+
+class Llama4TextExperts(nn.Module):
+    def __init__(self, config: Llama4Config):
+        super().__init__()
+        self.num_experts = config.num_local_experts
+        self.intermediate_size = config.intermediate_size
+        self.hidden_size = config.hidden_size
+        self.expert_dim = self.intermediate_size
+        self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+        self.down_proj = nn.Parameter(torch.empty((self.num_experts, self.expert_dim, self.hidden_size)))
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        This should really not be run on a single machine, as we are reaching compute bound:
+        - the inputs are expected to be "sorted" per expert already.
+        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape
+
+        Args:
+            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
+            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
+            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
+        Returns:
+            torch.Tensor
+        """
+        hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size)
+        gate_up = torch.bmm(hidden_states, self.gate_up_proj)
+        gate, up = gate_up.chunk(2, dim=-1)  # not supported for DTensors
+        next_states = torch.bmm((up * self.act_fn(gate)), self.down_proj)
+        next_states = next_states.view(-1, self.hidden_size)
+        return next_states
+
+
+# Phi3MLP
+class Llama4TextMLP(nn.Module):
+    def __init__(self, config, intermediate_size=None):
+        super().__init__()
+
+        if intermediate_size is None:
+            intermediate_size = config.intermediate_size
+
+        self.config = config
+        self.gate_proj = nn.Linear(config.hidden_size, intermediate_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.activation_fn(self.gate_proj(x)) * self.up_proj(x)
+        return self.down_proj(down_proj)
+
+
+class Llama4TextL2Norm(torch.nn.Module):
+    def __init__(self, dim: int = None, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        return self._norm(x.float()).type_as(x)
+
+    def extra_repr(self):
+        return f"eps={self.eps}"
+
+
+class Llama4TextRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-5):
+        """
+        Llama4RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+class Llama4TextMoe(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.top_k = config.num_experts_per_tok
+        self.hidden_dim = config.hidden_size
+        self.num_experts = config.num_local_experts
+        self.experts = Llama4TextExperts(config)
+        self.router = nn.Linear(config.hidden_size, config.num_local_experts, bias=False)
+        self.shared_expert = Llama4TextMLP(config)
+
+    def forward(self, hidden_states):
+        batch, seq_len, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_dim)
+        router_logits = self.router(hidden_states).transpose(0, 1)
+        tokens_per_expert = batch * seq_len
+
+        router_top_value, router_indices = torch.topk(router_logits.transpose(0, 1), self.top_k, dim=1)
+        router_scores = (
+            torch.full_like(router_logits.transpose(0, 1), float("-inf"))
+            .scatter_(1, router_indices, router_top_value)
+            .transpose(0, 1)
+        )
+        # We do this to make sure we have -inf for non topK tokens before going through the !
+        # Here we are just creating a tensor to index each and every single one of the hidden states. Let s maybe register a buffer for this!
+        router_indices = (
+            torch.arange(tokens_per_expert, device=hidden_states.device).view(1, -1).expand(router_scores.size(0), -1)
+        )
+        router_scores = torch.sigmoid(router_scores.float()).to(hidden_states.dtype)
+
+        router_indices = router_indices.reshape(-1, 1).expand(-1, hidden_dim)
+        routed_in = torch.gather(
+            input=hidden_states,
+            dim=0,
+            index=router_indices,
+        ).to(hidden_states.device)
+        # we gather inputs corresponding to each expert based on the router indices
+        routed_in = routed_in * router_scores.reshape(-1, 1)
+        routed_out = self.experts(routed_in)
+        out = self.shared_expert(hidden_states)
+        # now that we finished expert computation -> we scatter add because we gathered previously
+        # we have to do this because we used all experts on all tokens. This is faster than the for loop, tho you are compute bound
+        # this scales a lot better if you do EP!
+        out.scatter_add_(dim=0, index=router_indices, src=routed_out.view(-1, hidden_dim))
+        return out, router_scores
+
+
+class Llama4TextRotaryEmbedding(nn.Module):
+    def __init__(self, config: Llama4TextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        self.rope_type = "llama3" if config.rope_scaling is not None else "default"
+
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.to(x.device) @ position_ids_expanded).transpose(1, 2)
+            freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # Convert to complex representation
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        freqs_cis = freqs_cis * self.attention_scaling
+        return freqs_cis
+
+
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis[:, :, None, :]).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis[:, :, None, :]).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) / math.sqrt(module.head_dim)
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Llama4TextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Llama4TextConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attn_scale = config.attn_scale
+        self.floor_scale = config.floor_scale
+        self.attn_temperature_tuning = config.attn_temperature_tuning
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.use_rope = int((layer_idx + 1) % 4 != 0)  # rope unused for dense layers
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        if self.config.use_qk_norm and self.use_rope:
+            self.qk_norm = Llama4TextL2Norm()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        key_states = self.k_proj(hidden_states).view(*input_shape, -1, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        if self.use_rope:  # the 16E model skips rope for long context on certain layers
+            query_states, key_states = apply_rotary_emb(
+                query_states, key_states, position_embeddings.to(query_states.device)
+            )
+
+        if hasattr(self, "qk_norm"):  # the 128E model does not use qk_norm
+            query_states = self.qk_norm(query_states)
+            key_states = self.qk_norm(key_states)
+
+        # Use temperature tuning from https://arxiv.org/abs/2501.19399) to NoROPE layers
+        if self.attn_temperature_tuning and not self.use_rope:
+            attn_scales = (
+                torch.log(torch.floor((cache_position.float() + 1.0) / self.floor_scale) + 1.0) * self.attn_scale + 1.0
+            )
+            attn_scales = attn_scales.view((*input_shape, 1, 1))
+            query_states = (query_states * attn_scales).to(query_states.dtype)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Llama4TextDecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Llama4TextAttention(config, layer_idx)
+        self.use_chunked_attention = int((layer_idx + 1) % 4 != 0)  # <=> use rope
+        self.is_moe_layer = layer_idx in config.moe_layers
+        if self.is_moe_layer:  # the 128E model interleaves dense / sparse
+            self.feed_forward = Llama4TextMoe(config)
+        else:
+            self.feed_forward = Llama4TextMLP(config, intermediate_size=config.intermediate_size_mlp)
+
+        self.input_layernorm = Llama4TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Llama4TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        chunk_causal_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # use local attention mask for ROPE layers
+        if self.use_chunked_attention and chunk_causal_mask is not None:
+            attention_mask = chunk_causal_mask
+
+        # Self Attention
+        attention_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + attention_states
+
+        # Fully Connected
+        residual = hidden_states
+
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        if self.is_moe_layer:
+            hidden_states, router_logits = hidden_states
+        else:
+            router_logits = None
+        hidden_states = residual + hidden_states.view(residual.shape)
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+LLAMA4_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Llama4Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Llama4 Model outputting raw hidden-states without any specific head on top.",
+    LLAMA4_START_DOCSTRING,
+)
+class Llama4PreTrainedModel(PreTrainedModel):
+    config_class = Llama4Config
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+LLAMA4_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Llama4 Model outputting raw hidden-states without any specific head on top.",
+    LLAMA4_START_DOCSTRING,
+)
+class Llama4TextModel(Llama4PreTrainedModel):
+    _no_split_modules = ["Llama4TextDecoderLayer"]
+    base_model_prefix = "model"
+    config_class = Llama4TextConfig
+
+    def __init__(self, config: Llama4TextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Llama4TextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Llama4TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Llama4TextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA4_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids.to(self.embed_tokens.weight.device))
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask, chunk_causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        freq_cis = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    chunk_causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    freq_cis,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    chunk_causal_mask=chunk_causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=freq_cis,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+        chunked_attention_mask=None,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask, attention_mask  # flash does not support chunked attn TODO support flash
+            return None, None
+
+        if self.config._attn_implementation not in ["sdpa", "flex_attention", "eager"]:
+            return None, None
+
+        sequence_length = input_tensor.shape[1]
+        cache_position = cache_position.to(self.device)
+        attention_chunk_size = self.config.attention_chunk_size
+
+        first_cache_position = cache_position[0]
+        last_cache_position = cache_position[-1]
+
+        # to avoid graph break, we introduce this hack
+        cond1 = first_cache_position >= attention_chunk_size
+        cond2 = (first_cache_position < attention_chunk_size) & (
+            first_cache_position + sequence_length > attention_chunk_size
+        )
+
+        key_length = torch.where(
+            cond1,
+            attention_chunk_size + sequence_length - 1,
+            torch.where(cond2, first_cache_position + sequence_length, attention_chunk_size),
+        )
+
+        if past_key_values is not None and past_key_values.is_compileable:
+            target_length = past_key_values.get_max_cache_shape
+        else:
+            target_length = attention_mask.shape[-1] if attention_mask is not None else sequence_length
+
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                offsets = (first_cache_position, max(last_cache_position - key_length, 0))
+                chunked_attention_mask = make_flex_block_causal_mask(
+                    attention_mask, self.config.attention_chunk_size, sequence_length, key_length, offsets=offsets
+                )
+                attention_mask = make_flex_block_causal_mask(
+                    attention_mask,
+                    query_length=sequence_length,
+                    key_length=past_key_values.get_max_cache_shape(),
+                    offsets=None if sequence_length != 1 else (first_cache_position, 0),
+                )
+                return attention_mask, chunked_attention_mask
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask, chunked_attention_mask
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        dtype, device = input_tensor.dtype, input_tensor.device
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if target_length > self.config.attention_chunk_size:
+            chunked_attention_mask = self.create_chunked_attention_mask(
+                self.config.attention_chunk_size,
+                start=first_cache_position,
+                end=first_cache_position + key_length,
+                device=device,
+            )
+            chunked_attention_mask = chunked_attention_mask & attention_mask
+            if sequence_length == 1:
+                chunked_attention_mask = chunked_attention_mask[-1:]
+            if self.config._attn_implementation == "eager":
+                chunked_attention_mask = (
+                    chunked_attention_mask[None, None, :, :]
+                    .to(dtype)
+                    .masked_fill(chunked_attention_mask, torch.finfo(dtype).min)
+                )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and attention_mask.ndim == 4
+            and not output_attentions  # Only unmask for 4d masks
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+            # chunked_attention_mask = AttentionMaskConverter._unmask_unattended(chunked_attention_mask, min_dtype)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and chunked_attention_mask is not None:
+            chunked_attention_mask = chunked_attention_mask.bool()
+            causal_mask = causal_mask.bool()
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=first_cache_position,
+                is_training=self.training,
+            ):
+                causal_mask = None
+        return causal_mask, chunked_attention_mask
+
+    def create_chunked_attention_mask(
+        self, attention_chunk_size: int, start: int, end: int, device: torch.device
+    ) -> torch.Tensor:
+        """
+        Generate the following:
+
+        'What'      :  0 ■ ⬚ ⬚ ⬚ ⬚ ⬚    |
+        '▁is'       :  1 ■ ■ ⬚ ⬚ ⬚ ⬚     |
+        '▁ch'       :  2 ■ ■ ■ ⬚ ⬚ ⬚     |
+        'unked'     :  3 ⬚ ⬚ ⬚ ■ ⬚ ⬚    |
+        '▁attention':  4 ⬚ ⬚ ⬚ ■ ■ ⬚    |
+        '?'         :  5 ⬚ ⬚ ⬚ ■ ■ ■     |
+
+        If the chunk size is 3.
+        This can just be appplied over the already created attention mask
+        """
+        block_pos = torch.abs(
+            (torch.arange(start, end).unsqueeze(0) // attention_chunk_size)
+            - (torch.arange(start, end).unsqueeze(1) // attention_chunk_size)
+        )
+        token_pos = torch.arange(start, end).unsqueeze(0) - torch.arange(start, end).unsqueeze(1)
+        mask = (block_pos == 0) & (token_pos <= 0)
+        return mask.to(device)
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.to(device).reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(device)
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin):
+    base_model_prefix = "language_model"
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    config_class = Llama4TextConfig
+
+    def __init__(self, config: Llama4TextConfig):
+        super().__init__(config)
+        self.model = Llama4TextModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(LLAMA4_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Llama4ForCausalLM
+
+        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@dataclass
+class Llama4CausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Llava causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class Llama4VisionMLP2(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.fc1 = nn.Linear(self.intermediate_size, config.projector_input_dim, bias=False)
+        self.fc2 = nn.Linear(config.projector_output_dim, config.projector_output_dim, bias=False)
+        self.activation_fn = nn.GELU()  # ACT2FN[config.hidden_act]
+        self.dropout = config.projector_dropout
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        return self.activation_fn(self.fc2(hidden_states))
+
+
+class Llama4MultiModalProjector(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.linear_1 = nn.Linear(
+            config.vision_config.vision_output_dim,
+            config.text_config.hidden_size,
+            bias=False,
+        )
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        return hidden_states
+
+
+def pixel_shuffle(input_tensor, shuffle_ratio):
+    # input_tensor: [batch_size, num_patches, channels]
+    batch_size, num_patches, channels = input_tensor.shape
+    patch_size = int(math.sqrt(num_patches))
+
+    input_tensor = input_tensor.view(batch_size, patch_size, patch_size, -1)
+    batch_size, height, width, channels = input_tensor.size()
+
+    reshaped_tensor = input_tensor.view(batch_size, height, int(width * shuffle_ratio), int(channels / shuffle_ratio))
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    reshaped_tensor = reshaped_tensor.view(
+        batch_size, int(height * shuffle_ratio), int(width * shuffle_ratio), int(channels / (shuffle_ratio**2))
+    )
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    output_tensor = reshaped_tensor.view(batch_size, -1, reshaped_tensor.shape[-1])
+    return output_tensor
+
+
+class Llama4VisionPixelShuffleMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
+        self.inner_dim = int(config.projector_input_dim // (self.pixel_shuffle_ratio**2))
+        self.output_dim = config.projector_output_dim
+        self.mlp = Llama4VisionMLP2(config)
+
+    def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
+        encoded_patches = pixel_shuffle(encoded_patches, self.pixel_shuffle_ratio)
+        return self.mlp(encoded_patches)
+
+
+LLAVA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`LlavaConfig`] or [`LlavaVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+# TODO there is a different RoPE for vision encoder, defined as below
+def reshape_for_broadcast(freqs_ci: torch.Tensor, query: torch.Tensor):
+    ndim = query.ndim
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(query.shape)]
+    return freqs_ci.view(*shape)
+
+
+def vision_apply_rotary_emb(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    freqs_ci: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
+    key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
+    freqs_ci = reshape_for_broadcast(freqs_ci=freqs_ci, query=query_)  # freqs_ci[:,:,None,:]
+    freqs_ci = freqs_ci.to(query_.device)
+    query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
+    key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
+    return query_out.type_as(query), key_out.type_as(key)  # but this drops to 8e-3
+
+
+class Llama4VisionAttention(nn.Module):
+    def __init__(self, config: Llama4VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.num_key_value_groups = 1
+        self.attention_dropout = config.attention_dropout
+
+        self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.embed_dim, bias=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        freqs_ci: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        key_states = self.k_proj(hidden_states).view(hidden_shape)
+        value_states = self.v_proj(hidden_states).view(hidden_shape)
+
+        query_states, key_states = vision_apply_rotary_emb(query_states, key_states, freqs_ci=freqs_ci)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        # flex disable because breaks on TP 8, embed is 88 not power of 2
+        if self.config._attn_implementation not in ["eager", "flex_attention"]:
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            None,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=None,
+            is_causal=False,  # HAS TO BE ENFORCED
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Llama4VisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = nn.GELU()  # ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size, bias=True)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size, bias=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Llama4VisionEncoderLayer(nn.Module):
+    def __init__(self, config: Llama4VisionConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Llama4VisionAttention(config)
+        self.mlp = Llama4VisionMLP(config)
+
+        self.input_layernorm = nn.LayerNorm(config.hidden_size)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        freqs_ci: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = None,
+    ):
+        # Self Attention
+        residual = hidden_state
+
+        hidden_state = self.input_layernorm(hidden_state)
+
+        hidden_state, attn_weights = self.self_attn(
+            hidden_state,
+            freqs_ci=freqs_ci,
+            attention_mask=attention_mask,
+        )
+        hidden_state = residual + hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state = residual + hidden_state
+
+        outputs = (hidden_state,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Llama4VisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Llama4VisionEncoderLayer`].
+
+    Args:
+        config: Llama4VisionConfig
+    """
+
+    def __init__(self, config: Llama4VisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Llama4VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        freqs_ci: torch.Tensor,  # TODO move this to an attribute instead of keeping it around
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_state=hidden_states,
+                    attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                    freqs_ci=freqs_ci,
+                )
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+            hidden_states = layer_outputs[0]
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class Llama4UnfoldConvolution(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        kernel_size = config.patch_size
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        self.unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=config.patch_size)
+        self.linear = nn.Linear(
+            config.num_channels * kernel_size[0] * kernel_size[1],
+            config.hidden_size,
+            bias=False,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.unfold(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.linear(hidden_states)
+        return hidden_states
+
+
+class Llama4VisionRotaryEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        idx = config.image_size // config.patch_size
+        img_idx = torch.arange(idx**2, dtype=torch.int32).reshape(idx**2, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # ID_CLS_TOKEN
+        frequencies_x = img_idx % idx  # get the coordinates of the 2d matrix along x
+        frequencies_y = img_idx // idx  # get the coordinates of the 2d matrix along y
+        freq_dim = config.hidden_size // config.num_attention_heads // 2
+        rope_freq = 1.0 / (config.rope_theta ** (torch.arange(0, freq_dim, 2)[: (freq_dim // 2)].float() / freq_dim))
+        freqs_x = ((frequencies_x + 1)[..., None] * rope_freq[None, None, :]).repeat_interleave(2, dim=-1)
+        freqs_y = ((frequencies_y + 1)[..., None] * rope_freq[None, None, :]).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        freq_cis = torch.view_as_complex(torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1))
+        self.freqs_ci = freq_cis  # idx**2, idx**2, idx * 2
+
+    def forward(self, hidden_states):
+        return self.freqs_ci.to(hidden_states.device)
+
+
+class Llama4VisionModel(Llama4PreTrainedModel):
+    base_model_prefix = "vision_model"
+    _no_split_modules = ["Llama4VisionAttention"]
+    config_class = Llama4VisionConfig
+
+    def __init__(self, config: Llama4VisionConfig):
+        super().__init__(config)
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
+        self.scale = config.hidden_size**-0.5
+
+        self.patch_embedding = Llama4UnfoldConvolution(config)
+
+        self.class_embedding = nn.Parameter(self.scale * torch.randn(self.hidden_size))
+        self.positional_embedding_vlm = nn.Parameter(self.scale * torch.randn(self.num_patches, self.hidden_size))
+        self.rotary_embedding = Llama4VisionRotaryEmbedding(config)
+
+        # layer norms
+        self.layernorm_pre = nn.LayerNorm(self.hidden_size)
+        self.layernorm_post = nn.LayerNorm(self.hidden_size)
+
+        # encoders
+        self.model = Llama4VisionEncoder(config)
+        self.vision_adapter = Llama4VisionPixelShuffleMLP(config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        """
+        This function is used to fetch the first embedding layer to activate grads on inputs.
+        """
+        return self.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:
+        r"""
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, MllamaVisionModel
+
+        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
+        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
+        >>> processor = AutoProcessor.from_pretrained(checkpoint)
+
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> output = model(**inputs)
+
+        >>> print(output.last_hidden_state.shape)
+        torch.Size([1, 1, 4, 1025, 7680])
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # num_concurrent_media and num_chunks are both currently 1
+        batch_size_times_num_tiles, num_channels, height, width = pixel_values.shape
+        num_concurrent_media = 1
+        num_chunks = 1
+        hidden_state = self.patch_embedding(pixel_values)
+        _, num_patches, hidden_dim = hidden_state.shape
+
+        # Add cls token
+        hidden_state = hidden_state.reshape(
+            batch_size_times_num_tiles * num_concurrent_media * num_chunks, num_patches, hidden_dim
+        )
+        class_embedding = self.class_embedding.expand(hidden_state.shape[0], 1, hidden_state.shape[-1])
+        hidden_state = torch.cat([hidden_state, class_embedding], dim=1)
+        num_patches += 1
+
+        # Position embeddings
+        hidden_state = hidden_state.reshape(
+            batch_size_times_num_tiles * num_concurrent_media, num_chunks, num_patches, hidden_dim
+        )
+        positional_embedding = self.positional_embedding_vlm.to(dtype=hidden_state.dtype, device=hidden_state.device)
+        hidden_state = hidden_state + positional_embedding
+
+        hidden_state = self.layernorm_pre(hidden_state)
+
+        hidden_state = hidden_state.view(batch_size_times_num_tiles, -1, hidden_dim)
+        freqs_ci = self.rotary_embedding(pixel_values)
+
+        output = self.model(
+            hidden_state,
+            attention_mask=None,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            freqs_ci=freqs_ci,
+        )
+
+        hidden_state = output.last_hidden_state
+
+        hidden_state = self.layernorm_post(hidden_state)
+
+        hidden_state = hidden_state[:, :-1, :]
+
+        # now, we use Llama4VisionPixelShuffle + mlp to project embeddings
+        hidden_state = self.vision_adapter(hidden_state)
+
+        hidden_states = output.hidden_states if output_hidden_states else None
+
+        if output_attentions:
+            attentions = output[2]
+        else:
+            attentions = None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states, attentions] if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
+
+
+class Llama4ForConditionalGeneration(Llama4PreTrainedModel, GenerationMixin):
+    _tp_plan = {}
+    base_model_prefix = ""
+    config_class = Llama4Config
+    _supports_flex_attn = True
+
+    def __init__(self, config: Llama4Config):
+        super().__init__(config)
+        self.vision_model = Llama4VisionModel(config.vision_config)
+
+        self.multi_modal_projector = Llama4MultiModalProjector(config)
+        self.language_model = Llama4ForCausalLM(config.text_config)
+        self.vocab_size = config.text_config.vocab_size
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Union[int, List[int]],
+        vision_feature_select_strategy: str,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply al projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+            vision_feature_layer (`Union[int, List[int]]`):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            vision_feature_select_strategy (`str`):
+                The feature selection strategy used to select the vision feature from the vision backbone.
+                Can be one of `"default"` or `"full"`
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(f"Unexpected select feature strategy: {self.vision_feature_select_strategy}")
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        image_outputs = self.vision_model(pixel_values, output_hidden_states=False, **kwargs)
+        hidden_state = image_outputs.last_hidden_state
+        return hidden_state
+
+    @replace_return_docstrings(output_type=Llama4CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, List[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: torch.Tensor = None,
+        **lm_kwargs,
+    ) -> Union[Tuple, Llama4CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
+
+        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+
+        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer
+            if vision_feature_layer is not None
+            else self.config.vision_config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+                image_sizes=image_sizes,
+            )
+            original_inputs_embeds_shape = inputs_embeds.shape
+
+            vision_flat = image_features.view(-1, image_features.size(-1))
+            projected_vision_flat = self.multi_modal_projector(vision_flat)
+
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            final_mask = special_image_mask.to(inputs_embeds.device)
+            inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1))
+
+            final_mask_1d = final_mask[..., 0].reshape(-1)
+            num_tokens_to_fill = final_mask_1d.sum()
+
+            if num_tokens_to_fill != projected_vision_flat.size(0):
+                raise ValueError(
+                    f"Mismatch: final_mask wants {num_tokens_to_fill} embeddings, "
+                    f"but multi_modal_projector returned {projected_vision_flat.size(0)}"
+                )
+
+            expanded_mask = final_mask_1d.unsqueeze(-1).expand(-1, inputs_embeds.size(-1))
+            inputs_embeds.masked_scatter_(expanded_mask, projected_vision_flat)
+
+            inputs_embeds = inputs_embeds.view(original_inputs_embeds_shape)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Llama4CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+__all__ = [
+    "Llama4PreTrainedModel",
+    "Llama4TextModel",
+    "Llama4VisionModel",
+    "Llama4ForCausalLM",
+    "Llama4ForConditionalGeneration",
+]
diff --git a/src/transformers/models/llama4/processing_llama4.py b/src/transformers/models/llama4/processing_llama4.py
new file mode 100644
index 000000000000..0ca4a44c5e90
--- /dev/null
+++ b/src/transformers/models/llama4/processing_llama4.py
@@ -0,0 +1,275 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Union
+
+from transformers.processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import (
+    ImageInput,
+    make_flat_list_of_images,
+)
+
+
+class Llama4ImagesKwargs(ImagesKwargs, total=False):
+    max_patches: Optional[int]
+    resize_to_max_canvas: Optional[bool]
+
+
+class Llama4ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Llama4ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding_side": "left",
+        },
+    }
+
+
+chat_template = "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now(\"%d %b %Y\") %}\n    {%- else %}\n        {%- set date_string = \"26 Jul 2024\" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}    \n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content']|trim %}\n    {%- else %}\n        {#- FIXME: The processor requires an array, always. #}\n        {%- set system_message = messages[0]['content'][0]['text']|trim %}\n    {%- endif %}\n    {%- set messages = messages[1:] %}\n    {%- set user_supplied_system_message = true %}\n{%- else %}\n    {%- set system_message = \"\" %}\n    {%- set user_supplied_system_message = false %}\n{%- endif %}\n\n{#- System message if the user supplied one #}\n{%- if user_supplied_system_message %}\n    {{- \"<|header_start|>system<|header_end|>\n\n\" }}\n    {%- if tools is not none %}\n        {{- \"Environment: ipython\n\" }}\n    {%- endif %}\n    {%- if tools is not none and not tools_in_user_message %}\n        {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n        {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n        {{- \"Do not use variables.\n\n\" }}\n        {%- for t in tools %}\n            {{- t | tojson(indent=4) }}\n            {{- \"\n\n\" }}\n        {%- endfor %}\n    {%- endif %}\n    {{- system_message }}\n    {{- \"<|eot|>\" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|header_start|>user<|header_end|>\n\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\n\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\n\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n    {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}\n        {%- if message['content'] is string %}\n            {{- message['content'] }}\n        {%- else %}\n            {%- for content in message['content'] %}\n                {%- if content['type'] == 'image' %}\n                    {{- '<|image|>' }}\n                {%- elif content['type'] == 'text' %}\n                    {{- content['text'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- \"<|eot|>\" }}\n    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}\n       {{- '<|header_start|>assistant<|header_end|>\n\n' -}}\n       {{- '<|python_start|>' }}\n        {%- if message['content'] is string %}\n            {{- message['content'] }}\n        {%- else %}\n            {%- for content in message['content'] %}\n                {%- if content['type'] == 'image' %}\n                    {{- '<|image|>' }}\n                {%- elif content['type'] == 'text' %}\n                    {{- content['text'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n       {{- '<|python_end|>' }}\n        {%- for tool_call in message.tool_calls %}\n           {{- '{\"name\": \"' + tool_call.function.name + '\", ' }}\n           {{- '\"parameters\": ' }}\n           {{- tool_call.function.arguments | tojson }}\n           {{- \"}\" }}\n        {%- endfor %}\n       {{- \"<|eot|>\" }}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|header_start|>ipython<|header_end|>\n\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|header_start|>assistant<|header_end|>\n\n' }}\n{%- endif %}\n"
+
+
+class Llama4Processor(ProcessorMixin):
+    r"""
+    Constructs a Llama4 processor which wraps a [`AutoImageProcessor`] and
+    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
+    tokenizer functionalities. See the [`~Llama4Processor.__call__`] and [`~Llama4Processor.decode`] for more information.
+    Args:
+        image_processor ([`AutoImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        patch_size (`int`, *optional*, defaults to 28):
+            The size of image patches for tokenization.
+        img_size (`int`, *optional*, defaults to 364):
+            The size of the image to be tokenized. This should correspond to the size given to the image processor.
+        image_token (`str`, *optional*, defaults to `"<|image|>"`):
+            The token to be used to represent an image in the text.
+        downsample_factor (`int`, *optional*, defaults to 1):
+            The factor by which to scale the patch size.
+        start_of_img_token (`str`, *optional*, defaults to `"<|START_OF_IMG|>"`):
+            The token to be used to represent the start of an image in the text.
+        end_of_img_token (`str`, *optional*, defaults to `"<|END_OF_IMG|>"`):
+            The token to be used to represent the end of an image in the text.
+        img_patch_token (`str`, *optional*, defaults to `"<|IMG_PATCH|>"`):
+            The token to be used to represent an image patch in the text.
+        img_line_break_token (`str`, *optional*, defaults to `"<|IMG_LINE_BREAK|>"`):
+            The token to be used to represent a line break in the text.
+        tile_token (`str`, *optional*, defaults to `"TILE"`):
+            The token to be used to represent an image patch in the text.
+        tile_global_token (`str`, *optional*, defaults to `"TILE_GLOBAL"`):
+            The token to be used to represent the cover image in the text.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "image_token",
+        "patch_size",
+        "img_size",
+        "downsample_factor",
+        "start_of_img_token",
+        "end_of_img_token",
+        "img_patch_token",
+        "img_line_break_token",
+        "tile_token",
+        "tile_global_token",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size: int = 14,
+        pixel_shuffle_ratio: float = 0.5,
+        fake_image_token="<|image|>",
+        image_token="<|image|>",
+        start_of_image_token="<|image_start|>",
+        end_of_image_token="<|image_end|>",
+        patch_token="<|patch|>",
+        tile_x_separator_token="<|tile_x_separator|>",
+        tile_y_separator_token="<|tile_y_separator|>",
+        chat_template=chat_template,
+        **kwargs,
+    ):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+        self.downsample_ratio = int(round(1.0 / (pixel_shuffle_ratio**2)))
+        self.patch_size = patch_size
+
+        self.fake_image_token = fake_image_token
+        self.image_token = image_token
+        self.start_of_img_token = start_of_image_token
+        self.end_of_img_token = end_of_image_token
+        self.img_patch_token = patch_token
+        self.tile_token = tile_x_separator_token
+        self.tile_global_token = tile_y_separator_token
+
+    def _prompt_split_image(self, aspect_ratio, num_patches_per_chunk):
+        """
+        Create a structured string representation of image tokens
+
+        Args:
+           num_patches: Number of patches in the image
+
+        Returns:
+            String with appropriate image tokens
+        """
+        img_string = "<|image_start|>"
+        ratio_h, ratio_w = aspect_ratio
+        if ratio_h * ratio_w > 1:
+            for yy in range(ratio_h):
+                for xx in range(ratio_w):
+                    img_string += "<|patch|>" * num_patches_per_chunk
+                    if xx < ratio_w - 1:
+                        img_string += "<|tile_x_separator|>"
+
+                img_string += "<|tile_y_separator|>"
+        img_string += "<|image|>"
+        img_string += "<|patch|>" * num_patches_per_chunk
+        img_string += "<|image_end|>"
+
+        return img_string
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Llama4ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
+        To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
+        Llama4ImageProcessor's [`~Llama4ImageProcessor.__call__`] if `images` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None:
+            raise ValueError("You have to specify text.")
+
+        output_kwargs = self._merge_kwargs(
+            Llama4ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if not isinstance(text, (list, tuple)):
+            text = [text]
+
+        # Process images
+        image_inputs = {}
+        if images is not None:
+            images = make_flat_list_of_images(images)
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            image_height, image_width = image_inputs["pixel_values"][0].shape[-2:]
+            num_patches_per_chunk = int(
+                (image_height // self.patch_size) * (image_width // self.patch_size) // self.downsample_ratio
+            )
+            aspect_ratios = image_inputs.pop("aspect_ratios")
+
+            total_placeholders = sum(prompt.count(self.fake_image_token) for prompt in text)
+            if total_placeholders != len(images):
+                raise ValueError(
+                    f"Found {total_placeholders} placeholders across the batch, "
+                    f"but have {len(images)} flattened images."
+                )
+
+            image_index = 0
+            processed_text = []
+            for prompt in text:
+                placeholder_count = prompt.count(self.fake_image_token)
+                if placeholder_count == 0:
+                    # do nothing if there is no image
+                    processed_text.append(prompt)
+                    continue
+                prompt_splits = prompt.split(self.fake_image_token)
+                new_prompt = []
+                for local_image_index, split_part in enumerate(prompt_splits):
+                    new_prompt.append(split_part)
+                    if local_image_index < placeholder_count:
+                        tokens_for_this_image = self._prompt_split_image(
+                            aspect_ratios[image_index], num_patches_per_chunk
+                        )
+                        image_index += 1
+                        new_prompt.append(tokens_for_this_image)
+                processed_text.append("".join(new_prompt))
+
+            if image_index != len(images):
+                raise ValueError("Number of image placeholders in the prompt does not match the number of images.")
+
+            text = processed_text
+
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        return BatchFeature(data={**text_inputs, **image_inputs})
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(tokenizer_input_names) + list(image_processor_input_names)
+
+
+__all__ = ["Llama4Processor"]
diff --git a/src/transformers/models/llava/convert_llava_weights_to_hf.py b/src/transformers/models/llava/convert_llava_weights_to_hf.py
deleted file mode 100644
index 3582b9772c9c..000000000000
--- a/src/transformers/models/llava/convert_llava_weights_to_hf.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import glob
-
-import torch
-from huggingface_hub import file_exists, hf_hub_download, snapshot_download
-from safetensors import safe_open
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoImageProcessor,
-    AutoTokenizer,
-    LlavaConfig,
-    LlavaForConditionalGeneration,
-    LlavaProcessor,
-    SiglipVisionConfig,
-)
-
-
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/llava/convert_llava_weights_to_hf.py --text_model_id lmsys/vicuna-7b-v1.5 --vision_model_id openai/clip-vit-large-patch14-336 --output_hub_path org/llava-v1.5-7b-conv --old_state_dict_id liuhaotian/llava-v1.5-7b
-
-Example for creating the old state dict file with Python:
-
-    import torch
-    from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
-
-    # load model
-    kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
-    model = LlavaLlamaForCausalLM.from_pretrained("liuhaotian/llava-v1.5-7b", low_cpu_mem_usage=True, **kwargs)
-
-    # load vision tower
-    model.get_vision_tower().load_model()
-
-    # Save state dict
-    torch.save(model.state_dict(), "tmp/hf_models/llava-v1.5-7b/model_state_dict.bin")
-"""
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
-    ".vision_resampler": "",  # all lmms-lab models do avg pooling, so no vision_resampler
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-}
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    # tied wieghts so lm.head is not saved. Let's clone to load state dict
-    if "lm_head.weight" not in original_state_dict:
-        original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
-
-    if "model.image_newline" in original_state_dict:
-        # not used in the original implementation because "merge_type=flat"
-        del original_state_dict["model.image_newline"]
-    return original_state_dict
-
-
-# used only for llava-interlave
-# for ex: Qwen/Qwen1.5-0.5B-Chat google/siglip-so400m-patch14-384 lmms-lab/llava-next-interleave-qwen-0.5b
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-def convert_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-    if "Qwen" not in text_model_id:  # qwen already has a pad token
-        tokenizer.add_special_tokens({"pad_token": "<pad>"})
-
-    image_processor = AutoImageProcessor.from_pretrained(vision_model_id)
-    processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-    if "siglip" in vision_model_id:
-        vision_config = SiglipVisionConfig(
-            hidden_size=1152,
-            image_size=384,
-            intermediate_size=4304,
-            num_attention_heads=16,
-            num_hidden_layers=26,
-            patch_size=14,
-            vision_use_head=False,
-        ).to_dict()
-    else:
-        vision_config = None
-
-    config = LlavaConfig(
-        text_config=text_config,
-        vision_config=vision_config,
-    )
-
-    # llms-lab interleeave models do not use any selection startegy except for last hidden state
-    if "Qwen" in text_model_id:
-        config.image_token_index = 151646
-        if "siglip" in vision_model_id:
-            config.vision_feature_select_strategy = "full"
-            config.vision_feature_layer = -1
-    else:
-        config.pad_token_id = 32001
-        config.image_token_index = 32000
-
-    with torch.device("meta"):
-        model = LlavaForConditionalGeneration(config)
-
-    # Some llava variants like microsoft/llava-med-v1.5-mistral-7b use safetensors to store weights
-    if file_exists(old_state_dict_id, "model_state_dict.bin"):
-        state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict.bin")
-        state_dict = torch.load(state_dict_path, map_location="cpu", weights_only=True)
-    else:
-        state_dict = load_original_state_dict(old_state_dict_id)
-
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, strict=True, assign=True)
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model and pad to 64 for performance reasons
-    pad_shape = 64
-    vocab_size = config.text_config.vocab_size
-    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
-    model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
-        tuple(
-            (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
-        ),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
-        dim=0,
-    )
-
-    model.push_to_hub(output_hub_path)
-    processor.push_to_hub(output_hub_path)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--text_model_id",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--vision_model_id",
-        help="Hub location of the vision model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--old_state_dict_id",
-        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
-    )
-    args = parser.parse_args()
-    convert_llava_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/llava/image_processing_llava.py b/src/transformers/models/llava/image_processing_llava.py
index c78d1c28672d..37ef079c9187 100644
--- a/src/transformers/models/llava/image_processing_llava.py
+++ b/src/transformers/models/llava/image_processing_llava.py
@@ -279,7 +279,7 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_pad: bool = None,
+        do_pad: Optional[bool] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
         resample: Optional[PILImageResampling] = None,
diff --git a/src/transformers/models/llava/image_processing_llava_fast.py b/src/transformers/models/llava/image_processing_llava_fast.py
index e582336e97be..d85eb89b7c79 100644
--- a/src/transformers/models/llava/image_processing_llava_fast.py
+++ b/src/transformers/models/llava/image_processing_llava_fast.py
@@ -23,8 +23,7 @@
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -61,11 +60,7 @@
         from torchvision.transforms import functional as F
 
 
-class LlavaFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    do_pad: Optional[bool]
-
-
-class LlavaFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     do_pad: Optional[bool]
 
 
@@ -90,10 +85,9 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
-    valid_init_kwargs = LlavaFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = LlavaFastImageProcessorPreprocessKwargs
+    valid_kwargs = LlavaFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorInitKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None:
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -103,9 +97,7 @@ def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorInitKwargs]) -> None:
                 Whether to pad the image to a square based on the longest edge. Can be overridden by the `do_pad` parameter
         """,
     )
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorPreprocessKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def pad_to_square(
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 610ab417d92b..c1d075b6416e 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -78,7 +78,7 @@ class LlavaCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -329,8 +329,8 @@ def get_image_features(
     @replace_return_docstrings(output_type=LlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -344,11 +344,10 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
-        image_sizes: torch.Tensor = None,
+        image_sizes: Optional[torch.Tensor] = None,
         **lm_kwargs,
     ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index b8138e258185..6253e1992f02 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -89,6 +89,11 @@ def __init__(
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -103,7 +108,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py
deleted file mode 100644
index 06edc5c9b1ad..000000000000
--- a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py
+++ /dev/null
@@ -1,397 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert LLaVa-NeXT (LLaVa-1.6) checkpoints from the original repository.
-
-URL: https://github.com/haotian-liu/LLaVA/tree/main.
-
-
-The command used to obtain original logits is the following:
-python llava/eval/run_llava.py --model-path "liuhaotian/llava-v1.6-mistral-7b" --image-file "images/llava_v1_5_radar.jpg" --query "What is shown in this image?" --max_new_tokens 100 --temperature 0
-
-Note: logits are tested with torch==2.1.2.
-"""
-
-import argparse
-import gc
-import glob
-import json
-from pathlib import Path
-
-import requests
-import torch
-from accelerate import init_empty_weights
-from huggingface_hub import hf_hub_download, snapshot_download
-from PIL import Image
-from safetensors import safe_open
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoTokenizer,
-    LlavaNextConfig,
-    LlavaNextForConditionalGeneration,
-    LlavaNextImageProcessor,
-    LlavaNextProcessor,
-)
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-    "language_model.model.image_newline": "image_newline",
-}
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    return original_state_dict
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value.to(torch.float16)
-    return new_state_dict
-
-
-def load_image():
-    url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
-    # load original config
-    filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model")
-    # read json
-    with open(filepath) as f:
-        data = json.load(f)
-        print(data)
-
-    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
-        text_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
-        image_token_index = 32000
-    elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
-        text_model_id = "lmsys/vicuna-7b-v1.5"
-        image_token_index = 32000
-    elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
-        text_model_id = "lmsys/vicuna-13b-v1.5"
-        image_token_index = 32000
-    elif model_id == "liuhaotian/llava-v1.6-34b":
-        text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B"
-        image_token_index = 64000
-    elif model_id == "lmms-lab/llama3-llava-next-8b":
-        text_model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-        image_token_index = 128256
-    elif model_id == "lmms-lab/llava-next-72b":
-        text_model_id = "Qwen/Qwen1.5-72B-Chat"
-        image_token_index = 151646
-    elif model_id == "lmms-lab/llava-next-110b":
-        text_model_id = "Qwen/Qwen1.5-110B-Chat"
-        image_token_index = 151646
-
-    vision_model_id = data["mm_vision_tower"]
-
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    use_fast = False if model_id == "liuhaotian/llava-v1.6-34b" else True
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=use_fast)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-
-    if model_id in ("liuhaotian/llava-v1.6-mistral-7b", "lmms-lab/llama3-llava-next-8b"):
-        # Mistral-7B doesn't have a padding token set yet
-        tokenizer.add_special_tokens({"pad_token": "<pad>"})
-
-    image_processor = LlavaNextImageProcessor.from_pretrained(vision_model_id)
-    processor = LlavaNextProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-    config = LlavaNextConfig(
-        text_config=text_config.to_dict(),
-        image_grid_pinpoints=image_processor.image_grid_pinpoints,
-        use_image_newline_parameter=True,
-        image_token_index=image_token_index,
-    )
-
-    with init_empty_weights():
-        model = LlavaNextForConditionalGeneration(config)
-
-    # load original state dict
-    state_dict = load_original_state_dict(model_id)
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, assign=True)
-    model.eval()
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model
-    # Pad to 64 for performance reasons
-    # Qwen-based models have extra unused space in the vocab size already, so no need to resize
-    if model_id not in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]:
-        pad_shape = 64
-        vocab_size = config.text_config.vocab_size
-        if model_id == "liuhaotian/llava-v1.6-34b":
-            # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and <image>
-            num_tokens = vocab_size + 3
-        else:
-            # this one has 2 additional tokens, namely <image> and <pad>
-            num_tokens = vocab_size + 2
-        model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape)
-        model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
-            tuple(
-                (
-                    dist.sample()
-                    for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0])
-                )
-            ),
-            dim=0,
-        )
-        model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
-            tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
-            dim=0,
-        )
-
-    print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    gc.collect()
-
-    # Load everything back for inference tests in float32 because prev script was written as that
-    # Though it's mostly loaded in fp16 as original weights are in fp16
-    model = LlavaNextForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, device_map="auto")
-    processor = LlavaNextProcessor.from_pretrained(pytorch_dump_folder_path)
-    device = model.device
-
-    # prepare inputs
-    image = load_image()
-    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
-        prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
-    elif model_id in ["liuhaotian/llava-v1.6-vicuna-7b", "liuhaotian/llava-v1.6-vicuna-13b"]:
-        prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
-    elif model_id == "liuhaotian/llava-v1.6-34b":
-        prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
-    elif model_id == "lmms-lab/llama3-llava-next-8b":
-        prompt = "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-    elif model_id in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]:
-        prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
-
-    inputs = processor(images=image, text=prompt, return_tensors="pt")
-
-    # verify inputs
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_pixel_values.pt", repo_type="dataset")
-    original_pixel_values = torch.load(filepath, map_location="cpu")
-    assert torch.allclose(original_pixel_values, inputs.pixel_values.half())
-
-    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
-        filepath = hf_hub_download(repo_id="nielsr/test-image", filename="llava_1_6_input_ids.pt", repo_type="dataset")
-        original_input_ids = torch.load(filepath, map_location="cpu")
-        # replace -200 by image_token_index (since we use token ID = 32000 for the image token)
-        original_input_ids[original_input_ids == -200] = image_token_index
-        assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
-
-    elif model_id == "liuhaotian/llava-v1.6-34b":
-        filepath = hf_hub_download(
-            repo_id="nielsr/test-image", filename="llava_1_6_34b_input_ids.pt", repo_type="dataset"
-        )
-        original_input_ids = torch.load(filepath, map_location="cpu")
-        # replace -200 by image_token_index
-        original_input_ids[original_input_ids == -200] = image_token_index
-
-        assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist()
-
-    image_sizes = torch.tensor([[899, 1024]])
-    assert image_sizes[0].tolist() == inputs.image_sizes[0].tolist()
-
-    # verify single forward pass
-    print("Single forward pass")
-    with torch.inference_mode():
-        inputs = inputs.to(device)
-        outputs = model(**inputs)
-        print("Shape of logits:", outputs.logits.shape)
-        print("First values of logits:", outputs.logits[0, :3, :3])
-
-        if model_id == "liuhaotian/llava-v1.6-mistral-7b":
-            expected_slice = torch.tensor(
-                [[-4.8555, -4.6992, -0.1996], [-10.5703, -10.7344, -2.7246], [-7.0391, -7.3672, -0.2634]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
-            expected_slice = torch.tensor(
-                [[1.4883, 0.9976, -0.6992], [-9.7031, -5.7031, -1.5557], [-5.1328, -5.5586, 8.8281]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
-            expected_slice = torch.tensor(
-                [[-0.9614, 7.3125, 0.2106], [-7.2695, -8.5469, 3.6211], [-6.3750, -8.1875, 5.4688]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "liuhaotian/llava-v1.6-34b":
-            expected_slice = torch.tensor(
-                [[-9.0859, -9.1406, 5.9453], [-5.9570, -5.9766, 2.2754], [-5.7305, -5.7539, 4.0000]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llama3-llava-next-8b":
-            expected_slice = torch.tensor(
-                [[-3.9648, 1.1396, 3.3145], [-5.3594, -1.5654, -1.9619], [-12.3750, -10.6797, -9.3125]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-next-72b":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[3.7148, 3.9277, 3.4395], [-0.4341, 1.1387, 6.5117], [3.2324, 3.4688, 4.1133]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-next-110b":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[-2.5449, -1.6738, -2.0371], [1.0811, 3.4961, 5.0312], [1.7803, 2.5137, 2.4277]],
-                dtype=torch.float32,
-                device=device,
-            )
-        else:
-            raise ValueError(f"Model {model_id} not supported")
-
-        assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
-        print("Logits are ok!")
-
-    # verify generation
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=100,
-        use_cache=True,
-    )
-
-    generated_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-
-    print("Generated text:", repr(generated_text))
-
-    if model_id == "liuhaotian/llava-v1.6-mistral-7b":
-        expected_text = '[INST]  \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several axes labeled with different metrics or benchmarks, such as "MMM-Vet," "MMM-Bench," "LLaVA-Bench," "SLED-Bench," "'
-    elif model_id == "liuhaotian/llava-v1.6-vicuna-7b":
-        expected_text = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human\'s questions. USER:  \nWhat is shown in this image? ASSISTANT: The image appears to be a graphical representation of a benchmarking study comparing the performance of various models or systems. It\'s a scatter plot with a circular layout, where each point represents a different model or system, and the axes represent different metrics or dimensions of comparison.\n\nThe metrics are likely related to machine learning or artificial intelligence performance, as indicated by the terms like "BLIP-2," "Instruct BLIP," "POE," "QWA," "V"""
-    elif model_id == "liuhaotian/llava-v1.6-vicuna-13b":
-        expected_text = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER:  \nWhat is shown in this image? ASSISTANT: The image appears to be a radar chart, also known as a spider chart or star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several variables represented:\n\n- MM-Vet\n- LLa-Va-Bench\n- SEED-Bench\n- MM"
-    elif model_id == "liuhaotian/llava-v1.6-34b":
-        expected_text = "<|im_start|> system\nAnswer the questions. <|im_start|> user\n\nWhat is shown in this image? <|im_start|> assistant\nThe image appears to be a radar chart, also known as a spider chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular chart, there are several datasets represented by different colors and labeled with various acronyms such as MM-Vet, LLaVA-Bench, SEED-Bench, MM-Bench-CN, MM-"
-    elif model_id == "lmms-lab/llama3-llava-next-8b":
-        expected_text = 'system\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.user\n\n\nWhat is shown in this image?assistant\n\n\nThe image shows a radar chart, also known as a spider chart or a web chart, which is a type of graph used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along each axis and connected to form a polygon.\n\nIn this particular radar chart, there are several axes labeled with different variables, such as "MM-Vet," "LL'
-    elif model_id == "lmms-lab/llava-next-72b":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image displays a radar chart, also known as a spider chart or a star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the value of each variable is represented by the distance from the center of the chart to the point where the axis intersects with the line representing that variable's value.\n\nIn this particular chart, there are several axes"
-    elif model_id == "lmms-lab/llava-next-110b":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart comparing the performance of different models on various visual question answering (VQA) benchmarks. Each colored line represents a different model, and the distance from the center of the chart indicates the score or performance level of the model on a particular benchmark. The benchmarks are labeled around the edges of the chart, and include VQA v2, GQA, VizWiz, TextVQA, MMBench-CN, MME, and others. The chart allows for a"
-    else:
-        raise ValueError(f"Model {model_id} not supported")
-
-    assert generated_text == expected_text
-    print("Generated text is ok!")
-
-    # verify batched generation
-    print("Batched generation...")
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    cats_image = Image.open(requests.get(url, stream=True).raw)
-
-    inputs = processor(
-        images=[image, cats_image],
-        text=[prompt, prompt],
-        padding=True,
-        return_tensors="pt",
-    ).to(device)
-
-    for k, v in inputs.items():
-        print(k, v.shape)
-
-    print("Image sizes:", inputs.image_sizes)
-
-    # make sure image_sizes are the same
-    # as otherwise batched generation doesn't work
-    inputs.image_sizes[1] = inputs.image_sizes[0]
-
-    print("Batched generation...")
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=20,
-        use_cache=True,
-    )
-
-    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-    print(outputs)
-
-    if push_to_hub:
-        checkpoint_name = model_id.split("/")[-1]
-        print(f"Pushing to repo llava-hf/{checkpoint_name}-hf")
-        model.push_to_hub(f"llava-hf/{checkpoint_name}-hf")
-        processor.push_to_hub(f"llava-hf/{checkpoint_name}-hf")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_id",
-        help="Hub location of the model to convert",
-        default="liuhaotian/llava-v1.6-mistral-7b",
-        choices=[
-            "liuhaotian/llava-v1.6-mistral-7b",
-            "liuhaotian/llava-v1.6-vicuna-7b",
-            "liuhaotian/llava-v1.6-vicuna-13b",
-            "liuhaotian/llava-v1.6-34b",
-            "lmms-lab/llama3-llava-next-8b",
-            "lmms-lab/llava-next-72b",
-            "lmms-lab/llava-next-110b",
-        ],
-        required=False,
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-
-    convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
index d338775df644..c212a549fcf2 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -329,14 +329,14 @@ def pad(
     def _preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
@@ -558,19 +558,19 @@ def _pad_for_batching(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         image_grid_pinpoints: List = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/llava_next/image_processing_llava_next_fast.py b/src/transformers/models/llava_next/image_processing_llava_next_fast.py
index 1323f303b01d..d4caf2a19a23 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next_fast.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next_fast.py
@@ -21,8 +21,7 @@
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     divide_to_patches,
     group_images_by_shape,
     reorder_images,
@@ -57,12 +56,7 @@
         from torchvision.transforms import functional as F
 
 
-class LlavaNextFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    image_grid_pinpoints: Optional[List[List[int]]]
-    do_pad: Optional[bool]
-
-
-class LlavaNextFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class LlavaNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     image_grid_pinpoints: Optional[List[List[int]]]
     do_pad: Optional[bool]
 
@@ -96,10 +90,9 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
     do_convert_rgb = True
     do_pad = True
     image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
-    valid_init_kwargs = LlavaNextFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = LlavaNextFastImageProcessorPreprocessKwargs
+    valid_kwargs = LlavaNextFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -113,9 +106,7 @@ def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorInitKwargs]):
                     number of patches in the batch. Padding will be applied to the bottom and right with zeros.
         """,
     )
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorPreprocessKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _prepare_images_structure(
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 3cdf1b348404..06fc6bfedbb9 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -185,7 +185,7 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -542,8 +542,8 @@ def get_image_features(
     @replace_return_docstrings(output_type=LlavaNextCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         image_sizes: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -561,7 +561,6 @@ def forward(
         **lm_kwargs,
     ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 4de5fe63efce..61bb2cdd7fed 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -92,6 +92,11 @@ def __init__(
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -106,7 +111,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py b/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py
deleted file mode 100644
index aae44eee97a0..000000000000
--- a/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert LLaVa-NeXT-Video checkpoints from the original repository.
-
-URL: https://github.com/LLaVA-VL/LLaVA-NeXT/tree/inference
-"""
-
-import argparse
-import glob
-import json
-from pathlib import Path
-
-import torch
-from accelerate import init_empty_weights
-from huggingface_hub import hf_hub_download, snapshot_download
-from safetensors import safe_open
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoTokenizer,
-    LlavaNextImageProcessor,
-    LlavaNextVideoConfig,
-    LlavaNextVideoForConditionalGeneration,
-    LlavaNextVideoImageProcessor,
-    LlavaNextVideoProcessor,
-)
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
-    ".vision_resampler": "",  # all lmms-lab models do avg pooling, so no vision_resampler
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-    "language_model.model.image_newline": "image_newline",
-}
-
-# {{SYSTEM_PROMPT}} USER: <image>\n{{PROMPT}} ASSISTANT:" assistant end with "</s> "
-chat_vicuna = (
-    "{% for message in messages %}"
-    "{% if message['role'] == 'system' %}"
-    "{{ message['content'][0]['text'] }}"
-    "{% else %}"
-    "{{ message['role'].upper() + ': '}}"
-    "{% endif %}"
-    "{# Render all images first #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
-    "{{ '<image>\n' }}"
-    "{% endfor %}"
-    "{# Render all text next #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
-    "{{ content['text'] + ' '}}"
-    "{% endfor %}"
-    "{% endfor %}"
-    "{% if add_generation_prompt %}"
-    "{{ 'ASSISTANT:' }}"
-    "{% endif %}"
-)
-
-# "[INST] <image>\nWhat is shown in this image? [/INST]" assistant end with "</s> "
-chat_mistral = (
-    "{% for message in messages %}"
-    "{% if message['role'] == 'user' %}"
-    "{{ '[INST] ' }}"
-    "{# Render all images first #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
-    "{{ '<image>\n' }}"
-    "{% endfor %}"
-    "{# Render all text next #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
-    "{{ content['text'] }}"
-    "{% endfor %}"
-    "{{' [/INST]' }}"
-    "{% elif message['role'] == 'assistant' %}"
-    r"{{ ' ' + message['content'][0]['text'] + '<\s> '}}"
-    "{% else %}"
-    "{{ raise_exception('Only user and assistant roles are supported!') }}"
-    "{% endif %}"
-    "{% endfor %}"
-)
-
-# "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
-chat_yi = (
-    "{% for message in messages %}"
-    "{{'<|im_start|>' + message['role'] + '\n'}}"
-    "{# Render all images first #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
-    "{{ '<image>\n' }}"
-    "{% endfor %}"
-    "{# Render all text next #}"
-    "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
-    "{{ content['text'] }}"
-    "{% endfor %}"
-    "{{'<|im_end|>' + '\n'}}"
-    "{% endfor %}"
-    "{% if add_generation_prompt %}"
-    "{{ '<|im_start|>assistant\n' }}"
-    "{% endif %}"
-)
-
-model2template = {
-    "lmms-lab/LLaVA-NeXT-Video-7B-32K": chat_mistral,
-    "lmms-lab/LLaVA-NeXT-Video-7B": chat_vicuna,
-    "lmms-lab/LLaVA-NeXT-Video-7B-DPO": chat_vicuna,
-    "lmms-lab/LLaVA-NeXT-Video-34B": chat_yi,
-    "lmms-lab/LLaVA-NeXT-Video-34B-DPO": chat_yi,
-}
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    return original_state_dict
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value.to(torch.bfloat16)
-    return new_state_dict
-
-
-def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
-    # load original config
-    filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model")
-    with open(filepath) as f:
-        data = json.load(f)
-        print(data)
-
-    if model_id == "lmms-lab/LLaVA-NeXT-Video-7B-32K":
-        text_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
-        video_token_index = 32000
-        image_token_index = 32001
-        overwrite_text_config = {}
-    elif model_id in ["lmms-lab/LLaVA-NeXT-Video-7B", "lmms-lab/LLaVA-NeXT-Video-7B-DPO"]:
-        text_model_id = "lmsys/vicuna-7b-v1.5"
-        video_token_index = 32000
-        image_token_index = 32001
-        overwrite_text_config = {"factor": 2.0, "type": "linear"}
-    elif model_id in ["lmms-lab/LLaVA-NeXT-Video-34B", "lmms-lab/LLaVA-NeXT-Video-34B-DPO"]:
-        text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B"
-        video_token_index = 64000
-        image_token_index = 64001
-        overwrite_text_config = {}
-    else:
-        raise ValueError("Incorrect checkpoint referenced. Text model-id not identified!")
-
-    vision_model_id = data["mm_vision_tower"]
-
-    torch.set_default_dtype(torch.bfloat16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-    text_config = text_config.to_dict()
-    text_config.update(overwrite_text_config)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=True, padding_side="left")
-    tokenizer.add_tokens(AddedToken("<video>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-
-    image_processor = LlavaNextImageProcessor.from_pretrained(vision_model_id)
-    video_processor = LlavaNextVideoImageProcessor.from_pretrained(vision_model_id)
-    processor = LlavaNextVideoProcessor(
-        tokenizer=tokenizer,
-        video_processor=video_processor,
-        image_processor=image_processor,
-        chat_template=model2template[model_id],
-    )
-
-    config = LlavaNextVideoConfig(
-        text_config=text_config,
-        image_grid_pinpoints=image_processor.image_grid_pinpoints,
-        use_image_newline_parameter=True,
-        video_token_index=video_token_index,
-        image_token_index=image_token_index,
-    )
-
-    with init_empty_weights():
-        model = LlavaNextVideoForConditionalGeneration(config)
-
-    # load original state dict
-    state_dict = load_original_state_dict(model_id)
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, assign=True, strict=True)
-
-    # See https://nlp.stanford.edu/~johnhew/vocab-expansion.html for why we get mean/stdev this way to expand embeddings
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model
-    # Pad to 64 for performance reasons
-    pad_shape = 64
-    vocab_size = config.text_config.vocab_size
-
-    # this one has 2 additional tokens, namely <image>, <video> and <pad>
-    num_tokens = vocab_size + 3
-    model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape)
-    model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
-        tuple(
-            (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
-        ),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
-        dim=0,
-    )
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        repo_id = model_id.split("/")[-1]
-        print(f"Pushing model to hub repo: {repo_id}")
-        model.push_to_hub(f"llava-hf/{repo_id}-hf")
-        processor.push_to_hub(f"llava-hf/{repo_id}-hf")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_id",
-        help="Hub location of the model to convert",
-        default="lmms-lab/LLaVA-NeXT-Video-7B",
-        choices=[
-            "lmms-lab/LLaVA-NeXT-Video-7B",
-            "lmms-lab/LLaVA-NeXT-Video-7B-DPO",
-            "lmms-lab/LLaVA-NeXT-Video-7B-32K",
-            "lmms-lab/LLaVA-NeXT-Video-34B",
-            "lmms-lab/LLaVA-NeXT-Video-34B-DPO",
-        ],
-        required=False,
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-
-    convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index 3ec8d9db069d..139852324a5e 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -179,17 +179,17 @@ def resize(
     def _preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> list[np.ndarray]:
@@ -279,17 +279,17 @@ def _preprocess(
     def preprocess(
         self,
         images: VideoInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 9ce88c541231..bf30ff17c0dd 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -86,7 +86,7 @@ class LlavaNextVideoCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -581,9 +581,9 @@ def get_image_features(
     @replace_return_docstrings(output_type=LlavaNextVideoCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        pixel_values_videos: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
         image_sizes: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -601,7 +601,6 @@ def forward(
         **lm_kwargs,
     ) -> Union[Tuple, LlavaNextVideoCausalLMOutputWithPast]:
         r"""
-        Args:
             pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, image_size, image_size)):
                 The tensors corresponding to the input videos. Pixel values can be obtained using
                 [`AutoImageProcessor`]. See [`LlavaNextVideoVideoProcessor.__call__`] for details. [`LlavaProcessor`] uses
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index 8769f8db4131..8168682ad7ec 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -340,9 +340,9 @@ def get_video_features(
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        pixel_values_videos: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
         image_sizes: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -360,7 +360,6 @@ def forward(
         **lm_kwargs,
     ) -> Union[Tuple, LlavaNextVideoCausalLMOutputWithPast]:
         r"""
-        Args:
             pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, image_size, image_size)):
                 The tensors corresponding to the input videos. Pixel values can be obtained using
                 [`AutoImageProcessor`]. See [`LlavaNextVideoVideoProcessor.__call__`] for details. [`LlavaProcessor`] uses
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index 8a1294611b0d..43b4102b9668 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -16,24 +16,33 @@
 Processor class for LLaVa-NeXT-Video.
 """
 
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import List, Union
 
 import numpy as np
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils import select_best_resolution
 from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
 
 
-if TYPE_CHECKING:
-    pass
-
 logger = logging.get_logger(__name__)
 
 
+class LlavaNextVideoProcessorKwargs(ProcessingKwargs, total=False):
+    # see processing_utils.ProcessingKwargs documentation for usage.
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+        },
+    }
+
+
 class LlavaNextVideoProcessor(ProcessorMixin):
     r"""
     Constructs a LLaVa-NeXT-Video processor which wraps a LLaVa-NeXT image processor, LLaVa-NeXT-Video video processor and
@@ -76,7 +85,7 @@ class LlavaNextVideoProcessor(ProcessorMixin):
         "video_token",
         "num_additional_image_tokens",
     ]
-    image_processor_class = "LlavaNextImageProcessor"
+    image_processor_class = ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")
     video_processor_class = "LlavaNextVideoImageProcessor"
     tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
 
@@ -98,17 +107,25 @@ def __init__(
         self.vision_feature_select_strategy = vision_feature_select_strategy
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
         self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        self.video_token_id = (
+            tokenizer.video_token_id
+            if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token)
+        )
         super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
         images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
         videos: VideoInput = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: int = None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        **kwargs: Unpack[LlavaNextVideoProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -116,7 +133,7 @@ def __call__(
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
         LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. To prepare the video(s),
         this method forwards the `videos` and `kwrags` arguments to LlavaNextVideoImageProcessor's
-        [`~LlavaNextVideoImageProcessor.__call__`] if `videos` is not `None`. Please refer to the doctsring
+        [`~LlavaNextVideoImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
@@ -130,19 +147,6 @@ def __call__(
             videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                 tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
@@ -160,13 +164,21 @@ def __call__(
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        output_kwargs = self._merge_kwargs(
+            LlavaNextVideoProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
         if images is not None:
-            image_inputs = self.image_processor(images, return_tensors=return_tensors)
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         else:
             image_inputs = {}
 
         if videos is not None:
-            videos_inputs = self.video_processor(videos, return_tensors=return_tensors)
+            videos_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
         else:
             videos_inputs = {}
 
@@ -212,13 +224,7 @@ def __call__(
                 prompt_strings.append(sample)
             text = prompt_strings
 
-        text_inputs = self.tokenizer(
-            text,
-            return_tensors=return_tensors,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-        )
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
         return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
 
     # Copied from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_number_of_features
diff --git a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
deleted file mode 100644
index 65c57f624f54..000000000000
--- a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
+++ /dev/null
@@ -1,388 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert LLaVa-Onevision checkpoints from the original repository.
-
-URL: https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main
-
-"""
-
-import argparse
-import gc
-import glob
-import json
-from pathlib import Path
-
-import requests
-import torch
-from accelerate import init_empty_weights
-from huggingface_hub import hf_hub_download, snapshot_download
-from PIL import Image
-from safetensors import safe_open
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoTokenizer,
-    LlavaOnevisionConfig,
-    LlavaOnevisionForConditionalGeneration,
-    LlavaOnevisionImageProcessor,
-    LlavaOnevisionProcessor,
-    LlavaOnevisionVideoProcessor,
-    SiglipVisionConfig,
-)
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-    "language_model.model.image_newline": "image_newline",
-}
-
-chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
-
-
-def load_original_state_dict(model_id):
-    directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"])
-
-    original_state_dict = {}
-    for path in glob.glob(f"{directory_path}/*"):
-        if path.endswith(".safetensors"):
-            with safe_open(path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    original_state_dict[key] = f.get_tensor(key)
-
-    # tied wieghts so lm.head is not saved. Let's clone to load state dict
-    if "lm_head.weight" not in original_state_dict:
-        original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone()
-
-    return original_state_dict
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value.to(torch.float16)
-    return new_state_dict
-
-
-def load_image():
-    url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
-    # load original config
-    filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model")
-    # read json
-    with open(filepath) as f:
-        data = json.load(f)
-        print(data)
-
-    if model_id in ["lmms-lab/llava-onevision-qwen2-0.5b-ov", "lmms-lab/llava-onevision-qwen2-0.5b-si"]:
-        text_model_id = "Qwen/Qwen2-0.5B-Instruct"
-    elif model_id in [
-        "lmms-lab/llava-onevision-qwen2-7b-ov",
-        "lmms-lab/llava-onevision-qwen2-7b-si",
-        "lmms-lab/llava-onevision-qwen2-7b-ov-chat",
-    ]:
-        text_model_id = "Qwen/Qwen2-7B-Instruct"
-    elif model_id in [
-        "lmms-lab/llava-onevision-qwen2-72b-ov",
-        "lmms-lab/llava-onevision-qwen2-72b-si",
-        "lmms-lab/llava-onevision-qwen2-72b-ov-chat",
-    ]:
-        text_model_id = "Qwen/Qwen2-72B-Instruct"
-
-    vision_model_id = data["mm_vision_tower"]
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=True)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_tokens(AddedToken("<video>", special=True, normalized=False), special_tokens=True)
-
-    image_processor = LlavaOnevisionImageProcessor.from_pretrained(vision_model_id)
-    video_processor = LlavaOnevisionVideoProcessor.from_pretrained(vision_model_id)
-    processor = LlavaOnevisionProcessor(
-        tokenizer=tokenizer,
-        video_processor=video_processor,
-        image_processor=image_processor,
-        num_image_tokens=729,
-        vision_feature_select_strategy="full",
-        chat_template=chat_template,
-    )
-
-    vision_config = SiglipVisionConfig(
-        hidden_size=1152,
-        image_size=384,
-        intermediate_size=4304,
-        num_attention_heads=16,
-        num_hidden_layers=26,  # drop the last layer
-        patch_size=14,
-        vision_use_head=False,  # no head
-    ).to_dict()
-
-    config = LlavaOnevisionConfig(
-        text_config=text_config.to_dict(),
-        vision_config=vision_config,
-        use_image_newline_parameter=True,
-    )
-
-    with init_empty_weights():
-        model = LlavaOnevisionForConditionalGeneration(config)
-
-    # load original state dict
-    state_dict = load_original_state_dict(model_id)
-    state_dict = convert_state_dict_to_hf(state_dict)
-    model.load_state_dict(state_dict, assign=True)
-    model.eval()
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model
-    # Pad to 64 for performance reasons
-    # Qwen-based models have extra unused space in the vocab size already, so no need to resize
-    pad_shape = 64
-    vocab_size = config.text_config.vocab_size
-    num_tokens = vocab_size + 2
-    model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape)
-    model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack(
-        tuple(
-            (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]))
-        ),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[vocab_size:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))),
-        dim=0,
-    )
-
-    print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    gc.collect()
-
-    # Load everything back for inference tests in float32 because prev script was written as that
-    # Though it's mostly loaded in fp16 as original weights are in fp16
-    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
-        pytorch_dump_folder_path, torch_dtype="float16", device_map="auto"
-    )
-    processor = LlavaOnevisionProcessor.from_pretrained(pytorch_dump_folder_path)
-    device = model.device
-
-    # prepare inputs
-    image = load_image()
-    prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
-    inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch.float16)
-
-    # verify inputs
-    filepath = hf_hub_download(
-        repo_id="RaushanTurganbay/test-image", filename="llava_onevision_pixel_values.pt", repo_type="dataset"
-    )
-    original_pixel_values = torch.load(filepath, map_location="cpu")
-    assert torch.allclose(original_pixel_values, inputs.pixel_values.half())
-
-    image_sizes = torch.tensor([[899, 1024]])
-    assert image_sizes[0].tolist() == inputs.image_sizes[0].tolist()
-
-    # verify single forward pass
-    print("Single forward pass")
-    with torch.inference_mode():
-        inputs = inputs.to(device)
-        outputs = model(**inputs)
-        print("Shape of logits:", outputs.logits.shape)
-        print("First values of logits:", outputs.logits[0, :3, :3])
-
-        if model_id == "lmms-lab/llava-onevision-qwen2-0.5b-si":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[-12.1953, -14.6797, -12.7891], [0.5840, -0.8467, 1.3799], [3.6055, 4.5430, 9.9062]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-0.5b-ov":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[-12.0234, -14.3828, -12.7500], [2.3594, 1.0000, 3.9336], [3.6582, 4.7148, 9.1172]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-7b-si":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[1.7656, 3.3418, 1.4033], [0.0757, 0.7427, 3.5098], [6.7109, 5.6797, 9.3828]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[1.8496, 3.4219, 1.3135], [3.0996, 3.0117, 3.1484], [4.2422, 4.7109, 9.9688]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-72b-si":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[4.1875, 4.4883, 2.7910], [1.2949, 5.1328, 3.1582], [0.9390, 6.4531, 8.4375]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[4.2930, 4.7305, 2.7363], [1.7529, 5.0742, 3.9590], [1.3936, 6.3438, 9.3984]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov-chat":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[1.8662, 3.4316, 1.3174], [2.7109, 2.5488, 3.0117], [4.4648, 4.9648, 10.3359]],
-                dtype=torch.float32,
-                device=device,
-            )
-        elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov-chat":
-            # Not yet checked against reference
-            expected_slice = torch.tensor(
-                [[4.3086, 4.7344, 2.6953], [1.7090, 5.1719, 4.0234], [1.3057, 6.3438, 9.5469]],
-                dtype=torch.float32,
-                device=device,
-            )
-        else:
-            raise ValueError(f"Model {model_id} not supported")
-
-        assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
-        print("Logits are ok!")
-
-    # verify generation
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=100,
-        use_cache=True,
-    )
-
-    generated_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
-
-    print("Generated text:", repr(generated_text))
-
-    if model_id == "lmms-lab/llava-onevision-qwen2-0.5b-si":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart that shows the performance of different algorithms or models in a specific domain, such as image classification or natural language processing. The chart is color-coded to represent different algorithms, with each color corresponding to a specific algorithm. The algorithms are labeled as BLIP-2, InstructBLIP, Owen-VL-Chat, and LLaVA-1.5. The chart also includes a legend at the bottom that explains the color coding and the algorithms represented."
-    elif model_id == "lmms-lab/llava-onevision-qwen2-0.5b-ov":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart that compares the performance of different models in a specific task, likely related to natural language processing or machine learning. The chart is divided into different categories, each represented by a different color and labeled with the name of the model or technique used. The models are evaluated based on their performance metrics, such as BLEU-2, InstructBLIP, Qwen-VL-Chat, and LLaVA-1.5. The radar chart helps to visualize the relative"
-    elif model_id == "lmms-lab/llava-onevision-qwen2-7b-si":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThis image is a radar chart that compares the performance of different models on various metrics. The models being compared are BLIP-2, InstructBLIP, and Qwen-VL-Chat. The metrics being compared are VQA, QA, GQA, VQA-av2, and VQA-av2. The chart shows that BLIP-2 performs the best on all metrics, followed by InstructBLIP and Qwen-VL-Chat."
-    elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, also known as a spider chart or a star chart, which is used to compare multiple quantitative variables. Each axis represents a different variable, and the chart is filled with data points that represent the performance or values of different entities across these variables.\n\nIn this particular radar chart, the variables are represented on the axes, and the performance of different models or systems is shown by the lines connecting the data points. The models or systems are labeled along the bottom of the chart,"
-    elif model_id == "lmms-lab/llava-onevision-qwen2-72b-si":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. The chart is used to compare the performance of different models or systems across various benchmarks or metrics.\n\nIn this specific radar chart, there are multiple axes, each representing a different benchmark or metric, such as VQA2, GQA, TextVQA, and others. The chart includes several colored lines"
-    elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart comparing the performance of different models on various multimodal benchmarks. The models compared are BLIP-2, InstructBLIP, POPE, QWen-VL-Chat, and LLava-1.5. The benchmarks include VQAv2, GQA, TextVQA, SQA-IMG, VizWiz, MM-IMDb, MM-VQA, MM-IMDb-CN, MM-IMDb-EN, MM-"
-    elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov-chat":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, also known as a spider chart or a star chart, which is used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along these axes.\n\nIn this particular radar chart, there are multiple lines representing different models or systems, each distinguished by a different color and labeled with a name such as BLIP-2, In"
-    elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov-chat":
-        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart comparing the performance of different models on various multimodal benchmarks. The models compared are BLIP-2, InstructBLIP, POPE, QWen-VL-Chat, and LLava-1.5. The benchmarks include VQAv2, GQA, TextVQA, SQA-IMG, VizWiz, MM-IMDb, MM-VQA, MM-IMDb-CN, MM-IMDb-EN, MM-"
-    else:
-        raise ValueError(f"Model {model_id} not supported")
-
-    assert generated_text == expected_text
-    print("Generated text is ok!")
-
-    # verify batched generation
-    print("Batched generation...")
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    cats_image = Image.open(requests.get(url, stream=True).raw)
-
-    inputs = processor(
-        images=[image, cats_image],
-        text=[prompt, prompt],
-        padding=True,
-        return_tensors="pt",
-    ).to(device, torch.float16)
-
-    for k, v in inputs.items():
-        print(k, v.shape)
-
-    print("Image sizes:", inputs.image_sizes)
-
-    # make sure image_sizes are the same
-    # as otherwise batched generation doesn't work
-    inputs.image_sizes[1] = inputs.image_sizes[0]
-
-    print("Batched generation...")
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=20,
-        use_cache=True,
-    )
-
-    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-    print(outputs)
-
-    if push_to_hub:
-        checkpoint_name = model_id.split("/")[-1]
-        print(f"Pushing to repo llava-hf/{checkpoint_name}-hf")
-        model.push_to_hub(f"llava-hf/{checkpoint_name}-hf")
-        processor.push_to_hub(f"llava-hf/{checkpoint_name}-hf")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_id",
-        help="Hub location of the model to convert",
-        default="lmms-lab/llava-onevision-qwen2-0.5b-ov",
-        choices=[
-            "lmms-lab/llava-onevision-qwen2-0.5b-ov",
-            "lmms-lab/llava-onevision-qwen2-0.5b-si",
-            "lmms-lab/llava-onevision-qwen2-7b-si",
-            "lmms-lab/llava-onevision-qwen2-7b-ov",
-            "lmms-lab/llava-onevision-qwen2-72b-si",
-            "lmms-lab/llava-onevision-qwen2-72b-ov",
-            "lmms-lab/llava-onevision-qwen2-7b-ov-chat",
-            "lmms-lab/llava-onevision-qwen2-72b-ov-chat",
-        ],
-        required=False,
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-
-    convert_llava_to_hf(args.model_id, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
index a7408ca4dd2b..23e03483f2f0 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -453,15 +453,15 @@ def _pad_for_batching(
     def _preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Image.Image:
@@ -528,17 +528,17 @@ def _preprocess(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         image_grid_pinpoints: List = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
index 13aa26549669..598ac78f538c 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
@@ -12,8 +12,7 @@
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     divide_to_patches,
     group_images_by_shape,
     reorder_images,
@@ -40,12 +39,7 @@
     from torchvision.transforms import functional as F
 
 
-class LlavaOnevisionFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    image_grid_pinpoints: Optional[List[List[int]]]
-    do_pad: Optional[bool]
-
-
-class LlavaOnevisionFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     image_grid_pinpoints: Optional[List[List[int]]]
     do_pad: Optional[bool]
 
@@ -77,11 +71,10 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
     do_convert_rgb = True
     do_pad = True
     image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]]  # fmt: skip
-    valid_init_kwargs = LlavaOnevisionFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = LlavaOnevisionFastImageProcessorPreprocessKwargs
+    valid_kwargs = LlavaOnevisionFastImageProcessorKwargs
     model_input_names = ["pixel_values_videos"]
 
-    def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -95,9 +88,7 @@ def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorInitKwargs])
                     number of patches in the batch. Padding will be applied to the bottom and right with zeros.
         """,
     )
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorPreprocessKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _prepare_images_structure(
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index e86ce394e13d..31d5b9edb6c5 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -191,7 +191,7 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -601,10 +601,10 @@ def get_video_features(
     @add_start_docstrings(LLAVA_ONEVISION_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         image_sizes: Optional[torch.LongTensor] = None,
-        pixel_values_videos: torch.FloatTensor = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
         image_sizes_videos: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -623,7 +623,6 @@ def forward(
         **lm_kwargs,
     ) -> Union[Tuple, LlavaOnevisionCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -740,7 +739,7 @@ def forward(
 
             special_video_mask = (input_ids == self.config.video_token_index).unsqueeze(-1)
             special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
-            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != video_features.numel():
+            if not is_torchdynamo_compiling() and inputs_embeds[special_video_mask].numel() != video_features.numel():
                 n_video_tokens = (input_ids == self.config.video_token_index).sum()
                 n_video_features = video_features.shape[0]
                 raise ValueError(
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index f4ca90f28c21..4b1443ab9e49 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -41,7 +41,7 @@ class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
             "padding": False,
         },
         "image_kwargs": {},
-        "video_kwargs": {},
+        "videos_kwargs": {},
     }
 
 
@@ -100,6 +100,16 @@ def __init__(
         self.vision_feature_select_strategy = vision_feature_select_strategy
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
         self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        self.video_token_id = (
+            tokenizer.video_token_id
+            if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token)
+        )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
     def __call__(
@@ -114,7 +124,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
index 743e9f2df68c..14307470e43c 100644
--- a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
@@ -109,15 +109,15 @@ def __init__(
     def _preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> list[np.ndarray]:
@@ -200,15 +200,15 @@ def _preprocess(
     def preprocess(
         self,
         videos: VideoInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
deleted file mode 100644
index 4ef2131228b6..000000000000
--- a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoBERTa checkpoint."""
-
-import argparse
-
-import pytorch_lightning as pl
-import torch
-from torch import nn
-
-from transformers import LongformerForQuestionAnswering, LongformerModel
-
-
-class LightningModel(pl.LightningModule):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-        self.num_labels = 2
-        self.qa_outputs = nn.Linear(self.model.config.hidden_size, self.num_labels)
-
-    # implement only because lightning requires to do so
-    def forward(self):
-        pass
-
-
-def convert_longformer_qa_checkpoint_to_pytorch(
-    longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str
-):
-    # load longformer model from model identifier
-    longformer = LongformerModel.from_pretrained(longformer_model)
-    lightning_model = LightningModel(longformer)
-
-    ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu"))
-    lightning_model.load_state_dict(ckpt["state_dict"])
-
-    # init longformer question answering model
-    longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(longformer_model)
-
-    # transfer weights
-    longformer_for_qa.longformer.load_state_dict(lightning_model.model.state_dict())
-    longformer_for_qa.qa_outputs.load_state_dict(lightning_model.qa_outputs.state_dict())
-    longformer_for_qa.eval()
-
-    # save model
-    longformer_for_qa.save_pretrained(pytorch_dump_folder_path)
-
-    print(f"Conversion successful. Model saved under {pytorch_dump_folder_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--longformer_model",
-        default=None,
-        type=str,
-        required=True,
-        help="model identifier of longformer. Should be either `longformer-base-4096` or `longformer-large-4096`.",
-    )
-    parser.add_argument(
-        "--longformer_question_answering_ckpt_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path the official PyTorch Lightning Checkpoint.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_longformer_qa_checkpoint_to_pytorch(
-        args.longformer_model, args.longformer_question_answering_ckpt_path, args.pytorch_dump_folder_path
-    )
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index ebdba5c4ed2d..9eefa02a6687 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -128,7 +128,7 @@ class LongformerBaseModelOutputWithPooling(ModelOutput):
     """
 
     last_hidden_state: torch.FloatTensor
-    pooler_output: torch.FloatTensor = None
+    pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -174,7 +174,7 @@ class LongformerMaskedLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -222,8 +222,8 @@ class LongformerQuestionAnsweringModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -269,7 +269,7 @@ class LongformerSequenceClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -317,7 +317,7 @@ class LongformerMultipleChoiceModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -363,7 +363,7 @@ class LongformerTokenClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     global_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -510,12 +510,12 @@ def __init__(self, config, layer_id):
 
         self.layer_id = layer_id
         attention_window = config.attention_window[self.layer_id]
-        assert (
-            attention_window % 2 == 0
-        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
-        assert (
-            attention_window > 0
-        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        assert attention_window % 2 == 0, (
+            f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        )
+        assert attention_window > 0, (
+            f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        )
 
         self.one_sided_attn_window_size = attention_window // 2
 
@@ -549,9 +549,9 @@ def forward(
         value_vectors = self.value(hidden_states)
 
         seq_len, batch_size, embed_dim = hidden_states.size()
-        assert (
-            embed_dim == self.embed_dim
-        ), f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+        assert embed_dim == self.embed_dim, (
+            f"hidden_states should have embed_dim = {self.embed_dim}, but has {embed_dim}"
+        )
 
         # normalize query
         query_vectors /= math.sqrt(self.head_dim)
@@ -619,9 +619,9 @@ def forward(
         )  # use fp32 for numerical stability
 
         if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
             attn_probs = layer_head_mask.view(1, 1, -1, 1) * attn_probs
 
         # softmax sometimes inserts NaN if all positions are masked, replace them with 0
@@ -813,9 +813,9 @@ def _sliding_chunks_query_key_matmul(self, query: torch.Tensor, key: torch.Tenso
         overlap of size window_overlap
         """
         batch_size, seq_len, num_heads, head_dim = query.size()
-        assert (
-            seq_len % (window_overlap * 2) == 0
-        ), f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+        assert seq_len % (window_overlap * 2) == 0, (
+            f"Sequence length should be multiple of {window_overlap * 2}. Given {seq_len}"
+        )
         assert query.size() == key.size()
 
         chunks_count = torch.div(seq_len, window_overlap, rounding_mode="trunc") - 1
@@ -1086,9 +1086,9 @@ def _compute_global_attn_output_from_hidden(
 
         # apply layer head masking
         if layer_head_mask is not None:
-            assert layer_head_mask.size() == (
-                self.num_heads,
-            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            assert layer_head_mask.size() == (self.num_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            )
             global_attn_probs_float = layer_head_mask.view(1, -1, 1, 1) * global_attn_probs_float.view(
                 batch_size, self.num_heads, max_num_global_attn_indices, seq_len
             )
@@ -1287,9 +1287,9 @@ def forward(
 
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layer)
-            ), f"The head_mask should be specified for {len(self.layer)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layer)), (
+                f"The head_mask should be specified for {len(self.layer)} layers, but it is for {head_mask.size()[0]}."
+            )
         for idx, layer_module in enumerate(self.layer):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -1590,8 +1590,7 @@ def _pad_to_window_size(
         # this path should be recorded in the ONNX export, it is fine with padding_len == 0 as well
         if padding_len > 0:
             logger.warning_once(
-                f"Input ids are automatically padded to be a multiple of "
-                f"`config.attention_window`: {attention_window}"
+                f"Input ids are automatically padded to be a multiple of `config.attention_window`: {attention_window}"
             )
             if input_ids is not None:
                 input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id)
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 0f52ca658a7b..9280838de0b9 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -93,7 +93,7 @@ class TFLongformerBaseModelOutput(ModelOutput):
             in the sequence.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     global_attentions: Tuple[tf.Tensor, ...] | None = None
@@ -140,8 +140,8 @@ class TFLongformerBaseModelOutputWithPooling(ModelOutput):
             in the sequence.
     """
 
-    last_hidden_state: tf.Tensor = None
-    pooler_output: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
+    pooler_output: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     global_attentions: Tuple[tf.Tensor, ...] | None = None
@@ -187,7 +187,7 @@ class TFLongformerMaskedLMOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     global_attentions: Tuple[tf.Tensor, ...] | None = None
@@ -235,8 +235,8 @@ class TFLongformerQuestionAnsweringModelOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    start_logits: tf.Tensor = None
-    end_logits: tf.Tensor = None
+    start_logits: Optional[tf.Tensor] = None
+    end_logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     global_attentions: Tuple[tf.Tensor, ...] | None = None
@@ -282,7 +282,7 @@ class TFLongformerSequenceClassifierOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     global_attentions: Tuple[tf.Tensor, ...] | None = None
@@ -330,7 +330,7 @@ class TFLongformerMultipleChoiceModelOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     global_attentions: Tuple[tf.Tensor, ...] | None = None
@@ -376,7 +376,7 @@ class TFLongformerTokenClassifierOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     global_attentions: Tuple[tf.Tensor, ...] | None = None
@@ -746,12 +746,12 @@ def __init__(self, config, layer_id, **kwargs):
         self.layer_id = layer_id
         attention_window = config.attention_window[self.layer_id]
 
-        assert (
-            attention_window % 2 == 0
-        ), f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
-        assert (
-            attention_window > 0
-        ), f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        assert attention_window % 2 == 0, (
+            f"`attention_window` for layer {self.layer_id} has to be an even value. Given {attention_window}"
+        )
+        assert attention_window > 0, (
+            f"`attention_window` for layer {self.layer_id} has to be positive. Given {attention_window}"
+        )
 
         self.one_sided_attn_window_size = attention_window // 2
 
diff --git a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
deleted file mode 100644
index cf5c2d52d8ea..000000000000
--- a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert T5/LongT5X checkpoints from the original repository to JAX/FLAX model. This script is an extension of
-'src/transformers/models/t5/convert_t5x_checkpoint_to_flax.
-"""
-
-import argparse
-
-from t5x import checkpoints
-
-from transformers import AutoConfig, FlaxAutoModelForSeq2SeqLM
-
-
-def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_folder_path):
-    config = AutoConfig.from_pretrained(config_name)
-    flax_model = FlaxAutoModelForSeq2SeqLM.from_config(config=config)
-    t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-
-    split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["layers_0"]["mlp"]
-
-    if config.model_type == "t5":
-        encoder_attn_name = "SelfAttention"
-    if config.model_type == "longt5" and config.encoder_attention_type == "local":
-        encoder_attn_name = "LocalSelfAttention"
-    elif config.model_type == "longt5" and config.encoder_attention_type == "transient-global":
-        encoder_attn_name = "TransientGlobalSelfAttention"
-    else:
-        raise ValueError(
-            "Given config is expected to have `model_type='t5'`, or `model_type='longt5` with `encoder_attention_type`"
-            " attribute with a value from ['local', 'transient-global]."
-        )
-
-    # Encoder
-    for layer_index in range(config.num_layers):
-        layer_name = f"layers_{str(layer_index)}"
-
-        # Self-Attention
-        t5x_attention_key = t5x_model["target"]["encoder"][layer_name]["attention"]["key"]["kernel"]
-        t5x_attention_out = t5x_model["target"]["encoder"][layer_name]["attention"]["out"]["kernel"]
-        t5x_attention_query = t5x_model["target"]["encoder"][layer_name]["attention"]["query"]["kernel"]
-        t5x_attention_value = t5x_model["target"]["encoder"][layer_name]["attention"]["value"]["kernel"]
-
-        # Global input layer norm
-        if config.model_type == "longt5" and config.encoder_attention_type == "transient-global":
-            t5x_global_layer_norm = t5x_model["target"]["encoder"][layer_name]["attention"]["T5LayerNorm_0"]["scale"]
-
-        # Layer Normalization
-        t5x_attention_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_attention_layer_norm"]["scale"]
-
-        if split_mlp_wi:
-            t5x_mlp_wi_0 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_0"]["kernel"]
-            t5x_mlp_wi_1 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_1"]["kernel"]
-        else:
-            t5x_mlp_wi = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi"]["kernel"]
-
-        t5x_mlp_wo = t5x_model["target"]["encoder"][layer_name]["mlp"]["wo"]["kernel"]
-
-        # Layer Normalization
-        t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
-
-        # Assigning
-        flax_model_encoder_layer_block = flax_model.params["encoder"]["block"][str(layer_index)]["layer"]
-        flax_model_encoder_layer_block["0"][encoder_attn_name]["k"]["kernel"] = t5x_attention_key
-        flax_model_encoder_layer_block["0"][encoder_attn_name]["o"]["kernel"] = t5x_attention_out
-        flax_model_encoder_layer_block["0"][encoder_attn_name]["q"]["kernel"] = t5x_attention_query
-        flax_model_encoder_layer_block["0"][encoder_attn_name]["v"]["kernel"] = t5x_attention_value
-
-        flax_model_encoder_layer_block["0"]["layer_norm"]["weight"] = t5x_attention_layer_norm
-
-        # Global input layer norm
-        if config.model_type == "longt5" and config.encoder_attention_type == "transient-global":
-            flax_model_encoder_layer_block["0"][encoder_attn_name]["global_input_layer_norm"]["weight"] = (
-                t5x_global_layer_norm
-            )
-
-        if split_mlp_wi:
-            flax_model_encoder_layer_block["1"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0
-            flax_model_encoder_layer_block["1"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1
-        else:
-            flax_model_encoder_layer_block["1"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi
-
-        flax_model_encoder_layer_block["1"]["DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo
-        flax_model_encoder_layer_block["1"]["layer_norm"]["weight"] = t5x_mlp_layer_norm
-
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"] = flax_model_encoder_layer_block
-
-    # Only for layer 0:
-    t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"]["rel_embedding"].T
-    flax_model.params["encoder"]["block"]["0"]["layer"]["0"][encoder_attn_name]["relative_attention_bias"][
-        "embedding"
-    ] = t5x_encoder_rel_embedding
-
-    # Side/global relative position_bias + layer norm
-    if config.model_type == "longt5" and config.encoder_attention_type == "transient-global":
-        t5x_encoder_global_rel_embedding = t5x_model["target"]["encoder"]["side_relpos_bias"]["rel_embedding"].T
-        flax_model.params["encoder"]["block"]["0"]["layer"]["0"][encoder_attn_name]["global_relative_attention_bias"][
-            "embedding"
-        ] = t5x_encoder_global_rel_embedding
-
-    # Assigning
-    t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"]
-    flax_model.params["encoder"]["final_layer_norm"]["weight"] = t5x_encoder_norm
-
-    # Decoder
-    for layer_index in range(config.num_layers):
-        layer_name = f"layers_{str(layer_index)}"
-
-        # Self-Attention
-        t5x_attention_key = t5x_model["target"]["decoder"][layer_name]["self_attention"]["key"]["kernel"]
-        t5x_attention_out = t5x_model["target"]["decoder"][layer_name]["self_attention"]["out"]["kernel"]
-        t5x_attention_query = t5x_model["target"]["decoder"][layer_name]["self_attention"]["query"]["kernel"]
-        t5x_attention_value = t5x_model["target"]["decoder"][layer_name]["self_attention"]["value"]["kernel"]
-
-        # Layer Normalization
-        t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_self_attention_layer_norm"][
-            "scale"
-        ]
-
-        # Encoder-Decoder-Attention
-        t5x_enc_dec_attention_module = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]
-        t5x_enc_dec_attention_key = t5x_enc_dec_attention_module["key"]["kernel"]
-        t5x_enc_dec_attention_out = t5x_enc_dec_attention_module["out"]["kernel"]
-        t5x_enc_dec_attention_query = t5x_enc_dec_attention_module["query"]["kernel"]
-        t5x_enc_dec_attention_value = t5x_enc_dec_attention_module["value"]["kernel"]
-
-        # Layer Normalization
-        t5x_cross_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_cross_attention_layer_norm"]["scale"]
-
-        # MLP
-        if split_mlp_wi:
-            t5x_mlp_wi_0 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_0"]["kernel"]
-            t5x_mlp_wi_1 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_1"]["kernel"]
-        else:
-            t5x_mlp_wi = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi"]["kernel"]
-
-        t5x_mlp_wo = t5x_model["target"]["decoder"][layer_name]["mlp"]["wo"]["kernel"]
-
-        # Layer Normalization
-        tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
-
-        # Assigning
-        flax_model_decoder_layer_block = flax_model.params["decoder"]["block"][str(layer_index)]["layer"]
-        flax_model_decoder_layer_block["0"]["SelfAttention"]["k"]["kernel"] = t5x_attention_key
-        flax_model_decoder_layer_block["0"]["SelfAttention"]["o"]["kernel"] = t5x_attention_out
-        flax_model_decoder_layer_block["0"]["SelfAttention"]["q"]["kernel"] = t5x_attention_query
-        flax_model_decoder_layer_block["0"]["SelfAttention"]["v"]["kernel"] = t5x_attention_value
-
-        flax_model_decoder_layer_block["0"]["layer_norm"]["weight"] = t5x_pre_attention_layer_norm
-
-        flax_model_decoder_layer_block["1"]["EncDecAttention"]["k"]["kernel"] = t5x_enc_dec_attention_key
-        flax_model_decoder_layer_block["1"]["EncDecAttention"]["o"]["kernel"] = t5x_enc_dec_attention_out
-        flax_model_decoder_layer_block["1"]["EncDecAttention"]["q"]["kernel"] = t5x_enc_dec_attention_query
-        flax_model_decoder_layer_block["1"]["EncDecAttention"]["v"]["kernel"] = t5x_enc_dec_attention_value
-
-        flax_model_decoder_layer_block["1"]["layer_norm"]["weight"] = t5x_cross_layer_norm
-
-        if split_mlp_wi:
-            flax_model_decoder_layer_block["2"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0
-            flax_model_decoder_layer_block["2"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1
-        else:
-            flax_model_decoder_layer_block["2"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi
-
-        flax_model_decoder_layer_block["2"]["DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo
-
-        flax_model_decoder_layer_block["2"]["layer_norm"]["weight"] = tx5_mlp_layer_norm
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"] = flax_model_decoder_layer_block
-
-    # Decoder Normalization
-    tx5_decoder_norm = t5x_model["target"]["decoder"]["decoder_norm"]["scale"]
-    flax_model.params["decoder"]["final_layer_norm"]["weight"] = tx5_decoder_norm
-
-    # Only for layer 0:
-    t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["relpos_bias"]["rel_embedding"].T
-    flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"][
-        "embedding"
-    ] = t5x_decoder_rel_embedding
-
-    # Token Embeddings
-    tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"]
-    flax_model.params["shared"]["embedding"] = tx5_token_embeddings
-
-    # LM Head (only in v1.1 and LongT5 checkpoints)
-    if "logits_dense" in t5x_model["target"]["decoder"]:
-        flax_model.params["lm_head"]["kernel"] = t5x_model["target"]["decoder"]["logits_dense"]["kernel"]
-
-    flax_model.save_pretrained(flax_dump_folder_path)
-    print("T5X Model was sucessfully converted!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path the T5X checkpoint."
-    )
-    parser.add_argument("--config_name", default=None, type=str, required=True, help="Config name of LongT5/T5 model.")
-    parser.add_argument(
-        "--flax_dump_folder_path", default=None, type=str, required=True, help="Path to the output FLAX model."
-    )
-    args = parser.parse_args()
-    convert_t5x_checkpoint_to_flax(args.t5x_checkpoint_path, args.config_name, args.flax_dump_folder_path)
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
index 55081978dbf6..7c5fdf9c17cc 100644
--- a/src/transformers/models/longt5/modeling_flax_longt5.py
+++ b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -431,7 +431,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index 84ea0443d2f1..d0769264061e 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -40,6 +40,7 @@
     DUMMY_MASK,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
     is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
@@ -48,6 +49,12 @@
 from .configuration_longt5 import LongT5Config
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "LongT5Config"
@@ -228,7 +235,7 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         # LongT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
 
@@ -1597,12 +1604,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1684,7 +1696,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
diff --git a/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index c86fa6e30890..000000000000
--- a/src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert LUKE checkpoint."""
-
-import argparse
-import json
-import os
-
-import torch
-
-from transformers import LukeConfig, LukeModel, LukeTokenizer, RobertaTokenizer
-from transformers.tokenization_utils_base import AddedToken
-
-
-@torch.no_grad()
-def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, pytorch_dump_folder_path, model_size):
-    # Load configuration defined in the metadata file
-    with open(metadata_path) as metadata_file:
-        metadata = json.load(metadata_file)
-    config = LukeConfig(use_entity_aware_attention=True, **metadata["model_config"])
-
-    # Load in the weights from the checkpoint_path
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-
-    # Load the entity vocab file
-    entity_vocab = load_entity_vocab(entity_vocab_path)
-
-    tokenizer = RobertaTokenizer.from_pretrained(metadata["model_config"]["bert_model_name"])
-
-    # Add special tokens to the token vocabulary for downstream tasks
-    entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False)
-    entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False)
-    tokenizer.add_special_tokens({"additional_special_tokens": [entity_token_1, entity_token_2]})
-    config.vocab_size += 2
-
-    print(f"Saving tokenizer to {pytorch_dump_folder_path}")
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-    with open(os.path.join(pytorch_dump_folder_path, LukeTokenizer.vocab_files_names["entity_vocab_file"]), "w") as f:
-        json.dump(entity_vocab, f)
-
-    tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path)
-
-    # Initialize the embeddings of the special tokens
-    word_emb = state_dict["embeddings.word_embeddings.weight"]
-    ent_emb = word_emb[tokenizer.convert_tokens_to_ids(["@"])[0]].unsqueeze(0)
-    ent2_emb = word_emb[tokenizer.convert_tokens_to_ids(["#"])[0]].unsqueeze(0)
-    state_dict["embeddings.word_embeddings.weight"] = torch.cat([word_emb, ent_emb, ent2_emb])
-
-    # Initialize the query layers of the entity-aware self-attention mechanism
-    for layer_index in range(config.num_hidden_layers):
-        for matrix_name in ["query.weight", "query.bias"]:
-            prefix = f"encoder.layer.{layer_index}.attention.self."
-            state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix + matrix_name]
-            state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix + matrix_name]
-            state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix + matrix_name]
-
-    # Initialize the embedding of the [MASK2] entity using that of the [MASK] entity for downstream tasks
-    entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"]
-    entity_emb[entity_vocab["[MASK2]"]] = entity_emb[entity_vocab["[MASK]"]]
-
-    model = LukeModel(config=config).eval()
-
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    if not (len(missing_keys) == 1 and missing_keys[0] == "embeddings.position_ids"):
-        raise ValueError(f"Missing keys {', '.join(missing_keys)}. Expected only missing embeddings.position_ids")
-    if not (all(key.startswith("entity_predictions") or key.startswith("lm_head") for key in unexpected_keys)):
-        raise ValueError(
-            "Unexpected keys"
-            f" {', '.join([key for key in unexpected_keys if not (key.startswith('entity_predictions') or key.startswith('lm_head'))])}"
-        )
-
-    # Check outputs
-    tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path, task="entity_classification")
-
-    text = (
-        "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the"
-        " new world number one avoid a humiliating second- round exit at Wimbledon ."
-    )
-    span = (39, 42)
-    encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")
-
-    outputs = model(**encoding)
-
-    # Verify word hidden states
-    if model_size == "large":
-        expected_shape = torch.Size((1, 42, 1024))
-        expected_slice = torch.tensor(
-            [[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]]
-        )
-    else:  # base
-        expected_shape = torch.Size((1, 42, 768))
-        expected_slice = torch.tensor([[0.0037, 0.1368, -0.0091], [0.1099, 0.3329, -0.1095], [0.0765, 0.5335, 0.1179]])
-
-    if not (outputs.last_hidden_state.shape == expected_shape):
-        raise ValueError(
-            f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}"
-        )
-    if not torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
-        raise ValueError
-
-    # Verify entity hidden states
-    if model_size == "large":
-        expected_shape = torch.Size((1, 1, 1024))
-        expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]])
-    else:  # base
-        expected_shape = torch.Size((1, 1, 768))
-        expected_slice = torch.tensor([[0.1457, 0.1044, 0.0174]])
-
-    if not (outputs.entity_last_hidden_state.shape != expected_shape):
-        raise ValueError(
-            f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is"
-            f" {expected_shape}"
-        )
-    if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
-        raise ValueError
-
-    # Finally, save our PyTorch model and tokenizer
-    print("Saving PyTorch model to {}".format(pytorch_dump_folder_path))
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-def load_entity_vocab(entity_vocab_path):
-    entity_vocab = {}
-    with open(entity_vocab_path, "r", encoding="utf-8") as f:
-        for index, line in enumerate(f):
-            title, _ = line.rstrip().split("\t")
-            entity_vocab[title] = index
-
-    return entity_vocab
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--checkpoint_path", type=str, help="Path to a pytorch_model.bin file.")
-    parser.add_argument(
-        "--metadata_path", default=None, type=str, help="Path to a metadata.json file, defining the configuration."
-    )
-    parser.add_argument(
-        "--entity_vocab_path",
-        default=None,
-        type=str,
-        help="Path to an entity_vocab.tsv file, containing the entity vocabulary.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to where to dump the output PyTorch model."
-    )
-    parser.add_argument(
-        "--model_size", default="base", type=str, choices=["base", "large"], help="Size of the model to be converted."
-    )
-    args = parser.parse_args()
-    convert_luke_checkpoint(
-        args.checkpoint_path,
-        args.metadata_path,
-        args.entity_vocab_path,
-        args.pytorch_dump_folder_path,
-        args.model_size,
-    )
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 7a4f03fdf517..7bc27bcd9c83 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -71,7 +71,7 @@ class BaseLukeModelOutputWithPooling(BaseModelOutputWithPooling):
             compute the weighted average in the self-attention heads.
     """
 
-    entity_last_hidden_state: torch.FloatTensor = None
+    entity_last_hidden_state: Optional[torch.FloatTensor] = None
     entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
@@ -102,7 +102,7 @@ class BaseLukeModelOutput(BaseModelOutput):
             heads.
     """
 
-    entity_last_hidden_state: torch.FloatTensor = None
+    entity_last_hidden_state: Optional[torch.FloatTensor] = None
     entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
 
 
@@ -142,8 +142,8 @@ class LukeMaskedLMOutput(ModelOutput):
     loss: Optional[torch.FloatTensor] = None
     mlm_loss: Optional[torch.FloatTensor] = None
     mep_loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    entity_logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    entity_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -174,7 +174,7 @@ class EntityClassificationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -205,7 +205,7 @@ class EntityPairClassificationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -236,7 +236,7 @@ class EntitySpanClassificationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -270,7 +270,7 @@ class LukeSequenceClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -304,7 +304,7 @@ class LukeTokenClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -340,8 +340,8 @@ class LukeQuestionAnsweringModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -377,7 +377,7 @@ class LukeMultipleChoiceModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     entity_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -472,7 +472,10 @@ def __init__(self, config: LukeConfig):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(
-        self, entity_ids: torch.LongTensor, position_ids: torch.LongTensor, token_type_ids: torch.LongTensor = None
+        self,
+        entity_ids: torch.LongTensor,
+        position_ids: torch.LongTensor,
+        token_type_ids: Optional[torch.LongTensor] = None,
     ):
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(entity_ids)
@@ -501,7 +504,7 @@ def __init__(self, config):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 97f2d721097b..beb06aed6e4e 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -570,7 +570,7 @@ def __call__(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -718,7 +718,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -801,7 +801,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1077,7 +1077,7 @@ def _batch_prepare_for_model(
         max_entity_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1165,7 +1165,7 @@ def prepare_for_model(
         max_entity_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1394,7 +1394,7 @@ def pad(
         max_length: Optional[int] = None,
         max_entity_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         verbose: bool = True,
@@ -1554,7 +1554,7 @@ def _pad(
         max_entity_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 1dd77bc36f80..000000000000
--- a/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert LXMERT checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import LxmertConfig, LxmertForPreTraining, load_tf_weights_in_lxmert
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = LxmertConfig.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = LxmertForPreTraining(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_lxmert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 36dae0ee1d7e..1b8fb938a2ea 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -790,6 +790,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, LxmertLMPredictionHead):
+            module.bias.data.zero_()
 
 
 LXMERT_START_DOCSTRING = r"""
@@ -1072,6 +1074,9 @@ def __init__(self, config):
             }
         self.visual_losses = visual_losses
 
+    def _tie_weights(self):
+        self.cls.predictions.decoder.weight = self.lxmert.embeddings.word_embeddings.weight
+
     def resize_token_embeddings(
         self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True
     ) -> nn.Embedding:
diff --git a/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py b/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py
deleted file mode 100644
index 97265fbdcf93..000000000000
--- a/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-from torch import nn
-
-from transformers import M2M100Config, M2M100ForConditionalGeneration
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "decoder.output_projection.weight",
-        "_float_tensor",
-        "encoder.embed_positions._float_tensor",
-        "decoder.embed_positions._float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def convert_fairseq_m2m100_checkpoint_from_disk(checkpoint_path):
-    m2m_100 = torch.load(checkpoint_path, map_location="cpu")
-    args = m2m_100["args"] or m2m_100["cfg"]["model"]
-    state_dict = m2m_100["model"]
-    remove_ignore_keys_(state_dict)
-    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
-
-    config = M2M100Config(
-        vocab_size=vocab_size,
-        max_position_embeddings=1024,
-        encoder_layers=args.encoder_layers,
-        decoder_layers=args.decoder_layers,
-        encoder_attention_heads=args.encoder_attention_heads,
-        decoder_attention_heads=args.decoder_attention_heads,
-        encoder_ffn_dim=args.encoder_ffn_embed_dim,
-        decoder_ffn_dim=args.decoder_ffn_embed_dim,
-        d_model=args.encoder_embed_dim,
-        encoder_layerdrop=args.encoder_layerdrop,
-        decoder_layerdrop=args.decoder_layerdrop,
-        dropout=args.dropout,
-        attention_dropout=args.attention_dropout,
-        activation_dropout=args.activation_dropout,
-        activation_function="relu",
-    )
-
-    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-    model = M2M100ForConditionalGeneration(config)
-    model.model.load_state_dict(state_dict, strict=False)
-    model.lm_head = make_linear_from_emb(model.model.shared)
-
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.")
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    model = convert_fairseq_m2m100_checkpoint_from_disk(args.fairseq_pathß)
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index eb207bedd21b..c4cbc2192200 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -31,6 +31,7 @@
     _prepare_4d_causal_attention_mask,
     _prepare_4d_causal_attention_mask_for_sdpa,
 )
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -43,15 +44,13 @@
     add_end_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_m2m_100 import M2M100Config
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -144,7 +143,10 @@ def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional
 
     @torch.no_grad()
     def forward(
-        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
     ):
         if input_ids is not None:
             bsz, seq_len = input_ids.size()
@@ -352,9 +354,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -631,9 +633,7 @@ def forward(
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
+        if hidden_states.dtype == torch.float16:
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
@@ -1294,7 +1294,7 @@ def forward(
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting" " `use_cache=False`..."
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                 )
                 use_cache = False
 
diff --git a/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py b/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py
deleted file mode 100644
index 0cf7dcc0edaf..000000000000
--- a/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# coding=utf-8
-# Copyright 2024 state-spaces/mamba org and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba_ssm` package to be installed."""
-
-import argparse
-import json
-import math
-from typing import Tuple
-
-import torch
-
-from transformers import AutoTokenizer, MambaConfig, MambaForCausalLM
-from transformers.utils import logging
-from transformers.utils.import_utils import is_mamba_ssm_available
-
-
-if is_mamba_ssm_available():
-    from mamba_ssm.models.config_mamba import MambaConfig as MambaConfigSSM
-    from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
-
-    def convert_ssm_config_to_hf_config(config_ssm: MambaConfigSSM) -> MambaConfig:
-        """Convert a MambaConfig from mamba_ssm to a MambaConfig from transformers."""
-        hf_config = MambaConfig()
-        # Set config hidden size, num hidden layers, and vocab size directly from the original config
-        hf_config.hidden_size = config_ssm.d_model
-        hf_config.intermediate_size = config_ssm.d_model * 2
-        hf_config.time_step_rank = math.ceil(config_ssm.d_model / 16)
-
-        hf_config.num_hidden_layers = config_ssm.n_layer
-        vocab_size = config_ssm.vocab_size
-        pad_vocab_size_multiple = config_ssm.pad_vocab_size_multiple
-        if (vocab_size % pad_vocab_size_multiple) != 0:
-            vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
-        hf_config.vocab_size = vocab_size
-        return hf_config
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def convert_mamba_ssm_checkpoint_to_huggingface_model(
-    original_state_dict: dict, original_ssm_config_dict: dict
-) -> Tuple[MambaForCausalLM, AutoTokenizer]:
-    if not is_mamba_ssm_available():
-        raise ImportError(
-            "Calling convert_mamba_ssm_checkpoint_to_huggingface_model requires the mamba_ssm library to be installed. Please install it with `pip install mamba_ssm`."
-        )
-    original_ssm_config = MambaConfigSSM(**original_ssm_config_dict)
-
-    # Convert mamba_ssm config to huggingface MambaConfig
-    hf_config = convert_ssm_config_to_hf_config(original_ssm_config)
-
-    # No weights need to be renamed between the two models.
-    converted_state_dict = original_state_dict
-
-    # Load reshaped state dict into a huggingface model.
-    hf_model = MambaForCausalLM(hf_config)
-    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
-    hf_model.load_state_dict(converted_state_dict)
-    return (hf_model, tokenizer)
-
-
-def validate_converted_model(
-    original_state_dict: dict, original_ssm_config_dict: dict, hf_model: MambaForCausalLM, tokenizer: AutoTokenizer
-) -> None:
-    """Validate the converted model returns the same output as the original model."""
-    torch_device = "cuda"
-
-    original_config = MambaConfigSSM(**original_ssm_config_dict)
-    original_model = MambaLMHeadModel(original_config).to(torch_device)
-    original_model.load_state_dict(original_state_dict)
-
-    hf_model = hf_model.to(torch_device)
-    input_ids = tokenizer("Hey how are you doing?", return_tensors="pt")["input_ids"].to(torch_device)
-    # Assert model logits are close
-    with torch.no_grad():
-        original_model_logits = original_model(input_ids).logits
-        hf_model_logits = hf_model(input_ids).logits
-    if not torch.allclose(original_model_logits, hf_model_logits, atol=1e-3):
-        raise ValueError("The converted model did not return the same logits as the original model.")
-
-    logger.info("Model conversion validated successfully.")
-
-
-def convert_mamba_checkpoint_file_to_huggingface_model_file(
-    mamba_checkpoint_path: str, config_json_file: str, output_dir: str
-) -> None:
-    if not is_mamba_ssm_available():
-        raise ImportError(
-            "Calling convert_mamba_checkpoint_file_to_huggingface_model_file requires the mamba_ssm library to be installed. Please install it with `pip install mamba_ssm`."
-        )
-    if not torch.cuda.is_available():
-        raise ValueError(
-            "This script is to be run with a CUDA device, as the original mamba_ssm model does not support cpu."
-        )
-    logger.info(f"Loading model from {mamba_checkpoint_path} based on config from {config_json_file}")
-    # Load weights and config from paths
-    original_state_dict = torch.load(mamba_checkpoint_path, map_location="cpu")
-    with open(config_json_file, "r", encoding="utf-8") as json_file:
-        original_ssm_config_dict = json.load(json_file)
-
-    # Convert the model
-    hf_model, tokenizer = convert_mamba_ssm_checkpoint_to_huggingface_model(
-        original_state_dict, original_ssm_config_dict
-    )
-
-    # Validate the conversion
-    validate_converted_model(original_state_dict, original_ssm_config_dict, hf_model, tokenizer)
-
-    logger.info(f"Model converted successfully. Saving model to {output_dir}")
-
-    # Save new model to pytorch_dump_path
-    hf_model.save_pretrained(output_dir)
-    tokenizer.save_pretrained(output_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-i",
-        "--mamba_checkpoint_file",
-        type=str,
-        required=True,
-        help="Path to a `pytorch_model.bin` mamba_ssm checkpoint file to be converted.",
-    )
-    parser.add_argument(
-        "-c",
-        "--config_json_file",
-        type=str,
-        required=True,
-        help="Path to a `config.json` file corresponding to a MambaConfig of the original mamba_ssm model.",
-    )
-    parser.add_argument(
-        "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to."
-    )
-    args = parser.parse_args()
-
-    convert_mamba_checkpoint_file_to_huggingface_model_file(
-        args.mamba_checkpoint_file, args.config_json_file, args.output_dir
-    )
diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py
index 960da998909f..c5389cdfd411 100644
--- a/src/transformers/models/mamba/modeling_mamba.py
+++ b/src/transformers/models/mamba/modeling_mamba.py
@@ -98,7 +98,7 @@ def __init__(self, config: MambaConfig, layer_idx: int):
 
         # projection of the input hidden states
         self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias)
-        # selective projection used to make dt, B and C input dependant
+        # selective projection used to make dt, B and C input dependent
         self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
         # time step projection (discretization)
         self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)
@@ -708,7 +708,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[torch.LongTensor] = None,
         **kwargs,
     ):
-        # Overwitten -- uses `cache_params` as opposed to `past_key_values`
+        # Overwritten -- uses `cache_params` as opposed to `past_key_values`
 
         if use_cache:
             # `cache_position` should have been initialized in `generate`
diff --git a/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py b/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py
deleted file mode 100644
index f68e9bd4904b..000000000000
--- a/src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# coding=utf-8
-# Copyright 2024 state-spaces/mamba2 org and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This script can be used to convert checkpoints provided in the `mamba2_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba2_ssm` package to be installed."""
-
-import argparse
-import json
-from functools import partial
-from os import path
-from typing import Dict, Optional
-
-import torch
-from safetensors import safe_open
-from safetensors.torch import save_model
-
-from transformers import GPTNeoXTokenizerFast, LlamaTokenizerFast, Mamba2Config, Mamba2ForCausalLM
-
-
-def load_state_dict_from_safetensors(mamba2_checkpoint_path: str, ckpt_name: str) -> Dict[str, torch.Tensor]:
-    # Load weights and config from paths
-    original_state_dict = {}
-    with safe_open(path.join(mamba2_checkpoint_path, ckpt_name), framework="pt") as f:
-        for k in f.keys():
-            newk = k.removeprefix("model.")
-            original_state_dict[newk] = f.get_tensor(k).clone()
-    return original_state_dict
-
-
-def load_state_dict_from_torch(mamba2_checkpoint_path: str, ckpt_name: str) -> Dict[str, torch.Tensor]:
-    return torch.load(path.join(mamba2_checkpoint_path, ckpt_name), map_location="cpu")
-
-
-def convert_ssm_config_to_hf_config(config_ssm: Dict, mamba2_model_dict: Dict) -> Mamba2Config:
-    """Convert a Mamba2Config from mamba_ssm to a Mamba2Config from here."""
-    hf_config = Mamba2Config()
-
-    # Switch to a different dict depending on model type
-    config_dict = mamba2_model_dict
-
-    # Set important values from config and recalculate other resulting entries
-    hf_config.hidden_size = config_ssm[config_dict["hidden_size"]]
-    hf_config.num_heads = (hf_config.hidden_size * hf_config.expand) // hf_config.head_dim
-    hf_config.num_hidden_layers = config_ssm[config_dict["num_hidden_layers"]]
-    hf_config.n_groups = config_ssm.get(config_dict["n_groups"], 1)
-    hf_config.tie_word_embeddings = config_ssm["tie_embeddings"]
-    hf_config.bos_token_id = config_dict["bos_token_id"]
-    hf_config.pad_token_id = config_dict["pad_token_id"]
-    hf_config.eos_token_id = config_dict["eos_token_id"]
-
-    # Padded vocab size, mostly of 16 but 32 is also very common in different models
-    vocab_size = config_ssm["vocab_size"]
-    pad_vocab_size_multiple = config_ssm["pad_vocab_size_multiple"]
-    if (vocab_size % pad_vocab_size_multiple) != 0:
-        vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple)
-    hf_config.vocab_size = vocab_size
-
-    return hf_config
-
-
-def load_and_save_tokenizer(
-    mamba2_model_type: str,
-    output_dir: str,
-    tokenizer_model_path: Optional[str] = None,
-) -> None:
-    tokenizer = None
-
-    # Load tokenizer
-    if tokenizer_model_path is not None and mamba2_model_type == "codestral":
-        tokenizer_class = LlamaTokenizerFast
-        tokenizer = tokenizer_class(tokenizer_model_path, legacy=False, from_slow=True)
-    elif mamba2_model_type == "mamba_ssm":
-        tokenizer = GPTNeoXTokenizerFast.from_pretrained("state-spaces/mamba-130m-hf", padding_side="left")
-
-    # Save tokenizer
-    if tokenizer is not None:
-        tokenizer.save_pretrained(output_dir)
-
-
-_MAMBA2_MODELS_DICT = {
-    "codestral": {
-        "hidden_size": "dim",
-        "num_hidden_layers": "n_layers",
-        "n_groups": "n_groups",
-        "bos_token_id": 0,
-        "pad_token_id": 1,
-        "eos_token_id": 2,
-        "config_name": "params.json",
-        "load_state_dict": partial(load_state_dict_from_safetensors, ckpt_name="consolidated.safetensors"),
-        "load_and_save_tokenizer": partial(load_and_save_tokenizer, "codestral"),
-    },
-    "mamba_ssm": {
-        "hidden_size": "d_model",
-        "num_hidden_layers": "n_layer",
-        "n_groups": "ngroups",
-        "bos_token_id": 0,
-        "pad_token_id": 0,
-        "eos_token_id": 0,
-        "config_name": "config.json",
-        "load_state_dict": partial(load_state_dict_from_torch, ckpt_name="pytorch_model.bin"),
-        "load_and_save_tokenizer": partial(load_and_save_tokenizer, "mamba_ssm"),
-    },
-}
-
-
-def convert_mamba2_checkpoint_file_to_huggingface_model_file(
-    mamba2_checkpoint_path: str,
-    mamba2_model_type: str,
-    precision: str,
-    output_dir: str,
-    tokenizer_model_path: Optional[str] = None,
-) -> None:
-    mamba2_model_dict = _MAMBA2_MODELS_DICT[mamba2_model_type]
-
-    # Load and save config based on name
-    config_path = path.join(mamba2_checkpoint_path, mamba2_model_dict["config_name"])
-    with open(config_path, "r", encoding="utf-8") as json_file:
-        config = json.load(json_file)
-    hf_config = convert_ssm_config_to_hf_config(config_ssm=config, mamba2_model_dict=mamba2_model_dict)
-    hf_config.save_pretrained(output_dir)
-
-    # Load state dict of the original model and transfer to hf model
-    original_state_dict = mamba2_model_dict["load_state_dict"](mamba2_checkpoint_path=mamba2_checkpoint_path)
-    hf_model = Mamba2ForCausalLM(hf_config)
-    hf_model.load_state_dict(original_state_dict)
-
-    # Save new model to pytorch_dump_path
-    dtype = torch.float32 if precision == "fp32" else (torch.bfloat16 if precision == "bf16" else torch.float16)
-    save_model(hf_model.to(dtype), path.join(output_dir, "model.safetensors"), metadata={"format": "pt"})
-
-    # Load and save tokenizer
-    mamba2_model_dict["load_and_save_tokenizer"](output_dir=output_dir, tokenizer_model_path=tokenizer_model_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-i",
-        "--mamba2_checkpoint_directory",
-        type=str,
-        required=True,
-        help="Path to a directory containing the `pytorch_model.bin` or `.safetensors` mamba2_ssm checkpoint file to be converted.",
-    )
-    parser.add_argument(
-        "-m",
-        "--mamba2_model_type",
-        type=str,
-        default="mamba_ssm",
-        const="mamba_ssm",
-        required=True,
-        choices=("codestral", "mamba_ssm"),
-        help="The model type the conversion will be performed on. Can choose from either `codestral` or `mamba_ssm`.",
-    )
-    parser.add_argument(
-        "-p",
-        "--precision",
-        type=str,
-        default="fp16",
-        const="fp16",
-        required=True,
-        choices=("fp32", "fp16", "bf16"),
-        help="The precision the model will be saved in. Select from fp32, fp16 or bf16.",
-    )
-    parser.add_argument(
-        "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to."
-    )
-    parser.add_argument(
-        "-t",
-        "--tokenizer_model_path",
-        type=str,
-        default=None,
-        required=False,
-        help="Path to a `codestral` tokenizer file.",
-    )
-    args = parser.parse_args()
-
-    convert_mamba2_checkpoint_file_to_huggingface_model_file(
-        args.mamba2_checkpoint_directory,
-        args.mamba2_model_type,
-        args.precision,
-        args.output_dir,
-        args.tokenizer_model_path,
-    )
diff --git a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
deleted file mode 100644
index 40ad3294097c..000000000000
--- a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
+++ /dev/null
@@ -1,1327 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import datetime
-import json
-import os
-import re
-from pathlib import Path
-from typing import Tuple
-
-import yaml
-from tqdm import tqdm
-
-from transformers.models.marian.convert_marian_to_pytorch import (
-    FRONT_MATTER_TEMPLATE,
-    convert,
-    convert_opus_name_to_hf_name,
-    download_and_unzip,
-    get_system_metadata,
-)
-
-
-DEFAULT_REPO = "Tatoeba-Challenge"
-DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models")
-ISO_URL = "https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv"
-ISO_PATH = "lang_code_data/iso-639-3.csv"
-LANG_CODE_PATH = "lang_code_data/language-codes-3b2.csv"
-TATOEBA_MODELS_URL = "https://object.pouta.csc.fi/Tatoeba-MT-models"
-
-
-class TatoebaConverter:
-    """
-    Convert Tatoeba-Challenge models to huggingface format.
-
-    Steps:
-
-        1. Convert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
-        2. Rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
-           one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en
-        3. Select the best model for a particular pair, parse the yml for it and write a model card. By default the
-           best model is the one listed first in released-model-results, but it's also possible to specify the most
-           recent one.
-    """
-
-    def __init__(self, save_dir="marian_converted"):
-        assert Path(DEFAULT_REPO).exists(), "need git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git"
-        self.download_lang_info()
-        self.model_results = json.load(open("Tatoeba-Challenge/models/released-model-results.json"))
-        self.alpha3_to_alpha2 = {}
-        for line in open(ISO_PATH):
-            parts = line.split("\t")
-            if len(parts[0]) == 3 and len(parts[3]) == 2:
-                self.alpha3_to_alpha2[parts[0]] = parts[3]
-        for line in LANG_CODE_PATH:
-            parts = line.split(",")
-            if len(parts[0]) == 3 and len(parts[1]) == 2:
-                self.alpha3_to_alpha2[parts[0]] = parts[1]
-        self.model_card_dir = Path(save_dir)
-        self.tag2name = {}
-        for key, value in GROUP_MEMBERS.items():
-            self.tag2name[key] = value[0]
-
-    def convert_models(self, tatoeba_ids, dry_run=False):
-        models_to_convert = [self.parse_metadata(x) for x in tatoeba_ids]
-        save_dir = Path("marian_ckpt")
-        dest_dir = Path(self.model_card_dir)
-        dest_dir.mkdir(exist_ok=True)
-        for model in tqdm(models_to_convert):  # k, prepro, download, test_set_url in tqdm(model_list):
-            if "SentencePiece" not in model["pre-processing"]:
-                print(f"Skipping {model['release']} because it doesn't appear to use SentencePiece")
-                continue
-            if not os.path.exists(save_dir / model["_name"]):
-                download_and_unzip(f"{TATOEBA_MODELS_URL}/{model['release']}", save_dir / model["_name"])
-            # from convert_marian_to_pytorch
-            opus_language_groups_to_hf = convert_opus_name_to_hf_name
-            pair_name = opus_language_groups_to_hf(model["_name"])
-            convert(save_dir / model["_name"], dest_dir / f"opus-mt-{pair_name}")
-            self.write_model_card(model, dry_run=dry_run)
-
-    def expand_group_to_two_letter_codes(self, grp_name):
-        return [self.alpha3_to_alpha2.get(x, x) for x in GROUP_MEMBERS[grp_name][1]]
-
-    def is_group(self, code, name):
-        return "languages" in name or len(GROUP_MEMBERS.get(code, [])) > 1
-
-    def get_tags(self, code, name):
-        if len(code) == 2:
-            assert "languages" not in name, f"{code}: {name}"
-            return [code]
-        elif self.is_group(code, name):
-            group = self.expand_group_to_two_letter_codes(code)
-            group.append(code)
-            return group
-        else:  # zho-> zh
-            print(f"Three letter monolingual code: {code}")
-            return [code]
-
-    def resolve_lang_code(self, src, tgt) -> Tuple[str, str]:
-        src_tags = self.get_tags(src, self.tag2name[src])
-        tgt_tags = self.get_tags(tgt, self.tag2name[tgt])
-        return src_tags, tgt_tags
-
-    @staticmethod
-    def model_type_info_from_model_name(name):
-        info = {"_has_backtranslated_data": False}
-        if "1m" in name:
-            info["_data_per_pair"] = str(1e6)
-        if "2m" in name:
-            info["_data_per_pair"] = str(2e6)
-        if "4m" in name:
-            info["_data_per_pair"] = str(4e6)
-        if "+bt" in name:
-            info["_has_backtranslated_data"] = True
-        if "tuned4" in name:
-            info["_tuned"] = re.search(r"tuned4[^-]+", name).group()
-        return info
-
-    def write_model_card(self, model_dict, dry_run=False) -> str:
-        """
-        Construct card from data parsed from YAML and the model's name. upload command: aws s3 sync model_card_dir
-        s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
-        """
-        model_dir_url = f"{TATOEBA_MODELS_URL}/{model_dict['release']}"
-        long_pair = model_dict["_name"].split("-")
-        assert len(long_pair) == 2, f"got a translation pair {model_dict['_name']} that doesn't appear to be a pair"
-        short_src = self.alpha3_to_alpha2.get(long_pair[0], long_pair[0])
-        short_tgt = self.alpha3_to_alpha2.get(long_pair[1], long_pair[1])
-        model_dict["_hf_model_id"] = f"opus-mt-{short_src}-{short_tgt}"
-
-        a3_src, a3_tgt = model_dict["_name"].split("-")
-        # opus_src_tags, opus_tgt_tags = a3_src.split("+"), a3_tgt.split("+")
-
-        # This messy part tries to deal with language tags in multilingual models, possibly
-        # not all having three-letter codes
-        resolved_src_tags, resolved_tgt_tags = self.resolve_lang_code(a3_src, a3_tgt)
-        a2_src_tags, a2_tgt_tags = [], []
-        for tag in resolved_src_tags:
-            if tag not in self.alpha3_to_alpha2:
-                a2_src_tags.append(tag)
-        for tag in resolved_tgt_tags:
-            if tag not in self.alpha3_to_alpha2:
-                a2_tgt_tags.append(tag)
-
-        lang_tags = dedup(a2_src_tags + a2_tgt_tags)
-        src_multilingual, tgt_multilingual = (len(a2_src_tags) > 1), (len(a2_tgt_tags) > 1)
-        s, t = ",".join(a2_src_tags), ",".join(a2_tgt_tags)
-
-        metadata = {
-            "hf_name": model_dict["_name"],
-            "source_languages": s,
-            "target_languages": t,
-            "opus_readme_url": f"{model_dir_url}/README.md",
-            "original_repo": "Tatoeba-Challenge",
-            "tags": ["translation"],
-            "languages": lang_tags,
-        }
-        lang_tags = l2front_matter(lang_tags)
-
-        metadata["src_constituents"] = list(GROUP_MEMBERS[a3_src][1])
-        metadata["tgt_constituents"] = list(GROUP_MEMBERS[a3_tgt][1])
-        metadata["src_multilingual"] = src_multilingual
-        metadata["tgt_multilingual"] = tgt_multilingual
-
-        backtranslated_data = ""
-        if model_dict["_has_backtranslated_data"]:
-            backtranslated_data = " with backtranslations"
-
-        multilingual_data = ""
-        if "_data_per_pair" in model_dict:
-            multilingual_data = f"* data per pair in multilingual model: {model_dict['_data_per_pair']}\n"
-
-        tuned = ""
-        if "_tuned" in model_dict:
-            tuned = f"* multilingual model tuned for: {model_dict['_tuned']}\n"
-
-        model_base_filename = model_dict["release"].split("/")[-1]
-        download = f"* download original weights: [{model_base_filename}]({model_dir_url}/{model_dict['release']})\n"
-
-        langtoken = ""
-        if tgt_multilingual:
-            langtoken = (
-                "* a sentence-initial language token is required in the form of >>id<<"
-                "(id = valid, usually three-letter target language ID)\n"
-            )
-
-        metadata.update(get_system_metadata(DEFAULT_REPO))
-
-        scorestable = ""
-        for k, v in model_dict.items():
-            if "scores" in k:
-                this_score_table = f"* {k}\n|Test set|score|\n|---|---|\n"
-                pairs = sorted(v.items(), key=lambda x: x[1], reverse=True)
-                for pair in pairs:
-                    this_score_table += f"|{pair[0]}|{pair[1]}|\n"
-                scorestable += this_score_table
-
-        datainfo = ""
-        if "training-data" in model_dict:
-            datainfo += "* Training data: \n"
-            for k, v in model_dict["training-data"].items():
-                datainfo += f"  * {str(k)}: {str(v)}\n"
-        if "validation-data" in model_dict:
-            datainfo += "* Validation data: \n"
-            for k, v in model_dict["validation-data"].items():
-                datainfo += f"  * {str(k)}: {str(v)}\n"
-        if "test-data" in model_dict:
-            datainfo += "* Test data: \n"
-            for k, v in model_dict["test-data"].items():
-                datainfo += f"  * {str(k)}: {str(v)}\n"
-
-        testsetfilename = model_dict["release"].replace(".zip", ".test.txt")
-        testscoresfilename = model_dict["release"].replace(".zip", ".eval.txt")
-        testset = f"* test set translations file: [test.txt]({model_dir_url}/{testsetfilename})\n"
-        testscores = f"* test set scores file: [eval.txt]({model_dir_url}/{testscoresfilename})\n"
-
-        # combine with Tatoeba markdown
-        readme_url = f"{TATOEBA_MODELS_URL}/{model_dict['_name']}/README.md"
-        extra_markdown = f"""
-### {model_dict['_name']}
-
-* source language name: {self.tag2name[a3_src]}
-* target language name: {self.tag2name[a3_tgt]}
-* OPUS readme: [README.md]({readme_url})
-"""
-
-        content = (
-            f"""
-* model: {model_dict['modeltype']}
-* source language code{src_multilingual*'s'}: {', '.join(a2_src_tags)}
-* target language code{tgt_multilingual*'s'}: {', '.join(a2_tgt_tags)}
-* dataset: opus {backtranslated_data}
-* release date: {model_dict['release-date']}
-* pre-processing: {model_dict['pre-processing']}
-"""
-            + multilingual_data
-            + tuned
-            + download
-            + langtoken
-            + datainfo
-            + testset
-            + testscores
-            + scorestable
-        )
-
-        content = FRONT_MATTER_TEMPLATE.format(lang_tags) + extra_markdown + content
-
-        items = "\n".join([f"* {k}: {v}" for k, v in metadata.items()])
-        sec3 = "\n### System Info: \n" + items
-        content += sec3
-        if dry_run:
-            print("CONTENT:")
-            print(content)
-            print("METADATA:")
-            print(metadata)
-            return
-        sub_dir = self.model_card_dir / model_dict["_hf_model_id"]
-        sub_dir.mkdir(exist_ok=True)
-        dest = sub_dir / "README.md"
-        dest.open("w").write(content)
-        for k, v in metadata.items():
-            if isinstance(v, datetime.date):
-                metadata[k] = datetime.datetime.strftime(v, "%Y-%m-%d")
-        with open(sub_dir / "metadata.json", "w", encoding="utf-8") as writeobj:
-            json.dump(metadata, writeobj)
-
-    def download_lang_info(self):
-        global LANG_CODE_PATH
-        Path(LANG_CODE_PATH).parent.mkdir(exist_ok=True)
-        import wget
-        from huggingface_hub import hf_hub_download
-
-        if not os.path.exists(ISO_PATH):
-            wget.download(ISO_URL, ISO_PATH)
-        if not os.path.exists(LANG_CODE_PATH):
-            LANG_CODE_PATH = hf_hub_download(
-                repo_id="huggingface/language_codes_marianMT", filename="language-codes-3b2.csv", repo_type="dataset"
-            )
-
-    def parse_metadata(self, model_name, repo_path=DEFAULT_MODEL_DIR, method="best"):
-        p = Path(repo_path) / model_name
-
-        def url_to_name(url):
-            return url.split("/")[-1].split(".")[0]
-
-        if model_name not in self.model_results:
-            # This is not a language pair, so model results are ambiguous, go by newest
-            method = "newest"
-
-        if method == "best":
-            # Sort by how early they appear in released-models-results
-            results = [url_to_name(model["download"]) for model in self.model_results[model_name]]
-            ymls = [f for f in os.listdir(p) if f.endswith(".yml") and f[:-4] in results]
-            ymls.sort(key=lambda x: results.index(x[:-4]))
-            metadata = yaml.safe_load(open(p / ymls[0]))
-            metadata.update(self.model_type_info_from_model_name(ymls[0][:-4]))
-        elif method == "newest":
-            ymls = [f for f in os.listdir(p) if f.endswith(".yml")]
-            # Sort by date
-            ymls.sort(
-                key=lambda x: datetime.datetime.strptime(re.search(r"\d\d\d\d-\d\d?-\d\d?", x).group(), "%Y-%m-%d")
-            )
-            metadata = yaml.safe_load(open(p / ymls[-1]))
-            metadata.update(self.model_type_info_from_model_name(ymls[-1][:-4]))
-        else:
-            raise NotImplementedError(f"Don't know argument method='{method}' to parse_metadata()")
-        metadata["_name"] = model_name
-        return metadata
-
-
-GROUP_MEMBERS = {
-    # three letter code -> (group/language name, {constituents...}
-    # if this language is on the target side the constituents can be used as target language codes.
-    # if the language is on the source side they are supported natively without special codes.
-    "aav": ("Austro-Asiatic languages", {"hoc", "hoc_Latn", "kha", "khm", "khm_Latn", "mnw", "vie", "vie_Hani"}),
-    "afa": (
-        "Afro-Asiatic languages",
-        {
-            "acm",
-            "afb",
-            "amh",
-            "apc",
-            "ara",
-            "arq",
-            "ary",
-            "arz",
-            "hau_Latn",
-            "heb",
-            "kab",
-            "mlt",
-            "rif_Latn",
-            "shy_Latn",
-            "som",
-            "thv",
-            "tir",
-        },
-    ),
-    "afr": ("Afrikaans", {"afr"}),
-    "alv": (
-        "Atlantic-Congo languages",
-        {
-            "ewe",
-            "fuc",
-            "fuv",
-            "ibo",
-            "kin",
-            "lin",
-            "lug",
-            "nya",
-            "run",
-            "sag",
-            "sna",
-            "swh",
-            "toi_Latn",
-            "tso",
-            "umb",
-            "wol",
-            "xho",
-            "yor",
-            "zul",
-        },
-    ),
-    "ara": ("Arabic", {"afb", "apc", "apc_Latn", "ara", "ara_Latn", "arq", "arq_Latn", "arz"}),
-    "art": (
-        "Artificial languages",
-        {
-            "afh_Latn",
-            "avk_Latn",
-            "dws_Latn",
-            "epo",
-            "ido",
-            "ido_Latn",
-            "ile_Latn",
-            "ina_Latn",
-            "jbo",
-            "jbo_Cyrl",
-            "jbo_Latn",
-            "ldn_Latn",
-            "lfn_Cyrl",
-            "lfn_Latn",
-            "nov_Latn",
-            "qya",
-            "qya_Latn",
-            "sjn_Latn",
-            "tlh_Latn",
-            "tzl",
-            "tzl_Latn",
-            "vol_Latn",
-        },
-    ),
-    "aze": ("Azerbaijani", {"aze_Latn"}),
-    "bat": ("Baltic languages", {"lit", "lav", "prg_Latn", "ltg", "sgs"}),
-    "bel": ("Belarusian", {"bel", "bel_Latn"}),
-    "ben": ("Bengali", {"ben"}),
-    "bnt": (
-        "Bantu languages",
-        {"kin", "lin", "lug", "nya", "run", "sna", "swh", "toi_Latn", "tso", "umb", "xho", "zul"},
-    ),
-    "bul": ("Bulgarian", {"bul", "bul_Latn"}),
-    "cat": ("Catalan", {"cat"}),
-    "cau": ("Caucasian languages", {"abk", "kat", "che", "ady"}),
-    "ccs": ("South Caucasian languages", {"kat"}),
-    "ceb": ("Cebuano", {"ceb"}),
-    "cel": ("Celtic languages", {"gla", "gle", "bre", "cor", "glv", "cym"}),
-    "ces": ("Czech", {"ces"}),
-    "cpf": ("Creoles and pidgins, French‑based", {"gcf_Latn", "hat", "mfe"}),
-    "cpp": (
-        "Creoles and pidgins, Portuguese-based",
-        {"zsm_Latn", "ind", "pap", "min", "tmw_Latn", "max_Latn", "zlm_Latn"},
-    ),
-    "cus": ("Cushitic languages", {"som"}),
-    "dan": ("Danish", {"dan"}),
-    "deu": ("German", {"deu"}),
-    "dra": ("Dravidian languages", {"tam", "kan", "mal", "tel"}),
-    "ell": ("Modern Greek (1453-)", {"ell"}),
-    "eng": ("English", {"eng"}),
-    "epo": ("Esperanto", {"epo"}),
-    "est": ("Estonian", {"est"}),
-    "euq": ("Basque (family)", {"eus"}),
-    "eus": ("Basque", {"eus"}),
-    "fin": ("Finnish", {"fin"}),
-    "fiu": (
-        "Finno-Ugrian languages",
-        {
-            "est",
-            "fin",
-            "fkv_Latn",
-            "hun",
-            "izh",
-            "kpv",
-            "krl",
-            "liv_Latn",
-            "mdf",
-            "mhr",
-            "myv",
-            "sma",
-            "sme",
-            "udm",
-            "vep",
-            "vro",
-        },
-    ),
-    "fra": ("French", {"fra"}),
-    "gem": (
-        "Germanic languages",
-        {
-            "afr",
-            "ang_Latn",
-            "dan",
-            "deu",
-            "eng",
-            "enm_Latn",
-            "fao",
-            "frr",
-            "fry",
-            "gos",
-            "got_Goth",
-            "gsw",
-            "isl",
-            "ksh",
-            "ltz",
-            "nds",
-            "nld",
-            "nno",
-            "nob",
-            "nob_Hebr",
-            "non_Latn",
-            "pdc",
-            "sco",
-            "stq",
-            "swe",
-            "swg",
-            "yid",
-        },
-    ),
-    "gle": ("Irish", {"gle"}),
-    "glg": ("Galician", {"glg"}),
-    "gmq": ("North Germanic languages", {"dan", "nob", "nob_Hebr", "swe", "isl", "nno", "non_Latn", "fao"}),
-    "gmw": (
-        "West Germanic languages",
-        {
-            "afr",
-            "ang_Latn",
-            "deu",
-            "eng",
-            "enm_Latn",
-            "frr",
-            "fry",
-            "gos",
-            "gsw",
-            "ksh",
-            "ltz",
-            "nds",
-            "nld",
-            "pdc",
-            "sco",
-            "stq",
-            "swg",
-            "yid",
-        },
-    ),
-    "grk": ("Greek languages", {"grc_Grek", "ell"}),
-    "hbs": ("Serbo-Croatian", {"hrv", "srp_Cyrl", "bos_Latn", "srp_Latn"}),
-    "heb": ("Hebrew", {"heb"}),
-    "hin": ("Hindi", {"hin"}),
-    "hun": ("Hungarian", {"hun"}),
-    "hye": ("Armenian", {"hye", "hye_Latn"}),
-    "iir": (
-        "Indo-Iranian languages",
-        {
-            "asm",
-            "awa",
-            "ben",
-            "bho",
-            "gom",
-            "guj",
-            "hif_Latn",
-            "hin",
-            "jdt_Cyrl",
-            "kur_Arab",
-            "kur_Latn",
-            "mai",
-            "mar",
-            "npi",
-            "ori",
-            "oss",
-            "pan_Guru",
-            "pes",
-            "pes_Latn",
-            "pes_Thaa",
-            "pnb",
-            "pus",
-            "rom",
-            "san_Deva",
-            "sin",
-            "snd_Arab",
-            "tgk_Cyrl",
-            "tly_Latn",
-            "urd",
-            "zza",
-        },
-    ),
-    "ilo": ("Iloko", {"ilo"}),
-    "inc": (
-        "Indic languages",
-        {
-            "asm",
-            "awa",
-            "ben",
-            "bho",
-            "gom",
-            "guj",
-            "hif_Latn",
-            "hin",
-            "mai",
-            "mar",
-            "npi",
-            "ori",
-            "pan_Guru",
-            "pnb",
-            "rom",
-            "san_Deva",
-            "sin",
-            "snd_Arab",
-            "urd",
-        },
-    ),
-    "ine": (
-        "Indo-European languages",
-        {
-            "afr",
-            "afr_Arab",
-            "aln",
-            "ang_Latn",
-            "arg",
-            "asm",
-            "ast",
-            "awa",
-            "bel",
-            "bel_Latn",
-            "ben",
-            "bho",
-            "bjn",
-            "bos_Latn",
-            "bre",
-            "bul",
-            "bul_Latn",
-            "cat",
-            "ces",
-            "cor",
-            "cos",
-            "csb_Latn",
-            "cym",
-            "dan",
-            "deu",
-            "dsb",
-            "egl",
-            "ell",
-            "eng",
-            "enm_Latn",
-            "ext",
-            "fao",
-            "fra",
-            "frm_Latn",
-            "frr",
-            "fry",
-            "gcf_Latn",
-            "gla",
-            "gle",
-            "glg",
-            "glv",
-            "gom",
-            "gos",
-            "got_Goth",
-            "grc_Grek",
-            "gsw",
-            "guj",
-            "hat",
-            "hif_Latn",
-            "hin",
-            "hrv",
-            "hsb",
-            "hye",
-            "hye_Latn",
-            "ind",
-            "isl",
-            "ita",
-            "jdt_Cyrl",
-            "ksh",
-            "kur_Arab",
-            "kur_Latn",
-            "lad",
-            "lad_Latn",
-            "lat_Grek",
-            "lat_Latn",
-            "lav",
-            "lij",
-            "lit",
-            "lld_Latn",
-            "lmo",
-            "ltg",
-            "ltz",
-            "mai",
-            "mar",
-            "max_Latn",
-            "mfe",
-            "min",
-            "mkd",
-            "mwl",
-            "nds",
-            "nld",
-            "nno",
-            "nob",
-            "nob_Hebr",
-            "non_Latn",
-            "npi",
-            "oci",
-            "ori",
-            "orv_Cyrl",
-            "oss",
-            "pan_Guru",
-            "pap",
-            "pcd",
-            "pdc",
-            "pes",
-            "pes_Latn",
-            "pes_Thaa",
-            "pms",
-            "pnb",
-            "pol",
-            "por",
-            "prg_Latn",
-            "pus",
-            "roh",
-            "rom",
-            "ron",
-            "rue",
-            "rus",
-            "rus_Latn",
-            "san_Deva",
-            "scn",
-            "sco",
-            "sgs",
-            "sin",
-            "slv",
-            "snd_Arab",
-            "spa",
-            "sqi",
-            "srd",
-            "srp_Cyrl",
-            "srp_Latn",
-            "stq",
-            "swe",
-            "swg",
-            "tgk_Cyrl",
-            "tly_Latn",
-            "tmw_Latn",
-            "ukr",
-            "urd",
-            "vec",
-            "wln",
-            "yid",
-            "zlm_Latn",
-            "zsm_Latn",
-            "zza",
-        },
-    ),
-    "isl": ("Icelandic", {"isl"}),
-    "ita": ("Italian", {"ita"}),
-    "itc": (
-        "Italic languages",
-        {
-            "arg",
-            "ast",
-            "bjn",
-            "cat",
-            "cos",
-            "egl",
-            "ext",
-            "fra",
-            "frm_Latn",
-            "gcf_Latn",
-            "glg",
-            "hat",
-            "ind",
-            "ita",
-            "lad",
-            "lad_Latn",
-            "lat_Grek",
-            "lat_Latn",
-            "lij",
-            "lld_Latn",
-            "lmo",
-            "max_Latn",
-            "mfe",
-            "min",
-            "mwl",
-            "oci",
-            "pap",
-            "pcd",
-            "pms",
-            "por",
-            "roh",
-            "ron",
-            "scn",
-            "spa",
-            "srd",
-            "tmw_Latn",
-            "vec",
-            "wln",
-            "zlm_Latn",
-            "zsm_Latn",
-        },
-    ),
-    "jpn": ("Japanese", {"jpn", "jpn_Bopo", "jpn_Hang", "jpn_Hani", "jpn_Hira", "jpn_Kana", "jpn_Latn", "jpn_Yiii"}),
-    "jpx": ("Japanese (family)", {"jpn"}),
-    "kat": ("Georgian", {"kat"}),
-    "kor": ("Korean", {"kor_Hani", "kor_Hang", "kor_Latn", "kor"}),
-    "lav": ("Latvian", {"lav"}),
-    "lit": ("Lithuanian", {"lit"}),
-    "mkd": ("Macedonian", {"mkd"}),
-    "mkh": ("Mon-Khmer languages", {"vie_Hani", "mnw", "vie", "kha", "khm_Latn", "khm"}),
-    "msa": ("Malay (macrolanguage)", {"zsm_Latn", "ind", "max_Latn", "zlm_Latn", "min"}),
-    "mul": (
-        "Multiple languages",
-        {
-            "abk",
-            "acm",
-            "ady",
-            "afb",
-            "afh_Latn",
-            "afr",
-            "akl_Latn",
-            "aln",
-            "amh",
-            "ang_Latn",
-            "apc",
-            "ara",
-            "arg",
-            "arq",
-            "ary",
-            "arz",
-            "asm",
-            "ast",
-            "avk_Latn",
-            "awa",
-            "aze_Latn",
-            "bak",
-            "bam_Latn",
-            "bel",
-            "bel_Latn",
-            "ben",
-            "bho",
-            "bod",
-            "bos_Latn",
-            "bre",
-            "brx",
-            "brx_Latn",
-            "bul",
-            "bul_Latn",
-            "cat",
-            "ceb",
-            "ces",
-            "cha",
-            "che",
-            "chr",
-            "chv",
-            "cjy_Hans",
-            "cjy_Hant",
-            "cmn",
-            "cmn_Hans",
-            "cmn_Hant",
-            "cor",
-            "cos",
-            "crh",
-            "crh_Latn",
-            "csb_Latn",
-            "cym",
-            "dan",
-            "deu",
-            "dsb",
-            "dtp",
-            "dws_Latn",
-            "egl",
-            "ell",
-            "enm_Latn",
-            "epo",
-            "est",
-            "eus",
-            "ewe",
-            "ext",
-            "fao",
-            "fij",
-            "fin",
-            "fkv_Latn",
-            "fra",
-            "frm_Latn",
-            "frr",
-            "fry",
-            "fuc",
-            "fuv",
-            "gan",
-            "gcf_Latn",
-            "gil",
-            "gla",
-            "gle",
-            "glg",
-            "glv",
-            "gom",
-            "gos",
-            "got_Goth",
-            "grc_Grek",
-            "grn",
-            "gsw",
-            "guj",
-            "hat",
-            "hau_Latn",
-            "haw",
-            "heb",
-            "hif_Latn",
-            "hil",
-            "hin",
-            "hnj_Latn",
-            "hoc",
-            "hoc_Latn",
-            "hrv",
-            "hsb",
-            "hun",
-            "hye",
-            "iba",
-            "ibo",
-            "ido",
-            "ido_Latn",
-            "ike_Latn",
-            "ile_Latn",
-            "ilo",
-            "ina_Latn",
-            "ind",
-            "isl",
-            "ita",
-            "izh",
-            "jav",
-            "jav_Java",
-            "jbo",
-            "jbo_Cyrl",
-            "jbo_Latn",
-            "jdt_Cyrl",
-            "jpn",
-            "kab",
-            "kal",
-            "kan",
-            "kat",
-            "kaz_Cyrl",
-            "kaz_Latn",
-            "kek_Latn",
-            "kha",
-            "khm",
-            "khm_Latn",
-            "kin",
-            "kir_Cyrl",
-            "kjh",
-            "kpv",
-            "krl",
-            "ksh",
-            "kum",
-            "kur_Arab",
-            "kur_Latn",
-            "lad",
-            "lad_Latn",
-            "lao",
-            "lat_Latn",
-            "lav",
-            "ldn_Latn",
-            "lfn_Cyrl",
-            "lfn_Latn",
-            "lij",
-            "lin",
-            "lit",
-            "liv_Latn",
-            "lkt",
-            "lld_Latn",
-            "lmo",
-            "ltg",
-            "ltz",
-            "lug",
-            "lzh",
-            "lzh_Hans",
-            "mad",
-            "mah",
-            "mai",
-            "mal",
-            "mar",
-            "max_Latn",
-            "mdf",
-            "mfe",
-            "mhr",
-            "mic",
-            "min",
-            "mkd",
-            "mlg",
-            "mlt",
-            "mnw",
-            "moh",
-            "mon",
-            "mri",
-            "mwl",
-            "mww",
-            "mya",
-            "myv",
-            "nan",
-            "nau",
-            "nav",
-            "nds",
-            "niu",
-            "nld",
-            "nno",
-            "nob",
-            "nob_Hebr",
-            "nog",
-            "non_Latn",
-            "nov_Latn",
-            "npi",
-            "nya",
-            "oci",
-            "ori",
-            "orv_Cyrl",
-            "oss",
-            "ota_Arab",
-            "ota_Latn",
-            "pag",
-            "pan_Guru",
-            "pap",
-            "pau",
-            "pdc",
-            "pes",
-            "pes_Latn",
-            "pes_Thaa",
-            "pms",
-            "pnb",
-            "pol",
-            "por",
-            "ppl_Latn",
-            "prg_Latn",
-            "pus",
-            "quc",
-            "qya",
-            "qya_Latn",
-            "rap",
-            "rif_Latn",
-            "roh",
-            "rom",
-            "ron",
-            "rue",
-            "run",
-            "rus",
-            "sag",
-            "sah",
-            "san_Deva",
-            "scn",
-            "sco",
-            "sgs",
-            "shs_Latn",
-            "shy_Latn",
-            "sin",
-            "sjn_Latn",
-            "slv",
-            "sma",
-            "sme",
-            "smo",
-            "sna",
-            "snd_Arab",
-            "som",
-            "spa",
-            "sqi",
-            "srp_Cyrl",
-            "srp_Latn",
-            "stq",
-            "sun",
-            "swe",
-            "swg",
-            "swh",
-            "tah",
-            "tam",
-            "tat",
-            "tat_Arab",
-            "tat_Latn",
-            "tel",
-            "tet",
-            "tgk_Cyrl",
-            "tha",
-            "tir",
-            "tlh_Latn",
-            "tly_Latn",
-            "tmw_Latn",
-            "toi_Latn",
-            "ton",
-            "tpw_Latn",
-            "tso",
-            "tuk",
-            "tuk_Latn",
-            "tur",
-            "tvl",
-            "tyv",
-            "tzl",
-            "tzl_Latn",
-            "udm",
-            "uig_Arab",
-            "uig_Cyrl",
-            "ukr",
-            "umb",
-            "urd",
-            "uzb_Cyrl",
-            "uzb_Latn",
-            "vec",
-            "vie",
-            "vie_Hani",
-            "vol_Latn",
-            "vro",
-            "war",
-            "wln",
-            "wol",
-            "wuu",
-            "xal",
-            "xho",
-            "yid",
-            "yor",
-            "yue",
-            "yue_Hans",
-            "yue_Hant",
-            "zho",
-            "zho_Hans",
-            "zho_Hant",
-            "zlm_Latn",
-            "zsm_Latn",
-            "zul",
-            "zza",
-        },
-    ),
-    "nic": (
-        "Niger-Kordofanian languages",
-        {
-            "bam_Latn",
-            "ewe",
-            "fuc",
-            "fuv",
-            "ibo",
-            "kin",
-            "lin",
-            "lug",
-            "nya",
-            "run",
-            "sag",
-            "sna",
-            "swh",
-            "toi_Latn",
-            "tso",
-            "umb",
-            "wol",
-            "xho",
-            "yor",
-            "zul",
-        },
-    ),
-    "nld": ("Dutch", {"nld"}),
-    "nor": ("Norwegian", {"nob", "nno"}),
-    "phi": ("Philippine languages", {"ilo", "akl_Latn", "war", "hil", "pag", "ceb"}),
-    "pol": ("Polish", {"pol"}),
-    "por": ("Portuguese", {"por"}),
-    "pqe": (
-        "Eastern Malayo-Polynesian languages",
-        {"fij", "gil", "haw", "mah", "mri", "nau", "niu", "rap", "smo", "tah", "ton", "tvl"},
-    ),
-    "roa": (
-        "Romance languages",
-        {
-            "arg",
-            "ast",
-            "cat",
-            "cos",
-            "egl",
-            "ext",
-            "fra",
-            "frm_Latn",
-            "gcf_Latn",
-            "glg",
-            "hat",
-            "ind",
-            "ita",
-            "lad",
-            "lad_Latn",
-            "lij",
-            "lld_Latn",
-            "lmo",
-            "max_Latn",
-            "mfe",
-            "min",
-            "mwl",
-            "oci",
-            "pap",
-            "pms",
-            "por",
-            "roh",
-            "ron",
-            "scn",
-            "spa",
-            "tmw_Latn",
-            "vec",
-            "wln",
-            "zlm_Latn",
-            "zsm_Latn",
-        },
-    ),
-    "ron": ("Romanian", {"ron"}),
-    "run": ("Rundi", {"run"}),
-    "rus": ("Russian", {"rus"}),
-    "sal": ("Salishan languages", {"shs_Latn"}),
-    "sem": ("Semitic languages", {"acm", "afb", "amh", "apc", "ara", "arq", "ary", "arz", "heb", "mlt", "tir"}),
-    "sla": (
-        "Slavic languages",
-        {
-            "bel",
-            "bel_Latn",
-            "bos_Latn",
-            "bul",
-            "bul_Latn",
-            "ces",
-            "csb_Latn",
-            "dsb",
-            "hrv",
-            "hsb",
-            "mkd",
-            "orv_Cyrl",
-            "pol",
-            "rue",
-            "rus",
-            "slv",
-            "srp_Cyrl",
-            "srp_Latn",
-            "ukr",
-        },
-    ),
-    "slv": ("Slovenian", {"slv"}),
-    "spa": ("Spanish", {"spa"}),
-    "swe": ("Swedish", {"swe"}),
-    "taw": ("Tai", {"lao", "tha"}),
-    "tgl": ("Tagalog", {"tgl_Latn"}),
-    "tha": ("Thai", {"tha"}),
-    "trk": (
-        "Turkic languages",
-        {
-            "aze_Latn",
-            "bak",
-            "chv",
-            "crh",
-            "crh_Latn",
-            "kaz_Cyrl",
-            "kaz_Latn",
-            "kir_Cyrl",
-            "kjh",
-            "kum",
-            "ota_Arab",
-            "ota_Latn",
-            "sah",
-            "tat",
-            "tat_Arab",
-            "tat_Latn",
-            "tuk",
-            "tuk_Latn",
-            "tur",
-            "tyv",
-            "uig_Arab",
-            "uig_Cyrl",
-            "uzb_Cyrl",
-            "uzb_Latn",
-        },
-    ),
-    "tur": ("Turkish", {"tur"}),
-    "ukr": ("Ukrainian", {"ukr"}),
-    "urd": ("Urdu", {"urd"}),
-    "urj": (
-        "Uralic languages",
-        {
-            "est",
-            "fin",
-            "fkv_Latn",
-            "hun",
-            "izh",
-            "kpv",
-            "krl",
-            "liv_Latn",
-            "mdf",
-            "mhr",
-            "myv",
-            "sma",
-            "sme",
-            "udm",
-            "vep",
-            "vro",
-        },
-    ),
-    "vie": ("Vietnamese", {"vie", "vie_Hani"}),
-    "war": ("Waray (Philippines)", {"war"}),
-    "zho": (
-        "Chinese",
-        {
-            "cjy_Hans",
-            "cjy_Hant",
-            "cmn",
-            "cmn_Bopo",
-            "cmn_Hang",
-            "cmn_Hani",
-            "cmn_Hans",
-            "cmn_Hant",
-            "cmn_Hira",
-            "cmn_Kana",
-            "cmn_Latn",
-            "cmn_Yiii",
-            "gan",
-            "hak_Hani",
-            "lzh",
-            "lzh_Bopo",
-            "lzh_Hang",
-            "lzh_Hani",
-            "lzh_Hans",
-            "lzh_Hira",
-            "lzh_Kana",
-            "lzh_Yiii",
-            "nan",
-            "nan_Hani",
-            "wuu",
-            "wuu_Bopo",
-            "wuu_Hani",
-            "wuu_Latn",
-            "yue",
-            "yue_Bopo",
-            "yue_Hang",
-            "yue_Hani",
-            "yue_Hans",
-            "yue_Hant",
-            "yue_Hira",
-            "yue_Kana",
-            "zho",
-            "zho_Hans",
-            "zho_Hant",
-        },
-    ),
-    "zle": ("East Slavic languages", {"bel", "orv_Cyrl", "bel_Latn", "rus", "ukr", "rue"}),
-    "zls": ("South Slavic languages", {"bos_Latn", "bul", "bul_Latn", "hrv", "mkd", "slv", "srp_Cyrl", "srp_Latn"}),
-    "zlw": ("West Slavic languages", {"csb_Latn", "dsb", "hsb", "pol", "ces"}),
-}
-
-
-def l2front_matter(langs):
-    return "".join(f"- {l}\n" for l in langs)
-
-
-def dedup(lst):
-    """Preservers order"""
-    new_lst = []
-    for item in lst:
-        if not item or item in new_lst:
-            continue
-        else:
-            new_lst.append(item)
-    return new_lst
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m", "--models", action="append", help="<Required> Set flag", required=True, nargs="+", dest="models"
-    )
-    parser.add_argument("-save_dir", "--save_dir", default="marian_converted", help="where to save converted models")
-    args = parser.parse_args()
-    resolver = TatoebaConverter(save_dir=args.save_dir)
-    resolver.convert_models(args.models[0])
diff --git a/src/transformers/models/marian/convert_marian_to_pytorch.py b/src/transformers/models/marian/convert_marian_to_pytorch.py
deleted file mode 100644
index 4cc9b15ce4d5..000000000000
--- a/src/transformers/models/marian/convert_marian_to_pytorch.py
+++ /dev/null
@@ -1,717 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import os
-import socket
-import time
-import warnings
-from pathlib import Path
-from typing import Dict, List, Union
-from zipfile import ZipFile
-
-import numpy as np
-import torch
-from huggingface_hub.hf_api import list_models
-from torch import nn
-from tqdm import tqdm
-
-from transformers import MarianConfig, MarianMTModel, MarianTokenizer
-
-
-def remove_suffix(text: str, suffix: str):
-    if text.endswith(suffix):
-        return text[: -len(suffix)]
-    return text  # or whatever
-
-
-def remove_prefix(text: str, prefix: str):
-    if text.startswith(prefix):
-        return text[len(prefix) :]
-    return text  # or whatever
-
-
-def convert_encoder_layer(opus_dict, layer_prefix: str, converter: dict):
-    sd = {}
-    for k in opus_dict:
-        if not k.startswith(layer_prefix):
-            continue
-        stripped = remove_prefix(k, layer_prefix)
-        v = opus_dict[k].T  # besides embeddings, everything must be transposed.
-        sd[converter[stripped]] = torch.tensor(v).squeeze()
-    return sd
-
-
-def load_layers_(layer_lst: nn.ModuleList, opus_state: dict, converter, is_decoder=False):
-    for i, layer in enumerate(layer_lst):
-        layer_tag = f"decoder_l{i + 1}_" if is_decoder else f"encoder_l{i + 1}_"
-        sd = convert_encoder_layer(opus_state, layer_tag, converter)
-        layer.load_state_dict(sd, strict=False)
-
-
-def find_pretrained_model(src_lang: str, tgt_lang: str) -> List[str]:
-    """Find models that can accept src_lang as input and return tgt_lang as output."""
-    prefix = "Helsinki-NLP/opus-mt-"
-    model_list = list_models()
-    model_ids = [x.id for x in model_list if x.id.startswith("Helsinki-NLP")]
-    src_and_targ = [
-        remove_prefix(m, prefix).lower().split("-") for m in model_ids if "+" not in m
-    ]  # + cant be loaded.
-    matching = [f"{prefix}{a}-{b}" for (a, b) in src_and_targ if src_lang in a and tgt_lang in b]
-    return matching
-
-
-def add_emb_entries(wemb, final_bias, n_special_tokens=1):
-    vsize, d_model = wemb.shape
-    embs_to_add = np.zeros((n_special_tokens, d_model))
-    new_embs = np.concatenate([wemb, embs_to_add])
-    bias_to_add = np.zeros((n_special_tokens, 1))
-    new_bias = np.concatenate((final_bias, bias_to_add), axis=1)
-    return new_embs, new_bias
-
-
-def _cast_yaml_str(v):
-    bool_dct = {"true": True, "false": False}
-    if not isinstance(v, str):
-        return v
-    elif v in bool_dct:
-        return bool_dct[v]
-    try:
-        return int(v)
-    except (TypeError, ValueError):
-        return v
-
-
-def cast_marian_config(raw_cfg: Dict[str, str]) -> Dict:
-    return {k: _cast_yaml_str(v) for k, v in raw_cfg.items()}
-
-
-CONFIG_KEY = "special:model.yml"
-
-
-def load_config_from_state_dict(opus_dict):
-    import yaml
-
-    cfg_str = "".join([chr(x) for x in opus_dict[CONFIG_KEY]])
-    yaml_cfg = yaml.load(cfg_str[:-1], Loader=yaml.BaseLoader)
-    return cast_marian_config(yaml_cfg)
-
-
-def find_model_file(dest_dir):  # this one better
-    model_files = list(Path(dest_dir).glob("*.npz"))
-    if len(model_files) != 1:
-        raise ValueError(f"Found more than one model file: {model_files}")
-    model_file = model_files[0]
-    return model_file
-
-
-# Group Names Logic: change long opus model names to something shorter, like opus-mt-en-ROMANCE
-ROM_GROUP = (
-    "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT"
-    "+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co"
-    "+nap+scn+vec+sc+ro+la"
-)
-GROUPS = [
-    ("cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "ZH"),
-    (ROM_GROUP, "ROMANCE"),
-    ("de+nl+fy+af+da+fo+is+no+nb+nn+sv", "NORTH_EU"),
-    ("da+fo+is+no+nb+nn+sv", "SCANDINAVIA"),
-    ("se+sma+smj+smn+sms", "SAMI"),
-    ("nb_NO+nb+nn_NO+nn+nog+no_nb+no", "NORWAY"),
-    ("ga+cy+br+gd+kw+gv", "CELTIC"),  # https://en.wikipedia.org/wiki/Insular_Celtic_languages
-]
-GROUP_TO_OPUS_NAME = {
-    "opus-mt-ZH-de": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-de",
-    "opus-mt-ZH-fi": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
-    "opus-mt-ZH-sv": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-sv",
-    "opus-mt-SCANDINAVIA-SCANDINAVIA": "da+fo+is+no+nb+nn+sv-da+fo+is+no+nb+nn+sv",
-    "opus-mt-NORTH_EU-NORTH_EU": "de+nl+fy+af+da+fo+is+no+nb+nn+sv-de+nl+fy+af+da+fo+is+no+nb+nn+sv",
-    "opus-mt-de-ZH": "de-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
-    "opus-mt-en_el_es_fi-en_el_es_fi": "en+el+es+fi-en+el+es+fi",
-    "opus-mt-en-ROMANCE": (
-        "en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO"
-        "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR"
-        "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la"
-    ),
-    "opus-mt-en-CELTIC": "en-ga+cy+br+gd+kw+gv",
-    "opus-mt-es-NORWAY": "es-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
-    "opus-mt-fi_nb_no_nn_ru_sv_en-SAMI": "fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms",
-    "opus-mt-fi-ZH": "fi-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
-    "opus-mt-fi-NORWAY": "fi-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
-    "opus-mt-ROMANCE-en": (
-        "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO"
-        "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR"
-        "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en"
-    ),
-    "opus-mt-CELTIC-en": "ga+cy+br+gd+kw+gv-en",
-    "opus-mt-sv-ZH": "sv-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
-    "opus-mt-sv-NORWAY": "sv-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
-}
-OPUS_GITHUB_URL = "https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/"
-ORG_NAME = "Helsinki-NLP/"
-
-
-def convert_opus_name_to_hf_name(x):
-    """For OPUS-MT-Train/ DEPRECATED"""
-    for substr, grp_name in GROUPS:
-        x = x.replace(substr, grp_name)
-    return x.replace("+", "_")
-
-
-def convert_hf_name_to_opus_name(hf_model_name):
-    """
-    Relies on the assumption that there are no language codes like pt_br in models that are not in GROUP_TO_OPUS_NAME.
-    """
-    hf_model_name = remove_prefix(hf_model_name, ORG_NAME)
-    if hf_model_name in GROUP_TO_OPUS_NAME:
-        opus_w_prefix = GROUP_TO_OPUS_NAME[hf_model_name]
-    else:
-        opus_w_prefix = hf_model_name.replace("_", "+")
-    return remove_prefix(opus_w_prefix, "opus-mt-")
-
-
-def get_system_metadata(repo_root):
-    import git
-
-    return {
-        "helsinki_git_sha": git.Repo(path=repo_root, search_parent_directories=True).head.object.hexsha,
-        "transformers_git_sha": git.Repo(path=".", search_parent_directories=True).head.object.hexsha,
-        "port_machine": socket.gethostname(),
-        "port_time": time.strftime("%Y-%m-%d-%H:%M"),
-    }
-
-
-# docstyle-ignore
-FRONT_MATTER_TEMPLATE = """---
-language:
-{}
-tags:
-- translation
-
-license: apache-2.0
----
-"""
-DEFAULT_REPO = "Tatoeba-Challenge"
-DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models")
-
-
-def write_model_card(
-    hf_model_name: str,
-    repo_root=DEFAULT_REPO,
-    save_dir=Path("marian_converted"),
-    dry_run=False,
-    extra_metadata={},
-) -> str:
-    """
-    Copy the most recent model's readme section from opus, and add metadata. upload command: aws s3 sync model_card_dir
-    s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
-    """
-    import pandas as pd
-
-    hf_model_name = remove_prefix(hf_model_name, ORG_NAME)
-    opus_name: str = convert_hf_name_to_opus_name(hf_model_name)
-    if repo_root not in ("OPUS-MT-train", "Tatoeba-Challenge"):
-        raise ValueError(f"Repos root is {repo_root}. Expected either OPUS-MT-train or Tatoeba-Challenge")
-    opus_readme_path = Path(repo_root).joinpath("models", opus_name, "README.md")
-    if not (opus_readme_path.exists()):
-        raise ValueError(f"Readme file {opus_readme_path} not found")
-
-    opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")]
-
-    readme_url = f"https://github.com/Helsinki-NLP/{repo_root}/tree/master/models/{opus_name}/README.md"
-
-    s, t = ",".join(opus_src), ",".join(opus_tgt)
-    metadata = {
-        "hf_name": hf_model_name,
-        "source_languages": s,
-        "target_languages": t,
-        "opus_readme_url": readme_url,
-        "original_repo": repo_root,
-        "tags": ["translation"],
-    }
-    metadata.update(extra_metadata)
-    metadata.update(get_system_metadata(repo_root))
-
-    # combine with opus markdown
-
-    extra_markdown = (
-        f"### {hf_model_name}\n\n* source group: {metadata['src_name']} \n* target group: "
-        f"{metadata['tgt_name']} \n*  OPUS readme: [{opus_name}]({readme_url})\n"
-    )
-
-    content = opus_readme_path.open().read()
-    content = content.split("\n# ")[-1]  # Get the lowest level 1 header in the README -- the most recent model.
-    splat = content.split("*")[2:]
-    print(splat[3])
-    content = "*".join(splat)
-    content = (
-        FRONT_MATTER_TEMPLATE.format(metadata["src_alpha2"])
-        + extra_markdown
-        + "\n* "
-        + content.replace("download", "download original weights")
-    )
-
-    items = "\n\n".join([f"- {k}: {v}" for k, v in metadata.items()])
-    sec3 = "\n### System Info: \n" + items
-    content += sec3
-    if dry_run:
-        return content, metadata
-    sub_dir = save_dir / f"opus-mt-{hf_model_name}"
-    sub_dir.mkdir(exist_ok=True)
-    dest = sub_dir / "README.md"
-    dest.open("w").write(content)
-    pd.Series(metadata).to_json(sub_dir / "metadata.json")
-
-    # if dry_run:
-    return content, metadata
-
-
-def make_registry(repo_path="Opus-MT-train/models"):
-    if not (Path(repo_path) / "fr-en" / "README.md").exists():
-        raise ValueError(
-            f"repo_path:{repo_path} does not exist: "
-            "You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git before calling."
-        )
-    results = {}
-    for p in Path(repo_path).iterdir():
-        n_dash = p.name.count("-")
-        if n_dash == 0:
-            continue
-        else:
-            lns = list(open(p / "README.md").readlines())
-            results[p.name] = _parse_readme(lns)
-    return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()]
-
-
-def convert_all_sentencepiece_models(model_list=None, repo_path=None, dest_dir=Path("marian_converted")):
-    """Requires 300GB"""
-    save_dir = Path("marian_ckpt")
-    dest_dir = Path(dest_dir)
-    dest_dir.mkdir(exist_ok=True)
-    save_paths = []
-    if model_list is None:
-        model_list: list = make_registry(repo_path=repo_path)
-    for k, prepro, download, test_set_url in tqdm(model_list):
-        if "SentencePiece" not in prepro:  # dont convert BPE models.
-            continue
-        if not os.path.exists(save_dir / k):
-            download_and_unzip(download, save_dir / k)
-        pair_name = convert_opus_name_to_hf_name(k)
-        convert(save_dir / k, dest_dir / f"opus-mt-{pair_name}")
-
-        save_paths.append(dest_dir / f"opus-mt-{pair_name}")
-    return save_paths
-
-
-def lmap(f, x) -> List:
-    return list(map(f, x))
-
-
-def fetch_test_set(test_set_url):
-    import wget
-
-    fname = wget.download(test_set_url, "opus_test.txt")
-    lns = Path(fname).open().readlines()
-    src = lmap(str.strip, lns[::4])
-    gold = lmap(str.strip, lns[1::4])
-    mar_model = lmap(str.strip, lns[2::4])
-    if not (len(gold) == len(mar_model) == len(src)):
-        raise ValueError(f"Gold, marian and source lengths {len(gold)}, {len(mar_model)}, {len(src)} mismatched")
-    os.remove(fname)
-    return src, mar_model, gold
-
-
-def convert_whole_dir(path=Path("marian_ckpt/")):
-    for subdir in tqdm(list(path.ls())):
-        dest_dir = f"marian_converted/{subdir.name}"
-        if (dest_dir / "pytorch_model.bin").exists():
-            continue
-        convert(source_dir, dest_dir)
-
-
-def _parse_readme(lns):
-    """Get link and metadata from opus model card equivalent."""
-    subres = {}
-    for ln in [x.strip() for x in lns]:
-        if not ln.startswith("*"):
-            continue
-        ln = ln[1:].strip()
-
-        for k in ["download", "dataset", "models", "model", "pre-processing"]:
-            if ln.startswith(k):
-                break
-        else:
-            continue
-        if k in ["dataset", "model", "pre-processing"]:
-            splat = ln.split(":")
-            _, v = splat
-            subres[k] = v
-        elif k == "download":
-            v = ln.split("(")[-1][:-1]
-            subres[k] = v
-    return subres
-
-
-def save_tokenizer_config(dest_dir: Path, separate_vocabs=False):
-    dname = dest_dir.name.split("-")
-    dct = {"target_lang": dname[-1], "source_lang": "-".join(dname[:-1]), "separate_vocabs": separate_vocabs}
-    save_json(dct, dest_dir / "tokenizer_config.json")
-
-
-def add_to_vocab_(vocab: Dict[str, int], special_tokens: List[str]):
-    start = max(vocab.values()) + 1
-    added = 0
-    for tok in special_tokens:
-        if tok in vocab:
-            continue
-        vocab[tok] = start + added
-        added += 1
-    return added
-
-
-def find_vocab_file(model_dir):
-    return list(model_dir.glob("*vocab.yml"))[0]
-
-
-def find_src_vocab_file(model_dir):
-    return list(model_dir.glob("*src.vocab.yml"))[0]
-
-
-def find_tgt_vocab_file(model_dir):
-    return list(model_dir.glob("*trg.vocab.yml"))[0]
-
-
-def add_special_tokens_to_vocab(model_dir: Path, separate_vocab=False) -> None:
-    if separate_vocab:
-        vocab = load_yaml(find_src_vocab_file(model_dir))
-        vocab = {k: int(v) for k, v in vocab.items()}
-        num_added = add_to_vocab_(vocab, ["<pad>"])
-        save_json(vocab, model_dir / "vocab.json")
-
-        vocab = load_yaml(find_tgt_vocab_file(model_dir))
-        vocab = {k: int(v) for k, v in vocab.items()}
-        num_added = add_to_vocab_(vocab, ["<pad>"])
-        save_json(vocab, model_dir / "target_vocab.json")
-        save_tokenizer_config(model_dir, separate_vocabs=separate_vocab)
-    else:
-        vocab = load_yaml(find_vocab_file(model_dir))
-        vocab = {k: int(v) for k, v in vocab.items()}
-        num_added = add_to_vocab_(vocab, ["<pad>"])
-        print(f"added {num_added} tokens to vocab")
-        save_json(vocab, model_dir / "vocab.json")
-        save_tokenizer_config(model_dir)
-
-
-def check_equal(marian_cfg, k1, k2):
-    v1, v2 = marian_cfg[k1], marian_cfg[k2]
-    if v1 != v2:
-        raise ValueError(f"hparams {k1},{k2} differ: {v1} != {v2}")
-
-
-def check_marian_cfg_assumptions(marian_cfg):
-    assumed_settings = {
-        "layer-normalization": False,
-        "right-left": False,
-        "transformer-ffn-depth": 2,
-        "transformer-aan-depth": 2,
-        "transformer-no-projection": False,
-        "transformer-postprocess-emb": "d",
-        "transformer-postprocess": "dan",  # Dropout, add, normalize
-        "transformer-preprocess": "",
-        "type": "transformer",
-        "ulr-dim-emb": 0,
-        "dec-cell-base-depth": 2,
-        "dec-cell-high-depth": 1,
-        "transformer-aan-nogate": False,
-    }
-    for k, v in assumed_settings.items():
-        actual = marian_cfg[k]
-        if actual != v:
-            raise ValueError(f"Unexpected config value for {k} expected {v} got {actual}")
-
-
-BIAS_KEY = "decoder_ff_logit_out_b"
-BART_CONVERTER = {  # for each encoder and decoder layer
-    "self_Wq": "self_attn.q_proj.weight",
-    "self_Wk": "self_attn.k_proj.weight",
-    "self_Wv": "self_attn.v_proj.weight",
-    "self_Wo": "self_attn.out_proj.weight",
-    "self_bq": "self_attn.q_proj.bias",
-    "self_bk": "self_attn.k_proj.bias",
-    "self_bv": "self_attn.v_proj.bias",
-    "self_bo": "self_attn.out_proj.bias",
-    "self_Wo_ln_scale": "self_attn_layer_norm.weight",
-    "self_Wo_ln_bias": "self_attn_layer_norm.bias",
-    "ffn_W1": "fc1.weight",
-    "ffn_b1": "fc1.bias",
-    "ffn_W2": "fc2.weight",
-    "ffn_b2": "fc2.bias",
-    "ffn_ffn_ln_scale": "final_layer_norm.weight",
-    "ffn_ffn_ln_bias": "final_layer_norm.bias",
-    # Decoder Cross Attention
-    "context_Wk": "encoder_attn.k_proj.weight",
-    "context_Wo": "encoder_attn.out_proj.weight",
-    "context_Wq": "encoder_attn.q_proj.weight",
-    "context_Wv": "encoder_attn.v_proj.weight",
-    "context_bk": "encoder_attn.k_proj.bias",
-    "context_bo": "encoder_attn.out_proj.bias",
-    "context_bq": "encoder_attn.q_proj.bias",
-    "context_bv": "encoder_attn.v_proj.bias",
-    "context_Wo_ln_scale": "encoder_attn_layer_norm.weight",
-    "context_Wo_ln_bias": "encoder_attn_layer_norm.bias",
-}
-
-
-class OpusState:
-    def __init__(self, source_dir, eos_token_id=0):
-        npz_path = find_model_file(source_dir)
-        self.state_dict = np.load(npz_path)
-        cfg = load_config_from_state_dict(self.state_dict)
-        if cfg["dim-vocabs"][0] != cfg["dim-vocabs"][1]:
-            raise ValueError
-        if "Wpos" in self.state_dict:
-            raise ValueError("Wpos key in state dictionary")
-        self.state_dict = dict(self.state_dict)
-        if cfg["tied-embeddings-all"]:
-            cfg["tied-embeddings-src"] = True
-            cfg["tied-embeddings"] = True
-        self.share_encoder_decoder_embeddings = cfg["tied-embeddings-src"]
-
-        # create the tokenizer here because we need to know the eos_token_id
-        self.source_dir = source_dir
-        self.tokenizer = self.load_tokenizer()
-        # retrieve EOS token and set correctly
-        tokenizer_has_eos_token_id = (
-            hasattr(self.tokenizer, "eos_token_id") and self.tokenizer.eos_token_id is not None
-        )
-        eos_token_id = self.tokenizer.eos_token_id if tokenizer_has_eos_token_id else 0
-
-        if cfg["tied-embeddings-src"]:
-            self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1)
-            self.pad_token_id = self.wemb.shape[0] - 1
-            cfg["vocab_size"] = self.pad_token_id + 1
-        else:
-            self.wemb, _ = add_emb_entries(self.state_dict["encoder_Wemb"], self.state_dict[BIAS_KEY], 1)
-            self.dec_wemb, self.final_bias = add_emb_entries(
-                self.state_dict["decoder_Wemb"], self.state_dict[BIAS_KEY], 1
-            )
-            # still assuming that vocab size is same for encoder and decoder
-            self.pad_token_id = self.wemb.shape[0] - 1
-            cfg["vocab_size"] = self.pad_token_id + 1
-            cfg["decoder_vocab_size"] = self.pad_token_id + 1
-
-        if cfg["vocab_size"] != self.tokenizer.vocab_size:
-            raise ValueError(
-                f"Original vocab size {cfg['vocab_size']} and new vocab size {len(self.tokenizer.encoder)} mismatched."
-            )
-
-        # self.state_dict['Wemb'].sha
-        self.state_keys = list(self.state_dict.keys())
-        if "Wtype" in self.state_dict:
-            raise ValueError("Wtype key in state dictionary")
-        self._check_layer_entries()
-        self.cfg = cfg
-        hidden_size, intermediate_shape = self.state_dict["encoder_l1_ffn_W1"].shape
-        if hidden_size != cfg["dim-emb"]:
-            raise ValueError(f"Hidden size {hidden_size} and configured size {cfg['dim_emb']} mismatched")
-
-        # Process decoder.yml
-        decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml"))
-        check_marian_cfg_assumptions(cfg)
-        self.hf_config = MarianConfig(
-            vocab_size=cfg["vocab_size"],
-            decoder_vocab_size=cfg.get("decoder_vocab_size", cfg["vocab_size"]),
-            share_encoder_decoder_embeddings=cfg["tied-embeddings-src"],
-            decoder_layers=cfg["dec-depth"],
-            encoder_layers=cfg["enc-depth"],
-            decoder_attention_heads=cfg["transformer-heads"],
-            encoder_attention_heads=cfg["transformer-heads"],
-            decoder_ffn_dim=cfg["transformer-dim-ffn"],
-            encoder_ffn_dim=cfg["transformer-dim-ffn"],
-            d_model=cfg["dim-emb"],
-            activation_function=cfg["transformer-ffn-activation"],
-            pad_token_id=self.pad_token_id,
-            eos_token_id=eos_token_id,
-            forced_eos_token_id=eos_token_id,
-            bos_token_id=0,
-            max_position_embeddings=cfg["dim-emb"],
-            scale_embedding=True,
-            normalize_embedding="n" in cfg["transformer-preprocess"],
-            static_position_embeddings=not cfg["transformer-train-position-embeddings"],
-            tie_word_embeddings=cfg["tied-embeddings"],
-            dropout=0.1,  # see opus-mt-train repo/transformer-dropout param.
-            # default: add_final_layer_norm=False,
-            num_beams=decoder_yml["beam-size"],
-            decoder_start_token_id=self.pad_token_id,
-            bad_words_ids=[[self.pad_token_id]],
-            max_length=512,
-        )
-
-    def _check_layer_entries(self):
-        self.encoder_l1 = self.sub_keys("encoder_l1")
-        self.decoder_l1 = self.sub_keys("decoder_l1")
-        self.decoder_l2 = self.sub_keys("decoder_l2")
-        if len(self.encoder_l1) != 16:
-            warnings.warn(f"Expected 16 keys for each encoder layer, got {len(self.encoder_l1)}")
-        if len(self.decoder_l1) != 26:
-            warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}")
-        if len(self.decoder_l2) != 26:
-            warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}")
-
-    @property
-    def extra_keys(self):
-        extra = []
-        for k in self.state_keys:
-            if (
-                k.startswith("encoder_l")
-                or k.startswith("decoder_l")
-                or k in [CONFIG_KEY, "Wemb", "encoder_Wemb", "decoder_Wemb", "Wpos", "decoder_ff_logit_out_b"]
-            ):
-                continue
-            else:
-                extra.append(k)
-        return extra
-
-    def sub_keys(self, layer_prefix):
-        return [remove_prefix(k, layer_prefix) for k in self.state_dict if k.startswith(layer_prefix)]
-
-    def load_tokenizer(self):
-        # save tokenizer
-        add_special_tokens_to_vocab(self.source_dir, not self.share_encoder_decoder_embeddings)
-        return MarianTokenizer.from_pretrained(str(self.source_dir))
-
-    def load_marian_model(self) -> MarianMTModel:
-        state_dict, cfg = self.state_dict, self.hf_config
-
-        if not cfg.static_position_embeddings:
-            raise ValueError("config.static_position_embeddings should be True")
-        model = MarianMTModel(cfg)
-
-        if "hidden_size" in cfg.to_dict():
-            raise ValueError("hidden_size is in config")
-        load_layers_(
-            model.model.encoder.layers,
-            state_dict,
-            BART_CONVERTER,
-        )
-        load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True)
-
-        # handle tensors not associated with layers
-        if self.cfg["tied-embeddings-src"]:
-            wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb))
-            bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias))
-            model.model.shared.weight = wemb_tensor
-            model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared
-        else:
-            wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb))
-            model.model.encoder.embed_tokens.weight = wemb_tensor
-
-            decoder_wemb_tensor = nn.Parameter(torch.FloatTensor(self.dec_wemb))
-            bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias))
-            model.model.decoder.embed_tokens.weight = decoder_wemb_tensor
-
-        # handle tied embeddings, otherwise "from_pretrained" loads them incorrectly
-        if self.cfg["tied-embeddings"]:
-            model.lm_head.weight.data = model.model.decoder.embed_tokens.weight.data.clone()
-
-        model.final_logits_bias = bias_tensor
-
-        if "Wpos" in state_dict:
-            print("Unexpected: got Wpos")
-            wpos_tensor = torch.tensor(state_dict["Wpos"])
-            model.model.encoder.embed_positions.weight = wpos_tensor
-            model.model.decoder.embed_positions.weight = wpos_tensor
-
-        if cfg.normalize_embedding:
-            if "encoder_emb_ln_scale_pre" not in state_dict:
-                raise ValueError("encoder_emb_ln_scale_pre is not in state dictionary")
-            raise NotImplementedError("Need to convert layernorm_embedding")
-
-        if self.extra_keys:
-            raise ValueError(f"Failed to convert {self.extra_keys}")
-
-        if model.get_input_embeddings().padding_idx != self.pad_token_id:
-            raise ValueError(
-                f"Padding tokens {model.get_input_embeddings().padding_idx} and {self.pad_token_id} mismatched"
-            )
-        return model
-
-
-def download_and_unzip(url, dest_dir):
-    try:
-        import wget
-    except ImportError:
-        raise ImportError("you must pip install wget")
-
-    filename = wget.download(url)
-    unzip(filename, dest_dir)
-    os.remove(filename)
-
-
-def convert(source_dir: Path, dest_dir):
-    dest_dir = Path(dest_dir)
-    dest_dir.mkdir(exist_ok=True)
-
-    opus_state = OpusState(source_dir)
-
-    # save tokenizer
-    opus_state.tokenizer.save_pretrained(dest_dir)
-
-    # save_json(opus_state.cfg, dest_dir / "marian_original_config.json")
-    # ^^ Uncomment to save human readable marian config for debugging
-
-    model = opus_state.load_marian_model()
-    model = model.half()
-    model.save_pretrained(dest_dir)
-    model.from_pretrained(dest_dir)  # sanity check
-
-
-def load_yaml(path):
-    import yaml
-
-    with open(path, encoding="utf-8") as f:
-        return yaml.load(f, Loader=yaml.BaseLoader)
-
-
-def save_json(content: Union[Dict, List], path: str) -> None:
-    with open(path, "w") as f:
-        json.dump(content, f)
-
-
-def unzip(zip_path: str, dest_dir: str) -> None:
-    with ZipFile(zip_path, "r") as zipObj:
-        zipObj.extractall(dest_dir)
-
-
-if __name__ == "__main__":
-    """
-    Tatoeba conversion instructions in scripts/tatoeba/README.md
-    """
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--src",
-        type=str,
-        help="path to marian model sub dir. yaml.load will be used to load the configuration file, please be wary of which file you're loading.",
-        default="en-de",
-    )
-    parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-
-    source_dir = Path(args.src)
-    if not source_dir.exists():
-        raise ValueError(f"Source directory {source_dir} not found")
-    dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest
-    convert(source_dir, dest_dir)
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index 2021ca341405..d4844b6fc320 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -285,7 +285,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index b64970e8063c..affb6b8b670c 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -73,24 +73,21 @@ class MarianSinusoidalPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
         super().__init__(num_positions, embedding_dim)
-        self.weight = self._init_weight(self.weight)
 
-    @staticmethod
-    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+    def _init_weight(self):
         """
         Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
         the 2nd half of the vector. [dim // 2:]
         """
-        n_pos, dim = out.shape
+        n_pos, dim = self.weight.shape
         position_enc = np.array(
             [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
         )
-        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False)
         sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
         out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
         out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-        out.detach_()
-        return out
+        self.weight = nn.Parameter(out, requires_grad=False)
 
     @torch.no_grad()
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
@@ -468,7 +465,7 @@ def _init_weights(self, module: Union[nn.Linear, nn.Embedding, MarianSinusoidalP
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, MarianSinusoidalPositionalEmbedding):
-            pass
+            module._init_weight()
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
@@ -661,7 +658,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -741,9 +738,9 @@ def forward(
 
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
@@ -827,7 +824,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1127,7 +1124,7 @@ def resize_decoder_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
     @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.Tensor] = None,
@@ -1363,7 +1360,7 @@ def tie_weights(self):
     @add_end_docstrings(MARIAN_GENERATION_EXAMPLE)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.Tensor] = None,
@@ -1507,7 +1504,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index c401d38f086f..b1fd6463c68e 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -339,7 +339,7 @@ def get_tgt_vocab(self):
     def __getstate__(self) -> Dict:
         state = self.__dict__.copy()
         state.update(
-            {k: None for k in ["spm_source", "spm_target", "current_spm", "punc_normalizer", "target_vocab_file"]}
+            dict.fromkeys(["spm_source", "spm_target", "current_spm", "punc_normalizer", "target_vocab_file"])
         )
         return state
 
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index 166e63b84b99..f47483d9d861 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -731,6 +731,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, MarkupLMLMPredictionHead):
+            module.bias.data.zero_()
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py
index 7ad5054aef5a..26ba704150dc 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm.py
@@ -503,7 +503,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -646,7 +646,7 @@ def batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         xpaths: Optional[List[List[List[int]]]] = None,
         node_labels: Optional[Union[List[int], List[List[int]]]] = None,
         add_special_tokens: bool = True,
@@ -655,7 +655,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -706,7 +706,7 @@ def _batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         xpaths: Optional[List[List[List[int]]]] = None,
         node_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -715,7 +715,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -760,7 +760,7 @@ def _batch_encode_plus(
     def _batch_prepare_for_model(
         self,
         batch_text_or_text_pairs,
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         xpaths: Optional[List[List[int]]] = None,
         node_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -769,7 +769,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -843,7 +843,7 @@ def encode(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -892,7 +892,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -960,7 +960,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1015,7 +1015,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1375,7 +1375,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
index ec6808348abd..55d75e354164 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
@@ -278,7 +278,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -421,7 +421,7 @@ def batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         xpaths: Optional[List[List[List[int]]]] = None,
         node_labels: Optional[Union[List[int], List[List[int]]]] = None,
         add_special_tokens: bool = True,
@@ -430,7 +430,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -495,7 +495,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -558,7 +558,7 @@ def _batch_encode_plus(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         xpaths: Optional[List[List[List[int]]]] = None,
         node_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -567,7 +567,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -722,7 +722,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -784,7 +784,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index 608d2dacb59d..37281afeca14 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -160,7 +160,7 @@ def __init__(
         init_xavier_std: float = 1.0,
         use_auxiliary_loss: bool = True,
         feature_strides: List[int] = [4, 8, 16, 32],
-        output_auxiliary_logits: bool = None,
+        output_auxiliary_logits: Optional[bool] = None,
         backbone: Optional[str] = None,
         use_pretrained_backbone: bool = False,
         use_timm_backbone: bool = False,
diff --git a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index ea1c578509f6..000000000000
--- a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,1019 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import sys
-from argparse import ArgumentParser
-from dataclasses import dataclass
-from pathlib import Path
-from pprint import pformat
-from typing import Any, Dict, Iterator, List, Set, Tuple
-
-import requests
-import torch
-import torchvision.transforms as T
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.projects.deeplab import add_deeplab_config
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torch import Tensor, nn
-
-from transformers import (
-    Mask2FormerConfig,
-    Mask2FormerForUniversalSegmentation,
-    Mask2FormerImageProcessor,
-    Mask2FormerModel,
-    SwinConfig,
-)
-from transformers.models.mask2former.modeling_mask2former import (
-    Mask2FormerForUniversalSegmentationOutput,
-    Mask2FormerModelOutput,
-)
-from transformers.utils import logging
-
-
-StateDict = Dict[str, Tensor]
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-torch.manual_seed(0)
-
-
-class TrackedStateDict:
-    def __init__(self, to_track: Dict):
-        """This class "tracks" a python dictionary by keeping track of which item is accessed.
-
-        Args:
-            to_track (Dict): The dictionary we wish to track
-        """
-        self.to_track = to_track
-        self._seen: Set[str] = set()
-
-    def __getitem__(self, key: str) -> Any:
-        return self.to_track[key]
-
-    def __setitem__(self, key: str, item: Any):
-        self._seen.add(key)
-        self.to_track[key] = item
-
-    def diff(self) -> List[str]:
-        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
-        This is an effective method to check if we have update all the keys
-
-        Returns:
-            List[str]: List of keys not yet updated
-        """
-        return set(self.to_track.keys()) - self._seen
-
-    def copy(self) -> Dict:
-        # proxy the call to the internal dictionary
-        return self.to_track.copy()
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    img_data = requests.get(url, stream=True).raw
-    im = Image.open(img_data)
-    return im
-
-
-@dataclass
-class Args:
-    """Fake command line arguments needed by mask2former/detectron implementation"""
-
-    config_file: str
-
-
-def setup_cfg(args: Args):
-    # load config from file and command-line arguments
-    cfg = get_cfg()
-    add_deeplab_config(cfg)
-    add_maskformer2_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.freeze()
-    return cfg
-
-
-class OriginalMask2FormerConfigToOursConverter:
-    def __call__(self, original_config: object) -> Mask2FormerConfig:
-        model = original_config.MODEL
-
-        repo_id = "huggingface/label-files"
-        if model.SEM_SEG_HEAD.NUM_CLASSES == 847:
-            filename = "mask2former-ade20k-full-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 150:
-            filename = "ade20k-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 80:
-            filename = "coco-detection-mmdet-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 171:
-            filename = "mask2former-coco-stuff-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 133:
-            filename = "coco-panoptic-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 19:
-            filename = "cityscapes-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 8:
-            filename = "cityscapes-instance-id2label.json"
-        elif model.SEM_SEG_HEAD.NUM_CLASSES == 65:
-            filename = "mapillary-vistas-id2label.json"
-
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        label2id = {label: idx for idx, label in id2label.items()}
-
-        if model.SWIN.EMBED_DIM == 96:
-            backbone_config = SwinConfig.from_pretrained(
-                "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
-            )
-        elif model.SWIN.EMBED_DIM == 128:
-            backbone_config = SwinConfig(
-                embed_dim=128,
-                window_size=12,
-                depths=(2, 2, 18, 2),
-                num_heads=(4, 8, 16, 32),
-                out_features=["stage1", "stage2", "stage3", "stage4"],
-            )
-
-        elif model.SWIN.EMBED_DIM == 192:
-            backbone_config = SwinConfig.from_pretrained(
-                "microsoft/swin-large-patch4-window12-384", out_features=["stage1", "stage2", "stage3", "stage4"]
-            )
-        else:
-            raise ValueError(f"embed dim {model.SWIN.EMBED_DIM} not supported for Swin!")
-
-        backbone_config.drop_path_rate = model.SWIN.DROP_PATH_RATE
-        backbone_config.attention_probs_dropout_prob = model.SWIN.ATTN_DROP_RATE
-        backbone_config.depths = model.SWIN.DEPTHS
-
-        config: Mask2FormerConfig = Mask2FormerConfig(
-            ignore_value=model.SEM_SEG_HEAD.IGNORE_VALUE,
-            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
-            num_queries=model.MASK_FORMER.NUM_OBJECT_QUERIES,
-            no_object_weight=model.MASK_FORMER.NO_OBJECT_WEIGHT,
-            class_weight=model.MASK_FORMER.CLASS_WEIGHT,
-            mask_weight=model.MASK_FORMER.MASK_WEIGHT,
-            dice_weight=model.MASK_FORMER.DICE_WEIGHT,
-            train_num_points=model.MASK_FORMER.TRAIN_NUM_POINTS,
-            oversample_ratio=model.MASK_FORMER.OVERSAMPLE_RATIO,
-            importance_sample_ratio=model.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO,
-            init_std=0.02,
-            init_xavier_std=1.0,
-            use_auxiliary_loss=model.MASK_FORMER.DEEP_SUPERVISION,
-            feature_strides=[4, 8, 16, 32],
-            backbone_config=backbone_config,
-            id2label=id2label,
-            label2id=label2id,
-            feature_size=model.SEM_SEG_HEAD.CONVS_DIM,
-            mask_feature_size=model.SEM_SEG_HEAD.MASK_DIM,
-            hidden_dim=model.MASK_FORMER.HIDDEN_DIM,
-            encoder_layers=model.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS,
-            encoder_feedforward_dim=1024,
-            decoder_layers=model.MASK_FORMER.DEC_LAYERS,
-            num_attention_heads=model.MASK_FORMER.NHEADS,
-            dropout=model.MASK_FORMER.DROPOUT,
-            dim_feedforward=model.MASK_FORMER.DIM_FEEDFORWARD,
-            pre_norm=model.MASK_FORMER.PRE_NORM,
-            enforce_input_proj=model.MASK_FORMER.ENFORCE_INPUT_PROJ,
-            common_stride=model.SEM_SEG_HEAD.COMMON_STRIDE,
-        )
-        return config
-
-
-class OriginalMask2FormerConfigToImageProcessorConverter:
-    def __call__(self, original_config: object) -> Mask2FormerImageProcessor:
-        model = original_config.MODEL
-        model_input = original_config.INPUT
-
-        return Mask2FormerImageProcessor(
-            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
-            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
-            size=model_input.MIN_SIZE_TEST,
-            max_size=model_input.MAX_SIZE_TEST,
-            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
-            ignore_index=model.SEM_SEG_HEAD.IGNORE_VALUE,
-            size_divisibility=32,
-        )
-
-
-class OriginalMask2FormerCheckpointToOursConverter:
-    def __init__(self, original_model: nn.Module, config: Mask2FormerConfig):
-        self.original_model = original_model
-        self.config = config
-
-    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
-        for src_key, dst_key in renamed_keys:
-            dst_state_dict[dst_key] = src_state_dict.pop(src_key)
-
-    def replace_maskformer_swin_backbone(
-        self, dst_state_dict: StateDict, src_state_dict: StateDict, config: Mask2FormerConfig
-    ):
-        dst_prefix: str = "pixel_level_module.encoder"
-        src_prefix: str = "backbone"
-
-        renamed_keys = [
-            (
-                f"{src_prefix}.patch_embed.proj.weight",
-                f"{dst_prefix}.model.embeddings.patch_embeddings.projection.weight",
-            ),
-            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.model.embeddings.patch_embeddings.projection.bias"),
-            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.model.embeddings.norm.weight"),
-            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.model.embeddings.norm.bias"),
-        ]
-        num_layers = len(config.backbone_config.depths)
-        for layer_idx in range(num_layers):
-            for block_idx in range(config.backbone_config.depths[layer_idx]):
-                renamed_keys.extend(
-                    [  # src, dst
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
-                        ),
-                    ]
-                )
-                # now we need to handle the attentions
-                # read in weights + bias of input projection layer of cross-attention
-
-                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
-                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
-
-                size = src_att_weight.shape[0]
-                offset = size // 3
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
-                ] = src_att_weight[:offset, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
-                ] = src_att_bias[:offset]
-
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
-                ] = src_att_weight[offset : offset * 2, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
-                ] = src_att_bias[offset : offset * 2]
-
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
-                ] = src_att_weight[-offset:, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
-                ] = src_att_bias[-offset:]
-
-                # let's pop them
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
-                # proj
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                # second norm
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
-                        ),
-                    ]
-                )
-
-                # mlp
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
-                        )
-                    ]
-                )
-
-            if layer_idx < num_layers - 1:
-                # patch merging
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.reduction.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.bias",
-                        ),
-                    ]
-                )
-
-            # hidden states norms
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.{layer_idx}.weight",
-                    ),
-                    (
-                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.{layer_idx}.bias",
-                    ),
-                ]
-            )
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_swin_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: Mask2FormerConfig):
-        dst_prefix: str = "pixel_level_module.encoder"
-        src_prefix: str = "backbone"
-
-        renamed_keys = [
-            (
-                f"{src_prefix}.patch_embed.proj.weight",
-                f"{dst_prefix}.embeddings.patch_embeddings.projection.weight",
-            ),
-            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.embeddings.patch_embeddings.projection.bias"),
-            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.embeddings.norm.weight"),
-            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.embeddings.norm.bias"),
-        ]
-
-        for layer_idx in range(len(config.backbone_config.depths)):
-            for block_idx in range(config.backbone_config.depths[layer_idx]):
-                renamed_keys.extend(
-                    [  # src, dst
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
-                        ),
-                    ]
-                )
-                # now we need to handle the attentions
-                # read in weights + bias of input projection layer of cross-attention
-
-                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
-                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
-
-                size = src_att_weight.shape[0]
-                offset = size // 3
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
-                ] = src_att_weight[:offset, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
-                ] = src_att_bias[:offset]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
-                ] = src_att_weight[offset : offset * 2, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
-                ] = src_att_bias[offset : offset * 2]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
-                ] = src_att_weight[-offset:, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
-                ] = src_att_bias[-offset:]
-
-                # let's pop them
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
-                # proj
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                # second norm
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
-                        ),
-                    ]
-                )
-
-                # mlp
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
-                        )
-                    ]
-                )
-
-            if layer_idx < 3:
-                # patch merging
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.reduction.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.bias",
-                        ),
-                    ]
-                )
-
-            # hidden states norms
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
-                    ),
-                    (
-                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
-                    ),
-                ]
-            )
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    # Backbone + Pixel Decoder
-    def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "pixel_level_module.decoder"
-        src_prefix: str = "sem_seg_head.pixel_decoder"
-
-        self.replace_swin_backbone(dst_state_dict, src_state_dict, self.config)
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        def rename_keys_for_self_attn(src_prefix: str, dst_prefix: str):
-            self_attn_keys = []
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.attention_weights", f"{dst_prefix}.attention_weights")
-            )
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.output_proj", f"{dst_prefix}.output_proj")
-            )
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.sampling_offsets", f"{dst_prefix}.sampling_offsets")
-            )
-            self_attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.value_proj", f"{dst_prefix}.value_proj"))
-
-            return self_attn_keys
-
-        def rename_keys_for_encoder_layer(src_prefix: str, dst_prefix: str):
-            encoder_keys = []
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.fc1"))
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.fc2"))
-            encoder_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.norm1", f"{dst_prefix}.self_attn_layer_norm")
-            )
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm2", f"{dst_prefix}.final_layer_norm"))
-            encoder_keys.extend(rename_keys_for_self_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn"))
-
-            return encoder_keys
-
-        # convolution layer for final features
-        renamed_keys = [
-            (f"{src_prefix}.adapter_1.weight", f"{dst_prefix}.adapter_1.0.weight"),
-            (f"{src_prefix}.adapter_1.norm.weight", f"{dst_prefix}.adapter_1.1.weight"),
-            (f"{src_prefix}.adapter_1.norm.bias", f"{dst_prefix}.adapter_1.1.bias"),
-        ]
-
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.layer_1.weight", f"{dst_prefix}.layer_1.0.weight"),
-                (f"{src_prefix}.layer_1.norm.weight", f"{dst_prefix}.layer_1.1.weight"),
-                (f"{src_prefix}.layer_1.norm.bias", f"{dst_prefix}.layer_1.1.bias"),
-            ]
-        )
-
-        # proj layers
-        for i in range(3):
-            for j in range(2):
-                renamed_keys.extend(
-                    [
-                        (f"{src_prefix}.input_proj.{i}.{j}.weight", f"{dst_prefix}.input_projections.{i}.{j}.weight"),
-                        (f"{src_prefix}.input_proj.{i}.{j}.bias", f"{dst_prefix}.input_projections.{i}.{j}.bias"),
-                    ]
-                )
-
-        renamed_keys.extend([(f"{src_prefix}.transformer.level_embed", f"{dst_prefix}.level_embed")])
-
-        # layers
-        for layer_idx in range(self.config.encoder_layers):
-            renamed_keys.extend(
-                rename_keys_for_encoder_layer(
-                    f"{src_prefix}.transformer.encoder.layers.{layer_idx}", f"{dst_prefix}.encoder.layers.{layer_idx}"
-                )
-            )
-
-        # proj
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"),
-                (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"),
-            ]
-        )
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    # Transformer Decoder
-    def rename_keys_in_masked_attention_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder"
-        src_prefix: str = "sem_seg_head.predictor"
-
-        rename_keys = []
-        for i in range(self.config.decoder_layers - 1):
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.out_proj.weight",
-                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.out_proj.bias",
-                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.bias",
-                )
-            )
-
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_self_attention_layers.{i}.norm.weight",
-                    f"{dst_prefix}.layers.{i}.self_attn_layer_norm.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_self_attention_layers.{i}.norm.bias",
-                    f"{dst_prefix}.layers.{i}.self_attn_layer_norm.bias",
-                )
-            )
-
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.in_proj_weight",
-                    f"{dst_prefix}.layers.{i}.cross_attn.in_proj_weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.in_proj_bias",
-                    f"{dst_prefix}.layers.{i}.cross_attn.in_proj_bias",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.out_proj.weight",
-                    f"{dst_prefix}.layers.{i}.cross_attn.out_proj.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.multihead_attn.out_proj.bias",
-                    f"{dst_prefix}.layers.{i}.cross_attn.out_proj.bias",
-                )
-            )
-
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.norm.weight",
-                    f"{dst_prefix}.layers.{i}.cross_attn_layer_norm.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_cross_attention_layers.{i}.norm.bias",
-                    f"{dst_prefix}.layers.{i}.cross_attn_layer_norm.bias",
-                )
-            )
-
-            rename_keys.append(
-                (f"{src_prefix}.transformer_ffn_layers.{i}.linear1.weight", f"{dst_prefix}.layers.{i}.fc1.weight")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.transformer_ffn_layers.{i}.linear1.bias", f"{dst_prefix}.layers.{i}.fc1.bias")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.transformer_ffn_layers.{i}.linear2.weight", f"{dst_prefix}.layers.{i}.fc2.weight")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.transformer_ffn_layers.{i}.linear2.bias", f"{dst_prefix}.layers.{i}.fc2.bias")
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_ffn_layers.{i}.norm.weight",
-                    f"{dst_prefix}.layers.{i}.final_layer_norm.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.transformer_ffn_layers.{i}.norm.bias",
-                    f"{dst_prefix}.layers.{i}.final_layer_norm.bias",
-                )
-            )
-
-        return rename_keys
-
-    def replace_masked_attention_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder"
-        src_prefix: str = "sem_seg_head.predictor"
-
-        renamed_keys = self.rename_keys_in_masked_attention_decoder(dst_state_dict, src_state_dict)
-
-        # add more
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.decoder_norm.weight", f"{dst_prefix}.layernorm.weight"),
-                (f"{src_prefix}.decoder_norm.bias", f"{dst_prefix}.layernorm.bias"),
-            ]
-        )
-
-        mlp_len = 3
-        for i in range(mlp_len):
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.mask_embed.layers.{i}.weight",
-                        f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.weight",
-                    ),
-                    (
-                        f"{src_prefix}.mask_embed.layers.{i}.bias",
-                        f"{dst_prefix}.mask_predictor.mask_embedder.{i}.0.bias",
-                    ),
-                ]
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder.layers"
-        src_prefix: str = "sem_seg_head.predictor"
-        for i in range(self.config.decoder_layers - 1):
-            # read in weights + bias of input projection layer of self-attention
-            in_proj_weight = src_state_dict.pop(
-                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight"
-            )
-            in_proj_bias = src_state_dict.pop(
-                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias"
-            )
-            # next, add query, keys and values (in that order) to the state dict
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-    def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module"
-        src_prefix: str = "sem_seg_head.predictor"
-
-        self.replace_masked_attention_decoder(dst_state_dict, src_state_dict)
-
-        renamed_keys = [
-            (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
-            (f"{src_prefix}.query_feat.weight", f"{dst_prefix}.queries_features.weight"),
-            (f"{src_prefix}.level_embed.weight", f"{dst_prefix}.level_embed.weight"),
-        ]
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-        self.replace_keys_qkv_transformer_decoder(dst_state_dict, src_state_dict)
-
-    def replace_universal_segmentation_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = ""
-        src_prefix: str = "sem_seg_head.predictor"
-
-        renamed_keys = [
-            (f"{src_prefix}.class_embed.weight", f"{dst_prefix}class_predictor.weight"),
-            (f"{src_prefix}.class_embed.bias", f"{dst_prefix}class_predictor.bias"),
-        ]
-
-        logger.info(f"Replacing keys {pformat(renamed_keys)}")
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def convert(self, mask2former: Mask2FormerModel) -> Mask2FormerModel:
-        dst_state_dict = TrackedStateDict(mask2former.state_dict())
-        src_state_dict = self.original_model.state_dict()
-
-        self.replace_pixel_module(dst_state_dict, src_state_dict)
-        self.replace_transformer_module(dst_state_dict, src_state_dict)
-
-        logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
-        logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
-        logger.info("🙌 Done")
-
-        state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
-        mask2former.load_state_dict(state_dict)
-        return mask2former
-
-    def convert_universal_segmentation(
-        self, mask2former: Mask2FormerForUniversalSegmentation
-    ) -> Mask2FormerForUniversalSegmentation:
-        dst_state_dict = TrackedStateDict(mask2former.state_dict())
-        src_state_dict = self.original_model.state_dict()
-
-        self.replace_universal_segmentation_module(dst_state_dict, src_state_dict)
-
-        state_dict = {key: dst_state_dict[key] for key in dst_state_dict.to_track.keys()}
-        mask2former.load_state_dict(state_dict)
-
-        return mask2former
-
-    @staticmethod
-    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
-        checkpoints: List[Path] = checkpoints_dir.glob("**/*.pkl")
-
-        for checkpoint in checkpoints:
-            logger.info(f"💪 Converting {checkpoint.stem}")
-            # find associated config file
-
-            # dataset_name e.g 'coco'
-            dataset_name = checkpoint.parents[2].stem
-            if dataset_name == "ade":
-                dataset_name = dataset_name.replace("ade", "ade20k")
-
-            # task type e.g 'instance-segmentation'
-            segmentation_task = checkpoint.parents[1].stem
-
-            # config file corresponding to checkpoint
-            config_file_name = f"{checkpoint.parents[0].stem}.yaml"
-
-            config: Path = config_dir / dataset_name / segmentation_task / "swin" / config_file_name
-            yield config, checkpoint
-
-
-def test(
-    original_model,
-    our_model: Mask2FormerForUniversalSegmentation,
-    image_processor: Mask2FormerImageProcessor,
-    tolerance: float,
-):
-    with torch.no_grad():
-        original_model = original_model.eval()
-        our_model = our_model.eval()
-
-        im = prepare_img()
-        x = image_processor(images=im, return_tensors="pt")["pixel_values"]
-
-        original_model_backbone_features = original_model.backbone(x.clone())
-        our_model_output: Mask2FormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)
-
-        # Test backbone
-        for original_model_feature, our_model_feature in zip(
-            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
-        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=tolerance
-            ), "The backbone features are not the same."
-
-        # Test pixel decoder
-        mask_features, _, multi_scale_features = original_model.sem_seg_head.pixel_decoder.forward_features(
-            original_model_backbone_features
-        )
-
-        for original_model_feature, our_model_feature in zip(
-            multi_scale_features, our_model_output.pixel_decoder_hidden_states
-        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=tolerance
-            ), "The pixel decoder feature are not the same"
-
-        # Let's test the full model
-        tr_complete = T.Compose(
-            [T.Resize((384, 384)), T.ToTensor()],
-        )
-        y = (tr_complete(im) * 255.0).to(torch.int).float()
-
-        # modify original Mask2Former code to return mask and class logits
-        original_class_logits, original_mask_logits = original_model([{"image": y.clone().squeeze(0)}])
-
-        our_model_out: Mask2FormerForUniversalSegmentationOutput = our_model(x.clone())
-        our_mask_logits = our_model_out.masks_queries_logits
-        our_class_logits = our_model_out.class_queries_logits
-
-        assert original_mask_logits.shape == our_mask_logits.shape, "Output masks shapes are not matching."
-        assert original_class_logits.shape == our_class_logits.shape, "Output class logits shapes are not matching."
-        assert torch.allclose(
-            original_class_logits, our_class_logits, atol=tolerance
-        ), "The class logits are not the same."
-        assert torch.allclose(
-            original_mask_logits, our_mask_logits, atol=tolerance
-        ), "The predicted masks are not the same."
-
-        logger.info("✅ Test passed!")
-
-
-def get_model_name(checkpoint_file: Path):
-    # model_name_raw is something like maskformer2_swin_small_bs16_50ep
-    model_name_raw: str = checkpoint_file.parents[0].stem
-
-    # `segmentation_task_type` must be one of the following: `instance-segmentation`, `panoptic-segmentation`, `semantic-segmentation`
-    segmentation_task_name: str = checkpoint_file.parents[1].stem
-    if segmentation_task_name not in ["instance-segmentation", "panoptic-segmentation", "semantic-segmentation"]:
-        raise ValueError(
-            f"{segmentation_task_name} must be wrong since acceptable values are: instance-segmentation,"
-            " panoptic-segmentation, semantic-segmentation."
-        )
-
-    # dataset name must be one of the following: `coco`, `ade`, `cityscapes`, `mapillary-vistas`
-    dataset_name: str = checkpoint_file.parents[2].stem
-    if dataset_name not in ["coco", "ade", "cityscapes", "mapillary-vistas"]:
-        raise ValueError(
-            f"{dataset_name} must be wrong since we didn't find 'coco' or 'ade' or 'cityscapes' or 'mapillary-vistas'"
-            " in it "
-        )
-
-    backbone = "swin"
-    backbone_types = ["tiny", "small", "base_IN21k", "base", "large"]
-    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0].replace("_", "-")
-
-    model_name = f"mask2former-{backbone}-{backbone_type}-{dataset_name}-{segmentation_task_name.split('-')[0]}"
-
-    return model_name
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser(
-        description="Command line to convert the original mask2formers (with swin backbone) to our implementations."
-    )
-
-    parser.add_argument(
-        "--checkpoints_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's checkpoints. The directory has to have the following structure:"
-            " <DIR_NAME>/<DATASET_NAME>/<SEGMENTATION_TASK_NAME>/<CONFIG_NAME>.pkl"
-        ),
-    )
-    parser.add_argument(
-        "--configs_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
-            " structure: <DIR_NAME>/<DATASET_NAME>/<SEGMENTATION_TASK_NAME>/<CONFIG_NAME>.yaml"
-        ),
-    )
-    parser.add_argument(
-        "--mask2former_dir",
-        required=True,
-        type=Path,
-        help=(
-            "A path to Mask2Former's original implementation directory. You can download from here:"
-            " https://github.com/facebookresearch/Mask2Former"
-        ),
-    )
-
-    args = parser.parse_args()
-
-    checkpoints_dir: Path = args.checkpoints_dir
-    config_dir: Path = args.configs_dir
-    mask2former_dir: Path = args.mask2former_dir
-    # append the path to the parents to mask2former dir
-    sys.path.append(str(mask2former_dir.parent))
-    # import original Mask2Former config and model from original source code repo
-    from Mask2Former.mask2former.config import add_maskformer2_config
-    from Mask2Former.mask2former.maskformer_model import MaskFormer as OriginalMask2Former
-
-    for config_file, checkpoint_file in OriginalMask2FormerCheckpointToOursConverter.using_dirs(
-        checkpoints_dir, config_dir
-    ):
-        model_name = get_model_name(checkpoint_file)
-        image_processor = OriginalMask2FormerConfigToImageProcessorConverter()(
-            setup_cfg(Args(config_file=config_file))
-        )
-        image_processor.size = {"height": 384, "width": 384}
-
-        original_config = setup_cfg(Args(config_file=config_file))
-        mask2former_kwargs = OriginalMask2Former.from_config(original_config)
-        original_model = OriginalMask2Former(**mask2former_kwargs).eval()
-
-        DetectionCheckpointer(original_model).load(str(checkpoint_file))
-
-        config: Mask2FormerConfig = OriginalMask2FormerConfigToOursConverter()(original_config)
-        mask2former = Mask2FormerModel(config=config).eval()
-
-        converter = OriginalMask2FormerCheckpointToOursConverter(original_model, config)
-        mask2former = converter.convert(mask2former)
-
-        mask2former_for_segmentation = Mask2FormerForUniversalSegmentation(config=config).eval()
-        mask2former_for_segmentation.model = mask2former
-
-        mask2former_for_segmentation = converter.convert_universal_segmentation(mask2former_for_segmentation)
-
-        tolerance = 3e-1
-        high_tolerance_models = [
-            "mask2former-swin-base-IN21k-coco-instance",
-            "mask2former-swin-base-coco-instance",
-            "mask2former-swin-small-cityscapes-semantic",
-        ]
-
-        if model_name in high_tolerance_models:
-            tolerance = 3e-1
-
-        logger.info(f"🪄 Testing {model_name}...")
-        test(original_model, mask2former_for_segmentation, image_processor, tolerance)
-        logger.info(f"🪄 Pushing {model_name} to hub...")
-
-        image_processor.push_to_hub(model_name)
-        mask2former_for_segmentation.push_to_hub(model_name)
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index 6797fb48bc85..5c61431bf025 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -575,13 +575,13 @@ def __call__(self, images, segmentation_maps=None, **kwargs) -> BatchFeature:
     def _preprocess(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
-        size_divisor: int = None,
+        size_divisor: Optional[int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -599,13 +599,13 @@ def _preprocess(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
-        size_divisor: int = None,
+        size_divisor: Optional[int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -641,7 +641,7 @@ def _preprocess_image(
     def _preprocess_mask(
         self,
         segmentation_map: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         size_divisor: int = 0,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 887a617b8241..e4fba109a071 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -76,7 +76,7 @@ class Mask2FormerPixelDecoderOutput(ModelOutput):
     """
 
     multi_scale_features: Tuple[torch.FloatTensor] = None
-    mask_features: torch.FloatTensor = None
+    mask_features: Optional[torch.FloatTensor] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -105,7 +105,7 @@ class Mask2FormerMaskedAttentionDecoderOutput(BaseModelOutputWithCrossAttentions
             layernorm.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[torch.FloatTensor] = None
     masks_queries_logits: Tuple[torch.FloatTensor] = None
@@ -137,9 +137,9 @@ class Mask2FormerPixelLevelModuleOutput(ModelOutput):
             called feature maps) of the model at the output of each stage.
     """
 
-    encoder_last_hidden_state: torch.FloatTensor = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    decoder_last_hidden_state: torch.FloatTensor = None
+    decoder_last_hidden_state: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Tuple[torch.FloatTensor] = None
 
 
@@ -178,9 +178,9 @@ class Mask2FormerModelOutput(ModelOutput):
             sequence_length)`. Self attentions weights from transformer decoder.
     """
 
-    encoder_last_hidden_state: torch.FloatTensor = None
-    pixel_decoder_last_hidden_state: torch.FloatTensor = None
-    transformer_decoder_last_hidden_state: torch.FloatTensor = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    pixel_decoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    transformer_decoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     pixel_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     transformer_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -234,12 +234,12 @@ class Mask2FormerForUniversalSegmentationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    class_queries_logits: torch.FloatTensor = None
-    masks_queries_logits: torch.FloatTensor = None
+    class_queries_logits: Optional[torch.FloatTensor] = None
+    masks_queries_logits: Optional[torch.FloatTensor] = None
     auxiliary_logits: Optional[List[Dict[str, torch.FloatTensor]]] = None
-    encoder_last_hidden_state: torch.FloatTensor = None
-    pixel_decoder_last_hidden_state: torch.FloatTensor = None
-    transformer_decoder_last_hidden_state: torch.FloatTensor = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    pixel_decoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    transformer_decoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     pixel_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     transformer_decoder_hidden_states: Optional[torch.FloatTensor] = None
@@ -799,7 +799,7 @@ def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> tor
         return num_masks
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
+# Copied from transformers.models.oneformer.modeling_oneformer.multi_scale_deformable_attention
 def multi_scale_deformable_attention(
     value: Tensor,
     value_spatial_shapes: Union[Tensor, List[Tuple]],
@@ -1004,7 +1004,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes_list=None,
         level_start_index=None,
@@ -1592,7 +1592,7 @@ def with_pos_embed(self, tensor, pos: Optional[Tensor]):
     def forward_post(
         self,
         hidden_states: torch.Tensor,
-        level_index: int = None,
+        level_index: Optional[int] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_embeddings: Optional[torch.Tensor] = None,
         query_position_embeddings: Optional[torch.Tensor] = None,
@@ -1651,7 +1651,7 @@ def forward_post(
     def forward_pre(
         self,
         hidden_states: torch.Tensor,
-        level_index: int = None,
+        level_index: Optional[int] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_embeddings: Optional[torch.Tensor] = None,
         query_position_embeddings: Optional[torch.Tensor] = None,
@@ -1712,7 +1712,7 @@ def forward_pre(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        level_index: int = None,
+        level_index: Optional[int] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_embeddings: Optional[torch.Tensor] = None,
         query_position_embeddings: Optional[torch.Tensor] = None,
@@ -1801,11 +1801,11 @@ def __init__(self, config: Mask2FormerConfig):
 
     def forward(
         self,
-        inputs_embeds: torch.Tensor = None,
-        multi_stage_positional_embeddings: torch.Tensor = None,
-        pixel_embeddings: torch.Tensor = None,
-        encoder_hidden_states: torch.Tensor = None,
-        query_position_embeddings: torch.Tensor = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        multi_stage_positional_embeddings: Optional[torch.Tensor] = None,
+        pixel_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
         feature_size_list: List = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -2013,7 +2013,9 @@ def __init__(self, hidden_size: int, num_heads: int, mask_feature_size: torch.Te
 
         self.mask_embedder = Mask2FormerMLPPredictionHead(self.hidden_size, self.hidden_size, mask_feature_size)
 
-    def forward(self, outputs: torch.Tensor, pixel_embeddings: torch.Tensor, attention_mask_target_size: int = None):
+    def forward(
+        self, outputs: torch.Tensor, pixel_embeddings: torch.Tensor, attention_mask_target_size: Optional[int] = None
+    ):
         mask_embeddings = self.mask_embedder(outputs.transpose(0, 1))
 
         is_tracing = torch.jit.is_tracing() or isinstance(outputs, torch.fx.Proxy) or is_torchdynamo_compiling()
diff --git a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 8b73c6824550..000000000000
--- a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,731 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-from argparse import ArgumentParser
-from dataclasses import dataclass
-from pathlib import Path
-from pprint import pformat
-from typing import Any, Dict, Iterator, List, Set, Tuple
-
-import requests
-import torch
-import torchvision.transforms as T
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.data import MetadataCatalog
-from detectron2.projects.deeplab import add_deeplab_config
-from PIL import Image
-from torch import Tensor, nn
-
-from transformers.models.maskformer.feature_extraction_maskformer import MaskFormerImageProcessor
-from transformers.models.maskformer.modeling_maskformer import (
-    MaskFormerConfig,
-    MaskFormerForInstanceSegmentation,
-    MaskFormerForInstanceSegmentationOutput,
-    MaskFormerModel,
-    MaskFormerModelOutput,
-)
-from transformers.utils import logging
-
-
-StateDict = Dict[str, Tensor]
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-torch.manual_seed(0)
-
-
-class TrackedStateDict:
-    def __init__(self, to_track: Dict):
-        """This class "tracks" a python dictionary by keeping track of which item is accessed.
-
-        Args:
-            to_track (Dict): The dictionary we wish to track
-        """
-        self.to_track = to_track
-        self._seen: Set[str] = set()
-
-    def __getitem__(self, key: str) -> Any:
-        return self.to_track[key]
-
-    def __setitem__(self, key: str, item: Any):
-        self._seen.add(key)
-        self.to_track[key] = item
-
-    def diff(self) -> List[str]:
-        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
-        This is an effective method to check if we have update all the keys
-
-        Returns:
-            List[str]: List of keys not yet updated
-        """
-        return set(self.to_track.keys()) - self._seen
-
-    def copy(self) -> Dict:
-        # proxy the call to the internal dictionary
-        return self.to_track.copy()
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    img_data = requests.get(url, stream=True).raw
-    im = Image.open(img_data)
-    return im
-
-
-@dataclass
-class Args:
-    """Fake command line arguments needed by maskformer/detectron implementation"""
-
-    config_file: str
-
-
-def setup_cfg(args: Args):
-    # load config from file and command-line arguments
-    cfg = get_cfg()
-    add_deeplab_config(cfg)
-    add_mask_former_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.freeze()
-    return cfg
-
-
-class OriginalMaskFormerConfigToOursConverter:
-    def __call__(self, original_config: object) -> MaskFormerConfig:
-        model = original_config.MODEL
-        mask_former = model.MASK_FORMER
-        swin = model.SWIN
-
-        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0])
-        id2label = dict(enumerate(dataset_catalog.stuff_classes))
-        label2id = {label: idx for idx, label in id2label.items()}
-
-        config: MaskFormerConfig = MaskFormerConfig(
-            fpn_feature_size=model.SEM_SEG_HEAD.CONVS_DIM,
-            mask_feature_size=model.SEM_SEG_HEAD.MASK_DIM,
-            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
-            no_object_weight=mask_former.NO_OBJECT_WEIGHT,
-            num_queries=mask_former.NUM_OBJECT_QUERIES,
-            backbone_config={
-                "pretrain_img_size": swin.PRETRAIN_IMG_SIZE,
-                "image_size": swin.PRETRAIN_IMG_SIZE,
-                "in_channels": 3,
-                "patch_size": swin.PATCH_SIZE,
-                "embed_dim": swin.EMBED_DIM,
-                "depths": swin.DEPTHS,
-                "num_heads": swin.NUM_HEADS,
-                "window_size": swin.WINDOW_SIZE,
-                "drop_path_rate": swin.DROP_PATH_RATE,
-                "model_type": "swin",
-            },
-            dice_weight=mask_former.DICE_WEIGHT,
-            ce_weight=1.0,
-            mask_weight=mask_former.MASK_WEIGHT,
-            decoder_config={
-                "model_type": "detr",
-                "max_position_embeddings": 1024,
-                "encoder_layers": 6,
-                "encoder_ffn_dim": 2048,
-                "encoder_attention_heads": 8,
-                "decoder_layers": mask_former.DEC_LAYERS,
-                "decoder_ffn_dim": mask_former.DIM_FEEDFORWARD,
-                "decoder_attention_heads": mask_former.NHEADS,
-                "encoder_layerdrop": 0.0,
-                "decoder_layerdrop": 0.0,
-                "d_model": mask_former.HIDDEN_DIM,
-                "dropout": mask_former.DROPOUT,
-                "attention_dropout": 0.0,
-                "activation_dropout": 0.0,
-                "init_std": 0.02,
-                "init_xavier_std": 1.0,
-                "scale_embedding": False,
-                "auxiliary_loss": False,
-                "dilation": False,
-                # default pretrained config values
-            },
-            id2label=id2label,
-            label2id=label2id,
-        )
-
-        return config
-
-
-class OriginalMaskFormerConfigToImageProcessorConverter:
-    def __call__(self, original_config: object) -> MaskFormerImageProcessor:
-        model = original_config.MODEL
-        model_input = original_config.INPUT
-        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0])
-
-        return MaskFormerImageProcessor(
-            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
-            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
-            size=model_input.MIN_SIZE_TEST,
-            max_size=model_input.MAX_SIZE_TEST,
-            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
-            ignore_index=dataset_catalog.ignore_label,
-            size_divisibility=32,  # 32 is required by swin
-        )
-
-
-class OriginalMaskFormerCheckpointToOursConverter:
-    def __init__(self, original_model: nn.Module, config: MaskFormerConfig):
-        self.original_model = original_model
-        self.config = config
-
-    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
-        for src_key, dst_key in renamed_keys:
-            dst_state_dict[dst_key] = src_state_dict.pop(src_key)
-
-    def replace_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: MaskFormerConfig):
-        dst_prefix: str = "pixel_level_module.encoder"
-        src_prefix: str = "backbone"
-
-        renamed_keys = [
-            (
-                f"{src_prefix}.patch_embed.proj.weight",
-                f"{dst_prefix}.model.embeddings.patch_embeddings.projection.weight",
-            ),
-            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.model.embeddings.patch_embeddings.projection.bias"),
-            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.model.embeddings.norm.weight"),
-            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.model.embeddings.norm.bias"),
-        ]
-        num_layers = len(config.backbone_config.depths)
-        for layer_idx in range(num_layers):
-            for block_idx in range(config.backbone_config.depths[layer_idx]):
-                renamed_keys.extend(
-                    [  # src, dst
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
-                        ),
-                    ]
-                )
-                # now we need to handle the attentions
-                # read in weights + bias of input projection layer of cross-attention
-
-                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
-                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
-
-                size = src_att_weight.shape[0]
-                offset = size // 3
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
-                ] = src_att_weight[:offset, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
-                ] = src_att_bias[:offset]
-
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
-                ] = src_att_weight[offset : offset * 2, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
-                ] = src_att_bias[offset : offset * 2]
-
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
-                ] = src_att_weight[-offset:, :]
-                dst_state_dict[
-                    f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
-                ] = src_att_bias[-offset:]
-
-                # let's pop them
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
-                # proj
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                # second norm
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
-                        ),
-                    ]
-                )
-
-                # mlp
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
-                        )
-                    ]
-                )
-
-            if layer_idx < num_layers - 1:
-                # patch merging
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.reduction.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
-                            f"{dst_prefix}.model.encoder.layers.{layer_idx}.downsample.norm.bias",
-                        ),
-                    ]
-                )
-
-            # hidden states norms
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.{layer_idx}.weight",
-                    ),
-                    (
-                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.{layer_idx}.bias",
-                    ),
-                ]
-            )
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "pixel_level_module.decoder"
-        src_prefix: str = "sem_seg_head.pixel_decoder"
-
-        self.replace_backbone(dst_state_dict, src_state_dict, self.config)
-
-        def rename_keys_for_conv(detectron_conv: str, mine_conv: str):
-            return [
-                (f"{detectron_conv}.weight", f"{mine_conv}.0.weight"),
-                # 2 cuz the have act in the middle -> rename it
-                (f"{detectron_conv}.norm.weight", f"{mine_conv}.1.weight"),
-                (f"{detectron_conv}.norm.bias", f"{mine_conv}.1.bias"),
-            ]
-
-        renamed_keys = [
-            (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"),
-            (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"),
-            # the layers in the original one are in reverse order, stem is the last one!
-        ]
-
-        renamed_keys.extend(rename_keys_for_conv(f"{src_prefix}.layer_4", f"{dst_prefix}.fpn.stem"))
-
-        # add all the fpn layers (here we need some config parameters to know the size in advance)
-        for src_i, dst_i in zip(range(3, 0, -1), range(0, 3)):
-            renamed_keys.extend(
-                rename_keys_for_conv(f"{src_prefix}.adapter_{src_i}", f"{dst_prefix}.fpn.layers.{dst_i}.proj")
-            )
-            renamed_keys.extend(
-                rename_keys_for_conv(f"{src_prefix}.layer_{src_i}", f"{dst_prefix}.fpn.layers.{dst_i}.block")
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def rename_keys_in_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder"
-        src_prefix: str = "sem_seg_head.predictor.transformer.decoder"
-        # not sure why we are not popping direcetly here!
-        # here we list all keys to be renamed (original name on the left, our name on the right)
-        rename_keys = []
-        for i in range(self.config.decoder_config.decoder_layers):
-            # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-            rename_keys.append(
-                (
-                    f"{src_prefix}.layers.{i}.self_attn.out_proj.weight",
-                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.layers.{i}.self_attn.out_proj.bias",
-                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.bias",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.layers.{i}.multihead_attn.out_proj.weight",
-                    f"{dst_prefix}.layers.{i}.encoder_attn.out_proj.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"{src_prefix}.layers.{i}.multihead_attn.out_proj.bias",
-                    f"{dst_prefix}.layers.{i}.encoder_attn.out_proj.bias",
-                )
-            )
-            rename_keys.append((f"{src_prefix}.layers.{i}.linear1.weight", f"{dst_prefix}.layers.{i}.fc1.weight"))
-            rename_keys.append((f"{src_prefix}.layers.{i}.linear1.bias", f"{dst_prefix}.layers.{i}.fc1.bias"))
-            rename_keys.append((f"{src_prefix}.layers.{i}.linear2.weight", f"{dst_prefix}.layers.{i}.fc2.weight"))
-            rename_keys.append((f"{src_prefix}.layers.{i}.linear2.bias", f"{dst_prefix}.layers.{i}.fc2.bias"))
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm1.weight", f"{dst_prefix}.layers.{i}.self_attn_layer_norm.weight")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm1.bias", f"{dst_prefix}.layers.{i}.self_attn_layer_norm.bias")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm2.weight", f"{dst_prefix}.layers.{i}.encoder_attn_layer_norm.weight")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm2.bias", f"{dst_prefix}.layers.{i}.encoder_attn_layer_norm.bias")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm3.weight", f"{dst_prefix}.layers.{i}.final_layer_norm.weight")
-            )
-            rename_keys.append(
-                (f"{src_prefix}.layers.{i}.norm3.bias", f"{dst_prefix}.layers.{i}.final_layer_norm.bias")
-            )
-
-        return rename_keys
-
-    def replace_q_k_v_in_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder"
-        src_prefix: str = "sem_seg_head.predictor.transformer.decoder"
-        for i in range(self.config.decoder_config.decoder_layers):
-            # read in weights + bias of input projection layer of self-attention
-            in_proj_weight = src_state_dict.pop(f"{src_prefix}.layers.{i}.self_attn.in_proj_weight")
-            in_proj_bias = src_state_dict.pop(f"{src_prefix}.layers.{i}.self_attn.in_proj_bias")
-            # next, add query, keys and values (in that order) to the state dict
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-            # read in weights + bias of input projection layer of cross-attention
-            in_proj_weight_cross_attn = src_state_dict.pop(f"{src_prefix}.layers.{i}.multihead_attn.in_proj_weight")
-            in_proj_bias_cross_attn = src_state_dict.pop(f"{src_prefix}.layers.{i}.multihead_attn.in_proj_bias")
-            # next, add query, keys and values (in that order) of cross-attention to the state dict
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[
-                256:512, :
-            ]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
-            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
-
-    def replace_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder"
-        src_prefix: str = "sem_seg_head.predictor.transformer.decoder"
-        renamed_keys = self.rename_keys_in_detr_decoder(dst_state_dict, src_state_dict)
-        # add more
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.norm.weight", f"{dst_prefix}.layernorm.weight"),
-                (f"{src_prefix}.norm.bias", f"{dst_prefix}.layernorm.bias"),
-            ]
-        )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-        self.replace_q_k_v_in_detr_decoder(dst_state_dict, src_state_dict)
-
-    def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module"
-        src_prefix: str = "sem_seg_head.predictor"
-
-        self.replace_detr_decoder(dst_state_dict, src_state_dict)
-
-        renamed_keys = [
-            (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
-            (f"{src_prefix}.input_proj.weight", f"{dst_prefix}.input_projection.weight"),
-            (f"{src_prefix}.input_proj.bias", f"{dst_prefix}.input_projection.bias"),
-        ]
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_instance_segmentation_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        # NOTE in our case we don't have a prefix, thus we removed the "." from the keys later on!
-        dst_prefix: str = ""
-        src_prefix: str = "sem_seg_head.predictor"
-
-        renamed_keys = [
-            (f"{src_prefix}.class_embed.weight", f"{dst_prefix}class_predictor.weight"),
-            (f"{src_prefix}.class_embed.bias", f"{dst_prefix}class_predictor.bias"),
-        ]
-
-        mlp_len = 3
-        for i in range(mlp_len):
-            renamed_keys.extend(
-                [
-                    (f"{src_prefix}.mask_embed.layers.{i}.weight", f"{dst_prefix}mask_embedder.{i}.0.weight"),
-                    (f"{src_prefix}.mask_embed.layers.{i}.bias", f"{dst_prefix}mask_embedder.{i}.0.bias"),
-                ]
-            )
-        logger.info(f"Replacing keys {pformat(renamed_keys)}")
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def convert(self, mask_former: MaskFormerModel) -> MaskFormerModel:
-        dst_state_dict = TrackedStateDict(mask_former.state_dict())
-        src_state_dict = self.original_model.state_dict()
-
-        self.replace_pixel_module(dst_state_dict, src_state_dict)
-        self.replace_transformer_module(dst_state_dict, src_state_dict)
-
-        logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
-        logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
-        logger.info("🙌 Done")
-
-        mask_former.load_state_dict(dst_state_dict)
-
-        return mask_former
-
-    def convert_instance_segmentation(
-        self, mask_former: MaskFormerForInstanceSegmentation
-    ) -> MaskFormerForInstanceSegmentation:
-        dst_state_dict = TrackedStateDict(mask_former.state_dict())
-        src_state_dict = self.original_model.state_dict()
-
-        self.replace_instance_segmentation_module(dst_state_dict, src_state_dict)
-
-        mask_former.load_state_dict(dst_state_dict)
-
-        return mask_former
-
-    @staticmethod
-    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
-        checkpoints: List[Path] = checkpoints_dir.glob("**/*.pkl")
-
-        for checkpoint in checkpoints:
-            logger.info(f"💪 Converting {checkpoint.stem}")
-            # find associated config file
-            config: Path = config_dir / checkpoint.parents[0].stem / "swin" / f"{checkpoint.stem}.yaml"
-
-            yield config, checkpoint
-
-
-def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_processor: MaskFormerImageProcessor):
-    with torch.no_grad():
-        original_model = original_model.eval()
-        our_model = our_model.eval()
-
-        im = prepare_img()
-
-        tr = T.Compose(
-            [
-                T.Resize((384, 384)),
-                T.ToTensor(),
-                T.Normalize(
-                    mean=torch.tensor([123.675, 116.280, 103.530]) / 255.0,
-                    std=torch.tensor([58.395, 57.120, 57.375]) / 255.0,
-                ),
-            ],
-        )
-
-        x = tr(im).unsqueeze(0)
-
-        original_model_backbone_features = original_model.backbone(x.clone())
-
-        our_model_output: MaskFormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)
-
-        for original_model_feature, our_model_feature in zip(
-            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
-        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=1e-3
-            ), "The backbone features are not the same."
-
-        original_model_pixel_out = original_model.sem_seg_head.pixel_decoder.forward_features(
-            original_model_backbone_features
-        )
-
-        assert torch.allclose(
-            original_model_pixel_out[0], our_model_output.pixel_decoder_last_hidden_state, atol=1e-4
-        ), "The pixel decoder feature are not the same"
-
-        # let's test the full model
-        original_model_out = original_model([{"image": x.squeeze(0)}])
-
-        original_segmentation = original_model_out[0]["sem_seg"]
-
-        our_model_out: MaskFormerForInstanceSegmentationOutput = our_model(x)
-
-        our_segmentation = image_processor.post_process_segmentation(our_model_out, target_size=(384, 384))
-
-        assert torch.allclose(
-            original_segmentation, our_segmentation, atol=1e-3
-        ), "The segmentation image is not the same."
-
-        logger.info("✅ Test passed!")
-
-
-def get_name(checkpoint_file: Path):
-    model_name_raw: str = checkpoint_file.stem
-    # model_name_raw is something like maskformer_panoptic_swin_base_IN21k_384_bs64_554k
-    parent_name: str = checkpoint_file.parents[0].stem
-    backbone = "swin"
-    dataset = ""
-    if "coco" in parent_name:
-        dataset = "coco"
-    elif "ade" in parent_name:
-        dataset = "ade"
-    else:
-        raise ValueError(f"{parent_name} must be wrong since we didn't find 'coco' or 'ade' in it ")
-
-    backbone_types = ["tiny", "small", "base", "large"]
-
-    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0]
-
-    model_name = f"maskformer-{backbone}-{backbone_type}-{dataset}"
-
-    return model_name
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser(
-        description="Command line to convert the original maskformers (with swin backbone) to our implementations."
-    )
-
-    parser.add_argument(
-        "--checkpoints_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's checkpoints. The directory has to have the following structure:"
-            " <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.pkl\n"
-            "Given the files are in the pickle format, please be wary of passing it files you trust."
-        ),
-    )
-    parser.add_argument(
-        "--configs_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
-            " structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.yaml"
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        required=True,
-        type=Path,
-        help="Path to the folder to output PyTorch models.",
-    )
-    parser.add_argument(
-        "--maskformer_dir",
-        required=True,
-        type=Path,
-        help=(
-            "A path to MaskFormer's original implementation directory. You can download from here:"
-            " https://github.com/facebookresearch/MaskFormer"
-        ),
-    )
-
-    args = parser.parse_args()
-
-    checkpoints_dir: Path = args.checkpoints_dir
-    config_dir: Path = args.configs_dir
-    save_directory: Path = args.pytorch_dump_folder_path
-    maskformer_dir: Path = args.maskformer_dir
-    # append the path to the parents to maskformer dir
-    sys.path.append(str(maskformer_dir.parent))
-    # and import what's needed
-    from MaskFormer.mask_former import add_mask_former_config
-    from MaskFormer.mask_former.mask_former_model import MaskFormer as OriginalMaskFormer
-
-    if not save_directory.exists():
-        save_directory.mkdir(parents=True)
-
-    for config_file, checkpoint_file in OriginalMaskFormerCheckpointToOursConverter.using_dirs(
-        checkpoints_dir, config_dir
-    ):
-        image_processor = OriginalMaskFormerConfigToImageProcessorConverter()(setup_cfg(Args(config_file=config_file)))
-
-        original_config = setup_cfg(Args(config_file=config_file))
-        mask_former_kwargs = OriginalMaskFormer.from_config(original_config)
-
-        original_model = OriginalMaskFormer(**mask_former_kwargs).eval()
-
-        DetectionCheckpointer(original_model).load(str(checkpoint_file))
-
-        config: MaskFormerConfig = OriginalMaskFormerConfigToOursConverter()(original_config)
-
-        mask_former = MaskFormerModel(config=config).eval()
-
-        converter = OriginalMaskFormerCheckpointToOursConverter(original_model, config)
-
-        maskformer = converter.convert(mask_former)
-
-        mask_former_for_instance_segmentation = MaskFormerForInstanceSegmentation(config=config).eval()
-
-        mask_former_for_instance_segmentation.model = mask_former
-        mask_former_for_instance_segmentation = converter.convert_instance_segmentation(
-            mask_former_for_instance_segmentation
-        )
-
-        test(original_model, mask_former_for_instance_segmentation, image_processor)
-
-        model_name = get_name(checkpoint_file)
-        logger.info(f"🪄 Saving {model_name}")
-
-        image_processor.save_pretrained(save_directory / model_name)
-        mask_former_for_instance_segmentation.save_pretrained(save_directory / model_name)
-
-        image_processor.push_to_hub(
-            repo_path_or_name=save_directory / model_name,
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-        mask_former_for_instance_segmentation.push_to_hub(
-            repo_path_or_name=save_directory / model_name,
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
diff --git a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
deleted file mode 100644
index 3ca9d9dfc3d0..000000000000
--- a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
+++ /dev/null
@@ -1,390 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MaskFormer checkpoints with ResNet backbone from the original repository. URL:
-https://github.com/facebookresearch/MaskFormer"""
-
-import argparse
-import json
-import pickle
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, ResNetConfig
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_maskformer_config(model_name: str):
-    if "resnet101c" in model_name:
-        # TODO add support for ResNet-C backbone, which uses a "deeplab" stem
-        raise NotImplementedError("To do")
-    elif "resnet101" in model_name:
-        backbone_config = ResNetConfig.from_pretrained(
-            "microsoft/resnet-101", out_features=["stage1", "stage2", "stage3", "stage4"]
-        )
-    else:
-        backbone_config = ResNetConfig.from_pretrained(
-            "microsoft/resnet-50", out_features=["stage1", "stage2", "stage3", "stage4"]
-        )
-    config = MaskFormerConfig(backbone_config=backbone_config)
-
-    repo_id = "huggingface/label-files"
-    if "ade20k-full" in model_name:
-        config.num_labels = 847
-        filename = "maskformer-ade20k-full-id2label.json"
-    elif "ade" in model_name:
-        config.num_labels = 150
-        filename = "ade20k-id2label.json"
-    elif "coco-stuff" in model_name:
-        config.num_labels = 171
-        filename = "maskformer-coco-stuff-id2label.json"
-    elif "coco" in model_name:
-        # TODO
-        config.num_labels = 133
-        filename = "coco-panoptic-id2label.json"
-    elif "cityscapes" in model_name:
-        config.num_labels = 19
-        filename = "cityscapes-id2label.json"
-    elif "vistas" in model_name:
-        config.num_labels = 65
-        filename = "mapillary-vistas-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.stem.conv1.weight", "model.pixel_level_module.encoder.embedder.embedder.convolution.weight"))
-    rename_keys.append(("backbone.stem.conv1.norm.weight", "model.pixel_level_module.encoder.embedder.embedder.normalization.weight"))
-    rename_keys.append(("backbone.stem.conv1.norm.bias", "model.pixel_level_module.encoder.embedder.embedder.normalization.bias"))
-    rename_keys.append(("backbone.stem.conv1.norm.running_mean", "model.pixel_level_module.encoder.embedder.embedder.normalization.running_mean"))
-    rename_keys.append(("backbone.stem.conv1.norm.running_var", "model.pixel_level_module.encoder.embedder.embedder.normalization.running_var"))
-    # fmt: on
-    # stages
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            # shortcut
-            if layer_idx == 0:
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.weight",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.weight",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.bias",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.running_mean",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.shortcut.norm.running_var",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
-                    )
-                )
-            # 3 convs
-            for i in range(3):
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.weight",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.weight",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.bias",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.running_mean",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.res{stage_idx + 2}.{layer_idx}.conv{i+1}.norm.running_var",
-                        f"model.pixel_level_module.encoder.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var",
-                    )
-                )
-
-    # FPN
-    # fmt: off
-    rename_keys.append(("sem_seg_head.layer_4.weight", "model.pixel_level_module.decoder.fpn.stem.0.weight"))
-    rename_keys.append(("sem_seg_head.layer_4.norm.weight", "model.pixel_level_module.decoder.fpn.stem.1.weight"))
-    rename_keys.append(("sem_seg_head.layer_4.norm.bias", "model.pixel_level_module.decoder.fpn.stem.1.bias"))
-    for source_index, target_index in zip(range(3, 0, -1), range(0, 3)):
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.0.weight"))
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.weight"))
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.bias"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.0.weight"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.weight"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.bias"))
-    rename_keys.append(("sem_seg_head.mask_features.weight", "model.pixel_level_module.decoder.mask_projection.weight"))
-    rename_keys.append(("sem_seg_head.mask_features.bias", "model.pixel_level_module.decoder.mask_projection.bias"))
-    # fmt: on
-
-    # Transformer decoder
-    # fmt: off
-    for idx in range(config.decoder_config.decoder_layers):
-        # self-attention out projection
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.bias"))
-        # cross-attention out projection
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.bias"))
-        # MLP 1
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.weight", f"model.transformer_module.decoder.layers.{idx}.fc1.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.bias", f"model.transformer_module.decoder.layers.{idx}.fc1.bias"))
-        # MLP 2
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.weight", f"model.transformer_module.decoder.layers.{idx}.fc2.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.bias", f"model.transformer_module.decoder.layers.{idx}.fc2.bias"))
-        # layernorm 1 (self-attention layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.bias"))
-        # layernorm 2 (cross-attention layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.bias"))
-        # layernorm 3 (final layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.weight", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.bias", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.bias"))
-
-    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.weight", "model.transformer_module.decoder.layernorm.weight"))
-    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.bias", "model.transformer_module.decoder.layernorm.bias"))
-    # fmt: on
-
-    # heads on top
-    # fmt: off
-    rename_keys.append(("sem_seg_head.predictor.query_embed.weight", "model.transformer_module.queries_embedder.weight"))
-
-    rename_keys.append(("sem_seg_head.predictor.input_proj.weight", "model.transformer_module.input_projection.weight"))
-    rename_keys.append(("sem_seg_head.predictor.input_proj.bias", "model.transformer_module.input_projection.bias"))
-
-    rename_keys.append(("sem_seg_head.predictor.class_embed.weight", "class_predictor.weight"))
-    rename_keys.append(("sem_seg_head.predictor.class_embed.bias", "class_predictor.bias"))
-
-    for i in range(3):
-        rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.weight", f"mask_embedder.{i}.0.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.bias", f"mask_embedder.{i}.0.bias"))
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_decoder_q_k_v(state_dict, config):
-    # fmt: off
-    hidden_size = config.decoder_config.hidden_size
-    for idx in range(config.decoder_config.decoder_layers):
-        # read in weights + bias of self-attention input projection layer (in the original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
-        # read in weights + bias of cross-attention input projection layer (in the original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
-    # fmt: on
-
-
-# We will verify our results on an image of cute cats
-def prepare_img() -> torch.Tensor:
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_maskformer_checkpoint(
-    model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
-):
-    """
-    Copy/paste/tweak model's weights to our MaskFormer structure.
-    """
-    config = get_maskformer_config(model_name)
-
-    # load original state_dict
-    with open(checkpoint_path, "rb") as f:
-        data = pickle.load(f)
-    state_dict = data["model"]
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_decoder_q_k_v(state_dict, config)
-
-    # update to torch tensors
-    for key, value in state_dict.items():
-        state_dict[key] = torch.from_numpy(value)
-
-    # load 🤗 model
-    model = MaskFormerForInstanceSegmentation(config)
-    model.eval()
-
-    model.load_state_dict(state_dict)
-
-    # verify results
-    image = prepare_img()
-    if "vistas" in model_name:
-        ignore_index = 65
-    elif "cityscapes" in model_name:
-        ignore_index = 65535
-    else:
-        ignore_index = 255
-    do_reduce_labels = True if "ade" in model_name else False
-    image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, do_reduce_labels=do_reduce_labels)
-
-    inputs = image_processor(image, return_tensors="pt")
-
-    outputs = model(**inputs)
-
-    if model_name == "maskformer-resnet50-ade":
-        expected_logits = torch.tensor(
-            [[6.7710, -0.1452, -3.5687], [1.9165, -1.0010, -1.8614], [3.6209, -0.2950, -1.3813]]
-        )
-    elif model_name == "maskformer-resnet101-ade":
-        expected_logits = torch.tensor(
-            [[4.0381, -1.1483, -1.9688], [2.7083, -1.9147, -2.2555], [3.4367, -1.3711, -2.1609]]
-        )
-    elif model_name == "maskformer-resnet50-coco-stuff":
-        expected_logits = torch.tensor(
-            [[3.2309, -3.0481, -2.8695], [5.4986, -5.4242, -2.4211], [6.2100, -5.2279, -2.7786]]
-        )
-    elif model_name == "maskformer-resnet101-coco-stuff":
-        expected_logits = torch.tensor(
-            [[4.7188, -3.2585, -2.8857], [6.6871, -2.9181, -1.2487], [7.2449, -2.2764, -2.1874]]
-        )
-    elif model_name == "maskformer-resnet101-cityscapes":
-        expected_logits = torch.tensor(
-            [[-1.8861, -1.5465, 0.6749], [-2.3677, -1.6707, -0.0867], [-2.2314, -1.9530, -0.9132]]
-        )
-    elif model_name == "maskformer-resnet50-vistas":
-        expected_logits = torch.tensor(
-            [[-6.3917, -1.5216, -1.1392], [-5.5335, -4.5318, -1.8339], [-4.3576, -4.0301, 0.2162]]
-        )
-    elif model_name == "maskformer-resnet50-ade20k-full":
-        expected_logits = torch.tensor(
-            [[3.6146, -1.9367, -3.2534], [4.0099, 0.2027, -2.7576], [3.3913, -2.3644, -3.9519]]
-        )
-    elif model_name == "maskformer-resnet101-ade20k-full":
-        expected_logits = torch.tensor(
-            [[3.2211, -1.6550, -2.7605], [2.8559, -2.4512, -2.9574], [2.6331, -2.6775, -2.1844]]
-        )
-
-    assert torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_logits, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and image processor of {model_name} to {pytorch_dump_folder_path}")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and image processor of {model_name} to the hub...")
-        model.push_to_hub(f"facebook/{model_name}")
-        image_processor.push_to_hub(f"facebook/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="maskformer-resnet50-ade",
-        type=str,
-        required=True,
-        choices=[
-            "maskformer-resnet50-ade",
-            "maskformer-resnet101-ade",
-            "maskformer-resnet50-coco-stuff",
-            "maskformer-resnet101-coco-stuff",
-            "maskformer-resnet101-cityscapes",
-            "maskformer-resnet50-vistas",
-            "maskformer-resnet50-ade20k-full",
-            "maskformer-resnet101-ade20k-full",
-        ],
-        help=("Name of the MaskFormer model you'd like to convert",),
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        required=True,
-        help="Path to the original pickle file (.pkl) of the original checkpoint.\n"
-        "Given the files are in the pickle format, please be wary of passing it files you trust.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_maskformer_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
deleted file mode 100644
index 41e8b4888810..000000000000
--- a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MaskFormer checkpoints with Swin backbone from the original repository. URL:
-https://github.com/facebookresearch/MaskFormer"""
-
-import argparse
-import json
-import pickle
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, SwinConfig
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_maskformer_config(model_name: str):
-    backbone_config = SwinConfig.from_pretrained(
-        "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
-    )
-    config = MaskFormerConfig(backbone_config=backbone_config)
-
-    repo_id = "huggingface/label-files"
-    if "ade20k-full" in model_name:
-        # this should be ok
-        config.num_labels = 847
-        filename = "maskformer-ade20k-full-id2label.json"
-    elif "ade" in model_name:
-        # this should be ok
-        config.num_labels = 150
-        filename = "ade20k-id2label.json"
-    elif "coco-stuff" in model_name:
-        # this should be ok
-        config.num_labels = 171
-        filename = "maskformer-coco-stuff-id2label.json"
-    elif "coco" in model_name:
-        # TODO
-        config.num_labels = 133
-        filename = "coco-panoptic-id2label.json"
-    elif "cityscapes" in model_name:
-        # this should be ok
-        config.num_labels = 19
-        filename = "cityscapes-id2label.json"
-    elif "vistas" in model_name:
-        # this should be ok
-        config.num_labels = 65
-        filename = "mapillary-vistas-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.patch_embed.proj.weight", "model.pixel_level_module.encoder.model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("backbone.patch_embed.proj.bias", "model.pixel_level_module.encoder.model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("backbone.patch_embed.norm.weight", "model.pixel_level_module.encoder.model.embeddings.norm.weight"))
-    rename_keys.append(("backbone.patch_embed.norm.bias", "model.pixel_level_module.encoder.model.embeddings.norm.bias"))
-    # stages
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm1.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm1.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.proj.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.attn.proj.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm2.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.norm2.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
-            rename_keys.append((f"backbone.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
-
-        if i < 3:
-            rename_keys.append((f"backbone.layers.{i}.downsample.reduction.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.downsample.reduction.weight"))
-            rename_keys.append((f"backbone.layers.{i}.downsample.norm.weight", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.downsample.norm.weight"))
-            rename_keys.append((f"backbone.layers.{i}.downsample.norm.bias", f"model.pixel_level_module.encoder.model.encoder.layers.{i}.downsample.norm.bias"))
-        rename_keys.append((f"backbone.norm{i}.weight", f"model.pixel_level_module.encoder.hidden_states_norms.{i}.weight"))
-        rename_keys.append((f"backbone.norm{i}.bias", f"model.pixel_level_module.encoder.hidden_states_norms.{i}.bias"))
-
-    # FPN
-    rename_keys.append(("sem_seg_head.layer_4.weight", "model.pixel_level_module.decoder.fpn.stem.0.weight"))
-    rename_keys.append(("sem_seg_head.layer_4.norm.weight", "model.pixel_level_module.decoder.fpn.stem.1.weight"))
-    rename_keys.append(("sem_seg_head.layer_4.norm.bias", "model.pixel_level_module.decoder.fpn.stem.1.bias"))
-    for source_index, target_index in zip(range(3, 0, -1), range(0, 3)):
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.0.weight"))
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.weight"))
-        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.bias"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.0.weight"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.weight"))
-        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.bias"))
-    rename_keys.append(("sem_seg_head.mask_features.weight", "model.pixel_level_module.decoder.mask_projection.weight"))
-    rename_keys.append(("sem_seg_head.mask_features.bias", "model.pixel_level_module.decoder.mask_projection.bias"))
-
-    # Transformer decoder
-    for idx in range(config.decoder_config.decoder_layers):
-        # self-attention out projection
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.bias"))
-        # cross-attention out projection
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.bias"))
-        # MLP 1
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.weight", f"model.transformer_module.decoder.layers.{idx}.fc1.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.bias", f"model.transformer_module.decoder.layers.{idx}.fc1.bias"))
-        # MLP 2
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.weight", f"model.transformer_module.decoder.layers.{idx}.fc2.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.bias", f"model.transformer_module.decoder.layers.{idx}.fc2.bias"))
-        # layernorm 1 (self-attention layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.bias"))
-        # layernorm 2 (cross-attention layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.bias"))
-        # layernorm 3 (final layernorm)
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.weight", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.bias", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.bias"))
-
-    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.weight", "model.transformer_module.decoder.layernorm.weight"))
-    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.bias", "model.transformer_module.decoder.layernorm.bias"))
-
-    # heads on top
-    rename_keys.append(("sem_seg_head.predictor.query_embed.weight", "model.transformer_module.queries_embedder.weight"))
-
-    rename_keys.append(("sem_seg_head.predictor.input_proj.weight", "model.transformer_module.input_projection.weight"))
-    rename_keys.append(("sem_seg_head.predictor.input_proj.bias", "model.transformer_module.input_projection.bias"))
-
-    rename_keys.append(("sem_seg_head.predictor.class_embed.weight", "class_predictor.weight"))
-    rename_keys.append(("sem_seg_head.predictor.class_embed.bias", "class_predictor.bias"))
-
-    for i in range(3):
-        rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.weight", f"mask_embedder.{i}.0.weight"))
-        rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.bias", f"mask_embedder.{i}.0.bias"))
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_swin_q_k_v(state_dict, backbone_config):
-    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
-    for i in range(len(backbone_config.depths)):
-        dim = num_features[i]
-        for j in range(backbone_config.depths[i]):
-            # fmt: off
-            # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"backbone.layers.{i}.blocks.{j}.attn.qkv.weight")
-            in_proj_bias = state_dict.pop(f"backbone.layers.{i}.blocks.{j}.attn.qkv.bias")
-            # next, add query, keys and values (in that order) to the state dict
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
-                dim : dim * 2, :
-            ]
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
-                dim : dim * 2
-            ]
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-                -dim :, :
-            ]
-            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
-            # fmt: on
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_decoder_q_k_v(state_dict, config):
-    # fmt: off
-    hidden_size = config.decoder_config.hidden_size
-    for idx in range(config.decoder_config.decoder_layers):
-        # read in weights + bias of self-attention input projection layer (in the original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
-        # read in weights + bias of cross-attention input projection layer (in the original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
-        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
-    # fmt: on
-
-
-# We will verify our results on an image of cute cats
-def prepare_img() -> torch.Tensor:
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_maskformer_checkpoint(
-    model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
-):
-    """
-    Copy/paste/tweak model's weights to our MaskFormer structure.
-    """
-    config = get_maskformer_config(model_name)
-
-    # load original state_dict
-    with open(checkpoint_path, "rb") as f:
-        data = pickle.load(f)
-    state_dict = data["model"]
-
-    # for name, param in state_dict.items():
-    #     print(name, param.shape)
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_swin_q_k_v(state_dict, config.backbone_config)
-    read_in_decoder_q_k_v(state_dict, config)
-
-    # update to torch tensors
-    for key, value in state_dict.items():
-        state_dict[key] = torch.from_numpy(value)
-
-    # load 🤗 model
-    model = MaskFormerForInstanceSegmentation(config)
-    model.eval()
-
-    for name, param in model.named_parameters():
-        print(name, param.shape)
-
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    assert missing_keys == [
-        "model.pixel_level_module.encoder.model.layernorm.weight",
-        "model.pixel_level_module.encoder.model.layernorm.bias",
-    ]
-    assert len(unexpected_keys) == 0, f"Unexpected keys: {unexpected_keys}"
-
-    # verify results
-    image = prepare_img()
-    if "vistas" in model_name:
-        ignore_index = 65
-    elif "cityscapes" in model_name:
-        ignore_index = 65535
-    else:
-        ignore_index = 255
-    do_reduce_labels = True if "ade" in model_name else False
-    image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, do_reduce_labels=do_reduce_labels)
-
-    inputs = image_processor(image, return_tensors="pt")
-
-    outputs = model(**inputs)
-
-    print("Logits:", outputs.class_queries_logits[0, :3, :3])
-
-    if model_name == "maskformer-swin-tiny-ade":
-        expected_logits = torch.tensor(
-            [[3.6353, -4.4770, -2.6065], [0.5081, -4.2394, -3.5343], [2.1909, -5.0353, -1.9323]]
-        )
-    assert torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_logits, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model and image processor to the hub...")
-        model.push_to_hub(f"nielsr/{model_name}")
-        image_processor.push_to_hub(f"nielsr/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="maskformer-swin-tiny-ade",
-        type=str,
-        help=("Name of the MaskFormer model you'd like to convert",),
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/MaskFormer_checkpoints/MaskFormer-Swin-tiny-ADE20k/model.pkl",
-        type=str,
-        help="Path to the original state dict (.pth file).\n"
-        "Given the files are in the pickle format, please be wary of passing it files you trust.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_maskformer_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index 87a3063a37d4..b31d0321881a 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -576,13 +576,13 @@ def __call__(self, images, segmentation_maps=None, **kwargs) -> BatchFeature:
     def _preprocess(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
-        size_divisor: int = None,
+        size_divisor: Optional[int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -600,13 +600,13 @@ def _preprocess(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
-        size_divisor: int = None,
+        size_divisor: Optional[int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -642,7 +642,7 @@ def _preprocess_image(
     def _preprocess_mask(
         self,
         segmentation_map: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         size_divisor: int = 0,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index b29672d7de36..5c1873b4d666 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -140,7 +140,7 @@ class MaskFormerPixelDecoderOutput(ModelOutput):
             weighted average in the self-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -235,9 +235,9 @@ class MaskFormerForInstanceSegmentationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    class_queries_logits: torch.FloatTensor = None
-    masks_queries_logits: torch.FloatTensor = None
-    auxiliary_logits: torch.FloatTensor = None
+    class_queries_logits: Optional[torch.FloatTensor] = None
+    masks_queries_logits: Optional[torch.FloatTensor] = None
+    auxiliary_logits: Optional[torch.FloatTensor] = None
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     pixel_decoder_last_hidden_state: Optional[torch.FloatTensor] = None
     transformer_decoder_last_hidden_state: Optional[torch.FloatTensor] = None
diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py
index 2597d2a03e62..dd3fc11ca165 100644
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -61,8 +61,8 @@ class MaskFormerSwinModelOutputWithPooling(ModelOutput):
             heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    pooler_output: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     hidden_states_spatial_dimensions: Tuple[Tuple[int, int]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -93,7 +93,7 @@ class MaskFormerSwinBaseModelOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     hidden_states_spatial_dimensions: Tuple[Tuple[int, int]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -766,7 +766,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->MaskFormerSwin, swin->model
 class MaskFormerSwinPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -790,6 +789,11 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, MaskFormerSwinEmbeddings):
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, MaskFormerSwinSelfAttention):
+            module.relative_position_bias_table.data.zero_()
 
 
 class MaskFormerSwinModel(MaskFormerSwinPreTrainedModel):
diff --git a/src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py b/src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py
deleted file mode 100644
index eb7f00bf7710..000000000000
--- a/src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-from torch import nn
-
-from transformers import MBartConfig, MBartForConditionalGeneration
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "_float_tensor",
-        "decoder.output_projection.weight",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def convert_fairseq_mbart_checkpoint_from_disk(
-    checkpoint_path, hf_config_path="facebook/mbart-large-en-ro", finetuned=False, mbart_50=False
-):
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    remove_ignore_keys_(state_dict)
-    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
-
-    mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size)
-    if mbart_50 and finetuned:
-        mbart_config.activation_function = "relu"
-
-    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-    model = MBartForConditionalGeneration(mbart_config)
-    model.model.load_state_dict(state_dict)
-
-    if finetuned:
-        model.lm_head = make_linear_from_emb(model.model.shared)
-
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
-    )
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--hf_config",
-        default="facebook/mbart-large-cc25",
-        type=str,
-        help="Which huggingface architecture to use: mbart-large",
-    )
-    parser.add_argument("--mbart_50", action="store_true", help="whether the model is mMART-50 checkpoint")
-    parser.add_argument("--finetuned", action="store_true", help="whether the model is a fine-tuned checkpoint")
-    args = parser.parse_args()
-    model = convert_fairseq_mbart_checkpoint_from_disk(
-        args.fairseq_path, hf_config_path=args.hf_config, finetuned=args.finetuned, mbart_50=args.mbart_50
-    )
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index 9583c0767484..2f1b650a5d60 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -286,7 +286,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 8412ecef1cf9..850fde60d146 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -31,6 +31,7 @@
     _prepare_4d_causal_attention_mask,
     _prepare_4d_causal_attention_mask_for_sdpa,
 )
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -46,15 +47,13 @@
     add_end_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_mbart import MBartConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -295,9 +294,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -580,9 +579,7 @@ def forward(
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
+        if hidden_states.dtype == torch.float16:
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
@@ -970,7 +967,7 @@ def _backward_compatibility_gradient_checkpointing(self):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -1156,7 +1153,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1445,7 +1442,7 @@ def _tie_weights(self):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1573,7 +1570,7 @@ def set_output_embeddings(self, new_embeddings):
     @add_end_docstrings(MBART_GENERATION_EXAMPLE)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1695,7 +1692,7 @@ def __init__(self, config: MBartConfig, **kwargs):
     # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1823,7 +1820,7 @@ def __init__(self, config):
     # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward
     def forward(
         self,
-        input_ids: torch.Tensor = None,
+        input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1968,7 +1965,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
deleted file mode 100644
index 0fc67866301f..000000000000
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ /dev/null
@@ -1,334 +0,0 @@
-####################################################################################################
-
-# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-####################################################################################################
-
-#
-# Note: If when running this conversion script you're getting an exception:
-#     ModuleNotFoundError: No module named 'megatron.model.enums'
-# you need to tell python where to find the clone of Megatron-LM, e.g.:
-#
-# cd /tmp
-# git clone https://github.com/NVIDIA/Megatron-LM
-# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py ...
-#
-# if you already have it cloned elsewhere, simply adjust the path to the existing path
-#
-# If the training was done using a Megatron-LM fork, e.g.,
-# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one
-# in your path, i.e., /path/to/Megatron-DeepSpeed/
-#
-
-import argparse
-import os
-import re
-import zipfile
-
-import torch
-
-from transformers import MegatronBertConfig
-
-
-####################################################################################################
-
-
-def recursive_print(name, val, spaces=0):
-    # Format the message.
-    if name is None:
-        msg = None
-    else:
-        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
-        msg = fmt.format(name)
-
-    # Print and recurse (if needed).
-    if isinstance(val, dict):
-        if msg is not None:
-            print(msg)
-        for k in val.keys():
-            recursive_print(k, val[k], spaces + 2)
-    elif isinstance(val, torch.Tensor):
-        print(msg, ":", val.size())
-    else:
-        print(msg, ":", val)
-
-
-def fix_query_key_value_ordering(param, checkpoint_version, num_splits, num_heads, hidden_size):
-    # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :]
-    # for compatibility with later versions of NVIDIA Megatron-LM.
-    # The inverse operation is performed inside Megatron-LM to read checkpoints:
-    # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209
-    # If param is the weight tensor of the self-attention block, the returned tensor
-    # will have to be transposed one more time to be read by HuggingFace BERT.
-    input_shape = param.size()
-    if checkpoint_version == 1.0:
-        # version 1.0 stores [num_heads * hidden_size * num_splits, :]
-        saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:]
-        param = param.view(*saved_shape)
-        param = param.transpose(0, 2)
-        param = param.transpose(1, 2).contiguous()
-    elif checkpoint_version >= 2.0:
-        # other versions store [num_heads * num_splits * hidden_size, :]
-        saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:]
-        param = param.view(*saved_shape)
-        param = param.transpose(0, 1).contiguous()
-    param = param.view(*input_shape)
-    return param
-
-
-####################################################################################################
-
-
-def convert_megatron_checkpoint(args, input_state_dict, config):
-    # The converted output model.
-    output_state_dict = {}
-
-    # old versions did not store training args
-    ds_args = input_state_dict.get("args", None)
-    if ds_args is not None:
-        # do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint
-        # from pprint import pprint
-        # pprint(vars(ds_args))
-
-        config.tokenizer_type = ds_args.tokenizer_type
-        config.vocab_size = ds_args.padded_vocab_size
-        config.max_position_embeddings = ds_args.max_position_embeddings
-        config.hidden_size = ds_args.hidden_size
-        config.num_hidden_layers = ds_args.num_layers
-        config.num_attention_heads = ds_args.num_attention_heads
-        config.intermediate_size = ds_args.ffn_hidden_size if "ffn_hidden_size" in ds_args else 4 * ds_args.hidden_size
-        # pprint(config)
-
-    # The number of heads.
-    heads = config.num_attention_heads
-    # The hidden_size per head.
-    hidden_size_per_head = config.hidden_size // heads
-    # Megatron-LM checkpoint version
-    if "checkpoint_version" in input_state_dict.keys():
-        checkpoint_version = input_state_dict["checkpoint_version"]
-    else:
-        checkpoint_version = 0.0
-
-    # The model.
-    model = input_state_dict["model"]
-    # The language model.
-    lm = model["language_model"]
-    # The embeddings.
-    embeddings = lm["embedding"]
-
-    # The word embeddings.
-    word_embeddings = embeddings["word_embeddings"]["weight"]
-    # Truncate the embedding table to vocab_size rows.
-    word_embeddings = word_embeddings[: config.vocab_size, :]
-    # Store the word embeddings.
-    output_state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings
-
-    # The position embeddings.
-    pos_embeddings = embeddings["position_embeddings"]["weight"]
-    assert pos_embeddings.size(0) == config.max_position_embeddings and pos_embeddings.size(1) == config.hidden_size
-    # Store the position embeddings.
-    output_state_dict["bert.embeddings.position_embeddings.weight"] = pos_embeddings
-
-    # The token-type embeddings.
-    tokentype_embeddings = embeddings["tokentype_embeddings"]["weight"]
-    # Store the position embeddings.
-    output_state_dict["bert.embeddings.token_type_embeddings.weight"] = tokentype_embeddings
-
-    # The transformer.
-    transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]
-
-    # The regex to extract layer names.
-    layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
-
-    # The simple map of names for "automated" rules.
-    megatron_to_transformers = {
-        "attention.dense": ".attention.output.dense.",
-        "self_attention.dense": ".attention.output.dense.",
-        "mlp.dense_h_to_4h": ".intermediate.dense.",
-        "mlp.dense_4h_to_h": ".output.dense.",
-    }
-
-    # Keep track of the attention/query/value tensor.
-    attention_qkv_weight = None
-
-    # Extract the layers.
-    for key, val in transformer.items():
-        # Match the name.
-        m = layer_re.match(key)
-
-        # Stop if that's not a layer
-        if m is None:
-            break
-
-        # The index of the layer.
-        layer_idx = int(m.group(1))
-        # The name of the operation.
-        op_name = m.group(2)
-        # Is it a weight or a bias?
-        weight_or_bias = m.group(3)
-
-        # The name of the layer.
-        layer_name = f"bert.encoder.layer.{layer_idx}"
-
-        # For layernorm(s), simply store the layer norm.
-        if op_name.endswith("layernorm"):
-            ln_name = "attention.ln" if op_name.startswith("input") else "ln"
-            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
-
-        # Transpose the QKV matrix.
-        elif (
-            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
-        ) and weight_or_bias == "weight":
-            # Make sure the QKV pointer is nil.
-            assert attention_qkv_weight is None, ""
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
-            # Store the tensor as we need the bias as well to interleave QKV and biases.
-            attention_qkv_weight = out_val
-
-        # Transpose the bias.
-        elif (
-            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
-        ) and weight_or_bias == "bias":
-            # Make sure we read the weight tensor.
-            assert attention_qkv_weight is not None, ""
-
-            # Split the QKV matrix into Q, K and V. Megatron stores Q,K,V interleaved.
-            q = attention_qkv_weight[0 * config.hidden_size : 1 * config.hidden_size, :]
-            k = attention_qkv_weight[1 * config.hidden_size : 2 * config.hidden_size, :]
-            v = attention_qkv_weight[2 * config.hidden_size : 3 * config.hidden_size, :]
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
-            # Split the bias.
-            q_bias = out_val[0 * config.hidden_size : 1 * config.hidden_size]
-            k_bias = out_val[1 * config.hidden_size : 2 * config.hidden_size]
-            v_bias = out_val[2 * config.hidden_size : 3 * config.hidden_size]
-
-            # Store.
-            output_state_dict[f"{layer_name}.attention.self.query.weight"] = q
-            output_state_dict[f"{layer_name}.attention.self.query.bias"] = q_bias
-            output_state_dict[f"{layer_name}.attention.self.key.weight"] = k
-            output_state_dict[f"{layer_name}.attention.self.key.bias"] = k_bias
-            output_state_dict[f"{layer_name}.attention.self.value.weight"] = v
-            output_state_dict[f"{layer_name}.attention.self.value.bias"] = v_bias
-
-            # Clear the stored tensor.
-            attention_qkv_weight = None
-
-        # Copy weights and biases as is.
-        elif weight_or_bias in ["weight", "bias"]:
-            out_name = megatron_to_transformers[op_name]
-            output_state_dict[layer_name + out_name + weight_or_bias] = val
-
-    # The final layernorm.
-    output_state_dict["bert.encoder.ln.weight"] = transformer["final_layernorm.weight"]
-    output_state_dict["bert.encoder.ln.bias"] = transformer["final_layernorm.bias"]
-
-    # The pooler.
-    pooler = lm["pooler"]
-
-    # Store the matrix and the bias.
-    output_state_dict["bert.pooler.dense.weight"] = pooler["dense.weight"]
-    output_state_dict["bert.pooler.dense.bias"] = pooler["dense.bias"]
-
-    # The LM head from Megatron (for RACE).
-    lm_head = model["lm_head"]
-
-    # The transform matrix.
-    output_state_dict["cls.predictions.transform.dense.weight"] = lm_head["dense.weight"]
-    output_state_dict["cls.predictions.transform.dense.bias"] = lm_head["dense.bias"]
-
-    # The transform LN.
-    output_state_dict["cls.predictions.transform.LayerNorm.weight"] = lm_head["layernorm.weight"]
-    output_state_dict["cls.predictions.transform.LayerNorm.bias"] = lm_head["layernorm.bias"]
-
-    # For the decoder, we replicate the weights.
-    output_state_dict["cls.predictions.decoder.weight"] = word_embeddings
-    output_state_dict["cls.predictions.bias"] = lm_head["bias"]
-
-    # The classifier from Megatron (for MLNI).
-    binary_head = model["binary_head"]
-
-    # Store the classifier.
-    output_state_dict["cls.seq_relationship.weight"] = binary_head["weight"]
-    output_state_dict["cls.seq_relationship.bias"] = binary_head["bias"]
-
-    # It should be done!
-    return output_state_dict
-
-
-####################################################################################################
-
-
-def main():
-    # Create the argument parser.
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--print-checkpoint-structure", action="store_true")
-    parser.add_argument("path_to_checkpoint", type=str, help="Path to the ZIP file containing the checkpoint")
-    parser.add_argument(
-        "--config_file",
-        default="",
-        type=str,
-        help="An optional config json file describing the pre-trained model.",
-    )
-    args = parser.parse_args()
-
-    # Extract the basename.
-    basename = os.path.dirname(args.path_to_checkpoint)
-
-    # Load the model.
-    # the .zip is very optional, let's keep it for backward compatibility
-    print(f'Extracting PyTorch state dictionary from "{args.path_to_checkpoint}"')
-    if args.path_to_checkpoint.endswith(".zip"):
-        with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
-            with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
-                input_state_dict = torch.load(pytorch_dict, map_location="cpu")
-    else:
-        input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu")
-
-    if args.config_file == "":
-        # Default config of megatron-bert 345m
-        config = MegatronBertConfig()
-
-        # different megatron-bert-*-345m models have different vocab sizes, so override the default
-        # config (which is for megatron-bert-cased-345m) with the actual vocab dimension
-        config.vocab_size = input_state_dict["model"]["lm_head"]["bias"].numel()
-    else:
-        config = MegatronBertConfig.from_json_file(args.config_file)
-
-    # Convert.
-    print("Converting")
-    output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
-
-    # Print the structure of converted state dict.
-    if args.print_checkpoint_structure:
-        recursive_print(None, output_state_dict)
-
-    # Store the config to file.
-    print("Saving config")
-    config.save_pretrained(basename)
-
-    # Store the state_dict to file.
-    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
-    print(f'Saving checkpoint to "{output_checkpoint_file}"')
-    torch.save(output_state_dict, output_checkpoint_file)
-
-
-####################################################################################################
-
-if __name__ == "__main__":
-    main()
-
-####################################################################################################
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index dba31a7b85fa..82f2202f4749 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -751,8 +751,8 @@ class MegatronBertForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
deleted file mode 100644
index 38060f8af5c7..000000000000
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ /dev/null
@@ -1,358 +0,0 @@
-####################################################################################################
-
-# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-####################################################################################################
-
-#
-# Note: If when running this conversion script you're getting an exception:
-#     ModuleNotFoundError: No module named 'megatron.model.enums'
-# you need to tell python where to find the clone of Megatron-LM, e.g.:
-#
-# cd /tmp
-# git clone https://github.com/NVIDIA/Megatron-LM
-# PYTHONPATH=/tmp/Megatron-LM python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py ...
-#
-# if you already have it cloned elsewhere, simply adjust the path to the existing path
-#
-# If the training was done using a Megatron-LM fork, e.g.,
-# https://github.com/microsoft/Megatron-DeepSpeed/ then chances are that you need to have that one
-# in your path, i.e., /path/to/Megatron-DeepSpeed/
-#
-
-import argparse
-import os
-import re
-import zipfile
-
-import torch
-
-from transformers import AutoTokenizer, GPT2Config
-
-
-####################################################################################################
-
-
-def recursive_print(name, val, spaces=0):
-    # Format the message.
-    if name is None:
-        msg = None
-    else:
-        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
-        msg = fmt.format(name)
-
-    # Print and recurse (if needed).
-    if isinstance(val, dict):
-        if msg is not None:
-            print(msg)
-        for k in val.keys():
-            recursive_print(k, val[k], spaces + 2)
-    elif isinstance(val, torch.Tensor):
-        print(msg, ":", val.size())
-    else:
-        print(msg, ":", val)
-
-
-def fix_query_key_value_ordering(param, checkpoint_version, num_splits, num_heads, hidden_size):
-    # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :]
-    # for compatibility with later versions of NVIDIA Megatron-LM.
-    # The inverse operation is performed inside Megatron-LM to read checkpoints:
-    # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209
-    # If param is the weight tensor of the self-attention block, the returned tensor
-    # will have to be transposed one more time to be read by HuggingFace GPT2.
-    input_shape = param.size()
-    if checkpoint_version == 1.0:
-        # version 1.0 stores [num_heads * hidden_size * num_splits, :]
-        saved_shape = (num_heads, hidden_size, num_splits) + input_shape[1:]
-        param = param.view(*saved_shape)
-        param = param.transpose(0, 2)
-        param = param.transpose(1, 2).contiguous()
-    elif checkpoint_version >= 2.0:
-        # other versions store [num_heads * num_splits * hidden_size, :]
-        saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:]
-        param = param.view(*saved_shape)
-        param = param.transpose(0, 1).contiguous()
-    param = param.view(*input_shape)
-    return param
-
-
-####################################################################################################
-
-
-def convert_megatron_checkpoint(args, input_state_dict, config):
-    # The converted output model.
-    output_state_dict = {}
-
-    # old versions did not store training args
-    ds_args = input_state_dict.get("args", None)
-    if ds_args is not None:
-        # do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint
-        # from pprint import pprint
-        # pprint(vars(ds_args))
-
-        config.vocab_size = ds_args.padded_vocab_size
-        config.n_positions = ds_args.max_position_embeddings
-        config.n_embd = ds_args.hidden_size
-        config.n_layer = ds_args.num_layers
-        config.n_head = ds_args.num_attention_heads
-        config.n_inner = ds_args.ffn_hidden_size
-        # pprint(config)
-
-    # The number of heads.
-    heads = config.n_head
-    # The hidden_size per head.
-    hidden_size_per_head = config.n_embd // config.n_head
-    # Megatron-LM checkpoint version
-    if "checkpoint_version" in input_state_dict.keys():
-        checkpoint_version = input_state_dict["checkpoint_version"]
-    else:
-        checkpoint_version = 0.0
-
-    # The model.
-    model = input_state_dict["model"]
-    # The language model.
-    lm = model["language_model"]
-    # The embeddings.
-    embeddings = lm["embedding"]
-
-    # The word embeddings.
-    word_embeddings = embeddings["word_embeddings"]["weight"]
-    # Truncate the embedding table to vocab_size rows.
-    word_embeddings = word_embeddings[: config.vocab_size, :]
-    output_state_dict["transformer.wte.weight"] = word_embeddings
-
-    # The position embeddings.
-    pos_embeddings = embeddings["position_embeddings"]["weight"]
-    # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
-    n_positions = pos_embeddings.size(0)
-    if n_positions != config.n_positions:
-        raise ValueError(
-            f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match"
-        )
-    # Store the position embeddings.
-    output_state_dict["transformer.wpe.weight"] = pos_embeddings
-
-    # The transformer.
-    transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]
-
-    # The regex to extract layer names.
-    layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
-
-    # The simple map of names for "automated" rules.
-    megatron_to_transformers = {
-        "attention.dense": ".attn.c_proj.",
-        "self_attention.dense": ".attn.c_proj.",
-        "mlp.dense_h_to_4h": ".mlp.c_fc.",
-        "mlp.dense_4h_to_h": ".mlp.c_proj.",
-    }
-
-    # Extract the layers.
-    for key, val in transformer.items():
-        # Match the name.
-        m = layer_re.match(key)
-
-        # Stop if that's not a layer
-        if m is None:
-            break
-
-        # The index of the layer.
-        layer_idx = int(m.group(1))
-        # The name of the operation.
-        op_name = m.group(2)
-        # Is it a weight or a bias?
-        weight_or_bias = m.group(3)
-
-        # The name of the layer.
-        layer_name = f"transformer.h.{layer_idx}"
-
-        # For layernorm(s), simply store the layer norm.
-        if op_name.endswith("layernorm"):
-            ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
-            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
-
-        # Transpose the QKV matrix.
-        elif (
-            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
-        ) and weight_or_bias == "weight":
-            # Insert a tensor of 1x1xDxD bias.
-            causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
-                1, 1, n_positions, n_positions
-            )
-            output_state_dict[layer_name + ".attn.bias"] = causal_mask
-
-            # Insert a "dummy" tensor for masked_bias.
-            masked_bias = torch.tensor(-1e4, dtype=torch.float16)
-            output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
-            # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
-            out_val = out_val.transpose(0, 1).contiguous()
-            # Store.
-            output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val
-
-        # Transpose the bias.
-        elif (
-            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
-        ) and weight_or_bias == "bias":
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
-            # Store. No change of shape.
-            output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val
-
-        # Transpose the weights.
-        elif weight_or_bias == "weight":
-            out_name = megatron_to_transformers[op_name]
-            output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1)
-
-        # Copy the bias.
-        elif weight_or_bias == "bias":
-            out_name = megatron_to_transformers[op_name]
-            output_state_dict[layer_name + out_name + "bias"] = val
-
-    # DEBUG.
-    assert config.n_layer == layer_idx + 1
-
-    # The final layernorm.
-    output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"]
-    output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"]
-
-    # For LM head, transformers' wants the matrix to weight embeddings.
-    output_state_dict["lm_head.weight"] = word_embeddings
-
-    # It should be done!
-    return output_state_dict
-
-
-####################################################################################################
-
-
-def main():
-    # Create the argument parser.
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--print-checkpoint-structure", action="store_true")
-    parser.add_argument(
-        "path_to_checkpoint",
-        type=str,
-        help="Path to the checkpoint file (.zip archive or direct .pt file)",
-    )
-    parser.add_argument(
-        "--config_file",
-        default="",
-        type=str,
-        help="An optional config json file describing the pre-trained model.",
-    )
-    args = parser.parse_args()
-
-    # Extract the basename.
-    basename = os.path.dirname(args.path_to_checkpoint)
-
-    # Load the model.
-    # the .zip is very optional, let's keep it for backward compatibility
-    print(f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}")
-    if args.path_to_checkpoint.endswith(".zip"):
-        with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
-            with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
-                input_state_dict = torch.load(pytorch_dict, map_location="cpu")
-    else:
-        input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu")
-
-    ds_args = input_state_dict.get("args", None)
-
-    # Read the config, or default to the model released by NVIDIA.
-    if args.config_file == "":
-        if ds_args is not None:
-            if ds_args.bias_gelu_fusion:
-                activation_function = "gelu_fast"
-            elif ds_args.openai_gelu:
-                activation_function = "gelu_new"
-            else:
-                activation_function = "gelu"
-        else:
-            # in the very early days this used to be "gelu_new"
-            activation_function = "gelu_new"
-
-        # Spell out all parameters in case the defaults change.
-        config = GPT2Config(
-            vocab_size=50257,
-            n_positions=1024,
-            n_embd=1024,
-            n_layer=24,
-            n_head=16,
-            n_inner=4096,
-            activation_function=activation_function,
-            resid_pdrop=0.1,
-            embd_pdrop=0.1,
-            attn_pdrop=0.1,
-            layer_norm_epsilon=1e-5,
-            initializer_range=0.02,
-            summary_type="cls_index",
-            summary_use_proj=True,
-            summary_activation=None,
-            summary_proj_to_labels=True,
-            summary_first_dropout=0.1,
-            scale_attn_weights=True,
-            use_cache=True,
-            bos_token_id=50256,
-            eos_token_id=50256,
-        )
-    else:
-        config = GPT2Config.from_json_file(args.config_file)
-
-    config.architectures = ["GPT2LMHeadModel"]
-
-    # Convert.
-    print("Converting")
-    output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
-
-    # Print the structure of converted state dict.
-    if args.print_checkpoint_structure:
-        recursive_print(None, output_state_dict)
-
-    # Add tokenizer class info to config
-    # see https://github.com/huggingface/transformers/issues/13906)
-    if ds_args is not None:
-        tokenizer_type = ds_args.tokenizer_type
-        if tokenizer_type == "GPT2BPETokenizer":
-            tokenizer_model_name = "openai-community/gpt2"
-        elif tokenizer_type == "PretrainedFromHF":
-            tokenizer_model_name = ds_args.tokenizer_name_or_path
-        else:
-            raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}")
-    else:
-        tokenizer_model_name = "openai-community/gpt2"
-
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
-    tokenizer_class = type(tokenizer).__name__
-    config.tokenizer_class = tokenizer_class
-
-    # Store the config to file.
-    print("Saving config")
-    config.save_pretrained(basename)
-
-    # Save tokenizer based on args
-    print(f"Adding {tokenizer_class} tokenizer files")
-    tokenizer.save_pretrained(basename)
-
-    # Store the state_dict to file.
-    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
-    print(f'Saving checkpoint to "{output_checkpoint_file}"')
-    torch.save(output_state_dict, output_checkpoint_file)
-
-
-####################################################################################################
-
-if __name__ == "__main__":
-    main()
-
-####################################################################################################
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 81d49cb10c21..5af06e2f139b 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -51,7 +51,7 @@ class MgpstrProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "char_tokenizer"]
-    image_processor_class = "ViTImageProcessor"
+    image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast")
     char_tokenizer_class = "MgpstrTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
@@ -81,7 +81,7 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
         [`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
         arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
-        refer to the doctsring of the above methods for more information.
+        refer to the docstring of the above methods for more information.
         """
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
@@ -182,7 +182,7 @@ def _decode_helper(self, pred_logits, format):
         for index in range(batch_size):
             pred_eos = preds_str[index].find(eos_str)
             pred = preds_str[index][:pred_eos]
-            pred_index = preds_index[index].cpu().tolist()
+            pred_index = preds_index[index].tolist()
             pred_eos_index = pred_index.index(eos_token) if eos_token in pred_index else -1
             pred_max_prob = preds_max_prob[index][: pred_eos_index + 1]
             confidence_score = pred_max_prob.cumprod(dim=0)[-1] if pred_max_prob.nelement() != 0 else 0.0
diff --git a/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py b/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
deleted file mode 100644
index c617fa036c5d..000000000000
--- a/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Mimi checkpoints."""
-
-import argparse
-
-import safetensors
-import torch
-
-from transformers import (
-    EncodecFeatureExtractor,
-    MimiConfig,
-    MimiModel,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.mimi")
-
-
-def assert_param_count(model_1, model_2):
-    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
-    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
-    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
-
-
-def param_count(model):
-    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
-
-
-def _grab_best_device(use_gpu=True):
-    if torch.cuda.device_count() > 0 and use_gpu:
-        device = "cuda"
-    else:
-        device = "cpu"
-    return torch.device(device)
-
-
-convert_list = [
-    # GENERAL
-    ("conv.conv.conv", "conv"),
-    ("convtr.convtr.convtr", "conv"),
-    ("conv.conv", "conv"),
-    ("convtr.convtr", "conv"),
-    # QUANTIZER
-    ("quantizer.rvq_first.vq", "quantizer.semantic_residual_vector_quantizer"),
-    ("quantizer.rvq_first", "quantizer.semantic_residual_vector_quantizer"),
-    ("quantizer.rvq_rest.vq", "quantizer.acoustic_residual_vector_quantizer"),
-    ("quantizer.rvq_rest", "quantizer.acoustic_residual_vector_quantizer"),
-    ("_codebook", "codebook"),
-    ("_initialized", "initialized"),
-    ("embedding_sum", "embed_sum"),
-    # ENCODER PART
-    ("encoder.model", "encoder.layers"),
-    ("decoder.model", "decoder.layers"),
-    # TRANSFORMERS PART
-    ("encoder_transformer.transformer", "encoder_transformer"),
-    ("decoder_transformer.transformer", "decoder_transformer"),
-    ("linear1", "mlp.fc1"),
-    ("linear2", "mlp.fc2"),
-    ("self_attn.out_proj", "self_attn.o_proj"),
-    ("norm1", "input_layernorm"),
-    ("norm2", "post_attention_layernorm"),
-    ("layer_scale_1", "self_attn_layer_scale"),
-    ("layer_scale_2", "mlp_layer_scale"),
-]
-
-
-def _convert_model(
-    state_dict,
-    hf_model,
-    convert_list,
-    device,
-    config,
-    unwanted_prefix=None,
-):
-    hidden_size = config.hidden_size
-    head_dim = config.head_dim
-    num_heads = int(config.hidden_size // config.head_dim)
-    num_key_value_heads = config.num_key_value_heads
-    key_value_head_dim = config.num_key_value_heads * head_dim
-
-    # permute for sliced rotary
-    def permute(w, n_heads, dim1=hidden_size, dim2=hidden_size):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    for k, v in list(state_dict.items()):
-        new_k = k if unwanted_prefix is None else k[len(unwanted_prefix) :]
-        for old_layer_name, new_layer_name in convert_list:
-            if old_layer_name in new_k:
-                new_k = new_k.replace(old_layer_name, new_layer_name)
-
-        if "in_proj_weight" in new_k:
-            # split qkv into query key and value
-            mixed_qkv = state_dict.pop(k)
-            qkv_dim = mixed_qkv.size(0) // 3
-
-            query_layer = mixed_qkv[:qkv_dim]
-            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
-            value_layer = mixed_qkv[qkv_dim * 2 :]
-
-            state_dict[new_k.replace("in_proj_weight", "q_proj.weight")] = permute(query_layer, num_heads)
-            state_dict[new_k.replace("in_proj_weight", "k_proj.weight")] = permute(
-                key_layer, num_key_value_heads, dim1=key_value_head_dim
-            )
-            state_dict[new_k.replace("in_proj_weight", "v_proj.weight")] = value_layer
-        else:
-            state_dict[new_k] = state_dict.pop(k)
-
-    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    hf_model.load_state_dict(state_dict, strict=True)
-    n_params = param_count(hf_model)
-
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
-
-    hf_model.eval()
-    hf_model.to(device)
-    del state_dict
-
-    return hf_model
-
-
-@torch.no_grad()
-def convert_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    device = _grab_best_device()
-
-    if config_path is not None:
-        config = MimiConfig.from_pretrained(config_path)
-    else:
-        config = MimiConfig()
-
-    model = MimiModel(config)
-
-    feature_extractor = EncodecFeatureExtractor(
-        feature_size=config.audio_channels,
-        sampling_rate=config.sampling_rate,
-    )
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    original_checkpoint = safetensors.torch.load_file(checkpoint_path)
-    if "best_state" in original_checkpoint:
-        # we might have a training state saved, in which case discard the yaml results and just retain the weights
-        original_checkpoint = original_checkpoint["best_state"]
-
-    model = _convert_model(original_checkpoint, model, convert_list, device, config)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        feature_extractor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py
index af36b2333577..6313ff4b6565 100644
--- a/src/transformers/models/mimi/modeling_mimi.py
+++ b/src/transformers/models/mimi/modeling_mimi.py
@@ -25,22 +25,21 @@
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import BaseModelOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_mimi import MimiConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 logger = logging.get_logger(__name__)
@@ -76,8 +75,8 @@ class MimiOutput(ModelOutput):
             have their past key value states given to this model).
     """
 
-    audio_codes: torch.LongTensor = None
-    audio_values: torch.FloatTensor = None
+    audio_codes: Optional[torch.LongTensor] = None
+    audio_values: Optional[torch.FloatTensor] = None
     encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
     decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
 
@@ -98,7 +97,7 @@ class MimiEncoderOutput(ModelOutput):
             have their past key value states given to this model).
     """
 
-    audio_codes: torch.LongTensor = None
+    audio_codes: Optional[torch.LongTensor] = None
     encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
 
 
@@ -118,7 +117,7 @@ class MimiDecoderOutput(ModelOutput):
             have their past key value states given to this model).
     """
 
-    audio_values: torch.FloatTensor = None
+    audio_values: Optional[torch.FloatTensor] = None
     decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
 
 
@@ -161,7 +160,7 @@ def __init__(
 
         self.register_buffer("stride", stride, persistent=False)
         self.register_buffer("kernel_size", kernel_size, persistent=False)
-        self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
+        self.register_buffer("padding_total", kernel_size - stride, persistent=False)
 
         # Asymmetric padding required for odd strides
         self.padding_right = self.padding_total // 2
@@ -382,45 +381,18 @@ def __init__(self, config: MimiConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -602,9 +574,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -896,7 +868,7 @@ def __init__(self, config: MimiConfig):
 
     def forward(
         self,
-        hidden_states: torch.LongTensor = None,
+        hidden_states: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -1066,7 +1038,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -1170,7 +1142,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1316,7 +1288,7 @@ def decode(self, embed_ind):
 class MimiResidualVectorQuantizer(nn.Module):
     """Residual Vector Quantizer."""
 
-    def __init__(self, config: MimiConfig, num_quantizers: int = None):
+    def __init__(self, config: MimiConfig, num_quantizers: Optional[int] = None):
         super().__init__()
         self.codebook_size = config.codebook_size
         self.frame_rate = config.frame_rate
@@ -1598,7 +1570,7 @@ def _encode_frame(
     def encode(
         self,
         input_values: torch.Tensor,
-        padding_mask: torch.Tensor = None,
+        padding_mask: Optional[torch.Tensor] = None,
         num_quantizers: Optional[float] = None,
         encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/mistral/convert_mistral_weights_to_hf.py b/src/transformers/models/mistral/convert_mistral_weights_to_hf.py
deleted file mode 100644
index 1fc4ad90e4f5..000000000000
--- a/src/transformers/models/mistral/convert_mistral_weights_to_hf.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-import re
-
-import torch
-from safetensors.torch import load_file
-
-from transformers import AutoTokenizer, LlamaTokenizerFast, MistralConfig, MistralForCausalLM
-from transformers.integrations.mistral import convert_tekken_tokenizer
-
-
-# fmt: off
-STATE_DICT_MAPPING = {
-    # CausalLM keys
-    r"^output.weight":                            r"lm_head.weight",
-
-    # Model keys
-    r"^norm.weight":                              r"model.norm.weight",
-    r"^tok_embeddings.weight":                    r"model.embed_tokens.weight",
-
-    # Layers keys
-    r"^layers.(\d+).attention_norm.weight":       r"model.layers.\1.input_layernorm.weight",
-    r"^layers.(\d+).ffn_norm.weight":             r"model.layers.\1.post_attention_layernorm.weight",
-
-    # Attention keys
-    r"^layers.(\d+).attention.w(q|k|v|o).weight": r"model.layers.\1.self_attn.\2_proj.weight",
-
-
-    # MLP keys
-    r"^layers.(\d+).feed_forward.w1.weight":      r"model.layers.\1.mlp.gate_proj.weight",
-    r"^layers.(\d+).feed_forward.w2.weight":      r"model.layers.\1.mlp.down_proj.weight",
-    r"^layers.(\d+).feed_forward.w3.weight":      r"model.layers.\1.mlp.up_proj.weight",
-}
-# fmt: on
-
-
-def map_old_key_to_new(old_key):
-    """Map of a key of the original state dict to the equivalent key in HF format"""
-    for pattern, replacement in STATE_DICT_MAPPING.items():
-        new_key, n_replace = re.subn(pattern, replacement, old_key)
-        # Early exit of the loop
-        if n_replace > 0:
-            return new_key
-
-    raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).")
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def permute_for_rope(tensor, n_heads, dim1, dim2):
-    """Permute the weights for the ROPE formulation."""
-    tensor = tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2)
-    tensor = tensor.transpose(1, 2)
-    tensor = tensor.reshape(dim1, dim2)
-    return tensor
-
-
-def convert_state_dict(original_state_dict: dict, config: MistralConfig):
-    """Convert a state dict file, when a single `nn.Module` is never sharded in different files (usual case)."""
-    new_dict = {}
-
-    num_attention_heads = config.num_attention_heads
-    hidden_size = config.hidden_size
-    head_dim = config.head_dim
-    num_key_value_heads = config.num_key_value_heads
-    key_value_dim = head_dim * num_key_value_heads
-    query_dim = head_dim * num_attention_heads
-
-    for old_key, tensor in original_state_dict.items():
-        new_key = map_old_key_to_new(old_key)
-
-        if "q_proj" in new_key:
-            tensor = tensor.view(num_attention_heads, head_dim, hidden_size).reshape(query_dim, hidden_size)
-            tensor = permute_for_rope(tensor, num_attention_heads, query_dim, hidden_size)
-        elif "k_proj" in new_key:
-            tensor = tensor.view(num_key_value_heads, head_dim, hidden_size).reshape(key_value_dim, hidden_size)
-            tensor = permute_for_rope(tensor, num_key_value_heads, key_value_dim, hidden_size)
-        elif "v_proj" in new_key:
-            tensor = tensor.view(num_key_value_heads, head_dim, hidden_size).reshape(key_value_dim, hidden_size)
-
-        new_dict[new_key] = tensor
-    return new_dict
-
-
-def get_concat_dim(key):
-    """Return the dimension to concatenate the weights on."""
-    concat_dim_1 = [
-        r"model.embed_tokens.weight",
-        r"model.layers.(\d+).self_attn.o_proj.weight",
-        r"model.layers.(\d+).mlp.down_proj.weight",
-    ]
-    if any(re.search(pattern, key) for pattern in concat_dim_1):
-        return 1
-    return 0
-
-
-def convert_state_dict_sharded(loaded_shards: list[dict], config: MistralConfig):
-    """Convert the state dict, when a single `nn.Module` is sharded accross different files."""
-    new_dict = {}
-
-    num_shards = len(loaded_shards)
-
-    n_heads = config.num_attention_heads
-    dim = config.hidden_size
-    dims_per_head = dim // n_heads
-    num_key_value_heads = config.num_key_value_heads
-    n_heads_per_shard = n_heads // num_shards
-    num_local_key_value_heads = num_key_value_heads // num_shards
-    key_value_dim = dim if n_heads == num_key_value_heads else dims_per_head * num_local_key_value_heads
-
-    original_keys = loaded_shards[0].keys()
-    for old_key in original_keys:
-        new_key = map_old_key_to_new(old_key)
-        cat_dim = get_concat_dim(new_key)
-
-        if "q_proj" in new_key:
-            tensor = torch.cat(
-                [shard.pop(old_key).view(n_heads_per_shard, dims_per_head, dim) for shard in loaded_shards],
-                dim=cat_dim,
-            ).reshape(dim, dim)
-            tensor = permute_for_rope(tensor, n_heads, dim, dim)
-        elif "k_proj" in new_key:
-            tensor = torch.cat(
-                [shard.pop(old_key).view(num_local_key_value_heads, dims_per_head, dim) for shard in loaded_shards],
-                dim=cat_dim,
-            ).reshape(key_value_dim, dim)
-            tensor = permute_for_rope(tensor, num_key_value_heads, key_value_dim, dim)
-        elif "v_proj" in new_key:
-            tensor = torch.cat(
-                [shard.pop(old_key).view(num_local_key_value_heads, dims_per_head, dim) for shard in loaded_shards],
-                dim=cat_dim,
-            ).reshape(key_value_dim, dim)
-        elif "input_layernorm" in new_key or "post_attention_layernorm" in new_key:
-            tensor = loaded_shards[0][old_key].clone()
-        elif "model.norm.weight" in new_key:
-            tensor = loaded_shards[0][old_key]
-        else:
-            tensor = torch.cat([shard.pop(old_key) for shard in loaded_shards], dim=cat_dim)
-
-        new_dict[new_key] = tensor
-
-    return new_dict
-
-
-def convert_config(original_config: dict, max_position_embeddings: int = 32768):
-    key_mapping = {
-        "hidden_size": "dim",
-        "num_hidden_layers": "n_layers",
-        "intermediate_size": "hidden_dim",
-        "num_attention_heads": "n_heads",
-        "rms_norm_eps": "norm_eps",
-    }
-    similar_keys_to_keep = [
-        "head_dim",
-        "vocab_size",
-    ]
-
-    new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()}
-    new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep})
-
-    # These are not always defined depending on `params.json`
-    new_config_kwargs["sliding_window"] = original_config.get("sliding_window", None)
-    new_config_kwargs["num_key_value_heads"] = original_config.get(
-        "n_kv_heads", new_config_kwargs["num_attention_heads"]
-    )
-    new_config_kwargs["rope_theta"] = original_config.get("rope_theta", 10000.0)
-    new_config_kwargs["max_position_embeddings"] = original_config.get("max_seq_len", max_position_embeddings)
-
-    # This may sometimes be a string in `params.json`
-    if new_config_kwargs["sliding_window"] is not None:
-        new_config_kwargs["sliding_window"] = int(new_config_kwargs["sliding_window"])
-
-    new_config = MistralConfig(**new_config_kwargs)
-    return new_config
-
-
-def convert_and_write_model(input_dir: str, output_dir: str, max_position_embeddings: int, modules_are_split: bool):
-    """Convert the model and save it (this implicitly save the config as well)."""
-    params = read_json(os.path.join(input_dir, "params.json"))
-    config = convert_config(params, max_position_embeddings)
-
-    full_state_dict = {}
-    # The model may be split between different files, but a single nn.Module is always fully present in a single file
-    if not modules_are_split:
-        shards = [file for file in os.listdir(input_dir) if file.endswith(".safetensors")]
-        for shard_file in shards:
-            original_state_dict = load_file(os.path.join(input_dir, shard_file))
-            new_dict = convert_state_dict(original_state_dict, config)
-            full_state_dict.update(new_dict)
-    # A single nn.Module is split between different checkpoint files
-    else:
-        shards = [file for file in os.listdir(input_dir) if re.match(r"consolidated.\d+.pth", file)]
-        shards = sorted(shards, key=lambda x: int(x.split(".")[1]))
-        loaded_shards = [torch.load(os.path.join(input_dir, file), map_location="cpu") for file in shards]
-        full_state_dict = convert_state_dict_sharded(loaded_shards, config)
-
-    # Load weights into model and resave them
-    with torch.device("meta"):
-        model = MistralForCausalLM(config)
-    model.load_state_dict(full_state_dict, strict=True, assign=True)
-    model.save_pretrained(output_dir)
-
-
-def convert_and_write_tokenizer(input_dir: str, output_dir: str, tokenizer_template_name: str = ""):
-    """Convert the tokenizer and save it."""
-    # Tekken format
-    if "tekken.json" in os.listdir(input_dir):
-        tokenizer_file = os.path.join(input_dir, "tekken.json")
-        tokenizer = convert_tekken_tokenizer(tokenizer_file)
-    else:
-        # May have .v3 or .v7 at the end
-        tokenizer_file = [file for file in os.listdir(input_dir) if "tokenizer.model" in file][0]
-        tokenizer = LlamaTokenizerFast(os.path.join(input_dir, tokenizer_file))
-
-    # Load a chat template from another model
-    if tokenizer_template_name != "":
-        template_tok = AutoTokenizer.from_pretrained(tokenizer_template_name)
-        tokenizer.chat_template = template_tok.chat_template
-
-    # Finally save it
-    tokenizer.save_pretrained(output_dir)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "input_dir",
-        help="Location of Mistral weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--template_name",
-        type=str,
-        default="",
-        help="Another model name from which to copy the chat template.",
-    )
-    parser.add_argument(
-        "--max_position_embeddings",
-        type=int,
-        default=32768,
-        help="`max_position_embeddings` field in the config. This needs to be manually passed (not present anywhere otherwise).",
-    )
-    parser.add_argument(
-        "--modules_are_split",
-        action="store_true",
-        help="If passed, then the weights of a single `nn.Module` are assumed to be split between different files.",
-    )
-    parser.add_argument(
-        "--tokenizer_only",
-        action="store_true",
-        help="If passed, will only convert the tokenizer.",
-    )
-
-    args = parser.parse_args()
-
-    if not args.tokenizer_only:
-        convert_and_write_model(args.input_dir, args.output_dir, args.max_position_embeddings, args.modules_are_split)
-    convert_and_write_tokenizer(args.input_dir, args.output_dir, args.template_name)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/mistral/modeling_flax_mistral.py b/src/transformers/models/mistral/modeling_flax_mistral.py
index 3bff2a628122..9ad28772bcf1 100644
--- a/src/transformers/models/mistral/modeling_flax_mistral.py
+++ b/src/transformers/models/mistral/modeling_flax_mistral.py
@@ -254,7 +254,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index b300c7c646f2..494f94dd3480 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -4,6 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_mistral.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from functools import partial
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
@@ -21,7 +22,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -29,6 +30,7 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -288,45 +290,18 @@ def __init__(self, config: MistralConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -413,20 +388,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -487,10 +454,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -498,16 +466,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -518,6 +484,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -552,7 +522,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -586,13 +556,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -600,7 +569,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -703,7 +672,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -782,27 +751,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -837,10 +805,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -849,12 +816,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -863,10 +829,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -906,6 +868,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -917,23 +880,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -942,9 +903,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -952,10 +912,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
@@ -995,29 +951,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1026,9 +981,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1043,7 +997,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1058,10 +1012,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1095,6 +1045,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
@@ -1107,9 +1058,8 @@ def forward(
         end_positions: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -1120,9 +1070,8 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
             are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1130,10 +1079,9 @@ def forward(
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -1144,10 +1092,6 @@ def forward(
         if start_positions is not None and end_positions is not None:
             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
 
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return QuestionAnsweringModelOutput(
             loss=loss,
             start_logits=start_logits,
diff --git a/src/transformers/models/mistral/modeling_tf_mistral.py b/src/transformers/models/mistral/modeling_tf_mistral.py
index 35e42fc0b248..e27453249b99 100644
--- a/src/transformers/models/mistral/modeling_tf_mistral.py
+++ b/src/transformers/models/mistral/modeling_tf_mistral.py
@@ -528,7 +528,7 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
     @unpack_inputs
     def call(
         self,
-        input_ids: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
         attention_mask: Optional[tf.Tensor] = None,
         position_ids: Optional[tf.Tensor] = None,
         past_key_values: Optional[List[tf.Tensor]] = None,
@@ -770,7 +770,7 @@ def __init__(self, config: MistralConfig, *inputs, **kwargs):
     @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     def call(
         self,
-        input_ids: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
         attention_mask: Optional[tf.Tensor] = None,
         position_ids: Optional[tf.Tensor] = None,
         past_key_values: Optional[List[tf.Tensor]] = None,
@@ -837,7 +837,7 @@ def get_decoder(self):
     @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def call(
         self,
-        input_ids: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
         attention_mask: Optional[tf.Tensor] = None,
         position_ids: Optional[tf.Tensor] = None,
         past_key_values: Optional[List[tf.Tensor]] = None,
@@ -849,11 +849,10 @@ def call(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, TFCausalLMOutputWithPast]:
         r"""
-        Args:
-            labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
-                or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         """
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
@@ -963,7 +962,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     def call(
         self,
-        input_ids: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
         attention_mask: Optional[tf.Tensor] = None,
         position_ids: Optional[tf.Tensor] = None,
         past_key_values: Optional[List[tf.Tensor]] = None,
@@ -975,11 +974,10 @@ def call(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, TFSequenceClassifierOutputWithPast]:
         r"""
-        Args:
-            labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         """
 
         transformer_outputs = self.model(
diff --git a/src/transformers/models/mistral/modular_mistral.py b/src/transformers/models/mistral/modular_mistral.py
index d1531c58a8a6..6026f5a7e07b 100644
--- a/src/transformers/models/mistral/modular_mistral.py
+++ b/src/transformers/models/mistral/modular_mistral.py
@@ -7,7 +7,7 @@
 from ...cache_utils import Cache, SlidingWindowCache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_outputs import QuestionAnsweringModelOutput
+from ...modeling_outputs import BaseModelOutputWithPast, QuestionAnsweringModelOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
 from ...utils import logging
@@ -118,7 +118,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -221,7 +221,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -302,9 +302,8 @@ def forward(
         end_positions: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -315,9 +314,8 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
             are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -325,10 +323,9 @@ def forward(
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -339,10 +336,6 @@ def forward(
         if start_positions is not None and end_positions is not None:
             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
 
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return QuestionAnsweringModelOutput(
             loss=loss,
             start_logits=start_logits,
diff --git a/src/transformers/models/mistral3/__init__.py b/src/transformers/models/mistral3/__init__.py
new file mode 100644
index 000000000000..11a9fcbdc4ed
--- /dev/null
+++ b/src/transformers/models/mistral3/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mistral3 import *
+    from .modeling_mistral3 import *
+    from .processing_mistral3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/mistral3/configuration_mistral3.py b/src/transformers/models/mistral3/configuration_mistral3.py
new file mode 100644
index 000000000000..e7b27d57220e
--- /dev/null
+++ b/src/transformers/models/mistral3/configuration_mistral3.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class Mistral3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Mistral3ForConditionalGeneration`]. It is used to instantiate an
+    Mistral3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    [mistralai/Mistral-Small-3.1-24B-Instruct-2503](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `PixtralVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `MistralConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 10):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -1):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the multimodal projector.
+        spatial_merge_size (`int`, *optional*, defaults to 2):
+            The downsampling factor for the spatial merge operation.
+
+    Example:
+
+    ```python
+    >>> from transformers import Mistral3ForConditionalGeneration, Mistral3Config, PixtralVisionConfig, MistralConfig
+
+    >>> # Initializing a Pixtral-vision config
+    >>> vision_config = PixtralVisionConfig()
+
+    >>> # Initializing a Mistral config
+    >>> text_config = MistralConfig()
+
+    >>> # Initializing a Mistral3 configuration
+    >>> configuration = Mistral3Config(vision_config, text_config)
+
+    >>> # Initializing a model from the mistral3.1 configuration
+    >>> model = Mistral3ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mistral3"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+    is_composition = True
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_index=10,
+        projector_hidden_act="gelu",
+        vision_feature_layer=-1,
+        multimodal_projector_bias=False,
+        spatial_merge_size=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+
+        self.vision_feature_layer = vision_feature_layer
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config["model_type"] if "model_type" in vision_config else "pixtral"
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["pixtral"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=1540,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                head_dim=64,
+                hidden_act="gelu",
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "mistral"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["mistral"](
+                attention_dropout=0.0,
+                head_dim=128,
+                hidden_act="silu",
+                hidden_size=5120,
+                initializer_range=0.02,
+                intermediate_size=32768,
+                max_position_embeddings=131072,
+                model_type="mistral",
+                num_attention_heads=32,
+                num_hidden_layers=40,
+                num_key_value_heads=8,
+                rms_norm_eps=1e-05,
+                rope_theta=1000000000.0,
+                sliding_window=None,
+                use_cache=True,
+                vocab_size=131072,
+            )
+
+        self.text_config = text_config
+        self.multimodal_projector_bias = multimodal_projector_bias
+        self.spatial_merge_size = spatial_merge_size
+
+
+__all__ = ["Mistral3Config"]
diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py
new file mode 100644
index 000000000000..8ef132846624
--- /dev/null
+++ b/src/transformers/models/mistral3/modeling_mistral3.py
@@ -0,0 +1,553 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/mistral3/modular_mistral3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_mistral3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...generation import GenerationMixin
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torchdynamo_compiling,
+    replace_return_docstrings,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_mistral3 import Mistral3Config
+
+
+_CONFIG_FOR_DOC = "Mistral3Config"
+
+
+class Mistral3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Mistral3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Mistral3PatchMerger(nn.Module):
+    """
+    Learned merging of spatial_merge_size ** 2 patches
+    """
+
+    def __init__(self, config: Mistral3Config):
+        super().__init__()
+        self.config = config
+
+        hidden_size = config.vision_config.hidden_size
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = self.config.vision_config.patch_size
+        self.merging_layer = nn.Linear(hidden_size * self.spatial_merge_size**2, hidden_size, bias=False)
+
+    def forward(self, image_features: torch.Tensor, image_sizes: torch.Tensor) -> torch.Tensor:
+        image_sizes = [
+            (image_size[0] // self.patch_size, image_size[1] // self.patch_size) for image_size in image_sizes
+        ]
+
+        tokens_per_image = [h * w for h, w in image_sizes]
+        d = image_features.shape[-1]
+
+        permuted_tensor = []
+        for image_index, image_tokens in enumerate(image_features.split(tokens_per_image)):
+            # Reshape image_tokens into a 2D grid
+            h, w = image_sizes[image_index]
+            image_grid = image_tokens.view(h, w, d).permute(2, 0, 1).unsqueeze(0)
+            grid = torch.nn.functional.unfold(
+                image_grid, kernel_size=self.spatial_merge_size, stride=self.spatial_merge_size
+            )
+            grid = grid.view(d * self.spatial_merge_size**2, -1).t()
+            permuted_tensor.append(grid)
+
+        image_features = torch.cat(permuted_tensor, dim=0)
+        image_features = self.merging_layer(image_features)
+        return image_features
+
+
+class Mistral3MultiModalProjector(nn.Module):
+    def __init__(self, config: Mistral3Config):
+        super().__init__()
+        self.norm = Mistral3RMSNorm(config.vision_config.hidden_size)
+        self.patch_merger = Mistral3PatchMerger(config)
+        # We have hidden_size * the number of vision feature layers
+        num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * num_feature_layers,
+            config.text_config.hidden_size,
+            bias=config.multimodal_projector_bias,
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
+
+    def forward(self, image_features: torch.Tensor, image_sizes: torch.Tensor):
+        image_features = self.norm(image_features)
+        image_features = self.patch_merger(image_features, image_sizes)
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@dataclass
+class Mistral3CausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Mistral3 causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+MISTRAL3_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Mistral3Config`] or [`Mistral3VisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL3_START_DOCSTRING,
+)
+class Mistral3PreTrainedModel(PreTrainedModel):
+    config_class = Mistral3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Mistral3VisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        # important: this ported version of Mistral3 isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/Mistral3/tree/main/mistral3 should serve for that purpose
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MISTRAL3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`Mistral3Processor`] uses
+            [`CLIPImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    """The MISTRAL3 model which consists of a vision backbone and a language model.""",
+    MISTRAL3_START_DOCSTRING,
+)
+class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin):
+    def __init__(self, config: Mistral3Config):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = Mistral3MultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Union[int, List[int]],
+        image_sizes: torch.Tensor,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
+               The tensors corresponding to the input images.
+            vision_feature_layer (`Union[int, List[int]]`):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            image_sizes (`torch.Tensor`):
+                Tensor containing the image sizes as returned by the processor.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
+        image_outputs = self.vision_tower(pixel_values, image_sizes=image_sizes, output_hidden_states=True, **kwargs)
+        # If we have one vision feature layer, return the corresponding hidden states,
+        # otherwise, select the hidden states of each feature layer and concatenate them
+        if isinstance(vision_feature_layer, int):
+            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+        else:
+            hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            selected_image_feature = torch.cat(hs_pool, dim=-1)
+
+        image_features = self.multi_modal_projector(selected_image_feature.squeeze(0), image_sizes)
+        return image_features
+
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(MISTRAL3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Mistral3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, List[int]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **lm_kwargs,
+    ) -> Union[Tuple, Mistral3CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration
+
+        >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
+        >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
+
+        >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is the image?The image depicts two cats lying on a pink blanket."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                image_sizes=image_sizes,
+            )
+
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
+                n_image_tokens = (input_ids == self.config.image_token_index).sum()
+                n_image_features = image_features.shape[0] * image_features.shape[1]
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Mistral3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = ["Mistral3PreTrainedModel", "Mistral3ForConditionalGeneration"]
diff --git a/src/transformers/models/mistral3/modular_mistral3.py b/src/transformers/models/mistral3/modular_mistral3.py
new file mode 100644
index 000000000000..3793bef1831a
--- /dev/null
+++ b/src/transformers/models/mistral3/modular_mistral3.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...utils import is_torchdynamo_compiling, logging
+from ..llava.modeling_llava import LlavaCausalLMOutputWithPast, LlavaForConditionalGeneration
+from ..mistral.modeling_mistral import MistralRMSNorm
+from .configuration_mistral3 import Mistral3Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class Mistral3RMSNorm(MistralRMSNorm):
+    pass
+
+
+class Mistral3PatchMerger(nn.Module):
+    """
+    Learned merging of spatial_merge_size ** 2 patches
+    """
+
+    def __init__(self, config: Mistral3Config):
+        super().__init__()
+        self.config = config
+
+        hidden_size = config.vision_config.hidden_size
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = self.config.vision_config.patch_size
+        self.merging_layer = nn.Linear(hidden_size * self.spatial_merge_size**2, hidden_size, bias=False)
+
+    def forward(self, image_features: torch.Tensor, image_sizes: torch.Tensor) -> torch.Tensor:
+        image_sizes = [
+            (image_size[0] // self.patch_size, image_size[1] // self.patch_size) for image_size in image_sizes
+        ]
+
+        tokens_per_image = [h * w for h, w in image_sizes]
+        d = image_features.shape[-1]
+
+        permuted_tensor = []
+        for image_index, image_tokens in enumerate(image_features.split(tokens_per_image)):
+            # Reshape image_tokens into a 2D grid
+            h, w = image_sizes[image_index]
+            image_grid = image_tokens.view(h, w, d).permute(2, 0, 1).unsqueeze(0)
+            grid = torch.nn.functional.unfold(
+                image_grid, kernel_size=self.spatial_merge_size, stride=self.spatial_merge_size
+            )
+            grid = grid.view(d * self.spatial_merge_size**2, -1).t()
+            permuted_tensor.append(grid)
+
+        image_features = torch.cat(permuted_tensor, dim=0)
+        image_features = self.merging_layer(image_features)
+        return image_features
+
+
+class Mistral3MultiModalProjector(nn.Module):
+    def __init__(self, config: Mistral3Config):
+        super().__init__()
+        self.norm = Mistral3RMSNorm(config.vision_config.hidden_size)
+        self.patch_merger = Mistral3PatchMerger(config)
+        # We have hidden_size * the number of vision feature layers
+        num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * num_feature_layers,
+            config.text_config.hidden_size,
+            bias=config.multimodal_projector_bias,
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
+
+    def forward(self, image_features: torch.Tensor, image_sizes: torch.Tensor):
+        image_features = self.norm(image_features)
+        image_features = self.patch_merger(image_features, image_sizes)
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class Mistral3CausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
+    pass
+
+
+class Mistral3ForConditionalGeneration(LlavaForConditionalGeneration):
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Union[int, List[int]],
+        image_sizes: torch.Tensor,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
+               The tensors corresponding to the input images.
+            vision_feature_layer (`Union[int, List[int]]`):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            image_sizes (`torch.Tensor`):
+                Tensor containing the image sizes as returned by the processor.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
+        image_outputs = self.vision_tower(pixel_values, image_sizes=image_sizes, output_hidden_states=True, **kwargs)
+        # If we have one vision feature layer, return the corresponding hidden states,
+        # otherwise, select the hidden states of each feature layer and concatenate them
+        if isinstance(vision_feature_layer, int):
+            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+        else:
+            hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            selected_image_feature = torch.cat(hs_pool, dim=-1)
+
+        image_features = self.multi_modal_projector(selected_image_feature.squeeze(0), image_sizes)
+        return image_features
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, List[int]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **lm_kwargs,
+    ) -> Union[Tuple, Mistral3CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration
+
+        >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
+        >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
+
+        >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is the image?The image depicts two cats lying on a pink blanket."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                image_sizes=image_sizes,
+            )
+
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
+                n_image_tokens = (input_ids == self.config.image_token_index).sum()
+                n_image_features = image_features.shape[0] * image_features.shape[1]
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Mistral3CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+__all__ = [
+    "Mistral3PreTrainedModel",  # noqa
+    "Mistral3ForConditionalGeneration",
+]
diff --git a/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py b/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py
deleted file mode 100644
index 10b753f42248..000000000000
--- a/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-
-import torch
-
-from transformers import (
-    MixtralConfig,
-    MixtralForCausalLM,
-)
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py \
-    --input_dir /path/to/downloaded/mixtral/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import MixtralForCausalLM
-
-model = MixtralForCausalLM.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(model_path, input_base_path, model_size, safe_serialization=True):
-    os.makedirs(model_path, exist_ok=True)
-
-    params = read_json(os.path.join(input_base_path, "params.json"))
-    num_shards = 1
-
-    # For some reason this is a string in the params.json
-    sliding_window = int(params["sliding_window"]) if "sliding_window" in params else None
-    n_layers = params["num_hidden_layers"]
-    n_heads = params["num_attention_heads"]
-    n_heads_per_shard = n_heads // num_shards
-    dim = params["hidden_size"]
-    dims_per_head = dim // n_heads
-    base = params.get("rope_theta", 10000.0)
-    max_position_embeddings = 4096 * 8
-    num_local_experts = params["num_local_experts"]
-    ffn_dim = params["intermediate_size"]
-
-    vocab_size = params["vocab_size"]
-
-    if "num_key_value_heads" in params:
-        num_key_value_heads = params["num_key_value_heads"]  # for GQA / MQA
-        num_local_key_value_heads = num_key_value_heads // num_shards
-        key_value_dim = dims_per_head * num_local_key_value_heads
-    else:  # compatibility with other checkpoints
-        num_key_value_heads = n_heads
-        num_local_key_value_heads = n_heads_per_shard
-        key_value_dim = dim
-
-    # permute for sliced rotary
-    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-    # Load weights
-    loaded = [
-        torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pt"), map_location="cpu") for i in range(8)
-    ]
-
-    merged_state_dict = {}
-    for state_dict in loaded:
-        merged_state_dict.update(state_dict)
-
-    state_dict = {}
-
-    for layer_i in range(n_layers):
-        # Sharded
-        # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
-        # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
-        # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
-
-        state_dict.update(
-            {
-                f"model.layers.{layer_i}.input_layernorm.weight": merged_state_dict[
-                    f"layers.{layer_i}.attention_norm.weight"
-                ].clone(),
-                f"model.layers.{layer_i}.post_attention_layernorm.weight": merged_state_dict[
-                    f"layers.{layer_i}.ffn_norm.weight"
-                ].clone(),
-            }
-        )
-
-        state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
-            merged_state_dict[f"layers.{layer_i}.attention.wq.weight"]
-            .view(n_heads_per_shard, dims_per_head, dim)
-            .reshape(dim, dim)
-        )
-        state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
-            merged_state_dict[f"layers.{layer_i}.attention.wk.weight"]
-            .view(num_local_key_value_heads, dims_per_head, dim)
-            .reshape(key_value_dim, dim),
-            num_key_value_heads,
-            key_value_dim,
-            dim,
-        )
-        state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = (
-            merged_state_dict[f"layers.{layer_i}.attention.wv.weight"]
-            .view(num_local_key_value_heads, dims_per_head, dim)
-            .reshape(key_value_dim, dim)
-        )
-
-        state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = merged_state_dict[
-            f"layers.{layer_i}.attention.wo.weight"
-        ]
-
-        w1 = merged_state_dict[f"layers.{layer_i}.block_sparse_moe.w1"]
-        w2 = merged_state_dict[f"layers.{layer_i}.block_sparse_moe.w2"]
-        w3 = merged_state_dict[f"layers.{layer_i}.block_sparse_moe.w3"]
-
-        experts_w1 = [
-            w1[ffn_dim * expert_idx : ffn_dim * (expert_idx + 1), :].contiguous().clone()
-            for expert_idx in range(num_local_experts)
-        ]
-
-        for idx, expert_block in enumerate(experts_w1):
-            expert_key = f"model.layers.{layer_i}.block_sparse_moe.experts.{idx}.w1"
-            state_dict[expert_key + ".weight"] = expert_block.clone()
-
-        experts_w2 = [
-            w2[ffn_dim * expert_idx : ffn_dim * (expert_idx + 1), :].contiguous().clone()
-            for expert_idx in range(num_local_experts)
-        ]
-
-        for idx, expert_block in enumerate(experts_w2):
-            expert_key = f"model.layers.{layer_i}.block_sparse_moe.experts.{idx}.w2"
-            state_dict[expert_key + ".weight"] = expert_block.T.clone().contiguous()
-
-        experts_w3 = [
-            w3[ffn_dim * expert_idx : ffn_dim * (expert_idx + 1), :].contiguous().clone()
-            for expert_idx in range(num_local_experts)
-        ]
-
-        for idx, expert_block in enumerate(experts_w3):
-            expert_key = f"model.layers.{layer_i}.block_sparse_moe.experts.{idx}.w3"
-            state_dict[expert_key + ".weight"] = expert_block.clone()
-
-        state_dict[f"model.layers.{layer_i}.block_sparse_moe.gate.weight"] = merged_state_dict[
-            f"layers.{layer_i}.block_sparse_moe.gate.weight"
-        ]
-
-    state_dict.update(
-        {
-            "model.norm.weight": merged_state_dict["norm.weight"],
-            "model.embed_tokens.weight": merged_state_dict["tok_embeddings.weight"],
-            "lm_head.weight": merged_state_dict["output.weight"],
-        }
-    )
-
-    config = MixtralConfig(
-        hidden_size=dim,
-        intermediate_size=ffn_dim,
-        num_attention_heads=params["num_attention_heads"],
-        num_hidden_layers=params["num_hidden_layers"],
-        rms_norm_eps=params["rms_norm_eps"],
-        num_key_value_heads=num_key_value_heads,
-        vocab_size=vocab_size,
-        rope_theta=base,
-        max_position_embeddings=max_position_embeddings,
-        sliding_window=sliding_window,
-        num_local_experts=num_local_experts,
-    )
-
-    print("Loading the checkpoint in a Mixtral model.")
-    with torch.device("meta"):
-        model = MixtralForCausalLM(config)
-    # Avoid saving this as part of the config.
-    del model.config._name_or_path
-    model.config.torch_dtype = torch.float16
-    print("Saving in the Transformers format.")
-
-    model.load_state_dict(state_dict, strict=True, assign=True)
-
-    for n, p in model.named_parameters():
-        assert p.device.type != "meta", f"{n} has not been loaded!"
-
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Mixtral weights, which contains tokenizer.model and model folders",
-        required=True,
-    )
-    parser.add_argument(
-        "--model_size",
-        choices=["7B"],
-        help="'f' models correspond to the finetuned versions, and are specific to the Mixtral official release. For more details on Mixtral, checkout the original repo: https://huggingface.co/mistral-ai",
-        default="7B",
-    )
-    parser.add_argument("--output_dir", help="Location to write HF model", required=True)
-    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        model_size=args.model_size,
-        safe_serialization=args.safe_serialization,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 170d54eca1b2..5c6bea2b58d0 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -24,6 +24,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from functools import partial
 from typing import Callable, List, Optional, Tuple, Union
 
 import torch
@@ -44,7 +45,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -52,6 +53,7 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -410,45 +412,18 @@ def __init__(self, config: MixtralConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -535,20 +510,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -609,10 +576,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -621,10 +589,9 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> MoeModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
             output_router_logits if output_router_logits is not None else self.config.output_router_logits
@@ -634,8 +601,6 @@ def forward(
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -680,7 +645,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -719,14 +684,13 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = MoeModelOutputWithPast(
+        return MoeModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
             router_logits=all_router_logits,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -734,7 +698,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -837,7 +801,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1001,12 +965,13 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1016,13 +981,11 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1062,10 +1025,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: MoeModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1075,12 +1037,11 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             output_router_logits=output_router_logits,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -1092,7 +1053,7 @@ def forward(
         aux_loss = None
         if output_router_logits:
             aux_loss = load_balancing_loss_func(
-                outputs.router_logits if return_dict else outputs[-1],
+                outputs.router_logits,
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
@@ -1100,12 +1061,6 @@ def forward(
             if labels is not None:
                 loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            if output_router_logits:
-                output = (aux_loss,) + output
-            return (loss,) + output if loss is not None else output
-
         return MoeCausalLMOutputWithPast(
             loss=loss,
             aux_loss=aux_loss,
@@ -1148,29 +1103,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1179,9 +1133,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1196,7 +1149,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1211,10 +1164,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1254,6 +1203,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1265,23 +1215,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1290,9 +1238,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1300,10 +1247,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
@@ -1336,6 +1279,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
@@ -1348,9 +1292,8 @@ def forward(
         end_positions: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -1361,9 +1304,8 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
             are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1371,10 +1313,9 @@ def forward(
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -1385,10 +1326,6 @@ def forward(
         if start_positions is not None and end_positions is not None:
             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
 
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return QuestionAnsweringModelOutput(
             loss=loss,
             start_logits=start_logits,
diff --git a/src/transformers/models/mixtral/modular_mixtral.py b/src/transformers/models/mixtral/modular_mixtral.py
index 7890400934c6..d94b581477b5 100644
--- a/src/transformers/models/mixtral/modular_mixtral.py
+++ b/src/transformers/models/mixtral/modular_mixtral.py
@@ -19,6 +19,7 @@
 # limitations under the License.
 """PyTorch Mixtral model."""
 
+from functools import partial
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -332,7 +333,7 @@ def __init__(self, config: MixtralConfig):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -341,10 +342,9 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, MoeModelOutputWithPast]:
+    ) -> MoeModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
             output_router_logits if output_router_logits is not None else self.config.output_router_logits
@@ -354,8 +354,6 @@ def forward(
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -400,7 +398,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -439,14 +437,13 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = MoeModelOutputWithPast(
+        return MoeModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
             router_logits=all_router_logits,
         )
-        return output if return_dict else output.to_tuple()
 
 
 class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
@@ -464,7 +461,7 @@ def __init__(self, config):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -474,13 +471,11 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -520,10 +515,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: MoeModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -533,12 +527,11 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             output_router_logits=output_router_logits,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -550,7 +543,7 @@ def forward(
         aux_loss = None
         if output_router_logits:
             aux_loss = load_balancing_loss_func(
-                outputs.router_logits if return_dict else outputs[-1],
+                outputs.router_logits,
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
@@ -558,12 +551,6 @@ def forward(
             if labels is not None:
                 loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            if output_router_logits:
-                output = (aux_loss,) + output
-            return (loss,) + output if loss is not None else output
-
         return MoeCausalLMOutputWithPast(
             loss=loss,
             aux_loss=aux_loss,
diff --git a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
deleted file mode 100644
index b2c40e27bb2b..000000000000
--- a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py
+++ /dev/null
@@ -1,639 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import gc
-import json
-import math
-import os
-from typing import List, Optional
-
-import regex as re
-import torch
-import torch.nn.functional as F
-
-from transformers import (
-    GenerationConfig,
-    MllamaConfig,
-    MllamaForConditionalGeneration,
-    MllamaImageProcessor,
-    PreTrainedTokenizerFast,
-)
-from transformers.convert_slow_tokenizer import TikTokenConverter
-from transformers.models.mllama.configuration_mllama import MllamaTextConfig, MllamaVisionConfig
-from transformers.models.mllama.image_processing_mllama import get_all_supported_aspect_ratios
-
-
-# fmt: off
-# If a weight needs to be split in two or more keys, use `|` to indicate it. ex:
-# r"text_model.layers.(\d+).attention.wqkv.weight": r"language_model.model.layers.\1.self_attn.q|k|v|_proj.weight"
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    r"text_model.norm.weight":                                                                  r"language_model.model.norm.weight",
-    r"text_model.output.weight":                                                                r"language_model.lm_head.weight",
-    r"text_model.tok_embeddings":                                                               r"language_model.model.embed_tokens",
-    r"text_model.learnable_embedding":                                                          r"language_model.model.learnable_embedding",
-    r"text_model.rope.freqs":                                                                   None, # meaning we skip it and don't want it
-    # For every cross attention layer, the layer needs to be updated
-    r"text_model.cross_attention_layers.(\d+).gate_attn":                                       r"language_model.model.layers.\1.cross_attn_attn_gate",
-    r"text_model.cross_attention_layers.(\d+).gate_ffwd":                                       r"language_model.model.layers.\1.cross_attn_mlp_gate",
-    # special key, wqkv needs to be split afterwards
-    r"text_model.cross_attention_layers.(\d+).attention.w(q|k|v|o)":                            r"language_model.model.layers.\1.cross_attn.\2_proj",
-    r"text_model.cross_attention_layers.(\d+).attention.(q|k)_norm":                            r"language_model.model.layers.\1.cross_attn.\2_norm",
-    r"text_model.cross_attention_layers.(\d+).attention_norm.weight":                           r"language_model.model.layers.\1.input_layernorm.weight",
-    r"text_model.cross_attention_layers.(\d+).attention.wk.layer_norm_weight":                  r"language_model.model.layers.\1.post_attention_layernorm.weight",
-    r"text_model.cross_attention_layers.(\d+).feed_forward.w1.weight":                          r"language_model.model.layers.\1.mlp.gate_proj.weight",
-    r"text_model.cross_attention_layers.(\d+).feed_forward.w2.weight":                          r"language_model.model.layers.\1.mlp.down_proj.weight",
-    r"text_model.cross_attention_layers.(\d+).feed_forward.w3.weight":                          r"language_model.model.layers.\1.mlp.up_proj.weight",
-    r"text_model.cross_attention_layers.(\d+).ffn_norm.weight":                                 r"language_model.model.layers.\1.post_attention_layernorm.weight",
-    # self attention layers
-    r"text_model.layers.(\d+).attention.w(q|k|v|o).weight":                                     r"language_model.model.layers.\1.self_attn.\2_proj.weight",
-    r"text_model.layers.(\d+).attention_norm.weight":                                           r"language_model.model.layers.\1.input_layernorm.weight",
-    r"text_model.layers.(\d+).feed_forward.w1.":                                                r"language_model.model.layers.\1.mlp.gate_proj.",
-    r"text_model.layers.(\d+).feed_forward.w2.":                                                r"language_model.model.layers.\1.mlp.down_proj.",
-    r"text_model.layers.(\d+).feed_forward.w3.":                                                r"language_model.model.layers.\1.mlp.up_proj.",
-    r"text_model.layers.(\d+).ffn_norm.weight":                                                 r"language_model.model.layers.\1.post_attention_layernorm.weight",
-    # Vision encoder mapping
-    r"vision_model.vision_encoder.conv1._linear":                                               r"vision_model.patch_embedding",
-    r'vision_model.vision_projection.':                                                         r"multi_modal_projector.",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).attn.wq":    r"vision_model.\1.layers.\2.self_attn.q_proj",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).attn.wk":    r"vision_model.\1.layers.\2.self_attn.k_proj",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).attn.wv":    r"vision_model.\1.layers.\2.self_attn.v_proj",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).attn.wo":    r"vision_model.\1.layers.\2.self_attn.o_proj",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).mlp.c_fc":   r"vision_model.\1.layers.\2.mlp.fc1",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).mlp.c_proj": r"vision_model.\1.layers.\2.mlp.fc2",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).ln_1":       r"vision_model.\1.layers.\2.input_layernorm",
-    r"vision_model.vision_encoder.(global_transformer|transformer).resblocks.(\d+).ln_2":       r"vision_model.\1.layers.\2.post_attention_layernorm",
-    r"vision_model.vision_encoder.global_transformer.resblocks.(\d+).(gate_ffn|gate_attn)":     r"vision_model.global_transformer.layers.\1.\2",
-    r'vision_model.vision_encoder.ln_(pre|post).(weight|bias)':                                 r'vision_model.vision_encoder.layernorm_\1.\2',
-    r'vision_model.vision_encoder.positional_embedding\b':                                      r'vision_model.gated_positional_embedding.embedding',
-    r'vision_model.vision_encoder.gated_positional_embedding\b':                                r'vision_model.gated_positional_embedding.tile_embedding.weight',
-    r'vision_model.vision_encoder.gated_positional_embedding_gate':                             r'vision_model.gated_positional_embedding.gate',
-    r"vision_model.vision_encoder.pre_tile_pos_embed.embedding":                                r"vision_model.pre_tile_positional_embedding.embedding.weight",
-    r"vision_model.vision_encoder.post_tile_pos_embed.embedding":                               r"vision_model.post_tile_positional_embedding.embedding.weight",
-    r"vision_model.vision_encoder.pre_tile_pos_embed.gate":                                     r"vision_model.pre_tile_positional_embedding.gate",
-    r"vision_model.vision_encoder.post_tile_pos_embed.gate":                                    r"vision_model.post_tile_positional_embedding.gate",
-    r"vision_model.vision_encoder.(?=\w)":                                                      r"vision_model.",
-}
-# fmt: on
-
-CONTEXT_LENGTH = 131072
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
-    """
-    This function should be applied only once, on the concatenated keys to efficiently rename using
-    the key mappings.
-    """
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-            if replacement is None:
-                new_text = re.sub(pattern, "", new_text)  # an empty line
-                continue
-            new_text = re.sub(pattern, replacement, new_text)
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-    return output_dict
-
-
-def permute_for_rope(input_tensor, n_heads, dim1, dim2):
-    """
-    When you go from the complex ROPE formulation to sin and cos one, you need
-    to permute the query and key weights (to avoid doing it on the fly)
-    """
-    input_tensor = input_tensor.reshape(dim1, dim2)
-    input_tensor = input_tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2)
-    input_tensor = input_tensor.transpose(1, 2).reshape(dim1, dim2)
-    return input_tensor
-
-
-def pre_compute_positional_embedding(embedding):
-    """
-    Instead of iterating of the batch of images, and the ratios inside, we pre-compute the
-    positional embeddings depending on the aspect ratio id. This is done to support `torch.compile`
-    and efficient inference / training with different aspect ratios.
-    """
-    max_num_tiles, *shapes = embedding.shape
-    hidden_size = shapes[-1]
-    supported_aspect_ratios = get_all_supported_aspect_ratios(max_num_tiles)
-    max_aspect_ratio_id = len(supported_aspect_ratios)  # we keep 0 index for padding
-    # tile embedding does not have patches
-    num_patches = 1 if len(shapes) == 2 else shapes[1]
-    precomputed_embeddings = torch.zeros(
-        max_aspect_ratio_id + 1,
-        max_num_tiles,
-        num_patches,
-        hidden_size,
-        device=embedding.device,
-        dtype=embedding.dtype,
-    )
-
-    for i, (height, width) in enumerate(supported_aspect_ratios):
-        aspect_ratio_id = i + 1  # we keep 0 index for padding
-        current_embedding = embedding[:height, :width].reshape(height * width, num_patches, hidden_size)
-        precomputed_embeddings[aspect_ratio_id, : height * width] = current_embedding
-    precomputed_embeddings = precomputed_embeddings.flatten(1)
-    return precomputed_embeddings
-
-
-def is_param_different_across_shards(key):
-    """
-    Return `True` if the parameter is different across checkpoint shards
-    and needs to be concatenated.
-    """
-    patterns = [r"vision_model.patch_embedding.weight",r"vision_model.(transformer|global_transformer).layers.(\d+).self_attn.(q|k|v|o)_proj.weight",r"vision_model.(transformer|global_transformer).layers.(\d+).mlp.fc1.(weight|bias)",r"vision_model.(transformer|global_transformer).layers.(\d+).mlp.fc2.weight",  r"multi_modal_projector.(weight|bias)",r"language_model.model.embed_tokens.weight",r"language_model.lm_head.weight",r"language_model.model.layers.(\d+).self_attn.(q|k|v|o)_proj.weight",r"language_model.model.layers.(\d+).cross_attn.(q|k|v|o)_proj.weight",r"language_model.model.layers.(\d+).mlp.(up|down|gate)_proj.weight",r"language_model.model.learnable_embedding.weight"]  # fmt: skip
-    return any(re.search(pattern, key) for pattern in patterns)
-
-
-def get_concat_dim(key):
-    """
-    Return the dimension to concatenate the weights on.
-    """
-    concat_dim_1 = [r"vision_model.(transformer|global_transformer).layers.(\d+).mlp.fc2.weight",r"vision_model.(transformer|global_transformer).layers.(\d+).self_attn.o_proj.weight",r"language_model.model.layers.(\d+).cross_attn.o_proj.weight",r"language_model.model.layers.(\d+).self_attn.o_proj.weight",r"language_model.model.layers.(\d+).mlp.down_proj.weight"]  # fmt: off
-    if any(re.search(pattern, key) for pattern in concat_dim_1):
-        return 1
-    return 0
-
-
-def compute_intermediate_size(hidden_dim, multiple_of=1024, ffn_dim_multiplier=1.3):
-    hidden_dim = 4 * int(2 * hidden_dim / 3)
-    hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-    hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-    return hidden_dim
-
-
-def interpolate_positional_embedding(
-    embeddings: torch.Tensor, vision_tile_size: int, vision_patch_size: int
-) -> torch.Tensor:
-    """
-    This method allows to interpolate the pre-trained position embeddings, to be able to use the model on higher resolution
-    images.
-    """
-    cls_embedding, positional_embedding = embeddings[:1], embeddings[1:]
-    total_num_patches, dim = positional_embedding.shape
-
-    # compute current and target number of patches for height and width
-    num_patches = int(round(total_num_patches**0.5))
-    new_num_patches = vision_tile_size // vision_patch_size
-
-    # Check if the number of patches is already the desired size
-    if num_patches == new_num_patches:
-        return embeddings
-
-    positional_embedding = positional_embedding.transpose(0, 1)
-    positional_embedding = positional_embedding.reshape(1, dim, num_patches, num_patches)
-    positional_embedding = F.interpolate(
-        positional_embedding,
-        size=(new_num_patches, new_num_patches),
-        mode="bicubic",
-        align_corners=False,
-    )
-    positional_embedding = positional_embedding.reshape(dim, -1).transpose(0, 1)
-
-    embeddings = torch.cat([cls_embedding, positional_embedding], dim=0)
-    return embeddings
-
-
-def write_model(
-    model_path,
-    input_base_path,
-    num_shards,
-    safe_serialization=True,
-    instruct=False,
-):
-    os.makedirs(model_path, exist_ok=True)
-
-    with open(os.path.join(input_base_path, "params.json"), "r") as f:
-        params = json.load(f)
-
-    params = params.get("model", params)
-    torch_dtype = "bfloat16"
-
-    # ------------------------------------------------------------
-    # Text model params and config
-    # ------------------------------------------------------------
-
-    # params from config
-    text_vocab_size = params["vocab_size"]
-    text_num_layers = params["n_layers"]
-    text_dim = params["dim"]
-    text_num_heads = params["n_heads"]
-    text_rms_norm_eps = params["norm_eps"]
-    text_rope_theta = params["rope_theta"]
-    cross_attention_num_layers = params["vision_num_cross_attention_layers"]
-
-    # some constans from original code
-    rope_scaling = {
-        "rope_type": "llama3",
-        "factor": 8.0,
-        "low_freq_factor": 1.0,
-        "high_freq_factor": 4.0,
-        "original_max_position_embeddings": 8192,
-    }
-    max_position_embeddings = CONTEXT_LENGTH
-
-    # compute additional params for weight conversion
-    text_num_heads_per_shard = text_num_heads // num_shards
-    text_dim_per_head = text_dim // text_num_heads
-    text_intermediate_size = compute_intermediate_size(text_dim, multiple_of=params["multiple_of"])
-
-    if params.get("n_kv_heads", None) is not None:
-        text_num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
-        text_num_key_value_heads_per_shard = text_num_key_value_heads // num_shards
-        text_key_value_dim = text_dim_per_head * text_num_key_value_heads
-    else:  # compatibility with other checkpoints
-        text_num_key_value_heads = text_num_heads
-        text_num_key_value_heads_per_shard = text_num_heads_per_shard
-        text_key_value_dim = text_dim
-
-    # cross-attention layers: 20 for 90B, 8 for 11B
-    cross_attention_frequency = math.ceil(text_num_layers / cross_attention_num_layers)
-    text_num_total_layers = text_num_layers + cross_attention_num_layers
-    cross_attention_layers_shift = list(
-        range(cross_attention_frequency - 1, text_num_total_layers, cross_attention_frequency + 1)
-    )
-    self_attention_layers_shift = [k for k in range(text_num_total_layers) if k not in cross_attention_layers_shift]
-
-    bos_token_id = 128000
-    eos_token_id = [128001, 128008, 128009] if instruct else 128001
-    pad_token_id = 128004
-
-    text_config = MllamaTextConfig(
-        num_attention_heads=text_num_heads,
-        vocab_size=text_vocab_size,
-        hidden_size=text_dim,
-        rms_norm_eps=text_rms_norm_eps,
-        rope_theta=text_rope_theta,
-        num_hidden_layers=text_num_total_layers,
-        cross_attention_layers=cross_attention_layers_shift,
-        intermediate_size=text_intermediate_size,
-        max_position_embeddings=max_position_embeddings,
-        rope_scaling=rope_scaling,
-        bos_token_id=bos_token_id,
-        eos_token_id=eos_token_id,
-        pad_token_id=pad_token_id,
-        tie_word_embeddings=False,  # Constant set to False
-        torch_dtype=torch_dtype,
-    )
-
-    # ------------------------------------------------------------
-    # Vision model params and config
-    # ------------------------------------------------------------
-
-    # params from config
-    vision_tile_size = params["vision_chunk_size"]
-    vision_max_num_tiles = params["vision_max_num_chunks"]
-
-    # some constants from original code
-    vision_patch_size = 14
-    vision_num_channels = 3
-    vision_num_layers = 32
-    vision_num_layers_global = 8
-    vision_dim = 1280
-    vision_num_heads = 16
-    vision_intermediate_layers_indices = [3, 7, 15, 23, 30]
-
-    # compute additional params for weight conversion
-    vision_dim_per_head = vision_dim // vision_num_heads
-    vision_num_heads_per_shard = vision_num_heads // num_shards
-    vision_intermediate_size = vision_dim * 4
-    vision_supported_aspect_ratios = get_all_supported_aspect_ratios(vision_max_num_tiles)
-
-    vision_config = MllamaVisionConfig(
-        hidden_size=vision_dim,
-        patch_size=vision_patch_size,
-        num_channels=vision_num_channels,
-        intermediate_size=vision_intermediate_size,
-        num_hidden_layers=vision_num_layers,
-        num_attention_heads=vision_num_heads,
-        num_global_layers=vision_num_layers_global,
-        intermediate_layers_indices=vision_intermediate_layers_indices,
-        image_size=vision_tile_size,
-        max_num_tiles=vision_max_num_tiles,
-        supported_aspect_ratios=vision_supported_aspect_ratios,
-        torch_dtype=torch_dtype,
-    )
-
-    # save config
-    config = MllamaConfig(vision_config=vision_config, text_config=text_config, torch_dtype=torch_dtype)
-    config.architectures = ["MllamaForConditionalGeneration"]
-    config.save_pretrained(model_path)
-    print("Model config saved successfully...")
-
-    # ------------------------------------------------------------
-    # Convert weights
-    # ------------------------------------------------------------
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}...")
-    if num_shards == 1:
-        if os.path.exists(os.path.join(input_base_path, "consolidated.00.pth")):
-            path = os.path.join(input_base_path, "consolidated.00.pth")
-        else:
-            path = os.path.join(input_base_path, "consolidated.pth")
-        loaded = [torch.load(path, map_location="cpu", mmap=True)]
-    else:
-        loaded = [
-            torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu", mmap=True)
-            for i in range(num_shards)
-        ]
-
-    print("Converting model...")
-    all_keys = list(loaded[0].keys())
-    new_keys = convert_old_keys_to_new_keys(all_keys)
-
-    state_dict = {}
-    for key in all_keys:
-        new_key = new_keys[key]
-
-        # In the original model, self-attention layers and cross-attention layers are different lists of layers.
-        # In the converted model, they are merged into one list with corresponding index shift to preserve the order.
-        if ("cross_attention" in key or "text_model.layers" in key) and "language_model" in new_key:
-            shift = cross_attention_layers_shift if "cross_attention" in key else self_attention_layers_shift
-            new_key = re.sub(r"layers.(\d+).", lambda _match: f"layers.{shift[int(_match.groups()[0])]}.", new_key)
-
-        current_parameter = [chunk.pop(key).contiguous().clone() for chunk in loaded]
-        if not is_param_different_across_shards(new_key):
-            current_parameter = current_parameter[0]
-
-        concat_dim = get_concat_dim(new_key)
-
-        # Post-process the current_parameter.
-        if re.search("(k|v|q)_proj.weight", new_key) and "language_model" in new_key:
-            if "q_proj" in new_key:
-                param_num_heads = text_num_heads
-                param_num_head_per_shard = text_num_heads_per_shard
-                param_dim = text_dim
-            else:
-                param_num_heads = text_num_key_value_heads
-                param_num_head_per_shard = text_num_key_value_heads_per_shard
-                param_dim = text_key_value_dim
-            shards = [param.view(param_num_head_per_shard, text_dim_per_head, text_dim) for param in current_parameter]
-            current_parameter = torch.cat(shards, dim=concat_dim)
-            if "cross_attn" not in new_key and "v_proj.weight" not in new_key:
-                current_parameter = permute_for_rope(current_parameter, param_num_heads, param_dim, text_dim)
-            state_dict[new_key] = current_parameter.reshape(param_num_heads * text_dim_per_head, text_dim)
-
-        elif "vision_model" in new_key and re.search("(k|v|q)_proj", new_key):
-            shards = [
-                param.view(vision_num_heads_per_shard, vision_dim_per_head, vision_dim) for param in current_parameter
-            ]
-            param = torch.cat(shards, dim=concat_dim)
-            state_dict[new_key] = param.reshape(vision_num_heads * vision_dim_per_head, vision_dim)
-
-        elif new_key == "vision_model.patch_embedding.weight":
-            current_parameter = torch.cat(current_parameter, dim=concat_dim)
-            state_dict[new_key] = current_parameter.reshape(
-                -1, vision_num_channels, vision_patch_size, vision_patch_size
-            )
-
-        elif new_key.endswith("gate"):
-            state_dict[new_key] = current_parameter[0].view(1)
-
-        elif "vision_model.gated_positional_embedding.embedding" in new_key:
-            current_parameter = interpolate_positional_embedding(
-                current_parameter, vision_tile_size, vision_patch_size
-            )
-            state_dict[new_key] = current_parameter
-
-        elif "vision_model.gated_positional_embedding.tile_embedding.weight" in new_key:
-            current_parameter = current_parameter.permute(2, 0, 1, 3).flatten(1)
-            current_parameter = interpolate_positional_embedding(
-                current_parameter, vision_tile_size, vision_patch_size
-            )
-            current_parameter = current_parameter.reshape(
-                -1, vision_max_num_tiles, vision_max_num_tiles, vision_dim
-            ).permute(1, 2, 0, 3)
-            state_dict[new_key] = pre_compute_positional_embedding(current_parameter)
-
-        elif "tile_positional_embedding.embedding" in new_key:
-            state_dict[new_key] = pre_compute_positional_embedding(current_parameter)
-
-        elif new_key != "":
-            if isinstance(current_parameter, list):
-                current_parameter = torch.cat(current_parameter, dim=concat_dim)
-            state_dict[new_key] = current_parameter
-
-    state_dict["language_model.model.embed_tokens.weight"] = torch.cat(
-        [
-            state_dict["language_model.model.embed_tokens.weight"],
-            state_dict.pop("language_model.model.learnable_embedding.weight"),
-        ],
-        dim=0,
-    )
-    del loaded
-    gc.collect()
-
-    print("Loading the checkpoint in a Mllama model.")
-    with torch.device("meta"):
-        model = MllamaForConditionalGeneration(config)
-    model.load_state_dict(state_dict, strict=True, assign=True)
-    print("Checkpoint loaded successfully.")
-    del model.config._name_or_path
-
-    print("Saving the model.")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    del state_dict, model
-
-    # Safety check: reload the converted model
-    gc.collect()
-    print("Reloading the model to check if it's saved correctly.")
-    MllamaForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map="auto")
-    print("Model reloaded successfully.")
-
-    # generation config
-    if instruct:
-        print("Saving generation config...")
-        generation_config = GenerationConfig(
-            do_sample=True,
-            temperature=0.6,
-            top_p=0.9,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            pad_token_id=pad_token_id,
-        )
-        generation_config.save_pretrained(model_path)
-
-
-class MllamaConverter(TikTokenConverter):
-    def __init__(
-        self,
-        vocab_file,
-        special_tokens: List[str],
-        pattern: str,
-        model_max_length: int,
-        chat_template: Optional[str] = None,
-        **kwargs,
-    ):
-        super().__init__(vocab_file, pattern=pattern)
-        self.additional_special_tokens = special_tokens
-        tokenizer = self.converted()
-        if chat_template is not None:
-            kwargs["chat_template"] = chat_template
-        self.tokenizer = PreTrainedTokenizerFast(
-            tokenizer_object=tokenizer,
-            model_input_names=["input_ids", "attention_mask"],
-            model_max_length=model_max_length,
-            **kwargs,
-        )
-
-
-def write_tokenizer(tokenizer_path: str, save_dir: str, instruct: bool = False):
-    model_max_length = CONTEXT_LENGTH
-    pattern = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: W605
-
-    # Special tokens
-    num_reserved_special_tokens = 256
-    special_tokens = [
-        "<|begin_of_text|>",
-        "<|end_of_text|>",
-        "<|reserved_special_token_0|>",
-        "<|reserved_special_token_1|>",
-        "<|finetune_right_pad_id|>",
-        "<|step_id|>",
-        "<|start_header_id|>",
-        "<|end_header_id|>",
-        "<|eom_id|>",  # end of message
-        "<|eot_id|>",  # end of turn
-        "<|python_tag|>",
-    ]
-    special_tokens += [
-        f"<|reserved_special_token_{i + 2}|>" for i in range(num_reserved_special_tokens - len(special_tokens))
-    ]
-    # original tokenizer has <|image|> with 128011 token_id,
-    # however, later in the code it is replaced with 128256 token_id
-    special_tokens.append("<|image|>")
-
-    # Chat template
-    chat_template = (
-        "{% for message in messages %}"
-        "{% if loop.index0 == 0 %}"
-        "{{ bos_token }}"
-        "{% endif %}"
-        "{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}"
-        "{% if message['content'] is string %}"
-        "{{ message['content'] }}"
-        "{% else %}"
-        "{% for content in message['content'] %}"
-        "{% if content['type'] == 'image' %}"
-        "{{ '<|image|>' }}"
-        "{% elif content['type'] == 'text' %}"
-        "{{ content['text'] }}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% endif %}"
-        "{{ '<|eot_id|>' }}"
-        "{% endfor %}"
-        "{% if add_generation_prompt %}"
-        "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
-        "{% endif %}"
-    )
-
-    converter = MllamaConverter(
-        vocab_file=tokenizer_path,
-        pattern=pattern,
-        special_tokens=special_tokens,
-        model_max_length=model_max_length,
-        chat_template=chat_template if instruct else None,
-        bos_token="<|begin_of_text|>",
-        eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>",
-        pad_token="<|finetune_right_pad_id|>",
-    )
-    tokenizer = converter.tokenizer
-    tokenizer.save_pretrained(save_dir)
-
-    if instruct:
-        print("Saving chat template...")
-        chat_template_path = os.path.join(save_dir, "chat_template.json")
-        with open(chat_template_path, "w") as f:
-            json.dump({"chat_template": chat_template}, f, indent=2)
-
-
-def write_image_processor(config_path: str, save_dir: str):
-    with open(config_path, "r") as f:
-        params = json.load(f)
-
-    tile_size = params["vision_chunk_size"]
-    max_image_tiles = params["vision_max_num_chunks"]
-
-    image_processor = MllamaImageProcessor(
-        do_resize=True,
-        size={"height": tile_size, "width": tile_size},
-        do_rescale=True,
-        rescale_factor=1 / 255,
-        do_normalize=True,
-        image_mean=[0.48145466, 0.4578275, 0.40821073],
-        image_std=[0.26862954, 0.26130258, 0.27577711],
-        do_pad=True,
-        max_image_tiles=max_image_tiles,
-    )
-
-    image_processor.save_pretrained(save_dir)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        default="Llama-3.2-11B-Vision/original",
-        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="Llama-3.2-11B-Vision",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`."
-    )
-    parser.add_argument(
-        "--special_tokens",
-        default=None,
-        type=List[str],
-        help="The list of special tokens that should be added to the model.",
-    )
-    parser.add_argument(
-        "--num_shards",
-        default=1,
-        type=int,
-        help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth",
-    )
-    parser.add_argument(
-        "--instruct",
-        action="store_true",
-        help="Whether the model is an instruct model",
-    )
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        safe_serialization=args.safe_serialization,
-        num_shards=args.num_shards,
-        instruct=args.instruct,
-    )
-
-    write_tokenizer(
-        tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
-        save_dir=args.output_dir,
-        instruct=args.instruct,
-    )
-
-    write_image_processor(
-        config_path=os.path.join(args.input_dir, "params.json"),
-        save_dir=args.output_dir,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/mllama/image_processing_mllama.py b/src/transformers/models/mllama/image_processing_mllama.py
index 9ff077f15014..bcb97bbcd994 100644
--- a/src/transformers/models/mllama/image_processing_mllama.py
+++ b/src/transformers/models/mllama/image_processing_mllama.py
@@ -93,7 +93,7 @@ def get_image_size_fit_to_canvas(
     canvas_height and canvas_width, while ensuring that the image dimensions are not smaller than
     tile_size. If the image is larger than the canvas, the returned size will fit within the canvas.
     If the image already fits within the canvas, the size remains unchanged.
-    The aspect ratio of the original image is preserved.
+    The aspect ratio of the original image is preserved as much as possible.
 
     Args:
         image_height (`int`):
@@ -120,10 +120,12 @@ def get_image_size_fit_to_canvas(
 
     if scale_w < scale_h:
         new_width = target_width
-        new_height = min(math.floor(image_height * scale_w), target_height)
+        # minimum height is 1 to avoid invalid height of 0
+        new_height = min(math.floor(image_height * scale_w) or 1, target_height)
     else:
         new_height = target_height
-        new_width = min(math.floor(image_width * scale_h), target_width)
+        # minimum width is 1 to avoid invalid width of 0
+        new_width = min(math.floor(image_width * scale_h) or 1, target_width)
 
     return new_height, new_width
 
@@ -695,8 +697,6 @@ def preprocess(
         if self.do_convert_rgb:
             images_list = [[convert_to_rgb(image) for image in images] for images in images_list]
 
-        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
-
         batch_images = []
         batch_aspect_ratios = []
 
@@ -707,6 +707,13 @@ def preprocess(
 
             # iterate over images in a batch sample
             for image in images:
+                # default PIL images to channels_last
+                if input_data_format is None and isinstance(image, PIL.Image.Image):
+                    input_data_format = ChannelDimension.LAST
+
+                # convert to numpy array for processing
+                image = to_numpy_array(image)
+
                 # convert images to channels first format for faster processing
                 # LAST is slower for `pad` and not supported by `split_to_tiles`
                 data_format = ChannelDimension.FIRST
@@ -735,7 +742,7 @@ def preprocess(
                     image = self.rescale(
                         image=image,
                         scale=rescale_factor,
-                        input_data_format=input_data_format,
+                        input_data_format=data_format,
                         data_format=data_format,
                     )
 
@@ -744,7 +751,7 @@ def preprocess(
                         image=image,
                         mean=image_mean,
                         std=image_std,
-                        input_data_format=input_data_format,
+                        input_data_format=data_format,
                         data_format=data_format,
                     )
 
diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py
index 4a705083f3ba..48e863d68689 100644
--- a/src/transformers/models/mllama/modeling_mllama.py
+++ b/src/transformers/models/mllama/modeling_mllama.py
@@ -28,11 +28,11 @@
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -40,6 +40,12 @@
 from .configuration_mllama import MllamaConfig, MllamaTextConfig, MllamaVisionConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 
@@ -191,7 +197,7 @@ def forward(
         self,
         hidden_state: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = None,
+        output_attentions: Optional[bool] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         query = self.q_proj(hidden_state)
         key = self.k_proj(hidden_state)
@@ -231,7 +237,7 @@ def forward(
         self,
         hidden_state: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = None,
+        output_attentions: Optional[bool] = None,
     ) -> torch.Tensor:
         # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
         if output_attentions:
@@ -296,7 +302,7 @@ def forward(
         self,
         hidden_state: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = None,
+        output_attentions: Optional[bool] = None,
     ):
         # Self Attention
         residual = hidden_state
@@ -463,7 +469,7 @@ def forward(
         past_key_value: Optional[Cache] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
-        use_cache: bool = None,
+        use_cache: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
@@ -531,7 +537,7 @@ def forward(
         past_key_value: Optional[Cache] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
-        use_cache: bool = None,
+        use_cache: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
@@ -992,44 +998,18 @@ def __init__(self, config: MllamaTextConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -1075,12 +1055,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1162,7 +1147,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1883,7 +1868,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class="MllamaTextConfig")
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         cross_attention_states: Optional[torch.LongTensor] = None,
@@ -1901,7 +1886,6 @@ def forward(
         **loss_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -2037,7 +2021,7 @@ def forward(
         cross_attention_mask: Optional[torch.Tensor] = None,
         cross_attention_states: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -2046,9 +2030,9 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
+        **loss_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -2148,15 +2132,31 @@ def forward(
             past_key_values=past_key_values,
             use_cache=use_cache,
             inputs_embeds=inputs_embeds,
-            labels=labels,
             output_hidden_states=output_hidden_states,
             output_attentions=output_attentions,
             return_dict=return_dict,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
+            **loss_kwargs,
         )
 
-        return outputs
+        # Temporary fix to calculate the loss in main class, as the model's vocab size may be resized
+        loss = None
+        logits = outputs[0]
+
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config.get_text_config().vocab_size, **loss_kwargs)
+
+        if not return_dict:
+            return (loss,) + outputs if loss is not None else outputs
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=outputs.logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
 
     def prepare_inputs_for_generation(
         self,
@@ -2184,7 +2184,7 @@ def prepare_inputs_for_generation(
         if past_key_values is not None:
             if (
                 inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
             ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 5ed398e928fa..d26d93bc3ce1 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -301,12 +301,16 @@ def __call__(
                 raise ValueError(
                     "If a batch of text is provided, there should be either no images or at least one image per sample"
                 )
-            if sum(n_images_in_images) != sum(n_images_in_text):
+            if sum(n_images_in_text) > 0 and n_images_in_images != n_images_in_text:
                 if images is None:
                     raise ValueError("No image were provided, but there are image tokens in the prompt")
                 else:
+                    add_message = ""
+                    if sum(n_images_in_images) == sum(n_images_in_text):
+                        add_message = "Make sure to pass your images as a nested list, where each sub-list holds images per batch"
                     raise ValueError(
-                        f"The number of image token ({sum(n_images_in_text)}) should be the same as in the number of provided images ({sum(n_images_in_images)})"
+                        f"The number of image tokens in each text ({n_images_in_text}) should be the same as the "
+                        f"number of provided images per batch ({n_images_in_images}). {add_message}"
                     )
 
         if images is not None:
@@ -346,7 +350,9 @@ def decode(self, *args, **kwargs):
         """
         return self.tokenizer.decode(*args, **kwargs)
 
-    def post_process_image_text_to_text(self, generated_outputs):
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
         """
         Post-process the output of the model to decode the text.
 
@@ -354,12 +360,21 @@ def post_process_image_text_to_text(self, generated_outputs):
             generated_outputs (`torch.Tensor` or `np.ndarray`):
                 The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                 or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
 
         Returns:
             `List[str]`: The decoded text.
         """
         return self.tokenizer.batch_decode(
-            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
         )
 
     @property
diff --git a/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index f361082fb3c5..000000000000
--- a/src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert mLUKE checkpoint."""
-
-import argparse
-import json
-import os
-from collections import OrderedDict
-
-import torch
-
-from transformers import LukeConfig, LukeForMaskedLM, MLukeTokenizer, XLMRobertaTokenizer
-from transformers.tokenization_utils_base import AddedToken
-
-
-@torch.no_grad()
-def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path, pytorch_dump_folder_path, model_size):
-    # Load configuration defined in the metadata file
-    with open(metadata_path) as metadata_file:
-        metadata = json.load(metadata_file)
-    config = LukeConfig(use_entity_aware_attention=True, **metadata["model_config"])
-
-    # Load in the weights from the checkpoint_path
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["module"]
-
-    # Load the entity vocab file
-    entity_vocab = load_original_entity_vocab(entity_vocab_path)
-    # add an entry for [MASK2]
-    entity_vocab["[MASK2]"] = max(entity_vocab.values()) + 1
-    config.entity_vocab_size += 1
-
-    tokenizer = XLMRobertaTokenizer.from_pretrained(metadata["model_config"]["bert_model_name"])
-
-    # Add special tokens to the token vocabulary for downstream tasks
-    entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False)
-    entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False)
-    tokenizer.add_special_tokens({"additional_special_tokens": [entity_token_1, entity_token_2]})
-    config.vocab_size += 2
-
-    print(f"Saving tokenizer to {pytorch_dump_folder_path}")
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-    with open(os.path.join(pytorch_dump_folder_path, "tokenizer_config.json"), "r") as f:
-        tokenizer_config = json.load(f)
-    tokenizer_config["tokenizer_class"] = "MLukeTokenizer"
-    with open(os.path.join(pytorch_dump_folder_path, "tokenizer_config.json"), "w") as f:
-        json.dump(tokenizer_config, f)
-
-    with open(os.path.join(pytorch_dump_folder_path, MLukeTokenizer.vocab_files_names["entity_vocab_file"]), "w") as f:
-        json.dump(entity_vocab, f)
-
-    tokenizer = MLukeTokenizer.from_pretrained(pytorch_dump_folder_path)
-
-    # Initialize the embeddings of the special tokens
-    ent_init_index = tokenizer.convert_tokens_to_ids(["@"])[0]
-    ent2_init_index = tokenizer.convert_tokens_to_ids(["#"])[0]
-
-    word_emb = state_dict["embeddings.word_embeddings.weight"]
-    ent_emb = word_emb[ent_init_index].unsqueeze(0)
-    ent2_emb = word_emb[ent2_init_index].unsqueeze(0)
-    state_dict["embeddings.word_embeddings.weight"] = torch.cat([word_emb, ent_emb, ent2_emb])
-    # add special tokens for 'entity_predictions.bias'
-    for bias_name in ["lm_head.decoder.bias", "lm_head.bias"]:
-        decoder_bias = state_dict[bias_name]
-        ent_decoder_bias = decoder_bias[ent_init_index].unsqueeze(0)
-        ent2_decoder_bias = decoder_bias[ent2_init_index].unsqueeze(0)
-        state_dict[bias_name] = torch.cat([decoder_bias, ent_decoder_bias, ent2_decoder_bias])
-
-    # Initialize the query layers of the entity-aware self-attention mechanism
-    for layer_index in range(config.num_hidden_layers):
-        for matrix_name in ["query.weight", "query.bias"]:
-            prefix = f"encoder.layer.{layer_index}.attention.self."
-            state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix + matrix_name]
-            state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix + matrix_name]
-            state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix + matrix_name]
-
-    # Initialize the embedding of the [MASK2] entity using that of the [MASK] entity for downstream tasks
-    entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"]
-    entity_mask_emb = entity_emb[entity_vocab["[MASK]"]].unsqueeze(0)
-    state_dict["entity_embeddings.entity_embeddings.weight"] = torch.cat([entity_emb, entity_mask_emb])
-    # add [MASK2] for 'entity_predictions.bias'
-    entity_prediction_bias = state_dict["entity_predictions.bias"]
-    entity_mask_bias = entity_prediction_bias[entity_vocab["[MASK]"]].unsqueeze(0)
-    state_dict["entity_predictions.bias"] = torch.cat([entity_prediction_bias, entity_mask_bias])
-
-    model = LukeForMaskedLM(config=config).eval()
-
-    state_dict.pop("entity_predictions.decoder.weight")
-    state_dict.pop("lm_head.decoder.weight")
-    state_dict.pop("lm_head.decoder.bias")
-    state_dict_for_hugging_face = OrderedDict()
-    for key, value in state_dict.items():
-        if not (key.startswith("lm_head") or key.startswith("entity_predictions")):
-            state_dict_for_hugging_face[f"luke.{key}"] = state_dict[key]
-        else:
-            state_dict_for_hugging_face[key] = state_dict[key]
-
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict_for_hugging_face, strict=False)
-
-    if set(unexpected_keys) != {"luke.embeddings.position_ids"}:
-        raise ValueError(f"Unexpected unexpected_keys: {unexpected_keys}")
-    if set(missing_keys) != {
-        "lm_head.decoder.weight",
-        "lm_head.decoder.bias",
-        "entity_predictions.decoder.weight",
-    }:
-        raise ValueError(f"Unexpected missing_keys: {missing_keys}")
-
-    model.tie_weights()
-    assert (model.luke.embeddings.word_embeddings.weight == model.lm_head.decoder.weight).all()
-    assert (model.luke.entity_embeddings.entity_embeddings.weight == model.entity_predictions.decoder.weight).all()
-
-    # Check outputs
-    tokenizer = MLukeTokenizer.from_pretrained(pytorch_dump_folder_path, task="entity_classification")
-
-    text = "ISO 639-3 uses the code fas for the dialects spoken across Iran and アフガニスタン (Afghanistan)."
-    span = (0, 9)
-    encoding = tokenizer(text, entity_spans=[span], return_tensors="pt")
-
-    outputs = model(**encoding)
-
-    # Verify word hidden states
-    if model_size == "large":
-        raise NotImplementedError
-    else:  # base
-        expected_shape = torch.Size((1, 33, 768))
-        expected_slice = torch.tensor([[0.0892, 0.0596, -0.2819], [0.0134, 0.1199, 0.0573], [-0.0169, 0.0927, 0.0644]])
-
-    if not (outputs.last_hidden_state.shape == expected_shape):
-        raise ValueError(
-            f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}"
-        )
-    if not torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
-        raise ValueError
-
-    # Verify entity hidden states
-    if model_size == "large":
-        raise NotImplementedError
-    else:  # base
-        expected_shape = torch.Size((1, 1, 768))
-        expected_slice = torch.tensor([[-0.1482, 0.0609, 0.0322]])
-
-    if not (outputs.entity_last_hidden_state.shape == expected_shape):
-        raise ValueError(
-            f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is"
-            f" {expected_shape}"
-        )
-    if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
-        raise ValueError
-
-    # Verify masked word/entity prediction
-    tokenizer = MLukeTokenizer.from_pretrained(pytorch_dump_folder_path)
-    text = "Tokyo is the capital of <mask>."
-    span = (24, 30)
-    encoding = tokenizer(text, entity_spans=[span], return_tensors="pt")
-
-    outputs = model(**encoding)
-
-    input_ids = encoding["input_ids"][0].tolist()
-    mask_position_id = input_ids.index(tokenizer.convert_tokens_to_ids("<mask>"))
-    predicted_id = outputs.logits[0][mask_position_id].argmax(dim=-1)
-    assert "Japan" == tokenizer.decode(predicted_id)
-
-    predicted_entity_id = outputs.entity_logits[0][0].argmax().item()
-    multilingual_predicted_entities = [
-        entity for entity, entity_id in tokenizer.entity_vocab.items() if entity_id == predicted_entity_id
-    ]
-    assert [e for e in multilingual_predicted_entities if e.startswith("en:")][0] == "en:Japan"
-
-    # Finally, save our PyTorch model and tokenizer
-    print("Saving PyTorch model to {}".format(pytorch_dump_folder_path))
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-def load_original_entity_vocab(entity_vocab_path):
-    SPECIAL_TOKENS = ["[MASK]", "[PAD]", "[UNK]"]
-
-    data = [json.loads(line) for line in open(entity_vocab_path)]
-
-    new_mapping = {}
-    for entry in data:
-        entity_id = entry["id"]
-        for entity_name, language in entry["entities"]:
-            if entity_name in SPECIAL_TOKENS:
-                new_mapping[entity_name] = entity_id
-                break
-            new_entity_name = f"{language}:{entity_name}"
-            new_mapping[new_entity_name] = entity_id
-    return new_mapping
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--checkpoint_path", type=str, help="Path to a pytorch_model.bin file.")
-    parser.add_argument(
-        "--metadata_path", default=None, type=str, help="Path to a metadata.json file, defining the configuration."
-    )
-    parser.add_argument(
-        "--entity_vocab_path",
-        default=None,
-        type=str,
-        help="Path to an entity_vocab.tsv file, containing the entity vocabulary.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to where to dump the output PyTorch model."
-    )
-    parser.add_argument(
-        "--model_size", default="base", type=str, choices=["base", "large"], help="Size of the model to be converted."
-    )
-    args = parser.parse_args()
-    convert_luke_checkpoint(
-        args.checkpoint_path,
-        args.metadata_path,
-        args.entity_vocab_path,
-        args.pytorch_dump_folder_path,
-        args.model_size,
-    )
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index 79bf2237b233..6bd9ed1a50ec 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -399,7 +399,7 @@ def __call__(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -548,7 +548,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -632,7 +632,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -911,7 +911,7 @@ def _batch_prepare_for_model(
         max_entity_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1000,7 +1000,7 @@ def prepare_for_model(
         max_entity_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1230,7 +1230,7 @@ def pad(
         max_length: Optional[int] = None,
         max_entity_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         verbose: bool = True,
@@ -1391,7 +1391,7 @@ def _pad(
         max_entity_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 022a9d036cdb..000000000000
--- a/src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-
-from transformers import MobileBertConfig, MobileBertForPreTraining, load_tf_weights_in_mobilebert
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, mobilebert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = MobileBertConfig.from_json_file(mobilebert_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = MobileBertForPreTraining(config)
-    # Load weights from tf checkpoint
-    model = load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path)
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--mobilebert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained MobileBERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.mobilebert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index 24e29c2a21f2..9801e19ac3ed 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -144,9 +144,9 @@ def load_tf_weights_in_mobilebert(model, config, tf_checkpoint_path):
         elif m_name == "kernel":
             array = np.transpose(array)
         try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            assert pointer.shape == array.shape, (
+                f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            )
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
@@ -734,8 +734,8 @@ class MobileBertForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index 60815e093676..e85c079025a4 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -1063,8 +1063,8 @@ class TFMobileBertForPreTrainingOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    prediction_logits: tf.Tensor = None
-    seq_relationship_logits: tf.Tensor = None
+    prediction_logits: Optional[tf.Tensor] = None
+    seq_relationship_logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
diff --git a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 1b53bbeab475..000000000000
--- a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MobileNetV1 checkpoints from the tensorflow/models library."""
-
-import argparse
-import json
-import re
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    MobileNetV1Config,
-    MobileNetV1ForImageClassification,
-    MobileNetV1ImageProcessor,
-    load_tf_weights_in_mobilenet_v1,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_mobilenet_v1_config(model_name):
-    config = MobileNetV1Config(layer_norm_eps=0.001)
-
-    if "_quant" in model_name:
-        raise ValueError("Quantized models are not supported.")
-
-    matches = re.match(r"^mobilenet_v1_([^_]*)_([^_]*)$", model_name)
-    if matches:
-        config.depth_multiplier = float(matches[1])
-        config.image_size = int(matches[2])
-
-    # The TensorFlow version of MobileNetV1 predicts 1001 classes instead of
-    # the usual 1000. The first class (index 0) is "background".
-    config.num_labels = 1001
-    filename = "imagenet-1k-id2label.json"
-    repo_id = "huggingface/label-files"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k) + 1: v for k, v in id2label.items()}
-    id2label[0] = "background"
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our MobileNetV1 structure.
-    """
-    config = get_mobilenet_v1_config(model_name)
-
-    # Load 🤗 model
-    model = MobileNetV1ForImageClassification(config).eval()
-
-    # Load weights from TensorFlow checkpoint
-    load_tf_weights_in_mobilenet_v1(model, config, checkpoint_path)
-
-    # Check outputs on an image, prepared by MobileNetV1ImageProcessor
-    image_processor = MobileNetV1ImageProcessor(
-        crop_size={"width": config.image_size, "height": config.image_size},
-        size={"shortest_edge": config.image_size + 32},
-    )
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    outputs = model(**encoding)
-    logits = outputs.logits
-
-    assert logits.shape == (1, 1001)
-
-    if model_name == "mobilenet_v1_1.0_224":
-        expected_logits = torch.tensor([-4.1739, -1.1233, 3.1205])
-    elif model_name == "mobilenet_v1_0.75_192":
-        expected_logits = torch.tensor([-3.9440, -2.3141, -0.3333])
-    else:
-        expected_logits = None
-
-    if expected_logits is not None:
-        assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing to the hub...")
-        repo_id = "google/" + model_name
-        image_processor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="mobilenet_v1_1.0_224",
-        type=str,
-        help="Name of the MobileNetV1 model you'd like to convert. Should in the form 'mobilenet_v1_<depth>_<size>'.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", required=True, type=str, help="Path to the original TensorFlow checkpoint (.ckpt file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_movilevit_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
index f0092ca02963..6c30c3413baa 100644
--- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
@@ -171,7 +171,7 @@ def preprocess(
         do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
diff --git a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 1fdb9783ccf0..000000000000
--- a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MobileNetV2 checkpoints from the tensorflow/models library."""
-
-import argparse
-import json
-import re
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    MobileNetV2Config,
-    MobileNetV2ForImageClassification,
-    MobileNetV2ForSemanticSegmentation,
-    MobileNetV2ImageProcessor,
-    load_tf_weights_in_mobilenet_v2,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_mobilenet_v2_config(model_name):
-    config = MobileNetV2Config(layer_norm_eps=0.001)
-
-    if "quant" in model_name:
-        raise ValueError("Quantized models are not supported.")
-
-    matches = re.match(r"^.*mobilenet_v2_([^_]*)_([^_]*)$", model_name)
-    if matches:
-        config.depth_multiplier = float(matches[1])
-        config.image_size = int(matches[2])
-
-    if model_name.startswith("deeplabv3_"):
-        config.output_stride = 8
-        config.num_labels = 21
-        filename = "pascal-voc-id2label.json"
-    else:
-        # The TensorFlow version of MobileNetV2 predicts 1001 classes instead
-        # of the usual 1000. The first class (index 0) is "background".
-        config.num_labels = 1001
-        filename = "imagenet-1k-id2label.json"
-
-    repo_id = "huggingface/label-files"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-
-    if config.num_labels == 1001:
-        id2label = {int(k) + 1: v for k, v in id2label.items()}
-        id2label[0] = "background"
-    else:
-        id2label = {int(k): v for k, v in id2label.items()}
-
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_movilevit_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our MobileNetV2 structure.
-    """
-    config = get_mobilenet_v2_config(model_name)
-
-    # Load 🤗 model
-    if model_name.startswith("deeplabv3_"):
-        model = MobileNetV2ForSemanticSegmentation(config).eval()
-    else:
-        model = MobileNetV2ForImageClassification(config).eval()
-
-    # Load weights from TensorFlow checkpoint
-    load_tf_weights_in_mobilenet_v2(model, config, checkpoint_path)
-
-    # Check outputs on an image, prepared by MobileNetV2ImageProcessor
-    image_processor = MobileNetV2ImageProcessor(
-        crop_size={"width": config.image_size, "height": config.image_size},
-        size={"shortest_edge": config.image_size + 32},
-    )
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    outputs = model(**encoding)
-    logits = outputs.logits
-
-    if model_name.startswith("deeplabv3_"):
-        assert logits.shape == (1, 21, 65, 65)
-
-        if model_name == "deeplabv3_mobilenet_v2_1.0_513":
-            expected_logits = torch.tensor(
-                [
-                    [[17.5790, 17.7581, 18.3355], [18.3257, 18.4230, 18.8973], [18.6169, 18.8650, 19.2187]],
-                    [[-2.1595, -2.0977, -2.3741], [-2.4226, -2.3028, -2.6835], [-2.7819, -2.5991, -2.7706]],
-                    [[4.2058, 4.8317, 4.7638], [4.4136, 5.0361, 4.9383], [4.5028, 4.9644, 4.8734]],
-                ]
-            )
-
-        else:
-            raise ValueError(f"Unknown model name: {model_name}")
-
-        assert torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-4)
-    else:
-        assert logits.shape == (1, 1001)
-
-        if model_name == "mobilenet_v2_1.4_224":
-            expected_logits = torch.tensor([0.0181, -1.0015, 0.4688])
-        elif model_name == "mobilenet_v2_1.0_224":
-            expected_logits = torch.tensor([0.2445, -1.1993, 0.1905])
-        elif model_name == "mobilenet_v2_0.75_160":
-            expected_logits = torch.tensor([0.2482, 0.4136, 0.6669])
-        elif model_name == "mobilenet_v2_0.35_96":
-            expected_logits = torch.tensor([0.1451, -0.4624, 0.7192])
-        else:
-            expected_logits = None
-
-        if expected_logits is not None:
-            assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing to the hub...")
-        repo_id = "google/" + model_name
-        image_processor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="mobilenet_v2_1.0_224",
-        type=str,
-        help="Name of the MobileNetV2 model you'd like to convert. Should in the form 'mobilenet_v2_<depth>_<size>'.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", required=True, type=str, help="Path to the original TensorFlow checkpoint (.ckpt file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_movilevit_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index c93f95a2058e..0107e96402cb 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -175,7 +175,7 @@ def preprocess(
         do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
         do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
diff --git a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
deleted file mode 100644
index 522d6671d127..000000000000
--- a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MobileViT checkpoints from the ml-cvnets library."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    MobileViTConfig,
-    MobileViTForImageClassification,
-    MobileViTForSemanticSegmentation,
-    MobileViTImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_mobilevit_config(mobilevit_name):
-    config = MobileViTConfig()
-
-    # size of the architecture
-    if "mobilevit_s" in mobilevit_name:
-        config.hidden_sizes = [144, 192, 240]
-        config.neck_hidden_sizes = [16, 32, 64, 96, 128, 160, 640]
-    elif "mobilevit_xs" in mobilevit_name:
-        config.hidden_sizes = [96, 120, 144]
-        config.neck_hidden_sizes = [16, 32, 48, 64, 80, 96, 384]
-    elif "mobilevit_xxs" in mobilevit_name:
-        config.hidden_sizes = [64, 80, 96]
-        config.neck_hidden_sizes = [16, 16, 24, 48, 64, 80, 320]
-        config.hidden_dropout_prob = 0.05
-        config.expand_ratio = 2.0
-
-    if mobilevit_name.startswith("deeplabv3_"):
-        config.image_size = 512
-        config.output_stride = 16
-        config.num_labels = 21
-        filename = "pascal-voc-id2label.json"
-    else:
-        config.num_labels = 1000
-        filename = "imagenet-1k-id2label.json"
-
-    repo_id = "huggingface/label-files"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def rename_key(name, base_model=False):
-    for i in range(1, 6):
-        if f"layer_{i}." in name:
-            name = name.replace(f"layer_{i}.", f"encoder.layer.{i - 1}.")
-
-    if "conv_1." in name:
-        name = name.replace("conv_1.", "conv_stem.")
-    if ".block." in name:
-        name = name.replace(".block.", ".")
-
-    if "exp_1x1" in name:
-        name = name.replace("exp_1x1", "expand_1x1")
-    if "red_1x1" in name:
-        name = name.replace("red_1x1", "reduce_1x1")
-    if ".local_rep.conv_3x3." in name:
-        name = name.replace(".local_rep.conv_3x3.", ".conv_kxk.")
-    if ".local_rep.conv_1x1." in name:
-        name = name.replace(".local_rep.conv_1x1.", ".conv_1x1.")
-    if ".norm." in name:
-        name = name.replace(".norm.", ".normalization.")
-    if ".conv." in name:
-        name = name.replace(".conv.", ".convolution.")
-    if ".conv_proj." in name:
-        name = name.replace(".conv_proj.", ".conv_projection.")
-
-    for i in range(0, 2):
-        for j in range(0, 4):
-            if f".{i}.{j}." in name:
-                name = name.replace(f".{i}.{j}.", f".{i}.layer.{j}.")
-
-    for i in range(2, 6):
-        for j in range(0, 4):
-            if f".{i}.{j}." in name:
-                name = name.replace(f".{i}.{j}.", f".{i}.")
-                if "expand_1x1" in name:
-                    name = name.replace("expand_1x1", "downsampling_layer.expand_1x1")
-                if "conv_3x3" in name:
-                    name = name.replace("conv_3x3", "downsampling_layer.conv_3x3")
-                if "reduce_1x1" in name:
-                    name = name.replace("reduce_1x1", "downsampling_layer.reduce_1x1")
-
-    for i in range(2, 5):
-        if f".global_rep.{i}.weight" in name:
-            name = name.replace(f".global_rep.{i}.weight", ".layernorm.weight")
-        if f".global_rep.{i}.bias" in name:
-            name = name.replace(f".global_rep.{i}.bias", ".layernorm.bias")
-
-    if ".global_rep." in name:
-        name = name.replace(".global_rep.", ".transformer.")
-    if ".pre_norm_mha.0." in name:
-        name = name.replace(".pre_norm_mha.0.", ".layernorm_before.")
-    if ".pre_norm_mha.1.out_proj." in name:
-        name = name.replace(".pre_norm_mha.1.out_proj.", ".attention.output.dense.")
-    if ".pre_norm_ffn.0." in name:
-        name = name.replace(".pre_norm_ffn.0.", ".layernorm_after.")
-    if ".pre_norm_ffn.1." in name:
-        name = name.replace(".pre_norm_ffn.1.", ".intermediate.dense.")
-    if ".pre_norm_ffn.4." in name:
-        name = name.replace(".pre_norm_ffn.4.", ".output.dense.")
-    if ".transformer." in name:
-        name = name.replace(".transformer.", ".transformer.layer.")
-
-    if ".aspp_layer." in name:
-        name = name.replace(".aspp_layer.", ".")
-    if ".aspp_pool." in name:
-        name = name.replace(".aspp_pool.", ".")
-    if "seg_head." in name:
-        name = name.replace("seg_head.", "segmentation_head.")
-    if "segmentation_head.classifier.classifier." in name:
-        name = name.replace("segmentation_head.classifier.classifier.", "segmentation_head.classifier.")
-
-    if "classifier.fc." in name:
-        name = name.replace("classifier.fc.", "classifier.")
-    elif (not base_model) and ("segmentation_head." not in name):
-        name = "mobilevit." + name
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model, base_model=False):
-    if base_model:
-        model_prefix = ""
-    else:
-        model_prefix = "mobilevit."
-
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if key[:8] == "encoder.":
-            key = key[8:]
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[0][6:]) - 1
-            transformer_num = int(key_split[3])
-            layer = model.get_submodule(f"{model_prefix}encoder.layer.{layer_num}")
-            dim = layer.transformer.layer[transformer_num].attention.attention.all_head_size
-            prefix = (
-                f"{model_prefix}encoder.layer.{layer_num}.transformer.layer.{transformer_num}.attention.attention."
-            )
-            if "weight" in key:
-                orig_state_dict[prefix + "query.weight"] = val[:dim, :]
-                orig_state_dict[prefix + "key.weight"] = val[dim : dim * 2, :]
-                orig_state_dict[prefix + "value.weight"] = val[-dim:, :]
-            else:
-                orig_state_dict[prefix + "query.bias"] = val[:dim]
-                orig_state_dict[prefix + "key.bias"] = val[dim : dim * 2]
-                orig_state_dict[prefix + "value.bias"] = val[-dim:]
-        else:
-            orig_state_dict[rename_key(key, base_model)] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_movilevit_checkpoint(mobilevit_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our MobileViT structure.
-    """
-    config = get_mobilevit_config(mobilevit_name)
-
-    # load original state_dict
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-
-    # load 🤗 model
-    if mobilevit_name.startswith("deeplabv3_"):
-        model = MobileViTForSemanticSegmentation(config).eval()
-    else:
-        model = MobileViTForImageClassification(config).eval()
-
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    # Check outputs on an image, prepared by MobileViTImageProcessor
-    image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    outputs = model(**encoding)
-    logits = outputs.logits
-
-    if mobilevit_name.startswith("deeplabv3_"):
-        assert logits.shape == (1, 21, 32, 32)
-
-        if mobilevit_name == "deeplabv3_mobilevit_s":
-            expected_logits = torch.tensor(
-                [
-                    [[6.2065, 6.1292, 6.2070], [6.1079, 6.1254, 6.1747], [6.0042, 6.1071, 6.1034]],
-                    [[-6.9253, -6.8653, -7.0398], [-7.3218, -7.3983, -7.3670], [-7.1961, -7.2482, -7.1569]],
-                    [[-4.4723, -4.4348, -4.3769], [-5.3629, -5.4632, -5.4598], [-5.1587, -5.3402, -5.5059]],
-                ]
-            )
-        elif mobilevit_name == "deeplabv3_mobilevit_xs":
-            expected_logits = torch.tensor(
-                [
-                    [[5.4449, 5.5733, 5.6314], [5.1815, 5.3930, 5.5963], [5.1656, 5.4333, 5.4853]],
-                    [[-9.4423, -9.7766, -9.6714], [-9.1581, -9.5720, -9.5519], [-9.1006, -9.6458, -9.5703]],
-                    [[-7.7721, -7.3716, -7.1583], [-8.4599, -8.0624, -7.7944], [-8.4172, -7.8366, -7.5025]],
-                ]
-            )
-        elif mobilevit_name == "deeplabv3_mobilevit_xxs":
-            expected_logits = torch.tensor(
-                [
-                    [[6.9811, 6.9743, 7.3123], [7.1777, 7.1931, 7.3938], [7.5633, 7.8050, 7.8901]],
-                    [[-10.5536, -10.2332, -10.2924], [-10.2336, -9.8624, -9.5964], [-10.8840, -10.8158, -10.6659]],
-                    [[-3.4938, -3.0631, -2.8620], [-3.4205, -2.8135, -2.6875], [-3.4179, -2.7945, -2.8750]],
-                ]
-            )
-        else:
-            raise ValueError(f"Unknown mobilevit_name: {mobilevit_name}")
-
-        assert torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-4)
-    else:
-        assert logits.shape == (1, 1000)
-
-        if mobilevit_name == "mobilevit_s":
-            expected_logits = torch.tensor([-0.9866, 0.2392, -1.1241])
-        elif mobilevit_name == "mobilevit_xs":
-            expected_logits = torch.tensor([-2.4761, -0.9399, -1.9587])
-        elif mobilevit_name == "mobilevit_xxs":
-            expected_logits = torch.tensor([-1.9364, -1.2327, -0.4653])
-        else:
-            raise ValueError(f"Unknown mobilevit_name: {mobilevit_name}")
-
-        assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {mobilevit_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_mapping = {
-            "mobilevit_s": "mobilevit-small",
-            "mobilevit_xs": "mobilevit-x-small",
-            "mobilevit_xxs": "mobilevit-xx-small",
-            "deeplabv3_mobilevit_s": "deeplabv3-mobilevit-small",
-            "deeplabv3_mobilevit_xs": "deeplabv3-mobilevit-x-small",
-            "deeplabv3_mobilevit_xxs": "deeplabv3-mobilevit-xx-small",
-        }
-
-        print("Pushing to the hub...")
-        model_name = model_mapping[mobilevit_name]
-        image_processor.push_to_hub(model_name, organization="apple")
-        model.push_to_hub(model_name, organization="apple")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--mobilevit_name",
-        default="mobilevit_s",
-        type=str,
-        help=(
-            "Name of the MobileViT model you'd like to convert. Should be one of 'mobilevit_s', 'mobilevit_xs',"
-            " 'mobilevit_xxs', 'deeplabv3_mobilevit_s', 'deeplabv3_mobilevit_xs', 'deeplabv3_mobilevit_xxs'."
-        ),
-    )
-    parser.add_argument(
-        "--checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_movilevit_checkpoint(
-        args.mobilevit_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
-    )
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index 3a32a79b272a..f59c246627c3 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -220,14 +220,14 @@ def _preprocess(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_center_crop: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_flip_channel_order: bool = None,
+        do_flip_channel_order: Optional[bool] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
@@ -262,9 +262,9 @@ def _preprocess_image(
     def _preprocess_mask(
         self,
         segmentation_map: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
@@ -302,14 +302,14 @@ def preprocess(
         self,
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_center_crop: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_flip_channel_order: bool = None,
+        do_flip_channel_order: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py
index 7f2a23238e50..f41da2bafaf2 100755
--- a/src/transformers/models/mobilevit/modeling_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_mobilevit.py
@@ -215,7 +215,7 @@ def __init__(self, config: MobileViTConfig, hidden_size: int) -> None:
 
         if hidden_size % config.num_attention_heads != 0:
             raise ValueError(
-                f"The hidden size {hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
index 9939ddcb7168..76397f160b59 100644
--- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
@@ -262,7 +262,7 @@ def __init__(self, config: MobileViTConfig, hidden_size: int, **kwargs) -> None:
 
         if hidden_size % config.num_attention_heads != 0:
             raise ValueError(
-                f"The hidden size {hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
deleted file mode 100644
index 7b2f53f8d77e..000000000000
--- a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MobileViTV2 checkpoints from the ml-cvnets library."""
-
-import argparse
-import collections
-import json
-from pathlib import Path
-
-import requests
-import torch
-import yaml
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    MobileViTImageProcessor,
-    MobileViTV2Config,
-    MobileViTV2ForImageClassification,
-    MobileViTV2ForSemanticSegmentation,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def load_orig_config_file(orig_cfg_file):
-    print("Loading config file...")
-
-    def flatten_yaml_as_dict(d, parent_key="", sep="."):
-        items = []
-        for k, v in d.items():
-            new_key = parent_key + sep + k if parent_key else k
-            if isinstance(v, collections.abc.MutableMapping):
-                items.extend(flatten_yaml_as_dict(v, new_key, sep=sep).items())
-            else:
-                items.append((new_key, v))
-        return dict(items)
-
-    config = argparse.Namespace()
-    with open(orig_cfg_file, "r") as yaml_file:
-        try:
-            cfg = yaml.load(yaml_file, Loader=yaml.FullLoader)
-
-            flat_cfg = flatten_yaml_as_dict(cfg)
-            for k, v in flat_cfg.items():
-                setattr(config, k, v)
-        except yaml.YAMLError as exc:
-            logger.error("Error while loading config file: {}. Error message: {}".format(orig_cfg_file, str(exc)))
-    return config
-
-
-def get_mobilevitv2_config(task_name, orig_cfg_file):
-    config = MobileViTV2Config()
-
-    is_segmentation_model = False
-
-    # dataset
-    if task_name.startswith("imagenet1k_"):
-        config.num_labels = 1000
-        if int(task_name.strip().split("_")[-1]) == 384:
-            config.image_size = 384
-        else:
-            config.image_size = 256
-        filename = "imagenet-1k-id2label.json"
-    elif task_name.startswith("imagenet21k_to_1k_"):
-        config.num_labels = 21000
-        if int(task_name.strip().split("_")[-1]) == 384:
-            config.image_size = 384
-        else:
-            config.image_size = 256
-        filename = "imagenet-22k-id2label.json"
-    elif task_name.startswith("ade20k_"):
-        config.num_labels = 151
-        config.image_size = 512
-        filename = "ade20k-id2label.json"
-        is_segmentation_model = True
-    elif task_name.startswith("voc_"):
-        config.num_labels = 21
-        config.image_size = 512
-        filename = "pascal-voc-id2label.json"
-        is_segmentation_model = True
-
-    # orig_config
-    orig_config = load_orig_config_file(orig_cfg_file)
-    assert getattr(orig_config, "model.classification.name", -1) == "mobilevit_v2", "Invalid model"
-    config.width_multiplier = getattr(orig_config, "model.classification.mitv2.width_multiplier", 1.0)
-    assert (
-        getattr(orig_config, "model.classification.mitv2.attn_norm_layer", -1) == "layer_norm_2d"
-    ), "Norm layers other than layer_norm_2d is not supported"
-    config.hidden_act = getattr(orig_config, "model.classification.activation.name", "swish")
-    # config.image_size == getattr(orig_config,  'sampler.bs.crop_size_width', 256)
-
-    if is_segmentation_model:
-        config.output_stride = getattr(orig_config, "model.segmentation.output_stride", 16)
-        if "_deeplabv3" in task_name:
-            config.atrous_rates = getattr(orig_config, "model.segmentation.deeplabv3.aspp_rates", [12, 24, 36])
-            config.aspp_out_channels = getattr(orig_config, "model.segmentation.deeplabv3.aspp_out_channels", 512)
-            config.aspp_dropout_prob = getattr(orig_config, "model.segmentation.deeplabv3.aspp_dropout", 0.1)
-
-    # id2label
-    repo_id = "huggingface/label-files"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def create_rename_keys(state_dict, base_model=False):
-    if base_model:
-        model_prefix = ""
-    else:
-        model_prefix = "mobilevitv2."
-
-    rename_keys = []
-    for k in state_dict.keys():
-        if k[:8] == "encoder.":
-            k_new = k[8:]
-        else:
-            k_new = k
-
-        if ".block." in k:
-            k_new = k_new.replace(".block.", ".")
-        if ".conv." in k:
-            k_new = k_new.replace(".conv.", ".convolution.")
-        if ".norm." in k:
-            k_new = k_new.replace(".norm.", ".normalization.")
-
-        if "conv_1." in k:
-            k_new = k_new.replace("conv_1.", f"{model_prefix}conv_stem.")
-        for i in [1, 2]:
-            if f"layer_{i}." in k:
-                k_new = k_new.replace(f"layer_{i}.", f"{model_prefix}encoder.layer.{i-1}.layer.")
-        if ".exp_1x1." in k:
-            k_new = k_new.replace(".exp_1x1.", ".expand_1x1.")
-        if ".red_1x1." in k:
-            k_new = k_new.replace(".red_1x1.", ".reduce_1x1.")
-
-        for i in [3, 4, 5]:
-            if f"layer_{i}.0." in k:
-                k_new = k_new.replace(f"layer_{i}.0.", f"{model_prefix}encoder.layer.{i-1}.downsampling_layer.")
-            if f"layer_{i}.1.local_rep.0." in k:
-                k_new = k_new.replace(f"layer_{i}.1.local_rep.0.", f"{model_prefix}encoder.layer.{i-1}.conv_kxk.")
-            if f"layer_{i}.1.local_rep.1." in k:
-                k_new = k_new.replace(f"layer_{i}.1.local_rep.1.", f"{model_prefix}encoder.layer.{i-1}.conv_1x1.")
-
-        for i in [3, 4, 5]:
-            if i == 3:
-                j_in = [0, 1]
-            elif i == 4:
-                j_in = [0, 1, 2, 3]
-            elif i == 5:
-                j_in = [0, 1, 2]
-
-            for j in j_in:
-                if f"layer_{i}.1.global_rep.{j}." in k:
-                    k_new = k_new.replace(
-                        f"layer_{i}.1.global_rep.{j}.", f"{model_prefix}encoder.layer.{i-1}.transformer.layer.{j}."
-                    )
-            if f"layer_{i}.1.global_rep.{j+1}." in k:
-                k_new = k_new.replace(
-                    f"layer_{i}.1.global_rep.{j+1}.", f"{model_prefix}encoder.layer.{i-1}.layernorm."
-                )
-
-            if f"layer_{i}.1.conv_proj." in k:
-                k_new = k_new.replace(f"layer_{i}.1.conv_proj.", f"{model_prefix}encoder.layer.{i-1}.conv_projection.")
-
-        if "pre_norm_attn.0." in k:
-            k_new = k_new.replace("pre_norm_attn.0.", "layernorm_before.")
-        if "pre_norm_attn.1." in k:
-            k_new = k_new.replace("pre_norm_attn.1.", "attention.")
-        if "pre_norm_ffn.0." in k:
-            k_new = k_new.replace("pre_norm_ffn.0.", "layernorm_after.")
-        if "pre_norm_ffn.1." in k:
-            k_new = k_new.replace("pre_norm_ffn.1.", "ffn.conv1.")
-        if "pre_norm_ffn.3." in k:
-            k_new = k_new.replace("pre_norm_ffn.3.", "ffn.conv2.")
-
-        if "classifier.1." in k:
-            k_new = k_new.replace("classifier.1.", "classifier.")
-
-        if "seg_head." in k:
-            k_new = k_new.replace("seg_head.", "segmentation_head.")
-        if ".aspp_layer." in k:
-            k_new = k_new.replace(".aspp_layer.", ".")
-        if ".aspp_pool." in k:
-            k_new = k_new.replace(".aspp_pool.", ".")
-
-        rename_keys.append((k, k_new))
-    return rename_keys
-
-
-def remove_unused_keys(state_dict):
-    """remove unused keys (e.g.: seg_head.aux_head)"""
-    keys_to_ignore = []
-    for k in state_dict.keys():
-        if k.startswith("seg_head.aux_head."):
-            keys_to_ignore.append(k)
-    for k in keys_to_ignore:
-        state_dict.pop(k, None)
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    # url = "https://cdn.britannica.com/86/141086-050-9D7C75EE/Gulfstream-G450-business-jet-passengers.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_mobilevitv2_checkpoint(task_name, checkpoint_path, orig_config_path, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our MobileViTV2 structure.
-    """
-    config = get_mobilevitv2_config(task_name, orig_config_path)
-
-    # load original state_dict
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    # load huggingface model
-    if task_name.startswith("ade20k_") or task_name.startswith("voc_"):
-        model = MobileViTV2ForSemanticSegmentation(config).eval()
-        base_model = False
-    else:
-        model = MobileViTV2ForImageClassification(config).eval()
-        base_model = False
-
-    # remove and rename some keys of load the original model
-    state_dict = checkpoint
-    remove_unused_keys(state_dict)
-    rename_keys = create_rename_keys(state_dict, base_model=base_model)
-    for rename_key_src, rename_key_dest in rename_keys:
-        rename_key(state_dict, rename_key_src, rename_key_dest)
-
-    # load modified state_dict
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by MobileViTImageProcessor
-    image_processor = MobileViTImageProcessor(crop_size=config.image_size, size=config.image_size + 32)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    outputs = model(**encoding)
-
-    # verify classification model
-    if task_name.startswith("imagenet"):
-        logits = outputs.logits
-        predicted_class_idx = logits.argmax(-1).item()
-        print("Predicted class:", model.config.id2label[predicted_class_idx])
-        if task_name.startswith("imagenet1k_256") and config.width_multiplier == 1.0:
-            # expected_logits for base variant
-            expected_logits = torch.tensor([-1.6336e00, -7.3204e-02, -5.1883e-01])
-            assert torch.allclose(logits[0, :3], expected_logits, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {task_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--task",
-        default="imagenet1k_256",
-        type=str,
-        help=(
-            "Name of the task for which the MobileViTV2 model you'd like to convert is trained on . "
-            """
-                Classification (ImageNet-1k)
-                    - MobileViTV2 (256x256) : imagenet1k_256
-                    - MobileViTV2 (Trained on 256x256 and Finetuned on 384x384) : imagenet1k_384
-                    - MobileViTV2 (Trained on ImageNet-21k and Finetuned on ImageNet-1k 256x256) :
-                      imagenet21k_to_1k_256
-                    - MobileViTV2 (Trained on ImageNet-21k, Finetuned on ImageNet-1k 256x256, and Finetuned on
-                      ImageNet-1k 384x384) : imagenet21k_to_1k_384
-                Segmentation
-                    - ADE20K Dataset : ade20k_deeplabv3
-                    - Pascal VOC 2012 Dataset: voc_deeplabv3
-            """
-        ),
-        choices=[
-            "imagenet1k_256",
-            "imagenet1k_384",
-            "imagenet21k_to_1k_256",
-            "imagenet21k_to_1k_384",
-            "ade20k_deeplabv3",
-            "voc_deeplabv3",
-        ],
-    )
-
-    parser.add_argument(
-        "--orig_checkpoint_path", required=True, type=str, help="Path to the original state dict (.pt file)."
-    )
-    parser.add_argument(
-        "--orig_config_path",
-        required=True,
-        type=str,
-        help="Path to the original config file. yaml.load will be used to load the file, please be wary of which file you're loading.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_mobilevitv2_checkpoint(
-        args.task, args.orig_checkpoint_path, args.orig_config_path, args.pytorch_dump_folder_path
-    )
diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py
index cc0295c25b55..1835f55aaec4 100644
--- a/src/transformers/models/modernbert/configuration_modernbert.py
+++ b/src/transformers/models/modernbert/configuration_modernbert.py
@@ -214,5 +214,10 @@ def __init__(
                 f'Invalid value for `classifier_pooling`, should be either "cls" or "mean", but is {self.classifier_pooling}.'
             )
 
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("reference_compile", None)
+        return output
+
 
 __all__ = ["ModernBertConfig"]
diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py
index 2fa5a08acc48..5b2486e2c283 100644
--- a/src/transformers/models/modernbert/modeling_modernbert.py
+++ b/src/transformers/models/modernbert/modeling_modernbert.py
@@ -30,8 +30,14 @@
 
 from ...activations import ACT2FN
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
-from ...modeling_outputs import BaseModelOutput, MaskedLMOutput, SequenceClassifierOutput, TokenClassifierOutput
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
@@ -208,7 +214,7 @@ def compiled_embeddings(self, input_ids: torch.LongTensor) -> torch.Tensor:
         return self.drop(self.norm(self.tok_embeddings(input_ids)))
 
     def forward(
-        self, input_ids: torch.LongTensor = None, inputs_embeds: Optional[torch.Tensor] = None
+        self, input_ids: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         if inputs_embeds is not None:
             hidden_states = self.drop(self.norm(inputs_embeds))
@@ -258,45 +264,18 @@ def __init__(self, config: ModernBertConfig, dim: int, base: float, device: Opti
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -650,7 +629,10 @@ def init_weight(module: nn.Module, std: float):
             init_weight(module.dense, stds["out"])
         elif isinstance(module, ModernBertForMaskedLM):
             init_weight(module.decoder, stds["out"])
-        elif isinstance(module, (ModernBertForSequenceClassification, ModernBertForTokenClassification)):
+        elif isinstance(
+            module,
+            (ModernBertForSequenceClassification, ModernBertForTokenClassification, ModernBertForQuestionAnswering),
+        ):
             init_weight(module.classifier, stds["final_out"])
 
     @classmethod
@@ -1384,10 +1366,98 @@ def forward(
         )
 
 
+@add_start_docstrings(
+    """
+    The ModernBert Model with a span classification head on top for extractive question-answering tasks like SQuAD
+    (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MODERNBERT_START_DOCSTRING,
+)
+class ModernBertForQuestionAnswering(ModernBertPreTrainedModel):
+    def __init__(self, config: ModernBertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.model = ModernBertModel(config)
+        self.head = ModernBertPredictionHead(config)
+        self.drop = torch.nn.Dropout(config.classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MODERNBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        sliding_window_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        indices: Optional[torch.Tensor] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        seq_len: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        self._maybe_set_compile()
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            sliding_window_mask=sliding_window_mask,
+            position_ids=position_ids,
+            indices=indices,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = outputs[0]
+
+        last_hidden_state = self.head(last_hidden_state)
+        last_hidden_state = self.drop(last_hidden_state)
+        logits = self.classifier(last_hidden_state)
+
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
 __all__ = [
     "ModernBertModel",
     "ModernBertPreTrainedModel",
     "ModernBertForMaskedLM",
     "ModernBertForSequenceClassification",
     "ModernBertForTokenClassification",
+    "ModernBertForQuestionAnswering",
 ]
diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py
index edfdc94346bf..932d1025987f 100644
--- a/src/transformers/models/modernbert/modular_modernbert.py
+++ b/src/transformers/models/modernbert/modular_modernbert.py
@@ -29,6 +29,7 @@
 from ...modeling_outputs import (
     BaseModelOutput,
     MaskedLMOutput,
+    QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
@@ -247,6 +248,11 @@ def __init__(
                 f'Invalid value for `classifier_pooling`, should be either "cls" or "mean", but is {self.classifier_pooling}.'
             )
 
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("reference_compile", None)
+        return output
+
 
 def _unpad_modernbert_input(
     inputs: torch.Tensor,
@@ -471,7 +477,7 @@ def compiled_embeddings(self, input_ids: torch.LongTensor) -> torch.Tensor:
         return self.drop(self.norm(self.tok_embeddings(input_ids)))
 
     def forward(
-        self, input_ids: torch.LongTensor = None, inputs_embeds: Optional[torch.Tensor] = None
+        self, input_ids: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         if inputs_embeds is not None:
             hidden_states = self.drop(self.norm(inputs_embeds))
@@ -825,7 +831,10 @@ def init_weight(module: nn.Module, std: float):
             init_weight(module.dense, stds["out"])
         elif isinstance(module, ModernBertForMaskedLM):
             init_weight(module.decoder, stds["out"])
-        elif isinstance(module, (ModernBertForSequenceClassification, ModernBertForTokenClassification)):
+        elif isinstance(
+            module,
+            (ModernBertForSequenceClassification, ModernBertForTokenClassification, ModernBertForQuestionAnswering),
+        ):
             init_weight(module.classifier, stds["final_out"])
 
     @classmethod
@@ -1487,6 +1496,93 @@ def forward(
         )
 
 
+@add_start_docstrings(
+    """
+    The ModernBert Model with a span classification head on top for extractive question-answering tasks like SQuAD
+    (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MODERNBERT_START_DOCSTRING,
+)
+class ModernBertForQuestionAnswering(ModernBertPreTrainedModel):
+    def __init__(self, config: ModernBertConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.model = ModernBertModel(config)
+        self.head = ModernBertPredictionHead(config)
+        self.drop = torch.nn.Dropout(config.classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MODERNBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        sliding_window_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        indices: Optional[torch.Tensor] = None,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        seq_len: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        self._maybe_set_compile()
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            sliding_window_mask=sliding_window_mask,
+            position_ids=position_ids,
+            indices=indices,
+            cu_seqlens=cu_seqlens,
+            max_seqlen=max_seqlen,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = outputs[0]
+
+        last_hidden_state = self.head(last_hidden_state)
+        last_hidden_state = self.drop(last_hidden_state)
+        logits = self.classifier(last_hidden_state)
+
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
 __all__ = [
     "ModernBertConfig",
     "ModernBertModel",
@@ -1494,4 +1590,5 @@ def forward(
     "ModernBertForMaskedLM",
     "ModernBertForSequenceClassification",
     "ModernBertForTokenClassification",
+    "ModernBertForQuestionAnswering",
 ]
diff --git a/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py b/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
deleted file mode 100644
index fa80f2b70964..000000000000
--- a/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright 2025 Useful Sensors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import re
-
-import h5py
-import numpy as np
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers.models.moonshine.modeling_moonshine import MoonshineConfig, MoonshineForConditionalGeneration
-
-
-# Copied from https://github.com/usefulsensors/moonshine/blob/a1d77cc573b0471ac4602b86f67b3f48d67df1a9/moonshine/model.py
-def _get_weights(model_name):
-    repo = "UsefulSensors/moonshine"
-
-    return (
-        hf_hub_download(repo, f"{x}.weights.h5", subfolder=model_name) for x in ("preprocessor", "encoder", "decoder")
-    )
-
-
-def _read_h5_weights(group, current_key="", weights={}):
-    for key in group.keys():
-        full_key = f"{current_key}.{key}" if current_key else key
-        if isinstance(group[key], h5py.Dataset):
-            w = np.array(group[key])
-            w = torch.from_numpy(w)
-            if len(w.shape) > 1:
-                if len(w.shape) == 3:
-                    hidden_size = max(list(w.shape))
-                    try:
-                        w = w.reshape(hidden_size, hidden_size)
-                    except RuntimeError:
-                        # meaning its a conv layers
-                        pass
-                w = w.transpose(0, -1)
-            weights[full_key] = w
-        else:
-            _read_h5_weights(group[key], full_key, weights)
-    return weights
-
-
-def _convert_layer_names(name, gated_mlp=False):
-    name = re.sub(
-        r"layers\.functional(?:_(\d+))?\.layers",
-        lambda m: f'layers.{m.group(1) if m.group(1) else "0"}',
-        name,
-        count=1,
-    )
-    if gated_mlp:
-        name = re.sub(r"functional\.layers\.dense\.", "mlp.fc1.", name)
-        name = re.sub(r"functional\.layers\.dense_1\.", "mlp.fc2.", name)
-    else:
-        name = re.sub(r"functional\.layers\.sequential\.layers\.dense\.", "mlp.fc1.", name)
-        name = re.sub(r"functional\.layers\.sequential\.layers\.dense_1\.", "mlp.fc2.", name)
-    name = re.sub(r"layers\.sequential\.layers\.conv1d\.", "conv1.", name)
-    name = re.sub(r"layers\.sequential\.layers\.conv1d_1\.", "conv2.", name)
-    name = re.sub(r"layers\.sequential\.layers\.conv1d_2\.", "conv3.", name)
-    name = re.sub(r"layers\.sequential\.layers\.group_normalization\.", "groupnorm.", name)
-    name = re.sub(r"mha_with_rope\.key_dense", "self_attn.k_proj", name)
-    name = re.sub(r"mha_with_rope\.query_dense", "self_attn.q_proj", name)
-    name = re.sub(r"mha_with_rope\.value_dense", "self_attn.v_proj", name)
-    name = re.sub(r"mha_with_rope\.output_dense", "self_attn.o_proj", name)
-    name = re.sub(r"mha_precomputed_kv\.key_dense", "encoder_attn.k_proj", name)
-    name = re.sub(r"mha_precomputed_kv\.query_dense", "encoder_attn.q_proj", name)
-    name = re.sub(r"mha_precomputed_kv\.value_dense", "encoder_attn.v_proj", name)
-    name = re.sub(r"mha_precomputed_kv\.output_dense", "encoder_attn.o_proj", name)
-    name = re.sub(r"mha_causal_with_rope\.key_dense", "self_attn.k_proj", name)
-    name = re.sub(r"mha_causal_with_rope\.query_dense", "self_attn.q_proj", name)
-    name = re.sub(r"mha_causal_with_rope\.value_dense", "self_attn.v_proj", name)
-    name = re.sub(r"mha_causal_with_rope\.output_dense", "self_attn.o_proj", name)
-    name = re.sub(r"layer_normalization\.", "input_layernorm.", name)
-    name = re.sub(r"layer_normalization_1\.", "post_attention_layernorm.", name)
-    name = re.sub(r"layer_normalization_2\.", "final_layernorm.", name)
-    name = re.sub(r"vars\.0", "weight", name)
-    name = re.sub(r"vars\.1", "bias", name)
-    name = re.sub(r"layers\.reversible_embedding", "embed_tokens", name)
-
-    return name
-
-
-def _convert_weights(weights, encoder=True):
-    if "layers.rotary_embedding.vars.0" in weights:
-        weights.pop("layers.rotary_embedding.vars.0")
-
-    converted_weights = {}
-    if encoder:
-        converted_weights["layer_norm.weight"] = weights.pop("layers.layer_normalization.vars.0")
-    else:
-        converted_weights["norm.weight"] = weights.pop("layers.layer_normalization.vars.0")
-
-    for name, w in weights.items():
-        if encoder:
-            new_name = _convert_layer_names(name)
-        else:
-            new_name = _convert_layer_names(name, gated_mlp=True)
-        converted_weights[new_name] = w
-
-    return converted_weights
-
-
-def convert_usefulsensors_moonshine_to_hf(model_name, pytorch_dump_folder_path):
-    preprocessor_weights_path, encoder_weights_path, decoder_weights_path = _get_weights(model_name)
-
-    with h5py.File(preprocessor_weights_path, "r") as f:
-        loaded_preprocessor_weights = _read_h5_weights(f, weights={})
-
-    with h5py.File(encoder_weights_path, "r") as f:
-        loaded_encoder_weights = _read_h5_weights(f, weights={})
-
-    with h5py.File(decoder_weights_path, "r") as f:
-        loaded_decoder_weights = _read_h5_weights(f, weights={})
-
-    encoder_state_dict = {**loaded_encoder_weights, **loaded_preprocessor_weights}
-    converted_encoder_state_dict = _convert_weights(encoder_state_dict)
-
-    converted_decoder_state_dict = _convert_weights(loaded_decoder_weights, encoder=False)
-    converted_decoder_state_dict["embed_tokens.weight"] = converted_decoder_state_dict["embed_tokens.weight"].T
-
-    final_weights = {}
-    for k, v in converted_encoder_state_dict.items():
-        final_weights[f"model.encoder.{k}"] = v
-
-    for k, v in converted_decoder_state_dict.items():
-        final_weights[f"model.decoder.{k}"] = v
-
-    if model_name == "tiny":
-        config = MoonshineConfig()
-    elif model_name == "base":
-        config = MoonshineConfig(
-            hidden_size=416,
-            intermediate_size=1664,
-            encoder_num_hidden_layers=8,
-            decoder_num_hidden_layers=8,
-            encoder_num_attention_heads=8,
-            decoder_num_attention_heads=8,
-            partial_rotary_factor=0.62,
-        )
-    else:
-        raise ValueError(f"Unknown model name {model_name}")
-
-    final_weights["proj_out.weight"] = converted_decoder_state_dict["embed_tokens.weight"]
-
-    model = MoonshineForConditionalGeneration(config)
-    model.load_state_dict(final_weights)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # # Required parameters
-    parser.add_argument("--model_name", type=str, help="Path to the downloaded checkpoints")
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-
-    convert_usefulsensors_moonshine_to_hf(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py
index e8b8194516e3..317c7ac6df68 100644
--- a/src/transformers/models/moonshine/modeling_moonshine.py
+++ b/src/transformers/models/moonshine/modeling_moonshine.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from functools import partial
 from typing import Callable, Optional, Tuple, Union
 
 import numpy as np
@@ -40,18 +41,26 @@
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
 from .configuration_moonshine import MoonshineConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "MoonshineConfig"
 
@@ -326,45 +335,18 @@ def __init__(self, config: MoonshineConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -430,7 +412,7 @@ def forward(
 
 
 class MoonshineDecoderLayer(nn.Module):
-    def __init__(self, config: MoonshineConfig, layer_idx: int = None):
+    def __init__(self, config: MoonshineConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.hidden_size = config.hidden_size
 
@@ -607,15 +589,15 @@ def get_input_embeddings(self) -> nn.Module:
     def set_input_embeddings(self, value: nn.Module):
         self.conv1 = value
 
+    @can_return_tuple
     def forward(
         self,
         input_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         r"""
         Args:
             input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
@@ -642,7 +624,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if input_values is None:
             raise ValueError("You must specify input_values.")
@@ -717,12 +698,11 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
 
 MOONSHINE_INPUTS_DOCSTRING = r"""
@@ -760,20 +740,12 @@ def forward(
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -836,10 +808,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MOONSHINE_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -847,7 +820,6 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
@@ -869,7 +841,6 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -937,7 +908,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     encoder_hidden_states,
@@ -977,14 +948,13 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPastAndCrossAttentions(
+        return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
             cross_attentions=all_cross_attentions,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -992,12 +962,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1078,7 +1053,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1165,7 +1140,7 @@ def compute_num_masked_span(input_length):
 
     # compute number of masked spans in batch
     input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
+        attention_mask.detach().sum(-1).tolist()
         if attention_mask is not None
         else [sequence_length for _ in range(batch_size)]
     )
@@ -1394,6 +1369,7 @@ def _mask_input_features(
 
         return input_features
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MOONSHINE_MODEL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1409,9 +1385,8 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
+    ) -> Seq2SeqModelOutput:
         r"""
         Returns:
 
@@ -1437,18 +1412,16 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if encoder_outputs is None:
-            encoder_outputs = self.encoder(
+            encoder_outputs: BaseModelOutput = self.encoder(
                 input_values,
                 attention_mask=attention_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
             )
         # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+        elif not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
                 hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
@@ -1456,24 +1429,20 @@ def forward(
             )
 
         # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-        decoder_outputs = self.decoder(
+        decoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
             encoder_attention_mask=attention_mask,
-            encoder_hidden_states=encoder_outputs[0],
+            encoder_hidden_states=encoder_outputs.last_hidden_state,
             past_key_values=past_key_values,
             inputs_embeds=decoder_inputs_embeds,
             position_ids=decoder_position_ids,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
         )
 
-        if not return_dict:
-            return decoder_outputs + encoder_outputs
-
         return Seq2SeqModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             past_key_values=decoder_outputs.past_key_values,
@@ -1532,6 +1501,7 @@ def set_output_embeddings(self, new_embeddings):
     def get_input_embeddings(self) -> nn.Module:
         return self.model.get_input_embeddings()
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MOONSHINE_MODEL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1547,10 +1517,9 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
+    ) -> Seq2SeqLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
@@ -1580,7 +1549,6 @@ def forward(
         >>> transcription
         'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if labels is not None:
             if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -1588,7 +1556,7 @@ def forward(
                     labels, self.config.pad_token_id, self.config.decoder_start_token_id
                 )
 
-        outputs = self.model(
+        outputs: Seq2SeqModelOutput = self.model(
             input_values,
             attention_mask=attention_mask,
             decoder_input_ids=decoder_input_ids,
@@ -1600,19 +1568,14 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
         )
-        logits = self.proj_out(outputs[0])
+        logits = self.proj_out(outputs.last_hidden_state)
 
         loss = None
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return Seq2SeqLMOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py
index 24fa4f0a1ef8..9d6f2c52c50f 100644
--- a/src/transformers/models/moonshine/modular_moonshine.py
+++ b/src/transformers/models/moonshine/modular_moonshine.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from functools import partial
 from typing import Callable, Optional, Tuple, Union
 
 import torch
@@ -39,6 +40,7 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -427,7 +429,7 @@ def __init__(self, config: MoonshineConfig, layer_idx: int):
 
 
 class MoonshineDecoderLayer(nn.Module):
-    def __init__(self, config: MoonshineConfig, layer_idx: int = None):
+    def __init__(self, config: MoonshineConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.hidden_size = config.hidden_size
 
@@ -604,15 +606,15 @@ def get_input_embeddings(self) -> nn.Module:
     def set_input_embeddings(self, value: nn.Module):
         self.conv1 = value
 
+    @can_return_tuple
     def forward(
         self,
         input_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         r"""
         Args:
             input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
@@ -639,7 +641,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if input_values is None:
             raise ValueError("You must specify input_values.")
@@ -714,12 +715,11 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
 
 class MoonshineDecoder(LlamaModel):
@@ -734,7 +734,7 @@ def __init__(self, config: MoonshineConfig):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -742,7 +742,6 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
@@ -764,7 +763,6 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -832,7 +830,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     encoder_hidden_states,
@@ -872,14 +870,13 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPastAndCrossAttentions(
+        return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
             cross_attentions=all_cross_attentions,
         )
-        return output if return_dict else output.to_tuple()
 
 
 MOONSHINE_MODEL_INPUTS_DOCSTRING = r"""
@@ -977,6 +974,7 @@ def forward(
     MOONSHINE_START_DOCSTRING,
 )
 class MoonshineModel(WhisperModel):
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MOONSHINE_MODEL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -992,9 +990,8 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
+    ) -> Seq2SeqModelOutput:
         r"""
         ```python
         >>> import torch
@@ -1016,18 +1013,16 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if encoder_outputs is None:
-            encoder_outputs = self.encoder(
+            encoder_outputs: BaseModelOutput = self.encoder(
                 input_values,
                 attention_mask=attention_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
             )
         # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+        elif not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
                 hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
@@ -1035,24 +1030,20 @@ def forward(
             )
 
         # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-        decoder_outputs = self.decoder(
+        decoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
             encoder_attention_mask=attention_mask,
-            encoder_hidden_states=encoder_outputs[0],
+            encoder_hidden_states=encoder_outputs.last_hidden_state,
             past_key_values=past_key_values,
             inputs_embeds=decoder_inputs_embeds,
             position_ids=decoder_position_ids,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
         )
 
-        if not return_dict:
-            return decoder_outputs + encoder_outputs
-
         return Seq2SeqModelOutput(
             last_hidden_state=decoder_outputs.last_hidden_state,
             past_key_values=decoder_outputs.past_key_values,
@@ -1095,6 +1086,7 @@ def set_output_embeddings(self, new_embeddings):
     def get_input_embeddings(self) -> nn.Module:
         return self.model.get_input_embeddings()
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(MOONSHINE_MODEL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1110,10 +1102,9 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
+    ) -> Seq2SeqLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
@@ -1143,7 +1134,6 @@ def forward(
         >>> transcription
         'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if labels is not None:
             if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -1151,7 +1141,7 @@ def forward(
                     labels, self.config.pad_token_id, self.config.decoder_start_token_id
                 )
 
-        outputs = self.model(
+        outputs: Seq2SeqModelOutput = self.model(
             input_values,
             attention_mask=attention_mask,
             decoder_input_ids=decoder_input_ids,
@@ -1163,19 +1153,14 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
         )
-        logits = self.proj_out(outputs[0])
+        logits = self.proj_out(outputs.last_hidden_state)
 
         loss = None
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return Seq2SeqLMOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/moshi/convert_moshi_transformers.py b/src/transformers/models/moshi/convert_moshi_transformers.py
deleted file mode 100644
index 1caaee25ef6f..000000000000
--- a/src/transformers/models/moshi/convert_moshi_transformers.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Moshi checkpoints."""
-
-import argparse
-
-import safetensors
-import sentencepiece
-import torch
-
-from transformers import (
-    AutoFeatureExtractor,
-    GenerationConfig,
-    MimiModel,  # initial audio encoder
-    MoshiConfig,
-    MoshiForConditionalGeneration,
-    PreTrainedTokenizerFast,
-    logging,
-)
-from transformers.convert_slow_tokenizer import MoshiConverter
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.mimi")
-
-
-def assert_param_count(model_1, model_2):
-    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
-    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
-    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
-
-
-def param_count(model):
-    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
-
-
-def _grab_best_device(use_gpu=True):
-    if torch.cuda.device_count() > 0 and use_gpu:
-        device = "cuda"
-    else:
-        device = "cpu"
-    return torch.device(device)
-
-
-convert_list = [
-    # GENERAL
-    ("out_norm", "decoder.model.norm"),
-    ("depformer_emb", "depth_decoder.emb"),
-    ("depformer_text_emb", "depth_decoder.text_emb"),
-    ("text_emb", "decoder.model.emb"),
-    ("emb", "embed_tokens"),
-    ("text_linear", "decoder.lm_head"),
-    ("depformer", "depth_decoder"),
-    ("transformer", "decoder.model"),
-    # TRANSFORMERS PART
-    ("gating.linear_in", "mlp.fc1"),
-    ("gating.linear_out", "mlp.fc2"),
-    ("self_attn.out_proj", "self_attn.o_proj.linear"),
-    ("norm1", "input_layernorm"),
-    ("norm2", "post_attention_layernorm"),
-    ("layer_scale_1", "self_attn_layer_scale"),
-    ("layer_scale_2", "mlp_layer_scale"),
-    ("alpha", "weight"),
-]
-
-
-def _preprocess_state_dict(state_dict, config):
-    # Moshi original weights are using a gating mechanism
-
-    # pattern for depth transformer:
-    # stack(gating.{i}.linear_in)->mlp.fc1
-    # stack(gating.{i}.linear_out)->mlp.fc2
-
-    for layer_idx in range(config.depth_decoder_config.num_hidden_layers):
-        linear_layers_in = [
-            state_dict.pop(f"depformer.layers.{layer_idx}.gating.{i}.linear_in.weight")
-            for i in range(config.num_codebooks)
-        ]
-        linear_layers_out = [
-            state_dict.pop(f"depformer.layers.{layer_idx}.gating.{i}.linear_out.weight")
-            for i in range(config.num_codebooks)
-        ]
-
-        state_dict[f"depth_decoder.layers.{layer_idx}.mlp.fc1.weight"] = torch.stack(linear_layers_in)
-        state_dict[f"depth_decoder.layers.{layer_idx}.mlp.fc2.weight"] = torch.stack(linear_layers_out)
-
-    input_projections = []
-    lm_heads = []
-    for codebook_idx in range(config.num_codebooks):
-        input_projections.append(state_dict.pop(f"depformer_in.{codebook_idx}.weight"))
-        lm_heads.append(state_dict.pop(f"linears.{codebook_idx}.weight"))
-
-    state_dict["depth_decoder.input_projections.weight"] = torch.stack(input_projections, dim=0)
-    state_dict["depth_decoder.lm_heads.weight"] = torch.stack(lm_heads, dim=0)
-
-    return state_dict
-
-
-def _convert_model(
-    state_dict,
-    hf_model,
-    convert_list,
-    device,
-    config,
-    unwanted_prefix=None,
-):
-    hidden_size = config.hidden_size
-    head_dim = config.head_dim
-    num_heads = int(config.hidden_size // config.head_dim)
-    num_key_value_heads = config.num_key_value_heads
-    key_value_head_dim = config.num_key_value_heads * head_dim
-
-    state_dict = _preprocess_state_dict(state_dict, config)
-
-    # permute for sliced rotary
-    def permute(w, n_heads, dim1=hidden_size, dim2=hidden_size):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    for k, v in list(state_dict.items()):
-        if "audio_encoder" not in k:
-            new_k = k if unwanted_prefix is None else k[len(unwanted_prefix) :]
-            for old_layer_name, new_layer_name in convert_list:
-                if old_layer_name in new_k:
-                    new_k = new_k.replace(old_layer_name, new_layer_name)
-
-            if "alpha" in k:
-                state_dict[k] = state_dict[k].squeeze()
-
-            if "in_proj_weight" in new_k:
-                # split qkv into query key and value
-                mixed_qkv = state_dict.pop(k)
-                if "depth_decoder" in new_k:
-                    mixed_qkv = mixed_qkv.view(config.num_codebooks, -1, mixed_qkv.shape[-1])
-
-                    qkv_dim = mixed_qkv.size(1) // 3
-
-                    query_layer = mixed_qkv[:, :qkv_dim]
-                    key_layer = mixed_qkv[:, qkv_dim : qkv_dim * 2]
-                    value_layer = mixed_qkv[:, qkv_dim * 2 :]
-                    state_dict[new_k.replace("in_proj_weight", "q_proj.linear.weight")] = query_layer
-                    state_dict[new_k.replace("in_proj_weight", "k_proj.linear.weight")] = key_layer
-
-                else:
-                    qkv_dim = mixed_qkv.size(0) // 3
-
-                    query_layer = mixed_qkv[:qkv_dim]
-                    key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
-                    value_layer = mixed_qkv[qkv_dim * 2 :]
-                    state_dict[new_k.replace("in_proj_weight", "q_proj.linear.weight")] = permute(
-                        query_layer, num_heads, hidden_size, hidden_size
-                    )
-                    state_dict[new_k.replace("in_proj_weight", "k_proj.linear.weight")] = permute(
-                        key_layer, num_key_value_heads, key_value_head_dim, hidden_size
-                    )
-
-                state_dict[new_k.replace("in_proj_weight", "v_proj.linear.weight")] = value_layer
-            elif "o_proj" in new_k and "depth_decoder" in new_k:
-                output_layer = state_dict.pop(k)
-                state_dict[new_k] = output_layer.view(config.num_codebooks, -1, output_layer.shape[-1])
-            else:
-                state_dict[new_k] = state_dict.pop(k)
-
-    # Do the last one by hand
-    state_dict["depth_decoder.text_embed_tokens.weight"] = state_dict.pop(
-        "depth_decoder.decoder.model.embed_tokens.weight"
-    )
-
-    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    hf_model.load_state_dict(state_dict, strict=True)
-    n_params = param_count(hf_model)
-
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
-
-    hf_model.eval()
-    hf_model.to(device)
-    del state_dict
-
-    return hf_model
-
-
-@torch.no_grad()
-def convert_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    mimi_repo_id,
-    config_path=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    device = _grab_best_device()
-
-    mimi_model = MimiModel.from_pretrained(mimi_repo_id, torch_dtype=torch.bfloat16)
-
-    if config_path is not None:
-        config = MoshiConfig.from_pretrained(config_path)
-    else:
-        audio_encoder_config = mimi_model.config
-        config = MoshiConfig.from_audio_encoder_config(audio_encoder_config)
-
-    model = MoshiForConditionalGeneration(config).to(torch.bfloat16)
-
-    depth_decoder_generation_config = GenerationConfig(
-        do_sample=True,
-        temperature=0.8,
-        top_k=250,
-        min_length=config.num_codebooks + 1,
-        max_length=config.num_codebooks + 1,
-        cache_implementation="sliding_window",
-    )
-
-    generation_config = GenerationConfig(
-        do_sample=True,
-        temp=0.7,
-        top_k=25,
-        cache_implementation="sliding_window",
-        pad_token_id=config.vocab_size,
-        bos_token_id=config.vocab_size,
-    )
-    generation_config.depth_decoder_config = depth_decoder_generation_config.to_diff_dict()
-
-    model.generation_config = generation_config
-
-    original_checkpoint = safetensors.torch.load_file(checkpoint_path)
-    if "best_state" in original_checkpoint:
-        # we might have a training state saved, in which case discard the yaml results and just retain the weights
-        original_checkpoint = original_checkpoint["best_state"]
-
-    audio_checkpoint = mimi_model.state_dict()
-    original_checkpoint.update({f"audio_encoder.{key}": value for (key, value) in audio_checkpoint.items()})
-
-    model = _convert_model(original_checkpoint, model, convert_list, device, config)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument(
-        "--tokenizer_vocab_path", required=False, default=None, type=str, help="Path to original tokenizer vocab file"
-    )
-    parser.add_argument("--mimi_repo_id", required=True, default=None, type=str, help="Repository id to HF Mimi.")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-
-    # convert tokenizer
-    if args.tokenizer_vocab_path:
-        original_tokenizer = sentencepiece.SentencePieceProcessor(args.tokenizer_vocab_path)
-        tokenizer = MoshiConverter(args.tokenizer_vocab_path).converted()
-        tokenizer = PreTrainedTokenizerFast(
-            tokenizer_object=tokenizer,
-            chat_template=None,
-            unk_token="<unk>",
-            model_input_names=["input_ids", "attention_mask"],
-            clean_up_tokenization_spaces=False,
-            bos_token_id=original_tokenizer.bos_id(),
-            eos_token_id=original_tokenizer.eos_id(),
-            pad_token_id=original_tokenizer.pad_id(),
-        )
-
-        tokenizer.save_pretrained(args.pytorch_dump_folder_path)
-
-        if args.push_to_hub:
-            print("Pushing the tokenizer to the hub...")
-            tokenizer.push_to_hub(args.push_to_hub)
-
-    # upload feature extractor
-    feature_extractor = AutoFeatureExtractor.from_pretrained(args.mimi_repo_id)
-    feature_extractor.save_pretrained(args.pytorch_dump_folder_path)
-
-    if args.push_to_hub:
-        print("Pushing the feature extractor to the hub...")
-        feature_extractor.push_to_hub(args.push_to_hub)
-
-    convert_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.mimi_repo_id,
-        args.config_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py
index ae9a3fd804dc..0cf575187f70 100644
--- a/src/transformers/models/moshi/modeling_moshi.py
+++ b/src/transformers/models/moshi/modeling_moshi.py
@@ -30,20 +30,19 @@
     GenerationMixin,
 )
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     ModelOutput,
     Seq2SeqLMOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
@@ -53,7 +52,7 @@
 from .configuration_moshi import MoshiConfig, MoshiDepthConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 logger = logging.get_logger(__name__)
@@ -101,7 +100,7 @@ class MoshiConditionalGenerationGenerateOutput(ModelOutput):
     """
 
     audio_sequences: Optional[torch.Tensor] = None
-    sequences: torch.LongTensor = None
+    sequences: Optional[torch.LongTensor] = None
     sequences_scores: Optional[torch.FloatTensor] = None
     scores: Optional[Tuple[torch.FloatTensor]] = None
     logits: Optional[Tuple[torch.FloatTensor]] = None
@@ -144,8 +143,8 @@ class MoshiCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    last_hidden_state: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -194,13 +193,13 @@ class MoshiConditionalGenerationOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    last_hidden_state: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     depth_loss: Optional[torch.FloatTensor] = None
-    audio_logits: torch.FloatTensor = None
+    audio_logits: Optional[torch.FloatTensor] = None
     depth_past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     depth_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     depth_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -221,10 +220,10 @@ class MoshiUnconditionalInput(ModelOutput):
             1]`: 1 for tokens that are **not masked**, 0 for tokens that are **masked**.
     """
 
-    input_ids: torch.LongTensor = None
-    user_audio_codes: torch.Tensor = None
-    moshi_audio_codes: torch.Tensor = None
-    attention_mask: torch.LongTensor = None
+    input_ids: Optional[torch.LongTensor] = None
+    user_audio_codes: Optional[torch.Tensor] = None
+    moshi_audio_codes: Optional[torch.Tensor] = None
+    attention_mask: Optional[torch.LongTensor] = None
 
 
 # Copied from transformers.models.gemma.modeling_gemma.GemmaRMSNorm with Gemma->Moshi
@@ -326,45 +325,18 @@ def __init__(self, config: MoshiConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -420,7 +392,7 @@ def __init__(self, config, use_flexible_linear=False):
             self.fc1 = MoshiFlexibleLinear(hidden_size, ffn_dim, num_layers)
             self.fc2 = MoshiFlexibleLinear(ffn_dim // 2, hidden_size, num_layers)
 
-    def forward(self, hidden_states: torch.Tensor, layer_idx: int = None) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor, layer_idx: Optional[int] = None) -> torch.Tensor:
         hidden_states = self.fc1(hidden_states) if layer_idx is None else self.fc1(hidden_states, layer_idx)
 
         batch_size, sequence_length, _ = hidden_states.shape
@@ -571,9 +543,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -1090,7 +1062,7 @@ def __init__(self, config: MoshiDepthConfig):
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        last_hidden_state: torch.LongTensor = None,
+        last_hidden_state: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.BoolTensor] = None,
         past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -1296,7 +1268,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -1400,7 +1372,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1483,7 +1455,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(MOSHI_DECODER_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -1610,7 +1582,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -1714,7 +1686,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1798,7 +1770,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=MoshiCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -1813,7 +1785,6 @@ def forward(
         **kwargs,
     ) -> Union[Tuple, MoshiCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -2099,6 +2070,31 @@ def forward(
             depth_attentions=None if decoder_outputs is None else decoder_outputs.attentions,
         )
 
+    def _prepare_attention_mask_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        generation_config: GenerationConfig,
+        kwargs: Dict[str, Any],
+    ) -> torch.LongTensor:
+        pad_token_id = generation_config.pad_token_id
+        eos_token_id = generation_config.eos_token_id
+
+        default_attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)
+        if pad_token_id is None:
+            return default_attention_mask
+
+        is_pad_token_in_inputs = (pad_token_id is not None) and torch.isin(input_ids, pad_token_id).any()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~torch.isin(
+            eos_token_id, pad_token_id
+        ).any()
+        can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id
+        attention_mask_from_padding = input_ids.ne(pad_token_id).long()
+
+        attention_mask = (
+            attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask
+        )
+        return attention_mask
+
     def _prepare_inputs_embeds_for_generation(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -2315,6 +2311,12 @@ def generate(
         kwargs_depth_decoder = depth_decoder_generation_config
 
         attention_mask = kwargs.pop("attention_mask", None)
+        if attention_mask is None:
+            attention_mask = self._prepare_attention_mask_for_generation(
+                input_ids=input_ids,
+                generation_config=generation_config,
+                kwargs=kwargs,
+            )
         (
             inputs_embeds,
             input_ids,
@@ -2477,7 +2479,7 @@ def prepare_inputs_for_generation(
         if past_key_values is not None:
             if (
                 inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
             ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
@@ -2497,11 +2499,11 @@ def prepare_inputs_for_generation(
                 batch_size, sequence_length = input_ids.shape
                 device = input_ids.device
 
-            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask = self.decoder.model._prepare_4d_causal_attention_mask_with_cache_position(
                 attention_mask,
                 sequence_length=sequence_length,
                 target_length=past_key_values.get_max_cache_shape(),
-                dtype=self.lm_head.weight.dtype,
+                dtype=self.decoder.lm_head.weight.dtype,
                 device=device,
                 cache_position=cache_position,
                 batch_size=batch_size,
@@ -2614,7 +2616,7 @@ def apply_delay_pattern_mask(input_ids, decoder_pad_token_mask):
         return input_ids
 
     def build_delay_pattern_mask(
-        self, input_ids: torch.LongTensor, bos_token_id: int, pad_token_id: int, max_length: int = None
+        self, input_ids: torch.LongTensor, bos_token_id: int, pad_token_id: int, max_length: Optional[int] = None
     ):
         """Build a delayed pattern mask to the input_ids. Each codebook, except the first one, is offset by
         one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there
diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py
index f9bfdf7c1dda..4de999903969 100644
--- a/src/transformers/models/mpt/modeling_mpt.py
+++ b/src/transformers/models/mpt/modeling_mpt.py
@@ -685,7 +685,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
deleted file mode 100644
index f558f7c7bce3..000000000000
--- a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MRA checkpoints from the original repository. URL: https://github.com/mlpen/mra-attention"""
-
-import argparse
-
-import torch
-
-from transformers import MraConfig, MraForMaskedLM
-
-
-def rename_key(orig_key):
-    if "model" in orig_key:
-        orig_key = orig_key.replace("model.", "")
-    if "norm1" in orig_key:
-        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
-    if "norm2" in orig_key:
-        orig_key = orig_key.replace("norm2", "output.LayerNorm")
-    if "norm" in orig_key:
-        orig_key = orig_key.replace("norm", "LayerNorm")
-    if "transformer" in orig_key:
-        layer_num = orig_key.split(".")[0].split("_")[-1]
-        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
-    if "mha.attn" in orig_key:
-        orig_key = orig_key.replace("mha.attn", "attention.self")
-    if "mha" in orig_key:
-        orig_key = orig_key.replace("mha", "attention")
-    if "W_q" in orig_key:
-        orig_key = orig_key.replace("W_q", "self.query")
-    if "W_k" in orig_key:
-        orig_key = orig_key.replace("W_k", "self.key")
-    if "W_v" in orig_key:
-        orig_key = orig_key.replace("W_v", "self.value")
-    if "ff.0" in orig_key:
-        orig_key = orig_key.replace("ff.0", "intermediate.dense")
-    if "ff.2" in orig_key:
-        orig_key = orig_key.replace("ff.2", "output.dense")
-    if "ff" in orig_key:
-        orig_key = orig_key.replace("ff", "output.dense")
-    if "mlm_class" in orig_key:
-        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
-    if "mlm" in orig_key:
-        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
-    if "backbone.backbone.encoders" in orig_key:
-        orig_key = orig_key.replace("backbone.backbone.encoders", "encoder.layer")
-    if "cls" not in orig_key:
-        orig_key = "mra." + orig_key
-
-    return orig_key
-
-
-def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if ("pooler" in key) or ("sen_class" in key):
-            continue
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
-    orig_state_dict["mra.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
-
-    return orig_state_dict
-
-
-def convert_mra_checkpoint(checkpoint_path, mra_config_file, pytorch_dump_path):
-    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
-    config = MraConfig.from_json_file(mra_config_file)
-    model = MraForMaskedLM(config)
-
-    new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
-
-    print(model.load_state_dict(new_state_dict))
-    model.eval()
-    model.save_pretrained(pytorch_dump_path)
-
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pytorch_model_path", default=None, type=str, required=True, help="Path to Mra pytorch checkpoint."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The json file for Mra model config.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_mra_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 129255a90b5b..f8fbc8e34378 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -44,6 +44,7 @@
     DUMMY_MASK,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
     is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
@@ -53,6 +54,11 @@
 from .configuration_mt5 import MT5Config
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "MT5Config"
@@ -129,7 +135,7 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         # MT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
 
@@ -713,9 +719,9 @@ def load_tf_weights_in_mt5(model, config, tf_checkpoint_path):
             logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
             array = np.transpose(array)
         try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            assert pointer.shape == array.shape, (
+                f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            )
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
@@ -1189,12 +1195,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1276,7 +1287,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -2172,7 +2183,7 @@ def __init__(self, config: MT5Config):
     # Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.forward
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
diff --git a/src/transformers/models/musicgen/convert_musicgen_transformers.py b/src/transformers/models/musicgen/convert_musicgen_transformers.py
deleted file mode 100644
index f4afd24df009..000000000000
--- a/src/transformers/models/musicgen/convert_musicgen_transformers.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MusicGen checkpoints from the original repository."""
-
-import argparse
-from pathlib import Path
-from typing import Dict, OrderedDict, Tuple
-
-import torch
-from audiocraft.models import MusicGen
-
-from transformers import (
-    AutoFeatureExtractor,
-    AutoTokenizer,
-    EncodecModel,
-    MusicgenDecoderConfig,
-    MusicgenForConditionalGeneration,
-    MusicgenProcessor,
-    T5EncoderModel,
-)
-from transformers.models.musicgen.modeling_musicgen import MusicgenForCausalLM
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]
-
-
-def rename_keys(name):
-    if "emb" in name:
-        name = name.replace("emb", "model.decoder.embed_tokens")
-    if "transformer" in name:
-        name = name.replace("transformer", "model.decoder")
-    if "cross_attention" in name:
-        name = name.replace("cross_attention", "encoder_attn")
-    if "linear1" in name:
-        name = name.replace("linear1", "fc1")
-    if "linear2" in name:
-        name = name.replace("linear2", "fc2")
-    if "norm1" in name:
-        name = name.replace("norm1", "self_attn_layer_norm")
-    if "norm_cross" in name:
-        name = name.replace("norm_cross", "encoder_attn_layer_norm")
-    if "norm2" in name:
-        name = name.replace("norm2", "final_layer_norm")
-    if "out_norm" in name:
-        name = name.replace("out_norm", "model.decoder.layer_norm")
-    if "linears" in name:
-        name = name.replace("linears", "lm_heads")
-    if "condition_provider.conditioners.description.output_proj" in name:
-        name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
-    return name
-
-
-def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
-    """Function that takes the fairseq Musicgen state dict and renames it according to the HF
-    module names. It further partitions the state dict into the decoder (LM) state dict, and that for the
-    encoder-decoder projection."""
-    keys = list(state_dict.keys())
-    enc_dec_proj_state_dict = {}
-    for key in keys:
-        val = state_dict.pop(key)
-        key = rename_keys(key)
-        if "in_proj_weight" in key:
-            # split fused qkv proj
-            state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
-            state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
-            state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
-        elif "enc_to_dec_proj" in key:
-            enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
-        else:
-            state_dict[key] = val
-    return state_dict, enc_dec_proj_state_dict
-
-
-def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenDecoderConfig:
-    if checkpoint.endswith("small"):
-        # default config values
-        hidden_size = 1024
-        num_hidden_layers = 24
-        num_attention_heads = 16
-    elif checkpoint.endswith("medium"):
-        hidden_size = 1536
-        num_hidden_layers = 48
-        num_attention_heads = 24
-    elif checkpoint.endswith("large"):
-        hidden_size = 2048
-        num_hidden_layers = 48
-        num_attention_heads = 32
-    else:
-        raise ValueError(
-            "Checkpoint should be one of `['small', 'medium', 'large']` for the mono checkpoints, "
-            "`['facebook/musicgen-stereo-small', 'facebook/musicgen-stereo-medium', 'facebook/musicgen-stereo-large']` "
-            f"for the stereo checkpoints, or a custom checkpoint with the checkpoint size as a suffix, got {checkpoint}."
-        )
-
-    if "stereo" in checkpoint:
-        audio_channels = 2
-        num_codebooks = 8
-    else:
-        audio_channels = 1
-        num_codebooks = 4
-
-    config = MusicgenDecoderConfig(
-        hidden_size=hidden_size,
-        ffn_dim=hidden_size * 4,
-        num_hidden_layers=num_hidden_layers,
-        num_attention_heads=num_attention_heads,
-        num_codebooks=num_codebooks,
-        audio_channels=audio_channels,
-    )
-    return config
-
-
-@torch.no_grad()
-def convert_musicgen_checkpoint(
-    checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu", safe_serialization=False
-):
-    fairseq_model = MusicGen.get_pretrained(checkpoint, device=device)
-    decoder_config = decoder_config_from_checkpoint(checkpoint)
-
-    decoder_state_dict = fairseq_model.lm.state_dict()
-    decoder_state_dict, enc_dec_proj_state_dict = rename_state_dict(
-        decoder_state_dict, hidden_size=decoder_config.hidden_size
-    )
-
-    text_encoder = T5EncoderModel.from_pretrained("google-t5/t5-base")
-    audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
-    decoder = MusicgenForCausalLM(decoder_config).eval()
-
-    # load all decoder weights - expect that we'll be missing embeddings and enc-dec projection
-    missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)
-
-    for key in missing_keys.copy():
-        if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
-            missing_keys.remove(key)
-
-    if len(missing_keys) > 0:
-        raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")
-
-    if len(unexpected_keys) > 0:
-        raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")
-
-    # init the composite model
-    model = MusicgenForConditionalGeneration(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder)
-
-    # load the pre-trained enc-dec projection (from the decoder state dict)
-    model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)
-
-    # check we can do a forward pass
-    input_ids = torch.arange(0, 2 * decoder_config.num_codebooks, dtype=torch.long).reshape(2, -1)
-    decoder_input_ids = input_ids.reshape(2 * decoder_config.num_codebooks, -1)
-
-    with torch.no_grad():
-        logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
-
-    if logits.shape != (2 * decoder_config.num_codebooks, 1, 2048):
-        raise ValueError("Incorrect shape for logits")
-
-    # now construct the processor
-    tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        "facebook/encodec_32khz", padding_side="left", feature_size=decoder_config.audio_channels
-    )
-
-    processor = MusicgenProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
-    # set the appropriate bos/pad token ids
-    model.generation_config.decoder_start_token_id = 2048
-    model.generation_config.pad_token_id = 2048
-
-    # set other default generation config params
-    model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
-    model.generation_config.do_sample = True
-    model.generation_config.guidance_scale = 3.0
-
-    if pytorch_dump_folder is not None:
-        Path(pytorch_dump_folder).mkdir(exist_ok=True)
-        logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
-        model.save_pretrained(pytorch_dump_folder, safe_serialization=safe_serialization)
-        processor.save_pretrained(pytorch_dump_folder)
-
-    if repo_id:
-        logger.info(f"Pushing model {checkpoint} to {repo_id}")
-        model.push_to_hub(repo_id, safe_serialization=safe_serialization)
-        processor.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint",
-        default="small",
-        type=str,
-        help="Checkpoint size of the MusicGen model you'd like to convert. Can be one of: "
-        "`['small', 'medium', 'large']` for the mono checkpoints, "
-        "`['facebook/musicgen-stereo-small', 'facebook/musicgen-stereo-medium', 'facebook/musicgen-stereo-large']` "
-        "for the stereo checkpoints, or a custom checkpoint with the checkpoint size as a suffix.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder",
-        required=True,
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-    parser.add_argument(
-        "--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
-    )
-    parser.add_argument(
-        "--safe_serialization",
-        action="store_true",
-        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).",
-    )
-
-    args = parser.parse_args()
-    convert_musicgen_checkpoint(args.checkpoint, args.pytorch_dump_folder, args.push_to_hub)
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index cab950995a97..151c4e89f3bf 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -40,6 +40,7 @@
     _prepare_4d_causal_attention_mask,
     _prepare_4d_causal_attention_mask_for_sdpa,
 )
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -51,8 +52,6 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -61,7 +60,7 @@
 from .configuration_musicgen import MusicgenConfig, MusicgenDecoderConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 if TYPE_CHECKING:
@@ -88,8 +87,8 @@ class MusicgenUnconditionalInput(ModelOutput):
     """
 
     encoder_outputs: Tuple[torch.FloatTensor] = None
-    attention_mask: torch.LongTensor = None
-    guidance_scale: float = None
+    attention_mask: Optional[torch.LongTensor] = None
+    guidance_scale: Optional[float] = None
 
 
 def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
@@ -328,9 +327,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -719,6 +718,11 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, MusicgenSinusoidalPositionalEmbedding):
+            weights = module.get_embedding(*module.weights.shape)
+            weights = nn.Parameter(weights, requires_grad=False)
+            weights.detach_()
+            module.weights = weights
 
 
 MUSICGEN_START_DOCSTRING = r"""
@@ -962,7 +966,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1158,7 +1162,7 @@ def get_decoder(self):
     @add_start_docstrings_to_model_forward(MUSICGEN_DECODER_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1246,7 +1250,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1377,7 +1381,9 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,
         }
 
-    def build_delay_pattern_mask(self, input_ids: torch.LongTensor, pad_token_id: int, max_length: int = None):
+    def build_delay_pattern_mask(
+        self, input_ids: torch.LongTensor, pad_token_id: int, max_length: Optional[int] = None
+    ):
         """Build a delayed pattern mask to the input_ids. Each codebook is offset by the previous codebook by
         one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there
         are 4 codebooks and a max sequence length of 8, we have the delayed pattern mask of shape `(codebooks,
@@ -1804,33 +1810,12 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         return self.decoder.set_output_embeddings(new_embeddings)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r"""
-        Example:
-
-        ```python
-        >>> from transformers import MusicgenForConditionalGeneration
-
-        >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
-        ```"""
-
-        # At the moment fast initialization is not supported for composite models
-        if kwargs.get("_fast_init", False):
-            logger.warning(
-                "Fast initialization is currently not supported for MusicgenForConditionalGeneration. "
-                "Falling back to slow initialization..."
-            )
-        kwargs["_fast_init"] = False
-
-        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-
     @classmethod
     def from_sub_models_pretrained(
         cls,
-        text_encoder_pretrained_model_name_or_path: str = None,
-        audio_encoder_pretrained_model_name_or_path: str = None,
-        decoder_pretrained_model_name_or_path: str = None,
+        text_encoder_pretrained_model_name_or_path: Optional[str] = None,
+        audio_encoder_pretrained_model_name_or_path: Optional[str] = None,
+        decoder_pretrained_model_name_or_path: Optional[str] = None,
         *model_args,
         **kwargs,
     ) -> PreTrainedModel:
@@ -2232,8 +2217,8 @@ def _prepare_decoder_input_ids_for_generation(
         batch_size: int,
         model_input_name: str,
         model_kwargs: Dict[str, torch.Tensor],
-        decoder_start_token_id: int = None,
-        bos_token_id: int = None,
+        decoder_start_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
         device: torch.device = None,
     ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
         """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
@@ -2454,7 +2439,7 @@ def _maybe_initialize_input_ids_for_generation(
         return torch.ones((batch_size, 1), dtype=torch.long, device=self.device) * bos_token_id
 
     def _get_decoder_start_token_id(
-        self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: int = None
+        self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: Optional[int] = None
     ) -> int:
         decoder_start_token_id = (
             decoder_start_token_id
diff --git a/src/transformers/models/musicgen/processing_musicgen.py b/src/transformers/models/musicgen/processing_musicgen.py
index deebf9045b4f..82671e4bf4cc 100644
--- a/src/transformers/models/musicgen/processing_musicgen.py
+++ b/src/transformers/models/musicgen/processing_musicgen.py
@@ -53,7 +53,7 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
     def __call__(self, *args, **kwargs):
         """
         Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
-        argument to [`~T5Tokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
+        argument to [`~T5Tokenizer.__call__`]. Please refer to the docstring of the above two methods for more
         information.
         """
         # For backward compatibility
diff --git a/src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py b/src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
deleted file mode 100644
index 52980f73ecdb..000000000000
--- a/src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Musicgen Melody checkpoints from the original repository."""
-
-import argparse
-from pathlib import Path
-from typing import Dict, OrderedDict, Tuple
-
-import torch
-from audiocraft.models import MusicGen
-
-from transformers import (
-    AutoTokenizer,
-    EncodecModel,
-    T5EncoderModel,
-)
-from transformers.models.musicgen_melody.configuration_musicgen_melody import MusicgenMelodyDecoderConfig
-from transformers.models.musicgen_melody.feature_extraction_musicgen_melody import MusicgenMelodyFeatureExtractor
-from transformers.models.musicgen_melody.modeling_musicgen_melody import (
-    MusicgenMelodyForCausalLM,
-    MusicgenMelodyForConditionalGeneration,
-)
-from transformers.models.musicgen_melody.processing_musicgen_melody import MusicgenMelodyProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-EXPECTED_MISSING_KEYS = ["model.decoder.embed_positions.weights"]
-EXPECTED_ADDITIONAL_KEYS = ["condition_provider.conditioners.self_wav.chroma.spec.window"]
-
-
-def rename_keys(name):
-    if "emb" in name:
-        name = name.replace("emb", "model.decoder.embed_tokens")
-    if "transformer" in name:
-        name = name.replace("transformer", "model.decoder")
-    if "cross_attention" in name:
-        name = name.replace("cross_attention", "encoder_attn")
-    if "linear1" in name:
-        name = name.replace("linear1", "fc1")
-    if "linear2" in name:
-        name = name.replace("linear2", "fc2")
-    if "norm1" in name:
-        name = name.replace("norm1", "self_attn_layer_norm")
-    if "norm_cross" in name:
-        name = name.replace("norm_cross", "encoder_attn_layer_norm")
-    if "norm2" in name:
-        name = name.replace("norm2", "final_layer_norm")
-    if "out_norm" in name:
-        name = name.replace("out_norm", "model.decoder.layer_norm")
-    if "linears" in name:
-        name = name.replace("linears", "lm_heads")
-    if "condition_provider.conditioners.description.output_proj" in name:
-        name = name.replace("condition_provider.conditioners.description.output_proj", "enc_to_dec_proj")
-    if "condition_provider.conditioners.self_wav.output_proj" in name:
-        name = name.replace("condition_provider.conditioners.self_wav.output_proj", "audio_enc_to_dec_proj")
-    return name
-
-
-def rename_state_dict(state_dict: OrderedDict, hidden_size: int) -> Tuple[Dict, Dict]:
-    """Function that takes the fairseq MusicgenMelody state dict and renames it according to the HF
-    module names. It further partitions the state dict into the decoder (LM) state dict, and that for the
-    text encoder projection and for the audio encoder projection."""
-    keys = list(state_dict.keys())
-    enc_dec_proj_state_dict = {}
-    audio_enc_to_dec_proj_state_dict = {}
-    for key in keys:
-        val = state_dict.pop(key)
-        key = rename_keys(key)
-        if "in_proj_weight" in key:
-            # split fused qkv proj
-            state_dict[key.replace("in_proj_weight", "q_proj.weight")] = val[:hidden_size, :]
-            state_dict[key.replace("in_proj_weight", "k_proj.weight")] = val[hidden_size : 2 * hidden_size, :]
-            state_dict[key.replace("in_proj_weight", "v_proj.weight")] = val[-hidden_size:, :]
-        elif "audio_enc_to_dec_proj" in key:
-            audio_enc_to_dec_proj_state_dict[key[len("audio_enc_to_dec_proj.") :]] = val
-        elif "enc_to_dec_proj" in key:
-            enc_dec_proj_state_dict[key[len("enc_to_dec_proj.") :]] = val
-        else:
-            state_dict[key] = val
-    return state_dict, enc_dec_proj_state_dict, audio_enc_to_dec_proj_state_dict
-
-
-def decoder_config_from_checkpoint(checkpoint: str) -> MusicgenMelodyDecoderConfig:
-    if checkpoint == "facebook/musicgen-melody" or checkpoint == "facebook/musicgen-stereo-melody":
-        hidden_size = 1536
-        num_hidden_layers = 48
-        num_attention_heads = 24
-    elif checkpoint == "facebook/musicgen-melody-large" or checkpoint == "facebook/musicgen-stereo-melody-large":
-        hidden_size = 2048
-        num_hidden_layers = 48
-        num_attention_heads = 32
-    else:
-        raise ValueError(
-            "Checkpoint should be one of `['facebook/musicgen-melody', 'facebook/musicgen-melody-large']` for the mono checkpoints, "
-            "or `['facebook/musicgen-stereo-melody', 'facebook/musicgen-stereo-melody-large']` "
-            f"for the stereo checkpoints, got {checkpoint}."
-        )
-
-    if "stereo" in checkpoint:
-        audio_channels = 2
-        num_codebooks = 8
-    else:
-        audio_channels = 1
-        num_codebooks = 4
-
-    config = MusicgenMelodyDecoderConfig(
-        hidden_size=hidden_size,
-        ffn_dim=hidden_size * 4,
-        num_hidden_layers=num_hidden_layers,
-        num_attention_heads=num_attention_heads,
-        num_codebooks=num_codebooks,
-        audio_channels=audio_channels,
-    )
-    return config
-
-
-@torch.no_grad()
-def convert_musicgen_melody_checkpoint(
-    checkpoint, pytorch_dump_folder=None, repo_id=None, device="cpu", test_same_output=False
-):
-    fairseq_model = MusicGen.get_pretrained(checkpoint, device=args.device)
-    decoder_config = decoder_config_from_checkpoint(checkpoint)
-
-    decoder_state_dict = fairseq_model.lm.state_dict()
-    decoder_state_dict, enc_dec_proj_state_dict, audio_enc_to_dec_proj_state_dict = rename_state_dict(
-        decoder_state_dict, hidden_size=decoder_config.hidden_size
-    )
-
-    text_encoder = T5EncoderModel.from_pretrained("t5-base")
-    audio_encoder = EncodecModel.from_pretrained("facebook/encodec_32khz")
-    decoder = MusicgenMelodyForCausalLM(decoder_config).eval()
-
-    # load all decoder weights - expect that we'll be missing embeddings and enc-dec projection
-    missing_keys, unexpected_keys = decoder.load_state_dict(decoder_state_dict, strict=False)
-
-    for key in missing_keys.copy():
-        if key.startswith(("text_encoder", "audio_encoder")) or key in EXPECTED_MISSING_KEYS:
-            missing_keys.remove(key)
-
-    for key in unexpected_keys.copy():
-        if key in EXPECTED_ADDITIONAL_KEYS:
-            unexpected_keys.remove(key)
-
-    if len(missing_keys) > 0:
-        raise ValueError(f"Missing key(s) in state_dict: {missing_keys}")
-
-    if len(unexpected_keys) > 0:
-        raise ValueError(f"Unexpected key(s) in state_dict: {unexpected_keys}")
-
-    # init the composite model
-    model = MusicgenMelodyForConditionalGeneration(
-        text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder
-    ).to(args.device)
-
-    # load the pre-trained enc-dec projection (from the decoder state dict)
-    model.enc_to_dec_proj.load_state_dict(enc_dec_proj_state_dict)
-
-    # load the pre-trained audio encoder projection (from the decoder state dict)
-    model.audio_enc_to_dec_proj.load_state_dict(audio_enc_to_dec_proj_state_dict)
-
-    # check we can do a forward pass
-    input_ids = torch.arange(0, 2 * decoder_config.num_codebooks, dtype=torch.long).reshape(2, -1).to(device)
-    decoder_input_ids = input_ids.reshape(2 * decoder_config.num_codebooks, -1).to(device)
-
-    with torch.no_grad():
-        logits = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
-
-    output_length = 1 + input_ids.shape[1] + model.config.chroma_length
-    if logits.shape != (2 * decoder_config.num_codebooks, output_length, 2048):
-        raise ValueError("Incorrect shape for logits")
-
-    # now construct the processor
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
-    feature_extractor = MusicgenMelodyFeatureExtractor()
-
-    processor = MusicgenMelodyProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
-    # set the appropriate bos/pad token ids
-    model.generation_config.decoder_start_token_id = 2048
-    model.generation_config.pad_token_id = 2048
-
-    # set other default generation config params
-    model.generation_config.max_length = int(30 * audio_encoder.config.frame_rate)
-    model.generation_config.do_sample = True
-    model.generation_config.guidance_scale = 3.0
-
-    if test_same_output:
-        # check same output than original model
-        decoder_input_ids = torch.ones_like(decoder_input_ids).to(device) * model.generation_config.pad_token_id
-        with torch.no_grad():
-            decoder_input_ids = decoder_input_ids[: decoder_config.num_codebooks]
-            inputs = processor(text=["gen"], return_tensors="pt", padding=True).to(device)
-            logits = model(**inputs, decoder_input_ids=decoder_input_ids).logits
-
-            attributes, prompt_tokens = fairseq_model._prepare_tokens_and_attributes(["gen"], None)
-            original_logits = fairseq_model.lm.forward(
-                decoder_input_ids.reshape(1, decoder_config.num_codebooks, -1), attributes
-            )
-
-            torch.testing.assert_close(
-                original_logits.squeeze(2).reshape(decoder_config.num_codebooks, -1),
-                logits[:, -1],
-                rtol=1e-5,
-                atol=5e-5,
-            )
-
-    if pytorch_dump_folder is not None:
-        Path(pytorch_dump_folder).mkdir(exist_ok=True)
-        logger.info(f"Saving model {checkpoint} to {pytorch_dump_folder}")
-        model.save_pretrained(pytorch_dump_folder)
-        processor.save_pretrained(pytorch_dump_folder)
-
-    if repo_id:
-        logger.info(f"Pushing model {checkpoint} to {repo_id}")
-        model.push_to_hub(repo_id, create_pr=True)
-        processor.push_to_hub(repo_id, create_pr=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint",
-        default="facebook/musicgen-melody",
-        type=str,
-        help="Checkpoint size of the Musicgen Melody model you'd like to convert. Can be one of: "
-        "`['facebook/musicgen-melody', 'facebook/musicgen-melody-large']` for the mono checkpoints, or "
-        "`['facebook/musicgen-stereo-melody', 'facebook/musicgen-stereo-melody-large']` "
-        "for the stereo checkpoints.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default="musicgen-melody",
-        type=str,
-        help="Where to upload the converted model on the 🤗 hub.",
-    )
-    parser.add_argument(
-        "--device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda."
-    )
-    parser.add_argument("--test_same_output", default=False, type=bool, help="If `True`, test if same output logits.")
-
-    args = parser.parse_args()
-    convert_musicgen_melody_checkpoint(
-        args.checkpoint, args.pytorch_dump_folder, args.push_to_hub, args.device, args.test_same_output
-    )
diff --git a/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py b/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
index ac83f3ac8df0..ec490b7d9048 100644
--- a/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
@@ -242,7 +242,7 @@ def __call__(
 
         if sampling_rate is None:
             logger.warning_once(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index 279a7c046c4d..70b914313c89 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -35,6 +35,7 @@
     StoppingCriteriaList,
 )
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     ModelOutput,
@@ -43,8 +44,6 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -53,7 +52,7 @@
 from .configuration_musicgen_melody import MusicgenMelodyConfig, MusicgenMelodyDecoderConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 if TYPE_CHECKING:
@@ -98,7 +97,7 @@ class MusicgenMelodyOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -344,9 +343,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -678,6 +677,11 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, MusicgenMelodySinusoidalPositionalEmbedding):
+            weights = module.get_embedding(*module.weights.shape)
+            weights = nn.Parameter(weights, requires_grad=False)
+            weights.detach_()
+            module.weights = weights
 
 
 MUSICGEN_MELODY_START_DOCSTRING = r"""
@@ -895,7 +899,7 @@ def set_input_embeddings(self, value):
     # Ignore copy
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1071,7 +1075,7 @@ def get_decoder(self):
     # Ignore copy
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1158,7 +1162,7 @@ def get_decoder(self):
     # Ignore copy
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1297,7 +1301,9 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,
         }
 
-    def build_delay_pattern_mask(self, input_ids: torch.LongTensor, pad_token_id: int, max_length: int = None):
+    def build_delay_pattern_mask(
+        self, input_ids: torch.LongTensor, pad_token_id: int, max_length: Optional[int] = None
+    ):
         """Build a delayed pattern mask to the input_ids. Each codebook is offset by the previous codebook by
         one, giving a delayed pattern mask at the start of sequence and end of sequence. Take the example where there
         are 4 codebooks and a max sequence length of 8, we have the delayed pattern mask of shape `(codebooks,
@@ -1706,9 +1712,9 @@ def set_output_embeddings(self, new_embeddings):
     # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenForConditionalGeneration.from_sub_models_pretrained with Musicgen->MusicgenMelody, musicgen-small->musicgen-melody
     def from_sub_models_pretrained(
         cls,
-        text_encoder_pretrained_model_name_or_path: str = None,
-        audio_encoder_pretrained_model_name_or_path: str = None,
-        decoder_pretrained_model_name_or_path: str = None,
+        text_encoder_pretrained_model_name_or_path: Optional[str] = None,
+        audio_encoder_pretrained_model_name_or_path: Optional[str] = None,
+        decoder_pretrained_model_name_or_path: Optional[str] = None,
         *model_args,
         **kwargs,
     ) -> PreTrainedModel:
@@ -2112,8 +2118,8 @@ def _prepare_decoder_input_ids_for_generation(
         batch_size: int,
         model_input_name: str,
         model_kwargs: Dict[str, torch.Tensor],
-        decoder_start_token_id: int = None,
-        bos_token_id: int = None,
+        decoder_start_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
         device: torch.device = None,
     ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
         """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
@@ -2304,7 +2310,7 @@ def freeze_text_encoder(self):
 
     # Copied from transformers.models.musicgen.modeling_musicgen.MusicgenForConditionalGeneration._get_decoder_start_token_id
     def _get_decoder_start_token_id(
-        self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: int = None
+        self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: Optional[int] = None
     ) -> int:
         decoder_start_token_id = (
             decoder_start_token_id
diff --git a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
index 34b1d1ec4d6d..8cf11e67d46d 100644
--- a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
@@ -54,7 +54,7 @@ def __call__(self, audio=None, text=None, **kwargs):
         Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
         and `kwargs` arguments to MusicgenMelodyFeatureExtractor's [`~MusicgenMelodyFeatureExtractor.__call__`] if `audio` is not
         `None` to pre-process the audio. It also forwards the `text` and `kwargs` arguments to
-        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the doctsring of the above two methods for more information.
+        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
 
         Args:
             audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index ea1d12af0c8e..2dec79729c37 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -790,7 +790,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -987,7 +987,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1263,7 +1263,7 @@ def set_lightweight_tuning(self):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1401,7 +1401,7 @@ def set_lightweight_tuning(self):
     @add_end_docstrings(MVP_CONDITIONAL_GENERATION_EXAMPLE)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1523,7 +1523,7 @@ def set_lightweight_tuning(self):
     @add_end_docstrings(MVP_SEQUENCE_CLASSIFICATION_SAMPLE)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1649,7 +1649,7 @@ def set_lightweight_tuning(self):
     @add_end_docstrings(MVP_QUESTION_ANSWERING_SAMPLE)
     def forward(
         self,
-        input_ids: torch.Tensor = None,
+        input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1797,7 +1797,7 @@ def set_lightweight_tuning(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
diff --git a/src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 39653e4b1c77..000000000000
--- a/src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The MyT5 authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert MyT5 checkpoint."""
-
-import argparse
-
-from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-# Copied from transformers.models.t5.convert_t5_original_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = T5Config.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = T5ForConditionalGeneration(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained MyT5 model. \nThis specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py b/src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py
deleted file mode 100644
index b9b1e9c56b06..000000000000
--- a/src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import shutil
-from argparse import ArgumentParser
-from collections import OrderedDict
-
-import torch
-from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
-from nemo.utils import logging
-from pytorch_lightning import Trainer
-
-from transformers import LlamaTokenizer, PreTrainedTokenizerFast
-from transformers.convert_slow_tokenizer import LlamaConverter
-
-
-"""
-Script to convert a nemotron checkpoint in nemo (mcore path) into a HuggingFace checkpoint.
-This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder.
-
-1) Generate only HF weights from a nemo file:
-
-    python convert_nemotron_nemo_to_hf.py \
-    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
-    --output_path /path/to/pytorch_model.bin
-
-2) Generate the full HF model folder
-
-    python convert_nemotron_nemo_to_hf.py \
-    --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
-    --hf_input_path /path/to/input_hf_folder \
-    --hf_output_path /path/to/output_hf_folder \
-
-    Use the --cpu-only flag if the model cannot fit in the GPU (e.g. Nemotron4 340b).
-    However this option makes the conversion script significantly slower.
-"""
-
-
-def get_args():
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--input_name_or_path",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to .nemo file or extracted folder",
-    )
-    parser.add_argument("--output_path", type=str, default=None, required=False, help="Path to HF .bin file")
-    parser.add_argument(
-        "--hf_input_path",
-        type=str,
-        default=None,
-        help="A HF model path, " "e.g. a folder containing https://huggingface.co/nvidia/Minitron-8B-Base",
-    )
-    parser.add_argument(
-        "--hf_output_path",
-        type=str,
-        default=None,
-        help="Output HF model path, " "with the same format as above but user's own weights",
-    )
-    parser.add_argument(
-        "--precision",
-        type=str,
-        default=None,
-        help="Precision of output weights."
-        "Defaults to precision of the input nemo weights (model.cfg.trainer.precision)",
-    )
-    parser.add_argument(
-        "--cpu-only",
-        action="store_true",
-        help="Load model in cpu only. Useful if the model cannot fit in GPU memory, "
-        "but this option makes the conversion script significantly slower.",
-    )
-    args = parser.parse_args()
-    return args
-
-
-def convert_hf_config(nemo_config, tokenizer, vocab_size, dtype, hf_output_path, hf_url="nvidia/Minitron-8B-Base"):
-    """
-    Convert NeMo config to HF config
-    """
-    NEMO_ACT2HF = {
-        "squared-relu": "relu2",
-        "fast-swiglu": "silu",
-    }
-    DTYPE2HF = {
-        torch.bfloat16: "bfloat16",
-        torch.float16: "float16",
-        torch.float32: "float32",
-    }
-    hf_config = {
-        "_name_or_path": hf_url,
-        "architectures": ["NemotronForCausalLM"],
-        "bos_token_id": tokenizer.bos_id,
-        "eos_token_id": tokenizer.eos_id,
-        "hidden_act": NEMO_ACT2HF[nemo_config.activation],
-        "hidden_size": nemo_config.hidden_size,
-        "initializer_range": nemo_config.init_method_std,
-        "intermediate_size": nemo_config.ffn_hidden_size,
-        "max_position_embeddings": nemo_config.max_position_embeddings,
-        "model_type": "nemotron",
-        "num_attention_heads": nemo_config.num_attention_heads,
-        "num_hidden_layers": nemo_config.num_layers,
-        "num_key_value_heads": nemo_config.get("num_query_groups", nemo_config.num_attention_heads),
-        "norm_eps": nemo_config.layernorm_epsilon,
-        "rope_theta": nemo_config.get("rotary_base", 10000),
-        "partial_rotary_factor": nemo_config.get("rotary_percentage", 1.0),
-        "tie_word_embeddings": False,
-        "torch_dtype": DTYPE2HF[dtype],
-        "transformers_version": "4.32.0.dev0",  # TODO
-        "use_cache": True,
-        "vocab_size": vocab_size,
-    }
-    if nemo_config.kv_channels is not None:
-        hf_config["kv_channels"] = nemo_config.kv_channels
-    json.dump(hf_config, open(f"{hf_output_path}/config.json", "w"), indent=2)
-
-
-def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None:
-    """
-    Convert NeMo weights to HF weights
-    """
-    dummy_trainer = Trainer(devices=1, accelerator="cpu", strategy=NLPDDPStrategy())
-    model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True)
-    model_config.tensor_model_parallel_size = 1
-    model_config.pipeline_model_parallel_size = 1
-    model_config.sequence_parallel = False
-    model_config.transformer_engine = True
-    if cpu_only:
-        map_location = torch.device("cpu")
-        model_config.use_cpu_initialization = True
-        model_config.dist_ckpt_load_on_device = False
-    else:
-        map_location = None
-
-    if cpu_only:
-        logging.info("******** Loading model on CPU. This will take a significant amount of time.")
-
-    model = MegatronGPTModel.restore_from(
-        input_nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location
-    )
-
-    vocab_size = model.padded_vocab_size
-
-    if precision is None:
-        precision = model.cfg.precision
-    if precision in [32, "32"]:
-        dtype = torch.float32
-    elif precision in [16, "16", "16-mixed"]:
-        dtype = torch.float16
-    elif precision in ["bf16", "bf16-mixed"]:
-        dtype = torch.bfloat16
-    else:
-        logging.warning(f"Precision string {precision} is not recognized, falling back to fp32")
-        dtype = torch.float32  # fallback
-    logging.info(f"Using precision {dtype}")
-
-    def param_to_weights(param):
-        return param.to(dtype)
-
-    checkpoint = OrderedDict()
-
-    hidden_size = model.cfg.hidden_size
-    head_num = model.cfg.num_attention_heads
-    num_layers = model.cfg.num_layers
-    ffn_hidden_size = model.cfg.ffn_hidden_size
-    num_query_groups = model.cfg.get("num_query_groups", head_num)  # different num_query_groups for 70B
-    if num_query_groups is None:
-        num_query_groups = head_num
-    heads_per_group = head_num // num_query_groups
-    qkv_total_dim = head_num + 2 * num_query_groups
-
-    # Embedding
-    embed_weight = model.state_dict()["model.embedding.word_embeddings.weight"]
-    embed_weights_base_name = "model.embed_tokens.weight"
-    checkpoint[embed_weights_base_name] = param_to_weights(embed_weight)
-
-    for l in range(int(num_layers)):
-        print(f"converting layer {l}")
-
-        qkv_weights = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_qkv.weight"]
-        qkv_weights = qkv_weights.reshape([qkv_total_dim, -1, hidden_size])
-
-        q_slice = torch.cat(
-            [
-                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
-                for i in range(num_query_groups)
-            ]
-        )
-        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
-        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
-        ## Example of slices
-        ## (without GQA): num_query_groups = head_num = 32,
-        ## q_slice = [0, 3, 6, 9 , ... 90, 93]
-        ## k_slice = [1, 4, 7, 10, ... 91, 94]
-        ## v_slice = [2, 5, 8, 11, ... 92, 95]
-        ## (with GQA): num_query_groups = 8, head_num = 64
-        ## q_slice = [0, 1, .. 6, 7, 10, 11, .. 16, 17, 20, 21, .. 67, 70, ... 76, 77]
-        ## k_slice = [8, 18, 28, ... 68, 78]
-        ## v_slice = [9, 19, 29, ... 69, 79]
-
-        q_weights_base_name = f"model.layers.{l}.self_attn.q_proj.weight"
-        k_weights_base_name = f"model.layers.{l}.self_attn.k_proj.weight"
-        v_weights_base_name = f"model.layers.{l}.self_attn.v_proj.weight"
-
-        checkpoint[q_weights_base_name] = param_to_weights(qkv_weights[q_slice].reshape(-1, hidden_size))
-        checkpoint[k_weights_base_name] = param_to_weights(qkv_weights[k_slice].reshape(-1, hidden_size))
-        checkpoint[v_weights_base_name] = param_to_weights(qkv_weights[v_slice].reshape(-1, hidden_size))
-
-        # attention dense
-        o_weight = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_proj.weight"]
-        o_weight_base_name = f"model.layers.{l}.self_attn.o_proj.weight"
-        checkpoint[o_weight_base_name] = param_to_weights(o_weight)
-
-        # mlp
-        mlp_weights = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc1.weight"]
-        mlp_up_proj_weight = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc2.weight"]
-
-        if mlp_weights.shape[0] != mlp_up_proj_weight.shape[1]:
-            # Has projection (used for swi-glu)
-            logging.warning(
-                "Gated projection layers detected in NeMo checkpoint. Currently Nemotron HF does not support gated MLP."
-            )
-            assert mlp_weights.shape[0] == 2 * mlp_up_proj_weight.shape[1]
-
-            mlp_down_proj_weight = mlp_weights[:ffn_hidden_size, :]
-            mlp_gate_proj_weight = mlp_weights[ffn_hidden_size:, :]
-
-            mlp_down_proj_base_name = f"model.layers.{l}.mlp.gate_proj.weight"
-            mlp_gate_proj_base_name = f"model.layers.{l}.mlp.up_proj.weight"
-
-            checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight)
-            checkpoint[mlp_gate_proj_base_name] = param_to_weights(mlp_gate_proj_weight)
-        else:
-            mlp_down_proj_weight = mlp_weights
-            mlp_down_proj_base_name = f"model.layers.{l}.mlp.up_proj.weight"
-            checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight)
-
-        mlp_up_proj_base_name = f"model.layers.{l}.mlp.down_proj.weight"
-        checkpoint[mlp_up_proj_base_name] = param_to_weights(mlp_up_proj_weight)
-
-        # layernorm
-        input_ln_weight = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight"]
-        input_ln_base_name = f"model.layers.{l}.input_layernorm.weight"
-        checkpoint[input_ln_base_name] = param_to_weights(input_ln_weight)
-        if (
-            model.state_dict().get(f"model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_bias", None)
-            is not None
-        ):
-            input_ln_bias = model.state_dict()[f"model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_bias"]
-            input_ln_bias_name = f"model.layers.{l}.input_layernorm.bias"
-            checkpoint[input_ln_bias_name] = param_to_weights(input_ln_bias)
-
-        post_attn_ln_weight = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight"]
-        post_attn_ln_base_name = f"model.layers.{l}.post_attention_layernorm.weight"
-        checkpoint[post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
-        if model.state_dict().get(f"model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_bias", None) is not None:
-            post_attn_ln_bias = model.state_dict()[f"model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_bias"]
-            post_attn_ln_bias_name = f"model.layers.{l}.post_attention_layernorm.bias"
-            checkpoint[post_attn_ln_bias_name] = param_to_weights(post_attn_ln_bias)
-
-        print(f"done layer {l}")
-
-    final_ln_weight = model.state_dict()["model.decoder.final_layernorm.weight"]
-    final_ln_base_name = "model.norm.weight"
-    checkpoint[final_ln_base_name] = param_to_weights(final_ln_weight)
-    if model.state_dict().get("model.decoder.final_layernorm.bias", None) is not None:
-        final_ln_bias = model.state_dict()["model.decoder.final_layernorm.bias"]
-        final_ln_bias_name = "model.norm.bias"
-        checkpoint[final_ln_bias_name] = param_to_weights(final_ln_bias)
-
-    output_layer_weight = model.state_dict()["model.output_layer.weight"]
-    output_layer_base_name = "lm_head.weight"
-    checkpoint[output_layer_base_name] = param_to_weights(output_layer_weight)
-
-    os.makedirs(os.path.dirname(output_hf_file), exist_ok=True)
-    torch.save(checkpoint, output_hf_file)
-    logging.info(f"Weights saved to {output_hf_file}")
-
-    return model_config, model.tokenizer, dtype, vocab_size
-
-
-def extract_nemotron_tokenizer(nemo_file, model_config, output_hf_path, nemo_tokenizer):
-    tokenizer_cfg = model_config.tokenizer
-    if tokenizer_cfg.library == "sentencepiece":
-        # For sentencepiece tokenizer, we are wrapping with HF's LlamaTokenizer
-        # and convert it to a PreTrainedTokenizerFast
-        tokenizer_fn = tokenizer_cfg.model[5:]
-        output_tokenizer = f"{output_hf_path}/tokenizer.model"
-        if nemo_file.endswith(".nemo"):
-            import tarfile
-
-            archive = tarfile.open(nemo_file, "r")
-            tokenizer_filename = "./" + tokenizer_fn  # exclude 'nemo:' prefix
-            archive.extract(tokenizer_filename, output_hf_path)
-            archive.close()
-            os.rename(f"{output_hf_path}/{tokenizer_fn}", output_tokenizer)
-        elif os.path.isdir(nemo_file):
-            shutil.copy(f"{nemo_file}/{tokenizer_fn}", output_tokenizer)
-        # We use LlamaTokenizer for sentencepiece based tokenizer
-        tokenizer = LlamaTokenizer.from_pretrained(output_hf_path, legacy=False)
-        # Convert the LlamaTokenizer to a PreTrainedTokenizerFast instance
-        tokenizer = PreTrainedTokenizerFast(
-            tokenizer_object=LlamaConverter(tokenizer).converted(), model_input_names=["input_ids", "token_type_ids"]
-        )
-        tokenizer.save_pretrained(output_hf_path)
-        logging.info(f"Setencepiece tokenizer has been saved to {output_tokenizer}")
-    elif isinstance(nemo_tokenizer, AutoTokenizer):
-        nemo_tokenizer.tokenizer.save_pretrained(output_hf_path)
-        logging.info(f"HF AutoTokenizer has been saved to {output_hf_path}")
-    else:
-        raise ValueError(f"Unsupported tokenizer type: library: {tokenizer_cfg.library}, type: {tokenizer_cfg.type}")
-
-
-if __name__ == "__main__":
-    args = get_args()
-    if not args.hf_output_path:
-        assert args.output_path is not None, "Need to provide either output_path or hf_output_path"
-    else:
-        args.output_path = f"{args.hf_output_path}/pytorch_model.bin"
-        logging.info(f"weight will be saved to {args.output_path}")
-
-    nemo_config, nemo_tokenizer, dtype, vocab_size = convert(
-        args.input_name_or_path, args.output_path, precision=args.precision, cpu_only=args.cpu_only
-    )
-    if args.hf_input_path and args.hf_output_path:
-        convert_hf_config(nemo_config, nemo_tokenizer, vocab_size, dtype, args.hf_output_path, args.hf_input_path)
-        extract_nemotron_tokenizer(args.input_name_or_path, nemo_config, args.hf_output_path, nemo_tokenizer)
-    else:
-        logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.")
-        logging.info(f".bin file is saved to {args.output_path}")
diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py
index 829a3283d0a3..b29b4bf0ad30 100644
--- a/src/transformers/models/nemotron/modeling_nemotron.py
+++ b/src/transformers/models/nemotron/modeling_nemotron.py
@@ -27,7 +27,7 @@
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
-from ...modeling_flash_attention_utils import _flash_attention_forward
+from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -35,14 +35,15 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_greater_or_equal_2_10,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -50,6 +51,12 @@
 from .configuration_nemotron import NemotronConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "nvidia/nemotron-3-8b-base-4k-hf"
@@ -105,45 +112,18 @@ def __init__(
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -315,9 +295,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     # Ignore copy
     def forward(
@@ -754,10 +734,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(NEMOTRON_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -765,15 +746,13 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -860,8 +839,6 @@ def forward(
 
         next_cache = next_decoder_cache if use_cache else None
 
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -876,12 +853,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -963,7 +945,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1026,13 +1008,14 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(NEMOTRON_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     # Ignore copy (doc string different)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -1041,13 +1024,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **loss_kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1082,10 +1063,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1094,11 +1074,10 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -1107,10 +1086,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -1152,29 +1127,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(NEMOTRON_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1183,9 +1157,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1200,7 +1173,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1215,10 +1188,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1254,21 +1223,21 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.transformer.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(NEMOTRON_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         start_positions: Optional[torch.LongTensor] = None,
         end_positions: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -1279,9 +1248,8 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
             are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.transformer(
+        outputs: BaseModelOutputWithPast = self.transformer(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1289,10 +1257,9 @@ def forward(
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -1303,10 +1270,6 @@ def forward(
         if start_positions is not None and end_positions is not None:
             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
 
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return QuestionAnsweringModelOutput(
             loss=loss,
             start_logits=start_logits,
@@ -1347,6 +1310,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(NEMOTRON_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1358,23 +1322,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1383,9 +1345,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1393,10 +1354,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/nllb_moe/configuration_nllb_moe.py b/src/transformers/models/nllb_moe/configuration_nllb_moe.py
index c2d7f7f11699..f161930d90d0 100644
--- a/src/transformers/models/nllb_moe/configuration_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/configuration_nllb_moe.py
@@ -100,7 +100,7 @@ class NllbMoeConfig(PretrainedConfig):
             experts.
         router_bias (`bool`, *optional*, defaults to `False`):
             Whether or not the classifier of the router should have a bias.
-        moe_token_dropout (`float`, *optional*, defualt ot 0.2):
+        moe_token_dropout (`float`, *optional*, default to 0.2):
             Masking rate for MoE expert output masking (EOM), which is implemented via a Dropout2d on the expert
             outputs.
         output_router_logits (`bool`, *optional*, defaults to `False`):
diff --git a/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py b/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
deleted file mode 100644
index 5f98c0ca3d92..000000000000
--- a/src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-
-import torch
-from torch import nn
-
-from transformers import NllbMoeConfig, NllbMoeModel
-from transformers.modeling_utils import dtype_byte_size
-from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "decoder.output_projection.weight",
-        "_float_tensor",
-        "encoder.embed_positions._float_tensor",
-        "decoder.embed_positions._float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def rename_fairseq_keys(state_dict, expert_idx=None):
-    new_dict = {}
-    for old_key in state_dict.keys():
-        key = old_key
-        if "moe_layer.experts." in key:
-            if expert_idx is not None:
-                key = key.replace("moe_layer.experts.0", f"ffn.experts.expert_{expert_idx}")
-            else:
-                key = key.replace("moe_layer.experts.", "ffn.experts.expert_")
-        if "gate" in key:
-            key = key.replace(".moe_layer.gate.wg", ".ffn.router.classifier")
-        if "fc2" and "experts" not in key:
-            key = key.replace(".fc2.", ".ffn.fc2.")
-        if "fc1" and "experts" not in key:
-            key = key.replace(".fc1.", ".ffn.fc1.")
-        if ".encoder_attn." in key:
-            key = key.replace(".encoder_attn.", ".cross_attention.")
-        if "encoder_attn_layer_norm" in key:
-            key = key.replace("encoder_attn_layer_norm", "cross_attention_layer_norm")
-        if "final_layer_norm" in key:
-            key = key.replace("final_layer_norm", "ff_layer_norm")
-        new_dict[key] = state_dict[old_key]
-    return new_dict
-
-
-def shard_on_the_fly(switch_checkpoint_path, dump_path, num_experts, dtype, weights_name: str = WEIGHTS_NAME):
-    sharded_state_dicts = []
-    total_size = 0
-    os.makedirs(dump_path, exist_ok=True)
-
-    for expert in range(num_experts):
-        expert_path = switch_checkpoint_path + f"-rank-{expert}.pt"
-        if os.path.isfile(expert_path):
-            expert_state = torch.load(expert_path)["model"]
-            remove_ignore_keys_(expert_state)
-            expert_state = rename_fairseq_keys(expert_state, expert)
-            save_path = os.path.join(
-                dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin")
-            )
-            torch.save(expert_state, save_path)
-            sharded_state_dicts.append(expert_state.keys())
-            total_size += sum([value.numel() for key, value in expert_state.items()]) * dtype_byte_size(
-                expert_state[list(expert_state)[0]].dtype
-            )
-
-    # Add the last block
-    save_path = os.path.join(dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin"))
-    shared_weights = torch.load(switch_checkpoint_path + "-shared.pt")["model"]
-    remove_ignore_keys_(shared_weights)
-    shared_weights = rename_fairseq_keys(shared_weights, None)
-    shared_weights["shared.weight"] = shared_weights["decoder.embed_tokens.weight"]
-    sharded_state_dicts.append(shared_weights.keys())
-
-    # If we only have the shared weights (dummy model/experts saved on the same file)
-    if len(sharded_state_dicts) == 1:
-        save_path = os.path.join(dump_path, weights_name)
-        torch.save(shared_weights, save_path)
-        return {weights_name: sharded_state_dicts[0]}, None
-    else:
-        torch.save(shared_weights, save_path)
-    # Otherwise, let's build the index
-    weight_map = {}
-    for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
-        temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx+1:05d}-of-???.bin"))
-        os.rename(temp_filename, os.path.join(dump_path, shard_file))
-        for key in shard:
-            weight_map[key] = shard_file
-
-    # Add the metadata
-    metadata = {"total_size": total_size}
-    index = {"metadata": metadata, "weight_map": weight_map}
-
-    with open(os.path.join(dump_path, WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f:
-        content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-        f.write(content)
-
-    return metadata, index
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--nllb_moe_checkpoint_path",
-        default="/home/arthur_huggingface_co/fairseq/weights/checkpoints/model_moe_54b/checkpoint_2_300000",
-        type=str,
-        required=False,
-        help="Path to a directory containing a folder per layer. Follows the original Google format.",
-    )
-    parser.add_argument("--dtype", default="float32", type=str, required=False, help="dtype of the saved model")
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="/home/arthur_huggingface_co/fairseq/weights/checkpoints/hf-converted-moe-54b",
-        type=str,
-        required=False,
-        help="Path to the output pytorch model.",
-    )
-    args = parser.parse_args()
-    metadata, index = shard_on_the_fly(
-        args.nllb_moe_checkpoint_path,
-        args.pytorch_dump_folder_path,
-        128,
-        args.dtype,
-    )
-
-    config = NllbMoeConfig.from_pretrained(
-        "facebook/nllb-200-3.3B", encoder_sparse_step=4, decoder_sparse_step=4, num_experts=128
-    )
-    config.save_pretrained(args.pytorch_dump_folder_path)
-    model = NllbMoeModel.from_pretrained(args.pytorch_dump_folder_path)
-    print("Done")
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index f48c3f517750..8a2e735f9fe2 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -188,7 +188,10 @@ def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional
 
     @torch.no_grad()
     def forward(
-        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
     ):
         if input_ids is not None:
             bsz, seq_len = input_ids.size()
@@ -1352,7 +1355,7 @@ def forward(
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting" " `use_cache=False`..."
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                 )
                 use_cache = False
 
diff --git a/src/transformers/models/nougat/convert_nougat_to_hf.py b/src/transformers/models/nougat/convert_nougat_to_hf.py
deleted file mode 100644
index e42f8553ac4f..000000000000
--- a/src/transformers/models/nougat/convert_nougat_to_hf.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Nougat checkpoints using the original `nougat` library. URL:
-https://github.com/facebookresearch/nougat/tree/main"""
-
-import argparse
-
-import torch
-from huggingface_hub import hf_hub_download
-from nougat import NougatModel
-from nougat.dataset.rasterize import rasterize_paper
-from nougat.utils.checkpoint import get_checkpoint
-from PIL import Image
-
-from transformers import (
-    DonutSwinConfig,
-    DonutSwinModel,
-    MBartConfig,
-    MBartForCausalLM,
-    NougatImageProcessor,
-    NougatProcessor,
-    NougatTokenizerFast,
-    VisionEncoderDecoderModel,
-)
-
-
-def get_configs(model):
-    original_config = model.config
-
-    encoder_config = DonutSwinConfig(
-        image_size=original_config.input_size,
-        patch_size=4,
-        depths=original_config.encoder_layer,
-        num_heads=[4, 8, 16, 32],
-        window_size=original_config.window_size,
-        embed_dim=128,
-    )
-    decoder_config = MBartConfig(
-        is_decoder=True,
-        is_encoder_decoder=False,
-        add_cross_attention=True,
-        decoder_layers=original_config.decoder_layer,
-        max_position_embeddings=original_config.max_position_embeddings,
-        vocab_size=len(
-            model.decoder.tokenizer
-        ),  # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json)
-        scale_embedding=True,
-        add_final_layer_norm=True,
-        tie_word_embeddings=False,
-    )
-
-    return encoder_config, decoder_config
-
-
-# Copied from transformers.models.donut.convert_donut_to_pytorch.rename_key
-def rename_key(name):
-    if "encoder.model" in name:
-        name = name.replace("encoder.model", "encoder")
-    if "decoder.model" in name:
-        name = name.replace("decoder.model", "decoder")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.norm")
-    if name.startswith("encoder"):
-        if "layers" in name:
-            name = "encoder." + name
-        if "attn.proj" in name:
-            name = name.replace("attn.proj", "attention.output.dense")
-        if "attn" in name and "mask" not in name:
-            name = name.replace("attn", "attention.self")
-        if "norm1" in name:
-            name = name.replace("norm1", "layernorm_before")
-        if "norm2" in name:
-            name = name.replace("norm2", "layernorm_after")
-        if "mlp.fc1" in name:
-            name = name.replace("mlp.fc1", "intermediate.dense")
-        if "mlp.fc2" in name:
-            name = name.replace("mlp.fc2", "output.dense")
-
-        if name == "encoder.norm.weight":
-            name = "encoder.layernorm.weight"
-        if name == "encoder.norm.bias":
-            name = "encoder.layernorm.bias"
-
-    return name
-
-
-# Copied from transformers.models.donut.convert_donut_to_pytorch.convert_state_dict
-def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[3])
-            block_num = int(key_split[5])
-            dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
-
-            if "weight" in key:
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
-                ] = val[:dim, :]
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = (
-                    val[dim : dim * 2, :]
-                )
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = (
-                    val[:dim]
-                )
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = (
-                    val[dim : dim * 2]
-                )
-                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = (
-                    val[-dim:]
-                )
-        elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
-            # HuggingFace implementation doesn't use attn_mask buffer
-            # and model doesn't use final LayerNorms for the encoder
-            pass
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_nougat_checkpoint(model_tag, pytorch_dump_folder_path=None, push_to_hub=False):
-    # load original model
-    checkpoint_path = get_checkpoint(None, model_tag)
-    original_model = NougatModel.from_pretrained(checkpoint_path)
-    original_model.eval()
-
-    # load HuggingFace model
-    encoder_config, decoder_config = get_configs(original_model)
-    encoder = DonutSwinModel(encoder_config)
-    decoder = MBartForCausalLM(decoder_config)
-    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
-    model.eval()
-
-    state_dict = original_model.state_dict()
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    # verify results on PDF
-    filepath = hf_hub_download(repo_id="ysharma/nougat", filename="input/nougat.pdf", repo_type="space")
-    images = rasterize_paper(pdf=filepath, return_pil=True)
-    image = Image.open(images[0])
-
-    tokenizer_file = checkpoint_path / "tokenizer.json"
-    tokenizer = NougatTokenizerFast(tokenizer_file=str(tokenizer_file))
-    tokenizer.pad_token = "<pad>"
-    tokenizer.bos_token = "<s>"
-    tokenizer.eos_token = "</s>"
-    tokenizer.unk_token = "<unk>"
-    tokenizer.model_max_length = original_model.config.max_length
-
-    size = {"height": original_model.config.input_size[0], "width": original_model.config.input_size[1]}
-    image_processor = NougatImageProcessor(
-        do_align_long_axis=original_model.config.align_long_axis,
-        size=size,
-    )
-    processor = NougatProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    # verify pixel_values
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-    original_pixel_values = original_model.encoder.prepare_input(image).unsqueeze(0)
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    # verify patch embeddings
-    original_patch_embed = original_model.encoder.model.patch_embed(pixel_values)
-    patch_embeddings, _ = model.encoder.embeddings(pixel_values)
-    assert torch.allclose(original_patch_embed, patch_embeddings)
-
-    # verify encoder hidden states
-    original_last_hidden_state = original_model.encoder(pixel_values)
-    last_hidden_state = model.encoder(pixel_values).last_hidden_state
-    assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2)
-
-    # NOTE original model does not use tied weights for embeddings of decoder
-    original_embeddings = original_model.decoder.model.model.decoder.embed_tokens
-    embeddings = model.decoder.model.decoder.embed_tokens
-    assert torch.allclose(original_embeddings.weight, embeddings.weight, atol=1e-3)
-
-    # verify decoder hidden states
-    prompt = "hello world"
-    decoder_input_ids = original_model.decoder.tokenizer(
-        prompt, add_special_tokens=False, return_tensors="pt"
-    ).input_ids
-    decoder_attention_mask = torch.ones_like(decoder_input_ids)
-    original_logits = original_model(
-        image_tensors=pixel_values, decoder_input_ids=decoder_input_ids, attention_mask=decoder_attention_mask
-    ).logits
-    logits = model(
-        pixel_values,
-        decoder_input_ids=decoder_input_ids[:, :-1],
-        decoder_attention_mask=decoder_attention_mask[:, :-1],
-    ).logits
-    assert torch.allclose(original_logits, logits, atol=1e-3)
-
-    # verify generation
-    outputs = model.generate(
-        pixel_values,
-        min_length=1,
-        max_length=30,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        use_cache=True,
-        bad_words_ids=[
-            [tokenizer.unk_token_id],
-        ],
-        return_dict_in_generate=True,
-        do_sample=False,
-    )
-    generated = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
-
-    if model_tag == "0.1.0-base":
-        expected_generation = "# Nougat: Neural Optical Understanding for Academic Documents\n\nLukas Blecher\n\nCorrespondence to: lblec"
-    elif model_tag == "0.1.0-small":
-        expected_generation = (
-            "# Nougat: Neural Optical Understanding for Academic Documents\n\nLukas Blecher\n\nCorrespondence to: lble"
-        )
-    else:
-        raise ValueError(f"Unexpected model tag: {model_tag}")
-
-    assert generated == expected_generation
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        tag_to_name = {"0.1.0-base": "nougat-base", "0.1.0-small": "nougat-small"}
-        model_name = tag_to_name[model_tag]
-
-        model.push_to_hub(f"facebook/{model_name}")
-        processor.push_to_hub(f"facebook/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_tag",
-        default="0.1.0-base",
-        required=False,
-        type=str,
-        choices=["0.1.0-base", "0.1.0-small"],
-        help="Tag of the original model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        required=False,
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether or not to push the converted model and processor to the 🤗 hub.",
-    )
-
-    args = parser.parse_args()
-    convert_nougat_checkpoint(args.model_tag, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py
index 2cbd6e67294e..d5251d4ff12c 100644
--- a/src/transformers/models/nougat/image_processing_nougat.py
+++ b/src/transformers/models/nougat/image_processing_nougat.py
@@ -360,16 +360,16 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_crop_margin: bool = None,
-        do_resize: bool = None,
+        do_crop_margin: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_thumbnail: bool = None,
-        do_align_long_axis: bool = None,
-        do_pad: bool = None,
-        do_rescale: bool = None,
+        do_thumbnail: Optional[bool] = None,
+        do_align_long_axis: Optional[bool] = None,
+        do_pad: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
         rescale_factor: Union[int, float] = None,
-        do_normalize: bool = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
index 58a13454e862..ca395e261aff 100644
--- a/src/transformers/models/nougat/processing_nougat.py
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -50,16 +50,16 @@ def __call__(
         self,
         images=None,
         text=None,
-        do_crop_margin: bool = None,
-        do_resize: bool = None,
+        do_crop_margin: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: "PILImageResampling" = None,  # noqa: F821
-        do_thumbnail: bool = None,
-        do_align_long_axis: bool = None,
-        do_pad: bool = None,
-        do_rescale: bool = None,
+        do_thumbnail: Optional[bool] = None,
+        do_align_long_axis: Optional[bool] = None,
+        do_pad: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
         rescale_factor: Union[int, float] = None,
-        do_normalize: bool = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional["ChannelDimension"] = "channels_first",  # noqa: F821
diff --git a/src/transformers/models/nougat/tokenization_nougat_fast.py b/src/transformers/models/nougat/tokenization_nougat_fast.py
index 55802bab804a..e5dc6ed16453 100644
--- a/src/transformers/models/nougat/tokenization_nougat_fast.py
+++ b/src/transformers/models/nougat/tokenization_nougat_fast.py
@@ -19,7 +19,7 @@
 import re
 from functools import partial
 from multiprocessing import Pool
-from typing import List, Union
+from typing import List, Optional, Union
 
 import numpy as np
 
@@ -113,26 +113,17 @@ def normalize_list_like_lines(generation):
         normalization adjusts the bullet point style and nesting levels based on the captured patterns.
     """
 
-    # This matches lines starting with - or *, not followed by - or * (lists)
-    # that are then numbered by digits \d or roman numerals (one or more)
-    # and then, optional additional numbering of this line is captured
-    # this is then fed to re.finditer.
-    pattern = r"(?:^)(-|\*)?(?!-|\*) ?((?:\d|[ixv])+ )?.+? (-|\*) (((?:\d|[ixv])+)\.(\d|[ixv]) )?.*(?:$)"
-
-    for match in reversed(list(re.finditer(pattern, generation, flags=re.I | re.M))):
-        start, stop = match.span()
-        delim = match.group(3) + " "
-        splits = match.group(0).split(delim)
+    lines = generation.split("\n")
+    output_lines = []
+    for line_no, line in enumerate(lines):
+        match = re.search(r". ([-*]) ", line)
+        if not match or line[0] not in ("-", "*"):
+            output_lines.append(line)
+            continue  # Doesn't fit the pattern we want, no changes
+        delim = match.group(1) + " "
+        splits = line.split(delim)[1:]
         replacement = ""
-
-        if match.group(1) is not None:
-            splits = splits[1:]
-            delim1 = match.group(1) + " "
-        else:
-            delim1 = ""
-            continue  # Skip false positives
-
-        pre, post = generation[:start], generation[stop:]
+        delim1 = line[0] + " "
 
         for i, item in enumerate(splits):
             level = 0
@@ -144,15 +135,15 @@ def normalize_list_like_lines(generation):
                 level = potential_numeral.count(".")
 
             replacement += (
-                ("\n" if i > 0 else "") + ("\t" * level) + (delim if i > 0 or start == 0 else delim1) + item.strip()
+                ("\n" if i > 0 else "") + ("\t" * level) + (delim if i > 0 or line_no == 0 else delim1) + item.strip()
             )
 
-        if post == "":
-            post = "\n"
+        if line_no == len(lines) - 1:  # If this is the last line in the generation
+            replacement += "\n"  # Add an empty line to the end of the generation
 
-        generation = pre + replacement + post
+        output_lines.append(replacement)
 
-    return generation
+    return "\n".join(output_lines)
 
 
 def find_next_punctuation(text: str, start_idx=0):
@@ -593,7 +584,7 @@ def post_process_generation(
         self,
         generation: Union[str, List[str]],
         fix_markdown: bool = True,
-        num_workers: int = None,
+        num_workers: Optional[int] = None,
     ) -> Union[str, List[str]]:
         """
         Postprocess a generated text or a list of generated texts.
diff --git a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 8d5a52bdbf82..000000000000
--- a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert Nystromformer checkpoints from the original repository."""
-
-import argparse
-
-import torch
-
-from transformers import NystromformerConfig, NystromformerForMaskedLM
-
-
-def rename_key(orig_key):
-    if "model" in orig_key:
-        orig_key = orig_key.replace("model.", "")
-    if "norm1" in orig_key:
-        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
-    if "norm2" in orig_key:
-        orig_key = orig_key.replace("norm2", "output.LayerNorm")
-    if "norm" in orig_key:
-        orig_key = orig_key.replace("norm", "LayerNorm")
-    if "transformer" in orig_key:
-        layer_num = orig_key.split(".")[0].split("_")[-1]
-        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
-    if "mha.attn" in orig_key:
-        orig_key = orig_key.replace("mha.attn", "attention.self")
-    if "mha" in orig_key:
-        orig_key = orig_key.replace("mha", "attention")
-    if "W_q" in orig_key:
-        orig_key = orig_key.replace("W_q", "self.query")
-    if "W_k" in orig_key:
-        orig_key = orig_key.replace("W_k", "self.key")
-    if "W_v" in orig_key:
-        orig_key = orig_key.replace("W_v", "self.value")
-    if "ff1" in orig_key:
-        orig_key = orig_key.replace("ff1", "intermediate.dense")
-    if "ff2" in orig_key:
-        orig_key = orig_key.replace("ff2", "output.dense")
-    if "ff" in orig_key:
-        orig_key = orig_key.replace("ff", "output.dense")
-    if "mlm_class" in orig_key:
-        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
-    if "mlm" in orig_key:
-        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
-    if "cls" not in orig_key:
-        orig_key = "nystromformer." + orig_key
-
-    return orig_key
-
-
-def convert_checkpoint_helper(config, orig_state_dict):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if ("pooler" in key) or ("sen_class" in key) or ("conv.bias" in key):
-            continue
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
-    orig_state_dict["nystromformer.embeddings.position_ids"] = (
-        torch.arange(config.max_position_embeddings).expand((1, -1)) + 2
-    )
-
-    return orig_state_dict
-
-
-def convert_nystromformer_checkpoint(checkpoint_path, nystromformer_config_file, pytorch_dump_path):
-    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
-    config = NystromformerConfig.from_json_file(nystromformer_config_file)
-    model = NystromformerForMaskedLM(config)
-
-    new_state_dict = convert_checkpoint_helper(config, orig_state_dict)
-
-    model.load_state_dict(new_state_dict)
-    model.eval()
-    model.save_pretrained(pytorch_dump_path)
-
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pytorch_model_path", default=None, type=str, required=True, help="Path to Nystromformer pytorch checkpoint."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The json file for Nystromformer model config.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_nystromformer_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/olmo/convert_olmo_weights_to_hf.py b/src/transformers/models/olmo/convert_olmo_weights_to_hf.py
deleted file mode 100644
index 0e77bdc69e7a..000000000000
--- a/src/transformers/models/olmo/convert_olmo_weights_to_hf.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import gc
-import json
-import os
-import shutil
-from pathlib import Path
-
-import torch
-import yaml
-from tokenizers import Tokenizer
-
-from transformers import OlmoConfig, OlmoForCausalLM
-from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/olmo/convert_olmo_weights_to_hf.py \
-    --input_dir /path/to/downloaded/olmo/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import OlmoForCausalLM, AutoTokenizer
-
-model = OlmoForCausalLM.from_pretrained("/output/path")
-tokenizer = AutoTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(model_path, input_base_path, tokenizer_path=None, safe_serialization=True, fix_eos_token_id=True):
-    os.makedirs(model_path, exist_ok=True)
-    tmp_model_path = os.path.join(model_path, "tmp")
-    os.makedirs(tmp_model_path, exist_ok=True)
-
-    config_path = Path(input_base_path) / "config.yaml"
-    olmo_config = yaml.safe_load(config_path.read_text())["model"]
-
-    n_layers = olmo_config["n_layers"]
-    n_heads = olmo_config["n_heads"]
-    dim = olmo_config["d_model"]
-    dims_per_head = dim // n_heads
-    base = 10000.0
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    max_position_embeddings = olmo_config["max_sequence_length"]
-
-    vocab_size = olmo_config.get("embedding_size", olmo_config["vocab_size"])
-
-    if olmo_config.get("n_kv_heads", None) is not None:
-        num_key_value_heads = olmo_config["n_kv_heads"]  # for GQA / MQA
-    elif olmo_config["multi_query_attention"]:  # compatibility with other checkpoints
-        num_key_value_heads = 1
-    else:
-        num_key_value_heads = n_heads
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-
-    # Not sharded
-    # (The sharded implementation would also work, but this is simpler.)
-    loaded = torch.load(os.path.join(input_base_path, "model.pt"), map_location="cpu")
-
-    param_count = 0
-    index_dict = {"weight_map": {}}
-    for layer_i in range(n_layers):
-        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
-        # Unsharded
-        # TODO: Layernorm stuff
-        # TODO: multi query attention
-        fused_dims = [dim, dims_per_head * num_key_value_heads, dims_per_head * num_key_value_heads]
-        q_proj_weight, k_proj_weight, v_proj_weight = torch.split(
-            loaded[f"transformer.blocks.{layer_i}.att_proj.weight"], fused_dims, dim=0
-        )
-        up_proj_weight, gate_proj_weight = torch.chunk(
-            loaded[f"transformer.blocks.{layer_i}.ff_proj.weight"], 2, dim=0
-        )
-        state_dict = {
-            f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj_weight,
-            f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj_weight,
-            f"model.layers.{layer_i}.self_attn.v_proj.weight": v_proj_weight,
-            f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.blocks.{layer_i}.attn_out.weight"],
-            f"model.layers.{layer_i}.mlp.gate_proj.weight": gate_proj_weight,
-            f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"transformer.blocks.{layer_i}.ff_out.weight"],
-            f"model.layers.{layer_i}.mlp.up_proj.weight": up_proj_weight,
-        }
-
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-
-        for k, v in state_dict.items():
-            index_dict["weight_map"][k] = filename
-            param_count += v.numel()
-        torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
-
-    # Unsharded
-    # TODO: Deal with weight-tying
-    state_dict = {
-        "model.embed_tokens.weight": loaded["transformer.wte.weight"],
-        "lm_head.weight": loaded["transformer.ff_out.weight"]
-        if "transformer.ff_out.weight" in loaded
-        else loaded["transformer.wte.weight"],
-    }
-
-    for k, v in state_dict.items():
-        index_dict["weight_map"][k] = filename
-        param_count += v.numel()
-    torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    # Write configs
-    index_dict["metadata"] = {"total_size": param_count * 2}
-    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-
-    if olmo_config.get("mlp_hidden_size", None) is not None:
-        intermediate_size = olmo_config["mlp_hidden_size"] // 2
-    else:
-        intermediate_size = (dim * olmo_config["mlp_ratio"]) // 2
-
-    config = OlmoConfig(
-        vocab_size=vocab_size,
-        hidden_size=dim,
-        intermediate_size=intermediate_size,
-        num_hidden_layers=n_layers,
-        num_attention_heads=n_heads,
-        num_key_value_heads=num_key_value_heads,
-        max_position_embeddings=max_position_embeddings,
-        pad_token_id=olmo_config["pad_token_id"],
-        bos_token_id=None,
-        eos_token_id=olmo_config["eos_token_id"],
-        tie_word_embeddings=olmo_config["weight_tying"],
-        rope_theta=base,
-        clip_qkv=olmo_config.get("clip_qkv"),
-    )
-    config.save_pretrained(tmp_model_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    del loaded
-    gc.collect()
-
-    if tokenizer_path is not None:
-        _write_tokenizer(model_path, config, tokenizer_path, fix_eos_token_id)
-
-    print("Loading the checkpoint in a OLMo model.")
-    model = OlmoForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
-    # Avoid saving this as part of the config.
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    shutil.rmtree(tmp_model_path)
-
-
-def _write_tokenizer(
-    output_path: Path, config: OlmoConfig, input_tokenizer_path: Path, fix_eos_token_id: bool = True
-) -> None:
-    print(f"Saving a {GPTNeoXTokenizerFast.__name__} to {output_path}.")
-
-    base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
-
-    eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
-    pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id
-
-    if fix_eos_token_id and eos_token_id == 0:
-        # Fixing a bug in OLMo where eos token id was incorrectly set
-        print("Changing eos_token_id from 0 to 50279.")
-        eos_token_id = 50279
-
-    tokenizer = GPTNeoXTokenizerFast(
-        tokenizer_object=base_tokenizer,
-        eos_token=base_tokenizer.decode([eos_token_id], skip_special_tokens=False),
-        pad_token=base_tokenizer.decode([pad_token_id], skip_special_tokens=False),
-        unk_token=None,
-        bos_token=None,
-    )
-
-    tokenizer.save_pretrained(output_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        required=True,
-        help="Location of OLMo weights, which contains config.yaml and model.pt.",
-    )
-    parser.add_argument(
-        "--tokenizer_json_path",
-        default=None,
-        help="Location of OLMo tokenizer json file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        required=True,
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--no_fix_eos_token_id",
-        action="store_false",
-        dest="fix_eos_token_id",
-        help="If set, does not change eos token id from 0 to 50279 if it is 0. Changing 0 to 50279 is a bug fix, so use this option with care.",
-    )
-    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
-    # Different OLMo versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used.
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        safe_serialization=args.safe_serialization,
-        tokenizer_path=args.tokenizer_json_path,
-        fix_eos_token_id=args.fix_eos_token_id,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 6b7abaa96af2..4ad98556eea7 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -4,7 +4,8 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_olmo.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -16,13 +17,15 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     LossKwargs,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -30,6 +33,12 @@
 from .configuration_olmo import OlmoConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "OlmoConfig"
 
@@ -292,45 +301,18 @@ def __init__(self, config: OlmoConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -417,20 +399,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -491,10 +465,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(OLMO_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -502,16 +477,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -522,6 +495,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -556,7 +533,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -590,13 +567,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -604,12 +580,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -690,7 +671,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -757,27 +738,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(OLMO_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -812,10 +792,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -824,12 +803,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -838,10 +816,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py
index 3d71c1b96fd6..3c1f396e0f84 100644
--- a/src/transformers/models/olmo2/configuration_olmo2.py
+++ b/src/transformers/models/olmo2/configuration_olmo2.py
@@ -5,7 +5,6 @@
 #                          modular_olmo2.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 
-
 from ...configuration_utils import PretrainedConfig
 
 
diff --git a/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py b/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py
deleted file mode 100644
index d804e7a159e4..000000000000
--- a/src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import argparse
-import gc
-import json
-import os
-import shutil
-from pathlib import Path
-from typing import Any, Dict
-
-import torch
-import yaml
-from tokenizers import Tokenizer
-
-from transformers import Olmo2Config, Olmo2ForCausalLM
-from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py \
-    --input_dir /path/to/downloaded/olmo2/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import Olmo2ForCausalLM, AutoTokenizer
-
-model = Olmo2ForCausalLM.from_pretrained("/output/path")
-tokenizer = AutoTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(
-    model_path,
-    input_base_path,
-    include_tokenizer=True,
-    tokenizer_path=None,
-    safe_serialization=True,
-    fix_eos_token_id=True,
-    tmp_cleanup=True,
-):
-    os.makedirs(model_path, exist_ok=True)
-    tmp_model_path = os.path.join(model_path, "tmp")
-    os.makedirs(tmp_model_path, exist_ok=True)
-
-    config_path = Path(input_base_path) / "config.yaml"
-    olmo2_config = yaml.safe_load(config_path.read_text())["model"]
-
-    if not olmo2_config.get("attention_layer_norm", False):
-        raise RuntimeError("OLMo2 checkpoints must have attention layer norm")
-    if not olmo2_config.get("norm_after", False):
-        raise RuntimeError("OLMo2 checkpoints must set norm_after to True")
-
-    n_layers = olmo2_config["n_layers"]
-    n_heads = olmo2_config["n_heads"]
-    dim = olmo2_config["d_model"]
-    dims_per_head = dim // n_heads
-    base = olmo2_config["rope_theta"]
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    max_position_embeddings = olmo2_config["max_sequence_length"]
-
-    vocab_size = olmo2_config.get("embedding_size", olmo2_config["vocab_size"])
-
-    if olmo2_config.get("n_kv_heads", None) is not None:
-        num_key_value_heads = olmo2_config["n_kv_heads"]  # for GQA / MQA
-    elif olmo2_config["multi_query_attention"]:  # compatibility with other checkpoints
-        num_key_value_heads = 1
-    else:
-        num_key_value_heads = n_heads
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-
-    # Not sharded
-    # (The sharded implementation would also work, but this is simpler.)
-    loaded = torch.load(os.path.join(input_base_path, "model.pt"), map_location="cpu")
-
-    param_count = 0
-    index_dict: Dict[str, Any] = {"weight_map": {}}
-    for layer_i in range(n_layers):
-        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
-        # Unsharded
-        # TODO: Layernorm stuff
-        # TODO: multi query attention
-        fused_dims = [dim, dims_per_head * num_key_value_heads, dims_per_head * num_key_value_heads]
-        q_proj_weight, k_proj_weight, v_proj_weight = torch.split(
-            loaded[f"transformer.blocks.{layer_i}.att_proj.weight"], fused_dims, dim=0
-        )
-        up_proj_weight, gate_proj_weight = torch.chunk(
-            loaded[f"transformer.blocks.{layer_i}.ff_proj.weight"], 2, dim=0
-        )
-        state_dict = {
-            f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj_weight,
-            f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj_weight,
-            f"model.layers.{layer_i}.self_attn.v_proj.weight": v_proj_weight,
-            f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.blocks.{layer_i}.attn_out.weight"],
-            f"model.layers.{layer_i}.self_attn.q_norm.weight": loaded[f"transformer.blocks.{layer_i}.q_norm.weight"],
-            f"model.layers.{layer_i}.self_attn.k_norm.weight": loaded[f"transformer.blocks.{layer_i}.k_norm.weight"],
-            f"model.layers.{layer_i}.mlp.gate_proj.weight": gate_proj_weight,
-            f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"transformer.blocks.{layer_i}.ff_out.weight"],
-            f"model.layers.{layer_i}.mlp.up_proj.weight": up_proj_weight,
-            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
-                f"transformer.blocks.{layer_i}.attn_norm.weight"
-            ],
-            f"model.layers.{layer_i}.post_feedforward_layernorm.weight": loaded[
-                f"transformer.blocks.{layer_i}.ff_norm.weight"
-            ],
-        }
-
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-
-        for k, v in state_dict.items():
-            index_dict["weight_map"][k] = filename
-            param_count += v.numel()
-        torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
-
-    # Unsharded
-    # TODO: Deal with weight-tying
-    state_dict = {
-        "model.embed_tokens.weight": loaded["transformer.wte.weight"],
-        "model.norm.weight": loaded["transformer.ln_f.weight"],
-        "lm_head.weight": loaded["transformer.ff_out.weight"]
-        if "transformer.ff_out.weight" in loaded
-        else loaded["transformer.wte.weight"],
-    }
-
-    for k, v in state_dict.items():
-        index_dict["weight_map"][k] = filename
-        param_count += v.numel()
-    torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    # Write configs
-    index_dict["metadata"] = {"total_size": param_count * 2}
-    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-
-    if olmo2_config.get("mlp_hidden_size", None) is not None:
-        intermediate_size = olmo2_config["mlp_hidden_size"] // 2
-    else:
-        intermediate_size = (dim * olmo2_config["mlp_ratio"]) // 2
-
-    if fix_eos_token_id and olmo2_config["eos_token_id"] == 0:
-        # Fixing a bug in OLMo where eos token id was incorrectly set
-        print("Changing eos_token_id from 0 to 50279.")
-        olmo2_config["eos_token_id"] = 50279
-
-    config = Olmo2Config(
-        vocab_size=vocab_size,
-        hidden_size=dim,
-        intermediate_size=intermediate_size,
-        num_hidden_layers=n_layers,
-        num_attention_heads=n_heads,
-        num_key_value_heads=num_key_value_heads,
-        max_position_embeddings=max_position_embeddings,
-        pad_token_id=olmo2_config["pad_token_id"],
-        bos_token_id=None,
-        eos_token_id=olmo2_config["eos_token_id"],
-        tie_word_embeddings=olmo2_config["weight_tying"],
-        rms_norm_eps=olmo2_config["layer_norm_eps"],
-        rope_theta=base,
-    )
-    config.save_pretrained(tmp_model_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    del loaded
-    gc.collect()
-
-    if include_tokenizer:
-        _write_tokenizer(model_path, config, input_base_path, tokenizer_path)
-
-    print("Loading the checkpoint in a OLMo2 model.")
-    model = Olmo2ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
-    # Avoid saving this as part of the config.
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    if tmp_cleanup:
-        # Make cleanup optional; attempting to `rmtree` the `tmp_model_path` causes
-        # errors if using NFS.
-        shutil.rmtree(tmp_model_path)
-
-
-def _write_tokenizer(
-    output_path: Path,
-    config: Olmo2Config,
-    checkpoint_dir: str,
-    input_tokenizer_path: Path | None,
-) -> None:
-    print(f"Saving a {GPT2TokenizerFast.__name__} to {output_path}.")
-
-    if input_tokenizer_path is not None:
-        base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
-    else:
-        config_path = Path(checkpoint_dir) / "config.yaml"
-        tokenizer_config = yaml.safe_load(config_path.read_text())["tokenizer"]
-
-        # Initialize tokenizer and validate vocab size.
-        if Path(tokenizer_config["identifier"]).is_file():
-            base_tokenizer = Tokenizer.from_file(tokenizer_config["identifier"])
-        else:
-            base_tokenizer = Tokenizer.from_pretrained(tokenizer_config["identifier"])
-
-    eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
-    pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id
-
-    tokenizer = GPT2TokenizerFast(
-        tokenizer_object=base_tokenizer,
-        eos_token=base_tokenizer.decode([eos_token_id], skip_special_tokens=False),
-        pad_token=base_tokenizer.decode([pad_token_id], skip_special_tokens=False),
-    )
-
-    tokenizer.save_pretrained(output_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        required=True,
-        help="Location of OLMo2 weights, which contains config.yaml and model.pt.",
-    )
-    parser.add_argument(
-        "--no_tokenizer",
-        action="store_false",
-        dest="include_tokenizer",
-        help="If set, do not convert OLMo tokenizer to HF tokenizer.",
-    )
-    parser.add_argument(
-        "--tokenizer_json_path",
-        type=Path,
-        default=None,
-        help="Location of OLMo2 tokenizer json file. Defaults to what is set in the config file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        required=True,
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--no_fix_eos_token_id",
-        action="store_false",
-        dest="fix_eos_token_id",
-        help="If set, does not change eos token id from 0 to 50279 if it is 0. Changing 0 to 50279 is a bug fix, so use this option with care.",
-    )
-    parser.add_argument(
-        "--no_tmp_cleanup",
-        action="store_false",
-        dest="tmp_cleanup",
-        help="If passed, don't remove temp dir at end of HF conversion.",
-    )
-    parser.add_argument(
-        "--no_safe_serialization",
-        action="store_false",
-        dest="safe_serialization",
-        help="Whether or not to save using `safetensors`.",
-    )
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        safe_serialization=args.safe_serialization,
-        include_tokenizer=args.include_tokenizer,
-        tokenizer_path=args.tokenizer_json_path,
-        fix_eos_token_id=args.fix_eos_token_id,
-        tmp_cleanup=args.tmp_cleanup,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/olmo2/modeling_olmo2.py b/src/transformers/models/olmo2/modeling_olmo2.py
index 89ef5e1050bb..3404c5e81777 100644
--- a/src/transformers/models/olmo2/modeling_olmo2.py
+++ b/src/transformers/models/olmo2/modeling_olmo2.py
@@ -4,7 +4,8 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_olmo2.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -15,13 +16,15 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     LossKwargs,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -29,6 +32,12 @@
 from .configuration_olmo2 import Olmo2Config
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "Olmo2Config"
 
@@ -293,45 +302,18 @@ def __init__(self, config: Olmo2Config, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -418,20 +400,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -492,10 +466,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(OLMO2_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -503,16 +478,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -523,6 +496,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -557,7 +534,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -591,13 +568,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -605,12 +581,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -691,7 +672,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -758,27 +739,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(OLMO2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -813,10 +793,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -825,12 +804,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -839,10 +817,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/olmo2/modular_olmo2.py b/src/transformers/models/olmo2/modular_olmo2.py
index bc5a9b89d501..fbd431532fd7 100644
--- a/src/transformers/models/olmo2/modular_olmo2.py
+++ b/src/transformers/models/olmo2/modular_olmo2.py
@@ -1,7 +1,7 @@
 from typing import Callable, Optional, Tuple
 
 import torch
-from torch import nn
+import torch.nn as nn
 
 from ...cache_utils import Cache
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
diff --git a/src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py b/src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py
deleted file mode 100644
index a14cd50a0e74..000000000000
--- a/src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Example for running:
-0. Cp ckpts to local
-aws s3 cp --recursive s3://ai2-llm/checkpoints/OLMoE/olmoe-8x1b-newhp-newds-final-annealFrom1200000/step23842 /data/niklas/llm/checkpoints/olmoe-8x1b-newhp-newds-final-annealFrom1200000_step23842
-1. Unshard your OLMoE checkpoint using https://github.com/allenai/OLMo/blob/7d63fe09d23cf23714da5aa633a44a90180195da/scripts/unshard.py
-python OLMo/scripts/unshard.py /data/niklas/llm/checkpoints/23485/step954000 /data/niklas/llm/checkpoints/1b-954000-unsharded --model-only
-python OLMo/scripts/unshard.py /data/niklas/llm/checkpoints/23485/step954000 /data/niklas/llm/checkpoints/1b-954000-unsharded --model-only
-python OLMo/scripts/unshard.py /data/niklas/llm/checkpoints/olmoe-8x1b-newhp-newds-final-annealFrom1200000_step23842 /data/niklas/llm/checkpoints/olmoe-8x1b-newhp-newds-final-annealFrom1200000_step23842-unsharded --model-only
-2. Convert to transformers
-rm -rf olmoe; mkdir olmoe; python /data/niklas/transformers/src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py --input_dir /data/niklas/llm/checkpoints/olmoe-8x1b-newhp-newds-final-annealFrom1200000_step23842-unsharded --tokenizer_json_path /data/niklas/llm/checkpoints/olmoe-step1200000-unsharded/tokenizer.json --output_dir olmoe
-3. Load model via:
-```
-from transformers import OlmoeForCausalLM, AutoTokenizer
-import torch
-model = OlmoeForCausalLM.from_pretrained("../transformers/olmoe", torch_dtype=torch.bfloat16).cuda()
-model = OlmoeForCausalLM.from_pretrained("../transformers/olmoe").cuda()
-tokenizer = AutoTokenizer.from_pretrained("../transformers/olmoe")
-inputs = tokenizer("Bitcoin is", return_tensors="pt")
-inputs = {k: v.cuda() for k, v in inputs.items()}
-out = model.generate(**inputs, max_length=64)
-print(tokenizer.decode(out[0]))
-# > # Bitcoin is a digital currency that is created and held electronically. No one controls it. Bitcoins aren’t printed, like dollars or euros – they’re produced by people and businesses running computers all around the world, using software that solves mathematical
-# Or quick sanity check:
-o = model(torch.tensor([[0, 1]]).cuda())
-# If the checkpoint is not converted to BF16 but kept in FP32:
-# > # Bitcoin is a digital currency that is not controlled by any central authority. It is a peer-to-peer payment system that allows users to send and receive payments from anywhere in the world. Bitcoin is also known as a cryptocurrency because it uses cryptography to secure transactions and prevent fraud.
-```
-
-Note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-
-Compare with OLMo codebase:
-```
-from olmo.model import OLMo
-import torch
-model = OLMo.from_checkpoint("/data/niklas/llm/checkpoints/olmoe-step1200000-unsharded-pt")
-model = model.cuda()
-model = model.to(torch.bfloat16)
-from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("../transformers/olmoe")
-inputs = tokenizer("Bitcoin is", return_tensors="pt")
-inputs = {k: v.cuda() for k, v in inputs.items()}
-out = model.generate(**inputs)
-print(tokenizer.decode(out[0][0][0]))
-# Bitcoin is a digital currency that is created and held electronically. No one controls it. Bitcoins aren’t printed, like dollars or euros – they’re produced by people and businesses running computers all around the world, using software that solves mathematical problems. It’s the first example of a growing category of money
-# Or quick sanity check:
-o = model(torch.tensor([[0, 1]]).cuda())
-```
-"""
-
-import argparse
-import gc
-import json
-import os
-import shutil
-from pathlib import Path
-
-import torch
-import yaml
-from tokenizers import Tokenizer
-
-from transformers import OlmoeConfig, OlmoeForCausalLM
-from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(model_path, input_base_path, tokenizer_path=None, safe_serialization=True, fix_eos_token_id=True):
-    os.makedirs(model_path, exist_ok=True)
-    tmp_model_path = os.path.join(model_path, "tmp")
-    os.makedirs(tmp_model_path, exist_ok=True)
-
-    config_path = Path(input_base_path) / "config.yaml"
-    olmoe_config = yaml.safe_load(config_path.read_text())["model"]
-
-    if fix_eos_token_id:
-        olmoe_config["eos_token_id"] = 50279
-
-    n_layers = olmoe_config["n_layers"]
-    n_heads = olmoe_config["n_heads"]
-    dim = olmoe_config["d_model"]
-    dims_per_head = dim // n_heads
-    base = 10000.0
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    max_position_embeddings = olmoe_config["max_sequence_length"]
-
-    vocab_size = olmoe_config.get("embedding_size", olmoe_config["vocab_size"])
-
-    if olmoe_config.get("n_kv_heads", None) is not None:
-        num_key_value_heads = olmoe_config["n_kv_heads"]  # for GQA / MQA
-    elif olmoe_config["multi_query_attention"]:  # compatibility with other checkpoints
-        num_key_value_heads = 1
-    else:
-        num_key_value_heads = n_heads
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-
-    # Not sharded
-    loaded = torch.load(os.path.join(input_base_path, "model.pt"), map_location="cpu")
-
-    param_count = 0
-    index_dict = {"weight_map": {}}
-    for layer_i in range(n_layers):
-        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
-        fused_dims = [dim, dims_per_head * num_key_value_heads, dims_per_head * num_key_value_heads]
-        q_proj_weight, k_proj_weight, v_proj_weight = torch.split(
-            loaded[f"transformer.blocks.{layer_i}.att_proj.weight"], fused_dims, dim=0
-        )
-        state_dict = {
-            f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj_weight,
-            f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj_weight,
-            f"model.layers.{layer_i}.self_attn.v_proj.weight": v_proj_weight,
-            f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.blocks.{layer_i}.attn_out.weight"],
-            f"model.layers.{layer_i}.self_attn.q_norm.weight": loaded[f"transformer.blocks.{layer_i}.q_norm.weight"],
-            f"model.layers.{layer_i}.self_attn.k_norm.weight": loaded[f"transformer.blocks.{layer_i}.k_norm.weight"],
-            f"model.layers.{layer_i}.mlp.gate.weight": loaded[f"transformer.blocks.{layer_i}.ffn.router.layer.weight"],
-            f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"transformer.blocks.{layer_i}.attn_norm.weight"],
-            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
-                f"transformer.blocks.{layer_i}.ff_norm.weight"
-            ],
-        }
-
-        num_experts = loaded[f"transformer.blocks.{layer_i}.ffn.router.layer.weight"].shape[0]
-        dim_per_expert = loaded[f"transformer.blocks.{layer_i}.ffn.experts.mlp.w1"].shape[0] // num_experts
-        for expert_i in range(num_experts):
-            state_dict[f"model.layers.{layer_i}.mlp.experts.{expert_i}.gate_proj.weight"] = loaded[
-                f"transformer.blocks.{layer_i}.ffn.experts.mlp.w1"
-            ][dim_per_expert * expert_i : dim_per_expert * (expert_i + 1), :]
-            state_dict[f"model.layers.{layer_i}.mlp.experts.{expert_i}.up_proj.weight"] = loaded[
-                f"transformer.blocks.{layer_i}.ffn.experts.mlp.v1"
-            ][dim_per_expert * expert_i : dim_per_expert * (expert_i + 1), :]
-            state_dict[f"model.layers.{layer_i}.mlp.experts.{expert_i}.down_proj.weight"] = loaded[
-                f"transformer.blocks.{layer_i}.ffn.experts.mlp.w2"
-            ][dim_per_expert * expert_i : dim_per_expert * (expert_i + 1), :].T.contiguous()
-
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-
-        for k, v in state_dict.items():
-            index_dict["weight_map"][k] = filename
-            param_count += v.numel()
-        torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
-
-    # Unsharded
-    state_dict = {
-        "model.embed_tokens.weight": loaded["transformer.wte.weight"],
-        "lm_head.weight": loaded["transformer.ff_out.weight"],
-        "model.norm.weight": loaded["transformer.ln_f.weight"],
-    }
-
-    for k, v in state_dict.items():
-        index_dict["weight_map"][k] = filename
-        param_count += v.numel()
-    torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    # Write configs
-    index_dict["metadata"] = {"total_size": param_count * 2}
-    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-
-    config = OlmoeConfig(
-        vocab_size=vocab_size,
-        hidden_size=dim,
-        intermediate_size=dim_per_expert,
-        num_hidden_layers=n_layers,
-        num_attention_heads=n_heads,
-        num_key_value_heads=num_key_value_heads,
-        max_position_embeddings=max_position_embeddings,
-        pad_token_id=olmoe_config["pad_token_id"],
-        bos_token_id=None,
-        eos_token_id=olmoe_config["eos_token_id"],
-        tie_word_embeddings=olmoe_config["weight_tying"],
-        rope_theta=base,
-        clip_qkv=olmoe_config.get("clip_qkv"),
-    )
-    config.save_pretrained(tmp_model_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    del loaded
-    gc.collect()
-
-    if tokenizer_path is not None:
-        _write_tokenizer(model_path, config, tokenizer_path, fix_eos_token_id)
-
-    print("Loading the checkpoint in a OLMoE model.")
-    model = OlmoeForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16)
-    # Avoid saving this as part of the config.
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    shutil.rmtree(tmp_model_path)
-
-
-def _write_tokenizer(
-    output_path: Path, config: OlmoeConfig, input_tokenizer_path: Path, fix_eos_token_id: bool = True
-) -> None:
-    print(f"Saving a {GPTNeoXTokenizerFast.__name__} to {output_path}.")
-
-    base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
-
-    eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
-    pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id
-
-    if fix_eos_token_id and eos_token_id == 0:
-        # Fixing a bug in OLMo where eos token id was incorrectly set
-        print("Changing eos_token_id from 0 to 50279.")
-        eos_token_id = 50279
-
-    tokenizer = GPTNeoXTokenizerFast(
-        tokenizer_object=base_tokenizer,
-        eos_token=base_tokenizer.decode([eos_token_id], skip_special_tokens=False),
-        pad_token=base_tokenizer.decode([pad_token_id], skip_special_tokens=False),
-        unk_token=None,
-        bos_token=None,
-    )
-
-    tokenizer.save_pretrained(output_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        required=True,
-        help="Location of OLMoE weights, which contains config.yaml and model.pt.",
-    )
-    parser.add_argument(
-        "--tokenizer_json_path",
-        default=None,
-        help="Location of OLMoE tokenizer json file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        required=True,
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--no_fix_eos_token_id",
-        action="store_false",
-        dest="fix_eos_token_id",
-        help="If set, does not change eos token id from 0 to 50279 if it is 0. Changing 0 to 50279 is a bug fix, so use this option with care.",
-    )
-    parser.add_argument(
-        "--safe_serialization", type=bool, default=True, help="Whether or not to save using `safetensors`."
-    )
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        safe_serialization=args.safe_serialization,
-        tokenizer_path=args.tokenizer_json_path,
-        fix_eos_token_id=args.fix_eos_token_id,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index 9b0336a32b1c..f6d21fb05eea 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -23,18 +23,17 @@
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     MoeCausalLMOutputWithPast,
     MoeModelOutputWithPast,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -42,7 +41,7 @@
 from .configuration_olmoe import OlmoeConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -176,45 +175,18 @@ def __init__(self, config: OlmoeConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -402,9 +374,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -897,7 +869,7 @@ def set_input_embeddings(self, value):
     # Ignore copy
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -1122,7 +1094,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1190,7 +1162,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1206,7 +1178,6 @@ def forward(
         **loss_kwargs,
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
diff --git a/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py b/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
deleted file mode 100644
index 2e515e983408..000000000000
--- a/src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OmDet-Turbo checkpoints from the original repository.
-
-URL: https://github.com/om-ai-lab/OmDet"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import (
-    CLIPTokenizer,
-    DetrImageProcessor,
-    OmDetTurboConfig,
-    OmDetTurboForObjectDetection,
-    OmDetTurboProcessor,
-)
-
-
-IMAGE_MEAN = [123.675, 116.28, 103.53]
-IMAGE_STD = [58.395, 57.12, 57.375]
-
-
-def get_omdet_turbo_config(model_name, use_timm_backbone):
-    if "tiny" in model_name:
-        window_size = 7
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-        image_size = 640
-    else:
-        raise ValueError("Model not supported, only supports tiny variant.")
-
-    config = OmDetTurboConfig(
-        backbone_window_size=window_size,
-        backbone_image_size=image_size,
-        backbone_embed_dim=embed_dim,
-        backbone_depths=depths,
-        backbone_num_heads=num_heads,
-        backbone_out_indices=(1, 2, 3),
-        text_config={"model_type": "clip_text_model"},
-        use_timm_backbone=use_timm_backbone,
-        backbone="swin_tiny_patch4_window7_224" if use_timm_backbone else None,
-        apply_layernorm_after_vision_backbone=True if use_timm_backbone else False,
-        use_pretrained_backbone=False,
-    )
-
-    return config
-
-
-def create_rename_keys_vision(state_dict, config):
-    rename_keys = []
-    # fmt: off
-    ########################################## VISION BACKBONE - START
-    for layer_name in state_dict.keys():
-        if layer_name.startswith("backbone") and not layer_name.startswith("backbone.norm"):
-            if config.use_timm_backbone:
-                layer_name_replace = layer_name.replace("backbone", "vision_backbone.vision_backbone._backbone")
-                layer_name_replace = layer_name_replace.replace(".layers.", ".layers_")
-                if "downsample" in layer_name:
-                    # get layer number
-                    layer_num = int(layer_name.split(".")[2])
-                    layer_name_replace = layer_name_replace.replace(f"{layer_num}.downsample", f"{layer_num+1}.downsample")
-            else:
-                layer_name_replace = layer_name.replace("backbone", "vision_backbone.vision_backbone")
-                layer_name_replace = layer_name_replace.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-                layer_name_replace = layer_name_replace.replace("patch_embed.norm", "embeddings.norm")
-                if layer_name.startswith("backbone.layers"):
-                    layer_name_replace = layer_name_replace.replace("norm1", "layernorm_before")
-                    layer_name_replace = layer_name_replace.replace("norm2", "layernorm_after")
-                    layer_name_replace = layer_name_replace.replace("attn.proj", "attention.output.dense")
-                    layer_name_replace = layer_name_replace.replace("mlp.fc1", "intermediate.dense")
-                    layer_name_replace = layer_name_replace.replace("mlp.fc2", "output.dense")
-                    layer_name_replace = layer_name_replace.replace(".layers.", ".encoder.layers.")
-                    layer_name_replace = layer_name_replace.replace(".attn.", ".attention.self.")
-        elif layer_name.startswith("backbone.norm"):
-            layer_num = int(layer_name.split("norm")[1].split(".")[0])
-            if config.use_timm_backbone:
-                layer_name_replace = layer_name.replace("backbone", "vision_backbone")
-                layer_name_replace = layer_name_replace.replace(f"norm{layer_num}", f"layer_norms.{layer_num-1}")
-            else:
-                layer_name_replace = layer_name.replace(f"backbone.norm{layer_num}", f"vision_backbone.vision_backbone.hidden_states_norms.stage{layer_num+1}")
-        else:
-            continue
-        rename_keys.append((layer_name, layer_name_replace))
-    ########################################## VISION BACKBONE - END
-
-    ########################################## ENCODER - START
-    for layer_name, params in state_dict.items():
-        if "neck" in layer_name:
-            layer_name_replace = layer_name.replace("neck", "encoder")
-            layer_name_replace = layer_name_replace.replace("input_proj", "channel_projection_layers")
-            if "fpn_blocks" in layer_name or "pan_blocks" in layer_name or "lateral_convs" in layer_name or "downsample_convs" in layer_name:
-                layer_name_replace = layer_name_replace.replace(".m.", ".bottlenecks.")
-                layer_name_replace = layer_name_replace.replace(".cv", ".conv")
-                layer_name_replace = layer_name_replace.replace(".bn", ".norm")
-            if "encoder_layer" in layer_name:
-                layer_name_replace = layer_name_replace.replace("encoder_layer", "encoder.0.layers.0")
-                layer_name_replace = layer_name_replace.replace(".linear", ".fc")
-                layer_name_replace = layer_name_replace.replace("norm1", "self_attn_layer_norm")
-                layer_name_replace = layer_name_replace.replace("norm2", "final_layer_norm")
-            rename_keys.append((layer_name, layer_name_replace))
-    ########################################## ENCODER - END
-
-    ########################################## DECODER - START
-    for layer_name, params in state_dict.items():
-        if layer_name.startswith("decoder"):
-            layer_name_replace = layer_name.replace("decoder.decoder.layers", "decoder.layers")
-            layer_name_replace = layer_name_replace.replace("input_proj", "channel_projection_layers")
-            layer_name_replace = layer_name_replace.replace("query_pos_head", "query_position_head")
-            layer_name_replace = layer_name_replace.replace("enc_bbox_head", "encoder_bbox_head")
-            layer_name_replace = layer_name_replace.replace("enc_output", "encoder_vision_features")
-            layer_name_replace = layer_name_replace.replace("dec_score_head", "decoder_class_head")
-            layer_name_replace = layer_name_replace.replace("dec_bbox_head", "decoder_bbox_head")
-            layer_name_replace = layer_name_replace.replace("enc_score_head", "encoder_class_head")
-            rename_keys.append((layer_name, layer_name_replace))
-    ########################################## DECODER - END
-    # fmt: on
-    return rename_keys
-
-
-def create_rename_keys_language(state_dict):
-    rename_keys = []
-    # fmt: off
-    for layer_name in state_dict.keys():
-        if layer_name.startswith("language_backbone") and not layer_name.startswith("language_backbone.text_projection"):
-            layer_name_replace = layer_name.replace("language_backbone", "language_backbone.model.text_model")
-            layer_name_replace = layer_name_replace.replace("transformer.resblocks", "encoder.layers")
-            layer_name_replace = layer_name_replace.replace("token_embedding", "embeddings.token_embedding")
-            layer_name_replace = layer_name_replace.replace("positional_embedding", "embeddings.position_embedding.weight")
-            layer_name_replace = layer_name_replace.replace(".attn", ".self_attn")
-            layer_name_replace = layer_name_replace.replace(".mlp.c_fc", ".mlp.fc1")
-            layer_name_replace = layer_name_replace.replace(".mlp.c_proj", ".mlp.fc2")
-            layer_name_replace = layer_name_replace.replace("ln_final", "final_layer_norm")
-            layer_name_replace = layer_name_replace.replace(".ln_", ".layer_norm")
-            rename_keys.append((layer_name, layer_name_replace))
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v_vision(state_dict, config):
-    state_dict_keys = list(state_dict.keys())
-    for layer_name_vision in state_dict_keys:
-        if layer_name_vision.startswith("vision_backbone") and "qkv" in layer_name_vision:
-            layer_num = int(layer_name_vision.split(".")[4])
-            hidden_size = config.backbone_config.embed_dim * 2**layer_num
-            if "weight" in layer_name_vision:
-                in_proj_weight = state_dict.pop(layer_name_vision)
-                state_dict[layer_name_vision.replace("qkv.weight", "key.weight")] = in_proj_weight[:hidden_size, :]
-                state_dict[layer_name_vision.replace("qkv.weight", "query.weight")] = in_proj_weight[
-                    hidden_size : hidden_size * 2, :
-                ]
-                state_dict[layer_name_vision.replace("qkv.weight", "value.weight")] = in_proj_weight[-hidden_size:, :]
-            elif "bias" in layer_name_vision:
-                in_proj_bias = state_dict.pop(layer_name_vision)
-                state_dict[layer_name_vision.replace("qkv.bias", "key.bias")] = in_proj_bias[:hidden_size]
-                state_dict[layer_name_vision.replace("qkv.bias", "query.bias")] = in_proj_bias[
-                    hidden_size : hidden_size * 2
-                ]
-                state_dict[layer_name_vision.replace("qkv.bias", "value.bias")] = in_proj_bias[-hidden_size:]
-
-
-def read_in_q_k_v_text(state_dict, config):
-    state_dict_keys = list(state_dict.keys())
-    hidden_size = config.text_config.projection_dim
-    for layer_name_text in state_dict_keys:
-        if layer_name_text.startswith("language_backbone") and "in_proj" in layer_name_text:
-            if "weight" in layer_name_text:
-                in_proj_weight = state_dict.pop(layer_name_text)
-                state_dict[layer_name_text.replace("in_proj_weight", "q_proj.weight")] = in_proj_weight[
-                    :hidden_size, :
-                ]
-                state_dict[layer_name_text.replace("in_proj_weight", "k_proj.weight")] = in_proj_weight[
-                    hidden_size : hidden_size * 2, :
-                ]
-                state_dict[layer_name_text.replace("in_proj_weight", "v_proj.weight")] = in_proj_weight[
-                    -hidden_size:, :
-                ]
-            elif "bias" in layer_name_text:
-                in_proj_bias = state_dict.pop(layer_name_text)
-                state_dict[layer_name_text.replace("in_proj_bias", "q_proj.bias")] = in_proj_bias[:hidden_size]
-                state_dict[layer_name_text.replace("in_proj_bias", "k_proj.bias")] = in_proj_bias[
-                    hidden_size : hidden_size * 2
-                ]
-                state_dict[layer_name_text.replace("in_proj_bias", "v_proj.bias")] = in_proj_bias[-hidden_size:]
-
-
-def read_in_q_k_v_encoder(state_dict, config):
-    embed_dim = config.encoder_hidden_dim
-    # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-    in_proj_weight = state_dict.pop("encoder.encoder.0.layers.0.self_attn.in_proj_weight")
-    in_proj_bias = state_dict.pop("encoder.encoder.0.layers.0.self_attn.in_proj_bias")
-    # next, add query, keys and values (in that order) to the state dict
-    state_dict["encoder.encoder.0.layers.0.self_attn.query.weight"] = in_proj_weight[:embed_dim, :]
-    state_dict["encoder.encoder.0.layers.0.self_attn.query.bias"] = in_proj_bias[:embed_dim]
-    state_dict["encoder.encoder.0.layers.0.self_attn.key.weight"] = in_proj_weight[embed_dim : embed_dim * 2, :]
-    state_dict["encoder.encoder.0.layers.0.self_attn.key.bias"] = in_proj_bias[embed_dim : embed_dim * 2]
-    state_dict["encoder.encoder.0.layers.0.self_attn.value.weight"] = in_proj_weight[-embed_dim:, :]
-    state_dict["encoder.encoder.0.layers.0.self_attn.value.bias"] = in_proj_bias[-embed_dim:]
-
-
-def read_in_q_k_v_decoder(state_dict, config):
-    for layer_num in range(config.decoder_num_layers):
-        embed_dim = config.decoder_hidden_dim
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"decoder.layers.{layer_num}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"decoder.layers.{layer_num}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{layer_num}.self_attn.query.weight"] = in_proj_weight[:embed_dim, :]
-        state_dict[f"decoder.layers.{layer_num}.self_attn.query.bias"] = in_proj_bias[:embed_dim]
-        state_dict[f"decoder.layers.{layer_num}.self_attn.key.weight"] = in_proj_weight[embed_dim : embed_dim * 2, :]
-        state_dict[f"decoder.layers.{layer_num}.self_attn.key.bias"] = in_proj_bias[embed_dim : embed_dim * 2]
-        state_dict[f"decoder.layers.{layer_num}.self_attn.value.weight"] = in_proj_weight[-embed_dim:, :]
-        state_dict[f"decoder.layers.{layer_num}.self_attn.value.bias"] = in_proj_bias[-embed_dim:]
-
-
-def run_test(model, processor):
-    # We will verify our results on an image of cute cats
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    classes = ["cat", "remote"]
-    task = "Detect {}.".format(", ".join(classes))
-    inputs = processor(image, text=classes, task=task, return_tensors="pt")
-
-    # Running forward
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    predicted_slice = outputs[1][0, :3, :3]
-    print(predicted_slice)
-    expected_slice = torch.tensor([[0.9427, -2.5958], [0.2105, -3.4569], [-2.6364, -4.1610]])
-
-    assert torch.allclose(predicted_slice, expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-
-@torch.no_grad()
-def convert_omdet_turbo_checkpoint(args):
-    model_name = args.model_name
-    pytorch_dump_folder_path = args.pytorch_dump_folder_path
-    push_to_hub = args.push_to_hub
-    use_timm_backbone = args.use_timm_backbone
-
-    checkpoint_mapping = {
-        "omdet-turbo-tiny": [
-            "https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/OmDet-Turbo_tiny_SWIN_T.pth",
-            "https://huggingface.co/omlab/OmDet-Turbo_tiny_SWIN_T/resolve/main/ViT-B-16.pt",
-        ],
-    }
-    # Define default OmDetTurbo configuation
-    config = get_omdet_turbo_config(model_name, use_timm_backbone)
-
-    # Load original checkpoint
-    checkpoint_url = checkpoint_mapping[model_name]
-    original_state_dict_vision = torch.hub.load_state_dict_from_url(checkpoint_url[0], map_location="cpu")["model"]
-    original_state_dict_vision = {k.replace("module.", ""): v for k, v in original_state_dict_vision.items()}
-
-    # Rename keys
-    new_state_dict = original_state_dict_vision.copy()
-    rename_keys_vision = create_rename_keys_vision(new_state_dict, config)
-
-    rename_keys_language = create_rename_keys_language(new_state_dict)
-
-    for src, dest in rename_keys_vision:
-        rename_key(new_state_dict, src, dest)
-
-    for src, dest in rename_keys_language:
-        rename_key(new_state_dict, src, dest)
-
-    if not use_timm_backbone:
-        read_in_q_k_v_vision(new_state_dict, config)
-    read_in_q_k_v_text(new_state_dict, config)
-    read_in_q_k_v_encoder(new_state_dict, config)
-    read_in_q_k_v_decoder(new_state_dict, config)
-    # add "model" prefix to all keys
-    new_state_dict = {f"model.{k}": v for k, v in new_state_dict.items()}
-
-    # Load HF model
-    model = OmDetTurboForObjectDetection(config)
-    model.eval()
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    image_processor = DetrImageProcessor(
-        size={"height": config.backbone_image_size, "width": config.backbone_image_size},
-        do_rescale=False,
-        image_mean=IMAGE_MEAN,
-        image_std=IMAGE_STD,
-        do_pad=False,
-    )
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    processor = OmDetTurboProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    # end-to-end consistency test
-    run_test(model, processor)
-
-    if pytorch_dump_folder_path is not None:
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(f"omlab/{model_name}")
-        processor.push_to_hub(f"omlab/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="omdet-turbo-tiny",
-        type=str,
-        choices=["omdet-turbo-tiny"],
-        help="Name of the OmDetTurbo model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    parser.add_argument(
-        "--use_timm_backbone", action="store_true", help="Whether or not to use timm backbone for vision backbone."
-    )
-
-    args = parser.parse_args()
-    convert_omdet_turbo_checkpoint(args)
diff --git a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py
index 31fcf1e44f15..67143cff68c7 100644
--- a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py
+++ b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py
@@ -15,38 +15,32 @@
 """PyTorch OmDet-Turbo model."""
 
 import math
-import os
 import warnings
 from collections import OrderedDict
 from dataclasses import dataclass
 from functools import lru_cache
-from pathlib import Path
 from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
 
 from ...activations import ACT2CLS, ACT2FN
 from ...file_utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_torch_cuda_available,
     replace_return_docstrings,
 )
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_utils import PreTrainedModel
-from ...utils import is_ninja_available, logging
+from ...utils import logging
 from ...utils.backbone_utils import load_backbone
 from ..auto import AutoModel
 from .configuration_omdet_turbo import OmDetTurboConfig
 
 
-MultiScaleDeformableAttention = None
-
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "OmDetTurboConfig"
 
@@ -74,7 +68,7 @@ class OmDetTurboEncoderOutput(ModelOutput):
             The extracted states from the Feature Pyramid Network (FPN) and Path Aggregation Network (PAN) of the encoder.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     extracted_states: Tuple[torch.FloatTensor] = None
@@ -110,14 +104,14 @@ class OmDetTurboDecoderOutput(ModelOutput):
             weighted average in the self-attention, cross-attention and multi-scale deformable attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    decoder_coords: torch.FloatTensor = None
-    decoder_classes: torch.FloatTensor = None
-    encoder_coord_logits: torch.FloatTensor = None
+    decoder_coords: Optional[torch.FloatTensor] = None
+    decoder_classes: Optional[torch.FloatTensor] = None
+    encoder_coord_logits: Optional[torch.FloatTensor] = None
     encoder_class_logits: Tuple[torch.FloatTensor] = None
-    init_reference_points: torch.FloatTensor = None
+    init_reference_points: Optional[torch.FloatTensor] = None
     intermediate_reference_points: Tuple[Tuple[torch.FloatTensor]] = None
 
 
@@ -163,14 +157,14 @@ class OmDetTurboObjectDetectionOutput(ModelOutput):
             The number of queried classes for each image.
     """
 
-    loss: torch.FloatTensor = None
-    decoder_coord_logits: torch.FloatTensor = None
-    decoder_class_logits: torch.FloatTensor = None
-    init_reference_points: torch.FloatTensor = None
+    loss: Optional[torch.FloatTensor] = None
+    decoder_coord_logits: Optional[torch.FloatTensor] = None
+    decoder_class_logits: Optional[torch.FloatTensor] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
     intermediate_reference_points: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    encoder_coord_logits: torch.FloatTensor = None
+    encoder_coord_logits: Optional[torch.FloatTensor] = None
     encoder_class_logits: Tuple[torch.FloatTensor] = None
-    encoder_extracted_states: torch.FloatTensor = None
+    encoder_extracted_states: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -178,79 +172,60 @@ class OmDetTurboObjectDetectionOutput(ModelOutput):
     classes_structure: Optional[torch.LongTensor] = None
 
 
-# Copied from models.deformable_detr.load_cuda_kernels
-def load_cuda_kernels():
-    from torch.utils.cpp_extension import load
-
-    global MultiScaleDeformableAttention
-
-    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
-    src_files = [
-        root / filename
-        for filename in [
-            "vision.cpp",
-            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
-            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
-        ]
-    ]
-
-    MultiScaleDeformableAttention = load(
-        "MultiScaleDeformableAttention",
-        src_files,
-        with_cuda=True,
-        extra_include_paths=[str(root)],
-        extra_cflags=["-DWITH_CUDA=1"],
-        extra_cuda_cflags=[
-            "-DCUDA_HAS_FP16=1",
-            "-D__CUDA_NO_HALF_OPERATORS__",
-            "-D__CUDA_NO_HALF_CONVERSIONS__",
-            "-D__CUDA_NO_HALF2_OPERATORS__",
-        ],
-    )
-
-
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
-def multi_scale_deformable_attention(
-    value: Tensor,
-    value_spatial_shapes: Union[Tensor, List[Tuple]],
-    sampling_locations: Tensor,
-    attention_weights: Tensor,
-) -> Tensor:
-    batch_size, _, num_heads, hidden_dim = value.shape
-    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    # Ignore copy
-    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for level_id, (height, width) in enumerate(value_spatial_shapes):
-        # batch_size, height*width, num_heads, hidden_dim
-        # -> batch_size, height*width, num_heads*hidden_dim
-        # -> batch_size, num_heads*hidden_dim, height*width
-        # -> batch_size*num_heads, hidden_dim, height, width
-        value_l_ = (
-            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
+@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention
+class MultiScaleDeformableAttention(nn.Module):
+    def forward(
+        self,
+        value: Tensor,
+        value_spatial_shapes: Tensor,
+        value_spatial_shapes_list: List[Tuple],
+        level_start_index: Tensor,
+        sampling_locations: Tensor,
+        attention_weights: Tensor,
+        im2col_step: int,
+    ):
+        batch_size, _, num_heads, hidden_dim = value.shape
+        _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+        value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
+        sampling_grids = 2 * sampling_locations - 1
+        sampling_value_list = []
+        for level_id, (height, width) in enumerate(value_spatial_shapes_list):
+            # batch_size, height*width, num_heads, hidden_dim
+            # -> batch_size, height*width, num_heads*hidden_dim
+            # -> batch_size, num_heads*hidden_dim, height*width
+            # -> batch_size*num_heads, hidden_dim, height, width
+            value_l_ = (
+                value_list[level_id]
+                .flatten(2)
+                .transpose(1, 2)
+                .reshape(batch_size * num_heads, hidden_dim, height, width)
+            )
+            # batch_size, num_queries, num_heads, num_points, 2
+            # -> batch_size, num_heads, num_queries, num_points, 2
+            # -> batch_size*num_heads, num_queries, num_points, 2
+            sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+            # batch_size*num_heads, hidden_dim, num_queries, num_points
+            sampling_value_l_ = nn.functional.grid_sample(
+                value_l_,
+                sampling_grid_l_,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            sampling_value_list.append(sampling_value_l_)
+        # (batch_size, num_queries, num_heads, num_levels, num_points)
+        # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+        # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+        attention_weights = attention_weights.transpose(1, 2).reshape(
+            batch_size * num_heads, 1, num_queries, num_levels * num_points
         )
-        # batch_size, num_queries, num_heads, num_points, 2
-        # -> batch_size, num_heads, num_queries, num_points, 2
-        # -> batch_size*num_heads, num_queries, num_points, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
-        # batch_size*num_heads, hidden_dim, num_queries, num_points
-        sampling_value_l_ = nn.functional.grid_sample(
-            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        output = (
+            (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+            .sum(-1)
+            .view(batch_size, num_heads * hidden_dim, num_queries)
         )
-        sampling_value_list.append(sampling_value_l_)
-    # (batch_size, num_queries, num_heads, num_levels, num_points)
-    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
-    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
-    attention_weights = attention_weights.transpose(1, 2).reshape(
-        batch_size * num_heads, 1, num_queries, num_levels * num_points
-    )
-    output = (
-        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
-        .sum(-1)
-        .view(batch_size, num_heads * hidden_dim, num_queries)
-    )
-    return output.transpose(1, 2).contiguous()
+        return output.transpose(1, 2).contiguous()
 
 
 class OmDetTurboLRUCache:
@@ -332,55 +307,6 @@ def forward(self, pixel_values):
         return outputs
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
-    def forward(
-        context,
-        value,
-        value_spatial_shapes,
-        value_level_start_index,
-        sampling_locations,
-        attention_weights,
-        im2col_step,
-    ):
-        context.im2col_step = im2col_step
-        output = MultiScaleDeformableAttention.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
-        )
-        context.save_for_backward(
-            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
-        )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->OmDetTurbo, Deformable DETR->OmDet-Turbo
 class OmDetTurboMultiscaleDeformableAttention(nn.Module):
     """
@@ -390,12 +316,7 @@ class OmDetTurboMultiscaleDeformableAttention(nn.Module):
     def __init__(self, config: OmDetTurboConfig, num_heads: int, n_points: int):
         super().__init__()
 
-        kernel_loaded = MultiScaleDeformableAttention is not None
-        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
-            try:
-                load_cuda_kernels()
-            except Exception as e:
-                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+        self.attn = MultiScaleDeformableAttention()
 
         if config.d_model % num_heads != 0:
             raise ValueError(
@@ -483,27 +404,16 @@ def forward(
         else:
             raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
 
-        if self.disable_custom_kernels:
-            # PyTorch implementation
-            output = multi_scale_deformable_attention(
-                value, spatial_shapes_list, sampling_locations, attention_weights
-            )
-        else:
-            try:
-                # custom kernel
-                output = MultiScaleDeformableAttentionFunction.apply(
-                    value,
-                    spatial_shapes,
-                    level_start_index,
-                    sampling_locations,
-                    attention_weights,
-                    self.im2col_step,
-                )
-            except Exception:
-                # PyTorch implementation
-                output = multi_scale_deformable_attention(
-                    value, spatial_shapes_list, sampling_locations, attention_weights
-                )
+        output = self.attn(
+            value,
+            spatial_shapes,
+            spatial_shapes_list,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+
         output = self.output_proj(output)
 
         return output, attention_weights
@@ -575,10 +485,9 @@ def __init__(self, config: OmDetTurboConfig):
             self.conv3 = nn.Identity()
 
     def forward(self, hidden_state):
-        device = hidden_state.device
         hidden_state_1 = self.conv1(hidden_state)
-        hidden_state_1 = self.bottlenecks(hidden_state_1).to(device)
-        hidden_state_2 = self.conv2(hidden_state).to(device)
+        hidden_state_1 = self.bottlenecks(hidden_state_1)
+        hidden_state_2 = self.conv2(hidden_state)
         return self.conv3(hidden_state_1 + hidden_state_2)
 
 
@@ -670,7 +579,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        position_embeddings: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
     ):
         """
@@ -1406,7 +1315,7 @@ def _get_encoder_input(self, vision_features):
 
         # [batch_size, height*width, channels]
         new_vision_features = torch.cat(new_vision_features, 1)
-        new_vision_shapes = torch.tensor(new_vision_shapes_list, dtype=torch.int64).to(vision_features[0].device)
+        new_vision_shapes = torch.tensor(new_vision_shapes_list, dtype=torch.int64, device=vision_features[0].device)
         level_start_index = torch.cat((new_vision_shapes.new_zeros((1,)), new_vision_shapes.prod(1).cumsum(0)[:-1]))
 
         return new_vision_features, new_vision_shapes, new_vision_shapes_list, level_start_index
@@ -1421,7 +1330,9 @@ def _get_decoder_input(
         )
         predicted_class_features = self.encoder_vision_features(
             torch.where(
-                valid_mask, vision_features, torch.tensor(0.0, dtype=vision_features.dtype).to(vision_features.device)
+                valid_mask,
+                vision_features,
+                torch.tensor(0.0, dtype=vision_features.dtype, device=vision_features.device),
             )
         )
 
diff --git a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
index f52840e1d0b6..6d59202e5748 100644
--- a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
+++ b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
@@ -216,7 +216,7 @@ class OmDetTurboProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "DetrImageProcessor"
+    image_processor_class = ("DetrImageProcessor", "DetrImageProcessorFast")
     tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/oneformer/convert_to_hf_oneformer.py b/src/transformers/models/oneformer/convert_to_hf_oneformer.py
deleted file mode 100644
index 6e88d8a0555f..000000000000
--- a/src/transformers/models/oneformer/convert_to_hf_oneformer.py
+++ /dev/null
@@ -1,1191 +0,0 @@
-# coding=utf-8
-# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert OneFormer checkpoints from the original repository. URL: https://github.com/SHI-Labs/OneFormer"""
-
-import os
-import sys
-from argparse import ArgumentParser
-from dataclasses import dataclass
-from pathlib import Path
-from pprint import pformat
-from typing import Any, Dict, Iterator, List, Set, Tuple
-
-import requests
-import torch
-import torchvision.transforms as T
-from PIL import Image
-from torch import Tensor, nn
-
-
-try:
-    from detectron2.checkpoint import DetectionCheckpointer
-    from detectron2.config import get_cfg
-    from detectron2.data import MetadataCatalog
-    from detectron2.projects.deeplab import add_deeplab_config
-except ImportError:
-    pass
-from transformers import CLIPTokenizer, DinatConfig, SwinConfig
-from transformers.models.oneformer.image_processing_oneformer import OneFormerImageProcessor
-from transformers.models.oneformer.modeling_oneformer import (
-    OneFormerConfig,
-    OneFormerForUniversalSegmentation,
-    OneFormerForUniversalSegmentationOutput,
-    OneFormerModel,
-    OneFormerModelOutput,
-)
-from transformers.models.oneformer.processing_oneformer import OneFormerProcessor
-from transformers.utils import logging
-
-
-StateDict = Dict[str, Tensor]
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-torch.manual_seed(0)
-
-
-class TrackedStateDict:
-    def __init__(self, to_track: Dict):
-        """This class "tracks" a python dictionary by keeping track of which item is accessed.
-
-        Args:
-            to_track (Dict): The dictionary we wish to track
-        """
-        self.to_track = to_track
-        self._seen: Set[str] = set()
-
-    def __getitem__(self, key: str) -> Any:
-        return self.to_track[key]
-
-    def __setitem__(self, key: str, item: Any):
-        self._seen.add(key)
-        self.to_track[key] = item
-
-    def diff(self) -> List[str]:
-        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
-        This is an effective method to check if we have update all the keys
-
-        Returns:
-            List[str]: List of keys not yet updated
-        """
-        return set(self.to_track.keys()) - self._seen
-
-    def copy(self) -> Dict:
-        # proxy the call to the internal dictionary
-        return self.to_track.copy()
-
-
-# Image to verify the result
-def prepare_img():
-    url = "https://praeclarumjj3.github.io/files/coco.jpeg"
-    img_data = requests.get(url, stream=True).raw
-    im = Image.open(img_data)
-    return im
-
-
-@dataclass
-class Args:
-    """Fake command line arguments needed by oneformer/detectron2 implementation"""
-
-    config_file: str
-
-
-def setup_cfg(args: Args):
-    # load config from file and command-line arguments
-    cfg = get_cfg()
-    add_deeplab_config(cfg)
-    add_common_config(cfg)
-    add_oneformer_config(cfg)
-    add_swin_config(cfg)
-    add_dinat_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.freeze()
-    return cfg
-
-
-class OriginalOneFormerConfigToOursConverter:
-    def __call__(self, original_config: object, is_swin: bool) -> OneFormerConfig:
-        model = original_config.MODEL
-
-        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST_PANOPTIC[0])
-        id2label = dict(enumerate(dataset_catalog.stuff_classes))
-        label2id = {label: idx for idx, label in id2label.items()}
-
-        if is_swin:
-            if model.SWIN.EMBED_DIM == 96:
-                backbone_config = SwinConfig.from_pretrained(
-                    "microsoft/swin-tiny-patch4-window7-224",
-                    drop_path_rate=model.SWIN.DROP_PATH_RATE,
-                    out_features=["stage1", "stage2", "stage3", "stage4"],
-                )
-            elif model.SWIN.EMBED_DIM == 192:
-                backbone_config = SwinConfig.from_pretrained(
-                    "microsoft/swin-large-patch4-window12-384",
-                    drop_path_rate=model.SWIN.DROP_PATH_RATE,
-                    out_features=["stage1", "stage2", "stage3", "stage4"],
-                )
-            else:
-                raise ValueError(f"embed dim {model.SWIN.EMBED_DIM} not supported for Swin!")
-        else:
-            backbone_config = DinatConfig.from_pretrained(
-                "shi-labs/dinat-large-11x11-in22k-in1k-384",
-                dilations=model.DiNAT.DILATIONS,
-                kernel_size=model.DiNAT.KERNEL_SIZE,
-                out_features=["stage1", "stage2", "stage3", "stage4"],
-            )
-
-        config: OneFormerConfig = OneFormerConfig(
-            backbone_config=backbone_config,
-            output_attentions=True,
-            output_hidden_states=True,
-            return_dict=True,
-            ignore_value=model.SEM_SEG_HEAD.IGNORE_VALUE,
-            num_classes=model.SEM_SEG_HEAD.NUM_CLASSES,
-            num_queries=model.ONE_FORMER.NUM_OBJECT_QUERIES,
-            no_object_weight=model.ONE_FORMER.NO_OBJECT_WEIGHT,
-            class_weight=model.ONE_FORMER.CLASS_WEIGHT,
-            mask_weight=model.ONE_FORMER.MASK_WEIGHT,
-            dice_weight=model.ONE_FORMER.DICE_WEIGHT,
-            contrastive_weight=model.ONE_FORMER.CONTRASTIVE_WEIGHT,
-            contrastive_temperature=model.ONE_FORMER.CONTRASTIVE_TEMPERATURE,
-            train_num_points=model.ONE_FORMER.TRAIN_NUM_POINTS,
-            oversample_ratio=model.ONE_FORMER.OVERSAMPLE_RATIO,
-            importance_sample_ratio=model.ONE_FORMER.IMPORTANCE_SAMPLE_RATIO,
-            init_std=0.02,
-            init_xavier_std=1.0,
-            layer_norm_eps=1e-05,
-            is_training=False,
-            use_auxiliary_loss=model.ONE_FORMER.DEEP_SUPERVISION,
-            output_auxiliary_logits=True,
-            strides=[4, 8, 16, 32],
-            task_seq_len=original_config.INPUT.TASK_SEQ_LEN,
-            max_seq_len=original_config.INPUT.MAX_SEQ_LEN,
-            text_encoder_width=model.TEXT_ENCODER.WIDTH,
-            text_encoder_context_length=model.TEXT_ENCODER.CONTEXT_LENGTH,
-            text_encoder_num_layers=model.TEXT_ENCODER.NUM_LAYERS,
-            text_encoder_vocab_size=model.TEXT_ENCODER.VOCAB_SIZE,
-            text_encoder_proj_layers=model.TEXT_ENCODER.PROJ_NUM_LAYERS,
-            text_encoder_n_ctx=model.TEXT_ENCODER.N_CTX,
-            conv_dim=model.SEM_SEG_HEAD.CONVS_DIM,
-            mask_dim=model.SEM_SEG_HEAD.MASK_DIM,
-            hidden_dim=model.ONE_FORMER.HIDDEN_DIM,
-            norm=model.SEM_SEG_HEAD.NORM,
-            encoder_layers=model.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS,
-            encoder_feedforward_dim=1024,
-            decoder_layers=model.ONE_FORMER.DEC_LAYERS,
-            use_task_norm=model.ONE_FORMER.USE_TASK_NORM,
-            num_attention_heads=model.ONE_FORMER.NHEADS,
-            dropout=model.ONE_FORMER.DROPOUT,
-            dim_feedforward=model.ONE_FORMER.DIM_FEEDFORWARD,
-            pre_norm=model.ONE_FORMER.PRE_NORM,
-            enforce_input_proj=model.ONE_FORMER.ENFORCE_INPUT_PROJ,
-            query_dec_layers=model.ONE_FORMER.CLASS_DEC_LAYERS,
-            common_stride=model.SEM_SEG_HEAD.COMMON_STRIDE,
-            id2label=id2label,
-            label2id=label2id,
-        )
-
-        return config
-
-
-class OriginalOneFormerConfigToProcessorConverter:
-    def __call__(self, original_config: object, model_repo: str) -> OneFormerProcessor:
-        model = original_config.MODEL
-        model_input = original_config.INPUT
-        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST_PANOPTIC[0])
-
-        if "ade20k" in model_repo:
-            class_info_file = "ade20k_panoptic.json"
-        elif "coco" in model_repo:
-            class_info_file = "coco_panoptic.json"
-        elif "cityscapes" in model_repo:
-            class_info_file = "cityscapes_panoptic.json"
-        else:
-            raise ValueError("Invalid Dataset!")
-
-        image_processor = OneFormerImageProcessor(
-            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
-            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
-            size=model_input.MIN_SIZE_TEST,
-            max_size=model_input.MAX_SIZE_TEST,
-            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
-            ignore_index=dataset_catalog.ignore_label,
-            class_info_file=class_info_file,
-        )
-
-        tokenizer = CLIPTokenizer.from_pretrained(model_repo)
-
-        return OneFormerProcessor(
-            image_processor=image_processor,
-            tokenizer=tokenizer,
-            task_seq_length=original_config.INPUT.TASK_SEQ_LEN,
-            max_seq_length=original_config.INPUT.MAX_SEQ_LEN,
-        )
-
-
-class OriginalOneFormerCheckpointToOursConverter:
-    def __init__(self, original_model: nn.Module, config: OneFormerConfig):
-        self.original_model = original_model
-        self.config = config
-
-    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
-        for src_key, dst_key in renamed_keys:
-            dst_state_dict[dst_key] = src_state_dict.pop(src_key)
-
-    # Swin Backbone
-    def replace_swin_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: OneFormerConfig):
-        dst_prefix: str = "pixel_level_module.encoder"
-        src_prefix: str = "backbone"
-
-        renamed_keys = [
-            (
-                f"{src_prefix}.patch_embed.proj.weight",
-                f"{dst_prefix}.embeddings.patch_embeddings.projection.weight",
-            ),
-            (f"{src_prefix}.patch_embed.proj.bias", f"{dst_prefix}.embeddings.patch_embeddings.projection.bias"),
-            (f"{src_prefix}.patch_embed.norm.weight", f"{dst_prefix}.embeddings.norm.weight"),
-            (f"{src_prefix}.patch_embed.norm.bias", f"{dst_prefix}.embeddings.norm.bias"),
-        ]
-        num_layers = len(config.backbone_config.depths)
-        for layer_idx in range(num_layers):
-            for block_idx in range(config.backbone_config.depths[layer_idx]):
-                renamed_keys.extend(
-                    [  # src, dst
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm1.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_before.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_bias_table",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_bias_table",
-                        ),
-                    ]
-                )
-                # now we need to handle the attentions
-                # read in weights + bias of input projection layer of cross-attention
-
-                src_att_weight = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
-                src_att_bias = src_state_dict[f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
-
-                size = src_att_weight.shape[0]
-                offset = size // 3
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.weight"
-                ] = src_att_weight[:offset, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.query.bias"
-                ] = src_att_bias[:offset]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.weight"
-                ] = src_att_weight[offset : offset * 2, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.key.bias"
-                ] = src_att_bias[offset : offset * 2]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.weight"
-                ] = src_att_weight[-offset:, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.value.bias"
-                ] = src_att_bias[-offset:]
-
-                # let's pop them
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
-                src_state_dict.pop(f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
-                # proj
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.proj.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                # second norm
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.norm2.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.layernorm_after.bias",
-                        ),
-                    ]
-                )
-
-                # mlp
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc1.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.intermediate.dense.bias",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.mlp.fc2.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.output.dense.bias",
-                        ),
-                    ]
-                )
-
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.blocks.{block_idx}.attn.relative_position_index",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.blocks.{block_idx}.attention.self.relative_position_index",
-                        )
-                    ]
-                )
-
-            if layer_idx < num_layers - 1:
-                # patch merging
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.reduction.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.reduction.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.weight",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.weight",
-                        ),
-                        (
-                            f"{src_prefix}.layers.{layer_idx}.downsample.norm.bias",
-                            f"{dst_prefix}.encoder.layers.{layer_idx}.downsample.norm.bias",
-                        ),
-                    ]
-                )
-
-            # hidden states norms
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
-                    ),
-                    (
-                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
-                    ),
-                ]
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    # Dinat Backbone
-    def replace_dinat_backbone(self, dst_state_dict: StateDict, src_state_dict: StateDict, config: OneFormerConfig):
-        dst_prefix: str = "pixel_level_module.encoder"
-        src_prefix: str = "backbone"
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        renamed_keys = rename_keys_for_weight_bias(f"{src_prefix}.patch_embed.norm", f"{dst_prefix}.embeddings.norm")
-
-        for i in range(2):
-            renamed_keys.extend(
-                rename_keys_for_weight_bias(
-                    f"{src_prefix}.patch_embed.proj.{i}",
-                    f"{dst_prefix}.embeddings.patch_embeddings.projection.{i}",
-                )
-            )
-
-        num_layers = len(config.backbone_config.depths)
-        for layer_idx in range(num_layers):
-            for block_idx in range(config.backbone_config.depths[layer_idx]):
-                renamed_keys.extend(
-                    rename_keys_for_weight_bias(
-                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.norm1",
-                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.layernorm_before",
-                    )
-                )
-
-                renamed_keys.extend(
-                    rename_keys_for_weight_bias(
-                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.norm2",
-                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.layernorm_after",
-                    )
-                )
-
-                renamed_keys.extend(
-                    [  # src, dst
-                        (
-                            f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.rpb",
-                            f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.rpb",
-                        ),
-                    ]
-                )
-                # now we need to handle the attentions
-                # read in weights + bias of input projection layer of cross-attention
-
-                src_att_weight = src_state_dict[f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.weight"]
-                src_att_bias = src_state_dict[f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.bias"]
-
-                size = src_att_weight.shape[0]
-                offset = size // 3
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.query.weight"
-                ] = src_att_weight[:offset, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.query.bias"
-                ] = src_att_bias[:offset]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.key.weight"
-                ] = src_att_weight[offset : offset * 2, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.key.bias"
-                ] = src_att_bias[offset : offset * 2]
-
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.value.weight"
-                ] = src_att_weight[-offset:, :]
-                dst_state_dict[
-                    f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.self.value.bias"
-                ] = src_att_bias[-offset:]
-
-                # let's pop them
-                src_state_dict.pop(f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.weight")
-                src_state_dict.pop(f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.qkv.bias")
-                # proj
-
-                renamed_keys.extend(
-                    rename_keys_for_weight_bias(
-                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.attn.proj",
-                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.attention.output.dense",
-                    )
-                )
-
-                # mlp
-                renamed_keys.extend(
-                    rename_keys_for_weight_bias(
-                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.mlp.fc1",
-                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.intermediate.dense",
-                    )
-                )
-
-                renamed_keys.extend(
-                    rename_keys_for_weight_bias(
-                        f"{src_prefix}.levels.{layer_idx}.blocks.{block_idx}.mlp.fc2",
-                        f"{dst_prefix}.encoder.levels.{layer_idx}.layers.{block_idx}.output.dense",
-                    )
-                )
-
-            if layer_idx < num_layers - 1:
-                # patch merging
-                renamed_keys.extend(
-                    [
-                        (
-                            f"{src_prefix}.levels.{layer_idx}.downsample.reduction.weight",
-                            f"{dst_prefix}.encoder.levels.{layer_idx}.downsample.reduction.weight",
-                        ),
-                        (
-                            f"{src_prefix}.levels.{layer_idx}.downsample.norm.weight",
-                            f"{dst_prefix}.encoder.levels.{layer_idx}.downsample.norm.weight",
-                        ),
-                        (
-                            f"{src_prefix}.levels.{layer_idx}.downsample.norm.bias",
-                            f"{dst_prefix}.encoder.levels.{layer_idx}.downsample.norm.bias",
-                        ),
-                    ]
-                )
-
-            # hidden states norms
-            renamed_keys.extend(
-                [
-                    (
-                        f"{src_prefix}.norm{layer_idx}.weight",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.weight",
-                    ),
-                    (
-                        f"{src_prefix}.norm{layer_idx}.bias",
-                        f"{dst_prefix}.hidden_states_norms.stage{layer_idx+1}.bias",
-                    ),
-                ]
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    # Backbone + Pixel Decoder
-    def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict, is_swin: bool):
-        dst_prefix: str = "pixel_level_module.decoder"
-        src_prefix: str = "sem_seg_head.pixel_decoder"
-
-        if is_swin:
-            self.replace_swin_backbone(dst_state_dict, src_state_dict, self.config)
-        else:
-            self.replace_dinat_backbone(dst_state_dict, src_state_dict, self.config)
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        def rename_keys_for_self_attn(src_prefix: str, dst_prefix: str):
-            self_attn_keys = []
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.attention_weights", f"{dst_prefix}.attention_weights")
-            )
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.output_proj", f"{dst_prefix}.output_proj")
-            )
-            self_attn_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.sampling_offsets", f"{dst_prefix}.sampling_offsets")
-            )
-            self_attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.value_proj", f"{dst_prefix}.value_proj"))
-
-            return self_attn_keys
-
-        def rename_keys_for_encoder_layer(src_prefix: str, dst_prefix: str):
-            encoder_keys = []
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.fc1"))
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.fc2"))
-            encoder_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.norm1", f"{dst_prefix}.self_attn_layer_norm")
-            )
-            encoder_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm2", f"{dst_prefix}.final_layer_norm"))
-            encoder_keys.extend(rename_keys_for_self_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn"))
-
-            return encoder_keys
-
-        # convolution layer for final features
-        renamed_keys = [
-            (f"{src_prefix}.adapter_1.weight", f"{dst_prefix}.adapter_1.0.weight"),
-            (f"{src_prefix}.adapter_1.norm.weight", f"{dst_prefix}.adapter_1.1.weight"),
-            (f"{src_prefix}.adapter_1.norm.bias", f"{dst_prefix}.adapter_1.1.bias"),
-        ]
-
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.layer_1.weight", f"{dst_prefix}.layer_1.0.weight"),
-                (f"{src_prefix}.layer_1.norm.weight", f"{dst_prefix}.layer_1.1.weight"),
-                (f"{src_prefix}.layer_1.norm.bias", f"{dst_prefix}.layer_1.1.bias"),
-            ]
-        )
-
-        # proj layers
-        for i in range(3):
-            for j in range(2):
-                renamed_keys.extend(
-                    [
-                        (f"{src_prefix}.input_proj.{i}.{j}.weight", f"{dst_prefix}.input_projections.{i}.{j}.weight"),
-                        (f"{src_prefix}.input_proj.{i}.{j}.bias", f"{dst_prefix}.input_projections.{i}.{j}.bias"),
-                    ]
-                )
-
-        renamed_keys.extend([(f"{src_prefix}.transformer.level_embed", f"{dst_prefix}.level_embed")])
-
-        # layers
-        for layer_idx in range(self.config.encoder_layers):
-            renamed_keys.extend(
-                rename_keys_for_encoder_layer(
-                    f"{src_prefix}.transformer.encoder.layers.{layer_idx}", f"{dst_prefix}.encoder.layers.{layer_idx}"
-                )
-            )
-
-        # proj
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"),
-                (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"),
-            ]
-        )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    # Transformer Decoder
-    def replace_keys_qkv_transformer_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module.decoder.layers"
-        src_prefix: str = "sem_seg_head.predictor"
-        for i in range(self.config.decoder_layers - 1):
-            # read in weights + bias of input projection layer of self-attention
-            in_proj_weight = src_state_dict.pop(
-                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_weight"
-            )
-            in_proj_bias = src_state_dict.pop(
-                f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.in_proj_bias"
-            )
-            # next, add query, keys and values (in that order) to the state dict
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.q_proj.bias"] = in_proj_bias[:256]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-            dst_state_dict[f"{dst_prefix}.{i}.self_attn.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-    def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "transformer_module"
-        src_prefix: str = "sem_seg_head.predictor"
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        def rename_keys_for_attn(src_prefix: str, dst_prefix: str):
-            attn_keys = [
-                (f"{src_prefix}.in_proj_bias", f"{dst_prefix}.in_proj_bias"),
-                (f"{src_prefix}.in_proj_weight", f"{dst_prefix}.in_proj_weight"),
-            ]
-            attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))
-
-            return attn_keys
-
-        def rename_keys_for_self_attn(src_prefix: str, dst_prefix: str):
-            attn_keys = []
-            attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))
-
-            return attn_keys
-
-        def rename_keys_for_query_transformer_layer(src_prefix: str, dst_prefix: str):
-            query_transformer_layer_keys = []
-
-            query_transformer_layer_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.linear1")
-            )
-            query_transformer_layer_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.linear2")
-            )
-            query_transformer_layer_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.norm1", f"{dst_prefix}.norm1")
-            )
-            query_transformer_layer_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.norm2", f"{dst_prefix}.norm2")
-            )
-            query_transformer_layer_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.norm3", f"{dst_prefix}.norm3")
-            )
-
-            query_transformer_layer_keys.extend(
-                rename_keys_for_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn")
-            )
-
-            query_transformer_layer_keys.extend(
-                rename_keys_for_attn(f"{src_prefix}.multihead_attn", f"{dst_prefix}.multihead_attn")
-            )
-
-            return query_transformer_layer_keys
-
-        def rename_keys_for_cross_attn_layer(src_prefix: str, dst_prefix: str):
-            cross_attn_layer_keys = []
-
-            cross_attn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm", f"{dst_prefix}.norm"))
-            cross_attn_layer_keys.extend(
-                rename_keys_for_attn(f"{src_prefix}.multihead_attn", f"{dst_prefix}.multihead_attn")
-            )
-
-            return cross_attn_layer_keys
-
-        def rename_keys_for_self_attn_layer(src_prefix: str, dst_prefix: str):
-            self_attn_layer_keys = []
-
-            self_attn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm", f"{dst_prefix}.norm"))
-            self_attn_layer_keys.extend(
-                rename_keys_for_self_attn(f"{src_prefix}.self_attn", f"{dst_prefix}.self_attn")
-            )
-
-            return self_attn_layer_keys
-
-        def rename_keys_for_ffn_layer(src_prefix: str, dst_prefix: str):
-            ffn_layer_keys = []
-
-            ffn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear1", f"{dst_prefix}.linear1"))
-            ffn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.linear2", f"{dst_prefix}.linear2"))
-            ffn_layer_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.norm", f"{dst_prefix}.norm"))
-
-            return ffn_layer_keys
-
-        def rename_keys_for_transformer_decoder_layer(src_prefix: str, dst_prefix: str, idx: int):
-            transformer_decoder_layer_keys = []
-
-            transformer_decoder_layer_keys.extend(
-                rename_keys_for_cross_attn_layer(
-                    f"{src_prefix}.transformer_cross_attention_layers.{idx}", f"{dst_prefix}.{idx}.cross_attn"
-                )
-            )
-
-            transformer_decoder_layer_keys.extend(
-                rename_keys_for_self_attn_layer(
-                    f"{src_prefix}.transformer_self_attention_layers.{idx}", f"{dst_prefix}.{idx}.self_attn"
-                )
-            )
-
-            transformer_decoder_layer_keys.extend(
-                rename_keys_for_ffn_layer(f"{src_prefix}.transformer_ffn_layers.{idx}", f"{dst_prefix}.{idx}.ffn")
-            )
-
-            return transformer_decoder_layer_keys
-
-        # positional embedding for object queries
-        renamed_keys = [
-            (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
-            (f"{src_prefix}.level_embed.weight", f"{dst_prefix}.level_embed.weight"),
-        ]
-
-        # norm
-        renamed_keys.extend(
-            rename_keys_for_weight_bias(f"{src_prefix}.decoder_norm", f"{dst_prefix}.decoder.decoder_norm")
-        )
-
-        # proj
-        renamed_keys.extend(
-            rename_keys_for_weight_bias(
-                f"{src_prefix}.class_input_proj", f"{dst_prefix}.decoder.query_input_projection"
-            )
-        )
-
-        renamed_keys.extend(
-            rename_keys_for_weight_bias(f"{src_prefix}.class_embed", f"{dst_prefix}.decoder.class_embed")
-        )
-
-        for i in range(3):
-            renamed_keys.extend(
-                rename_keys_for_weight_bias(
-                    f"{src_prefix}.mask_embed.layers.{i}", f"{dst_prefix}.decoder.mask_embed.layers.{i}.0"
-                )
-            )
-
-        # norm
-        renamed_keys.extend(
-            rename_keys_for_weight_bias(
-                f"{src_prefix}.class_transformer.decoder.norm", f"{dst_prefix}.decoder.query_transformer.decoder.norm"
-            )
-        )
-
-        # transformer to update queries with task tokens
-        for i in range(self.config.query_dec_layers):
-            renamed_keys.extend(
-                rename_keys_for_query_transformer_layer(
-                    f"{src_prefix}.class_transformer.decoder.layers.{i}",
-                    f"{dst_prefix}.decoder.query_transformer.decoder.layers.{i}",
-                )
-            )
-
-        # decoder layers
-        for i in range(self.config.decoder_layers - 1):
-            renamed_keys.extend(
-                rename_keys_for_transformer_decoder_layer(
-                    f"{src_prefix}",
-                    f"{dst_prefix}.decoder.layers",
-                    i,
-                )
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-        self.replace_keys_qkv_transformer_decoder(dst_state_dict, src_state_dict)
-
-    def replace_task_mlp(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "task_encoder"
-        src_prefix: str = "task_mlp"
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        renamed_keys = []
-
-        for i in range(2):
-            renamed_keys.extend(
-                rename_keys_for_weight_bias(f"{src_prefix}.layers.{i}", f"{dst_prefix}.task_mlp.layers.{i}.0")
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_text_projector(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "text_mapper.text_projector"
-        src_prefix: str = "text_projector"
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        renamed_keys = []
-
-        for i in range(self.config.text_encoder_config["text_encoder_proj_layers"]):
-            renamed_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.layers.{i}", f"{dst_prefix}.{i}.0"))
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def replace_text_mapper(self, dst_state_dict: StateDict, src_state_dict: StateDict):
-        dst_prefix: str = "text_mapper.text_encoder"
-        src_prefix: str = "text_encoder"
-
-        self.replace_text_projector(dst_state_dict, src_state_dict)
-
-        def rename_keys_for_weight_bias(src_prefix: str, dst_prefix: str):
-            return [
-                (f"{src_prefix}.weight", f"{dst_prefix}.weight"),
-                (f"{src_prefix}.bias", f"{dst_prefix}.bias"),
-            ]
-
-        def rename_keys_for_attn(src_prefix: str, dst_prefix: str):
-            attn_keys = [
-                (f"{src_prefix}.in_proj_bias", f"{dst_prefix}.in_proj_bias"),
-                (f"{src_prefix}.in_proj_weight", f"{dst_prefix}.in_proj_weight"),
-            ]
-            attn_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.out_proj", f"{dst_prefix}.out_proj"))
-
-            return attn_keys
-
-        def rename_keys_for_layer(src_prefix: str, dst_prefix: str):
-            resblock_keys = []
-
-            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.mlp.c_fc", f"{dst_prefix}.mlp.fc1"))
-            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.mlp.c_proj", f"{dst_prefix}.mlp.fc2"))
-            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_1", f"{dst_prefix}.layer_norm1"))
-            resblock_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_2", f"{dst_prefix}.layer_norm2"))
-            resblock_keys.extend(rename_keys_for_attn(f"{src_prefix}.attn", f"{dst_prefix}.self_attn"))
-
-            return resblock_keys
-
-        renamed_keys = [
-            ("prompt_ctx.weight", "text_mapper.prompt_ctx.weight"),
-        ]
-
-        renamed_keys.extend(
-            [
-                (f"{src_prefix}.positional_embedding", f"{dst_prefix}.positional_embedding"),
-                (f"{src_prefix}.token_embedding.weight", f"{dst_prefix}.token_embedding.weight"),
-            ]
-        )
-
-        renamed_keys.extend(rename_keys_for_weight_bias(f"{src_prefix}.ln_final", f"{dst_prefix}.ln_final"))
-
-        for i in range(self.config.text_encoder_config["text_encoder_num_layers"]):
-            renamed_keys.extend(
-                rename_keys_for_layer(
-                    f"{src_prefix}.transformer.resblocks.{i}", f"{dst_prefix}.transformer.layers.{i}"
-                )
-            )
-
-        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
-
-    def convert(self, oneformer: OneFormerModel, is_swin: bool) -> OneFormerModel:
-        dst_state_dict = TrackedStateDict(oneformer.state_dict())
-        src_state_dict = self.original_model.state_dict()
-
-        self.replace_pixel_module(dst_state_dict, src_state_dict, is_swin)
-        self.replace_transformer_module(dst_state_dict, src_state_dict)
-        self.replace_task_mlp(dst_state_dict, src_state_dict)
-        if self.config.is_training:
-            self.replace_text_mapper(dst_state_dict, src_state_dict)
-
-        logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
-        logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
-        logger.info("🙌 Done")
-
-        oneformer.load_state_dict(dst_state_dict)
-
-        return oneformer
-
-    @staticmethod
-    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
-        checkpoints: List[Path] = checkpoints_dir.glob("**/*.pth")
-
-        for checkpoint in checkpoints:
-            logger.info(f"💪 Converting {checkpoint.stem}")
-            # find associated config file
-            config: Path = config_dir / f"{checkpoint.stem}.yaml"
-
-            yield config, checkpoint
-
-
-def post_process_sem_seg_output(outputs: OneFormerForUniversalSegmentationOutput, target_size: Tuple[int, int]):
-    # class_queries_logits has shape [BATCH, QUERIES, CLASSES + 1]
-    class_queries_logits = outputs.class_queries_logits
-    # masks_queries_logits has shape [BATCH, QUERIES, HEIGHT, WIDTH]
-    masks_queries_logits = outputs.masks_queries_logits
-    if target_size is not None:
-        masks_queries_logits = torch.nn.functional.interpolate(
-            masks_queries_logits,
-            size=target_size,
-            mode="bilinear",
-            align_corners=False,
-        )
-    # remove the null class `[..., :-1]`
-    masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
-    # mask probs has shape [BATCH, QUERIES, HEIGHT, WIDTH]
-    masks_probs = masks_queries_logits.sigmoid()
-    # now we want to sum over the queries,
-    # $ out_{c,h,w} =  \sum_q p_{q,c} * m_{q,h,w} $
-    # where $ softmax(p) \in R^{q, c} $ is the mask classes
-    # and $ sigmoid(m) \in R^{q, h, w}$ is the mask probabilities
-    # b(atch)q(uery)c(lasses), b(atch)q(uery)h(eight)w(idth)
-    segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
-
-    return segmentation
-
-
-def test(
-    original_model,
-    our_model: OneFormerForUniversalSegmentation,
-    processor: OneFormerProcessor,
-    model_repo: str,
-):
-    def _preprocess_text(text_list=None, max_length=77):
-        if text_list is None:
-            raise ValueError("tokens cannot be None.")
-
-        tokens = tokenizer(text_list, padding="max_length", max_length=max_length, truncation=True)
-
-        attention_masks, input_ids = tokens["attention_mask"], tokens["input_ids"]
-
-        token_inputs = []
-        for attn_mask, input_id in zip(attention_masks, input_ids):
-            token = torch.tensor(attn_mask) * torch.tensor(input_id)
-            token_inputs.append(token.unsqueeze(0))
-
-        token_inputs = torch.cat(token_inputs, dim=0)
-        return token_inputs
-
-    with torch.no_grad():
-        tokenizer = CLIPTokenizer.from_pretrained(model_repo)
-        original_model = original_model.eval()
-        our_model = our_model.eval()
-
-        im = prepare_img()
-
-        tr = T.Compose(
-            [
-                T.Resize((640, 640)),
-                T.ToTensor(),
-                T.Normalize(
-                    mean=torch.tensor([123.675, 116.280, 103.530]) / 255.0,
-                    std=torch.tensor([58.395, 57.120, 57.375]) / 255.0,
-                ),
-            ],
-        )
-
-        x = tr(im).unsqueeze(0)
-
-        task_input = ["the task is semantic"]
-        task_token = _preprocess_text(task_input, max_length=processor.task_seq_length)
-
-        original_model_backbone_features = original_model.backbone(x.clone())
-
-        our_model_output: OneFormerModelOutput = our_model.model(x.clone(), task_token, output_hidden_states=True)
-
-        for original_model_feature, our_model_feature in zip(
-            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
-        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=3e-3
-            ), "The backbone features are not the same."
-        mask_features, _, multi_scale_features, _, _ = original_model.sem_seg_head.pixel_decoder.forward_features(
-            original_model_backbone_features
-        )
-
-        original_pixel_decoder_features = []
-        original_pixel_decoder_features.append(mask_features)
-        for i in range(len(multi_scale_features)):
-            original_pixel_decoder_features.append(multi_scale_features[i])
-
-        for original_model_feature, our_model_feature in zip(
-            original_pixel_decoder_features, our_model_output.pixel_decoder_hidden_states
-        ):
-            assert torch.allclose(
-                original_model_feature, our_model_feature, atol=3e-4
-            ), "The pixel decoder feature are not the same"
-
-        tr_complete = T.Compose(
-            [
-                T.Resize((640, 640)),
-                T.ToTensor(),
-            ],
-        )
-
-        y = (tr_complete(im) * 255.0).to(torch.int).float()
-
-        # let's test the full model
-        original_model_out = original_model([{"image": y.clone(), "task": "The task is semantic"}])
-
-        original_segmentation = original_model_out[0]["sem_seg"]
-
-        our_model_out: OneFormerForUniversalSegmentationOutput = our_model(
-            x.clone(), task_token, output_hidden_states=True
-        )
-
-        our_segmentation = post_process_sem_seg_output(our_model_out, target_size=(640, 640))[0]
-
-        assert torch.allclose(
-            original_segmentation, our_segmentation, atol=1e-3
-        ), "The segmentation image is not the same."
-
-        logger.info("✅ Test passed!")
-
-
-def get_name(checkpoint_file: Path):
-    model_name_raw: str = checkpoint_file.stem
-
-    backbone = "swin" if "swin" in model_name_raw else "dinat"
-    dataset = ""
-    if "coco" in model_name_raw:
-        dataset = "coco"
-    elif "ade20k" in model_name_raw:
-        dataset = "ade20k"
-    elif "cityscapes" in model_name_raw:
-        dataset = "cityscapes"
-    else:
-        raise ValueError(
-            f"{model_name_raw} must be wrong since we didn't find 'coco' or 'ade20k' or 'cityscapes' in it "
-        )
-
-    backbone_types = ["tiny", "large"]
-
-    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0]
-
-    model_name = f"oneformer_{dataset}_{backbone}_{backbone_type}"
-
-    return model_name
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser(
-        description=(
-            "Command line to convert the original oneformer models (with swin backbone) to transformers"
-            " implementation."
-        )
-    )
-
-    parser.add_argument(
-        "--checkpoints_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's checkpoints. The directory has to have the following structure:"
-            " structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.pth; where <CONFIG_NAME> name must follow the"
-            " following nomenclature nomenclature: oneformer_<DATASET_NAME>_<BACKBONE>_<BACKBONE_TYPE>"
-        ),
-    )
-    parser.add_argument(
-        "--configs_dir",
-        type=Path,
-        help=(
-            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
-            " structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.yaml; where <CONFIG_NAME> name must follow the"
-            " following nomenclature nomenclature: oneformer_<DATASET_NAME>_<BACKBONE>_<BACKBONE_TYPE>"
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        required=True,
-        type=Path,
-        help="Path to the folder to output PyTorch models.",
-    )
-    parser.add_argument(
-        "--oneformer_dir",
-        required=True,
-        type=Path,
-        help=(
-            "A path to OneFormer's original implementation directory. You can download from here: "
-            "https://github.com/SHI-Labs/OneFormer"
-        ),
-    )
-
-    args = parser.parse_args()
-
-    checkpoints_dir: Path = args.checkpoints_dir
-    config_dir: Path = args.configs_dir
-    save_directory: Path = args.pytorch_dump_folder_path
-    oneformer_dir: Path = args.oneformer_dir
-    # append the path to the parents to oneformer dir
-    sys.path.append(str(oneformer_dir.parent))
-    # and import what's needed
-    from OneFormer.oneformer import add_common_config, add_dinat_config, add_oneformer_config, add_swin_config
-    from OneFormer.oneformer.oneformer_model import OneFormer as OriginalOneFormer
-
-    if not save_directory.exists():
-        save_directory.mkdir(parents=True)
-
-    for config_file, checkpoint_file in OriginalOneFormerCheckpointToOursConverter.using_dirs(
-        checkpoints_dir, config_dir
-    ):
-        processor = OriginalOneFormerConfigToProcessorConverter()(
-            setup_cfg(Args(config_file=config_file)), os.path.join("shi-labs", config_file.stem)
-        )
-
-        original_config = setup_cfg(Args(config_file=config_file))
-        oneformer_kwargs = OriginalOneFormer.from_config(original_config)
-
-        original_model = OriginalOneFormer(**oneformer_kwargs).eval()
-
-        DetectionCheckpointer(original_model).load(str(checkpoint_file))
-
-        is_swin = "swin" in config_file.stem
-
-        config: OneFormerConfig = OriginalOneFormerConfigToOursConverter()(original_config, is_swin)
-
-        oneformer = OneFormerModel(config=config).eval()
-
-        converter = OriginalOneFormerCheckpointToOursConverter(original_model, config)
-
-        oneformer = converter.convert(oneformer, is_swin)
-
-        oneformer_for_universal_segmentation = OneFormerForUniversalSegmentation(config=config).eval()
-
-        oneformer_for_universal_segmentation.model = oneformer
-
-        test(
-            original_model,
-            oneformer_for_universal_segmentation,
-            processor,
-            os.path.join("shi-labs", config_file.stem),
-        )
-
-        model_name = get_name(checkpoint_file)
-        logger.info(f"🪄 Saving {model_name}")
-
-        processor.save_pretrained(save_directory / model_name)
-        oneformer_for_universal_segmentation.save_pretrained(save_directory / model_name)
-
-        processor.push_to_hub(
-            repo_id=os.path.join("shi-labs", config_file.stem),
-            commit_message="Add configs",
-            use_temp_dir=True,
-        )
-        oneformer_for_universal_segmentation.push_to_hub(
-            repo_id=os.path.join("shi-labs", config_file.stem),
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index f931ab31092c..956bd3e7e2fa 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -440,7 +440,7 @@ def __init__(
         ignore_index: Optional[int] = None,
         do_reduce_labels: bool = False,
         repo_path: Optional[str] = "shi-labs/oneformer_demo",
-        class_info_file: str = None,
+        class_info_file: Optional[str] = None,
         num_text: Optional[int] = None,
         num_labels: Optional[int] = None,
         **kwargs,
@@ -582,12 +582,12 @@ def __call__(self, images, task_inputs=None, segmentation_maps=None, **kwargs) -
     def _preprocess(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -603,12 +603,12 @@ def _preprocess(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -643,7 +643,7 @@ def _preprocess_image(
     def _preprocess_mask(
         self,
         segmentation_map: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index 585103f21b14..7c3ecb3611b3 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -61,7 +61,6 @@ def _get_clones(module, N):
     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
 def multi_scale_deformable_attention(
     value: Tensor,
     value_spatial_shapes: Union[Tensor, List[Tuple]],
@@ -359,7 +358,7 @@ def __init__(
         num_points: int,
         oversample_ratio: float,
         importance_sample_ratio: float,
-        contrastive_temperature: float = None,
+        contrastive_temperature: Optional[float] = None,
     ):
         """
         This class computes the losses using the class predictions, mask predictions and the contrastive queries.
@@ -755,10 +754,10 @@ class OneFormerTransformerDecoderOutput(BaseModelOutput):
             Tuple of class and mask predictions from each layer of the transformer decoder.
     """
 
-    object_queries: torch.FloatTensor = None
+    object_queries: Optional[torch.FloatTensor] = None
     contrastive_logits: Optional[torch.FloatTensor] = None
-    prediction_masks: torch.FloatTensor = None
-    prediction_class: torch.FloatTensor = None
+    prediction_masks: Optional[torch.FloatTensor] = None
+    prediction_class: Optional[torch.FloatTensor] = None
     auxiliary_predictions: Optional[Tuple[Dict[str, torch.FloatTensor]]] = None
 
 
@@ -783,7 +782,7 @@ class OneFormerPixelDecoderOutput(ModelOutput):
     """
 
     multi_scale_features: Tuple[torch.FloatTensor] = None
-    mask_features: torch.FloatTensor = None
+    mask_features: Optional[torch.FloatTensor] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -807,7 +806,7 @@ class OneFormerPixelLevelModuleOutput(ModelOutput):
 
     encoder_features: List[torch.FloatTensor] = None
     decoder_features: List[torch.FloatTensor] = None
-    decoder_last_feature: torch.FloatTensor = None
+    decoder_last_feature: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -850,13 +849,13 @@ class OneFormerModelOutput(ModelOutput):
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     pixel_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     transformer_decoder_hidden_states: Optional[torch.FloatTensor] = None
-    transformer_decoder_object_queries: torch.FloatTensor = None
+    transformer_decoder_object_queries: Optional[torch.FloatTensor] = None
     transformer_decoder_contrastive_queries: Optional[torch.FloatTensor] = None
-    transformer_decoder_mask_predictions: torch.FloatTensor = None
-    transformer_decoder_class_predictions: torch.FloatTensor = None
+    transformer_decoder_mask_predictions: Optional[torch.FloatTensor] = None
+    transformer_decoder_class_predictions: Optional[torch.FloatTensor] = None
     transformer_decoder_auxiliary_predictions: Optional[Tuple[Dict[str, torch.FloatTensor]]] = None
     text_queries: Optional[torch.FloatTensor] = None
-    task_token: torch.FloatTensor = None
+    task_token: Optional[torch.FloatTensor] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -913,19 +912,19 @@ class OneFormerForUniversalSegmentationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    class_queries_logits: torch.FloatTensor = None
-    masks_queries_logits: torch.FloatTensor = None
+    class_queries_logits: Optional[torch.FloatTensor] = None
+    masks_queries_logits: Optional[torch.FloatTensor] = None
     auxiliary_predictions: List[Dict[str, torch.FloatTensor]] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     pixel_decoder_hidden_states: Optional[List[torch.FloatTensor]] = None
     transformer_decoder_hidden_states: Optional[torch.FloatTensor] = None
-    transformer_decoder_object_queries: torch.FloatTensor = None
+    transformer_decoder_object_queries: Optional[torch.FloatTensor] = None
     transformer_decoder_contrastive_queries: Optional[torch.FloatTensor] = None
-    transformer_decoder_mask_predictions: torch.FloatTensor = None
-    transformer_decoder_class_predictions: torch.FloatTensor = None
+    transformer_decoder_mask_predictions: Optional[torch.FloatTensor] = None
+    transformer_decoder_class_predictions: Optional[torch.FloatTensor] = None
     transformer_decoder_auxiliary_predictions: Optional[List[Dict[str, torch.FloatTensor]]] = None
     text_queries: Optional[torch.FloatTensor] = None
-    task_token: torch.FloatTensor = None
+    task_token: Optional[torch.FloatTensor] = None
     attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
 
 
@@ -1086,7 +1085,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes=None,
         level_start_index=None,
@@ -2610,7 +2609,7 @@ def __init__(
         width: int,
         layers: int,
         heads: int,
-        attn_mask: torch.Tensor = None,
+        attn_mask: Optional[torch.Tensor] = None,
         use_checkpoint=False,
         layer_norm_eps=1e-05,
     ):
diff --git a/src/transformers/models/oneformer/processing_oneformer.py b/src/transformers/models/oneformer/processing_oneformer.py
index 78fef3283cd8..d3e02f50d817 100644
--- a/src/transformers/models/oneformer/processing_oneformer.py
+++ b/src/transformers/models/oneformer/processing_oneformer.py
@@ -82,7 +82,7 @@ def __call__(self, images=None, task_inputs=None, segmentation_maps=None, **kwar
         `task_inputs` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `task_inputs` is not
         `None` to encode. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
         OneFormerImageProcessor's [`~OneFormerImageProcessor.__call__`] if `images` is not `None`. Please refer to the
-        doctsring of the above two methods for more information.
+        docstring of the above two methods for more information.
 
         Args:
             task_inputs (`str`, `List[str]`):
diff --git a/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 3d5218c20426..000000000000
--- a/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
-    # Construct model
-    if openai_config_file == "":
-        config = OpenAIGPTConfig()
-    else:
-        config = OpenAIGPTConfig.from_json_file(openai_config_file)
-    model = OpenAIGPTModel(config)
-
-    # Load weights from numpy
-    load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print(f"Save configuration file to {pytorch_config_dump_path}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--openai_checkpoint_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the TensorFlow checkpoint path.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--openai_config_file",
-        default="",
-        type=str,
-        help=(
-            "An optional config json file corresponding to the pre-trained OpenAI model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    args = parser.parse_args()
-    convert_openai_checkpoint_to_pytorch(
-        args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path
-    )
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index a86b86bdcac6..595d3e93735c 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -318,8 +318,8 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     mc_loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    mc_logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    mc_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -812,7 +812,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index a30ce86ee7a1..3856711d1062 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -62,9 +62,9 @@ def __init__(self, nx, config, scale=False, **kwargs):
 
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
-        assert (
-            n_state % config.n_head == 0
-        ), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
+        assert n_state % config.n_head == 0, (
+            f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
+        )
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
@@ -429,8 +429,8 @@ class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
             heads.
     """
 
-    logits: tf.Tensor = None
-    mc_logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
+    mc_logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
diff --git a/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 486b477f973f..000000000000
--- a/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OPT checkpoint."""
-
-import argparse
-from pathlib import Path
-
-import torch
-
-from transformers import OPTConfig, OPTModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def load_checkpoint(checkpoint_path):
-    """Checkpoint path should end in model.pt"""
-    sd = torch.load(checkpoint_path, map_location="cpu")
-    if "model" in sd.keys():
-        sd = torch.load(checkpoint_path, map_location="cpu")["model"]
-
-    # pop unnecessary weights
-    keys_to_delete = [
-        "decoder.version",
-        "decoder.output_projection.weight",
-    ]
-    for key in keys_to_delete:
-        if key in sd:
-            sd.pop(key)
-
-    keys_to_rename = {
-        "decoder.project_in_dim.weight": "decoder.project_in.weight",
-        "decoder.project_out_dim.weight": "decoder.project_out.weight",
-        "decoder.layer_norm.weight": "decoder.final_layer_norm.weight",
-        "decoder.layer_norm.bias": "decoder.final_layer_norm.bias",
-    }
-    for old_key, new_key in keys_to_rename.items():
-        if old_key in sd:
-            sd[new_key] = sd.pop(old_key)
-
-    keys = list(sd.keys())
-    for key in keys:
-        if ".qkv_proj." in key:
-            value = sd[key]
-            # We split QKV in separate Q,K,V
-
-            q_name = key.replace(".qkv_proj.", ".q_proj.")
-            k_name = key.replace(".qkv_proj.", ".k_proj.")
-            v_name = key.replace(".qkv_proj.", ".v_proj.")
-
-            depth = value.shape[0]
-            assert depth % 3 == 0
-            # `SequeuceParallelTransformerBlock` has QKV weight is separated in K,V,Q despite the naming:
-            # https://cs.github.com/facebookresearch/metaseq/blob/51871bd73cd04c038f239ea2a26db1d7f6b37927/metaseq/modules/sequence_parallel_transformer_layer.py#L97
-            k, v, q = torch.split(value, depth // 3, dim=0)
-
-            sd[q_name] = q
-            sd[k_name] = k
-            sd[v_name] = v
-            del sd[key]
-
-    return sd
-
-
-@torch.no_grad()
-def convert_opt_checkpoint(checkpoint_path, pytorch_dump_folder_path, config=None):
-    """
-    Copy/paste/tweak model's weights to our BERT structure.
-    """
-    state_dict = load_checkpoint(checkpoint_path)
-
-    if config is not None:
-        config = OPTConfig.from_pretrained(config)
-    else:
-        config = OPTConfig()
-
-    model = OPTModel(config).half().eval()
-    model.load_state_dict(state_dict)
-
-    # Check results
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--fairseq_path",
-        type=str,
-        help=(
-            "path to fairseq checkpoint in correct format. You can find all checkpoints in the correct format here:"
-            " https://huggingface.co/models?other=opt_metasq"
-        ),
-    )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--hf_config", default=None, type=str, help="Define HF config.")
-    args = parser.parse_args()
-    convert_opt_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, config=args.hf_config)
diff --git a/src/transformers/models/opt/modeling_flax_opt.py b/src/transformers/models/opt/modeling_flax_opt.py
index 2cbffbaffe18..fc023bb4ae84 100644
--- a/src/transformers/models/opt/modeling_flax_opt.py
+++ b/src/transformers/models/opt/modeling_flax_opt.py
@@ -150,7 +150,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index f1f1ef1821c7..01639f5b66fb 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -27,6 +27,7 @@
 from ...modeling_attn_mask_utils import (
     AttentionMaskConverter,
 )
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -38,15 +39,20 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
 from .configuration_opt import OPTConfig
 
 
-if is_flash_attn_2_available():
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -98,7 +104,7 @@ class OPTAttention(nn.Module):
     def __init__(
         self,
         config: OPTConfig,
-        layer_idx: int = None,
+        layer_idx: Optional[int] = None,
         **kwargs,
     ):
         super().__init__()
@@ -199,9 +205,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -362,7 +368,7 @@ def forward(
 
 
 class OPTDecoderLayer(nn.Module):
-    def __init__(self, config: OPTConfig, layer_idx: int = None):
+    def __init__(self, config: OPTConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.embed_dim = config.hidden_size
 
@@ -636,12 +642,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -723,7 +734,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -756,7 +767,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -996,7 +1007,7 @@ def get_decoder(self):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1075,7 +1086,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1310,7 +1321,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/owlv2/convert_owlv2_to_hf.py b/src/transformers/models/owlv2/convert_owlv2_to_hf.py
deleted file mode 100644
index ed563b2c5bd0..000000000000
--- a/src/transformers/models/owlv2/convert_owlv2_to_hf.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OWLv2 checkpoints from the original repository.
-
-URL: https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit"""
-
-import argparse
-import collections
-import os
-
-import jax
-import jax.numpy as jnp
-import numpy as np
-import torch
-from flax.training import checkpoints
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    CLIPTokenizer,
-    Owlv2Config,
-    Owlv2ForObjectDetection,
-    Owlv2ImageProcessor,
-    Owlv2Processor,
-    Owlv2TextConfig,
-    Owlv2VisionConfig,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_owlv2_config(model_name):
-    if "large" in model_name:
-        image_size = 1008
-        patch_size = 14
-        vision_hidden_size = 1024
-        vision_intermediate_size = 4096
-        vision_num_hidden_layers = 24
-        vision_num_attention_heads = 16
-        projection_dim = 768
-        text_hidden_size = 768
-        text_intermediate_size = 3072
-        text_num_attention_heads = 12
-        text_num_hidden_layers = 12
-    else:
-        image_size = 960
-        patch_size = 16
-        vision_hidden_size = 768
-        vision_intermediate_size = 3072
-        vision_num_hidden_layers = 12
-        vision_num_attention_heads = 12
-        projection_dim = 512
-        text_hidden_size = 512
-        text_intermediate_size = 2048
-        text_num_attention_heads = 8
-        text_num_hidden_layers = 12
-
-    vision_config = Owlv2VisionConfig(
-        patch_size=patch_size,
-        image_size=image_size,
-        hidden_size=vision_hidden_size,
-        num_hidden_layers=vision_num_hidden_layers,
-        intermediate_size=vision_intermediate_size,
-        num_attention_heads=vision_num_attention_heads,
-    )
-    text_config = Owlv2TextConfig(
-        hidden_size=text_hidden_size,
-        intermediate_size=text_intermediate_size,
-        num_attention_heads=text_num_attention_heads,
-        num_hidden_layers=text_num_hidden_layers,
-    )
-
-    config = Owlv2Config(
-        text_config=text_config.to_dict(),
-        vision_config=vision_config.to_dict(),
-        projection_dim=projection_dim,
-    )
-
-    return config
-
-
-def flatten_nested_dict(params, parent_key="", sep="/"):
-    items = []
-
-    for k, v in params.items():
-        new_key = parent_key + sep + k if parent_key else k
-
-        if isinstance(v, collections.MutableMapping):
-            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, model_name):
-    rename_keys = []
-
-    # fmt: off
-    # CLIP vision encoder
-    rename_keys.append(("backbone/clip/visual/class_embedding", "owlv2.vision_model.embeddings.class_embedding"))
-    rename_keys.append(("backbone/clip/visual/conv1/kernel", "owlv2.vision_model.embeddings.patch_embedding.weight"))
-    rename_keys.append(("backbone/clip/visual/positional_embedding", "owlv2.vision_model.embeddings.position_embedding.weight"))
-    rename_keys.append(("backbone/clip/visual/ln_pre/scale", "owlv2.vision_model.pre_layernorm.weight"))
-    rename_keys.append(("backbone/clip/visual/ln_pre/bias", "owlv2.vision_model.pre_layernorm.bias"))
-
-    for i in range(config.vision_config.num_hidden_layers):
-        if "v2" in model_name:
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_0/scale", f"owlv2.vision_model.encoder.layers.{i}.layer_norm1.weight"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_0/bias", f"owlv2.vision_model.encoder.layers.{i}.layer_norm1.bias"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_1/scale", f"owlv2.vision_model.encoder.layers.{i}.layer_norm2.weight"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_1/bias", f"owlv2.vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        else:
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_1/scale", f"owlv2.vision_model.encoder.layers.{i}.layer_norm1.weight"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_1/bias", f"owlv2.vision_model.encoder.layers.{i}.layer_norm1.bias"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_2/scale", f"owlv2.vision_model.encoder.layers.{i}.layer_norm2.weight"))
-            rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/ln_2/bias", f"owlv2.vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/mlp/c_fc/kernel", f"owlv2.vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/mlp/c_fc/bias", f"owlv2.vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/mlp/c_proj/kernel", f"owlv2.vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/mlp/c_proj/bias", f"owlv2.vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/query/kernel", f"owlv2.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/query/bias", f"owlv2.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/key/kernel", f"owlv2.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/key/bias", f"owlv2.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/value/kernel", f"owlv2.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/value/bias", f"owlv2.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/out/kernel", f"owlv2.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"backbone/clip/visual/transformer/resblocks.{i}/attn/out/bias", f"owlv2.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"))
-
-    rename_keys.append(("backbone/clip/visual/ln_post/scale", "owlv2.vision_model.post_layernorm.weight"))
-    rename_keys.append(("backbone/clip/visual/ln_post/bias", "owlv2.vision_model.post_layernorm.bias"))
-
-    # CLIP text encoder
-    rename_keys.append(("backbone/clip/text/token_embedding/embedding", "owlv2.text_model.embeddings.token_embedding.weight"))
-    rename_keys.append(("backbone/clip/text/positional_embedding", "owlv2.text_model.embeddings.position_embedding.weight"))
-
-    for i in range(config.text_config.num_hidden_layers):
-        if "v2" in model_name:
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_0/scale", f"owlv2.text_model.encoder.layers.{i}.layer_norm1.weight"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_0/bias", f"owlv2.text_model.encoder.layers.{i}.layer_norm1.bias"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_1/scale", f"owlv2.text_model.encoder.layers.{i}.layer_norm2.weight"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_1/bias", f"owlv2.text_model.encoder.layers.{i}.layer_norm2.bias"))
-        else:
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_1/scale", f"owlv2.text_model.encoder.layers.{i}.layer_norm1.weight"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_1/bias", f"owlv2.text_model.encoder.layers.{i}.layer_norm1.bias"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_2/scale", f"owlv2.text_model.encoder.layers.{i}.layer_norm2.weight"))
-            rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/ln_2/bias", f"owlv2.text_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/mlp/c_fc/kernel", f"owlv2.text_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/mlp/c_fc/bias", f"owlv2.text_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/mlp/c_proj/kernel", f"owlv2.text_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/mlp/c_proj/bias", f"owlv2.text_model.encoder.layers.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/query/kernel", f"owlv2.text_model.encoder.layers.{i}.self_attn.q_proj.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/query/bias", f"owlv2.text_model.encoder.layers.{i}.self_attn.q_proj.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/key/kernel", f"owlv2.text_model.encoder.layers.{i}.self_attn.k_proj.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/key/bias", f"owlv2.text_model.encoder.layers.{i}.self_attn.k_proj.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/value/kernel", f"owlv2.text_model.encoder.layers.{i}.self_attn.v_proj.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/value/bias", f"owlv2.text_model.encoder.layers.{i}.self_attn.v_proj.bias"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/out/kernel", f"owlv2.text_model.encoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"backbone/clip/text/transformer/resblocks.{i}/attn/out/bias", f"owlv2.text_model.encoder.layers.{i}.self_attn.out_proj.bias"))
-
-    rename_keys.append(("backbone/clip/text/ln_final/scale", "owlv2.text_model.final_layer_norm.weight"))
-    rename_keys.append(("backbone/clip/text/ln_final/bias", "owlv2.text_model.final_layer_norm.bias"))
-
-    # logit scale
-    rename_keys.append(("backbone/clip/logit_scale", "owlv2.logit_scale"))
-
-    # projection heads
-    rename_keys.append(("backbone/clip/text/text_projection/kernel", "owlv2.text_projection.weight"))
-
-    # class and box heads
-    rename_keys.append(("backbone/merged_class_token/scale", "layer_norm.weight"))
-    rename_keys.append(("backbone/merged_class_token/bias", "layer_norm.bias"))
-    rename_keys.append(("class_head/Dense_0/kernel", "class_head.dense0.weight"))
-    rename_keys.append(("class_head/Dense_0/bias", "class_head.dense0.bias"))
-    rename_keys.append(("class_head/logit_shift/kernel", "class_head.logit_shift.weight"))
-    rename_keys.append(("class_head/logit_scale/kernel", "class_head.logit_scale.weight"))
-    rename_keys.append(("class_head/logit_scale/bias", "class_head.logit_scale.bias"))
-    rename_keys.append(("class_head/logit_shift/bias", "class_head.logit_shift.bias"))
-    rename_keys.append(("obj_box_head/Dense_0/kernel", "box_head.dense0.weight"))
-    rename_keys.append(("obj_box_head/Dense_0/bias", "box_head.dense0.bias"))
-    rename_keys.append(("obj_box_head/Dense_1/kernel", "box_head.dense1.weight"))
-    rename_keys.append(("obj_box_head/Dense_1/bias", "box_head.dense1.bias"))
-    rename_keys.append(("obj_box_head/Dense_2/kernel", "box_head.dense2.weight"))
-    rename_keys.append(("obj_box_head/Dense_2/bias", "box_head.dense2.bias"))
-
-    # objectness head (only for v2)
-    if "v2" in model_name:
-        rename_keys.append(("objectness_head/Dense_0/kernel", "objectness_head.dense0.weight"))
-        rename_keys.append(("objectness_head/Dense_0/bias", "objectness_head.dense0.bias"))
-        rename_keys.append(("objectness_head/Dense_1/kernel", "objectness_head.dense1.weight"))
-        rename_keys.append(("objectness_head/Dense_1/bias", "objectness_head.dense1.bias"))
-        rename_keys.append(("objectness_head/Dense_2/kernel", "objectness_head.dense2.weight"))
-        rename_keys.append(("objectness_head/Dense_2/bias", "objectness_head.dense2.bias"))
-
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_and_reshape_key(dct, old, new, config):
-    val = dct.pop(old)
-
-    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "vision" in new:
-        val = val.reshape(-1, config.vision_config.hidden_size)
-    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "text" in new:
-        val = val.reshape(-1, config.text_config.hidden_size)
-
-    if "patch_embedding" in new:
-        print("Reshaping patch embedding... for", new)
-        val = val.transpose(3, 2, 0, 1)
-    elif new.endswith("weight") and "position_embedding" not in new and "token_embedding" not in new:
-        val = val.T
-
-    if new.endswith("bias"):
-        val = val.reshape(-1)
-
-    dct[new] = torch.from_numpy(np.array(val))
-
-
-@torch.no_grad()
-def convert_owlv2_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub, verify_logits):
-    """
-    Copy/paste/tweak model's weights to our OWL-ViT structure.
-    """
-    config = get_owlv2_config(model_name)
-
-    # see available checkpoints at https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit#pretrained-checkpoints
-    variables = checkpoints.restore_checkpoint(checkpoint_path, target=None)
-    variables = variables["params"] if "v2" in model_name else variables["optimizer"]["target"]
-    flax_params = jax.tree_util.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, variables)
-    state_dict = flatten_nested_dict(flax_params)
-
-    # Rename keys
-    rename_keys = create_rename_keys(config, model_name)
-    for src, dest in rename_keys:
-        rename_and_reshape_key(state_dict, src, dest, config)
-
-    # load HuggingFace model
-    model = Owlv2ForObjectDetection(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    assert missing_keys == ["owlv2.visual_projection.weight"]
-    assert unexpected_keys == []
-    model.eval()
-
-    # Initialize image processor
-    size = {"height": config.vision_config.image_size, "width": config.vision_config.image_size}
-    image_processor = Owlv2ImageProcessor(size=size)
-    # Initialize tokenizer
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16)
-    # Initialize processor
-    processor = Owlv2Processor(image_processor=image_processor, tokenizer=tokenizer)
-
-    # Verify pixel_values and input_ids
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="owlvit_pixel_values_960.pt", repo_type="dataset")
-    original_pixel_values = torch.load(filepath).permute(0, 3, 1, 2)
-
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="owlv2_input_ids.pt", repo_type="dataset")
-    original_input_ids = torch.load(filepath).squeeze()
-
-    filepath = hf_hub_download(repo_id="adirik/OWL-ViT", repo_type="space", filename="assets/astronaut.png")
-    image = Image.open(filepath)
-    texts = [["face", "rocket", "nasa badge", "star-spangled banner"]]
-    inputs = processor(text=texts, images=image, return_tensors="pt")
-
-    if "large" not in model_name:
-        assert torch.allclose(inputs.pixel_values, original_pixel_values.float(), atol=1e-6)
-    assert torch.allclose(inputs.input_ids[:4, :], original_input_ids[:4, :], atol=1e-6)
-
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs.logits
-        pred_boxes = outputs.pred_boxes
-        objectness_logits = outputs.objectness_logits
-
-    if verify_logits:
-        if model_name == "owlv2-base-patch16":
-            expected_logits = torch.tensor(
-                [[-10.0043, -9.0226, -8.0433], [-12.4569, -14.0380, -12.6153], [-21.0731, -22.2705, -21.8850]]
-            )
-            expected_boxes = torch.tensor(
-                [[0.0136, 0.0223, 0.0269], [0.0406, 0.0327, 0.0797], [0.0638, 0.1539, 0.1255]]
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-5.6589, -7.7702, -16.3965]],
-            )
-        elif model_name == "owlv2-base-patch16-finetuned":
-            expected_logits = torch.tensor(
-                [[-9.2391, -9.2313, -8.0295], [-14.5498, -16.8450, -14.7166], [-15.1278, -17.3060, -15.7169]],
-            )
-            expected_boxes = torch.tensor(
-                [[0.0103, 0.0094, 0.0207], [0.0483, 0.0729, 0.1013], [0.0629, 0.1396, 0.1313]]
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-6.5234, -13.3788, -14.6627]],
-            )
-        elif model_name == "owlv2-base-patch16-ensemble":
-            expected_logits = torch.tensor(
-                [[-8.6353, -9.5409, -6.6154], [-7.9442, -9.6151, -6.7117], [-12.4593, -15.3332, -12.1048]]
-            )
-            expected_boxes = torch.tensor(
-                [[0.0126, 0.0090, 0.0238], [0.0387, 0.0227, 0.0754], [0.0582, 0.1058, 0.1139]]
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-6.0628, -5.9507, -10.4486]],
-            )
-        elif model_name == "owlv2-large-patch14":
-            expected_logits = torch.tensor(
-                [[-12.6662, -11.8384, -12.1880], [-16.0599, -16.5835, -16.9364], [-21.4957, -26.7038, -25.1313]],
-            )
-            expected_boxes = torch.tensor(
-                [[0.0136, 0.0161, 0.0256], [0.0126, 0.0135, 0.0202], [0.0498, 0.0948, 0.0915]],
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-6.7196, -9.4590, -13.9472]],
-            )
-        elif model_name == "owlv2-large-patch14-finetuned":
-            expected_logits = torch.tensor(
-                [[-9.5413, -9.7130, -7.9762], [-9.5731, -9.7277, -8.2252], [-15.4434, -19.3084, -16.5490]],
-            )
-            expected_boxes = torch.tensor(
-                [[0.0089, 0.0080, 0.0175], [0.0112, 0.0098, 0.0179], [0.0375, 0.0821, 0.0528]],
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-6.2655, -6.5845, -11.3105]],
-            )
-        elif model_name == "owlv2-large-patch14-ensemble":
-            expected_logits = torch.tensor(
-                [[-12.2037, -12.2070, -11.5371], [-13.4875, -13.8235, -13.1586], [-18.2007, -22.9834, -20.6816]],
-            )
-            expected_boxes = torch.tensor(
-                [[0.0126, 0.0127, 0.0222], [0.0107, 0.0113, 0.0164], [0.0482, 0.1162, 0.0885]],
-            )
-            expected_objectness_logits = torch.tensor(
-                [[-7.7572, -8.3637, -13.0334]],
-            )
-
-        print("Objectness logits:", objectness_logits[:3, :3])
-        print("Logits:", logits[0, :3, :3])
-        print("Pred boxes:", pred_boxes[0, :3, :3])
-
-        assert torch.allclose(logits[0, :3, :3], expected_logits, atol=1e-3)
-        assert torch.allclose(pred_boxes[0, :3, :3], expected_boxes, atol=1e-3)
-        assert torch.allclose(objectness_logits[:3, :3], expected_objectness_logits, atol=1e-3)
-        print("Looks ok!")
-    else:
-        print("Model converted without verifying logits")
-
-    if pytorch_dump_folder_path is not None:
-        print("Saving model and processor locally...")
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing {model_name} to the hub...")
-        model.push_to_hub(f"google/{model_name}")
-        processor.push_to_hub(f"google/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="owlv2-base-patch16",
-        choices=[
-            "owlv2-base-patch16",
-            "owlv2-base-patch16-finetuned",
-            "owlv2-base-patch16-ensemble",
-            "owlv2-large-patch14",
-            "owlv2-large-patch14-finetuned",
-            "owlv2-large-patch14-ensemble",
-        ],
-        type=str,
-        help="Name of the Owlv2 model you'd like to convert from FLAX to PyTorch.",
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the original Flax checkpoint.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_false",
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
-
-    args = parser.parse_args()
-    convert_owlv2_checkpoint(
-        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits
-    )
diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py
index 1dfdfbd1c213..56c8075ef0ee 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -173,7 +173,7 @@ def _preprocess_resize_output_shape(image, output_shape):
         # multichannel case: append shape of last axis
         output_shape = output_shape + (image.shape[-1],)
     elif output_ndim < image.ndim:
-        raise ValueError("output_shape length cannot be smaller than the " "image number of dimensions")
+        raise ValueError("output_shape length cannot be smaller than the image number of dimensions")
 
     return image, output_shape
 
@@ -345,10 +345,10 @@ def resize(
             else:
                 anti_aliasing_sigma = np.atleast_1d(anti_aliasing_sigma) * np.ones_like(factors)
                 if np.any(anti_aliasing_sigma < 0):
-                    raise ValueError("Anti-aliasing standard deviation must be " "greater than or equal to zero")
+                    raise ValueError("Anti-aliasing standard deviation must be greater than or equal to zero")
                 elif np.any((anti_aliasing_sigma > 0) & (factors <= 1)):
                     warnings.warn(
-                        "Anti-aliasing standard deviation greater than zero but " "not down-sampling along all axes"
+                        "Anti-aliasing standard deviation greater than zero but not down-sampling along all axes"
                     )
             filtered = ndi.gaussian_filter(image, anti_aliasing_sigma, cval=cval, mode=ndi_mode)
         else:
@@ -369,12 +369,12 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_pad: bool = None,
-        do_resize: bool = None,
+        do_pad: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py
index d69bcaa87f00..7d83ab0004d8 100644
--- a/src/transformers/models/owlv2/modeling_owlv2.py
+++ b/src/transformers/models/owlv2/modeling_owlv2.py
@@ -85,10 +85,10 @@ class Owlv2Output(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -205,12 +205,12 @@ class Owlv2ObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    objectness_logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
-    class_embeds: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    objectness_logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    class_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -255,12 +255,12 @@ class Owlv2ImageGuidedObjectDetectionOutput(ModelOutput):
             The output of the [`Owlv2VisionModel`].
     """
 
-    logits: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
-    query_image_embeds: torch.FloatTensor = None
-    target_pred_boxes: torch.FloatTensor = None
-    query_pred_boxes: torch.FloatTensor = None
-    class_embeds: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    query_image_embeds: Optional[torch.FloatTensor] = None
+    target_pred_boxes: Optional[torch.FloatTensor] = None
+    query_pred_boxes: Optional[torch.FloatTensor] = None
+    class_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 664c63ffee0e..4996cae7ab96 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -96,7 +96,7 @@ def __call__(
         Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
         `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
deleted file mode 100644
index 1e9fbb950467..000000000000
--- a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OWL-ViT checkpoints from the original repository. URL:
-https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit"""
-
-import argparse
-import collections
-
-import jax
-import jax.numpy as jnp
-import torch
-import torch.nn as nn
-from clip.model import CLIP
-from flax.training import checkpoints
-from huggingface_hub import Repository
-
-from transformers import (
-    CLIPTokenizer,
-    OwlViTConfig,
-    OwlViTForObjectDetection,
-    OwlViTImageProcessor,
-    OwlViTModel,
-    OwlViTProcessor,
-)
-
-
-CONFIGS = {
-    "vit_b32": {
-        "embed_dim": 512,
-        "image_resolution": 768,
-        "context_length": 16,
-        "vocab_size": 49408,
-        "vision_layers": 12,
-        "vision_width": 768,
-        "vision_patch_size": 32,
-        "transformer_width": 512,
-        "transformer_heads": 8,
-        "transformer_layers": 12,
-    },
-    "vit_b16": {
-        "embed_dim": 512,
-        "image_resolution": 768,
-        "context_length": 16,
-        "vocab_size": 49408,
-        "vision_layers": 12,
-        "vision_width": 768,
-        "vision_patch_size": 16,
-        "transformer_width": 512,
-        "transformer_heads": 8,
-        "transformer_layers": 12,
-    },
-    "vit_l14": {
-        "embed_dim": 768,
-        "image_resolution": 840,
-        "context_length": 16,
-        "vocab_size": 49408,
-        "vision_layers": 24,
-        "vision_width": 1024,
-        "vision_patch_size": 14,
-        "transformer_width": 768,
-        "transformer_heads": 12,
-        "transformer_layers": 12,
-    },
-}
-
-
-def flatten_nested_dict(params, parent_key="", sep="/"):
-    items = []
-
-    for k, v in params.items():
-        new_key = parent_key + sep + k if parent_key else k
-
-        if isinstance(v, collections.MutableMapping):
-            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-
-
-def to_f32(params):
-    return jax.tree_util.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, params)
-
-
-def copy_attn_layer(hf_attn_layer, pt_attn_layer):
-    q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0)
-    q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0)
-
-    out_proj_weights = pt_attn_layer.out_proj.weight
-    out_proj_bias = pt_attn_layer.out_proj.bias
-
-    hf_attn_layer.q_proj.weight.data = q_proj
-    hf_attn_layer.q_proj.bias.data = q_proj_bias
-
-    hf_attn_layer.k_proj.weight.data = k_proj
-    hf_attn_layer.k_proj.bias.data = k_proj_bias
-
-    hf_attn_layer.v_proj.weight.data = v_proj
-    hf_attn_layer.v_proj.bias.data = v_proj_bias
-
-    hf_attn_layer.out_proj.weight = out_proj_weights
-    hf_attn_layer.out_proj.bias = out_proj_bias
-
-
-def copy_mlp(hf_mlp, pt_mlp):
-    copy_linear(hf_mlp.fc1, pt_mlp.c_fc)
-    copy_linear(hf_mlp.fc2, pt_mlp.c_proj)
-
-
-def copy_linear(hf_linear, pt_linear):
-    hf_linear.weight = pt_linear.weight
-    hf_linear.bias = pt_linear.bias
-
-
-def copy_layer(hf_layer, pt_layer):
-    # copy layer norms
-    copy_linear(hf_layer.layer_norm1, pt_layer.ln_1)
-    copy_linear(hf_layer.layer_norm2, pt_layer.ln_2)
-
-    # copy MLP
-    copy_mlp(hf_layer.mlp, pt_layer.mlp)
-
-    # copy attn
-    copy_attn_layer(hf_layer.self_attn, pt_layer.attn)
-
-
-def copy_layers(hf_layers, pt_layers):
-    for hf_layer, pt_layer in zip(hf_layers, pt_layers):
-        copy_layer(hf_layer, pt_layer)
-
-
-def copy_encoder(hf_encoder, pt_model):
-    # copy  embeds
-    hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight
-    hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding
-
-    # copy layer norm
-    copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final)
-
-    # copy hidden layers
-    copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks)
-
-
-def copy_text_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.text_projection.weight.data = pt_model.text_projection.data.T
-
-    # copy text encoder
-    copy_encoder(hf_model.text_model, pt_model)
-
-
-def copy_vision_model_and_projection(hf_model, pt_model):
-    # copy projection
-    hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T
-
-    # copy layer norms
-    copy_linear(hf_model.vision_model.pre_layernorm, pt_model.visual.ln_pre)
-    copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post)
-
-    # copy embeds
-    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data
-    hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding
-    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data
-
-    # copy encoder
-    copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks)
-
-
-def copy_class_merge_token(hf_model, flax_params):
-    flax_class_token_params = flatten_nested_dict(flax_params["backbone"]["merged_class_token"])
-
-    weight = torch.from_numpy(flax_class_token_params["scale"])
-    bias = torch.from_numpy(flax_class_token_params["bias"])
-    hf_model.layer_norm.weight = nn.Parameter(weight)
-    hf_model.layer_norm.bias = nn.Parameter(bias)
-
-
-def copy_class_box_heads(hf_model, flax_params):
-    pt_params = hf_model.state_dict()
-    new_params = {}
-
-    # Rename class prediction head flax params to pytorch HF
-    flax_class_params = flatten_nested_dict(flax_params["class_head"])
-
-    for flax_key, v in flax_class_params.items():
-        torch_key = flax_key.replace("/", ".")
-        torch_key = torch_key.replace(".kernel", ".weight")
-        torch_key = torch_key.replace("Dense_0", "dense0")
-        torch_key = "class_head." + torch_key
-
-        if "weight" in torch_key and v.ndim == 2:
-            v = v.T
-
-        new_params[torch_key] = nn.Parameter(torch.from_numpy(v))
-
-    # Rename box prediction box flax params to pytorch HF
-    flax_box_params = flatten_nested_dict(flax_params["obj_box_head"])
-
-    for flax_key, v in flax_box_params.items():
-        torch_key = flax_key.replace("/", ".")
-        torch_key = torch_key.replace(".kernel", ".weight")
-        torch_key = torch_key.replace("_", "").lower()
-        torch_key = "box_head." + torch_key
-
-        if "weight" in torch_key and v.ndim == 2:
-            v = v.T
-
-        new_params[torch_key] = nn.Parameter(torch.from_numpy(v))
-
-    # Copy flax params to PyTorch params
-    for name, param in new_params.items():
-        if name in pt_params.keys():
-            pt_params[name].copy_(param)
-
-
-def copy_flax_attn_params(hf_backbone, flax_attn_params):
-    for k, v in flax_attn_params.items():
-        if k.startswith("transformer"):
-            torch_key = k.replace("transformer.resblocks", "text_model.encoder.layers")
-        else:
-            torch_key = k.replace("visual.transformer.resblocks", "vision_model.encoder.layers")
-
-        torch_key = torch_key.replace("attn", "self_attn")
-        torch_key = torch_key.replace("key", "k_proj")
-        torch_key = torch_key.replace("value", "v_proj")
-        torch_key = torch_key.replace("query", "q_proj")
-        torch_key = torch_key.replace("out", "out_proj")
-
-        if "bias" in torch_key and v.ndim == 2:
-            shape = v.shape[0] * v.shape[1]
-            v = v.reshape(shape)
-
-        if "weight" in torch_key and "out" in torch_key:
-            shape = (v.shape[0] * v.shape[1], v.shape[2])
-            v = v.reshape(shape).T
-
-        if "weight" in torch_key and "out" not in torch_key:
-            shape = (v.shape[0], v.shape[1] * v.shape[2])
-            v = v.reshape(shape).T
-
-        # Copy flax CLIP attn params to HF PyTorch params
-        v = torch.from_numpy(v)
-        hf_backbone.state_dict()[torch_key].copy_(v)
-
-
-def _convert_attn_layers(params):
-    new_params = {}
-    processed_attn_layers = []
-
-    for k, v in params.items():
-        if "attn." in k:
-            base = k[: k.rindex("attn.") + 5]
-            if base in processed_attn_layers:
-                continue
-
-            processed_attn_layers.append(base)
-            dim = params[base + "out.weight"].shape[-1]
-            new_params[base + "out_proj.weight"] = params[base + "out.weight"].reshape(dim, dim).T
-            new_params[base + "out_proj.bias"] = params[base + "out.bias"]
-        else:
-            new_params[k] = v
-    return new_params
-
-
-def convert_clip_backbone(flax_params, torch_config):
-    torch_model = CLIP(**torch_config)
-    torch_model.eval()
-    torch_clip_params = torch_model.state_dict()
-
-    flax_clip_params = flatten_nested_dict(flax_params["backbone"]["clip"])
-    new_torch_params = {}
-
-    for flax_key, v in flax_clip_params.items():
-        torch_key = flax_key.replace("/", ".")
-        torch_key = torch_key.replace("text.token_embedding.embedding", "token_embedding.kernel")
-
-        if (
-            torch_key.startswith("text.transformer")
-            or torch_key.startswith("text.text_projection")
-            or torch_key.startswith("text.ln_final")
-            or torch_key.startswith("text.positional_embedding")
-        ):
-            torch_key = torch_key[5:]
-
-        torch_key = torch_key.replace("text_projection.kernel", "text_projection")
-        torch_key = torch_key.replace("visual.proj.kernel", "visual.proj")
-        torch_key = torch_key.replace(".scale", ".weight")
-        torch_key = torch_key.replace(".kernel", ".weight")
-
-        if "conv" in torch_key or "downsample.0.weight" in torch_key:
-            v = v.transpose(3, 2, 0, 1)
-
-        elif "weight" in torch_key and v.ndim == 2 and "embedding" not in torch_key:
-            # Fully connected layers are transposed, embeddings are not
-            v = v.T
-
-        new_torch_params[torch_key] = v
-
-    attn_params = _convert_attn_layers(new_torch_params)
-    new_torch_params.update(attn_params)
-    attn_params = {}
-
-    # Copy flax CLIP backbone params to PyTorch params
-    for name, param in new_torch_params.items():
-        if name in torch_clip_params.keys():
-            new_param = torch.from_numpy(new_torch_params[name])
-            torch_clip_params[name].copy_(new_param)
-        else:
-            attn_params[name] = param
-
-    return torch_clip_params, torch_model, attn_params
-
-
-@torch.no_grad()
-def convert_owlvit_checkpoint(pt_backbone, flax_params, attn_params, pytorch_dump_folder_path, config_path=None):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    repo = Repository(pytorch_dump_folder_path, clone_from=f"google/{pytorch_dump_folder_path}")
-    repo.git_pull()
-
-    if config_path is not None:
-        config = OwlViTConfig.from_pretrained(config_path)
-    else:
-        config = OwlViTConfig()
-
-    hf_backbone = OwlViTModel(config).eval()
-    hf_model = OwlViTForObjectDetection(config).eval()
-
-    copy_text_model_and_projection(hf_backbone, pt_backbone)
-    copy_vision_model_and_projection(hf_backbone, pt_backbone)
-    hf_backbone.logit_scale = pt_backbone.logit_scale
-    copy_flax_attn_params(hf_backbone, attn_params)
-
-    hf_model.owlvit = hf_backbone
-    copy_class_merge_token(hf_model, flax_params)
-    copy_class_box_heads(hf_model, flax_params)
-
-    # Save HF model
-    hf_model.save_pretrained(repo.local_dir)
-
-    # Initialize image processor
-    image_processor = OwlViTImageProcessor(
-        size=config.vision_config.image_size, crop_size=config.vision_config.image_size
-    )
-    # Initialize tokenizer
-    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32", pad_token="!", model_max_length=16)
-
-    # Initialize processor
-    processor = OwlViTProcessor(image_processor=image_processor, tokenizer=tokenizer)
-    image_processor.save_pretrained(repo.local_dir)
-    processor.save_pretrained(repo.local_dir)
-
-    repo.git_add()
-    repo.git_commit("Upload model and processor")
-    repo.git_push()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--owlvit_version",
-        default=None,
-        type=str,
-        required=True,
-        help="OWL-ViT model name [clip_b16, clip_b32, clip_l14].",
-    )
-    parser.add_argument(
-        "--owlvit_checkpoint", default=None, type=str, required=True, help="Path to flax model checkpoint."
-    )
-    parser.add_argument("--hf_config", default=None, type=str, required=True, help="Path to HF model config.")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default="hf_model", type=str, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-
-    # Initialize PyToch clip model
-    model_name = args.owlvit_version
-    if model_name == "clip_b16":
-        torch_config = CONFIGS["vit_b16"]
-    elif model_name == "clip_b32":
-        torch_config = CONFIGS["vit_b32"]
-    elif model_name == "clip_l14":
-        torch_config = CONFIGS["vit_l14"]
-
-    # Load from checkpoint and convert params to float-32
-    variables = checkpoints.restore_checkpoint(args.owlvit_checkpoint, target=None)["optimizer"]["target"]
-    flax_params = jax.tree_util.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, variables)
-    del variables
-
-    # Convert CLIP backbone
-    pt_backbone_params, clip_pt, attn_params = convert_clip_backbone(flax_params, torch_config)
-
-    convert_owlvit_checkpoint(clip_pt, flax_params, attn_params, args.pytorch_dump_folder_path, args.hf_config)
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index d9c0e724098d..2eb0114cf11f 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -85,10 +85,10 @@ class OwlViTOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -202,11 +202,11 @@ class OwlViTObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
-    class_embeds: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    class_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -250,12 +250,12 @@ class OwlViTImageGuidedObjectDetectionOutput(ModelOutput):
             The output of the [`OwlViTVisionModel`].
     """
 
-    logits: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
-    query_image_embeds: torch.FloatTensor = None
-    target_pred_boxes: torch.FloatTensor = None
-    query_pred_boxes: torch.FloatTensor = None
-    class_embeds: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    query_image_embeds: Optional[torch.FloatTensor] = None
+    target_pred_boxes: Optional[torch.FloatTensor] = None
+    query_pred_boxes: Optional[torch.FloatTensor] = None
+    class_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 98c24747b468..859e28bfcc8b 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -110,7 +110,7 @@ def __call__(
         Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
         `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py b/src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py
deleted file mode 100644
index df869fcefb2b..000000000000
--- a/src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py
+++ /dev/null
@@ -1,415 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert PaliGemma2 checkpoints from the original repository."""
-
-import argparse
-import collections
-
-import jax.numpy as jnp
-import ml_dtypes
-import numpy as np
-import torch
-
-from transformers import (
-    AutoTokenizer,
-    Gemma2Config,
-    PaliGemmaConfig,
-    PaliGemmaForConditionalGeneration,
-    PaliGemmaProcessor,
-    SiglipImageProcessor,
-)
-from transformers.tokenization_utils_base import AddedToken
-from transformers.utils import logging
-
-
-device = "cpu"
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# TODO add sequence length variations here
-
-PALIGEMMA2_VARIANTS = ["2b-224", "2b-448", "2b-896", "9b-224", "9b-448", "9b-896", "27b-224", "27b-448", "27b-896"]
-VARIANT_CONFIGS = {
-    "2b": {
-        "num_positions": 256,
-        "hidden_size": 2304,
-        "num_hidden_layers": 26,
-        "intermediate_size": 9216,
-        "num_key_value_heads": 4,
-        "num_attention_heads": 8,
-        "head_dim": 256,
-        "query_pre_attn_scalar": 256,
-    },
-    "9b": {
-        "num_positions": 1024,
-        "hidden_size": 3584,
-        "num_hidden_layers": 42,
-        "intermediate_size": 14336,
-        "num_key_value_heads": 8,
-        "num_attention_heads": 16,
-        "head_dim": 256,
-        "query_pre_attn_scalar": 256,
-    },
-    "27b": {
-        "num_positions": 4096,
-        "hidden_size": 4608,
-        "num_hidden_layers": 46,
-        "intermediate_size": 36864,
-        "num_key_value_heads": 16,
-        "num_attention_heads": 32,
-        "head_dim": 128,
-        "query_pre_attn_scalar": 4608 // 32,  # scaling is different for the 28b
-    },
-}
-
-DTYPES = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}
-
-
-def get_paligemma2_config(variant: str, precision: str):
-    config = {
-        "image_token_index": None,
-        "pad_token_id": 0,
-        "bos_token_id": 2,
-        "eos_token_id": 1,
-    }
-    base_variant = variant.split("-")[0]
-
-    if variant in PALIGEMMA2_VARIANTS:
-        image_size = int(variant.split("-")[1])
-        variant_config = VARIANT_CONFIGS[base_variant]
-        patch_size = 14
-        num_image_tokens = (image_size**2) // (patch_size**2)
-        config["projection_dim"] = variant_config["hidden_size"]
-        config["image_token_index"] = 257152
-        config["num_hidden_layers"] = variant_config["num_hidden_layers"]  # For generate
-        text_config = Gemma2Config.from_pretrained("google/gemma-2-2b-it").to_dict()
-        sup_text_config = {
-            "model_type": "gemma2",
-            "vocab_size": 257152,
-            "num_hidden_layers": variant_config["num_hidden_layers"],
-            "num_key_value_heads": variant_config["num_key_value_heads"],
-            "head_dim": variant_config["head_dim"],
-            "torch_dtype": precision,
-            "hidden_size": variant_config["hidden_size"],
-            "hidden_activation": "gelu_pytorch_tanh",
-            "num_attention_heads": variant_config["num_attention_heads"],
-            "intermediate_size": variant_config["intermediate_size"],
-            "is_encoder_decoder": False,
-            "query_pre_attn_scalar": variant_config["query_pre_attn_scalar"],
-        }
-        text_config.update(sup_text_config)
-
-        vision_config = {
-            "num_positions": variant_config["num_positions"],  # not useful, to remove
-            "torch_dtype": precision,
-            "image_size": image_size,
-            "patch_size": patch_size,
-            "num_image_tokens": num_image_tokens,
-            "hidden_size": 1152,
-            "intermediate_size": 4304,
-            "num_hidden_layers": 27,
-            "num_attention_heads": 16,
-            "projection_dim": variant_config["hidden_size"],
-            "hidden_act": "gelu_pytorch_tanh",
-            "vision_use_head": False,
-        }
-        final_config = PaliGemmaConfig(text_config=text_config, vision_config=vision_config, **config)
-    else:
-        raise ValueError(f"Identifier {variant} not supported. Available: {PALIGEMMA2_VARIANTS}")
-    return final_config
-
-
-def slice_state_dict(state_dict, config):
-    # fmt: off
-    # patch embeddings
-    state_dict["vision_tower.vision_model.embeddings.patch_embedding.weight"] = state_dict.pop("img/embedding/kernel").transpose(
-        3, 2, 0, 1
-    )
-    state_dict["vision_tower.vision_model.embeddings.patch_embedding.bias"] = state_dict.pop("img/embedding/bias")
-    # positional embeddings
-    state_dict["vision_tower.vision_model.embeddings.position_embedding.weight"] = state_dict.pop("img/pos_embedding").reshape(
-        -1, config.vision_config.hidden_size
-    )
-
-    # extract vision layers to be sliced at index 0. There are 27 layers in the base model.
-    encoderblock_layernorm0_scale = state_dict.pop("img/Transformer/encoderblock/LayerNorm_0/scale")
-    encoderblock_layernorm0_bias = state_dict.pop("img/Transformer/encoderblock/LayerNorm_0/bias")
-    encoderblock_layernorm1_scale = state_dict.pop("img/Transformer/encoderblock/LayerNorm_1/scale")
-    encoderblock_layernorm1_bias = state_dict.pop("img/Transformer/encoderblock/LayerNorm_1/bias")
-
-    encoderblock_mlp_dense0_kernel= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_0/kernel")
-    encoderblock_mlp_dense0_bias= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_0/bias")
-    encoderblock_mlp_dense1_kernel= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_1/kernel")
-    encoderblock_mlp_dense1_bias= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_1/bias")
-
-    encoderblock_attention_0_key_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/kernel")
-    encoderblock_attention_0_key_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/bias")
-    encoderblock_attention_0_value_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/kernel")
-    encoderblock_attention_0_value_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/bias")
-    encoderblock_attention_0_query_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/kernel")
-    encoderblock_attention_0_query_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/bias")
-    encoderblock_attention_0_out_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/kernel")
-    encoderblock_attention_0_out_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/bias")
-
-    for i in range(config.vision_config.num_hidden_layers):
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm1.weight"] = encoderblock_layernorm0_scale[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm1.bias"] = encoderblock_layernorm0_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm2.weight"] = encoderblock_layernorm1_scale[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm2.bias"] = encoderblock_layernorm1_bias[i]
-
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.weight"] = encoderblock_mlp_dense0_kernel[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.bias"] = encoderblock_mlp_dense0_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.weight"] = encoderblock_mlp_dense1_kernel[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.bias"] = encoderblock_mlp_dense1_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = encoderblock_attention_0_key_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = encoderblock_attention_0_key_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = encoderblock_attention_0_value_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = encoderblock_attention_0_value_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = encoderblock_attention_0_query_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = encoderblock_attention_0_query_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"] = encoderblock_attention_0_out_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"] = encoderblock_attention_0_out_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-
-    state_dict["vision_tower.vision_model.post_layernorm.weight"] = state_dict.pop("img/Transformer/encoder_norm/scale").transpose()
-    state_dict["vision_tower.vision_model.post_layernorm.bias"] = state_dict.pop("img/Transformer/encoder_norm/bias")
-
-    # multimodal projector
-
-    state_dict['multi_modal_projector.linear.weight'] = state_dict.pop("img/head/kernel").transpose()
-    state_dict['multi_modal_projector.linear.bias'] = state_dict.pop("img/head/bias")
-
-    # text decoder (gemma)
-
-    embedding_vector = state_dict.pop("llm/embedder/input_embedding")
-    state_dict["language_model.model.embed_tokens.weight"] = embedding_vector
-
-    # pop the einsum attention + mlp representations. There are 26 layers in gemma2-2b.
-
-    llm_attention_attn_vec_einsum = state_dict.pop("llm/layers/attn/attn_vec_einsum/w")
-    #  (26, 2, 4, 2304, 256) for 2b-224, 4 kv heads and 26 layers
-    llm_attention_kv_einsum = state_dict.pop("llm/layers/attn/kv_einsum/w")
-    llm_attention_q_einsum = state_dict.pop("llm/layers/attn/q_einsum/w")
-    llm_mlp_gating_einsum = state_dict.pop("llm/layers/mlp/gating_einsum")
-    llm_mlp_linear = state_dict.pop("llm/layers/mlp/linear")
-    # TODO verify correctness of layer norm loading
-    llm_input_layernorm = state_dict.pop("llm/layers/pre_attention_norm/scale")
-    llm_pre_feedforward_layernorm = state_dict.pop("llm/layers/pre_ffw_norm/scale")
-
-    llm_post_attention_layernorm = state_dict.pop("llm/layers/post_attention_norm/scale")
-    llm_post_feedforward_layernorm = state_dict.pop("llm/layers/post_ffw_norm/scale")
-
-    for i in range(config.text_config.num_hidden_layers):
-        # llm_attention_q_einsum[i].shape = (8, 2048, 256)
-        # q_proj_weight_reshaped = llm_attention_q_einsum[i].transpose(0, 2, 1).reshape(config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size)
-
-        """
-        q shape (8, 2304, 256)
-        k shape (4, 2304, 256)
-        v shape (4, 2304, 256)
-        o shape (8, 256, 2304)
-
-        """
-        q_transpose = (0, 2, 1)
-        k_transpose = (0, 2, 1)
-        v_transpose = (0, 2, 1)
-        o_transpose = (2, 0, 1)
-
-        q_weight_matrices = llm_attention_q_einsum[i].transpose(*q_transpose)
-        q_proj_weight_reshaped = q_weight_matrices
-        q_proj_weight_reshaped = q_proj_weight_reshaped.reshape(config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size)
-        state_dict[f"language_model.model.layers.{i}.self_attn.q_proj.weight"] = q_proj_weight_reshaped
-        # Shape: (4, 2304, 256)
-        k_weight_matrices = llm_attention_kv_einsum[i, 0].transpose(*k_transpose)
-        k_proj_weight_reshaped = k_weight_matrices.reshape(
-            config.text_config.num_key_value_heads * config.text_config.head_dim,
-            config.text_config.hidden_size
-        )
-        state_dict[f"language_model.model.layers.{i}.self_attn.k_proj.weight"] = k_proj_weight_reshaped
-        # llm_attention_kv_einsum[i, 1].shape = (num_key_value_heads, hidden_size, head_dim)
-        v_weight_matrices = llm_attention_kv_einsum[i, 1].transpose(*v_transpose) # Shape: (4, 2304, 256)
-        v_proj_weight_reshaped = v_weight_matrices.reshape(
-            config.text_config.num_key_value_heads * config.text_config.head_dim,
-            config.text_config.hidden_size
-        )
-        state_dict[f"language_model.model.layers.{i}.self_attn.v_proj.weight"] = v_proj_weight_reshaped
-
-        # output projection.
-
-        # llm_attention_attn_vec_einsum[i].shape = (8, 256, 2304)
-        o_proj_weight_reshaped = llm_attention_attn_vec_einsum[i].transpose(*o_transpose).reshape(config.text_config.hidden_size, config.text_config.num_attention_heads * config.text_config.head_dim)
-        state_dict[f"language_model.model.layers.{i}.self_attn.o_proj.weight"] = o_proj_weight_reshaped
-        # mlp layers
-        gate_proj_weight = llm_mlp_gating_einsum[i, 0]
-        state_dict[f"language_model.model.layers.{i}.mlp.gate_proj.weight"] = gate_proj_weight.transpose()
-        up_proj_weight = llm_mlp_gating_einsum[i, 1]
-        state_dict[f"language_model.model.layers.{i}.mlp.up_proj.weight"] = up_proj_weight.transpose()
-        state_dict[f"language_model.model.layers.{i}.mlp.down_proj.weight"] = llm_mlp_linear[i].transpose()
-        state_dict[f"language_model.model.layers.{i}.input_layernorm.weight"] = llm_input_layernorm[i]
-        state_dict[f"language_model.model.layers.{i}.post_attention_layernorm.weight"] = llm_post_attention_layernorm[i]
-        state_dict[f"language_model.model.layers.{i}.pre_feedforward_layernorm.weight"] = llm_pre_feedforward_layernorm[i]
-        state_dict[f"language_model.model.layers.{i}.post_feedforward_layernorm.weight"] = llm_post_feedforward_layernorm[i]
-    state_dict["language_model.model.norm.weight"] = state_dict.pop("llm/final_norm/scale")
-    state_dict["language_model.lm_head.weight"] = embedding_vector # weights are tied.
-    [k for k in state_dict.keys() if not k.startswith('vision') and not k.startswith('language')]
-    # fmt: on
-    for key, value in state_dict.items():
-        if not isinstance(value, torch.Tensor):
-            try:
-                if value.dtype == jnp.bfloat16:
-                    value = jnp.array(value).astype(jnp.float32)
-                    value = np.array(value)
-                    state_dict[key] = torch.from_numpy(value).to(torch.bfloat16)
-                else:
-                    state_dict[key] = torch.from_numpy(value)
-            except Exception as initial_exception:
-                raise ValueError(f"Conversion failed from jax weights with {initial_exception}. Check your inputs.")
-    return state_dict
-
-
-def flatten_nested_dict(params, parent_key="", sep="/", precision: int = "float32"):
-    items = []
-
-    for k, v in params.items():
-        k = k.removeprefix("params/")
-        new_key = parent_key + sep + k if parent_key else k
-
-        if isinstance(v, collections.abc.MutableMapping):
-            items.extend(flatten_nested_dict(v, parent_key=new_key, sep=sep, precision=precision).items())
-        else:
-            if precision == "bfloat16":
-                try:
-                    v = v.view(ml_dtypes.bfloat16)
-                except Exception as initial_exception:
-                    raise ValueError(f"Conversion failed from bfloat16 with {initial_exception}, check your inputs.")
-            items.append((new_key, v))
-    return dict(items)
-
-
-@torch.no_grad()
-def convert_paligemma2_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    variant: str,
-    precision: str,
-    do_convert_weights=False,
-):
-    """
-    Read checkpoints from flax npz files, rename/reshape, send result to state dict and verify logits if needed.
-    """
-    config = get_paligemma2_config(variant, precision=precision)
-    if do_convert_weights:
-        tokenizer_id = "google/paligemma-3b-pt-224"  # same tokenizer as paligemma 1
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
-        image_token = AddedToken("<image>", normalized=False, special=True)
-        tokens_to_add = {"additional_special_tokens": [image_token]}
-        tokenizer.add_special_tokens(tokens_to_add)
-
-        # tokenizer.padding_side = 'right' # uncomment for testing purposes only.
-
-        image_processor = SiglipImageProcessor.from_pretrained("google/paligemma-3b-pt-224")
-        image_processor.size = {"width": config.vision_config.image_size, "height": config.vision_config.image_size}
-        image_processor.image_seq_length = config.vision_config.num_image_tokens
-
-        processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
-        data = jnp.load(checkpoint_path)
-        state_dict = flatten_nested_dict(data, precision=precision)
-        del data
-        state_dict_transformers = slice_state_dict(state_dict, config)
-        del state_dict
-        del config.hidden_size  # this key is unused
-        model = PaliGemmaForConditionalGeneration(config).to(device).eval()
-        model.load_state_dict(state_dict_transformers)
-        del state_dict_transformers
-        model.config.text_config._attn_implementation = "sdpa"
-
-        # model expansion to get random embeds of image tokens
-        pad_shape = 64  # for performance reasons
-        pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-        mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-        n = pre_expansion_embeddings.size()[0]
-        sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-        dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-        # We add an image token so we resize the model
-        model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
-        model.language_model.model.embed_tokens.weight.data[257152:] = torch.stack(
-            tuple(
-                (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[257152:].shape[0]))
-            ),
-            dim=0,
-        )
-        model.language_model.lm_head.weight.data[257152:] = torch.stack(
-            tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[257152:].shape[0]))),
-            dim=0,
-        )
-        # convert to needed precision
-
-        model.to(DTYPES[precision])
-        model.save_pretrained(pytorch_dump_folder_path, safe_serialization=True)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    else:
-        processor = PaliGemmaProcessor.from_pretrained(pytorch_dump_folder_path, do_rescale=False)
-        model = (
-            PaliGemmaForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, attn_implementation="sdpa")
-            .to(device)
-            .eval()
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--checkpoint_path",
-        required=True,
-        type=str,
-        help="Path to the .npz checkpoint",
-    )
-
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        required=True,
-        type=str,
-        help="Path to the output directory where model and processor will be saved.",
-    )
-
-    parser.add_argument(
-        "--precision",
-        choices=["float32", "bfloat16", "float16"],
-        type=str,
-        help="Precision identifier for model conversion - should match the base checkpoint precision.",
-    )
-
-    parser.add_argument(
-        "--variant",
-        default="2b-224",
-        choices=PALIGEMMA2_VARIANTS,
-        type=str,
-        help="String identifier of the paligemma2 variant to convert.",
-    )
-
-    parser.add_argument(
-        "--do_convert_weights", action="store_true", help="Whether or not to reload and convert the weights."
-    )
-
-    args = parser.parse_args()
-    convert_paligemma2_checkpoint(
-        checkpoint_path=args.checkpoint_path,
-        pytorch_dump_folder_path=args.pytorch_dump_folder_path,
-        variant=args.variant,
-        precision=args.precision,
-        do_convert_weights=args.do_convert_weights,
-    )
diff --git a/src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py b/src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py
deleted file mode 100644
index bcea5372e57a..000000000000
--- a/src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert PaliGemma checkpoints from the original repository."""
-
-import argparse
-import collections
-
-import torch
-from numpy import load
-
-from transformers import (
-    AutoTokenizer,
-    GemmaTokenizer,
-    GemmaTokenizerFast,
-    PaliGemmaConfig,
-    PaliGemmaForConditionalGeneration,
-    PaliGemmaProcessor,
-    SiglipImageProcessor,
-)
-from transformers.tokenization_utils_base import AddedToken
-from transformers.utils import logging
-
-
-device = "cuda"  # "cpu"
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# TODO add sequence length variations here
-
-PALIGEMMA_VARIANTS = ["2b-test", "3b-224px", "3b-448px", "3b-896px"]
-
-
-def get_paligemma_config(variant: str, precision: str):
-    config = {
-        "image_token_index": None,
-        "pad_token_id": 0,
-        "bos_token_id": 2,
-        "eos_token_id": 1,
-    }
-
-    image_sizes = {"2b-test": 224, "3b-224px": 224, "3b-448px": 448, "3b-896px": 896}
-
-    if variant in PALIGEMMA_VARIANTS:
-        image_size = image_sizes[variant]
-        patch_size = 14
-        num_image_tokens = (image_size**2) // (patch_size**2)
-
-        config["image_token_index"] = 257152 if variant != "2b-test" else 256000
-        text_config = {
-            "vocab_size": 257152,
-            "num_hidden_layers": 18,
-            "num_key_value_heads": 1,
-            "head_dim": 256,
-            "torch_dtype": precision,
-            "hidden_size": 2048,
-            "hidden_activation": "gelu_pytorch_tanh",
-            "num_attention_heads": 8,
-            "intermediate_size": 16384,
-            "is_encoder_decoder": False,
-        }
-        vision_config = {
-            "torch_dtype": precision,
-            "image_size": image_size,
-            "patch_size": patch_size,
-            "num_image_tokens": num_image_tokens,
-            "hidden_size": 1152,
-            "intermediate_size": 4304,
-            "num_hidden_layers": 27,
-            "num_attention_heads": 16,
-            "projector_hidden_act": "gelu_fast",
-            "vision_use_head": False,
-        }
-        final_config = PaliGemmaConfig(text_config=text_config, vision_config=vision_config, **config)
-    else:
-        raise ValueError(f"Identifier {variant} not supported. Available: {PALIGEMMA_VARIANTS}")
-    return final_config
-
-
-def slice_state_dict(state_dict, config):
-    # fmt: off
-    # patch embeddings
-    state_dict["vision_tower.vision_model.embeddings.patch_embedding.weight"] = state_dict.pop("img/embedding/kernel").transpose(
-        3, 2, 0, 1
-    )
-    state_dict["vision_tower.vision_model.embeddings.patch_embedding.bias"] = state_dict.pop("img/embedding/bias")
-    # positional embeddings
-    state_dict["vision_tower.vision_model.embeddings.position_embedding.weight"] = state_dict.pop("img/pos_embedding").reshape(
-        -1, config.vision_config.hidden_size
-    )
-
-    # extract vision layers to be sliced at index 0. There are 27 layers in the base model.
-    encoderblock_layernorm0_scale = state_dict.pop("img/Transformer/encoderblock/LayerNorm_0/scale")
-    encoderblock_layernorm0_bias = state_dict.pop("img/Transformer/encoderblock/LayerNorm_0/bias")
-    encoderblock_layernorm1_scale = state_dict.pop("img/Transformer/encoderblock/LayerNorm_1/scale")
-    encoderblock_layernorm1_bias = state_dict.pop("img/Transformer/encoderblock/LayerNorm_1/bias")
-
-    encoderblock_mlp_dense0_kernel= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_0/kernel")
-    encoderblock_mlp_dense0_bias= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_0/bias")
-    encoderblock_mlp_dense1_kernel= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_1/kernel")
-    encoderblock_mlp_dense1_bias= state_dict.pop("img/Transformer/encoderblock/MlpBlock_0/Dense_1/bias")
-
-    encoderblock_attention_0_key_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/kernel")
-    encoderblock_attention_0_key_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/bias")
-    encoderblock_attention_0_value_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/kernel")
-    encoderblock_attention_0_value_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/bias")
-    encoderblock_attention_0_query_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/kernel")
-    encoderblock_attention_0_query_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/bias")
-    encoderblock_attention_0_out_kernel = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/kernel")
-    encoderblock_attention_0_out_bias = state_dict.pop("img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/bias")
-
-    for i in range(config.vision_config.num_hidden_layers):
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm1.weight"] = encoderblock_layernorm0_scale[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm1.bias"] = encoderblock_layernorm0_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm2.weight"] = encoderblock_layernorm1_scale[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.layer_norm2.bias"] = encoderblock_layernorm1_bias[i]
-
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.weight"] = encoderblock_mlp_dense0_kernel[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.bias"] = encoderblock_mlp_dense0_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.weight"] = encoderblock_mlp_dense1_kernel[i].transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.bias"] = encoderblock_mlp_dense1_bias[i]
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = encoderblock_attention_0_key_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = encoderblock_attention_0_key_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = encoderblock_attention_0_value_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = encoderblock_attention_0_value_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = encoderblock_attention_0_query_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = encoderblock_attention_0_query_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"] = encoderblock_attention_0_out_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
-        state_dict[f"vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"] = encoderblock_attention_0_out_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
-
-    state_dict["vision_tower.vision_model.post_layernorm.weight"] = state_dict.pop("img/Transformer/encoder_norm/scale").transpose()
-    state_dict["vision_tower.vision_model.post_layernorm.bias"] = state_dict.pop("img/Transformer/encoder_norm/bias")
-
-    # multimodal projector
-
-    state_dict['multi_modal_projector.linear.weight'] = state_dict.pop("img/head/kernel").transpose()
-    state_dict['multi_modal_projector.linear.bias'] = state_dict.pop("img/head/bias")
-
-    # text decoder (gemma)
-
-    embedding_vector = state_dict.pop("llm/embedder/input_embedding")
-    state_dict["language_model.model.embed_tokens.weight"] = embedding_vector
-
-    # pop the einsum attention + mlp representations. There are 18 layers in gemma-2b.
-
-    llm_attention_attn_vec_einsum = state_dict.pop("llm/layers/attn/attn_vec_einsum/w")
-    llm_attention_kv_einsum = state_dict.pop("llm/layers/attn/kv_einsum/w")
-    llm_attention_q_einsum = state_dict.pop("llm/layers/attn/q_einsum/w")
-
-    llm_mlp_gating_einsum = state_dict.pop("llm/layers/mlp/gating_einsum")
-    llm_mlp_linear = state_dict.pop("llm/layers/mlp/linear")
-    # TODO verify correctness of layer norm loading
-
-    llm_input_layernorm = state_dict.pop("llm/layers/pre_attention_norm/scale")
-    llm_post_attention_layernorm = state_dict.pop("llm/layers/pre_ffw_norm/scale")
-
-    for i in range(config.text_config.num_hidden_layers):
-        # llm_attention_q_einsum[i].shape = (8, 2048, 256)
-        q_proj_weight_reshaped = llm_attention_q_einsum[i].transpose(0, 2, 1).reshape(config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size)
-
-        state_dict[f"language_model.model.layers.{i}.self_attn.q_proj.weight"] = q_proj_weight_reshaped
-
-        # llm_attention_kv_einsum[i, 0, 0].shape = (2048, 256)
-        k_proj_weight_reshaped = llm_attention_kv_einsum[i, 0, 0].transpose()
-        state_dict[f"language_model.model.layers.{i}.self_attn.k_proj.weight"] = k_proj_weight_reshaped
-        # llm_attention_kv_einsum[i, 1, 0].shape = (2048, 256)
-        v_proj_weight_reshaped = llm_attention_kv_einsum[i, 1, 0].transpose()
-        state_dict[f"language_model.model.layers.{i}.self_attn.v_proj.weight"] = v_proj_weight_reshaped
-
-        # output projection.
-
-        # llm_attention_attn_vec_einsum[i].shape = (8, 256, 2048)
-        o_proj_weight_reshaped = llm_attention_attn_vec_einsum[i].transpose(2, 0, 1).reshape(config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size)
-
-        state_dict[f"language_model.model.layers.{i}.self_attn.o_proj.weight"] = o_proj_weight_reshaped
-        # mlp layers
-        gate_proj_weight = llm_mlp_gating_einsum[i, 0]
-        state_dict[f"language_model.model.layers.{i}.mlp.gate_proj.weight"] = gate_proj_weight.transpose()
-        up_proj_weight = llm_mlp_gating_einsum[i, 1]
-        state_dict[f"language_model.model.layers.{i}.mlp.up_proj.weight"] = up_proj_weight.transpose()
-        state_dict[f"language_model.model.layers.{i}.mlp.down_proj.weight"] = llm_mlp_linear[i].transpose()
-        state_dict[f"language_model.model.layers.{i}.input_layernorm.weight"] = llm_input_layernorm[i]
-        state_dict[f"language_model.model.layers.{i}.post_attention_layernorm.weight"] = llm_post_attention_layernorm[i]
-
-    state_dict["language_model.model.norm.weight"] = state_dict.pop("llm/final_norm/scale")
-    state_dict["language_model.lm_head.weight"] = embedding_vector # weights are tied.
-
-    # fmt: on
-    for key, value in state_dict.items():
-        state_dict[key] = torch.from_numpy(value)
-    return state_dict
-
-
-def flatten_nested_dict(params, parent_key="", sep="/"):
-    items = []
-
-    for k, v in params.items():
-        k = k.removeprefix("params/")
-        new_key = parent_key + sep + k if parent_key else k
-
-        if isinstance(v, collections.abc.MutableMapping):
-            items.extend(flatten_nested_dict(v, parent_key=new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-
-
-@torch.no_grad()
-def convert_paligemma_checkpoint(
-    checkpoint_path,
-    tokenizer_model_file,
-    pytorch_dump_folder_path,
-    variant: str,
-    precision: str,
-    do_convert_weights=False,
-):
-    """
-    Read checkpoints from flax npz files, rename/reshape, send result to state dict and verify logits if needed.
-    """
-    config = get_paligemma_config(variant, precision=precision)
-    if do_convert_weights:
-        if variant == "2b-test":
-            # for the test model, the vocabulary was smaller
-            tokenizer_id = "google/gemma-2b"
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
-        else:
-            tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast
-            tokenizer = tokenizer_class(tokenizer_model_file)
-        image_token = AddedToken("<image>", normalized=False, special=True)
-        tokens_to_add = {"additional_special_tokens": [image_token]}
-        tokenizer.add_special_tokens(tokens_to_add)
-
-        # tokenizer.padding_side = 'right' # uncomment for testing purposes only.
-
-        image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
-        image_processor.size = {"width": config.vision_config.image_size, "height": config.vision_config.image_size}
-        image_processor.image_seq_length = config.vision_config.num_image_tokens
-
-        processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
-        data = load(checkpoint_path)
-        state_dict = flatten_nested_dict(data)
-        del data
-        state_dict_transformers = slice_state_dict(state_dict, config)
-        del state_dict
-
-        model = PaliGemmaForConditionalGeneration(config).to(device).eval()
-        model.load_state_dict(state_dict_transformers)
-        del state_dict_transformers
-
-    else:
-        processor = PaliGemmaProcessor.from_pretrained(pytorch_dump_folder_path)
-        model = (
-            PaliGemmaForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, attn_implementation="sdpa")
-            .to(device)
-            .eval()
-        )
-    model.config.text_config._attn_implementation = "sdpa"
-
-    # model expansion to get random embeds of image tokens
-    pad_shape = 64  # for performance reasons
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model
-    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
-    model.language_model.model.embed_tokens.weight.data[257152:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[257152:].shape[0]))),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[257152:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[257152:].shape[0]))),
-        dim=0,
-    )
-
-    model.save_pretrained(pytorch_dump_folder_path, max_shard_size="2GB", safe_serialization=True)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-
-#
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--checkpoint_path",
-        required=True,
-        type=str,
-        help="Path to the .npz checkpoint",
-    )
-
-    parser.add_argument(
-        "--tokenizer_model_file",
-        required=True,
-        type=str,
-        help="Path to the sentencepiece tokenizer.model file",
-    )
-
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        required=True,
-        type=str,
-        help="Path to the output directory where model and processor will be saved.",
-    )
-
-    parser.add_argument(
-        "--precision",
-        choices=["float32", "bfloat16", "float16"],
-        type=str,
-        help="Precision identifier for model conversion - should match the base checkpoint precision.",
-    )
-
-    parser.add_argument(
-        "--variant",
-        default="2b-test",
-        choices=PALIGEMMA_VARIANTS,
-        type=str,
-        help="String identifier of the paligemma variant to convert.",
-    )
-
-    parser.add_argument(
-        "--do_convert_weights", action="store_true", help="Whether or not to reload and convert the weights."
-    )
-
-    args = parser.parse_args()
-    convert_paligemma_checkpoint(
-        checkpoint_path=args.checkpoint_path,
-        tokenizer_model_file=args.tokenizer_model_file,
-        pytorch_dump_folder_path=args.pytorch_dump_folder_path,
-        variant=args.variant,
-        precision=args.precision,
-        do_convert_weights=args.do_convert_weights,
-    )
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 35ad047a00dd..ef92378e0b0e 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -23,24 +23,19 @@
 
 from ...cache_utils import Cache, HybridCache, StaticCache
 from ...generation import GenerationMixin
+from ...modeling_outputs import CausalLMOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
     is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
 from ...utils.deprecation import deprecate_kwarg
-from .configuration_paligemma import PaliGemmaConfig
-
-
-if is_flash_attn_2_available():
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
 from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_paligemma import PaliGemmaConfig
 
 
 logger = logging.get_logger(__name__)
@@ -60,7 +55,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
     cache_position: torch.Tensor,
     batch_size: int,
     is_training: bool = False,
-    token_type_ids: torch.Tensor = None,
+    token_type_ids: Optional[torch.Tensor] = None,
     **kwargs,
 ):
     """
@@ -77,7 +72,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         dtype (`torch.dtype`):
             The dtype to use for the 4D attention mask.
         device (`torch.device`):
-            The device to plcae the 4D attention mask on.
+            The device to place the 4D attention mask on.
         min_dtype (`float`):
             The minimum value representable with the dtype `dtype`.
         cache_position (`torch.Tensor`):
@@ -150,7 +145,7 @@ class PaliGemmaCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -340,19 +335,22 @@ def get_decoder(self):
     def _update_causal_mask(
         self,
         attention_mask,
-        token_type_ids,
-        past_key_values,
-        cache_position,
-        input_tensor,
-        is_training: bool = False,
+        token_type_ids=None,
+        past_key_values=None,
+        cache_position=None,
+        input_tensor=None,
+        is_training: Optional[bool] = None,
     ):
         if self.config.text_config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
-
+        is_training = is_training if is_training is not None else self.training
         using_static_cache = isinstance(past_key_values, StaticCache)
         min_dtype = torch.finfo(self.dtype).min
+        if input_tensor is None:
+            input_tensor = attention_mask
+
         inputs_lead_dim, sequence_length = input_tensor.shape[:2]
         if using_static_cache:
             target_length = past_key_values.get_max_cache_shape()
@@ -387,6 +385,8 @@ def _update_causal_mask(
 
             # First unmask prefix tokens during training
             if is_training:
+                if token_type_ids is None:
+                    raise ValueError("Token type ids must be provided during training")
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
                 )
@@ -421,8 +421,8 @@ def get_image_features(self, pixel_values: torch.FloatTensor):
     @replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
@@ -438,7 +438,6 @@ def forward(
         **lm_kwargs,
     ) -> Union[Tuple, PaliGemmaCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -460,29 +459,24 @@ def forward(
         >>> import requests
         >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
 
-        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/PaliGemma-test-224px-hf")
-        >>> processor = AutoProcessor.from_pretrained("google/PaliGemma-test-224px-hf")
+        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
+        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")
 
-        >>> prompt = "answer en Where is the cow standing?"
-        >>> url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png"
+        >>> prompt = "Where is the cat standing?"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
         >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
 
         >>> # Generate
-        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> generate_ids = model.generate(**inputs,)
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "answer en Where is the cow standing?\nbeach"
+        "Where is the cat standing?\nsnow"
         ```"""
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
-        if pixel_values is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
-            )
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -491,8 +485,16 @@ def forward(
 
         is_training = token_type_ids is not None and labels is not None
 
+        # Replace image id woth PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_index >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_index
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+
         if inputs_embeds is None:
-            inputs_embeds = self.get_input_embeddings()(input_ids)
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
@@ -507,10 +509,16 @@ def forward(
         if pixel_values is not None:
             image_features = self.get_image_features(pixel_values)
 
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
-            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            if input_ids is None:
+                special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                    torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
+                )
+            else:
+                special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+                special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+
             if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
-                image_tokens_in_text = torch.sum(input_ids == self.config.image_token_index)
+                image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
                 raise ValueError(
                     f"Number of images does not match number of special image tokens in the input text. "
                     f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
@@ -530,7 +538,7 @@ def forward(
         causal_mask = self._update_causal_mask(
             attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds, is_training
         )
-        outputs = self.language_model(
+        outputs: CausalLMOutputWithPast = self.language_model(
             attention_mask=causal_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
@@ -538,13 +546,13 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            return_dict=True,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
             **lm_kwargs,
         )
 
-        logits = outputs.logits
+        logits = outputs[0]
         loss = None
         if labels is not None:
             # Upcast to float if we need to compute the loss to avoid potential precision issues
@@ -566,11 +574,8 @@ def forward(
             flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
             flat_labels = shift_labels.view(-1).to(shift_logits.device)
             loss = loss_fct(flat_logits, flat_labels)
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
 
-        return PaliGemmaCausalLMOutputWithPast(
+        output = PaliGemmaCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
@@ -578,6 +583,7 @@ def forward(
             attentions=outputs.attentions,
             image_hidden_states=image_features if pixel_values is not None else None,
         )
+        return output if return_dict else output.to_tuple()
 
     def prepare_inputs_for_generation(
         self,
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index 9419275da6c5..f988d43583f6 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -117,7 +117,7 @@ class PaliGemmaProcessor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer"]
     valid_kwargs = ["chat_template"]
-    image_processor_class = "SiglipImageProcessor"
+    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
     tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")
 
     def __init__(
@@ -162,7 +162,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         The usage for PaliGemma fine-tuning preparation is slightly different than usual. suffix passed are suffixes to
diff --git a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
index 37b764160616..ca88a84a399e 100644
--- a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
+++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
@@ -1153,7 +1153,7 @@ def __init__(self, config: PatchTSMixerConfig):
         self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
+        self, data: torch.Tensor, observed_indicator: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters:
@@ -1181,7 +1181,7 @@ class PatchTSMixerEncoderOutput(ModelOutput):
             Hidden-states of the model at the output of each layer.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -1283,9 +1283,9 @@ class PatchTSMixerModelOutput(ModelOutput):
             enabled.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    patch_input: torch.FloatTensor = None
+    patch_input: Optional[torch.FloatTensor] = None
     mask: Optional[torch.FloatTensor] = None
     loc: Optional[torch.FloatTensor] = None
     scale: Optional[torch.FloatTensor] = None
@@ -1402,8 +1402,8 @@ class PatchTSMixerForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_outputs: torch.FloatTensor = None
-    last_hidden_state: torch.FloatTensor = None
+    prediction_outputs: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -1521,11 +1521,11 @@ class PatchTSMixerForPredictionOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_outputs: torch.FloatTensor = None
-    last_hidden_state: torch.FloatTensor = None
+    prediction_outputs: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    loc: torch.FloatTensor = None
-    scale: torch.FloatTensor = None
+    loc: Optional[torch.FloatTensor] = None
+    scale: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -1539,7 +1539,7 @@ class SamplePatchTSMixerPredictionOutput(ModelOutput):
             Sampled values from the chosen distribution.
     """
 
-    sequences: torch.FloatTensor = None
+    sequences: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -1553,7 +1553,7 @@ class SamplePatchTSMixerRegressionOutput(ModelOutput):
                 Sampled values from the chosen distribution.
     """
 
-    sequences: torch.FloatTensor = None
+    sequences: Optional[torch.FloatTensor] = None
 
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
@@ -1817,8 +1817,8 @@ class PatchTSMixerForTimeSeriesClassificationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_outputs: torch.FloatTensor = None
-    last_hidden_state: torch.FloatTensor = None
+    prediction_outputs: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -1859,7 +1859,7 @@ def __init__(self, config: PatchTSMixerConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        target_values: torch.Tensor = None,
+        target_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = False,
         return_loss: bool = True,
         return_dict: Optional[bool] = None,
@@ -1948,8 +1948,8 @@ class PatchTSMixerForRegressionOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    regression_outputs: torch.FloatTensor = None
-    last_hidden_state: torch.FloatTensor = None
+    regression_outputs: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -2049,7 +2049,7 @@ def __init__(self, config: PatchTSMixerConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        target_values: torch.Tensor = None,
+        target_values: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = False,
         return_loss: bool = True,
         return_dict: Optional[bool] = None,
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 645bbfbbd160..ae09d410ace0 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -813,13 +813,13 @@ class PatchTSTModelOutput(ModelOutput):
             Patched input to the Transformer
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
-    mask: torch.FloatTensor = None
-    loc: torch.FloatTensor = None
-    scale: torch.FloatTensor = None
-    patch_input: torch.FloatTensor = None
+    mask: Optional[torch.FloatTensor] = None
+    loc: Optional[torch.FloatTensor] = None
+    scale: Optional[torch.FloatTensor] = None
+    patch_input: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -846,7 +846,7 @@ class PatchTSTForPretrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_output: torch.FloatTensor = None
+    prediction_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -875,7 +875,7 @@ class PatchTSTForRegressionOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    regression_outputs: torch.FloatTensor = None
+    regression_outputs: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -908,11 +908,11 @@ class PatchTSTForPredictionOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_outputs: torch.FloatTensor = None
+    prediction_outputs: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
-    loc: torch.FloatTensor = None
-    scale: torch.FloatTensor = None
+    loc: Optional[torch.FloatTensor] = None
+    scale: Optional[torch.FloatTensor] = None
 
 
 @dataclass
@@ -940,7 +940,7 @@ class PatchTSTForClassificationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
+    prediction_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -956,7 +956,7 @@ class SamplePatchTSTOutput(ModelOutput):
                 Sampled values from the chosen distribution.
     """
 
-    sequences: torch.FloatTensor = None
+    sequences: Optional[torch.FloatTensor] = None
 
 
 # Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
@@ -1095,7 +1095,7 @@ def __init__(self, config: PatchTSTConfig):
         self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
+        self, data: torch.Tensor, observed_indicator: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters:
@@ -1457,7 +1457,7 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        target_values: torch.Tensor = None,
+        target_values: Optional[torch.Tensor] = None,
         past_observed_mask: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1910,7 +1910,7 @@ def __init__(self, config: PatchTSTConfig):
     def forward(
         self,
         past_values: torch.Tensor,
-        target_values: torch.Tensor = None,
+        target_values: Optional[torch.Tensor] = None,
         past_observed_mask: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
diff --git a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py b/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
deleted file mode 100644
index cf183b590c1b..000000000000
--- a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pathlib import Path
-from typing import Dict
-
-import tensorflow as tf
-import torch
-from tqdm import tqdm
-
-from transformers import PegasusConfig, PegasusForConditionalGeneration, PegasusTokenizer
-from transformers.models.pegasus.configuration_pegasus import DEFAULTS, task_specific_params
-
-
-PATTERNS = [
-    # replace left string with right string to get the relevant state_dict key (identical state dict to bart)
-    ["memory_attention", "encoder_attn"],
-    ["attention", "attn"],
-    ["/", "."],
-    [".LayerNorm.gamma", "_layer_norm.weight"],
-    [".LayerNorm.beta", "_layer_norm.bias"],
-    ["r.layer_", "r.layers."],
-    ["output_proj", "out_proj"],
-    ["ffn.dense_1.", "fc2."],
-    ["ffn.dense.", "fc1."],
-    ["ffn_layer_norm", "final_layer_norm"],
-    ["kernel", "weight"],
-    ["encoder_layer_norm.", "encoder.layer_norm."],
-    ["decoder_layer_norm.", "decoder.layer_norm."],
-    ["embeddings.weights", "shared.weight"],
-]
-
-
-def rename_state_dict_key(k):
-    for pegasus_name, hf_name in PATTERNS:
-        k = k.replace(pegasus_name, hf_name)
-    return k
-
-
-# See appendix C of paper for all hyperparams
-
-
-def convert_pegasus(tf_weights: dict, cfg_updates: dict) -> PegasusForConditionalGeneration:
-    cfg_kwargs = DEFAULTS.copy()
-    cfg_kwargs.update(cfg_updates)
-    cfg = PegasusConfig(**cfg_kwargs)
-    torch_model = PegasusForConditionalGeneration(cfg)
-    sd = torch_model.model.state_dict()
-    mapping = {}
-    for k, v in tf_weights.items():
-        new_k = rename_state_dict_key(k)
-        if new_k not in sd:
-            raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-
-        if "dense" in k or "proj" in new_k:
-            v = v.T
-        mapping[new_k] = torch.tensor(v, dtype=sd[new_k].dtype)
-        assert v.shape == sd[new_k].shape, f"{new_k}, {k}, {v.shape}, {sd[new_k].shape}"
-    # make sure embedding.padding_idx is respected
-    mapping["shared.weight"][cfg.pad_token_id] = torch.zeros_like(mapping["shared.weight"][cfg.pad_token_id + 1])
-    mapping["encoder.embed_tokens.weight"] = mapping["shared.weight"]
-    mapping["decoder.embed_tokens.weight"] = mapping["shared.weight"]
-    empty_biases = {k: torch.zeros_like(v) for k, v in sd.items() if k.endswith("bias") and k not in mapping}
-    mapping.update(**empty_biases)
-    missing, extra = torch_model.model.load_state_dict(mapping, strict=False)
-    unexpected_missing = [
-        k for k in missing if k not in ["encoder.embed_positions.weight", "decoder.embed_positions.weight"]
-    ]
-    assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}"
-    assert extra == [], f"no matches found for the following tf keys {extra}"
-    return torch_model
-
-
-def get_tf_weights_as_numpy(path="./ckpt/aeslc/model.ckpt-32000") -> Dict:
-    init_vars = tf.train.list_variables(path)
-    tf_weights = {}
-    ignore_name = ["Adafactor", "global_step"]
-    for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
-        skip_key = any(pat in name for pat in ignore_name)
-        if skip_key:
-            continue
-        array = tf.train.load_variable(path, name)
-        tf_weights[name] = array
-    return tf_weights
-
-
-def convert_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str):
-    # save tokenizer first
-    dataset = Path(ckpt_path).parent.name
-    desired_max_model_length = task_specific_params[f"summarization_{dataset}"]["max_position_embeddings"]
-    tok = PegasusTokenizer.from_pretrained("sshleifer/pegasus", model_max_length=desired_max_model_length)
-    assert tok.model_max_length == desired_max_model_length
-    tok.save_pretrained(save_dir)
-
-    # convert model
-    tf_weights = get_tf_weights_as_numpy(ckpt_path)
-    cfg_updates = task_specific_params[f"summarization_{dataset}"]
-    if dataset == "large":
-        cfg_updates["task_specific_params"] = task_specific_params
-    torch_model = convert_pegasus(tf_weights, cfg_updates)
-    torch_model.save_pretrained(save_dir)
-    sd = torch_model.state_dict()
-    sd.pop("model.decoder.embed_positions.weight")
-    sd.pop("model.encoder.embed_positions.weight")
-    torch.save(sd, Path(save_dir) / "pytorch_model.bin")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("tf_ckpt_path", type=str, help="passed to tf.train.list_variables")
-    parser.add_argument("save_dir", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    if args.save_dir is None:
-        dataset = Path(args.tf_ckpt_path).parent.name
-        args.save_dir = os.path.join("pegasus", dataset)
-    convert_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir)
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index 269a3268fed1..bd450698937c 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -278,7 +278,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index fb560452a940..b4c5a0acdcd8 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -74,24 +74,21 @@ class PegasusSinusoidalPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
         super().__init__(num_positions, embedding_dim)
-        self.weight = self._init_weight(self.weight)
 
-    @staticmethod
-    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+    def _init_weight(self):
         """
         Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
         the 2nd half of the vector. [dim // 2:]
         """
-        n_pos, dim = out.shape
+        n_pos, dim = self.weight.shape
         position_enc = np.array(
             [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
         )
-        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False)
         sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
         out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
         out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-        out.detach_()
-        return out
+        self.weight = nn.Parameter(out, requires_grad=False)
 
     @torch.no_grad()
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
@@ -322,9 +319,7 @@ def forward(
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
+        if hidden_states.dtype == torch.float16:
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
@@ -469,7 +464,7 @@ def _init_weights(self, module):
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, PegasusSinusoidalPositionalEmbedding):
-            pass
+            module._init_weight()
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
@@ -665,6 +660,7 @@ def resize_position_embeddings(self, new_num_position_embeddings: int):
             self.config.d_model,
             self.padding_idx,
         )
+        self.embed_positions._init_weight()
         self.embed_positions.to(self.device)
 
     def get_position_embeddings(self) -> nn.Embedding:
@@ -868,6 +864,7 @@ def resize_position_embeddings(self, new_num_position_embeddings: int):
             self.config.d_model,
             self.padding_idx,
         )
+        self.embed_positions._init_weight()
         self.embed_positions.to(self.device)
 
     def get_position_embeddings(self) -> nn.Embedding:
@@ -1481,7 +1478,7 @@ def resize_position_embeddings(self, new_num_position_embeddings: int):
     # Copied from transformers.models.bart.modeling_bart.BartForCausalLM.forward with Bart->Pegasus, facebook/bart-base->google/pegasus-large
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
diff --git a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
deleted file mode 100644
index 082b9449374a..000000000000
--- a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
+++ /dev/null
@@ -1,468 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Perceiver checkpoints originally implemented in Haiku."""
-
-import argparse
-import json
-import pickle
-from pathlib import Path
-
-import haiku as hk
-import numpy as np
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    PerceiverConfig,
-    PerceiverForImageClassificationConvProcessing,
-    PerceiverForImageClassificationFourier,
-    PerceiverForImageClassificationLearned,
-    PerceiverForMaskedLM,
-    PerceiverForMultimodalAutoencoding,
-    PerceiverForOpticalFlow,
-    PerceiverImageProcessor,
-    PerceiverTokenizer,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def prepare_img():
-    # We will verify our results on an image of a dog
-    url = "https://storage.googleapis.com/perceiver_io/dalmation.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def rename_keys(state_dict, architecture):
-    for name in list(state_dict):
-        param = state_dict.pop(name)
-
-        # PREPROCESSORS
-        # rename text preprocessor embeddings (for MLM model)
-        name = name.replace("embed/embeddings", "input_preprocessor.embeddings.weight")
-        if name.startswith("trainable_position_encoding/pos_embs"):
-            name = name.replace(
-                "trainable_position_encoding/pos_embs", "input_preprocessor.position_embeddings.weight"
-            )
-
-        # rename image preprocessor embeddings (for image classification model with learned position embeddings)
-        name = name.replace("image_preprocessor/~/conv2_d/w", "input_preprocessor.convnet_1x1.weight")
-        name = name.replace("image_preprocessor/~/conv2_d/b", "input_preprocessor.convnet_1x1.bias")
-        name = name.replace(
-            "image_preprocessor/~_build_network_inputs/trainable_position_encoding/pos_embs",
-            "input_preprocessor.position_embeddings.position_embeddings",
-        )
-        name = name.replace(
-            "image_preprocessor/~_build_network_inputs/position_encoding_projector/linear/w",
-            "input_preprocessor.positions_projection.weight",
-        )
-        name = name.replace(
-            "image_preprocessor/~_build_network_inputs/position_encoding_projector/linear/b",
-            "input_preprocessor.positions_projection.bias",
-        )
-
-        # rename image preprocessor embeddings (for image classification model with conv processing)
-        if "counter" in name or "hidden" in name:
-            continue
-        name = name.replace(
-            "image_preprocessor/~/conv2_d_downsample/~/conv/w", "input_preprocessor.convnet.conv.weight"
-        )
-        name = name.replace(
-            "image_preprocessor/~/conv2_d_downsample/~/batchnorm/offset", "input_preprocessor.convnet.batchnorm.bias"
-        )
-        name = name.replace(
-            "image_preprocessor/~/conv2_d_downsample/~/batchnorm/scale", "input_preprocessor.convnet.batchnorm.weight"
-        )
-        name = name.replace(
-            "image_preprocessor/~/conv2_d_downsample/~/batchnorm/~/mean_ema/average",
-            "input_preprocessor.convnet.batchnorm.running_mean",
-        )
-        name = name.replace(
-            "image_preprocessor/~/conv2_d_downsample/~/batchnorm/~/var_ema/average",
-            "input_preprocessor.convnet.batchnorm.running_var",
-        )
-
-        # rename image preprocessor embeddings (for optical flow model)
-        name = name.replace("image_preprocessor/patches_linear/b", "input_preprocessor.conv_after_patches.bias")
-        name = name.replace("image_preprocessor/patches_linear/w", "input_preprocessor.conv_after_patches.weight")
-
-        # rename multimodal preprocessor embeddings
-        name = name.replace("multimodal_preprocessor/audio_mask_token/pos_embs", "input_preprocessor.mask.audio")
-        name = name.replace("multimodal_preprocessor/audio_padding/pos_embs", "input_preprocessor.padding.audio")
-        name = name.replace("multimodal_preprocessor/image_mask_token/pos_embs", "input_preprocessor.mask.image")
-        name = name.replace("multimodal_preprocessor/image_padding/pos_embs", "input_preprocessor.padding.image")
-        name = name.replace("multimodal_preprocessor/label_mask_token/pos_embs", "input_preprocessor.mask.label")
-        name = name.replace("multimodal_preprocessor/label_padding/pos_embs", "input_preprocessor.padding.label")
-
-        # DECODERS
-        # rename prefix of decoders
-        # multimodal autoencoding model
-        name = name.replace(
-            "multimodal_decoder/~/basic_decoder/cross_attention/", "decoder.decoder.decoding_cross_attention."
-        )
-        name = name.replace("multimodal_decoder/~decoder_query/audio_padding/pos_embs", "decoder.padding.audio")
-        name = name.replace("multimodal_decoder/~decoder_query/image_padding/pos_embs", "decoder.padding.image")
-        name = name.replace("multimodal_decoder/~decoder_query/label_padding/pos_embs", "decoder.padding.label")
-        name = name.replace("multimodal_decoder/~/basic_decoder/output/b", "decoder.decoder.final_layer.bias")
-        name = name.replace("multimodal_decoder/~/basic_decoder/output/w", "decoder.decoder.final_layer.weight")
-        if architecture == "multimodal_autoencoding":
-            name = name.replace(
-                "classification_decoder/~/basic_decoder/~/trainable_position_encoding/pos_embs",
-                "decoder.modalities.label.decoder.output_position_encodings.position_embeddings",
-            )
-        # flow model
-        name = name.replace(
-            "flow_decoder/~/basic_decoder/cross_attention/", "decoder.decoder.decoding_cross_attention."
-        )
-        name = name.replace("flow_decoder/~/basic_decoder/output/w", "decoder.decoder.final_layer.weight")
-        name = name.replace("flow_decoder/~/basic_decoder/output/b", "decoder.decoder.final_layer.bias")
-        # image models
-        name = name.replace(
-            "classification_decoder/~/basic_decoder/~/trainable_position_encoding/pos_embs",
-            "decoder.decoder.output_position_encodings.position_embeddings",
-        )
-        name = name.replace(
-            "basic_decoder/~/trainable_position_encoding/pos_embs",
-            "decoder.output_position_encodings.position_embeddings",
-        )
-        name = name.replace(
-            "classification_decoder/~/basic_decoder/cross_attention/", "decoder.decoder.decoding_cross_attention."
-        )
-        name = name.replace("classification_decoder/~/basic_decoder/output/b", "decoder.decoder.final_layer.bias")
-        name = name.replace("classification_decoder/~/basic_decoder/output/w", "decoder.decoder.final_layer.weight")
-        name = name = name.replace("classification_decoder/~/basic_decoder/~/", "decoder.decoder.")
-        name = name.replace("basic_decoder/cross_attention/", "decoder.decoding_cross_attention.")
-        name = name.replace("basic_decoder/~/", "decoder.")
-
-        # POSTPROCESSORS
-        name = name.replace(
-            "projection_postprocessor/linear/b", "output_postprocessor.modalities.image.classifier.bias"
-        )
-        name = name.replace(
-            "projection_postprocessor/linear/w", "output_postprocessor.modalities.image.classifier.weight"
-        )
-        name = name.replace(
-            "classification_postprocessor/linear/b", "output_postprocessor.modalities.label.classifier.bias"
-        )
-        name = name.replace(
-            "classification_postprocessor/linear/w", "output_postprocessor.modalities.label.classifier.weight"
-        )
-        name = name.replace("audio_postprocessor/linear/b", "output_postprocessor.modalities.audio.classifier.bias")
-        name = name.replace("audio_postprocessor/linear/w", "output_postprocessor.modalities.audio.classifier.weight")
-
-        # PERCEIVER MODEL
-
-        # rename latent embeddings
-        name = name.replace("perceiver_encoder/~/trainable_position_encoding/pos_embs", "embeddings.latents")
-        # rename latent embeddings (for multimodal model)
-        name = name.replace("encoder/~/trainable_position_encoding/pos_embs", "embeddings.latents")
-
-        # rename prefixes
-        if name.startswith("perceiver_encoder/~/"):
-            if "self_attention" in name:
-                suffix = "self_attends."
-            else:
-                suffix = ""
-            name = name.replace("perceiver_encoder/~/", "encoder." + suffix)
-        if name.startswith("encoder/~/"):
-            if "self_attention" in name:
-                suffix = "self_attends."
-            else:
-                suffix = ""
-            name = name.replace("encoder/~/", "encoder." + suffix)
-        # rename layernorm parameters
-        if "offset" in name:
-            name = name.replace("offset", "bias")
-        if "scale" in name:
-            name = name.replace("scale", "weight")
-        # in HuggingFace, the layernorm in between attention + MLP is just called "layernorm"
-        # rename layernorm in between attention + MLP of cross-attention
-        if "cross_attention" in name and "layer_norm_2" in name:
-            name = name.replace("layer_norm_2", "layernorm")
-        # rename layernorm in between attention + MLP of self-attention
-        if "self_attention" in name and "layer_norm_1" in name:
-            name = name.replace("layer_norm_1", "layernorm")
-
-        # in HuggingFace, the layernorms for queries + keys are called "layernorm1" and "layernorm2"
-        if "cross_attention" in name and "layer_norm_1" in name:
-            name = name.replace("layer_norm_1", "attention.self.layernorm2")
-        if "cross_attention" in name and "layer_norm" in name:
-            name = name.replace("layer_norm", "attention.self.layernorm1")
-        if "self_attention" in name and "layer_norm" in name:
-            name = name.replace("layer_norm", "attention.self.layernorm1")
-
-        # rename special characters by dots
-        name = name.replace("-", ".")
-        name = name.replace("/", ".")
-        # rename keys, queries, values and output of attention layers
-        if ("cross_attention" in name or "self_attention" in name) and "mlp" not in name:
-            if "linear.b" in name:
-                name = name.replace("linear.b", "self.query.bias")
-            if "linear.w" in name:
-                name = name.replace("linear.w", "self.query.weight")
-            if "linear_1.b" in name:
-                name = name.replace("linear_1.b", "self.key.bias")
-            if "linear_1.w" in name:
-                name = name.replace("linear_1.w", "self.key.weight")
-            if "linear_2.b" in name:
-                name = name.replace("linear_2.b", "self.value.bias")
-            if "linear_2.w" in name:
-                name = name.replace("linear_2.w", "self.value.weight")
-            if "linear_3.b" in name:
-                name = name.replace("linear_3.b", "output.dense.bias")
-            if "linear_3.w" in name:
-                name = name.replace("linear_3.w", "output.dense.weight")
-        if "self_attention_" in name:
-            name = name.replace("self_attention_", "")
-        if "self_attention" in name:
-            name = name.replace("self_attention", "0")
-        # rename dense layers of 2-layer MLP
-        if "mlp" in name:
-            if "linear.b" in name:
-                name = name.replace("linear.b", "dense1.bias")
-            if "linear.w" in name:
-                name = name.replace("linear.w", "dense1.weight")
-            if "linear_1.b" in name:
-                name = name.replace("linear_1.b", "dense2.bias")
-            if "linear_1.w" in name:
-                name = name.replace("linear_1.w", "dense2.weight")
-
-        # finally, TRANSPOSE if kernel and not embedding layer, and set value
-        if name[-6:] == "weight" and "embeddings" not in name:
-            param = np.transpose(param)
-
-        # if batchnorm, we need to squeeze it
-        if "batchnorm" in name:
-            param = np.squeeze(param)
-
-        if "embedding_decoder" not in name:
-            state_dict["perceiver." + name] = torch.from_numpy(param)
-        else:
-            state_dict[name] = torch.from_numpy(param)
-
-
-@torch.no_grad()
-def convert_perceiver_checkpoint(pickle_file, pytorch_dump_folder_path, architecture="MLM"):
-    """
-    Copy/paste/tweak model's weights to our Perceiver structure.
-    """
-
-    # load parameters as FlatMapping data structure
-    with open(pickle_file, "rb") as f:
-        checkpoint = pickle.loads(f.read())
-
-    state = None
-    if isinstance(checkpoint, dict) and architecture in [
-        "image_classification",
-        "image_classification_fourier",
-        "image_classification_conv",
-    ]:
-        # the image classification_conv checkpoint also has batchnorm states (running_mean and running_var)
-        params = checkpoint["params"]
-        state = checkpoint["state"]
-    else:
-        params = checkpoint
-
-    # turn into initial state dict
-    state_dict = {}
-    for scope_name, parameters in hk.data_structures.to_mutable_dict(params).items():
-        for param_name, param in parameters.items():
-            state_dict[scope_name + "/" + param_name] = param
-
-    if state is not None:
-        # add state variables
-        for scope_name, parameters in hk.data_structures.to_mutable_dict(state).items():
-            for param_name, param in parameters.items():
-                state_dict[scope_name + "/" + param_name] = param
-
-    # rename keys
-    rename_keys(state_dict, architecture=architecture)
-
-    # load HuggingFace model
-    config = PerceiverConfig()
-    subsampling = None
-    repo_id = "huggingface/label-files"
-    if architecture == "MLM":
-        config.qk_channels = 8 * 32
-        config.v_channels = 1280
-        model = PerceiverForMaskedLM(config)
-    elif "image_classification" in architecture:
-        config.num_latents = 512
-        config.d_latents = 1024
-        config.d_model = 512
-        config.num_blocks = 8
-        config.num_self_attends_per_block = 6
-        config.num_cross_attention_heads = 1
-        config.num_self_attention_heads = 8
-        config.qk_channels = None
-        config.v_channels = None
-        # set labels
-        config.num_labels = 1000
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        if architecture == "image_classification":
-            config.image_size = 224
-            model = PerceiverForImageClassificationLearned(config)
-        elif architecture == "image_classification_fourier":
-            config.d_model = 261
-            model = PerceiverForImageClassificationFourier(config)
-        elif architecture == "image_classification_conv":
-            config.d_model = 322
-            model = PerceiverForImageClassificationConvProcessing(config)
-        else:
-            raise ValueError(f"Architecture {architecture} not supported")
-    elif architecture == "optical_flow":
-        config.num_latents = 2048
-        config.d_latents = 512
-        config.d_model = 322
-        config.num_blocks = 1
-        config.num_self_attends_per_block = 24
-        config.num_self_attention_heads = 16
-        config.num_cross_attention_heads = 1
-        model = PerceiverForOpticalFlow(config)
-    elif architecture == "multimodal_autoencoding":
-        config.num_latents = 28 * 28 * 1
-        config.d_latents = 512
-        config.d_model = 704
-        config.num_blocks = 1
-        config.num_self_attends_per_block = 8
-        config.num_self_attention_heads = 8
-        config.num_cross_attention_heads = 1
-        config.num_labels = 700
-        # define dummy inputs + subsampling (as each forward pass is only on a chunk of image + audio data)
-        images = torch.randn((1, 16, 3, 224, 224))
-        audio = torch.randn((1, 30720, 1))
-        nchunks = 128
-        image_chunk_size = np.prod((16, 224, 224)) // nchunks
-        audio_chunk_size = audio.shape[1] // config.samples_per_patch // nchunks
-        # process the first chunk
-        chunk_idx = 0
-        subsampling = {
-            "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
-            "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
-            "label": None,
-        }
-        model = PerceiverForMultimodalAutoencoding(config)
-        # set labels
-        filename = "kinetics700-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    else:
-        raise ValueError(f"Architecture {architecture} not supported")
-    model.eval()
-
-    # load weights
-    model.load_state_dict(state_dict)
-
-    # prepare dummy input
-    input_mask = None
-    if architecture == "MLM":
-        tokenizer = PerceiverTokenizer.from_pretrained("/Users/NielsRogge/Documents/Perceiver/Tokenizer files")
-        text = "This is an incomplete sentence where some words are missing."
-        encoding = tokenizer(text, padding="max_length", return_tensors="pt")
-        # mask " missing.". Note that the model performs much better if the masked chunk starts with a space.
-        encoding.input_ids[0, 51:60] = tokenizer.mask_token_id
-        inputs = encoding.input_ids
-        input_mask = encoding.attention_mask
-    elif architecture in ["image_classification", "image_classification_fourier", "image_classification_conv"]:
-        image_processor = PerceiverImageProcessor()
-        image = prepare_img()
-        encoding = image_processor(image, return_tensors="pt")
-        inputs = encoding.pixel_values
-    elif architecture == "optical_flow":
-        inputs = torch.randn(1, 2, 27, 368, 496)
-    elif architecture == "multimodal_autoencoding":
-        images = torch.randn((1, 16, 3, 224, 224))
-        audio = torch.randn((1, 30720, 1))
-        inputs = {"image": images, "audio": audio, "label": torch.zeros((images.shape[0], 700))}
-
-    # forward pass
-    if architecture == "multimodal_autoencoding":
-        outputs = model(inputs=inputs, attention_mask=input_mask, subsampled_output_points=subsampling)
-    else:
-        outputs = model(inputs=inputs, attention_mask=input_mask)
-    logits = outputs.logits
-
-    # verify logits
-    if not isinstance(logits, dict):
-        print("Shape of logits:", logits.shape)
-    else:
-        for k, v in logits.items():
-            print(f"Shape of logits of modality {k}", v.shape)
-
-    if architecture == "MLM":
-        expected_slice = torch.tensor(
-            [[-11.8336, -11.6850, -11.8483], [-12.8149, -12.5863, -12.7904], [-12.8440, -12.6410, -12.8646]]
-        )
-        assert torch.allclose(logits[0, :3, :3], expected_slice)
-        masked_tokens_predictions = logits[0, 51:60].argmax(dim=-1).tolist()
-        expected_list = [38, 115, 111, 121, 121, 111, 116, 109, 52]
-        assert masked_tokens_predictions == expected_list
-        print("Greedy predictions:")
-        print(masked_tokens_predictions)
-        print()
-        print("Predicted string:")
-        print(tokenizer.decode(masked_tokens_predictions))
-
-    elif architecture in ["image_classification", "image_classification_fourier", "image_classification_conv"]:
-        print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
-
-    # Finally, save files
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pickle_file",
-        type=str,
-        default=None,
-        required=True,
-        help="Path to local pickle file of a Perceiver checkpoint you'd like to convert.\n"
-        "Given the files are in the pickle format, please be wary of passing it files you trust.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model directory, provided as a string.",
-    )
-    parser.add_argument(
-        "--architecture",
-        default="MLM",
-        type=str,
-        help="""
-        Architecture, provided as a string. One of 'MLM', 'image_classification', image_classification_fourier',
-        image_classification_fourier', 'optical_flow' or 'multimodal_autoencoding'.
-        """,
-    )
-
-    args = parser.parse_args()
-    convert_perceiver_checkpoint(args.pickle_file, args.pytorch_dump_folder_path, args.architecture)
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index c7212ec3a74e..047fd5b6bafe 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -77,8 +77,8 @@ class PerceiverModelOutput(ModelOutput):
             used to compute the weighted average in the cross-attention heads.
     """
 
-    logits: torch.FloatTensor = None
-    last_hidden_state: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -98,7 +98,7 @@ class PerceiverDecoderOutput(ModelOutput):
             used to compute the weighted average in the cross-attention heads.
     """
 
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
@@ -127,7 +127,7 @@ class PerceiverMaskedLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -159,7 +159,7 @@ class PerceiverClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -2862,7 +2862,7 @@ def forward(
         batch_size: int,
         device: torch.device,
         dtype: torch.dtype,
-        pos: torch.FloatTensor = None,
+        pos: Optional[torch.FloatTensor] = None,
     ) -> torch.FloatTensor:
         pos = _check_or_build_spatial_positions(pos, index_dims, batch_size)
         fourier_pos_enc = generate_fourier_features(
diff --git a/src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py b/src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py
deleted file mode 100644
index 6cd61b9f71c8..000000000000
--- a/src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import warnings
-
-import flatdict
-import torch
-
-from transformers import LlamaTokenizer, PersimmonConfig, PersimmonForCausalLM
-
-
-try:
-    from transformers import LlamaTokenizerFast
-
-    tokenizer_class = LlamaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    tokenizer_class = LlamaTokenizer
-
-"""
-Sample usage:
-
-```
-git clone https://github.com/persimmon-ai-labs/adept-inference
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar
-python src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py  --input_dir /path/to/downloaded/persimmon/weights/ --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import PersimmonForCausalLM, PersimmonTokenizer
-
-model = PersimmonForCausalLM.from_pretrained("/output/path")
-tokenizer = PersimmonTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "self_attention": "self_attn",
-    "language_model.encoder": "model",
-    "word_embeddings_for_head": "lm_head",
-    "language_model.embedding.word_embeddings": "model.embed_tokens",
-}
-
-KEYS_TO_REMOVE = "rotary_emb.inv_freq"
-
-
-def rename_state_dict(state_dict):
-    model_state_dict = {}
-    for key, value in state_dict.items():
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-        if KEYS_TO_REMOVE in key:
-            continue
-        model_state_dict[key] = value
-    return model_state_dict
-
-
-def convert_persimmon_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False):
-    import sys
-
-    sys.path.insert(0, ada_lib_path)
-    model_state_dict_base = torch.load(pt_model_path, map_location="cpu")
-    state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".")
-    state_dict = rename_state_dict(state_dict)
-
-    transformers_config = PersimmonConfig()
-    model = PersimmonForCausalLM(transformers_config, eos_token_id=71013, bos_token_id=71013).to(torch.bfloat16)
-    model.load_state_dict(state_dict)
-    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
-    transformers_config.save_pretrained(pytorch_dump_folder_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Persimmon weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--pt_model_path",
-        help="Location of Persimmon `model_optim_rng.pt`",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--ada_lib_path",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
-    args = parser.parse_args()
-    spm_path = os.path.join(args.input_dir, "adept_vocab.model")
-
-    convert_persimmon_checkpoint(
-        pytorch_dump_folder_path=args.output_dir,
-        pt_model_path=args.pt_model_path,
-        safe_serialization=args.safe_serialization,
-        ada_lib_path=args.ada_lib_path,
-    )
-    tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|")
-    tokenizer.save_pretrained(args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 9c589036815b..508e0c80733a 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -36,12 +36,14 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -49,6 +51,12 @@
 from .configuration_persimmon import PersimmonConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "adept/persimmon-8b-base"
@@ -74,45 +82,18 @@ def __init__(self, config: PersimmonConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -543,10 +524,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -554,17 +536,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -660,8 +639,6 @@ def forward(
         if return_legacy_cache:
             next_cache = next_cache.to_legacy_cache()
 
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -676,12 +653,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -763,7 +745,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -832,12 +814,13 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -846,13 +829,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -888,10 +869,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -900,11 +880,10 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # No upscaling to float was ever done for Persimmon
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -918,10 +897,6 @@ def forward(
                 **kwargs,
             )
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -963,29 +938,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -994,9 +968,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1011,7 +984,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1026,10 +999,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1070,6 +1039,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1081,23 +1051,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1106,9 +1074,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1116,10 +1083,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/phi/convert_phi_weights_to_hf.py b/src/transformers/models/phi/convert_phi_weights_to_hf.py
deleted file mode 100644
index 69ef4c5919ed..000000000000
--- a/src/transformers/models/phi/convert_phi_weights_to_hf.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Weights conversion script for Phi
-
-This script downloads both Phi-1 and Phi-1.5 checkpoints to "checkpoint_path" and then converts the weights to
-HugfgingFace model's format and saves them in "pytorch_dump_folder_path".
-
-Example : $python ./convert_phi_weights_to_hf.py --model_name "microsoft/phi-2" --pytorch_dump_folder ./dump_folder/ --checkpoint_path ./ckpt_path/
-"""
-
-import argparse
-import gc
-import os
-
-import safetensors
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import PhiConfig, PhiForCausalLM
-
-
-_MODELS = {
-    "microsoft/phi-1": ["https://huggingface.co/microsoft/phi-1/blob/main/pytorch_model.bin"],
-    "microsoft/phi-1_5": ["https://huggingface.co/microsoft/phi-1_5/blob/main/pytorch_model.bin"],
-    "microsoft/phi-2": [
-        "https://huggingface.co/microsoft/phi-2/blob/main/model-00001-of-00002.safetensors",
-        "https://huggingface.co/microsoft/phi-2/blob/main/model-00002-of-00002.safetensors",
-    ],
-}
-
-PHI_MAPPING = {
-    "transformer.embd.wte.weight": "model.embed_tokens.weight",
-    "lm_head.linear": "lm_head",
-    "lm_head.ln": "model.final_layernorm",
-    "layers": "model.layers",
-    "transformer": "model",
-    ".h.": ".layers.",
-    "ln": "input_layernorm",
-    "mixer": "self_attn",
-    "Wqkv": "query_key_value",
-    "out_proj": "dense",
-}
-
-
-def convert_weights(original_weights, mapping, config):
-    converted_weights = {}
-    original_weights_keys = sorted(original_weights.keys())
-
-    for original_weights_key in original_weights_keys:
-        new_key = original_weights_key
-
-        if "rotary_emb" in new_key:
-            continue
-
-        if "Wqkv" in new_key:
-            if "weight" in new_key:
-                weight = original_weights[new_key]
-                weights_shape = weight.shape
-                weight = (
-                    weight.view(3, config.num_attention_heads, -1, config.hidden_size)
-                    .transpose(0, 1)
-                    .reshape(*weights_shape)
-                )
-                original_weights[new_key] = weight
-            elif "bias" in new_key:
-                bias = original_weights[new_key]
-                bias_shape = bias.shape
-                bias = bias.view(3, config.num_attention_heads, -1).transpose(0, 1).reshape(*bias_shape)
-                original_weights[new_key] = bias
-
-        for k, v in mapping.items():
-            if k in new_key:
-                new_key = new_key.replace(k, v)
-
-        converted_weights[new_key] = original_weights.pop(original_weights_key)
-
-    return converted_weights
-
-
-def _download(url: str, root: str):
-    repo_id = f"{url.split('/')[3]}/{url.split('/')[4]}"
-    filename = f"{url.split('/')[-1]}"
-    hf_hub_download(
-        repo_id=repo_id,
-        filename=filename,
-        force_filename=root,
-        local_dir_use_symlinks=False,
-    )
-
-
-def convert_phi_weights(
-    model_name, checkpoint_path, pytorch_dump_folder_path, use_cuda, save_weights_directly, _MODELS
-):
-    _MODELS = _MODELS if model_name not in _MODELS.keys() else {model_name: _MODELS.get(model_name)}
-    device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu"
-    for model_name, model_url in _MODELS.items():
-        converted_checkpoint = {}
-        model_checkpoint = {}
-
-        # for phi-2 the weights are stored in 2 different safetensors file so we need to iterate over that list and download one at a time
-        for model_each_url in model_url:
-            model_path = os.path.join(checkpoint_path, model_name + "_" + model_each_url.split("/")[-1])
-            if not os.path.exists(model_path):
-                print(f"\n{model_name} was not found! Downloading it to {model_path}")
-                _download(url=model_each_url, root=model_path)
-
-            if model_path.endswith("safetensors"):
-                loaded_weights = safetensors.torch.load_file(model_path, device=device)
-            else:
-                loaded_weights = torch.load(model_path, map_location=device)
-            model_checkpoint.update(**loaded_weights)
-
-        model_type = model_name.split("/")[1]  # phi-1 or phi-1_5 or phi-2
-
-        # init the config for phi-1 and phi-1.5
-        config = PhiConfig()
-        # if we are dealing with phi-2 then update the config
-        if model_type == "phi-2":
-            config.hidden_size = 2560
-            config.intermediate_size = 10240
-            config.num_hidden_layers = 32
-            config.resid_pdrop = 0.1
-            config.partial_rotary_factor = 0.4
-            config.num_hidden_layers = 32
-            config.torch_dtype = "float16"
-
-        # Converting the weights
-        converted_checkpoint.update(**convert_weights(model_checkpoint, PHI_MAPPING, config))
-
-        # Save either the whole model or the converted weights
-        if save_weights_directly:
-            save_weights_path = os.path.join(pytorch_dump_folder_path, model_type + "_pytorch_model.bin")
-            torch.save(converted_checkpoint, save_weights_path)
-            print(f"Model weights saved at {save_weights_path}!")
-
-        else:
-            model = PhiForCausalLM(config).to(device)
-            model.load_state_dict(converted_checkpoint, strict=True)
-            save_model_path = os.path.join(pytorch_dump_folder_path, model_type)
-            model.save_pretrained(save_model_path)
-            print(f"Model saved at {save_model_path}!")
-
-            # release GPU memory for the 2nd model if cuda was used.
-            del config, model
-
-        # release GPU memory for the 2nd model if cuda was used.
-        del model_checkpoint, converted_checkpoint
-        if use_cuda:
-            torch.cuda.empty_cache()
-        gc.collect()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # # Required parameters
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        help="Name of the model to convert. (Please enter one of the following: phi-1, phi-1_5, phi-2). If nothing is provided, all models will be converted.",
-        default=None,
-    )
-    parser.add_argument(
-        "--checkpoint_path", type=str, help="Path to the folder of downloaded checkpoints. (Please enter full path)"
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model. (Please enter full path)",
-    )
-    parser.add_argument(
-        "--use_cuda",
-        default=False,
-        type=bool,
-        help="Whether to load the weights on GPU during conversion or not, False by default",
-    )
-    parser.add_argument(
-        "--save_weights_directly",
-        default=True,
-        type=bool,
-        help="Whether to save the weights directly after conversion or load the weight to the Phi model and then save "
-        "the Phi model along with weights. True by default",
-    )
-
-    args = parser.parse_args()
-    convert_phi_weights(
-        args.model_name,
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.use_cuda,
-        args.save_weights_directly,
-        _MODELS,
-    )
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 33d86999fdf8..51e518ffc203 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -4,7 +4,8 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_phi.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -20,7 +21,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -28,6 +29,8 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -35,6 +38,12 @@
 from .configuration_phi import PhiConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "microsoft/phi-1"
@@ -288,45 +297,18 @@ def __init__(self, config: PhiConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -413,20 +395,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -488,10 +462,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -499,16 +474,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -554,7 +527,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -588,13 +561,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -602,12 +574,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -688,7 +665,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -755,27 +732,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -810,10 +786,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -822,12 +797,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -836,10 +810,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -880,29 +850,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -911,9 +880,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -928,7 +896,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -943,10 +911,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -986,6 +950,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -997,23 +962,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1022,9 +985,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1032,10 +994,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/phi/modular_phi.py b/src/transformers/models/phi/modular_phi.py
index 1b98d939bf55..f0eb31058c90 100644
--- a/src/transformers/models/phi/modular_phi.py
+++ b/src/transformers/models/phi/modular_phi.py
@@ -1,4 +1,5 @@
-from typing import Callable, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -180,7 +181,7 @@ def __init__(self, config: PhiConfig):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -188,16 +189,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -243,7 +242,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -277,13 +276,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
 
 class PhiForCausalLM(LlamaForCausalLM):
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index a26e3a979377..1c8688b28f85 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -20,7 +20,8 @@
 # limitations under the License.
 
 
-from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -36,7 +37,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -44,6 +45,7 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -323,88 +325,6 @@ def forward(
         return outputs
 
 
-class Phi3RotaryEmbedding(nn.Module):
-    def __init__(self, config: Phi3Config, device=None):
-        super().__init__()
-        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        else:
-            self.rope_type = "default"
-        self.max_seq_len_cached = config.max_position_embeddings
-        self.original_max_seq_len = config.max_position_embeddings
-
-        self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
-
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
-    @torch.no_grad()
-    def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-        elif self.rope_type == "longrope":
-            self._longrope_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-    def _longrope_frequency_update(self, position_ids, device):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
-        seq_len = torch.max(position_ids) + 1
-        if hasattr(self.config, "original_max_position_embeddings"):
-            original_max_position_embeddings = self.config.original_max_position_embeddings
-        else:
-            original_max_position_embeddings = self.config.max_position_embeddings
-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, "long_inv_freq"):
-                self.long_inv_freq, _ = self.rope_init_fn(
-                    self.config, device, seq_len=original_max_position_embeddings + 1
-                )
-            self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-
-
 PHI3_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -453,6 +373,40 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
 
+class Phi3RotaryEmbedding(nn.Module):
+    def __init__(self, config: Phi3Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
 PHI3_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -488,20 +442,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -562,10 +508,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -573,16 +520,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -593,6 +538,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -627,7 +576,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -661,13 +610,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -675,7 +623,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -778,7 +726,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -857,27 +805,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -912,10 +859,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -924,12 +870,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -938,10 +883,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -1021,29 +962,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1052,9 +992,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1069,7 +1008,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1084,10 +1023,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1127,6 +1062,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1138,23 +1074,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1163,9 +1097,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1173,10 +1106,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/phi3/modular_phi3.py b/src/transformers/models/phi3/modular_phi3.py
index caddb77f6fcd..4e38e3164df9 100644
--- a/src/transformers/models/phi3/modular_phi3.py
+++ b/src/transformers/models/phi3/modular_phi3.py
@@ -33,7 +33,6 @@
     MistralForSequenceClassification,
     MistralForTokenClassification,
     MistralPreTrainedModel,
-    MistralRotaryEmbedding,
     eager_attention_forward,
     rotate_half,
 )
@@ -244,55 +243,6 @@ def forward(
         return outputs
 
 
-class Phi3RotaryEmbedding(MistralRotaryEmbedding):
-    def __init__(self, config: Phi3Config, device=None):
-        super().__init__(config, device)
-
-    def _longrope_frequency_update(self, position_ids, device):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
-        seq_len = torch.max(position_ids) + 1
-        if hasattr(self.config, "original_max_position_embeddings"):
-            original_max_position_embeddings = self.config.original_max_position_embeddings
-        else:
-            original_max_position_embeddings = self.config.max_position_embeddings
-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, "long_inv_freq"):
-                self.long_inv_freq, _ = self.rope_init_fn(
-                    self.config, device, seq_len=original_max_position_embeddings + 1
-                )
-            self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-
-    @torch.no_grad()
-    def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-        elif self.rope_type == "longrope":
-            self._longrope_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
 class Phi3PreTrainedModel(MistralPreTrainedModel):
     _version = "0.0.5"
 
diff --git a/src/transformers/models/phi4_multimodal/__init__.py b/src/transformers/models/phi4_multimodal/__init__.py
new file mode 100644
index 000000000000..c4e2e599f57a
--- /dev/null
+++ b/src/transformers/models/phi4_multimodal/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_phi4_multimodal import *
+    from .feature_extraction_phi4_multimodal import *
+    from .image_processing_phi4_multimodal_fast import *
+    from .modeling_phi4_multimodal import *
+    from .processing_phi4_multimodal import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py
new file mode 100644
index 000000000000..3f776b0b71ec
--- /dev/null
+++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py
@@ -0,0 +1,482 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_phi4_multimodal.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from ...configuration_utils import PretrainedConfig
+
+
+class Phi4MultimodalVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalVisionModel`]. It is used to instantiate a
+    Phi4Multimodal vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1152):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 4304):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 27):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 448):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        crop_size (`int`, *optional*, defaults to 448):
+            Crop size for the input images.
+        image_token_id (`int`, *optional*, defaults to 200010):
+            The image token id.
+        feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer of the encoder from which to extract image features.
+
+    Example:
+
+    ```python
+    >>> from transformers import Phi4MultimodalVisionConfig
+
+    >>> # Initializing a Phi4MultimodalVisionConfig with microsoft/Phi-4-multimodal-instruct style configuration
+    >>> configuration = Phi4MultimodalVisionConfig()
+    ```"""
+
+    model_type = "phi4_multimodal_vision"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1152,
+        intermediate_size=4304,
+        num_hidden_layers=27,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=448,
+        patch_size=14,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        crop_size: int = 448,
+        image_token_id: int = 200010,
+        feature_layer: int = -2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.crop_size = crop_size
+        self.image_token_id = image_token_id
+        self.feature_layer = feature_layer
+
+
+class Phi4MultimodalAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalAudioModel`]. It is used to instantiate a
+    Phi4Multimodal audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers.
+        intermediate_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_blocks (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        activation (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the MLPs.
+        chunk_size (`int`, *optional*, defaults to -1):
+            The chunk size to create the masks.
+        left_chunk (`int`, *optional*, defaults to 18):
+            The left chunk to create the masks.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout ratio.
+        ext_pw_out_channel (`int`, *optional*, defaults to 1024):
+            Number of out channels in the point-wise conv modules.
+        depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
+            Number of out channels in the depth-wise separable conv modules.
+        depthwise_multiplier (`int`, *optional*, defaults to 1):
+            Input size multiplier for the depth-wise separable conv modules.
+        kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size for the depth-wise separable conv modules.
+        conv_activation (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the conv modules.
+        input_size (`int`, *optional*, defaults to 80):
+            Input size for the audio model.
+        conv_glu_type (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the point-wise conv modules.
+        time_reduction (`int`, *optional*, defaults to 8):
+            Time reduction (subsampling factor).
+        bias_max_distance (`int`, *optional*, defaults to 1000):
+            Max distance for the relative attention bias module.
+        bias_symmetric (`bool`, *optional*, defaults to `False`):
+            Whether the relative attention bias should be symmetric or not.
+        nemo_activation (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in the nemo conv modules.
+        nemo_conv_channels (`int`, *optional*, defaults to 1024):
+            Number of channels in the nemo conv modules.
+        downsample_rate (`int`, *optional*, defaults to 1):
+            Downsample rate for the audio feature extractor.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        audio_token_id (`int`, *optional*, defaults to 200011):
+            The audio token id.
+        feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer of the encoder from which to extract audio features.
+
+    Example:
+
+    ```python
+    >>> from transformers import Phi4MultimodalAudioConfig
+
+    >>> # Initializing a Phi4MultimodalAudioConfig with microsoft/Phi-4-multimodal-instruct style configuration
+    >>> configuration = Phi4MultimodalAudioConfig()
+    ```"""
+
+    model_type = "phi4_multimodal_audio"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 1536,
+        num_blocks: int = 24,
+        num_attention_heads: int = 16,
+        activation: str = "swish",
+        chunk_size: int = -1,
+        left_chunk: int = 18,
+        dropout_rate: float = 0.0,
+        ext_pw_out_channel: int = 1024,
+        depthwise_seperable_out_channel: int = 1024,
+        depthwise_multiplier: int = 1,
+        kernel_size: int = 3,
+        conv_activation: str = "swish",
+        input_size: int = 80,
+        conv_glu_type: str = "swish",
+        time_reduction: int = 8,
+        bias_max_distance: int = 1000,
+        bias_symmetric: bool = False,
+        nemo_activation: str = "relu",
+        nemo_conv_channels: int = 1024,
+        downsample_rate: int = 1,
+        initializer_range: float = 0.02,
+        audio_token_id: int = 200011,
+        feature_layer: int = -2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.activation = activation
+        self.chunk_size = chunk_size
+        self.left_chunk = left_chunk
+        self.num_blocks = num_blocks
+        self.dropout_rate = dropout_rate
+        self.ext_pw_out_channel = ext_pw_out_channel
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.depthwise_multiplier = depthwise_multiplier
+        self.kernel_size = kernel_size
+        self.conv_activation = conv_activation
+        self.input_size = input_size
+        self.conv_glu_type = conv_glu_type
+        self.time_reduction = time_reduction
+        self.bias_max_distance = bias_max_distance
+        self.bias_symmetric = bias_symmetric
+        self.nemo_activation = nemo_activation
+        self.nemo_conv_channels = nemo_conv_channels
+        self.downsample_rate = downsample_rate
+        self.audio_token_id = audio_token_id
+        self.initializer_range = initializer_range
+        self.feature_layer = feature_layer
+
+        if time_reduction % 2 != 0:
+            raise ValueError("`time_reduction` should be a multiple of 2!")
+        length = input_size
+        for _ in range(int(math.log(time_reduction, 2))):
+            length = math.floor((length - 1) / 2 + 1)
+        self.nemo_final_size = length
+
+
+class Phi4MultimodalConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalModel`]. It is used to instantiate a
+    Phi4Multimodal model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 200064):
+            Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Phi3Model`].
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            Dropout probability for mlp outputs.
+        embd_pdrop (`int`, *optional*, defaults to 0.0):
+            The dropout ratio for the embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used for the RMSNorm.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
+            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
+            divided by the number of attention heads divided by 2.
+        partial_rotary_factor (`float`, *optional*, defaults to `1.0`):
+            Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0.
+        bos_token_id (`int`, *optional*, defaults to 199999):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`):
+            The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*, defaults to 199999):
+            The id of the padding token.
+        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model was trained with. This is used to determine the size of the
+            original RoPE embeddings when using long scaling.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If `None`, no sliding window is applied.
+        vision_config (`Phi4MultimodalVisionConfig` or `dict`, *optional*):
+            The vision config for the underlying image embedding model. If not provided, will default to the configuration
+            used to instantiate a model similar in architecture as
+            [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
+        audio_config (`Phi4MultimodalAudioConfig` or `dict`, *optional*):
+            The audio config for the underlying audio embedding model. If not provided, will default to the configuration
+            used to instantiate a model similar in architecture as
+            [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
+
+    Example:
+
+    ```python
+    >>> from transformers import Phi4MultimodalModel, Phi4MultimodalConfig
+
+    >>> # Initializing a Phi4Multimodal style configuration
+    >>> configuration = Phi4MultimodalConfig.from_pretrained("microsoft/Phi-4-multimodal-instruct")
+
+    >>> # Initializing a model from the configuration
+    >>> model = Phi4MultimodalModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "phi4_multimodal"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.qkv_proj": "colwise_rep",  # we need to replicate here due to the slicing of qkv
+        "layers.*.self_attn.o_proj": "rowwise_rep",  # we need to replicate here due to the slicing of qkv
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    sub_configs = {"audio_config": Phi4MultimodalAudioConfig, "vision_config": Phi4MultimodalVisionConfig}
+
+    def __init__(
+        self,
+        vocab_size=200064,
+        hidden_size=3072,
+        intermediate_size=8192,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=1,
+        bos_token_id=199999,
+        eos_token_id=[199999, 200020],
+        pad_token_id=199999,
+        original_max_position_embeddings=4096,
+        sliding_window=None,
+        vision_config=None,
+        audio_config=None,
+        **kwargs,
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self._rope_scaling_adjustment()
+        self._rope_scaling_validation()
+        self.sliding_window = sliding_window
+
+        if isinstance(vision_config, dict):
+            vision_config = Phi4MultimodalVisionConfig(**vision_config)
+        elif vision_config is None:
+            Phi4MultimodalVisionConfig()
+        self.vision_config = vision_config
+
+        if isinstance(audio_config, dict):
+            audio_config = Phi4MultimodalAudioConfig(**audio_config)
+        elif vision_config is None:
+            audio_config = Phi4MultimodalAudioConfig()
+        self.audio_config = audio_config
+
+    def _rope_scaling_adjustment(self):
+        """
+        Adjust the `type` of the `rope_scaling` configuration for backward compatibility.
+        """
+        if self.rope_scaling is None:
+            return
+
+        rope_scaling_type = self.rope_scaling.get("type", None)
+
+        # For backward compatibility if previous version used "su" or "yarn"
+        if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]:
+            self.rope_scaling["type"] = "longrope"
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}")
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor)
+        if not len(rope_scaling_short_factor) == rotary_ndims // 2:
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if not len(rope_scaling_long_factor) == rotary_ndims // 2:
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}"
+            )
+
+
+__all__ = ["Phi4MultimodalVisionConfig", "Phi4MultimodalAudioConfig", "Phi4MultimodalConfig"]
diff --git a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
new file mode 100644
index 000000000000..5d29af6c8bfb
--- /dev/null
+++ b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
@@ -0,0 +1,348 @@
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Processor class for Phi4Multimodal
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...audio_utils import AudioInput
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...image_processing_utils import BatchFeature
+from ...utils import TensorType, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+# TODO: @eustlb, remove this once #36603 is merged.
+def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
+    """Create a Mel filter-bank the same as SpeechLib FbankFC.
+
+    Args:
+        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
+        n_fft (int): FFT size. int > 0 [scalar]
+        n_mel (int): Mel filter size. int > 0 [scalar]
+        fmin (float): lowest frequency (in Hz). If None use 0.0.
+            float >= 0 [scalar]
+        fmax: highest frequency (in Hz). If None use sample_rate / 2.
+            float >= 0 [scalar]
+
+    Returns
+        out (numpy.ndarray): Mel transform matrix
+            [shape=(n_mels, 1 + n_fft/2)]
+    """
+
+    bank_width = int(n_fft // 2 + 1)
+    if fmax is None:
+        fmax = sample_rate / 2
+    if fmin is None:
+        fmin = 0
+    assert fmin >= 0, "fmin cannot be negtive"
+    assert fmin < fmax <= sample_rate / 2, "fmax must be between (fmin, samplerate / 2]"
+
+    def mel(f):
+        return 1127.0 * np.log(1.0 + f / 700.0)
+
+    def bin2mel(fft_bin):
+        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
+
+    def f2bin(f):
+        return int((f * n_fft / sample_rate) + 0.5)
+
+    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
+    klo = f2bin(fmin) + 1
+    khi = f2bin(fmax)
+
+    khi = max(khi, klo)
+
+    # Spec 2: SpeechLib uses trianges in Mel space
+    mlo = mel(fmin)
+    mhi = mel(fmax)
+    m_centers = np.linspace(mlo, mhi, n_mels + 2)
+    ms = (mhi - mlo) / (n_mels + 1)
+
+    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
+    for m in range(0, n_mels):
+        left = m_centers[m]
+        center = m_centers[m + 1]
+        right = m_centers[m + 2]
+        for fft_bin in range(klo, khi):
+            mbin = bin2mel(fft_bin)
+            if left < mbin < right:
+                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
+
+    return matrix
+
+
+class Phi4MultimodalFeatureExtractor(SequenceFeatureExtractor):
+    model_input_names = ["audio_input_features", "audio_embed_sizes", "audio_attention_mask"]
+
+    def __init__(
+        self,
+        feature_size: int = 80,
+        sampling_rate: int = 16000,
+        hop_length: int = 160,
+        n_fft: int = 512,
+        win_length: int = 400,
+        preemphasis: float = 0.97,
+        padding_value: float = 0.0,
+        audio_compression_rate: int = 8,
+        audio_downsample_rate: int = 1,
+        audio_feat_stride: int = 1,
+        mel_min_frequency: float = 0,
+        mel_max_frequency: float = 7690,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+
+        self.hop_length = hop_length
+        self.n_fft = n_fft
+        self.win_length = win_length
+        self.preemphasis = preemphasis
+        self.padding_value = padding_value
+        self.audio_compression_rate = audio_compression_rate
+        self.audio_downsample_rate = audio_downsample_rate
+        self.audio_feat_stride = audio_feat_stride
+
+        # TODO: @eustlb, uncomment and remove speechlib_mel once #36603 is merged.
+        # self.mel_filters = mel_filter_bank(
+        #     num_frequency_bins=self.n_fft // 2 + 1,
+        #     num_mel_filters=self.feature_size,
+        #     min_frequency=mel_min_frequency,
+        #     max_frequency=mel_max_frequency,
+        #     sampling_rate=self.sampling_rate,
+        #     triangularize_in_mel_space=True,
+        #     mel_scale="kaldi",
+        # )
+        self.mel_filters = speechlib_mel(
+            self.sampling_rate, self.n_fft, self.feature_size, mel_min_frequency, mel_max_frequency
+        ).T
+
+    def __call__(
+        self,
+        raw_speech: AudioInput,
+        sampling_rate: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        padding: Optional[str] = "longest",
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = True,
+        device: Optional[str] = "cpu",
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several audio sequence(s). Implementation uses PyTorch for
+        the STFT computation if available, otherwise a slower NumPy based one.
+
+        Args:
+            raw_speech (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array or PyTorch tensor.
+                For batched inputs, sequences can be a list of numpy arrays or PyTorch tensors, or a single numpy array or
+                PyTorch tensor with first dimension being the batch size.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            pad_to_multiple_of (`int`, *optional*, defaults to None):
+                If set will pad the sequence to a multiple of the provided value.
+            padding (`str`, *optional*, defaults to "longest"):
+                Padding strategy. Can be "longest" to pad to the longest sequence in the batch, or a specific length.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length.
+            truncation (`bool`, *optional*, defaults to False):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of numpy arrays. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+            return_attention_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return the extracted audio input features' attention mask.
+            device (`str`, *optional*, defaults to "cpu"):
+                Specifies the device for computation of the audio features. (e.g., "cpu", "cuda")
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **audio_input_features** -- Audio features extracted from the raw audio input, shape (batch_size, max_feature_length, feature_size).
+                - **audio_lengths** -- Length of each audio sample in the batch, shape (batch_size,).
+                - **audio_attention_mask** -- Attention mask for the audio input, shape (batch_size, max_feature_length).
+                If `return_tensors` is not specified, the fields will be PyTorch tensors if PyTorch is available, otherwise NumPy arrays.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
+                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
+                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        # Convert to torch tensor
+        if isinstance(raw_speech, np.ndarray):
+            raw_speech = torch.tensor(raw_speech)
+        elif isinstance(raw_speech, (list, tuple)) and isinstance(raw_speech[0], np.ndarray):
+            raw_speech = [torch.tensor(speech) for speech in raw_speech]
+
+        is_batched_torch = isinstance(raw_speech, torch.Tensor) and len(raw_speech.shape) > 1
+        if is_batched_torch and len(raw_speech.shape) > 2:
+            logger.warning(
+                f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
+                "We will take the mean of the channels to convert to mono."
+            )
+            raw_speech = raw_speech.mean(-1)
+
+        is_batched_sequence = isinstance(raw_speech, (list, tuple))
+        if is_batched_sequence:
+            for speech in raw_speech:
+                if len(speech.shape) > 1:
+                    logger.warning(
+                        f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
+                        "We will take the mean of the channels to convert to mono."
+                    )
+                    speech = speech.mean(-1)
+
+        if is_batched_torch or is_batched_sequence:
+            raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
+        else:
+            raw_speech = [raw_speech[:, None].to(torch.float32)]
+
+        audio_lengths = [len(speech) for speech in raw_speech]
+
+        # convert into correct format for padding
+        batched_speech = BatchFeature(data={"audio_input_features": raw_speech, "audio_lengths": audio_lengths})
+        padded_inputs = self.pad(
+            batched_speech,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        input_features = padded_inputs.audio_input_features.squeeze(-1)
+        audio_lengths = padded_inputs.audio_lengths
+
+        input_features = self._torch_extract_fbank_features(input_features, audio_lengths, device)
+
+        feature_lengths = (audio_lengths - self.win_length) // self.hop_length + 1
+        feature_lengths = feature_lengths * self.audio_feat_stride
+        audio_embed_sizes = self._compute_audio_embed_size(feature_lengths)
+
+        feature_attention_mask = (
+            torch.arange(0, feature_lengths.max()) if is_torch_available() else np.arange(0, feature_lengths.max())
+        )
+        feature_attention_mask = (
+            feature_attention_mask[None, :] < feature_lengths[:, None] if len(feature_lengths) > 1 else None
+        )
+
+        data = {
+            "audio_input_features": input_features,
+            "audio_embed_sizes": audio_embed_sizes,
+        }
+        if feature_attention_mask is not None and return_attention_mask:
+            data["audio_attention_mask"] = feature_attention_mask
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    # TODO; @eustlb, move this to audio_utils in a general spectogram_batch function that handles torch and numpy
+    def _torch_extract_fbank_features(
+        self, waveform: "torch.FloatTensor", audio_lengths: "torch.Tensor", device: str = "cpu"
+    ) -> "torch.FloatTensor":
+        """
+        Compute the log mel-scaled spectrogram of batched waveforms using PyTorch's FFT implementation.
+
+        Args:
+            waveform (torch.FloatTensor` of shape `(batch_size, max_audio_length)`):
+                The batched waveforms.
+            audio_lengths (`torch.Tensor` of shape `(batch_size,)`):
+                The lengths of the waveforms along the max_audio_length dimension.
+            device (`str`, *optional*, defaults to "cpu"):
+                The device to run the computation on. (e.g., "cpu", "cuda")
+
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, max_feature_length, feature_size)`:
+                The log mel-scaled spectrogram of the batched waveforms.
+        """
+        fft_window = torch.hamming_window(self.win_length, periodic=False, device=device, dtype=torch.float64)
+
+        # batched implementation
+        batch_size = waveform.shape[0]
+        frames = waveform.unfold(-1, self.win_length, self.hop_length)
+
+        # ---
+        # the unbatched (and unpaded) original implementation skips last few audio values that can't be included in a frame
+        # we need to ensure that the corresponding frames for the padded input also mask these values
+        if batch_size > 1:
+            frames = frames.clone()
+            # concerned batch indices
+            to_mask_batch_idxs = torch.arange(batch_size)[audio_lengths != audio_lengths.max()]
+            if to_mask_batch_idxs.numel() > 0:
+                batch_idxs_down = (audio_lengths[to_mask_batch_idxs] - self.win_length) // self.hop_length + 1
+                batch_idxs_up = audio_lengths[to_mask_batch_idxs] // self.hop_length + 1
+                offset_idx = batch_idxs_down.min()
+                max_idx = batch_idxs_up.max()
+
+                mask = torch.arange(max_idx - offset_idx, device=device).expand(to_mask_batch_idxs.shape[0], -1)
+                mask = ((batch_idxs_down - offset_idx).unsqueeze(1) <= mask) & (
+                    mask < (batch_idxs_up - offset_idx).unsqueeze(1)
+                )
+                mask = mask.unsqueeze(-1).expand(-1, -1, self.win_length)
+                masked_frames = frames[to_mask_batch_idxs, offset_idx:max_idx].masked_fill_(mask, 0)
+                frames[to_mask_batch_idxs, offset_idx:max_idx] = masked_frames
+        # ---
+
+        # apply pre-emphasis first order filter on fft windows
+        frames_prev = torch.roll(frames, 1, dims=-1)
+        frames_prev[:, :, 0] = frames_prev[:, :, 1]
+        frames = (frames - self.preemphasis * frames_prev) * 32768
+
+        # apply fft
+        S = torch.fft.rfft(fft_window * frames.view(-1, self.win_length), n=self.n_fft, dim=1)
+        S = S.view(frames.shape[0], -1, S.shape[-1])
+        S = S.to(torch.complex64)
+
+        spec = torch.abs(S)
+        spec_power = spec**2
+
+        # apply triangular mel filter bank
+        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
+        log_spec = torch.clamp(spec_power @ mel_filters, min=1.0)
+        log_spec = torch.log(log_spec)
+
+        return log_spec
+
+    def _compute_audio_embed_size(self, audio_frames):
+        integer = audio_frames // self.audio_compression_rate
+        remainder = audio_frames % self.audio_compression_rate
+        result = integer + (remainder > 0).to(integer.dtype)
+
+        integer = result // self.audio_downsample_rate
+        remainder = result % self.audio_downsample_rate
+        result = integer + (remainder > 0).to(integer.dtype)  # qformer compression
+
+        return result
+
+
+__all__ = ["Phi4MultimodalFeatureExtractor"]
diff --git a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
new file mode 100644
index 000000000000..813f89dbeaa6
--- /dev/null
+++ b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
@@ -0,0 +1,263 @@
+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Processor class for Phi4Multimodal
+"""
+
+import math
+from typing import List, Optional, Union
+
+import torch
+from torchvision.transforms import functional as F
+
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    Unpack,
+    convert_to_rgb,
+)
+from ...image_utils import ImageInput, make_flat_list_of_images, valid_images
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Phi4MultimodalFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    image_size: Optional[int]
+    patch_size: Optional[int]
+    dynamic_hd: Optional[int]
+
+
+class Phi4MultimodalImageProcessorFast(BaseImageProcessorFast):
+    r"""
+    Constructs a Phi4Multimodal image processor.
+    """
+
+    image_size = 448
+    patch_size = 14
+    dynamic_hd = 36
+    image_mean = [0.5, 0.5, 0.5]
+    image_std = [0.5, 0.5, 0.5]
+    valid_init_kwargs = Phi4MultimodalFastImageProcessorKwargs
+    model_input_names = ["image_pixel_values", "image_sizes", "image_attention_mask"]
+
+    def __init__(self, **kwargs: Unpack[Phi4MultimodalFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height):
+        best_ratio_diff = float("inf")
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * self.image_size * self.image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+
+    def dynamic_preprocess(self, image, max_num=36, min_num=1):
+        image_size = self.image_size
+        patch_size = self.patch_size
+        mask_size = image_size // patch_size
+        orig_width, orig_height = image.size
+
+        w_crop_num = math.ceil(orig_width / float(image_size))
+        h_crop_num = math.ceil(orig_height / float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+
+            # calculate the existing image aspect ratio
+            target_ratios = {
+                (i, j)
+                for n in range(min_num, max_num + 1)
+                for i in range(1, n + 1)
+                for j in range(1, n + 1)
+                if i * j <= max_num and i * j >= min_num
+            }
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+            # find the closest aspect ratio to the target
+            target_aspect_ratio = self.find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height)
+
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+
+        # Calculate the ratio
+        ratio_width = target_width / orig_width
+        ratio_height = target_height / orig_height
+        if ratio_width < ratio_height:
+            new_size = (target_width, int(orig_height * ratio_width))
+            padding_width = 0
+            padding_height = target_height - int(orig_height * ratio_width)
+        else:
+            new_size = (int(orig_width * ratio_height), target_height)
+            padding_width = target_width - int(orig_width * ratio_height)
+            padding_height = 0
+
+        attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]), int(mask_size * target_aspect_ratio[0])))
+        if padding_width >= patch_size:
+            attention_mask[:, -math.floor(padding_width / patch_size) :] = 0
+        if padding_height >= patch_size:
+            attention_mask[-math.floor(padding_height / patch_size) :, :] = 0
+
+        if min(new_size[1], target_height) < 10 or min(new_size[0], target_width) < 10:
+            raise ValueError(f"the aspect ratio is very extreme {new_size}")
+
+        image = F.resize(image, [new_size[1], new_size[0]])
+        resized_img = F.pad(image, [0, 0, padding_width, padding_height], fill=[255, 255, 255])
+
+        return resized_img, attention_mask
+
+    def pad_to_max_num_crops(self, images, max_crops=5):
+        """
+        images: B x 3 x H x W, B<=max_crops
+        """
+        B, _, H, W = images.shape
+        if B < max_crops:
+            pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
+            images = torch.cat([images, pad], dim=0)
+        return images
+
+    def pad_mask_to_max_num_crops(self, masks, max_crops=5):
+        B, H, W = masks.shape
+        if B < max_crops:
+            pad = torch.ones(max_crops - B, H, W, dtype=masks.dtype, device=masks.device)
+            masks = torch.cat([masks, pad], dim=0)
+        return masks
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+        """
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        images = make_flat_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        images = [convert_to_rgb(image) for image in images]
+
+        image_size = self.image_size
+        patch_size = self.patch_size
+        mask_size = image_size // patch_size
+        imgs_and_masks = [self.dynamic_preprocess(image, max_num=self.dynamic_hd) for image in images]
+        images, image_attention_masks = [x[0] for x in imgs_and_masks], [x[1] for x in imgs_and_masks]
+
+        images = [F.to_tensor(image) for image in images]
+        hd_images = [F.normalize(image, image_mean, image_std) for image in images]
+        global_image = [
+            torch.nn.functional.interpolate(
+                image.unsqueeze(0).float(),
+                size=(image_size, image_size),
+                mode="bicubic",
+            ).to(image.dtype)
+            for image in hd_images
+        ]
+
+        shapes = [[image.size(1), image.size(2)] for image in hd_images]
+        mask_shapes = [[mask.size(0), mask.size(1)] for mask in image_attention_masks]
+        global_attention_mask = [torch.ones((1, mask_size, mask_size)) for _ in hd_images]
+
+        hd_images_reshape = []
+        for im, (h, w) in zip(hd_images, shapes):
+            im = im.reshape(1, 3, h // image_size, image_size, w // image_size, image_size)
+            im = im.permute(0, 2, 4, 1, 3, 5)
+            im = im.reshape(-1, 3, image_size, image_size)
+            hd_images_reshape.append(im.contiguous())
+
+        attention_masks_reshape = []
+        for mask, (h, w) in zip(image_attention_masks, mask_shapes):
+            mask = mask.reshape(h // mask_size, mask_size, w // mask_size, mask_size)
+            mask = mask.transpose(1, 2)
+            mask = mask.reshape(-1, mask_size, mask_size)
+            attention_masks_reshape.append(mask.contiguous())
+
+        downsample_attention_masks = []
+        for mask, (h, w) in zip(attention_masks_reshape, mask_shapes):
+            mask = mask[:, 0::2, 0::2]
+            mask = mask.reshape(
+                h // mask_size, w // mask_size, mask_size // 2 + mask_size % 2, mask_size // 2 + mask_size % 2
+            )
+            mask = mask.transpose(1, 2)
+            mask = mask.reshape(mask.size(0) * mask.size(1), mask.size(2) * mask.size(3))
+            downsample_attention_masks.append(mask)
+
+        num_img_tokens = [
+            256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16 for mask in downsample_attention_masks
+        ]
+
+        hd_images_reshape = [
+            torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)
+        ]
+        hd_masks_reshape = [
+            torch.cat([_global_mask] + [_mask], dim=0)
+            for _global_mask, _mask in zip(global_attention_mask, attention_masks_reshape)
+        ]
+        max_crops = max([img.size(0) for img in hd_images_reshape])
+        image_transformed = [self.pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape]
+        image_transformed = torch.stack(image_transformed, dim=0)
+        mask_transformed = [self.pad_mask_to_max_num_crops(mask, max_crops) for mask in hd_masks_reshape]
+        mask_transformed = torch.stack(mask_transformed, dim=0)
+
+        returned_input_image_embeds = image_transformed
+        returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
+        returned_image_attention_mask = mask_transformed
+        returned_num_img_tokens = num_img_tokens
+
+        data = {
+            "image_pixel_values": returned_input_image_embeds,
+            "image_sizes": returned_image_sizes,
+            "image_attention_mask": returned_image_attention_mask,
+            "num_img_tokens": returned_num_img_tokens,
+        }
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["Phi4MultimodalImageProcessorFast"]
diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
new file mode 100644
index 000000000000..fb399f6d83df
--- /dev/null
+++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
@@ -0,0 +1,2252 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_phi4_multimodal.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPooling,
+    CausalLMOutputWithPast,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from .configuration_phi4_multimodal import Phi4MultimodalAudioConfig, Phi4MultimodalConfig, Phi4MultimodalVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Phi4MultimodalVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+def simple_eager_attention_forward(
+    module: nn.Module,
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Phi4MultimodalVisionAttention(nn.Module):
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scaling = self.head_dim**-0.5
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.v_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.q_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        attention_interface: Callable = simple_eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Phi4MultimodalVisionEncoderLayer(nn.Module):
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = Phi4MultimodalVisionAttention(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Phi4MultimodalVisionMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Phi4MultimodalVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Phi4MultimodalVisionEncoderLayer`].
+
+    Args:
+        config: Phi4MultimodalVisionConfig
+    """
+
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [Phi4MultimodalVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    @can_return_tuple
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutput:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsequently scaled and shifted by the mean and std args.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+class Phi4MultimodalVisionPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Phi4MultimodalVisionConfig
+    base_model_prefix = "phi4_vision"
+    supports_gradient_checkpointing = True
+
+    _no_split_modules = ["Phi4MultimodalVisionEncoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Phi4MultimodalVisionEmbeddings):
+            width = (
+                self.config.hidden_size
+                if isinstance(self.config, Phi4MultimodalVisionConfig)
+                else self.config.hidden_size
+            )
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, Phi4MultimodalVisionAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, Phi4MultimodalVisionMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, Phi4MultimodalVisionMultiheadAttentionPoolingHead):
+            nn.init.normal_(module.probe.data)
+            nn.init.normal_(module.attention.in_proj_weight.data)
+            nn.init.zeros_(module.attention.in_proj_bias.data)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class Phi4MultimodalVisionEmbeddings(nn.Module):
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__()
+        self.config = config
+        self.patch_size = config.patch_size
+        self.num_patches_per_side = config.image_size // self.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=config.hidden_size,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.position_embedding = nn.Embedding(self.num_patches_per_side**2, config.hidden_size)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing and no class embeddings.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1]
+        num_positions = self.position_embedding.weight.shape[0]
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding(self.position_ids)
+
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+
+    def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
+        position_ids = torch.full((batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Phi4MultimodalVisionMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = Phi4MultimodalVisionMLP(config)
+
+    def forward(self, hidden_state, attention_mask):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(
+            query=probe, key=hidden_state, value=hidden_state, key_padding_mask=~attention_mask
+        )[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+class Phi4MultimodalVisionModel(Phi4MultimodalVisionPreTrainedModel):
+    config_class = Phi4MultimodalVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Phi4MultimodalVisionEmbeddings(config)
+        self.encoder = Phi4MultimodalVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.head = Phi4MultimodalVisionMultiheadAttentionPoolingHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_attention_mask = torch.ones(
+                size=(
+                    batch_size,
+                    pixel_values.size(2) // self.config.patch_size,
+                    pixel_values.size(3) // self.config.patch_size,
+                ),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+
+        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            attention_mask = None
+        else:
+            attention_mask = (
+                _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+                if not self.config._attn_implementation == "flash_attention_2"
+                else patch_attention_mask
+            )
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = self.head(
+            hidden_state=last_hidden_state,
+            attention_mask=patch_attention_mask,
+        )
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class Phi4MultimodalImageEmbedding(nn.Module):
+    """Image embedding."""
+
+    def __init__(self, config: Phi4MultimodalConfig):
+        super().__init__()
+        self.config = config
+        self.layer_idx = config.vision_config.feature_layer
+        self.crop_size = config.vision_config.crop_size
+        self.image_dim_out = config.vision_config.hidden_size
+
+        n_patches = config.vision_config.image_size // config.vision_config.patch_size
+        if n_patches % 2 != 0:
+            self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
+            n_patches += 1
+        self.num_img_tokens = (n_patches // 2) ** 2
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.img_processor = Phi4MultimodalVisionModel._from_config(config.vision_config)
+        self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.img_projection_up = nn.Linear(self.image_dim_out, config.hidden_size)
+        self.img_projection_down = nn.Linear(config.hidden_size, config.hidden_size)
+        self.global_img_feature_extensor = nn.Parameter(torch.zeros([1, 1, self.image_dim_out]))
+        self.sub_img_feature_extensor = nn.Parameter(torch.zeros([1, 1, 1, self.image_dim_out]))
+
+    def get_img_features(self, img_embeds: torch.FloatTensor, attention_mask=None) -> torch.FloatTensor:
+        img_processor_output = self.img_processor(
+            img_embeds, patch_attention_mask=attention_mask, output_hidden_states=True
+        )
+        img_feature = img_processor_output.hidden_states[self.layer_idx]
+
+        patch_feature = img_feature
+        # reshape to 2D tensor
+        width = int(math.sqrt(patch_feature.size(1)))
+        patch_feature = patch_feature.view(-1, width, width, patch_feature.size(-1))
+        # convert to NCHW
+        patch_feature = patch_feature.permute(0, 3, 1, 2)
+        if getattr(self, "img_processor_padding", None) is not None:
+            patch_feature = self.img_processor_padding(patch_feature)
+        patch_feature = self.image_token_compression(patch_feature)
+        # convert to NHWC
+        patch_feature = patch_feature.permute(0, 2, 3, 1)
+        patch_feature = patch_feature.view(-1, patch_feature.size(1) * patch_feature.size(2), patch_feature.size(-1))
+        return patch_feature
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.Tensor,
+        image_pixel_values: torch.FloatTensor,
+        image_sizes: Optional[torch.Tensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        image_pixel_values = image_pixel_values.to(self.img_processor.embeddings.patch_embedding.weight.dtype)
+
+        target_device = self.img_projection_up.bias.device
+        target_dtype = self.img_projection_up.bias.dtype
+
+        batch_size = image_pixel_values.shape[0]
+
+        img_features = self.get_img_features(
+            image_pixel_values.flatten(0, 1),
+            attention_mask=image_attention_mask.flatten(0, 1).to(dtype=bool, device=target_device),
+        )
+        base_feat_size = int(np.sqrt(img_features.shape[1]))
+        img_features = img_features.view(batch_size, -1, base_feat_size**2, self.image_dim_out)
+        image_sizes = image_sizes.view(-1, 2)
+
+        output_imgs = []
+        for idx in range(batch_size):
+            height, width = image_sizes[idx]
+            height_ratio = height // self.crop_size
+            width_ratio = width // self.crop_size
+            area_ratio = height_ratio * width_ratio
+
+            global_img = img_features[idx, :1]
+            global_img = global_img.reshape(1, base_feat_size, base_feat_size, self.image_dim_out).contiguous()
+            temporary_extensor = self.sub_img_feature_extensor.repeat(1, base_feat_size, 1, 1)
+            global_img = torch.cat([global_img, temporary_extensor], dim=2).reshape(1, -1, self.image_dim_out)
+
+            sub_img = img_features[idx, 1:]
+            sub_img = sub_img[:area_ratio]
+            sub_img = (
+                sub_img.reshape(height_ratio, width_ratio, base_feat_size, base_feat_size, self.image_dim_out)
+                .transpose(1, 2)
+                .reshape(1, height_ratio * base_feat_size, width_ratio * base_feat_size, self.image_dim_out)
+                .contiguous()
+            )
+
+            if image_attention_mask is not None:
+                reshaped_image_attention_mask = (
+                    image_attention_mask[idx, 1 : area_ratio + 1, 0::2, 0::2]
+                    .reshape(height_ratio, width_ratio, base_feat_size, base_feat_size)
+                    .transpose(1, 2)
+                    .reshape(1, height_ratio * base_feat_size, width_ratio * base_feat_size)
+                )
+                useful_height = int(reshaped_image_attention_mask[0, :, 0].sum().item())
+                useful_width = int(reshaped_image_attention_mask[0, 0, :].sum().item())
+                sub_img = sub_img[:, :useful_height, :useful_width]
+                temporary_extensor = self.sub_img_feature_extensor.repeat(1, useful_height, 1, 1)
+            else:
+                temporary_extensor = self.sub_img_feature_extensor.repeat(1, height_ratio * base_feat_size, 1, 1)
+
+            sub_img = torch.cat([sub_img, temporary_extensor], dim=2).reshape(1, -1, self.image_dim_out)
+
+            # Merge global and sub
+            output_imgs.append(torch.cat([sub_img, self.global_img_feature_extensor, global_img], dim=1))
+
+        img_set_tensor = []
+        for output_img in output_imgs:
+            output_img = output_img.to(device=target_device, dtype=target_dtype)
+            img_feature_proj = self.img_projection_up(output_img)
+            img_feature_proj = nn.functional.gelu(img_feature_proj)
+            img_feature_proj = self.img_projection_down(img_feature_proj)
+            img_set_tensor.append(img_feature_proj)
+
+        merged_img_set_tensor = torch.cat(img_set_tensor, dim=1).squeeze(0)
+        merged_img_set_tensor = merged_img_set_tensor.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+
+        with torch.no_grad():
+            positions_tuple = torch.nonzero(input_ids == self.config.vision_config.image_token_id, as_tuple=True)
+
+        # Temporarily disable autocast to avoid issue on bf16 tensors
+        # Ref: https://github.com/pytorch/pytorch/issues/132715
+        with torch.autocast(device_type=inputs_embeds.device.type, enabled=False):
+            image_embeds = inputs_embeds.index_put(
+                indices=positions_tuple, values=merged_img_set_tensor, accumulate=False
+            )
+
+        image_embeds = self.drop(image_embeds)
+
+        return image_embeds
+
+
+########################################################## AUDIO #############################################
+
+
+class Phi4MultimodalAudioMLP(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.act_fn = ACT2FN[config.activation]
+        self.gate_up_proj = nn.Linear(config.hidden_size, config.intermediate_size * 2)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        up_states = self.gate_up_proj(hidden_states)
+        up_states, gate = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.act_fn(gate)
+        up_states = self.dropout(up_states)
+        hidden_states = self.down_proj(up_states)
+        out = self.dropout(hidden_states)
+
+        return out
+
+
+class Phi4MultimodalAudioAttention(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.config = config
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.dropout_rate
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        **kwargs,
+    ):
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        attention_interface: Callable = simple_eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, _ = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+
+class Phi4MultimodalAudioDepthWiseSeperableConv1d(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
+        super().__init__()
+        self.dw_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size * config.depthwise_multiplier,
+            config.kernel_size,
+            1,
+            padding=padding,
+            groups=config.hidden_size,
+        )
+        self.pw_conv = nn.Conv1d(
+            config.hidden_size * config.depthwise_multiplier, config.depthwise_seperable_out_channel, 1, 1, 0
+        )
+
+    def forward(self, hidden_states):
+        return self.pw_conv(self.dw_conv(hidden_states))
+
+
+class Phi4MultimodalAudioGluPointWiseConv(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.config = config
+        self.output_dim = config.ext_pw_out_channel
+
+        self.ext_pw_conv_1d = nn.Conv1d(config.hidden_size, config.ext_pw_out_channel * 2, kernel_size=1, stride=1)
+        self.glu_act = ACT2FN[config.conv_glu_type]
+        self.b1 = nn.Parameter(torch.zeros(1, config.ext_pw_out_channel, 1))
+        self.b2 = nn.Parameter(torch.zeros(1, config.ext_pw_out_channel, 1))
+
+    def forward(self, hidden_states):
+        # we assume the input always has the #channel (#dim) in the last dimension of the
+        # tensor, so need to switch the dimension first for 1D-Conv case
+        hidden_states = hidden_states.permute([0, 2, 1])
+        hidden_states = self.ext_pw_conv_1d(hidden_states)
+        out = hidden_states[:, 0 : self.output_dim, :] + self.b1
+        out = out * self.glu_act(hidden_states[:, self.output_dim : self.output_dim * 2, :] + self.b2)
+        return out.permute([0, 2, 1])
+
+
+class Phi4MultimodalAudioConvModule(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.config = config
+        self.kernel_size = config.kernel_size
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.glu = Phi4MultimodalAudioGluPointWiseConv(config)
+        self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeperableConv1d(config, padding=config.kernel_size - 1)
+        self.act = ACT2FN[config.conv_activation]
+        self.ext_pw_conv_1d = nn.Conv1d(config.hidden_size, config.ext_pw_out_channel, kernel_size=1, stride=1)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.glu(self.layer_norm(hidden_states))
+        hidden_states = self.dw_sep_conv_1d(hidden_states.permute([0, 2, 1]))
+
+        if self.kernel_size > 1:
+            hidden_states = hidden_states[:, :, : -(self.kernel_size - 1)]
+
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.ext_pw_conv_1d(hidden_states)
+        out = self.dropout(hidden_states.permute([0, 2, 1]))
+        return out
+
+
+class Phi4MultimodalAudioConformerEncoderLayer(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+
+        self.feed_forward_in = Phi4MultimodalAudioMLP(config)
+        self.self_attn = Phi4MultimodalAudioAttention(config)
+        self.conv = Phi4MultimodalAudioConvModule(config)
+        self.feed_forward_out = Phi4MultimodalAudioMLP(config)
+        self.layer_norm_att = nn.LayerNorm(config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ):
+        residual = hidden_states + 0.5 * self.feed_forward_in(hidden_states)
+        hidden_states = self.layer_norm_att(residual)
+
+        hidden_states = residual + self.self_attn(hidden_states, attention_mask)
+        hidden_states = hidden_states + self.conv(hidden_states)
+        hidden_states = hidden_states + 0.5 * self.feed_forward_out(hidden_states)
+
+        out = self.layer_norm(hidden_states)
+
+        return out
+
+
+class Phi4MultimodalAudioNemoConvSubsampling(torch.nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.subsampling_factor = config.time_reduction
+        self.sampling_num = int(math.log(self.subsampling_factor, 2))
+        self.act_fn = ACT2FN[config.nemo_activation]
+        conv_channels = config.nemo_conv_channels
+
+        layers = [
+            nn.Conv2d(1, conv_channels, kernel_size=3, stride=2, padding=1),
+            self.act_fn,
+        ]
+        for _ in range(self.sampling_num - 1):
+            layers.extend(
+                [
+                    nn.Conv2d(conv_channels, conv_channels, kernel_size=3, stride=2, padding=1, groups=conv_channels),
+                    nn.Conv2d(conv_channels, conv_channels, kernel_size=1, stride=1, padding=0, groups=1),
+                    self.act_fn,
+                ]
+            )
+
+        # Aggregate the layers
+        self.conv = torch.nn.Sequential(*layers)
+        self.out = torch.nn.Linear(conv_channels * config.nemo_final_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor]):
+        # Unsqueeze Channel Axis
+        hidden_states = hidden_states.unsqueeze(1)
+        hidden_states = self.conv(hidden_states)
+
+        # Flatten Channel and Frequency Axes
+        b, _, t, _ = hidden_states.size()
+        hidden_states = self.out(hidden_states.transpose(1, 2).reshape(b, t, -1))
+
+        if mask is None:
+            return hidden_states, None
+
+        max_audio_length = hidden_states.shape[1]
+        feature_lens = mask.sum(1)
+        padding_length = torch.ceil(feature_lens / self.subsampling_factor)
+        arange_ = torch.arange(0, max_audio_length, device=hidden_states.device)
+        pad_mask = arange_.expand(padding_length.size(0), -1) < padding_length.unsqueeze(1)
+        return hidden_states, pad_mask.unsqueeze(1)
+
+
+class Phi4MultimodalAudioRelativeAttentionBias(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+
+        self.max_distance = config.bias_max_distance
+        self.symmetric = config.bias_symmetric
+        self.num_buckets = self.max_distance
+        if not config.bias_symmetric:
+            self.num_buckets *= 2
+        self.bias_values = nn.Embedding(self.num_buckets, config.num_attention_heads)
+
+    def forward(self, x):
+        # instantiate bias compatible with shape of x
+        max_pos = x.size(1)
+        context_position = torch.arange(max_pos, device=x.device, dtype=torch.long)[:, None]
+        memory_position = torch.arange(max_pos, device=x.device, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        # clipping to a maximum distance using ops that play well with ONNX export
+        relative_position = relative_position.masked_fill(relative_position < -self.max_distance, -self.max_distance)
+        relative_position = relative_position.masked_fill(
+            relative_position > self.max_distance - 1, self.max_distance - 1
+        )
+
+        # mapping from relative position to index in the bias parameter
+        bias_idx = relative_position
+        bias_idx = bias_idx.abs() if self.symmetric else bias_idx + self.num_buckets // 2
+
+        att_bias = self.bias_values(bias_idx)
+        att_bias = att_bias.permute(2, 0, 1).unsqueeze(0)
+
+        return att_bias
+
+
+class Phi4MultimodalAudioMeanVarianceNormLayer(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.register_buffer("global_mean", torch.zeros(config.input_size))
+        self.register_buffer("global_invstd", torch.ones(config.input_size))
+
+    def forward(self, x):
+        return (x - self.global_mean) * self.global_invstd
+
+
+class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel):
+    config_class = Phi4MultimodalAudioConfig
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Phi4MultimodalAudioConformerEncoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+def unfold_tensor(tensor, max_seq_len):
+    """
+    For a given tensor with shape of (N, T, D), if sequence length T is longer than max_seq_len,
+    this function unfold it to a (NT', max_seq_len, D) where T' is T // max_seq_len.
+    Args:
+        tensor: N, T, D
+    """
+    _, _, D = tensor.shape
+    tensor = tensor.transpose(-1, -2)
+    # N x D x 1 x T => N x (D x max_seq_len) x T'
+    tensor = F.unfold(tensor[..., None, :], kernel_size=(1, max_seq_len), stride=(1, max_seq_len))
+
+    new_bsz, _, slen = tensor.shape
+    tensor = tensor.view(new_bsz, -1, max_seq_len, slen)
+    tensor = tensor.permute(0, 3, 2, 1)
+    tensor = tensor.view(-1, max_seq_len, D).contiguous()
+    return tensor
+
+
+def adaptive_enc_mask(x_len, chunk_start_idx, left_window=0, right_window=0):
+    """
+    The function is very important for Transformer Transducer Streaming mode
+    Args:
+        xs_len (int): sequence length
+        chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48]. It also supports adaptive chunk size [0,10,15,45]
+        left_window (int): how many left chunks can be seen
+        right_window (int): how many right chunks can be seen. It is used for chunk overlap model.
+        Returns:
+            mask (torch.Tensor): a mask tensor for streaming model
+    """
+    chunk_start_idx = torch.Tensor(chunk_start_idx).long()
+    start_pad = torch.nn.functional.pad(
+        chunk_start_idx, (1, 0)
+    )  # append 0 to the beginning, so it becomes [0, 0, 18, 36, 48]
+    end_pad = torch.nn.functional.pad(
+        chunk_start_idx, (0, 1), value=x_len
+    )  # append x_len to the end, so it becomes [0,18,36,48, x_len]
+    seq_range = torch.arange(0, x_len).unsqueeze(-1)
+    idx = ((seq_range < end_pad) & (seq_range >= start_pad)).nonzero()[:, 1]
+    seq_range_expand = torch.arange(0, x_len).unsqueeze(0).expand(x_len, -1)
+    idx_left = idx - left_window
+    idx_left[idx_left < 0] = 0
+    boundary_left = start_pad[idx_left]
+    mask_left = seq_range_expand >= boundary_left.unsqueeze(-1)
+    idx_right = idx + right_window
+    idx_right[idx_right > len(chunk_start_idx)] = len(chunk_start_idx)
+    boundary_right = end_pad[idx_right]
+    mask_right = seq_range_expand < boundary_right.unsqueeze(-1)
+    return mask_left & mask_right
+
+
+class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder_embedding = Phi4MultimodalAudioMeanVarianceNormLayer(config)
+        self.embed = Phi4MultimodalAudioNemoConvSubsampling(config)
+        self.relative_attention_bias_layer = Phi4MultimodalAudioRelativeAttentionBias(config)
+        self.encoders = nn.ModuleList(
+            [Phi4MultimodalAudioConformerEncoderLayer(config) for _ in range(config.num_blocks)]
+        )
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
+        # Create mask matrix for streaming
+        # S stores start index. if chunksize is 18, s is [0,18,36,....]
+        chunk_start_idx = np.arange(0, seq_len, chunk_size)
+        # avoid randomness when run evaluation or decoding
+        if self.training and np.random.rand() > 0.5:
+            # Either first or last chunk is not complete.
+            # If only the last one is not complete, EOS is not effective
+            chunk_start_idx = seq_len - chunk_start_idx
+            chunk_start_idx = chunk_start_idx[::-1]
+            chunk_start_idx = chunk_start_idx[:-1]
+            chunk_start_idx = np.insert(chunk_start_idx, 0, 0)
+
+        enc_streaming_mask = (
+            adaptive_enc_mask(seq_len, chunk_start_idx, left_window=left_chunk)
+            .unsqueeze(0)
+            .expand([batch_size, -1, -1])
+        )
+        return enc_streaming_mask
+
+    def forward_embeddings(self, hidden_states, masks):
+        """Forwarding the inputs through the top embedding layers"""
+        seq_len = math.ceil(hidden_states.shape[1] / self.config.time_reduction)
+        if seq_len <= 0:
+            raise ValueError(
+                f"The squence length after time reduction is invalid: {seq_len}. Your input feature is too short."
+            )
+
+        batch_size = hidden_states.shape[0]
+
+        enc_streaming_mask = self._streaming_mask(seq_len, batch_size, self.config.chunk_size, self.config.left_chunk)
+        enc_streaming_mask = enc_streaming_mask.to(hidden_states.device)
+
+        hidden_states, masks = self.embed(hidden_states, masks)
+
+        streaming_mask = enc_streaming_mask
+        if streaming_mask is not None and masks is not None:
+            hs_mask = masks & streaming_mask
+        elif masks is not None:
+            hs_mask = masks
+        else:
+            hs_mask = streaming_mask
+
+        return hidden_states, hs_mask, masks
+
+    def calculate_hs_mask(self, hidden_states, device, mask):
+        max_audio_length = hidden_states.shape[1]
+        batch_size = hidden_states.shape[0]
+        enc_streaming_mask = self._streaming_mask(
+            max_audio_length, batch_size, self.config.chunk_size, self.config.left_chunk
+        )
+        enc_streaming_mask = enc_streaming_mask.to(device)
+        if mask is None:
+            return enc_streaming_mask
+
+        feature_lens = mask.sum(1)
+        padding_length = feature_lens
+        pad_mask = torch.arange(0, max_audio_length, device=device).expand(
+            padding_length.size(0), -1
+        ) < padding_length.unsqueeze(1)
+        pad_mask = pad_mask.unsqueeze(1)
+        pad_mask = pad_mask & enc_streaming_mask
+        return pad_mask
+
+    def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor]):
+        hidden_states = self.encoder_embedding(hidden_states)
+        hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
+
+        unfolded = False
+        bs, seq_len, _ = hidden_states.shape
+        max_seq_len = 500  # maxium position for absolute positional encoding
+        if seq_len > max_seq_len:
+            # audio sequence is longer than max_seq_len, unfold it into chunks of max_seq_len
+            unfolded = True
+            # the unfold op will drop residual frames, pad it to the multiple of max_seq_len
+            if seq_len % max_seq_len > 0:
+                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
+            else:
+                chunk_pad_size = 0
+            if chunk_pad_size > 0:
+                hidden_states_pad = F.pad(hidden_states, (0, 0, 0, chunk_pad_size), "constant", 0)
+                hidden_states = hidden_states_pad.to(hidden_states.device)
+
+            hidden_states = unfold_tensor(hidden_states, max_seq_len)
+            masks_unfold = None
+            if mask is not None:
+                # revise hs_mask here because the previous calculated hs_mask did not consider extra pad
+                subsampled_pad_mask = mask.squeeze(1)  # [bz, subsampled_unmask_seq_len]
+                extra_padded_subsamlped_pad_mask = F.pad(
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant", False
+                )  # extra padding to the pad mask
+                extra_padded_subsamlped_pad_mask = extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
+                masks_unfold = unfold_tensor(
+                    extra_padded_subsamlped_pad_mask, max_seq_len
+                )  # unfold the pad mask like we did to the input tensor
+                masks_unfold = masks_unfold.squeeze(-1).bool()  # unfold op does not support bool tensor
+            hs_mask = self.calculate_hs_mask(
+                hidden_states, hidden_states.device, masks_unfold
+            )  # calculate hs_mask based on the unfolded pad mask
+
+        relative_attention_bias = self.relative_attention_bias_layer(hidden_states)
+        attention_mask = hs_mask.unsqueeze(1) + relative_attention_bias
+
+        for layer in self.encoders:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                hidden_states = layer(hidden_states, attention_mask)
+
+        if unfolded:
+            embed_dim = hidden_states.shape[-1]
+            hidden_states = hidden_states.reshape(bs, -1, embed_dim)
+            # if we ever padded before unfolding, we need to remove the padding
+            if chunk_pad_size > 0:
+                hidden_states = hidden_states[:, :-chunk_pad_size, :]
+
+        return hidden_states
+
+
+class Phi4MultimodalAudioEmbedding(nn.Module):
+    def __init__(self, config: Phi4MultimodalConfig):
+        super().__init__()
+        self.config = config
+        self.layer_idx = config.audio_config.feature_layer
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.encoder = Phi4MultimodalAudioModel._from_config(config.audio_config)
+        self.up_proj_for_speech = nn.Linear(
+            config.audio_config.hidden_size * config.audio_config.downsample_rate, config.hidden_size
+        )
+        self.down_proj_for_speech = nn.Linear(config.hidden_size, config.hidden_size)
+        self.up_proj_for_vision_speech = nn.Linear(
+            config.audio_config.hidden_size * config.audio_config.downsample_rate, config.hidden_size
+        )
+        self.down_proj_for_vision_speech = nn.Linear(config.hidden_size, config.hidden_size)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.Tensor,
+        audio_input_features: torch.FloatTensor,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        audio_projection_mode="speech",
+    ) -> torch.FloatTensor:
+        with torch.no_grad():
+            positions_tuple = torch.nonzero(input_ids == self.config.audio_config.audio_token_id, as_tuple=True)
+
+        up_proj = self.up_proj_for_speech if audio_projection_mode == "speech" else self.up_proj_for_vision_speech
+        down_proj = (
+            self.down_proj_for_speech if audio_projection_mode == "speech" else self.down_proj_for_vision_speech
+        )
+
+        target_device = up_proj.bias.device
+        target_dtype = up_proj.bias.dtype
+
+        audio_input_features = audio_input_features.to(device=target_device, dtype=target_dtype)
+
+        audio_encoder_hidden_states = self.encoder(audio_input_features, audio_attention_mask)
+        audio_encoder_hidden_states = up_proj(audio_encoder_hidden_states)
+        audio_encoder_hidden_states = nn.functional.gelu(audio_encoder_hidden_states)
+        audio_embeds = down_proj(audio_encoder_hidden_states)
+
+        merged_audio_embeds = torch.cat(
+            [audio_embeds[i, : audio_embed_sizes[i], :] for i in range(len(audio_embed_sizes))], dim=0
+        )
+        merged_audio_embeds = merged_audio_embeds.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+        # Temporarily disable autocast to avoid issue on bf16 tensors
+        # Ref: https://github.com/pytorch/pytorch/issues/132715
+        with torch.autocast(device_type=inputs_embeds.device.type, enabled=False):
+            audio_embeds = inputs_embeds.index_put(
+                indices=positions_tuple, values=merged_audio_embeds, accumulate=False
+            )
+
+        audio_embeds = self.drop(audio_embeds)
+
+        return audio_embeds
+
+
+class Phi4MultimodalRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Phi4MultimodalRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Phi4MultimodalMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    q_embed = torch.cat([(q_rot * cos) + (rotate_half(q_rot) * sin), q_pass], dim=-1)
+    k_embed = torch.cat([(k_rot * cos) + (rotate_half(k_rot) * sin), k_pass], dim=-1)
+    return q_embed, k_embed
+
+
+class Phi4MultimodalAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Phi4MultimodalConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        op_size = config.num_attention_heads * self.head_dim + 2 * (config.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(config.hidden_size, op_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.config.num_attention_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Phi4MultimodalDecoderLayer(nn.Module):
+    def __init__(self, config: Phi4MultimodalConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Phi4MultimodalAttention(config=config, layer_idx=layer_idx)
+        self.mlp = Phi4MultimodalMLP(config)
+        self.input_layernorm = Phi4MultimodalRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Phi4MultimodalRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.config = config
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+            past_key_value (`Cache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + self.resid_attn_dropout(hidden_states)  # main diff with Llama
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)  # main diff with Llama
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Phi4MultimodalFeatureEmbedding(nn.Module):
+    """Image-audio embedding."""
+
+    def __init__(self, config: Phi4MultimodalConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.image_token_id = config.vision_config.image_token_id
+        self.audio_token_id = config.audio_config.audio_token_id
+        self.image_embed = Phi4MultimodalImageEmbedding(config)
+        self.audio_embed = Phi4MultimodalAudioEmbedding(config)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.Tensor,
+        image_pixel_values: Optional[torch.FloatTensor] = None,
+        audio_input_features: Optional[torch.FloatTensor] = None,
+        image_sizes=None,
+        image_attention_mask=None,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+    ) -> torch.FloatTensor:
+        with torch.no_grad():
+            image_position_mask = (input_ids == self.config.vision_config.image_token_id).unsqueeze(-1)
+            non_image_position_mask = ~image_position_mask
+
+        image_embeds = None
+        audio_embeds = None
+        if image_pixel_values is not None and (input_ids == self.image_token_id).any():
+            image_embeds = self.image_embed(
+                input_ids,
+                inputs_embeds,
+                image_pixel_values=image_pixel_values,
+                image_sizes=image_sizes,
+                image_attention_mask=image_attention_mask,
+            )
+        if audio_input_features is not None and (input_ids == self.audio_token_id).any():
+            audio_projection_mode = "vision" if image_pixel_values is not None else "speech"
+            audio_embeds = self.audio_embed(
+                input_ids,
+                inputs_embeds,
+                audio_input_features=audio_input_features,
+                audio_embed_sizes=audio_embed_sizes,
+                audio_attention_mask=audio_attention_mask,
+                audio_projection_mode=audio_projection_mode,
+            )
+
+        # merge image and audio
+        if image_embeds is not None and audio_embeds is not None:
+            inputs_embeds = image_embeds * image_position_mask + audio_embeds * non_image_position_mask
+        elif image_embeds is not None:
+            inputs_embeds = image_embeds
+        elif audio_embeds is not None:
+            inputs_embeds = audio_embeds
+
+        return inputs_embeds
+
+
+PHI4_MULTIMODAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Phi4MultimodalConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Phi4Multimodal Model outputting raw hidden-states without any specific head on top.",
+    PHI4_MULTIMODAL_START_DOCSTRING,
+)
+class Phi4MultimodalPreTrainedModel(PreTrainedModel):
+    config_class = Phi4MultimodalConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Phi4MultimodalDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    _version = "0.0.5"
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class Phi4MultimodalRotaryEmbedding(nn.Module):
+    def __init__(self, config: Phi4MultimodalConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+PHI4_MULTIMODAL_MODEL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache`)`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            See our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        image_pixel_values (`torch.FloatTensor`, *optional*):
+            If the input contains images, these correspond to the pixel values after transformations (as returned by
+            the Processor)
+        image_sizes (`torch.LongTensor`, *optional*):
+            If the input contains images, these correspond to size of each image.
+        image_attention_mask (`torch.LongTensor`, *optional*):
+            Attention mask for the images.
+        audio_input_features (`torch.FloatTensor`, *optional*):
+            If the input contains audio samples, these correspond to the values after transformation (as returned by
+            the Processor).
+        audio_embed_sizes (`torch.Tensor`, *optional*):
+            Size of the audio inputs.
+        audio_attention_mask (`torch.Tensor, *optional*):
+            Attention mask for the audio inputs.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Phi4Multimodal Model outputting raw hidden-states without any specific head on top.",
+    PHI4_MULTIMODAL_START_DOCSTRING,
+)
+class Phi4MultimodalModel(Phi4MultimodalPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi4MultimodalMMDecoderLayer`]
+    Args:
+        config: Phi4MultimodalMMConfig
+    """
+
+    def __init__(self, config: Phi4MultimodalConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+
+        self.layers = nn.ModuleList(
+            [Phi4MultimodalDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Phi4MultimodalRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Phi4MultimodalRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+        self.embed_dropout = nn.Dropout(config.embd_pdrop)
+
+        self.embed_tokens_extend = Phi4MultimodalFeatureEmbedding(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(PHI4_MULTIMODAL_MODEL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        image_attention_mask=None,
+        audio_input_features: Optional[torch.FloatTensor] = None,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            inputs_embeds = self.embed_tokens_extend(
+                input_ids,
+                inputs_embeds,
+                image_pixel_values=image_pixel_values,
+                audio_input_features=audio_input_features,
+                image_sizes=image_sizes,
+                image_attention_mask=image_attention_mask,
+                audio_embed_sizes=audio_embed_sizes,
+                audio_attention_mask=audio_attention_mask,
+            )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Phi4Multimodal. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: Phi4MultimodalConfig,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`Phi4MultimodalConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+
+
+class Phi4MultimodalForCausalLM(Phi4MultimodalPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Phi4MultimodalModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(PHI4_MULTIMODAL_MODEL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=Phi4MultimodalConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        image_attention_mask=None,
+        audio_input_features: Optional[torch.FloatTensor] = None,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+        Returns:
+
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Phi4MultimodalForCausalLM
+        >>> model = Phi4MultimodalForCausalLM.from_pretrained("TBA")
+        >>> tokenizer = AutoTokenizer.from_pretrained("TBA")
+        >>> prompt = "This is an example script ."
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            image_pixel_values=image_pixel_values,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            audio_input_features=audio_input_features,
+            audio_embed_sizes=audio_embed_sizes,
+            audio_attention_mask=audio_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        image_pixel_values=None,
+        image_sizes=None,
+        image_attention_mask=None,
+        audio_input_features=None,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        logits_to_keep=0,
+        **kwargs,
+    ):
+        # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the
+        # process
+
+        # When the first time input length reached long and short factor switching point, enforce re-compute cache
+        # It will cause downside of slower at this single token position, however, better than current failure.
+        if (
+            past_key_values
+            and self.config.rope_scaling
+            and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
+        ):
+            past_length = cache_position[0]
+            if past_length <= self.config.original_max_position_embeddings:
+                past_key_values = None
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            image_pixel_values=image_pixel_values,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            audio_input_features=audio_input_features,
+            audio_embed_sizes=audio_embed_sizes,
+            audio_attention_mask=audio_attention_mask,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        return model_inputs
+
+
+__all__ = [
+    "Phi4MultimodalAudioPreTrainedModel",
+    "Phi4MultimodalAudioModel",
+    "Phi4MultimodalVisionPreTrainedModel",
+    "Phi4MultimodalVisionModel",
+    "Phi4MultimodalPreTrainedModel",
+    "Phi4MultimodalModel",
+    "Phi4MultimodalForCausalLM",
+]
diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
new file mode 100644
index 000000000000..901cfa27b063
--- /dev/null
+++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
@@ -0,0 +1,1838 @@
+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+
+from ...activations import ACT2FN
+from ...cache_utils import DynamicCache
+from ...configuration_utils import PretrainedConfig
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPooling,
+    CausalLMOutputWithPast,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...utils import (
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    logging,
+    replace_return_docstrings,
+)
+from ..phi3.configuration_phi3 import Phi3Config
+from ..phi3.modeling_phi3 import Phi3DecoderLayer, Phi3ForCausalLM, Phi3Model, Phi3RMSNorm
+from ..siglip.configuration_siglip import SiglipVisionConfig
+from ..siglip.modeling_siglip import (
+    SiglipEncoder,
+    SiglipEncoderLayer,
+    SiglipMLP,
+    SiglipMultiheadAttentionPoolingHead,
+    SiglipPreTrainedModel,
+    SiglipVisionEmbeddings,
+    default_flax_embed_init,
+    lecun_normal_,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class Phi4MultimodalVisionConfig(SiglipVisionConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalVisionModel`]. It is used to instantiate a
+    Phi4Multimodal vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1152):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 4304):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 27):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 448):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        crop_size (`int`, *optional*, defaults to 448):
+            Crop size for the input images.
+        image_token_id (`int`, *optional*, defaults to 200010):
+            The image token id.
+        feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer of the encoder from which to extract image features.
+
+    Example:
+
+    ```python
+    >>> from transformers import Phi4MultimodalVisionConfig
+
+    >>> # Initializing a Phi4MultimodalVisionConfig with microsoft/Phi-4-multimodal-instruct style configuration
+    >>> configuration = Phi4MultimodalVisionConfig()
+    ```"""
+
+    model_type = "phi4_multimodal_vision"
+
+    def __init__(
+        self,
+        hidden_size=1152,
+        intermediate_size=4304,
+        num_hidden_layers=27,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=448,
+        patch_size=14,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        crop_size: int = 448,
+        image_token_id: int = 200010,
+        feature_layer: int = -2,
+        **kwargs,
+    ):
+        super().__init__(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_channels=num_channels,
+            image_size=image_size,
+            patch_size=patch_size,
+            hidden_act=hidden_act,
+            layer_norm_eps=layer_norm_eps,
+            attention_dropout=attention_dropout,
+            **kwargs,
+        )
+        self.crop_size = crop_size
+        self.image_token_id = image_token_id
+        self.feature_layer = feature_layer
+
+
+class Phi4MultimodalAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalAudioModel`]. It is used to instantiate a
+    Phi4Multimodal audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers.
+        intermediate_size (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_blocks (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        activation (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the MLPs.
+        chunk_size (`int`, *optional*, defaults to -1):
+            The chunk size to create the masks.
+        left_chunk (`int`, *optional*, defaults to 18):
+            The left chunk to create the masks.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout ratio.
+        ext_pw_out_channel (`int`, *optional*, defaults to 1024):
+            Number of out channels in the point-wise conv modules.
+        depthwise_seperable_out_channel (`int`, *optional*, defaults to 1024):
+            Number of out channels in the depth-wise separable conv modules.
+        depthwise_multiplier (`int`, *optional*, defaults to 1):
+            Input size multiplier for the depth-wise separable conv modules.
+        kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size for the depth-wise separable conv modules.
+        conv_activation (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the conv modules.
+        input_size (`int`, *optional*, defaults to 80):
+            Input size for the audio model.
+        conv_glu_type (`str`, *optional*, defaults to `"swish"`):
+            The non-linear activation function in the point-wise conv modules.
+        time_reduction (`int`, *optional*, defaults to 8):
+            Time reduction (subsampling factor).
+        bias_max_distance (`int`, *optional*, defaults to 1000):
+            Max distance for the relative attention bias module.
+        bias_symmetric (`bool`, *optional*, defaults to `False`):
+            Whether the relative attention bias should be symmetric or not.
+        nemo_activation (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in the nemo conv modules.
+        nemo_conv_channels (`int`, *optional*, defaults to 1024):
+            Number of channels in the nemo conv modules.
+        downsample_rate (`int`, *optional*, defaults to 1):
+            Downsample rate for the audio feature extractor.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        audio_token_id (`int`, *optional*, defaults to 200011):
+            The audio token id.
+        feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer of the encoder from which to extract audio features.
+
+    Example:
+
+    ```python
+    >>> from transformers import Phi4MultimodalAudioConfig
+
+    >>> # Initializing a Phi4MultimodalAudioConfig with microsoft/Phi-4-multimodal-instruct style configuration
+    >>> configuration = Phi4MultimodalAudioConfig()
+    ```"""
+
+    model_type = "phi4_multimodal_audio"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 1536,
+        num_blocks: int = 24,
+        num_attention_heads: int = 16,
+        activation: str = "swish",
+        chunk_size: int = -1,
+        left_chunk: int = 18,
+        dropout_rate: float = 0.0,
+        ext_pw_out_channel: int = 1024,
+        depthwise_seperable_out_channel: int = 1024,
+        depthwise_multiplier: int = 1,
+        kernel_size: int = 3,
+        conv_activation: str = "swish",
+        input_size: int = 80,
+        conv_glu_type: str = "swish",
+        time_reduction: int = 8,
+        bias_max_distance: int = 1000,
+        bias_symmetric: bool = False,
+        nemo_activation: str = "relu",
+        nemo_conv_channels: int = 1024,
+        downsample_rate: int = 1,
+        initializer_range: float = 0.02,
+        audio_token_id: int = 200011,
+        feature_layer: int = -2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.activation = activation
+        self.chunk_size = chunk_size
+        self.left_chunk = left_chunk
+        self.num_blocks = num_blocks
+        self.dropout_rate = dropout_rate
+        self.ext_pw_out_channel = ext_pw_out_channel
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.depthwise_multiplier = depthwise_multiplier
+        self.kernel_size = kernel_size
+        self.conv_activation = conv_activation
+        self.input_size = input_size
+        self.conv_glu_type = conv_glu_type
+        self.time_reduction = time_reduction
+        self.bias_max_distance = bias_max_distance
+        self.bias_symmetric = bias_symmetric
+        self.nemo_activation = nemo_activation
+        self.nemo_conv_channels = nemo_conv_channels
+        self.downsample_rate = downsample_rate
+        self.audio_token_id = audio_token_id
+        self.initializer_range = initializer_range
+        self.feature_layer = feature_layer
+
+        if time_reduction % 2 != 0:
+            raise ValueError("`time_reduction` should be a multiple of 2!")
+        length = input_size
+        for _ in range(int(math.log(time_reduction, 2))):
+            length = math.floor((length - 1) / 2 + 1)
+        self.nemo_final_size = length
+
+
+class Phi4MultimodalConfig(Phi3Config):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi4MultimodalModel`]. It is used to instantiate a
+    Phi4Multimodal model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 200064):
+            Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Phi3Model`].
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            Dropout probability for mlp outputs.
+        embd_pdrop (`int`, *optional*, defaults to 0.0):
+            The dropout ratio for the embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used for the RMSNorm.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
+            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
+            divided by the number of attention heads divided by 2.
+        partial_rotary_factor (`float`, *optional*, defaults to `1.0`):
+            Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0.
+        bos_token_id (`int`, *optional*, defaults to 199999):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`):
+            The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*, defaults to 199999):
+            The id of the padding token.
+        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model was trained with. This is used to determine the size of the
+            original RoPE embeddings when using long scaling.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If `None`, no sliding window is applied.
+        vision_config (`Phi4MultimodalVisionConfig` or `dict`, *optional*):
+            The vision config for the underlying image embedding model. If not provided, will default to the configuration
+            used to instantiate a model similar in architecture as
+            [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
+        audio_config (`Phi4MultimodalAudioConfig` or `dict`, *optional*):
+            The audio config for the underlying audio embedding model. If not provided, will default to the configuration
+            used to instantiate a model similar in architecture as
+            [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct).
+
+    Example:
+
+    ```python
+    >>> from transformers import Phi4MultimodalModel, Phi4MultimodalConfig
+
+    >>> # Initializing a Phi4Multimodal style configuration
+    >>> configuration = Phi4MultimodalConfig.from_pretrained("microsoft/Phi-4-multimodal-instruct")
+
+    >>> # Initializing a model from the configuration
+    >>> model = Phi4MultimodalModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    sub_configs = {"audio_config": Phi4MultimodalAudioConfig, "vision_config": Phi4MultimodalVisionConfig}
+
+    def __init__(
+        self,
+        vocab_size=200064,
+        hidden_size=3072,
+        intermediate_size=8192,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act="silu",
+        max_position_embeddings=131072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=1,
+        bos_token_id=199999,
+        eos_token_id=[199999, 200020],
+        pad_token_id=199999,
+        original_max_position_embeddings=4096,
+        sliding_window=None,
+        vision_config=None,
+        audio_config=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            resid_pdrop=resid_pdrop,
+            embd_pdrop=embd_pdrop,
+            attention_dropout=attention_dropout,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+            rms_norm_eps=rms_norm_eps,
+            use_cache=use_cache,
+            tie_word_embeddings=tie_word_embeddings,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            partial_rotary_factor=partial_rotary_factor,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            original_max_position_embeddings=original_max_position_embeddings,
+            sliding_window=sliding_window,
+            **kwargs,
+        )
+
+        if isinstance(vision_config, dict):
+            vision_config = Phi4MultimodalVisionConfig(**vision_config)
+        elif vision_config is None:
+            Phi4MultimodalVisionConfig()
+        self.vision_config = vision_config
+
+        if isinstance(audio_config, dict):
+            audio_config = Phi4MultimodalAudioConfig(**audio_config)
+        elif vision_config is None:
+            audio_config = Phi4MultimodalAudioConfig()
+        self.audio_config = audio_config
+
+
+class Phi4MultimodalVisionMLP(SiglipMLP):
+    pass
+
+
+def simple_eager_attention_forward(
+    module: nn.Module,
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Phi4MultimodalVisionAttention(nn.Module):
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scaling = self.head_dim**-0.5
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.v_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.q_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        attention_interface: Callable = simple_eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Phi4MultimodalVisionEncoderLayer(SiglipEncoderLayer):
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__(config)
+        self.self_attn = Phi4MultimodalVisionAttention(config)
+        self.mlp = Phi4MultimodalVisionMLP(config)
+
+
+class Phi4MultimodalVisionEncoder(SiglipEncoder):
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [Phi4MultimodalVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+
+
+class Phi4MultimodalVisionPreTrainedModel(SiglipPreTrainedModel):
+    config_class = Phi4MultimodalVisionConfig
+    base_model_prefix = "phi4_vision"
+    supports_gradient_checkpointing = True
+
+    _no_split_modules = ["Phi4MultimodalVisionEncoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Phi4MultimodalVisionEmbeddings):
+            width = (
+                self.config.hidden_size
+                if isinstance(self.config, Phi4MultimodalVisionConfig)
+                else self.config.hidden_size
+            )
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, Phi4MultimodalVisionAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, Phi4MultimodalVisionMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, Phi4MultimodalVisionMultiheadAttentionPoolingHead):
+            nn.init.normal_(module.probe.data)
+            nn.init.normal_(module.attention.in_proj_weight.data)
+            nn.init.zeros_(module.attention.in_proj_bias.data)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class Phi4MultimodalVisionEmbeddings(SiglipVisionEmbeddings, nn.Module):
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        nn.Module.__init__()
+        self.config = config
+        self.patch_size = config.patch_size
+        self.num_patches_per_side = config.image_size // self.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=config.hidden_size,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.position_embedding = nn.Embedding(self.num_patches_per_side**2, config.hidden_size)
+
+    def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
+        position_ids = torch.full((batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Phi4MultimodalVisionMultiheadAttentionPoolingHead(SiglipMultiheadAttentionPoolingHead):
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__(config)
+        self.mlp = Phi4MultimodalVisionMLP(config)
+
+    def forward(self, hidden_state, attention_mask):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(
+            query=probe, key=hidden_state, value=hidden_state, key_padding_mask=~attention_mask
+        )[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+class Phi4MultimodalVisionModel(Phi4MultimodalVisionPreTrainedModel):
+    config_class = Phi4MultimodalVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Phi4MultimodalVisionConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Phi4MultimodalVisionEmbeddings(config)
+        self.encoder = Phi4MultimodalVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.head = Phi4MultimodalVisionMultiheadAttentionPoolingHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_attention_mask = torch.ones(
+                size=(
+                    batch_size,
+                    pixel_values.size(2) // self.config.patch_size,
+                    pixel_values.size(3) // self.config.patch_size,
+                ),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+
+        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            attention_mask = None
+        else:
+            attention_mask = (
+                _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+                if not self.config._attn_implementation == "flash_attention_2"
+                else patch_attention_mask
+            )
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = self.head(
+            hidden_state=last_hidden_state,
+            attention_mask=patch_attention_mask,
+        )
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class Phi4MultimodalImageEmbedding(nn.Module):
+    """Image embedding."""
+
+    def __init__(self, config: Phi4MultimodalConfig):
+        super().__init__()
+        self.config = config
+        self.layer_idx = config.vision_config.feature_layer
+        self.crop_size = config.vision_config.crop_size
+        self.image_dim_out = config.vision_config.hidden_size
+
+        n_patches = config.vision_config.image_size // config.vision_config.patch_size
+        if n_patches % 2 != 0:
+            self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
+            n_patches += 1
+        self.num_img_tokens = (n_patches // 2) ** 2
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.img_processor = Phi4MultimodalVisionModel._from_config(config.vision_config)
+        self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.img_projection_up = nn.Linear(self.image_dim_out, config.hidden_size)
+        self.img_projection_down = nn.Linear(config.hidden_size, config.hidden_size)
+        self.global_img_feature_extensor = nn.Parameter(torch.zeros([1, 1, self.image_dim_out]))
+        self.sub_img_feature_extensor = nn.Parameter(torch.zeros([1, 1, 1, self.image_dim_out]))
+
+    def get_img_features(self, img_embeds: torch.FloatTensor, attention_mask=None) -> torch.FloatTensor:
+        img_processor_output = self.img_processor(
+            img_embeds, patch_attention_mask=attention_mask, output_hidden_states=True
+        )
+        img_feature = img_processor_output.hidden_states[self.layer_idx]
+
+        patch_feature = img_feature
+        # reshape to 2D tensor
+        width = int(math.sqrt(patch_feature.size(1)))
+        patch_feature = patch_feature.view(-1, width, width, patch_feature.size(-1))
+        # convert to NCHW
+        patch_feature = patch_feature.permute(0, 3, 1, 2)
+        if getattr(self, "img_processor_padding", None) is not None:
+            patch_feature = self.img_processor_padding(patch_feature)
+        patch_feature = self.image_token_compression(patch_feature)
+        # convert to NHWC
+        patch_feature = patch_feature.permute(0, 2, 3, 1)
+        patch_feature = patch_feature.view(-1, patch_feature.size(1) * patch_feature.size(2), patch_feature.size(-1))
+        return patch_feature
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.Tensor,
+        image_pixel_values: torch.FloatTensor,
+        image_sizes: Optional[torch.Tensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        image_pixel_values = image_pixel_values.to(self.img_processor.embeddings.patch_embedding.weight.dtype)
+
+        target_device = self.img_projection_up.bias.device
+        target_dtype = self.img_projection_up.bias.dtype
+
+        batch_size = image_pixel_values.shape[0]
+
+        img_features = self.get_img_features(
+            image_pixel_values.flatten(0, 1),
+            attention_mask=image_attention_mask.flatten(0, 1).to(dtype=bool, device=target_device),
+        )
+        base_feat_size = int(np.sqrt(img_features.shape[1]))
+        img_features = img_features.view(batch_size, -1, base_feat_size**2, self.image_dim_out)
+        image_sizes = image_sizes.view(-1, 2)
+
+        output_imgs = []
+        for idx in range(batch_size):
+            height, width = image_sizes[idx]
+            height_ratio = height // self.crop_size
+            width_ratio = width // self.crop_size
+            area_ratio = height_ratio * width_ratio
+
+            global_img = img_features[idx, :1]
+            global_img = global_img.reshape(1, base_feat_size, base_feat_size, self.image_dim_out).contiguous()
+            temporary_extensor = self.sub_img_feature_extensor.repeat(1, base_feat_size, 1, 1)
+            global_img = torch.cat([global_img, temporary_extensor], dim=2).reshape(1, -1, self.image_dim_out)
+
+            sub_img = img_features[idx, 1:]
+            sub_img = sub_img[:area_ratio]
+            sub_img = (
+                sub_img.reshape(height_ratio, width_ratio, base_feat_size, base_feat_size, self.image_dim_out)
+                .transpose(1, 2)
+                .reshape(1, height_ratio * base_feat_size, width_ratio * base_feat_size, self.image_dim_out)
+                .contiguous()
+            )
+
+            if image_attention_mask is not None:
+                reshaped_image_attention_mask = (
+                    image_attention_mask[idx, 1 : area_ratio + 1, 0::2, 0::2]
+                    .reshape(height_ratio, width_ratio, base_feat_size, base_feat_size)
+                    .transpose(1, 2)
+                    .reshape(1, height_ratio * base_feat_size, width_ratio * base_feat_size)
+                )
+                useful_height = int(reshaped_image_attention_mask[0, :, 0].sum().item())
+                useful_width = int(reshaped_image_attention_mask[0, 0, :].sum().item())
+                sub_img = sub_img[:, :useful_height, :useful_width]
+                temporary_extensor = self.sub_img_feature_extensor.repeat(1, useful_height, 1, 1)
+            else:
+                temporary_extensor = self.sub_img_feature_extensor.repeat(1, height_ratio * base_feat_size, 1, 1)
+
+            sub_img = torch.cat([sub_img, temporary_extensor], dim=2).reshape(1, -1, self.image_dim_out)
+
+            # Merge global and sub
+            output_imgs.append(torch.cat([sub_img, self.global_img_feature_extensor, global_img], dim=1))
+
+        img_set_tensor = []
+        for output_img in output_imgs:
+            output_img = output_img.to(device=target_device, dtype=target_dtype)
+            img_feature_proj = self.img_projection_up(output_img)
+            img_feature_proj = nn.functional.gelu(img_feature_proj)
+            img_feature_proj = self.img_projection_down(img_feature_proj)
+            img_set_tensor.append(img_feature_proj)
+
+        merged_img_set_tensor = torch.cat(img_set_tensor, dim=1).squeeze(0)
+        merged_img_set_tensor = merged_img_set_tensor.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+
+        with torch.no_grad():
+            positions_tuple = torch.nonzero(input_ids == self.config.vision_config.image_token_id, as_tuple=True)
+
+        # Temporarily disable autocast to avoid issue on bf16 tensors
+        # Ref: https://github.com/pytorch/pytorch/issues/132715
+        with torch.autocast(device_type=inputs_embeds.device.type, enabled=False):
+            image_embeds = inputs_embeds.index_put(
+                indices=positions_tuple, values=merged_img_set_tensor, accumulate=False
+            )
+
+        image_embeds = self.drop(image_embeds)
+
+        return image_embeds
+
+
+########################################################## AUDIO #############################################
+
+
+class Phi4MultimodalAudioMLP(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.act_fn = ACT2FN[config.activation]
+        self.gate_up_proj = nn.Linear(config.hidden_size, config.intermediate_size * 2)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        up_states = self.gate_up_proj(hidden_states)
+        up_states, gate = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.act_fn(gate)
+        up_states = self.dropout(up_states)
+        hidden_states = self.down_proj(up_states)
+        out = self.dropout(hidden_states)
+
+        return out
+
+
+class Phi4MultimodalAudioAttention(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.config = config
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.dropout_rate
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        **kwargs,
+    ):
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        attention_interface: Callable = simple_eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, _ = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+
+class Phi4MultimodalAudioDepthWiseSeperableConv1d(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig, padding: int = 0):
+        super().__init__()
+        self.dw_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size * config.depthwise_multiplier,
+            config.kernel_size,
+            1,
+            padding=padding,
+            groups=config.hidden_size,
+        )
+        self.pw_conv = nn.Conv1d(
+            config.hidden_size * config.depthwise_multiplier, config.depthwise_seperable_out_channel, 1, 1, 0
+        )
+
+    def forward(self, hidden_states):
+        return self.pw_conv(self.dw_conv(hidden_states))
+
+
+class Phi4MultimodalAudioGluPointWiseConv(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.config = config
+        self.output_dim = config.ext_pw_out_channel
+
+        self.ext_pw_conv_1d = nn.Conv1d(config.hidden_size, config.ext_pw_out_channel * 2, kernel_size=1, stride=1)
+        self.glu_act = ACT2FN[config.conv_glu_type]
+        self.b1 = nn.Parameter(torch.zeros(1, config.ext_pw_out_channel, 1))
+        self.b2 = nn.Parameter(torch.zeros(1, config.ext_pw_out_channel, 1))
+
+    def forward(self, hidden_states):
+        # we assume the input always has the #channel (#dim) in the last dimension of the
+        # tensor, so need to switch the dimension first for 1D-Conv case
+        hidden_states = hidden_states.permute([0, 2, 1])
+        hidden_states = self.ext_pw_conv_1d(hidden_states)
+        out = hidden_states[:, 0 : self.output_dim, :] + self.b1
+        out = out * self.glu_act(hidden_states[:, self.output_dim : self.output_dim * 2, :] + self.b2)
+        return out.permute([0, 2, 1])
+
+
+class Phi4MultimodalAudioConvModule(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.config = config
+        self.kernel_size = config.kernel_size
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.glu = Phi4MultimodalAudioGluPointWiseConv(config)
+        self.dw_sep_conv_1d = Phi4MultimodalAudioDepthWiseSeperableConv1d(config, padding=config.kernel_size - 1)
+        self.act = ACT2FN[config.conv_activation]
+        self.ext_pw_conv_1d = nn.Conv1d(config.hidden_size, config.ext_pw_out_channel, kernel_size=1, stride=1)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.glu(self.layer_norm(hidden_states))
+        hidden_states = self.dw_sep_conv_1d(hidden_states.permute([0, 2, 1]))
+
+        if self.kernel_size > 1:
+            hidden_states = hidden_states[:, :, : -(self.kernel_size - 1)]
+
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.ext_pw_conv_1d(hidden_states)
+        out = self.dropout(hidden_states.permute([0, 2, 1]))
+        return out
+
+
+class Phi4MultimodalAudioConformerEncoderLayer(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+
+        self.feed_forward_in = Phi4MultimodalAudioMLP(config)
+        self.self_attn = Phi4MultimodalAudioAttention(config)
+        self.conv = Phi4MultimodalAudioConvModule(config)
+        self.feed_forward_out = Phi4MultimodalAudioMLP(config)
+        self.layer_norm_att = nn.LayerNorm(config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ):
+        residual = hidden_states + 0.5 * self.feed_forward_in(hidden_states)
+        hidden_states = self.layer_norm_att(residual)
+
+        hidden_states = residual + self.self_attn(hidden_states, attention_mask)
+        hidden_states = hidden_states + self.conv(hidden_states)
+        hidden_states = hidden_states + 0.5 * self.feed_forward_out(hidden_states)
+
+        out = self.layer_norm(hidden_states)
+
+        return out
+
+
+class Phi4MultimodalAudioNemoConvSubsampling(torch.nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.subsampling_factor = config.time_reduction
+        self.sampling_num = int(math.log(self.subsampling_factor, 2))
+        self.act_fn = ACT2FN[config.nemo_activation]
+        conv_channels = config.nemo_conv_channels
+
+        layers = [
+            nn.Conv2d(1, conv_channels, kernel_size=3, stride=2, padding=1),
+            self.act_fn,
+        ]
+        for _ in range(self.sampling_num - 1):
+            layers.extend(
+                [
+                    nn.Conv2d(conv_channels, conv_channels, kernel_size=3, stride=2, padding=1, groups=conv_channels),
+                    nn.Conv2d(conv_channels, conv_channels, kernel_size=1, stride=1, padding=0, groups=1),
+                    self.act_fn,
+                ]
+            )
+
+        # Aggregate the layers
+        self.conv = torch.nn.Sequential(*layers)
+        self.out = torch.nn.Linear(conv_channels * config.nemo_final_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor]):
+        # Unsqueeze Channel Axis
+        hidden_states = hidden_states.unsqueeze(1)
+        hidden_states = self.conv(hidden_states)
+
+        # Flatten Channel and Frequency Axes
+        b, _, t, _ = hidden_states.size()
+        hidden_states = self.out(hidden_states.transpose(1, 2).reshape(b, t, -1))
+
+        if mask is None:
+            return hidden_states, None
+
+        max_audio_length = hidden_states.shape[1]
+        feature_lens = mask.sum(1)
+        padding_length = torch.ceil(feature_lens / self.subsampling_factor)
+        arange_ = torch.arange(0, max_audio_length, device=hidden_states.device)
+        pad_mask = arange_.expand(padding_length.size(0), -1) < padding_length.unsqueeze(1)
+        return hidden_states, pad_mask.unsqueeze(1)
+
+
+class Phi4MultimodalAudioRelativeAttentionBias(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+
+        self.max_distance = config.bias_max_distance
+        self.symmetric = config.bias_symmetric
+        self.num_buckets = self.max_distance
+        if not config.bias_symmetric:
+            self.num_buckets *= 2
+        self.bias_values = nn.Embedding(self.num_buckets, config.num_attention_heads)
+
+    def forward(self, x):
+        # instantiate bias compatible with shape of x
+        max_pos = x.size(1)
+        context_position = torch.arange(max_pos, device=x.device, dtype=torch.long)[:, None]
+        memory_position = torch.arange(max_pos, device=x.device, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        # clipping to a maximum distance using ops that play well with ONNX export
+        relative_position = relative_position.masked_fill(relative_position < -self.max_distance, -self.max_distance)
+        relative_position = relative_position.masked_fill(
+            relative_position > self.max_distance - 1, self.max_distance - 1
+        )
+
+        # mapping from relative position to index in the bias parameter
+        bias_idx = relative_position
+        bias_idx = bias_idx.abs() if self.symmetric else bias_idx + self.num_buckets // 2
+
+        att_bias = self.bias_values(bias_idx)
+        att_bias = att_bias.permute(2, 0, 1).unsqueeze(0)
+
+        return att_bias
+
+
+class Phi4MultimodalAudioMeanVarianceNormLayer(nn.Module):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__()
+        self.register_buffer("global_mean", torch.zeros(config.input_size))
+        self.register_buffer("global_invstd", torch.ones(config.input_size))
+
+    def forward(self, x):
+        return (x - self.global_mean) * self.global_invstd
+
+
+class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel):
+    config_class = Phi4MultimodalAudioConfig
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Phi4MultimodalAudioConformerEncoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):
+    def __init__(self, config: Phi4MultimodalAudioConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder_embedding = Phi4MultimodalAudioMeanVarianceNormLayer(config)
+        self.embed = Phi4MultimodalAudioNemoConvSubsampling(config)
+        self.relative_attention_bias_layer = Phi4MultimodalAudioRelativeAttentionBias(config)
+        self.encoders = nn.ModuleList(
+            [Phi4MultimodalAudioConformerEncoderLayer(config) for _ in range(config.num_blocks)]
+        )
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
+        # Create mask matrix for streaming
+        # S stores start index. if chunksize is 18, s is [0,18,36,....]
+        chunk_start_idx = np.arange(0, seq_len, chunk_size)
+        # avoid randomness when run evaluation or decoding
+        if self.training and np.random.rand() > 0.5:
+            # Either first or last chunk is not complete.
+            # If only the last one is not complete, EOS is not effective
+            chunk_start_idx = seq_len - chunk_start_idx
+            chunk_start_idx = chunk_start_idx[::-1]
+            chunk_start_idx = chunk_start_idx[:-1]
+            chunk_start_idx = np.insert(chunk_start_idx, 0, 0)
+
+        enc_streaming_mask = (
+            adaptive_enc_mask(seq_len, chunk_start_idx, left_window=left_chunk)
+            .unsqueeze(0)
+            .expand([batch_size, -1, -1])
+        )
+        return enc_streaming_mask
+
+    def forward_embeddings(self, hidden_states, masks):
+        """Forwarding the inputs through the top embedding layers"""
+        seq_len = math.ceil(hidden_states.shape[1] / self.config.time_reduction)
+        if seq_len <= 0:
+            raise ValueError(
+                f"The squence length after time reduction is invalid: {seq_len}. Your input feature is too short."
+            )
+
+        batch_size = hidden_states.shape[0]
+
+        enc_streaming_mask = self._streaming_mask(seq_len, batch_size, self.config.chunk_size, self.config.left_chunk)
+        enc_streaming_mask = enc_streaming_mask.to(hidden_states.device)
+
+        hidden_states, masks = self.embed(hidden_states, masks)
+
+        streaming_mask = enc_streaming_mask
+        if streaming_mask is not None and masks is not None:
+            hs_mask = masks & streaming_mask
+        elif masks is not None:
+            hs_mask = masks
+        else:
+            hs_mask = streaming_mask
+
+        return hidden_states, hs_mask, masks
+
+    def calculate_hs_mask(self, hidden_states, device, mask):
+        max_audio_length = hidden_states.shape[1]
+        batch_size = hidden_states.shape[0]
+        enc_streaming_mask = self._streaming_mask(
+            max_audio_length, batch_size, self.config.chunk_size, self.config.left_chunk
+        )
+        enc_streaming_mask = enc_streaming_mask.to(device)
+        if mask is None:
+            return enc_streaming_mask
+
+        feature_lens = mask.sum(1)
+        padding_length = feature_lens
+        pad_mask = torch.arange(0, max_audio_length, device=device).expand(
+            padding_length.size(0), -1
+        ) < padding_length.unsqueeze(1)
+        pad_mask = pad_mask.unsqueeze(1)
+        pad_mask = pad_mask & enc_streaming_mask
+        return pad_mask
+
+    def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor]):
+        hidden_states = self.encoder_embedding(hidden_states)
+        hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
+
+        unfolded = False
+        bs, seq_len, _ = hidden_states.shape
+        max_seq_len = 500  # maxium position for absolute positional encoding
+        if seq_len > max_seq_len:
+            # audio sequence is longer than max_seq_len, unfold it into chunks of max_seq_len
+            unfolded = True
+            # the unfold op will drop residual frames, pad it to the multiple of max_seq_len
+            if seq_len % max_seq_len > 0:
+                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
+            else:
+                chunk_pad_size = 0
+            if chunk_pad_size > 0:
+                hidden_states_pad = F.pad(hidden_states, (0, 0, 0, chunk_pad_size), "constant", 0)
+                hidden_states = hidden_states_pad.to(hidden_states.device)
+
+            hidden_states = unfold_tensor(hidden_states, max_seq_len)
+            masks_unfold = None
+            if mask is not None:
+                # revise hs_mask here because the previous calculated hs_mask did not consider extra pad
+                subsampled_pad_mask = mask.squeeze(1)  # [bz, subsampled_unmask_seq_len]
+                extra_padded_subsamlped_pad_mask = F.pad(
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant", False
+                )  # extra padding to the pad mask
+                extra_padded_subsamlped_pad_mask = extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
+                masks_unfold = unfold_tensor(
+                    extra_padded_subsamlped_pad_mask, max_seq_len
+                )  # unfold the pad mask like we did to the input tensor
+                masks_unfold = masks_unfold.squeeze(-1).bool()  # unfold op does not support bool tensor
+            hs_mask = self.calculate_hs_mask(
+                hidden_states, hidden_states.device, masks_unfold
+            )  # calculate hs_mask based on the unfolded pad mask
+
+        relative_attention_bias = self.relative_attention_bias_layer(hidden_states)
+        attention_mask = hs_mask.unsqueeze(1) + relative_attention_bias
+
+        for layer in self.encoders:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                hidden_states = layer(hidden_states, attention_mask)
+
+        if unfolded:
+            embed_dim = hidden_states.shape[-1]
+            hidden_states = hidden_states.reshape(bs, -1, embed_dim)
+            # if we ever padded before unfolding, we need to remove the padding
+            if chunk_pad_size > 0:
+                hidden_states = hidden_states[:, :-chunk_pad_size, :]
+
+        return hidden_states
+
+
+def unfold_tensor(tensor, max_seq_len):
+    """
+    For a given tensor with shape of (N, T, D), if sequence length T is longer than max_seq_len,
+    this function unfold it to a (NT', max_seq_len, D) where T' is T // max_seq_len.
+    Args:
+        tensor: N, T, D
+    """
+    _, _, D = tensor.shape
+    tensor = tensor.transpose(-1, -2)
+    # N x D x 1 x T => N x (D x max_seq_len) x T'
+    tensor = F.unfold(tensor[..., None, :], kernel_size=(1, max_seq_len), stride=(1, max_seq_len))
+
+    new_bsz, _, slen = tensor.shape
+    tensor = tensor.view(new_bsz, -1, max_seq_len, slen)
+    tensor = tensor.permute(0, 3, 2, 1)
+    tensor = tensor.view(-1, max_seq_len, D).contiguous()
+    return tensor
+
+
+def adaptive_enc_mask(x_len, chunk_start_idx, left_window=0, right_window=0):
+    """
+    The function is very important for Transformer Transducer Streaming mode
+    Args:
+        xs_len (int): sequence length
+        chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48]. It also supports adaptive chunk size [0,10,15,45]
+        left_window (int): how many left chunks can be seen
+        right_window (int): how many right chunks can be seen. It is used for chunk overlap model.
+        Returns:
+            mask (torch.Tensor): a mask tensor for streaming model
+    """
+    chunk_start_idx = torch.Tensor(chunk_start_idx).long()
+    start_pad = torch.nn.functional.pad(
+        chunk_start_idx, (1, 0)
+    )  # append 0 to the beginning, so it becomes [0, 0, 18, 36, 48]
+    end_pad = torch.nn.functional.pad(
+        chunk_start_idx, (0, 1), value=x_len
+    )  # append x_len to the end, so it becomes [0,18,36,48, x_len]
+    seq_range = torch.arange(0, x_len).unsqueeze(-1)
+    idx = ((seq_range < end_pad) & (seq_range >= start_pad)).nonzero()[:, 1]
+    seq_range_expand = torch.arange(0, x_len).unsqueeze(0).expand(x_len, -1)
+    idx_left = idx - left_window
+    idx_left[idx_left < 0] = 0
+    boundary_left = start_pad[idx_left]
+    mask_left = seq_range_expand >= boundary_left.unsqueeze(-1)
+    idx_right = idx + right_window
+    idx_right[idx_right > len(chunk_start_idx)] = len(chunk_start_idx)
+    boundary_right = end_pad[idx_right]
+    mask_right = seq_range_expand < boundary_right.unsqueeze(-1)
+    return mask_left & mask_right
+
+
+class Phi4MultimodalAudioEmbedding(nn.Module):
+    def __init__(self, config: Phi4MultimodalConfig):
+        super().__init__()
+        self.config = config
+        self.layer_idx = config.audio_config.feature_layer
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.encoder = Phi4MultimodalAudioModel._from_config(config.audio_config)
+        self.up_proj_for_speech = nn.Linear(
+            config.audio_config.hidden_size * config.audio_config.downsample_rate, config.hidden_size
+        )
+        self.down_proj_for_speech = nn.Linear(config.hidden_size, config.hidden_size)
+        self.up_proj_for_vision_speech = nn.Linear(
+            config.audio_config.hidden_size * config.audio_config.downsample_rate, config.hidden_size
+        )
+        self.down_proj_for_vision_speech = nn.Linear(config.hidden_size, config.hidden_size)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.Tensor,
+        audio_input_features: torch.FloatTensor,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        audio_projection_mode="speech",
+    ) -> torch.FloatTensor:
+        with torch.no_grad():
+            positions_tuple = torch.nonzero(input_ids == self.config.audio_config.audio_token_id, as_tuple=True)
+
+        up_proj = self.up_proj_for_speech if audio_projection_mode == "speech" else self.up_proj_for_vision_speech
+        down_proj = (
+            self.down_proj_for_speech if audio_projection_mode == "speech" else self.down_proj_for_vision_speech
+        )
+
+        target_device = up_proj.bias.device
+        target_dtype = up_proj.bias.dtype
+
+        audio_input_features = audio_input_features.to(device=target_device, dtype=target_dtype)
+
+        audio_encoder_hidden_states = self.encoder(audio_input_features, audio_attention_mask)
+        audio_encoder_hidden_states = up_proj(audio_encoder_hidden_states)
+        audio_encoder_hidden_states = nn.functional.gelu(audio_encoder_hidden_states)
+        audio_embeds = down_proj(audio_encoder_hidden_states)
+
+        merged_audio_embeds = torch.cat(
+            [audio_embeds[i, : audio_embed_sizes[i], :] for i in range(len(audio_embed_sizes))], dim=0
+        )
+        merged_audio_embeds = merged_audio_embeds.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+        # Temporarily disable autocast to avoid issue on bf16 tensors
+        # Ref: https://github.com/pytorch/pytorch/issues/132715
+        with torch.autocast(device_type=inputs_embeds.device.type, enabled=False):
+            audio_embeds = inputs_embeds.index_put(
+                indices=positions_tuple, values=merged_audio_embeds, accumulate=False
+            )
+
+        audio_embeds = self.drop(audio_embeds)
+
+        return audio_embeds
+
+
+#################################################### TEXT ####################################################
+
+
+class Phi4MultimodalRMSNorm(Phi3RMSNorm):
+    pass
+
+
+class Phi4MultimodalDecoderLayer(Phi3DecoderLayer):
+    pass
+
+
+class Phi4MultimodalFeatureEmbedding(nn.Module):
+    """Image-audio embedding."""
+
+    def __init__(self, config: Phi4MultimodalConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.image_token_id = config.vision_config.image_token_id
+        self.audio_token_id = config.audio_config.audio_token_id
+        self.image_embed = Phi4MultimodalImageEmbedding(config)
+        self.audio_embed = Phi4MultimodalAudioEmbedding(config)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.Tensor,
+        image_pixel_values: Optional[torch.FloatTensor] = None,
+        audio_input_features: Optional[torch.FloatTensor] = None,
+        image_sizes=None,
+        image_attention_mask=None,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+    ) -> torch.FloatTensor:
+        with torch.no_grad():
+            image_position_mask = (input_ids == self.config.vision_config.image_token_id).unsqueeze(-1)
+            non_image_position_mask = ~image_position_mask
+
+        image_embeds = None
+        audio_embeds = None
+        if image_pixel_values is not None and (input_ids == self.image_token_id).any():
+            image_embeds = self.image_embed(
+                input_ids,
+                inputs_embeds,
+                image_pixel_values=image_pixel_values,
+                image_sizes=image_sizes,
+                image_attention_mask=image_attention_mask,
+            )
+        if audio_input_features is not None and (input_ids == self.audio_token_id).any():
+            audio_projection_mode = "vision" if image_pixel_values is not None else "speech"
+            audio_embeds = self.audio_embed(
+                input_ids,
+                inputs_embeds,
+                audio_input_features=audio_input_features,
+                audio_embed_sizes=audio_embed_sizes,
+                audio_attention_mask=audio_attention_mask,
+                audio_projection_mode=audio_projection_mode,
+            )
+
+        # merge image and audio
+        if image_embeds is not None and audio_embeds is not None:
+            inputs_embeds = image_embeds * image_position_mask + audio_embeds * non_image_position_mask
+        elif image_embeds is not None:
+            inputs_embeds = image_embeds
+        elif audio_embeds is not None:
+            inputs_embeds = audio_embeds
+
+        return inputs_embeds
+
+
+PHI4_MULTIMODAL_MODEL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache`)`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            See our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        image_pixel_values (`torch.FloatTensor`, *optional*):
+            If the input contains images, these correspond to the pixel values after transformations (as returned by
+            the Processor)
+        image_sizes (`torch.LongTensor`, *optional*):
+            If the input contains images, these correspond to size of each image.
+        image_attention_mask (`torch.LongTensor`, *optional*):
+            Attention mask for the images.
+        audio_input_features (`torch.FloatTensor`, *optional*):
+            If the input contains audio samples, these correspond to the values after transformation (as returned by
+            the Processor).
+        audio_embed_sizes (`torch.Tensor`, *optional*):
+            Size of the audio inputs.
+        audio_attention_mask (`torch.Tensor, *optional*):
+            Attention mask for the audio inputs.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+class Phi4MultimodalModel(Phi3Model, nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi4MultimodalMMDecoderLayer`]
+    Args:
+        config: Phi4MultimodalMMConfig
+    """
+
+    def __init__(self, config: Phi4MultimodalConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.embed_dropout = nn.Dropout(config.embd_pdrop)
+
+        self.embed_tokens_extend = Phi4MultimodalFeatureEmbedding(config)
+
+        self.layers = nn.ModuleList(
+            [Phi4MultimodalDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Phi4MultimodalRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(PHI4_MULTIMODAL_MODEL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        image_attention_mask=None,
+        audio_input_features: Optional[torch.FloatTensor] = None,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            inputs_embeds = self.embed_tokens_extend(
+                input_ids,
+                inputs_embeds,
+                image_pixel_values=image_pixel_values,
+                audio_input_features=audio_input_features,
+                image_sizes=image_sizes,
+                image_attention_mask=image_attention_mask,
+                audio_embed_sizes=audio_embed_sizes,
+                audio_attention_mask=audio_attention_mask,
+            )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Phi4MultimodalForCausalLM(Phi3ForCausalLM, nn.Module):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Phi4MultimodalModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(PHI4_MULTIMODAL_MODEL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=Phi4MultimodalConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        image_attention_mask=None,
+        audio_input_features: Optional[torch.FloatTensor] = None,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+        Returns:
+
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Phi4MultimodalForCausalLM
+        >>> model = Phi4MultimodalForCausalLM.from_pretrained("TBA")
+        >>> tokenizer = AutoTokenizer.from_pretrained("TBA")
+        >>> prompt = "This is an example script ."
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            image_pixel_values=image_pixel_values,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            audio_input_features=audio_input_features,
+            audio_embed_sizes=audio_embed_sizes,
+            audio_attention_mask=audio_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        image_pixel_values=None,
+        image_sizes=None,
+        image_attention_mask=None,
+        audio_input_features=None,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        logits_to_keep=0,
+        **kwargs,
+    ):
+        # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the
+        # process
+
+        # When the first time input length reached long and short factor switching point, enforce re-compute cache
+        # It will cause downside of slower at this single token position, however, better than current failure.
+        if (
+            past_key_values
+            and self.config.rope_scaling
+            and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
+        ):
+            past_length = cache_position[0]
+            if past_length <= self.config.original_max_position_embeddings:
+                past_key_values = None
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            image_pixel_values=image_pixel_values,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            audio_input_features=audio_input_features,
+            audio_embed_sizes=audio_embed_sizes,
+            audio_attention_mask=audio_attention_mask,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        return model_inputs
+
+
+__all__ = [
+    "Phi4MultimodalAudioPreTrainedModel",
+    "Phi4MultimodalAudioModel",
+    "Phi4MultimodalVisionPreTrainedModel",
+    "Phi4MultimodalVisionModel",
+    "Phi4MultimodalPreTrainedModel",  # noqa
+    "Phi4MultimodalModel",
+    "Phi4MultimodalForCausalLM",
+    "Phi4MultimodalVisionConfig",
+    "Phi4MultimodalAudioConfig",
+    "Phi4MultimodalConfig",
+]
diff --git a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
new file mode 100644
index 000000000000..f853865f7d55
--- /dev/null
+++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
@@ -0,0 +1,190 @@
+# Copyright 2025 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Processor class for Phi4Multimodal
+"""
+
+import re
+from typing import List, Optional, Union
+
+from ...audio_utils import AudioInput
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import TextInput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Phi4MultimodalProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "audio_kwargs": {
+            "device": "cpu",
+        },
+    }
+
+
+class Phi4MultimodalProcessor(ProcessorMixin):
+    r"""
+    Constructs a Phi4Multimodal processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
+
+    [`Phi4MultimodalProcessor`] offers all the functionalities of [`Phi4MultimodalImageProcessorFast`] and [`GPT2Tokenizer`]. See the
+    [`~Phi4MultimodalProcessor.__call__`] and [`~Phi4MultimodalProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`Phi4MultimodalImageProcessorFast`):
+            The image processor to use for images.
+        audio_processor (`Phi4MultimodalFeatureExtractor`):
+            The audio processor to use for audio inputs.
+        tokenizer (`GPT2TokenizerFast`):
+            The tokenizer to use for text.
+        fake_image_token_pattern (`str`, *optional*, defaults to `r"<\|image_\d+\|>"`):
+            The fake image token pattern.
+        fake_audio_token_pattern (`str`, *optional*, defaults to `r"<\|audio_\d+\|>"`):
+            The fake audio token pattern.
+    """
+
+    attributes = ["image_processor", "audio_processor", "tokenizer"]
+    tokenizer_class = "GPT2TokenizerFast"
+    image_processor_class = "Phi4MultimodalImageProcessorFast"
+    audio_processor_class = "Phi4MultimodalFeatureExtractor"
+    valid_kwargs = ["chat_template"]
+
+    def __init__(
+        self,
+        image_processor,
+        audio_processor,
+        tokenizer,
+        **kwargs,
+    ):
+        super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
+
+    def __call__(
+        self,
+        text: Union[TextInput, List[TextInput]],
+        images: Optional[ImageInput] = None,
+        audio: Optional[AudioInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
+        and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            audio (`List[Union[np.ndarray, torch.Tensor]]`):
+                List of the audios to be prepared.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
+            - **input_image_embeds** -- Pixel values to be fed to a model.
+            - **image_sizes** -- List of tuples specifying the size of each image in `input_image_embeds`.
+            - **image_attention_mask** -- List of attention masks for each image in `input_image_embeds`.
+            - **input_audio_embeds** -- Audio embeddings to be fed to a model.
+            - **audio_embed_sizes** -- List of integers specifying the size of each audio in `input_audio_embeds`.
+        """
+
+        output_kwargs = self._merge_kwargs(Phi4MultimodalProcessorKwargs, self.tokenizer.init_kwargs, **kwargs)
+        image_kwargs = output_kwargs["images_kwargs"]
+        audio_kwargs = output_kwargs["audio_kwargs"]
+        text_kwargs = output_kwargs["text_kwargs"]
+
+        image_inputs = self.image_processor(images, **image_kwargs) if images is not None else {}
+        audio_inputs = self.audio_processor(audio, **audio_kwargs) if audio is not None else {}
+
+        # We pop here for images as we don't need it later
+        num_img_tokens = image_inputs.pop("num_img_tokens", [])
+        audio_embed_sizes = audio_inputs.get("audio_embed_sizes", [])
+
+        # Replace certain special tokens for compatibility
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        image_token = self.tokenizer.image_token
+        audio_token = self.tokenizer.audio_token
+
+        # Check that the number of special tokens is sound
+        concatenated_prompt = "".join(text)
+        if concatenated_prompt.count(image_token) != len(num_img_tokens):
+            raise ValueError(
+                "You should add as much image tokens `<|image|>` in your prompt as you pass `images` to the processor. ",
+                f"Input contains {concatenated_prompt.count(image_token)} tokens != {len(num_img_tokens)} images",
+            )
+        if concatenated_prompt.count(audio_token) != len(audio_embed_sizes):
+            raise ValueError(
+                "You should add as much audio tokens `<|audio|>` in your prompt as you pass `audios` to the processor. "
+                f"Input contains {concatenated_prompt.count(audio_token)} tokens != {len(audio_embed_sizes)} audios"
+            )
+
+        # Add appropriate number of image/audio tokens (note that the count of replacement is dynamic)
+        image_count_iter = iter(num_img_tokens)
+        audio_count_iter = iter(audio_embed_sizes)
+        processed_text = [
+            re.sub(re.escape(image_token), lambda _: image_token * next(image_count_iter), t) for t in text
+        ]
+        processed_text = [
+            re.sub(re.escape(audio_token), lambda _: audio_token * next(audio_count_iter), t) for t in processed_text
+        ]
+
+        text_inputs = self.tokenizer(processed_text, **text_kwargs)
+
+        # prepare batch feature
+        data = {
+            **text_inputs,
+            **image_inputs,
+            **audio_inputs,
+        }
+
+        return BatchFeature(data=data)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GPT2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        audio_processor_input_names = self.audio_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names + audio_processor_input_names))
+
+
+__all__ = ["Phi4MultimodalProcessor"]
diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py
index 7f304281ae73..33123ff8ef2d 100644
--- a/src/transformers/models/phimoe/configuration_phimoe.py
+++ b/src/transformers/models/phimoe/configuration_phimoe.py
@@ -88,7 +88,7 @@ class PhimoeConfig(PretrainedConfig):
         num_local_experts (`int`, *optional*, defaults to 16):
             Number of experts per Sparse MLP layer.
         output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabeling this will also
+            Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss. See [here]() for more details
         router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             The aux loss factor for the total loss.
diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py
index 1cea9a2ea28b..9cbfe776bbd1 100644
--- a/src/transformers/models/phimoe/modeling_phimoe.py
+++ b/src/transformers/models/phimoe/modeling_phimoe.py
@@ -26,6 +26,7 @@
 from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
+from ...modeling_flash_attention_utils import is_flash_attn_available
 from ...modeling_outputs import (
     MoeCausalLMOutputWithPast,
     MoeModelOutputWithPast,
@@ -36,7 +37,7 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -45,7 +46,7 @@
 from .configuration_phimoe import PhimoeConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
@@ -1031,10 +1032,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PHIMOE_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1043,9 +1045,8 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, MoeModelOutputWithPast]:
+    ) -> MoeModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
             output_router_logits if output_router_logits is not None else self.config.output_router_logits
@@ -1055,8 +1056,6 @@ def forward(
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
@@ -1159,12 +1158,6 @@ def forward(
         if return_legacy_cache:
             next_cache = next_cache.to_legacy_cache()
 
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
-                if v is not None
-            )
         return MoeModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1180,7 +1173,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -1284,7 +1277,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1366,13 +1359,14 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(PHIMOE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     # Ignore copy
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1382,13 +1376,11 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **loss_kwargs,
-    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1430,10 +1422,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: MoeModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1443,11 +1434,10 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             output_router_logits=output_router_logits,
-            return_dict=return_dict,
             cache_position=cache_position,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -1459,7 +1449,7 @@ def forward(
         aux_loss = None
         if output_router_logits:
             aux_loss = load_balancing_loss_func(
-                outputs.router_logits if return_dict else outputs[-1],
+                outputs.router_logits,
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
@@ -1467,12 +1457,6 @@ def forward(
             if labels is not None:
                 loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            if output_router_logits:
-                output = (aux_loss,) + output
-            return (loss,) + output if loss is not None else output
-
         return MoeCausalLMOutputWithPast(
             loss=loss,
             aux_loss=aux_loss,
@@ -1538,7 +1522,7 @@ def prepare_inputs_for_generation(
     PHIMOE_START_DOCSTRING,
 )
 
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phimoe, LLAMA->PHIMOE
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phimoe, LLAMA->PHIMOE, BaseModelOutputWithPast->MoeModelOutputWithPast
 class PhimoeForSequenceClassification(PhimoePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1555,29 +1539,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(PHIMOE_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: MoeModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1586,9 +1569,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1603,7 +1585,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1618,10 +1600,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
diff --git a/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py b/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
deleted file mode 100644
index 457c2236694a..000000000000
--- a/src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import re
-
-import torch
-from flax.traverse_util import flatten_dict
-from t5x import checkpoints
-
-from transformers import (
-    AutoTokenizer,
-    Pix2StructConfig,
-    Pix2StructForConditionalGeneration,
-    Pix2StructImageProcessor,
-    Pix2StructProcessor,
-    Pix2StructTextConfig,
-    Pix2StructVisionConfig,
-)
-
-
-def get_flax_param(t5x_checkpoint_path):
-    flax_params = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-    flax_params = flatten_dict(flax_params)
-    return flax_params
-
-
-def rename_and_convert_flax_params(flax_dict):
-    converted_dict = {}
-
-    CONVERSION_MAPPING = {
-        "token_embedder": "embeddings",
-        "encoder_norm": "layernorm",
-        "kernel": "weight",
-        ".out": ".output",
-        "scale": "weight",
-        "embedders_0.pos_embedding": "row_embedder.weight",
-        "embedders_1.pos_embedding": "column_embedder.weight",
-    }
-
-    DECODER_CONVERSION_MAPPING = {
-        "query": "attention.query",
-        "key": "attention.key",
-        "value": "attention.value",
-        "output.dense": "output",
-        "encoder_decoder_attention.o": "encoder_decoder_attention.attention.o",
-        "pre_self_attention_layer_norm": "self_attention.layer_norm",
-        "pre_cross_attention_layer_norm": "encoder_decoder_attention.layer_norm",
-        "mlp.": "mlp.DenseReluDense.",
-        "pre_mlp_layer_norm": "mlp.layer_norm",
-        "self_attention.o": "self_attention.attention.o",
-        "decoder.embeddings.embedding": "decoder.embed_tokens.weight",
-        "decoder.relpos_bias.rel_embedding": "decoder.layer.0.self_attention.attention.relative_attention_bias.weight",
-        "decoder.decoder_norm.weight": "decoder.final_layer_norm.weight",
-        "decoder.logits_dense.weight": "decoder.lm_head.weight",
-    }
-
-    for key in flax_dict.keys():
-        if "target" in key:
-            # remove the first prefix from the key
-            new_key = ".".join(key[1:])
-
-            # rename the key
-            for old, new in CONVERSION_MAPPING.items():
-                new_key = new_key.replace(old, new)
-
-            if "decoder" in new_key:
-                for old, new in DECODER_CONVERSION_MAPPING.items():
-                    new_key = new_key.replace(old, new)
-
-            if "layers" in new_key and "decoder" not in new_key:
-                # use regex to replace the layer number
-                new_key = re.sub(r"layers_(\d+)", r"layer.\1", new_key)
-                new_key = new_key.replace("encoder", "encoder.encoder")
-
-            elif "layers" in new_key and "decoder" in new_key:
-                # use regex to replace the layer number
-                new_key = re.sub(r"layers_(\d+)", r"layer.\1", new_key)
-
-            converted_dict[new_key] = flax_dict[key]
-
-    converted_torch_dict = {}
-    # convert converted_dict into torch format
-    for key in converted_dict.keys():
-        if ("embed_tokens" not in key) and ("embedder" not in key):
-            converted_torch_dict[key] = torch.from_numpy(converted_dict[key].T)
-        else:
-            converted_torch_dict[key] = torch.from_numpy(converted_dict[key])
-
-    return converted_torch_dict
-
-
-def convert_pix2struct_original_pytorch_checkpoint_to_hf(
-    t5x_checkpoint_path, pytorch_dump_folder_path, use_large=False, is_vqa=False
-):
-    flax_params = get_flax_param(t5x_checkpoint_path)
-
-    if not use_large:
-        encoder_config = Pix2StructVisionConfig()
-        decoder_config = Pix2StructTextConfig()
-    else:
-        encoder_config = Pix2StructVisionConfig(
-            hidden_size=1536, d_ff=3968, num_attention_heads=24, num_hidden_layers=18
-        )
-        decoder_config = Pix2StructTextConfig(hidden_size=1536, d_ff=3968, num_heads=24, num_layers=18)
-    config = Pix2StructConfig(
-        vision_config=encoder_config.to_dict(), text_config=decoder_config.to_dict(), is_vqa=is_vqa
-    )
-
-    model = Pix2StructForConditionalGeneration(config)
-
-    torch_params = rename_and_convert_flax_params(flax_params)
-    model.load_state_dict(torch_params)
-
-    tok = AutoTokenizer.from_pretrained("ybelkada/test-pix2struct-tokenizer")
-    image_processor = Pix2StructImageProcessor()
-    processor = Pix2StructProcessor(image_processor=image_processor, tokenizer=tok)
-
-    if use_large:
-        processor.image_processor.max_patches = 4096
-
-    processor.image_processor.is_vqa = True
-
-    # mkdir if needed
-    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-    print("Model saved in {}".format(pytorch_dump_folder_path))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--t5x_checkpoint_path", default=None, type=str, help="Path to the original T5x checkpoint.")
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--use_large", action="store_true", help="Use large model.")
-    parser.add_argument("--is_vqa", action="store_true", help="Use large model.")
-    args = parser.parse_args()
-
-    convert_pix2struct_original_pytorch_checkpoint_to_hf(
-        args.t5x_checkpoint_path, args.pytorch_dump_folder_path, args.use_large
-    )
diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
index 386ac83b6155..e9db5175b2eb 100644
--- a/src/transformers/models/pix2struct/image_processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -349,7 +349,7 @@ def preprocess(
         self,
         images: ImageInput,
         header_text: Optional[str] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         do_normalize: Optional[bool] = None,
         max_patches: Optional[int] = None,
         patch_size: Optional[Dict[str, int]] = None,
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index 71cf2f255511..675859f3102b 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -39,6 +39,7 @@
     DUMMY_MASK,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
     is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
@@ -47,6 +48,12 @@
 from .configuration_pix2struct import Pix2StructConfig, Pix2StructTextConfig, Pix2StructVisionConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 # General docstring
@@ -65,7 +72,7 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
 
@@ -1584,12 +1591,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1671,7 +1683,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
index ac9802dac8f8..f9b1fcc44087 100644
--- a/src/transformers/models/pix2struct/processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -89,15 +89,6 @@ def __call__(
 
         Please refer to the docstring of the above two methods for more information.
         """
-        legacy = kwargs.pop("legacy", True)
-        if legacy:
-            logger.warning_once(
-                "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. "
-                "In the new behavior, If both images and text are provided, image_processor is not a VQA processor, and `add_special_tokens` is unset, "
-                "the default value of `add_special_tokens` will be changed to `False` when calling the tokenizer. "
-                "To test the new behavior, set `legacy=False`as a processor call argument."
-            )
-
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
@@ -126,7 +117,7 @@ def __call__(
 
         if text is not None and not self.image_processor.is_vqa:
             output_kwargs["text_kwargs"]["add_special_tokens"] = (
-                add_special_tokens if add_special_tokens is not None else legacy
+                add_special_tokens if add_special_tokens is not None else False
             )
             text_encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
 
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
deleted file mode 100644
index ee1f1e9eb5ed..000000000000
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc. team. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-
-import regex as re
-import torch
-from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-from safetensors.torch import load_file as safe_load_file
-
-from transformers import (
-    LlavaConfig,
-    LlavaForConditionalGeneration,
-    MistralConfig,
-    PixtralImageProcessor,
-    PixtralProcessor,
-    PixtralVisionConfig,
-)
-
-
-"""
-# Here is how to get the original tokens!
-model_name = "mistralai/Pixtral-12B-2409"
-tok = MistralTokenizer.from_model(model_name)
-
-from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk
-
-EXPECTED_TOKENS = tok.encode_chat_completion(
-    ChatCompletionRequest(
-        messages=[
-            UserMessage(
-                content=[
-                    TextChunk(text="Describe the images"),
-                ] + [ImageChunk(image=img) for img in IMG_URLS]
-            )
-        ],
-        model="pixtral",
-    )
-)
-assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS
-"""
-
-OLD_KEY_TO_NEW_KEY_MAPPING = {
-    # Layer Normalization Weights
-    r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight": r"vision_tower.transformer.layers.\1.attention_norm.weight",
-    r"vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"vision_tower.transformer.layers.\1.ffn_norm.weight",
-    # Self Attention Projections
-    r"vision_encoder.transformer.layers.(\d+).attention.wq.weight": r"vision_tower.transformer.layers.\1.attention.q_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wk.weight": r"vision_tower.transformer.layers.\1.attention.k_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wv.weight": r"vision_tower.transformer.layers.\1.attention.v_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wo.weight": r"vision_tower.transformer.layers.\1.attention.o_proj.weight",
-    # MLP Projections
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.feed_forward.down_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.feed_forward.up_proj.weight",
-    # Additional mappings
-    r"vision_encoder": r"vision_tower",
-    r"vision_language_adapter.w_in": r"multi_modal_projector.linear_1",
-    r"vision_language_adapter.w_out": r"multi_modal_projector.linear_2",
-    r"layers.(\d+).attention.wq.weight": r"language_model.model.layers.\1.self_attn.q_proj.weight",
-    r"layers.(\d+).attention.wk.weight": r"language_model.model.layers.\1.self_attn.k_proj.weight",
-    r"layers.(\d+).attention.wv.weight": r"language_model.model.layers.\1.self_attn.v_proj.weight",
-    r"layers.(\d+).attention.wo.weight": r"language_model.model.layers.\1.self_attn.o_proj.weight",
-    r"layers.(\d+).feed_forward.w1.weight": r"language_model.model.layers.\1.mlp.gate_proj.weight",
-    r"layers.(\d+).feed_forward.w2.weight": r"language_model.model.layers.\1.mlp.down_proj.weight",
-    r"layers.(\d+).feed_forward.w3.weight": r"language_model.model.layers.\1.mlp.up_proj.weight",
-    r"layers.(\d+).ffn_norm.weight": r"language_model.model.layers.\1.post_attention_layernorm.weight",
-    r"layers.(\d+).attention_norm.weight": r"language_model.model.layers.\1.input_layernorm.weight",
-    r"tok_embeddings.weight": r"language_model.model.embed_tokens.weight",
-    r"output.weight": r"language_model.lm_head.weight",
-    r"norm.weight": r"language_model.model.norm.weight",
-}
-
-
-def convert_mistral_tokenizer(model_file):
-    from transformers import LlamaTokenizer
-
-    mistral_tokenizer = MistralTokenizer.from_file(model_file)
-    vocab = mistral_tokenizer.instruct_tokenizer.tokenizer.vocab()
-    control_token_ids = mistral_tokenizer.instruct_tokenizer.tokenizer._control_tokens
-    all_special = [vocab[id] for id in control_token_ids]
-    hf_tokenizer = LlamaTokenizer(model_file)
-    # Do I need to exclude tokens that are already special?
-    hf_tokenizer.add_special_tokens({"additional_special_tokens": all_special})
-    hf_tokenizer.model_input_names = ["input_ids", "attention_mask"]
-    return hf_tokenizer
-
-
-def permute_for_rope(value, n_heads, config):
-    dim1 = value.shape[0]
-    dim2 = config.hidden_size
-    return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-
-def convert_dictionary(original_state_dict, vision_config, text_config):
-    new_dict = {}
-
-    all_keys = "\n" + "\n".join(original_state_dict.keys())
-    old_keys = all_keys
-    for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items():
-        all_keys = re.sub(r"\n" + old, r"\n" + new, all_keys)
-
-    OLD_TO_NEW = dict(zip(old_keys.split("\n"), all_keys.split("\n")))
-
-    for key, value in original_state_dict.items():
-        new_key = OLD_TO_NEW[key]
-        if "vision_encoder" in key:
-            _config = vision_config
-            num_attention_heads = _config.num_attention_heads
-        else:
-            _config = text_config
-            if "q_proj" in new_key:
-                num_attention_heads = _config.num_attention_heads
-            if "k_proj" in new_key:
-                num_attention_heads = _config.num_key_value_heads
-
-        if "q_proj" in new_key or "k_proj" in new_key:
-            value = permute_for_rope(value, num_attention_heads, _config)
-
-        new_dict[new_key] = value
-    return new_dict
-
-
-MISTRAL_CONFIG_MAPPING = {
-    "dim": "hidden_size",
-    "hidden_dim": "intermediate_size",
-    "n_kv_heads": "num_key_value_heads",
-    "n_heads": "num_attention_heads",
-    "n_layers": "num_hidden_layers",
-}
-
-
-def convert_mistral_model(input_dir, output_dir):
-    vision_config = {}
-    if os.path.isfile(f"{input_dir}/params.json"):
-        with open(f"{input_dir}/params.json") as f:
-            param_json = json.load(f)
-        vision_config = param_json.pop("vision_encoder")
-        for k, v in MISTRAL_CONFIG_MAPPING.items():
-            value = param_json.pop(k)
-            param_json[v] = value
-        if "hidden_act" not in vision_config:
-            vision_config["hidden_act"] = "silu"
-        text_config = MistralConfig(
-            **param_json,
-            hidden_act="silu",
-            sliding_window=None,
-            tie_word_embeddings=False,
-            is_composition=True,
-            rms_norm_eps=1e-5,
-        )
-    else:
-        text_config = MistralConfig(
-            attention_dropout=0.0,
-            bos_token_id=1,
-            eos_token_id=2,
-            head_dim=128,
-            hidden_act="silu",
-            hidden_size=5120,
-            initializer_range=0.02,
-            intermediate_size=14336,
-            max_position_embeddings=1024000,
-            model_type="mistral",
-            num_attention_heads=32,
-            num_hidden_layers=40,
-            num_key_value_heads=8,
-            rms_norm_eps=1e-05,
-            rope_theta=1000000000.0,
-            sliding_window=None,
-            tie_word_embeddings=False,
-            vocab_size=131072,
-        )
-    adapter_bias = vision_config.pop("adapter_bias", True)
-    vision_config = PixtralVisionConfig(**vision_config)
-    config = LlavaConfig(
-        vision_config,
-        text_config,
-        vision_feature_layer=-1,
-        image_token_index=10,
-        vision_feature_select_strategy="full",
-        image_seq_length=1,
-        multimodal_projector_bias=adapter_bias,
-    )
-    config.architectures = ["LlavaForConditionalGeneration"]
-    config.save_pretrained(output_dir)
-    full_original_state_dict = {}
-    safetensors_files = sorted([file for file in os.listdir(input_dir) if file.endswith(".safetensors")])
-    if len(safetensors_files) == 1:
-        full_original_state_dict = safe_load_file(f"{input_dir}/consolidated.safetensors")
-    else:
-        for file in safetensors_files:
-            loaded_dict = safe_load_file(f"{input_dir}/{file}")
-            full_original_state_dict.update(loaded_dict)
-
-    new_dict = convert_dictionary(full_original_state_dict, vision_config, text_config)
-    with torch.device("meta"):
-        model = LlavaForConditionalGeneration(config)
-    model.load_state_dict(new_dict, strict=True, assign=True)
-    model.save_pretrained(output_dir)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
-        required=True,
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-        required=True,
-    )
-    parser.add_argument(
-        "--tokenizer_file", help="Location of the specific tokenizer model file to use.", required=True
-    )
-    parser.add_argument(
-        "--chat_template_file",
-        help="Optional file containing a raw chat template. Will be set as the processor's chat template.",
-        required=False,
-    )
-
-    args = parser.parse_args()
-    convert_mistral_model(args.input_dir, args.output_dir)
-    tokenizer = convert_mistral_tokenizer(args.tokenizer_file)
-    image_processor = PixtralImageProcessor()
-    processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
-    if args.chat_template_file:
-        processor.chat_template = open(args.chat_template_file).read()
-    processor.save_pretrained(args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 969575d2e49a..074a8d1076a5 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -128,8 +128,9 @@ def get_resize_output_image_size(
 
     if ratio > 1:
         # Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results
-        height = int(math.ceil(height / ratio))
-        width = int(math.ceil(width / ratio))
+        # Here we use floor to ensure the image is always smaller than the given "longest_edge"
+        height = int(math.floor(height / ratio))
+        width = int(math.floor(width / ratio))
 
     num_height_tokens, num_width_tokens = _num_image_tokens((height, width), (patch_height, patch_width))
     return num_height_tokens * patch_height, num_width_tokens * patch_width
@@ -318,16 +319,16 @@ def _pad_for_batching(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         patch_size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/pixtral/image_processing_pixtral_fast.py b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
index f76fe4a716a9..0cb467303853 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral_fast.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
@@ -21,8 +21,7 @@
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -61,11 +60,7 @@
         from torchvision.transforms import functional as F
 
 
-class PixtralFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
-    patch_size: Optional[Dict[str, int]]
-
-
-class PixtralFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
+class PixtralFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     patch_size: Optional[Dict[str, int]]
 
 
@@ -88,10 +83,9 @@ class PixtralImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
-    valid_init_kwargs = PixtralFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = PixtralFastImageProcessorPreprocessKwargs
+    valid_kwargs = PixtralFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @add_start_docstrings(
@@ -101,9 +95,7 @@ def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorInitKwargs]):
             Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
         """,
     )
-    def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorPreprocessKwargs]
-    ) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def resize(
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index af41bab84259..1a3e3aee0fdd 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -23,6 +23,7 @@
 from ... import PreTrainedModel
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
+from ...modeling_rope_utils import dynamic_rope_update
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -83,38 +84,17 @@ def __init__(self, config, device=None):
         self.register_buffer("inv_freq", torch.cat((inv_freq, inv_freq), dim=-1), persistent=False)
 
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
         freqs = self.inv_freq[position_ids]
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             emb = freqs
             cos = emb.cos()
             sin = emb.sin()
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
 # Copied from transformers.models.llama.modeling_llama.rotate_half
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index d6130699fd73..1a542add6930 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -64,6 +64,8 @@ class PixtralProcessor(ProcessorMixin):
             The tokenizer is a required input.
         patch_size (`int`, *optional*, defaults to 16):
             Patch size from the vision tower.
+        spatial_merge_size (`int`, *optional*, defaults to 1):
+            The downsampling factor for the spatial merge operation.
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
         image_token (`str`, *optional*, defaults to `"[IMG]"`):
@@ -78,6 +80,7 @@ class PixtralProcessor(ProcessorMixin):
     valid_kwargs = [
         "chat_template",
         "patch_size",
+        "spatial_merge_size",
         "image_token",
         "image_break_token",
         "image_end_token",
@@ -90,6 +93,7 @@ def __init__(
         image_processor=None,
         tokenizer=None,
         patch_size: int = 16,
+        spatial_merge_size: int = 1,
         chat_template=None,
         image_token="[IMG]",  # set the default and let users change if they have peculiar special tokens in rare cases
         image_break_token="[IMG_BREAK]",
@@ -97,6 +101,7 @@ def __init__(
         **kwargs,
     ):
         self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
         self.image_token = image_token
         self.image_break_token = image_break_token
         self.image_end_token = image_end_token
@@ -114,7 +119,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
@@ -151,6 +156,8 @@ def __call__(
             **kwargs,
         )
 
+        patch_size = self.patch_size * self.spatial_merge_size
+
         if images is not None:
             if is_image_or_image_url(images):
                 images = [images]
@@ -167,7 +174,7 @@ def __call__(
                     "Invalid input images. Please provide a single image, a list of images, or a list of lists of images."
                 )
             images = [load_image(im) if isinstance(im, str) else im for im in images]
-            image_inputs = self.image_processor(images, patch_size=self.patch_size, **output_kwargs["images_kwargs"])
+            image_inputs = self.image_processor(images, patch_size=patch_size, **output_kwargs["images_kwargs"])
         else:
             image_inputs = {}
 
@@ -187,8 +194,8 @@ def __call__(
             for sample in text:
                 while self.image_token in sample:
                     height, width = next(image_sizes)
-                    num_height_tokens = height // self.patch_size
-                    num_width_tokens = width // self.patch_size
+                    num_height_tokens = height // patch_size
+                    num_width_tokens = width // patch_size
                     replace_tokens = [
                         [self.image_token] * num_width_tokens + [self.image_break_token]
                     ] * num_height_tokens
diff --git a/src/transformers/models/plbart/convert_plbart_original_checkpoint_to_torch.py b/src/transformers/models/plbart/convert_plbart_original_checkpoint_to_torch.py
deleted file mode 100644
index eac4a27d11c5..000000000000
--- a/src/transformers/models/plbart/convert_plbart_original_checkpoint_to_torch.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-from torch import nn
-
-from transformers import PLBartConfig, PLBartForConditionalGeneration, PLBartForSequenceClassification
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "_float_tensor",
-        "decoder.output_projection.weight",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def convert_fairseq_plbart_checkpoint_from_disk(
-    checkpoint_path, hf_config_path="uclanlp/plbart-base", finetuned=False, classification=False
-):
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    remove_ignore_keys_(state_dict)
-    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
-
-    plbart_config = PLBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size)
-
-    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-    if not classification:
-        model = PLBartForConditionalGeneration(plbart_config)
-        model.model.load_state_dict(state_dict)
-        if finetuned:
-            model.lm_head = make_linear_from_emb(model.model.shared)
-
-    else:
-        classification_head = {}
-        for key, value in state_dict.copy().items():
-            if key.startswith("classification_heads.sentence_classification_head"):
-                classification_head[key.replace("classification_heads.sentence_classification_head.", "")] = value
-                state_dict.pop(key)
-        model = PLBartForSequenceClassification(plbart_config)
-        model.model.load_state_dict(state_dict)
-        model.classification_head.load_state_dict(classification_head)
-
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("fairseq_path", type=str, help="model.pt on local filesystem.")
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--hf_config",
-        default="uclanlp/plbart-base",
-        type=str,
-        help="Which huggingface architecture to use: plbart-base",
-    )
-    parser.add_argument("--finetuned", action="store_true", help="whether the model is a fine-tuned checkpoint")
-    parser.add_argument(
-        "--classification", action="store_true", help="whether the model is a classification checkpoint"
-    )
-    args = parser.parse_args()
-    model = convert_fairseq_plbart_checkpoint_from_disk(
-        args.fairseq_path,
-        hf_config_path=args.hf_config,
-        finetuned=args.finetuned,
-        classification=args.classification,
-    )
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index e2f11d97b8fd..e625c20251e8 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -701,7 +701,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -887,7 +887,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1421,7 +1421,7 @@ def __init__(self, config: PLBartConfig, **kwargs):
     # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -1570,7 +1570,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
diff --git a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
deleted file mode 100644
index e5fad6da1a3f..000000000000
--- a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert PoolFormer checkpoints from the original repository. URL: https://github.com/sail-sg/poolformer"""
-
-import argparse
-import json
-from collections import OrderedDict
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import PoolFormerConfig, PoolFormerForImageClassification, PoolFormerImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def replace_key_with_offset(key, offset, original_name, new_name):
-    """
-    Replaces the key by subtracting the offset from the original layer number
-    """
-    to_find = original_name.split(".")[0]
-    key_list = key.split(".")
-    orig_block_num = int(key_list[key_list.index(to_find) - 2])
-    layer_num = int(key_list[key_list.index(to_find) - 1])
-    new_block_num = orig_block_num - offset
-
-    key = key.replace(f"{orig_block_num}.{layer_num}.{original_name}", f"block.{new_block_num}.{layer_num}.{new_name}")
-    return key
-
-
-def rename_keys(state_dict):
-    new_state_dict = OrderedDict()
-    total_embed_found, patch_emb_offset = 0, 0
-    for key, value in state_dict.items():
-        if key.startswith("network"):
-            key = key.replace("network", "poolformer.encoder")
-        if "proj" in key:
-            # Works for the first embedding as well as the internal embedding layers
-            if key.endswith("bias") and "patch_embed" not in key:
-                patch_emb_offset += 1
-            to_replace = key[: key.find("proj")]
-            key = key.replace(to_replace, f"patch_embeddings.{total_embed_found}.")
-            key = key.replace("proj", "projection")
-            if key.endswith("bias"):
-                total_embed_found += 1
-        if "patch_embeddings" in key:
-            key = "poolformer.encoder." + key
-        if "mlp.fc1" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "mlp.fc1", "output.conv1")
-        if "mlp.fc2" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "mlp.fc2", "output.conv2")
-        if "norm1" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "norm1", "before_norm")
-        if "norm2" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "norm2", "after_norm")
-        if "layer_scale_1" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "layer_scale_1", "layer_scale_1")
-        if "layer_scale_2" in key:
-            key = replace_key_with_offset(key, patch_emb_offset, "layer_scale_2", "layer_scale_2")
-        if "head" in key:
-            key = key.replace("head", "classifier")
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-# We will verify our results on a COCO image
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    return image
-
-
-@torch.no_grad()
-def convert_poolformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our PoolFormer structure.
-    """
-
-    # load default PoolFormer configuration
-    config = PoolFormerConfig()
-
-    # set attributes based on model_name
-    repo_id = "huggingface/label-files"
-    size = model_name[-3:]
-    config.num_labels = 1000
-    filename = "imagenet-1k-id2label.json"
-    expected_shape = (1, 1000)
-
-    # set config attributes
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    if size == "s12":
-        config.depths = [2, 2, 6, 2]
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.mlp_ratio = 4.0
-        crop_pct = 0.9
-    elif size == "s24":
-        config.depths = [4, 4, 12, 4]
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.mlp_ratio = 4.0
-        crop_pct = 0.9
-    elif size == "s36":
-        config.depths = [6, 6, 18, 6]
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.mlp_ratio = 4.0
-        config.layer_scale_init_value = 1e-6
-        crop_pct = 0.9
-    elif size == "m36":
-        config.depths = [6, 6, 18, 6]
-        config.hidden_sizes = [96, 192, 384, 768]
-        config.mlp_ratio = 4.0
-        config.layer_scale_init_value = 1e-6
-        crop_pct = 0.95
-    elif size == "m48":
-        config.depths = [8, 8, 24, 8]
-        config.hidden_sizes = [96, 192, 384, 768]
-        config.mlp_ratio = 4.0
-        config.layer_scale_init_value = 1e-6
-        crop_pct = 0.95
-    else:
-        raise ValueError(f"Size {size} not supported")
-
-    # load image processor
-    image_processor = PoolFormerImageProcessor(crop_pct=crop_pct)
-
-    # Prepare image
-    image = prepare_img()
-    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
-
-    logger.info(f"Converting model {model_name}...")
-
-    # load original state dict
-    state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
-
-    # rename keys
-    state_dict = rename_keys(state_dict)
-
-    # create HuggingFace model and load state dict
-    model = PoolFormerForImageClassification(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # Define image processor
-    image_processor = PoolFormerImageProcessor(crop_pct=crop_pct)
-    pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values
-
-    # forward pass
-    outputs = model(pixel_values)
-    logits = outputs.logits
-
-    # define expected logit slices for different models
-    if size == "s12":
-        expected_slice = torch.tensor([-0.3045, -0.6758, -0.4869])
-    elif size == "s24":
-        expected_slice = torch.tensor([0.4402, -0.1374, -0.8045])
-    elif size == "s36":
-        expected_slice = torch.tensor([-0.6080, -0.5133, -0.5898])
-    elif size == "m36":
-        expected_slice = torch.tensor([0.3952, 0.2263, -1.2668])
-    elif size == "m48":
-        expected_slice = torch.tensor([0.1167, -0.0656, -0.3423])
-    else:
-        raise ValueError(f"Size {size} not supported")
-
-    # verify logits
-    assert logits.shape == expected_shape
-    assert torch.allclose(logits[0, :3], expected_slice, atol=1e-2)
-
-    # finally, save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        default="poolformer_s12",
-        type=str,
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, help="Path to the original PyTorch checkpoint (.pth file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_poolformer_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py
index 624f1a6f40e3..cd4c4bb77086 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -213,15 +213,15 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
-        crop_pct: int = None,
+        crop_pct: Optional[int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py b/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
deleted file mode 100644
index 54b8bb67e60a..000000000000
--- a/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""File for loading the Pop2Piano model weights from the official repository and to show how tokenizer vocab was
-constructed"""
-
-import json
-
-import torch
-
-from transformers import Pop2PianoConfig, Pop2PianoForConditionalGeneration
-
-
-########################## MODEL WEIGHTS ##########################
-
-# This weights were downloaded from the official pop2piano repository
-# https://huggingface.co/sweetcocoa/pop2piano/blob/main/model-1999-val_0.67311615.ckpt
-official_weights = torch.load("./model-1999-val_0.67311615.ckpt")
-state_dict = {}
-
-
-# load the config and init the model
-cfg = Pop2PianoConfig.from_pretrained("sweetcocoa/pop2piano")
-model = Pop2PianoForConditionalGeneration(cfg)
-
-
-# load relative attention bias
-state_dict["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = official_weights["state_dict"][
-    "transformer.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
-]
-state_dict["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = official_weights["state_dict"][
-    "transformer.decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
-]
-
-# load embed tokens and final layer norm for both encoder and decoder
-state_dict["encoder.embed_tokens.weight"] = official_weights["state_dict"]["transformer.encoder.embed_tokens.weight"]
-state_dict["decoder.embed_tokens.weight"] = official_weights["state_dict"]["transformer.decoder.embed_tokens.weight"]
-
-state_dict["encoder.final_layer_norm.weight"] = official_weights["state_dict"][
-    "transformer.encoder.final_layer_norm.weight"
-]
-state_dict["decoder.final_layer_norm.weight"] = official_weights["state_dict"][
-    "transformer.decoder.final_layer_norm.weight"
-]
-
-# load lm_head, mel_conditioner.emb and shared
-state_dict["lm_head.weight"] = official_weights["state_dict"]["transformer.lm_head.weight"]
-state_dict["mel_conditioner.embedding.weight"] = official_weights["state_dict"]["mel_conditioner.embedding.weight"]
-state_dict["shared.weight"] = official_weights["state_dict"]["transformer.shared.weight"]
-
-# load each encoder blocks
-for i in range(cfg.num_layers):
-    # layer 0
-    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.0.SelfAttention.q.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.0.SelfAttention.k.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.0.SelfAttention.v.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.0.SelfAttention.o.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.0.layer_norm.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.0.layer_norm.weight"
-    ]
-
-    # layer 1
-    state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.1.DenseReluDense.wo.weight"
-    ]
-    state_dict[f"encoder.block.{i}.layer.1.layer_norm.weight"] = official_weights["state_dict"][
-        f"transformer.encoder.block.{i}.layer.1.layer_norm.weight"
-    ]
-
-# load each decoder blocks
-for i in range(6):
-    # layer 0
-    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.0.SelfAttention.q.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.0.SelfAttention.k.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.0.SelfAttention.v.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.0.SelfAttention.o.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.0.layer_norm.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.0.layer_norm.weight"
-    ]
-
-    # layer 1
-    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.q.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.k.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.v.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.1.EncDecAttention.o.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.1.layer_norm.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.1.layer_norm.weight"
-    ]
-
-    # layer 2
-    state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.2.DenseReluDense.wo.weight"
-    ]
-    state_dict[f"decoder.block.{i}.layer.2.layer_norm.weight"] = official_weights["state_dict"][
-        f"transformer.decoder.block.{i}.layer.2.layer_norm.weight"
-    ]
-
-model.load_state_dict(state_dict, strict=True)
-
-# save the weights
-torch.save(state_dict, "./pytorch_model.bin")
-
-########################## TOKENIZER ##########################
-
-# the tokenize and detokenize methods are taken from the official implementation
-
-
-# link : https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L34
-def tokenize(idx, token_type, n_special=4, n_note=128, n_velocity=2):
-    if token_type == "TOKEN_TIME":
-        return n_special + n_note + n_velocity + idx
-    elif token_type == "TOKEN_VELOCITY":
-        return n_special + n_note + idx
-    elif token_type == "TOKEN_NOTE":
-        return n_special + idx
-    elif token_type == "TOKEN_SPECIAL":
-        return idx
-    else:
-        return -1
-
-
-# link : https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L48
-def detokenize(idx, n_special=4, n_note=128, n_velocity=2, time_idx_offset=0):
-    if idx >= n_special + n_note + n_velocity:
-        return "TOKEN_TIME", (idx - (n_special + n_note + n_velocity)) + time_idx_offset
-    elif idx >= n_special + n_note:
-        return "TOKEN_VELOCITY", idx - (n_special + n_note)
-    elif idx >= n_special:
-        return "TOKEN_NOTE", idx - n_special
-    else:
-        return "TOKEN_SPECIAL", idx
-
-
-# crate the decoder and then the encoder of the tokenizer
-decoder = {}
-for i in range(cfg.vocab_size):
-    decoder.update({i: f"{detokenize(i)[1]}_{detokenize(i)[0]}"})
-
-encoder = {v: k for k, v in decoder.items()}
-
-# save the vocab
-with open("./vocab.json", "w") as file:
-    file.write(json.dumps(encoder))
diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py
index 351482a75e58..73ff1b9d7b96 100644
--- a/src/transformers/models/pop2piano/modeling_pop2piano.py
+++ b/src/transformers/models/pop2piano/modeling_pop2piano.py
@@ -38,6 +38,7 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
     is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
@@ -46,6 +47,12 @@
 from .configuration_pop2piano import Pop2PianoConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _load_pop2piano_layer_norm = True
@@ -157,7 +164,7 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         # Pop2Piano uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
 
@@ -997,12 +1004,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1084,7 +1096,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1202,7 +1214,7 @@ def get_mel_conditioner_outputs(
         input_features: torch.FloatTensor,
         composer: str,
         generation_config: GenerationConfig,
-        attention_mask: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
     ):
         """
         This method is used to concatenate mel conditioner tokens at the front of the input_features in order to
diff --git a/src/transformers/models/pop2piano/tokenization_pop2piano.py b/src/transformers/models/pop2piano/tokenization_pop2piano.py
index 18adb2e9621b..678a651fee42 100644
--- a/src/transformers/models/pop2piano/tokenization_pop2piano.py
+++ b/src/transformers/models/pop2piano/tokenization_pop2piano.py
@@ -245,7 +245,9 @@ def relative_batch_tokens_ids_to_midi(
 
     # Taken from the original code
     # Please see https://github.com/sweetcocoa/pop2piano/blob/fac11e8dcfc73487513f4588e8d0c22a22f2fdc5/midi_tokenizer.py#L257
-    def relative_tokens_ids_to_notes(self, tokens: np.ndarray, start_idx: float, cutoff_time_idx: float = None):
+    def relative_tokens_ids_to_notes(
+        self, tokens: np.ndarray, start_idx: float, cutoff_time_idx: Optional[float] = None
+    ):
         """
         Converts relative tokens to notes which will then be used to create Pretty Midi objects.
 
diff --git a/src/transformers/models/prompt_depth_anything/__init__.py b/src/transformers/models/prompt_depth_anything/__init__.py
new file mode 100644
index 000000000000..3cb05f8e3788
--- /dev/null
+++ b/src/transformers/models/prompt_depth_anything/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_prompt_depth_anything import PromptDepthAnythingConfig
+    from .image_processing_prompt_depth_anything import PromptDepthAnythingImageProcessor
+    from .modeling_prompt_depth_anything import (
+        PromptDepthAnythingForDepthEstimation,
+        PromptDepthAnythingPreTrainedModel,
+    )
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py
new file mode 100644
index 000000000000..cf213133c142
--- /dev/null
+++ b/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py
@@ -0,0 +1,171 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_prompt_depth_anything.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto.configuration_auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class PromptDepthAnythingConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PromptDepthAnythingModel`]. It is used to instantiate a PromptDepthAnything
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the PromptDepthAnything
+    [LiheYoung/depth-anything-small-hf](https://huggingface.co/LiheYoung/depth-anything-small-hf) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
+            leverage the [`AutoBackbone`] API.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`]
+            API.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size of the patches to extract from the backbone features.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        reassemble_hidden_size (`int`, *optional*, defaults to 384):
+            The number of input channels of the reassemble layers.
+        reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
+            The up/downsampling factors of the reassemble layers.
+        neck_hidden_sizes (`List[str]`, *optional*, defaults to `[48, 96, 192, 384]`):
+            The hidden sizes to project to for the feature maps of the backbone.
+        fusion_hidden_size (`int`, *optional*, defaults to 64):
+            The number of channels before fusion.
+        head_in_index (`int`, *optional*, defaults to -1):
+            The index of the features to use in the depth estimation head.
+        head_hidden_size (`int`, *optional*, defaults to 32):
+            The number of output channels in the second convolution of the depth estimation head.
+        depth_estimation_type (`str`, *optional*, defaults to `"relative"`):
+            The type of depth estimation to use. Can be one of `["relative", "metric"]`.
+        max_depth (`float`, *optional*):
+            The maximum depth to use for the "metric" depth estimation head. 20 should be used for indoor models
+            and 80 for outdoor models. For "relative" depth estimation, this value is ignored.
+
+    Example:
+
+    ```python
+    >>> from transformers import PromptDepthAnythingConfig, PromptDepthAnythingForDepthEstimation
+
+    >>> # Initializing a PromptDepthAnything small style configuration
+    >>> configuration = PromptDepthAnythingConfig()
+
+    >>> # Initializing a model from the PromptDepthAnything small style configuration
+    >>> model = PromptDepthAnythingForDepthEstimation(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "prompt_depth_anything"
+
+    def __init__(
+        self,
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
+        patch_size=14,
+        initializer_range=0.02,
+        reassemble_hidden_size=384,
+        reassemble_factors=[4, 2, 1, 0.5],
+        neck_hidden_sizes=[48, 96, 192, 384],
+        fusion_hidden_size=64,
+        head_in_index=-1,
+        head_hidden_size=32,
+        depth_estimation_type="relative",
+        max_depth=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `Dinov2` backbone.")
+            backbone_config = CONFIG_MAPPING["dinov2"](
+                image_size=518,
+                hidden_size=384,
+                num_attention_heads=6,
+                out_indices=[9, 10, 11, 12],
+                apply_layernorm=True,
+                reshape_hidden_states=False,
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+        self.reassemble_hidden_size = reassemble_hidden_size
+        self.patch_size = patch_size
+        self.initializer_range = initializer_range
+        self.reassemble_factors = reassemble_factors
+        self.neck_hidden_sizes = neck_hidden_sizes
+        self.fusion_hidden_size = fusion_hidden_size
+        self.head_in_index = head_in_index
+        self.head_hidden_size = head_hidden_size
+        if depth_estimation_type not in ["relative", "metric"]:
+            raise ValueError("depth_estimation_type must be one of ['relative', 'metric']")
+        self.depth_estimation_type = depth_estimation_type
+        self.max_depth = max_depth if max_depth else 1
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        if output["backbone_config"] is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
+__all__ = ["PromptDepthAnythingConfig"]
diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
new file mode 100644
index 000000000000..898475835be8
--- /dev/null
+++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
@@ -0,0 +1,504 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for PromptDepthAnything."""
+
+import math
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+
+
+if TYPE_CHECKING:
+    from ...modeling_outputs import DepthEstimatorOutput
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_torch_available,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    logging,
+    requires_backends,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
+    x = round(val / multiple) * multiple
+
+    if max_val is not None and x > max_val:
+        x = math.floor(val / multiple) * multiple
+
+    if x < min_val:
+        x = math.ceil(val / multiple) * multiple
+
+    return x
+
+
+def _get_resize_output_image_size(
+    input_image: np.ndarray,
+    output_size: Union[int, Iterable[int]],
+    keep_aspect_ratio: bool,
+    multiple: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    output_size = (output_size, output_size) if isinstance(output_size, int) else output_size
+
+    input_height, input_width = get_image_size(input_image, input_data_format)
+    output_height, output_width = output_size
+
+    # determine new height and width
+    scale_height = output_height / input_height
+    scale_width = output_width / input_width
+
+    if keep_aspect_ratio:
+        # scale as little as possible
+        if abs(1 - scale_width) < abs(1 - scale_height):
+            # fit width
+            scale_height = scale_width
+        else:
+            # fit height
+            scale_width = scale_height
+
+    new_height = _constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
+    new_width = _constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
+
+    return (new_height, new_width)
+
+
+class PromptDepthAnythingImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a PromptDepthAnything image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the image after resizing. Can be overidden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
+        keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+            If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
+            be overidden by `keep_aspect_ratio` in `preprocess`.
+        ensure_multiple_of (`int`, *optional*, defaults to 1):
+            If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
+            by `ensure_multiple_of` in `preprocess`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
+            `preprocess`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overidden by `rescale_factor` in `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `False`):
+            Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
+            combination with DPT.
+        size_divisor (`int`, *optional*):
+            If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
+            DINOv2 paper, which uses the model in combination with DPT.
+        prompt_scale_to_meter (`float`, *optional*, defaults to 0.001):
+            Scale factor to convert the prompt depth to meters.
+    """
+
+    model_input_names = ["pixel_values", "prompt_depth"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        keep_aspect_ratio: bool = False,
+        ensure_multiple_of: int = 1,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: bool = False,
+        size_divisor: Optional[int] = None,
+        prompt_scale_to_meter: float = 0.001,  # default unit is mm
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 384, "width": 384}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.size = size
+        self.keep_aspect_ratio = keep_aspect_ratio
+        self.ensure_multiple_of = ensure_multiple_of
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_pad = do_pad
+        self.size_divisor = size_divisor
+        self.prompt_scale_to_meter = prompt_scale_to_meter
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        keep_aspect_ratio: bool = False,
+        ensure_multiple_of: int = 1,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
+        is resized to the largest possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is
+        set, the image is resized to a size that is a multiple of this value.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Target size of the output image.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+                If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
+            ensure_multiple_of (`int`, *optional*, defaults to 1):
+                The image is resized to a size that is a multiple of this value.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+
+        output_size = _get_resize_output_image_size(
+            image,
+            output_size=(size["height"], size["width"]),
+            keep_aspect_ratio=keep_aspect_ratio,
+            multiple=ensure_multiple_of,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def pad_image(
+        self,
+        image: np.ndarray,
+        size_divisor: int,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Center pad an image to be a multiple of `multiple`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            size_divisor (`int`):
+                The width and height of the image will be padded to a multiple of this number.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+
+        def _get_pad(size, size_divisor):
+            new_size = math.ceil(size / size_divisor) * size_divisor
+            pad_size = new_size - size
+            pad_size_left = pad_size // 2
+            pad_size_right = pad_size - pad_size_left
+            return pad_size_left, pad_size_right
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        height, width = get_image_size(image, input_data_format)
+
+        pad_size_left, pad_size_right = _get_pad(height, size_divisor)
+        pad_size_top, pad_size_bottom = _get_pad(width, size_divisor)
+
+        padded_image = pad(
+            image, ((pad_size_left, pad_size_right), (pad_size_top, pad_size_bottom)), data_format=data_format
+        )
+        return padded_image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        prompt_depth: Optional[ImageInput] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[int] = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        size_divisor: Optional[int] = None,
+        prompt_scale_to_meter: Optional[float] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            prompt_depth (`ImageInput`, *optional*):
+                Prompt depth to preprocess, which can be sparse depth obtained from multi-view geometry or
+                low-resolution depth from a depth sensor. Generally has shape (height, width), where height
+                and width can be smaller than those of the images. It's optional and can be None, which means no prompt depth
+                is used. If it is None, the output depth will be a monocular relative depth.
+                It is recommended to provide a prompt_scale_to_meter value, which is the scale factor to convert the prompt depth
+                to meters. This is useful when the prompt depth is not in meters.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. If `keep_aspect_ratio` is `True`, the image is resized to the largest
+                possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is set, the image is
+                resized to a size that is a multiple of this value.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `self.keep_aspect_ratio`):
+                Whether to keep the aspect ratio of the image. If False, the image will be resized to (size, size). If
+                True, the image will be resized to keep the aspect ratio and the size will be the maximum possible.
+            ensure_multiple_of (`int`, *optional*, defaults to `self.ensure_multiple_of`):
+                Ensure that the image size is a multiple of this value.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            prompt_scale_to_meter (`float`, *optional*, defaults to `self.prompt_scale_to_meter`):
+                Scale factor to convert the prompt depth to meters.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        keep_aspect_ratio = keep_aspect_ratio if keep_aspect_ratio is not None else self.keep_aspect_ratio
+        ensure_multiple_of = ensure_multiple_of if ensure_multiple_of is not None else self.ensure_multiple_of
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_pad=do_pad,
+            size_divisibility=size_divisor,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        preprocessed_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(
+                    image=image,
+                    size=size,
+                    resample=resample,
+                    keep_aspect_ratio=keep_aspect_ratio,
+                    ensure_multiple_of=ensure_multiple_of,
+                    input_data_format=input_data_format,
+                )
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            if do_pad:
+                image = self.pad_image(image=image, size_divisor=size_divisor, input_data_format=input_data_format)
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            preprocessed_images.append(image)
+
+        images = preprocessed_images
+
+        data = {"pixel_values": images}
+        if prompt_depth is not None:
+            # prompt_depth is a list of images with shape (height, width)
+            # we need to convert it to a list of images with shape (1, height, width)
+            prompt_depths = make_list_of_images(prompt_depth, expected_ndims=2)
+
+            # Validate prompt_depths has same length as images
+            if len(prompt_depths) != len(images):
+                raise ValueError(
+                    f"Number of prompt depth images ({len(prompt_depths)}) does not match number of input images ({len(images)})"
+                )
+
+            if prompt_scale_to_meter is None:
+                prompt_scale_to_meter = self.prompt_scale_to_meter
+
+            processed_prompt_depths = []
+            for depth in prompt_depths:
+                depth = to_numpy_array(depth)
+                depth = depth * prompt_scale_to_meter
+                if depth.min() == depth.max():
+                    # Prompt depth is invalid, min and max are the same.
+                    # We can simply select one pixel and set it to a small value.
+                    depth[0, 0] = depth[0, 0] + 1e-6
+                depth = depth[..., None].astype(np.float32)
+                depth = to_channel_dimension_format(depth, data_format, input_channel_dim=input_data_format)
+
+                processed_prompt_depths.append(depth)
+            prompt_depths = processed_prompt_depths
+            data["prompt_depth"] = prompt_depths
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    # Copied from transformers.models.dpt.image_processing_dpt.DPTImageProcessor.post_process_depth_estimation with DPT->PromptDepthAnything
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+    ) -> List[Dict[str, TensorType]]:
+        """
+        Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`DepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+
+        if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
+            )
+
+        results = []
+        target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
+        for depth, target_size in zip(predicted_depth, target_sizes):
+            if target_size is not None:
+                depth = torch.nn.functional.interpolate(
+                    depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False
+                ).squeeze()
+
+            results.append({"predicted_depth": depth})
+
+        return results
+
+
+__all__ = ["PromptDepthAnythingImageProcessor"]
diff --git a/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py
new file mode 100644
index 000000000000..7653a72f0695
--- /dev/null
+++ b/src/transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py
@@ -0,0 +1,546 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_prompt_depth_anything.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from transformers.utils.generic import torch_int
+
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...modeling_outputs import DepthEstimatorOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils.backbone_utils import load_backbone
+from .configuration_prompt_depth_anything import PromptDepthAnythingConfig
+
+
+_CONFIG_FOR_DOC = "PromptDepthAnythingConfig"
+
+
+class PromptDepthAnythingLayer(nn.Module):
+    def __init__(self, config: PromptDepthAnythingConfig):
+        super().__init__()
+        self.convolution1 = nn.Conv2d(
+            1,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+        )
+        self.activation1 = nn.ReLU()
+
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+        )
+        self.activation2 = nn.ReLU()
+
+        self.convolution3 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+        )
+
+    def forward(self, prompt_depth: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.convolution1(prompt_depth)
+        hidden_state = self.activation1(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution3(hidden_state)
+        return hidden_state
+
+
+class PromptDepthAnythingPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        config (`[PromptDepthAnythingConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+        )
+
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+        )
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+        hidden_state = self.convolution1(hidden_state)
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        return hidden_state + residual
+
+
+class PromptDepthAnythingFeatureFusionLayer(nn.Module):
+    """Feature fusion layer, merges feature maps from different stages.
+
+    Args:
+        config (`[PromptDepthAnythingConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config: PromptDepthAnythingConfig):
+        super().__init__()
+
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
+
+        self.residual_layer1 = PromptDepthAnythingPreActResidualLayer(config)
+        self.residual_layer2 = PromptDepthAnythingPreActResidualLayer(config)
+        self.prompt_depth_layer = PromptDepthAnythingLayer(config)
+
+    def forward(self, hidden_state, residual=None, size=None, prompt_depth=None):
+        if residual is not None:
+            if hidden_state.shape != residual.shape:
+                residual = nn.functional.interpolate(
+                    residual, size=hidden_state.shape[2:], mode="bilinear", align_corners=False
+                )
+            hidden_state = hidden_state + self.residual_layer1(residual)
+
+        hidden_state = self.residual_layer2(hidden_state)
+
+        if prompt_depth is not None:
+            prompt_depth = nn.functional.interpolate(
+                prompt_depth, size=hidden_state.shape[2:], mode="bilinear", align_corners=False
+            )
+            res = self.prompt_depth_layer(prompt_depth)
+            hidden_state = hidden_state + res
+
+        modifier = {"scale_factor": 2} if size is None else {"size": size}
+
+        hidden_state = nn.functional.interpolate(
+            hidden_state,
+            **modifier,
+            mode="bilinear",
+            align_corners=True,
+        )
+        hidden_state = self.projection(hidden_state)
+
+        return hidden_state
+
+
+class PromptDepthAnythingFeatureFusionStage(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for _ in range(len(config.neck_hidden_sizes)):
+            self.layers.append(PromptDepthAnythingFeatureFusionLayer(config))
+
+    def forward(self, hidden_states, size=None, prompt_depth=None):
+        # reversing the hidden_states, we start from the last
+        hidden_states = hidden_states[::-1]
+
+        fused_hidden_states = []
+        fused_hidden_state = None
+
+        for idx, (hidden_state, layer) in enumerate(zip(hidden_states, self.layers)):
+            size = hidden_states[idx + 1].shape[2:] if idx != (len(hidden_states) - 1) else None
+
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state, size=size, prompt_depth=prompt_depth)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size, prompt_depth=prompt_depth)
+
+            fused_hidden_states.append(fused_hidden_state)
+
+        return fused_hidden_states
+
+
+class PromptDepthAnythingDepthEstimationHead(nn.Module):
+    """
+    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's
+    supplementary material). The final activation function is either ReLU or Sigmoid, depending on the depth estimation
+    type (relative or metric). For metric depth estimation, the output is scaled by the maximum depth used during pretraining.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.head_in_index = config.head_in_index
+        self.patch_size = config.patch_size
+
+        features = config.fusion_hidden_size
+        self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1)
+        self.activation1 = nn.ReLU()
+        self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0)
+        if config.depth_estimation_type == "relative":
+            self.activation2 = nn.ReLU()
+        elif config.depth_estimation_type == "metric":
+            self.activation2 = nn.Sigmoid()
+        else:
+            raise ValueError(f"Unknown depth estimation type: {config.depth_estimation_type}")
+        self.max_depth = config.max_depth
+
+    def forward(self, hidden_states: List[torch.Tensor], patch_height: int, patch_width: int) -> torch.Tensor:
+        hidden_states = hidden_states[-1]
+
+        predicted_depth = self.conv1(hidden_states)
+        target_height = torch_int(patch_height * self.patch_size)
+        target_width = torch_int(patch_width * self.patch_size)
+        predicted_depth = nn.functional.interpolate(
+            predicted_depth,
+            (target_height, target_width),
+            mode="bilinear",
+            align_corners=True,
+        )
+        predicted_depth = self.conv2(predicted_depth)
+        predicted_depth = self.activation1(predicted_depth)
+        predicted_depth = self.conv3(predicted_depth)
+        predicted_depth = self.activation2(predicted_depth)
+        # (batch_size, 1, height, width) -> (batch_size, height, width), which
+        # keeps the same behavior as Depth Anything v1 & v2
+        predicted_depth = predicted_depth.squeeze(dim=1)
+
+        return predicted_depth
+
+
+class PromptDepthAnythingPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PromptDepthAnythingConfig
+    base_model_prefix = "prompt_depth_anything"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class PromptDepthAnythingReassembleLayer(nn.Module):
+    def __init__(self, config: PromptDepthAnythingConfig, channels: int, factor: int):
+        super().__init__()
+        self.projection = nn.Conv2d(in_channels=config.reassemble_hidden_size, out_channels=channels, kernel_size=1)
+
+        # up/down sampling depending on factor
+        if factor > 1:
+            self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
+        elif factor == 1:
+            self.resize = nn.Identity()
+        elif factor < 1:
+            # so should downsample
+            stride = torch_int(1 / factor)
+            self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=stride, padding=1)
+
+    def forward(self, hidden_state):
+        hidden_state = self.projection(hidden_state)
+        hidden_state = self.resize(hidden_state)
+
+        return hidden_state
+
+
+class PromptDepthAnythingReassembleStage(nn.Module):
+    """
+    This class reassembles the hidden states of the backbone into image-like feature representations at various
+    resolutions.
+
+    This happens in 3 stages:
+    1. Take the patch embeddings and reshape them to image-like feature representations.
+    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
+    3. Resizing the spatial dimensions (height, width).
+
+    Args:
+        config (`[PromptDepthAnythingConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.layers = nn.ModuleList()
+        for channels, factor in zip(config.neck_hidden_sizes, config.reassemble_factors):
+            self.layers.append(PromptDepthAnythingReassembleLayer(config, channels=channels, factor=factor))
+
+    def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_width=None) -> List[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
+                List of hidden states from the backbone.
+        """
+        out = []
+
+        for i, hidden_state in enumerate(hidden_states):
+            # reshape to (batch_size, num_channels, height, width)
+            hidden_state = hidden_state[:, 1:]
+            batch_size, _, num_channels = hidden_state.shape
+            hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels)
+            hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+            hidden_state = self.layers[i](hidden_state)
+            out.append(hidden_state)
+
+        return out
+
+
+class PromptDepthAnythingNeck(nn.Module):
+    """
+    PromptDepthAnythingNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
+    input and produces another list of tensors as output. For PromptDepthAnything, it includes 2 stages:
+
+    * PromptDepthAnythingReassembleStage
+    * PromptDepthAnythingFeatureFusionStage.
+
+    Args:
+        config (dict): config dict.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.reassemble_stage = PromptDepthAnythingReassembleStage(config)
+
+        self.convs = nn.ModuleList()
+        for channel in config.neck_hidden_sizes:
+            self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))
+
+        # fusion
+        self.fusion_stage = PromptDepthAnythingFeatureFusionStage(config)
+
+    def forward(
+        self,
+        hidden_states: List[torch.Tensor],
+        patch_height: Optional[int] = None,
+        patch_width: Optional[int] = None,
+        prompt_depth: Optional[torch.Tensor] = None,
+    ) -> List[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
+                List of hidden states from the backbone.
+        """
+        if not isinstance(hidden_states, (tuple, list)):
+            raise TypeError("hidden_states should be a tuple or list of tensors")
+
+        if len(hidden_states) != len(self.config.neck_hidden_sizes):
+            raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
+
+        # postprocess hidden states
+        hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
+
+        features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
+
+        # fusion blocks
+        output = self.fusion_stage(features, prompt_depth=prompt_depth)
+
+        return output
+
+
+PROMPT_DEPTH_ANYTHING_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`PromptDepthAnythingConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+PROMPT_DEPTH_ANYTHING_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        prompt_depth (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`, *optional*):
+            Prompt depth is the sparse or low-resolution depth obtained from multi-view geometry or a
+            low-resolution depth sensor. It generally has shape (height, width), where height
+            and width can be smaller than those of the images. It is optional and can be None, which means no prompt depth
+            will be used. If it is None, the output will be a monocular relative depth.
+            The values are recommended to be in meters, but this is not necessary.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """
+    Prompt Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
+    """,
+    PROMPT_DEPTH_ANYTHING_START_DOCSTRING,
+)
+class PromptDepthAnythingForDepthEstimation(PromptDepthAnythingPreTrainedModel):
+    _no_split_modules = ["DPTViTEmbeddings"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.backbone = load_backbone(config)
+        self.neck = PromptDepthAnythingNeck(config)
+        self.head = PromptDepthAnythingDepthEstimationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(PROMPT_DEPTH_ANYTHING_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        prompt_depth: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], DepthEstimatorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/image.jpg?raw=true"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
+        >>> model = AutoModelForDepthEstimation.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
+
+        >>> prompt_depth_url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/arkit_depth.png?raw=true"
+        >>> prompt_depth = Image.open(requests.get(prompt_depth_url, stream=True).raw)
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt", prompt_depth=prompt_depth)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # interpolate to original size
+        >>> post_processed_output = image_processor.post_process_depth_estimation(
+        ...     outputs,
+        ...     target_sizes=[(image.height, image.width)],
+        ... )
+
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
+        >>> depth = predicted_depth * 1000.
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint16")) # mm
+        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        outputs = self.backbone.forward_with_filtered_kwargs(
+            pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        )
+        hidden_states = outputs.feature_maps
+
+        _, _, height, width = pixel_values.shape
+        patch_size = self.config.patch_size
+        patch_height = height // patch_size
+        patch_width = width // patch_size
+
+        if prompt_depth is not None:
+            # normalize prompt depth
+            batch_size = prompt_depth.shape[0]
+            depth_min = torch.min(prompt_depth.reshape(batch_size, -1), dim=1).values
+            depth_max = torch.max(prompt_depth.reshape(batch_size, -1), dim=1).values
+            depth_min, depth_max = depth_min.view(batch_size, 1, 1, 1), depth_max.view(batch_size, 1, 1, 1)
+            prompt_depth = (prompt_depth - depth_min) / (depth_max - depth_min)
+            # normalize done
+
+        hidden_states = self.neck(hidden_states, patch_height, patch_width, prompt_depth=prompt_depth)
+
+        predicted_depth = self.head(hidden_states, patch_height, patch_width)
+        if prompt_depth is not None:
+            # denormalize predicted depth
+            depth_min = depth_min.squeeze(1).to(predicted_depth.device)
+            depth_max = depth_max.squeeze(1).to(predicted_depth.device)
+            predicted_depth = predicted_depth * (depth_max - depth_min) + depth_min
+            # denormalize done
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (predicted_depth,) + outputs[1:]
+            else:
+                output = (predicted_depth,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return DepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["PromptDepthAnythingForDepthEstimation", "PromptDepthAnythingPreTrainedModel"]
diff --git a/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py
new file mode 100644
index 000000000000..ad9b254a8a26
--- /dev/null
+++ b/src/transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py
@@ -0,0 +1,391 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from transformers.models.depth_anything.configuration_depth_anything import DepthAnythingConfig
+from transformers.models.depth_anything.modeling_depth_anything import (
+    DepthAnythingDepthEstimationHead,
+    DepthAnythingFeatureFusionLayer,
+    DepthAnythingFeatureFusionStage,
+    DepthAnythingForDepthEstimation,
+    DepthAnythingNeck,
+    DepthAnythingReassembleStage,
+)
+from transformers.utils.generic import torch_int
+
+from ...file_utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import DepthEstimatorOutput
+from ...modeling_utils import PreTrainedModel
+
+
+_CONFIG_FOR_DOC = "PromptDepthAnythingConfig"
+
+
+class PromptDepthAnythingConfig(DepthAnythingConfig):
+    model_type = "prompt_depth_anything"
+
+
+class PromptDepthAnythingLayer(nn.Module):
+    def __init__(self, config: PromptDepthAnythingConfig):
+        super().__init__()
+        self.convolution1 = nn.Conv2d(
+            1,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+        )
+        self.activation1 = nn.ReLU()
+
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+        )
+        self.activation2 = nn.ReLU()
+
+        self.convolution3 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+        )
+
+    def forward(self, prompt_depth: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.convolution1(prompt_depth)
+        hidden_state = self.activation1(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution3(hidden_state)
+        return hidden_state
+
+
+class PromptDepthAnythingFeatureFusionLayer(DepthAnythingFeatureFusionLayer):
+    def __init__(self, config: PromptDepthAnythingConfig):
+        super().__init__(config)
+        self.prompt_depth_layer = PromptDepthAnythingLayer(config)
+
+    def forward(self, hidden_state, residual=None, size=None, prompt_depth=None):
+        if residual is not None:
+            if hidden_state.shape != residual.shape:
+                residual = nn.functional.interpolate(
+                    residual, size=hidden_state.shape[2:], mode="bilinear", align_corners=False
+                )
+            hidden_state = hidden_state + self.residual_layer1(residual)
+
+        hidden_state = self.residual_layer2(hidden_state)
+
+        if prompt_depth is not None:
+            prompt_depth = nn.functional.interpolate(
+                prompt_depth, size=hidden_state.shape[2:], mode="bilinear", align_corners=False
+            )
+            res = self.prompt_depth_layer(prompt_depth)
+            hidden_state = hidden_state + res
+
+        modifier = {"scale_factor": 2} if size is None else {"size": size}
+
+        hidden_state = nn.functional.interpolate(
+            hidden_state,
+            **modifier,
+            mode="bilinear",
+            align_corners=True,
+        )
+        hidden_state = self.projection(hidden_state)
+
+        return hidden_state
+
+
+class PromptDepthAnythingFeatureFusionStage(DepthAnythingFeatureFusionStage):
+    def forward(self, hidden_states, size=None, prompt_depth=None):
+        # reversing the hidden_states, we start from the last
+        hidden_states = hidden_states[::-1]
+
+        fused_hidden_states = []
+        fused_hidden_state = None
+
+        for idx, (hidden_state, layer) in enumerate(zip(hidden_states, self.layers)):
+            size = hidden_states[idx + 1].shape[2:] if idx != (len(hidden_states) - 1) else None
+
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state, size=size, prompt_depth=prompt_depth)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size, prompt_depth=prompt_depth)
+
+            fused_hidden_states.append(fused_hidden_state)
+
+        return fused_hidden_states
+
+
+class PromptDepthAnythingDepthEstimationHead(DepthAnythingDepthEstimationHead):
+    def forward(self, hidden_states: List[torch.Tensor], patch_height: int, patch_width: int) -> torch.Tensor:
+        hidden_states = hidden_states[-1]
+
+        predicted_depth = self.conv1(hidden_states)
+        target_height = torch_int(patch_height * self.patch_size)
+        target_width = torch_int(patch_width * self.patch_size)
+        predicted_depth = nn.functional.interpolate(
+            predicted_depth,
+            (target_height, target_width),
+            mode="bilinear",
+            align_corners=True,
+        )
+        predicted_depth = self.conv2(predicted_depth)
+        predicted_depth = self.activation1(predicted_depth)
+        predicted_depth = self.conv3(predicted_depth)
+        predicted_depth = self.activation2(predicted_depth)
+        # (batch_size, 1, height, width) -> (batch_size, height, width), which
+        # keeps the same behavior as Depth Anything v1 & v2
+        predicted_depth = predicted_depth.squeeze(dim=1)
+
+        return predicted_depth
+
+
+PROMPT_DEPTH_ANYTHING_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`PromptDepthAnythingConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+PROMPT_DEPTH_ANYTHING_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        prompt_depth (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`, *optional*):
+            Prompt depth is the sparse or low-resolution depth obtained from multi-view geometry or a
+            low-resolution depth sensor. It generally has shape (height, width), where height
+            and width can be smaller than those of the images. It is optional and can be None, which means no prompt depth
+            will be used. If it is None, the output will be a monocular relative depth.
+            The values are recommended to be in meters, but this is not necessary.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class PromptDepthAnythingPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = PromptDepthAnythingConfig
+    base_model_prefix = "prompt_depth_anything"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class PromptDepthAnythingReassembleLayer(nn.Module):
+    def __init__(self, config: PromptDepthAnythingConfig, channels: int, factor: int):
+        super().__init__()
+        self.projection = nn.Conv2d(in_channels=config.reassemble_hidden_size, out_channels=channels, kernel_size=1)
+
+        # up/down sampling depending on factor
+        if factor > 1:
+            self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
+        elif factor == 1:
+            self.resize = nn.Identity()
+        elif factor < 1:
+            # so should downsample
+            stride = torch_int(1 / factor)
+            self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=stride, padding=1)
+
+    def forward(self, hidden_state):
+        hidden_state = self.projection(hidden_state)
+        hidden_state = self.resize(hidden_state)
+
+        return hidden_state
+
+
+class PromptDepthAnythingReassembleStage(DepthAnythingReassembleStage):
+    pass
+
+
+class PromptDepthAnythingNeck(DepthAnythingNeck):
+    def forward(
+        self,
+        hidden_states: List[torch.Tensor],
+        patch_height: Optional[int] = None,
+        patch_width: Optional[int] = None,
+        prompt_depth: Optional[torch.Tensor] = None,
+    ) -> List[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
+                List of hidden states from the backbone.
+        """
+        if not isinstance(hidden_states, (tuple, list)):
+            raise TypeError("hidden_states should be a tuple or list of tensors")
+
+        if len(hidden_states) != len(self.config.neck_hidden_sizes):
+            raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
+
+        # postprocess hidden states
+        hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
+
+        features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
+
+        # fusion blocks
+        output = self.fusion_stage(features, prompt_depth=prompt_depth)
+
+        return output
+
+
+@add_start_docstrings(
+    """
+    Prompt Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
+    """,
+    PROMPT_DEPTH_ANYTHING_START_DOCSTRING,
+)
+class PromptDepthAnythingForDepthEstimation(DepthAnythingForDepthEstimation):
+    @add_start_docstrings_to_model_forward(PROMPT_DEPTH_ANYTHING_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        prompt_depth: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], DepthEstimatorOutput]:
+        r"""
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/image.jpg?raw=true"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
+        >>> model = AutoModelForDepthEstimation.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
+
+        >>> prompt_depth_url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/arkit_depth.png?raw=true"
+        >>> prompt_depth = Image.open(requests.get(prompt_depth_url, stream=True).raw)
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt", prompt_depth=prompt_depth)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # interpolate to original size
+        >>> post_processed_output = image_processor.post_process_depth_estimation(
+        ...     outputs,
+        ...     target_sizes=[(image.height, image.width)],
+        ... )
+
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
+        >>> depth = predicted_depth * 1000.
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint16")) # mm
+        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        outputs = self.backbone.forward_with_filtered_kwargs(
+            pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        )
+        hidden_states = outputs.feature_maps
+
+        _, _, height, width = pixel_values.shape
+        patch_size = self.config.patch_size
+        patch_height = height // patch_size
+        patch_width = width // patch_size
+
+        if prompt_depth is not None:
+            # normalize prompt depth
+            batch_size = prompt_depth.shape[0]
+            depth_min = torch.min(prompt_depth.reshape(batch_size, -1), dim=1).values
+            depth_max = torch.max(prompt_depth.reshape(batch_size, -1), dim=1).values
+            depth_min, depth_max = depth_min.view(batch_size, 1, 1, 1), depth_max.view(batch_size, 1, 1, 1)
+            prompt_depth = (prompt_depth - depth_min) / (depth_max - depth_min)
+            # normalize done
+
+        hidden_states = self.neck(hidden_states, patch_height, patch_width, prompt_depth=prompt_depth)
+
+        predicted_depth = self.head(hidden_states, patch_height, patch_width)
+        if prompt_depth is not None:
+            # denormalize predicted depth
+            depth_min = depth_min.squeeze(1).to(predicted_depth.device)
+            depth_max = depth_max.squeeze(1).to(predicted_depth.device)
+            predicted_depth = predicted_depth * (depth_max - depth_min) + depth_min
+            # denormalize done
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (predicted_depth,) + outputs[1:]
+            else:
+                output = (predicted_depth,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return DepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "PromptDepthAnythingConfig",
+    "PromptDepthAnythingForDepthEstimation",
+    "PromptDepthAnythingPreTrainedModel",
+]
diff --git a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 30390561169e..000000000000
--- a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ProphetNet checkpoint."""
-
-import argparse
-
-from torch import nn
-
-# transformers_old should correspond to branch `save_old_prophetnet_model_structure` here
-# original prophetnet_checkpoints are saved under `patrickvonplaten/..._old` respectively
-from transformers_old.modeling_prophetnet import (
-    ProphetNetForConditionalGeneration as ProphetNetForConditionalGenerationOld,
-)
-from transformers_old.modeling_xlm_prophetnet import (
-    XLMProphetNetForConditionalGeneration as XLMProphetNetForConditionalGenerationOld,
-)
-
-from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging
-
-
-logger = logging.get_logger(__name__)
-logging.set_verbosity_info()
-
-
-def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, pytorch_dump_folder_path: str):
-    """
-    Copy/paste/tweak prohpetnet's weights to our prophetnet structure.
-    """
-    if "xprophetnet" in prophetnet_checkpoint_path:
-        prophet_old = XLMProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
-        prophet, loading_info = XLMProphetNetForConditionalGeneration.from_pretrained(
-            prophetnet_checkpoint_path, output_loading_info=True
-        )
-    else:
-        prophet_old = ProphetNetForConditionalGenerationOld.from_pretrained(prophetnet_checkpoint_path)
-        prophet, loading_info = ProphetNetForConditionalGeneration.from_pretrained(
-            prophetnet_checkpoint_path, output_loading_info=True
-        )
-
-    special_keys = ["key_proj", "value_proj", "query_proj"]
-
-    mapping = {
-        "self_attn": "ngram_self_attn",
-        "cross_attn": "encoder_attn",
-        "cross_attn_layer_norm": "encoder_attn_layer_norm",
-        "feed_forward_layer_norm": "final_layer_norm",
-        "feed_forward": "",
-        "intermediate": "fc1",
-        "output": "fc2",
-        "key_proj": "k_proj",
-        "query_proj": "q_proj",
-        "value_proj": "v_proj",
-        "word_embeddings": "embed_tokens",
-        "embeddings_layer_norm": "emb_layer_norm",
-        "relative_pos_embeddings": "relative_linear",
-        "ngram_embeddings": "ngram_input_embed",
-        "position_embeddings": "embed_positions",
-    }
-
-    for key in loading_info["missing_keys"]:
-        attributes = key.split(".")
-
-        if attributes[0] == "lm_head":
-            model = prophet
-            old_model = prophet_old
-        else:
-            model = prophet.prophetnet
-            old_model = prophet_old.model
-
-        is_key_init = False
-        for attribute in attributes:
-            if attribute in mapping:
-                old_attribute = mapping[attribute]
-                if not hasattr(old_model, old_attribute) and len(old_attribute) > 0:
-                    old_attribute = attribute
-            elif hasattr(old_model, attribute):
-                old_attribute = attribute
-
-            if attribute == "weight":
-                assert old_model.weight.shape == model.weight.shape, "Shapes have to match!"
-                model.weight = old_model.weight
-                logger.info(f"{attribute} is initialized.")
-                is_key_init = True
-                break
-            elif attribute == "bias":
-                assert old_model.bias.shape == model.bias.shape, "Shapes have to match!"
-                model.bias = old_model.bias
-                logger.info(f"{attribute} is initialized")
-                is_key_init = True
-                break
-            elif attribute in special_keys and hasattr(old_model, "in_proj_weight"):
-                embed_dim = old_model.in_proj_weight.shape[0] // 3
-                param = getattr(model, attribute)
-                param.weight.shape == old_model.in_proj_weight[:embed_dim, :].shape, "Shapes have to match"
-                param.bias.shape == old_model.in_proj_bias[:embed_dim].shape, "Shapes have to match"
-                if attribute == "query_proj":
-                    model.query_proj.weight = nn.Parameter(old_model.in_proj_weight[:embed_dim, :])
-                    model.query_proj.bias = nn.Parameter(old_model.in_proj_bias[:embed_dim])
-
-                elif attribute == "key_proj":
-                    model.key_proj.weight = nn.Parameter(old_model.in_proj_weight[embed_dim : 2 * embed_dim, :])
-                    model.key_proj.bias = nn.Parameter(old_model.in_proj_bias[embed_dim : 2 * embed_dim])
-                elif attribute == "value_proj":
-                    model.value_proj.weight = nn.Parameter(old_model.in_proj_weight[2 * embed_dim :, :])
-                    model.value_proj.bias = nn.Parameter(old_model.in_proj_bias[2 * embed_dim :])
-                is_key_init = True
-                break
-            elif attribute == "position_embeddings":
-                assert (
-                    model.position_embeddings.weight.shape[-1] == old_model.embed_positions.weight.shape[-1]
-                ), "Hidden size has to match"
-                assert model.position_embeddings.weight.shape[0] == 512, "We want 512 position_embeddings."
-                model.position_embeddings.weight = nn.Parameter(old_model.embed_positions.weight[:512, :])
-                is_key_init = True
-                break
-
-            if attribute.isdigit():
-                model = model[int(attribute)]
-                old_model = old_model[int(old_attribute)]
-            else:
-                model = getattr(model, attribute)
-
-                if old_attribute == "":
-                    old_model = old_model
-                else:
-                    if not hasattr(old_model, old_attribute):
-                        raise ValueError(f"{old_model} does not have {old_attribute}")
-                    old_model = getattr(old_model, old_attribute)
-
-        if not is_key_init:
-            raise ValueError(f"{key} was not correctly initialized!")
-
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    prophet.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--prophetnet_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_prophetnet_checkpoint_to_pytorch(args.prophetnet_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index fc148edbc49c..ddb72211b0e6 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -308,7 +308,7 @@ class ProphetNetSeq2SeqLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     logits_ngram: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[torch.FloatTensor]] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -528,7 +528,7 @@ class ProphetNetDecoderLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     logits_ngram: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -588,9 +588,9 @@ def __init__(self, config: ProphetNetConfig) -> None:
         super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
 
     def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
-        assert (position_ids is None) or (
-            self.padding_idx is None
-        ), "If position_ids is pre-computed then padding_idx should not be set."
+        assert (position_ids is None) or (self.padding_idx is None), (
+            "If position_ids is pre-computed then padding_idx should not be set."
+        )
 
         if position_ids is None:
             if past_key_values is not None:
@@ -784,9 +784,9 @@ def __init__(self, config: ProphetNetConfig):
         self.head_dim = config.hidden_size // self.num_attn_heads
         self.ngram = config.ngram
 
-        assert (
-            self.head_dim * self.num_attn_heads == config.hidden_size
-        ), "config.hidden_size must be divisible by num_attn_heads"
+        assert self.head_dim * self.num_attn_heads == config.hidden_size, (
+            "config.hidden_size must be divisible by num_attn_heads"
+        )
         # key, value, query projection
         self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
         self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
@@ -1041,9 +1041,9 @@ def get_predict_relative_pos_embeddings(
 
         if predict_relative_position_buckets is None:
             key_sequence_length = attn_weights.shape[-1]
-            assert (
-                position_ids[0][0] == key_sequence_length - 1
-            ), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            assert position_ids[0][0] == key_sequence_length - 1, (
+                "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            )
             relative_positions = (
                 torch.arange(0, key_sequence_length)
                 .unsqueeze(0)
@@ -1313,9 +1313,9 @@ def forward(
 
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 encoder_hidden_states = encoder_hidden_states + (hidden_states,)
@@ -1488,9 +1488,9 @@ def forward(
 
         # prepare attention mask
         if past_key_values is not None:
-            assert (
-                hidden_states.size(1) == 1
-            ), "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+            assert hidden_states.size(1) == 1, (
+                "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+            )
 
             ngram_hidden_states = [
                 (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1)
diff --git a/src/transformers/models/pvt/convert_pvt_to_pytorch.py b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
deleted file mode 100644
index 73ae4c157187..000000000000
--- a/src/transformers/models/pvt/convert_pvt_to_pytorch.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
-# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Pvt checkpoints from the original library."""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import PvtConfig, PvtForImageClassification, PvtImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-    for i in range(config.num_encoder_blocks):
-        # Remane embedings' paramters
-        rename_keys.append((f"pos_embed{i + 1}", f"pvt.encoder.patch_embeddings.{i}.position_embeddings"))
-
-        rename_keys.append((f"patch_embed{i + 1}.proj.weight", f"pvt.encoder.patch_embeddings.{i}.projection.weight"))
-        rename_keys.append((f"patch_embed{i + 1}.proj.bias", f"pvt.encoder.patch_embeddings.{i}.projection.bias"))
-        rename_keys.append((f"patch_embed{i + 1}.norm.weight", f"pvt.encoder.patch_embeddings.{i}.layer_norm.weight"))
-        rename_keys.append((f"patch_embed{i + 1}.norm.bias", f"pvt.encoder.patch_embeddings.{i}.layer_norm.bias"))
-
-        for j in range(config.depths[i]):
-            # Rename blocks' parameters
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.q.weight", f"pvt.encoder.block.{i}.{j}.attention.self.query.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.q.bias", f"pvt.encoder.block.{i}.{j}.attention.self.query.bias")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.kv.weight", f"pvt.encoder.block.{i}.{j}.attention.self.kv.weight")
-            )
-            rename_keys.append((f"block{i + 1}.{j}.attn.kv.bias", f"pvt.encoder.block.{i}.{j}.attention.self.kv.bias"))
-
-            if config.sequence_reduction_ratios[i] > 1:
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.norm.weight",
-                        f"pvt.encoder.block.{i}.{j}.attention.self.layer_norm.weight",
-                    )
-                )
-                rename_keys.append(
-                    (f"block{i + 1}.{j}.attn.norm.bias", f"pvt.encoder.block.{i}.{j}.attention.self.layer_norm.bias")
-                )
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.sr.weight",
-                        f"pvt.encoder.block.{i}.{j}.attention.self.sequence_reduction.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.sr.bias",
-                        f"pvt.encoder.block.{i}.{j}.attention.self.sequence_reduction.bias",
-                    )
-                )
-
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.proj.weight", f"pvt.encoder.block.{i}.{j}.attention.output.dense.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.proj.bias", f"pvt.encoder.block.{i}.{j}.attention.output.dense.bias")
-            )
-
-            rename_keys.append((f"block{i + 1}.{j}.norm1.weight", f"pvt.encoder.block.{i}.{j}.layer_norm_1.weight"))
-            rename_keys.append((f"block{i + 1}.{j}.norm1.bias", f"pvt.encoder.block.{i}.{j}.layer_norm_1.bias"))
-
-            rename_keys.append((f"block{i + 1}.{j}.norm2.weight", f"pvt.encoder.block.{i}.{j}.layer_norm_2.weight"))
-            rename_keys.append((f"block{i + 1}.{j}.norm2.bias", f"pvt.encoder.block.{i}.{j}.layer_norm_2.bias"))
-
-            rename_keys.append((f"block{i + 1}.{j}.mlp.fc1.weight", f"pvt.encoder.block.{i}.{j}.mlp.dense1.weight"))
-            rename_keys.append((f"block{i + 1}.{j}.mlp.fc1.bias", f"pvt.encoder.block.{i}.{j}.mlp.dense1.bias"))
-            rename_keys.append((f"block{i + 1}.{j}.mlp.fc2.weight", f"pvt.encoder.block.{i}.{j}.mlp.dense2.weight"))
-            rename_keys.append((f"block{i + 1}.{j}.mlp.fc2.bias", f"pvt.encoder.block.{i}.{j}.mlp.dense2.bias"))
-
-    # Rename cls token
-    rename_keys.extend(
-        [
-            ("cls_token", "pvt.encoder.patch_embeddings.3.cls_token"),
-        ]
-    )
-    # Rename norm layer and classifier layer
-    rename_keys.extend(
-        [
-            ("norm.weight", "pvt.encoder.layer_norm.weight"),
-            ("norm.bias", "pvt.encoder.layer_norm.bias"),
-            ("head.weight", "classifier.weight"),
-            ("head.bias", "classifier.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_k_v(state_dict, config):
-    # for each of the encoder blocks:
-    for i in range(config.num_encoder_blocks):
-        for j in range(config.depths[i]):
-            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
-            kv_weight = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.weight")
-            kv_bias = state_dict.pop(f"pvt.encoder.block.{i}.{j}.attention.self.kv.bias")
-            # next, add keys and values (in that order) to the state dict
-            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[: config.hidden_sizes[i], :]
-            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
-
-            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
-                config.hidden_sizes[i] :, :
-            ]
-            state_dict[f"pvt.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_pvt_checkpoint(pvt_size, pvt_checkpoint, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our PVT structure.
-    """
-
-    # define default Pvt configuration
-    if pvt_size == "tiny":
-        config_path = "Zetatech/pvt-tiny-224"
-    elif pvt_size == "small":
-        config_path = "Zetatech/pvt-small-224"
-    elif pvt_size == "medium":
-        config_path = "Zetatech/pvt-medium-224"
-    elif pvt_size == "large":
-        config_path = "Zetatech/pvt-large-224"
-    else:
-        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
-    config = PvtConfig(name_or_path=config_path)
-    # load original model from https://github.com/whai362/PVT
-    state_dict = torch.load(pvt_checkpoint, map_location="cpu")
-
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = PvtForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by PVTFeatureExtractor
-    image_processor = PvtImageProcessor(size=config.image_size)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values)
-    logits = outputs.logits.detach().cpu()
-
-    if pvt_size == "tiny":
-        expected_slice_logits = torch.tensor([-1.4192, -1.9158, -0.9702])
-    elif pvt_size == "small":
-        expected_slice_logits = torch.tensor([0.4353, -0.1960, -0.2373])
-    elif pvt_size == "medium":
-        expected_slice_logits = torch.tensor([-0.2914, -0.2231, 0.0321])
-    elif pvt_size == "large":
-        expected_slice_logits = torch.tensor([0.3740, -0.7739, -0.4214])
-    else:
-        raise ValueError(f"Available model's size: 'tiny', 'small', 'medium', 'large', but " f"'{pvt_size}' was given")
-
-    assert torch.allclose(logits[0, :3], expected_slice_logits, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model pytorch_model.bin to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pvt_size",
-        default="tiny",
-        type=str,
-        help="Size of the PVT pretrained model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pvt_checkpoint",
-        default="pvt_tiny.pth",
-        type=str,
-        help="Checkpoint of the PVT pretrained model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_pvt_checkpoint(args.pvt_size, args.pvt_checkpoint, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py b/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py
deleted file mode 100644
index e397cb244c0e..000000000000
--- a/src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Authors: Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan,
-# Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao and The HuggingFace Inc. team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert PvtV2 checkpoints from the original library."""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import PvtImageProcessor, PvtV2Config, PvtV2ForImageClassification
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-    for i in range(config.num_encoder_blocks):
-        # Remane embedings' paramters
-        rename_keys.append(
-            (f"patch_embed{i + 1}.proj.weight", f"pvt_v2.encoder.layers.{i}.patch_embedding.proj.weight")
-        )
-        rename_keys.append((f"patch_embed{i + 1}.proj.bias", f"pvt_v2.encoder.layers.{i}.patch_embedding.proj.bias"))
-        rename_keys.append(
-            (f"patch_embed{i + 1}.norm.weight", f"pvt_v2.encoder.layers.{i}.patch_embedding.layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"patch_embed{i + 1}.norm.bias", f"pvt_v2.encoder.layers.{i}.patch_embedding.layer_norm.bias")
-        )
-        rename_keys.append((f"norm{i + 1}.weight", f"pvt_v2.encoder.layers.{i}.layer_norm.weight"))
-        rename_keys.append((f"norm{i + 1}.bias", f"pvt_v2.encoder.layers.{i}.layer_norm.bias"))
-
-        for j in range(config.depths[i]):
-            # Rename blocks' parameters
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.q.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.query.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.q.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.query.bias")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.kv.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.kv.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.bias")
-            )
-
-            if config.linear_attention or config.sr_ratios[i] > 1:
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.norm.weight",
-                        f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.layer_norm.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.norm.bias",
-                        f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.layer_norm.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.sr.weight",
-                        f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.spatial_reduction.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"block{i + 1}.{j}.attn.sr.bias",
-                        f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.spatial_reduction.bias",
-                    )
-                )
-
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.proj.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.proj.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.attn.proj.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.proj.bias")
-            )
-
-            rename_keys.append(
-                (f"block{i + 1}.{j}.norm1.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.layer_norm_1.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.norm1.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.layer_norm_1.bias")
-            )
-
-            rename_keys.append(
-                (f"block{i + 1}.{j}.norm2.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.layer_norm_2.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.norm2.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.layer_norm_2.bias")
-            )
-
-            rename_keys.append(
-                (f"block{i + 1}.{j}.mlp.fc1.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dense1.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.mlp.fc1.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dense1.bias")
-            )
-            rename_keys.append(
-                (
-                    f"block{i + 1}.{j}.mlp.dwconv.dwconv.weight",
-                    f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dwconv.dwconv.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"block{i + 1}.{j}.mlp.dwconv.dwconv.bias",
-                    f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dwconv.dwconv.bias",
-                )
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.mlp.fc2.weight", f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dense2.weight")
-            )
-            rename_keys.append(
-                (f"block{i + 1}.{j}.mlp.fc2.bias", f"pvt_v2.encoder.layers.{i}.blocks.{j}.mlp.dense2.bias")
-            )
-
-    rename_keys.extend(
-        [
-            ("head.weight", "classifier.weight"),
-            ("head.bias", "classifier.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_k_v(state_dict, config):
-    # for each of the encoder blocks:
-    for i in range(config.num_encoder_blocks):
-        for j in range(config.depths[i]):
-            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
-            kv_weight = state_dict.pop(f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.weight")
-            kv_bias = state_dict.pop(f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.kv.bias")
-            # next, add keys and values (in that order) to the state dict
-            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.key.weight"] = kv_weight[
-                : config.hidden_sizes[i], :
-            ]
-            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.key.bias"] = kv_bias[: config.hidden_sizes[i]]
-
-            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.value.weight"] = kv_weight[
-                config.hidden_sizes[i] :, :
-            ]
-            state_dict[f"pvt_v2.encoder.layers.{i}.blocks.{j}.attention.value.bias"] = kv_bias[
-                config.hidden_sizes[i] :
-            ]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_pvt_v2_checkpoint(pvt_v2_size, pvt_v2_checkpoint, pytorch_dump_folder_path, verify_imagenet_weights=False):
-    """
-    Copy/paste/tweak model's weights to our PVT structure.
-    """
-
-    # define default PvtV2 configuration
-    if pvt_v2_size == "b0":
-        config_path = "OpenGVLab/pvt_v2_b0"
-    elif pvt_v2_size == "b1":
-        config_path = "OpenGVLab/pvt_v2_b1"
-    elif pvt_v2_size == "b2":
-        config_path = "OpenGVLab/pvt_v2_b2"
-    elif pvt_v2_size == "b2-linear":
-        config_path = "OpenGVLab/pvt_v2_b2_linear"
-    elif pvt_v2_size == "b3":
-        config_path = "OpenGVLab/pvt_v2_b3"
-    elif pvt_v2_size == "b4":
-        config_path = "OpenGVLab/pvt_v2_b4"
-    elif pvt_v2_size == "b5":
-        config_path = "OpenGVLab/pvt_v2_b5"
-    else:
-        raise ValueError(
-            f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but "
-            f"'{pvt_v2_size}' was given"
-        )
-    config = PvtV2Config.from_pretrained(config_path)
-    # load original model from https://github.com/whai362/PVT
-    state_dict = torch.load(pvt_v2_checkpoint, map_location="cpu")
-
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_k_v(state_dict, config)
-
-    # load HuggingFace model
-    model = PvtV2ForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-    image_processor = PvtImageProcessor(size=config.image_size)
-
-    if verify_imagenet_weights:
-        # Check outputs on an image, prepared by PvtImageProcessor
-        print("Verifying conversion of pretrained ImageNet weights...")
-        encoding = image_processor(images=prepare_img(), return_tensors="pt")
-        pixel_values = encoding["pixel_values"]
-        outputs = model(pixel_values)
-        logits = outputs.logits.detach().cpu()
-
-        if pvt_v2_size == "b0":
-            expected_slice_logits = torch.tensor([-1.1939, -1.4547, -0.1076])
-        elif pvt_v2_size == "b1":
-            expected_slice_logits = torch.tensor([-0.4716, -0.7335, -0.4600])
-        elif pvt_v2_size == "b2":
-            expected_slice_logits = torch.tensor([0.0795, -0.3170, 0.2247])
-        elif pvt_v2_size == "b2-linear":
-            expected_slice_logits = torch.tensor([0.0968, 0.3937, -0.4252])
-        elif pvt_v2_size == "b3":
-            expected_slice_logits = torch.tensor([-0.4595, -0.2870, 0.0940])
-        elif pvt_v2_size == "b4":
-            expected_slice_logits = torch.tensor([-0.1769, -0.1747, -0.0143])
-        elif pvt_v2_size == "b5":
-            expected_slice_logits = torch.tensor([-0.2943, -0.1008, 0.6812])
-        else:
-            raise ValueError(
-                f"Available model sizes: 'b0', 'b1', 'b2', 'b2-linear', 'b3', 'b4', 'b5', but "
-                f"'{pvt_v2_size}' was given"
-            )
-
-        assert torch.allclose(
-            logits[0, :3], expected_slice_logits, atol=1e-4
-        ), "ImageNet weights not converted successfully."
-
-        print("ImageNet weights verified, conversion successful.")
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model pytorch_model.bin to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pvt_v2_size",
-        default="b0",
-        type=str,
-        help="Size of the PVTv2 pretrained model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pvt_v2_checkpoint",
-        default="pvt_v2_b0.pth",
-        type=str,
-        help="Checkpoint of the PVTv2 pretrained model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--verify-imagenet-weights",
-        action="store_true",
-        default=False,
-        help="Verifies the correct conversion of author-published pretrained ImageNet weights.",
-    )
-
-    args = parser.parse_args()
-    convert_pvt_v2_checkpoint(
-        pvt_v2_size=args.pvt_v2_size,
-        pvt_v2_checkpoint=args.pvt_v2_checkpoint,
-        pytorch_dump_folder_path=args.pytorch_dump_folder_path,
-        verify_imagenet_weights=args.verify_imagenet_weights,
-    )
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index bf135a46c8d7..16a7316e2d0e 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -4,7 +4,8 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_qwen2.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, List, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -21,7 +22,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -29,6 +30,7 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -301,45 +303,18 @@ def __init__(self, config: Qwen2Config, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -426,20 +401,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -500,10 +467,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
@@ -511,16 +479,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -531,6 +497,10 @@ def forward(
             )
             use_cache = False
 
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -565,7 +535,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
                     hidden_states,
                     causal_mask,
                     position_ids,
@@ -599,13 +569,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -613,7 +582,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -716,7 +685,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -795,27 +764,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -850,10 +818,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -862,12 +829,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -876,10 +842,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -920,29 +882,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -951,9 +912,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -968,7 +928,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -983,10 +943,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1026,6 +982,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1037,23 +994,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1062,9 +1017,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1072,10 +1026,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
@@ -1108,21 +1058,21 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.transformer.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         start_positions: Optional[torch.LongTensor] = None,
         end_positions: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -1133,9 +1083,8 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
             are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.transformer(
+        outputs: BaseModelOutputWithPast = self.transformer(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1143,10 +1092,9 @@ def forward(
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -1157,10 +1105,6 @@ def forward(
         if start_positions is not None and end_positions is not None:
             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
 
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return QuestionAnsweringModelOutput(
             loss=loss,
             start_logits=start_logits,
diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
index b2bf37ba0c14..ed3505728a63 100644
--- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
@@ -243,7 +243,7 @@ def __init__(
 
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, move it to 'rope_type'.
-        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
+        # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations
         # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
         # TODO: @raushan update config in the hub
         if self.rope_scaling is not None and "type" in self.rope_scaling:
diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 8f9c7a5e6bba..eb455444e5cb 100644
--- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -37,34 +37,20 @@
 from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
-from ...utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
-    logging,
-    replace_return_docstrings,
-)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_qwen2_5_vl import Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig
 
 
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_varlen_func
-    from flash_attn.layers.rotary import apply_rotary_emb
-
-else:
-    flash_attn_varlen_func = None
-    apply_rotary_emb = None
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import apply_rotary_emb, flash_attn_varlen_func
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
-else:
-    flash_attn_varlen_func = None
 
 
 logger = logging.get_logger(__name__)
@@ -165,8 +151,8 @@ def apply_rotary_pos_emb_flashatt(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
-    q_embed = apply_rotary_emb(q.float(), cos, sin).type_as(q)
-    k_embed = apply_rotary_emb(k.float(), cos, sin).type_as(k)
+    q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
+    k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
     return q_embed, k_embed
 
 
@@ -194,8 +180,8 @@ def forward(
                 "removed and `position_embeddings` will be mandatory."
             )
             emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-            cos = emb.cos().float()
-            sin = emb.sin().float()
+            cos = emb.cos()
+            sin = emb.sin()
         else:
             cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin)
@@ -223,7 +209,7 @@ def apply_rotary_pos_emb_vision(
     orig_q_dtype = q.dtype
     orig_k_dtype = k.dtype
     q, k = q.float(), k.float()
-    cos, sin = cos.unsqueeze(-2), sin.unsqueeze(-2)
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     q_embed = q_embed.to(orig_q_dtype)
@@ -256,8 +242,8 @@ def forward(
                 "removed and `position_embeddings` will be mandatory."
             )
             emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-            cos = emb.cos().float()
-            sin = emb.sin().float()
+            cos = emb.cos()
+            sin = emb.sin()
         else:
             cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
@@ -305,8 +291,8 @@ def forward(
                 "removed and `position_embeddings` will be mandatory."
             )
             emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-            cos = emb.cos().float()
-            sin = emb.sin().float()
+            cos = emb.cos()
+            sin = emb.sin()
         else:
             cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
@@ -317,8 +303,10 @@ def forward(
         q = q.transpose(0, 1)
         k = k.transpose(0, 1)
         v = v.transpose(0, 1)
-        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
-        attn_output = attn_output.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(
+            q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0), attention_mask, dropout_p=0.0
+        )
+        attn_output = attn_output.squeeze(0).transpose(0, 1)
         attn_output = attn_output.reshape(seq_length, -1)
         attn_output = self.proj(attn_output)
         return attn_output
@@ -582,45 +570,20 @@ def __init__(self, config: Qwen2_5_VLConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block. In contrast to other models, Qwen2_5_VL has different position ids for thw grids
+        # In contrast to other models, Qwen2_5_VL has different position ids for the grids
         # So we expand the inv_freq to shape (3, ...)
         inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
         position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -647,7 +610,7 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
     Explanation:
         Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
         sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
-        vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
+        vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
         Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
         For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
         height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
@@ -816,9 +779,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -1126,7 +1089,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1247,7 +1210,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -1350,7 +1313,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1426,7 +1389,7 @@ class Qwen2_5_VLCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -1565,7 +1528,7 @@ def get_rope_index(
                 width position_ids: [0, 1, 2, 3, 4]
 
             For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
-            and 1D rotary position embeddin for text part.
+            and 1D rotary position embedding for text part.
             Examples:
                 Temporal (Time): 3 patches, representing different segments of the video in time.
                 Height: 2 patches, dividing each frame vertically.
@@ -1724,7 +1687,7 @@ def get_rope_index(
     @replace_return_docstrings(output_type=Qwen2_5_VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1743,7 +1706,6 @@ def forward(
         second_per_grid_ts: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1922,68 +1884,29 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        #              (we can't check exception 3 while compiling)
-        # Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
-        # generate the first token for each sequence. Later use the generated Input ids for continuation.
-        if past_key_values is not None:
-            if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
-                inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
-            elif (
-                inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-        if cache_position[0] != 0:
-            pixel_values = None
-            pixel_values_videos = None
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            second_per_grid_ts=second_per_grid_ts,
+            use_cache=use_cache,
+            **kwargs,
+        )
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
-            if model_inputs["inputs_embeds"] is not None:
-                batch_size, sequence_length, _ = inputs_embeds.shape
-                device = inputs_embeds.device
-            else:
-                batch_size, sequence_length = input_ids.shape
-                device = input_ids.device
+        # Qwen2-5-VL position_ids are prepareed with rope_deltas in forward
+        model_inputs["position_ids"] = None
 
-            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
-                attention_mask,
-                sequence_length=sequence_length,
-                target_length=past_key_values.get_max_cache_shape(),
-                dtype=self.lm_head.weight.dtype,
-                device=device,
-                cache_position=cache_position,
-                batch_size=batch_size,
-                config=self.config,
-                past_key_values=past_key_values,
-            )
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
 
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "pixel_values_videos": pixel_values_videos,
-                "image_grid_thw": image_grid_thw,
-                "video_grid_thw": video_grid_thw,
-                "cache_position": cache_position,
-                "second_per_grid_ts": second_per_grid_ts,
-            }
-        )
         return model_inputs
 
     def _get_image_nums_and_video_nums(
diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
index 0740de2e217e..925d59df2fa1 100644
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -41,25 +41,20 @@
     VisionRotaryEmbedding,
     VisionSdpaAttention,
 )
-from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
+from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLImagesKwargs, Qwen2VLProcessor
 
 from ...activations import ACT2FN
-from ...cache_utils import StaticCache
 from ...configuration_utils import PretrainedConfig
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, VideoInput
+from ...modeling_flash_attention_utils import is_flash_attn_available
 from ...processing_utils import ProcessingKwargs, Unpack, VideosKwargs
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import is_flash_attn_2_available, is_torchdynamo_compiling, logging
+from ...utils import logging
 
 
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_varlen_func
-    from flash_attn.layers.rotary import apply_rotary_emb
-
-else:
-    flash_attn_varlen_func = None
-    apply_rotary_emb = None
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import apply_rotary_emb, flash_attn_varlen_func
 
 
 logger = logging.get_logger(__name__)
@@ -70,8 +65,8 @@ def apply_rotary_pos_emb_flashatt(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
-    q_embed = apply_rotary_emb(q.float(), cos, sin).type_as(q)
-    k_embed = apply_rotary_emb(k.float(), cos, sin).type_as(k)
+    q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
+    k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
     return q_embed, k_embed
 
 
@@ -170,8 +165,8 @@ def forward(
                 "removed and `position_embeddings` will be mandatory."
             )
             emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-            cos = emb.cos().float()
-            sin = emb.sin().float()
+            cos = emb.cos()
+            sin = emb.sin()
         else:
             cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin)
@@ -433,7 +428,7 @@ def get_rope_index(
                 width position_ids: [0, 1, 2, 3, 4]
 
             For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
-            and 1D rotary position embeddin for text part.
+            and 1D rotary position embedding for text part.
             Examples:
                 Temporal (Time): 3 patches, representing different segments of the video in time.
                 Height: 2 patches, dividing each frame vertically.
@@ -590,7 +585,7 @@ def get_rope_index(
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -609,7 +604,6 @@ def forward(
         second_per_grid_ts: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -788,68 +782,29 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        #              (we can't check exception 3 while compiling)
-        # Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
-        # generate the first token for each sequence. Later use the generated Input ids for continuation.
-        if past_key_values is not None:
-            if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
-                inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
-            elif (
-                inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-        if cache_position[0] != 0:
-            pixel_values = None
-            pixel_values_videos = None
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            second_per_grid_ts=second_per_grid_ts,
+            use_cache=use_cache,
+            **kwargs,
+        )
 
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
+        # Qwen2-5-VL position_ids are prepareed with rope_deltas in forward
+        model_inputs["position_ids"] = None
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
-            if model_inputs["inputs_embeds"] is not None:
-                batch_size, sequence_length, _ = inputs_embeds.shape
-                device = inputs_embeds.device
-            else:
-                batch_size, sequence_length = input_ids.shape
-                device = input_ids.device
-
-            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
-                attention_mask,
-                sequence_length=sequence_length,
-                target_length=past_key_values.get_max_cache_shape(),
-                dtype=self.lm_head.weight.dtype,
-                device=device,
-                cache_position=cache_position,
-                batch_size=batch_size,
-                config=self.config,
-                past_key_values=past_key_values,
-            )
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
 
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "pixel_values_videos": pixel_values_videos,
-                "image_grid_thw": image_grid_thw,
-                "video_grid_thw": video_grid_thw,
-                "cache_position": cache_position,
-                "second_per_grid_ts": second_per_grid_ts,
-            }
-        )
         return model_inputs
 
 
@@ -857,7 +812,12 @@ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
     fps: Union[List[float], float]
 
 
+class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs):
+    pass
+
+
 class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen2_5_VLImagesKwargs
     videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
     _defaults = {
         "text_kwargs": {
diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
index a11010f6c91c..e07642a1bf9d 100644
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@@ -23,11 +23,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Union
+from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, VideoInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
@@ -35,7 +35,16 @@ class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
     fps: Union[List[float], float]
 
 
+class Qwen2_5_VLImagesKwargs(ImagesKwargs):
+    min_pixels: Optional[int]
+    max_pixels: Optional[int]
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+
+
 class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen2_5_VLImagesKwargs
     videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
     _defaults = {
         "text_kwargs": {
@@ -192,7 +201,9 @@ def decode(self, *args, **kwargs):
         """
         return self.tokenizer.decode(*args, **kwargs)
 
-    def post_process_image_text_to_text(self, generated_outputs):
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
         """
         Post-process the output of the model to decode the text.
 
@@ -200,12 +211,21 @@ def post_process_image_text_to_text(self, generated_outputs):
             generated_outputs (`torch.Tensor` or `np.ndarray`):
                 The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                 or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
 
         Returns:
             `List[str]`: The decoded text.
         """
         return self.tokenizer.batch_decode(
-            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
         )
 
     @property
diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
index 320d2093133f..e375272af926 100644
--- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
@@ -16,30 +16,30 @@
 
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, EncoderDecoderCache, StaticCache
+from ...cache_utils import Cache
 from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import BaseModelOutput, ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
+from ...utils.deprecation import deprecate_kwarg
 from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_qwen2_audio import Qwen2AudioConfig, Qwen2AudioEncoderConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -58,12 +58,15 @@ class Qwen2AudioCausalLMOutputWithPast(ModelOutput):
             Language modeling loss (for next-token prediction).
         logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
+        past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are
+            two sets of pre-computed hidden-states: key and values states in the self-attention blocks.
+            The `past_key_values` are returned when `use_cache=True` is passed or when `config.use_cache=True`.
+            It is a [`~cache_utils.Cache`] instance.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those
+            that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+            all `input_ids` of shape `(batch_size, sequence_length)`.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
@@ -80,17 +83,17 @@ class Qwen2AudioCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_key_values: Optional[List[torch.FloatTensor]] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     attention_mask: Optional[torch.FloatTensor] = None
 
 
-# Copied from transformers.models.whisper.modeling_whisper.WhisperAttention with Whisper->Qwen2Audio
 class Qwen2AudioAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
+    # Copied from transformers.models.whisper.modeling_whisper.WhisperAttention.__init__ with Whisper->Qwen2Audio
     def __init__(
         self,
         embed_dim: int,
@@ -135,11 +138,14 @@ def __init__(
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
+    @deprecate_kwarg("key_value_states", version="4.52")
+    @deprecate_kwarg("past_key_value", version="4.52")
+    @deprecate_kwarg("cache_position", version="4.52")
     def forward(
         self,
         hidden_states: torch.Tensor,
         key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[EncoderDecoderCache] = None,
+        past_key_value: Optional[Cache] = None,
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
@@ -147,38 +153,12 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
 
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self._shape(self.q_proj(hidden_states) * self.scaling, tgt_len, bsz)
-
-        if past_key_value is not None:
-            is_updated = past_key_value.is_updated.get(self.layer_idx)
-            if is_cross_attention:
-                # after the first generated id, we can subsequently re-use all key/value_states from cache
-                past_key_value.is_updated[self.layer_idx] = True
-                past_key_value = past_key_value.cross_attention_cache
-            else:
-                past_key_value = past_key_value.self_attention_cache
-
-        # use key_value_states if cross attention
-        current_states = key_value_states if key_value_states is not None else hidden_states
-        if is_cross_attention and past_key_value and is_updated:
-            # reuse k,v, cross_attentions
-            key_states = past_key_value.key_cache[self.layer_idx]
-            value_states = past_key_value.value_cache[self.layer_idx]
-        else:
-            key_states = self._shape(self.k_proj(current_states), -1, bsz)
-            value_states = self._shape(self.v_proj(current_states), -1, bsz)
-            if past_key_value is not None:
-                # save all key/value_states to cache to be re-used for fast auto-regressive generation
-                cache_position = cache_position if not is_cross_attention else None
-                key_states, value_states = past_key_value.update(
-                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
-                )
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
 
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
 
@@ -212,10 +192,9 @@ def forward(
 
         attn_output = self.out_proj(attn_output)
 
-        return attn_output, attn_weights, past_key_value
+        return attn_output, attn_weights, None
 
 
-# Copied from transformers.models.whisper.modeling_whisper.WhisperFlashAttention2 with Whisper->Qwen2Audio
 class Qwen2AudioFlashAttention2(Qwen2AudioAttention):
     """
     Qwen2Audio flash attention module. This module inherits from `Qwen2AudioAttention` as the weights of the module stays
@@ -223,65 +202,38 @@ class Qwen2AudioFlashAttention2(Qwen2AudioAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    # Copied from transformers.models.whisper.modeling_whisper.WhisperFlashAttention2.__init__ with Whisper->Qwen2Audio
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
+    @deprecate_kwarg("key_value_states", version="4.52")
+    @deprecate_kwarg("past_key_value", version="4.52")
+    @deprecate_kwarg("cache_position", version="4.52")
     def forward(
         self,
         hidden_states: torch.Tensor,
         key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[EncoderDecoderCache] = None,
+        past_key_value: Optional[Cache] = None,
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if isinstance(past_key_value, StaticCache):
-            raise ValueError(
-                "The `static` cache implementation is not compatible with `attn_implementation='flash_attention_2'`. "
-                "Use `attn_implementation='sdpa'` in the meantime, and open an issue at https://github.com/huggingface/transformers"
-            )
         # Qwen2AudioFlashAttention2 attention does not support output_attentions
         if output_attentions:
             raise ValueError("Qwen2AudioFlashAttention2 attention does not support output_attentions")
 
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = torch.reshape(self.q_proj(hidden_states), (bsz, tgt_len, self.num_heads, self.head_dim))
-
-        if past_key_value is not None:
-            is_updated = past_key_value.is_updated.get(self.layer_idx)
-            if is_cross_attention:
-                # after the first generated id, we can subsequently re-use all key/value_states from cache
-                past_key_value.is_updated[self.layer_idx] = True
-                past_key_value = past_key_value.cross_attention_cache
-            else:
-                past_key_value = past_key_value.self_attention_cache
-
-        # use key_value_states if cross attention
-        current_states = key_value_states if key_value_states is not None else hidden_states
-        if is_cross_attention and past_key_value and is_updated:
-            # reuse k,v, cross_attentions
-            key_states = past_key_value.key_cache[self.layer_idx]
-            value_states = past_key_value.value_cache[self.layer_idx]
-        else:
-            key_states = self._shape(self.k_proj(current_states), -1, bsz)
-            value_states = self._shape(self.v_proj(current_states), -1, bsz)
-            if past_key_value is not None:
-                # save all key/value_states to cache to be re-used for fast auto-regressive generation
-                cache_position = cache_position if not is_cross_attention else None
-                key_states, value_states = past_key_value.update(
-                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
-                )
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
 
         # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]
         #  We would need to refactor the KV cache to be able to avoid many of these transpose/reshape/view.
@@ -335,16 +287,18 @@ def forward(
         if not output_attentions:
             attn_weights = None
 
-        return attn_output, attn_weights, past_key_value
+        return attn_output, attn_weights, None
 
 
-# Copied from transformers.models.whisper.modeling_whisper.WhisperSdpaAttention with Whisper->Qwen2Audio
 class Qwen2AudioSdpaAttention(Qwen2AudioAttention):
+    @deprecate_kwarg("key_value_states", version="4.52")
+    @deprecate_kwarg("past_key_value", version="4.52")
+    @deprecate_kwarg("cache_position", version="4.52")
     def forward(
         self,
         hidden_states: torch.Tensor,
         key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[EncoderDecoderCache] = None,
+        past_key_value: Optional[Cache] = None,
         attention_mask: Optional[torch.Tensor] = None,
         layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
@@ -359,46 +313,17 @@ def forward(
             )
             return super().forward(
                 hidden_states,
-                key_value_states=key_value_states,
-                past_key_value=past_key_value,
                 attention_mask=attention_mask,
                 layer_head_mask=layer_head_mask,
                 output_attentions=output_attentions,
-                cache_position=cache_position,
             )
 
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
         bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz)
-
-        if past_key_value is not None:
-            is_updated = past_key_value.is_updated.get(self.layer_idx)
-            if is_cross_attention:
-                # after the first generated id, we can subsequently re-use all key/value_states from cache
-                past_key_value.is_updated[self.layer_idx] = True
-                past_key_value = past_key_value.cross_attention_cache
-            else:
-                past_key_value = past_key_value.self_attention_cache
-
-        # use key_value_states if cross attention
-        current_states = key_value_states if key_value_states is not None else hidden_states
-        if is_cross_attention and past_key_value and is_updated:
-            # reuse k,v, cross_attentions
-            key_states = past_key_value.key_cache[self.layer_idx]
-            value_states = past_key_value.value_cache[self.layer_idx]
-        else:
-            key_states = self._shape(self.k_proj(current_states), -1, bsz)
-            value_states = self._shape(self.v_proj(current_states), -1, bsz)
-            if past_key_value is not None:
-                # save all key/value_states to cache to be re-used for fast auto-regressive generation
-                cache_position = cache_position if not is_cross_attention else None
-                key_states, value_states = past_key_value.update(
-                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
-                )
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
 
         causal_mask = attention_mask
         if attention_mask is not None:  # no matter the length, we just slice it
@@ -434,7 +359,7 @@ def forward(
 
         attn_output = self.out_proj(attn_output)
 
-        return attn_output, None, past_key_value
+        return attn_output, None, None
 
 
 QWEN2AUDIO_ATTENTION_CLASSES = {
@@ -501,9 +426,7 @@ def forward(
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
+        if hidden_states.dtype == torch.float16:
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
@@ -697,9 +620,9 @@ def forward(
 
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
 
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
@@ -815,16 +738,15 @@ def forward(self, audio_features):
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+        past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are
+            two sets of pre-computed hidden-states: key and values states in the self-attention blocks.
+            The `past_key_values` are returned when `use_cache=True` is passed or when `config.use_cache=True`.
+            It is a [`~cache_utils.Cache`] instance.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those
+            that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+            all `input_ids` of shape `(batch_size, sequence_length)`.shape `(batch_size, 1)` instead of all
             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
@@ -851,7 +773,7 @@ def forward(self, audio_features):
 class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMixin):
     def __init__(self, config: Qwen2AudioConfig):
         super().__init__(config)
-        self.audio_tower = AutoModel.from_config(config.audio_config)
+        self.audio_tower = AutoModel.from_config(config.audio_config)  # Usually a `Qwen2AudioEncoder` instance
 
         self.multi_modal_projector = Qwen2AudioMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
@@ -1098,12 +1020,12 @@ def _merge_input_ids_with_audio_features(
     @replace_return_docstrings(output_type=Qwen2AudioCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        input_features: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         feature_attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -1112,7 +1034,6 @@ def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Qwen2AudioCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1259,78 +1180,5 @@ def forward(
             attention_mask=attention_mask,
         )
 
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        inputs_embeds=None,
-        input_features=None,
-        attention_mask=None,
-        **kwargs,
-    ):
-        # Overwritten -- custom processing (note: might not be needed, but there are no generation tests running atm)
-
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-
-            # Here, we get the attention_mask, which was previously stored in the state after _merge_input_ids_with_audio_features.
-            if input_features is not None and kwargs.get("attention_mask") is not None:
-                attention_mask = kwargs["attention_mask"]
-                attention_mask = torch.cat(
-                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-                )
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.audio_token_index in input_ids:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        feature_attention_mask = kwargs.get("feature_attention_mask", None)
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "input_features": input_features,
-                "feature_attention_mask": feature_attention_mask,
-            }
-        )
-        return model_inputs
-
-    def _reorder_cache(self, *args, **kwargs):
-        return self.language_model._reorder_cache(*args, **kwargs)
-
 
 __all__ = ["Qwen2AudioForConditionalGeneration", "Qwen2AudioPreTrainedModel", "Qwen2AudioEncoder"]
diff --git a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
index 5eee95398b36..0daa90c56438 100644
--- a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
@@ -16,13 +16,24 @@
 Processor class for Qwen2Audio.
 """
 
-from typing import List, Optional, Union
+import warnings
+from typing import List, Union
 
 import numpy as np
 
 from ...feature_extraction_utils import BatchFeature
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.deprecation import deprecate_kwarg
+
+
+class Qwen2AudioProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "audio_kwargs": {},
+    }
 
 
 class Qwen2AudioProcessor(ProcessorMixin):
@@ -49,6 +60,7 @@ class Qwen2AudioProcessor(ProcessorMixin):
     """
 
     attributes = ["feature_extractor", "tokenizer"]
+    valid_kwargs = ["chat_template", "audio_token", "audio_bos_token", "audio_eos_token"]
     feature_extractor_class = "WhisperFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
 
@@ -68,19 +80,19 @@ def __init__(
         self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
         super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
 
+    @deprecate_kwarg("audios", version="4.54.0", new_name="audio")
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        audios: Union[np.ndarray, List[np.ndarray]] = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        sampling_rate: Optional[int] = None,
-        **kwargs,
+        audio: Union[np.ndarray, List[np.ndarray]] = None,
+        audios=None,  # kept for BC
+        **kwargs: Unpack[Qwen2AudioProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
         and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
-        WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the doctsring
+        WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
@@ -88,43 +100,48 @@ def __call__(
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            audios (`np.ndarray`, `List[np.ndarray]`):
+            audio (`np.ndarray`, `List[np.ndarray]`):
                 The audio or batch of audios to be prepared. Each audio can be a NumPy array.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            sampling_rate (`int`, defaults to 16000):
-                The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
         """
 
+        # Handle BC when user passes deprecared keyword argument
+        if audios is not None and audio is None:
+            audio = audios
+            warnings.wanr(
+                "You may have used the keyword argument for the `audio` inputs. It is strongly recommended to pass inputs with keyword arguments "
+                "with keys `audio` and `text`. From transformers v4.55 `audio` will be the onle acceptable keyword argument.",
+                FutureWarning,
+            )
+
         if text is None:
-            raise ValueError("You need to specify either a `text` input to process.")
+            raise ValueError("You need to specify `text` input to process.")
         elif isinstance(text, str):
             text = [text]
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        # ensure we have as much audios as audio tokens
-        num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
-        num_audios = 1 if type(audios) == np.ndarray else len(audios)
-        if num_audio_tokens != num_audios:
-            raise ValueError(
-                f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}"
-            )
+        output_kwargs = self._merge_kwargs(
+            Qwen2AudioProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
 
-        if audios is not None:
-            audio_inputs = self.feature_extractor(
-                audios, sampling_rate=sampling_rate, return_attention_mask=True, padding="max_length", **kwargs
-            )
-            audio_inputs["feature_attention_mask"] = audio_inputs.pop(
-                "attention_mask"
-            )  # rename attention_mask to prevent conflicts later on
+        if audio is not None:
+            # ensure we have as much audios as audio tokens
+            num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
+            num_audios = 1 if type(audio) is np.ndarray else len(audio)
+            if num_audio_tokens != num_audios:
+                raise ValueError(
+                    f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}"
+                )
+
+            # Some kwargs should not be changed so we can expand text with audio tokens below
+            output_kwargs["audio_kwargs"]["return_attention_mask"] = True
+            output_kwargs["audio_kwargs"]["padding"] = "max_length"
+            audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
+
+            # rename attention_mask to prevent conflicts later on
+            audio_inputs["feature_attention_mask"] = audio_inputs.pop("attention_mask")
 
             expanded_text = []
             audio_lengths = audio_inputs["feature_attention_mask"].sum(-1).tolist()
@@ -162,9 +179,9 @@ def __call__(
                 expanded_text.append(sample)
             text = expanded_text
 
-        inputs = self.tokenizer(text, padding=padding, **kwargs)
+        inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
-        if audios is not None:
+        if audio is not None:
             inputs.update(audio_inputs)
 
         return BatchFeature(data={**inputs})
@@ -190,6 +207,7 @@ def model_input_names(self):
         return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names + ["feature_attention_mask"]))
 
     @property
+    # NOTE: we don't have default templates anymore, and the below is kept only because the hub config is not yet updated!
     def default_chat_template(self):
         """
         This default vicuna template formats inputs in the form of a chat history. For each message in the chat history:
@@ -228,7 +246,7 @@ def default_chat_template(self):
                     "{{ message['content'] }}<|im_end|>\n"
                 "{% else %}"
                     "{% for content in message['content'] %}"
-                        "{% if 'audio' in content or 'audio_url' in content %}"
+                        "{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}"
                             "{% set audio_count.value = audio_count.value + 1 %}"
                             "Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
                         "{% elif 'text' in content %}"
diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py
index a52b4204a662..113f6c09eae0 100644
--- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py
@@ -26,8 +26,7 @@ class Qwen2MoeConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a
     Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen1.5-MoE-A2.7B" [Qwen/Qwen1.5-MoE-A2.7B"](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B").
+    with the defaults will yield a similar configuration to that of [Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B).
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -125,7 +124,7 @@ class Qwen2MoeConfig(PretrainedConfig):
         norm_topk_prob (`bool`, *optional*, defaults to `False`):
             Whether to normalize the topk probabilities.
         output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabeling this will also
+            Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
         router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             The aux loss factor for the total loss.
@@ -133,7 +132,8 @@ class Qwen2MoeConfig(PretrainedConfig):
             Indicate which layers use Qwen2MoeMLP rather than Qwen2MoeSparseMoeBlock
             The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
             If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
-
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
     ```python
     >>> from transformers import Qwen2MoeModel, Qwen2MoeConfig
 
@@ -195,6 +195,7 @@ def __init__(
         output_router_logits=False,
         router_aux_loss_coef=0.001,
         mlp_only_layers=None,
+        qkv_bias=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -231,6 +232,7 @@ def __init__(
         self.output_router_logits = output_router_logits
         self.router_aux_loss_coef = router_aux_loss_coef
         self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
+        self.qkv_bias = qkv_bias
 
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index 3e4aa05a22bf..b0a8745a9b72 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -31,6 +31,7 @@
 from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     MoeCausalLMOutputWithPast,
     MoeModelOutputWithPast,
@@ -38,14 +39,13 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -53,7 +53,7 @@
 from .configuration_qwen2_moe import Qwen2MoeConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 logger = logging.get_logger(__name__)
@@ -185,45 +185,18 @@ def __init__(self, config: Qwen2MoeConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -327,9 +300,9 @@ def __init__(self, config: Qwen2MoeConfig, layer_idx: Optional[int] = None):
                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                 f" and `num_heads`: {self.num_heads})."
             )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=self.config.qkv_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.config.qkv_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.config.qkv_bias)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
 
         self.rotary_emb = Qwen2MoeRotaryEmbedding(config=self.config)
@@ -410,9 +383,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -920,10 +893,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -932,9 +906,8 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, MoeModelOutputWithPast]:
+    ) -> MoeModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
             output_router_logits if output_router_logits is not None else self.config.output_router_logits
@@ -944,8 +917,6 @@ def forward(
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1047,12 +1018,6 @@ def forward(
         if return_legacy_cache:
             next_cache = next_cache.to_legacy_cache()
 
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
-                if v is not None
-            )
         return MoeModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1068,7 +1033,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -1172,7 +1137,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1251,12 +1216,13 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1266,13 +1232,11 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         output_router_logits: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **loss_kwargs,
-    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1311,10 +1275,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: MoeModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1324,11 +1287,10 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             output_router_logits=output_router_logits,
-            return_dict=return_dict,
             cache_position=cache_position,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -1340,7 +1302,7 @@ def forward(
         aux_loss = None
         if output_router_logits:
             aux_loss = load_balancing_loss_func(
-                outputs.router_logits if return_dict else outputs[-1],
+                outputs.router_logits,
                 self.num_experts,
                 self.num_experts_per_tok,
                 attention_mask,
@@ -1348,12 +1310,6 @@ def forward(
             if labels is not None:
                 loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            if output_router_logits:
-                output = (aux_loss,) + output
-            return (loss,) + output if loss is not None else output
-
         return MoeCausalLMOutputWithPast(
             loss=loss,
             aux_loss=aux_loss,
@@ -1380,7 +1336,7 @@ def forward(
     """,
     QWEN2MOE_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Qwen2Moe, LLAMA->QWEN2MOE
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Qwen2Moe, LLAMA->QWEN2MOE, BaseModelOutputWithPast->MoeModelOutputWithPast
 class Qwen2MoeForSequenceClassification(Qwen2MoePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1397,29 +1353,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: MoeModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1428,9 +1383,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1445,7 +1399,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1460,10 +1414,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1480,7 +1430,7 @@ def forward(
     """,
     QWEN2MOE_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen2Moe, LLAMA->QWEN2MOE
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen2Moe, LLAMA->QWEN2MOE, BaseModelOutputWithPast->MoeModelOutputWithPast
 class Qwen2MoeForTokenClassification(Qwen2MoePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1504,6 +1454,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1515,23 +1466,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: MoeModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1540,9 +1489,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1550,10 +1498,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
@@ -1569,7 +1513,7 @@ def forward(
     """,
     QWEN2MOE_START_DOCSTRING,
 )
-# Copied from transformers.models.mistral.modeling_mistral.MistralForQuestionAnswering with Mistral->Qwen2Moe, MISTRAL->QWEN2MOE
+# Copied from transformers.models.mistral.modeling_mistral.MistralForQuestionAnswering with Mistral->Qwen2Moe, MISTRAL->QWEN2MOE, BaseModelOutputWithPast->MoeModelOutputWithPast
 class Qwen2MoeForQuestionAnswering(Qwen2MoePreTrainedModel):
     base_model_prefix = "model"
 
@@ -1587,6 +1531,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
     def forward(
         self,
@@ -1599,9 +1544,8 @@ def forward(
         end_positions: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -1612,9 +1556,8 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
             are not taken into account for computing the loss.
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: MoeModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1622,10 +1565,9 @@ def forward(
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         logits = self.qa_outputs(sequence_output)
         start_logits, end_logits = logits.split(1, dim=-1)
@@ -1636,10 +1578,6 @@ def forward(
         if start_positions is not None and end_positions is not None:
             loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
 
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return QuestionAnsweringModelOutput(
             loss=loss,
             start_logits=start_logits,
diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
index 710738e39654..2917e2d8ba18 100644
--- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py
@@ -232,7 +232,7 @@ def __init__(
 
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, move it to 'rope_type'.
-        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
+        # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations
         # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
         # TODO: @raushan update config in the hub
         if self.rope_scaling is not None and "type" in self.rope_scaling:
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index 97fd06368d60..8c494fdc6dbc 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -66,7 +66,7 @@ def smart_resize(
 
     """
     if height < factor or width < factor:
-        raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+        raise ValueError(f"height:{height} and width:{width} must be larger than factor:{factor}")
     elif max(height, width) / min(height, width) > 200:
         raise ValueError(
             f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
@@ -91,6 +91,8 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the image's (height, width) dimensions.
+        size (`Dict[str, int]`, *optional*, defaults to `{"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280}`):
+            Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
             Resampling filter to use when resizing the image.
         do_rescale (`bool`, *optional*, defaults to `True`):
@@ -110,7 +112,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
         max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
             The max pixels of the image to resize the image.
         patch_size (`int`, *optional*, defaults to 14):
-            The spacial patch size of the vision encoder.
+            The spatial patch size of the vision encoder.
         temporal_patch_size (`int`, *optional*, defaults to 2):
             The temporal patch size of the vision encoder.
         merge_size (`int`, *optional*, defaults to 2):
@@ -122,6 +124,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
     def __init__(
         self,
         do_resize: bool = True,
+        size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         do_rescale: bool = True,
         rescale_factor: Union[int, float] = 1 / 255,
@@ -129,14 +132,27 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        min_pixels: int = 56 * 56,
-        max_pixels: int = 28 * 28 * 1280,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
         patch_size: int = 14,
         temporal_patch_size: int = 2,
         merge_size: int = 2,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
+        if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+        else:
+            size = {"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280}
+        # backward compatibility: override size with min_pixels and max_pixels if they are provided
+        if min_pixels is not None:
+            size["shortest_edge"] = min_pixels
+        if max_pixels is not None:
+            size["longest_edge"] = max_pixels
+        self.min_pixels = size["shortest_edge"]
+        self.max_pixels = size["longest_edge"]
+        self.size = size
+
         self.do_resize = do_resize
         self.resample = resample
         self.do_rescale = do_rescale
@@ -144,25 +160,27 @@ def __init__(
         self.do_normalize = do_normalize
         self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
         self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
+
         self.patch_size = patch_size
         self.temporal_patch_size = temporal_patch_size
         self.merge_size = merge_size
-        self.size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
         self.do_convert_rgb = do_convert_rgb
 
     def _preprocess(
         self,
         images: Union[ImageInput, VideoInput],
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        patch_size: Optional[int] = None,
+        temporal_patch_size: Optional[int] = None,
+        merge_size: Optional[int] = None,
+        do_convert_rgb: Optional[bool] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ):
@@ -176,6 +194,8 @@ def _preprocess(
                 Optional list of dictionaries containing additional information about vision inputs.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
             resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                 Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
@@ -188,6 +208,12 @@ def _preprocess(
                 Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                 Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spacial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
             do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                 Whether to convert the image to RGB.
             data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
@@ -226,9 +252,9 @@ def _preprocess(
                 resized_height, resized_width = smart_resize(
                     height,
                     width,
-                    factor=self.patch_size * self.merge_size,
-                    min_pixels=self.min_pixels,
-                    max_pixels=self.max_pixels,
+                    factor=patch_size * merge_size,
+                    min_pixels=size["shortest_edge"],
+                    max_pixels=size["longest_edge"],
                 )
                 image = resize(
                     image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
@@ -248,26 +274,26 @@ def _preprocess(
         patches = np.array(processed_images)
         if data_format == ChannelDimension.LAST:
             patches = patches.transpose(0, 3, 1, 2)
-        if patches.shape[0] % self.temporal_patch_size != 0:
-            repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
+        if patches.shape[0] % temporal_patch_size != 0:
+            repeats = np.repeat(patches[-1][np.newaxis], temporal_patch_size - 1, axis=0)
             patches = np.concatenate([patches, repeats], axis=0)
         channel = patches.shape[1]
-        grid_t = patches.shape[0] // self.temporal_patch_size
-        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        grid_t = patches.shape[0] // temporal_patch_size
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
         patches = patches.reshape(
             grid_t,
-            self.temporal_patch_size,
+            temporal_patch_size,
             channel,
-            grid_h // self.merge_size,
-            self.merge_size,
-            self.patch_size,
-            grid_w // self.merge_size,
-            self.merge_size,
-            self.patch_size,
+            grid_h // merge_size,
+            merge_size,
+            patch_size,
+            grid_w // merge_size,
+            merge_size,
+            patch_size,
         )
         patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
         flatten_patches = patches.reshape(
-            grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
+            grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
         )
 
         return flatten_patches, (grid_t, grid_h, grid_w)
@@ -276,15 +302,20 @@ def preprocess(
         self,
         images: ImageInput,
         videos: VideoInput = None,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        patch_size: Optional[int] = None,
+        temporal_patch_size: Optional[int] = None,
+        merge_size: Optional[int] = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -316,6 +347,16 @@ def preprocess(
             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                 Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                 `True`.
+            min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
+                The min pixels of the image to resize the image.
+            max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
+                The max pixels of the image to resize the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spacial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
             do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                 Whether to convert the image to RGB.
             return_tensors (`str` or `TensorType`, *optional*):
@@ -338,14 +379,30 @@ def preprocess(
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
 
         """
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+
+        if size is not None:
+            if "shortest_edge" not in size or "longest_edge" not in size:
+                raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+            min_pixels = size["shortest_edge"]
+        elif min_pixels is not None and max_pixels is not None:
+            # backward compatibility: override size with min_pixels and max_pixels if they are provided
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
+        else:
+            size = {**self.size}
+
         do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
+
         resample = resample if resample is not None else self.resample
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
+        merge_size = merge_size if merge_size is not None else self.merge_size
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
         if images is not None:
@@ -375,12 +432,16 @@ def preprocess(
                 patches, image_grid_thw = self._preprocess(
                     image,
                     do_resize=do_resize,
+                    size=size,
                     resample=resample,
                     do_rescale=do_rescale,
                     rescale_factor=rescale_factor,
                     do_normalize=do_normalize,
                     image_mean=image_mean,
                     image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
                     data_format=data_format,
                     do_convert_rgb=do_convert_rgb,
                     input_data_format=input_data_format,
@@ -397,12 +458,16 @@ def preprocess(
                 patches, video_grid_thw = self._preprocess(
                     images,
                     do_resize=do_resize,
+                    size=size,
                     resample=resample,
                     do_rescale=do_rescale,
                     rescale_factor=rescale_factor,
                     do_normalize=do_normalize,
                     image_mean=image_mean,
                     image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
                     data_format=data_format,
                     do_convert_rgb=do_convert_rgb,
                     input_data_format=input_data_format,
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
index 2a87cd34fd42..60b62449d3ae 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
@@ -25,7 +25,7 @@
 from ...image_processing_utils_fast import (
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
+    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -49,7 +49,6 @@
     is_torch_available,
     is_torchvision_available,
     is_torchvision_v2_available,
-    is_vision_available,
     logging,
 )
 from .image_processing_qwen2_vl import smart_resize
@@ -58,18 +57,19 @@
 if is_torch_available():
     import torch
 
-if is_vision_available():
-    pass
 
-if is_torchvision_v2_available():
-    from torchvision.transforms.v2 import functional as F
-elif is_torchvision_available():
-    from torchvision.transforms import functional as F
+if is_torchvision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
+
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
 
 logger = logging.get_logger(__name__)
 
 
-class Qwen2VLFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
+class Qwen2VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     min_pixels: Optional[int]
     max_pixels: Optional[int]
     patch_size: Optional[int]
@@ -86,7 +86,7 @@ class Qwen2VLFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
         max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
             The max pixels of the image to resize the image.
         patch_size (`int`, *optional*, defaults to 14):
-            The spacial patch size of the vision encoder.
+            The spatial patch size of the vision encoder.
         temporal_patch_size (`int`, *optional*, defaults to 2):
             The temporal patch size of the vision encoder.
         merge_size (`int`, *optional*, defaults to 2):
@@ -105,13 +105,26 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
     patch_size = 14
     temporal_patch_size = 2
     merge_size = 2
-    min_pixels = 56 * 56
-    max_pixels = 28 * 28 * 1280
-    valid_init_kwargs = Qwen2VLFastImageProcessorInitKwargs
+    min_pixels = None
+    max_pixels = None
+    valid_kwargs = Qwen2VLFastImageProcessorKwargs
     model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
 
-    def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorInitKwargs]):
-        super().__init__(**kwargs)
+    def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs]):
+        size = kwargs.pop("size", None)
+        min_pixels = kwargs.pop("min_pixels", None)
+        max_pixels = kwargs.pop("max_pixels", None)
+        if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
+            raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+        else:
+            size = self.size
+        # backward compatibility: override size with min_pixels and max_pixels if they are provided
+        if min_pixels is not None:
+            size["shortest_edge"] = min_pixels
+        if max_pixels is not None:
+            size["longest_edge"] = max_pixels
+
+        super().__init__(size=size, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs)
 
     def _preprocess(
         self,
@@ -124,6 +137,9 @@ def _preprocess(
         do_normalize: bool,
         image_mean: Optional[Union[float, List[float]]],
         image_std: Optional[Union[float, List[float]]],
+        patch_size: int,
+        temporal_patch_size: int,
+        merge_size: int,
         do_convert_rgb: bool,
         input_data_format: Optional[Union[str, ChannelDimension]],
         device: Optional[Union[str, torch.device]],
@@ -138,6 +154,8 @@ def _preprocess(
                 Optional list of dictionaries containing additional information about vision inputs.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
             interpolation (`InterpolationMode`):
                 Resampling filter to use if resizing the image.
             do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
@@ -150,6 +168,12 @@ def _preprocess(
                 Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                 Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spacial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
             do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                 Whether to convert the image to RGB.
             input_data_format (`ChannelDimension` or `str`, *optional*):
@@ -178,9 +202,9 @@ def _preprocess(
                 resized_height, resized_width = smart_resize(
                     height,
                     width,
-                    factor=self.patch_size * self.merge_size,
-                    min_pixels=self.min_pixels,
-                    max_pixels=self.max_pixels,
+                    factor=patch_size * merge_size,
+                    min_pixels=size["shortest_edge"],
+                    max_pixels=size["longest_edge"],
                 )
                 stacked_images = F.resize(
                     stacked_images, size=(resized_height, resized_width), interpolation=interpolation
@@ -201,28 +225,28 @@ def _preprocess(
 
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
         patches = torch.stack(processed_images, dim=0)
-        if patches.shape[0] % self.temporal_patch_size != 0:
-            repeats = patches[-1].unsqueeze(0).repeat(self.temporal_patch_size - 1, 1, 1, 1)
+        if patches.shape[0] % temporal_patch_size != 0:
+            repeats = patches[-1].unsqueeze(0).repeat(temporal_patch_size - 1, 1, 1, 1)
             patches = torch.cat([patches, repeats], dim=0)
 
         channel = patches.shape[1]
-        grid_t = patches.shape[0] // self.temporal_patch_size
-        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        grid_t = patches.shape[0] // temporal_patch_size
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
 
         patches = patches.view(
             grid_t,
-            self.temporal_patch_size,
+            temporal_patch_size,
             channel,
-            grid_h // self.merge_size,
-            self.merge_size,
-            self.patch_size,
-            grid_w // self.merge_size,
-            self.merge_size,
-            self.patch_size,
+            grid_h // merge_size,
+            merge_size,
+            patch_size,
+            grid_w // merge_size,
+            merge_size,
+            patch_size,
         )
         patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
         flatten_patches = patches.reshape(
-            grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
+            grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size
         )
 
         return flatten_patches, (grid_t, grid_h, grid_w)
@@ -231,15 +255,20 @@ def preprocess(
         self,
         images: ImageInput,
         videos: VideoInput = None,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        temporal_patch_size: Optional[int] = None,
+        merge_size: Optional[int] = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -257,8 +286,7 @@ def preprocess(
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
-                the longest edge resized to keep the input aspect ratio.
+                Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
             resample (`int`, *optional*, defaults to `self.resample`):
                 Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
                 has an effect if `do_resize` is set to `True`.
@@ -273,6 +301,16 @@ def preprocess(
             image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                 Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                 `True`.
+            min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
+                The min pixels of the image to resize the image.
+            max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
+                The max pixels of the image to resize the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spacial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
             do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                 Whether to convert the image to RGB.
             return_tensors (`str` or `TensorType`, *optional*):
@@ -296,6 +334,19 @@ def preprocess(
             device (`torch.device`, *optional*):
                 The device to process the images on. If unset, the device is inferred from the input images.
         """
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+
+        if size is not None:
+            if "shortest_edge" not in size or "longest_edge" not in size:
+                raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
+            min_pixels = size["shortest_edge"]
+        elif min_pixels is not None and max_pixels is not None:
+            # backward compatibility: override size with min_pixels and max_pixels if they are provided
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
+        else:
+            size = {**self.size}
+
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
         resample = resample if resample is not None else self.resample
@@ -304,6 +355,9 @@ def preprocess(
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
+        merge_size = merge_size if merge_size is not None else self.merge_size
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
         # Make hashable for cache
@@ -311,19 +365,22 @@ def preprocess(
         image_mean = tuple(image_mean) if image_mean is not None else None
         image_std = tuple(image_std) if image_std is not None else None
 
-        image_mean, image_std, interpolation = self._prepare_process_arguments(
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
+        self._validate_preprocess_kwargs(
             do_rescale=do_rescale,
             rescale_factor=rescale_factor,
             do_normalize=do_normalize,
             image_mean=image_mean,
             image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
             return_tensors=return_tensors,
             data_format=data_format,
-            device=device,
         )
+        interpolation = (
+            pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
+        )
+
         if images is not None:
             images = make_flat_list_of_images(images)
         if videos is not None:
@@ -348,6 +405,9 @@ def preprocess(
                     do_normalize=do_normalize,
                     image_mean=image_mean,
                     image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
                     do_convert_rgb=do_convert_rgb,
                     input_data_format=input_data_format,
                     device=device,
@@ -371,6 +431,9 @@ def preprocess(
                     do_normalize=do_normalize,
                     image_mean=image_mean,
                     image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
                     do_convert_rgb=do_convert_rgb,
                     input_data_format=input_data_format,
                     device=device,
diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index 91ec520d5146..824ad4df58d0 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -33,27 +33,21 @@
 from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
 from .configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLVisionConfig
 
 
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_varlen_func
-
-    from ...modeling_flash_attention_utils import _flash_attention_forward
-else:
-    flash_attn_varlen_func = None
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_varlen_func
 
 
 logger = logging.get_logger(__name__)
@@ -93,7 +87,7 @@ class Qwen2VLCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -118,45 +112,20 @@ def __init__(self, config: Qwen2VLConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(
-                self.config, device, seq_len=seq_len, **self.rope_kwargs
-            )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block. In contrast to other models, Qwen2_VL has different position ids for thw grids
+        # In contrast to other models, Qwen2_VL has different position ids for the grids
         # So we expand the inv_freq to shape (3, ...)
         inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
         position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -175,7 +144,7 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
     Explanation:
         Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
         sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
-        vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
+        vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately.
         Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
         For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
         height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
@@ -220,7 +189,7 @@ def apply_rotary_pos_emb_vision(
     orig_q_dtype = q.dtype
     orig_k_dtype = k.dtype
     q, k = q.float(), k.float()
-    cos, sin = cos.unsqueeze(-2), sin.unsqueeze(-2)
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     q_embed = q_embed.to(orig_q_dtype)
@@ -318,8 +287,8 @@ def forward(
                 "removed and `position_embeddings` will be mandatory."
             )
             emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-            cos = emb.cos().float()
-            sin = emb.sin().float()
+            cos = emb.cos()
+            sin = emb.sin()
         else:
             cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
@@ -367,8 +336,8 @@ def forward(
                 "removed and `position_embeddings` will be mandatory."
             )
             emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-            cos = emb.cos().float()
-            sin = emb.sin().float()
+            cos = emb.cos()
+            sin = emb.sin()
         else:
             cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
@@ -405,8 +374,8 @@ def forward(
                 "removed and `position_embeddings` will be mandatory."
             )
             emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-            cos = emb.cos().float()
-            sin = emb.sin().float()
+            cos = emb.cos()
+            sin = emb.sin()
         else:
             cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
@@ -417,8 +386,10 @@ def forward(
         q = q.transpose(0, 1)
         k = k.transpose(0, 1)
         v = v.transpose(0, 1)
-        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
-        attn_output = attn_output.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(
+            q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0), attention_mask, dropout_p=0.0
+        )
+        attn_output = attn_output.squeeze(0).transpose(0, 1)
         attn_output = attn_output.reshape(seq_length, -1)
         attn_output = self.proj(attn_output)
         return attn_output
@@ -629,9 +600,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -1071,7 +1042,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1193,7 +1164,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -1297,7 +1268,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1462,7 +1433,7 @@ def get_rope_index(
         Explanation:
             Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
 
-            For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
+            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
             Examples:
                 input_ids: [T T T T T], here T is for text.
                 temporal position_ids: [0, 1, 2, 3, 4]
@@ -1470,7 +1441,7 @@ def get_rope_index(
                 width position_ids: [0, 1, 2, 3, 4]
 
             For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
-            and 1D rotary position embeddin for text part.
+            and 1D rotary position embedding for text part.
             Examples:
                 Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
                 input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
@@ -1602,7 +1573,7 @@ def get_rope_index(
     @replace_return_docstrings(output_type=Qwen2VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1620,7 +1591,6 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1791,67 +1761,28 @@ def prepare_inputs_for_generation(
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
 
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        #              (we can't check exception 3 while compiling)
-        # Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
-        # generate the first token for each sequence. Later use the generated Input ids for continuation.
-        if past_key_values is not None:
-            if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
-                inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
-            elif (
-                inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-
-        if cache_position[0] != 0:
-            pixel_values = None
-            pixel_values_videos = None
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
-            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
-        else:
-            model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            **kwargs,
+        )
 
-        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
-            if model_inputs["inputs_embeds"] is not None:
-                batch_size, sequence_length, _ = inputs_embeds.shape
-                device = inputs_embeds.device
-            else:
-                batch_size, sequence_length = input_ids.shape
-                device = input_ids.device
+        # Qwen2-VL position_ids are prepareed with rope_deltas in forward
+        model_inputs["position_ids"] = None
 
-            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
-                attention_mask,
-                sequence_length=sequence_length,
-                target_length=past_key_values.get_max_cache_shape(),
-                dtype=self.lm_head.weight.dtype,
-                device=device,
-                cache_position=cache_position,
-                batch_size=batch_size,
-                config=self.config,
-                past_key_values=past_key_values,
-            )
+        if model_inputs["cache_position"][0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
 
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "pixel_values_videos": pixel_values_videos,
-                "image_grid_thw": image_grid_thw,
-                "video_grid_thw": video_grid_thw,
-                "cache_position": cache_position,
-            }
-        )
         return model_inputs
 
     def _get_image_nums_and_video_nums(
diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
index cd4a61d28dfc..06b0adb0fb20 100644
--- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
@@ -21,11 +21,11 @@
 Processor class for Qwen2-VL.
 """
 
-from typing import List, Union
+from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, VideoInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
 
@@ -33,7 +33,16 @@
 logger = logging.get_logger(__name__)
 
 
+class Qwen2VLImagesKwargs(ImagesKwargs):
+    min_pixels: Optional[int]
+    max_pixels: Optional[int]
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+
+
 class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen2VLImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
@@ -170,7 +179,9 @@ def decode(self, *args, **kwargs):
         """
         return self.tokenizer.decode(*args, **kwargs)
 
-    def post_process_image_text_to_text(self, generated_outputs):
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
         """
         Post-process the output of the model to decode the text.
 
@@ -178,12 +189,21 @@ def post_process_image_text_to_text(self, generated_outputs):
             generated_outputs (`torch.Tensor` or `np.ndarray`):
                 The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                 or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
 
         Returns:
             `List[str]`: The decoded text.
         """
         return self.tokenizer.batch_decode(
-            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
         )
 
     @property
diff --git a/src/transformers/models/qwen3/__init__.py b/src/transformers/models/qwen3/__init__.py
new file mode 100644
index 000000000000..ae1ec7ea525c
--- /dev/null
+++ b/src/transformers/models/qwen3/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_qwen3 import *
+    from .modeling_qwen3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py
new file mode 100644
index 000000000000..06e527ce53f4
--- /dev/null
+++ b/src/transformers/models/qwen3/configuration_qwen3.py
@@ -0,0 +1,212 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Qwen3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
+    Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-8B [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen3Model, Qwen3Config
+
+    >>> # Initializing a Qwen3 style configuration
+    >>> configuration = Qwen3Config()
+
+    >>> # Initializing a model from the Qwen3-8B style configuration
+    >>> model = Qwen3Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `Qwen3`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window  # we check `use_sliding_window` in the modeling code
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Qwen3Config"]
diff --git a/src/transformers/models/qwen3/modeling_qwen3.py b/src/transformers/models/qwen3/modeling_qwen3.py
new file mode 100644
index 000000000000..5fec83d47888
--- /dev/null
+++ b/src/transformers/models/qwen3/modeling_qwen3.py
@@ -0,0 +1,1151 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/qwen3/modular_qwen3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_qwen3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    LossKwargs,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_qwen3 import Qwen3Config
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen3-8B"
+_CONFIG_FOR_DOC = "Qwen3Config"
+
+
+class Qwen3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Qwen3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Qwen3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Qwen3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape
+        self.sliding_window = config.sliding_window
+        if not (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            self.sliding_window = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,  # diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Qwen3DecoderLayer(nn.Module):
+    def __init__(self, config: Qwen3Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Qwen3MLP(config)
+        self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if (
+            config.sliding_window and config._attn_implementation != "flash_attention_2"
+        ):  # diff with Llama is this warning
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Qwen3RotaryEmbedding(nn.Module):
+    def __init__(self, config: Qwen3Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+QWEN3_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen3Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen3 Model outputting raw hidden-states without any specific head on top.",
+    QWEN3_START_DOCSTRING,
+)
+class Qwen3PreTrainedModel(PreTrainedModel):
+    config_class = Qwen3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+QWEN3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen3 Model outputting raw hidden-states without any specific head on top.",
+    QWEN3_START_DOCSTRING,
+)
+class Qwen3Model(Qwen3PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3DecoderLayer`]
+
+    Args:
+        config: Qwen3Config
+    """
+
+    def __init__(self, config: Qwen3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Qwen3. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: Qwen3Config,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`Qwen3Config`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @can_return_tuple
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> CausalLMOutputWithPast:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM
+
+        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Qwen3 Model transformer with a sequence classification head on top (linear layer).
+
+    [`Qwen3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    QWEN3_START_DOCSTRING,
+)
+class Qwen3ForSequenceClassification(Qwen3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen3Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> SequenceClassifierOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        transformer_outputs: BaseModelOutputWithPast = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        hidden_states = transformer_outputs.last_hidden_state
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Qwen3 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    QWEN3_START_DOCSTRING,
+)
+class Qwen3ForTokenClassification(Qwen3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen3Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> TokenClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs.last_hidden_state
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+The Qwen3 Model transformer with a span classification head on top for extractive question-answering tasks like
+SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    QWEN3_START_DOCSTRING,
+)
+class Qwen3ForQuestionAnswering(Qwen3PreTrainedModel):
+    base_model_prefix = "transformer"
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = Qwen3Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.transformer.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.transformer.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> QuestionAnsweringModelOutput:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+
+        outputs: BaseModelOutputWithPast = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        sequence_output = outputs.last_hidden_state
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
+
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Qwen3ForCausalLM",
+    "Qwen3ForQuestionAnswering",
+    "Qwen3Model",
+    "Qwen3PreTrainedModel",
+    "Qwen3ForSequenceClassification",
+    "Qwen3ForTokenClassification",
+]
diff --git a/src/transformers/models/qwen3/modular_qwen3.py b/src/transformers/models/qwen3/modular_qwen3.py
new file mode 100644
index 000000000000..8e0b86da4079
--- /dev/null
+++ b/src/transformers/models/qwen3/modular_qwen3.py
@@ -0,0 +1,203 @@
+# coding=utf-8
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen3 model."""
+
+from typing import Callable, Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import CausalLMOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import (
+    LossKwargs,
+    logging,
+)
+from ..gemma.modeling_gemma import GemmaMLP
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaForQuestionAnswering,
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+    LlamaRMSNorm,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from ..mistral.modeling_mistral import MistralModel
+from .configuration_qwen3 import Qwen3Config
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen3-8B"
+
+
+class Qwen3RMSNorm(LlamaRMSNorm):
+    pass
+
+
+class Qwen3MLP(GemmaMLP):
+    pass
+
+
+class Qwen3Attention(LlamaAttention):
+    def __init__(self, config: Qwen3Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape
+        self.sliding_window = config.sliding_window
+        if not (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            self.sliding_window = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,  # diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Qwen3DecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: Qwen3Config, layer_idx: int):
+        super().__init__()
+        self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Qwen3MLP(config)
+        if (
+            config.sliding_window and config._attn_implementation != "flash_attention_2"
+        ):  # diff with Llama is this warning
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+
+
+class Qwen3Model(MistralModel):  # mistral model creates sliding window
+    pass
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class Qwen3ForCausalLM(LlamaForCausalLM):
+    def forward(
+        self,
+        **super_kwargs: Unpack[KwargsForCausalLM],
+    ) -> CausalLMOutputWithPast:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM
+
+        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        return super().forward(**super_kwargs)
+
+
+class Qwen3ForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+class Qwen3ForTokenClassification(LlamaForTokenClassification):
+    pass
+
+
+class Qwen3ForQuestionAnswering(LlamaForQuestionAnswering):
+    pass
+
+
+__all__ = [
+    "Qwen3ForCausalLM",
+    "Qwen3ForQuestionAnswering",
+    "Qwen3Model",
+    "Qwen3PreTrainedModel",  # noqa: F822
+    "Qwen3ForSequenceClassification",
+    "Qwen3ForTokenClassification",
+]
diff --git a/src/transformers/models/qwen3_moe/__init__.py b/src/transformers/models/qwen3_moe/__init__.py
new file mode 100644
index 000000000000..715effd8b7f1
--- /dev/null
+++ b/src/transformers/models/qwen3_moe/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_qwen3_moe import *
+    from .modeling_qwen3_moe import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py
new file mode 100644
index 000000000000..0997bbd77613
--- /dev/null
+++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py
@@ -0,0 +1,240 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3MoE model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Qwen3MoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3MoeModel`]. It is used to instantiate a
+    Qwen3MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of [Qwen/Qwen3-MoE-15B-A2B](https://huggingface.co/Qwen/Qwen3-15B-A2B).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen3MoE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen3MoeModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        decoder_sparse_step (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer.
+        moe_intermediate_size (`int`, *optional*, defaults to 768):
+            Intermediate size of the routed expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 128):
+            Number of routed experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the topk probabilities.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabeling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        mlp_only_layers (`List[int]`, *optional*, defaults to `[]`):
+            Indicate which layers use Qwen3MoeMLP rather than Qwen3MoeSparseMoeBlock
+            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
+            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
+
+    ```python
+    >>> from transformers import Qwen3MoeModel, Qwen3MoeConfig
+
+    >>> # Initializing a Qwen3MoE style configuration
+    >>> configuration = Qwen3MoeConfig()
+
+    >>> # Initializing a model from the Qwen3-15B-A2B" style configuration
+    >>> model = Qwen3MoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `Qwen3Moe`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=2048,
+        intermediate_size=6144,
+        num_hidden_layers=24,
+        num_attention_heads=32,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        decoder_sparse_step=1,
+        moe_intermediate_size=768,
+        num_experts_per_tok=8,
+        num_experts=128,
+        norm_topk_prob=False,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        mlp_only_layers=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["Qwen3MoeConfig"]
diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
new file mode 100644
index 000000000000..61e4a88049f6
--- /dev/null
+++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
@@ -0,0 +1,1358 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/qwen3_moe/modular_qwen3_moe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_qwen3_moe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    LossKwargs,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_qwen3_moe import Qwen3MoeConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen3-MoE-15B-A2B"
+_CONFIG_FOR_DOC = "Qwen3MoeConfig"
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Qwen3MoeAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Qwen3MoeConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = Qwen3MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        self.k_norm = Qwen3MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape
+        self.sliding_window = config.sliding_window
+        if not (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            self.sliding_window = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,  # diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Qwen3MoeMLP(nn.Module):
+    def __init__(self, config, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Qwen3MoeSparseMoeBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+
+        # gating
+        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [Qwen3MoeMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+
+
+class Qwen3MoeRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen3MoeRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Qwen3MoeDecoderLayer(nn.Module):
+    def __init__(self, config: Qwen3MoeConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Qwen3MoeAttention(config, layer_idx)
+        self.mlp = Qwen3MoeMLP(config)
+
+        self.self_attn = Qwen3MoeAttention(config, layer_idx)
+
+        if (layer_idx not in config.mlp_only_layers) and (
+            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
+        ):
+            self.mlp = Qwen3MoeSparseMoeBlock(config)
+        else:
+            self.mlp = Qwen3MoeMLP(config, intermediate_size=config.intermediate_size)
+
+        self.input_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
+                and should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states = self.mlp(hidden_states)
+        if isinstance(hidden_states, tuple):
+            hidden_states, router_logits = hidden_states
+        else:
+            router_logits = None
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+class Qwen3MoeRotaryEmbedding(nn.Module):
+    def __init__(self, config: Qwen3MoeConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+QWEN3_MOE_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen3MoeConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen3Moe Model outputting raw hidden-states without any specific head on top.",
+    QWEN3_MOE_START_DOCSTRING,
+)
+class Qwen3MoePreTrainedModel(PreTrainedModel):
+    config_class = Qwen3MoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3MoeDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+QWEN3_MOE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen3Moe Model outputting raw hidden-states without any specific head on top.",
+    QWEN3_MOE_START_DOCSTRING,
+)
+class Qwen3MoeModel(Qwen3MoePreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3MoeDecoderLayer`]
+
+    Args:
+        config: Qwen3MoeConfig
+    """
+
+    def __init__(self, config: Qwen3MoeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen3MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3MoeRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> MoeModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    output_router_logits,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    output_router_logits=output_router_logits,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Qwen3Moe. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: Qwen3MoeConfig,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`Qwen3MoeConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, Tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+class Qwen3MoeForCausalLM(Qwen3MoePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen3MoeModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @can_return_tuple
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> MoeCausalLMOutputWithPast:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen3MoeForCausalLM
+
+        >>> model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: MoeModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Qwen3Moe Model transformer with a sequence classification head on top (linear layer).
+
+    [`Qwen3MoeForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    QWEN3_MOE_START_DOCSTRING,
+)
+class Qwen3MoeForSequenceClassification(Qwen3MoePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen3MoeModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> SequenceClassifierOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        transformer_outputs: BaseModelOutputWithPast = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        hidden_states = transformer_outputs.last_hidden_state
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Qwen3Moe Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    QWEN3_MOE_START_DOCSTRING,
+)
+class Qwen3MoeForTokenClassification(Qwen3MoePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen3MoeModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> TokenClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs.last_hidden_state
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+The Qwen3Moe Model transformer with a span classification head on top for extractive question-answering tasks like
+SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    QWEN3_MOE_START_DOCSTRING,
+)
+class Qwen3MoeForQuestionAnswering(Qwen3MoePreTrainedModel):
+    base_model_prefix = "transformer"
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = Qwen3MoeModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.transformer.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.transformer.embed_tokens = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> QuestionAnsweringModelOutput:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+
+        outputs: BaseModelOutputWithPast = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        sequence_output = outputs.last_hidden_state
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
+
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Qwen3MoeForCausalLM",
+    "Qwen3MoeForQuestionAnswering",
+    "Qwen3MoeModel",
+    "Qwen3MoePreTrainedModel",
+    "Qwen3MoeForSequenceClassification",
+    "Qwen3MoeForTokenClassification",
+]
diff --git a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
new file mode 100644
index 000000000000..385f338c78e4
--- /dev/null
+++ b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
@@ -0,0 +1,367 @@
+# coding=utf-8
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen3 model."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...processing_utils import Unpack
+from ...utils import (
+    LossKwargs,
+    logging,
+)
+from ..llama.modeling_llama import (
+    LlamaForQuestionAnswering,
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+    LlamaRMSNorm,
+)
+from ..mixtral.modeling_mixtral import (
+    MixtralForCausalLM,
+    MixtralModel,
+    load_balancing_loss_func,
+)
+from ..qwen2_moe.modeling_qwen2_moe import Qwen2MoeDecoderLayer
+from ..qwen3.modeling_qwen3 import Qwen3Attention
+from .configuration_qwen3_moe import Qwen3MoeConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen3-MoE-15B-A2B"
+
+
+class Qwen3MoeAttention(Qwen3Attention):  # This is the main diff with qwen2Moe!
+    pass
+
+
+class Qwen3MoeMLP(nn.Module):
+    def __init__(self, config, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Qwen3MoeSparseMoeBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+
+        # gating
+        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [Qwen3MoeMLP(config, intermediate_size=config.moe_intermediate_size) for _ in range(self.num_experts)]
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+
+
+class Qwen3MoeRMSNorm(LlamaRMSNorm):
+    pass
+
+
+class Qwen3MoeDecoderLayer(Qwen2MoeDecoderLayer, nn.Module):
+    def __init__(self, config: Qwen3MoeConfig, layer_idx: int):
+        nn.Module().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Qwen3MoeAttention(config, layer_idx)
+        self.mlp = Qwen3MoeMLP(config)
+
+        self.self_attn = Qwen3MoeAttention(config, layer_idx)
+
+        if (layer_idx not in config.mlp_only_layers) and (
+            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
+        ):
+            self.mlp = Qwen3MoeSparseMoeBlock(config)
+        else:
+            self.mlp = Qwen3MoeMLP(config, intermediate_size=config.intermediate_size)
+
+        self.input_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
+                and should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states = self.mlp(hidden_states)
+        if isinstance(hidden_states, tuple):
+            hidden_states, router_logits = hidden_states
+        else:
+            router_logits = None
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+class Qwen3MoeModel(MixtralModel):
+    def __init__(self, config: Qwen3MoeConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [Qwen3MoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class Qwen3MoeForCausalLM(MixtralForCausalLM):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen3MoeModel(config)
+        self.num_experts = config.num_experts
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> MoeCausalLMOutputWithPast:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen3MoeForCausalLM
+
+        >>> model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: MoeModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+
+class Qwen3MoeForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+class Qwen3MoeForTokenClassification(LlamaForTokenClassification):
+    pass
+
+
+class Qwen3MoeForQuestionAnswering(LlamaForQuestionAnswering):
+    pass
+
+
+__all__ = [
+    "Qwen3MoeForCausalLM",
+    "Qwen3MoeForQuestionAnswering",
+    "Qwen3MoeModel",
+    "Qwen3MoePreTrainedModel",  # noqa: F822
+    "Qwen3MoeForSequenceClassification",
+    "Qwen3MoeForTokenClassification",
+]
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index d3ca787691c4..c2258d9767a8 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -22,7 +22,7 @@
 from torch import nn
 
 from ...configuration_utils import PretrainedConfig
-from ...generation import BeamSearchScorer, GenerationConfig, LogitsProcessorList, StoppingCriteriaList
+from ...generation import GenerationConfig, LogitsProcessorList, StoppingCriteriaList
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import add_start_docstrings_to_model_forward, logging, replace_return_docstrings
@@ -112,8 +112,8 @@ class RetrievAugLMMarginOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    doc_scores: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    doc_scores: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     retrieved_doc_embeds: Optional[torch.FloatTensor] = None
     retrieved_doc_ids: Optional[torch.LongTensor] = None
@@ -202,8 +202,8 @@ class RetrievAugLMOutput(ModelOutput):
             weighted average in the cross-attention heads.
     """
 
-    logits: torch.FloatTensor = None
-    doc_scores: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    doc_scores: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     retrieved_doc_embeds: Optional[torch.FloatTensor] = None
     retrieved_doc_ids: Optional[torch.LongTensor] = None
@@ -235,18 +235,11 @@ class RagPreTrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
 
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        # At the moment fast initialization is not supported
-        # for composite models
-        kwargs["_fast_init"] = False
-        return super().from_pretrained(*args, **kwargs)
-
     @classmethod
     def from_pretrained_question_encoder_generator(
         cls,
-        question_encoder_pretrained_model_name_or_path: str = None,
-        generator_pretrained_model_name_or_path: str = None,
+        question_encoder_pretrained_model_name_or_path: Optional[str] = None,
+        generator_pretrained_model_name_or_path: Optional[str] = None,
         retriever: RagRetriever = None,
         **kwargs,
     ) -> PreTrainedModel:
@@ -494,9 +487,9 @@ def __init__(
         retriever: Optional[RagRetriever] = None,  # or maybe just use a `set_retriever(...)` method
         **kwargs,
     ):
-        assert config is not None or (
-            question_encoder is not None and generator is not None
-        ), "Either a configuration or an question_encoder and a generator has to be provided."
+        assert config is not None or (question_encoder is not None and generator is not None), (
+            "Either a configuration or an question_encoder and a generator has to be provided."
+        )
 
         if config is None:
             config = RagConfig.from_question_encoder_generator_configs(
@@ -517,9 +510,9 @@ def __init__(
 
         self.retriever = retriever
         if self.retriever is not None:
-            assert isinstance(
-                retriever, RagRetriever
-            ), f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
+            assert isinstance(retriever, RagRetriever), (
+                f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
+            )
             self.retriever = retriever
 
         self.question_encoder = question_encoder
@@ -590,7 +583,7 @@ def forward(
 
                 retriever_outputs = self.retriever(
                     input_ids,
-                    question_encoder_last_hidden_state.cpu().detach().to(torch.float32).numpy(),
+                    question_encoder_last_hidden_state.detach().to(device="cpu", dtype=torch.float32).numpy(),
                     prefix=self.generator.config.prefix,
                     n_docs=n_docs,
                     return_tensors="pt",
@@ -660,9 +653,9 @@ def forward(
                     " retriever using the `set_retriever(...)` function."
                 )
 
-        assert (
-            doc_scores is not None
-        ), "Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function."
+        assert doc_scores is not None, (
+            "Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function."
+        )
 
         assert (doc_scores.shape[1] % n_docs) == 0, (
             f" The first dimension of `context_input_ids` should be a multiple of `n_docs`={n_docs}, but is"
@@ -740,9 +733,9 @@ def __init__(
         retriever: Optional[RagRetriever] = None,
         **kwargs,
     ):
-        assert config is not None or (
-            question_encoder is not None and generator is not None
-        ), "Either a configuration or an encoder and a generator has to be provided."
+        assert config is not None or (question_encoder is not None and generator is not None), (
+            "Either a configuration or an encoder and a generator has to be provided."
+        )
 
         if config is None:
             config = RagConfig.from_question_encoder_generator_configs(
@@ -973,15 +966,15 @@ def generate(
         )
         num_beams = num_beams if num_beams is not None else self.config.num_beams
 
-        assert (
-            input_ids is not None or context_input_ids is not None
-        ), " At least one of input_ids or context_input_ids must be given"
+        assert input_ids is not None or context_input_ids is not None, (
+            " At least one of input_ids or context_input_ids must be given"
+        )
 
         if self.retriever is not None and context_input_ids is None:
             question_hidden_states = self.question_encoder(input_ids, attention_mask=attention_mask)[0]
             context_input_ids = self.retriever(
                 input_ids,
-                question_hidden_states.cpu().detach().to(torch.float32).numpy(),
+                question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
                 prefix=self.generator.config.prefix,
                 n_docs=n_docs,
                 return_tensors="pt",
@@ -1138,9 +1131,9 @@ def __init__(
         retriever: Optional[RagRetriever] = None,
         **kwargs,
     ):
-        assert config is not None or (
-            question_encoder is not None and generator is not None
-        ), "Either a configuration or an encoder and a generator has to be provided."
+        assert config is not None or (question_encoder is not None and generator is not None), (
+            "Either a configuration or an encoder and a generator has to be provided."
+        )
 
         if config is None:
             config = RagConfig.from_question_encoder_generator_configs(
@@ -1469,7 +1462,7 @@ def generate(
             question_hidden_states = self.question_encoder(input_ids, attention_mask=attention_mask)[0]
             out = self.retriever(
                 input_ids,
-                question_hidden_states.cpu().detach().to(torch.float32).numpy(),
+                question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
                 prefix=self.generator.config.prefix,
                 n_docs=n_docs,
                 return_tensors="pt",
@@ -1563,18 +1556,8 @@ def extend_enc_output(tensor, num_beams=None):
         elif generation_config.num_beams > 1:
             if generation_config.num_return_sequences > generation_config.num_beams:
                 raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
-            beam_scorer = BeamSearchScorer(
-                batch_size=batch_size,
-                num_beams=generation_config.num_beams,
-                device=self.device,
-                length_penalty=generation_config.length_penalty,
-                do_early_stopping=generation_config.early_stopping,
-                num_beam_hyps_to_keep=generation_config.num_return_sequences,
-                max_length=generation_config.max_length,
-            )
             return self._beam_search(
                 input_ids,
-                beam_scorer,
                 logits_processor=pre_processor,
                 stopping_criteria=prepared_stopping_criteria,
                 generation_config=generation_config,
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index 6714ac61a3bd..9c670683c992 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -115,7 +115,7 @@ class TFRetrievAugLMMarginOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     doc_scores: tf.Tensor | None = None
     retrieved_doc_embeds: tf.Tensor | None = None
@@ -198,7 +198,7 @@ class TFRetrievAugLMOutput(ModelOutput):
             average in the self-attention heads.
     """
 
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     past_key_values: List[tf.Tensor] | None = None
     doc_scores: tf.Tensor | None = None
     retrieved_doc_embeds: tf.Tensor | None = None
@@ -232,8 +232,8 @@ class TFRagPreTrainedModel(TFPreTrainedModel):
     @classmethod
     def from_pretrained_question_encoder_generator(
         cls,
-        question_encoder_pretrained_model_name_or_path: str = None,
-        generator_pretrained_model_name_or_path: str = None,
+        question_encoder_pretrained_model_name_or_path: Optional[str] = None,
+        generator_pretrained_model_name_or_path: Optional[str] = None,
         retriever: RagRetriever = None,
         *model_args,
         **kwargs,
@@ -506,9 +506,9 @@ def __init__(
         load_weight_prefix: Optional[str] = None,
         **kwargs,
     ):
-        assert config is not None or (
-            question_encoder is not None and generator is not None
-        ), "Either a configuration or an question_encoder and a generator has to be provided."
+        assert config is not None or (question_encoder is not None and generator is not None), (
+            "Either a configuration or an question_encoder and a generator has to be provided."
+        )
 
         if config is None:
             config = RagConfig.from_question_encoder_generator_configs(
@@ -533,9 +533,9 @@ def __init__(
 
         self.retriever = retriever
         if self.retriever is not None:
-            assert isinstance(
-                retriever, RagRetriever
-            ), f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
+            assert isinstance(retriever, RagRetriever), (
+                f"`self.retriever` is of type {type(self.retriever)}, but should be of type `RagRetriever`"
+            )
             self.retriever = retriever
 
         self.question_encoder = question_encoder
@@ -589,9 +589,9 @@ def call(
         >>> input_ids = input_dict["input_ids"]
         >>> outputs = model(input_ids)
         ```"""
-        assert (
-            "decoder_cached_states" not in kwargs
-        ), "Please use past_key_values to cache intermediate outputs"  # from modeling_tf_bart.py
+        assert "decoder_cached_states" not in kwargs, (
+            "Please use past_key_values to cache intermediate outputs"
+        )  # from modeling_tf_bart.py
 
         # aliasing to minimize code changing
         n_docs = n_docs if n_docs is not None else self.config.n_docs
@@ -657,9 +657,9 @@ def call(
                     " retriever using the `set_retriever(...)` function."
                 )
 
-        assert (
-            doc_scores is not None
-        ), "Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function."
+        assert doc_scores is not None, (
+            "Make sure that `doc_scores` are passed when passing `encoder_outputs` to the forward function."
+        )
 
         assert (doc_scores.shape[1] % n_docs) == 0, (
             f" The first dimension of `context_input_ids` should be a multiple of `n_docs`={n_docs}, but is"
@@ -747,9 +747,9 @@ def __init__(
         retriever: Optional[RagRetriever] = None,
         **kwargs,
     ):
-        assert config is not None or (
-            question_encoder is not None and generator is not None
-        ), "Either a configuration or an encoder and a generator has to be provided."
+        assert config is not None or (question_encoder is not None and generator is not None), (
+            "Either a configuration or an encoder and a generator has to be provided."
+        )
 
         if config is None:
             config = RagConfig.from_question_encoder_generator_configs(
@@ -939,9 +939,9 @@ def call(
         >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
         ```"""
 
-        assert (
-            "decoder_cached_states" not in kwargs
-        ), "Please use past_key_values to cache intermediate outputs"  # from modeling_tf_bart.py
+        assert "decoder_cached_states" not in kwargs, (
+            "Please use past_key_values to cache intermediate outputs"
+        )  # from modeling_tf_bart.py
 
         do_marginalize = do_marginalize if do_marginalize else self.config.do_marginalize
         reduce_loss = reduce_loss if reduce_loss else self.config.reduce_loss
@@ -1327,9 +1327,9 @@ def __init__(
         retriever: Optional[RagRetriever] = None,
         **kwargs,
     ):
-        assert config is not None or (
-            question_encoder is not None and generator is not None
-        ), "Either a configuration or an encoder and a generator has to be provided."
+        assert config is not None or (question_encoder is not None and generator is not None), (
+            "Either a configuration or an encoder and a generator has to be provided."
+        )
 
         if config is None:
             config = RagConfig.from_question_encoder_generator_configs(
@@ -1454,9 +1454,9 @@ def call(
         >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
         ```"""
 
-        assert (
-            "decoder_cached_states" not in kwargs
-        ), "Please use past_key_values to cache intermediate outputs"  # from modeling_tf_bart.py
+        assert "decoder_cached_states" not in kwargs, (
+            "Please use past_key_values to cache intermediate outputs"
+        )  # from modeling_tf_bart.py
 
         exclude_bos_score = exclude_bos_score if exclude_bos_score else self.config.exclude_bos_score
         reduce_loss = reduce_loss if reduce_loss else self.config.reduce_loss
@@ -1663,9 +1663,9 @@ def generate(
         )
         num_beams = num_beams if num_beams is not None else self.config.num_beams
 
-        assert (
-            input_ids is not None or context_input_ids is not None
-        ), " At least one of input_ids or context_input_ids must be given"
+        assert input_ids is not None or context_input_ids is not None, (
+            " At least one of input_ids or context_input_ids must be given"
+        )
 
         if self.retriever is not None and context_input_ids is None:
             question_hidden_states = self.question_encoder(input_ids, attention_mask=attention_mask)[0]
diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py
index f4000aa6e7f6..c7a592a64417 100644
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -156,9 +156,9 @@ def _deserialize_index(self):
             )
         with open(resolved_meta_path, "rb") as metadata_file:
             self.index_id_to_db_id = pickle.load(metadata_file)
-        assert (
-            len(self.index_id_to_db_id) == self.index.ntotal
-        ), "Deserialized index_id_to_db_id should match faiss index size"
+        assert len(self.index_id_to_db_id) == self.index.ntotal, (
+            "Deserialized index_id_to_db_id should match faiss index size"
+        )
 
     def is_initialized(self):
         return self._index_initialized
diff --git a/src/transformers/models/rag/tokenization_rag.py b/src/transformers/models/rag/tokenization_rag.py
index 4d0a994e766f..428b5f66446c 100644
--- a/src/transformers/models/rag/tokenization_rag.py
+++ b/src/transformers/models/rag/tokenization_rag.py
@@ -81,7 +81,7 @@ def prepare_seq2seq_batch(
         max_length: Optional[int] = None,
         max_target_length: Optional[int] = None,
         padding: str = "longest",
-        return_tensors: str = None,
+        return_tensors: Optional[str] = None,
         truncation: bool = True,
         **kwargs,
     ) -> BatchEncoding:
diff --git a/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py b/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py
deleted file mode 100644
index dc6619e217e4..000000000000
--- a/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import warnings
-
-import torch
-from accelerate import init_empty_weights
-
-from transformers import GemmaTokenizer, RecurrentGemmaConfig, RecurrentGemmaForCausalLM
-
-
-try:
-    from transformers import GemmaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    GemmaTokenizerFast = None
-
-import regex as re
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/gemma/convert_gemma_weights_to_hf.py \
-    --input_dir /path/to/downloaded/gemma/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import GemmaForCausalLM, GemmaTokenizerFast
-
-model = GemmaForCausalLM.from_pretrained("/output/path")
-tokenizer = GemmaTokenizerFast.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-gemma_2b_config = RecurrentGemmaConfig(
-    num_attention_heads=10,
-    num_key_value_heads=1,
-    hidden_size=2560,
-    intermediate_size=15360,
-    vocab_size=256000,
-    num_hidden_layers=26,
-)
-
-gemma_7b_config = RecurrentGemmaConfig()
-
-CONFIG_MAPPING = {"2B": gemma_2b_config, "7B": gemma_7b_config}
-LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"}
-
-
-def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32):
-    print(f"Fetching all parameters from the checkpoint at '{input_base_path}'")
-    model_state_dict = torch.load(input_base_path, map_location="cpu")
-
-    REPLACEMENT = {
-        "blocks.": "layers.",
-        ".ffw_down.b": ".down_proj.b",
-        ".ffw_down.w": ".down_proj.w",
-        ".ffw_up.b": ".up_proj.bias",
-        ".ffw_up.w": ".up_proj.weight",
-        "recurrent_block": "temporal_block",
-        "attention_block": "temporal_block",
-        "temporal_block.proj_final": "temporal_block.out_proj",
-        "norm.scale": "norm.weight",
-        ".proj_k": ".k_proj",
-        ".proj_q": ".q_proj",
-        ".proj_v": ".v_proj",
-        ".proj_final": ".o_proj",
-        "embedder.input_embedding": "embed_tokens.weight",
-        "conv_1d.w": "conv_1d.weight",
-        "conv_1d.b": "conv_1d.bias",
-        "input_gate.w": "input_gate.weight",
-        "input_gate.b": "input_gate.bias",
-        "a_param": "recurrent_param",
-        "a_gate.b": "recurrent_gate.bias",
-        "a_gate.w": "recurrent_gate.weight",
-    }
-
-    state_dict = {}
-    for k, v in model_state_dict.items():
-        k = "model." + k
-        pattern = re.compile("|".join(map(re.escape, REPLACEMENT.keys())))
-        key = pattern.sub(lambda match: REPLACEMENT[match.group(0)], k)
-        if "conv_1d.weight" in key:
-            v = v[:, None, :].transpose(0, 2)
-        if "up_proj.weight" in key:
-            state_dict[key.replace("up_proj", "gate_proj")] = v[0].T.contiguous()
-            v = v[1].T.contiguous()
-        if "up_proj.bias" in key:
-            state_dict[key.replace("up_proj", "gate_proj")] = v[0, 0, 0].clone()
-            v = v[1, 0, 0].contiguous()
-        if "recurrent_gate.bias" in key:
-            state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone()
-        elif "recurrent_gate.weight" in key:
-            state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone()
-        elif "input_gate.b" in key:
-            state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone()
-        elif "input_gate.w" in key:
-            state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone()
-        elif "embed_tokens" in key:
-            state_dict[key] = v[: config.vocab_size, :].contiguous().clone()
-            state_dict["lm_head.weight"] = v[: config.vocab_size, :].contiguous().clone()
-        else:
-            state_dict[key] = v.contiguous()
-
-    torch.set_default_dtype(dtype)
-
-    print("Loading the checkpoint in a Gemma model.")
-    with init_empty_weights():
-        model = RecurrentGemmaForCausalLM(config)
-    model.load_state_dict(state_dict, assign=True, strict=True)
-
-    model.config.torch_dtype = torch.float32
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-
-    if push_to_hub:
-        print(f"pushing the model to {save_path}")
-    else:
-        model.save_pretrained(save_path, safe_serialization=safe_serialization)
-
-
-def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False):
-    # Initialize the tokenizer based on the `spm` model
-    tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast
-    print(f"Saving a {tokenizer_class.__name__} to {save_path}.")
-    tokenizer = tokenizer_class(input_tokenizer_path)
-    if push_to_hub:
-        tokenizer.push_to_hub(save_path)
-    else:
-        tokenizer.save_pretrained(save_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_checkpoint",
-        help="Absolute path to the target Gemma weights.",
-        default="/home/arthur/transformers_recurrentgemma/google/recurrent-gemma-2b-it/ToBeDeleted/2b-it.pt",
-    )
-    parser.add_argument(
-        "--tokenizer_checkpoint",
-        help="Location of Gemma tokenizer model",
-    )
-    parser.add_argument(
-        "--model_size",
-        default="2B",
-        choices=["2B", "7B", "tokenizer_only"],
-        help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="google/recurrent-gemma-2b-it-hf",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--pickle_serialization",
-        help="Whether or not to save using `safetensors`.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--convert_tokenizer",
-        help="Whether or not to convert the tokenizer as well.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--dtype",
-        default="float32",
-        help="Target dtype of the converted model",
-    )
-    args = parser.parse_args()
-
-    if args.convert_tokenizer:
-        if args.tokenizer_checkpoint is None:
-            raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer")
-
-        spm_path = os.path.join(args.tokenizer_checkpoint)
-        write_tokenizer(spm_path, args.output_dir, args.push_to_hub)
-
-    config = CONFIG_MAPPING[args.model_size]
-    dtype = getattr(torch, args.dtype)
-    write_model(
-        config=config,
-        input_base_path=args.input_checkpoint,
-        save_path=args.output_dir,
-        safe_serialization=not args.pickle_serialization,
-        push_to_hub=args.push_to_hub,
-        dtype=dtype,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
index e2014079f936..450da13493d5 100644
--- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
@@ -492,8 +492,8 @@ def forward(
         activations: torch.Tensor,
         position_ids: torch.Tensor,
         attention_mask: torch.Tensor,
-        cache_position: torch.Tensor = None,
-        use_cache: bool = None,
+        cache_position: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
         raw_activations = activations
         inputs_normalized = self.temporal_pre_norm(raw_activations)  # RMSNorm introduces slight slight differences
@@ -677,7 +677,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(RECURRENTGEMMA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
@@ -821,7 +821,6 @@ def forward(
         **kwargs,
     ) -> Union[Tuple, CausalLMOutput]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
diff --git a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py b/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
deleted file mode 100755
index 7e287a47bfed..000000000000
--- a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Reformer checkpoint."""
-
-import argparse
-import pickle
-
-import numpy as np
-import torch
-from torch import nn
-
-from transformers import ReformerConfig, ReformerModelWithLMHead
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def set_param(torch_layer, weight, bias=None):
-    # set parameter of one layer
-    assert torch_layer.weight.shape == weight.shape, f"{torch_layer} layer.weight does not match"
-    torch_layer.weight = nn.Parameter(weight)
-    if bias is not None:
-        assert torch_layer.bias.shape == bias.shape, f"{torch_layer} layer.bias does not match"
-        torch_layer.bias = nn.Parameter(bias)
-
-
-def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size):
-    # set torch weights for 1-to-1 comparison
-    np_query_key = np.asarray(weights[0])
-    np_value = np.asarray(weights[1])
-    np_dense = np.asarray(weights[2])
-
-    set_param(
-        torch_layer.self_attention.query_key,
-        torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size),
-    )
-    set_param(
-        torch_layer.self_attention.value,
-        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
-    )
-    set_param(
-        torch_layer.output.dense,
-        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
-    )
-
-
-def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size):
-    # set torch weights for 1-to-1 comparison
-    np_query = np.asarray(weights[0])
-    np_key = np.asarray(weights[1])
-    np_value = np.asarray(weights[2])
-    np_dense = np.asarray(weights[3])
-
-    set_param(
-        torch_layer.self_attention.query,
-        torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
-    )
-    set_param(
-        torch_layer.self_attention.key,
-        torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
-    )
-    set_param(
-        torch_layer.self_attention.value,
-        torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
-    )
-    set_param(
-        torch_layer.output.dense,
-        torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
-    )
-
-
-def set_block_weights_in_torch(weights, torch_block, hidden_size):
-    # layernorm 1
-    layer_norm_1 = weights[0][0][0]
-    layer_norm_1_weight = np.asarray(layer_norm_1[0])
-    layer_norm_1_bias = np.asarray(layer_norm_1[1])
-    set_param(
-        torch_block.attention.layer_norm,
-        torch.tensor(layer_norm_1_weight),
-        torch.tensor(layer_norm_1_bias),
-    )
-
-    # lsh weights + output
-    attn_weights = weights[0][1]
-    if len(attn_weights) < 4:
-        set_layer_weights_in_torch_lsh(attn_weights, torch_block.attention, hidden_size)
-    else:
-        set_layer_weights_in_torch_local(attn_weights, torch_block.attention, hidden_size)
-
-    # intermediate weighs
-    intermediate_weights = weights[2][0][1][2]
-
-    # Chunked Feed Forward
-    if len(intermediate_weights) == 4:
-        intermediate_weights = intermediate_weights[2]
-
-    # layernorm 2
-    layer_norm_2_weight = np.asarray(intermediate_weights[0][0])
-    layer_norm_2_bias = np.asarray(intermediate_weights[0][1])
-    set_param(
-        torch_block.feed_forward.layer_norm,
-        torch.tensor(layer_norm_2_weight),
-        torch.tensor(layer_norm_2_bias),
-    )
-
-    # intermediate dense
-    inter_dense_weight = np.asarray(intermediate_weights[1][0])
-    inter_dense_bias = np.asarray(intermediate_weights[1][1])
-    set_param(
-        torch_block.feed_forward.dense.dense,
-        torch.tensor(inter_dense_weight).transpose(0, 1).contiguous(),
-        torch.tensor(inter_dense_bias),
-    )
-
-    # intermediate out
-    out_dense_weight = np.asarray(intermediate_weights[4][0])
-    out_dense_bias = np.asarray(intermediate_weights[4][1])
-    set_param(
-        torch_block.feed_forward.output.dense,
-        torch.tensor(out_dense_weight).transpose(0, 1).contiguous(),
-        torch.tensor(out_dense_bias),
-    )
-
-
-def set_model_weights_in_torch(weights, torch_model, hidden_size):
-    # reformer model
-    torch_model_reformer = torch_model.reformer
-
-    # word embeds
-    word_embeddings = np.asarray(weights[1])
-    set_param(
-        torch_model_reformer.embeddings.word_embeddings,
-        torch.tensor(word_embeddings),
-    )
-
-    if isinstance(weights[3], tuple):
-        position_embeddings = torch_model_reformer.embeddings.position_embeddings
-        for emb_idx in range(len(position_embeddings.weights)):
-            emb_weights = np.asarray(weights[3][emb_idx][0])
-            assert (
-                position_embeddings.weights[emb_idx].shape == emb_weights.shape
-            ), f"{position_embeddings[emb_idx]} emb does not match"
-            position_embeddings.weights[emb_idx] = nn.Parameter(torch.tensor(emb_weights))
-
-    trax_layer_weights = weights[5]
-    assert len(torch_model_reformer.encoder.layers) * 4 == len(
-        trax_layer_weights
-    ), "HF and trax model do not have the same number of layers"
-    for layer_idx, layer in enumerate(torch_model_reformer.encoder.layers):
-        block_weights = trax_layer_weights[4 * layer_idx : 4 * (layer_idx + 1)]
-        set_block_weights_in_torch(block_weights, layer, hidden_size)
-
-    # output layer norm
-    layer_norm_out_weight = np.asarray(weights[7][0])
-    layer_norm_out_bias = np.asarray(weights[7][1])
-    set_param(
-        torch_model_reformer.encoder.layer_norm,
-        torch.tensor(layer_norm_out_weight),
-        torch.tensor(layer_norm_out_bias),
-    )
-
-    # output embeddings
-    output_embed_weights = np.asarray(weights[9][0])
-    output_embed_bias = np.asarray(weights[9][1])
-    set_param(
-        torch_model.lm_head.decoder,
-        torch.tensor(output_embed_weights).transpose(0, 1).contiguous(),
-        torch.tensor(output_embed_bias),
-    )
-
-
-def convert_trax_checkpoint_to_pytorch(trax_model_pkl_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = ReformerConfig.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = ReformerModelWithLMHead(config)
-
-    with open(trax_model_pkl_path, "rb") as f:
-        model_weights = pickle.load(f)["weights"]
-
-    set_model_weights_in_torch(model_weights, model, config.hidden_size)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--trax_model_pkl_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the TensorFlow checkpoint path.\n"
-        "Given the files are in the pickle format, please be wary of passing it files you trust.",
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained Reformer model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_trax_checkpoint_to_pytorch(args.trax_model_pkl_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index ab6c15f5174e..48be78e7d477 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -446,12 +446,12 @@ def forward(
         # free memory
         del hidden_states
 
-        assert (
-            query_key_vectors.shape[-1] == self.attention_head_size
-        ), f"last dim of query_key_vectors is {query_key_vectors.shape[-1]} but should be {self.attention_head_size}."
-        assert (
-            value_vectors.shape[-1] == self.attention_head_size
-        ), f"last dim of value_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
+        assert query_key_vectors.shape[-1] == self.attention_head_size, (
+            f"last dim of query_key_vectors is {query_key_vectors.shape[-1]} but should be {self.attention_head_size}."
+        )
+        assert value_vectors.shape[-1] == self.attention_head_size, (
+            f"last dim of value_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
+        )
 
         do_standard_self_attention = (sequence_length <= self.chunk_length) or (
             use_cache and past_buckets_states[1] is not None
@@ -470,9 +470,9 @@ def forward(
                 # make sure buckets has correct shape for LSH attention
                 buckets = buckets.view(batch_size, self.num_attention_heads, num_hashes * sequence_length)
 
-            assert (
-                int(buckets.shape[-1]) == num_hashes * sequence_length
-            ), f"last dim of buckets is {buckets.shape[-1]}, but should be {num_hashes * sequence_length}"
+            assert int(buckets.shape[-1]) == num_hashes * sequence_length, (
+                f"last dim of buckets is {buckets.shape[-1]}, but should be {num_hashes * sequence_length}"
+            )
 
             sorted_bucket_idx, undo_sorted_bucket_idx = self._get_sorted_bucket_idx_and_undo_sorted_bucket_idx(
                 sequence_length, buckets, num_hashes
@@ -612,18 +612,18 @@ def _hash_vectors(self, vectors, num_hashes, attention_mask, increase_num_bucket
         # We sample a different random rotation for each round of hashing to
         # decrease the probability of hash misses.
         if isinstance(self.num_buckets, int):
-            assert (
-                self.num_buckets % 2 == 0
-            ), f"There should be an even number of buckets, but `self.num_buckets`: {self.num_buckets}"
+            assert self.num_buckets % 2 == 0, (
+                f"There should be an even number of buckets, but `self.num_buckets`: {self.num_buckets}"
+            )
             rotation_size = self.num_buckets
             num_buckets = self.num_buckets
         else:
             # Factorize the hash if self.num_buckets is a list or tuple
             rotation_size, num_buckets = 0, 1
             for bucket_factor in self.num_buckets:
-                assert (
-                    bucket_factor % 2 == 0
-                ), f"The number of buckets should be even, but `num_bucket`: {bucket_factor}"
+                assert bucket_factor % 2 == 0, (
+                    f"The number of buckets should be even, but `num_bucket`: {bucket_factor}"
+                )
                 rotation_size = rotation_size + bucket_factor
                 num_buckets = num_buckets * bucket_factor
 
@@ -1090,15 +1090,15 @@ def forward(
         key_vectors = self._split_hidden_size_dim(key_vectors, self.num_attention_heads, self.attention_head_size)
         value_vectors = self._split_hidden_size_dim(value_vectors, self.num_attention_heads, self.attention_head_size)
 
-        assert (
-            query_vectors.shape[-1] == self.attention_head_size
-        ), f"last dim of query_key_vectors is {query_vectors.shape[-1]} but should be {self.attention_head_size}."
-        assert (
-            key_vectors.shape[-1] == self.attention_head_size
-        ), f"last dim of query_key_vectors is {key_vectors.shape[-1]} but should be {self.attention_head_size}."
-        assert (
-            value_vectors.shape[-1] == self.attention_head_size
-        ), f"last dim of query_key_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
+        assert query_vectors.shape[-1] == self.attention_head_size, (
+            f"last dim of query_key_vectors is {query_vectors.shape[-1]} but should be {self.attention_head_size}."
+        )
+        assert key_vectors.shape[-1] == self.attention_head_size, (
+            f"last dim of query_key_vectors is {key_vectors.shape[-1]} but should be {self.attention_head_size}."
+        )
+        assert value_vectors.shape[-1] == self.attention_head_size, (
+            f"last dim of query_key_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
+        )
 
         if self.chunk_length is None:
             assert self.num_chunks_before == 0 and self.num_chunks_after == 0, (
@@ -1885,7 +1885,7 @@ class ReformerModelWithLMHeadOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_buckets_states: Optional[List[Tuple[torch.LongTensor, torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -1976,9 +1976,9 @@ class ReformerModel(ReformerPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.config = config
-        assert (
-            self.config.num_hidden_layers > 0
-        ), "`config.attn_layers` is empty. Select at least one attn layer form ['lsh', 'local']"
+        assert self.config.num_hidden_layers > 0, (
+            "`config.attn_layers` is empty. Select at least one attn layer form ['lsh', 'local']"
+        )
 
         self.embeddings = ReformerEmbeddings(config)
         self.encoder = ReformerEncoder(config)
@@ -2039,9 +2039,9 @@ def forward(
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        assert (
-            len(input_shape) == 2
-        ), f"`input_ids` have be of shape `[batch_size, sequence_length]`, but got shape: {input_shape}"
+        assert len(input_shape) == 2, (
+            f"`input_ids` have be of shape `[batch_size, sequence_length]`, but got shape: {input_shape}"
+        )
 
         if past_buckets_states is not None:
             assert not self.training, "`past_buckets_states` can only be used for inference, not for training`."
diff --git a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
deleted file mode 100644
index a06b2e830de0..000000000000
--- a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RegNet 10B checkpoints vissl."""
-# You need to install a specific version of classy vision
-# pip install git+https://github.com/FrancescoSaverioZuppichini/ClassyVision.git@convert_weights
-
-import argparse
-import json
-import os
-import re
-from collections import OrderedDict
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from pprint import pprint
-from typing import Dict, List, Tuple
-
-import torch
-import torch.nn as nn
-from classy_vision.models.regnet import RegNet, RegNetParams
-from huggingface_hub import hf_hub_download
-from torch import Tensor
-from vissl.models.model_helpers import get_trunk_forward_outputs
-
-from transformers import AutoImageProcessor, RegNetConfig, RegNetForImageClassification, RegNetModel
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-
-@dataclass
-class Tracker:
-    module: nn.Module
-    traced: List[nn.Module] = field(default_factory=list)
-    handles: list = field(default_factory=list)
-    name2module: Dict[str, nn.Module] = field(default_factory=OrderedDict)
-
-    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor, name: str):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
-        if has_not_submodules:
-            self.traced.append(m)
-            self.name2module[name] = m
-
-    def __call__(self, x: Tensor):
-        for name, m in self.module.named_modules():
-            self.handles.append(m.register_forward_hook(partial(self._forward_hook, name=name)))
-        self.module(x)
-        [x.remove() for x in self.handles]
-        return self
-
-    @property
-    def parametrized(self):
-        # check the len of the state_dict keys to see if we have learnable params
-        return {k: v for k, v in self.name2module.items() if len(list(v.state_dict().keys())) > 0}
-
-
-class FakeRegNetVisslWrapper(nn.Module):
-    """
-    Fake wrapper for RegNet that mimics what vissl does without the need to pass a config file.
-    """
-
-    def __init__(self, model: nn.Module):
-        super().__init__()
-
-        feature_blocks: List[Tuple[str, nn.Module]] = []
-        # - get the stem
-        feature_blocks.append(("conv1", model.stem))
-        # - get all the feature blocks
-        for k, v in model.trunk_output.named_children():
-            assert k.startswith("block"), f"Unexpected layer name {k}"
-            block_index = len(feature_blocks) + 1
-            feature_blocks.append((f"res{block_index}", v))
-
-        self._feature_blocks = nn.ModuleDict(feature_blocks)
-
-    def forward(self, x: Tensor):
-        return get_trunk_forward_outputs(
-            x,
-            out_feat_keys=None,
-            feature_blocks=self._feature_blocks,
-        )
-
-
-class FakeRegNetParams(RegNetParams):
-    """
-    Used to instantiace a RegNet model from classy vision with the same depth as the 10B one but with super small
-    parameters, so we can trace it in memory.
-    """
-
-    def get_expanded_params(self):
-        return [(8, 2, 2, 8, 1.0), (8, 2, 7, 8, 1.0), (8, 2, 17, 8, 1.0), (8, 2, 1, 8, 1.0)]
-
-
-def get_from_to_our_keys(model_name: str) -> Dict[str, str]:
-    """
-    Returns a dictionary that maps from original model's key -> our implementation's keys
-    """
-
-    # create our model (with small weights)
-    our_config = RegNetConfig(depths=[2, 7, 17, 1], hidden_sizes=[8, 8, 8, 8], groups_width=8)
-    if "in1k" in model_name:
-        our_model = RegNetForImageClassification(our_config)
-    else:
-        our_model = RegNetModel(our_config)
-    # create from model (with small weights)
-    from_model = FakeRegNetVisslWrapper(
-        RegNet(FakeRegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
-    )
-
-    with torch.no_grad():
-        from_model = from_model.eval()
-        our_model = our_model.eval()
-
-        x = torch.randn((1, 3, 32, 32))
-        # trace both
-        dest_tracker = Tracker(our_model)
-        dest_traced = dest_tracker(x).parametrized
-
-        pprint(dest_tracker.name2module)
-        src_tracker = Tracker(from_model)
-        src_traced = src_tracker(x).parametrized
-
-    # convert the keys -> module dict to keys -> params
-    def to_params_dict(dict_with_modules):
-        params_dict = OrderedDict()
-        for name, module in dict_with_modules.items():
-            for param_name, param in module.state_dict().items():
-                params_dict[f"{name}.{param_name}"] = param
-        return params_dict
-
-    from_to_ours_keys = {}
-
-    src_state_dict = to_params_dict(src_traced)
-    dst_state_dict = to_params_dict(dest_traced)
-
-    for (src_key, src_param), (dest_key, dest_param) in zip(src_state_dict.items(), dst_state_dict.items()):
-        from_to_ours_keys[src_key] = dest_key
-        logger.info(f"{src_key} -> {dest_key}")
-    # if "in1k" was in the model_name it means it must have a classification head (was finetuned)
-    if "in1k" in model_name:
-        from_to_ours_keys["0.clf.0.weight"] = "classifier.1.weight"
-        from_to_ours_keys["0.clf.0.bias"] = "classifier.1.bias"
-
-    return from_to_ours_keys
-
-
-def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
-    filename = "imagenet-1k-id2label.json"
-    num_labels = 1000
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    ImageNetPreTrainedConfig = partial(RegNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    names_to_config = {
-        "regnet-y-10b-seer": ImageNetPreTrainedConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
-        ),
-        # finetuned on imagenet
-        "regnet-y-10b-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
-        ),
-    }
-
-    # add seer weights logic
-    def load_using_classy_vision(checkpoint_url: str) -> Tuple[Dict, Dict]:
-        files = torch.hub.load_state_dict_from_url(checkpoint_url, model_dir=str(save_directory), map_location="cpu")
-        # check if we have a head, if yes add it
-        model_state_dict = files["classy_state_dict"]["base_model"]["model"]
-        return model_state_dict["trunk"], model_state_dict["heads"]
-
-    names_to_from_model = {
-        "regnet-y-10b-seer": partial(
-            load_using_classy_vision,
-            "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet10B/model_iteration124500_conso.torch",
-        ),
-        "regnet-y-10b-seer-in1k": partial(
-            load_using_classy_vision,
-            "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_10b_finetuned_in1k_model_phase28_conso.torch",
-        ),
-    }
-
-    from_to_ours_keys = get_from_to_our_keys(model_name)
-
-    if not (save_directory / f"{model_name}.pth").exists():
-        logger.info("Loading original state_dict.")
-        from_state_dict_trunk, from_state_dict_head = names_to_from_model[model_name]()
-        from_state_dict = from_state_dict_trunk
-        if "in1k" in model_name:
-            # add the head
-            from_state_dict = {**from_state_dict_trunk, **from_state_dict_head}
-        logger.info("Done!")
-
-        converted_state_dict = {}
-
-        not_used_keys = list(from_state_dict.keys())
-        regex = r"\.block.-part."
-        # this is "interesting", so the original checkpoints have `block[0,1]-part` in each key name, we remove it
-        for key in from_state_dict.keys():
-            # remove the weird "block[0,1]-part" from the key
-            src_key = re.sub(regex, "", key)
-            # now src_key from the model checkpoints is the one we got from the original model after tracing, so use it to get the correct destination key
-            dest_key = from_to_ours_keys[src_key]
-            # store the parameter with our key
-            converted_state_dict[dest_key] = from_state_dict[key]
-            not_used_keys.remove(key)
-        # check that all keys have been updated
-        assert len(not_used_keys) == 0, f"Some keys where not used {','.join(not_used_keys)}"
-
-        logger.info(f"The following keys were not used: {','.join(not_used_keys)}")
-
-        # save our state dict to disk
-        torch.save(converted_state_dict, save_directory / f"{model_name}.pth")
-
-        del converted_state_dict
-    else:
-        logger.info("The state_dict was already stored on disk.")
-    if push_to_hub:
-        logger.info(f"Token is {os.environ['HF_TOKEN']}")
-        logger.info("Loading our model.")
-        # create our model
-        our_config = names_to_config[model_name]
-        our_model_func = RegNetModel
-        if "in1k" in model_name:
-            our_model_func = RegNetForImageClassification
-        our_model = our_model_func(our_config)
-        # place our model to the meta device (so remove all the weights)
-        our_model.to(torch.device("meta"))
-        logger.info("Loading state_dict in our model.")
-        # load state dict
-        state_dict_keys = our_model.state_dict().keys()
-        PreTrainedModel._load_pretrained_model_low_mem(
-            our_model, state_dict_keys, [save_directory / f"{model_name}.pth"]
-        )
-        logger.info("Finally, pushing!")
-        # push it to hub
-        our_model.push_to_hub(
-            repo_path_or_name=save_directory / model_name,
-            commit_message="Add model",
-            output_dir=save_directory / model_name,
-        )
-        size = 384
-        # we can use the convnext one
-        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
-        image_processor.push_to_hub(
-            repo_path_or_name=save_directory / model_name,
-            commit_message="Add image processor",
-            output_dir=save_directory / model_name,
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        type=str,
-        help=(
-            "The name of the model you wish to convert, it must be one of the supported regnet* architecture,"
-            " currently: regnetx-*, regnety-*. If `None`, all of them will the converted."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=Path,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default=True,
-        type=bool,
-        required=False,
-        help="If True, push model and image processor to the hub.",
-    )
-
-    args = parser.parse_args()
-
-    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
-    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
-    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
deleted file mode 100644
index 38158b682cb5..000000000000
--- a/src/transformers/models/regnet/convert_regnet_to_pytorch.py
+++ /dev/null
@@ -1,458 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RegNet checkpoints from timm and vissl."""
-
-import argparse
-import json
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from typing import Callable, Dict, List, Tuple
-
-import timm
-import torch
-import torch.nn as nn
-from classy_vision.models.regnet import RegNet, RegNetParams, RegNetY32gf, RegNetY64gf, RegNetY128gf
-from huggingface_hub import hf_hub_download
-from torch import Tensor
-from vissl.models.model_helpers import get_trunk_forward_outputs
-
-from transformers import AutoImageProcessor, RegNetConfig, RegNetForImageClassification, RegNetModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-
-@dataclass
-class Tracker:
-    module: nn.Module
-    traced: List[nn.Module] = field(default_factory=list)
-    handles: list = field(default_factory=list)
-
-    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
-        if has_not_submodules:
-            self.traced.append(m)
-
-    def __call__(self, x: Tensor):
-        for m in self.module.modules():
-            self.handles.append(m.register_forward_hook(self._forward_hook))
-        self.module(x)
-        [x.remove() for x in self.handles]
-        return self
-
-    @property
-    def parametrized(self):
-        # check the len of the state_dict keys to see if we have learnable params
-        return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))
-
-
-@dataclass
-class ModuleTransfer:
-    src: nn.Module
-    dest: nn.Module
-    verbose: int = 1
-    src_skip: List = field(default_factory=list)
-    dest_skip: List = field(default_factory=list)
-    raise_if_mismatch: bool = True
-
-    def __call__(self, x: Tensor):
-        """
-        Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
-        hood we tracked all the operations in both modules.
-        """
-        dest_traced = Tracker(self.dest)(x).parametrized
-        src_traced = Tracker(self.src)(x).parametrized
-
-        src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
-        dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))
-
-        if len(dest_traced) != len(src_traced) and self.raise_if_mismatch:
-            raise Exception(
-                f"Numbers of operations are different. Source module has {len(src_traced)} operations while"
-                f" destination module has {len(dest_traced)}."
-            )
-
-        for dest_m, src_m in zip(dest_traced, src_traced):
-            dest_m.load_state_dict(src_m.state_dict())
-            if self.verbose == 1:
-                print(f"Transfered from={src_m} to={dest_m}")
-
-
-class FakeRegNetVisslWrapper(nn.Module):
-    """
-    Fake wrapper for RegNet that mimics what vissl does without the need to pass a config file.
-    """
-
-    def __init__(self, model: nn.Module):
-        super().__init__()
-
-        feature_blocks: List[Tuple[str, nn.Module]] = []
-        # - get the stem
-        feature_blocks.append(("conv1", model.stem))
-        # - get all the feature blocks
-        for k, v in model.trunk_output.named_children():
-            assert k.startswith("block"), f"Unexpected layer name {k}"
-            block_index = len(feature_blocks) + 1
-            feature_blocks.append((f"res{block_index}", v))
-
-        self._feature_blocks = nn.ModuleDict(feature_blocks)
-
-    def forward(self, x: Tensor):
-        return get_trunk_forward_outputs(
-            x,
-            out_feat_keys=None,
-            feature_blocks=self._feature_blocks,
-        )
-
-
-class NameToFromModelFuncMap(dict):
-    """
-    A Dictionary with some additional logic to return a function that creates the correct original model.
-    """
-
-    def convert_name_to_timm(self, x: str) -> str:
-        x_split = x.split("-")
-        return x_split[0] + x_split[1] + "_" + "".join(x_split[2:])
-
-    def __getitem__(self, x: str) -> Callable[[], Tuple[nn.Module, Dict]]:
-        # default to timm!
-        if x not in self:
-            x = self.convert_name_to_timm(x)
-            val = partial(lambda: (timm.create_model(x, pretrained=True).eval(), None))
-
-        else:
-            val = super().__getitem__(x)
-
-        return val
-
-
-class NameToOurModelFuncMap(dict):
-    """
-    A Dictionary with some additional logic to return the correct hugging face RegNet class reference.
-    """
-
-    def __getitem__(self, x: str) -> Callable[[], nn.Module]:
-        if "seer" in x and "in1k" not in x:
-            val = RegNetModel
-        else:
-            val = RegNetForImageClassification
-        return val
-
-
-def manually_copy_vissl_head(from_state_dict, to_state_dict, keys: List[Tuple[str, str]]):
-    for from_key, to_key in keys:
-        to_state_dict[to_key] = from_state_dict[from_key].clone()
-        print(f"Copied key={from_key} to={to_key}")
-    return to_state_dict
-
-
-def convert_weight_and_push(
-    name: str,
-    from_model_func: Callable[[], nn.Module],
-    our_model_func: Callable[[], nn.Module],
-    config: RegNetConfig,
-    save_directory: Path,
-    push_to_hub: bool = True,
-):
-    print(f"Converting {name}...")
-    with torch.no_grad():
-        from_model, from_state_dict = from_model_func()
-        our_model = our_model_func(config).eval()
-        module_transfer = ModuleTransfer(src=from_model, dest=our_model, raise_if_mismatch=False)
-        x = torch.randn((1, 3, 224, 224))
-        module_transfer(x)
-
-    if from_state_dict is not None:
-        keys = []
-        # for seer - in1k finetuned we have to manually copy the head
-        if "seer" in name and "in1k" in name:
-            keys = [("0.clf.0.weight", "classifier.1.weight"), ("0.clf.0.bias", "classifier.1.bias")]
-        to_state_dict = manually_copy_vissl_head(from_state_dict, our_model.state_dict(), keys)
-        our_model.load_state_dict(to_state_dict)
-
-    our_outputs = our_model(x, output_hidden_states=True)
-    our_output = (
-        our_outputs.logits if isinstance(our_model, RegNetForImageClassification) else our_outputs.last_hidden_state
-    )
-
-    from_output = from_model(x)
-    from_output = from_output[-1] if isinstance(from_output, list) else from_output
-
-    # now since I don't want to use any config files, vissl seer model doesn't actually have an head, so let's just check the last hidden state
-    if "seer" in name and "in1k" in name:
-        our_output = our_outputs.hidden_states[-1]
-
-    assert torch.allclose(from_output, our_output), "The model logits don't match the original one."
-
-    if push_to_hub:
-        our_model.push_to_hub(
-            repo_path_or_name=save_directory / name,
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-
-        size = 224 if "seer" not in name else 384
-        # we can use the convnext one
-        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
-        image_processor.push_to_hub(
-            repo_path_or_name=save_directory / name,
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-        print(f"Pushed {name}")
-
-
-def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
-    filename = "imagenet-1k-id2label.json"
-    num_labels = 1000
-    expected_shape = (1, num_labels)
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text())
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    ImageNetPreTrainedConfig = partial(RegNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    names_to_config = {
-        "regnet-x-002": ImageNetPreTrainedConfig(
-            depths=[1, 1, 4, 7], hidden_sizes=[24, 56, 152, 368], groups_width=8, layer_type="x"
-        ),
-        "regnet-x-004": ImageNetPreTrainedConfig(
-            depths=[1, 2, 7, 12], hidden_sizes=[32, 64, 160, 384], groups_width=16, layer_type="x"
-        ),
-        "regnet-x-006": ImageNetPreTrainedConfig(
-            depths=[1, 3, 5, 7], hidden_sizes=[48, 96, 240, 528], groups_width=24, layer_type="x"
-        ),
-        "regnet-x-008": ImageNetPreTrainedConfig(
-            depths=[1, 3, 7, 5], hidden_sizes=[64, 128, 288, 672], groups_width=16, layer_type="x"
-        ),
-        "regnet-x-016": ImageNetPreTrainedConfig(
-            depths=[2, 4, 10, 2], hidden_sizes=[72, 168, 408, 912], groups_width=24, layer_type="x"
-        ),
-        "regnet-x-032": ImageNetPreTrainedConfig(
-            depths=[2, 6, 15, 2], hidden_sizes=[96, 192, 432, 1008], groups_width=48, layer_type="x"
-        ),
-        "regnet-x-040": ImageNetPreTrainedConfig(
-            depths=[2, 5, 14, 2], hidden_sizes=[80, 240, 560, 1360], groups_width=40, layer_type="x"
-        ),
-        "regnet-x-064": ImageNetPreTrainedConfig(
-            depths=[2, 4, 10, 1], hidden_sizes=[168, 392, 784, 1624], groups_width=56, layer_type="x"
-        ),
-        "regnet-x-080": ImageNetPreTrainedConfig(
-            depths=[2, 5, 15, 1], hidden_sizes=[80, 240, 720, 1920], groups_width=120, layer_type="x"
-        ),
-        "regnet-x-120": ImageNetPreTrainedConfig(
-            depths=[2, 5, 11, 1], hidden_sizes=[224, 448, 896, 2240], groups_width=112, layer_type="x"
-        ),
-        "regnet-x-160": ImageNetPreTrainedConfig(
-            depths=[2, 6, 13, 1], hidden_sizes=[256, 512, 896, 2048], groups_width=128, layer_type="x"
-        ),
-        "regnet-x-320": ImageNetPreTrainedConfig(
-            depths=[2, 7, 13, 1], hidden_sizes=[336, 672, 1344, 2520], groups_width=168, layer_type="x"
-        ),
-        # y variant
-        "regnet-y-002": ImageNetPreTrainedConfig(depths=[1, 1, 4, 7], hidden_sizes=[24, 56, 152, 368], groups_width=8),
-        "regnet-y-004": ImageNetPreTrainedConfig(
-            depths=[1, 3, 6, 6], hidden_sizes=[48, 104, 208, 440], groups_width=8
-        ),
-        "regnet-y-006": ImageNetPreTrainedConfig(
-            depths=[1, 3, 7, 4], hidden_sizes=[48, 112, 256, 608], groups_width=16
-        ),
-        "regnet-y-008": ImageNetPreTrainedConfig(
-            depths=[1, 3, 8, 2], hidden_sizes=[64, 128, 320, 768], groups_width=16
-        ),
-        "regnet-y-016": ImageNetPreTrainedConfig(
-            depths=[2, 6, 17, 2], hidden_sizes=[48, 120, 336, 888], groups_width=24
-        ),
-        "regnet-y-032": ImageNetPreTrainedConfig(
-            depths=[2, 5, 13, 1], hidden_sizes=[72, 216, 576, 1512], groups_width=24
-        ),
-        "regnet-y-040": ImageNetPreTrainedConfig(
-            depths=[2, 6, 12, 2], hidden_sizes=[128, 192, 512, 1088], groups_width=64
-        ),
-        "regnet-y-064": ImageNetPreTrainedConfig(
-            depths=[2, 7, 14, 2], hidden_sizes=[144, 288, 576, 1296], groups_width=72
-        ),
-        "regnet-y-080": ImageNetPreTrainedConfig(
-            depths=[2, 4, 10, 1], hidden_sizes=[168, 448, 896, 2016], groups_width=56
-        ),
-        "regnet-y-120": ImageNetPreTrainedConfig(
-            depths=[2, 5, 11, 1], hidden_sizes=[224, 448, 896, 2240], groups_width=112
-        ),
-        "regnet-y-160": ImageNetPreTrainedConfig(
-            depths=[2, 4, 11, 1], hidden_sizes=[224, 448, 1232, 3024], groups_width=112
-        ),
-        "regnet-y-320": ImageNetPreTrainedConfig(
-            depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232
-        ),
-        # models created by SEER -> https://arxiv.org/abs/2202.08360
-        "regnet-y-320-seer": RegNetConfig(depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232),
-        "regnet-y-640-seer": RegNetConfig(depths=[2, 5, 12, 1], hidden_sizes=[328, 984, 1968, 4920], groups_width=328),
-        "regnet-y-1280-seer": RegNetConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[528, 1056, 2904, 7392], groups_width=264
-        ),
-        "regnet-y-2560-seer": RegNetConfig(
-            depths=[3, 7, 16, 1], hidden_sizes=[640, 1696, 2544, 5088], groups_width=640
-        ),
-        "regnet-y-10b-seer": ImageNetPreTrainedConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
-        ),
-        # finetuned on imagenet
-        "regnet-y-320-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[2, 5, 12, 1], hidden_sizes=[232, 696, 1392, 3712], groups_width=232
-        ),
-        "regnet-y-640-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[2, 5, 12, 1], hidden_sizes=[328, 984, 1968, 4920], groups_width=328
-        ),
-        "regnet-y-1280-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[528, 1056, 2904, 7392], groups_width=264
-        ),
-        "regnet-y-2560-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[3, 7, 16, 1], hidden_sizes=[640, 1696, 2544, 5088], groups_width=640
-        ),
-        "regnet-y-10b-seer-in1k": ImageNetPreTrainedConfig(
-            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
-        ),
-    }
-
-    names_to_ours_model_map = NameToOurModelFuncMap()
-    names_to_from_model_map = NameToFromModelFuncMap()
-    # add seer weights logic
-
-    def load_using_classy_vision(checkpoint_url: str, model_func: Callable[[], nn.Module]) -> Tuple[nn.Module, Dict]:
-        files = torch.hub.load_state_dict_from_url(checkpoint_url, model_dir=str(save_directory), map_location="cpu")
-        model = model_func()
-        # check if we have a head, if yes add it
-        model_state_dict = files["classy_state_dict"]["base_model"]["model"]
-        state_dict = model_state_dict["trunk"]
-        model.load_state_dict(state_dict)
-        return model.eval(), model_state_dict["heads"]
-
-    # pretrained
-    names_to_from_model_map["regnet-y-320-seer"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet32d/seer_regnet32gf_model_iteration244000.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY32gf()),
-    )
-
-    names_to_from_model_map["regnet-y-640-seer"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet64/seer_regnet64gf_model_final_checkpoint_phase0.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY64gf()),
-    )
-
-    names_to_from_model_map["regnet-y-1280-seer"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/swav_ig1b_regnet128Gf_cnstant_bs32_node16_sinkhorn10_proto16k_syncBN64_warmup8k/model_final_checkpoint_phase0.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY128gf()),
-    )
-
-    names_to_from_model_map["regnet-y-10b-seer"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet10B/model_iteration124500_conso.torch",
-        lambda: FakeRegNetVisslWrapper(
-            RegNet(RegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
-        ),
-    )
-
-    # IN1K finetuned
-    names_to_from_model_map["regnet-y-320-seer-in1k"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet32_finetuned_in1k_model_final_checkpoint_phase78.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY32gf()),
-    )
-
-    names_to_from_model_map["regnet-y-640-seer-in1k"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet64_finetuned_in1k_model_final_checkpoint_phase78.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY64gf()),
-    )
-
-    names_to_from_model_map["regnet-y-1280-seer-in1k"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_regnet128_finetuned_in1k_model_final_checkpoint_phase78.torch",
-        lambda: FakeRegNetVisslWrapper(RegNetY128gf()),
-    )
-
-    names_to_from_model_map["regnet-y-10b-seer-in1k"] = partial(
-        load_using_classy_vision,
-        "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_10b_finetuned_in1k_model_phase28_conso.torch",
-        lambda: FakeRegNetVisslWrapper(
-            RegNet(RegNetParams(depth=27, group_width=1010, w_0=1744, w_a=620.83, w_m=2.52))
-        ),
-    )
-
-    if model_name:
-        convert_weight_and_push(
-            model_name,
-            names_to_from_model_map[model_name],
-            names_to_ours_model_map[model_name],
-            names_to_config[model_name],
-            save_directory,
-            push_to_hub,
-        )
-    else:
-        for model_name, config in names_to_config.items():
-            convert_weight_and_push(
-                model_name,
-                names_to_from_model_map[model_name],
-                names_to_ours_model_map[model_name],
-                config,
-                save_directory,
-                push_to_hub,
-            )
-    return config, expected_shape
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        type=str,
-        help=(
-            "The name of the model you wish to convert, it must be one of the supported regnet* architecture,"
-            " currently: regnetx-*, regnety-*. If `None`, all of them will the converted."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=Path,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default=True,
-        type=bool,
-        required=False,
-        help="If True, push model and image processor to the hub.",
-    )
-
-    args = parser.parse_args()
-
-    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
-    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
-    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py
index a1e97c302d06..9fd0a4c634d5 100644
--- a/src/transformers/models/regnet/modeling_regnet.py
+++ b/src/transformers/models/regnet/modeling_regnet.py
@@ -80,7 +80,7 @@ def forward(self, hidden_state):
 
 class RegNetEmbeddings(nn.Module):
     """
-    RegNet Embedddings (stem) composed of a single aggressive convolution.
+    RegNet Embeddings (stem) composed of a single aggressive convolution.
     """
 
     def __init__(self, config: RegNetConfig):
diff --git a/src/transformers/models/regnet/modeling_tf_regnet.py b/src/transformers/models/regnet/modeling_tf_regnet.py
index 049c71ae6c0a..7e0cc5d562b9 100644
--- a/src/transformers/models/regnet/modeling_tf_regnet.py
+++ b/src/transformers/models/regnet/modeling_tf_regnet.py
@@ -311,7 +311,7 @@ def __init__(
         self.layers = [
             # downsampling is done in the first layer with stride of 2
             layer(config, in_channels, out_channels, stride=stride, name="layers.0"),
-            *[layer(config, out_channels, out_channels, name=f"layers.{i+1}") for i in range(depth - 1)],
+            *[layer(config, out_channels, out_channels, name=f"layers.{i + 1}") for i in range(depth - 1)],
         ]
 
     def call(self, hidden_state):
@@ -346,7 +346,7 @@ def __init__(self, config: RegNetConfig, **kwargs):
         )
         in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:])
         for i, ((in_channels, out_channels), depth) in enumerate(zip(in_out_channels, config.depths[1:])):
-            self.stages.append(TFRegNetStage(config, in_channels, out_channels, depth=depth, name=f"stages.{i+1}"))
+            self.stages.append(TFRegNetStage(config, in_channels, out_channels, depth=depth, name=f"stages.{i + 1}"))
 
     def call(
         self, hidden_state: tf.Tensor, output_hidden_states: bool = False, return_dict: bool = True
diff --git a/src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py b/src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 622d507080e4..000000000000
--- a/src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RemBERT checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import RemBertConfig, RemBertModel, load_tf_weights_in_rembert
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_rembert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = RemBertConfig.from_json_file(bert_config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = RemBertModel(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_rembert(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
-    torch.save(model.state_dict(), pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--rembert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained RemBERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_rembert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.rembert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index 66ba88b40d7a..a8942b1e7c29 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -776,7 +776,7 @@ class PreTrainedModel
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -931,7 +931,7 @@ def set_output_embeddings(self, new_embeddings):
     )
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -1028,7 +1028,7 @@ def set_output_embeddings(self, new_embeddings):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -1164,7 +1164,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids: torch.FloatTensor = None,
+        input_ids: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.FloatTensor] = None,
@@ -1260,7 +1260,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids: torch.FloatTensor = None,
+        input_ids: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.FloatTensor] = None,
@@ -1352,7 +1352,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids: torch.FloatTensor = None,
+        input_ids: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.FloatTensor] = None,
@@ -1430,7 +1430,7 @@ def __init__(self, config):
     )
     def forward(
         self,
-        input_ids: torch.FloatTensor = None,
+        input_ids: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.FloatTensor] = None,
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index 733defb4470b..4a21ee48d39f 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -106,10 +106,10 @@ def build(self, input_shape=None):
 
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        token_type_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        token_type_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
         past_key_values_length=0,
         training: bool = False,
     ) -> tf.Tensor:
diff --git a/src/transformers/models/resnet/convert_resnet_to_pytorch.py b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
deleted file mode 100644
index feceb74d16ef..000000000000
--- a/src/transformers/models/resnet/convert_resnet_to_pytorch.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ResNet checkpoints from timm."""
-
-import argparse
-import json
-from dataclasses import dataclass, field
-from functools import partial
-from pathlib import Path
-from typing import List
-
-import timm
-import torch
-import torch.nn as nn
-from huggingface_hub import hf_hub_download
-from torch import Tensor
-
-from transformers import AutoImageProcessor, ResNetConfig, ResNetForImageClassification
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger()
-
-
-@dataclass
-class Tracker:
-    module: nn.Module
-    traced: List[nn.Module] = field(default_factory=list)
-    handles: list = field(default_factory=list)
-
-    def _forward_hook(self, m, inputs: Tensor, outputs: Tensor):
-        has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d)
-        if has_not_submodules:
-            self.traced.append(m)
-
-    def __call__(self, x: Tensor):
-        for m in self.module.modules():
-            self.handles.append(m.register_forward_hook(self._forward_hook))
-        self.module(x)
-        [x.remove() for x in self.handles]
-        return self
-
-    @property
-    def parametrized(self):
-        # check the len of the state_dict keys to see if we have learnable params
-        return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced))
-
-
-@dataclass
-class ModuleTransfer:
-    src: nn.Module
-    dest: nn.Module
-    verbose: int = 0
-    src_skip: List = field(default_factory=list)
-    dest_skip: List = field(default_factory=list)
-
-    def __call__(self, x: Tensor):
-        """
-        Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the
-        hood we tracked all the operations in both modules.
-        """
-        dest_traced = Tracker(self.dest)(x).parametrized
-        src_traced = Tracker(self.src)(x).parametrized
-
-        src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced))
-        dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced))
-
-        if len(dest_traced) != len(src_traced):
-            raise Exception(
-                f"Numbers of operations are different. Source module has {len(src_traced)} operations while"
-                f" destination module has {len(dest_traced)}."
-            )
-
-        for dest_m, src_m in zip(dest_traced, src_traced):
-            dest_m.load_state_dict(src_m.state_dict())
-            if self.verbose == 1:
-                print(f"Transfered from={src_m} to={dest_m}")
-
-
-def convert_weight_and_push(name: str, config: ResNetConfig, save_directory: Path, push_to_hub: bool = True):
-    print(f"Converting {name}...")
-    with torch.no_grad():
-        from_model = timm.create_model(name, pretrained=True).eval()
-        our_model = ResNetForImageClassification(config).eval()
-        module_transfer = ModuleTransfer(src=from_model, dest=our_model)
-        x = torch.randn((1, 3, 224, 224))
-        module_transfer(x)
-
-    assert torch.allclose(from_model(x), our_model(x).logits), "The model logits don't match the original one."
-
-    checkpoint_name = f"resnet{'-'.join(name.split('resnet'))}"
-    print(checkpoint_name)
-
-    if push_to_hub:
-        our_model.push_to_hub(
-            repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add model",
-            use_temp_dir=True,
-        )
-
-        # we can use the convnext one
-        image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k")
-        image_processor.push_to_hub(
-            repo_path_or_name=save_directory / checkpoint_name,
-            commit_message="Add image processor",
-            use_temp_dir=True,
-        )
-
-        print(f"Pushed {checkpoint_name}")
-
-
-def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
-    filename = "imagenet-1k-id2label.json"
-    num_labels = 1000
-    expected_shape = (1, num_labels)
-
-    repo_id = "huggingface/label-files"
-    num_labels = num_labels
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-
-    id2label = id2label
-    label2id = {v: k for k, v in id2label.items()}
-
-    ImageNetPreTrainedConfig = partial(ResNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)
-
-    names_to_config = {
-        "resnet18": ImageNetPreTrainedConfig(
-            depths=[2, 2, 2, 2], hidden_sizes=[64, 128, 256, 512], layer_type="basic"
-        ),
-        "resnet26": ImageNetPreTrainedConfig(
-            depths=[2, 2, 2, 2], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
-        ),
-        "resnet34": ImageNetPreTrainedConfig(
-            depths=[3, 4, 6, 3], hidden_sizes=[64, 128, 256, 512], layer_type="basic"
-        ),
-        "resnet50": ImageNetPreTrainedConfig(
-            depths=[3, 4, 6, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
-        ),
-        "resnet101": ImageNetPreTrainedConfig(
-            depths=[3, 4, 23, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
-        ),
-        "resnet152": ImageNetPreTrainedConfig(
-            depths=[3, 8, 36, 3], hidden_sizes=[256, 512, 1024, 2048], layer_type="bottleneck"
-        ),
-    }
-
-    if model_name:
-        convert_weight_and_push(model_name, names_to_config[model_name], save_directory, push_to_hub)
-    else:
-        for model_name, config in names_to_config.items():
-            convert_weight_and_push(model_name, config, save_directory, push_to_hub)
-    return config, expected_shape
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        type=str,
-        help=(
-            "The name of the model you wish to convert, it must be one of the supported resnet* architecture,"
-            " currently: resnet18,26,34,50,101,152. If `None`, all of them will the converted."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=Path,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        default=True,
-        type=bool,
-        required=False,
-        help="If True, push model and image processor to the hub.",
-    )
-
-    args = parser.parse_args()
-    pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path
-    pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True)
-    convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/resnet/modeling_tf_resnet.py b/src/transformers/models/resnet/modeling_tf_resnet.py
index 4590cccf4ccb..0f32c04f459c 100644
--- a/src/transformers/models/resnet/modeling_tf_resnet.py
+++ b/src/transformers/models/resnet/modeling_tf_resnet.py
@@ -552,10 +552,10 @@ def classifier(self, x: tf.Tensor) -> tf.Tensor:
     @unpack_inputs
     def call(
         self,
-        pixel_values: tf.Tensor = None,
-        labels: tf.Tensor = None,
-        output_hidden_states: bool = None,
-        return_dict: bool = None,
+        pixel_values: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFImageClassifierOutputWithNoAttention]:
         r"""
diff --git a/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index c0e6bf94d2eb..000000000000
--- a/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoBERTa checkpoint."""
-
-import argparse
-import pathlib
-
-import fairseq
-import torch
-from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
-from fairseq.modules import TransformerSentenceEncoderLayer
-from packaging import version
-
-from transformers import RobertaConfig, RobertaForMaskedLM, RobertaForSequenceClassification
-from transformers.models.bert.modeling_bert import (
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-from transformers.utils import logging
-
-
-if version.parse(fairseq.__version__) < version.parse("0.9.0"):
-    raise Exception("requires fairseq >= 0.9.0")
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = "Hello world! cécé herlolip"
-
-
-def convert_roberta_checkpoint_to_pytorch(
-    roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
-):
-    """
-    Copy/paste/tweak roberta's weights to our BERT structure.
-    """
-    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
-    roberta.eval()  # disable dropout
-    roberta_sent_encoder = roberta.model.encoder.sentence_encoder
-    config = RobertaConfig(
-        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=roberta.args.encoder_embed_dim,
-        num_hidden_layers=roberta.args.encoder_layers,
-        num_attention_heads=roberta.args.encoder_attention_heads,
-        intermediate_size=roberta.args.encoder_ffn_embed_dim,
-        max_position_embeddings=514,
-        type_vocab_size=1,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-    )
-    if classification_head:
-        config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]
-    print("Our BERT config:", config)
-
-    model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
-    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
-    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.roberta.embeddings.token_type_embeddings.weight
-    )  # just zero them out b/c RoBERTa doesn't use them.
-    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
-    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer: BertLayer = model.roberta.encoder.layer[i]
-        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
-
-        # self attention
-        self_attn: BertSelfAttention = layer.attention.self
-        assert (
-            roberta_layer.self_attn.k_proj.weight.data.shape
-            == roberta_layer.self_attn.q_proj.weight.data.shape
-            == roberta_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        )
-
-        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
-
-        # self-attention output
-        self_output: BertSelfOutput = layer.attention.output
-        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
-        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
-        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
-        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
-        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
-
-        # intermediate
-        intermediate: BertIntermediate = layer.intermediate
-        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
-        intermediate.dense.weight = roberta_layer.fc1.weight
-        intermediate.dense.bias = roberta_layer.fc1.bias
-
-        # output
-        bert_output: BertOutput = layer.output
-        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
-        bert_output.dense.weight = roberta_layer.fc2.weight
-        bert_output.dense.bias = roberta_layer.fc2.bias
-        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
-        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
-        # end of layer
-
-    if classification_head:
-        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
-        model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
-        model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias
-
-    # Let's check that we get the same results.
-    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
-
-    our_output = model(input_ids)[0]
-    if classification_head:
-        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
-    else:
-        their_output = roberta.model(input_ids)[0]
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    args = parser.parse_args()
-    convert_roberta_checkpoint_to_pytorch(
-        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
-    )
diff --git a/src/transformers/models/roberta/modeling_flax_roberta.py b/src/transformers/models/roberta/modeling_flax_roberta.py
index 4d9bf7cb6e85..2beb0a06b8d7 100644
--- a/src/transformers/models/roberta/modeling_flax_roberta.py
+++ b/src/transformers/models/roberta/modeling_flax_roberta.py
@@ -224,7 +224,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 0425b8d1978d..f2dfa19a6a50 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -698,7 +698,7 @@ class RobertaPreTrainedModel(PreTrainedModel):
     _no_split_modules = ["RobertaEmbeddings", "RobertaSelfAttention", "RobertaSdpaSelfAttention"]
     _supports_sdpa = True
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->RobertaLMHead
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -714,6 +714,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, RobertaLMHead):
+            module.bias.data.zero_()
 
 
 ROBERTA_START_DOCSTRING = r"""
diff --git a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index b8491db08b18..000000000000
--- a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoBERTa-PreLayerNorm checkpoint."""
-
-import argparse
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import AutoTokenizer, RobertaPreLayerNormConfig, RobertaPreLayerNormForMaskedLM
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def convert_roberta_prelayernorm_checkpoint_to_pytorch(checkpoint_repo: str, pytorch_dump_folder_path: str):
-    """
-    Copy/paste/tweak roberta_prelayernorm's weights to our BERT structure.
-    """
-    # convert configuration
-    config = RobertaPreLayerNormConfig.from_pretrained(
-        checkpoint_repo, architectures=["RobertaPreLayerNormForMaskedLM"]
-    )
-
-    # convert state_dict
-    original_state_dict = torch.load(hf_hub_download(repo_id=checkpoint_repo, filename="pytorch_model.bin"))
-    state_dict = {}
-    for tensor_key, tensor_value in original_state_dict.items():
-        # The transformer implementation gives the model a unique name, rather than overwiriting 'roberta'
-        if tensor_key.startswith("roberta."):
-            tensor_key = "roberta_prelayernorm." + tensor_key[len("roberta.") :]
-
-        # The original implementation contains weights which are not used, remove them from the state_dict
-        if tensor_key.endswith(".self.LayerNorm.weight") or tensor_key.endswith(".self.LayerNorm.bias"):
-            continue
-
-        state_dict[tensor_key] = tensor_value
-
-    model = RobertaPreLayerNormForMaskedLM.from_pretrained(
-        pretrained_model_name_or_path=None, config=config, state_dict=state_dict
-    )
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    # convert tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(checkpoint_repo)
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint-repo",
-        default=None,
-        type=str,
-        required=True,
-        help="Path the official PyTorch dump, e.g. 'andreasmadsen/efficient_mlm_m0.40'.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_roberta_prelayernorm_checkpoint_to_pytorch(args.checkpoint_repo, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
index 6584c2e15e35..1e691c047bd4 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
@@ -227,7 +227,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index e8c5156d3cc5..6b0c40b222c1 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -581,7 +581,7 @@ class RobertaPreLayerNormPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["RobertaPreLayerNormEmbeddings", "RobertaPreLayerNormSelfAttention"]
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->RobertaPreLayerNormLMHead
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -597,6 +597,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, RobertaPreLayerNormLMHead):
+            module.bias.data.zero_()
 
 
 ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index 60c5e5414517..f0f38a48e51a 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -210,7 +210,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -310,7 +310,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -484,7 +484,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         # Load from model defaults
@@ -557,7 +557,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -658,7 +658,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -770,8 +770,8 @@ def build_inputs_with_special_tokens(
         self,
         token_ids_0: List[int],
         token_ids_1: Optional[List[int]] = None,
-        cls_token_id: int = None,
-        sep_token_id: int = None,
+        cls_token_id: Optional[int] = None,
+        sep_token_id: Optional[int] = None,
     ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
diff --git a/src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index d227948e0ee3..000000000000
--- a/src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoFormer checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import RoFormerConfig, RoFormerForMaskedLM, load_tf_weights_in_roformer
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = RoFormerConfig.from_json_file(bert_config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = RoFormerForMaskedLM(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_roformer(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    torch.save(model.state_dict(), pytorch_dump_path, _use_new_zipfile_serialization=False)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--bert_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained BERT model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/roformer/modeling_flax_roformer.py b/src/transformers/models/roformer/modeling_flax_roformer.py
index 0c7cdab581a7..f47146f16bbd 100644
--- a/src/transformers/models/roformer/modeling_flax_roformer.py
+++ b/src/transformers/models/roformer/modeling_flax_roformer.py
@@ -262,7 +262,7 @@ def __call__(
 
     @staticmethod
     def apply_rotary_position_embeddings(sinusoidal_pos, query_layer, key_layer, value_layer=None):
-        sin, cos = sinusoidal_pos.split(2, axis=-1)
+        sin, cos = jnp.split(sinusoidal_pos, 2, axis=-1)
         sin_pos = jnp.stack([sin, sin], axis=-1).reshape(sinusoidal_pos.shape)
         cos_pos = jnp.stack([cos, cos], axis=-1).reshape(sinusoidal_pos.shape)
 
@@ -1046,7 +1046,7 @@ def __call__(
         hidden_states = outputs[0]
 
         logits = self.qa_outputs(hidden_states)
-        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index 8dbf17ea46d5..445f6edb1c89 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -59,24 +59,21 @@ class RoFormerSinusoidalPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
         super().__init__(num_positions, embedding_dim)
-        self.weight = self._init_weight(self.weight)
 
-    @staticmethod
-    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+    def _init_weight(self):
         """
         Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
         the 2nd half of the vector. [dim // 2:]
         """
-        n_pos, dim = out.shape
+        n_pos, dim = self.weight.shape
         position_enc = np.array(
             [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
         )
-        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False)
         sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
         out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
         out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-        out.detach_()
-        return out
+        self.weight = nn.Parameter(out, requires_grad=False)
 
     @torch.no_grad()
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
@@ -694,7 +691,7 @@ def _init_weights(self, module):
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, RoFormerSinusoidalPositionalEmbedding):
-            pass
+            module._init_weight()
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
             if module.padding_idx is not None:
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index 4e7c0be16f93..738f8e67e9b9 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -156,9 +156,9 @@ def build(self, input_shape=None):
 
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        token_type_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        token_type_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
         training: bool = False,
     ) -> tf.Tensor:
         """
diff --git a/src/transformers/models/roformer/tokenization_utils.py b/src/transformers/models/roformer/tokenization_utils.py
index 9f5f1546fb59..4c9cf6cb0a0b 100644
--- a/src/transformers/models/roformer/tokenization_utils.py
+++ b/src/transformers/models/roformer/tokenization_utils.py
@@ -40,7 +40,7 @@ def __init__(self, vocab) -> None:
     def jieba_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
         splits = []
 
-        # this code slice normalized_string is too slow (6s) but test_alignement_methods can pass
+        # this code slice normalized_string is too slow (6s) but test_alignment_methods can pass
         for token, start, end in self.jieba.tokenize(str(normalized_string), hmm=False):
             if token in self.vocab:
                 splits.append(normalized_string[start:end])
@@ -52,7 +52,7 @@ def jieba_split(self, i: int, normalized_string: NormalizedString) -> List[Norma
                         splits.append(normalized_string[start:end])
                         start = end
 
-        # this code test_alignement_methods can't pass but fast (300ms)
+        # this code test_alignment_methods can't pass but fast (300ms)
         # for token in self.jieba.cut(str(normalized_string), False):
         #     if token in self.vocab:
         #         splits.append(NormalizedString(token))
diff --git a/src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py b/src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py
deleted file mode 100644
index 9f2271930e13..000000000000
--- a/src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py
+++ /dev/null
@@ -1,782 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RT Detr checkpoints with Timm backbone"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import RTDetrConfig, RTDetrForObjectDetection, RTDetrImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_rt_detr_config(model_name: str) -> RTDetrConfig:
-    config = RTDetrConfig()
-
-    config.num_labels = 80
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-mmdet-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    if model_name == "rtdetr_r18vd":
-        config.backbone_config.hidden_sizes = [64, 128, 256, 512]
-        config.backbone_config.depths = [2, 2, 2, 2]
-        config.backbone_config.layer_type = "basic"
-        config.encoder_in_channels = [128, 256, 512]
-        config.hidden_expansion = 0.5
-        config.decoder_layers = 3
-    elif model_name == "rtdetr_r34vd":
-        config.backbone_config.hidden_sizes = [64, 128, 256, 512]
-        config.backbone_config.depths = [3, 4, 6, 3]
-        config.backbone_config.layer_type = "basic"
-        config.encoder_in_channels = [128, 256, 512]
-        config.hidden_expansion = 0.5
-        config.decoder_layers = 4
-    elif model_name == "rtdetr_r50vd_m":
-        pass
-    elif model_name == "rtdetr_r50vd":
-        pass
-    elif model_name == "rtdetr_r101vd":
-        config.backbone_config.depths = [3, 4, 23, 3]
-        config.encoder_ffn_dim = 2048
-        config.encoder_hidden_dim = 384
-        config.decoder_in_channels = [384, 384, 384]
-    elif model_name == "rtdetr_r18vd_coco_o365":
-        config.backbone_config.hidden_sizes = [64, 128, 256, 512]
-        config.backbone_config.depths = [2, 2, 2, 2]
-        config.backbone_config.layer_type = "basic"
-        config.encoder_in_channels = [128, 256, 512]
-        config.hidden_expansion = 0.5
-        config.decoder_layers = 3
-    elif model_name == "rtdetr_r50vd_coco_o365":
-        pass
-    elif model_name == "rtdetr_r101vd_coco_o365":
-        config.backbone_config.depths = [3, 4, 23, 3]
-        config.encoder_ffn_dim = 2048
-        config.encoder_hidden_dim = 384
-        config.decoder_in_channels = [384, 384, 384]
-
-    return config
-
-
-def create_rename_keys(config):
-    # here we list all keys to be renamed (original name on the left, our name on the right)
-    rename_keys = []
-
-    # stem
-    # fmt: off
-    last_key = ["weight", "bias", "running_mean", "running_var"]
-
-    for level in range(3):
-        rename_keys.append((f"backbone.conv1.conv1_{level+1}.conv.weight", f"model.backbone.model.embedder.embedder.{level}.convolution.weight"))
-        for last in last_key:
-            rename_keys.append((f"backbone.conv1.conv1_{level+1}.norm.{last}", f"model.backbone.model.embedder.embedder.{level}.normalization.{last}"))
-
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            # shortcut
-            if layer_idx == 0:
-                if stage_idx == 0:
-                    rename_keys.append(
-                        (
-                            f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.weight",
-                            f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.convolution.weight",
-                        )
-                    )
-                    for last in last_key:
-                        rename_keys.append(
-                            (
-                                f"backbone.res_layers.{stage_idx}.blocks.0.short.norm.{last}",
-                                f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.normalization.{last}",
-                            )
-                        )
-                else:
-                    rename_keys.append(
-                        (
-                            f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.conv.weight",
-                            f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.convolution.weight",
-                        )
-                    )
-                    for last in last_key:
-                        rename_keys.append(
-                            (
-                                f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.norm.{last}",
-                                f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.normalization.{last}",
-                            )
-                        )
-
-            rename_keys.append(
-                (
-                    f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.conv.weight",
-                    f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.convolution.weight",
-                )
-            )
-            for last in last_key:
-                rename_keys.append((
-                    f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.norm.{last}",
-                    f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.{last}",
-                    ))
-
-            rename_keys.append(
-                (
-                    f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.conv.weight",
-                    f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.convolution.weight",
-                )
-            )
-            for last in last_key:
-                rename_keys.append((
-                    f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.norm.{last}",
-                    f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.{last}",
-                    ))
-
-            # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/nn/backbone/presnet.py#L171
-            if config.backbone_config.layer_type != "basic":
-                rename_keys.append(
-                    (
-                        f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.conv.weight",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.convolution.weight",
-                    )
-                )
-                for last in last_key:
-                    rename_keys.append((
-                        f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.norm.{last}",
-                        f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.normalization.{last}",
-                        ))
-    # fmt: on
-
-    for i in range(config.encoder_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
-                f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
-                f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.linear1.weight",
-                f"model.encoder.encoder.{i}.layers.0.fc1.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.linear1.bias",
-                f"model.encoder.encoder.{i}.layers.0.fc1.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.linear2.weight",
-                f"model.encoder.encoder.{i}.layers.0.fc2.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.linear2.bias",
-                f"model.encoder.encoder.{i}.layers.0.fc2.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.norm1.weight",
-                f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.norm1.bias",
-                f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.norm2.weight",
-                f"model.encoder.encoder.{i}.layers.0.final_layer_norm.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"encoder.encoder.{i}.layers.0.norm2.bias",
-                f"model.encoder.encoder.{i}.layers.0.final_layer_norm.bias",
-            )
-        )
-
-    for j in range(0, 3):
-        rename_keys.append((f"encoder.input_proj.{j}.0.weight", f"model.encoder_input_proj.{j}.0.weight"))
-        for last in last_key:
-            rename_keys.append((f"encoder.input_proj.{j}.1.{last}", f"model.encoder_input_proj.{j}.1.{last}"))
-
-    block_levels = 3 if config.backbone_config.layer_type != "basic" else 4
-
-    for i in range(len(config.encoder_in_channels) - 1):
-        # encoder layers: hybridencoder parts
-        for j in range(1, block_levels):
-            rename_keys.append(
-                (f"encoder.fpn_blocks.{i}.conv{j}.conv.weight", f"model.encoder.fpn_blocks.{i}.conv{j}.conv.weight")
-            )
-            for last in last_key:
-                rename_keys.append(
-                    (
-                        f"encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
-                        f"model.encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
-                    )
-                )
-
-        rename_keys.append((f"encoder.lateral_convs.{i}.conv.weight", f"model.encoder.lateral_convs.{i}.conv.weight"))
-        for last in last_key:
-            rename_keys.append(
-                (f"encoder.lateral_convs.{i}.norm.{last}", f"model.encoder.lateral_convs.{i}.norm.{last}")
-            )
-
-        for j in range(3):
-            for k in range(1, 3):
-                rename_keys.append(
-                    (
-                        f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
-                        f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
-                    )
-                )
-                for last in last_key:
-                    rename_keys.append(
-                        (
-                            f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
-                            f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
-                        )
-                    )
-
-        for j in range(1, block_levels):
-            rename_keys.append(
-                (f"encoder.pan_blocks.{i}.conv{j}.conv.weight", f"model.encoder.pan_blocks.{i}.conv{j}.conv.weight")
-            )
-            for last in last_key:
-                rename_keys.append(
-                    (
-                        f"encoder.pan_blocks.{i}.conv{j}.norm.{last}",
-                        f"model.encoder.pan_blocks.{i}.conv{j}.norm.{last}",
-                    )
-                )
-
-        for j in range(3):
-            for k in range(1, 3):
-                rename_keys.append(
-                    (
-                        f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
-                        f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
-                    )
-                )
-                for last in last_key:
-                    rename_keys.append(
-                        (
-                            f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
-                            f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
-                        )
-                    )
-
-        rename_keys.append(
-            (f"encoder.downsample_convs.{i}.conv.weight", f"model.encoder.downsample_convs.{i}.conv.weight")
-        )
-        for last in last_key:
-            rename_keys.append(
-                (f"encoder.downsample_convs.{i}.norm.{last}", f"model.encoder.downsample_convs.{i}.norm.{last}")
-            )
-
-    for i in range(config.decoder_layers):
-        # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.self_attn.out_proj.weight",
-                f"model.decoder.layers.{i}.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.self_attn.out_proj.bias",
-                f"model.decoder.layers.{i}.self_attn.out_proj.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.weight",
-                f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.bias",
-                f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.attention_weights.weight",
-                f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.attention_weights.bias",
-                f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.value_proj.weight",
-                f"model.decoder.layers.{i}.encoder_attn.value_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.value_proj.bias",
-                f"model.decoder.layers.{i}.encoder_attn.value_proj.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.output_proj.weight",
-                f"model.decoder.layers.{i}.encoder_attn.output_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.decoder.layers.{i}.cross_attn.output_proj.bias",
-                f"model.decoder.layers.{i}.encoder_attn.output_proj.bias",
-            )
-        )
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")
-        )
-        rename_keys.append((f"decoder.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"decoder.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"decoder.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"decoder.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"decoder.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")
-        )
-
-    for i in range(config.decoder_layers):
-        # decoder + class and bounding box heads
-        rename_keys.append(
-            (
-                f"decoder.dec_score_head.{i}.weight",
-                f"model.decoder.class_embed.{i}.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_score_head.{i}.bias",
-                f"model.decoder.class_embed.{i}.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.0.weight",
-                f"model.decoder.bbox_embed.{i}.layers.0.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.0.bias",
-                f"model.decoder.bbox_embed.{i}.layers.0.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.1.weight",
-                f"model.decoder.bbox_embed.{i}.layers.1.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.1.bias",
-                f"model.decoder.bbox_embed.{i}.layers.1.bias",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.2.weight",
-                f"model.decoder.bbox_embed.{i}.layers.2.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"decoder.dec_bbox_head.{i}.layers.2.bias",
-                f"model.decoder.bbox_embed.{i}.layers.2.bias",
-            )
-        )
-
-    # decoder projection
-    for i in range(len(config.decoder_in_channels)):
-        rename_keys.append(
-            (
-                f"decoder.input_proj.{i}.conv.weight",
-                f"model.decoder_input_proj.{i}.0.weight",
-            )
-        )
-        for last in last_key:
-            rename_keys.append(
-                (
-                    f"decoder.input_proj.{i}.norm.{last}",
-                    f"model.decoder_input_proj.{i}.1.{last}",
-                )
-            )
-
-    # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-    rename_keys.extend(
-        [
-            ("decoder.denoising_class_embed.weight", "model.denoising_class_embed.weight"),
-            ("decoder.query_pos_head.layers.0.weight", "model.decoder.query_pos_head.layers.0.weight"),
-            ("decoder.query_pos_head.layers.0.bias", "model.decoder.query_pos_head.layers.0.bias"),
-            ("decoder.query_pos_head.layers.1.weight", "model.decoder.query_pos_head.layers.1.weight"),
-            ("decoder.query_pos_head.layers.1.bias", "model.decoder.query_pos_head.layers.1.bias"),
-            ("decoder.enc_output.0.weight", "model.enc_output.0.weight"),
-            ("decoder.enc_output.0.bias", "model.enc_output.0.bias"),
-            ("decoder.enc_output.1.weight", "model.enc_output.1.weight"),
-            ("decoder.enc_output.1.bias", "model.enc_output.1.bias"),
-            ("decoder.enc_score_head.weight", "model.enc_score_head.weight"),
-            ("decoder.enc_score_head.bias", "model.enc_score_head.bias"),
-            ("decoder.enc_bbox_head.layers.0.weight", "model.enc_bbox_head.layers.0.weight"),
-            ("decoder.enc_bbox_head.layers.0.bias", "model.enc_bbox_head.layers.0.bias"),
-            ("decoder.enc_bbox_head.layers.1.weight", "model.enc_bbox_head.layers.1.weight"),
-            ("decoder.enc_bbox_head.layers.1.bias", "model.enc_bbox_head.layers.1.bias"),
-            ("decoder.enc_bbox_head.layers.2.weight", "model.enc_bbox_head.layers.2.weight"),
-            ("decoder.enc_bbox_head.layers.2.bias", "model.enc_bbox_head.layers.2.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-def rename_key(state_dict, old, new):
-    try:
-        val = state_dict.pop(old)
-        state_dict[new] = val
-    except Exception:
-        pass
-
-
-def read_in_q_k_v(state_dict, config):
-    prefix = ""
-    encoder_hidden_dim = config.encoder_hidden_dim
-
-    # first: transformer encoder
-    for i in range(config.encoder_layers):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.weight"] = in_proj_weight[
-            :encoder_hidden_dim, :
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.bias"] = in_proj_bias[:encoder_hidden_dim]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.weight"] = in_proj_weight[
-            encoder_hidden_dim : 2 * encoder_hidden_dim, :
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.bias"] = in_proj_bias[
-            encoder_hidden_dim : 2 * encoder_hidden_dim
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.weight"] = in_proj_weight[
-            -encoder_hidden_dim:, :
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.bias"] = in_proj_bias[-encoder_hidden_dim:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(config.decoder_layers):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def convert_rt_detr_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, repo_id):
-    """
-    Copy/paste/tweak model's weights to our RTDETR structure.
-    """
-
-    # load default config
-    config = get_rt_detr_config(model_name)
-
-    # load original model from torch hub
-    model_name_to_checkpoint_url = {
-        "rtdetr_r18vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_dec3_6x_coco_from_paddle.pth",
-        "rtdetr_r34vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r34vd_dec4_6x_coco_from_paddle.pth",
-        "rtdetr_r50vd_m": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_m_6x_coco_from_paddle.pth",
-        "rtdetr_r50vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_6x_coco_from_paddle.pth",
-        "rtdetr_r101vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_6x_coco_from_paddle.pth",
-        "rtdetr_r18vd_coco_o365": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_5x_coco_objects365_from_paddle.pth",
-        "rtdetr_r50vd_coco_o365": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_2x_coco_objects365_from_paddle.pth",
-        "rtdetr_r101vd_coco_o365": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_2x_coco_objects365_from_paddle.pth",
-    }
-    logger.info(f"Converting model {model_name}...")
-    state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[
-        "ema"
-    ]["module"]
-
-    # rename keys
-    for src, dest in create_rename_keys(config):
-        rename_key(state_dict, src, dest)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict, config)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    for key in state_dict.copy().keys():
-        if key.endswith("num_batches_tracked"):
-            del state_dict[key]
-        # for two_stage
-        if "bbox_embed" in key or ("class_embed" in key and "denoising_" not in key):
-            state_dict[key.split("model.decoder.")[-1]] = state_dict[key]
-
-    # finally, create HuggingFace model and load state dict
-    model = RTDetrForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # load image processor
-    image_processor = RTDetrImageProcessor()
-
-    # prepare image
-    img = prepare_img()
-
-    # preprocess image
-    transformations = transforms.Compose(
-        [
-            transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR),
-            transforms.ToTensor(),
-        ]
-    )
-    original_pixel_values = transformations(img).unsqueeze(0)  # insert batch dimension
-
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    pixel_values = pixel_values.to(device)
-
-    # Pass image by the model
-    outputs = model(pixel_values)
-
-    if model_name == "rtdetr_r18vd":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.3364253, -6.465683, -3.6130402],
-                [-4.083815, -6.4039373, -6.97881],
-                [-4.192215, -7.3410473, -6.9027247],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.16868353, 0.19833282, 0.21182671],
-                [0.25559652, 0.55121744, 0.47988364],
-                [0.7698693, 0.4124569, 0.46036878],
-            ]
-        )
-    elif model_name == "rtdetr_r34vd":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.3727384, -4.7921476, -5.7299604],
-                [-4.840536, -8.455345, -4.1745796],
-                [-4.1277084, -5.2154565, -5.7852697],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.258278, 0.5497808, 0.4732004],
-                [0.16889669, 0.19890057, 0.21138911],
-                [0.76632994, 0.4147879, 0.46851268],
-            ]
-        )
-    elif model_name == "rtdetr_r50vd_m":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.319764, -6.1349025, -6.094794],
-                [-5.1056995, -7.744766, -4.803956],
-                [-4.7685347, -7.9278393, -4.5751696],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.2582739, 0.55071366, 0.47660282],
-                [0.16811174, 0.19954777, 0.21292639],
-                [0.54986024, 0.2752091, 0.0561416],
-            ]
-        )
-    elif model_name == "rtdetr_r50vd":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.6476398, -5.001154, -4.9785104],
-                [-4.1593494, -4.7038546, -5.946485],
-                [-4.4374595, -4.658361, -6.2352347],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.16880608, 0.19992264, 0.21225442],
-                [0.76837635, 0.4122631, 0.46368608],
-                [0.2595386, 0.5483334, 0.4777486],
-            ]
-        )
-    elif model_name == "rtdetr_r101vd":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.6162, -4.9189, -4.6656],
-                [-4.4701, -4.4997, -4.9659],
-                [-5.6641, -7.9000, -5.0725],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.7707, 0.4124, 0.4585],
-                [0.2589, 0.5492, 0.4735],
-                [0.1688, 0.1993, 0.2108],
-            ]
-        )
-    elif model_name == "rtdetr_r18vd_coco_o365":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.8726, -5.9066, -5.2450],
-                [-4.8157, -6.8764, -5.1656],
-                [-4.7492, -5.7006, -5.1333],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.2552, 0.5501, 0.4773],
-                [0.1685, 0.1986, 0.2104],
-                [0.7692, 0.4141, 0.4620],
-            ]
-        )
-    elif model_name == "rtdetr_r50vd_coco_o365":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.6491, -3.9252, -5.3163],
-                [-4.1386, -5.0348, -3.9016],
-                [-4.4778, -4.5423, -5.7356],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.2583, 0.5492, 0.4747],
-                [0.5501, 0.2754, 0.0574],
-                [0.7693, 0.4137, 0.4613],
-            ]
-        )
-    elif model_name == "rtdetr_r101vd_coco_o365":
-        expected_slice_logits = torch.tensor(
-            [
-                [-4.5152, -5.6811, -5.7311],
-                [-4.5358, -7.2422, -5.0941],
-                [-4.6919, -5.5834, -6.0145],
-            ]
-        )
-        expected_slice_boxes = torch.tensor(
-            [
-                [0.7703, 0.4140, 0.4583],
-                [0.1686, 0.1991, 0.2107],
-                [0.2570, 0.5496, 0.4750],
-            ]
-        )
-    else:
-        raise ValueError(f"Unknown rt_detr_name: {model_name}")
-
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-3)
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Upload model, image processor and config to the hub
-        logger.info("Uploading PyTorch model and image processor to the hub...")
-        config.push_to_hub(
-            repo_id=repo_id, commit_message="Add config from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py"
-        )
-        model.push_to_hub(
-            repo_id=repo_id, commit_message="Add model from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py"
-        )
-        image_processor.push_to_hub(
-            repo_id=repo_id,
-            commit_message="Add image processor from convert_rt_detr_original_pytorch_checkpoint_to_pytorch.py",
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name",
-        default="rtdetr_r50vd",
-        type=str,
-        help="model_name of the checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
-    parser.add_argument(
-        "--repo_id",
-        type=str,
-        help="repo_id where the model will be pushed to.",
-    )
-    args = parser.parse_args()
-    convert_rt_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.repo_id)
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py
index b3e75972cc65..e458de37949b 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py
@@ -477,7 +477,7 @@ def prepare_annotation(
         image: np.ndarray,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -784,7 +784,7 @@ def preprocess(
         self,
         images: ImageInput,
         annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
index 0c9b4512adc3..dd0c54cc6319 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@@ -12,8 +12,7 @@
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
     BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
     BaseImageProcessorFast,
-    DefaultFastImageProcessorInitKwargs,
-    DefaultFastImageProcessorPreprocessKwargs,
+    DefaultFastImageProcessorKwargs,
     SizeDict,
     add_start_docstrings,
     get_image_size_for_max_height_width,
@@ -53,21 +52,12 @@
     from torchvision.transforms import functional as F
 
 
-class RTDetrFastImageProcessorInitKwargs(DefaultFastImageProcessorInitKwargs):
+class RTDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     format: Optional[Union[str, AnnotationFormat]]
     do_convert_annotations: Optional[bool]
     do_pad: Optional[bool]
     pad_size: Optional[Dict[str, int]]
-
-
-class RTDetrFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorPreprocessKwargs):
-    format: Optional[AnnotationFormat]
-    annotations: Optional[Dict]
-    do_convert_annotations: Optional[bool]
-    do_pad: Optional[bool]
-    pad_size: Optional[Dict[str, int]]
     return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
 
 
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@@ -151,6 +141,8 @@ def prepare_coco_detection_annotation(
             The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
             provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
             height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
     """,
 )
 class RTDetrImageProcessorFast(BaseImageProcessorFast):
@@ -165,11 +157,10 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
     size = {"height": 640, "width": 640}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_init_kwargs = RTDetrFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = RTDetrFastImageProcessorPreprocessKwargs
+    valid_kwargs = RTDetrFastImageProcessorKwargs
     do_convert_annotations = True
 
-    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
         # Backwards compatibility
         do_convert_annotations = kwargs.get("do_convert_annotations", None)
         do_normalize = kwargs.get("do_normalize", None)
@@ -183,7 +174,7 @@ def prepare_annotation(
         image: torch.Tensor,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -424,9 +415,13 @@ def pad(
         """,
     )
     def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[RTDetrFastImageProcessorPreprocessKwargs]
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[RTDetrFastImageProcessorKwargs],
     ) -> BatchFeature:
-        return super().preprocess(images, **kwargs)
+        return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
 
     def _preprocess(
         self,
@@ -491,15 +486,8 @@ def _preprocess(
                         target_size=resized_image.size()[-2:],
                     )
                 image = resized_image
-
-            if do_rescale and do_normalize:
-                # fused rescale and normalize
-                image = F.normalize(image.to(dtype=torch.float32), image_mean, image_std)
-            elif do_rescale:
-                image = image * rescale_factor
-            elif do_normalize:
-                image = F.normalize(image, image_mean, image_std)
-
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
             if do_convert_annotations and annotations is not None:
                 annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
 
diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py
index 3ff825034945..727925fb17c0 100644
--- a/src/transformers/models/rt_detr/modeling_rt_detr.py
+++ b/src/transformers/models/rt_detr/modeling_rt_detr.py
@@ -15,21 +15,18 @@
 """PyTorch RT-DETR model."""
 
 import math
-import os
 import warnings
 from dataclasses import dataclass
 from functools import partial
-from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
 
 from ...activations import ACT2CLS, ACT2FN
 from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...integrations import use_kernel_forward_from_hub
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import compile_compatible_method_lru_cache
@@ -37,11 +34,9 @@
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_ninja_available,
-    is_torch_cuda_available,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
+    torch_int,
 )
 from ...utils.backbone_utils import load_backbone
 from .configuration_rt_detr import RTDetrConfig
@@ -49,94 +44,66 @@
 
 logger = logging.get_logger(__name__)
 
-MultiScaleDeformableAttention = None
-
-
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.load_cuda_kernels
-def load_cuda_kernels():
-    from torch.utils.cpp_extension import load
-
-    global MultiScaleDeformableAttention
-
-    root = Path(__file__).resolve().parent.parent.parent / "kernels" / "deformable_detr"
-    src_files = [
-        root / filename
-        for filename in [
-            "vision.cpp",
-            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
-            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
-        ]
-    ]
-
-    MultiScaleDeformableAttention = load(
-        "MultiScaleDeformableAttention",
-        src_files,
-        with_cuda=True,
-        extra_include_paths=[str(root)],
-        extra_cflags=["-DWITH_CUDA=1"],
-        extra_cuda_cflags=[
-            "-DCUDA_HAS_FP16=1",
-            "-D__CUDA_NO_HALF_OPERATORS__",
-            "-D__CUDA_NO_HALF_CONVERSIONS__",
-            "-D__CUDA_NO_HALF2_OPERATORS__",
-        ],
-    )
 
+_CONFIG_FOR_DOC = "RTDetrConfig"
+# TODO: Replace all occurrences of the checkpoint with the final one
+_CHECKPOINT_FOR_DOC = "PekingU/rtdetr_r50vd"
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction
-class MultiScaleDeformableAttentionFunction(Function):
-    @staticmethod
+
+@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention
+class MultiScaleDeformableAttention(nn.Module):
     def forward(
-        context,
-        value,
-        value_spatial_shapes,
-        value_level_start_index,
-        sampling_locations,
-        attention_weights,
-        im2col_step,
+        self,
+        value: Tensor,
+        value_spatial_shapes: Tensor,
+        value_spatial_shapes_list: List[Tuple],
+        level_start_index: Tensor,
+        sampling_locations: Tensor,
+        attention_weights: Tensor,
+        im2col_step: int,
     ):
-        context.im2col_step = im2col_step
-        output = MultiScaleDeformableAttention.ms_deform_attn_forward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            context.im2col_step,
+        batch_size, _, num_heads, hidden_dim = value.shape
+        _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+        value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
+        sampling_grids = 2 * sampling_locations - 1
+        sampling_value_list = []
+        for level_id, (height, width) in enumerate(value_spatial_shapes_list):
+            # batch_size, height*width, num_heads, hidden_dim
+            # -> batch_size, height*width, num_heads*hidden_dim
+            # -> batch_size, num_heads*hidden_dim, height*width
+            # -> batch_size*num_heads, hidden_dim, height, width
+            value_l_ = (
+                value_list[level_id]
+                .flatten(2)
+                .transpose(1, 2)
+                .reshape(batch_size * num_heads, hidden_dim, height, width)
+            )
+            # batch_size, num_queries, num_heads, num_points, 2
+            # -> batch_size, num_heads, num_queries, num_points, 2
+            # -> batch_size*num_heads, num_queries, num_points, 2
+            sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+            # batch_size*num_heads, hidden_dim, num_queries, num_points
+            sampling_value_l_ = nn.functional.grid_sample(
+                value_l_,
+                sampling_grid_l_,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            sampling_value_list.append(sampling_value_l_)
+        # (batch_size, num_queries, num_heads, num_levels, num_points)
+        # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+        # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+        attention_weights = attention_weights.transpose(1, 2).reshape(
+            batch_size * num_heads, 1, num_queries, num_levels * num_points
         )
-        context.save_for_backward(
-            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
+        output = (
+            (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+            .sum(-1)
+            .view(batch_size, num_heads * hidden_dim, num_queries)
         )
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(context, grad_output):
-        (
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-        ) = context.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
-            value,
-            value_spatial_shapes,
-            value_level_start_index,
-            sampling_locations,
-            attention_weights,
-            grad_output,
-            context.im2col_step,
-        )
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "RTDetrConfig"
-# TODO: Replace all occurrences of the checkpoint with the final one
-_CHECKPOINT_FOR_DOC = "PekingU/rtdetr_r50vd"
+        return output.transpose(1, 2).contiguous()
 
 
 @dataclass
@@ -170,10 +137,10 @@ class RTDetrDecoderOutput(ModelOutput):
             used to compute the weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_logits: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -233,17 +200,17 @@ class RTDetrModelOutput(ModelOutput):
             Extra dictionary for the denoising related values
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_logits: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    init_reference_points: torch.FloatTensor = None
+    init_reference_points: Optional[torch.FloatTensor] = None
     enc_topk_logits: Optional[torch.FloatTensor] = None
     enc_topk_bboxes: Optional[torch.FloatTensor] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
@@ -322,13 +289,13 @@ class RTDetrObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_logits: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -500,7 +467,7 @@ def get_contrastive_denoising_training_group(
         denoise_positive_idx, [n * num_groups_denoising_queries for n in num_ground_truths]
     )
     # total denoising queries
-    num_denoising_queries = int(max_gt_num * 2 * num_groups_denoising_queries)
+    num_denoising_queries = torch_int(max_gt_num * 2 * num_groups_denoising_queries)
 
     if label_noise_ratio > 0:
         mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
@@ -619,7 +586,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        position_embeddings: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         **kwargs,
     ):
@@ -721,56 +688,12 @@ def __init__(self, config: RTDetrConfig):
             self.conv3 = nn.Identity()
 
     def forward(self, hidden_state):
-        device = hidden_state.device
         hidden_state_1 = self.conv1(hidden_state)
-        hidden_state_1 = self.bottlenecks(hidden_state_1).to(device)
-        hidden_state_2 = self.conv2(hidden_state).to(device)
+        hidden_state_1 = self.bottlenecks(hidden_state_1)
+        hidden_state_2 = self.conv2(hidden_state)
         return self.conv3(hidden_state_1 + hidden_state_2)
 
 
-# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention
-def multi_scale_deformable_attention(
-    value: Tensor,
-    value_spatial_shapes: Union[Tensor, List[Tuple]],
-    sampling_locations: Tensor,
-    attention_weights: Tensor,
-) -> Tensor:
-    batch_size, _, num_heads, hidden_dim = value.shape
-    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for level_id, (height, width) in enumerate(value_spatial_shapes):
-        # batch_size, height*width, num_heads, hidden_dim
-        # -> batch_size, height*width, num_heads*hidden_dim
-        # -> batch_size, num_heads*hidden_dim, height*width
-        # -> batch_size*num_heads, hidden_dim, height, width
-        value_l_ = (
-            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
-        )
-        # batch_size, num_queries, num_heads, num_points, 2
-        # -> batch_size, num_heads, num_queries, num_points, 2
-        # -> batch_size*num_heads, num_queries, num_points, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
-        # batch_size*num_heads, hidden_dim, num_queries, num_points
-        sampling_value_l_ = nn.functional.grid_sample(
-            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
-        )
-        sampling_value_list.append(sampling_value_l_)
-    # (batch_size, num_queries, num_heads, num_levels, num_points)
-    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
-    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
-    attention_weights = attention_weights.transpose(1, 2).reshape(
-        batch_size * num_heads, 1, num_queries, num_levels * num_points
-    )
-    output = (
-        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
-        .sum(-1)
-        .view(batch_size, num_heads * hidden_dim, num_queries)
-    )
-    return output.transpose(1, 2).contiguous()
-
-
 # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->RTDetr
 class RTDetrMultiscaleDeformableAttention(nn.Module):
     """
@@ -780,12 +703,7 @@ class RTDetrMultiscaleDeformableAttention(nn.Module):
     def __init__(self, config: RTDetrConfig, num_heads: int, n_points: int):
         super().__init__()
 
-        kernel_loaded = MultiScaleDeformableAttention is not None
-        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
-            try:
-                load_cuda_kernels()
-            except Exception as e:
-                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+        self.attn = MultiScaleDeformableAttention()
 
         if config.d_model % num_heads != 0:
             raise ValueError(
@@ -872,27 +790,16 @@ def forward(
         else:
             raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
 
-        if self.disable_custom_kernels or MultiScaleDeformableAttention is None or is_torchdynamo_compiling():
-            # PyTorch implementation
-            output = multi_scale_deformable_attention(
-                value, spatial_shapes_list, sampling_locations, attention_weights
-            )
-        else:
-            try:
-                # custom kernel
-                output = MultiScaleDeformableAttentionFunction.apply(
-                    value,
-                    spatial_shapes,
-                    level_start_index,
-                    sampling_locations,
-                    attention_weights,
-                    self.im2col_step,
-                )
-            except Exception:
-                # PyTorch implementation
-                output = multi_scale_deformable_attention(
-                    value, spatial_shapes_list, sampling_locations, attention_weights
-                )
+        output = self.attn(
+            value,
+            spatial_shapes,
+            spatial_shapes_list,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+
         output = self.output_proj(output)
 
         return output, attention_weights
@@ -1129,7 +1036,7 @@ class RTDetrPreTrainedModel(PreTrainedModel):
     config_class = RTDetrConfig
     base_model_prefix = "rt_detr"
     main_input_name = "pixel_values"
-    _no_split_modules = [r"RTDetrConvEncoder", r"RTDetrEncoderLayer", r"RTDetrDecoderLayer"]
+    _no_split_modules = [r"RTDetrHybridEncoder", r"RTDetrDecoderLayer"]
 
     def _init_weights(self, module):
         """Initalize the weights"""
@@ -1280,43 +1187,56 @@ def __init__(self, config: RTDetrConfig):
         self.eval_size = config.eval_size
         self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
         self.out_strides = self.feat_strides
-        activation_function = config.activation_function
+        self.num_fpn_stages = len(self.in_channels) - 1
+        self.num_pan_stages = len(self.in_channels) - 1
+        activation = config.activation_function
 
         # encoder transformer
         self.encoder = nn.ModuleList([RTDetrEncoder(config) for _ in range(len(self.encode_proj_layers))])
-        # top-down fpn
+
+        # top-down FPN
         self.lateral_convs = nn.ModuleList()
         self.fpn_blocks = nn.ModuleList()
-        for _ in range(len(self.in_channels) - 1, 0, -1):
-            self.lateral_convs.append(
-                RTDetrConvNormLayer(
-                    config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1, activation=activation_function
-                )
+        for _ in range(self.num_fpn_stages):
+            lateral_conv = RTDetrConvNormLayer(
+                config,
+                in_channels=self.encoder_hidden_dim,
+                out_channels=self.encoder_hidden_dim,
+                kernel_size=1,
+                stride=1,
+                activation=activation,
             )
-            self.fpn_blocks.append(RTDetrCSPRepLayer(config))
+            fpn_block = RTDetrCSPRepLayer(config)
+            self.lateral_convs.append(lateral_conv)
+            self.fpn_blocks.append(fpn_block)
 
-        # bottom-up pan
+        # bottom-up PAN
         self.downsample_convs = nn.ModuleList()
         self.pan_blocks = nn.ModuleList()
-        for _ in range(len(self.in_channels) - 1):
-            self.downsample_convs.append(
-                RTDetrConvNormLayer(
-                    config, self.encoder_hidden_dim, self.encoder_hidden_dim, 3, 2, activation=activation_function
-                )
+        for _ in range(self.num_pan_stages):
+            downsample_conv = RTDetrConvNormLayer(
+                config,
+                in_channels=self.encoder_hidden_dim,
+                out_channels=self.encoder_hidden_dim,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
             )
-            self.pan_blocks.append(RTDetrCSPRepLayer(config))
+            pan_block = RTDetrCSPRepLayer(config)
+            self.downsample_convs.append(downsample_conv)
+            self.pan_blocks.append(pan_block)
 
     @staticmethod
     def build_2d_sincos_position_embedding(
         width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
     ):
-        grid_w = torch.arange(int(width), dtype=dtype, device=device)
-        grid_h = torch.arange(int(height), dtype=dtype, device=device)
+        grid_w = torch.arange(torch_int(width), device=device).to(dtype)
+        grid_h = torch.arange(torch_int(height), device=device).to(dtype)
         grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
         if embed_dim % 4 != 0:
             raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding")
         pos_dim = embed_dim // 4
-        omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim
+        omega = torch.arange(pos_dim, device=device).to(dtype) / pos_dim
         omega = 1.0 / (temperature**omega)
 
         out_w = grid_w.flatten()[..., None] @ omega[None]
@@ -1372,6 +1292,7 @@ def forward(
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
+
         # encoder
         if self.config.encoder_layers > 0:
             for i, enc_ind in enumerate(self.encode_proj_layers):
@@ -1407,30 +1328,37 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states[enc_ind],)
 
-        # broadcasting and fusion
+        # top-down FPN
         fpn_feature_maps = [hidden_states[-1]]
-        for idx in range(len(self.in_channels) - 1, 0, -1):
-            feat_high = fpn_feature_maps[0]
-            feat_low = hidden_states[idx - 1]
-            feat_high = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_high)
-            fpn_feature_maps[0] = feat_high
-            upsample_feat = F.interpolate(feat_high, scale_factor=2.0, mode="nearest")
-            fps_map = self.fpn_blocks[len(self.in_channels) - 1 - idx](torch.concat([upsample_feat, feat_low], dim=1))
-            fpn_feature_maps.insert(0, fps_map)
-
-        fpn_states = [fpn_feature_maps[0]]
-        for idx in range(len(self.in_channels) - 1):
-            feat_low = fpn_states[-1]
-            feat_high = fpn_feature_maps[idx + 1]
-            downsample_feat = self.downsample_convs[idx](feat_low)
-            hidden_states = self.pan_blocks[idx](
-                torch.concat([downsample_feat, feat_high.to(downsample_feat.device)], dim=1)
-            )
-            fpn_states.append(hidden_states)
+        for idx, (lateral_conv, fpn_block) in enumerate(zip(self.lateral_convs, self.fpn_blocks)):
+            backbone_feature_map = hidden_states[self.num_fpn_stages - idx - 1]
+            top_fpn_feature_map = fpn_feature_maps[-1]
+            # apply lateral block
+            top_fpn_feature_map = lateral_conv(top_fpn_feature_map)
+            fpn_feature_maps[-1] = top_fpn_feature_map
+            # apply fpn block
+            top_fpn_feature_map = F.interpolate(top_fpn_feature_map, scale_factor=2.0, mode="nearest")
+            fused_feature_map = torch.concat([top_fpn_feature_map, backbone_feature_map], dim=1)
+            new_fpn_feature_map = fpn_block(fused_feature_map)
+            fpn_feature_maps.append(new_fpn_feature_map)
+
+        fpn_feature_maps = fpn_feature_maps[::-1]
+
+        # bottom-up PAN
+        pan_feature_maps = [fpn_feature_maps[0]]
+        for idx, (downsample_conv, pan_block) in enumerate(zip(self.downsample_convs, self.pan_blocks)):
+            top_pan_feature_map = pan_feature_maps[-1]
+            fpn_feature_map = fpn_feature_maps[idx + 1]
+            downsampled_feature_map = downsample_conv(top_pan_feature_map)
+            fused_feature_map = torch.concat([downsampled_feature_map, fpn_feature_map], dim=1)
+            new_pan_feature_map = pan_block(fused_feature_map)
+            pan_feature_maps.append(new_pan_feature_map)
 
         if not return_dict:
-            return tuple(v for v in [fpn_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(last_hidden_state=fpn_states, hidden_states=encoder_states, attentions=all_attentions)
+            return tuple(v for v in [pan_feature_maps, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=pan_feature_maps, hidden_states=encoder_states, attentions=all_attentions
+        )
 
 
 class RTDetrDecoder(RTDetrPreTrainedModel):
@@ -1719,13 +1647,14 @@ def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dt
         anchors = []
         for level, (height, width) in enumerate(spatial_shapes):
             grid_y, grid_x = torch.meshgrid(
-                torch.arange(end=height, dtype=dtype, device=device),
-                torch.arange(end=width, dtype=dtype, device=device),
+                torch.arange(end=height, device=device).to(dtype),
+                torch.arange(end=width, device=device).to(dtype),
                 indexing="ij",
             )
             grid_xy = torch.stack([grid_x, grid_y], -1)
-            valid_wh = torch.tensor([width, height], device=device).to(dtype)
-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_wh
+            grid_xy = grid_xy.unsqueeze(0) + 0.5
+            grid_xy[..., 0] /= width
+            grid_xy[..., 1] /= height
             wh = torch.ones_like(grid_xy) * grid_size * (2.0**level)
             anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4))
         # define the valid range for anchor coordinates
@@ -1826,14 +1755,15 @@ def forward(
         # Prepare encoder inputs (by flattening)
         source_flatten = []
         spatial_shapes_list = []
+        spatial_shapes = torch.empty((len(sources), 2), device=device, dtype=torch.long)
         for level, source in enumerate(sources):
-            batch_size, num_channels, height, width = source.shape
-            spatial_shape = (height, width)
-            spatial_shapes_list.append(spatial_shape)
+            height, width = source.shape[-2:]
+            spatial_shapes[level, 0] = height
+            spatial_shapes[level, 1] = width
+            spatial_shapes_list.append((height, width))
             source = source.flatten(2).transpose(1, 2)
             source_flatten.append(source)
         source_flatten = torch.cat(source_flatten, 1)
-        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
         level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
 
         # prepare denoising training
@@ -1867,8 +1797,7 @@ def forward(
             anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype)
         else:
             anchors, valid_mask = self.anchors, self.valid_mask
-
-        anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype)
+            anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype)
 
         # use the valid_mask to selectively retain values in the feature map where the mask is `True`
         memory = valid_mask.to(source_flatten.dtype) * source_flatten
@@ -2069,7 +1998,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.model(
diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py
index 101d02c0213f..fd9913e97e94 100644
--- a/src/transformers/models/rt_detr/modular_rt_detr.py
+++ b/src/transformers/models/rt_detr/modular_rt_detr.py
@@ -2,8 +2,7 @@
 from typing import Dict, List, Optional, Tuple, Union
 
 from transformers.models.detr.image_processing_detr_fast import (
-    DetrFastImageProcessorInitKwargs,
-    DetrFastImageProcessorPreprocessKwargs,
+    DetrFastImageProcessorKwargs,
     DetrImageProcessorFast,
 )
 
@@ -112,11 +111,7 @@ def prepare_coco_detection_annotation(
     return new_target
 
 
-class RTDetrFastImageProcessorInitKwargs(DetrFastImageProcessorInitKwargs):
-    pass
-
-
-class RTDetrFastImageProcessorPreprocessKwargs(DetrFastImageProcessorPreprocessKwargs):
+class RTDetrFastImageProcessorKwargs(DetrFastImageProcessorKwargs):
     pass
 
 
@@ -133,10 +128,9 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast, BaseImageProcessorFast):
     size = {"height": 640, "width": 640}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_init_kwargs = RTDetrFastImageProcessorInitKwargs
-    valid_preprocess_kwargs = RTDetrFastImageProcessorPreprocessKwargs
+    valid_kwargs = RTDetrFastImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
         # Backwards compatibility
         do_convert_annotations = kwargs.get("do_convert_annotations", None)
         do_normalize = kwargs.get("do_normalize", None)
@@ -181,16 +175,20 @@ def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorInitKwargs]) -> None
         """,
     )
     def preprocess(
-        self, images: ImageInput, **kwargs: Unpack[RTDetrFastImageProcessorPreprocessKwargs]
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[RTDetrFastImageProcessorKwargs],
     ) -> BatchFeature:
-        return BaseImageProcessorFast().preprocess(images, **kwargs)
+        return BaseImageProcessorFast().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
 
     def prepare_annotation(
         self,
         image: torch.Tensor,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -268,15 +266,8 @@ def _preprocess(
                         target_size=resized_image.size()[-2:],
                     )
                 image = resized_image
-
-            if do_rescale and do_normalize:
-                # fused rescale and normalize
-                image = F.normalize(image.to(dtype=torch.float32), image_mean, image_std)
-            elif do_rescale:
-                image = image * rescale_factor
-            elif do_normalize:
-                image = F.normalize(image, image_mean, image_std)
-
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
             if do_convert_annotations and annotations is not None:
                 annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
 
diff --git a/src/transformers/models/rt_detr_v2/convert_rt_detr_v2_weights_to_hf.py b/src/transformers/models/rt_detr_v2/convert_rt_detr_v2_weights_to_hf.py
deleted file mode 100644
index 51372b74e426..000000000000
--- a/src/transformers/models/rt_detr_v2/convert_rt_detr_v2_weights_to_hf.py
+++ /dev/null
@@ -1,363 +0,0 @@
-# coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RT Detr V2 checkpoints with Timm backbone"""
-
-import argparse
-import json
-import re
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms
-
-from transformers import RTDetrImageProcessor, RTDetrV2Config, RTDetrV2ForObjectDetection
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_rt_detr_v2_config(model_name: str) -> RTDetrV2Config:
-    config = RTDetrV2Config()
-
-    config.num_labels = 80
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-mmdet-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    if model_name == "rtdetr_v2_r18vd":
-        config.backbone_config.hidden_sizes = [64, 128, 256, 512]
-        config.backbone_config.depths = [2, 2, 2, 2]
-        config.backbone_config.layer_type = "basic"
-        config.encoder_in_channels = [128, 256, 512]
-        config.hidden_expansion = 0.5
-        config.decoder_layers = 3
-    elif model_name == "rtdetr_v2_r34vd":
-        config.backbone_config.hidden_sizes = [64, 128, 256, 512]
-        config.backbone_config.depths = [3, 4, 6, 3]
-        config.backbone_config.layer_type = "basic"
-        config.encoder_in_channels = [128, 256, 512]
-        config.hidden_expansion = 0.5
-        config.decoder_layers = 4
-    # TODO: check this not working
-    elif model_name == "rtdetr_v2_r50vd_m":
-        config.hidden_expansion = 0.5
-    elif model_name == "rtdetr_v2_r50vd":
-        pass
-    elif model_name == "rtdetr_v2_r101vd":
-        config.backbone_config.depths = [3, 4, 23, 3]
-        config.encoder_ffn_dim = 2048
-        config.encoder_hidden_dim = 384
-        config.decoder_in_channels = [384, 384, 384]
-
-    return config
-
-
-# Define a mapping from original keys to converted keys using regex
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    r"backbone.conv1.conv1_1.conv.weight": r"model.backbone.model.embedder.embedder.0.convolution.weight",
-    r"backbone.conv1.conv1_1.norm.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.embedder.0.normalization.\1",
-    r"backbone.conv1.conv1_2.conv.weight": r"model.backbone.model.embedder.embedder.1.convolution.weight",
-    r"backbone.conv1.conv1_2.norm.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.embedder.1.normalization.\1",
-    r"backbone.conv1.conv1_3.conv.weight": r"model.backbone.model.embedder.embedder.2.convolution.weight",
-    r"backbone.conv1.conv1_3.norm.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.embedder.2.normalization.\1",
-    r"backbone.res_layers.(\d+).blocks.(\d+).branch2a.conv.weight": r"model.backbone.model.encoder.stages.\1.layers.\2.layer.0.convolution.weight",
-    r"backbone.res_layers.(\d+).blocks.(\d+).branch2a.norm.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.layers.\2.layer.0.normalization.\3",
-    r"backbone.res_layers.(\d+).blocks.(\d+).branch2b.conv.weight": r"model.backbone.model.encoder.stages.\1.layers.\2.layer.1.convolution.weight",
-    r"backbone.res_layers.(\d+).blocks.(\d+).branch2b.norm.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.layers.\2.layer.1.normalization.\3",
-    r"backbone.res_layers.(\d+).blocks.(\d+).branch2c.conv.weight": r"model.backbone.model.encoder.stages.\1.layers.\2.layer.2.convolution.weight",
-    r"backbone.res_layers.(\d+).blocks.(\d+).branch2c.norm.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.layers.\2.layer.2.normalization.\3",
-    r"encoder.encoder.(\d+).layers.0.self_attn.out_proj.weight": r"model.encoder.encoder.\1.layers.0.self_attn.out_proj.weight",
-    r"encoder.encoder.(\d+).layers.0.self_attn.out_proj.bias": r"model.encoder.encoder.\1.layers.0.self_attn.out_proj.bias",
-    r"encoder.encoder.(\d+).layers.0.linear1.weight": r"model.encoder.encoder.\1.layers.0.fc1.weight",
-    r"encoder.encoder.(\d+).layers.0.linear1.bias": r"model.encoder.encoder.\1.layers.0.fc1.bias",
-    r"encoder.encoder.(\d+).layers.0.linear2.weight": r"model.encoder.encoder.\1.layers.0.fc2.weight",
-    r"encoder.encoder.(\d+).layers.0.linear2.bias": r"model.encoder.encoder.\1.layers.0.fc2.bias",
-    r"encoder.encoder.(\d+).layers.0.norm1.weight": r"model.encoder.encoder.\1.layers.0.self_attn_layer_norm.weight",
-    r"encoder.encoder.(\d+).layers.0.norm1.bias": r"model.encoder.encoder.\1.layers.0.self_attn_layer_norm.bias",
-    r"encoder.encoder.(\d+).layers.0.norm2.weight": r"model.encoder.encoder.\1.layers.0.final_layer_norm.weight",
-    r"encoder.encoder.(\d+).layers.0.norm2.bias": r"model.encoder.encoder.\1.layers.0.final_layer_norm.bias",
-    r"encoder.input_proj.(\d+).conv.weight": r"model.encoder_input_proj.\1.0.weight",
-    r"encoder.input_proj.(\d+).norm.(.*)": r"model.encoder_input_proj.\1.1.\2",
-    r"encoder.fpn_blocks.(\d+).conv(\d+).conv.weight": r"model.encoder.fpn_blocks.\1.conv\2.conv.weight",
-    # r"encoder.fpn_blocks.(\d+).conv(\d+).norm.(.*)": r"model.encoder.fpn_blocks.\1.conv\2.norm.\3",
-    r"encoder.fpn_blocks.(\d+).conv(\d+).norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.conv\2.norm.\3",
-    r"encoder.lateral_convs.(\d+).conv.weight": r"model.encoder.lateral_convs.\1.conv.weight",
-    r"encoder.lateral_convs.(\d+).norm.(.*)": r"model.encoder.lateral_convs.\1.norm.\2",
-    r"encoder.fpn_blocks.(\d+).bottlenecks.(\d+).conv(\d+).conv.weight": r"model.encoder.fpn_blocks.\1.bottlenecks.\2.conv\3.conv.weight",
-    r"encoder.fpn_blocks.(\d+).bottlenecks.(\d+).conv(\d+).norm.(\w+)": r"model.encoder.fpn_blocks.\1.bottlenecks.\2.conv\3.norm.\4",
-    r"encoder.pan_blocks.(\d+).conv(\d+).conv.weight": r"model.encoder.pan_blocks.\1.conv\2.conv.weight",
-    r"encoder.pan_blocks.(\d+).conv(\d+).norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.conv\2.norm.\3",
-    r"encoder.pan_blocks.(\d+).bottlenecks.(\d+).conv(\d+).conv.weight": r"model.encoder.pan_blocks.\1.bottlenecks.\2.conv\3.conv.weight",
-    r"encoder.pan_blocks.(\d+).bottlenecks.(\d+).conv(\d+).norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.bottlenecks.\2.conv\3.norm.\4",
-    r"encoder.downsample_convs.(\d+).conv.weight": r"model.encoder.downsample_convs.\1.conv.weight",
-    r"encoder.downsample_convs.(\d+).norm.(weight|bias|running_mean|running_var)": r"model.encoder.downsample_convs.\1.norm.\2",
-    r"decoder.decoder.layers.(\d+).self_attn.out_proj.weight": r"model.decoder.layers.\1.self_attn.out_proj.weight",
-    r"decoder.decoder.layers.(\d+).self_attn.out_proj.bias": r"model.decoder.layers.\1.self_attn.out_proj.bias",
-    r"decoder.decoder.layers.(\d+).cross_attn.sampling_offsets.weight": r"model.decoder.layers.\1.encoder_attn.sampling_offsets.weight",
-    r"decoder.decoder.layers.(\d+).cross_attn.sampling_offsets.bias": r"model.decoder.layers.\1.encoder_attn.sampling_offsets.bias",
-    r"decoder.decoder.layers.(\d+).cross_attn.attention_weights.weight": r"model.decoder.layers.\1.encoder_attn.attention_weights.weight",
-    r"decoder.decoder.layers.(\d+).cross_attn.attention_weights.bias": r"model.decoder.layers.\1.encoder_attn.attention_weights.bias",
-    r"decoder.decoder.layers.(\d+).cross_attn.value_proj.weight": r"model.decoder.layers.\1.encoder_attn.value_proj.weight",
-    r"decoder.decoder.layers.(\d+).cross_attn.value_proj.bias": r"model.decoder.layers.\1.encoder_attn.value_proj.bias",
-    r"decoder.decoder.layers.(\d+).cross_attn.output_proj.weight": r"model.decoder.layers.\1.encoder_attn.output_proj.weight",
-    r"decoder.decoder.layers.(\d+).cross_attn.output_proj.bias": r"model.decoder.layers.\1.encoder_attn.output_proj.bias",
-    r"decoder.decoder.layers.(\d+).norm1.weight": r"model.decoder.layers.\1.self_attn_layer_norm.weight",
-    r"decoder.decoder.layers.(\d+).norm1.bias": r"model.decoder.layers.\1.self_attn_layer_norm.bias",
-    r"decoder.decoder.layers.(\d+).norm2.weight": r"model.decoder.layers.\1.encoder_attn_layer_norm.weight",
-    r"decoder.decoder.layers.(\d+).norm2.bias": r"model.decoder.layers.\1.encoder_attn_layer_norm.bias",
-    r"decoder.decoder.layers.(\d+).linear1.weight": r"model.decoder.layers.\1.fc1.weight",
-    r"decoder.decoder.layers.(\d+).linear1.bias": r"model.decoder.layers.\1.fc1.bias",
-    r"decoder.decoder.layers.(\d+).linear2.weight": r"model.decoder.layers.\1.fc2.weight",
-    r"decoder.decoder.layers.(\d+).linear2.bias": r"model.decoder.layers.\1.fc2.bias",
-    r"decoder.decoder.layers.(\d+).norm3.weight": r"model.decoder.layers.\1.final_layer_norm.weight",
-    r"decoder.decoder.layers.(\d+).norm3.bias": r"model.decoder.layers.\1.final_layer_norm.bias",
-    r"decoder.decoder.layers.(\d+).cross_attn.num_points_scale": r"model.decoder.layers.\1.encoder_attn.n_points_scale",
-    r"decoder.dec_score_head.(\d+).weight": r"model.decoder.class_embed.\1.weight",
-    r"decoder.dec_score_head.(\d+).bias": r"model.decoder.class_embed.\1.bias",
-    r"decoder.dec_bbox_head.(\d+).layers.(\d+).(weight|bias)": r"model.decoder.bbox_embed.\1.layers.\2.\3",
-    r"decoder.denoising_class_embed.weight": r"model.denoising_class_embed.weight",
-    r"decoder.query_pos_head.layers.0.weight": r"model.decoder.query_pos_head.layers.0.weight",
-    r"decoder.query_pos_head.layers.0.bias": r"model.decoder.query_pos_head.layers.0.bias",
-    r"decoder.query_pos_head.layers.1.weight": r"model.decoder.query_pos_head.layers.1.weight",
-    r"decoder.query_pos_head.layers.1.bias": r"model.decoder.query_pos_head.layers.1.bias",
-    r"decoder.enc_output.proj.weight": r"model.enc_output.0.weight",
-    r"decoder.enc_output.proj.bias": r"model.enc_output.0.bias",
-    r"decoder.enc_output.norm.weight": r"model.enc_output.1.weight",
-    r"decoder.enc_output.norm.bias": r"model.enc_output.1.bias",
-    r"decoder.enc_score_head.weight": r"model.enc_score_head.weight",
-    r"decoder.enc_score_head.bias": r"model.enc_score_head.bias",
-    r"decoder.enc_bbox_head.layers.(\d+).(weight|bias)": r"model.enc_bbox_head.layers.\1.\2",
-    r"backbone.res_layers.0.blocks.0.short.conv.weight": r"model.backbone.model.encoder.stages.0.layers.0.shortcut.convolution.weight",
-    r"backbone.res_layers.0.blocks.0.short.norm.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.0.layers.0.shortcut.normalization.\1",
-    r"backbone.res_layers.(\d+).blocks.0.short.conv.conv.weight": r"model.backbone.model.encoder.stages.\1.layers.0.shortcut.1.convolution.weight",
-    r"backbone.res_layers.(\d+).blocks.0.short.conv.norm.(\w+)": r"model.backbone.model.encoder.stages.\1.layers.0.shortcut.1.normalization.\2",
-    # Mapping for subsequent blocks in other stages
-    r"backbone.res_layers.(\d+).blocks.0.short.conv.weight": r"model.backbone.model.encoder.stages.\1.layers.0.shortcut.1.convolution.weight",
-    r"backbone.res_layers.(\d+).blocks.0.short.norm.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.layers.0.shortcut.1.normalization.\2",
-    r"decoder.input_proj.(\d+).conv.weight": r"model.decoder_input_proj.\1.0.weight",
-    r"decoder.input_proj.(\d+).norm.(.*)": r"model.decoder_input_proj.\1.1.\2",
-}
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
-    # Use the mapping to rename keys
-    for original_key, converted_key in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-        for key in list(state_dict_keys.keys()):
-            new_key = re.sub(original_key, converted_key, key)
-            if new_key != key:
-                state_dict_keys[new_key] = state_dict_keys.pop(key)
-
-    return state_dict_keys
-
-
-def read_in_q_k_v(state_dict, config):
-    prefix = ""
-    encoder_hidden_dim = config.encoder_hidden_dim
-
-    # first: transformer encoder
-    for i in range(config.encoder_layers):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.weight"] = in_proj_weight[
-            :encoder_hidden_dim, :
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.bias"] = in_proj_bias[:encoder_hidden_dim]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.weight"] = in_proj_weight[
-            encoder_hidden_dim : 2 * encoder_hidden_dim, :
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.bias"] = in_proj_bias[
-            encoder_hidden_dim : 2 * encoder_hidden_dim
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.weight"] = in_proj_weight[
-            -encoder_hidden_dim:, :
-        ]
-        state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.bias"] = in_proj_bias[-encoder_hidden_dim:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(config.decoder_layers):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-
-    return im
-
-
-@torch.no_grad()
-def write_model_and_image_processor(model_name, output_dir, push_to_hub, repo_id):
-    """
-    Copy/paste/tweak model's weights to our RTDETR structure.
-    """
-
-    # load default config
-    config = get_rt_detr_v2_config(model_name)
-
-    # load original model from torch hub
-    model_name_to_checkpoint_url = {
-        "rtdetr_v2_r18vd": "https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth",
-        "rtdetr_v2_r34vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r34vd_120e_coco_ema.pth",
-        "rtdetr_v2_r50vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_6x_coco_ema.pth",
-        "rtdetr_v2_r101vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r101vd_6x_coco_from_paddle.pth",
-    }
-    logger.info(f"Converting model {model_name}...")
-    state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[
-        "ema"
-    ]["module"]
-    # rename keys
-    state_dict = convert_old_keys_to_new_keys(state_dict)
-    for key in state_dict.copy().keys():
-        if key.endswith("num_batches_tracked"):
-            del state_dict[key]
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict, config)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    for key in state_dict.copy().keys():
-        if key.endswith("num_batches_tracked"):
-            del state_dict[key]
-        # for two_stage
-        if "bbox_embed" in key or ("class_embed" in key and "denoising_" not in key):
-            state_dict[key.split("model.decoder.")[-1]] = state_dict[key]
-
-    # no need in ckpt
-    del state_dict["decoder.anchors"]
-    del state_dict["decoder.valid_mask"]
-    # finally, create HuggingFace model and load state dict
-    model = RTDetrV2ForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # load image processor
-    image_processor = RTDetrImageProcessor()
-
-    # prepare image
-    img = prepare_img()
-
-    # preprocess image
-    transformations = transforms.Compose(
-        [
-            transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR),
-            transforms.ToTensor(),
-        ]
-    )
-    original_pixel_values = transformations(img).unsqueeze(0)  # insert batch dimension
-
-    encoding = image_processor(images=img, return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-
-    assert torch.allclose(original_pixel_values, pixel_values)
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    pixel_values = pixel_values.to(device)
-
-    # Pass image by the model
-    with torch.no_grad():
-        outputs = model(pixel_values)
-
-    if model_name == "rtdetr_v2_r18vd":
-        expected_slice_logits = torch.tensor(
-            [[-3.7045, -5.1913, -6.1787], [-4.0106, -9.3450, -5.2043], [-4.1287, -4.7463, -5.8634]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.2582, 0.5497, 0.4764], [0.1684, 0.1985, 0.2120], [0.7665, 0.4146, 0.4669]]
-        )
-    elif model_name == "rtdetr_v2_r34vd":
-        expected_slice_logits = torch.tensor(
-            [[-4.6108, -5.9453, -3.8505], [-3.8702, -6.1136, -5.5677], [-3.7790, -6.4538, -5.9449]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.1691, 0.1984, 0.2118], [0.2594, 0.5506, 0.4736], [0.7669, 0.4136, 0.4654]]
-        )
-    elif model_name == "rtdetr_v2_r50vd":
-        expected_slice_logits = torch.tensor(
-            [[-4.7881, -4.6754, -6.1624], [-5.4441, -6.6486, -4.3840], [-3.5455, -4.9318, -6.3544]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.2588, 0.5487, 0.4747], [0.5497, 0.2760, 0.0573], [0.7688, 0.4133, 0.4634]]
-        )
-    elif model_name == "rtdetr_v2_r101vd":
-        expected_slice_logits = torch.tensor(
-            [[-4.6162, -4.9189, -4.6656], [-4.4701, -4.4997, -4.9659], [-5.6641, -7.9000, -5.0725]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.7707, 0.4124, 0.4585], [0.2589, 0.5492, 0.4735], [0.1688, 0.1993, 0.2108]]
-        )
-    else:
-        raise ValueError(f"Unknown rt_detr_v2_name: {model_name}")
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-3)
-
-    if output_dir is not None:
-        Path(output_dir).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {output_dir}")
-        model.save_pretrained(output_dir)
-        print(f"Saving image processor to {output_dir}")
-        image_processor.save_pretrained(output_dir)
-
-    if push_to_hub:
-        # Upload model, image processor and config to the hub
-        logger.info("Uploading PyTorch model and image processor to the hub...")
-        config.push_to_hub(
-            repo_id=repo_id,
-            commit_message="Add config from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
-        )
-        model.push_to_hub(
-            repo_id=repo_id,
-            commit_message="Add model from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
-        )
-        image_processor.push_to_hub(
-            repo_id=repo_id,
-            commit_message="Add image processor from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name",
-        default="rtdetr_v2_r18vd",
-        type=str,
-        help="model_name of the checkpoint you'd like to convert.",
-    )
-    parser.add_argument("--output_dir", default=None, type=str, help="Location to write HF model and image processor")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
-    parser.add_argument(
-        "--repo_id",
-        type=str,
-        help="repo_id where the model will be pushed to.",
-    )
-    args = parser.parse_args()
-    write_model_and_image_processor(args.model_name, args.output_dir, args.push_to_hub, args.repo_id)
diff --git a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py
index e258d0d71c67..59a00a6e7410 100644
--- a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py
+++ b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py
@@ -39,6 +39,7 @@
     add_start_docstrings_to_model_forward,
     is_torchdynamo_compiling,
     replace_return_docstrings,
+    torch_int,
 )
 from ...utils.backbone_utils import load_backbone
 from .configuration_rt_detr_v2 import RTDetrV2Config
@@ -485,10 +486,10 @@ class RTDetrV2DecoderOutput(ModelOutput):
             used to compute the weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_logits: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -548,17 +549,17 @@ class RTDetrV2ModelOutput(ModelOutput):
             Extra dictionary for the denoising related values
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_logits: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
     encoder_last_hidden_state: Optional[torch.FloatTensor] = None
     encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
-    init_reference_points: torch.FloatTensor = None
+    init_reference_points: Optional[torch.FloatTensor] = None
     enc_topk_logits: Optional[torch.FloatTensor] = None
     enc_topk_bboxes: Optional[torch.FloatTensor] = None
     enc_outputs_class: Optional[torch.FloatTensor] = None
@@ -637,13 +638,13 @@ class RTDetrV2ObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
-    last_hidden_state: torch.FloatTensor = None
-    intermediate_hidden_states: torch.FloatTensor = None
-    intermediate_logits: torch.FloatTensor = None
-    intermediate_reference_points: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
     cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -797,7 +798,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        position_embeddings: torch.Tensor = None,
+        position_embeddings: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
         **kwargs,
     ):
@@ -899,10 +900,9 @@ def __init__(self, config: RTDetrV2Config):
             self.conv3 = nn.Identity()
 
     def forward(self, hidden_state):
-        device = hidden_state.device
         hidden_state_1 = self.conv1(hidden_state)
-        hidden_state_1 = self.bottlenecks(hidden_state_1).to(device)
-        hidden_state_2 = self.conv2(hidden_state).to(device)
+        hidden_state_1 = self.bottlenecks(hidden_state_1)
+        hidden_state_2 = self.conv2(hidden_state)
         return self.conv3(hidden_state_1 + hidden_state_2)
 
 
@@ -944,43 +944,56 @@ def __init__(self, config: RTDetrV2Config):
         self.eval_size = config.eval_size
         self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
         self.out_strides = self.feat_strides
-        activation_function = config.activation_function
+        self.num_fpn_stages = len(self.in_channels) - 1
+        self.num_pan_stages = len(self.in_channels) - 1
+        activation = config.activation_function
 
         # encoder transformer
         self.encoder = nn.ModuleList([RTDetrV2Encoder(config) for _ in range(len(self.encode_proj_layers))])
-        # top-down fpn
+
+        # top-down FPN
         self.lateral_convs = nn.ModuleList()
         self.fpn_blocks = nn.ModuleList()
-        for _ in range(len(self.in_channels) - 1, 0, -1):
-            self.lateral_convs.append(
-                RTDetrV2ConvNormLayer(
-                    config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1, activation=activation_function
-                )
+        for _ in range(self.num_fpn_stages):
+            lateral_conv = RTDetrV2ConvNormLayer(
+                config,
+                in_channels=self.encoder_hidden_dim,
+                out_channels=self.encoder_hidden_dim,
+                kernel_size=1,
+                stride=1,
+                activation=activation,
             )
-            self.fpn_blocks.append(RTDetrV2CSPRepLayer(config))
+            fpn_block = RTDetrV2CSPRepLayer(config)
+            self.lateral_convs.append(lateral_conv)
+            self.fpn_blocks.append(fpn_block)
 
-        # bottom-up pan
+        # bottom-up PAN
         self.downsample_convs = nn.ModuleList()
         self.pan_blocks = nn.ModuleList()
-        for _ in range(len(self.in_channels) - 1):
-            self.downsample_convs.append(
-                RTDetrV2ConvNormLayer(
-                    config, self.encoder_hidden_dim, self.encoder_hidden_dim, 3, 2, activation=activation_function
-                )
+        for _ in range(self.num_pan_stages):
+            downsample_conv = RTDetrV2ConvNormLayer(
+                config,
+                in_channels=self.encoder_hidden_dim,
+                out_channels=self.encoder_hidden_dim,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
             )
-            self.pan_blocks.append(RTDetrV2CSPRepLayer(config))
+            pan_block = RTDetrV2CSPRepLayer(config)
+            self.downsample_convs.append(downsample_conv)
+            self.pan_blocks.append(pan_block)
 
     @staticmethod
     def build_2d_sincos_position_embedding(
         width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
     ):
-        grid_w = torch.arange(int(width), dtype=dtype, device=device)
-        grid_h = torch.arange(int(height), dtype=dtype, device=device)
+        grid_w = torch.arange(torch_int(width), device=device).to(dtype)
+        grid_h = torch.arange(torch_int(height), device=device).to(dtype)
         grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
         if embed_dim % 4 != 0:
             raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding")
         pos_dim = embed_dim // 4
-        omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim
+        omega = torch.arange(pos_dim, device=device).to(dtype) / pos_dim
         omega = 1.0 / (temperature**omega)
 
         out_w = grid_w.flatten()[..., None] @ omega[None]
@@ -1036,6 +1049,7 @@ def forward(
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
+
         # encoder
         if self.config.encoder_layers > 0:
             for i, enc_ind in enumerate(self.encode_proj_layers):
@@ -1071,30 +1085,37 @@ def forward(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states[enc_ind],)
 
-        # broadcasting and fusion
+        # top-down FPN
         fpn_feature_maps = [hidden_states[-1]]
-        for idx in range(len(self.in_channels) - 1, 0, -1):
-            feat_high = fpn_feature_maps[0]
-            feat_low = hidden_states[idx - 1]
-            feat_high = self.lateral_convs[len(self.in_channels) - 1 - idx](feat_high)
-            fpn_feature_maps[0] = feat_high
-            upsample_feat = F.interpolate(feat_high, scale_factor=2.0, mode="nearest")
-            fps_map = self.fpn_blocks[len(self.in_channels) - 1 - idx](torch.concat([upsample_feat, feat_low], dim=1))
-            fpn_feature_maps.insert(0, fps_map)
-
-        fpn_states = [fpn_feature_maps[0]]
-        for idx in range(len(self.in_channels) - 1):
-            feat_low = fpn_states[-1]
-            feat_high = fpn_feature_maps[idx + 1]
-            downsample_feat = self.downsample_convs[idx](feat_low)
-            hidden_states = self.pan_blocks[idx](
-                torch.concat([downsample_feat, feat_high.to(downsample_feat.device)], dim=1)
-            )
-            fpn_states.append(hidden_states)
+        for idx, (lateral_conv, fpn_block) in enumerate(zip(self.lateral_convs, self.fpn_blocks)):
+            backbone_feature_map = hidden_states[self.num_fpn_stages - idx - 1]
+            top_fpn_feature_map = fpn_feature_maps[-1]
+            # apply lateral block
+            top_fpn_feature_map = lateral_conv(top_fpn_feature_map)
+            fpn_feature_maps[-1] = top_fpn_feature_map
+            # apply fpn block
+            top_fpn_feature_map = F.interpolate(top_fpn_feature_map, scale_factor=2.0, mode="nearest")
+            fused_feature_map = torch.concat([top_fpn_feature_map, backbone_feature_map], dim=1)
+            new_fpn_feature_map = fpn_block(fused_feature_map)
+            fpn_feature_maps.append(new_fpn_feature_map)
+
+        fpn_feature_maps = fpn_feature_maps[::-1]
+
+        # bottom-up PAN
+        pan_feature_maps = [fpn_feature_maps[0]]
+        for idx, (downsample_conv, pan_block) in enumerate(zip(self.downsample_convs, self.pan_blocks)):
+            top_pan_feature_map = pan_feature_maps[-1]
+            fpn_feature_map = fpn_feature_maps[idx + 1]
+            downsampled_feature_map = downsample_conv(top_pan_feature_map)
+            fused_feature_map = torch.concat([downsampled_feature_map, fpn_feature_map], dim=1)
+            new_pan_feature_map = pan_block(fused_feature_map)
+            pan_feature_maps.append(new_pan_feature_map)
 
         if not return_dict:
-            return tuple(v for v in [fpn_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(last_hidden_state=fpn_states, hidden_states=encoder_states, attentions=all_attentions)
+            return tuple(v for v in [pan_feature_maps, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=pan_feature_maps, hidden_states=encoder_states, attentions=all_attentions
+        )
 
 
 def inverse_sigmoid(x, eps=1e-5):
@@ -1184,7 +1205,7 @@ def get_contrastive_denoising_training_group(
         denoise_positive_idx, [n * num_groups_denoising_queries for n in num_ground_truths]
     )
     # total denoising queries
-    num_denoising_queries = int(max_gt_num * 2 * num_groups_denoising_queries)
+    num_denoising_queries = torch_int(max_gt_num * 2 * num_groups_denoising_queries)
 
     if label_noise_ratio > 0:
         mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
@@ -1289,7 +1310,7 @@ class RTDetrV2PreTrainedModel(PreTrainedModel):
     config_class = RTDetrV2Config
     base_model_prefix = "rt_detr_v2"
     main_input_name = "pixel_values"
-    _no_split_modules = [r"RTDetrV2ConvEncoder", r"RTDetrV2EncoderLayer", r"RTDetrV2DecoderLayer"]
+    _no_split_modules = [r"RTDetrV2HybridEncoder", r"RTDetrV2DecoderLayer"]
 
     def _init_weights(self, module):
         """Initalize the weights"""
@@ -1610,13 +1631,14 @@ def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dt
         anchors = []
         for level, (height, width) in enumerate(spatial_shapes):
             grid_y, grid_x = torch.meshgrid(
-                torch.arange(end=height, dtype=dtype, device=device),
-                torch.arange(end=width, dtype=dtype, device=device),
+                torch.arange(end=height, device=device).to(dtype),
+                torch.arange(end=width, device=device).to(dtype),
                 indexing="ij",
             )
             grid_xy = torch.stack([grid_x, grid_y], -1)
-            valid_wh = torch.tensor([width, height], device=device).to(dtype)
-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_wh
+            grid_xy = grid_xy.unsqueeze(0) + 0.5
+            grid_xy[..., 0] /= width
+            grid_xy[..., 1] /= height
             wh = torch.ones_like(grid_xy) * grid_size * (2.0**level)
             anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4))
         # define the valid range for anchor coordinates
@@ -1717,14 +1739,15 @@ def forward(
         # Prepare encoder inputs (by flattening)
         source_flatten = []
         spatial_shapes_list = []
+        spatial_shapes = torch.empty((len(sources), 2), device=device, dtype=torch.long)
         for level, source in enumerate(sources):
-            batch_size, num_channels, height, width = source.shape
-            spatial_shape = (height, width)
-            spatial_shapes_list.append(spatial_shape)
+            height, width = source.shape[-2:]
+            spatial_shapes[level, 0] = height
+            spatial_shapes[level, 1] = width
+            spatial_shapes_list.append((height, width))
             source = source.flatten(2).transpose(1, 2)
             source_flatten.append(source)
         source_flatten = torch.cat(source_flatten, 1)
-        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
         level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
 
         # prepare denoising training
@@ -1758,8 +1781,7 @@ def forward(
             anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype)
         else:
             anchors, valid_mask = self.anchors, self.valid_mask
-
-        anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype)
+            anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype)
 
         # use the valid_mask to selectively retain values in the feature map where the mask is `True`
         memory = valid_mask.to(source_flatten.dtype) * source_flatten
@@ -1975,7 +1997,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.model(
diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
deleted file mode 100644
index a0c97fc4e234..000000000000
--- a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert a RWKV checkpoint from BlinkDL to the Hugging Face format."""
-
-import argparse
-import gc
-import json
-import os
-import re
-
-import torch
-from huggingface_hub import hf_hub_download, split_torch_state_dict_into_shards
-
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
-from transformers.modeling_utils import WEIGHTS_INDEX_NAME
-
-
-NUM_HIDDEN_LAYERS_MAPPING = {
-    "169M": 12,
-    "430M": 24,
-    "1B5": 24,
-    "3B": 32,
-    "7B": 32,
-    "14B": 40,
-}
-
-HIDEN_SIZE_MAPPING = {
-    "169M": 768,
-    "430M": 1024,
-    "1B5": 2048,
-    "3B": 2560,
-    "7B": 4096,
-    "14B": 5120,
-}
-
-
-def convert_state_dict(state_dict):
-    state_dict_keys = list(state_dict.keys())
-    for name in state_dict_keys:
-        weight = state_dict.pop(name)
-        # emb -> embedding
-        if name.startswith("emb."):
-            name = name.replace("emb.", "embeddings.")
-        # ln_0 -> pre_ln (only present at block 0)
-        if name.startswith("blocks.0.ln0"):
-            name = name.replace("blocks.0.ln0", "blocks.0.pre_ln")
-        # att -> attention
-        name = re.sub(r"blocks\.(\d+)\.att", r"blocks.\1.attention", name)
-        # ffn -> feed_forward
-        name = re.sub(r"blocks\.(\d+)\.ffn", r"blocks.\1.feed_forward", name)
-        # time_mix_k -> time_mix_key and reshape
-        if name.endswith(".time_mix_k"):
-            name = name.replace(".time_mix_k", ".time_mix_key")
-        # time_mix_v -> time_mix_value and reshape
-        if name.endswith(".time_mix_v"):
-            name = name.replace(".time_mix_v", ".time_mix_value")
-        # time_mix_r -> time_mix_key and reshape
-        if name.endswith(".time_mix_r"):
-            name = name.replace(".time_mix_r", ".time_mix_receptance")
-
-        if name != "head.weight":
-            name = "rwkv." + name
-
-        state_dict[name] = weight
-    return state_dict
-
-
-def convert_rmkv_checkpoint_to_hf_format(
-    repo_id, checkpoint_file, output_dir, size=None, tokenizer_file=None, push_to_hub=False, model_name=None
-):
-    # 1. If possible, build the tokenizer.
-    if tokenizer_file is None:
-        print("No `--tokenizer_file` provided, we will use the default tokenizer.")
-        vocab_size = 50277
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
-    else:
-        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
-        vocab_size = len(tokenizer)
-    tokenizer.save_pretrained(output_dir)
-
-    # 2. Build the config
-    possible_sizes = list(NUM_HIDDEN_LAYERS_MAPPING.keys())
-    if size is None:
-        # Try to infer size from the checkpoint name
-        for candidate in possible_sizes:
-            if candidate in checkpoint_file:
-                size = candidate
-                break
-        if size is None:
-            raise ValueError("Could not infer the size, please provide it with the `--size` argument.")
-    if size not in possible_sizes:
-        raise ValueError(f"`size` should be one of {possible_sizes}, got {size}.")
-
-    config = RwkvConfig(
-        vocab_size=vocab_size,
-        num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
-        hidden_size=HIDEN_SIZE_MAPPING[size],
-    )
-    config.save_pretrained(output_dir)
-
-    # 3. Download model file then convert state_dict
-    model_file = hf_hub_download(repo_id, checkpoint_file)
-    state_dict = torch.load(model_file, map_location="cpu")
-    state_dict = convert_state_dict(state_dict)
-
-    # 4. Split in shards and save
-    state_dict_split = split_torch_state_dict_into_shards(state_dict)
-    shards = index = None
-    for tensors in state_dict_split.filename_to_tensors.values():
-        shards = {tensor: state_dict[tensor] for tensor in tensors}
-    if state_dict_split.is_sharded:
-        index = {
-            "metadata": state_dict_split.metadata,
-            "weight_map": state_dict_split.tensor_to_filename,
-        }
-
-    for shard_file, shard in shards.items():
-        torch.save(shard, os.path.join(output_dir, shard_file))
-
-    if index is not None:
-        save_index_file = os.path.join(output_dir, WEIGHTS_INDEX_NAME)
-        # Save the index as well
-        with open(save_index_file, "w", encoding="utf-8") as f:
-            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-            f.write(content)
-
-        # 5. Clean up shards (for some reason the file PyTorch saves take the same space as the whole state_dict
-        print(
-            "Cleaning up shards. This may error with an OOM error, it this is the case don't worry you still have converted the model."
-        )
-        shard_files = list(shards.keys())
-
-        del state_dict
-        del shards
-        gc.collect()
-
-        for shard_file in shard_files:
-            state_dict = torch.load(os.path.join(output_dir, shard_file))
-            torch.save({k: v.cpu().clone() for k, v in state_dict.items()}, os.path.join(output_dir, shard_file))
-
-    del state_dict
-    gc.collect()
-
-    if push_to_hub:
-        if model_name is None:
-            raise ValueError("Please provide a `model_name` to push the model to the Hub.")
-        model = AutoModelForCausalLM.from_pretrained(output_dir)
-        model.push_to_hub(model_name, max_shard_size="2GB")
-        tokenizer.push_to_hub(model_name)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--repo_id", default=None, type=str, required=True, help="Repo ID from which to pull the checkpoint."
-    )
-    parser.add_argument(
-        "--checkpoint_file", default=None, type=str, required=True, help="Name of the checkpoint file in the repo."
-    )
-    parser.add_argument(
-        "--output_dir", default=None, type=str, required=True, help="Where to save the converted model."
-    )
-    parser.add_argument(
-        "--tokenizer_file",
-        default=None,
-        type=str,
-        help="Path to the tokenizer file to use (if not provided, only the model is converted).",
-    )
-    parser.add_argument(
-        "--size",
-        default=None,
-        type=str,
-        help="Size of the model. Will be inferred from the `checkpoint_file` if not passed.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Push to the Hub the converted model.",
-    )
-    parser.add_argument(
-        "--model_name",
-        default=None,
-        type=str,
-        help="Name of the pushed model on the Hub, including the username / organization.",
-    )
-
-    args = parser.parse_args()
-    convert_rmkv_checkpoint_to_hf_format(
-        args.repo_id,
-        args.checkpoint_file,
-        args.output_dir,
-        size=args.size,
-        tokenizer_file=args.tokenizer_file,
-        push_to_hub=args.push_to_hub,
-        model_name=args.model_name,
-    )
diff --git a/src/transformers/models/rwkv/modeling_rwkv.py b/src/transformers/models/rwkv/modeling_rwkv.py
index 10aea7222320..6274464309e3 100644
--- a/src/transformers/models/rwkv/modeling_rwkv.py
+++ b/src/transformers/models/rwkv/modeling_rwkv.py
@@ -478,7 +478,7 @@ class RwkvOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     state: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -511,7 +511,7 @@ class RwkvCausalLMOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     state: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -625,7 +625,7 @@ def forward(
         use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if attention_mask is None:
+        if attention_mask is not None:
             logger.warning_once("`attention_mask` was passed, but it is unused in this model.")
 
         if self.training == self.layers_are_rescaled:
diff --git a/src/transformers/models/sam/configuration_sam.py b/src/transformers/models/sam/configuration_sam.py
index e0a759dbf111..0f4a2b0893ab 100644
--- a/src/transformers/models/sam/configuration_sam.py
+++ b/src/transformers/models/sam/configuration_sam.py
@@ -183,9 +183,27 @@ class SamVisionConfig(PretrainedConfig):
         mlp_dim (`int`, *optional*):
             The dimensionality of the MLP layer in the Transformer encoder. If `None`, defaults to `mlp_ratio *
             hidden_size`.
-    """
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     SamVisionConfig,
+    ...     SamVisionModel,
+    ... )
+
+    >>> # Initializing a SamVisionConfig with `"facebook/sam-vit-huge"` style configuration
+    >>> configuration = SamVisionConfig()
+
+    >>> # Initializing a SamVisionModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
+    >>> model = SamVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     base_config_key = "vision_config"
+    model_type = "sam_vision_model"
 
     def __init__(
         self,
diff --git a/src/transformers/models/sam/convert_sam_to_hf.py b/src/transformers/models/sam/convert_sam_to_hf.py
deleted file mode 100644
index dd8818b68cfc..000000000000
--- a/src/transformers/models/sam/convert_sam_to_hf.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert SAM checkpoints from the original repository.
-
-URL: https://github.com/facebookresearch/segment-anything.
-
-Also supports converting the SlimSAM checkpoints from https://github.com/czg1225/SlimSAM/tree/master.
-"""
-
-import argparse
-import re
-
-import numpy as np
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    SamConfig,
-    SamImageProcessor,
-    SamModel,
-    SamProcessor,
-    SamVisionConfig,
-)
-
-
-def get_config(model_name):
-    if "slimsam-50" in model_name:
-        vision_config = SamVisionConfig(
-            hidden_size=384,
-            mlp_dim=1536,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            global_attn_indexes=[2, 5, 8, 11],
-        )
-    elif "slimsam-77" in model_name:
-        vision_config = SamVisionConfig(
-            hidden_size=168,
-            mlp_dim=696,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            global_attn_indexes=[2, 5, 8, 11],
-        )
-    elif "sam_vit_b" in model_name:
-        vision_config = SamVisionConfig()
-    elif "sam_vit_l" in model_name:
-        vision_config = SamVisionConfig(
-            hidden_size=1024,
-            num_hidden_layers=24,
-            num_attention_heads=16,
-            global_attn_indexes=[5, 11, 17, 23],
-        )
-    elif "sam_vit_h" in model_name:
-        vision_config = SamVisionConfig(
-            hidden_size=1280,
-            num_hidden_layers=32,
-            num_attention_heads=16,
-            global_attn_indexes=[7, 15, 23, 31],
-        )
-
-    config = SamConfig(
-        vision_config=vision_config,
-    )
-
-    return config
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "iou_prediction_head.layers.0": "iou_prediction_head.proj_in",
-    "iou_prediction_head.layers.1": "iou_prediction_head.layers.0",
-    "iou_prediction_head.layers.2": "iou_prediction_head.proj_out",
-    "mask_decoder.output_upscaling.0": "mask_decoder.upscale_conv1",
-    "mask_decoder.output_upscaling.1": "mask_decoder.upscale_layer_norm",
-    "mask_decoder.output_upscaling.3": "mask_decoder.upscale_conv2",
-    "mask_downscaling.0": "mask_embed.conv1",
-    "mask_downscaling.1": "mask_embed.layer_norm1",
-    "mask_downscaling.3": "mask_embed.conv2",
-    "mask_downscaling.4": "mask_embed.layer_norm2",
-    "mask_downscaling.6": "mask_embed.conv3",
-    "point_embeddings": "point_embed",
-    "pe_layer.positional_encoding_gaussian_matrix": "shared_embedding.positional_embedding",
-    "image_encoder": "vision_encoder",
-    "neck.0": "neck.conv1",
-    "neck.1": "neck.layer_norm1",
-    "neck.2": "neck.conv2",
-    "neck.3": "neck.layer_norm2",
-    "patch_embed.proj": "patch_embed.projection",
-    ".norm": ".layer_norm",
-    "blocks": "layers",
-}
-
-
-def replace_keys(state_dict):
-    model_state_dict = {}
-    state_dict.pop("pixel_mean", None)
-    state_dict.pop("pixel_std", None)
-
-    output_hypernetworks_mlps_pattern = r".*.output_hypernetworks_mlps.(\d+).layers.(\d+).*"
-
-    for key, value in state_dict.items():
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        if re.match(output_hypernetworks_mlps_pattern, key):
-            layer_nb = int(re.match(output_hypernetworks_mlps_pattern, key).group(2))
-            if layer_nb == 0:
-                key = key.replace("layers.0", "proj_in")
-            elif layer_nb == 1:
-                key = key.replace("layers.1", "layers.0")
-            elif layer_nb == 2:
-                key = key.replace("layers.2", "proj_out")
-
-        model_state_dict[key] = value
-
-    model_state_dict["shared_image_embedding.positional_embedding"] = model_state_dict[
-        "prompt_encoder.shared_embedding.positional_embedding"
-    ]
-
-    return model_state_dict
-
-
-def convert_sam_checkpoint(model_name, checkpoint_path, pytorch_dump_folder, push_to_hub):
-    config = get_config(model_name)
-
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-    state_dict = replace_keys(state_dict)
-
-    image_processor = SamImageProcessor()
-    processor = SamProcessor(image_processor=image_processor)
-    hf_model = SamModel(config)
-    hf_model.eval()
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    hf_model.load_state_dict(state_dict)
-    hf_model = hf_model.to(device)
-
-    img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
-    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-
-    input_points = [[[500, 375]]]
-    input_labels = [[1]]
-
-    inputs = processor(images=np.array(raw_image), return_tensors="pt").to(device)
-
-    with torch.no_grad():
-        output = hf_model(**inputs)
-    scores = output.iou_scores.squeeze()
-
-    if model_name == "sam_vit_b_01ec64":
-        inputs = processor(
-            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
-        ).to(device)
-
-        with torch.no_grad():
-            output = hf_model(**inputs)
-            scores = output.iou_scores.squeeze()
-
-    elif model_name == "sam_vit_h_4b8939":
-        inputs = processor(
-            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
-        ).to(device)
-
-        with torch.no_grad():
-            output = hf_model(**inputs)
-        scores = output.iou_scores.squeeze()
-
-        assert scores[-1].item() == 0.9712603092193604
-
-        input_boxes = ((75, 275, 1725, 850),)
-
-        inputs = processor(images=np.array(raw_image), input_boxes=input_boxes, return_tensors="pt").to(device)
-
-        with torch.no_grad():
-            output = hf_model(**inputs)
-        scores = output.iou_scores.squeeze()
-
-        assert scores[-1].item() == 0.8686015605926514
-
-        # Test with 2 points and 1 image.
-        input_points = [[[400, 650], [800, 650]]]
-        input_labels = [[1, 1]]
-
-        inputs = processor(
-            images=np.array(raw_image), input_points=input_points, input_labels=input_labels, return_tensors="pt"
-        ).to(device)
-
-        with torch.no_grad():
-            output = hf_model(**inputs)
-        scores = output.iou_scores.squeeze()
-
-        assert scores[-1].item() == 0.9936047792434692
-
-    if pytorch_dump_folder is not None:
-        processor.save_pretrained(pytorch_dump_folder)
-        hf_model.save_pretrained(pytorch_dump_folder)
-
-    if push_to_hub:
-        repo_id = f"nielsr/{model_name}" if "slimsam" in model_name else f"meta/{model_name}"
-        processor.push_to_hub(repo_id)
-        hf_model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    choices = ["sam_vit_b_01ec64", "sam_vit_h_4b8939", "sam_vit_l_0b3195", "slimsam-50-uniform", "slimsam-77-uniform"]
-    parser.add_argument(
-        "--model_name",
-        default="sam_vit_h_4b8939",
-        choices=choices,
-        type=str,
-        help="Name of the original model to convert",
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        type=str,
-        required=False,
-        help="Path to the original checkpoint",
-    )
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Whether to push the model and processor to the hub after converting",
-    )
-
-    args = parser.parse_args()
-
-    if "slimsam" in args.model_name:
-        checkpoint_path = args.checkpoint_path
-        if checkpoint_path is None:
-            raise ValueError("You need to provide a checkpoint path for SlimSAM models.")
-    else:
-        checkpoint_path = hf_hub_download("ybelkada/segment-anything", f"checkpoints/{args.model_name}.pth")
-
-    convert_sam_checkpoint(args.model_name, checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index ae643b367f4d..5d98674f73e6 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -127,8 +127,8 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: bool = True,
-        pad_size: int = None,
-        mask_pad_size: int = None,
+        pad_size: Optional[int] = None,
+        mask_pad_size: Optional[int] = None,
         do_convert_rgb: bool = True,
         **kwargs,
     ) -> None:
@@ -296,7 +296,7 @@ def _preprocess_image(
         do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
+        do_rescale: Optional[bool] = None,
         rescale_factor: Optional[float] = None,
         do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
@@ -307,8 +307,6 @@ def _preprocess_image(
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Tuple[np.ndarray, Tuple[int, int], Tuple[int, int]]:
-        image = to_numpy_array(image)
-
         # PIL RGBA images are converted to RGB
         if do_convert_rgb:
             image = convert_to_rgb(image)
@@ -1383,7 +1381,7 @@ def _mask_to_rle_pytorch(input_mask: "torch.Tensor"):
             continue
         btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
         counts = [] if input_mask[i, 0] == 0 else [0]
-        counts += [cur_idxs[0].item()] + btw_idxs.tolist() + [height * width - cur_idxs[-1]]
+        counts += [cur_idxs[0].item()] + btw_idxs.tolist() + [height * width - cur_idxs[-1].item()]
         out.append({"size": [height, width], "counts": counts})
     return out
 
@@ -1403,7 +1401,7 @@ def _mask_to_rle_tf(input_mask: "tf.Tensor"):
     # Encode run length
     out = []
     for i in range(batch_size):
-        cur_idxs = change_indices[change_indices[:, 0] == i, 1] + 1
+        cur_idxs = change_indices[change_indices[:, 0] == i][:, 1] + 1
         if len(cur_idxs) == 0:
             # No changes => either all 0 or all 1
             # If the entire mask is 0, RLE is [height*width] or if the entire mask is 1, RLE is [0, height*width].
@@ -1414,7 +1412,9 @@ def _mask_to_rle_tf(input_mask: "tf.Tensor"):
             continue
         btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
         counts = [] if input_mask[i, 0] == 0 else [0]
-        counts += [cur_idxs[0].item()] + btw_idxs.tolist() + [height * width - cur_idxs[-1]]
+        counts += (
+            [cur_idxs[0].numpy().item()] + btw_idxs.numpy().tolist() + [height * width - cur_idxs[-1].numpy().item()]
+        )
         out.append({"size": [height, width], "counts": counts})
     return out
 
diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index 9fe4c27bcac8..549eb8a317ed 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -16,7 +16,7 @@
 
 import collections
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -27,7 +27,14 @@
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
 
 
@@ -62,7 +69,7 @@ class SamVisionEncoderOutput(ModelOutput):
     """
 
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -96,8 +103,8 @@ class SamImageSegmentationOutput(ModelOutput):
             heads.
     """
 
-    iou_scores: torch.FloatTensor = None
-    pred_masks: torch.FloatTensor = None
+    iou_scores: Optional[torch.FloatTensor] = None
+    pred_masks: Optional[torch.FloatTensor] = None
     vision_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     vision_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     mask_decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -216,7 +223,9 @@ def _recombine_heads(self, hidden_states: Tensor, point_batch_size: int) -> Tens
         hidden_states = hidden_states.transpose(1, 2)
         return hidden_states.reshape(batch // point_batch_size, point_batch_size, n_tokens, n_heads * c_per_head)
 
-    def forward(self, query: Tensor, key: Tensor, value: Tensor, attention_similarity: Tensor = None) -> Tensor:
+    def forward(
+        self, query: Tensor, key: Tensor, value: Tensor, attention_similarity: Optional[Tensor] = None
+    ) -> Tensor:
         # Input projections
         query = self.q_proj(query)
         key = self.k_proj(key)
@@ -255,7 +264,9 @@ class SamSdpaAttention(SamAttention):
     def __init__(self, config, downsample_rate=None):
         super().__init__(config, downsample_rate)
 
-    def forward(self, query: Tensor, key: Tensor, value: Tensor, attention_similarity: Tensor = None) -> Tensor:
+    def forward(
+        self, query: Tensor, key: Tensor, value: Tensor, attention_similarity: Optional[Tensor] = None
+    ) -> Tensor:
         # Input projections
         query = self.q_proj(query)
         key = self.k_proj(key)
@@ -400,13 +411,11 @@ def forward(
         target_embedding=None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         all_attentions = ()
 
@@ -509,8 +518,8 @@ def forward(
         dense_prompt_embeddings: torch.Tensor,
         multimask_output: bool,
         output_attentions: Optional[bool] = None,
-        attention_similarity: torch.Tensor = None,
-        target_embedding: torch.Tensor = None,
+        attention_similarity: Optional[torch.Tensor] = None,
+        target_embedding: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Predict masks given image and prompt embeddings.
@@ -649,9 +658,10 @@ def forward(self, masks):
 
 
 class SamPromptEncoder(nn.Module):
-    def __init__(self, config: SamPromptEncoderConfig, shared_patch_embedding):
+    def __init__(self, config: SamPromptEncoderConfig):
         super().__init__()
-        self.shared_embedding = shared_patch_embedding
+        self.shared_embedding = SamPositionalEmbedding(config.vision_config)
+        config = config.prompt_encoder_config
         self.mask_embed = SamMaskEmbedding(config)
         self.no_mask_embed = nn.Embedding(1, config.hidden_size)
 
@@ -820,9 +830,8 @@ def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.
 
         return rel_pos_resized[relative_coords.long()]
 
-    def add_decomposed_rel_pos(
+    def get_decomposed_rel_pos(
         self,
-        attn: torch.Tensor,
         query: torch.Tensor,
         rel_pos_h: torch.Tensor,
         rel_pos_w: torch.Tensor,
@@ -834,8 +843,6 @@ def add_decomposed_rel_pos(
         https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
 
         Args:
-            attn (`torch.Tensor`):
-                attention map.
             query (`torch.Tensor`):
                 query q in the attention layer with shape (batch_size, query_height * query_width, channel).
             rel_pos_h (`torch.Tensor`):
@@ -848,8 +855,8 @@ def add_decomposed_rel_pos(
                 spatial sequence size of key k with (key_height, key_width).
 
         Returns:
-            attn (`torch.Tensor`):
-                attention map with added relative positional embeddings.
+            decomposed_rel_pos (`torch.Tensor`):
+                decomposed relative position embeddings.
         """
         query_height, query_width = q_size
         key_height, key_width = k_size
@@ -860,10 +867,10 @@ def add_decomposed_rel_pos(
         reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
         rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
         rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
-        attn = attn.reshape(batch_size, query_height, query_width, key_height, key_width)
-        attn = attn + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
-        attn = attn.reshape(batch_size, query_height * query_width, key_height * key_width)
-        return attn
+
+        decomposed_rel_pos = rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+
+        return decomposed_rel_pos
 
     def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
         batch_size, height, width, _ = hidden_states.shape
@@ -879,9 +886,11 @@ def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch
         attn_weights = (query * self.scale) @ key.transpose(-2, -1)
 
         if self.use_rel_pos:
-            attn_weights = self.add_decomposed_rel_pos(
-                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            decomposed_rel_pos = self.get_decomposed_rel_pos(
+                query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
             )
+            decomposed_rel_pos = decomposed_rel_pos.reshape_as(attn_weights)
+            attn_weights = attn_weights + decomposed_rel_pos
 
         attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
 
@@ -909,47 +918,19 @@ class SamVisionSdpaAttention(SamVisionAttention):
     def __init__(self, config, window_size):
         super().__init__(config, window_size)
 
-    def add_decomposed_rel_pos(
-        self,
-        query: torch.Tensor,
-        rel_pos_h: torch.Tensor,
-        rel_pos_w: torch.Tensor,
-        q_size: Tuple[int, int],
-        k_size: Tuple[int, int],
-    ) -> torch.Tensor:
-        """
-        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
-        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
-        This method is reimplemented to follow the implementation in:
-        https://github.com/pytorch-labs/segment-anything-fast/blob/main/segment_anything_fast/modeling/image_encoder.py   # noqa B950
-        This implementation is more memory efficient when using SDPA in the forward method.
-        Args:
-            q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
-            rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
-            rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
-            q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
-            k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
-
-        Returns:
-            attn (Tensor): attention map with added relative positional embeddings.
-        """
-        query_height, query_width = q_size
-        key_height, key_width = k_size
-        relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
-        relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
-
-        batch_size, _, dim = query.shape
-        reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
-        rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
-        rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
-        rel_h = rel_h.unsqueeze(-1)
-        rel_w = rel_w.unsqueeze(-2)
-        rel_h = rel_h.reshape(batch_size, query_height * query_width, key_height, 1)
-        rel_w = rel_w.reshape(batch_size, query_height * query_width, 1, key_width)
-
-        return rel_h, rel_w
-
     def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
+        if output_attentions:
+            logger.warning_once(
+                "`SamVisionSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
+                "`output_attentions=True`. Falling back to the manual attention implementation, but "
+                "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
+                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                output_attentions=output_attentions,
+            )
+
         batch_size, height, width, _ = hidden_states.shape
         # qkv with shape (3, B, nHead, H * W, C)
         qkv = (
@@ -960,25 +941,21 @@ def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch
         # q, k, v with shape (B * nHead, H * W, C)
         query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
 
-        rel_h, rel_w = None, None
+        attn_bias = None
         if self.use_rel_pos:
-            rel_h, rel_w = self.add_decomposed_rel_pos(
+            decomposed_rel_pos = self.get_decomposed_rel_pos(
                 query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
             )
+            decomposed_rel_pos = decomposed_rel_pos.reshape(
+                batch_size, self.num_attention_heads, height * width, height * width
+            )
+            attn_bias = decomposed_rel_pos
 
         query = query.view(batch_size, self.num_attention_heads, height * width, -1)
         key = key.view(batch_size, self.num_attention_heads, height * width, -1)
         value = value.view(batch_size, self.num_attention_heads, height * width, -1)
 
-        if self.use_rel_pos:
-            rel_h = rel_h.view(batch_size, self.num_attention_heads, rel_h.size(1), rel_h.size(2), rel_h.size(3))
-            rel_w = rel_w.view(batch_size, self.num_attention_heads, rel_w.size(1), rel_w.size(2), rel_w.size(3))
-            attn_bias = (rel_h + rel_w).view(
-                batch_size, self.num_attention_heads, rel_h.size(2), rel_h.size(3) * rel_w.size(4)
-            )
-            attn_output = torch.nn.functional.scaled_dot_product_attention(query, key, value, attn_mask=attn_bias)
-        else:
-            attn_output = torch.nn.functional.scaled_dot_product_attention(query, key, value)
+        attn_output = torch.nn.functional.scaled_dot_product_attention(query, key, value, attn_mask=attn_bias)
 
         attn_output = (
             attn_output.view(batch_size, self.num_attention_heads, height, width, -1)
@@ -988,17 +965,7 @@ def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch
 
         attn_output = self.proj(attn_output)
 
-        if output_attentions:
-            # For output_attentions, calculate the attention weights
-            attn_weights = (query @ key.transpose(-2, -1)) * self.scale
-            if attn_bias is not None:
-                attn_weights = attn_weights + attn_bias
-            attn_weights = F.softmax(attn_weights, dim=-1)
-            outputs = (attn_output, attn_weights)
-        else:
-            outputs = (attn_output, None)
-
-        return outputs
+        return attn_output, None
 
 
 SAM_VISION_ATTENTION_CLASSES = {
@@ -1157,18 +1124,17 @@ def __init__(self, config: SamVisionConfig):
     def get_input_embeddings(self):
         return self.patch_embed
 
+    @can_return_tuple
     def forward(
         self,
         pixel_values: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SamVisionEncoderOutput]:
+    ) -> SamVisionEncoderOutput:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
@@ -1202,14 +1168,6 @@ def forward(
 
         hidden_states = self.neck(hidden_states)
 
-        if not return_dict:
-            outputs = (hidden_states,)
-            if output_hidden_states:
-                outputs = outputs + (all_hidden_states,)
-            if output_attentions:
-                outputs = outputs + (all_self_attentions,)
-            return outputs
-
         return SamVisionEncoderOutput(
             last_hidden_state=hidden_states,
             hidden_states=all_hidden_states,
@@ -1235,6 +1193,13 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, (SamLayerNorm, nn.LayerNorm)):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, SamVisionAttention):
+            if module.use_rel_pos:
+                module.rel_pos_h.data.zero_()
+                module.rel_pos_w.data.zero_()
 
 
 SAM_START_DOCSTRING = r"""
@@ -1323,6 +1288,61 @@ def _init_weights(self, module):
 """
 
 
+SAM_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`SamProcessor`]. See [`SamProcessor.__call__`] for
+            details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """The vision model from Sam without any head or projection on top.""",
+    SAM_START_DOCSTRING,
+)
+class SamVisionModel(SamPreTrainedModel):
+    config_class = SamVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: SamVisionConfig):
+        super().__init__(config)
+        self.vision_encoder = SamVisionEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_encoder.patch_embed
+
+    @add_start_docstrings_to_model_forward(SAM_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SamVisionEncoderOutput, config_class=SamVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SamVisionEncoderOutput]:
+        r"""
+        Returns:
+
+        """
+        return self.vision_encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
 @add_start_docstrings(
     "Segment Anything Model (SAM) for generating segmentation masks, given an input image and ",
     " optional 2D location and bounding boxes.",
@@ -1330,17 +1350,24 @@ def _init_weights(self, module):
 )
 class SamModel(SamPreTrainedModel):
     _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"]
+    # need to be ignored, as it's a buffer and will not be correctly detected as tied weight
+    _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"]
 
     def __init__(self, config):
         super().__init__(config)
         self.shared_image_embedding = SamPositionalEmbedding(config.vision_config)
 
         self.vision_encoder = SamVisionEncoder(config.vision_config)
-        self.prompt_encoder = SamPromptEncoder(config.prompt_encoder_config, self.shared_image_embedding)
+        self.prompt_encoder = SamPromptEncoder(config)
         self.mask_decoder = SamMaskDecoder(config.mask_decoder_config)
 
         self.post_init()
 
+    def _tie_weights(self):
+        self.prompt_encoder.shared_embedding.positional_embedding.data = (
+            self.shared_image_embedding.positional_embedding.data
+        )
+
     def get_input_embeddings(self):
         return self.vision_encoder.get_input_embeddings()
 
@@ -1363,7 +1390,6 @@ def get_image_embeddings(
         pixel_values,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ):
         r"""
         Returns the image embeddings by passing the pixel values through the vision encoder.
@@ -1375,15 +1401,11 @@ def get_image_embeddings(
                 Whether or not to return the attentions tensors of all attention layers.
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of all layers.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-
         """
         vision_output = self.vision_encoder(
             pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
         image_embeddings = vision_output[0]
         return image_embeddings
@@ -1421,6 +1443,7 @@ def get_prompt_embeddings(
         )
         return prompt_output
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(SAM_INPUTS_DOCSTRING)
     def forward(
         self,
@@ -1435,9 +1458,8 @@ def forward(
         target_embedding: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> List[Dict[str, torch.Tensor]]:
+    ) -> SamImageSegmentationOutput:
         r"""
         Example:
 
@@ -1467,7 +1489,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values is None and image_embeddings is None:
             raise ValueError("Either pixel_values or image_embeddings must be provided.")
@@ -1504,18 +1525,17 @@ def forward(
         vision_hidden_states = None
 
         if pixel_values is not None:
-            vision_outputs = self.vision_encoder(
+            vision_outputs: SamVisionEncoderOutput = self.vision_encoder(
                 pixel_values,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
             )
-            image_embeddings = vision_outputs[0]
+            image_embeddings = vision_outputs.last_hidden_state
 
             if output_hidden_states:
-                vision_hidden_states = vision_outputs[1]
+                vision_hidden_states = vision_outputs.hidden_states
             if output_attentions:
-                vision_attentions = vision_outputs[-1]
+                vision_attentions = vision_outputs.attentions
 
         if input_points is not None and input_labels is None:
             input_labels = torch.ones_like(input_points[:, :, :, 0], dtype=torch.int, device=input_points.device)
@@ -1547,15 +1567,6 @@ def forward(
             output_attentions=output_attentions,
         )
 
-        if not return_dict:
-            output = (iou_predictions, low_res_masks)
-            if output_hidden_states:
-                output = output + (vision_hidden_states,)
-
-            if output_attentions:
-                output = output + (vision_attentions, mask_decoder_attentions)
-            return output
-
         return SamImageSegmentationOutput(
             iou_scores=iou_predictions,
             pred_masks=low_res_masks,
@@ -1565,4 +1576,4 @@ def forward(
         )
 
 
-__all__ = ["SamModel", "SamPreTrainedModel"]
+__all__ = ["SamVisionModel", "SamModel", "SamPreTrainedModel"]
diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index ee75b1bf4f21..cbe06e05424a 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -30,7 +30,13 @@
 from ...modeling_tf_outputs import TFBaseModelOutput
 from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, keras, shape_list, unpack_inputs
 from ...tf_utils import flatten, functional_layernorm
-from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig
 
 
@@ -65,7 +71,7 @@ class TFSamVisionEncoderOutput(ModelOutput):
     """
 
     image_embeds: tf.Tensor | None = None
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
 
@@ -99,8 +105,8 @@ class TFSamImageSegmentationOutput(ModelOutput):
             heads.
     """
 
-    iou_scores: tf.Tensor = None
-    pred_masks: tf.Tensor = None
+    iou_scores: Optional[tf.Tensor] = None
+    pred_masks: Optional[tf.Tensor] = None
     vision_hidden_states: Tuple[tf.Tensor, ...] | None = None
     vision_attentions: Tuple[tf.Tensor, ...] | None = None
     mask_decoder_attentions: Tuple[tf.Tensor, ...] | None = None
@@ -982,9 +988,8 @@ def get_rel_pos(self, q_size: int, k_size: int, rel_pos: tf.Tensor) -> tf.Tensor
 
         return tf.gather(rel_pos_resized, tf.cast(relative_coords, tf.int32))
 
-    def add_decomposed_rel_pos(
+    def get_decomposed_rel_pos(
         self,
-        attn: tf.Tensor,
         query: tf.Tensor,
         rel_pos_h: tf.Tensor,
         rel_pos_w: tf.Tensor,
@@ -996,8 +1001,6 @@ def add_decomposed_rel_pos(
         https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
 
         Args:
-            attn (`tf.Tensor`):
-                attention map.
             query (`tf.Tensor`):
                 query q in the attention layer with shape (batch_size, query_height * query_width, channel).
             rel_pos_h (`tf.Tensor`):
@@ -1010,8 +1013,8 @@ def add_decomposed_rel_pos(
                 spatial sequence size of key k with (key_height, key_width).
 
         Returns:
-            attn (`tf.Tensor`):
-                attention map with added relative positional embeddings.
+            decomposed_rel_pos (`torch.Tensor`):
+                decomposed relative position embeddings.
         """
         query_height, query_width = q_size
         key_height, key_width = k_size
@@ -1022,10 +1025,12 @@ def add_decomposed_rel_pos(
         reshaped_query = tf.reshape(query, (batch_size, query_height, query_width, dim))
         rel_h = tf.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
         rel_w = tf.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
-        attn = tf.reshape(attn, (batch_size, query_height, query_width, key_height, key_width))
-        attn = attn + tf.expand_dims(rel_h, axis=-1) + tf.expand_dims(rel_w, axis=-2)
-        attn = tf.reshape(attn, (batch_size, query_height * query_width, key_height * key_width))
-        return attn
+
+        rel_h = tf.expand_dims(rel_h, axis=-1)
+        rel_w = tf.expand_dims(rel_w, axis=-2)
+        decomposed_rel_pos = rel_h + rel_w
+
+        return decomposed_rel_pos
 
     def call(self, hidden_states: tf.Tensor, output_attentions=False, training=False) -> tf.Tensor:
         batch_size, height, width, _ = shape_list(hidden_states)
@@ -1039,9 +1044,11 @@ def call(self, hidden_states: tf.Tensor, output_attentions=False, training=False
         attn_weights = tf.matmul(query * self.scale, key, transpose_b=True)
 
         if self.use_rel_pos:
-            attn_weights = self.add_decomposed_rel_pos(
-                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            decomposed_rel_pos = self.get_decomposed_rel_pos(
+                query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
             )
+            decomposed_rel_pos = tf.reshape(decomposed_rel_pos, shape_list(attn_weights))
+            attn_weights = attn_weights + decomposed_rel_pos
 
         attn_weights = tf.nn.softmax(attn_weights, axis=-1)
 
@@ -1399,6 +1406,70 @@ class TFSamPreTrainedModel(TFPreTrainedModel):
 """
 
 
+SAM_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`SamProcessor`]. See [`SamProcessor.__call__`] for
+            details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """The vision model from Sam without any head or projection on top.""",
+    SAM_START_DOCSTRING,
+)
+class TFSamVisionModel(TFSamPreTrainedModel):
+    config_class = SamVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: SamVisionConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.vision_encoder = TFSamVisionEncoder(config, name="vision_encoder")
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vision_encoder", None) is not None:
+            with tf.name_scope(self.vision_encoder.name):
+                self.vision_encoder.build(None)
+
+    def get_input_embeddings(self):
+        return self.vision_encoder.patch_embed
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SAM_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFSamVisionEncoderOutput, config_class=SamVisionConfig)
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+        **kwargs,
+    ) -> TFSamVisionEncoderOutput | Tuple[tf.Tensor]:
+        r"""
+        Returns:
+
+        """
+        return self.vision_encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+
 @add_start_docstrings(
     "Segment Anything Model (SAM) for generating segmentation masks, given an input image and ",
     " optional 2D location and bounding boxes.",
@@ -1652,4 +1723,4 @@ def build(self, input_shape=None):
                 self.mask_decoder.build(None)
 
 
-__all__ = ["TFSamModel", "TFSamPreTrainedModel"]
+__all__ = ["TFSamVisionModel", "TFSamModel", "TFSamPreTrainedModel"]
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index 6f11d710f611..0b30261fa913 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -106,6 +106,7 @@ def __call__(
         input_points = output_kwargs["images_kwargs"].pop("input_points", None)
         input_labels = output_kwargs["images_kwargs"].pop("input_labels", None)
         input_boxes = output_kwargs["images_kwargs"].pop("input_boxes", None)
+        point_pad_value = output_kwargs["images_kwargs"].pop("point_pad_value", None)
 
         encoding_image_processor = self.image_processor(
             images,
@@ -131,7 +132,7 @@ def __call__(
             input_labels=input_labels,
             input_boxes=input_boxes,
             return_tensors=output_kwargs["common_kwargs"].get("return_tensors"),
-            point_pad_value=output_kwargs["images_kwargs"].get("point_pad_value"),
+            point_pad_value=point_pad_value,
         )
 
         return encoding_image_processor
diff --git a/src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py b/src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
deleted file mode 100644
index b321af02e73b..000000000000
--- a/src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
+++ /dev/null
@@ -1,396 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Converting Meta SeamlessM4T checkpoints from seamless_communication to HF."""
-
-import argparse
-import os
-from pathlib import Path
-
-import torch
-from accelerate.utils.modeling import find_tied_parameters
-from seamless_communication.models.inference.translator import Translator
-
-from transformers import (
-    SeamlessM4TConfig,
-    SeamlessM4TFeatureExtractor,
-    SeamlessM4TModel,
-    SeamlessM4TProcessor,
-    SeamlessM4TTokenizer,
-)
-from transformers.utils import logging
-
-
-UNIT_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__", ]  # fmt: skip
-VOCODER_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",]  # fmt: skip
-MEDIUM_SUPPORTED_LANGUAGES = ["ace","ace_Latn","acm","acq","aeb","afr","ajp","aka","amh","apc","arb","ars","ary","arz","asm","ast","awa","ayr","azb","azj","bak","bam","ban","bel","bem","ben","bho","bjn","bjn_Latn","bod","bos","bug","bul","cat","ceb","ces","cjk","ckb","crh","cym","dan","deu","dik","dyu","dzo","ell","eng","epo","est","eus","ewe","fao","pes","fij","fin","fon","fra","fur","fuv","gla","gle","glg","grn","guj","hat","hau","heb","hin","hne","hrv","hun","hye","ibo","ilo","ind","isl","ita","jav","jpn","kab","kac","kam","kan","kas","kas_Deva","kat","knc","knc_Latn","kaz","kbp","kea","khm","kik","kin","kir","kmb","kon","kor","kmr","lao","lvs","lij","lim","lin","lit","lmo","ltg","ltz","lua","lug","luo","lus","mag","mai","mal","mar","min","mkd","plt","mlt","mni","khk","mos","mri","zsm","mya","nld","nno","nob","npi","nso","nus","nya","oci","gaz","ory","pag","pan","pap","pol","por","prs","pbt","quy","ron","run","rus","sag","san","sat","scn","shn","sin","slk","slv","smo","sna","snd","som","sot","spa","als","srd","srp","ssw","sun","swe","swh","szl","tam","tat","tel","tgk","tgl","tha","tir","taq","taq_Tfng","tpi","tsn","tso","tuk","tum","tur","twi","tzm","uig","ukr","umb","urd","uzn","vec","vie","war","wol","xho","ydd","yor","yue","cmn","cmn_Hant","zul",]  # fmt: skip
-LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]  # fmt: skip
-
-
-def assert_param_count(model_1, model_2):
-    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
-    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
-    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
-
-
-def param_count(model):
-    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
-
-
-def _grab_best_device(use_gpu=True):
-    if torch.cuda.device_count() > 0 and use_gpu:
-        device = "cuda"
-    else:
-        device = "cpu"
-    return torch.device(device)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-vocoder_convert_list = [
-    ("ups", "hifi_gan.upsampler"),
-    ("conv_pre", "hifi_gan.conv_pre"),
-    ("resblocks", "hifi_gan.resblocks"),
-    ("conv_post", "hifi_gan.conv_post"),
-    ("lang", "language_embedding"),
-    ("spkr", "speaker_embedding"),
-    ("dict.", "unit_embedding."),
-    ("dur_predictor.conv1.0", "dur_predictor.conv1"),
-    ("dur_predictor.conv2.0", "dur_predictor.conv2"),
-]
-
-# order is important
-wav2vec_convert_list = [
-    ("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
-    ("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
-    ("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
-    ("speech_encoder.inner.layers", "encoder.layers"),
-    ("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
-    ("speech_encoder.adaptor_layers", "adapter.layers"),
-    ("inner_proj", "intermediate_dense"),
-    ("self_attn.output_proj", "self_attn.linear_out"),
-    ("output_proj", "output_dense"),
-    ("self_attn.k_proj", "self_attn.linear_k"),
-    ("self_attn.v_proj", "self_attn.linear_v"),
-    ("self_attn.q_proj", "self_attn.linear_q"),
-    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
-    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
-    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
-    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
-    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
-    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
-    ("conv.batch_norm", "conv_module.batch_norm"),
-    ("conv_layer_norm", "conv_module.layer_norm"),
-    ("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
-    ("speech_encoder.proj2", "intermediate_ffn.output_dense"),
-    ("speech_encoder.layer_norm", "inner_layer_norm"),
-]
-
-t2u_convert_list = [
-    ("t2u_model.final_proj", "lm_head"),
-    ("t2u_model.", "model."),
-    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
-    ("encoder_decoder_attn", "cross_attention"),
-    ("linear_k", "k_proj"),
-    ("linear_v", "v_proj"),
-    ("linear_q", "q_proj"),
-    ("ffn.inner_proj", "ffn.fc1"),
-    ("ffn.output_proj", "ffn.fc2"),
-    ("output_proj", "out_proj"),
-    ("decoder_frontend.embed", "decoder.embed_tokens"),
-]
-
-text_convert_list = [
-    ("text_encoder.", ""),
-    ("text_decoder.", ""),
-    ("text_encoder_frontend.embed", "embed_tokens"),
-    ("text_decoder_frontend.embed", "embed_tokens"),
-    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
-    ("encoder_decoder_attn", "cross_attention"),
-    ("linear_k", "k_proj"),
-    ("linear_v", "v_proj"),
-    ("linear_q", "q_proj"),
-    ("ffn.inner_proj", "ffn.fc1"),
-    ("ffn.output_proj", "ffn.fc2"),
-    ("output_proj", "out_proj"),
-    ("final_proj", "lm_head"),
-]
-
-CUR_PATH = os.path.dirname(os.path.abspath(__file__))
-default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
-CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
-
-
-def _load_hf_config(model_type="medium"):
-    if model_type == "medium":
-        kwargs = {
-            "vocab_size": 256206,
-            "t2u_vocab_size": 10082,
-            "hidden_size": 1024,
-            "max_position_embeddings": 4096,
-            "encoder_layers": 12,
-            "decoder_layers": 12,
-            "encoder_ffn_dim": 4096,
-            "decoder_ffn_dim": 4096,
-            "t2u_encoder_layers": 4,
-            "t2u_decoder_layers": 4,
-            "speech_encoder_layers": 12,
-        }
-        return SeamlessM4TConfig(**kwargs)
-    else:
-        return SeamlessM4TConfig()
-
-
-def _convert_model(
-    original_model,
-    hf_model,
-    convert_list,
-    device,
-    unwanted_prefix="model.",
-    filter_state_dict="speech",
-    exclude_state_dict=None,
-):
-    state_dict = original_model.state_dict()
-
-    # filter func
-    if isinstance(filter_state_dict, str):
-
-        def filter_func(x):
-            return filter_state_dict in x[0]
-
-    else:
-
-        def filter_func(item):
-            if exclude_state_dict is not None and exclude_state_dict in item[0]:
-                return False
-            for filter_el in filter_state_dict:
-                if filter_el in item[0]:
-                    return True
-
-            return False
-
-    state_dict = dict(filter(filter_func, state_dict.items()))
-
-    for k, v in list(state_dict.items()):
-        new_k = k[len(unwanted_prefix) :]
-        for old_layer_name, new_layer_name in convert_list:
-            if old_layer_name in new_k:
-                new_k = new_k.replace(old_layer_name, new_layer_name)
-
-        # must do it by hand
-        if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
-            new_k = new_k.replace("layer_norm", "final_layer_norm")
-
-        state_dict[new_k] = state_dict.pop(k)
-
-    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    extra_keys = set(extra_keys)
-    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
-    missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    hf_model.load_state_dict(state_dict, strict=False)
-    n_params = param_count(hf_model)
-
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
-
-    hf_model.eval()
-    hf_model.to(device)
-    del state_dict
-
-    return hf_model
-
-
-def load_model(save_dir, model_type, repo_id):
-    """
-    Meta SeamlessM4T is made of 8 main components:
-    - speech_encoder (#1) and speech_encoder_frontend (#2)
-    - t2u_model (#3)
-    - text_encoder (#4) and text_encoder_frontend (#5)
-    - text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
-    - final_proj (#7)
-    - vocoder (#8)
-    """
-    device = _grab_best_device()
-    if model_type == "medium":
-        name = "seamlessM4T_medium"
-    else:
-        name = "seamlessM4T_large"
-
-    original_model = Translator(name, "vocoder_36langs", device, torch.float32)
-
-    ######### TOKENIZER
-
-    langs = MEDIUM_SUPPORTED_LANGUAGES if model_type == "medium" else LARGE_SUPPORTED_LANGUAGES
-    langs = [f"__{lang}__" for lang in langs]
-    vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
-
-    save_dir = os.path.join(save_dir, name)
-    Path(save_dir).mkdir(exist_ok=True)
-
-    tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
-
-    sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
-
-    tokenizer.save_pretrained(save_dir)
-    tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
-
-    if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
-        raise ValueError(
-            f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
-        )
-
-    ####### get language to ids dict
-    text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
-    # offset: vocoder unit vocab size + 5 (for EOS/PAD/BOS/UNK/MSK) + len(supported_languages)
-    t2u_lang_code_to_id = {
-        code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
-        for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
-    }
-    vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
-
-    ######### FE
-
-    fe = SeamlessM4TFeatureExtractor(language_code=langs)
-
-    fe.save_pretrained(save_dir)
-    fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
-
-    processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
-    processor.save_pretrained(save_dir)
-    processor.push_to_hub(repo_id=repo_id, create_pr=True)
-
-    processor = SeamlessM4TProcessor.from_pretrained(save_dir)
-
-    ######## Model
-
-    # init model
-    hf_config = _load_hf_config(model_type)
-    hf_model = SeamlessM4TModel(hf_config)
-
-    hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
-    hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
-    hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
-
-    # -1. take care of vocoder
-    # similarly to speech T5 must apply and remove weight norm
-    hf_model.vocoder.apply_weight_norm()
-    hf_model.vocoder = _convert_model(
-        original_model,
-        hf_model.vocoder,
-        vocoder_convert_list,
-        device,
-        unwanted_prefix="vocoder.code_generator.",
-        filter_state_dict="vocoder",
-    )
-    hf_model.vocoder.remove_weight_norm()
-
-    # 1. take care of speech encoder
-    wav2vec = hf_model.speech_encoder
-    hf_model.speech_encoder = _convert_model(
-        original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
-    )
-
-    # 2. take care of t2u
-
-    hf_model.t2u_model = _convert_model(
-        original_model,
-        hf_model.t2u_model,
-        t2u_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict="t2u_model",
-    )
-
-    # 3. take care of text encoder
-    hf_model.text_encoder = _convert_model(
-        original_model,
-        hf_model.text_encoder,
-        text_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.text_encoder"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # 4. take care of text decoder
-    hf_model.text_decoder = _convert_model(
-        original_model,
-        hf_model.text_decoder,
-        text_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.text_decoder"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # 5. take care of final proj
-    hf_model.lm_head = _convert_model(
-        original_model,
-        hf_model.lm_head,
-        [("final_proj.", "")],
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.final_proj"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # sanity check
-    print(find_tied_parameters(hf_model))
-
-    count_1 = param_count(hf_model)
-    count_2 = param_count(original_model)
-
-    print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
-    print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
-
-    del original_model
-
-    hf_model.generation_config._from_model_config = False
-    hf_model.save_pretrained(save_dir)
-    hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
-    hf_model = SeamlessM4TModel.from_pretrained(save_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-
-    parser.add_argument(
-        "--model_type",
-        default="medium",
-        type=str,
-        help="Model type.",
-    )
-
-    parser.add_argument(
-        "--save_dir",
-        default="/home/ubuntu/weights",
-        type=str,
-        help="Path to the output PyTorch model.",
-    )
-
-    parser.add_argument(
-        "--repo_id",
-        default="facebook/hf-seamless-m4t-medium",
-        type=str,
-        help="Repo ID.",
-    )
-
-    args = parser.parse_args()
-
-    load_model(args.save_dir, args.model_type, args.repo_id)
diff --git a/src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py b/src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py
index 08ea4ea08286..b17dcf792e18 100644
--- a/src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py
@@ -74,7 +74,7 @@ def __init__(
         self.stride = stride
 
         mel_filters = mel_filter_bank(
-            num_frequency_bins=256,
+            num_frequency_bins=257,
             num_mel_filters=self.num_mel_bins,
             min_frequency=20,
             max_frequency=sampling_rate // 2,
@@ -84,7 +84,7 @@ def __init__(
             triangularize_in_mel_space=True,
         )
 
-        self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
+        self.mel_filters = mel_filters
         self.window = window_function(400, "povey", periodic=False)
 
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
@@ -225,7 +225,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index d8b52cf63700..fa7c75844f75 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -1050,7 +1050,10 @@ def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional
 
     @torch.no_grad()
     def forward(
-        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
     ):
         if input_ids is not None:
             bsz, seq_len = input_ids.size()
@@ -1685,7 +1688,7 @@ def __init__(
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
@@ -1873,7 +1876,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -2203,7 +2206,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -2578,7 +2581,7 @@ def forward(
         lang = self.language_embedding(lang_id).transpose(1, 2)
 
         log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2))
-        dur_out = torch.clamp(torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1)
+        dur_out = torch.clamp(torch.round((torch.expm1(log_dur_pred))).long(), min=1)
         # B x C x T
         if hidden_states.size(0) == 1:
             hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2)
@@ -2697,7 +2700,7 @@ def _tie_weights(self):
     @add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -2869,11 +2872,11 @@ def generate(
                 if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
                     raise ValueError(
                         f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
-                        {', '.join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                        {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
                     )
                 # tgt_lang gets priority over decoder input ids
                 text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
             else:
                 raise ValueError(
                     """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
@@ -2912,7 +2915,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The speech-to-text SeamlessM4T Model transformer which can be used for S2TT.",
     SEAMLESS_M4T_START_DOCSTRING,
 )
-class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel):
+class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["text_decoder", "t2u_model", "vocoder"]
     main_input_name = "input_features"
 
@@ -2958,7 +2961,7 @@ def _tie_weights(self):
     @add_start_docstrings_to_model_forward(M4T_SPEECH_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_features: torch.LongTensor = None,
+        input_features: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -3140,11 +3143,11 @@ def generate(
                 if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
                     raise ValueError(
                         f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
-                        {', '.join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                        {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
                     )
                 # tgt_lang gets priority over decoder input ids
                 text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
             else:
                 raise ValueError(
                     """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
@@ -3182,7 +3185,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The text-to-speech SeamlessM4T Model transformer which can be used for T2ST.",
     SEAMLESS_M4T_START_DOCSTRING,
 )
-class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel):
+class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["speech_encoder"]
     main_input_name = "input_ids"
 
@@ -3236,7 +3239,7 @@ def _tie_weights(self):
     @add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -3407,7 +3410,7 @@ def generate(
                 elif tgt_lang not in lang_code_to_id:
                     raise ValueError(
                         f"""`tgt_lang={tgt_lang}` is not supported by this model.
-                    Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
                     more languages for text translation than for speech synthesis."""
                     )
 
@@ -3420,7 +3423,7 @@ def generate(
 
         # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
         text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
 
         kwargs_text["decoder_input_ids"] = text_decoder_input_ids
 
@@ -3441,7 +3444,8 @@ def generate(
             idx_most_probable_sequences_per_batch = text_generation_output.sequences_scores.view(batch_size, -1)
             idx_most_probable_sequences_per_batch = idx_most_probable_sequences_per_batch.argmax(-1)
             idx_most_probable_sequences_per_batch = (
-                idx_most_probable_sequences_per_batch + torch.arange(batch_size).to(self.device) * num_return_sequences
+                idx_most_probable_sequences_per_batch
+                + torch.arange(batch_size, device=self.device) * num_return_sequences
             )
             sequences = sequences[idx_most_probable_sequences_per_batch]
 
@@ -3462,8 +3466,8 @@ def generate(
         # Compute t2u decoder_input_ids
         t2u_decoder_input_ids = kwargs_speech.get("decoder_input_ids")
         t2u_tgt_lang_id = self.generation_config.t2u_lang_code_to_id.get(tgt_lang)
-        t2u_decoder_input_ids = torch.tensor([[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size).to(
-            self.device
+        t2u_decoder_input_ids = torch.tensor(
+            [[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size, device=self.device
         )
         kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids
         # second generation
@@ -3480,9 +3484,9 @@ def generate(
         )
 
         vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
-        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids)).to(self.device)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
 
-        spkr_id = torch.tensor([[spkr_id]] * len(unit_ids)).to(self.device)
+        spkr_id = torch.tensor([[spkr_id]] * len(unit_ids), device=self.device)
 
         waveform, waveform_lengths = self.vocoder(input_ids=unit_ids, spkr_id=spkr_id, lang_id=vocoder_tgt_lang_id)
 
@@ -3511,7 +3515,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The speech-to-speech SeamlessM4T Model transformer which can be used for S2ST.",
     SEAMLESS_M4T_START_DOCSTRING,
 )
-class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel):
+class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["text_encoder"]
     main_input_name = "input_features"
 
@@ -3560,7 +3564,7 @@ def _tie_weights(self):
     @add_start_docstrings_to_model_forward(M4T_SPEECH_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_features: torch.LongTensor = None,
+        input_features: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -3736,7 +3740,7 @@ def generate(
                 elif tgt_lang not in lang_code_to_id:
                     raise ValueError(
                         f"""`tgt_lang={tgt_lang}` is not supported by this model.
-                    Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
                     more languages for text translation than for speech synthesis."""
                     )
 
@@ -3748,7 +3752,7 @@ def generate(
         text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
         # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
         text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
 
         kwargs_text["decoder_input_ids"] = text_decoder_input_ids
 
@@ -3779,7 +3783,8 @@ def generate(
             idx_most_probable_sequences_per_batch = text_generation_output.sequences_scores.view(batch_size, -1)
             idx_most_probable_sequences_per_batch = idx_most_probable_sequences_per_batch.argmax(-1)
             idx_most_probable_sequences_per_batch = (
-                idx_most_probable_sequences_per_batch + torch.arange(batch_size).to(self.device) * num_return_sequences
+                idx_most_probable_sequences_per_batch
+                + torch.arange(batch_size, device=self.device) * num_return_sequences
             )
             sequences = sequences[idx_most_probable_sequences_per_batch]
 
@@ -3800,8 +3805,8 @@ def generate(
         # Compute t2u decoder_input_ids
         t2u_decoder_input_ids = kwargs_speech.get("decoder_input_ids")
         t2u_tgt_lang_id = self.generation_config.t2u_lang_code_to_id.get(tgt_lang)
-        t2u_decoder_input_ids = torch.tensor([[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size).to(
-            self.device
+        t2u_decoder_input_ids = torch.tensor(
+            [[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size, device=self.device
         )
         kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids
 
@@ -3819,9 +3824,9 @@ def generate(
         )
 
         vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
-        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids)).to(self.device)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
 
-        spkr_id = torch.tensor([[spkr_id]] * len(unit_ids)).to(self.device)
+        spkr_id = torch.tensor([[spkr_id]] * len(unit_ids), device=self.device)
 
         waveform, waveform_lengths = self.vocoder(input_ids=unit_ids, spkr_id=spkr_id, lang_id=vocoder_tgt_lang_id)
 
@@ -3854,7 +3859,7 @@ def _reorder_cache(past_key_values, beam_idx):
             Default modality. Used to initialize the model.
     """,
 )
-class SeamlessM4TModel(SeamlessM4TPreTrainedModel):
+class SeamlessM4TModel(SeamlessM4TPreTrainedModel, GenerationMixin):
     _tied_weights_keys = [
         "lm_head.weight",
         "text_encoder.embed_tokens.weight",
@@ -4151,7 +4156,7 @@ def generate(
                 elif tgt_lang not in lang_code_to_id:
                     raise ValueError(
                         f"""`tgt_lang={tgt_lang}` is not supported by this model.
-                    Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
                     more languages for text translation than for speech synthesis."""
                     )
 
@@ -4171,7 +4176,7 @@ def generate(
         if tgt_lang is not None:
             # tgt_lang gets priority over decoder input ids
             text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-            text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+            text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
 
         kwargs_text["decoder_input_ids"] = text_decoder_input_ids
 
@@ -4221,7 +4226,8 @@ def generate(
             idx_most_probable_sequences_per_batch = text_generation_output.sequences_scores.view(batch_size, -1)
             idx_most_probable_sequences_per_batch = idx_most_probable_sequences_per_batch.argmax(-1)
             idx_most_probable_sequences_per_batch = (
-                idx_most_probable_sequences_per_batch + torch.arange(batch_size).to(self.device) * num_return_sequences
+                idx_most_probable_sequences_per_batch
+                + torch.arange(batch_size, device=self.device) * num_return_sequences
             )
             sequences = sequences[idx_most_probable_sequences_per_batch]
 
@@ -4242,8 +4248,8 @@ def generate(
         # Compute t2u decoder_input_ids
         t2u_decoder_input_ids = kwargs_speech.get("decoder_input_ids")
         t2u_tgt_lang_id = self.generation_config.t2u_lang_code_to_id.get(tgt_lang)
-        t2u_decoder_input_ids = torch.tensor([[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size).to(
-            self.device
+        t2u_decoder_input_ids = torch.tensor(
+            [[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size, device=self.device
         )
         kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids
 
@@ -4261,9 +4267,9 @@ def generate(
         )
 
         vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
-        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids)).to(self.device)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
 
-        spkr_id = torch.tensor([[spkr_id]] * len(unit_ids)).to(self.device)
+        spkr_id = torch.tensor([[spkr_id]] * len(unit_ids), device=self.device)
 
         waveform, waveform_lengths = self.vocoder(input_ids=unit_ids, spkr_id=spkr_id, lang_id=vocoder_tgt_lang_id)
 
diff --git a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
index dd80b503eead..90d96b61c99c 100644
--- a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
@@ -47,7 +47,7 @@ def __call__(self, text=None, audios=None, src_lang=None, tgt_lang=None, **kwarg
         and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
         `None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
         SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
-        to the doctsring of the above two methods for more information.
+        to the docstring of the above two methods for more information.
 
         Args:
             text (`str`, `List[str]`, `List[List[str]]`):
diff --git a/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py b/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
deleted file mode 100644
index 97a633d05ac6..000000000000
--- a/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
+++ /dev/null
@@ -1,404 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Converting Meta SeamlessM4Tv2 checkpoints from seamless_communication to HF."""
-
-import argparse
-import os
-from pathlib import Path
-
-import torch
-from accelerate.utils.modeling import find_tied_parameters
-from seamless_communication.inference import Translator
-
-from transformers import (
-    SeamlessM4TFeatureExtractor,
-    SeamlessM4TProcessor,
-    SeamlessM4TTokenizer,
-    SeamlessM4Tv2Config,
-    SeamlessM4Tv2Model,
-)
-from transformers.utils import logging
-
-
-# fmt: off
-UNIT_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__", ]
-# fmt: on
-
-# fmt: off
-VOCODER_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",]
-# fmt: on
-
-# fmt: off
-LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]
-# fmt: on
-
-
-def assert_param_count(model_1, model_2):
-    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
-    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
-    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
-
-
-def param_count(model):
-    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
-
-
-def _grab_best_device(use_gpu=True):
-    if torch.cuda.device_count() > 0 and use_gpu:
-        device = "cuda"
-    else:
-        device = "cpu"
-    return torch.device(device)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-vocoder_convert_list = [
-    ("ups", "hifi_gan.upsampler"),
-    ("conv_pre", "hifi_gan.conv_pre"),
-    ("resblocks", "hifi_gan.resblocks"),
-    ("conv_post", "hifi_gan.conv_post"),
-    ("lang", "language_embedding"),
-    ("spkr", "speaker_embedding"),
-    ("dict.", "unit_embedding."),
-    ("dur_predictor.conv1.0", "dur_predictor.conv1"),
-    ("dur_predictor.conv2.0", "dur_predictor.conv2"),
-]
-
-# order is important
-wav2vec_convert_list = [
-    ("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
-    ("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
-    ("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
-    ("speech_encoder.inner.layers", "encoder.layers"),
-    ("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
-    ("speech_encoder.adaptor_layers", "adapter.layers"),
-    ("inner_proj", "intermediate_dense"),
-    ("self_attn.output_proj", "self_attn.linear_out"),
-    ("output_proj", "output_dense"),
-    ("self_attn.k_proj", "self_attn.linear_k"),
-    ("self_attn.v_proj", "self_attn.linear_v"),
-    ("self_attn.q_proj", "self_attn.linear_q"),
-    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
-    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
-    ("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"),
-    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
-    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
-    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
-    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
-    ("conv.batch_norm", "conv_module.batch_norm"),
-    ("conv.layer_norm", "conv_module.depthwise_layer_norm"),
-    ("conv_layer_norm", "conv_module.layer_norm"),
-    ("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
-    ("speech_encoder.proj2", "intermediate_ffn.output_dense"),
-    ("speech_encoder.layer_norm", "inner_layer_norm"),
-]
-
-t2u_convert_list = [
-    ("t2u_model.final_proj", "lm_head"),
-    ("t2u_model.", "model."),
-    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
-    ("encoder_decoder_attn", "cross_attention"),
-    ("linear_k", "k_proj"),
-    ("linear_v", "v_proj"),
-    ("linear_q", "q_proj"),
-    ("ffn.inner_proj", "ffn.fc1"),
-    ("ffn.output_proj", "ffn.fc2"),
-    ("output_proj", "out_proj"),
-    ("decoder_frontend.embed_char", "decoder.embed_char"),
-    ("decoder_frontend.pos_emb_alpha_char", "decoder.pos_emb_alpha_char"),
-    ("decoder_frontend.embed", "decoder.embed_tokens"),
-    ("decoder_frontend.pos_emb_alpha", "decoder.pos_emb_alpha"),
-    ("conv1d.conv", "conv"),
-    ("conv1d_layer_norm", "conv_layer_norm"),
-    ("decoder_frontend.variance_adaptor", "decoder"),
-    ("duration_predictor.conv1.0", "duration_predictor.conv1"),
-    ("duration_predictor.conv2.0", "duration_predictor.conv2"),
-]
-
-text_convert_list = [
-    ("text_encoder.", ""),
-    ("text_decoder.", ""),
-    ("text_encoder_frontend.embed", "embed_tokens"),
-    ("text_decoder_frontend.embed", "embed_tokens"),
-    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
-    ("encoder_decoder_attn", "cross_attention"),
-    ("linear_k", "k_proj"),
-    ("linear_v", "v_proj"),
-    ("linear_q", "q_proj"),
-    ("ffn.inner_proj", "ffn.fc1"),
-    ("ffn.output_proj", "ffn.fc2"),
-    ("output_proj", "out_proj"),
-    ("final_proj", "lm_head"),
-]
-
-CUR_PATH = os.path.dirname(os.path.abspath(__file__))
-default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
-CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
-
-
-def _load_hf_config():
-    return SeamlessM4Tv2Config()
-
-
-def _convert_model(
-    original_model,
-    hf_model,
-    convert_list,
-    device,
-    unwanted_prefix="model.",
-    filter_state_dict="speech",
-    exclude_state_dict=None,
-):
-    state_dict = original_model.state_dict()
-
-    # filter func
-    if isinstance(filter_state_dict, str):
-
-        def filter_func(x):
-            return filter_state_dict in x[0]
-
-    else:
-
-        def filter_func(item):
-            if exclude_state_dict is not None and exclude_state_dict in item[0]:
-                return False
-            for filter_el in filter_state_dict:
-                if filter_el in item[0]:
-                    return True
-
-            return False
-
-    state_dict = dict(filter(filter_func, state_dict.items()))
-
-    for k, v in list(state_dict.items()):
-        new_k = k[len(unwanted_prefix) :]
-        for old_layer_name, new_layer_name in convert_list:
-            if old_layer_name in new_k:
-                new_k = new_k.replace(old_layer_name, new_layer_name)
-
-        # must do it by hand
-        if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
-            new_k = new_k.replace("layer_norm", "final_layer_norm")
-
-        state_dict[new_k] = state_dict.pop(k)
-
-    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    extra_keys = set(extra_keys)
-    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
-    missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    hf_model.load_state_dict(state_dict, strict=False)
-    n_params = param_count(hf_model)
-
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
-
-    hf_model.eval()
-    hf_model.to(device)
-    del state_dict
-
-    return hf_model
-
-
-def load_model(save_dir, model_type, repo_id):
-    """
-    Meta SeamlessM4Tv2 is made of 8 main components:
-    - speech_encoder (#1) and speech_encoder_frontend (#2)
-    - t2u_model (#3)
-    - text_encoder (#4) and text_encoder_frontend (#5)
-    - text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
-    - final_proj (#7)
-    - vocoder (#8)
-    """
-    device = _grab_best_device()
-    name = "seamlessM4T_v2_large"
-
-    original_model = Translator(name, "vocoder_v2", device, dtype=torch.float32)
-
-    ######### TOKENIZER
-
-    langs = LARGE_SUPPORTED_LANGUAGES
-    langs = [f"__{lang}__" for lang in langs]
-    vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
-
-    save_dir = os.path.join(save_dir, name)
-    Path(save_dir).mkdir(exist_ok=True)
-
-    tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
-
-    sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
-
-    tokenizer.save_pretrained(save_dir)
-    tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
-
-    if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
-        raise ValueError(
-            f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
-        )
-
-    ####### get language to ids dict
-    text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
-    # offset: vocoder unit vocab size + 5 (for EOS/PAD/BOS/UNK/MSK) + len(supported_languages)
-    t2u_lang_code_to_id = {
-        code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
-        for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
-    }
-    vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
-
-    ######### FE
-
-    fe = SeamlessM4TFeatureExtractor(language_code=langs)
-
-    fe.save_pretrained(save_dir)
-    fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
-
-    processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
-    processor.save_pretrained(save_dir)
-    processor.push_to_hub(repo_id=repo_id, create_pr=True)
-
-    processor = SeamlessM4TProcessor.from_pretrained(save_dir)
-
-    ######## Model
-
-    # init config
-    hf_config = _load_hf_config()
-
-    ######## get id_to_text and char_to_id from original model tokenizers
-    id_to_text = {i: original_model.text_tokenizer.model.index_to_token(i) for i in range(hf_config.vocab_size)}
-    char_to_id = {
-        original_model.model.t2u_model.decoder_frontend.char_tokenizer.model.index_to_token(i): i for i in range(10904)
-    }
-
-    # init model
-    hf_model = SeamlessM4Tv2Model(hf_config)
-
-    hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
-    hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
-    hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
-    hf_model.generation_config.__setattr__("id_to_text", id_to_text)
-    hf_model.generation_config.__setattr__("char_to_id", char_to_id)
-
-    # -1. take care of vocoder
-    # similarly to speech T5 must apply and remove weight norm
-    hf_model.vocoder.apply_weight_norm()
-    hf_model.vocoder = _convert_model(
-        original_model,
-        hf_model.vocoder,
-        vocoder_convert_list,
-        device,
-        unwanted_prefix="vocoder.code_generator.",
-        filter_state_dict="vocoder",
-    )
-    hf_model.vocoder.remove_weight_norm()
-
-    # 1. take care of speech encoder
-    wav2vec = hf_model.speech_encoder
-    hf_model.speech_encoder = _convert_model(
-        original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
-    )
-
-    # 2. take care of t2u
-
-    hf_model.t2u_model = _convert_model(
-        original_model,
-        hf_model.t2u_model,
-        t2u_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict="t2u_model",
-    )
-
-    # 3. take care of text encoder
-    hf_model.text_encoder = _convert_model(
-        original_model,
-        hf_model.text_encoder,
-        text_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.text_encoder"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # 4. take care of text decoder
-    hf_model.text_decoder = _convert_model(
-        original_model,
-        hf_model.text_decoder,
-        text_convert_list,
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.text_decoder"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # 5. take care of final proj
-    hf_model.lm_head = _convert_model(
-        original_model,
-        hf_model.lm_head,
-        [("final_proj.", "")],
-        device,
-        unwanted_prefix="model.",
-        filter_state_dict=["model.final_proj"],
-        exclude_state_dict="t2u_model",
-    )
-
-    # sanity check
-    print(find_tied_parameters(hf_model))
-
-    count_1 = param_count(hf_model)
-    count_2 = param_count(original_model)
-
-    print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
-    print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
-
-    del original_model
-
-    hf_model.generation_config._from_model_config = False
-    hf_model.save_pretrained(save_dir)
-    hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
-    hf_model = SeamlessM4Tv2Model.from_pretrained(save_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-
-    parser.add_argument(
-        "--model_type",
-        default="large",
-        type=str,
-        help="Model type.",
-    )
-
-    parser.add_argument(
-        "--save_dir",
-        default="/home/ubuntu/weights_v2",
-        type=str,
-        help="Path to the output PyTorch model.",
-    )
-
-    parser.add_argument(
-        "--repo_id",
-        default="facebook/seamless-m4t-v2-large",
-        type=str,
-        help="Repo ID.",
-    )
-
-    args = parser.parse_args()
-
-    load_model(args.save_dir, args.model_type, args.repo_id)
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index da9a4ecdadb4..998008bbf56b 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -104,7 +104,7 @@ class SeamlessM4Tv2TextToUnitDecoderOutput(ModelOutput):
             for *masked*
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     padding_mask: Optional[torch.Tensor] = None
@@ -153,7 +153,7 @@ class SeamlessM4Tv2TextToUnitOutput(ModelOutput):
             Language modeling loss.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     padding_mask: Optional[torch.Tensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -999,7 +999,10 @@ def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional
 
     @torch.no_grad()
     def forward(
-        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
     ):
         if input_ids is not None:
             bsz, seq_len = input_ids.size()
@@ -1799,7 +1802,7 @@ def __init__(
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
@@ -1988,7 +1991,7 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -2248,9 +2251,9 @@ def set_input_embeddings(self, value):
 
     def forward(
         self,
-        char_input_ids: torch.LongTensor = None,
-        char_count_per_id: torch.LongTensor = None,
-        encoder_hidden_states: torch.FloatTensor = None,
+        char_input_ids: Optional[torch.LongTensor] = None,
+        char_count_per_id: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -2292,7 +2295,7 @@ def forward(
 
         # predict duration
         log_dur_pred = self.duration_predictor(char_hidden_states, padding_mask=char_padding_mask)
-        dur_out = torch.clamp(torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1)
+        dur_out = torch.clamp(torch.round((torch.expm1(log_dur_pred))).long(), min=1)
         dur_out = dur_out.masked_fill(~char_padding_mask.bool(), 0.0)
 
         # upsample char hidden states according to predicted duration
@@ -2380,8 +2383,8 @@ def __init__(
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        char_input_ids: torch.LongTensor = None,
-        char_count_per_id: torch.LongTensor = None,
+        char_input_ids: Optional[torch.LongTensor] = None,
+        char_count_per_id: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -2499,9 +2502,9 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(M4T_TEXT_TO_UNITS_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        char_input_ids: torch.LongTensor = None,
-        char_count_per_id: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        char_input_ids: Optional[torch.LongTensor] = None,
+        char_count_per_id: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -2660,7 +2663,7 @@ def __init__(self, embed_dim, hidden_dim, kernel_size, var_pred_dropout):
         self.ln2 = nn.LayerNorm(hidden_dim)
         self.proj = nn.Linear(hidden_dim, 1)
 
-    def forward(self, hidden_states: Tensor, padding_mask: Tensor = None) -> Tensor:
+    def forward(self, hidden_states: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
         # Input: B x T x C; Output: B x T
         if padding_mask is not None:
             hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
@@ -2854,7 +2857,7 @@ def forward(
         lang = self.language_embedding(lang_id).transpose(1, 2)
 
         log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2))
-        dur_out = torch.clamp(torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1)
+        dur_out = torch.clamp(torch.round((torch.expm1(log_dur_pred))).long(), min=1)
         # B x C x T
         if hidden_states.size(0) == 1:
             hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2)
@@ -2977,7 +2980,7 @@ def _tie_weights(self):
     @add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -3149,11 +3152,11 @@ def generate(
                 if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
                     raise ValueError(
                         f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
-                        {', '.join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                        {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
                     )
                 # tgt_lang gets priority over decoder input ids
                 text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
             else:
                 raise ValueError(
                     """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
@@ -3192,7 +3195,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The speech-to-text SeamlessM4Tv2 Model transformer which can be used for S2TT.",
     SEAMLESS_M4T_V2_START_DOCSTRING,
 )
-class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel):
+class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["text_decoder", "t2u_model", "vocoder"]
     main_input_name = "input_features"
 
@@ -3247,7 +3250,7 @@ def _tie_weights(self):
     # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.forward
     def forward(
         self,
-        input_features: torch.LongTensor = None,
+        input_features: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -3430,11 +3433,11 @@ def generate(
                 if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
                     raise ValueError(
                         f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
-                        {', '.join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                        {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
                     )
                 # tgt_lang gets priority over decoder input ids
                 text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
             else:
                 raise ValueError(
                     """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
@@ -3473,7 +3476,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The text-to-speech SeamlessM4Tv2 Model transformer which can be used for T2ST.",
     SEAMLESS_M4T_V2_START_DOCSTRING,
 )
-class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel):
+class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["speech_encoder"]
     main_input_name = "input_ids"
 
@@ -3536,7 +3539,7 @@ def _tie_weights(self):
     # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.forward with SeamlessM4T->SeamlessM4Tv2
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -3707,7 +3710,7 @@ def generate(
                 elif tgt_lang not in lang_code_to_id:
                     raise ValueError(
                         f"""`tgt_lang={tgt_lang}` is not supported by this model.
-                    Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
                     more languages for text translation than for speech synthesis."""
                     )
 
@@ -3720,7 +3723,7 @@ def generate(
 
         # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
         text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
 
         kwargs_text["decoder_input_ids"] = text_decoder_input_ids
 
@@ -3810,9 +3813,9 @@ def generate(
         )
 
         vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
-        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids)).to(self.device)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
 
-        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids)).to(self.device)
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
 
         waveform, waveform_lengths = self.vocoder(
             input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
@@ -3844,7 +3847,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The speech-to-speech SeamlessM4Tv2 Model transformer which can be used for S2ST.",
     SEAMLESS_M4T_V2_START_DOCSTRING,
 )
-class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel):
+class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["text_encoder"]
     main_input_name = "input_features"
 
@@ -3902,7 +3905,7 @@ def _tie_weights(self):
     # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.forward with SeamlessM4T->SeamlessM4Tv2
     def forward(
         self,
-        input_features: torch.LongTensor = None,
+        input_features: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
@@ -4078,7 +4081,7 @@ def generate(
                 elif tgt_lang not in lang_code_to_id:
                     raise ValueError(
                         f"""`tgt_lang={tgt_lang}` is not supported by this model.
-                    Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
                     more languages for text translation than for speech synthesis."""
                     )
 
@@ -4090,7 +4093,7 @@ def generate(
         text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
         # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
         text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
 
         kwargs_text["decoder_input_ids"] = text_decoder_input_ids
 
@@ -4190,9 +4193,9 @@ def generate(
         )
 
         vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
-        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids)).to(self.device)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
 
-        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids)).to(self.device)
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
 
         waveform, waveform_lengths = self.vocoder(
             input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
@@ -4229,7 +4232,7 @@ def _reorder_cache(past_key_values, beam_idx):
             This will be updated automatically according to the modality passed to the forward and generate passes (`input_ids` for text and `input_features` for audio).
     """,
 )
-class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel):
+class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
     _tied_weights_keys = [
         "lm_head.weight",
         "text_encoder.embed_tokens.weight",
@@ -4539,7 +4542,7 @@ def generate(
                 elif tgt_lang not in lang_code_to_id:
                     raise ValueError(
                         f"""`tgt_lang={tgt_lang}` is not supported by this model.
-                    Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
                     more languages for text translation than for speech synthesis."""
                     )
 
@@ -4559,7 +4562,7 @@ def generate(
         if tgt_lang is not None:
             # tgt_lang gets priority over decoder input ids
             text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
-            text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+            text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
 
         kwargs_text["decoder_input_ids"] = text_decoder_input_ids
 
@@ -4679,9 +4682,9 @@ def generate(
         )
 
         vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
-        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids)).to(self.device)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
 
-        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids)).to(self.device)
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
 
         waveform, waveform_lengths = self.vocoder(
             input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
diff --git a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
deleted file mode 100644
index dbac5ab6b891..000000000000
--- a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SegFormer checkpoints."""
-
-import argparse
-import json
-from collections import OrderedDict
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    SegformerConfig,
-    SegformerForImageClassification,
-    SegformerForSemanticSegmentation,
-    SegformerImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def rename_keys(state_dict, encoder_only=False):
-    new_state_dict = OrderedDict()
-    for key, value in state_dict.items():
-        if encoder_only and not key.startswith("head"):
-            key = "segformer.encoder." + key
-        if key.startswith("backbone"):
-            key = key.replace("backbone", "segformer.encoder")
-        if "patch_embed" in key:
-            # replace for example patch_embed1 by patch_embeddings.0
-            idx = key[key.find("patch_embed") + len("patch_embed")]
-            key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx)-1}")
-        if "norm" in key:
-            key = key.replace("norm", "layer_norm")
-        if "segformer.encoder.layer_norm" in key:
-            # replace for example layer_norm1 by layer_norm.0
-            idx = key[key.find("segformer.encoder.layer_norm") + len("segformer.encoder.layer_norm")]
-            key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx)-1}")
-        if "layer_norm1" in key:
-            key = key.replace("layer_norm1", "layer_norm_1")
-        if "layer_norm2" in key:
-            key = key.replace("layer_norm2", "layer_norm_2")
-        if "block" in key:
-            # replace for example block1 by block.0
-            idx = key[key.find("block") + len("block")]
-            key = key.replace(f"block{idx}", f"block.{int(idx)-1}")
-        if "attn.q" in key:
-            key = key.replace("attn.q", "attention.self.query")
-        if "attn.proj" in key:
-            key = key.replace("attn.proj", "attention.output.dense")
-        if "attn" in key:
-            key = key.replace("attn", "attention.self")
-        if "fc1" in key:
-            key = key.replace("fc1", "dense1")
-        if "fc2" in key:
-            key = key.replace("fc2", "dense2")
-        if "linear_pred" in key:
-            key = key.replace("linear_pred", "classifier")
-        if "linear_fuse" in key:
-            key = key.replace("linear_fuse.conv", "linear_fuse")
-            key = key.replace("linear_fuse.bn", "batch_norm")
-        if "linear_c" in key:
-            # replace for example linear_c4 by linear_c.3
-            idx = key[key.find("linear_c") + len("linear_c")]
-            key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx)-1}")
-        if key.startswith("head"):
-            key = key.replace("head", "classifier")
-        new_state_dict[key] = value
-
-    return new_state_dict
-
-
-def read_in_k_v(state_dict, config):
-    # for each of the encoder blocks:
-    for i in range(config.num_encoder_blocks):
-        for j in range(config.depths[i]):
-            # read in weights + bias of keys and values (which is a single matrix in the original implementation)
-            kv_weight = state_dict.pop(f"segformer.encoder.block.{i}.{j}.attention.self.kv.weight")
-            kv_bias = state_dict.pop(f"segformer.encoder.block.{i}.{j}.attention.self.kv.bias")
-            # next, add keys and values (in that order) to the state dict
-            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[
-                : config.hidden_sizes[i], :
-            ]
-            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]]
-            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[
-                config.hidden_sizes[i] :, :
-            ]
-            state_dict[f"segformer.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[
-                config.hidden_sizes[i] :
-            ]
-
-
-# We will verify our results on a COCO image
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-
-    return image
-
-
-@torch.no_grad()
-def convert_segformer_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our SegFormer structure.
-    """
-
-    # load default SegFormer configuration
-    config = SegformerConfig()
-    encoder_only = False
-
-    # set attributes based on model_name
-    repo_id = "huggingface/label-files"
-    if "segformer" in model_name:
-        size = model_name[len("segformer.") : len("segformer.") + 2]
-        if "ade" in model_name:
-            config.num_labels = 150
-            filename = "ade20k-id2label.json"
-            expected_shape = (1, 150, 128, 128)
-        elif "city" in model_name:
-            config.num_labels = 19
-            filename = "cityscapes-id2label.json"
-            expected_shape = (1, 19, 128, 128)
-        else:
-            raise ValueError(f"Model {model_name} not supported")
-    elif "mit" in model_name:
-        encoder_only = True
-        size = model_name[4:6]
-        config.num_labels = 1000
-        filename = "imagenet-1k-id2label.json"
-        expected_shape = (1, 1000)
-    else:
-        raise ValueError(f"Model {model_name} not supported")
-
-    # set config attributes
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    if size == "b0":
-        pass
-    elif size == "b1":
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.decoder_hidden_size = 256
-    elif size == "b2":
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.decoder_hidden_size = 768
-        config.depths = [3, 4, 6, 3]
-    elif size == "b3":
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.decoder_hidden_size = 768
-        config.depths = [3, 4, 18, 3]
-    elif size == "b4":
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.decoder_hidden_size = 768
-        config.depths = [3, 8, 27, 3]
-    elif size == "b5":
-        config.hidden_sizes = [64, 128, 320, 512]
-        config.decoder_hidden_size = 768
-        config.depths = [3, 6, 40, 3]
-    else:
-        raise ValueError(f"Size {size} not supported")
-
-    # load image processor (only resize + normalize)
-    image_processor = SegformerImageProcessor(
-        image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
-    )
-
-    # prepare image
-    image = prepare_img()
-    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
-
-    logger.info(f"Converting model {model_name}...")
-
-    # load original state dict
-    if encoder_only:
-        state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
-    else:
-        state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))["state_dict"]
-
-    # rename keys
-    state_dict = rename_keys(state_dict, encoder_only=encoder_only)
-    if not encoder_only:
-        del state_dict["decode_head.conv_seg.weight"]
-        del state_dict["decode_head.conv_seg.bias"]
-
-    # key and value matrices need special treatment
-    read_in_k_v(state_dict, config)
-
-    # create HuggingFace model and load state dict
-    if encoder_only:
-        config.reshape_last_stage = False
-        model = SegformerForImageClassification(config)
-    else:
-        model = SegformerForSemanticSegmentation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # forward pass
-    outputs = model(pixel_values)
-    logits = outputs.logits
-
-    # set expected_slice based on model name
-    # ADE20k checkpoints
-    if model_name == "segformer.b0.512x512.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
-                [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
-                [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
-            ]
-        )
-    elif model_name == "segformer.b1.512x512.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-7.5820, -8.7231, -8.3215], [-8.0600, -10.3529, -10.0304], [-7.5208, -9.4103, -9.6239]],
-                [[-12.6918, -13.8994, -13.7137], [-13.3196, -15.7523, -15.4789], [-12.9343, -14.8757, -14.9689]],
-                [[-11.1911, -11.9421, -11.3243], [-11.3342, -13.6839, -13.3581], [-10.3909, -12.1832, -12.4858]],
-            ]
-        )
-    elif model_name == "segformer.b2.512x512.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-11.8173, -14.3850, -16.3128], [-14.5648, -16.5804, -18.6568], [-14.7223, -15.7387, -18.4218]],
-                [[-15.7290, -17.9171, -19.4423], [-18.3105, -19.9448, -21.4661], [-17.9296, -18.6497, -20.7910]],
-                [[-15.0783, -17.0336, -18.2789], [-16.8771, -18.6870, -20.1612], [-16.2454, -17.1426, -19.5055]],
-            ]
-        )
-    elif model_name == "segformer.b3.512x512.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-9.0878, -10.2081, -10.1891], [-9.3144, -10.7941, -10.9843], [-9.2294, -10.3855, -10.5704]],
-                [[-12.2316, -13.9068, -13.6102], [-12.9161, -14.3702, -14.3235], [-12.5233, -13.7174, -13.7932]],
-                [[-14.6275, -15.2490, -14.9727], [-14.3400, -15.9687, -16.2827], [-14.1484, -15.4033, -15.8937]],
-            ]
-        )
-    elif model_name == "segformer.b4.512x512.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-12.3144, -13.2447, -14.0802], [-13.3614, -14.5816, -15.6117], [-13.3340, -14.4433, -16.2219]],
-                [[-19.2781, -20.4128, -20.7506], [-20.6153, -21.6566, -22.0998], [-19.9800, -21.0430, -22.1494]],
-                [[-18.8739, -19.7804, -21.1834], [-20.1233, -21.6765, -23.2944], [-20.0315, -21.2641, -23.6944]],
-            ]
-        )
-    elif model_name == "segformer.b5.640x640.ade.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-9.5524, -12.0835, -11.7348], [-10.5229, -13.6446, -14.5662], [-9.5842, -12.8851, -13.9414]],
-                [[-15.3432, -17.5323, -17.0818], [-16.3330, -18.9255, -19.2101], [-15.1340, -17.7848, -18.3971]],
-                [[-12.6072, -14.9486, -14.6631], [-13.7629, -17.0907, -17.7745], [-12.7899, -16.1695, -17.1671]],
-            ]
-        )
-    # Cityscapes checkpoints
-    elif model_name == "segformer.b0.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-11.9295, -13.4057, -14.8106], [-13.3431, -14.8179, -15.3781], [-14.2836, -15.5942, -16.1588]],
-                [[-11.4906, -12.8067, -13.6564], [-13.1189, -14.0500, -14.1543], [-13.8748, -14.5136, -14.8789]],
-                [[0.5374, 0.1067, -0.4742], [0.1141, -0.2255, -0.7099], [-0.3000, -0.5924, -1.3105]],
-            ]
-        )
-    elif model_name == "segformer.b0.512x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-7.8217, -9.8767, -10.1717], [-9.4438, -10.9058, -11.4047], [-9.7939, -12.3495, -12.1079]],
-                [[-7.1514, -9.5336, -10.0860], [-9.7776, -11.6822, -11.8439], [-10.1411, -12.7655, -12.8972]],
-                [[0.3021, 0.0805, -0.2310], [-0.0328, -0.1605, -0.2714], [-0.1408, -0.5477, -0.6976]],
-            ]
-        )
-    elif model_name == "segformer.b0.640x1280.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [
-                    [-1.1372e01, -1.2787e01, -1.3477e01],
-                    [-1.2536e01, -1.4194e01, -1.4409e01],
-                    [-1.3217e01, -1.4888e01, -1.5327e01],
-                ],
-                [
-                    [-1.4791e01, -1.7122e01, -1.8277e01],
-                    [-1.7163e01, -1.9192e01, -1.9533e01],
-                    [-1.7897e01, -1.9991e01, -2.0315e01],
-                ],
-                [
-                    [7.6723e-01, 4.1921e-01, -7.7878e-02],
-                    [4.7772e-01, 9.5557e-03, -2.8082e-01],
-                    [3.6032e-01, -2.4826e-01, -5.1168e-01],
-                ],
-            ]
-        )
-    elif model_name == "segformer.b0.768x768.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-9.4959, -11.3087, -11.7479], [-11.0025, -12.6540, -12.3319], [-11.4064, -13.0487, -12.9905]],
-                [[-9.8905, -11.3084, -12.0854], [-11.1726, -12.7698, -12.9583], [-11.5985, -13.3278, -14.1774]],
-                [[0.2213, 0.0192, -0.2466], [-0.1731, -0.4213, -0.4874], [-0.3126, -0.6541, -1.1389]],
-            ]
-        )
-    elif model_name == "segformer.b1.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]],
-                [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]],
-                [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]],
-            ]
-        )
-    elif model_name == "segformer.b2.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-16.0976, -16.4856, -17.3962], [-16.6234, -19.0342, -19.7685], [-16.0900, -18.0661, -19.1180]],
-                [[-18.4750, -18.8488, -19.5074], [-19.4030, -22.1570, -22.5977], [-19.1191, -20.8486, -22.3783]],
-                [[-4.5178, -5.5037, -6.5109], [-5.0884, -7.2174, -8.0334], [-4.4156, -5.8117, -7.2970]],
-            ]
-        )
-    elif model_name == "segformer.b3.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-14.2081, -14.4732, -14.1977], [-14.5867, -16.4423, -16.6356], [-13.4441, -14.9685, -16.8696]],
-                [[-14.4576, -14.7073, -15.0451], [-15.0816, -17.6237, -17.9873], [-14.4213, -16.0199, -18.5992]],
-                [[-4.7349, -4.9588, -5.0966], [-4.3210, -6.9325, -7.2591], [-3.4312, -4.7484, -7.1917]],
-            ]
-        )
-    elif model_name == "segformer.b4.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-11.7737, -11.9526, -11.3273], [-13.6692, -14.4574, -13.8878], [-13.8937, -14.6924, -15.9345]],
-                [[-14.6706, -14.5330, -14.1306], [-16.1502, -16.8180, -16.4269], [-16.8338, -17.8939, -20.1746]],
-                [[1.0491, 0.8289, 1.0310], [1.1044, 0.5219, 0.8055], [1.0899, 0.6926, 0.5590]],
-            ]
-        )
-    elif model_name == "segformer.b5.1024x1024.city.160k":
-        expected_slice = torch.tensor(
-            [
-                [[-12.5641, -13.4777, -13.0684], [-13.9587, -15.8983, -16.6557], [-13.3109, -15.7350, -16.3141]],
-                [[-14.7074, -15.4352, -14.5944], [-16.6353, -18.1663, -18.6120], [-15.1702, -18.0329, -18.1547]],
-                [[-1.7990, -2.0951, -1.7784], [-2.6397, -3.8245, -3.9686], [-1.5264, -2.8126, -2.9316]],
-            ]
-        )
-    else:
-        predicted_class_idx = logits.argmax(-1).item()
-        print("Predicted class:", model.config.id2label[predicted_class_idx])
-
-    # verify logits
-    if not encoder_only:
-        assert logits.shape == expected_shape
-        assert torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-2)
-
-    # finally, save model and image processor
-    logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--model_name",
-        default="segformer.b0.512x512.ade.160k",
-        type=str,
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, help="Path to the original PyTorch checkpoint (.pth file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_segformer_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index 51b4cd27cab0..b978f701657a 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -219,12 +219,12 @@ def _preprocess(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -260,8 +260,8 @@ def _preprocess_image(
     def _preprocess_mask(
         self,
         segmentation_map: ImageInput,
-        do_reduce_labels: bool = None,
-        do_resize: bool = None,
+        do_reduce_labels: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index 36efb00e67b0..84a90e5e6238 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -73,7 +73,7 @@ class SegFormerImageClassifierOutput(ImageClassifierOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -463,7 +463,7 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 9c65a3b53229..bad72cdf2bbc 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -325,8 +325,8 @@ def __init__(
         self,
         config: SegformerConfig,
         in_features: int,
-        hidden_features: int = None,
-        out_features: int = None,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/seggpt/convert_seggpt_to_hf.py b/src/transformers/models/seggpt/convert_seggpt_to_hf.py
deleted file mode 100644
index d67daeab93d8..000000000000
--- a/src/transformers/models/seggpt/convert_seggpt_to_hf.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SegGPT checkpoints from the original repository.
-
-URL: https://github.com/baaivision/Painter/tree/main/SegGPT
-"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import SegGptConfig, SegGptForImageSegmentation, SegGptImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-
-    # rename embedding and its parameters
-    rename_keys.append(("patch_embed.proj.weight", "model.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("patch_embed.proj.bias", "model.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("mask_token", "model.embeddings.mask_token"))
-    rename_keys.append(("segment_token_x", "model.embeddings.segment_token_input"))
-    rename_keys.append(("segment_token_y", "model.embeddings.segment_token_prompt"))
-    rename_keys.append(("type_token_cls", "model.embeddings.type_token_semantic"))
-    rename_keys.append(("type_token_ins", "model.embeddings.type_token_instance"))
-    rename_keys.append(("pos_embed", "model.embeddings.position_embeddings"))
-
-    # rename decoder and other
-    rename_keys.append(("norm.weight", "model.encoder.layernorm.weight"))
-    rename_keys.append(("norm.bias", "model.encoder.layernorm.bias"))
-    rename_keys.append(("decoder_embed.weight", "decoder.decoder_embed.weight"))
-    rename_keys.append(("decoder_embed.bias", "decoder.decoder_embed.bias"))
-    rename_keys.append(("decoder_pred.0.weight", "decoder.decoder_pred.conv.weight"))
-    rename_keys.append(("decoder_pred.0.bias", "decoder.decoder_pred.conv.bias"))
-    rename_keys.append(("decoder_pred.1.weight", "decoder.decoder_pred.layernorm.weight"))
-    rename_keys.append(("decoder_pred.1.bias", "decoder.decoder_pred.layernorm.bias"))
-    rename_keys.append(("decoder_pred.3.weight", "decoder.decoder_pred.head.weight"))
-    rename_keys.append(("decoder_pred.3.bias", "decoder.decoder_pred.head.bias"))
-
-    # rename blocks
-    for i in range(config.num_hidden_layers):
-        rename_keys.append((f"blocks.{i}.attn.qkv.weight", f"model.encoder.layers.{i}.attention.qkv.weight"))
-        rename_keys.append((f"blocks.{i}.attn.qkv.bias", f"model.encoder.layers.{i}.attention.qkv.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"model.encoder.layers.{i}.attention.proj.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"model.encoder.layers.{i}.attention.proj.bias"))
-        rename_keys.append((f"blocks.{i}.attn.rel_pos_h", f"model.encoder.layers.{i}.attention.rel_pos_h"))
-        rename_keys.append((f"blocks.{i}.attn.rel_pos_w", f"model.encoder.layers.{i}.attention.rel_pos_w"))
-
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"model.encoder.layers.{i}.mlp.lin1.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"model.encoder.layers.{i}.mlp.lin1.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"model.encoder.layers.{i}.mlp.lin2.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"model.encoder.layers.{i}.mlp.lin2.bias"))
-
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"model.encoder.layers.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"model.encoder.layers.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"model.encoder.layers.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"model.encoder.layers.{i}.layernorm_after.bias"))
-
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on spongebob images
-def prepare_input():
-    image_input_url = (
-        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
-    )
-    image_prompt_url = (
-        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
-    )
-    mask_prompt_url = (
-        "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"
-    )
-
-    image_input = Image.open(requests.get(image_input_url, stream=True).raw)
-    image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
-    mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw)
-
-    return image_input, image_prompt, mask_prompt
-
-
-@torch.no_grad()
-def convert_seggpt_checkpoint(args):
-    model_name = args.model_name
-    pytorch_dump_folder_path = args.pytorch_dump_folder_path
-    verify_logits = args.verify_logits
-    push_to_hub = args.push_to_hub
-
-    # Define default GroundingDINO configuation
-    config = SegGptConfig()
-
-    # Load original checkpoint
-    checkpoint_url = "https://huggingface.co/BAAI/SegGpt/blob/main/seggpt_vit_large.pth"
-    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-
-    # # Rename keys
-    new_state_dict = original_state_dict.copy()
-    rename_keys = create_rename_keys(config)
-
-    for src, dest in rename_keys:
-        rename_key(new_state_dict, src, dest)
-
-    # Load HF model
-    model = SegGptForImageSegmentation(config)
-    model.eval()
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-
-    input_img, prompt_img, prompt_mask = prepare_input()
-    image_processor = SegGptImageProcessor()
-    inputs = image_processor(images=input_img, prompt_images=prompt_img, prompt_masks=prompt_mask, return_tensors="pt")
-
-    expected_prompt_pixel_values = torch.tensor(
-        [
-            [[-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965], [-0.6965, -0.6965, -0.6965]],
-            [[1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583], [1.6583, 1.6583, 1.6583]],
-            [[2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088], [2.3088, 2.3088, 2.3088]],
-        ]
-    )
-
-    expected_pixel_values = torch.tensor(
-        [
-            [[1.6324, 1.6153, 1.5810], [1.6153, 1.5982, 1.5810], [1.5810, 1.5639, 1.5639]],
-            [[1.2731, 1.2556, 1.2206], [1.2556, 1.2381, 1.2031], [1.2206, 1.2031, 1.1681]],
-            [[1.6465, 1.6465, 1.6465], [1.6465, 1.6465, 1.6465], [1.6291, 1.6291, 1.6291]],
-        ]
-    )
-
-    expected_prompt_masks = torch.tensor(
-        [
-            [[-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179], [-2.1179, -2.1179, -2.1179]],
-            [[-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357], [-2.0357, -2.0357, -2.0357]],
-            [[-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044], [-1.8044, -1.8044, -1.8044]],
-        ]
-    )
-
-    assert torch.allclose(inputs.pixel_values[0, :, :3, :3], expected_pixel_values, atol=1e-4)
-    assert torch.allclose(inputs.prompt_pixel_values[0, :, :3, :3], expected_prompt_pixel_values, atol=1e-4)
-    assert torch.allclose(inputs.prompt_masks[0, :, :3, :3], expected_prompt_masks, atol=1e-4)
-
-    torch.manual_seed(2)
-    outputs = model(**inputs)
-    print(outputs)
-
-    if verify_logits:
-        expected_output = torch.tensor(
-            [
-                [[-2.1208, -2.1190, -2.1198], [-2.1237, -2.1228, -2.1227], [-2.1232, -2.1226, -2.1228]],
-                [[-2.0405, -2.0396, -2.0403], [-2.0434, -2.0434, -2.0433], [-2.0428, -2.0432, -2.0434]],
-                [[-1.8102, -1.8088, -1.8099], [-1.8131, -1.8126, -1.8129], [-1.8130, -1.8128, -1.8131]],
-            ]
-        )
-        assert torch.allclose(outputs.pred_masks[0, :, :3, :3], expected_output, atol=1e-4)
-        print("Looks good!")
-    else:
-        print("Converted without verifying logits")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to hub")
-        model.push_to_hub(f"EduardoPacheco/{model_name}")
-        image_processor.push_to_hub(f"EduardoPacheco/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="seggpt-vit-large",
-        type=str,
-        choices=["seggpt-vit-large"],
-        help="Name of the SegGpt model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_false",
-        help="Whether or not to verify the logits against the original implementation.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_seggpt_checkpoint(args)
diff --git a/src/transformers/models/seggpt/image_processing_seggpt.py b/src/transformers/models/seggpt/image_processing_seggpt.py
index bcc1ad32efe0..26c7c1f47acd 100644
--- a/src/transformers/models/seggpt/image_processing_seggpt.py
+++ b/src/transformers/models/seggpt/image_processing_seggpt.py
@@ -586,7 +586,7 @@ def post_process_semantic_segmentation(
         palette_tensor = None
         palette = self.get_palette(num_labels) if num_labels is not None else None
         if palette is not None:
-            palette_tensor = torch.tensor(palette).float().to(masks.device)
+            palette_tensor = torch.tensor(palette).to(device=masks.device, dtype=torch.float)
             _, num_channels, _, _ = masks.shape
             palette_tensor = palette_tensor.view(1, 1, num_labels + 1, num_channels)
 
diff --git a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index df0cae2a3b29..000000000000
--- a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SEW checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-# Register SEW's fairseq modules
-from sew_asapp import tasks  # noqa: F401
-
-from transformers import (
-    SEWConfig,
-    SEWForCTC,
-    SEWModel,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Processor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.upsample.0": "encoder.upsample.projection",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "layer_norm",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.sew.feature_extractor if is_finetuned else hf_model.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "sew." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
-
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def convert_config(model, is_finetuned):
-    config = SEWConfig()
-    if is_finetuned:
-        fs_config = model.w2v_encoder.w2v_model.cfg
-    else:
-        fs_config = model.cfg
-
-    config.conv_bias = fs_config.conv_bias
-    conv_layers = eval(fs_config.conv_feature_layers)
-    config.conv_dim = [x[0] for x in conv_layers]
-    config.conv_kernel = [x[1] for x in conv_layers]
-    config.conv_stride = [x[2] for x in conv_layers]
-    config.feat_extract_activation = "gelu"
-    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
-    config.final_dropout = 0.0
-    config.hidden_act = fs_config.activation_fn.name
-    config.hidden_size = fs_config.encoder_embed_dim
-    config.initializer_range = 0.02
-    config.intermediate_size = fs_config.encoder_ffn_embed_dim
-    config.layer_norm_eps = 1e-5
-    config.layerdrop = fs_config.encoder_layerdrop
-    config.num_attention_heads = fs_config.encoder_attention_heads
-    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
-    config.num_conv_pos_embeddings = fs_config.conv_pos
-    config.num_feat_extract_layers = len(conv_layers)
-    config.num_hidden_layers = fs_config.encoder_layers
-    config.squeeze_factor = fs_config.squeeze_factor
-
-    # take care of any params that are overridden by the Wav2VecCtc model
-    if is_finetuned:
-        fs_config = model.cfg
-        config.final_dropout = fs_config.final_dropout
-        config.layerdrop = fs_config.layerdrop
-    config.activation_dropout = fs_config.activation_dropout
-    config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0
-    config.attention_dropout = fs_config.attention_dropout
-    config.feat_proj_dropout = fs_config.dropout_input
-    config.hidden_dropout = fs_config.dropout
-    config.mask_feature_length = fs_config.mask_channel_length
-    config.mask_feature_prob = fs_config.mask_channel_prob
-    config.mask_time_length = fs_config.mask_length
-    config.mask_time_prob = fs_config.mask_prob
-
-    config.feature_extractor_type = "Wav2Vec2FeatureExtractor"
-    config.tokenizer_class = "Wav2Vec2CTCTokenizer"
-
-    return config
-
-
-@torch.no_grad()
-def convert_sew_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-
-    if is_finetuned:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-        )
-    else:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
-
-    if config_path is not None:
-        config = SEWConfig.from_pretrained(config_path)
-    else:
-        config = convert_config(model[0], is_finetuned)
-    model = model[0].eval()
-
-    return_attention_mask = True if config.feat_extract_norm == "layer" else False
-    feature_extractor = Wav2Vec2FeatureExtractor(
-        feature_size=1,
-        sampling_rate=16000,
-        padding_value=0,
-        do_normalize=True,
-        return_attention_mask=return_attention_mask,
-    )
-
-    if is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            target_dict.indices[target_dict.bos_word] = target_dict.pad_index
-            target_dict.indices[target_dict.pad_word] = target_dict.bos_index
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(target_dict.indices, vocab_handle)
-            tokenizer = Wav2Vec2CTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_model = SEWForCTC(config)
-    else:
-        hf_model = SEWModel(config)
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    recursively_load_weights(model, hf_model, is_finetuned)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_sew_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
-    )
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index d534f6843466..572c07e3c9d9 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -27,20 +27,19 @@
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
 )
 from .configuration_sew import SEWConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -124,7 +123,7 @@ def compute_num_masked_span(input_length):
 
     # compute number of masked spans in batch
     input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
+        attention_mask.detach().sum(-1).tolist()
         if attention_mask is not None
         else [sequence_length for _ in range(batch_size)]
     )
@@ -567,9 +566,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -1219,7 +1218,7 @@ def forward(
     """SEW Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     SEW_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEW, wav2vec2->sew, WAV_2_VEC_2->SEW
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEW, wav2vec2->sew, WAV2VEC2->SEW
 class SEWForCTC(SEWPreTrainedModel):
     def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
@@ -1378,7 +1377,7 @@ def forward(
     """,
     SEW_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->SEW, wav2vec2->sew, WAV_2_VEC_2->SEW
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->SEW, wav2vec2->sew, WAV2VEC2->SEW
 class SEWForSequenceClassification(SEWPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 1540efa4be17..000000000000
--- a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SEW checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-# Register SEW's fairseq modules
-from sew_asapp import tasks  # noqa: F401
-
-from transformers import (
-    SEWDConfig,
-    SEWDForCTC,
-    SEWDModel,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Processor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "attention.self.query_proj": "encoder.encoder.layer.*.attention.self.query_proj",
-    "attention.self.key_proj": "encoder.encoder.layer.*.attention.self.key_proj",
-    "attention.self.value_proj": "encoder.encoder.layer.*.attention.self.value_proj",
-    "attention.output.dense": "encoder.encoder.layer.*.attention.output.dense",
-    "attention.output.LayerNorm": "encoder.encoder.layer.*.attention.output.LayerNorm",
-    "intermediate.dense": "encoder.encoder.layer.*.intermediate.dense",
-    "output.dense": "encoder.encoder.layer.*.output.dense",
-    "output.LayerNorm": "encoder.encoder.layer.*.output.LayerNorm",
-    "encoder.encoder.rel_embeddings": "encoder.encoder.rel_embeddings",
-    "encoder.encoder.LayerNorm": "encoder.encoder.LayerNorm",
-    "encoder.upsample.0": "encoder.upsample.projection",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "layer_norm",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.sew_d.feature_extractor if is_finetuned else hf_model.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "sew_d." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key
-
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        if not layer_index.isnumeric():
-                            continue
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def convert_config(model, is_finetuned):
-    config = SEWDConfig()
-    if is_finetuned:
-        fs_config = model.w2v_encoder.w2v_model.cfg
-    else:
-        fs_config = model.cfg
-
-    config.conv_bias = fs_config.conv_bias
-    conv_layers = eval(fs_config.conv_feature_layers)
-    config.conv_dim = [x[0] for x in conv_layers]
-    config.conv_kernel = [x[1] for x in conv_layers]
-    config.conv_stride = [x[2] for x in conv_layers]
-    config.feat_extract_activation = "gelu"
-    config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
-    config.final_dropout = 0.0
-    config.hidden_act = fs_config.activation_fn.name
-    config.hidden_size = fs_config.encoder_embed_dim
-    config.initializer_range = 0.02
-    config.intermediate_size = fs_config.encoder_ffn_embed_dim
-    config.layer_norm_eps = 1e-5
-    config.layerdrop = fs_config.encoder_layerdrop
-    config.num_attention_heads = fs_config.encoder_attention_heads
-    config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
-    config.num_conv_pos_embeddings = fs_config.conv_pos
-    config.num_feat_extract_layers = len(conv_layers)
-    config.num_hidden_layers = fs_config.encoder_layers
-    config.squeeze_factor = fs_config.squeeze_factor
-    # DeBERTa-specific parameters:
-    config.max_position_embeddings = fs_config.max_position_embeddings
-    config.position_buckets = fs_config.position_buckets
-    config.share_att_key = fs_config.share_att_key
-    config.relative_attention = fs_config.relative_attention
-    config.position_biased_input = fs_config.position_biased_input
-    config.pos_att_type = tuple(fs_config.pos_att_type.split("|"))
-    config.norm_rel_ebd = fs_config.norm_rel_ebd
-
-    # take care of any params that are overridden by the Wav2VecCtc model
-    if is_finetuned:
-        fs_config = model.cfg
-        config.final_dropout = fs_config.final_dropout
-        config.layerdrop = fs_config.layerdrop
-    config.activation_dropout = fs_config.activation_dropout
-    config.apply_spec_augment = fs_config.mask_prob > 0 or fs_config.mask_channel_prob > 0
-    config.attention_dropout = fs_config.attention_dropout
-    config.feat_proj_dropout = fs_config.dropout_input
-    config.hidden_dropout = fs_config.dropout
-    config.mask_feature_length = fs_config.mask_channel_length
-    config.mask_feature_prob = fs_config.mask_channel_prob
-    config.mask_time_length = fs_config.mask_length
-    config.mask_time_prob = fs_config.mask_prob
-
-    config.feature_extractor_type = "Wav2Vec2FeatureExtractor"
-    config.tokenizer_class = "Wav2Vec2CTCTokenizer"
-
-    return config
-
-
-@torch.no_grad()
-def convert_sew_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-
-    if is_finetuned:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-        )
-    else:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
-
-    if config_path is not None:
-        config = SEWDConfig.from_pretrained(config_path)
-    else:
-        config = convert_config(model[0], is_finetuned)
-    model = model[0].eval()
-
-    return_attention_mask = True if config.feat_extract_norm == "layer" else False
-    feature_extractor = Wav2Vec2FeatureExtractor(
-        feature_size=1,
-        sampling_rate=16000,
-        padding_value=0,
-        do_normalize=True,
-        return_attention_mask=return_attention_mask,
-    )
-
-    if is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            target_dict.indices[target_dict.bos_word] = target_dict.pad_index
-            target_dict.indices[target_dict.pad_word] = target_dict.bos_index
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(target_dict.indices, vocab_handle)
-            tokenizer = Wav2Vec2CTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_model = SEWDForCTC(config)
-    else:
-        hf_model = SEWDModel(config)
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    recursively_load_weights(model, hf_model, is_finetuned)
-
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--is_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_sew_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, args.is_finetuned
-    )
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 35e2debbae48..96c587fbb2eb 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -112,7 +112,7 @@ def compute_num_masked_span(input_length):
 
     # compute number of masked spans in batch
     input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
+        attention_mask.detach().sum(-1).tolist()
         if attention_mask is not None
         else [sequence_length for _ in range(batch_size)]
     )
@@ -820,7 +820,7 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_
             raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
 
         att_span = self.pos_ebd_size
-        relative_pos = relative_pos.long().to(query_layer.device)
+        relative_pos = relative_pos.to(device=query_layer.device, dtype=torch.long)
 
         rel_embeddings = rel_embeddings[0 : att_span * 2, :].unsqueeze(0)
         if self.share_att_key:
@@ -1468,7 +1468,7 @@ def forward(
     """SEW-D Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     SEWD_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEWD, wav2vec2->sew_d, WAV_2_VEC_2->SEWD
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->SEWD, wav2vec2->sew_d, WAV2VEC2->SEWD
 class SEWDForCTC(SEWDPreTrainedModel):
     def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
@@ -1627,7 +1627,7 @@ def forward(
     """,
     SEWD_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->SEWD, wav2vec2->sew_d, WAV_2_VEC_2->SEWD
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification with Wav2Vec2->SEWD, wav2vec2->sew_d, WAV2VEC2->SEWD
 class SEWDForSequenceClassification(SEWDPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/shieldgemma2/__init__.py b/src/transformers/models/shieldgemma2/__init__.py
new file mode 100644
index 000000000000..3eaa89402735
--- /dev/null
+++ b/src/transformers/models/shieldgemma2/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_shieldgemma2 import *
+    from .modeling_shieldgemma2 import *
+    from .processing_shieldgemma2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/shieldgemma2/configuration_shieldgemma2.py b/src/transformers/models/shieldgemma2/configuration_shieldgemma2.py
new file mode 100644
index 000000000000..8094cb14b430
--- /dev/null
+++ b/src/transformers/models/shieldgemma2/configuration_shieldgemma2.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class ShieldGemma2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ShieldGemma2ForImageClassification`]. It is used to instantiate an
+    ShieldGemma2ForImageClassification according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the shieldgemma-2-4b-it.
+
+    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[ShieldGemma2TextConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*):
+            Custom vision config or dict.
+        mm_tokens_per_image (`int`, *optional*, defaults to 256):
+            The number of tokens per image embedding.
+        boi_token_index (`int`, *optional*, defaults to 255999):
+            The begin-of-image token index to wrap the image prompt.
+        eoi_token_index (`int`, *optional*, defaults to 256000):
+            The end-of-image token index to wrap the image prompt.
+        image_token_index (`int`, *optional*, defaults to 262144):
+            The image token index to encode the image prompt.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import ShieldGemma2ForConditionalGeneration, ShieldGemma2Config, SiglipVisionConfig, ShieldGemma2TextConfig
+
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+
+    >>> # Initializing a ShieldGemma2 Text config
+    >>> text_config = ShieldGemma2TextConfig()
+
+    >>> # Initializing a ShieldGemma2 gemma-3-4b style configuration
+    >>> configuration = ShieldGemma2Config(vision_config, text_config)
+
+    >>> # Initializing a model from the gemma-3-4b style configuration
+    >>> model = ShieldGemma2TextConfig(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "shieldgemma2"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        mm_tokens_per_image: int = 256,
+        boi_token_index: int = 255_999,
+        eoi_token_index: int = 256_000,
+        image_token_index: int = 262_144,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
+            )
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["siglip_vision_model"]()
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gemma3_text"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["gemma3_text"]()
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.mm_tokens_per_image = mm_tokens_per_image
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.image_token_index = image_token_index
+        self.initializer_range = initializer_range
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["ShieldGemma2Config"]
diff --git a/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py b/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py
new file mode 100644
index 000000000000..3981a1f54dac
--- /dev/null
+++ b/src/transformers/models/shieldgemma2/modeling_shieldgemma2.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+
+from ...cache_utils import Cache
+from ...modeling_outputs import ImageClassifierOutputWithNoAttention
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ..auto import AutoModelForImageTextToText
+from .configuration_shieldgemma2 import ShieldGemma2Config
+
+
+_CHECKPOINT_FOR_DOC = "google/shieldgemma-2-4b-it"
+_CONFIG_FOR_DOC = "ShieldGemma2Config"
+
+logger = logging.get_logger(__name__)
+
+SHIELDGEMMA2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@dataclass
+class ShieldGemma2ImageClassifierOutputWithNoAttention(ImageClassifierOutputWithNoAttention):
+    """ShieldGemma2 classifies imags as violative or not relative to a specific policy
+    Args:
+    """
+
+    probabilities: Optional[torch.Tensor] = None
+
+
+class ShieldGemma2ForImageClassification(PreTrainedModel):
+    config_class = ShieldGemma2Config
+
+    def __init__(self, config: ShieldGemma2Config):
+        super().__init__(config=config)
+        self.yes_token_index = getattr(config, "yes_token_index", 10_784)
+        self.no_token_index = getattr(config, "no_token_index", 3771)
+        self.model = AutoModelForImageTextToText.from_config(config=config)
+
+    def get_input_embeddings(self):
+        return self.model.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.model.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.model.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.model.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.language_model.get_decoder()
+
+    def tie_weights(self):
+        return self.model.language_model.tie_weights()
+
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(SHIELDGEMMA2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> ShieldGemma2ImageClassifierOutputWithNoAttention:
+        """Predicts the binary probability that the image violates the specified policy.
+
+        Returns:
+            A `ShieldGemma2ImageClassifierOutputWithNoAttention` instance containing the logits and probabilities
+            associated with the model predicting the `Yes` or `No` token as the response to that prompt, captured in the
+            following properties.
+
+                *   `logits` (`torch.Tensor` of shape `(batch_size, 2)`):
+                    The first position along dim=1 is the logits for the `Yes` token and the second position along dim=1 is
+                    the logits for the `No` token.
+                *   `probabilities` (`torch.Tensor` of shape `(batch_size, 2)`):
+                    The first position along dim=1 is the probability of predicting the `Yes` token and the second position
+                    along dim=1 is the probability of predicting the `No` token.
+
+            ShieldGemma prompts are constructed such that predicting the `Yes` token means the content *does violate* the
+            policy as described. If you are only interested in the violative condition, use
+            `violated = outputs.probabilities[:, 1]` to extract that slice from the output tensors.
+
+            When used with the `ShieldGemma2Processor`, the `batch_size` will be equal to `len(images) * len(policies)`,
+            and the order within the batch will be img1_policy1, ... img1_policyN, ... imgM_policyN.
+        """
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            token_type_ids=token_type_ids,
+            cache_position=cache_position,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+        logits = outputs.logits
+        selected_logits = logits[:, -1, [self.yes_token_index, self.no_token_index]]
+        probabilities = torch.softmax(selected_logits, dim=-1)
+        return ShieldGemma2ImageClassifierOutputWithNoAttention(
+            logits=selected_logits,
+            probabilities=probabilities,
+        )
+
+
+__all__ = [
+    "ShieldGemma2ForImageClassification",
+]
diff --git a/src/transformers/models/shieldgemma2/processing_shieldgemma2.py b/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
new file mode 100644
index 000000000000..097b83ef6ee6
--- /dev/null
+++ b/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
@@ -0,0 +1,195 @@
+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Mapping, Sequence
+from typing import Optional
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import Unpack
+from ...utils import logging
+from ..gemma3.processing_gemma3 import Gemma3Processor, Gemma3ProcessorKwargs
+
+
+logger = logging.get_logger(__name__)
+
+DEFAULT_SHIELDGEMMA2_POLICIES: Mapping[str, str] = {
+    "dangerous": (
+        "No Dangerous Content: The image shall not contain content that"
+        " facilitates or encourages activities that could cause real-world harm"
+        " (e.g., building firearms and explosive devices, promotion of"
+        " terrorism, instructions for suicide)."
+    ),
+    "sexual": (
+        "No Sexually Explicit content: The image shall not contain content that"
+        " depicts explicit or graphic sexual acts (e.g., pornography, erotic"
+        " nudity, depictions of rape or sexual assault)."
+    ),
+    "violence": (
+        "No Violence/Gore content: The image shall not contain content that"
+        " depicts shocking, sensational, or gratuitous violence (e.g.,"
+        " excessive blood and gore, gratuitous violence against animals,"
+        " extreme injury or moment of death)."
+    ),
+}
+
+
+class ShieldGemma2ProcessorKwargs(Gemma3ProcessorKwargs, total=False):
+    policies: Optional[Sequence[str]]
+    custom_policies: Optional[Mapping[str, str]]
+    _defaults = {
+        "text_kwargs": {
+            "padding": True,
+        },
+        "images_kwargs": {
+            "do_pan_and_scan": False,
+        },
+    }
+
+
+class ShieldGemma2Processor(Gemma3Processor):
+    def __init__(
+        self, image_processor, tokenizer, chat_template=None, image_seq_length=256, policy_definitions=None, **kwargs
+    ):
+        """A processor for the ShieldGemma 2 model.
+
+        Args:
+            image_processor: The image processor to use, typically a `Gemma3ImageProcessorFast` instance.
+            tokenizer: The tokenizer to use, typically a `GemmaTokenizerFast` instance.
+            chat_template: The chat template to use with this processor. Typically, this is unset as the processor
+                configuration on Hugging Face Hub includes this value already.
+            image_seq_length: The number of soft tokens per image. Typically, this is unset as the processor
+                configuration on Hugging Face Hub includes this value already.
+            policy_definitions: A mapping from policy name to its description in text used as the default policies to
+                classify images against. The policy descriptions are included in the text of the prompts generated by
+                this processor. Typically, this is unset as the processor configuration on Hugging Face Hub includes
+                the base policies ShieldGemma was trained on.
+        """
+        super().__init__(image_processor, tokenizer, chat_template, image_seq_length, **kwargs)
+        if policy_definitions is None:
+            self.policy_definitions = DEFAULT_SHIELDGEMMA2_POLICIES
+        else:
+            self.policy_definitions = policy_definitions
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text=None,
+        videos=None,
+        audio=None,
+        **kwargs: Unpack[ShieldGemma2ProcessorKwargs],
+    ) -> BatchFeature:
+        """Generates a batch of inputs from the provided images.
+
+        ShieldGemma was trained to classify image content for policy compliance using a specific prompt construction.
+        This processor generates a batch of such prompts from the provided images by:
+
+        1.  Creating a list of conversations, one for each `<image, policy>` pair;
+        2.  Converting these conversations to text using `self.apply_chat_template()`; and
+        3.  Encoding the conversations and images using the same techniques as `Gemma3Processor`.
+
+        Args:
+            images: A single image or a list of images to include in the batch.
+            text: Not supported.
+            videos: Not supported.
+            audio: Not supported.
+            kwargs: An optional dictionary of keyword arguments to configre the
+                processor. Possible values include:
+
+                *   `custom_policies`: Additional policy definitions that augment the `self.policy_definitions` passed
+                    into the constructor. Note that `custom_policies` that share a key with `self.policy_definitions`
+                    will override the policy description
+                *   `policies`: (Optional) a list of keys in the joint `self.policy_definitions | custom_policies`
+                    dictionary of specific interest for the provided images. If empty or None, prompts will be
+                    generated for every key in the joint dictionary.
+
+        Returns:
+            A `BatchFeature` continaing `input_ids`, `pixel_values`, etc. where each Tensor is of shape
+            `(len(images) * len(policies), )`, and the order within the batch will be
+            img1_policy1, ... img1_policyN, ... imgM_policyN.
+        """
+        del text, videos, audio
+
+        if not images:
+            raise ValueError("ShieldGemma 2 needs images to classify")
+        elif not isinstance(images, Sequence):
+            images = [images]
+
+        if not self.chat_template:
+            raise ValueError("ShieldGemma 2 requires the use of a specific chat template")
+
+        # Disable pan and scan
+        images_kwargs = kwargs.setdefault("images_kwargs", {})
+        if images_kwargs.get("do_pan_and_scan") is True:
+            logger.warning_once("ShieldGemma2 does not support pan and scan.")
+            images_kwargs["do_pan_and_scan"] = False
+
+        # Enable padding on the batch during tokenization
+        text_kwargs = kwargs.setdefault("text_kwargs", {})
+        if "padding" not in text_kwargs:
+            text_kwargs["padding"] = kwargs.pop("padding", True)
+            text_kwargs["padding_side"] = kwargs.pop("padding_side", "left")
+
+        policy_definitions: Mapping[str, str] = {
+            **self.policy_definitions,
+            **kwargs.get("custom_policies", {}),
+        }
+
+        if (policies := kwargs.get("policies")) is None:
+            policies = list(policy_definitions.keys())
+
+        # TODO(ryanmullins): Support images from PIL or URLs.
+        messages = []
+        expanded_images = []
+        for img in images:
+            for policy in policies:
+                messages.append(
+                    [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "image"},
+                                {"type": "text", "text": policy_definitions[policy]},
+                            ],
+                        }
+                    ]
+                )
+                expanded_images.append([img])
+
+        text = self.apply_chat_template(messages, tokenize=False)
+        return super().__call__(images=expanded_images, text=text, **kwargs)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"]
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["ShieldGemma2Processor"]
diff --git a/src/transformers/models/siglip/configuration_siglip.py b/src/transformers/models/siglip/configuration_siglip.py
index ad676046f348..f4a140cecc2c 100644
--- a/src/transformers/models/siglip/configuration_siglip.py
+++ b/src/transformers/models/siglip/configuration_siglip.py
@@ -59,6 +59,8 @@ class SiglipTextConfig(PretrainedConfig):
             The id of the beginning-of-sequence token in the vocabulary.
         eos_token_id (`int`, *optional*, defaults to 49407):
             The id of the end-of-sequence token in the vocabulary.
+        projection_size (`int`, *optional*, defaults to `hidden_size`):
+            The size of the projection head.
 
     Example:
 
@@ -94,6 +96,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=49406,
         eos_token_id=49407,
+        projection_size=None,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -107,6 +110,7 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
         self.attention_dropout = attention_dropout
+        self.projection_size = projection_size if projection_size is not None else hidden_size
 
 
 class SiglipVisionConfig(PretrainedConfig):
diff --git a/src/transformers/models/siglip/convert_siglip_to_hf.py b/src/transformers/models/siglip/convert_siglip_to_hf.py
deleted file mode 100644
index 163f6f279792..000000000000
--- a/src/transformers/models/siglip/convert_siglip_to_hf.py
+++ /dev/null
@@ -1,412 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SigLIP checkpoints from the original repository.
-
-URL: https://github.com/google-research/big_vision/tree/main
-"""
-
-import argparse
-import collections
-from pathlib import Path
-
-import numpy as np
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from numpy import load
-from PIL import Image
-
-from transformers import SiglipConfig, SiglipImageProcessor, SiglipModel, SiglipProcessor, SiglipTokenizer
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-model_name_to_checkpoint = {
-    # base checkpoints
-    "siglip-base-patch16-224": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_224_63724782.npz",
-    "siglip-base-patch16-256": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_256_60500360.npz",
-    "siglip-base-patch16-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_384_68578854.npz",
-    "siglip-base-patch16-512": "/Users/nielsrogge/Documents/SigLIP/webli_en_b16_512_68580893.npz",
-    # large checkpoints
-    "siglip-large-patch16-256": "/Users/nielsrogge/Documents/SigLIP/webli_en_l16_256_60552751.npz",
-    "siglip-large-patch16-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_l16_384_63634585.npz",
-    # multilingual checkpoint
-    "siglip-base-patch16-256-i18n": "/Users/nielsrogge/Documents/SigLIP/webli_i18n_b16_256_66117334.npz",
-    # so400m checkpoints
-    "siglip-so400m-patch14-384": "/Users/nielsrogge/Documents/SigLIP/webli_en_so400m_384_58765454.npz",
-}
-
-model_name_to_image_size = {
-    "siglip-base-patch16-224": 224,
-    "siglip-base-patch16-256": 256,
-    "siglip-base-patch16-384": 384,
-    "siglip-base-patch16-512": 512,
-    "siglip-large-patch16-256": 256,
-    "siglip-large-patch16-384": 384,
-    "siglip-base-patch16-256-i18n": 256,
-    "siglip-so400m-patch14-384": 384,
-}
-
-
-def get_siglip_config(model_name):
-    config = SiglipConfig()
-
-    vocab_size = 250000 if "i18n" in model_name else 32000
-    image_size = model_name_to_image_size[model_name]
-    patch_size = 16 if "patch16" in model_name else 14
-
-    # size of the architecture
-    config.vision_config.image_size = image_size
-    config.vision_config.patch_size = patch_size
-    config.text_config.vocab_size = vocab_size
-
-    if "base" in model_name:
-        pass
-    elif "large" in model_name:
-        config.text_config.hidden_size = 1024
-        config.text_config.intermediate_size = 4096
-        config.text_config.num_hidden_layers = 24
-        config.text_config.num_attention_heads = 16
-        config.vision_config.hidden_size = 1024
-        config.vision_config.intermediate_size = 4096
-        config.vision_config.num_hidden_layers = 24
-        config.vision_config.num_attention_heads = 16
-    elif "so400m" in model_name:
-        config.text_config.hidden_size = 1152
-        config.text_config.intermediate_size = 4304
-        config.text_config.num_hidden_layers = 27
-        config.text_config.num_attention_heads = 16
-        config.vision_config.hidden_size = 1152
-        config.vision_config.intermediate_size = 4304
-        config.vision_config.num_hidden_layers = 27
-        config.vision_config.num_attention_heads = 16
-    else:
-        raise ValueError("Model not supported")
-
-    return config
-
-
-def create_rename_keys(config):
-    rename_keys = []
-    # fmt: off
-
-    # vision encoder
-
-    rename_keys.append(("params/img/embedding/kernel", "vision_model.embeddings.patch_embedding.weight"))
-    rename_keys.append(("params/img/embedding/bias", "vision_model.embeddings.patch_embedding.bias"))
-    rename_keys.append(("params/img/pos_embedding", "vision_model.embeddings.position_embedding.weight"))
-
-    for i in range(config.vision_config.num_hidden_layers):
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/scale", f"vision_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_0/bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/scale", f"vision_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/LayerNorm_1/bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"vision_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"vision_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"vision_model.encoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"params/img/Transformer/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"vision_model.encoder.layers.{i}.self_attn.out_proj.bias"))
-
-    rename_keys.append(("params/img/Transformer/encoder_norm/scale", "vision_model.post_layernorm.weight"))
-    rename_keys.append(("params/img/Transformer/encoder_norm/bias", "vision_model.post_layernorm.bias"))
-
-    rename_keys.append(("params/img/MAPHead_0/probe", "vision_model.head.probe"))
-    rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/scale", "vision_model.head.layernorm.weight"))
-    rename_keys.append(("params/img/MAPHead_0/LayerNorm_0/bias", "vision_model.head.layernorm.bias"))
-    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/kernel", "vision_model.head.mlp.fc1.weight"))
-    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_0/bias", "vision_model.head.mlp.fc1.bias"))
-    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/kernel", "vision_model.head.mlp.fc2.weight"))
-    rename_keys.append(("params/img/MAPHead_0/MlpBlock_0/Dense_1/bias", "vision_model.head.mlp.fc2.bias"))
-    rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/kernel", "vision_model.head.attention.out_proj.weight"))
-    rename_keys.append(("params/img/MAPHead_0/MultiHeadDotProductAttention_0/out/bias", "vision_model.head.attention.out_proj.bias"))
-
-    # text encoder
-
-    rename_keys.append(("params/txt/Embed_0/embedding", "text_model.embeddings.token_embedding.weight"))
-    rename_keys.append(("params/txt/pos_embedding", "text_model.embeddings.position_embedding.weight"))
-
-    for i in range(config.text_config.num_hidden_layers):
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/scale", f"text_model.encoder.layers.{i}.layer_norm1.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_0/bias", f"text_model.encoder.layers.{i}.layer_norm1.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/scale", f"text_model.encoder.layers.{i}.layer_norm2.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/LayerNorm_1/bias", f"text_model.encoder.layers.{i}.layer_norm2.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/kernel", f"text_model.encoder.layers.{i}.mlp.fc1.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_0/bias", f"text_model.encoder.layers.{i}.mlp.fc1.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/kernel", f"text_model.encoder.layers.{i}.mlp.fc2.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MlpBlock_0/Dense_1/bias", f"text_model.encoder.layers.{i}.mlp.fc2.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/kernel", f"text_model.encoder.layers.{i}.self_attn.k_proj.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/key/bias", f"text_model.encoder.layers.{i}.self_attn.k_proj.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/kernel", f"text_model.encoder.layers.{i}.self_attn.v_proj.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/value/bias", f"text_model.encoder.layers.{i}.self_attn.v_proj.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/kernel", f"text_model.encoder.layers.{i}.self_attn.q_proj.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/query/bias", f"text_model.encoder.layers.{i}.self_attn.q_proj.bias"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/kernel", f"text_model.encoder.layers.{i}.self_attn.out_proj.weight"))
-        rename_keys.append((f"params/txt/Encoder_0/encoderblock_{i}/MultiHeadDotProductAttention_0/out/bias", f"text_model.encoder.layers.{i}.self_attn.out_proj.bias"))
-
-    rename_keys.append(("params/txt/Encoder_0/encoder_norm/scale", "text_model.final_layer_norm.weight"))
-    rename_keys.append(("params/txt/Encoder_0/encoder_norm/bias", "text_model.final_layer_norm.bias"))
-    rename_keys.append(("params/txt/head/kernel", "text_model.head.weight"))
-    rename_keys.append(("params/txt/head/bias", "text_model.head.bias"))
-
-    # learned temperature and bias
-    rename_keys.append(("params/t", "logit_scale"))
-    rename_keys.append(("params/b", "logit_bias"))
-
-    # fmt: on
-    return rename_keys
-
-
-def rename_key(dct, old, new, config):
-    val = dct.pop(old)
-
-    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "vision" in new:
-        val = val.reshape(-1, config.vision_config.hidden_size)
-    if ("out_proj" in new or "v_proj" in new or "k_proj" in new or "q_proj" in new) and "text" in new:
-        val = val.reshape(-1, config.text_config.hidden_size)
-
-    if "patch_embedding.weight" in new:
-        val = val.transpose(3, 2, 0, 1)
-    elif new.endswith("weight") and "position_embedding" not in new and "token_embedding" not in new:
-        val = val.T
-
-    if "position_embedding" in new and "vision" in new:
-        val = val.reshape(-1, config.vision_config.hidden_size)
-    if "position_embedding" in new and "text" in new:
-        val = val.reshape(-1, config.text_config.hidden_size)
-
-    if new.endswith("bias"):
-        val = val.reshape(-1)
-
-    dct[new] = torch.from_numpy(val)
-
-
-def read_in_q_k_v_head(state_dict, config):
-    # read in individual input projection layers
-    key_proj_weight = (
-        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/kernel")
-        .reshape(-1, config.vision_config.hidden_size)
-        .T
-    )
-    key_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/key/bias").reshape(-1)
-    value_proj_weight = (
-        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/kernel")
-        .reshape(-1, config.vision_config.hidden_size)
-        .T
-    )
-    value_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/value/bias").reshape(-1)
-    query_proj_weight = (
-        state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/kernel")
-        .reshape(-1, config.vision_config.hidden_size)
-        .T
-    )
-    query_proj_bias = state_dict.pop("params/img/MAPHead_0/MultiHeadDotProductAttention_0/query/bias").reshape(-1)
-
-    # next, add them to the state dict as a single matrix + vector
-    state_dict["vision_model.head.attention.in_proj_weight"] = torch.from_numpy(
-        np.concatenate([query_proj_weight, key_proj_weight, value_proj_weight], axis=0)
-    )
-    state_dict["vision_model.head.attention.in_proj_bias"] = torch.from_numpy(
-        np.concatenate([query_proj_bias, key_proj_bias, value_proj_bias], axis=0)
-    )
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-def flatten_nested_dict(params, parent_key="", sep="/"):
-    items = []
-
-    for k, v in params.items():
-        new_key = parent_key + sep + k if parent_key else k
-
-        if isinstance(v, collections.abc.MutableMapping):
-            items.extend(flatten_nested_dict(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
-
-
-@torch.no_grad()
-def convert_siglip_checkpoint(model_name, pytorch_dump_folder_path, verify_logits=True, push_to_hub=False):
-    """
-    Copy/paste/tweak model's weights to our SigLIP structure.
-    """
-
-    # define default SigLIP configuration
-    config = get_siglip_config(model_name)
-
-    # get checkpoint
-    checkpoint = model_name_to_checkpoint[model_name]
-
-    # get vocab file
-    if "i18n" in model_name:
-        vocab_file = "/Users/nielsrogge/Documents/SigLIP/multilingual_vocab/sentencepiece.model"
-    else:
-        vocab_file = "/Users/nielsrogge/Documents/SigLIP/english_vocab/sentencepiece.model"
-
-    # load original state dict
-    data = load(checkpoint)
-    state_dict = flatten_nested_dict(data)
-
-    # remove and rename some keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest, config)
-
-    # qkv matrices of attention pooling head need special treatment
-    read_in_q_k_v_head(state_dict, config)
-
-    # load HuggingFace model
-    model = SiglipModel(config).eval()
-    model.load_state_dict(state_dict)
-
-    # create processor
-    # important: make tokenizer not return attention_mask since original one doesn't require it
-    image_size = config.vision_config.image_size
-    size = {"height": image_size, "width": image_size}
-    image_processor = SiglipImageProcessor(size=size)
-    tokenizer = SiglipTokenizer(vocab_file=vocab_file, model_input_names=["input_ids"])
-    processor = SiglipProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    # verify on dummy images and texts
-    url_1 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-ipod.jpg"
-    image_1 = Image.open(requests.get(url_1, stream=True).raw).convert("RGB")
-    url_2 = "https://cdn.openai.com/multimodal-neurons/assets/apple/apple-blank.jpg"
-    image_2 = Image.open(requests.get(url_2, stream=True).raw).convert("RGB")
-    texts = ["an apple", "a picture of an apple"]
-
-    inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt", padding="max_length")
-
-    # verify input_ids against original ones
-    if image_size == 224:
-        filename = "siglip_pixel_values.pt"
-    elif image_size == 256:
-        filename = "siglip_pixel_values_256.pt"
-    elif image_size == 384:
-        filename = "siglip_pixel_values_384.pt"
-    elif image_size == 512:
-        filename = "siglip_pixel_values_512.pt"
-    else:
-        raise ValueError("Image size not supported")
-
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename=filename, repo_type="dataset")
-    original_pixel_values = torch.load(filepath)
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="siglip_input_ids.pt", repo_type="dataset")
-    original_input_ids = torch.load(filepath)
-
-    if "i18n" not in model_name:
-        assert inputs.input_ids.tolist() == original_input_ids.tolist()
-
-    print("Mean of original pixel values:", original_pixel_values.mean())
-    print("Mean of new pixel values:", inputs.pixel_values.mean())
-
-    # note: we're testing with original pixel values here since we don't have exact pixel values
-    with torch.no_grad():
-        outputs = model(input_ids=inputs.input_ids, pixel_values=original_pixel_values)
-
-    # with torch.no_grad():
-    #     outputs = model(input_ids=inputs.input_ids, pixel_values=inputs.pixel_values)
-
-    print(outputs.logits_per_image[:3, :3])
-
-    probs = torch.sigmoid(outputs.logits_per_image)  # these are the probabilities
-    print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
-    print(f"{probs[0][1]:.1%} that image 0 is '{texts[1]}'")
-
-    if verify_logits:
-        if model_name == "siglip-base-patch16-224":
-            expected_slice = torch.tensor(
-                [[-2.9621, -2.1672], [-0.2713, 0.2910]],
-            )
-        elif model_name == "siglip-base-patch16-256":
-            expected_slice = torch.tensor(
-                [[-3.1146, -1.9894], [-0.7312, 0.6387]],
-            )
-        elif model_name == "siglip-base-patch16-384":
-            expected_slice = torch.tensor(
-                [[-2.8098, -2.1891], [-0.4242, 0.4102]],
-            )
-        elif model_name == "siglip-base-patch16-512":
-            expected_slice = torch.tensor(
-                [[-2.7899, -2.2668], [-0.4295, -0.0735]],
-            )
-        elif model_name == "siglip-large-patch16-256":
-            expected_slice = torch.tensor(
-                [[-1.5827, -0.5801], [-0.9153, 0.1363]],
-            )
-        elif model_name == "siglip-large-patch16-384":
-            expected_slice = torch.tensor(
-                [[-2.1523, -0.2899], [-0.2959, 0.7884]],
-            )
-        elif model_name == "siglip-so400m-patch14-384":
-            expected_slice = torch.tensor([[-1.2441, -0.6649], [-0.7060, 0.7374]])
-        elif model_name == "siglip-base-patch16-256-i18n":
-            expected_slice = torch.tensor(
-                [[-0.9064, 0.1073], [-0.0299, 0.5304]],
-            )
-
-        assert torch.allclose(outputs.logits_per_image[:3, :3], expected_slice, atol=1e-4)
-        print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(f"nielsr/{model_name}")
-        processor.push_to_hub(f"nielsr/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="siglip-base-patch16-224",
-        type=str,
-        choices=model_name_to_checkpoint.keys(),
-        help="Name of the model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--verify_logits",
-        action="store_false",
-        help="Whether to verify logits against the original implementation.",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_siglip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub)
diff --git a/src/transformers/models/siglip/image_processing_siglip.py b/src/transformers/models/siglip/image_processing_siglip.py
index d58268780653..7ec6c36d39ca 100644
--- a/src/transformers/models/siglip/image_processing_siglip.py
+++ b/src/transformers/models/siglip/image_processing_siglip.py
@@ -89,7 +89,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -111,18 +111,18 @@ def __init__(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index d8a317493a10..6c488262a50d 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -17,7 +17,7 @@
 import math
 import warnings
 from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -29,13 +29,12 @@
 from ...activations import ACT2FN
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
     torch_int,
@@ -43,10 +42,6 @@
 from .configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
 
 
-if is_flash_attn_2_available():
-    from ...modeling_flash_attention_utils import _flash_attention_forward
-
-
 logger = logging.get_logger(__name__)
 
 # General docstring
@@ -174,7 +169,7 @@ class SiglipVisionModelOutput(ModelOutput):
     """
 
     image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -204,7 +199,7 @@ class SiglipTextModelOutput(ModelOutput):
     """
 
     text_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -233,10 +228,10 @@ class SiglipOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
 
@@ -360,11 +355,33 @@ def forward(
         return embeddings
 
 
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 class SiglipAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
-    def __init__(self, config):
+    def __init__(self, config: Union[SiglipVisionConfig, SiglipTextConfig]):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -377,6 +394,7 @@ def __init__(self, config):
             )
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
+        self.is_causal = False
 
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
@@ -391,136 +409,38 @@ def forward(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """Input shape: Batch x Time x Channel"""
 
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        batch_size, seq_length, embed_dim = hidden_states.shape
 
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
 
-        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
 
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                 )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights
-
-
-class SiglipFlashAttention2(SiglipAttention):
-    """
-    SiglipAttention flash attention module. This module inherits from `SiglipAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    is_causal = False
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32.
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
             else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
             attention_mask,
-            q_len,
-            dropout=dropout_rate,
             is_causal=self.is_causal,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
         )
 
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
         attn_output = self.out_proj(attn_output)
 
         if not output_attentions:
@@ -529,79 +449,6 @@ def forward(
         return attn_output, attn_weights
 
 
-class SiglipSdpaAttention(SiglipAttention):
-    """
-    Siglip attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `SiglipAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    is_causal = False
-
-    # Adapted from SiglipAttention.forward and transformers.models.llama.modeling_llama.LlamaSdpaAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "SiglipModel is using SiglipSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                output_attentions=output_attentions,
-            )
-
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = True if self.is_causal and q_len > 1 else False
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.dropout if self.training else 0.0,
-            is_causal=is_causal,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(batch_size, q_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, None
-
-
-SIGLIP_ATTENTION_CLASSES = {
-    "eager": SiglipAttention,
-    "flash_attention_2": SiglipFlashAttention2,
-    "sdpa": SiglipSdpaAttention,
-}
-
-
 # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
 class SiglipMLP(nn.Module):
     def __init__(self, config):
@@ -619,15 +466,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 
 class SiglipEncoderLayer(nn.Module):
-    def __init__(self, config: SiglipConfig):
+    def __init__(self, config: Union[SiglipVisionConfig, SiglipTextConfig]):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.self_attn = SIGLIP_ATTENTION_CLASSES[config._attn_implementation](config=config)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
-        self.mlp = SiglipMLP(config)
+        self.self_attn = SiglipAttention(config)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
 
-    # Ignore copy
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -855,14 +701,14 @@ def __init__(self, config: SiglipConfig):
         self.gradient_checkpointing = False
 
     # Ignore copy
+    @can_return_tuple
     def forward(
         self,
         inputs_embeds,
         attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -889,7 +735,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -920,10 +765,10 @@ def forward(
         if output_hidden_states:
             encoder_states = encoder_states + (hidden_states,)
 
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
         )
 
 
@@ -936,9 +781,10 @@ def __init__(self, config: SiglipTextConfig):
         self.encoder = SiglipEncoder(config)
         self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
-        self.head = nn.Linear(embed_dim, embed_dim)
+        self.head = nn.Linear(embed_dim, config.projection_size)
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
     def forward(
@@ -948,8 +794,7 @@ def forward(
         position_ids: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Returns:
 
@@ -958,7 +803,6 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if input_ids is None:
             raise ValueError("You have to specify input_ids")
@@ -974,24 +818,20 @@ def forward(
             # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
             attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
 
-        encoder_outputs = self.encoder(
+        encoder_outputs: BaseModelOutput = self.encoder(
             inputs_embeds=hidden_states,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = encoder_outputs.last_hidden_state
         last_hidden_state = self.final_layer_norm(last_hidden_state)
 
         # Assuming "sticky" EOS tokenization, last token is always EOS.
         pooled_output = last_hidden_state[:, -1, :]
         pooled_output = self.head(pooled_output)
 
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
         return BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
@@ -1019,6 +859,7 @@ def get_input_embeddings(self) -> nn.Module:
     def set_input_embeddings(self, value):
         self.text_model.embeddings.token_embedding = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
     def forward(
@@ -1028,8 +869,7 @@ def forward(
         position_ids: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Returns:
 
@@ -1048,7 +888,6 @@ def forward(
         >>> last_hidden_state = outputs.last_hidden_state
         >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         return self.text_model(
             input_ids=input_ids,
@@ -1056,7 +895,6 @@ def forward(
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
 
@@ -1073,6 +911,7 @@ def __init__(self, config: SiglipVisionConfig):
         if self.use_head:
             self.head = SiglipMultiheadAttentionPoolingHead(config)
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
     def forward(
@@ -1080,9 +919,8 @@ def forward(
         pixel_values,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         interpolate_pos_encoding: Optional[bool] = False,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Returns:
 
@@ -1091,23 +929,19 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
 
-        encoder_outputs = self.encoder(
+        encoder_outputs: BaseModelOutput = self.encoder(
             inputs_embeds=hidden_states,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = encoder_outputs.last_hidden_state
         last_hidden_state = self.post_layernorm(last_hidden_state)
 
         pooler_output = self.head(last_hidden_state) if self.use_head else None
-        if not return_dict:
-            return (last_hidden_state, pooler_output) + encoder_outputs[1:]
 
         return BaseModelOutputWithPooling(
             last_hidden_state=last_hidden_state,
@@ -1160,6 +994,7 @@ def __init__(self, config: SiglipVisionConfig):
     def get_input_embeddings(self) -> nn.Module:
         return self.vision_model.embeddings.patch_embedding
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
     def forward(
@@ -1167,9 +1002,8 @@ def forward(
         pixel_values,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         interpolate_pos_encoding: bool = False,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Returns:
 
@@ -1192,13 +1026,11 @@ def forward(
         >>> last_hidden_state = outputs.last_hidden_state
         >>> pooled_output = outputs.pooler_output  # pooled features
         ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         return self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
@@ -1247,7 +1079,6 @@ def get_text_features(
         position_ids: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> torch.FloatTensor:
         r"""
         Returns:
@@ -1273,18 +1104,16 @@ def get_text_features(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        text_outputs = self.text_model(
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        pooled_output = text_outputs[1]
+        pooled_output = text_outputs.pooler_output
 
         return pooled_output
 
@@ -1294,7 +1123,6 @@ def get_image_features(
         pixel_values: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         interpolate_pos_encoding: bool = False,
     ) -> torch.FloatTensor:
         r"""
@@ -1326,20 +1154,19 @@ def get_image_features(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
-        pooled_output = vision_outputs[1]
+        pooled_output = vision_outputs.pooler_output
 
         return pooled_output
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=SiglipOutput, config_class=SiglipConfig)
     def forward(
@@ -1351,9 +1178,8 @@ def forward(
         return_loss: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         interpolate_pos_encoding: bool = False,
-    ) -> Union[Tuple, SiglipOutput]:
+    ) -> SiglipOutput:
         r"""
         Returns:
 
@@ -1388,37 +1214,35 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        vision_outputs = self.vision_model(
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
-        text_outputs = self.text_model(
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
 
-        image_embeds = vision_outputs[1]
-        text_embeds = text_outputs[1]
+        image_embeds = vision_outputs.pooler_output
+        text_embeds = text_outputs.pooler_output
 
         # normalized features
         image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
         text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
 
         # cosine similarity as logits
-        logits_per_text = (
-            torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device)) * self.logit_scale.exp()
-            + self.logit_bias
-        )
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device))
+
+        logit_scale, logit_bias = self.logit_scale.to(text_embeds.device), self.logit_bias.to(text_embeds.device)
+        logits_per_text = logits_per_text * logit_scale.exp() + logit_bias
+
         logits_per_image = logits_per_text.t()
 
         loss = None
@@ -1430,10 +1254,6 @@ def forward(
             nll = -torch.sum(loglik, dim=-1)
             loss = nll.mean()
 
-        if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
-            return ((loss,) + output) if loss is not None else output
-
         return SiglipOutput(
             loss=loss,
             logits_per_image=logits_per_image,
@@ -1473,6 +1293,7 @@ def __init__(self, config: SiglipConfig) -> None:
         # Initialize weights and apply final processing
         self.post_init()
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1481,9 +1302,8 @@ def forward(
         labels: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         interpolate_pos_encoding: bool = False,
-    ) -> Union[tuple, ImageClassifierOutput]:
+    ) -> ImageClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
@@ -1521,17 +1341,15 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.vision_model(
+        outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
 
         # average pool the patch tokens
         sequence_output = torch.mean(sequence_output, dim=1)
@@ -1563,10 +1381,6 @@ def forward(
                 loss_fct = BCEWithLogitsLoss()
                 loss = loss_fct(logits, labels)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return ImageClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py
index fd89287fc3f4..0d66fb9d5f99 100644
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@@ -40,8 +40,8 @@ class SiglipProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "SiglipImageProcessor"
-    tokenizer_class = "SiglipTokenizer"
+    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
+    tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
@@ -52,14 +52,14 @@ def __call__(
         images: ImageInput = None,
         padding: Union[bool, str, PaddingStrategy] = False,
         truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: int = None,
+        max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to SiglipTokenizer's [`~SiglipTokenizer.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` argument to
-        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
@@ -113,7 +113,7 @@ def __call__(
             image_features = self.image_processor(images, return_tensors=return_tensors)
 
         if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
+            encoding.update(image_features)
             return encoding
         elif text is not None:
             return encoding
diff --git a/src/transformers/models/siglip2/__init__.py b/src/transformers/models/siglip2/__init__.py
new file mode 100644
index 000000000000..fe5b732b9513
--- /dev/null
+++ b/src/transformers/models/siglip2/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_siglip2 import *
+    from .image_processing_siglip2 import *
+    from .image_processing_siglip2_fast import *
+    from .modeling_siglip2 import *
+    from .processing_siglip2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/siglip2/configuration_siglip2.py b/src/transformers/models/siglip2/configuration_siglip2.py
new file mode 100644
index 000000000000..6cb379c670ad
--- /dev/null
+++ b/src/transformers/models/siglip2/configuration_siglip2.py
@@ -0,0 +1,277 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/siglip2/modular_siglip2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_siglip2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Siglip2TextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Siglip2TextModel`]. It is used to instantiate a
+    Siglip2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip2
+    [google/siglip2-base-patch16-224](https://huggingface.co/google/siglip2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Siglip2 text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Siglip2Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 64):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the padding token in the vocabulary.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the vocabulary.
+        projection_size (`int`, *optional*, defaults to `hidden_size`):
+            The size of the projection head.
+
+    Example:
+
+    ```python
+    >>> from transformers import Siglip2TextConfig, Siglip2TextModel
+
+    >>> # Initializing a Siglip2TextConfig with google/siglip2-base-patch16-224 style configuration
+    >>> configuration = Siglip2TextConfig()
+
+    >>> # Initializing a Siglip2TextModel (with random weights) from the google/siglip2-base-patch16-224 style configuration
+    >>> model = Siglip2TextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip2_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=64,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/siglip2
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        projection_size=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+        self.projection_size = projection_size if projection_size is not None else hidden_size
+
+
+class Siglip2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Siglip2VisionModel`]. It is used to instantiate a
+    Siglip2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip2
+    [google/siglip2-base-patch16-naflex](https://huggingface.co/google/siglip2-base-patch16-naflex) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        num_patches (`int`, *optional*, defaults to 256):
+            The number of patches in the image with the size of (`patch_size`, `patch_size`).
+            The image is resized to fill maximum of this number of patches, and to preserve
+            the aspect ratio. In case the resulted number of patches is lower, the image is
+            padded in "patch" dimension.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    Example:
+
+    ```python
+    >>> from transformers import Siglip2VisionConfig, Siglip2VisionModel
+
+    >>> # Initializing a Siglip2VisionConfig with google/siglip2-base-patch16-naflex style configuration
+    >>> configuration = Siglip2VisionConfig()
+
+    >>> # Initializing a Siglip2VisionModel (with random weights) from the google/siglip2-base-patch16-naflex style configuration
+    >>> model = Siglip2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip2_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        num_patches=256,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.num_patches = num_patches
+
+
+class Siglip2Config(PretrainedConfig):
+    r"""
+    [`Siglip2Config`] is the configuration class to store the configuration of a [`Siglip2Model`]. It is used to
+    instantiate a Siglip2 model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip2
+    [google/siglip2-base-patch16-224](https://huggingface.co/google/siglip2-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Siglip2TextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Siglip2VisionConfig`].
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import Siglip2Config, Siglip2Model
+
+    >>> # Initializing a Siglip2Config with google/siglip2-base-patch16-224 style configuration
+    >>> configuration = Siglip2Config()
+
+    >>> # Initializing a Siglip2Model (with random weights) from the google/siglip2-base-patch16-224 style configuration
+    >>> model = Siglip2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a Siglip2Config from a Siglip2TextConfig and a Siglip2VisionConfig
+    >>> from transformers import Siglip2TextConfig, Siglip2VisionConfig
+
+    >>> # Initializing a Siglip2Text and Siglip2Vision configuration
+    >>> config_text = Siglip2TextConfig()
+    >>> config_vision = Siglip2VisionConfig()
+
+    >>> config = Siglip2Config.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "siglip2"
+    sub_configs = {"text_config": Siglip2TextConfig, "vision_config": Siglip2VisionConfig}
+
+    def __init__(self, text_config=None, vision_config=None, **kwargs):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `Siglip2TextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `Siglip2VisionConfig` with default values.")
+
+        self.text_config = Siglip2TextConfig(**text_config)
+        self.vision_config = Siglip2VisionConfig(**vision_config)
+
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: Siglip2TextConfig, vision_config: Siglip2VisionConfig, **kwargs):
+        r"""
+        Instantiate a [`Siglip2Config`] (or a derived class) from siglip2 text model configuration and siglip2 vision
+        model configuration.
+
+        Returns:
+            [`Siglip2Config`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+
+__all__ = ["Siglip2Config", "Siglip2TextConfig", "Siglip2VisionConfig"]
diff --git a/src/transformers/models/siglip2/image_processing_siglip2.py b/src/transformers/models/siglip2/image_processing_siglip2.py
new file mode 100644
index 000000000000..6278010319b9
--- /dev/null
+++ b/src/transformers/models/siglip2/image_processing_siglip2.py
@@ -0,0 +1,343 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for SigLIP2."""
+
+import math
+from functools import lru_cache
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+@lru_cache(maxsize=256)
+def get_image_size_for_max_num_patches(
+    image_height: int, image_width: int, patch_size: int, max_num_patches: int, eps: float = 1e-5
+) -> Tuple[int, int]:
+    """
+    Determine image size based on max number of patches, ensure dimensions are divisible by patch size and image is at least 1 patch.
+
+    Args:
+        image_height (`int`):
+            Original image height.
+        image_width (`int`):
+            Original image width.
+        patch_size (`int`):
+            Patch size for processing.
+        max_num_patches (`int`):
+            Maximum number of patches.
+        eps (`float`):
+            Small threshold for binary search.
+
+    Returns:
+        Tuple: (target_height, target_width)
+    """
+
+    def get_scaled_image_size(scale: float, size: int, patch_size: int) -> int:
+        scaled_size = size * scale
+        scaled_size = math.ceil(scaled_size / patch_size) * patch_size  # make divisible by patch_size
+        scaled_size = max(patch_size, scaled_size)  # ensure at least 1 patch
+        return int(scaled_size)
+
+    # Binary search for optimal scale
+    scale_min, scale_max = eps / 10, 100.0
+    while (scale_max - scale_min) >= eps:
+        scale = (scale_min + scale_max) / 2
+        target_height = get_scaled_image_size(scale, image_height, patch_size)
+        target_width = get_scaled_image_size(scale, image_width, patch_size)
+        num_patches = (target_height / patch_size) * (target_width / patch_size)
+
+        if num_patches <= max_num_patches:
+            scale_min = scale
+        else:
+            scale_max = scale
+
+    scale = scale_min
+    target_height = get_scaled_image_size(scale, image_height, patch_size)
+    target_width = get_scaled_image_size(scale, image_width, patch_size)
+    return target_height, target_width
+
+
+def convert_image_to_patches(image: np.ndarray, patch_size: int) -> np.ndarray:
+    """
+    Convert 3D array image of shape (image_height, image_width, num_channels) into 2D array of patches of shape
+    (num_patches_height * num_patches_width, patch_size * patch_size * num_channels).
+    """
+    image_height, image_width, num_channels = image.shape
+    num_patches_height = image_height // patch_size
+    num_patches_width = image_width // patch_size
+    patched_image = image.reshape(num_patches_height, patch_size, num_patches_width, patch_size, num_channels)
+    patched_image = patched_image.transpose(0, 2, 1, 3, 4)
+    patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1)
+    return patched_image
+
+
+def pad_along_first_dim(array: np.ndarray, target_length: int, pad_value: int = 0) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Pad the array along the first dimension.
+    """
+    current_length = array.shape[0]
+    padding_length = target_length - current_length
+    mask = np.ones((target_length,), dtype=np.int32)
+    if padding_length > 0:
+        paddings = [(0, padding_length)] + [(0, 0)] * (array.ndim - 1)
+        array = np.pad(array, paddings, mode="constant", constant_values=pad_value)
+        mask[-padding_length:] = 0
+    return array, mask
+
+
+class Siglip2ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a SigLIP2 image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's dimensions to fit `max_num_patches` according to given `patch_size`.
+            Can be overridden by `do_resize` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
+            `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch the image will be split to.
+        max_num_patches (`int`, *optional*, defaults to 256):
+            The image will be resized to have at most this number of patches,
+            and then padded in "patch" dimension to match this number exactly.
+    """
+
+    model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        patch_size: int = 16,
+        max_num_patches: int = 256,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        image_mean = image_mean if image_mean is not None else [0.5, 0.5, 0.5]
+        image_std = image_std if image_std is not None else [0.5, 0.5, 0.5]
+
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+        self.patch_size = patch_size
+        self.max_num_patches = max_num_patches
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        patch_size: Optional[int] = None,
+        max_num_patches: Optional[int] = None,
+    ) -> "Image.Image":
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                Patch size for processing, same as the patch size used in the model.
+            max_num_patches (`int`, *optional*, defaults to `self.max_num_patches`):
+                Maximum number of patches per image, the image will be resized to have at most this number of patches.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        max_num_patches = max_num_patches if max_num_patches is not None else self.max_num_patches
+
+        # Explicitly specify data format to be channels last for image preprocessing.
+        # Image processor does not support different output formats, because it returns patches.
+        data_format = ChannelDimension.LAST
+
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+        )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        pixel_masks = []
+        pixel_values = []
+        spatial_shapes = []
+
+        for image in images:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+            if do_resize:
+                height, width = get_image_size_for_max_num_patches(
+                    image_height=image.shape[0],
+                    image_width=image.shape[1],
+                    patch_size=patch_size,
+                    max_num_patches=max_num_patches,
+                )
+                image = resize(image=image, size=(height, width), resample=resample, input_data_format=data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=data_format)
+
+            if do_normalize:
+                image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=data_format)
+
+            patches = convert_image_to_patches(image, patch_size)
+            patches, mask = pad_along_first_dim(patches, max_num_patches)
+            num_patches_height = image.shape[0] // patch_size
+            num_patches_width = image.shape[1] // patch_size
+
+            spatial_shapes.append((num_patches_height, num_patches_width))
+            pixel_values.append(patches)
+            pixel_masks.append(mask)
+
+        batch_feature = BatchFeature(
+            data={
+                "pixel_values": pixel_values,
+                "pixel_attention_mask": pixel_masks,
+                "spatial_shapes": spatial_shapes,
+            },
+            tensor_type=return_tensors,
+        )
+
+        return batch_feature
+
+
+__all__ = ["Siglip2ImageProcessor"]
diff --git a/src/transformers/models/siglip2/image_processing_siglip2_fast.py b/src/transformers/models/siglip2/image_processing_siglip2_fast.py
new file mode 100644
index 000000000000..dcd7cef6293e
--- /dev/null
+++ b/src/transformers/models/siglip2/image_processing_siglip2_fast.py
@@ -0,0 +1,195 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for SigLIP2."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+)
+from ...image_utils import (
+    ImageInput,
+    PILImageResampling,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+)
+from .image_processing_siglip2 import get_image_size_for_max_num_patches
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+def convert_image_to_patches(image: "torch.Tensor", patch_size: int) -> "torch.Tensor":
+    """
+    Convert 3D tensor image of shape (num_channels, image_height, image_width) into 2D tensor of patches of shape
+    (num_patches_height * num_patches_width, patch_size * patch_size * num_channels).
+    """
+    num_channels, image_height, image_width = image.shape
+    num_patches_height = image_height // patch_size
+    num_patches_width = image_width // patch_size
+    patched_image = image.reshape(num_channels, num_patches_height, patch_size, num_patches_width, patch_size)
+    patched_image = patched_image.permute(1, 3, 2, 4, 0)
+    patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1)
+    return patched_image
+
+
+def pad_along_first_dim(
+    tensor: "torch.Tensor", target_length: int, pad_value: int = 0
+) -> Tuple["torch.Tensor", "torch.Tensor"]:
+    """
+    Pad the tensor along the first dimension.
+    """
+    current_length = tensor.shape[0]
+    padding_length = target_length - current_length
+    mask = torch.ones((target_length,), dtype=torch.int32)
+    if padding_length > 0:
+        padding = [0, 0] * (tensor.ndim - 1) + [0, padding_length]
+        tensor = torch.nn.functional.pad(tensor, padding, mode="constant", value=pad_value)
+        mask[-padding_length:] = 0
+    return tensor, mask
+
+
+class Siglip2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    patch_size: Optional[int]
+    max_num_patches: Optional[int]
+
+
+@add_start_docstrings(
+    r"Constructs a fast Siglip2 image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch the image will be split to.
+        max_num_patches (`int`, *optional*, defaults to 256):
+            The image will be resized to have at most this number of patches,
+            and then padded in "patch" dimension to match this number exactly.
+    """,
+)
+class Siglip2ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = [0.5, 0.5, 0.5]
+    image_std = [0.5, 0.5, 0.5]
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    patch_size = 16
+    max_num_patches = 256
+    valid_kwargs = Siglip2FastImageProcessorKwargs
+    unused_kwargs = ["size", "do_center_crop", "crop_size"]
+
+    def __init__(self, **kwargs: Unpack[Siglip2FastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    def _validate_preprocess_kwargs(self, **kwargs) -> tuple:
+        # Remove do_resize from kwargs to not raise an error as size is None
+        kwargs.pop("do_resize", None)
+        return super()._validate_preprocess_kwargs(**kwargs)
+
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+        patch_size (`int`, *optional*, defaults to `self.patch_size`):
+            The size (resolution) of each patch the image will be split to.
+        max_num_patches (`int`, *optional*, defaults to `self.max_num_patches`):
+            The image will be resized to have at most this number of patches,
+            and then padded in "patch" dimension to match this number exactly.
+        """,
+    )
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Siglip2FastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        do_resize: bool,
+        patch_size: int,
+        max_num_patches: int,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        pixel_masks = []
+        pixel_values = []
+        spatial_shapes = []
+
+        for image in images:
+            if do_resize:
+                height, width = get_image_size_for_max_num_patches(
+                    image_height=image.shape[1],
+                    image_width=image.shape[2],
+                    patch_size=patch_size,
+                    max_num_patches=max_num_patches,
+                )
+                side_dict = SizeDict(height=height, width=width)
+                image = self.resize(image=image, size=side_dict, interpolation=interpolation)
+
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+
+            # (num_channels, height, width) -> (num_patches, patch_size * patch_size * num_channels)
+            patches = convert_image_to_patches(image, patch_size)
+            patches, mask = pad_along_first_dim(patches, max_num_patches)
+
+            num_patches_height = image.shape[1] // patch_size
+            num_patches_width = image.shape[2] // patch_size
+
+            spatial_shapes.append((num_patches_height, num_patches_width))
+            pixel_values.append(patches)
+            pixel_masks.append(mask)
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_masks = torch.stack(pixel_masks)
+        spatial_shapes = torch.tensor(spatial_shapes)
+
+        batch_feature = BatchFeature(
+            data={
+                "pixel_values": pixel_values,
+                "pixel_attention_mask": pixel_masks,
+                "spatial_shapes": spatial_shapes,
+            },
+            tensor_type=return_tensors,
+        )
+        return batch_feature
+
+
+__all__ = ["Siglip2ImageProcessorFast"]
diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py
new file mode 100644
index 000000000000..612e149d5447
--- /dev/null
+++ b/src/transformers/models/siglip2/modeling_siglip2.py
@@ -0,0 +1,1453 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/siglip2/modular_siglip2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_siglip2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_siglip2 import Siglip2Config, Siglip2TextConfig, Siglip2VisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "Siglip2Config"
+
+
+@dataclass
+class Siglip2VisionOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class Siglip2TextOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class Siglip2Output(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`Siglip2TextModel`].
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`Siglip2VisionModel`].
+        text_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`Siglip2TextModel`].
+        vision_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`Siglip2VisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class Siglip2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Linear(
+            in_features=config.num_channels * self.patch_size * self.patch_size,
+            out_features=self.embed_dim,
+        )
+
+        self.num_patches = config.num_patches
+        self.position_embedding_size = int(self.num_patches**0.5)
+        self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
+
+    @staticmethod
+    def resize_positional_embeddings(
+        positional_embeddings: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        max_length: int,
+    ) -> torch.Tensor:
+        """
+        Resize positional embeddings to image-specific size and pad to a fixed size.
+
+        Args:
+            positional_embeddings (`torch.Tensor`):
+                Position embeddings of shape (height, width, embed_dim)
+            spatial_shapes (`torch.LongTensor`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+            max_length (`int`):
+                Maximum length of the positional embeddings to pad resized positional embeddings to
+
+        Returns:
+            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
+        """
+        batch_size = spatial_shapes.shape[0]
+        embed_dim = positional_embeddings.shape[-1]
+        source_dtype = positional_embeddings.dtype
+
+        resulted_positional_embeddings = torch.empty(
+            (batch_size, max_length, embed_dim),
+            device=positional_embeddings.device,
+            dtype=source_dtype,
+        )
+
+        # (height, width, embed_dim) -> (1, embed_dim, height, width) for interpolation
+        positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0)
+
+        # Upcast to float32 on CPU because antialias is not supported for bfloat16/float16 on CPU
+        if positional_embeddings.device.type == "cpu":
+            positional_embeddings = positional_embeddings.to(torch.float32)
+
+        for i in range(batch_size):
+            # (1, dim, height, width) -> (1, dim, target_height, target_width)
+            height, width = spatial_shapes[i]
+            resized_embeddings = F.interpolate(
+                positional_embeddings,
+                size=(height, width),
+                mode="bilinear",
+                align_corners=False,
+                antialias=True,
+            )
+
+            # (1, dim, target_height, target_width) -> (target_height * target_width, dim)
+            resized_embeddings = resized_embeddings.reshape(embed_dim, height * width).transpose(0, 1)
+
+            # Cast to original dtype
+            resized_embeddings = resized_embeddings.to(source_dtype)
+
+            resulted_positional_embeddings[i, : height * width] = resized_embeddings
+            resulted_positional_embeddings[i, height * width :] = resized_embeddings[0]
+
+        return resulted_positional_embeddings
+
+    def forward(self, pixel_values: torch.FloatTensor, spatial_shapes: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
+            spatial_shapes (`List[Tuple[int, int]]`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+        """
+
+        # Apply patch embeddings to already patchified pixel values
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+
+        # Get positional resized and padded positional embeddings
+        positional_embeddings = self.position_embedding.weight.reshape(
+            self.position_embedding_size, self.position_embedding_size, -1
+        )
+        resized_positional_embeddings = self.resize_positional_embeddings(
+            positional_embeddings, spatial_shapes, max_length=pixel_values.shape[1]
+        )
+
+        # Add positional embeddings to patch embeddings
+        embeddings = patch_embeds + resized_positional_embeddings
+        return embeddings
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Siglip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Union[Siglip2VisionConfig, Siglip2TextConfig]):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class Siglip2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Siglip2EncoderLayer(nn.Module):
+    def __init__(self, config: Union[Siglip2VisionConfig, Siglip2TextConfig]):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = Siglip2Attention(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Siglip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Siglip2EncoderLayer`].
+
+    Args:
+        config: Siglip2Config
+    """
+
+    def __init__(self, config: Siglip2Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Siglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    @can_return_tuple
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutput:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+SIGLIP2_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class Siglip2VisionTransformer(nn.Module):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Siglip2VisionEmbeddings(config)
+        self.encoder = Siglip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head
+        if self.use_head:
+            self.head = Siglip2MultiheadAttentionPoolingHead(config)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        attention_mask: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        hidden_states = self.embeddings(pixel_values, spatial_shapes)
+
+        if attention_mask is not None and not self._use_flash_attention_2:
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+        else:
+            encoder_attention_mask = attention_mask
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooler_output = self.head(last_hidden_state, attention_mask) if self.use_head else None
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooler_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class Siglip2TextEmbeddings(nn.Module):
+    def __init__(self, config: Siglip2TextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsequently scaled and shifted by the mean and std args.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+SIGLIP2_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class Siglip2TextTransformer(nn.Module):
+    def __init__(self, config: Siglip2TextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = Siglip2TextEmbeddings(config)
+        self.encoder = Siglip2Encoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.head = nn.Linear(embed_dim, config.projection_size)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2TextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # note: Siglip2's text model does not use a causal mask, unlike the original CLIP model.
+        # expand attention_mask
+        if attention_mask is not None and not self._use_flash_attention_2:
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # Assuming "sticky" EOS tokenization, last token is always EOS.
+        pooled_output = last_hidden_state[:, -1, :]
+        pooled_output = self.head(pooled_output)
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+SIGLIP2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Siglip2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SIGLIP2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class Siglip2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Siglip2Config
+    base_model_prefix = "siglip2"
+    supports_gradient_checkpointing = True
+
+    _no_split_modules = [
+        "Siglip2TextEmbeddings",
+        "Siglip2EncoderLayer",
+        "Siglip2VisionEmbeddings",
+        "Siglip2EncoderLayer",
+        "Siglip2MultiheadAttentionPoolingHead",
+    ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Siglip2VisionEmbeddings):
+            width = (
+                self.config.vision_config.hidden_size
+                if isinstance(self.config, Siglip2Config)
+                else self.config.hidden_size
+            )
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, Siglip2Attention):
+            nn.init.xavier_uniform_(module.q_proj.weight)
+            nn.init.xavier_uniform_(module.k_proj.weight)
+            nn.init.xavier_uniform_(module.v_proj.weight)
+            nn.init.xavier_uniform_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, Siglip2MLP):
+            nn.init.xavier_uniform_(module.fc1.weight)
+            nn.init.xavier_uniform_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, Siglip2MultiheadAttentionPoolingHead):
+            nn.init.xavier_uniform_(module.probe.data)
+            nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
+            nn.init.zeros_(module.attention.in_proj_bias.data)
+        elif isinstance(module, Siglip2Model):
+            logit_scale_init = torch.log(torch.tensor(1.0))
+            module.logit_scale.data.fill_(logit_scale_init)
+            module.logit_bias.data.zero_()
+        elif isinstance(module, Siglip2ForImageClassification):
+            nn.init.normal_(
+                module.classifier.weight,
+                std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@add_start_docstrings(
+    """The text model from Siglip2 without any head or projection on top.""",
+    SIGLIP2_START_DOCSTRING,
+)
+class Siglip2TextModel(Siglip2PreTrainedModel):
+    config_class = Siglip2TextConfig
+
+    def __init__(self, config: Siglip2TextConfig):
+        super().__init__(config)
+        self.text_model = Siglip2TextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2TextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Siglip2TextModel
+
+        >>> model = Siglip2TextModel.from_pretrained("google/siglip2-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")
+
+        >>> # important: make sure to set padding="max_length" as that's how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+
+class Siglip2MultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+        self.num_heads = config.num_attention_heads
+
+    def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        if attention_mask is not None:
+            target_len, source_len = probe.shape[1], hidden_state.shape[1]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_state.dtype, target_len)
+            attention_mask = attention_mask.repeat(1, self.num_heads, target_len, 1)
+            attention_mask = attention_mask.reshape(-1, target_len, source_len)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state, attn_mask=attention_mask)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+@add_start_docstrings(
+    """The vision model from Siglip2 without any head or projection on top.""",
+    SIGLIP2_START_DOCSTRING,
+)
+class Siglip2VisionModel(Siglip2PreTrainedModel):
+    config_class = Siglip2VisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__(config)
+
+        self.vision_model = Siglip2VisionTransformer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_attention_mask: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Siglip2VisionModel
+
+        >>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled features
+        ```"""
+        return self.vision_model(
+            pixel_values=pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+
+@add_start_docstrings(SIGLIP2_START_DOCSTRING)
+class Siglip2Model(Siglip2PreTrainedModel):
+    config_class = Siglip2Config
+
+    def __init__(self, config: Siglip2Config):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, Siglip2TextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type Siglip2TextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, Siglip2VisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type Siglip2VisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        # First, initialize the text and vision models with proper attention implementation
+        text_model = Siglip2TextModel._from_config(text_config)
+        vision_model = Siglip2VisionModel._from_config(vision_config)
+
+        # Second, get the text and vision submodules (for backward compatibility)
+        self.text_model = text_model.text_model
+        self.vision_model = vision_model.vision_model
+
+        self.logit_scale = nn.Parameter(torch.randn(1))
+        self.logit_bias = nn.Parameter(torch.randn(1))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(SIGLIP2_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`Siglip2TextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModel
+        >>> import torch
+
+        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")
+
+        >>> # important: make sure to set padding="max_length" as that's how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use Siglip2 model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        pooled_output = text_outputs.pooler_output
+
+        return pooled_output
+
+    @add_start_docstrings_to_model_forward(SIGLIP2_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`Siglip2VisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+
+        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use Siglip2Model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values=pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        pooled_output = vision_outputs.pooler_output
+
+        return pooled_output
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Siglip2Output, config_class=Siglip2Config)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Siglip2Output:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+
+        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
+        >>> # important: we pass `padding=max_length` since the model was trained with this
+        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> logits_per_image = outputs.logits_per_image
+        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
+        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
+        31.9% that image 0 is 'a photo of 2 cats'
+        ```"""
+        # Use Siglip2 model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values=pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        image_embeds = vision_outputs.pooler_output
+        text_embeds = text_outputs.pooler_output
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device))
+
+        logit_scale, logit_bias = self.logit_scale.to(text_embeds.device), self.logit_bias.to(text_embeds.device)
+        logits_per_text = logits_per_text * logit_scale.exp() + logit_bias
+
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            # Adapted from https://github.com/google-research/big_vision/blob/01edb81a4716f93a48be43b3a4af14e29cdb3a7f/big_vision/trainers/proj/image_text/siglip2.py#L287
+            eye = torch.eye(logits_per_text.size(0), device=logits_per_text.device)
+            m1_diag1 = -torch.ones_like(logits_per_text) + 2 * eye
+            loglik = torch.nn.functional.logsigmoid(m1_diag1 * logits_per_text)
+            nll = -torch.sum(loglik, dim=-1)
+            loss = nll.mean()
+
+        return Siglip2Output(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+@add_start_docstrings(
+    """
+    Siglip2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
+    the patch tokens) e.g. for ImageNet.
+    """,
+    SIGLIP2_START_DOCSTRING,
+)
+class Siglip2ForImageClassification(Siglip2PreTrainedModel):
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Siglip2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+
+        # Create the vision model with proper attention
+        # and take only vision_model submodule (for backward compatibility)
+        vision_model = Siglip2VisionModel._from_config(config.vision_config)
+        self.vision_model = vision_model.vision_model
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, Siglip2ForImageClassification
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> # note: we are loading a `Siglip2Model` from the hub here,
+        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip2-base-patch16-224")
+        >>> model = Siglip2ForImageClassification.from_pretrained("google/siglip2-base-patch16-224")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> # model predicts one of the two classes
+        >>> predicted_class_idx = logits.argmax(-1).item()
+        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+        Predicted class: LABEL_1
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        sequence_output = outputs.last_hidden_state
+
+        # average pool the patch tokens
+        if pixel_attention_mask is not None:
+            pool_mask = pixel_attention_mask[..., None].to(sequence_output.device)
+            sequence_output = torch.sum(sequence_output * pool_mask, dim=1) / torch.sum(pool_mask, dim=1)
+        else:
+            sequence_output = torch.mean(sequence_output, dim=1)
+
+        # apply classifier
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Siglip2Model",
+    "Siglip2PreTrainedModel",
+    "Siglip2TextModel",
+    "Siglip2VisionModel",
+    "Siglip2ForImageClassification",
+]
diff --git a/src/transformers/models/siglip2/modular_siglip2.py b/src/transformers/models/siglip2/modular_siglip2.py
new file mode 100644
index 000000000000..23df3b0413d9
--- /dev/null
+++ b/src/transformers/models/siglip2/modular_siglip2.py
@@ -0,0 +1,511 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.models.siglip.configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
+from transformers.models.siglip.modeling_siglip import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+    SiglipForImageClassification,
+    SiglipModel,
+    SiglipMultiheadAttentionPoolingHead,
+    SiglipOutput,
+    SiglipPreTrainedModel,
+    SiglipTextModel,
+    SiglipTextModelOutput,
+    SiglipVisionModel,
+    SiglipVisionModelOutput,
+    SiglipVisionTransformer,
+)
+
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+
+
+class Siglip2TextConfig(SiglipTextConfig):
+    pass
+
+
+class Siglip2VisionConfig(SiglipVisionConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Siglip2VisionModel`]. It is used to instantiate a
+    Siglip2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip2
+    [google/siglip2-base-patch16-naflex](https://huggingface.co/google/siglip2-base-patch16-naflex) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        num_patches (`int`, *optional*, defaults to 256):
+            The number of patches in the image with the size of (`patch_size`, `patch_size`).
+            The image is resized to fill maximum of this number of patches, and to preserve
+            the aspect ratio. In case the resulted number of patches is lower, the image is
+            padded in "patch" dimension.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    Example:
+
+    ```python
+    >>> from transformers import Siglip2VisionConfig, Siglip2VisionModel
+
+    >>> # Initializing a Siglip2VisionConfig with google/siglip2-base-patch16-naflex style configuration
+    >>> configuration = Siglip2VisionConfig()
+
+    >>> # Initializing a Siglip2VisionModel (with random weights) from the google/siglip2-base-patch16-naflex style configuration
+    >>> model = Siglip2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        num_patches=256,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_patches = num_patches
+        del self.image_size
+
+
+class Siglip2Config(SiglipConfig):
+    pass
+
+
+class Siglip2VisionOutput(SiglipVisionModelOutput):
+    pass
+
+
+class Siglip2TextOutput(SiglipTextModelOutput):
+    pass
+
+
+class Siglip2Output(SiglipOutput):
+    pass
+
+
+class Siglip2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Linear(
+            in_features=config.num_channels * self.patch_size * self.patch_size,
+            out_features=self.embed_dim,
+        )
+
+        self.num_patches = config.num_patches
+        self.position_embedding_size = int(self.num_patches**0.5)
+        self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
+
+    @staticmethod
+    def resize_positional_embeddings(
+        positional_embeddings: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        max_length: int,
+    ) -> torch.Tensor:
+        """
+        Resize positional embeddings to image-specific size and pad to a fixed size.
+
+        Args:
+            positional_embeddings (`torch.Tensor`):
+                Position embeddings of shape (height, width, embed_dim)
+            spatial_shapes (`torch.LongTensor`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+            max_length (`int`):
+                Maximum length of the positional embeddings to pad resized positional embeddings to
+
+        Returns:
+            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
+        """
+        batch_size = spatial_shapes.shape[0]
+        embed_dim = positional_embeddings.shape[-1]
+        source_dtype = positional_embeddings.dtype
+
+        resulted_positional_embeddings = torch.empty(
+            (batch_size, max_length, embed_dim),
+            device=positional_embeddings.device,
+            dtype=source_dtype,
+        )
+
+        # (height, width, embed_dim) -> (1, embed_dim, height, width) for interpolation
+        positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0)
+
+        # Upcast to float32 on CPU because antialias is not supported for bfloat16/float16 on CPU
+        if positional_embeddings.device.type == "cpu":
+            positional_embeddings = positional_embeddings.to(torch.float32)
+
+        for i in range(batch_size):
+            # (1, dim, height, width) -> (1, dim, target_height, target_width)
+            height, width = spatial_shapes[i]
+            resized_embeddings = F.interpolate(
+                positional_embeddings,
+                size=(height, width),
+                mode="bilinear",
+                align_corners=False,
+                antialias=True,
+            )
+
+            # (1, dim, target_height, target_width) -> (target_height * target_width, dim)
+            resized_embeddings = resized_embeddings.reshape(embed_dim, height * width).transpose(0, 1)
+
+            # Cast to original dtype
+            resized_embeddings = resized_embeddings.to(source_dtype)
+
+            resulted_positional_embeddings[i, : height * width] = resized_embeddings
+            resulted_positional_embeddings[i, height * width :] = resized_embeddings[0]
+
+        return resulted_positional_embeddings
+
+    def forward(self, pixel_values: torch.FloatTensor, spatial_shapes: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
+            spatial_shapes (`List[Tuple[int, int]]`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+        """
+
+        # Apply patch embeddings to already patchified pixel values
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+
+        # Get positional resized and padded positional embeddings
+        positional_embeddings = self.position_embedding.weight.reshape(
+            self.position_embedding_size, self.position_embedding_size, -1
+        )
+        resized_positional_embeddings = self.resize_positional_embeddings(
+            positional_embeddings, spatial_shapes, max_length=pixel_values.shape[1]
+        )
+
+        # Add positional embeddings to patch embeddings
+        embeddings = patch_embeds + resized_positional_embeddings
+        return embeddings
+
+
+class Siglip2VisionTransformer(SiglipVisionTransformer):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    # Update: add `spatial_shapes` and `attention_mask`
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        attention_mask: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        hidden_states = self.embeddings(pixel_values, spatial_shapes)
+
+        if attention_mask is not None and not self._use_flash_attention_2:
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+        else:
+            encoder_attention_mask = attention_mask
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooler_output = self.head(last_hidden_state, attention_mask) if self.use_head else None
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooler_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class Siglip2PreTrainedModel(SiglipPreTrainedModel):
+    pass
+
+
+class Siglip2TextModel(SiglipTextModel):
+    pass
+
+
+class Siglip2MultiheadAttentionPoolingHead(SiglipMultiheadAttentionPoolingHead):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__(config)
+        self.num_heads = config.num_attention_heads
+
+    def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        if attention_mask is not None:
+            target_len, source_len = probe.shape[1], hidden_state.shape[1]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_state.dtype, target_len)
+            attention_mask = attention_mask.repeat(1, self.num_heads, target_len, 1)
+            attention_mask = attention_mask.reshape(-1, target_len, source_len)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state, attn_mask=attention_mask)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+class Siglip2VisionModel(SiglipVisionModel):
+    # Update: add `spatial_shapes` and `pixel_attention_mask`
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_attention_mask: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        return self.vision_model(
+            pixel_values=pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+
+class Siglip2Model(SiglipModel):
+    # Update: add `spatial_shapes` and `pixel_attention_mask`
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        # Use Siglip2Model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values=pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        pooled_output = vision_outputs.pooler_output
+
+        return pooled_output
+
+    # Update: add `spatial_shapes` and `pixel_attention_mask`
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Siglip2Output:
+        # Use Siglip2 model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values=pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        image_embeds = vision_outputs.pooler_output
+        text_embeds = text_outputs.pooler_output
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device))
+
+        logit_scale, logit_bias = self.logit_scale.to(text_embeds.device), self.logit_bias.to(text_embeds.device)
+        logits_per_text = logits_per_text * logit_scale.exp() + logit_bias
+
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            # Adapted from https://github.com/google-research/big_vision/blob/01edb81a4716f93a48be43b3a4af14e29cdb3a7f/big_vision/trainers/proj/image_text/siglip2.py#L287
+            eye = torch.eye(logits_per_text.size(0), device=logits_per_text.device)
+            m1_diag1 = -torch.ones_like(logits_per_text) + 2 * eye
+            loglik = torch.nn.functional.logsigmoid(m1_diag1 * logits_per_text)
+            nll = -torch.sum(loglik, dim=-1)
+            loss = nll.mean()
+
+        return Siglip2Output(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class Siglip2ForImageClassification(SiglipForImageClassification):
+    # Update: add `spatial_shapes` and `pixel_attention_mask`
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> ImageClassifierOutput:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        sequence_output = outputs.last_hidden_state
+
+        # average pool the patch tokens
+        if pixel_attention_mask is not None:
+            pool_mask = pixel_attention_mask[..., None].to(sequence_output.device)
+            sequence_output = torch.sum(sequence_output * pool_mask, dim=1) / torch.sum(pool_mask, dim=1)
+        else:
+            sequence_output = torch.mean(sequence_output, dim=1)
+
+        # apply classifier
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Siglip2Config",
+    "Siglip2TextConfig",
+    "Siglip2VisionConfig",
+    "Siglip2Model",
+    "Siglip2PreTrainedModel",
+    "Siglip2TextModel",
+    "Siglip2VisionModel",
+    "Siglip2ForImageClassification",
+]
diff --git a/src/transformers/models/siglip2/processing_siglip2.py b/src/transformers/models/siglip2/processing_siglip2.py
new file mode 100644
index 000000000000..16e0ea1b6b8d
--- /dev/null
+++ b/src/transformers/models/siglip2/processing_siglip2.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for SigLIP2.
+"""
+
+from typing import List, Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class Siglip2ImagesKwargs(ImagesKwargs, total=False):
+    max_num_patches: Optional[int]
+    patch_size: Optional[int]
+
+
+class Siglip2ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Siglip2ImagesKwargs
+
+    _defaults = {
+        "text_kwargs": {
+            "padding": "max_length",
+            "truncation": True,
+            "max_length": 64,
+        },
+        "images_kwargs": {
+            "max_num_patches": 256,
+            "patch_size": 16,
+        },
+    }
+
+
+class Siglip2Processor(ProcessorMixin):
+    r"""
+    Constructs a Siglip2 processor which wraps a Siglip2 image processor and a Gemma tokenizer into a single processor.
+
+    [`Siglip2Processor`] offers all the functionalities of [`Siglip2ImageProcessor`] and [`GemmaTokenizerFast`]. See the
+    [`~Siglip2Processor.__call__`] and [`~Siglip2Processor.decode`] for more information.
+
+    Args:
+        image_processor ([`Siglip2ImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`GemmaTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images: Optional[Union[ImageInput, List[ImageInput], List[List[ImageInput]]]] = None,
+        text: Optional[Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Siglip2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` argument to
+        Siglip2ImageProcessor's [`~Siglip2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*, defaults to 64):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*, defaults to `True`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'pt'`):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_attention_mask** -- Attention mask for the pixel values. Returned when `images` is not `None`.
+            - **spatial_shapes** -- The number of horizontal and vertical patches per image.
+              Returned when `images` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Siglip2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        if images is not None:
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+        if text is not None and images is not None:
+            encoding.update(image_features)
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Siglip2Tokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Siglip2Tokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["Siglip2Processor"]
diff --git a/src/transformers/models/smolvlm/__init__.py b/src/transformers/models/smolvlm/__init__.py
new file mode 100644
index 000000000000..706c9928300f
--- /dev/null
+++ b/src/transformers/models/smolvlm/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_smolvlm import *
+    from .image_processing_smolvlm import *
+    from .modeling_smolvlm import *
+    from .processing_smolvlm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/smolvlm/configuration_smolvlm.py b/src/transformers/models/smolvlm/configuration_smolvlm.py
new file mode 100644
index 000000000000..cd8544156832
--- /dev/null
+++ b/src/transformers/models/smolvlm/configuration_smolvlm.py
@@ -0,0 +1,196 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/smolvlm/modular_smolvlm.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_smolvlm.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+# Written by Orr Zohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class SmolVLMVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SmolVLMVisionModel`]. It is used to instantiate a
+    SmolVLM vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
+    [google/siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) used in SmolVLM
+    [HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1152):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
+    >>> from transformers.models.smolvlm.configuration_smolvlm import SmolVLMVisionConfig
+
+    >>> # Initializing a SmolVLMVisionConfig with google/siglip-so400m-patch14-384 style configuration
+    >>> configuration = SmolVLMVisionConfig()
+
+    >>> # Initializing a SmolVLMVisionTransformer (with random weights) from the google/siglip-so400m-patch14-384 style configuration
+    >>> model = SmolVLMVisionTransformer(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "smolvlm_vision"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=1152,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+
+
+class SmolVLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SmolVLMModel`]. It is used to instantiate a
+    SmolVLM model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the model of the SmolVLM
+    [HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should cache the key/value pairs of the attention mechanism. Only
+            relevant if `config.is_decoder=True`.
+        image_token_id (`int`, *optional*, defaults to 128257):
+            The id of the "image" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to tie the word embeddings with the token embeddings.
+        vision_config (`IdeficsVisionConfig` or `dict`, *optional*, defaults to `IdeficsVisionConfig`):
+            Custom vision config or dict for the vision tower
+        text_config (`PretrainedConfig` or `dict`, *optional*, defaults to `LlamaConfig`):
+            Custom text config or dict for the text model
+        scale_factor (`int`, *optional*, defaults to 2):
+            The scale factor for the image encoder.
+        pad_token_id (`int`, *optional*, defaults to 128002):
+            The id of the padding token.
+
+    Example:
+    ```python
+    >>> from transformers import SmolVLMModel, SmolVLMConfig
+    >>> # Initializing configuration
+    >>> configuration = SmolVLMConfig()
+    >>> # Initializing a model from the configuration
+    >>> model = SmolVLMModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "smolvlm"
+    sub_configs = {"text_config": AutoConfig, "vision_config": SmolVLMVisionConfig}
+
+    def __init__(
+        self,
+        use_cache=True,
+        image_token_id=128257,
+        tie_word_embeddings=False,
+        vision_config=None,
+        text_config=None,
+        scale_factor=2,
+        pad_token_id=128_002,
+        **kwargs,
+    ):
+        self.image_token_id = image_token_id
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+
+        if vision_config is None:
+            self.vision_config = SmolVLMVisionConfig()
+            logger.info("vision_config is None, using default vision config")
+        elif isinstance(vision_config, dict):
+            self.vision_config = SmolVLMVisionConfig(**vision_config)
+        elif isinstance(vision_config, SmolVLMVisionConfig):
+            self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            logger.info("text_config is None, using default text config")
+            text_config = CONFIG_MAPPING["llama"](
+                rms_norm_eps=1e-5,
+                pad_token_id=pad_token_id,
+                tie_word_embeddings=False,
+            )
+
+        self.text_config = text_config
+        self.scale_factor = scale_factor
+
+        super().__init__(**kwargs, pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings)
+
+
+__all__ = ["SmolVLMVisionConfig", "SmolVLMConfig"]
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py
new file mode 100644
index 000000000000..93ed18d13449
--- /dev/null
+++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py
@@ -0,0 +1,851 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/smolvlm/modular_smolvlm.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_smolvlm.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+# Written by Orr Zohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import PaddingMode, pad, to_channel_dimension_format, to_pil_image
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_nested_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+if is_vision_available():
+    import PIL
+    from PIL import Image
+
+
+logger = logging.get_logger(__name__)
+MAX_IMAGE_SIZE = 4096  # 4k resolution as absolute maximum
+
+
+def _resize_output_size_rescale_to_max_len(
+    height: int, width: int, min_len: Optional[int] = 1, max_len: Optional[int] = None
+) -> Tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        min_len (`int`, *optional*, defaults to 1):
+            Minimum size of the output image.
+        max_len (`int`, *optional*, defaults to the maximum size of the image):
+            Maximum size of the output image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+    aspect_ratio = width / height
+
+    if width >= height:
+        width = max_len
+        height = int(width / aspect_ratio)
+        if height % 2 != 0:
+            height += 1
+    elif height > width:
+        height = max_len
+        width = int(height * aspect_ratio)
+        if width % 2 != 0:
+            width += 1
+
+    # Avoid resizing to a size smaller than min_len
+    height = max(height, min_len)
+    width = max(width, min_len)
+    return height, width
+
+
+def _resize_output_size_scale_below_upper_bound(
+    height: int, width: int, max_len: Optional[Dict[str, int]] = None
+) -> Tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        height (`int`):
+            Height of the input image.
+        width (`int`):
+            Width of the input image.
+        max_len (`Dict[str, int]`, *optional*, defaults to the maximum size of the image):
+            Defines the maximum dimensions of the image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    max_len = max(height, width) if max_len is None else max_len
+
+    aspect_ratio = width / height
+    if width >= height and width > max_len:
+        width = max_len
+        height = int(width / aspect_ratio)
+    elif height > width and height > max_len:
+        height = max_len
+        width = int(height * aspect_ratio)
+
+    # Avoid resizing to a size smaller than 1
+    height = max(height, 1)
+    width = max(width, 1)
+    return height, width
+
+
+def get_resize_output_image_size(
+    image,
+    resolution_max_side: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    """
+    Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
+    Args:
+        image (`np.ndarray`):
+            Image to resize.
+        resolution_max_side (`int`):
+            The longest edge of the image will be resized to this value. The shortest edge will be resized to keep the
+            input aspect ratio.
+        input_data_format (`ChannelDimension` or `str`):
+            The channel dimension format of the input image.
+    Returns:
+        The output size of the image after resizing.
+    """
+    height, width = get_image_size(image, channel_dim=input_data_format)
+
+    # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio
+    height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=resolution_max_side)
+    # Find the output size when scaling the image to be below the MAX_IMAGE_SIZE
+    height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE)
+    return height, width
+
+
+def get_max_height_width(
+    images_list: List[List[np.ndarray]], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
+
+    max_height = max_width = float("-inf")
+    for images in images_list:
+        for image in images:
+            height, width = get_image_size(image, channel_dim=input_data_format)
+            max_height = max(height, max_height)
+            max_width = max(width, max_width)
+    return (max_height, max_width)
+
+
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+def convert_to_rgb(
+    image: np.ndarray,
+    palette: Optional[PIL.ImagePalette.ImagePalette] = None,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> ImageInput:
+    """
+    Converts an image to RGB format.
+    Args:
+        image (`np.ndarray`):
+            The image to convert.
+        palette (List[int], *optional*):
+            The palette to use if given.
+        data_format (ChannelDimension or str, *optional*):
+            The channel dimension format for the output image. If not provided, it will be the same as the input image.
+        input_data_format (ChannelDimension or str, *optional*):
+            The channel dimension format of the input image.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image, num_channels=(1, 3, 4))
+
+    # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+    # The resized image from PIL will always have channels last, so find the input format first.
+    data_format = input_data_format if data_format is None else data_format
+
+    mode = "P" if palette is not None else None
+    image = to_pil_image(image, image_mode=mode, input_data_format=input_data_format)
+    if image.mode == "P" and palette is not None:
+        image.putpalette(palette)
+
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+
+    output_array = np.array(alpha_composite)
+    # The image is always in channels last format after converting from a PIL image
+    output_array = to_channel_dimension_format(output_array, data_format, input_channel_dim=ChannelDimension.LAST)
+    return output_array
+
+
+# FIXME Amy: make a more general crop function that isn't just centre crop
+def _crop(
+    image: np.ndarray,
+    w1: int,
+    h1: int,
+    w2: int,
+    h2: int,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> np.ndarray:
+    if data_format is None:
+        data_format = infer_channel_dimension_format(image, num_channels=(1, 3, 4))
+
+    if data_format == ChannelDimension.FIRST:
+        image = image[:, h1:h2, w1:w2]
+    elif data_format == ChannelDimension.LAST:
+        image = image[h1:h2, w1:w2, :]
+    else:
+        raise ValueError("Invalid channel dimension format.")
+
+    return image
+
+
+class SmolVLMImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a SmolVLM image processor.
+    Args:
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB. This is useful if the input image is of a different format e.g. RGBA.
+            Only has an effect if the input image is in the PIL format.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image. The longest edge of the image is resized to  be <= `size["longest_edge"]`, with the
+            shortest edge resized to keep the input aspect ratio.
+        size (`Dict`, *optional*, defaults to `{"longest_edge": 4 * 364}`):
+            Controls the size of the output image. This is a dictionary containing the key "longest_edge".
+            The image will be resized such that the longest edge is <= `size["longest_edge"]` and the shortest edge is resized
+            to keep the input aspect ratio.
+        resample (`Resampling`, *optional*, defaults to `Resampling.LANCZOS`):
+            Resampling filter to use when resizing the image.
+        do_image_splitting (`bool`, *optional*, defaults to `True`):
+            Whether to split the image into sub-images concatenated with the original image. They are split into patches
+            such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+        max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
+            Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image. If set to `True`, the image is rescaled to have pixel values between 0 and 1.
+        rescale_factor (`float`, *optional*, defaults to `1/255`):
+            Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. If set to `True`, the image is normalized to have a mean of `image_mean` and
+            a standard deviation of `image_std`.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether or not to pad the images to the largest height and width in the batch and number of images per
+            sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width).
+    """
+
+    model_input_names = ["pixel_values", "pixel_attention_mask"]
+
+    def __init__(
+        self,
+        do_convert_rgb: bool = True,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        do_image_splitting: bool = True,
+        max_image_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_convert_rgb = do_convert_rgb
+        self.do_resize = do_resize
+        self.size = size if size is not None else {"longest_edge": 4 * 364}
+        self.resample = resample
+        self.do_image_splitting = do_image_splitting
+        self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 364}
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_pad = do_pad
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The longest edge of the image is resized to size["longest_edge"], with the shortest edge
+        resized to keep the input aspect ratio. Can also be used with size["height"] and size["width"].
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+                Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image, num_channels=(1, 3, 4))
+
+        # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+        # The resized image from PIL will always have channels last, so find the input format first.
+        data_format = input_data_format if data_format is None else data_format
+
+        if "longest_edge" in size:
+            size = get_resize_output_image_size(
+                image, resolution_max_side=size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("size must be a dictionary with key 'longest_edge' or 'height' and 'width'.")
+
+        image_mode = None
+        if image.ndim == 2 or image.shape[-1] == 1:
+            image_mode = "P"
+        image = to_pil_image(image, image_mode=image_mode, input_data_format=input_data_format)
+
+        resized_image = image.resize((size[1], size[0]), resample=resample)
+        resized_image = np.array(resized_image)
+
+        # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
+        # so we need to add it back if necessary.
+        resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
+        # The image is always in channels last format after converting from a PIL image
+        resized_image = to_channel_dimension_format(
+            resized_image, data_format, input_channel_dim=ChannelDimension.LAST
+        )
+        return resized_image
+
+    def split_image(
+        self,
+        image,
+        max_image_size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Split an image into squares of side max_image_size and the original image resized to max_image_size.
+        That means that a single image becomes a sequence of images.
+        This is a "trick" to spend more compute on each image with no changes in the vision encoder.
+        1) If one side of the original image is larger than `max_image_size`, resize it to `max_image_size` while preserving the aspect ratio.
+        2) Divide the resulting image into `ceil(height / max_image_size)` x `ceil(width / max_image_size)`
+        sub-images of the same size each (image_size, image_size). Typically, 364x364.
+        3) Returns the list of the crops and the original image, in addition to the number of splits for the height and the width.
+        Args:
+            image (`np.ndarray`):
+                Images to split.
+            max_image_size (`Dict[str, int]`):
+                Maximum size of the output image. If the image is larger than this size, it will be split into
+                patches of this size, and the original image will be concatenated with the patches, resized to max_size.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+                Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        height, width = get_image_size(image, channel_dim=input_data_format)
+        max_height = max_width = max_image_size["longest_edge"]
+
+        frames = []
+        if height > max_height or width > max_width:
+            # Calculate the number of splits
+            num_splits_h = math.ceil(height / max_height)
+            num_splits_w = math.ceil(width / max_width)
+            # Calculate the optimal width and height for the sub-images
+            optimal_height = math.ceil(height / num_splits_h)
+            optimal_width = math.ceil(width / num_splits_w)
+
+            # Iterate through each row and column
+            for r in range(num_splits_h):
+                for c in range(num_splits_w):
+                    # Calculate the starting point of the crop
+                    start_x = c * optimal_width
+                    start_y = r * optimal_height
+
+                    # Calculate the ending point of the crop
+                    end_x = min(start_x + optimal_width, width)
+                    end_y = min(start_y + optimal_height, height)
+
+                    # Crop the image
+                    cropped_image = _crop(
+                        image,
+                        start_x,
+                        start_y,
+                        end_x,
+                        end_y,
+                        data_format=data_format,
+                    )
+                    frames.append(cropped_image)
+
+            # For the global image at the end, we resize it to match the max_image_size, for cpu memory efficiency
+            global_image_height, global_image_width = max_height, max_width
+            if height != global_image_height or width != global_image_width:
+                image = self.resize(
+                    image,
+                    {"height": global_image_height, "width": global_image_width},
+                    resample=resample,
+                    input_data_format=data_format,
+                )
+        else:
+            num_splits_h, num_splits_w = 0, 0
+
+        frames.append(image)
+
+        return frames, num_splits_h, num_splits_w
+
+    def resize_for_vision_encoder(
+        self,
+        image: np.ndarray,
+        vision_encoder_max_size: int,
+        resample: PILImageResampling = PILImageResampling.LANCZOS,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Resize images to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
+        Args:
+            image (`np.ndarray`):
+                Images to resize.
+            vision_encoder_max_size (`int`):
+                Maximum size of the output image. If the image is larger than this size, it will be split into
+                patches of this size, and the original image will be concatenated with the patches, resized to max_size.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+                Resampling filter to use when resizing the image.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the output image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred
+        """
+        height, width = get_image_size(image, channel_dim=input_data_format)
+
+        aspect_ratio = width / height
+        if width >= height:
+            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+            height = int(width / aspect_ratio)
+            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+        elif height > width:
+            height = math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size
+            width = int(height * aspect_ratio)
+            width = math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size
+        new_size = {"height": height, "width": width}
+        return self.resize(
+            image, size=new_size, resample=resample, input_data_format=input_data_format, data_format=data_format
+        )
+
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: Tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    def pad(
+        self,
+        images: List[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        For a list of images, for each images, pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width.
+        For each sample in the batch, pads the sample with empty images to the max_number of images per sample in the batch. Optionally returns a pixel mask.
+        Args:
+            images (`List[np.ndarray]`):
+                List of list of images to pad. Pads to the largest height and width in the batch.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        batch_size = len(images)
+        max_num_images = max(len(images_) for images_ in images)
+        input_data_format = (
+            infer_channel_dimension_format(images[0][0], num_channels=(1, 3, 4))
+            if input_data_format is None
+            else input_data_format
+        )
+        data_format = input_data_format if data_format is None else data_format
+
+        if input_data_format == ChannelDimension.FIRST:
+            n_channels = images[0][0].shape[0]
+        elif input_data_format == ChannelDimension.LAST:
+            n_channels = images[0][0].shape[-1]
+        else:
+            raise ValueError("Invalid channel dimension format.")
+
+        def empty_image(size, input_data_format):
+            if input_data_format == ChannelDimension.FIRST:
+                return np.zeros((n_channels, *size), dtype=np.uint8)
+            elif input_data_format == ChannelDimension.LAST:
+                return np.zeros((*size, n_channels), dtype=np.uint8)
+
+        padded_images_list = [
+            [empty_image(pad_size, data_format) for _ in range(max_num_images)] for _ in range(batch_size)
+        ]
+        padded_masks = [[np.zeros(pad_size) for _ in range(max_num_images)] for _ in range(batch_size)]
+
+        for batch_idx in range(batch_size):
+            for sample_idx, image in enumerate(images[batch_idx]):
+                padded_images_list[batch_idx][sample_idx] = self._pad_image(
+                    image,
+                    pad_size,
+                    constant_values=constant_values,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                padded_masks[batch_idx][sample_idx] = make_pixel_mask(
+                    image, output_size=pad_size, input_data_format=input_data_format
+                )
+
+        padded_masks = padded_masks if return_pixel_mask else None
+        return padded_images_list, padded_masks
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_convert_rgb: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_image_splitting: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        max_image_size: Optional[Dict[str, int]] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_row_col_info: bool = False,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess a batch of images.
+        Args:
+            images (`ImageInput`):
+                A list of images to preprocess.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. With the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_image_splitting (`bool`, *optional*, defaults to `self.do_image_splitting`):
+                Whether to split the image into sub-images concatenated with the original image. They are split into patches
+                such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+            max_image_size (`Dict`, *optional*, defaults to `self.max_image_size`):
+                Maximum resolution of the images. If the image is larger than this size, the image is split into patches.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether or not to pad the images to the largest height and width in the batch.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            return_row_col_info (`bool`, *optional*, default to `False`):
+                Whether to return the number of rows and columns of the split images. This is used for the
+                `SmolVLMProcessor` to generate prompt strings based on the number of rows and columns.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_image_splitting = do_image_splitting if do_image_splitting is not None else self.do_image_splitting
+        max_image_size = max_image_size if max_image_size is not None else self.max_image_size
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_pad = do_pad if do_pad is not None else self.do_pad
+
+        images_list = make_nested_list_of_images(images)
+
+        if not valid_images(images_list[0]):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # save the palettes for conversion to RGB
+        palettes_list = [
+            [im.getpalette() if isinstance(im, Image.Image) and im.mode == "P" else None for im in images]
+            for images in images_list
+        ]
+
+        # All transformations expect numpy arrays.
+        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
+
+        # Extra channel dimension for grayscale images
+        if input_data_format in [ChannelDimension.LAST, None]:
+            images_list = [
+                [np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list
+            ]
+        elif input_data_format == ChannelDimension.FIRST:
+            images_list = [
+                [np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list
+            ]
+
+        if do_rescale and is_scaled_image(images_list[0][0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        # We assume that all images have the same channel dimension format.
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4))
+
+        if do_resize:
+            images_list = [
+                [
+                    self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        if do_image_splitting:
+            # We first resize both height and width of each image to the nearest max_image_size multiple, disregarding the aspect ratio
+            # for size=(10, max_image_size) -> rescaled_size=(max_image_size, max_image_size)
+            # for size=(11, max_image_size+1) -> rescaled_size=(max_image_size, max_image_size*2)
+            images_list = [
+                [
+                    self.resize_for_vision_encoder(
+                        image, max_image_size["longest_edge"], resample=resample, input_data_format=input_data_format
+                    )
+                    for image in images
+                ]
+                for images in images_list
+            ]
+            images_list_split_arrays = []
+            palettes_list_split_arrays = []
+            images_list_rows = []
+            images_list_cols = []
+            for images, palettes in zip(images_list, palettes_list):
+                split_image_arrays = []
+                split_palettes_arrays = []
+                image_rows = []
+                image_cols = []
+                for image, palette in zip(images, palettes):
+                    split_image_array, rows, cols = self.split_image(
+                        image,
+                        max_image_size=max_image_size,
+                        input_data_format=input_data_format,
+                    )
+                    split_image_arrays.extend(split_image_array)
+                    split_palettes_arrays.extend([palette] * len(split_image_array))
+                    image_rows.append(rows)
+                    image_cols.append(cols)
+                images_list_split_arrays.append(split_image_arrays)
+                palettes_list_split_arrays.append(split_palettes_arrays)
+                images_list_rows.append(image_rows)
+                images_list_cols.append(image_cols)
+            images_list = images_list_split_arrays
+            palettes_list = palettes_list_split_arrays
+        else:
+            # We square the images to max_image_size
+            images_list = [
+                [
+                    self.resize(
+                        image=image,
+                        size={"height": max_image_size["longest_edge"], "width": max_image_size["longest_edge"]},
+                        resample=resample,
+                        input_data_format=input_data_format,
+                    )
+                    for image in images
+                ]
+                for images in images_list
+            ]
+            images_list_rows = [[0] * len(images) for images in images_list]
+            images_list_cols = [[0] * len(images) for images in images_list]
+
+        if do_convert_rgb:
+            images_list = [
+                [convert_to_rgb(img, palette) for img, palette in zip(images, palettes)]
+                for images, palettes in zip(images_list, palettes_list)
+            ]
+
+        if do_rescale:
+            images_list = [
+                [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+                for images in images_list
+            ]
+
+        if do_normalize:
+            images_list = [
+                [
+                    self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        pixel_attention_mask = None
+        if do_pad:
+            images_list, pixel_attention_mask = self.pad(
+                images_list, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=input_data_format
+            )
+
+        if data_format is not None:
+            images_list = [
+                [
+                    to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                    for image in images
+                ]
+                for images in images_list
+            ]
+
+        # Faster tensor conversion
+        data = {"pixel_values": np.array(images_list) if do_pad and return_tensors is not None else images_list}
+        if pixel_attention_mask is not None:
+            data["pixel_attention_mask"] = (
+                np.array(pixel_attention_mask) if do_pad and return_tensors is not None else pixel_attention_mask
+            )
+
+        encoding = BatchFeature(data=data, tensor_type=return_tensors)
+
+        # This is needed for generating correct text inputs in the processor - we don't pad to the max number of images
+        if return_row_col_info:
+            encoding["rows"] = images_list_rows
+            encoding["cols"] = images_list_cols
+
+        return encoding
+
+
+__all__ = ["SmolVLMImageProcessor"]
diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py
new file mode 100644
index 000000000000..e88e23776b33
--- /dev/null
+++ b/src/transformers/models/smolvlm/modeling_smolvlm.py
@@ -0,0 +1,1165 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/smolvlm/modular_smolvlm.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_smolvlm.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+# Written by Orr Zohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import BaseModelOutput, ModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto import AutoModel
+from .configuration_smolvlm import SmolVLMConfig, SmolVLMVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "SmolVLMConfig"
+
+
+SMOLVLM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`SmolVLMConfig`] or [`SmolVLMVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare SmolVLM Model outputting raw hidden-states without any specific head on top.",
+    SMOLVLM_START_DOCSTRING,
+)
+class SmolVLMPreTrainedModel(PreTrainedModel):
+    config_class = SmolVLMConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["SmolVLMVisionAttention", "SmolVLMDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.get_text_config().initializer_range
+        )
+
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class SmolVLMVisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the need to resize them to the same
+    fixed size. In particular, we start from the original pre-trained SigLIP model
+    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, config: SmolVLMVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
+        position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class SmolVLMVisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        # Ignore copy
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class SmolVLMVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SmolVLMEncoderLayer(nn.Module):
+    def __init__(self, config: SmolVLMVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = SmolVLMVisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SmolVLMVisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class SmolVLMEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SmolVLMEncoderLayer`].
+
+    Args:
+        config: SmolVLMConfig
+    """
+
+    def __init__(self, config: SmolVLMConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SmolVLMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+SMOLVLM_VISION_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`SmolVLMVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The SmolVLM Vision Transformer Model outputting raw image embedding.",
+    SMOLVLM_VISION_START_DOCSTRING,
+)
+class SmolVLMVisionTransformer(SmolVLMPreTrainedModel):
+    config_class = SmolVLMVisionConfig
+    _supports_sdpa = True
+    _supports_flash_attention_2 = True
+    _supports_flex_attn = True
+
+    def __init__(self, config: SmolVLMVisionConfig):
+        super().__init__(config)
+        embed_dim = config.hidden_size
+
+        self.embeddings = SmolVLMVisionEmbeddings(config)
+        self.encoder = SmolVLMEncoder(config)
+        self.patch_size = config.patch_size
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings = value
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_size = self.patch_size
+            patch_attention_mask = torch.ones(
+                (
+                    batch_size,
+                    pixel_values.size(2) // patch_size,
+                    pixel_values.size(3) // patch_size,
+                )
+            )
+            patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
+
+        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            patch_attention_mask = None
+        elif not self._use_flash_attention_2:
+            patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=patch_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@dataclass
+class SmolVLMBaseModelOutputWithPast(ModelOutput):
+    """
+    Base class for SmolVLM model's outputs that may also contain a past key/values (to speed up sequential decoding).
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class SmolVLMSimpleMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        input_size = config.vision_config.hidden_size * (config.scale_factor**2)
+        output_size = config.text_config.hidden_size
+        self.proj = nn.Linear(input_size, output_size, bias=False)
+
+    def forward(self, x):
+        return self.proj(x)
+
+
+class SmolVLMConnector(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.scale_factor = config.scale_factor
+        self.modality_projection = SmolVLMSimpleMLP(config)
+
+    def pixel_shuffle(self, x, scale_factor=2):
+        bsz, seq, embed_dim = x.size()
+        height = width = int(seq**0.5)
+        x = x.view(bsz, height, width, embed_dim)
+        x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor)
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(bsz, int(width / scale_factor), int(height / scale_factor), embed_dim * (scale_factor**2))
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
+        return x
+
+    def forward(self, image_hidden_states):
+        image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
+        image_hidden_states = self.modality_projection(image_hidden_states)
+        return image_hidden_states
+
+
+SMOLVLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
+            [`CLIPImageProcessor`] for processing images).
+        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
+            Mask to avoid performing attention on padding pixel indices.
+        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The hidden states of the image encoder after modality projection.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    """SmolVLM model consisting of a SIGLIP vision encoder and Llama3 language decoder""",
+    SMOLVLM_START_DOCSTRING,
+)
+class SmolVLMModel(SmolVLMPreTrainedModel):
+    """
+    A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
+    in forward. Instead, we override inputs_merger here with custom logic.
+    """
+
+    def __init__(self, config: SmolVLMConfig):
+        super().__init__(config)
+        self.padding_idx = self.config.text_config.pad_token_id
+        self.vocab_size = self.config.text_config.vocab_size
+
+        self.vision_model = SmolVLMVisionTransformer._from_config(config.vision_config)
+        self.connector = SmolVLMConnector(config)
+        self.text_model = AutoModel.from_config(config.text_config)
+
+        self.image_seq_len = int(
+            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2)
+        )
+        self.image_token_id = self.config.image_token_id
+
+        self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2"
+
+        self.post_init()
+
+    def enable_input_require_grads(self):
+        """
+        Enables the gradients for the input embeddings.
+
+        This is useful for lora when using gradient checkpointing.
+        c.f. https://github.com/huggingface/peft/issues/1402#issuecomment-1913675032
+
+        Override to set output.requires_grad = True for both the decoder's and vision model's embeddings.
+        """
+
+        def get_lowest_module(module):
+            if len(list(module.children())) == 0:
+                # If the module has no children, it is a leaf module (e.g., Linear, Conv2d, etc.)
+                return module
+            else:
+                # Recursively call the function on each child module
+                return get_lowest_module(list(module.children())[0])
+
+        def make_inputs_require_grads(module, input, output):
+            output.requires_grad_(True)
+
+        self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
+        self._vision_require_grads_hook = get_lowest_module(self.vision_model).register_forward_hook(
+            make_inputs_require_grads
+        )
+
+    def disable_input_require_grads(self):
+        self._text_require_grads_hook.remove()
+        self._vision_require_grads_hook.remove()
+
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
+    def inputs_merger(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.Tensor, image_hidden_states: torch.Tensor
+    ):
+        """
+        This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
+        The merging happens as follows:
+        - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
+        - We get the image hidden states for the image through the vision encoder and that hidden state, after a pixel shuffle operation, is then projected into the text embedding space.
+        We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
+        - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
+        - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
+        """
+        _, patch_size, _ = image_hidden_states.shape
+
+        image_mask = input_ids == self.image_token_id
+        num_image_tokens = image_mask.sum(dim=1)
+        if not torch.all(num_image_tokens % patch_size == 0):
+            raise ValueError("At least one sample has <image> tokens not divisible by patch_size.")
+
+        blocks_per_sample = num_image_tokens // patch_size
+
+        offsets = torch.nn.functional.pad(blocks_per_sample.cumsum(dim=0), (1, 0), value=0)
+        block_offset = offsets[:-1]
+        row_cum = image_mask.cumsum(dim=-1)
+        chunk_idx = (row_cum - 1) // patch_size
+        local_idx = (row_cum - 1) % patch_size
+        block_idx = block_offset.unsqueeze(1) + chunk_idx
+
+        image_embeds = torch.zeros_like(inputs_embeds)
+        image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]
+
+        merged_embeds = torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
+        return merged_embeds
+
+    @add_start_docstrings_to_model_forward(
+        """
+        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
+        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
+        max_num_images is the maximum number of images among the batch_size samples in the batch.
+        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
+        For efficiency, we only pass through the vision_model's forward the real images by
+        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
+        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
+        """,
+        SMOLVLM_INPUTS_DOCSTRING,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, SmolVLMBaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.training and self.text_model.gradient_checkpointing and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_seen_tokens = 0
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            past_seen_tokens = past_key_values.get_seq_length()
+
+        if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0:
+            raise ValueError("When first calling the model, if input_embeds are passed, input_ids should not be None.")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(input_ids.device)
+
+        # START VISUAL INPUTS INTEGRATION
+        if pixel_values is not None and image_hidden_states is not None:
+            raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
+        elif pixel_values is not None:
+            batch_size, num_images, num_channels, height, width = pixel_values.shape
+            pixel_values = pixel_values
+            pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
+
+            # Remove padding images - padding images are full 0.
+            nb_values_per_image = pixel_values.shape[1:].numel()
+            real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
+
+            if not any(real_images_inds):
+                # no images, leave one empty image.
+                real_images_inds[0] = True
+
+            pixel_values = pixel_values[real_images_inds].contiguous()
+
+            # Handle the vision attention mask
+            if pixel_attention_mask is None:
+                pixel_attention_mask = torch.ones(
+                    size=[pixel_values.shape[i] for i in (0, 2, 3)],
+                    dtype=torch.bool,
+                    device=pixel_values.device,
+                )
+            else:
+                # Remove padding images from the mask
+                pixel_attention_mask = pixel_attention_mask.view(
+                    batch_size * num_images, *pixel_attention_mask.shape[2:]
+                )
+                pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
+
+            patch_size = self.config.vision_config.patch_size
+            patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
+            patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
+            patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+            # Get sequence from the vision encoder
+            image_hidden_states = self.vision_model(
+                pixel_values=pixel_values,
+                patch_attention_mask=patch_attention_mask,
+            ).last_hidden_state
+
+            # Modality projection & resampling
+            image_hidden_states = self.connector(image_hidden_states)
+
+        elif image_hidden_states is not None:
+            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
+
+        if inputs_embeds is not None and image_hidden_states is not None:
+            # When we generate, we don't want to replace the potential image_token_id that we generated by images
+            # that simply don't exist
+            inputs_embeds = self.inputs_merger(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                image_hidden_states=image_hidden_states,
+            )
+
+        outputs = self.text_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return tuple(v for v in [*outputs, image_hidden_states] if v is not None)
+
+        return SmolVLMBaseModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_hidden_states,
+        )
+
+
+@dataclass
+class SmolVLMCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Idefics causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@add_start_docstrings(
+    """The SmolVLM Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top. """,
+    SMOLVLM_START_DOCSTRING,
+)
+class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin):
+    """
+    A subclass of Idefics3ForConditionalGeneration that uses SmolVLMModel
+    instead of the default Idefics3Model.
+    """
+
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = SmolVLMModel(config)
+        self.image_token_id = self.config.image_token_id
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.vocab_size = config.text_config.vocab_size
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def enable_input_require_grads(self):
+        """
+        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
+        the model weights fixed.
+        """
+
+        def make_inputs_require_grads(module, input, output):
+            output.requires_grad_(True)
+
+        self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads)
+        self._vision_require_grads_hook = self.model.vision_model.get_input_embeddings().register_forward_hook(
+            make_inputs_require_grads
+        )
+
+    def disable_input_require_grads(self):
+        self._text_require_grads_hook.remove()
+        self._vision_require_grads_hook.remove()
+
+    def get_input_embeddings(self):
+        return self.model.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.text_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(SMOLVLM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SmolVLMCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+    ) -> Union[Tuple, SmolVLMCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `SmolVLMForConditionalGeneration`).
+                Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
+                computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+        >>> from transformers.image_utils import load_image
+
+        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
+        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
+        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
+        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
+
+        >>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
+        >>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+
+        >>> # Create inputs
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {"type": "video", "path": path/to/video},
+        ...             {"type": "text", "text": "What is happening in this video?"},
+        ...         ]
+        ...     }
+        ... ]
+
+        >>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)
+
+        >>> # Generate
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
+        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        >>> print(generated_texts)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_hidden_states=image_hidden_states,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
+                shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return SmolVLMCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        pixel_values=None,
+        pixel_attention_mask=None,
+        image_hidden_states=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take
+        # precedence is moved to the model, we can remove this fn)
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_hidden_states=image_hidden_states,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        # but IDEFICS requires both ids and embeds to be present
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs["input_ids"] = input_ids
+
+        if image_hidden_states is not None:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_attention_mask"] = None
+
+        return model_inputs
+
+    def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs):
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+        # Get the precomputed image_hidden_states
+        model_kwargs["image_hidden_states"] = outputs.image_hidden_states
+        return model_kwargs
+
+
+__all__ = ["SmolVLMForConditionalGeneration", "SmolVLMPreTrainedModel", "SmolVLMModel", "SmolVLMVisionTransformer"]
diff --git a/src/transformers/models/smolvlm/modular_smolvlm.py b/src/transformers/models/smolvlm/modular_smolvlm.py
new file mode 100644
index 000000000000..051bdfaf5afc
--- /dev/null
+++ b/src/transformers/models/smolvlm/modular_smolvlm.py
@@ -0,0 +1,387 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+# Written by Orr Zohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...cache_utils import DynamicCache
+from ...utils import (
+    logging,
+)
+from ..idefics3.configuration_idefics3 import Idefics3Config, Idefics3VisionConfig
+from ..idefics3.image_processing_idefics3 import Idefics3ImageProcessor
+from ..idefics3.modeling_idefics3 import (
+    Idefics3BaseModelOutputWithPast,
+    Idefics3ForConditionalGeneration,
+    Idefics3Model,
+    Idefics3PreTrainedModel,
+    Idefics3VisionTransformer,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class SmolVLMVisionConfig(Idefics3VisionConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SmolVLMVisionModel`]. It is used to instantiate a
+    SmolVLM vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
+    [google/siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) used in SmolVLM
+    [HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1152):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
+    >>> from transformers.models.smolvlm.configuration_smolvlm import SmolVLMVisionConfig
+
+    >>> # Initializing a SmolVLMVisionConfig with google/siglip-so400m-patch14-384 style configuration
+    >>> configuration = SmolVLMVisionConfig()
+
+    >>> # Initializing a SmolVLMVisionTransformer (with random weights) from the google/siglip-so400m-patch14-384 style configuration
+    >>> model = SmolVLMVisionTransformer(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "smolvlm_vision"
+    pass
+
+
+class SmolVLMPreTrainedModel(Idefics3PreTrainedModel):
+    pass
+
+
+class SmolVLMVisionTransformer(Idefics3VisionTransformer):
+    pass
+
+
+class SmolVLMConfig(Idefics3Config):
+    r"""
+    This is the configuration class to store the configuration of a [`SmolVLMModel`]. It is used to instantiate a
+    SmolVLM model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the model of the SmolVLM
+    [HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should cache the key/value pairs of the attention mechanism. Only
+            relevant if `config.is_decoder=True`.
+        image_token_id (`int`, *optional*, defaults to 128257):
+            The id of the "image" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to tie the word embeddings with the token embeddings.
+        vision_config (`IdeficsVisionConfig` or `dict`, *optional*, defaults to `IdeficsVisionConfig`):
+            Custom vision config or dict for the vision tower
+        text_config (`PretrainedConfig` or `dict`, *optional*, defaults to `LlamaConfig`):
+            Custom text config or dict for the text model
+        scale_factor (`int`, *optional*, defaults to 2):
+            The scale factor for the image encoder.
+        pad_token_id (`int`, *optional*, defaults to 128002):
+            The id of the padding token.
+
+    Example:
+    ```python
+    >>> from transformers import SmolVLMModel, SmolVLMConfig
+    >>> # Initializing configuration
+    >>> configuration = SmolVLMConfig()
+    >>> # Initializing a model from the configuration
+    >>> model = SmolVLMModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "smolvlm"
+    pass
+
+
+class SmolVLMImageProcessor(Idefics3ImageProcessor):
+    pass
+
+
+class SmolVLMBaseModelOutputWithPast(Idefics3BaseModelOutputWithPast):
+    pass
+
+
+class SmolVLMModel(Idefics3Model):
+    """
+    A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
+    in forward. Instead, we override inputs_merger here with custom logic.
+    """
+
+    def inputs_merger(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.Tensor, image_hidden_states: torch.Tensor
+    ):
+        _, patch_size, _ = image_hidden_states.shape
+
+        image_mask = input_ids == self.image_token_id
+        num_image_tokens = image_mask.sum(dim=1)
+        if not torch.all(num_image_tokens % patch_size == 0):
+            raise ValueError("At least one sample has <image> tokens not divisible by patch_size.")
+
+        blocks_per_sample = num_image_tokens // patch_size
+
+        offsets = torch.nn.functional.pad(blocks_per_sample.cumsum(dim=0), (1, 0), value=0)
+        block_offset = offsets[:-1]
+        row_cum = image_mask.cumsum(dim=-1)
+        chunk_idx = (row_cum - 1) // patch_size
+        local_idx = (row_cum - 1) % patch_size
+        block_idx = block_offset.unsqueeze(1) + chunk_idx
+
+        image_embeds = torch.zeros_like(inputs_embeds)
+        image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]
+
+        merged_embeds = torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
+        return merged_embeds
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, SmolVLMBaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.training and self.text_model.gradient_checkpointing and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_seen_tokens = 0
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            past_seen_tokens = past_key_values.get_seq_length()
+
+        if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0:
+            raise ValueError("When first calling the model, if input_embeds are passed, input_ids should not be None.")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(input_ids.device)
+
+        # START VISUAL INPUTS INTEGRATION
+        if pixel_values is not None and image_hidden_states is not None:
+            raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
+        elif pixel_values is not None:
+            batch_size, num_images, num_channels, height, width = pixel_values.shape
+            pixel_values = pixel_values
+            pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
+
+            # Remove padding images - padding images are full 0.
+            nb_values_per_image = pixel_values.shape[1:].numel()
+            real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
+
+            if not any(real_images_inds):
+                # no images, leave one empty image.
+                real_images_inds[0] = True
+
+            pixel_values = pixel_values[real_images_inds].contiguous()
+
+            # Handle the vision attention mask
+            if pixel_attention_mask is None:
+                pixel_attention_mask = torch.ones(
+                    size=[pixel_values.shape[i] for i in (0, 2, 3)],
+                    dtype=torch.bool,
+                    device=pixel_values.device,
+                )
+            else:
+                # Remove padding images from the mask
+                pixel_attention_mask = pixel_attention_mask.view(
+                    batch_size * num_images, *pixel_attention_mask.shape[2:]
+                )
+                pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
+
+            patch_size = self.config.vision_config.patch_size
+            patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
+            patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
+            patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+            # Get sequence from the vision encoder
+            image_hidden_states = self.vision_model(
+                pixel_values=pixel_values,
+                patch_attention_mask=patch_attention_mask,
+            ).last_hidden_state
+
+            # Modality projection & resampling
+            image_hidden_states = self.connector(image_hidden_states)
+
+        elif image_hidden_states is not None:
+            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
+
+        if inputs_embeds is not None and image_hidden_states is not None:
+            # When we generate, we don't want to replace the potential image_token_id that we generated by images
+            # that simply don't exist
+            inputs_embeds = self.inputs_merger(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                image_hidden_states=image_hidden_states,
+            )
+
+        outputs = self.text_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return tuple(v for v in [*outputs, image_hidden_states] if v is not None)
+
+        return SmolVLMBaseModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_hidden_states,
+        )
+
+
+class SmolVLMForConditionalGeneration(Idefics3ForConditionalGeneration):
+    """
+    A subclass of Idefics3ForConditionalGeneration that uses SmolVLMModel
+    instead of the default Idefics3Model.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = SmolVLMModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def forward(self, **super_kwargs):
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `SmolVLMForConditionalGeneration`).
+                Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
+                computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+        >>> from transformers.image_utils import load_image
+
+        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
+        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
+        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
+        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
+
+        >>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
+        >>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+
+        >>> # Create inputs
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {"type": "video", "path": path/to/video},
+        ...             {"type": "text", "text": "What is happening in this video?"},
+        ...         ]
+        ...     }
+        ... ]
+
+        >>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)
+
+        >>> # Generate
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
+        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        >>> print(generated_texts)
+        ```"""
+        super().forward(**super_kwargs)
+
+
+__all__ = [
+    "SmolVLMVisionConfig",
+    "SmolVLMConfig",
+    "SmolVLMImageProcessor",
+    "SmolVLMForConditionalGeneration",
+    "SmolVLMPreTrainedModel",
+    "SmolVLMModel",
+    "SmolVLMVisionTransformer",
+]
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
new file mode 100644
index 000000000000..ddc72894af65
--- /dev/null
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -0,0 +1,456 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for SmolVLM.
+"""
+
+import copy
+from datetime import timedelta
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import (
+    ImageInput,
+    VideoInput,
+    make_batched_videos,
+    make_nested_list_of_images,
+)
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import BatchEncoding, TextInput
+from ...utils import is_num2words_available, logging
+from .video_processing_smolvlm import (
+    DEFAULT_MEDIA_OUTTRO,
+    DEFAULT_VIDEO_INTRO,
+    FRAME_TIMESTAMP_MESSAGE,
+    smolvlm_sample_indices_fn,
+)
+
+
+if TYPE_CHECKING:
+    from ...tokenization_utils_base import PreTokenizedInput
+
+logger = logging.get_logger(__name__)
+
+
+if is_num2words_available():
+    from num2words import num2words
+else:
+    num2words = None
+
+
+def _prompt_split_image(
+    image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_image_token
+):
+    """Prompt with expanded image tokens for when the image is split into patches."""
+    text_split_images = ""
+    for n_h in range(image_rows):
+        for n_w in range(image_cols):
+            text_split_images += (
+                f"{fake_token_around_image}" + f"<row_{n_h + 1}_col_{n_w + 1}>" + f"{image_token}" * image_seq_len
+            )
+        text_split_images += "\n"
+
+    text_split_images += (
+        f"\n{fake_token_around_image}"
+        + f"{global_image_token}"
+        + f"{image_token}" * image_seq_len
+        + f"{fake_token_around_image}"
+    )
+    return text_split_images
+
+
+def _prompt_single_image(image_seq_len, fake_token_around_image, image_token, global_image_token):
+    """Prompt with expanded image tokens for a single image."""
+    return (
+        f"{fake_token_around_image}"
+        + f"{global_image_token}"
+        + f"{image_token}" * image_seq_len
+        + f"{fake_token_around_image}"
+    )
+
+
+def get_image_prompt_string(
+    image_rows, image_cols, image_seq_len, fake_token_around_image, image_token, global_image_token
+):
+    if image_rows == 0 and image_cols == 0:
+        return _prompt_single_image(
+            image_seq_len,
+            fake_token_around_image=fake_token_around_image,
+            image_token=image_token,
+            global_image_token=global_image_token,
+        )
+    return _prompt_split_image(
+        image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_image_token
+    )
+
+
+class SmolVLMImagesKwargs(ImagesKwargs, total=False):
+    return_row_col_info: Optional[bool]
+    max_image_size: Optional[Dict[str, int]]
+
+
+class SmolVLMProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: SmolVLMImagesKwargs
+
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "is_split_into_words": False,
+        },
+        "images_kwargs": {
+            "return_row_col_info": True,
+        },
+    }
+
+
+class SmolVLMProcessor(ProcessorMixin):
+    r"""
+    Constructs a SmolVLM processor which wraps a LLama tokenizer and SmolVLM image processor into a single processor.
+
+    [`SmolVLMProcessor`] offers all the functionalities of [`SmolVLMImageProcessor`] and [`SmolVLMTokenizerFast`]. See
+    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`SmolVLMImageProcessor`):
+            An instance of [`SmolVLMImageProcessor`]. The image processor is a required input.
+        tokenizer (`PreTrainedTokenizerBase`, *optional*):
+            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
+        image_seq_len (`int`, *optional*, defaults to 169):
+            The length of the image sequence i.e. the number of <image> tokens per image in the input.
+            This parameter is used to build the string from the input prompt and image tokens and should match the
+            value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["image_seq_len", "chat_template"]
+    image_processor_class = "SmolVLMImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
+    ):
+        self.fake_image_token = getattr(tokenizer, "fake_image_token", "<fake_token_around_image>")
+        self.image_token = getattr(tokenizer, "image_token", "<image>")
+        self.end_of_utterance_token = getattr(tokenizer, "end_of_utterance_token", "<end_of_utterance>")
+        self.global_image_token = getattr(tokenizer, "global_image_token", "<global-img>")
+        self.image_seq_len = image_seq_len
+
+        self.video_size = image_processor.video_sampling["video_size"]
+        self.image_size = image_processor.size
+
+        self.do_image_splitting = image_processor.do_image_splitting
+        self.do_video_splitting = image_processor.video_sampling.get("do_image_splitting", False)
+
+        self.default_max_frames = image_processor.video_sampling["max_frames"]
+        self.default_fps = image_processor.video_sampling["fps"]
+        # Matches one or more occurrences of <row_x_col_y> tags (where x and y are digits, optionally surrounded by newline characters
+        # self._regex_to_remove_extra_special_tokens = re.compile(r"(<row_\d+_col_\d+>\n?)+")
+
+        if not num2words:
+            raise ImportError(
+                "Package `num2words` is required to run SmolVLM processor. Install it with `pip install num2words`."
+            )
+
+        super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
+
+    def process_vision(self, text, images, output_kwargs, do_image_splitting=False, image_processor_size=None):
+        if text is not None:
+            n_images_in_text = [sample.count(self.image_token) for sample in text]
+
+        n_images_in_images = [len(sublist) for sublist in images]
+        image_inputs = self.image_processor(
+            images, do_image_splitting=do_image_splitting, size=image_processor_size, **output_kwargs["images_kwargs"]
+        )
+
+        if text is None:
+            return None, image_inputs
+
+        if n_images_in_images != n_images_in_text:
+            raise ValueError(
+                f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
+            )
+        image_rows = image_inputs.pop("rows", [[0] * len(text)])
+        image_cols = image_inputs.pop("cols", [[0] * len(text)])
+
+        prompt_strings = []
+        for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
+            # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
+            image_prompt_strings = []
+            for n_rows, n_cols in zip(sample_rows, sample_cols):
+                image_prompt_string = get_image_prompt_string(
+                    n_rows,
+                    n_cols,
+                    self.image_seq_len,
+                    image_token=self.image_token,
+                    fake_token_around_image=self.fake_image_token,
+                    global_image_token=self.global_image_token,
+                )
+                image_prompt_strings.append(image_prompt_string)
+
+            split_sample = sample.split(self.image_token)
+            if len(split_sample) == 0:
+                raise ValueError("The image token should be present in the text.")
+
+            # Place in the image prompt strings where the image tokens are
+            sample = split_sample[0]
+            for i, image_prompt_string in enumerate(image_prompt_strings):
+                sample += image_prompt_string + split_sample[i + 1]
+            prompt_strings.append(sample)
+
+        return prompt_strings, image_inputs
+
+    def __call__(
+        self,
+        images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None,
+        text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
+        audio=None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[SmolVLMProcessorKwargs],
+    ) -> BatchEncoding:
+        """
+        Processes the input prompts and returns a BatchEncoding.
+
+        Example:
+
+        ```python
+        >>> import requests
+        >>> from transformers import SmolVLMProcessor
+        >>> from transformers.image_utils import load_image
+
+        >>> processor = SmolVLMProcessor.from_pretrained("HuggingFaceM4/SmolVLM2-256M-Video-Instruct")
+        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
+
+        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
+
+        >>> image1, image2 = load_image(url1), load_image(url2)
+        >>> images = [[image1], [image2]]
+
+        >>> text = [
+        ...     "<image>In this image, we see",
+        ...     "bla bla bla<image>",
+        ... ]
+        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
+        >>> input_ids = outputs.input_ids
+        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
+        >>> print(input_tokens)
+        ['<|begin_of_text|><fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image> In this image, we see', '<|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|begin_of_text|>bla bla bla<fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image>']
+        ```
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
+            text (`Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+                Wherever an image token, `<image>` is encountered it is expanded to
+                `<fake_token_around_image>` + `<row_x_col_y>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
+            return_tensors (`Union[str, TensorType]`, *optional*):
+                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
+                information.
+        """
+        if text is None and images is None and videos is None:
+            raise ValueError("You must provide one of `text`, `images` or `videos'.")
+
+        if text is None and ((images is None) ^ (videos is not None)):
+            raise ValueError("You must specify exactly one of `images` or `videos`")
+
+        output_kwargs = self._merge_kwargs(
+            SmolVLMProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+            n_images_in_text = sum([sample.count(self.image_token) for sample in text])
+            if n_images_in_text > 0 and (images is None and videos is None):
+                raise ValueError(f"We detected {n_images_in_text} tokens in the text but no images/videos were passed")
+
+        inputs = BatchFeature()
+        # Images and videos are mutually exclusive, so process one which is present
+        if images is not None:
+            images = make_nested_list_of_images(images)
+            text, vision_inputs = self.process_vision(
+                text,
+                images,
+                output_kwargs,
+                do_image_splitting=self.do_image_splitting,
+                image_processor_size=self.image_size,
+            )
+            inputs.update(vision_inputs)
+        elif videos is not None:
+            videos = make_batched_videos(videos)
+            text, vision_inputs = self.process_vision(
+                text,
+                videos,
+                output_kwargs,
+                do_image_splitting=self.do_image_splitting,
+                image_processor_size=self.video_size,
+            )
+            inputs.update(vision_inputs)
+
+        if text is not None:
+            text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            inputs.update(text_inputs)
+
+        return inputs
+
+    def _process_messages_for_chat_template(
+        self,
+        conversations: List[List[Dict[str, str]]],
+        batch_images: List[ImageInput],
+        batch_videos: List[VideoInput],
+        batch_video_metadata: List[List[Dict[str, any]]],
+        **chat_template_kwargs,
+    ):
+        """
+        Used within `apply_chat_template` when a model has special way to process conversation history. For example,
+        video models might want to specify in the prompt the duration of video or which frame indices at which timestamps
+        were sampled. This information cannot be accessed before the video is loaded.
+        For most models it is a no-op, must be overriden by model processors which require special processing.
+        Args:
+            conversation (`List[Dict, str, str]`):
+                The conversation to process. Always comes in batched format.
+            batch_images (`List[List[ImageInput]]`):
+                Batch of images that were loaded from url/path defined in the conversation. The images
+                are ordered in the same way as in the conversation. Comes in nested list format, one list of `PIL` images
+                per batch.
+            batch_videos (`List[List[ImageInput]]`):
+                Batch of videos that were loaded from url/path defined in the conversation. The videos
+                are ordered in the same way as in the conversation. Comes in nested list format, one list of 4D video arrays
+                per batch.
+            batch_video_metadata (`List[List[Dict[[str, any]]]]`):
+                Batch of metadata returned from loading videos. That includes video fps, duration and total number of framer in original video.
+                Metadata are ordered in the same way as `batch_videos`. Comes in nested list format, one list of 4D video arrays
+                per batch.
+        """
+        # We don't want to modify in-place the messages passed by user
+        # The user might want to add new turn on conv and continue generation
+        conversations = copy.deepcopy(conversations)
+        batch_num_frames, batch_timestamps = [], []
+        for metadata_list, video_list in zip(batch_video_metadata, batch_videos):
+            for metadata, video in zip(metadata_list, video_list):
+                duration_sec = getattr(metadata, "duration")
+                frames_idx = getattr(metadata, "frames_indices")
+                fps = getattr(metadata, "fps")
+
+                timestamps = []
+                for idx, frame_np in zip(frames_idx, video):
+                    sec = idx / fps
+                    mm = int(sec // 60)
+                    ss = int(sec % 60)
+                    timestamps.append(f"{mm:02d}:{ss:02d}")
+                batch_timestamps.append(timestamps)
+                batch_num_frames.append(len(video))
+
+        for conversation in conversations:
+            # For each message, scan content for {"type": "video"}
+            for msg in conversation:
+                if "content" not in msg:
+                    continue
+
+                new_content = []
+                for block in msg["content"]:
+                    if block.get("type") == "video":
+                        curr_timestamps = batch_timestamps.pop(0)
+                        curr_num_frames = batch_num_frames.pop(0)
+
+                        # Build the video intro texts
+                        td = timedelta(seconds=int(duration_sec))
+                        new_content.append(
+                            {
+                                "type": "text",
+                                "text": DEFAULT_VIDEO_INTRO.format(
+                                    frame_count=num2words(curr_num_frames), video_duration=str(td)
+                                ),
+                            }
+                        )
+
+                        # 2) Insert per-frame lines: "Frame from {timestamp}:", then an "image" block
+                        for i, ts in enumerate(curr_timestamps):
+                            new_content.append({"type": "text", "text": FRAME_TIMESTAMP_MESSAGE.format(timestamp=ts)})
+                            new_content.append({"type": "image"})
+
+                        # 3) Optionally add an outro (e.g. "Now answer the question:")
+                        new_content.append({"type": "text", "text": DEFAULT_MEDIA_OUTTRO})
+                        # Do NOT add the original block => we skip it (since we've replaced it)
+                    else:
+                        # keep original block
+                        new_content.append(block)
+
+                # update the content
+                msg["content"] = new_content
+        return conversations
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SmolVLMTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        batched_decode_output = self.tokenizer.batch_decode(*args, **kwargs)
+        return batched_decode_output
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SmolVLMTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        decode_output = self.tokenizer.decode(*args, **kwargs)
+        return decode_output
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(image_processor_input_names + tokenizer_input_names))
+
+    # Add model-specific video sampling method when applying the template
+    def apply_chat_template(
+        self,
+        conversation,
+        max_frames=None,
+        target_fps=None,
+        skip_secs=1,
+        video_load_backend="pyav",
+        sample_indices_fn=None,
+        **kwargs,
+    ):
+        max_frames = self.default_max_frames if max_frames is None else max_frames
+        target_fps = self.default_fps if target_fps is None else target_fps
+
+        def sample_indices_fn_func(metadata, **fn_kwargs):
+            return smolvlm_sample_indices_fn(
+                metadata, max_frames=max_frames, target_fps=target_fps, skip_secs=skip_secs, **fn_kwargs
+            )
+
+        # word of caution- we are blindly overriding a callable kwarg here.
+        # typed kwargs would be a way to avoid that @molbap
+        if not sample_indices_fn:
+            sample_indices_fn = sample_indices_fn_func
+        return super().apply_chat_template(
+            conversation, video_load_backend=video_load_backend, sample_indices_fn=sample_indices_fn, **kwargs
+        )
+
+
+__all__ = ["SmolVLMProcessor"]
diff --git a/src/transformers/models/smolvlm/video_processing_smolvlm.py b/src/transformers/models/smolvlm/video_processing_smolvlm.py
new file mode 100644
index 000000000000..4adde6be578b
--- /dev/null
+++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+# Make sure these are imported from your library
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+DEFAULT_SYSTEM_MESSAGE = "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+DEFAULT_VIDEO_INTRO = (
+    "You are provided the following series of {frame_count} frames from a {video_duration} [H:MM:SS] video.\n"
+)
+DEFAULT_MEDIA_OUTTRO = "\n\n"
+FRAME_TIMESTAMP_MESSAGE = "\nFrame from {timestamp}:"
+
+
+def smolvlm_sample_indices_fn(metadata, max_frames, target_fps, skip_secs=0):
+    """
+    Example sampling function which:
+      - Uses `max_frames` (if provided) or calculates it from `fps` and metadata.
+      - Applies a basic center-skip if fewer frames than available, otherwise
+        optionally skips `skip_secs` from both the start and end.
+      - Uniformly samples the desired number of frames between the start and end indices.
+
+    Args:
+        max_frames (`int`):
+            Maximum number of frames to sample.
+        target_fps (`int`):
+            Target frames to sample per second.
+        metadata (`dict`):
+            Contains video metadata such as "n_frames" and "video_fps".
+        skip_secs (`float`, *optional*, defaults to 1.0):
+            Number of seconds to skip from the start and end if the video is long enough.
+
+    Returns:
+        numpy.ndarray:
+            An array of unique frame indices to sample.
+    """
+
+    total_num_frames = getattr(metadata, "total_num_frames", 0)
+    if total_num_frames <= 0:
+        raise ValueError(f"Invalid total_num_frames={total_num_frames} in metadata.")
+
+    native_fps = getattr(metadata, "fps", 30.0)
+    duration_seconds = getattr(metadata, "duration", 0)
+
+    if duration_seconds <= 0:
+        raise ValueError(f"Invalid duration_seconds={duration_seconds} in metadata.")
+
+    # Step 1) Estimate how many frames we'd sample at `target_fps`, fallback if target_fps <= 0
+    estimated_frames = int(round(target_fps * duration_seconds))
+
+    # Step 2) desired_frames
+    desired_frames = min(estimated_frames, max_frames)
+    if desired_frames < 1:
+        desired_frames = 1
+
+    # Step 3) center skip logic
+    start_idx = 0
+    end_idx = total_num_frames - 1
+
+    if skip_secs > 0 and (duration_seconds - 2 * skip_secs) > (max_frames * target_fps):
+        start_idx = int(skip_secs * native_fps)
+        end_idx = int(total_num_frames - skip_secs * native_fps)
+
+    start_idx = max(0, start_idx)
+    end_idx = min(end_idx, total_num_frames - 1)
+    if start_idx >= end_idx:
+        start_idx, end_idx = 0, total_num_frames - 1
+
+    indices = np.linspace(start_idx, end_idx, desired_frames, dtype=int)
+    indices = np.unique(indices)
+
+    return indices
diff --git a/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py b/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
deleted file mode 100644
index 874aa2e066f1..000000000000
--- a/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2 checkpoint."""
-
-import argparse
-
-import fairseq
-import torch
-from torch import nn
-
-from transformers import (
-    MBart50Tokenizer,
-    MBartConfig,
-    MBartForCausalLM,
-    SpeechEncoderDecoderConfig,
-    SpeechEncoderDecoderModel,
-    Wav2Vec2Config,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Model,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights_wav2vec2(fairseq_model, hf_model):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.feature_extractor
-    adapter = hf_model.adapter
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        elif any(x in name for x in ["adaptor", "w2v_encoder.proj.", "w2v_proj_ln."]):
-            load_adapter(name, value, adapter, unused_weights)
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def load_adapter(full_name, value, adapter, unused_weights):
-    name = full_name.split("adaptor.")[-1]
-    items = name.split(".")
-
-    if items[1].isdigit():
-        layer_id = int(items[1])
-    else:
-        layer_id = None
-
-    if "adaptor" not in full_name:
-        if "proj_ln" in full_name:
-            # has to be layer norm
-            if "bias" in name:
-                assert (
-                    value.shape == adapter.proj_layer_norm.bias.data.shape
-                ), f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.bias.data.shape} was found."
-                adapter.proj_layer_norm.bias.data = value
-                logger.info(f"Adapter proj layer norm bias was initialized from {full_name}.")
-            if "weight" in name:
-                assert (
-                    value.shape == adapter.proj_layer_norm.weight.data.shape
-                ), f"{full_name} has size {value.shape}, but {adapter.proj_layer_norm.weight.data.shape} was found."
-                adapter.proj_layer_norm.weight.data = value
-        else:
-            # has to be projection layer
-            if "bias" in name:
-                assert (
-                    value.shape == adapter.proj.bias.data.shape
-                ), f"{full_name} has size {value.shape}, but {adapter.proj.bias.data.shape} was found."
-                adapter.proj.bias.data = value
-                logger.info(f"Adapter proj layer bias was initialized from {full_name}.")
-            if "weight" in name:
-                assert (
-                    value.shape == adapter.proj.weight.data.shape
-                ), f"{full_name} has size {value.shape}, but {adapter.proj.weight.data.shape} was found."
-                adapter.proj.weight.data = value
-                logger.info(f"Adapter proj layer weight was initialized from {full_name}.")
-    elif isinstance(layer_id, int):
-        if "bias" in name:
-            assert (
-                value.shape == adapter.layers[layer_id].conv.bias.data.shape
-            ), f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.bias.data.shape} was found."
-            adapter.layers[layer_id].conv.bias.data = value
-            logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
-        elif "weight" in name:
-            assert (
-                value.shape == adapter.layers[layer_id].conv.weight.data.shape
-            ), f"{full_name} has size {value.shape}, but {adapter.layers[layer_id].conv.weight.data.shape} was found."
-            adapter.layers[layer_id].conv.weight.data = value
-            logger.info(f"Adapter layer {layer_id} bias was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-@torch.no_grad()
-def convert_wav2vec2_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    dict_path,
-    config_yaml_path,
-    encoder_config_path,
-    decoder_config_path,
-    add_adapter,
-    adapter_kernel_size,
-    adapter_stride,
-    decoder_start_token_id,
-    encoder_output_dim,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    # load configs
-    encoder_config = Wav2Vec2Config.from_pretrained(
-        encoder_config_path,
-        add_adapter=True,
-        adapter_stride=adapter_stride,
-        adapter_kernel_size=adapter_kernel_size,
-        token_token=True,
-        output_hidden_size=encoder_output_dim,
-    )
-    decoder_config = MBartConfig.from_pretrained(decoder_config_path)
-
-    # load model
-    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-        [checkpoint_path],
-        arg_overrides={
-            "config_yaml": config_yaml_path,
-            "data": "/".join(dict_path.split("/")[:-1]),
-            "w2v_path": checkpoint_path,
-            "load_pretrained_decoder_from": None,
-        },
-    )
-    model = model[0].eval()
-
-    # load feature extractor
-    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(encoder_config_path, token_token=True)
-
-    # set weights for wav2vec2 encoder
-    hf_encoder = Wav2Vec2Model(encoder_config)
-
-    recursively_load_weights_wav2vec2(model.encoder, hf_encoder)
-
-    # load decoder weights
-    hf_decoder = MBartForCausalLM(decoder_config)
-    missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(model.decoder.state_dict(), strict=False)
-    logger.warning(f"The following keys are missing when loading the decoder weights: {missing_keys}")
-    logger.warning(f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}")
-
-    hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder)
-    hf_wav2vec.config.tie_word_embeddings = False
-
-    tokenizer = MBart50Tokenizer(dict_path)
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-    config = hf_wav2vec.config.to_dict()
-    config["pad_token_id"] = tokenizer.pad_token_id
-    config["bos_token_id"] = tokenizer.bos_token_id
-    config["eos_token_id"] = tokenizer.eos_token_id
-    config["tokenizer_class"] = "mbart50"
-    config["feature_extractor_type"] = "wav2vec2"
-
-    config["decoder_start_token_id"] = tokenizer.eos_token_id
-    config["forced_bos_token_id"] = 250004
-    config["forced_eos_token_id"] = tokenizer.eos_token_id
-
-    hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_yaml_path", default=None, type=str, help="Path to yaml file of fine-tuned model")
-    parser.add_argument(
-        "--encoder_config_path",
-        default="facebook/wav2vec2-xls-r-1b",
-        type=str,
-        help="Path to hf encoder wav2vec2 checkpoint config",
-    )
-    parser.add_argument(
-        "--decoder_config_path",
-        default="facebook/mbart-large-50-one-to-many-mmt",
-        type=str,
-        help="Path to hf decoder checkpoint config",
-    )
-    parser.add_argument("--add_adapter", default=True, type=bool, help="whethere to add model adapter layers")
-    parser.add_argument("--adapter_stride", default=2, type=int, help="stride of adapter layers")
-    parser.add_argument("--adapter_kernel_size", default=3, type=int, help="kernel size of adapter layers")
-    parser.add_argument("--encoder_output_dim", default=1024, type=int, help="encoder output dim")
-    parser.add_argument("--start_token_id", default=250004, type=int, help="`decoder_start_token_id` of model config")
-
-    args = parser.parse_args()
-    convert_wav2vec2_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.dict_path,
-        args.config_yaml_path,
-        encoder_config_path=args.encoder_config_path,
-        decoder_config_path=args.decoder_config_path,
-        add_adapter=args.add_adapter,
-        adapter_kernel_size=args.adapter_kernel_size,
-        adapter_stride=args.adapter_stride,
-        decoder_start_token_id=args.start_token_id,
-        encoder_output_dim=args.encoder_output_dim,
-    )
diff --git a/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py b/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
deleted file mode 100644
index 377288982087..000000000000
--- a/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2 checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from torch import nn
-
-from transformers import (
-    Speech2Text2Config,
-    Speech2Text2ForCausalLM,
-    Speech2Text2Tokenizer,
-    SpeechEncoderDecoderConfig,
-    SpeechEncoderDecoderModel,
-    Wav2Vec2Config,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Model,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights_wav2vec2(fairseq_model, hf_model):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.feature_extractor
-
-    # if encoder has different dim to decoder -> use proj_weight
-    proj_weight = None
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        elif name.split(".")[0] == "proj":
-            proj_weight = fairseq_model.proj
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-    return proj_weight
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def create_vocab_dict(dict_path):
-    with open(dict_path, "r", encoding="utf-8") as f:
-        lines = f.readlines()
-        words = [line.split(" ")[0] for line in lines]
-
-    num_words = len(words)
-
-    vocab_dict = {
-        "<s>": 0,
-        "<pad>": 1,
-        "</s>": 2,
-        "<unk>": 3,
-    }
-
-    vocab_dict.update(dict(zip(words, range(4, num_words + 4))))
-    return vocab_dict
-
-
-@torch.no_grad()
-def convert_wav2vec2_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    dict_path,
-    encoder_config_path,
-    decoder_config_path,
-    vocab_size,
-    num_decoder_layers,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    encoder_config = Wav2Vec2Config.from_pretrained(encoder_config_path)
-    decoder_config = Speech2Text2Config.from_pretrained(
-        decoder_config_path, vocab_size=vocab_size, decoder_layers=num_decoder_layers, do_stable_layer_norm=True
-    )
-
-    feature_extractor = Wav2Vec2FeatureExtractor(
-        feature_size=1,
-        sampling_rate=16000,
-        padding_value=0,
-        do_normalize=True,
-        return_attention_mask=True,
-    )
-
-    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-        [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-    )
-    model = model[0].eval()
-
-    # set weights for wav2vec2 encoder
-    hf_encoder = Wav2Vec2Model(encoder_config)
-    projection_layer = recursively_load_weights_wav2vec2(model.encoder, hf_encoder)
-
-    hf_decoder = Speech2Text2ForCausalLM(decoder_config)
-    missing_keys, unexpected_keys = hf_decoder.model.decoder.load_state_dict(model.decoder.state_dict(), strict=False)
-
-    # set output linear layer
-    unexpected_keys.remove("embed_out")
-    hf_decoder.lm_head.weight = nn.Parameter(model.decoder.embed_out.detach())
-
-    # layer norm is init to identity matrix so leaving it is fine
-    logger.warning(f"The following keys are missing when loading the decoder weights: {missing_keys}")
-    logger.warning(f"The following keys are unexpected when loading the decoder weights: {unexpected_keys}")
-
-    hf_wav2vec = SpeechEncoderDecoderModel(encoder=hf_encoder, decoder=hf_decoder)
-    hf_wav2vec.config.tie_word_embeddings = False
-
-    # add projection layer
-    hf_wav2vec.enc_to_dec_proj.weight = nn.Parameter(projection_layer.weight)
-    hf_wav2vec.enc_to_dec_proj.bias = nn.Parameter(projection_layer.bias)
-
-    vocab_dict = create_vocab_dict(dict_path)
-
-    with open(os.path.join(pytorch_dump_folder_path, "vocab.json"), "w") as fp:
-        json.dump(vocab_dict, fp)
-
-    tokenizer = Speech2Text2Tokenizer(os.path.join(pytorch_dump_folder_path, "vocab.json"))
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-    config = hf_wav2vec.config.to_dict()
-    config["pad_token_id"] = tokenizer.pad_token_id
-    config["bos_token_id"] = tokenizer.bos_token_id
-    config["eos_token_id"] = tokenizer.eos_token_id
-    config["tokenizer_class"] = "speech_to_text_2"
-    config["feature_extractor_type"] = "wav2vec2"
-
-    hf_wav2vec.config = SpeechEncoderDecoderConfig.from_dict(config)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument(
-        "--encoder_config_path",
-        default="facebook/wav2vec2-large-lv60",
-        type=str,
-        help="Path to hf encoder wav2vec2 checkpoint config",
-    )
-    parser.add_argument(
-        "--decoder_config_path",
-        default="facebook/s2t-small-mustc-en-fr-st",
-        type=str,
-        help="Path to hf decoder s2t checkpoint config",
-    )
-    parser.add_argument("--vocab_size", default=10224, type=int, help="Vocab size of decoder")
-    parser.add_argument("--num_decoder_layers", default=7, type=int, help="Number of decoder layers")
-
-    args = parser.parse_args()
-    convert_wav2vec2_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.dict_path,
-        encoder_config_path=args.encoder_config_path,
-        decoder_config_path=args.decoder_config_path,
-        vocab_size=args.vocab_size,
-        num_decoder_layers=args.num_decoder_layers,
-    )
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index 9fa099d19230..425c3f7d5b35 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -277,22 +277,11 @@ def freeze_feature_encoder(self):
         """
         self.encoder.freeze_feature_encoder()
 
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        # At the moment fast initialization is not supported for composite models
-        if kwargs.get("_fast_init", False):
-            logger.warning(
-                "Fast initialization is currently not supported for SpeechEncoderDecoderModel. "
-                "Falling back to slow initialization..."
-            )
-        kwargs["_fast_init"] = False
-        return super().from_pretrained(*args, **kwargs)
-
     @classmethod
     def from_encoder_decoder_pretrained(
         cls,
-        encoder_pretrained_model_name_or_path: str = None,
-        decoder_pretrained_model_name_or_path: str = None,
+        encoder_pretrained_model_name_or_path: Optional[str] = None,
+        decoder_pretrained_model_name_or_path: Optional[str] = None,
         *model_args,
         **kwargs,
     ) -> PreTrainedModel:
diff --git a/src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py b/src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py
deleted file mode 100644
index eb4d85262479..000000000000
--- a/src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-from torch import nn
-
-from transformers import Speech2TextConfig, Speech2TextForConditionalGeneration
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "decoder.output_projection.weight",
-        "_float_tensor",
-        "encoder.embed_positions._float_tensor",
-        "decoder.embed_positions._float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_keys(s_dict):
-    keys = list(s_dict.keys())
-    for key in keys:
-        if "transformer_layers" in key:
-            s_dict[key.replace("transformer_layers", "layers")] = s_dict.pop(key)
-        elif "subsample" in key:
-            s_dict[key.replace("subsample", "conv")] = s_dict.pop(key)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def convert_fairseq_s2t_checkpoint_to_tfms(checkpoint_path, pytorch_dump_folder_path):
-    m2m_100 = torch.load(checkpoint_path, map_location="cpu")
-    args = m2m_100["args"]
-    state_dict = m2m_100["model"]
-    lm_head_weights = state_dict["decoder.output_projection.weight"]
-
-    remove_ignore_keys_(state_dict)
-    rename_keys(state_dict)
-
-    vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
-
-    tie_embeds = args.share_decoder_input_output_embed
-
-    conv_kernel_sizes = [int(i) for i in args.conv_kernel_sizes.split(",")]
-    config = Speech2TextConfig(
-        vocab_size=vocab_size,
-        max_source_positions=args.max_source_positions,
-        max_target_positions=args.max_target_positions,
-        encoder_layers=args.encoder_layers,
-        decoder_layers=args.decoder_layers,
-        encoder_attention_heads=args.encoder_attention_heads,
-        decoder_attention_heads=args.decoder_attention_heads,
-        encoder_ffn_dim=args.encoder_ffn_embed_dim,
-        decoder_ffn_dim=args.decoder_ffn_embed_dim,
-        d_model=args.encoder_embed_dim,
-        dropout=args.dropout,
-        attention_dropout=args.attention_dropout,
-        activation_dropout=args.activation_dropout,
-        activation_function="relu",
-        num_conv_layers=len(conv_kernel_sizes),
-        conv_channels=args.conv_channels,
-        conv_kernel_sizes=conv_kernel_sizes,
-        input_feat_per_channel=args.input_feat_per_channel,
-        input_channels=args.input_channels,
-        tie_word_embeddings=tie_embeds,
-        num_beams=5,
-        max_length=200,
-        use_cache=True,
-        decoder_start_token_id=2,
-        early_stopping=True,
-    )
-
-    model = Speech2TextForConditionalGeneration(config)
-    missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
-    if len(missing) > 0 and not set(missing) <= {
-        "encoder.embed_positions.weights",
-        "decoder.embed_positions.weights",
-    }:
-        raise ValueError(
-            "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights`  are allowed to be missing,"
-            f" but all the following weights are missing {missing}"
-        )
-
-    if tie_embeds:
-        model.lm_head = make_linear_from_emb(model.model.decoder.embed_tokens)
-    else:
-        model.lm_head.weight.data = lm_head_weights
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--fairseq_path", type=str, help="Path to the fairseq model (.pt) file.")
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    convert_fairseq_s2t_checkpoint_to_tfms(args.fairseq_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
index 56a0b859db45..3abbfacb8de0 100644
--- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@@ -52,6 +52,13 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
             Number of Mel-frequency bins.
         padding_value (`float`, *optional*, defaults to 0.0):
             The value that is used to fill the padding vectors.
+        dither (`float`, *optional*, defaults to 0.0):
+            Adds dithering. In other words, adds a small Gaussian noise to each frame.
+            E.g. use 4.0 to add dithering with a normal distribution centered
+            around 0.0 with standard deviation 4.0 (assuming [-32k,+32k] range of kaldi waveform).
+            The value 0.0 means no dithering.
+            Dithering has similar effect as `mel_floor`. It reduces the high log_mel_fbank
+            values for signals with hard-zero sections, when VAD cutoff is present in the signal.
         do_ceptral_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
         normalize_means (`bool`, *optional*, defaults to `True`):
@@ -68,6 +75,7 @@ def __init__(
         sampling_rate=16000,
         num_mel_bins=80,
         padding_value=0.0,
+        dither=0.0,
         do_ceptral_normalize=True,
         normalize_means=True,
         normalize_vars=True,
@@ -75,6 +83,7 @@ def __init__(
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.num_mel_bins = num_mel_bins
+        self.dither = dither
         self.do_ceptral_normalize = do_ceptral_normalize
         self.normalize_means = normalize_means
         self.normalize_vars = normalize_vars
@@ -82,7 +91,7 @@ def __init__(
 
         if not is_speech_available():
             mel_filters = mel_filter_bank(
-                num_frequency_bins=256,
+                num_frequency_bins=257,
                 num_mel_filters=self.num_mel_bins,
                 min_frequency=20,
                 max_frequency=sampling_rate // 2,
@@ -92,7 +101,7 @@ def __init__(
                 triangularize_in_mel_space=True,
             )
 
-            self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
+            self.mel_filters = mel_filters
             self.window = window_function(400, "povey", periodic=False)
 
     def _extract_fbank_features(
@@ -106,7 +115,12 @@ def _extract_fbank_features(
         waveform = waveform * (2**15)  # Kaldi compliance: 16-bit signed integers
         if is_speech_available():
             waveform = torch.from_numpy(waveform).unsqueeze(0)
-            features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate)
+            features = ta_kaldi.fbank(
+                waveform,
+                dither=self.dither,
+                num_mel_bins=self.num_mel_bins,
+                sample_frequency=self.sampling_rate,
+            )
             features = features.numpy()
         else:
             waveform = np.squeeze(waveform)
@@ -118,6 +132,7 @@ def _extract_fbank_features(
                 fft_length=512,
                 power=2.0,
                 center=False,
+                dither=self.dither,
                 preemphasis=0.97,
                 mel_filters=self.mel_filters,
                 log_mel="log",
@@ -233,7 +248,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 2e4f64c5120c..fd52e64a1065 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -386,9 +386,7 @@ def forward(
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
+        if hidden_states.dtype == torch.float16:
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
@@ -774,9 +772,9 @@ def forward(
 
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
 
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
diff --git a/src/transformers/models/speech_to_text/processing_speech_to_text.py b/src/transformers/models/speech_to_text/processing_speech_to_text.py
index 57eb1f9cb950..724c0a6ed0f7 100644
--- a/src/transformers/models/speech_to_text/processing_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py
@@ -51,7 +51,7 @@ def __call__(self, *args, **kwargs):
         When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
         [`~Speech2TextFeatureExtractor.__call__`] and returns its output. If used in the context
         [`~Speech2TextProcessor.as_target_processor`] this method forwards all its arguments to Speech2TextTokenizer's
-        [`~Speech2TextTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
+        [`~Speech2TextTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
         information.
         """
         # For backward compatibility
diff --git a/src/transformers/models/speecht5/convert_hifigan.py b/src/transformers/models/speecht5/convert_hifigan.py
deleted file mode 100644
index 4d78bb73af30..000000000000
--- a/src/transformers/models/speecht5/convert_hifigan.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SpeechT5 HiFi-GAN checkpoint."""
-
-import argparse
-
-import numpy as np
-import torch
-
-from transformers import SpeechT5HifiGan, SpeechT5HifiGanConfig, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.speecht5")
-
-
-def load_weights(checkpoint, hf_model, config):
-    hf_model.apply_weight_norm()
-
-    hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"]
-    hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"]
-    hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"]
-
-    for i in range(len(config.upsample_rates)):
-        hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"]
-        hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"]
-        hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"]
-
-    for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)):
-        for j in range(len(config.resblock_dilation_sizes)):
-            hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"]
-            hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"]
-            hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"]
-
-            hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"]
-            hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"]
-            hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"]
-
-    hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"]
-    hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"]
-    hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"]
-
-    hf_model.remove_weight_norm()
-
-
-@torch.no_grad()
-def convert_hifigan_checkpoint(
-    checkpoint_path,
-    stats_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    repo_id=None,
-):
-    if config_path is not None:
-        config = SpeechT5HifiGanConfig.from_pretrained(config_path)
-    else:
-        config = SpeechT5HifiGanConfig()
-
-    model = SpeechT5HifiGan(config)
-
-    orig_checkpoint = torch.load(checkpoint_path)
-    load_weights(orig_checkpoint["model"]["generator"], model, config)
-
-    stats = np.load(stats_path)
-    mean = stats[0].reshape(-1)
-    scale = stats[1].reshape(-1)
-    model.mean = torch.from_numpy(mean).float()
-    model.scale = torch.from_numpy(scale).float()
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument("--stats_path", required=True, default=None, type=str, help="Path to stats.npy file")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_hifigan_checkpoint(
-        args.checkpoint_path,
-        args.stats_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 20dea800d9d1..000000000000
--- a/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SpeechT5 checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    SpeechT5Config,
-    SpeechT5FeatureExtractor,
-    SpeechT5ForSpeechToSpeech,
-    SpeechT5ForSpeechToText,
-    SpeechT5ForTextToSpeech,
-    SpeechT5Processor,
-    SpeechT5Tokenizer,
-    logging,
-)
-from transformers.tokenization_utils import AddedToken
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.speecht5")
-
-MAPPING_SPEECH_ENCODER_PRENET = {
-    "speech_encoder_prenet.layer_norm": "speecht5.encoder.prenet.feature_projection.layer_norm",
-    "speech_encoder_prenet.post_extract_proj": "speecht5.encoder.prenet.feature_projection.projection",
-    "speech_encoder_prenet.pos_conv.0": "speecht5.encoder.prenet.pos_conv_embed.conv",
-    "speech_encoder_prenet.mask_emb": "speecht5.encoder.prenet.masked_spec_embed",
-}
-MAPPING_TEXT_ENCODER_PRENET = {
-    "text_encoder_prenet.encoder_prenet.0": "speecht5.encoder.prenet.embed_tokens",
-    "text_encoder_prenet.encoder_prenet.1.alpha": "speecht5.encoder.prenet.encode_positions.alpha",
-}
-MAPPING_SPEECH_DECODER_PRENET = {
-    "speech_decoder_prenet.decoder_prenet.0.0.prenet.0.0": "speecht5.decoder.prenet.layers.0",
-    "speech_decoder_prenet.decoder_prenet.0.0.prenet.1.0": "speecht5.decoder.prenet.layers.1",
-    "speech_decoder_prenet.decoder_prenet.0.1": "speecht5.decoder.prenet.final_layer",
-    "speech_decoder_prenet.decoder_prenet.1.alpha": "speecht5.decoder.prenet.encode_positions.alpha",
-    "speech_decoder_prenet.spkembs_layer.0": "speecht5.decoder.prenet.speaker_embeds_layer",
-}
-MAPPING_SPEECH_DECODER_POSTNET = {
-    "speech_decoder_postnet.feat_out": "speech_decoder_postnet.feat_out",
-    "speech_decoder_postnet.prob_out": "speech_decoder_postnet.prob_out",
-    "speech_decoder_postnet.postnet.postnet.0.0": "speech_decoder_postnet.layers.0.conv",
-    "speech_decoder_postnet.postnet.postnet.0.1": "speech_decoder_postnet.layers.0.batch_norm",
-    "speech_decoder_postnet.postnet.postnet.1.0": "speech_decoder_postnet.layers.1.conv",
-    "speech_decoder_postnet.postnet.postnet.1.1": "speech_decoder_postnet.layers.1.batch_norm",
-    "speech_decoder_postnet.postnet.postnet.2.0": "speech_decoder_postnet.layers.2.conv",
-    "speech_decoder_postnet.postnet.postnet.2.1": "speech_decoder_postnet.layers.2.batch_norm",
-    "speech_decoder_postnet.postnet.postnet.3.0": "speech_decoder_postnet.layers.3.conv",
-    "speech_decoder_postnet.postnet.postnet.3.1": "speech_decoder_postnet.layers.3.batch_norm",
-    "speech_decoder_postnet.postnet.postnet.4.0": "speech_decoder_postnet.layers.4.conv",
-    "speech_decoder_postnet.postnet.postnet.4.1": "speech_decoder_postnet.layers.4.batch_norm",
-}
-MAPPING_TEXT_DECODER_PRENET = {
-    "text_decoder_prenet.embed_tokens": "speecht5.decoder.prenet.embed_tokens",
-}
-MAPPING_TEXT_DECODER_POSTNET = {
-    "text_decoder_postnet.output_projection": "text_decoder_postnet.lm_head",
-}
-MAPPING_ENCODER = {
-    "encoder.layers.*.self_attn.k_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.k_proj",
-    "encoder.layers.*.self_attn.v_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.v_proj",
-    "encoder.layers.*.self_attn.q_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.q_proj",
-    "encoder.layers.*.self_attn.out_proj": "speecht5.encoder.wrapped_encoder.layers.*.attention.out_proj",
-    "encoder.layers.*.self_attn_layer_norm": "speecht5.encoder.wrapped_encoder.layers.*.layer_norm",
-    "encoder.layers.*.fc1": "speecht5.encoder.wrapped_encoder.layers.*.feed_forward.intermediate_dense",
-    "encoder.layers.*.fc2": "speecht5.encoder.wrapped_encoder.layers.*.feed_forward.output_dense",
-    "encoder.layers.*.final_layer_norm": "speecht5.encoder.wrapped_encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "speecht5.encoder.wrapped_encoder.layer_norm",
-    "encoder.pos_emb.pe_k": "speecht5.encoder.wrapped_encoder.embed_positions.pe_k",
-}
-MAPPING_DECODER = {
-    "decoder.layers.*.self_attn.k_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.k_proj",
-    "decoder.layers.*.self_attn.v_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.v_proj",
-    "decoder.layers.*.self_attn.q_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.q_proj",
-    "decoder.layers.*.self_attn.out_proj": "speecht5.decoder.wrapped_decoder.layers.*.self_attn.out_proj",
-    "decoder.layers.*.self_attn_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.self_attn_layer_norm",
-    "decoder.layers.*.encoder_attn.k_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.k_proj",
-    "decoder.layers.*.encoder_attn.v_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.v_proj",
-    "decoder.layers.*.encoder_attn.q_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.q_proj",
-    "decoder.layers.*.encoder_attn.out_proj": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn.out_proj",
-    "decoder.layers.*.encoder_attn_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.encoder_attn_layer_norm",
-    "decoder.layers.*.fc1": "speecht5.decoder.wrapped_decoder.layers.*.feed_forward.intermediate_dense",
-    "decoder.layers.*.fc2": "speecht5.decoder.wrapped_decoder.layers.*.feed_forward.output_dense",
-    "decoder.layers.*.final_layer_norm": "speecht5.decoder.wrapped_decoder.layers.*.final_layer_norm",
-}
-MAPPING_S2T = {
-    **MAPPING_SPEECH_ENCODER_PRENET,
-    **MAPPING_ENCODER,
-    **MAPPING_DECODER,
-    **MAPPING_TEXT_DECODER_PRENET,
-    **MAPPING_TEXT_DECODER_POSTNET,
-}
-MAPPING_T2S = {
-    **MAPPING_TEXT_ENCODER_PRENET,
-    **MAPPING_ENCODER,
-    **MAPPING_DECODER,
-    **MAPPING_SPEECH_DECODER_PRENET,
-    **MAPPING_SPEECH_DECODER_POSTNET,
-}
-MAPPING_S2S = {
-    **MAPPING_SPEECH_ENCODER_PRENET,
-    **MAPPING_ENCODER,
-    **MAPPING_DECODER,
-    **MAPPING_SPEECH_DECODER_PRENET,
-    **MAPPING_SPEECH_DECODER_POSTNET,
-}
-TOP_LEVEL_KEYS = []
-IGNORE_KEYS = [
-    "encoder.version",
-    "encoder.layers.*.norm_k.weight",
-    "encoder.layers.*.norm_k.bias",
-    "decoder.version",
-    "decoder.layers.*.norm_k.weight",
-    "decoder.layers.*.norm_k.bias",
-    "decoder.pos_emb.pe_k",
-    "speech_encoder_prenet.embed_positions._float_tensor",
-    "text_decoder_prenet.embed_positions._float_tensor",
-]
-IGNORE_KEYS_S2T = IGNORE_KEYS + [
-    "encoder.proj",
-    "text_encoder_prenet.*",
-    "speech_decoder_prenet.*",
-    "speech_decoder_postnet.*",
-]
-IGNORE_KEYS_T2S = IGNORE_KEYS + [
-    "encoder.proj",
-    "speech_encoder_prenet.*",
-    "text_decoder_prenet.*",
-    "text_decoder_postnet.*",
-]
-IGNORE_KEYS_S2S = IGNORE_KEYS + [
-    "encoder.proj",
-    "text_encoder_prenet.*",
-    "text_decoder_prenet.*",
-    "text_decoder_postnet.*",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "running_mean":
-        hf_pointer.running_mean.data = value
-    elif weight_type == "running_var":
-        hf_pointer.running_var.data = value
-    elif weight_type == "num_batches_tracked":
-        hf_pointer.num_batches_tracked.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
-
-
-def should_ignore(name, ignore_keys):
-    for key in ignore_keys:
-        if key.endswith(".*"):
-            if name.startswith(key[:-1]):
-                return True
-        elif ".*." in key:
-            prefix, suffix = key.split(".*.")
-            if prefix in name and suffix in name:
-                return True
-        elif key in name:
-            return True
-    return False
-
-
-def recursively_load_weights(fairseq_dict, hf_model, task):
-    unused_weights = []
-
-    if task == "s2t":
-        feature_encoder = hf_model.speecht5.encoder.prenet.feature_encoder
-        MAPPING = MAPPING_S2T
-        IGNORE_KEYS = IGNORE_KEYS_S2T
-    elif task == "t2s":
-        feature_encoder = None
-        MAPPING = MAPPING_T2S
-        IGNORE_KEYS = IGNORE_KEYS_T2S
-    elif task == "s2s":
-        feature_encoder = hf_model.speecht5.encoder.prenet.feature_encoder
-        MAPPING = MAPPING_S2S
-        IGNORE_KEYS = IGNORE_KEYS_S2S
-    else:
-        raise ValueError(f"Unsupported task: {task}")
-
-    for name, value in fairseq_dict.items():
-        if should_ignore(name, IGNORE_KEYS):
-            logger.info(f"{name} was ignored")
-            continue
-
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_encoder,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                # mapped_key = "speecht5." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-
-                if "*" in key:
-                    prefix, suffix = key.split(".*.")
-                    if prefix in name and suffix in name:
-                        key = suffix
-
-                # if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                if key in name:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        weight_type = "weight"
-                    elif "running_mean" in name:
-                        weight_type = "running_mean"
-                    elif "running_var" in name:
-                        weight_type = "running_var"
-                    elif "num_batches_tracked" in name:
-                        weight_type = "num_batches_tracked"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_speecht5_checkpoint(
-    task,
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    vocab_path=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = SpeechT5Config.from_pretrained(config_path)
-    else:
-        config = SpeechT5Config()
-
-    if task == "s2t":
-        config.max_length = config.max_text_positions
-        model = SpeechT5ForSpeechToText(config)
-    elif task == "t2s":
-        config.max_speech_positions = 1876
-        config.max_text_positions = 600
-        config.max_length = config.max_speech_positions
-        model = SpeechT5ForTextToSpeech(config)
-    elif task == "s2s":
-        config.max_speech_positions = 1876
-        config.max_length = config.max_speech_positions
-        model = SpeechT5ForSpeechToSpeech(config)
-    else:
-        raise ValueError(f"Unknown task name: {task}")
-
-    if vocab_path:
-        tokenizer = SpeechT5Tokenizer(vocab_path, model_max_length=config.max_text_positions)
-
-        # Mask token behaves like a normal word, i.e. include the space before it
-        mask_token = AddedToken("<mask>", lstrip=True, rstrip=False)
-        tokenizer.mask_token = mask_token
-        tokenizer.add_special_tokens({"mask_token": mask_token})
-        tokenizer.add_tokens(["<ctc_blank>"])
-
-    feature_extractor = SpeechT5FeatureExtractor()
-    processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-    fairseq_checkpoint = torch.load(checkpoint_path)
-    recursively_load_weights(fairseq_checkpoint["model"], model, task)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        processor.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--task",
-        default="s2t",
-        type=str,
-        help="Type of the SpeechT5 model you'd like to convert. Should be one of 's2t', 't2s', 's2s'.",
-    )
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--vocab_path", default=None, type=str, help="Path to SentencePiece model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_speecht5_checkpoint(
-        args.task,
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.vocab_path,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py
index aea75ca50b99..e6b277644ce9 100644
--- a/src/transformers/models/speecht5/feature_extraction_speecht5.py
+++ b/src/transformers/models/speecht5/feature_extraction_speecht5.py
@@ -252,7 +252,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 81cca3bc8480..d85e52924aa9 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -142,7 +142,7 @@ def compute_num_masked_span(input_length):
 
     # compute number of masked spans in batch
     input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
+        attention_mask.detach().sum(-1).tolist()
         if attention_mask is not None
         else [sequence_length for _ in range(batch_size)]
     )
@@ -431,7 +431,7 @@ def __init__(self, dim, max_length=1000):
 
     def forward(self, hidden_states):
         seq_len = hidden_states.shape[1]
-        pos_seq = torch.arange(0, seq_len).long().to(hidden_states.device)
+        pos_seq = torch.arange(0, seq_len).to(device=hidden_states.device, dtype=torch.long)
         pos_seq = pos_seq[:, None] - pos_seq[None, :]
 
         pos_seq[pos_seq < -self.max_length] = -self.max_length
@@ -2631,6 +2631,13 @@ def __init__(self, config: SpeechT5Config):
         # Initialize weights and apply final processing
         self.post_init()
 
+    @classmethod
+    def can_generate(cls) -> bool:
+        # Speecht5 has a unique model structure, where the external class (`SpeechT5ForTextToSpeech`) doesn't need to inherit from
+        # `GenerationMixin` (it has a non-standard generation method). This means that the base `can_generate()` will return `False`,
+        # but we need to override it so as to do `GenerationConfig` handling in multiple parts of the codebase.
+        return True
+
     def get_encoder(self):
         return self.speecht5.get_encoder()
 
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 64406745a426..174e766598a0 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -520,7 +520,6 @@ class SplinterPreTrainedModel(PreTrainedModel):
     base_model_prefix = "splinter"
     supports_gradient_checkpointing = True
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -965,8 +964,8 @@ class SplinterForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index c401b772db74..0d824c568cdc 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -30,20 +30,21 @@
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    can_return_tuple,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -51,7 +52,13 @@
 from .configuration_stablelm import StableLmConfig
 
 
-if is_flash_attn_2_available():
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -80,45 +87,18 @@ def __init__(self, config: StableLmConfig, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -452,9 +432,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -798,10 +778,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -809,17 +790,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -915,8 +893,6 @@ def forward(
         if return_legacy_cache:
             next_cache = next_cache.to_legacy_cache()
 
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -931,12 +907,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1018,7 +999,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1088,13 +1069,14 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     # Ignore copy
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1103,13 +1085,11 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1145,9 +1125,8 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1156,11 +1135,10 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # No upscaling to float was ever done for StableLm
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -1174,10 +1152,6 @@ def forward(
                 **kwargs,
             )
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -1219,29 +1193,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1250,9 +1223,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -1267,7 +1239,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -1282,10 +1254,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1326,6 +1294,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1337,23 +1306,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1362,9 +1329,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1372,10 +1338,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index d64953d72b69..ba4ff8098bdd 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -40,7 +40,7 @@
     SequenceClassifierOutputWithPast,
     TokenClassifierOutput,
 )
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
@@ -48,6 +48,7 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    can_return_tuple,
     logging,
     replace_return_docstrings,
 )
@@ -293,45 +294,18 @@ def __init__(self, config: Starcoder2Config, device=None):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -418,20 +392,12 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`Cache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -493,10 +459,11 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -504,16 +471,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -582,13 +547,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
     def _update_causal_mask(
         self,
@@ -596,7 +560,7 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and past_key_values is not None:
@@ -699,7 +663,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -778,27 +742,26 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @can_return_tuple
     @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
     @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> CausalLMOutputWithPast:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -833,10 +796,9 @@ def forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -845,12 +807,11 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
 
-        hidden_states = outputs[0]
+        hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -859,10 +820,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -903,29 +860,28 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        transformer_outputs = self.model(
+        transformer_outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -934,9 +890,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        hidden_states = transformer_outputs[0]
+        hidden_states = transformer_outputs.last_hidden_state
         logits = self.score(hidden_states)
 
         if input_ids is not None:
@@ -951,7 +906,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
@@ -966,10 +921,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
 
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
         return SequenceClassifierOutputWithPast(
             loss=loss,
             logits=pooled_logits,
@@ -1009,6 +960,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
@@ -1020,23 +972,21 @@ def forward(
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.model(
+        outputs: BaseModelOutputWithPast = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1045,9 +995,8 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
-        sequence_output = outputs[0]
+        sequence_output = outputs.last_hidden_state
         sequence_output = self.dropout(sequence_output)
         logits = self.score(sequence_output)
 
@@ -1055,10 +1004,6 @@ def forward(
         if labels is not None:
             loss = self.loss_function(logits, labels, self.config)
 
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
         return TokenClassifierOutput(
             loss=loss,
             logits=logits,
diff --git a/src/transformers/models/starcoder2/modular_starcoder2.py b/src/transformers/models/starcoder2/modular_starcoder2.py
index 32d64cd167ba..1aaf789a50a8 100644
--- a/src/transformers/models/starcoder2/modular_starcoder2.py
+++ b/src/transformers/models/starcoder2/modular_starcoder2.py
@@ -33,7 +33,7 @@
 )
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
-from ...utils import add_start_docstrings_to_model_forward, logging
+from ...utils import add_start_docstrings_to_model_forward, can_return_tuple, logging
 from ..mistral.modeling_mistral import (
     MistralAttention,
     MistralDecoderLayer,
@@ -155,10 +155,11 @@ def __init__(self, config: Starcoder2Config):
         self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
         self.embedding_dropout = config.embedding_dropout
 
+    @can_return_tuple
     @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
@@ -166,16 +167,14 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -244,13 +243,12 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        output = BaseModelOutputWithPast(
+        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
-        return output if return_dict else output.to_tuple()
 
 
 class Starcoder2ForCausalLM(MistralForCausalLM):
diff --git a/src/transformers/models/superglue/convert_superglue_to_hf.py b/src/transformers/models/superglue/convert_superglue_to_hf.py
deleted file mode 100644
index cfff39acdfd8..000000000000
--- a/src/transformers/models/superglue/convert_superglue_to_hf.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import gc
-import os
-import re
-from typing import List
-
-import torch
-from datasets import load_dataset
-
-from transformers import (
-    AutoModelForKeypointDetection,
-    SuperGlueConfig,
-    SuperGlueForKeypointMatching,
-    SuperGlueImageProcessor,
-)
-
-
-def prepare_imgs():
-    dataset = load_dataset("hf-internal-testing/image-matching-test-dataset", split="train")
-    image1 = dataset[0]["image"]
-    image2 = dataset[1]["image"]
-    image3 = dataset[2]["image"]
-    return [[image1, image2], [image3, image2]]
-
-
-def verify_model_outputs(model, model_name, device):
-    images = prepare_imgs()
-    preprocessor = SuperGlueImageProcessor()
-    inputs = preprocessor(images=images, return_tensors="pt").to(device)
-    model.to(device)
-    with torch.no_grad():
-        outputs = model(**inputs, output_hidden_states=True, output_attentions=True)
-
-    predicted_matches_values = outputs.matches[0, 0, :10]
-    predicted_matching_scores_values = outputs.matching_scores[0, 0, :10]
-
-    predicted_number_of_matches = torch.sum(outputs.matches[0][0] != -1).item()
-
-    if "outdoor" in model_name:
-        expected_max_number_keypoints = 865
-        expected_matches_shape = torch.Size((len(images), 2, expected_max_number_keypoints))
-        expected_matching_scores_shape = torch.Size((len(images), 2, expected_max_number_keypoints))
-
-        expected_matches_values = torch.tensor(
-            [125, 630, 137, 138, 136, 143, 135, -1, -1, 153], dtype=torch.int64, device=device
-        )
-        expected_matching_scores_values = torch.tensor(
-            [0.9899, 0.0033, 0.9897, 0.9889, 0.9879, 0.7464, 0.7109, 0, 0, 0.9841], device=device
-        )
-
-        expected_number_of_matches = 281
-    elif "indoor" in model_name:
-        expected_max_number_keypoints = 865
-        expected_matches_shape = torch.Size((len(images), 2, expected_max_number_keypoints))
-        expected_matching_scores_shape = torch.Size((len(images), 2, expected_max_number_keypoints))
-
-        expected_matches_values = torch.tensor(
-            [125, 144, 137, 138, 136, 155, 135, -1, -1, 153], dtype=torch.int64, device=device
-        )
-        expected_matching_scores_values = torch.tensor(
-            [0.9694, 0.0010, 0.9006, 0.8753, 0.8521, 0.5688, 0.6321, 0.0, 0.0, 0.7235], device=device
-        )
-
-        expected_number_of_matches = 282
-
-    assert outputs.matches.shape == expected_matches_shape
-    assert outputs.matching_scores.shape == expected_matching_scores_shape
-
-    assert torch.allclose(predicted_matches_values, expected_matches_values, atol=1e-4)
-    assert torch.allclose(predicted_matching_scores_values, expected_matching_scores_values, atol=1e-4)
-
-    assert predicted_number_of_matches == expected_number_of_matches
-
-
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    r"kenc.encoder.(\d+)": r"keypoint_encoder.encoder.\1.old",
-    r"gnn.layers.(\d+).attn.proj.0": r"gnn.layers.\1.attention.self.query",
-    r"gnn.layers.(\d+).attn.proj.1": r"gnn.layers.\1.attention.self.key",
-    r"gnn.layers.(\d+).attn.proj.2": r"gnn.layers.\1.attention.self.value",
-    r"gnn.layers.(\d+).attn.merge": r"gnn.layers.\1.attention.output.dense",
-    r"gnn.layers.(\d+).mlp.0": r"gnn.layers.\1.mlp.0.linear",
-    r"gnn.layers.(\d+).mlp.1": r"gnn.layers.\1.mlp.0.batch_norm",
-    r"gnn.layers.(\d+).mlp.3": r"gnn.layers.\1.mlp.1",
-    r"final_proj": r"final_projection.final_proj",
-}
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: List[str], conversion_mapping=ORIGINAL_TO_CONVERTED_KEY_MAPPING):
-    """
-    This function should be applied only once, on the concatenated keys to efficiently rename using
-    the key mappings.
-    """
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-        for pattern, replacement in conversion_mapping.items():
-            if replacement is None:
-                new_text = re.sub(pattern, "", new_text)  # an empty line
-                continue
-            new_text = re.sub(pattern, replacement, new_text)
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-    return output_dict
-
-
-def replace_state_dict_keys(all_keys, new_keys, original_state_dict):
-    state_dict = {}
-    for key in all_keys:
-        new_key = new_keys[key]
-        state_dict[new_key] = original_state_dict.pop(key).contiguous().clone()
-    return state_dict
-
-
-def convert_state_dict(state_dict, config):
-    converted_to_final_key_mapping = {}
-
-    def convert_conv_to_linear(keys):
-        for key in keys:
-            state_dict[key] = state_dict[key].squeeze(-1)
-
-    def qkv_permute_weights_and_biases(keys, num_heads=4):
-        for key in keys:
-            tensor = state_dict[key]
-            shape = tensor.shape
-            dim_out = shape[0]
-            if len(shape) == 2:
-                dim_in = shape[1]
-                tensor = (
-                    tensor.reshape(dim_out // num_heads, num_heads, dim_in).permute(1, 0, 2).reshape(dim_out, dim_in)
-                )
-            if len(shape) == 1:
-                tensor = tensor.reshape(dim_out // num_heads, num_heads).permute(1, 0).reshape(dim_out)
-            state_dict[key] = tensor
-
-    def output_permute_weights(keys, num_heads=4):
-        for key in keys:
-            tensor = state_dict[key]
-            dim_in = tensor.shape[1]
-            dim_out = tensor.shape[0]
-            tensor = tensor.reshape(dim_out, dim_in // num_heads, num_heads).permute(0, 2, 1).reshape(dim_out, dim_in)
-            state_dict[key] = tensor
-
-    conv_keys = []
-    qkv_permute_keys = []
-    output_permute_keys = []
-    # Keypoint Encoder
-    keypoint_encoder_key = "keypoint_encoder.encoder"
-    for i in range(1, len(config.keypoint_encoder_sizes) + 2):
-        old_conv_key = f"{keypoint_encoder_key}.{(i - 1) * 3}.old"
-        new_index = i - 1
-        new_conv_key = f"{keypoint_encoder_key}.{new_index}."
-        if i < len(config.keypoint_encoder_sizes) + 1:
-            new_conv_key = f"{new_conv_key}linear."
-        converted_to_final_key_mapping[rf"{old_conv_key}\."] = new_conv_key
-        if i < len(config.keypoint_encoder_sizes) + 1:
-            old_batch_norm_key = f"{keypoint_encoder_key}.{(i - 1) * 3 + 1}.old"
-            new_batch_norm_key = f"{keypoint_encoder_key}.{new_index}.batch_norm."
-            converted_to_final_key_mapping[rf"{old_batch_norm_key}\."] = new_batch_norm_key
-
-        conv_keys.append(f"{old_conv_key}.weight")
-
-    # Attentional GNN
-    for i in range(len(config.gnn_layers_types)):
-        gnn_layer_key = f"gnn.layers.{i}"
-        ## Attention
-        attention_key = f"{gnn_layer_key}.attention"
-        conv_keys.extend(
-            [
-                f"{attention_key}.self.query.weight",
-                f"{attention_key}.self.key.weight",
-                f"{attention_key}.self.value.weight",
-                f"{attention_key}.output.dense.weight",
-            ]
-        )
-        qkv_permute_keys.extend(
-            [
-                f"{attention_key}.self.query.weight",
-                f"{attention_key}.self.key.weight",
-                f"{attention_key}.self.value.weight",
-                f"{attention_key}.self.query.bias",
-                f"{attention_key}.self.key.bias",
-                f"{attention_key}.self.value.bias",
-            ]
-        )
-        output_permute_keys.append(f"{attention_key}.output.dense.weight")
-
-        ## MLP
-        conv_keys.extend([f"{gnn_layer_key}.mlp.0.linear.weight", f"{gnn_layer_key}.mlp.1.weight"])
-
-    # Final Projection
-    conv_keys.append("final_projection.final_proj.weight")
-
-    convert_conv_to_linear(conv_keys)
-    qkv_permute_weights_and_biases(qkv_permute_keys)
-    output_permute_weights(output_permute_keys)
-    all_keys = list(state_dict.keys())
-    new_keys = convert_old_keys_to_new_keys(all_keys, converted_to_final_key_mapping)
-    state_dict = replace_state_dict_keys(all_keys, new_keys, state_dict)
-    return state_dict
-
-
-def add_keypoint_detector_state_dict(superglue_state_dict):
-    keypoint_detector = AutoModelForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
-    keypoint_detector_state_dict = keypoint_detector.state_dict()
-    for k, v in keypoint_detector_state_dict.items():
-        superglue_state_dict[f"keypoint_detector.{k}"] = v
-    return superglue_state_dict
-
-
-@torch.no_grad()
-def write_model(
-    model_path,
-    checkpoint_url,
-    safe_serialization=True,
-    push_to_hub=False,
-):
-    os.makedirs(model_path, exist_ok=True)
-
-    # ------------------------------------------------------------
-    # SuperGlue config
-    # ------------------------------------------------------------
-
-    config = SuperGlueConfig(
-        hidden_size=256,
-        keypoint_encoder_sizes=[32, 64, 128, 256],
-        gnn_layers_types=["self", "cross"] * 9,
-        sinkhorn_iterations=100,
-        matching_threshold=0.0,
-    )
-    config.architectures = ["SuperGlueForKeypointMatching"]
-    config.save_pretrained(model_path, push_to_hub=push_to_hub)
-    print("Model config saved successfully...")
-
-    # ------------------------------------------------------------
-    # Convert weights
-    # ------------------------------------------------------------
-
-    print(f"Fetching all parameters from the checkpoint at {checkpoint_url}...")
-    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)
-
-    print("Converting model...")
-    all_keys = list(original_state_dict.keys())
-    new_keys = convert_old_keys_to_new_keys(all_keys)
-
-    state_dict = replace_state_dict_keys(all_keys, new_keys, original_state_dict)
-    state_dict = convert_state_dict(state_dict, config)
-
-    del original_state_dict
-    gc.collect()
-    state_dict = add_keypoint_detector_state_dict(state_dict)
-
-    print("Loading the checkpoint in a SuperGlue model...")
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    with torch.device(device):
-        model = SuperGlueForKeypointMatching(config)
-    model.load_state_dict(state_dict, strict=True)
-    print("Checkpoint loaded successfully...")
-    del model.config._name_or_path
-
-    print("Saving the model...")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    del state_dict, model
-
-    # Safety check: reload the converted model
-    gc.collect()
-    print("Reloading the model to check if it's saved correctly.")
-    model = SuperGlueForKeypointMatching.from_pretrained(model_path)
-    print("Model reloaded successfully.")
-
-    model_name = "superglue"
-    if "superglue_outdoor.pth" in checkpoint_url:
-        model_name += "_outdoor"
-    elif "superglue_indoor.pth" in checkpoint_url:
-        model_name += "_indoor"
-
-    print("Checking the model outputs...")
-    verify_model_outputs(model, model_name, device)
-    print("Model outputs verified successfully.")
-
-    organization = "magic-leap-community"
-    if push_to_hub:
-        print("Pushing model to the hub...")
-        model.push_to_hub(
-            repo_id=f"{organization}/{model_name}",
-            commit_message="Add model",
-        )
-
-    write_image_processor(model_path, model_name, organization, push_to_hub=push_to_hub)
-
-
-def write_image_processor(save_dir, model_name, organization, push_to_hub=False):
-    image_processor = SuperGlueImageProcessor()
-    image_processor.save_pretrained(save_dir)
-
-    if push_to_hub:
-        print("Pushing image processor to the hub...")
-        image_processor.push_to_hub(
-            repo_id=f"{organization}/{model_name}",
-            commit_message="Add image processor",
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/master/models/weights/superglue_indoor.pth",
-        type=str,
-        help="URL of the original SuperGlue checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        help="Push model and image preprocessor to the hub",
-    )
-
-    args = parser.parse_args()
-    write_model(
-        args.pytorch_dump_folder_path, args.checkpoint_url, safe_serialization=True, push_to_hub=args.push_to_hub
-    )
diff --git a/src/transformers/models/superglue/image_processing_superglue.py b/src/transformers/models/superglue/image_processing_superglue.py
index 567e55580701..b84bfc280c69 100644
--- a/src/transformers/models/superglue/image_processing_superglue.py
+++ b/src/transformers/models/superglue/image_processing_superglue.py
@@ -220,12 +220,12 @@ def resize(
     def preprocess(
         self,
         images,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_grayscale: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_grayscale: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/superpoint/convert_superpoint_to_pytorch.py b/src/transformers/models/superpoint/convert_superpoint_to_pytorch.py
deleted file mode 100644
index 007966a0557a..000000000000
--- a/src/transformers/models/superpoint/convert_superpoint_to_pytorch.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import SuperPointConfig, SuperPointForKeypointDetection, SuperPointImageProcessor
-
-
-def get_superpoint_config():
-    config = SuperPointConfig(
-        encoder_hidden_sizes=[64, 64, 128, 128],
-        decoder_hidden_size=256,
-        keypoint_decoder_dim=65,
-        descriptor_decoder_dim=256,
-        keypoint_threshold=0.005,
-        max_keypoints=-1,
-        nms_radius=4,
-        border_removal_distance=4,
-        initializer_range=0.02,
-    )
-
-    return config
-
-
-def create_rename_keys(config, state_dict):
-    rename_keys = []
-
-    # Encoder weights
-    rename_keys.append(("conv1a.weight", "encoder.conv_blocks.0.conv_a.weight"))
-    rename_keys.append(("conv1b.weight", "encoder.conv_blocks.0.conv_b.weight"))
-    rename_keys.append(("conv2a.weight", "encoder.conv_blocks.1.conv_a.weight"))
-    rename_keys.append(("conv2b.weight", "encoder.conv_blocks.1.conv_b.weight"))
-    rename_keys.append(("conv3a.weight", "encoder.conv_blocks.2.conv_a.weight"))
-    rename_keys.append(("conv3b.weight", "encoder.conv_blocks.2.conv_b.weight"))
-    rename_keys.append(("conv4a.weight", "encoder.conv_blocks.3.conv_a.weight"))
-    rename_keys.append(("conv4b.weight", "encoder.conv_blocks.3.conv_b.weight"))
-    rename_keys.append(("conv1a.bias", "encoder.conv_blocks.0.conv_a.bias"))
-    rename_keys.append(("conv1b.bias", "encoder.conv_blocks.0.conv_b.bias"))
-    rename_keys.append(("conv2a.bias", "encoder.conv_blocks.1.conv_a.bias"))
-    rename_keys.append(("conv2b.bias", "encoder.conv_blocks.1.conv_b.bias"))
-    rename_keys.append(("conv3a.bias", "encoder.conv_blocks.2.conv_a.bias"))
-    rename_keys.append(("conv3b.bias", "encoder.conv_blocks.2.conv_b.bias"))
-    rename_keys.append(("conv4a.bias", "encoder.conv_blocks.3.conv_a.bias"))
-    rename_keys.append(("conv4b.bias", "encoder.conv_blocks.3.conv_b.bias"))
-
-    # Keypoint Decoder weights
-    rename_keys.append(("convPa.weight", "keypoint_decoder.conv_score_a.weight"))
-    rename_keys.append(("convPb.weight", "keypoint_decoder.conv_score_b.weight"))
-    rename_keys.append(("convPa.bias", "keypoint_decoder.conv_score_a.bias"))
-    rename_keys.append(("convPb.bias", "keypoint_decoder.conv_score_b.bias"))
-
-    # Descriptor Decoder weights
-    rename_keys.append(("convDa.weight", "descriptor_decoder.conv_descriptor_a.weight"))
-    rename_keys.append(("convDb.weight", "descriptor_decoder.conv_descriptor_b.weight"))
-    rename_keys.append(("convDa.bias", "descriptor_decoder.conv_descriptor_a.bias"))
-    rename_keys.append(("convDb.bias", "descriptor_decoder.conv_descriptor_b.bias"))
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def prepare_imgs():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im1 = Image.open(requests.get(url, stream=True).raw)
-    url = "http://images.cocodataset.org/test-stuff2017/000000004016.jpg"
-    im2 = Image.open(requests.get(url, stream=True).raw)
-    return [im1, im2]
-
-
-@torch.no_grad()
-def convert_superpoint_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub, test_mode=False):
-    """
-    Copy/paste/tweak model's weights to our SuperPoint structure.
-    """
-
-    print("Downloading original model from checkpoint...")
-    config = get_superpoint_config()
-
-    # load original state_dict from URL
-    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)
-
-    print("Converting model parameters...")
-    # rename keys
-    rename_keys = create_rename_keys(config, original_state_dict)
-    new_state_dict = original_state_dict.copy()
-    for src, dest in rename_keys:
-        rename_key(new_state_dict, src, dest)
-
-    # Load HuggingFace model
-    model = SuperPointForKeypointDetection(config)
-    model.load_state_dict(new_state_dict)
-    model.eval()
-    print("Successfully loaded weights in the model")
-
-    # Check model outputs
-    preprocessor = SuperPointImageProcessor()
-    inputs = preprocessor(images=prepare_imgs(), return_tensors="pt")
-    outputs = model(**inputs)
-
-    # If test_mode is True, we check that the model outputs match the original results
-    if test_mode:
-        torch.count_nonzero(outputs.mask[0])
-        expected_keypoints_shape = (2, 830, 2)
-        expected_scores_shape = (2, 830)
-        expected_descriptors_shape = (2, 830, 256)
-
-        expected_keypoints_values = torch.tensor([[480.0, 9.0], [494.0, 9.0], [489.0, 16.0]])
-        expected_scores_values = torch.tensor([0.0064, 0.0140, 0.0595, 0.0728, 0.5170, 0.0175, 0.1523, 0.2055, 0.0336])
-        expected_descriptors_value = torch.tensor(-0.1096)
-        assert outputs.keypoints.shape == expected_keypoints_shape
-        assert outputs.scores.shape == expected_scores_shape
-        assert outputs.descriptors.shape == expected_descriptors_shape
-
-        assert torch.allclose(outputs.keypoints[0, :3], expected_keypoints_values, atol=1e-3)
-        assert torch.allclose(outputs.scores[0, :9], expected_scores_values, atol=1e-3)
-        assert torch.allclose(outputs.descriptors[0, 0, 0], expected_descriptors_value, atol=1e-3)
-        print("Model outputs match the original results!")
-
-    if save_model:
-        print("Saving model to local...")
-        # Create folder to save model
-        if not os.path.isdir(pytorch_dump_folder_path):
-            os.mkdir(pytorch_dump_folder_path)
-
-        model.save_pretrained(pytorch_dump_folder_path)
-        preprocessor.save_pretrained(pytorch_dump_folder_path)
-
-        model_name = "magic-leap-community/superpoint"
-        if push_to_hub:
-            print(f"Pushing {model_name} to the hub...")
-        model.push_to_hub(model_name)
-        preprocessor.push_to_hub(model_name)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/magicleap/SuperPointPretrainedNetwork/raw/master/superpoint_v1.pth",
-        type=str,
-        help="URL of the original SuperPoint checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="model",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--save_model", action="store_true", help="Save model to local")
-    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
-
-    args = parser.parse_args()
-    convert_superpoint_checkpoint(
-        args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub
-    )
diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index cfae68f90204..e1ad94613f09 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -180,11 +180,11 @@ def resize(
     def preprocess(
         self,
         images,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_grayscale: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_grayscale: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py b/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
deleted file mode 100644
index 21ecebebe241..000000000000
--- a/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert SwiftFormer checkpoints from the original implementation."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    SwiftFormerConfig,
-    SwiftFormerForImageClassification,
-    ViTImageProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-device = torch.device("cpu")
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-def get_expected_output(swiftformer_name):
-    if swiftformer_name == "swiftformer_xs":
-        return torch.tensor([-2.1703e00, 2.1107e00, -2.0811e00, 8.8685e-01, 2.4360e-01])
-
-    elif swiftformer_name == "swiftformer_s":
-        return torch.tensor([3.9636e-01, 2.3478e-01, -1.6963e00, -1.7381e00, -8.6337e-01])
-
-    elif swiftformer_name == "swiftformer_l1":
-        return torch.tensor([-4.2768e-01, -4.7429e-01, -1.0897e00, -1.0248e00, 3.5523e-02])
-
-    elif swiftformer_name == "swiftformer_l3":
-        return torch.tensor([-2.5330e-01, 2.4211e-01, -6.0185e-01, -8.2789e-01, -6.0446e-02])
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def create_rename_keys(state_dict):
-    rename_keys = []
-    for k in state_dict.keys():
-        k_new = k
-        if ".pwconv" in k:
-            k_new = k_new.replace(".pwconv", ".point_wise_conv")
-        if ".dwconv" in k:
-            k_new = k_new.replace(".dwconv", ".depth_wise_conv")
-        if ".Proj." in k:
-            k_new = k_new.replace(".Proj.", ".proj.")
-        if "patch_embed" in k_new:
-            k_new = k_new.replace("patch_embed", "swiftformer.patch_embed.patch_embedding")
-        if "network" in k_new:
-            ls = k_new.split(".")
-            if ls[2].isdigit():
-                k_new = "swiftformer.encoder.network." + ls[1] + ".blocks." + ls[2] + "." + ".".join(ls[3:])
-            else:
-                k_new = k_new.replace("network", "swiftformer.encoder.network")
-        rename_keys.append((k, k_new))
-    return rename_keys
-
-
-@torch.no_grad()
-def convert_swiftformer_checkpoint(swiftformer_name, pytorch_dump_folder_path, original_ckpt):
-    """
-    Copy/paste/tweak model's weights to our SwiftFormer structure.
-    """
-
-    # define default SwiftFormer configuration
-    config = SwiftFormerConfig()
-
-    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
-    config.num_labels = 1000
-    repo_id = "huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    # size of the architecture
-    if swiftformer_name == "swiftformer_xs":
-        config.depths = [3, 3, 6, 4]
-        config.embed_dims = [48, 56, 112, 220]
-
-    elif swiftformer_name == "swiftformer_s":
-        config.depths = [3, 3, 9, 6]
-        config.embed_dims = [48, 64, 168, 224]
-
-    elif swiftformer_name == "swiftformer_l1":
-        config.depths = [4, 3, 10, 5]
-        config.embed_dims = [48, 96, 192, 384]
-
-    elif swiftformer_name == "swiftformer_l3":
-        config.depths = [4, 4, 12, 6]
-        config.embed_dims = [64, 128, 320, 512]
-
-    # load state_dict of original model, remove and rename some keys
-    if original_ckpt:
-        if original_ckpt.startswith("https"):
-            checkpoint = torch.hub.load_state_dict_from_url(original_ckpt, map_location="cpu", check_hash=True)
-        else:
-            checkpoint = torch.load(original_ckpt, map_location="cpu")
-    state_dict = checkpoint
-
-    rename_keys = create_rename_keys(state_dict)
-    for rename_key_src, rename_key_dest in rename_keys:
-        rename_key(state_dict, rename_key_src, rename_key_dest)
-
-    # load HuggingFace model
-    hf_model = SwiftFormerForImageClassification(config).eval()
-    hf_model.load_state_dict(state_dict)
-
-    # prepare test inputs
-    image = prepare_img()
-    processor = ViTImageProcessor.from_pretrained("preprocessor_config")
-    inputs = processor(images=image, return_tensors="pt")
-
-    # compare outputs from both models
-    timm_logits = get_expected_output(swiftformer_name)
-    hf_logits = hf_model(inputs["pixel_values"]).logits
-
-    assert hf_logits.shape == torch.Size([1, 1000])
-    assert torch.allclose(hf_logits[0, 0:5], timm_logits, atol=1e-3)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {swiftformer_name} to {pytorch_dump_folder_path}")
-    hf_model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--swiftformer_name",
-        default="swiftformer_xs",
-        choices=["swiftformer_xs", "swiftformer_s", "swiftformer_l1", "swiftformer_l3"],
-        type=str,
-        help="Name of the SwiftFormer model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="./converted_outputs/",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--original_ckpt", default=None, type=str, help="Path to the original model checkpoint.")
-
-    args = parser.parse_args()
-    convert_swiftformer_checkpoint(args.swiftformer_name, args.pytorch_dump_folder_path, args.original_ckpt)
diff --git a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
deleted file mode 100644
index 6402346289c1..000000000000
--- a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Swin SimMIM checkpoints from the original repository.
-
-URL: https://github.com/microsoft/Swin-Transformer/blob/main/MODELHUB.md#simmim-pretrained-swin-v1-models"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import SwinConfig, SwinForMaskedImageModeling, ViTImageProcessor
-
-
-def get_swin_config(model_name):
-    config = SwinConfig(image_size=192)
-
-    if "base" in model_name:
-        window_size = 6
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-    elif "large" in model_name:
-        window_size = 12
-        embed_dim = 192
-        depths = (2, 2, 18, 2)
-        num_heads = (6, 12, 24, 48)
-    else:
-        raise ValueError("Model not supported, only supports base and large variants")
-
-    config.window_size = window_size
-    config.embed_dim = embed_dim
-    config.depths = depths
-    config.num_heads = num_heads
-
-    return config
-
-
-def rename_key(name):
-    if "encoder.mask_token" in name:
-        name = name.replace("encoder.mask_token", "embeddings.mask_token")
-    if "encoder.patch_embed.proj" in name:
-        name = name.replace("encoder.patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "encoder.patch_embed.norm" in name:
-        name = name.replace("encoder.patch_embed.norm", "embeddings.norm")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-
-    if name == "encoder.norm.weight":
-        name = "layernorm.weight"
-    if name == "encoder.norm.bias":
-        name = "layernorm.bias"
-
-    if "decoder" in name:
-        pass
-    else:
-        name = "swin." + name
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "attn_mask" in key:
-            pass
-        elif "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[2])
-            block_num = int(key_split[4])
-            dim = model.swin.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
-
-            if "weight" in key:
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"] = (
-                    val[:dim, :]
-                )
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"] = (
-                    val[-dim:, :]
-                )
-            else:
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = val[
-                    :dim
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = val[
-                    dim : dim * 2
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = val[
-                    -dim:
-                ]
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_swin_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-
-    config = get_swin_config(model_name)
-    model = SwinForMaskedImageModeling(config)
-    model.eval()
-
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-    image_processor = ViTImageProcessor(size={"height": 192, "width": 192})
-    image = Image.open(requests.get(url, stream=True).raw)
-    inputs = image_processor(images=image, return_tensors="pt")
-
-    with torch.no_grad():
-        outputs = model(**inputs).logits
-
-    print(outputs.keys())
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and image processor for {model_name} to hub")
-        model.push_to_hub(f"microsoft/{model_name}")
-        image_processor.push_to_hub(f"microsoft/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="swin-base-simmim-window6-192",
-        type=str,
-        choices=["swin-base-simmim-window6-192", "swin-large-simmim-window12-192"],
-        help="Name of the Swin SimMIM model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/SwinSimMIM/simmim_pretrain__swin_base__img192_window6__100ep.pth",
-        type=str,
-        help="Path to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_swin_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
deleted file mode 100644
index c91249b272ba..000000000000
--- a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import argparse
-import json
-
-import requests
-import timm
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import AutoImageProcessor, SwinConfig, SwinForImageClassification
-
-
-def get_swin_config(swin_name):
-    config = SwinConfig()
-    name_split = swin_name.split("_")
-
-    model_size = name_split[1]
-    img_size = int(name_split[4])
-    window_size = int(name_split[3][-1])
-
-    if model_size == "tiny":
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-    elif model_size == "small":
-        embed_dim = 96
-        depths = (2, 2, 18, 2)
-        num_heads = (3, 6, 12, 24)
-    elif model_size == "base":
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-    else:
-        embed_dim = 192
-        depths = (2, 2, 18, 2)
-        num_heads = (6, 12, 24, 48)
-
-    if "in22k" in swin_name:
-        num_classes = 21841
-    else:
-        num_classes = 1000
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    config.image_size = img_size
-    config.num_labels = num_classes
-    config.embed_dim = embed_dim
-    config.depths = depths
-    config.num_heads = num_heads
-    config.window_size = window_size
-
-    return config
-
-
-def rename_key(name):
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.norm")
-    if "layers" in name:
-        name = "encoder." + name
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-
-    if name == "norm.weight":
-        name = "layernorm.weight"
-    if name == "norm.bias":
-        name = "layernorm.bias"
-
-    if "head" in name:
-        name = name.replace("head", "classifier")
-    else:
-        name = "swin." + name
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "mask" in key:
-            continue
-        elif "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[1])
-            block_num = int(key_split[3])
-            dim = model.swin.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
-
-            if "weight" in key:
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"] = (
-                    val[:dim, :]
-                )
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"] = (
-                    val[-dim:, :]
-                )
-            else:
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = val[
-                    :dim
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = val[
-                    dim : dim * 2
-                ]
-                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = val[
-                    -dim:
-                ]
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_swin_checkpoint(swin_name, pytorch_dump_folder_path):
-    timm_model = timm.create_model(swin_name, pretrained=True)
-    timm_model.eval()
-
-    config = get_swin_config(swin_name)
-    model = SwinForImageClassification(config)
-    model.eval()
-
-    new_state_dict = convert_state_dict(timm_model.state_dict(), model)
-    model.load_state_dict(new_state_dict)
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-    image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swin_name.replace("_", "-")))
-    image = Image.open(requests.get(url, stream=True).raw)
-    inputs = image_processor(images=image, return_tensors="pt")
-
-    timm_outs = timm_model(inputs["pixel_values"])
-    hf_outs = model(**inputs).logits
-
-    assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
-
-    print(f"Saving model {swin_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--swin_name",
-        default="swin_tiny_patch4_window7_224",
-        type=str,
-        help="Name of the Swin timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_swin_checkpoint(args.swin_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index a4262491366a..5de428831e34 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -86,7 +86,7 @@ class SwinEncoderOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -121,7 +121,7 @@ class SwinModelOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -158,7 +158,7 @@ class SwinMaskedImageModelingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    reconstruction: torch.FloatTensor = None
+    reconstruction: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -203,7 +203,7 @@ class SwinImageClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -943,6 +943,13 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, SwinEmbeddings):
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, SwinSelfAttention):
+            module.relative_position_bias_table.data.zero_()
 
 
 SWIN_START_DOCSTRING = r"""
diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py
index 865444f081a6..c1c4fb1620fd 100644
--- a/src/transformers/models/swin/modeling_tf_swin.py
+++ b/src/transformers/models/swin/modeling_tf_swin.py
@@ -91,7 +91,7 @@ class TFSwinEncoderOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None
@@ -126,7 +126,7 @@ class TFSwinModelOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     pooler_output: tf.Tensor | None = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
@@ -163,7 +163,7 @@ class TFSwinMaskedImageModelingOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    reconstruction: tf.Tensor = None
+    reconstruction: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None
@@ -208,7 +208,7 @@ class TFSwinImageClassifierOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
     reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None
@@ -307,7 +307,7 @@ def build(self, input_shape: tf.TensorShape) -> None:
                 self.dropout.build(None)
 
     def call(
-        self, pixel_values: tf.Tensor, bool_masked_pos: bool = None, training: bool = False
+        self, pixel_values: tf.Tensor, bool_masked_pos: Optional[bool] = None, training: bool = False
     ) -> Tuple[tf.Tensor, Tuple[int, int]]:
         embeddings, output_dimensions = self.patch_embeddings(pixel_values, training=training)
         embeddings = self.norm(embeddings, training=training)
@@ -474,7 +474,7 @@ def build(self, input_shape=None):
 class TFSwinDropPath(keras.layers.Layer):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
-    def __init__(self, drop_prob: float = None, scale_by_keep: bool = True, **kwargs) -> None:
+    def __init__(self, drop_prob: Optional[float] = None, scale_by_keep: bool = True, **kwargs) -> None:
         super(TFSwinDropPath, self).__init__(**kwargs)
         self.drop_prob = drop_prob
         self.scale_by_keep = scale_by_keep
diff --git a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
deleted file mode 100644
index f0531283395e..000000000000
--- a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Swin2SR checkpoints from the original repository. URL: https://github.com/mv-lab/swin2sr"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-from torchvision.transforms import Compose, Normalize, Resize, ToTensor
-
-from transformers import Swin2SRConfig, Swin2SRForImageSuperResolution, Swin2SRImageProcessor
-
-
-def get_config(checkpoint_url):
-    config = Swin2SRConfig()
-
-    if "Swin2SR_ClassicalSR_X4_64" in checkpoint_url:
-        config.upscale = 4
-    elif "Swin2SR_CompressedSR_X4_48" in checkpoint_url:
-        config.upscale = 4
-        config.image_size = 48
-        config.upsampler = "pixelshuffle_aux"
-    elif "Swin2SR_Lightweight_X2_64" in checkpoint_url:
-        config.depths = [6, 6, 6, 6]
-        config.embed_dim = 60
-        config.num_heads = [6, 6, 6, 6]
-        config.upsampler = "pixelshuffledirect"
-    elif "Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR" in checkpoint_url:
-        config.upscale = 4
-        config.upsampler = "nearest+conv"
-    elif "Swin2SR_Jpeg_dynamic" in checkpoint_url:
-        config.num_channels = 1
-        config.upscale = 1
-        config.image_size = 126
-        config.window_size = 7
-        config.img_range = 255.0
-        config.upsampler = ""
-
-    return config
-
-
-def rename_key(name, config):
-    if "patch_embed.proj" in name and "layers" not in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.patch_embeddings.layernorm")
-    if "layers" in name:
-        name = name.replace("layers", "encoder.stages")
-    if "residual_group.blocks" in name:
-        name = name.replace("residual_group.blocks", "layers")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "q_bias" in name:
-        name = name.replace("q_bias", "query.bias")
-    if "k_bias" in name:
-        name = name.replace("k_bias", "key.bias")
-    if "v_bias" in name:
-        name = name.replace("v_bias", "value.bias")
-    if "cpb_mlp" in name:
-        name = name.replace("cpb_mlp", "continuous_position_bias_mlp")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "patch_embed.projection")
-
-    if name == "norm.weight":
-        name = "layernorm.weight"
-    if name == "norm.bias":
-        name = "layernorm.bias"
-
-    if "conv_first" in name:
-        name = name.replace("conv_first", "first_convolution")
-
-    if (
-        "upsample" in name
-        or "conv_before_upsample" in name
-        or "conv_bicubic" in name
-        or "conv_up" in name
-        or "conv_hr" in name
-        or "conv_last" in name
-        or "aux" in name
-    ):
-        # heads
-        if "conv_last" in name:
-            name = name.replace("conv_last", "final_convolution")
-        if config.upsampler in ["pixelshuffle", "pixelshuffle_aux", "nearest+conv"]:
-            if "conv_before_upsample.0" in name:
-                name = name.replace("conv_before_upsample.0", "conv_before_upsample")
-            if "upsample.0" in name:
-                name = name.replace("upsample.0", "upsample.convolution_0")
-            if "upsample.2" in name:
-                name = name.replace("upsample.2", "upsample.convolution_1")
-            name = "upsample." + name
-        elif config.upsampler == "pixelshuffledirect":
-            name = name.replace("upsample.0.weight", "upsample.conv.weight")
-            name = name.replace("upsample.0.bias", "upsample.conv.bias")
-        else:
-            pass
-    else:
-        name = "swin2sr." + name
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            stage_num = int(key_split[1])
-            block_num = int(key_split[4])
-            dim = config.embed_dim
-
-            if "weight" in key:
-                orig_state_dict[
-                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.weight"
-                ] = val[:dim, :]
-                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.weight"] = (
-                    val[dim : dim * 2, :]
-                )
-                orig_state_dict[
-                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.bias"] = (
-                    val[:dim]
-                )
-                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.bias"] = (
-                    val[dim : dim * 2]
-                )
-                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.bias"] = (
-                    val[-dim:]
-                )
-            pass
-        else:
-            orig_state_dict[rename_key(key, config)] = val
-
-    return orig_state_dict
-
-
-def convert_swin2sr_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub):
-    config = get_config(checkpoint_url)
-    model = Swin2SRForImageSuperResolution(config)
-    model.eval()
-
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    new_state_dict = convert_state_dict(state_dict, config)
-    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
-
-    if len(missing_keys) > 0:
-        raise ValueError("Missing keys when converting: {}".format(missing_keys))
-    for key in unexpected_keys:
-        if not ("relative_position_index" in key or "relative_coords_table" in key or "self_mask" in key):
-            raise ValueError(f"Unexpected key {key} in state_dict")
-
-    # verify values
-    url = "https://github.com/mv-lab/swin2sr/blob/main/testsets/real-inputs/shanghai.jpg?raw=true"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    processor = Swin2SRImageProcessor()
-    # pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    image_size = 126 if "Jpeg" in checkpoint_url else 256
-    transforms = Compose(
-        [
-            Resize((image_size, image_size)),
-            ToTensor(),
-            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]
-    )
-    pixel_values = transforms(image).unsqueeze(0)
-
-    if config.num_channels == 1:
-        pixel_values = pixel_values[:, 0, :, :].unsqueeze(1)
-
-    outputs = model(pixel_values)
-
-    # assert values
-    if "Swin2SR_ClassicalSR_X2_64" in checkpoint_url:
-        expected_shape = torch.Size([1, 3, 512, 512])
-        expected_slice = torch.tensor(
-            [[-0.7087, -0.7138, -0.6721], [-0.8340, -0.8095, -0.7298], [-0.9149, -0.8414, -0.7940]]
-        )
-    elif "Swin2SR_ClassicalSR_X4_64" in checkpoint_url:
-        expected_shape = torch.Size([1, 3, 1024, 1024])
-        expected_slice = torch.tensor(
-            [[-0.7775, -0.8105, -0.8933], [-0.7764, -0.8356, -0.9225], [-0.7976, -0.8686, -0.9579]]
-        )
-    elif "Swin2SR_CompressedSR_X4_48" in checkpoint_url:
-        # TODO values didn't match exactly here
-        expected_shape = torch.Size([1, 3, 1024, 1024])
-        expected_slice = torch.tensor(
-            [[-0.8035, -0.7504, -0.7491], [-0.8538, -0.8124, -0.7782], [-0.8804, -0.8651, -0.8493]]
-        )
-    elif "Swin2SR_Lightweight_X2_64" in checkpoint_url:
-        expected_shape = torch.Size([1, 3, 512, 512])
-        expected_slice = torch.tensor(
-            [[-0.7669, -0.8662, -0.8767], [-0.8810, -0.9962, -0.9820], [-0.9340, -1.0322, -1.1149]]
-        )
-    elif "Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR" in checkpoint_url:
-        expected_shape = torch.Size([1, 3, 1024, 1024])
-        expected_slice = torch.tensor(
-            [[-0.5238, -0.5557, -0.6321], [-0.6016, -0.5903, -0.6391], [-0.6244, -0.6334, -0.6889]]
-        )
-
-    assert (
-        outputs.reconstruction.shape == expected_shape
-    ), f"Shape of reconstruction should be {expected_shape}, but is {outputs.reconstruction.shape}"
-    assert torch.allclose(outputs.reconstruction[0, 0, :3, :3], expected_slice, atol=1e-3)
-    print("Looks ok!")
-
-    url_to_name = {
-        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X2_64.pth": (
-            "swin2SR-classical-sr-x2-64"
-        ),
-        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X4_64.pth": (
-            "swin2SR-classical-sr-x4-64"
-        ),
-        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_CompressedSR_X4_48.pth": (
-            "swin2SR-compressed-sr-x4-48"
-        ),
-        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_Lightweight_X2_64.pth": (
-            "swin2SR-lightweight-x2-64"
-        ),
-        "https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR.pth": (
-            "swin2SR-realworld-sr-x4-64-bsrgan-psnr"
-        ),
-    }
-    model_name = url_to_name[checkpoint_url]
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving image processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(f"caidas/{model_name}")
-        processor.push_to_hub(f"caidas/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X2_64.pth",
-        type=str,
-        help="URL of the original Swin2SR checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the converted model to the hub.")
-
-    args = parser.parse_args()
-    convert_swin2sr_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py
index 784367a01478..a3ae0ed58811 100644
--- a/src/transformers/models/swin2sr/modeling_swin2sr.py
+++ b/src/transformers/models/swin2sr/modeling_swin2sr.py
@@ -69,7 +69,7 @@ class Swin2SREncoderOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
deleted file mode 100644
index 0e6e837a7e7e..000000000000
--- a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Swinv2 checkpoints from the timm library."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import timm
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import AutoImageProcessor, Swinv2Config, Swinv2ForImageClassification
-
-
-def get_swinv2_config(swinv2_name):
-    config = Swinv2Config()
-    name_split = swinv2_name.split("_")
-
-    model_size = name_split[1]
-    if "to" in name_split[3]:
-        img_size = int(name_split[3][-3:])
-    else:
-        img_size = int(name_split[3])
-    if "to" in name_split[2]:
-        window_size = int(name_split[2][-2:])
-    else:
-        window_size = int(name_split[2][6:])
-
-    if model_size == "tiny":
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-    elif model_size == "small":
-        embed_dim = 96
-        depths = (2, 2, 18, 2)
-        num_heads = (3, 6, 12, 24)
-    elif model_size == "base":
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-    else:
-        embed_dim = 192
-        depths = (2, 2, 18, 2)
-        num_heads = (6, 12, 24, 48)
-
-    if "to" in swinv2_name:
-        config.pretrained_window_sizes = (12, 12, 12, 6)
-
-    if ("22k" in swinv2_name) and ("to" not in swinv2_name):
-        num_classes = 21841
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-22k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    else:
-        num_classes = 1000
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    config.image_size = img_size
-    config.num_labels = num_classes
-    config.embed_dim = embed_dim
-    config.depths = depths
-    config.num_heads = num_heads
-    config.window_size = window_size
-
-    return config
-
-
-def rename_key(name):
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "embeddings.norm")
-    if "layers" in name:
-        name = "encoder." + name
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "q_bias" in name:
-        name = name.replace("q_bias", "query.bias")
-    if "k_bias" in name:
-        name = name.replace("k_bias", "key.bias")
-    if "v_bias" in name:
-        name = name.replace("v_bias", "value.bias")
-    if "cpb_mlp" in name:
-        name = name.replace("cpb_mlp", "continuous_position_bias_mlp")
-    if name == "norm.weight":
-        name = "layernorm.weight"
-    if name == "norm.bias":
-        name = "layernorm.bias"
-
-    if "head" in name:
-        name = name.replace("head", "classifier")
-    else:
-        name = "swinv2." + name
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, model):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "mask" in key:
-            continue
-        elif "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[1])
-            block_num = int(key_split[3])
-            dim = model.swinv2.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
-
-            if "weight" in key:
-                orig_state_dict[
-                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
-                ] = val[:dim, :]
-                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = (
-                    val[dim : dim * 2, :]
-                )
-                orig_state_dict[
-                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
-                ] = val[-dim:, :]
-            else:
-                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = (
-                    val[:dim]
-                )
-                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = val[
-                    dim : dim * 2
-                ]
-                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = (
-                    val[-dim:]
-                )
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_swinv2_checkpoint(swinv2_name, pytorch_dump_folder_path):
-    timm_model = timm.create_model(swinv2_name, pretrained=True)
-    timm_model.eval()
-
-    config = get_swinv2_config(swinv2_name)
-    model = Swinv2ForImageClassification(config)
-    model.eval()
-
-    new_state_dict = convert_state_dict(timm_model.state_dict(), model)
-    model.load_state_dict(new_state_dict)
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-    image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swinv2_name.replace("_", "-")))
-    image = Image.open(requests.get(url, stream=True).raw)
-    inputs = image_processor(images=image, return_tensors="pt")
-
-    timm_outs = timm_model(inputs["pixel_values"])
-    hf_outs = model(**inputs).logits
-
-    assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
-
-    print(f"Saving model {swinv2_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    model.push_to_hub(
-        repo_path_or_name=Path(pytorch_dump_folder_path, swinv2_name),
-        organization="nandwalritik",
-        commit_message="Add model",
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--swinv2_name",
-        default="swinv2_tiny_patch4_window8_256",
-        type=str,
-        help="Name of the Swinv2 timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_swinv2_checkpoint(args.swinv2_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index 626f883ac803..4a5d1bd988c1 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -87,7 +87,7 @@ class Swinv2EncoderOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -123,7 +123,7 @@ class Swinv2ModelOutput(ModelOutput):
             include the spatial dimensions.
     """
 
-    last_hidden_state: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
     pooler_output: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -161,7 +161,7 @@ class Swinv2MaskedImageModelingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    reconstruction: torch.FloatTensor = None
+    reconstruction: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -207,7 +207,7 @@ class Swinv2ImageClassifierOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -976,7 +976,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->Swinv2,swin->swinv2
 class Swinv2PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -1000,6 +999,13 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, Swinv2Embeddings):
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+            if module.position_embeddings is not None:
+                module.position_embeddings.data.zero_()
+        elif isinstance(module, Swinv2SelfAttention):
+            module.logit_scale.data.fill_(math.log(10))
 
 
 SWINV2_START_DOCSTRING = r"""
diff --git a/src/transformers/models/switch_transformers/convert_big_switch.py b/src/transformers/models/switch_transformers/convert_big_switch.py
deleted file mode 100644
index e4b8af07cd4c..000000000000
--- a/src/transformers/models/switch_transformers/convert_big_switch.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import argparse
-import json
-import os
-
-import tensorstore as ts
-import torch
-from flax import serialization
-from flax.traverse_util import flatten_dict, unflatten_dict
-from tensorflow.io import gfile
-
-from transformers.modeling_utils import dtype_byte_size
-from transformers.models.switch_transformers.convert_switch_transformers_original_flax_checkpoint_to_pytorch import (
-    rename_keys,
-)
-from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
-from transformers.utils.hub import convert_file_size_to_int
-
-
-def rename_base_flax_keys(flax_key_tuple, flax_tensor):
-    """
-    Post renaming of basic JAX keys to pytorch.
-    """
-    if flax_key_tuple[-1] == "kernel" and flax_tensor.ndim == 3:
-        # expert layer
-        flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
-        flax_tensor = torch.permute(flax_tensor, (0, 2, 1))
-    elif flax_key_tuple[-1] == "kernel" and ".".join(flax_key_tuple):
-        # linear layer
-        flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
-        flax_tensor = flax_tensor.T
-    elif flax_key_tuple[-1] in ["scale", "embedding"]:
-        flax_key_tuple = flax_key_tuple[:-1] + ("weight",)
-
-    return flax_key_tuple, flax_tensor
-
-
-def get_key_and_tensorstore_dict(layer, checkpoint_info, switch_checkpoint_path):
-    if "metadata" in layer:
-        split_layer = layer.split("metadata")
-        curr_real_layer_name = "".join(split_layer[0])[:-1]
-        split_layer = [tuple(("metadata" + split_layer[1]).split("/"))]
-    elif "kvstore" in layer:
-        split_layer = layer.split("kvstore")
-        curr_real_layer_name = "".join(split_layer[0])[:-1]
-        split_layer = [tuple(("kvstore" + split_layer[1]).split("/"))]
-
-    else:
-        split_layer = layer.split("/")
-        curr_real_layer_name = "/".join(split_layer[:-1])
-        split_layer[-1] = (split_layer[-1],)
-
-    if "kvstore/path" in layer:
-        content = f"{switch_checkpoint_path}/{checkpoint_info[layer]}"
-    elif "kvstore/driver" in layer:
-        content = "file"
-    else:
-        content = checkpoint_info[layer]
-
-    return curr_real_layer_name, split_layer, content
-
-
-def rename_and_save_block(current_block, save_path):
-    current_block = rename_keys(current_block)
-    new_current_block = {}
-    for k, v in current_block.items():
-        new_current_block[k.replace("/", ".")] = v
-    current_block = new_current_block
-    torch.save(current_block, save_path)
-
-
-def shard_on_the_fly(switch_checkpoint_path, dump_path, max_shard_size, dtype, weights_name: str = WEIGHTS_NAME):
-    max_shard_size = convert_file_size_to_int(max_shard_size)
-    sharded_state_dicts = []
-    current_block = {}
-    current_block_size = 0
-    total_size = 0
-
-    os.makedirs(dump_path, exist_ok=True)
-    with gfile.GFile(switch_checkpoint_path + "/checkpoint", "rb") as fp:
-        checkpoint_info = serialization.msgpack_restore(fp.read())["optimizer"]["target"]
-        checkpoint_info = flatten_dict(checkpoint_info, sep="/")
-
-    all_layers = {}
-    for layer in checkpoint_info.keys():
-        curr_real_layer_name, split_layer, content = get_key_and_tensorstore_dict(
-            layer, checkpoint_info, switch_checkpoint_path
-        )
-        if curr_real_layer_name in all_layers:
-            all_layers[curr_real_layer_name][split_layer[-1]] = content
-        else:
-            all_layers[curr_real_layer_name] = {split_layer[-1]: content}
-
-    for key in all_layers.keys():
-        # open tensorstore file
-        raw_weights = ts.open(unflatten_dict(all_layers[key])).result().read().result()
-        raw_weights = torch.tensor(raw_weights)
-        weight_size = raw_weights.numel() * dtype_byte_size(raw_weights.dtype)
-
-        # use the renaming pattern from the small conversion scripts
-        key, raw_weights = rename_base_flax_keys(tuple(key.split("/")), raw_weights)
-        key = "/".join(key)
-
-        # If this weight is going to tip up over the maximal size, we split.
-        if current_block_size + weight_size > max_shard_size:
-            save_path = os.path.join(
-                dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin")
-            )
-            rename_and_save_block(current_block, save_path)
-            sharded_state_dicts.append(current_block.keys())
-            del current_block
-            current_block = {}
-            current_block_size = 0
-
-        current_block[key] = raw_weights.to(getattr(torch, dtype))
-        current_block_size += weight_size
-        total_size += weight_size
-
-    # Add the last block
-    save_path = os.path.join(dump_path, weights_name.replace(".bin", f"-{len(sharded_state_dicts)+1:05d}-of-???.bin"))
-    rename_and_save_block(current_block, save_path)
-    sharded_state_dicts.append(current_block.keys())
-
-    # If we only have one shard, we return it
-    if len(sharded_state_dicts) == 1:
-        return {weights_name: sharded_state_dicts[0]}, None
-
-    # Otherwise, let's build the index
-    weight_map = {}
-    shards = {}
-    for idx, shard in enumerate(sharded_state_dicts):
-        shard_file = weights_name.replace(
-            ".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin"
-        )  # len(sharded_state_dicts):05d}
-        temp_filename = os.path.join(dump_path, weights_name.replace(".bin", f"-{idx+1:05d}-of-???.bin"))
-        os.rename(temp_filename, os.path.join(dump_path, shard_file))
-        shards[shard_file] = shard
-        for key in shard:
-            weight_map[key] = shard_file
-
-    # Add the metadata
-    metadata = {"total_size": total_size}
-    index = {"metadata": metadata, "weight_map": weight_map}
-
-    with open(os.path.join(dump_path, WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f:
-        content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-        f.write(content)
-
-    return metadata, index
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--switch_t5x_checkpoint_path",
-        default="/mnt/disks/disk_switch/original_checkpoints/switch-xxl-128/checkpoint_634600",
-        type=str,
-        required=False,
-        help="Path to a directory containing a folder per layer. Follows the original Google format.",
-    )
-    parser.add_argument("--max_shard_size", default="10GB", required=False, help="Max shard size")
-    parser.add_argument("--dtype", default="bfloat16", type=str, required=False, help="dtype of the saved model")
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="/mnt/disks/disk_switch/original_checkpoints/switch-xxl-128-converted",
-        type=str,
-        required=False,
-        help="Path to the output pytorch model.",
-    )
-    args = parser.parse_args()
-    shard_on_the_fly(
-        args.switch_t5x_checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.max_shard_size,
-        args.dtype,
-    )
-
-
-def sanity_check():
-    from transformers import SwitchTransformersConfig, SwitchTransformersForConditionalGeneration, T5Tokenizer
-
-    config = SwitchTransformersConfig.from_pretrained("google/switch-base-8")
-    config.save_pretrained("/home/arthur_huggingface_co/transformers/switch_converted")
-    model = SwitchTransformersForConditionalGeneration.from_pretrained(
-        "/home/arthur_huggingface_co/transformers/switch_converted", device_map="auto"
-    )
-
-    tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
-    text = "A <extra_id_0> walks into a bar a orders a <extra_id_1> with <extra_id_2> pinch of <extra_id_3>."
-
-    input_ids = tokenizer(text, return_tensors="pt").input_ids
-    out = model.generate(input_ids, decoder_start_token_id=0)
-    print(tokenizer.decode(out[0]))
diff --git a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
deleted file mode 100644
index 5937101169c6..000000000000
--- a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert SwitchTransformersX checkpoints from the original repository to JAX/FLAX model."""
-
-import argparse
-import re
-
-from flax.traverse_util import flatten_dict, unflatten_dict
-from t5x import checkpoints
-
-from transformers import SwitchTransformersConfig, SwitchTransformersForConditionalGeneration
-from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-# should not include what is already done by the `from_pt` argument
-MOE_LAYER_NAME_MAPPING = {
-    "/attention/": "/0/SelfAttention/",
-    "/self_attention/": "/0/SelfAttention/",
-    "/encoder_decoder_attention/": "/1/EncDecAttention/",
-    "value": "v",
-    "query": "q",
-    "key": "k",
-    "out": "o",
-    "pre_self_attention_layer_norm": "0/layer_norm",
-    "pre_cross_attention_layer_norm": "1/layer_norm",
-    "pre_attention_layer_norm": "0/layer_norm",  # previously 1, but seems wrong
-    "token_embedder": "shared",
-    "encoder_norm": "final_layer_norm",
-    "decoder_norm": "final_layer_norm",
-    "relpos_bias/rel_embedding": "block/0/layer/0/SelfAttention/relative_attention_bias/weight",
-    "router/router_weights/w/": "router/classifier/",
-    "roer/roer_weights/w/": "router/classifier/",
-    "logits_dense": "lm_head",
-}
-
-
-def rename_keys(s_dict):
-    # 1. in HF T5, we have block.{x}.layer.{y}. which corresponds to layer.{x} in
-    # the original model
-    keys = list(s_dict.keys())
-    for key in keys:
-        layer_to_block_of_layer = r".*/layers_(\d+)"
-        new_key = key
-        if re.match(layer_to_block_of_layer, key):
-            new_key = re.sub(r"layers_(\d+)", r"block/\1/layer", new_key)
-
-        layer_to_block_of_layer = r"(encoder|decoder)\/"
-
-        if re.match(layer_to_block_of_layer, key):
-            groups = re.match(layer_to_block_of_layer, new_key).groups()
-            if groups[0] == "encoder":
-                new_key = re.sub(r"/mlp/", r"/1/mlp/", new_key)
-                new_key = re.sub(r"/pre_mlp_layer_norm/", r"/1/layer_norm/", new_key)
-
-            elif groups[0] == "decoder":
-                new_key = re.sub(r"/mlp/", r"/2/mlp/", new_key)
-                new_key = re.sub(r"/pre_mlp_layer_norm/", r"/2/layer_norm/", new_key)
-
-        # 2. Convert other classic mappings
-        for old_key, temp_key in MOE_LAYER_NAME_MAPPING.items():
-            if old_key in new_key:
-                new_key = new_key.replace(old_key, temp_key)
-
-        print(f"{key} -> {new_key}")
-        s_dict[new_key] = s_dict.pop(key)
-
-    if "encoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight" in s_dict:
-        s_dict["encoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight"] = s_dict[
-            "encoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight"
-        ].T
-    if "decoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight" in s_dict:
-        s_dict["decoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight"] = s_dict[
-            "decoder/block/0/layer/0/SelfAttention/relative_attention_bias/weight"
-        ].T
-
-    # 3. Take extra care of the EXPERTS layer
-    for key in list(s_dict.keys()):
-        if "expert" in key:
-            num_experts = s_dict[key].shape[0]
-            expert_weihts = s_dict[key]
-            for idx in range(num_experts):
-                s_dict[key.replace("expert/", f"experts/expert_{idx}/")] = expert_weihts[idx]
-                print(f"{key} -> {key.replace('expert/', f'experts/expert_{idx}/')}")
-
-            s_dict.pop(key)
-
-    return s_dict
-
-
-GIN_TO_CONFIG_MAPPING = {
-    "NUM_ENCODER_LAYERS": "num_layers",
-    "NUM_DECODER_LAYERS": "num_decoder_layers",
-    "NUM_HEADS": "num_heads",
-    "HEAD_DIM": "d_kv",
-    "EMBED_DIM": "d_model",
-    "MLP_DIM": "d_ff",
-    "NUM_SELECTED_EXPERTS": "num_selected_experts",
-    "NUM_ENCODER_SPARSE_LAYERS": "num_sparse_encoder_layers",
-    "NUM_DECODER_SPARSE_LAYERS": "num_sparse_decoder_layers",
-    "dense.MlpBlock.activations": "feed_forward_proj",
-}
-
-
-def convert_gin_to_config(gin_file, num_experts):
-    # Convert a google style config to the hugging face fromat
-    import regex as re
-
-    with open(gin_file, "r") as f:
-        raw_gin = f.read()
-
-    regex_match = re.findall(r"(.*) = ([0-9.]*)", raw_gin)
-    args = {}
-    for param, value in regex_match:
-        if param in GIN_TO_CONFIG_MAPPING and value != "":
-            args[GIN_TO_CONFIG_MAPPING[param]] = float(value) if "." in value else int(value)
-
-    activation = re.findall(r"(.*activations) = \(\'(.*)\',\)", raw_gin)[0]
-    args[GIN_TO_CONFIG_MAPPING[activation[0]]] = str(activation[1])
-
-    args["num_experts"] = num_experts
-    config = SwitchTransformersConfig(**args)
-    return config
-
-
-def convert_flax_checkpoint_to_pytorch(
-    flax_checkpoint_path, config_file, gin_file=None, pytorch_dump_path="./", num_experts=8
-):
-    # Initialise PyTorch model
-
-    print(f"Loading flax weights from : {flax_checkpoint_path}")
-    flax_params = checkpoints.load_t5x_checkpoint(flax_checkpoint_path)
-
-    if gin_file is not None:
-        config = convert_gin_to_config(gin_file, num_experts)
-    else:
-        config = SwitchTransformersConfig.from_pretrained(config_file)
-
-    pt_model = SwitchTransformersForConditionalGeneration(config)
-
-    flax_params = flax_params["target"]
-    flax_params = flatten_dict(flax_params, sep="/")
-    flax_params = rename_keys(flax_params)
-    flax_params = unflatten_dict(flax_params, sep="/")
-
-    # Load the flax params in the PT model
-    load_flax_weights_in_pytorch_model(pt_model, flax_params)
-
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    pt_model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--switch_t5x_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained SwitchTransformers model. \nThis specifies the"
-            " model architecture. If not provided, a `gin_file` has to be provided."
-        ),
-    )
-    parser.add_argument(
-        "--gin_file",
-        default=None,
-        type=str,
-        required=False,
-        help="Path to the gin config file. If not provided, a `config_file` has to be passed   ",
-    )
-    parser.add_argument(
-        "--config_name", default=None, type=str, required=False, help="Config name of SwitchTransformers model."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output pytorch model."
-    )
-    parser.add_argument("--num_experts", default=8, type=int, required=False, help="Number of experts")
-    args = parser.parse_args()
-    convert_flax_checkpoint_to_pytorch(
-        args.switch_t5x_checkpoint_path,
-        args.config_name,
-        args.gin_file,
-        args.pytorch_dump_folder_path,
-        args.num_experts,
-    )
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index a09392c85671..d2d9929b9128 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -40,6 +40,7 @@
     DUMMY_MASK,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
     is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
@@ -48,6 +49,12 @@
 from .configuration_switch_transformers import SwitchTransformersConfig
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "SwitchTransformersConfig"
@@ -223,7 +230,7 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         # SwitchTransformers uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
 
@@ -290,12 +297,12 @@ def forward(self, hidden_states):
         expert the corresponding hidden states.
 
         """
-        # Step 1: Get the router_mask from the router as wel as the probabilities
+        # Step 1: Get the router_mask from the router as well as the probabilities
         router_mask, router_probs, router_logits = self.router(hidden_states)
         expert_index = torch.argmax(router_mask, dim=-1)
 
         # The routers introduced might not always map all the tokens, to a router, which means that some hidden states
-        # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the seleced ones.
+        # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the selected ones.
 
         next_states = hidden_states.clone()
 
@@ -1133,12 +1140,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1220,7 +1232,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
diff --git a/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index 9b1b15857cea..000000000000
--- a/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The T5 authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert T5 checkpoint."""
-
-import argparse
-
-from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
-    # Initialise PyTorch model
-    config = T5Config.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    model = T5ForConditionalGeneration(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained T5 model. \nThis specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
deleted file mode 100644
index 91ac9f08a0a1..000000000000
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Convert T5X checkpoints from the original repository to JAX/FLAX model."""
-
-import argparse
-
-from t5x import checkpoints
-
-from transformers import FlaxT5ForConditionalGeneration, T5Config
-
-
-def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_folder_path):
-    config = T5Config.from_pretrained(config_name)
-    flax_model = FlaxT5ForConditionalGeneration(config=config)
-    t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-
-    split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["layers_0"]["mlp"]
-
-    # Encoder
-    for layer_index in range(config.num_layers):
-        layer_name = f"layers_{str(layer_index)}"
-
-        # Self-Attention
-        t5x_attention_key = t5x_model["target"]["encoder"][layer_name]["attention"]["key"]["kernel"]
-        t5x_attention_out = t5x_model["target"]["encoder"][layer_name]["attention"]["out"]["kernel"]
-        t5x_attention_query = t5x_model["target"]["encoder"][layer_name]["attention"]["query"]["kernel"]
-        t5x_attention_value = t5x_model["target"]["encoder"][layer_name]["attention"]["value"]["kernel"]
-
-        # Layer Normalization
-        t5x_attention_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_attention_layer_norm"]["scale"]
-
-        if split_mlp_wi:
-            t5x_mlp_wi_0 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_0"]["kernel"]
-            t5x_mlp_wi_1 = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi_1"]["kernel"]
-        else:
-            t5x_mlp_wi = t5x_model["target"]["encoder"][layer_name]["mlp"]["wi"]["kernel"]
-
-        t5x_mlp_wo = t5x_model["target"]["encoder"][layer_name]["mlp"]["wo"]["kernel"]
-
-        # Layer Normalization
-        t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
-
-        # Assigning
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"]["kernel"] = (
-            t5x_attention_key
-        )
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"]["kernel"] = (
-            t5x_attention_out
-        )
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"]["kernel"] = (
-            t5x_attention_query
-        )
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"]["kernel"] = (
-            t5x_attention_value
-        )
-
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"]["weight"] = (
-            t5x_attention_layer_norm
-        )
-
-        if split_mlp_wi:
-            flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wi_0"][
-                "kernel"
-            ] = t5x_mlp_wi_0
-            flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wi_1"][
-                "kernel"
-            ] = t5x_mlp_wi_1
-        else:
-            flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wi"]["kernel"] = (
-                t5x_mlp_wi
-            )
-
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wo"]["kernel"] = (
-            t5x_mlp_wo
-        )
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"]["weight"] = (
-            t5x_mlp_layer_norm
-        )
-
-    # Only for layer 0:
-    t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"]["rel_embedding"].T
-    flax_model.params["encoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"][
-        "embedding"
-    ] = t5x_encoder_rel_embedding
-
-    # Assigning
-    t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"]
-    flax_model.params["encoder"]["final_layer_norm"]["weight"] = t5x_encoder_norm
-
-    # Decoder
-    for layer_index in range(config.num_decoder_layers):
-        layer_name = f"layers_{str(layer_index)}"
-
-        # Self-Attention
-        t5x_attention_key = t5x_model["target"]["decoder"][layer_name]["self_attention"]["key"]["kernel"]
-        t5x_attention_out = t5x_model["target"]["decoder"][layer_name]["self_attention"]["out"]["kernel"]
-        t5x_attention_query = t5x_model["target"]["decoder"][layer_name]["self_attention"]["query"]["kernel"]
-        t5x_attention_value = t5x_model["target"]["decoder"][layer_name]["self_attention"]["value"]["kernel"]
-
-        # Layer Normalization
-        t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_self_attention_layer_norm"][
-            "scale"
-        ]
-
-        # Encoder-Decoder-Attention
-        t5x_enc_dec_attention_key = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["key"][
-            "kernel"
-        ]
-        t5x_enc_dec_attention_out = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["out"][
-            "kernel"
-        ]
-        t5x_enc_dec_attention_query = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["query"][
-            "kernel"
-        ]
-        t5x_enc_dec_attention_value = t5x_model["target"]["decoder"][layer_name]["encoder_decoder_attention"]["value"][
-            "kernel"
-        ]
-
-        # Layer Normalization
-        t5x_cross_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_cross_attention_layer_norm"]["scale"]
-
-        # MLP
-        if split_mlp_wi:
-            t5x_mlp_wi_0 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_0"]["kernel"]
-            t5x_mlp_wi_1 = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi_1"]["kernel"]
-        else:
-            t5x_mlp_wi = t5x_model["target"]["decoder"][layer_name]["mlp"]["wi"]["kernel"]
-
-        t5x_mlp_wo = t5x_model["target"]["decoder"][layer_name]["mlp"]["wo"]["kernel"]
-
-        # Layer Normalization
-        tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
-
-        # Assigning
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"]["kernel"] = (
-            t5x_attention_key
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"]["kernel"] = (
-            t5x_attention_out
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"]["kernel"] = (
-            t5x_attention_query
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"]["kernel"] = (
-            t5x_attention_value
-        )
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"]["weight"] = (
-            t5x_pre_attention_layer_norm
-        )
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["k"]["kernel"] = (
-            t5x_enc_dec_attention_key
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["o"]["kernel"] = (
-            t5x_enc_dec_attention_out
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["q"]["kernel"] = (
-            t5x_enc_dec_attention_query
-        )
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["v"]["kernel"] = (
-            t5x_enc_dec_attention_value
-        )
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"]["weight"] = (
-            t5x_cross_layer_norm
-        )
-
-        if split_mlp_wi:
-            flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wi_0"][
-                "kernel"
-            ] = t5x_mlp_wi_0
-            flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wi_1"][
-                "kernel"
-            ] = t5x_mlp_wi_1
-        else:
-            flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wi"]["kernel"] = (
-                t5x_mlp_wi
-            )
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wo"]["kernel"] = (
-            t5x_mlp_wo
-        )
-
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["layer_norm"]["weight"] = (
-            tx5_mlp_layer_norm
-        )
-
-    # Decoder Normalization
-    tx5_decoder_norm = t5x_model["target"]["decoder"]["decoder_norm"]["scale"]
-    flax_model.params["decoder"]["final_layer_norm"]["weight"] = tx5_decoder_norm
-
-    # Only for layer 0:
-    t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["relpos_bias"]["rel_embedding"].T
-    flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"]["relative_attention_bias"][
-        "embedding"
-    ] = t5x_decoder_rel_embedding
-
-    # Token Embeddings
-    tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"]
-    flax_model.params["shared"]["embedding"] = tx5_token_embeddings
-
-    # LM Head (only in v1.1 checkpoints)
-    if "logits_dense" in t5x_model["target"]["decoder"]:
-        flax_model.params["lm_head"]["kernel"] = t5x_model["target"]["decoder"]["logits_dense"]["kernel"]
-
-    flax_model.save_pretrained(flax_dump_folder_path)
-    print("T5X Model was sucessfully converted!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path the TX5 checkpoint."
-    )
-    parser.add_argument("--config_name", default=None, type=str, required=True, help="Config name of T5 model.")
-    parser.add_argument(
-        "--flax_dump_folder_path", default=None, type=str, required=True, help="Path to the output FLAX model."
-    )
-    args = parser.parse_args()
-    convert_t5x_checkpoint_to_flax(args.t5x_checkpoint_path, args.config_name, args.flax_dump_folder_path)
diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
deleted file mode 100755
index 5e7d9ef33d3e..000000000000
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# coding=utf-8
-# Copyright 2022 Google LLC and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert T5X checkpoint to PyTorch
-
-Steps:
-- Install gsutil according to https://cloud.google.com/storage/docs/gsutil_install
-- Get a T5X checkpoint at https://github.com/google-research/t5x/blob/main/docs/models.md#t5-11-checkpoints Example:
-    `gsutil -m cp -r gs://t5-data/pretrained_models/t5x/t5_1_1_small $HOME/`
-- Create or download a corresponding config for the downloaded model. E.g. for T5 v1.1 small, you can use
-    https://huggingface.co/google/t5-v1_1-small/blob/main/config.json
-- Convert:
-    ```
-    python3 convert_t5x_checkpoint_to_pytorch.py --t5x_checkpoint_path=$HOME/t5_1_1_small --config_file=config.json\
-      --pytorch_dump_path=$HOME/t5_1_1_small_pt
-    ```
-"""
-
-import argparse
-import collections
-
-import torch
-from flax import traverse_util
-from t5x import checkpoints
-
-from transformers import T5Config, T5EncoderModel, T5ForConditionalGeneration
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def t5x_attention_lookup(params, i, prefix, layer_name="attention"):
-    """Returns the KOQV parameters of (self-)attention. Does not transpose."""
-    k = params[f"{prefix}/layers_{i}/{layer_name}/key/kernel"]
-    o = params[f"{prefix}/layers_{i}/{layer_name}/out/kernel"]
-    q = params[f"{prefix}/layers_{i}/{layer_name}/query/kernel"]
-    v = params[f"{prefix}/layers_{i}/{layer_name}/value/kernel"]
-    return k, o, q, v
-
-
-def t5x_mlp_lookup(params, i, prefix, split_mlp_wi=False):
-    """Returns the MLP parameters of a layer. Does not transpose."""
-    if split_mlp_wi:
-        wi_0 = params[f"{prefix}/layers_{i}/mlp/wi_0/kernel"]
-        wi_1 = params[f"{prefix}/layers_{i}/mlp/wi_1/kernel"]
-        wi = (wi_0, wi_1)
-    else:
-        wi = params[f"{prefix}/layers_{i}/mlp/wi/kernel"]
-
-    wo = params[f"{prefix}/layers_{i}/mlp/wo/kernel"]
-    return wi, wo
-
-
-def t5x_layer_norm_lookup(params, i, prefix, layer_name):
-    """Returns the layer norm param of a layer."""
-    return params[f"{prefix}/layers_{i}/{layer_name}/scale"]
-
-
-def convert_t5x_to_pytorch(variables: dict, *, num_layers: int, num_decoder_layers: int, is_encoder_only: bool):
-    """Converts the parameters from T5X-Flax to Transformers-PyTorch."""
-    old = traverse_util.flatten_dict(variables["target"])
-    old = {"/".join(k): v for k, v in old.items()}
-
-    # v1.1 models have a gated GeLU with wi_0 and wi_1 instead of wi
-    split_mlp_wi = "encoder/layers_0/mlp/wi_0/kernel" in old
-    print("Split MLP:", split_mlp_wi)
-
-    new = collections.OrderedDict()
-
-    # Shared embeddings.
-    new["shared.weight"] = old["token_embedder/embedding"]
-
-    # Encoder.
-    for i in range(num_layers):
-        # Block i, layer 0 (Self Attention).
-        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_attention_layer_norm")
-        k, o, q, v = t5x_attention_lookup(old, i, "encoder", "attention")
-        new[f"encoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
-        new[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
-
-        # Block i, layer 1 (MLP).
-        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_mlp_layer_norm")
-        wi, wo = t5x_mlp_lookup(old, i, "encoder", split_mlp_wi)
-        new[f"encoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
-        if split_mlp_wi:
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = wi[0].T
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = wi[1].T
-        else:
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi.weight"] = wi.T
-        new[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = wo.T
-
-    new["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = old[
-        "encoder/relpos_bias/rel_embedding"
-    ].T
-    new["encoder.final_layer_norm.weight"] = old["encoder/encoder_norm/scale"]
-
-    if not is_encoder_only:
-        # Decoder.
-        for i in range(num_decoder_layers):
-            # Block i, layer 0 (Self Attention).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
-            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
-            new[f"decoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
-            new[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
-
-            # Block i, layer 1 (Cross Attention).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_cross_attention_layer_norm")
-            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "encoder_decoder_attention")
-            new[f"decoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = k.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = o.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = q.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = v.T
-
-            # Block i, layer 2 (MLP).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
-            wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
-            new[f"decoder.block.{i}.layer.2.layer_norm.weight"] = layer_norm
-            if split_mlp_wi:
-                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = wi[0].T
-                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = wi[1].T
-            else:
-                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi.weight"] = wi.T
-            new[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = wo.T
-
-        new["decoder.final_layer_norm.weight"] = old["decoder/decoder_norm/scale"]
-        new["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = old[
-            "decoder/relpos_bias/rel_embedding"
-        ].T
-
-        # LM Head (only in v1.1 checkpoints, in v1.0 embeddings are used instead)
-        if "decoder/logits_dense/kernel" in old:
-            new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T
-
-    return new
-
-
-def make_state_dict(converted_params, is_encoder_only: bool):
-    """Prepares a state dict for the PyTorch model."""
-    # Make a state dict with torch tensors.
-    state_dict = collections.OrderedDict([(k, torch.from_numpy(v.copy())) for (k, v) in converted_params.items()])
-
-    # Add what is missing.
-    if "encoder.embed_tokens.weight" not in state_dict:
-        state_dict["encoder.embed_tokens.weight"] = state_dict["shared.weight"]
-
-    if not is_encoder_only:
-        if "decoder.embed_tokens.weight" not in state_dict:
-            state_dict["decoder.embed_tokens.weight"] = state_dict["shared.weight"]
-
-        if "lm_head.weight" not in state_dict:  # For old 1.0 models.
-            print("Using shared word embeddings as lm_head.")
-            state_dict["lm_head.weight"] = state_dict["shared.weight"]
-
-    return state_dict
-
-
-def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only):
-    """Replaces the params in model witht the T5X converted params."""
-    variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-    converted = convert_t5x_to_pytorch(
-        variables,
-        num_layers=config.num_layers,
-        num_decoder_layers=config.num_decoder_layers,
-        is_encoder_only=is_encoder_only,
-    )
-    state_dict = make_state_dict(converted, is_encoder_only)
-    model.load_state_dict(state_dict, strict=True)
-
-
-def convert_t5x_checkpoint_to_pytorch(
-    t5x_checkpoint_path, config_file, pytorch_dump_path, is_encoder_only: bool = False
-):
-    """Loads the config and model, converts the T5X checkpoint, and saves a PyTorch checkpoint."""
-    # Initialise PyTorch model
-    config = T5Config.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    # Non-v1.1 checkpoints could also use T5Model, but this works for all.
-    # The v1.0 checkpoints will simply have an LM head that is the word embeddings.
-    if is_encoder_only:
-        model = T5EncoderModel(config)
-    else:
-        model = T5ForConditionalGeneration(config)
-
-    # Load weights from tf checkpoint
-    load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-    # Verify that we can load the checkpoint.
-    model.from_pretrained(pytorch_dump_path)
-    print("Done")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
-    # Required parameters
-    parser.add_argument(
-        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path to the T5X checkpoint."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--is_encoder_only", action="store_true", help="Check if the model is encoder-decoder model", default=False
-    )
-    args = parser.parse_args()
-    convert_t5x_checkpoint_to_pytorch(
-        args.t5x_checkpoint_path, args.config_file, args.pytorch_dump_path, args.is_encoder_only
-    )
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index 90bbf31f0678..be76fe1b7724 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -291,7 +291,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index a91c81ba79b7..306944bae19e 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -44,6 +44,7 @@
     DUMMY_MASK,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
     is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
@@ -53,6 +54,12 @@
 from .configuration_t5 import T5Config
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "T5Config"
@@ -242,7 +249,7 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
 
@@ -1202,12 +1209,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1289,7 +1301,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -2123,7 +2135,7 @@ def __init__(self, config: T5Config):
     @replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index f7d2b23f8b16..84f5b2a63612 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -363,9 +363,9 @@ def call(
         real_seq_length = seq_length
 
         if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
+            assert len(past_key_value) == 2, (
+                f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
+            )
             real_seq_length += shape_list(past_key_value[0])[2] if query_length is None else query_length
 
         key_length = real_seq_length if key_value_states is None else shape_list(key_value_states)[1]
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py b/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
deleted file mode 100644
index 487cdc481992..000000000000
--- a/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Table Transformer checkpoints with timm-backbone.
-
-URL: https://github.com/microsoft/table-transformer
-"""
-
-import argparse
-from collections import OrderedDict
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision.transforms import functional as F
-
-from transformers import DetrImageProcessor, TableTransformerConfig, TableTransformerForObjectDetection
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-rename_keys = []
-for i in range(6):
-    # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
-    # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
-            f"decoder.layers.{i}.encoder_attn.out_proj.weight",
-        )
-    )
-    rename_keys.append(
-        (
-            f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
-            f"decoder.layers.{i}.encoder_attn.out_proj.bias",
-        )
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias"))
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
-    )
-    rename_keys.append(
-        (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
-    )
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight"))
-    rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
-
-# convolutional projection + query embeddings + layernorm of encoder + layernorm of decoder + class and bounding box heads
-rename_keys.extend(
-    [
-        ("input_proj.weight", "input_projection.weight"),
-        ("input_proj.bias", "input_projection.bias"),
-        ("query_embed.weight", "query_position_embeddings.weight"),
-        ("transformer.encoder.norm.weight", "encoder.layernorm.weight"),
-        ("transformer.encoder.norm.bias", "encoder.layernorm.bias"),
-        ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
-        ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
-        ("class_embed.weight", "class_labels_classifier.weight"),
-        ("class_embed.bias", "class_labels_classifier.bias"),
-        ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
-        ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
-        ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
-        ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
-        ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
-        ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
-    ]
-)
-
-
-def rename_key(state_dict, old, new):
-    val = state_dict.pop(old)
-    state_dict[new] = val
-
-
-def rename_backbone_keys(state_dict):
-    new_state_dict = OrderedDict()
-    for key, value in state_dict.items():
-        if "backbone.0.body" in key:
-            new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model")
-            new_state_dict[new_key] = value
-        else:
-            new_state_dict[key] = value
-
-    return new_state_dict
-
-
-def read_in_q_k_v(state_dict):
-    prefix = ""
-
-    # first: transformer encoder
-    for i in range(6):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-        # read in weights + bias of input projection layer of cross-attention
-        in_proj_weight_cross_attn = state_dict.pop(
-            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
-        )
-        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) of cross-attention to the state dict
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
-
-
-def resize(image, checkpoint_url):
-    width, height = image.size
-    current_max_size = max(width, height)
-    target_max_size = 800 if "detection" in checkpoint_url else 1000
-    scale = target_max_size / current_max_size
-    resized_image = image.resize((int(round(scale * width)), int(round(scale * height))))
-
-    return resized_image
-
-
-def normalize(image):
-    image = F.to_tensor(image)
-    image = F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    return image
-
-
-@torch.no_grad()
-def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DETR structure.
-    """
-
-    logger.info("Converting model...")
-
-    # load original state dict
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-    # rename keys
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    state_dict = rename_backbone_keys(state_dict)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "model."
-    for key in state_dict.copy().keys():
-        if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
-            val = state_dict.pop(key)
-            state_dict[prefix + key] = val
-    # create HuggingFace model and load state dict
-    config = TableTransformerConfig(
-        backbone="resnet18",
-        mask_loss_coefficient=1,
-        dice_loss_coefficient=1,
-        ce_loss_coefficient=1,
-        bbox_loss_coefficient=5,
-        giou_loss_coefficient=2,
-        eos_coefficient=0.4,
-        class_cost=1,
-        bbox_cost=5,
-        giou_cost=2,
-    )
-
-    if "detection" in checkpoint_url:
-        config.num_queries = 15
-        config.num_labels = 2
-        id2label = {0: "table", 1: "table rotated"}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    else:
-        config.num_queries = 125
-        config.num_labels = 6
-        id2label = {
-            0: "table",
-            1: "table column",
-            2: "table row",
-            3: "table column header",
-            4: "table projected row header",
-            5: "table spanning cell",
-        }
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    image_processor = DetrImageProcessor(
-        format="coco_detection", max_size=800 if "detection" in checkpoint_url else 1000
-    )
-    model = TableTransformerForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # verify our conversion
-    filename = "example_pdf.png" if "detection" in checkpoint_url else "example_table.png"
-    file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename=filename)
-    image = Image.open(file_path).convert("RGB")
-    pixel_values = normalize(resize(image, checkpoint_url)).unsqueeze(0)
-
-    outputs = model(pixel_values)
-
-    if "detection" in checkpoint_url:
-        expected_shape = (1, 15, 3)
-        expected_logits = torch.tensor(
-            [[-6.7897, -16.9985, 6.7937], [-8.0186, -22.2192, 6.9677], [-7.3117, -21.0708, 7.4055]]
-        )
-        expected_boxes = torch.tensor([[0.4867, 0.1767, 0.6732], [0.6718, 0.4479, 0.3830], [0.4716, 0.1760, 0.6364]])
-
-    else:
-        expected_shape = (1, 125, 7)
-        expected_logits = torch.tensor(
-            [[-18.1430, -8.3214, 4.8274], [-18.4685, -7.1361, -4.2667], [-26.3693, -9.3429, -4.9962]]
-        )
-        expected_boxes = torch.tensor([[0.4983, 0.5595, 0.9440], [0.4916, 0.6315, 0.5954], [0.6108, 0.8637, 0.1135]])
-
-    assert outputs.logits.shape == expected_shape
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        # Save model and image processor
-        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Push model to HF hub
-        logger.info("Pushing model to the hub...")
-        model_name = (
-            "microsoft/table-transformer-detection"
-            if "detection" in checkpoint_url
-            else "microsoft/table-transformer-structure-recognition"
-        )
-        model.push_to_hub(model_name)
-        image_processor.push_to_hub(model_name)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
-        type=str,
-        choices=[
-            "https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
-            "https://pubtables1m.blob.core.windows.net/model/pubtables1m_structure_detr_r18.pth",
-        ],
-        help="URL of the Table Transformer checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_table_transformer_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py b/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
deleted file mode 100644
index 1073d4887743..000000000000
--- a/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
+++ /dev/null
@@ -1,434 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Table Transformer checkpoints with native (Transformers) backbone.
-
-URL: https://github.com/microsoft/table-transformer
-"""
-
-import argparse
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision.transforms import functional as F
-
-from transformers import DetrImageProcessor, ResNetConfig, TableTransformerConfig, TableTransformerForObjectDetection
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def create_rename_keys(config):
-    # here we list all keys to be renamed (original name on the left, our name on the right)
-    rename_keys = []
-
-    # stem
-    # fmt: off
-    rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight"))
-    rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight"))
-    rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias"))
-    rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean"))
-    rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var"))
-    # stages
-    for stage_idx in range(len(config.backbone_config.depths)):
-        for layer_idx in range(config.backbone_config.depths[stage_idx]):
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv1.weight",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.convolution.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn1.weight",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn1.bias",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.bias",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn1.running_mean",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.running_mean",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn1.running_var",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.running_var",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv2.weight",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.convolution.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn2.weight",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.weight",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn2.bias",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.bias",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn2.running_mean",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.running_mean",
-                )
-            )
-            rename_keys.append(
-                (
-                    f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn2.running_var",
-                    f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.running_var",
-                )
-            )
-            # all ResNet stages except the first one have a downsample as first layer
-            if stage_idx != 0 and layer_idx == 0:
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean",
-                    )
-                )
-                rename_keys.append(
-                    (
-                        # "backbone.conv_encoder.model.encoder.stages.3.layers.0.shortcut.normalization.running_var"
-                        f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var",
-                        f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var",
-                    )
-                )
-    # fmt: on
-
-    for i in range(config.encoder_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(
-            (
-                f"transformer.encoder.layers.{i}.self_attn.out_proj.weight",
-                f"encoder.layers.{i}.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias")
-        )
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias"))
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")
-        )
-        rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias"))
-        # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.self_attn.out_proj.weight",
-                f"decoder.layers.{i}.self_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias")
-        )
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight",
-                f"decoder.layers.{i}.encoder_attn.out_proj.weight",
-            )
-        )
-        rename_keys.append(
-            (
-                f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias",
-                f"decoder.layers.{i}.encoder_attn.out_proj.bias",
-            )
-        )
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight"))
-        rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias"))
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias")
-        )
-        rename_keys.append(
-            (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")
-        )
-        rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias"))
-
-    # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
-    rename_keys.extend(
-        [
-            ("input_proj.weight", "input_projection.weight"),
-            ("input_proj.bias", "input_projection.bias"),
-            ("query_embed.weight", "query_position_embeddings.weight"),
-            ("transformer.decoder.norm.weight", "decoder.layernorm.weight"),
-            ("transformer.decoder.norm.bias", "decoder.layernorm.bias"),
-            ("class_embed.weight", "class_labels_classifier.weight"),
-            ("class_embed.bias", "class_labels_classifier.bias"),
-            ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"),
-            ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"),
-            ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"),
-            ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"),
-            ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"),
-            ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"),
-            ("transformer.encoder.norm.weight", "encoder.layernorm.weight"),
-            ("transformer.encoder.norm.bias", "encoder.layernorm.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-def rename_key(state_dict, old, new):
-    val = state_dict.pop(old)
-    state_dict[new] = val
-
-
-def read_in_q_k_v(state_dict, is_panoptic=False):
-    prefix = ""
-    if is_panoptic:
-        prefix = "detr."
-
-    # first: transformer encoder
-    for i in range(6):
-        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-    # next: transformer decoder (which is a bit more complex because it also includes cross-attention)
-    for i in range(6):
-        # read in weights + bias of input projection layer of self-attention
-        in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
-        state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
-        state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
-        state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
-        # read in weights + bias of input projection layer of cross-attention
-        in_proj_weight_cross_attn = state_dict.pop(
-            f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight"
-        )
-        in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) of cross-attention to the state dict
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
-        state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
-
-
-def resize(image, checkpoint_url):
-    width, height = image.size
-    current_max_size = max(width, height)
-    target_max_size = 800 if "detection" in checkpoint_url else 1000
-    scale = target_max_size / current_max_size
-    resized_image = image.resize((int(round(scale * width)), int(round(scale * height))))
-
-    return resized_image
-
-
-def normalize(image):
-    image = F.to_tensor(image)
-    image = F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    return image
-
-
-@torch.no_grad()
-def convert_table_transformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our DETR structure.
-    """
-
-    logger.info("Converting model...")
-
-    # create HuggingFace model and load state dict
-    backbone_config = ResNetConfig.from_pretrained(
-        "microsoft/resnet-18", out_features=["stage1", "stage2", "stage3", "stage4"]
-    )
-
-    config = TableTransformerConfig(
-        backbone_config=backbone_config,
-        use_timm_backbone=False,
-        mask_loss_coefficient=1,
-        dice_loss_coefficient=1,
-        ce_loss_coefficient=1,
-        bbox_loss_coefficient=5,
-        giou_loss_coefficient=2,
-        eos_coefficient=0.4,
-        class_cost=1,
-        bbox_cost=5,
-        giou_cost=2,
-    )
-
-    # load original state dict
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
-
-    # rename keys
-    for src, dest in create_rename_keys(config):
-        rename_key(state_dict, src, dest)
-    # query, key and value matrices need special treatment
-    read_in_q_k_v(state_dict)
-    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
-    prefix = "model."
-    for key in state_dict.copy().keys():
-        if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"):
-            val = state_dict.pop(key)
-            state_dict[prefix + key] = val
-
-    if "detection" in checkpoint_url:
-        config.num_queries = 15
-        config.num_labels = 2
-        id2label = {0: "table", 1: "table rotated"}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    else:
-        config.num_queries = 125
-        config.num_labels = 6
-        id2label = {
-            0: "table",
-            1: "table column",
-            2: "table row",
-            3: "table column header",
-            4: "table projected row header",
-            5: "table spanning cell",
-        }
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    image_processor = DetrImageProcessor(format="coco_detection", size={"longest_edge": 800})
-    model = TableTransformerForObjectDetection(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # verify our conversion
-    filename = "example_pdf.png" if "detection" in checkpoint_url else "example_table.png"
-    file_path = hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename=filename)
-    image = Image.open(file_path).convert("RGB")
-    pixel_values = normalize(resize(image, checkpoint_url)).unsqueeze(0)
-
-    outputs = model(pixel_values)
-
-    if "detection" in checkpoint_url:
-        expected_shape = (1, 15, 3)
-        expected_logits = torch.tensor(
-            [[-6.7897, -16.9985, 6.7937], [-8.0186, -22.2192, 6.9677], [-7.3117, -21.0708, 7.4055]]
-        )
-        expected_boxes = torch.tensor([[0.4867, 0.1767, 0.6732], [0.6718, 0.4479, 0.3830], [0.4716, 0.1760, 0.6364]])
-
-    else:
-        expected_shape = (1, 125, 7)
-        expected_logits = torch.tensor(
-            [[-18.1430, -8.3214, 4.8274], [-18.4685, -7.1361, -4.2667], [-26.3693, -9.3429, -4.9962]]
-        )
-        expected_boxes = torch.tensor([[0.4983, 0.5595, 0.9440], [0.4916, 0.6315, 0.5954], [0.6108, 0.8637, 0.1135]])
-
-    assert outputs.logits.shape == expected_shape
-    assert torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-4)
-    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        # Save model and image processor
-        logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        # Push model to HF hub
-        logger.info("Pushing model to the hub...")
-        model_name = (
-            "microsoft/table-transformer-detection"
-            if "detection" in checkpoint_url
-            else "microsoft/table-transformer-structure-recognition"
-        )
-        model.push_to_hub(model_name, revision="no_timm")
-        image_processor.push_to_hub(model_name, revision="no_timm")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
-        type=str,
-        choices=[
-            "https://pubtables1m.blob.core.windows.net/model/pubtables1m_detection_detr_r18.pth",
-            "https://pubtables1m.blob.core.windows.net/model/pubtables1m_structure_detr_r18.pth",
-        ],
-        help="URL of the Table Transformer checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    args = parser.parse_args()
-    convert_table_transformer_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index cf6c6d4daf01..25246f338201 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -172,8 +172,8 @@ class TableTransformerObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
     decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -572,7 +572,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        object_queries: torch.Tensor = None,
+        object_queries: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
     ):
         """
diff --git a/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 34bf77cccd6b..000000000000
--- a/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert TAPAS checkpoint."""
-
-import argparse
-
-from transformers import (
-    TapasConfig,
-    TapasForMaskedLM,
-    TapasForQuestionAnswering,
-    TapasForSequenceClassification,
-    TapasModel,
-    TapasTokenizer,
-    load_tf_weights_in_tapas,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_tf_checkpoint_to_pytorch(
-    task, reset_position_index_per_cell, tf_checkpoint_path, tapas_config_file, pytorch_dump_path
-):
-    # Initialise PyTorch model.
-    # If you want to convert a checkpoint that uses absolute position embeddings, make sure to set reset_position_index_per_cell of
-    # TapasConfig to False.
-
-    # initialize configuration from json file
-    config = TapasConfig.from_json_file(tapas_config_file)
-    # set absolute/relative position embeddings parameter
-    config.reset_position_index_per_cell = reset_position_index_per_cell
-
-    # set remaining parameters of TapasConfig as well as the model based on the task
-    if task == "SQA":
-        model = TapasForQuestionAnswering(config=config)
-    elif task == "WTQ":
-        # run_task_main.py hparams
-        config.num_aggregation_labels = 4
-        config.use_answer_as_supervision = True
-        # hparam_utils.py hparams
-        config.answer_loss_cutoff = 0.664694
-        config.cell_selection_preference = 0.207951
-        config.huber_loss_delta = 0.121194
-        config.init_cell_selection_weights_to_zero = True
-        config.select_one_column = True
-        config.allow_empty_column_selection = False
-        config.temperature = 0.0352513
-
-        model = TapasForQuestionAnswering(config=config)
-    elif task == "WIKISQL_SUPERVISED":
-        # run_task_main.py hparams
-        config.num_aggregation_labels = 4
-        config.use_answer_as_supervision = False
-        # hparam_utils.py hparams
-        config.answer_loss_cutoff = 36.4519
-        config.cell_selection_preference = 0.903421
-        config.huber_loss_delta = 222.088
-        config.init_cell_selection_weights_to_zero = True
-        config.select_one_column = True
-        config.allow_empty_column_selection = True
-        config.temperature = 0.763141
-
-        model = TapasForQuestionAnswering(config=config)
-    elif task == "TABFACT":
-        model = TapasForSequenceClassification(config=config)
-    elif task == "MLM":
-        model = TapasForMaskedLM(config=config)
-    elif task == "INTERMEDIATE_PRETRAINING":
-        model = TapasModel(config=config)
-    else:
-        raise ValueError(f"Task {task} not supported.")
-
-    print(f"Building PyTorch model from configuration: {config}")
-    # Load weights from tf checkpoint
-    load_tf_weights_in_tapas(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model (weights and configuration)
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-    # Save tokenizer files
-    print(f"Save tokenizer files to {pytorch_dump_path}")
-    tokenizer = TapasTokenizer(vocab_file=tf_checkpoint_path[:-10] + "vocab.txt", model_max_length=512)
-    tokenizer.save_pretrained(pytorch_dump_path)
-
-    print("Used relative position embeddings:", model.config.reset_position_index_per_cell)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--task", default="SQA", type=str, help="Model task for which to convert a checkpoint. Defaults to SQA."
-    )
-    parser.add_argument(
-        "--reset_position_index_per_cell",
-        default=False,
-        action="store_true",
-        help="Whether to use relative position embeddings or not. Defaults to True.",
-    )
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--tapas_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained TAPAS model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(
-        args.task,
-        args.reset_position_index_per_cell,
-        args.tf_checkpoint_path,
-        args.tapas_config_file,
-        args.pytorch_dump_path,
-    )
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index cf22fe242f57..048e3fb00944 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -77,8 +77,8 @@ class TableQuestionAnsweringOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    logits_aggregation: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    logits_aggregation: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -719,7 +719,7 @@ class TapasPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _supports_param_buffer_assignment = False
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with Bert->Tapas
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -735,6 +735,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, TapasLMPredictionHead):
+            module.bias.data.zero_()
 
 
 TAPAS_START_DOCSTRING = r"""
@@ -1241,8 +1243,8 @@ def forward(
         if table_mask is None:
             table_mask = torch.where(row_ids > 0, torch.ones_like(row_ids), torch.zeros_like(row_ids))
         # torch.FloatTensor[batch_size, seq_length]
-        input_mask_float = attention_mask.float().to(device)
-        table_mask_float = table_mask.float().to(device)
+        input_mask_float = attention_mask.to(device=device, dtype=torch.float)
+        table_mask_float = table_mask.to(device=device, dtype=torch.float)
         # Mask for cells that exist in the table (i.e. that are not padding).
         cell_mask, _ = reduce_mean(input_mask_float, cell_index)
 
@@ -1284,9 +1286,9 @@ def forward(
                 aggregate_mask = None
             else:
                 if float_answer is not None:
-                    assert (
-                        labels.shape[0] == float_answer.shape[0]
-                    ), "Make sure the answers are a FloatTensor of shape (batch_size,)"
+                    assert labels.shape[0] == float_answer.shape[0], (
+                        "Make sure the answers are a FloatTensor of shape (batch_size,)"
+                    )
                     # <float32>[batch_size]
                     aggregate_mask = _calculate_aggregate_mask(
                         float_answer,
@@ -1336,9 +1338,9 @@ def forward(
                 if is_supervised:
                     # Note that `aggregate_mask` is None if the setting is supervised.
                     if aggregation_labels is not None:
-                        assert (
-                            labels.shape[0] == aggregation_labels.shape[0]
-                        ), "Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
+                        assert labels.shape[0] == aggregation_labels.shape[0], (
+                            "Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
+                        )
                         per_example_additional_loss = _calculate_aggregation_loss(
                             logits_aggregation,
                             aggregate_mask,
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index b73c3e93b9b6..7e5abdd7fabc 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -111,7 +111,7 @@ class TFTableQuestionAnsweringOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     logits_aggregation: tf.Tensor | None = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
@@ -170,10 +170,10 @@ def build(self, input_shape=None):
 
     def call(
         self,
-        input_ids: tf.Tensor = None,
-        position_ids: tf.Tensor = None,
-        token_type_ids: tf.Tensor = None,
-        inputs_embeds: tf.Tensor = None,
+        input_ids: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        token_type_ids: Optional[tf.Tensor] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
         training: bool = False,
     ) -> tf.Tensor:
         """
@@ -1562,9 +1562,9 @@ def call(
                 aggregate_mask = None
             else:
                 if float_answer is not None:
-                    assert (
-                        shape_list(labels)[0] == shape_list(float_answer)[0]
-                    ), "Make sure the answers are a FloatTensor of shape (batch_size,)"
+                    assert shape_list(labels)[0] == shape_list(float_answer)[0], (
+                        "Make sure the answers are a FloatTensor of shape (batch_size,)"
+                    )
                     # <float32>[batch_size]
                     aggregate_mask = _calculate_aggregate_mask(
                         float_answer,
@@ -1615,9 +1615,9 @@ def call(
                 if is_supervised:
                     # Note that `aggregate_mask` is None if the setting is supervised.
                     if aggregation_labels is not None:
-                        assert (
-                            shape_list(labels)[0] == shape_list(aggregation_labels)[0]
-                        ), "Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
+                        assert shape_list(labels)[0] == shape_list(aggregation_labels)[0], (
+                            "Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
+                        )
                         per_example_additional_loss = _calculate_aggregation_loss(
                             logits_aggregation,
                             aggregate_mask,
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 392ab81ac0d0..b290b4990d1e 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -247,8 +247,8 @@ def __init__(
         tokenize_chinese_chars=True,
         strip_accents=None,
         cell_trim_length: int = -1,
-        max_column_id: int = None,
-        max_row_id: int = None,
+        max_column_id: Optional[int] = None,
+        max_row_id: Optional[int] = None,
         strip_column_names: bool = False,
         update_answer_coordinates: bool = False,
         min_question_length=None,
@@ -522,7 +522,7 @@ def __call__(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -639,7 +639,7 @@ def batch_encode_plus(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -748,7 +748,7 @@ def _batch_encode_plus(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = True,
         return_attention_mask: Optional[bool] = None,
@@ -809,7 +809,7 @@ def _batch_prepare_for_model(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = True,
         return_attention_mask: Optional[bool] = True,
@@ -927,7 +927,7 @@ def encode_plus(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1010,7 +1010,7 @@ def _encode_plus(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = True,
         return_attention_mask: Optional[bool] = True,
@@ -1070,7 +1070,7 @@ def prepare_for_model(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = True,
         return_attention_mask: Optional[bool] = True,
@@ -1775,7 +1775,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -2242,8 +2242,8 @@ class NumericValue:
 
 @dataclass
 class NumericValueSpan:
-    begin_index: int = None
-    end_index: int = None
+    begin_index: Optional[int] = None
+    end_index: Optional[int] = None
     values: List[NumericValue] = None
 
 
diff --git a/src/transformers/models/textnet/convert_textnet_to_hf.py b/src/transformers/models/textnet/convert_textnet_to_hf.py
deleted file mode 100644
index a8a004d18a35..000000000000
--- a/src/transformers/models/textnet/convert_textnet_to_hf.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# coding=utf-8
-# Copyright 2024 the Fast authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import logging
-import re
-from collections import OrderedDict
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import TextNetBackbone, TextNetConfig, TextNetImageProcessor
-
-
-tiny_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_tiny.config"
-small_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_small.config"
-base_config_url = "https://raw.githubusercontent.com/czczup/FAST/main/config/fast/nas-configs/fast_base.config"
-
-rename_key_mappings = {
-    "module.backbone": "textnet",
-    "first_conv": "stem",
-    "bn": "batch_norm",
-    "ver": "vertical",
-    "hor": "horizontal",
-}
-
-
-def prepare_config(size_config_url, size):
-    config_dict = json.loads(requests.get(size_config_url).text)
-
-    backbone_config = {}
-    for stage_ix in range(1, 5):
-        stage_config = config_dict[f"stage{stage_ix}"]
-
-        merged_dict = {}
-
-        # Iterate through the list of dictionaries
-        for layer in stage_config:
-            for key, value in layer.items():
-                if key != "name":
-                    # Check if the key is already in the merged_dict
-                    if key in merged_dict:
-                        merged_dict[key].append(value)
-                    else:
-                        # If the key is not in merged_dict, create a new list with the value
-                        merged_dict[key] = [value]
-        backbone_config[f"stage{stage_ix}"] = merged_dict
-
-    neck_in_channels = []
-    neck_out_channels = []
-    neck_kernel_size = []
-    neck_stride = []
-    neck_dilation = []
-    neck_groups = []
-
-    for i in range(1, 5):
-        layer_key = f"reduce_layer{i}"
-        layer_dict = config_dict["neck"].get(layer_key)
-
-        if layer_dict:
-            # Append values to the corresponding lists
-            neck_in_channels.append(layer_dict["in_channels"])
-            neck_out_channels.append(layer_dict["out_channels"])
-            neck_kernel_size.append(layer_dict["kernel_size"])
-            neck_stride.append(layer_dict["stride"])
-            neck_dilation.append(layer_dict["dilation"])
-            neck_groups.append(layer_dict["groups"])
-
-    textnet_config = TextNetConfig(
-        stem_kernel_size=config_dict["first_conv"]["kernel_size"],
-        stem_stride=config_dict["first_conv"]["stride"],
-        stem_num_channels=config_dict["first_conv"]["in_channels"],
-        stem_out_channels=config_dict["first_conv"]["out_channels"],
-        stem_act_func=config_dict["first_conv"]["act_func"],
-        conv_layer_kernel_sizes=[
-            backbone_config["stage1"]["kernel_size"],
-            backbone_config["stage2"]["kernel_size"],
-            backbone_config["stage3"]["kernel_size"],
-            backbone_config["stage4"]["kernel_size"],
-        ],
-        conv_layer_strides=[
-            backbone_config["stage1"]["stride"],
-            backbone_config["stage2"]["stride"],
-            backbone_config["stage3"]["stride"],
-            backbone_config["stage4"]["stride"],
-        ],
-        hidden_sizes=[
-            config_dict["first_conv"]["out_channels"],
-            backbone_config["stage1"]["out_channels"][-1],
-            backbone_config["stage2"]["out_channels"][-1],
-            backbone_config["stage3"]["out_channels"][-1],
-            backbone_config["stage4"]["out_channels"][-1],
-        ],
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-        out_indices=[1, 2, 3, 4],
-    )
-
-    return textnet_config
-
-
-def convert_textnet_checkpoint(checkpoint_url, checkpoint_config_filename, pytorch_dump_folder_path):
-    config_filepath = hf_hub_download(repo_id="Raghavan/fast_model_config_files", filename="fast_model_configs.json")
-
-    with open(config_filepath) as f:
-        content = json.loads(f.read())
-
-    size = content[checkpoint_config_filename]["short_size"]
-
-    if "tiny" in content[checkpoint_config_filename]["config"]:
-        config = prepare_config(tiny_config_url, size)
-        expected_slice_backbone = torch.tensor(
-            [0.0000, 0.0000, 0.0000, 0.0000, 0.5300, 0.0000, 0.0000, 0.0000, 0.0000, 1.1221]
-        )
-    elif "small" in content[checkpoint_config_filename]["config"]:
-        config = prepare_config(small_config_url, size)
-        expected_slice_backbone = torch.tensor(
-            [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1394]
-        )
-    else:
-        config = prepare_config(base_config_url, size)
-        expected_slice_backbone = torch.tensor(
-            [0.9210, 0.6099, 0.0000, 0.0000, 0.0000, 0.0000, 3.2207, 2.6602, 1.8925, 0.0000]
-        )
-
-    model = TextNetBackbone(config)
-    textnet_image_processor = TextNetImageProcessor(size={"shortest_edge": size})
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["ema"]
-    state_dict_changed = OrderedDict()
-    for key in state_dict:
-        if "backbone" in key:
-            val = state_dict[key]
-            new_key = key
-            for search, replacement in rename_key_mappings.items():
-                if search in new_key:
-                    new_key = new_key.replace(search, replacement)
-
-            pattern = r"textnet\.stage(\d)"
-
-            def adjust_stage(match):
-                stage_number = int(match.group(1)) - 1
-                return f"textnet.encoder.stages.{stage_number}.stage"
-
-            # Using regex to find and replace the pattern in the string
-            new_key = re.sub(pattern, adjust_stage, new_key)
-            state_dict_changed[new_key] = val
-    model.load_state_dict(state_dict_changed)
-    model.eval()
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    original_pixel_values = torch.tensor(
-        [0.1939, 0.3481, 0.4166, 0.3309, 0.4508, 0.4679, 0.4851, 0.4851, 0.3309, 0.4337]
-    )
-    pixel_values = textnet_image_processor(image, return_tensors="pt").pixel_values
-
-    assert torch.allclose(original_pixel_values, pixel_values[0][0][3][:10], atol=1e-4)
-
-    with torch.no_grad():
-        output = model(pixel_values)
-
-    assert torch.allclose(output["feature_maps"][-1][0][10][12][:10].detach(), expected_slice_backbone, atol=1e-3)
-
-    model.save_pretrained(pytorch_dump_folder_path)
-    textnet_image_processor.save_pretrained(pytorch_dump_folder_path)
-    logging.info("The converted weights are saved here : " + pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/czczup/FAST/releases/download/release/fast_base_ic17mlt_640.pth",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--checkpoint_config_filename",
-        default="fast_base_ic17mlt_640.py",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-
-    convert_textnet_checkpoint(
-        args.checkpoint_url,
-        args.checkpoint_config_filename,
-        args.pytorch_dump_folder_path,
-    )
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 945ebe63fb83..74806a05566f 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -203,18 +203,18 @@ def resize(
     def preprocess(
         self,
         images: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
-        size_divisor: int = None,
+        size_divisor: Optional[int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index f48567245eae..d63be6d8d756 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -180,7 +180,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
+        self, data: torch.Tensor, observed_indicator: Optional[torch.Tensor] = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters:
@@ -233,24 +233,21 @@ class TimeSeriesSinusoidalPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
         super().__init__(num_positions, embedding_dim)
-        self.weight = self._init_weight(self.weight)
 
-    @staticmethod
-    def _init_weight(out: nn.Parameter) -> nn.Parameter:
+    def _init_weight(self):
         """
         Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
         the 2nd half of the vector. [dim // 2:]
         """
-        n_pos, dim = out.shape
+        n_pos, dim = self.weight.shape
         position_enc = np.array(
             [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
         )
-        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False)
         sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
         out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
         out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-        out.detach_()
-        return out
+        self.weight = nn.Parameter(out, requires_grad=False)
 
     @torch.no_grad()
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
@@ -641,7 +638,7 @@ def _init_weights(self, module):
             if module.bias is not None:
                 module.bias.data.zero_()
         elif isinstance(module, TimeSeriesSinusoidalPositionalEmbedding):
-            pass
+            module._init_weight()
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
diff --git a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
deleted file mode 100644
index ce4d13421ffd..000000000000
--- a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert TimeSformer checkpoints from the original repository: https://github.com/MCG-NJU/TimeSformer"""
-
-import argparse
-import json
-
-import gdown
-import numpy as np
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import TimesformerConfig, TimesformerForVideoClassification, VideoMAEImageProcessor
-
-
-def get_timesformer_config(model_name):
-    config = TimesformerConfig()
-
-    if "large" in model_name:
-        config.num_frames = 96
-
-    if "hr" in model_name:
-        config.num_frames = 16
-        config.image_size = 448
-
-    repo_id = "huggingface/label-files"
-    if "k400" in model_name:
-        config.num_labels = 400
-        filename = "kinetics400-id2label.json"
-    elif "k600" in model_name:
-        config.num_labels = 600
-        filename = "kinetics600-id2label.json"
-    elif "ssv2" in model_name:
-        config.num_labels = 174
-        filename = "something-something-v2-id2label.json"
-    else:
-        raise ValueError("Model name should either contain 'k400', 'k600' or 'ssv2'.")
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def rename_key(name):
-    if "encoder." in name:
-        name = name.replace("encoder.", "")
-    if "cls_token" in name:
-        name = name.replace("cls_token", "timesformer.embeddings.cls_token")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "timesformer.embeddings.position_embeddings")
-    if "time_embed" in name:
-        name = name.replace("time_embed", "timesformer.embeddings.time_embeddings")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "timesformer.embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "timesformer.embeddings.norm")
-    if "blocks" in name:
-        name = name.replace("blocks", "timesformer.encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name and "bias" not in name and "temporal" not in name:
-        name = name.replace("attn", "attention.self")
-    if "attn" in name and "temporal" not in name:
-        name = name.replace("attn", "attention.attention")
-    if "temporal_norm1" in name:
-        name = name.replace("temporal_norm1", "temporal_layernorm")
-    if "temporal_attn.proj" in name:
-        name = name.replace("temporal_attn", "temporal_attention.output.dense")
-    if "temporal_fc" in name:
-        name = name.replace("temporal_fc", "temporal_dense")
-    if "norm1" in name and "temporal" not in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "norm.weight" in name and "fc" not in name and "temporal" not in name:
-        name = name.replace("norm.weight", "timesformer.layernorm.weight")
-    if "norm.bias" in name and "fc" not in name and "temporal" not in name:
-        name = name.replace("norm.bias", "timesformer.layernorm.bias")
-    if "head" in name:
-        name = name.replace("head", "classifier")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if key.startswith("model."):
-            key = key.replace("model.", "")
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[1])
-            prefix = "timesformer.encoder.layer."
-            if "temporal" in key:
-                postfix = ".temporal_attention.attention.qkv."
-            else:
-                postfix = ".attention.attention.qkv."
-            if "weight" in key:
-                orig_state_dict[f"{prefix}{layer_num}{postfix}weight"] = val
-            else:
-                orig_state_dict[f"{prefix}{layer_num}{postfix}bias"] = val
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
-def prepare_video():
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
-    )
-    video = np.load(file)
-    return list(video)
-
-
-def convert_timesformer_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_name, push_to_hub):
-    config = get_timesformer_config(model_name)
-
-    model = TimesformerForVideoClassification(config)
-
-    # download original checkpoint, hosted on Google Drive
-    output = "pytorch_model.bin"
-    gdown.cached_download(checkpoint_url, output, quiet=False)
-    files = torch.load(output, map_location="cpu")
-    if "model" in files:
-        state_dict = files["model"]
-    elif "module" in files:
-        state_dict = files["module"]
-    else:
-        state_dict = files["model_state"]
-    new_state_dict = convert_state_dict(state_dict, config)
-
-    model.load_state_dict(new_state_dict)
-    model.eval()
-
-    # verify model on basic input
-    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
-    video = prepare_video()
-    inputs = image_processor(video[:8], return_tensors="pt")
-
-    outputs = model(**inputs)
-    logits = outputs.logits
-
-    model_names = [
-        # Kinetics-400 checkpoints (hr = high resolution input of 448px instead of 224px)
-        "timesformer-base-finetuned-k400",
-        "timesformer-large-finetuned-k400",
-        "timesformer-hr-finetuned-k400",
-        # Kinetics-600 checkpoints (hr = high resolution input of 448px instead of 224px)
-        "timesformer-base-finetuned-k600",
-        "timesformer-large-finetuned-k600",
-        "timesformer-hr-finetuned-k600",
-        # Something-Something-v2 checkpoints (hr = high resolution input of 448px instead of 224px)
-        "timesformer-base-finetuned-ssv2",
-        "timesformer-large-finetuned-ssv2",
-        "timesformer-hr-finetuned-ssv2",
-    ]
-
-    # NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5]
-    if model_name == "timesformer-base-finetuned-k400":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([-0.3016, -0.7713, -0.4205])
-    elif model_name == "timesformer-base-finetuned-k600":
-        expected_shape = torch.Size([1, 600])
-        expected_slice = torch.tensor([-0.7267, -0.7466, 3.2404])
-    elif model_name == "timesformer-base-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([-0.9059, 0.6433, -3.1457])
-    elif model_name == "timesformer-large-finetuned-k400":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0, 0, 0])
-    elif model_name == "timesformer-large-finetuned-k600":
-        expected_shape = torch.Size([1, 600])
-        expected_slice = torch.tensor([0, 0, 0])
-    elif model_name == "timesformer-large-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([0, 0, 0])
-    elif model_name == "timesformer-hr-finetuned-k400":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([-0.9617, -3.7311, -3.7708])
-    elif model_name == "timesformer-hr-finetuned-k600":
-        expected_shape = torch.Size([1, 600])
-        expected_slice = torch.tensor([2.5273, 0.7127, 1.8848])
-    elif model_name == "timesformer-hr-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([-3.6756, -0.7513, 0.7180])
-    else:
-        raise ValueError(f"Model name not supported. Should be one of {model_names}")
-
-    # verify logits
-    assert logits.shape == expected_shape
-    assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
-    print("Logits ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-        model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing to the hub...")
-        model.push_to_hub(f"fcakyon/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://drive.google.com/u/1/uc?id=17yvuYp9L4mn-HpIcK5Zo6K3UoOy1kA5l&export=download",
-        type=str,
-        help=(
-            "URL of the original PyTorch checkpoint (on Google Drive) you'd like to convert. Should be a direct"
-            " download link."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--model_name", default="timesformer-base-finetuned-k400", type=str, help="Name of the model.")
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_timesformer_checkpoint(
-        args.checkpoint_url, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub
-    )
diff --git a/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py b/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py
index ff5cb5a83827..48c1d8582533 100644
--- a/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py
+++ b/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py
@@ -113,7 +113,7 @@ def load_state_dict(self, state_dict, *args, **kwargs):
         Override original method to fix state_dict keys on load for cases when weights are loaded
         without using the `from_pretrained` method (e.g., in Trainer to resume from checkpoint).
         """
-        state_dict = self._fix_state_dict_keys_on_load(state_dict)
+        state_dict = {self._fix_state_dict_key_on_load(k)[0]: v for k, v in state_dict.items()}
         return super().load_state_dict(state_dict, *args, **kwargs)
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
deleted file mode 100644
index a787932b7694..000000000000
--- a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert TrOCR checkpoints from the unilm repository."""
-
-import argparse
-from pathlib import Path
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import (
-    RobertaTokenizer,
-    TrOCRConfig,
-    TrOCRForCausalLM,
-    TrOCRProcessor,
-    VisionEncoderDecoderModel,
-    ViTConfig,
-    ViTImageProcessor,
-    ViTModel,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(encoder_config, decoder_config):
-    rename_keys = []
-    for i in range(encoder_config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.norm1.weight", f"encoder.encoder.layer.{i}.layernorm_before.weight")
-        )
-        rename_keys.append((f"encoder.deit.blocks.{i}.norm1.bias", f"encoder.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.attn.proj.weight", f"encoder.encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.attn.proj.bias", f"encoder.encoder.layer.{i}.attention.output.dense.bias")
-        )
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.norm2.weight", f"encoder.encoder.layer.{i}.layernorm_after.weight")
-        )
-        rename_keys.append((f"encoder.deit.blocks.{i}.norm2.bias", f"encoder.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.mlp.fc1.weight", f"encoder.encoder.layer.{i}.intermediate.dense.weight")
-        )
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.mlp.fc1.bias", f"encoder.encoder.layer.{i}.intermediate.dense.bias")
-        )
-        rename_keys.append(
-            (f"encoder.deit.blocks.{i}.mlp.fc2.weight", f"encoder.encoder.layer.{i}.output.dense.weight")
-        )
-        rename_keys.append((f"encoder.deit.blocks.{i}.mlp.fc2.bias", f"encoder.encoder.layer.{i}.output.dense.bias"))
-
-    # cls token, position embeddings and patch embeddings of encoder
-    rename_keys.extend(
-        [
-            ("encoder.deit.cls_token", "encoder.embeddings.cls_token"),
-            ("encoder.deit.pos_embed", "encoder.embeddings.position_embeddings"),
-            ("encoder.deit.patch_embed.proj.weight", "encoder.embeddings.patch_embeddings.projection.weight"),
-            ("encoder.deit.patch_embed.proj.bias", "encoder.embeddings.patch_embeddings.projection.bias"),
-            ("encoder.deit.norm.weight", "encoder.layernorm.weight"),
-            ("encoder.deit.norm.bias", "encoder.layernorm.bias"),
-        ]
-    )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, encoder_config):
-    for i in range(encoder_config.num_hidden_layers):
-        # queries, keys and values (only weights, no biases)
-        in_proj_weight = state_dict.pop(f"encoder.deit.blocks.{i}.attn.qkv.weight")
-
-        state_dict[f"encoder.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : encoder_config.hidden_size, :
-        ]
-        state_dict[f"encoder.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            encoder_config.hidden_size : encoder_config.hidden_size * 2, :
-        ]
-        state_dict[f"encoder.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -encoder_config.hidden_size :, :
-        ]
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of the IAM Handwriting Database
-def prepare_img(checkpoint_url):
-    if "handwritten" in checkpoint_url:
-        url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02-00.jpg"  # industry
-        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02-12.jpg" # have
-        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02-10.jpg" # let
-        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"  #
-        # url = "https://fki.tic.heia-fr.ch/static/img/a01-122.jpg"
-    elif "printed" in checkpoint_url or "stage1" in checkpoint_url:
-        url = "https://www.researchgate.net/profile/Dinh-Sang/publication/338099565/figure/fig8/AS:840413229350922@1577381536857/An-receipt-example-in-the-SROIE-2019-dataset_Q640.jpg"
-    im = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    return im
-
-
-@torch.no_grad()
-def convert_tr_ocr_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our VisionEncoderDecoderModel structure.
-    """
-    # define encoder and decoder configs based on checkpoint_url
-    encoder_config = ViTConfig(image_size=384, qkv_bias=False)
-    decoder_config = TrOCRConfig()
-
-    # size of the architecture
-    if "base" in checkpoint_url:
-        decoder_config.encoder_hidden_size = 768
-    elif "large" in checkpoint_url:
-        # use ViT-large encoder
-        encoder_config.hidden_size = 1024
-        encoder_config.intermediate_size = 4096
-        encoder_config.num_hidden_layers = 24
-        encoder_config.num_attention_heads = 16
-        decoder_config.encoder_hidden_size = 1024
-    else:
-        raise ValueError("Should either find 'base' or 'large' in checkpoint URL")
-
-    # the large-printed + stage1 checkpoints uses sinusoidal position embeddings, no layernorm afterwards
-    if "large-printed" in checkpoint_url or "stage1" in checkpoint_url:
-        decoder_config.tie_word_embeddings = False
-        decoder_config.activation_function = "relu"
-        decoder_config.max_position_embeddings = 1024
-        decoder_config.scale_embedding = True
-        decoder_config.use_learned_position_embeddings = False
-        decoder_config.layernorm_embedding = False
-
-    # load HuggingFace model
-    encoder = ViTModel(encoder_config, add_pooling_layer=False)
-    decoder = TrOCRForCausalLM(decoder_config)
-    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
-    model.eval()
-
-    # load state_dict of original model, rename some keys
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True)["model"]
-
-    rename_keys = create_rename_keys(encoder_config, decoder_config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, encoder_config)
-
-    # remove parameters we don't need
-    del state_dict["encoder.deit.head.weight"]
-    del state_dict["encoder.deit.head.bias"]
-    del state_dict["decoder.version"]
-
-    # add prefix to decoder keys
-    for key, val in state_dict.copy().items():
-        val = state_dict.pop(key)
-        if key.startswith("decoder") and "output_projection" not in key:
-            state_dict["decoder.model." + key] = val
-        else:
-            state_dict[key] = val
-
-    # load state dict
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image
-    image_processor = ViTImageProcessor(size=encoder_config.image_size)
-    tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-large")
-    processor = TrOCRProcessor(image_processor, tokenizer)
-
-    pixel_values = processor(images=prepare_img(checkpoint_url), return_tensors="pt").pixel_values
-
-    # verify logits
-    decoder_input_ids = torch.tensor([[model.config.decoder.decoder_start_token_id]])
-    outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-    logits = outputs.logits
-
-    expected_shape = torch.Size([1, 1, 50265])
-    if "trocr-base-handwritten" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [-1.4502, -4.6683, -0.5347, -2.9291, 9.1435, -3.0571, 8.9764, 1.7560, 8.7358, -1.5311]
-        )
-    elif "trocr-large-handwritten" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [-2.6437, -1.3129, -2.2596, -5.3455, 6.3539, 1.7604, 5.4991, 1.4702, 5.6113, 2.0170]
-        )
-    elif "trocr-base-printed" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [-5.6816, -5.8388, 1.1398, -6.9034, 6.8505, -2.4393, 1.2284, -1.0232, -1.9661, -3.9210]
-        )
-    elif "trocr-large-printed" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [-6.0162, -7.0959, 4.4155, -5.1063, 7.0468, -3.1631, 2.6466, -0.3081, -0.8106, -1.7535]
-        )
-
-    if "stage1" not in checkpoint_url:
-        assert logits.shape == expected_shape, "Shape of logits not as expected"
-        assert torch.allclose(logits[0, 0, :10], expected_slice, atol=1e-3), "First elements of logits not as expected"
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving processor to {pytorch_dump_folder_path}")
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://layoutlm.blob.core.windows.net/trocr/model_zoo/fairseq/trocr-base-handwritten.pt",
-        type=str,
-        help="URL to the original PyTorch checkpoint (.pth file).",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_tr_ocr_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index 2a745516c4f0..fcc0de5b6d8a 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -144,8 +144,8 @@ def __init__(
         config,
         embed_dim: int,
         num_heads: int,
-        kdim: int = None,
-        vdim: int = None,
+        kdim: Optional[int] = None,
+        vdim: Optional[int] = None,
         dropout: float = 0.0,
         is_decoder: bool = False,
         bias: bool = True,
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 1ecb96b00f5d..6fb5f281ecdf 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -81,7 +81,7 @@ def __call__(
         When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
         [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
         [`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's
-        [`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
+        [`~TrOCRTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
         """
         # For backward compatibility
         if self._in_target_context_manager:
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 115433c6c48b..85b120206217 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -266,19 +266,19 @@ def pad_image(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
         do_pad: bool = True,
         pad_size: Dict[str, int] = None,
         constant_values: Union[float, Iterable[float]] = None,
         pad_mode: PaddingMode = None,
-        do_normalize: bool = None,
-        do_flip_channel_order: bool = None,
+        do_normalize: Optional[bool] = None,
+        do_flip_channel_order: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
@@ -340,19 +340,19 @@ def _preprocess_image(
     def preprocess(
         self,
         videos: Union[ImageInput, List[ImageInput], List[List[ImageInput]]],
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_pad: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_pad: Optional[bool] = None,
         pad_size: Dict[str, int] = None,
         constant_values: Union[float, Iterable[float]] = None,
         pad_mode: PaddingMode = None,
-        do_normalize: bool = None,
-        do_flip_channel_order: bool = None,
+        do_normalize: Optional[bool] = None,
+        do_flip_channel_order: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/tvp/modeling_tvp.py b/src/transformers/models/tvp/modeling_tvp.py
index 1d6d9a0106d9..9e93e6e9001b 100644
--- a/src/transformers/models/tvp/modeling_tvp.py
+++ b/src/transformers/models/tvp/modeling_tvp.py
@@ -54,7 +54,7 @@ class TvpVideoGroundingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -773,7 +773,7 @@ def forward(self, pixel_values, interpolate_pad_encoding: bool = False):
 
 
 @add_start_docstrings(
-    "The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on" " top.",
+    "The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.",
     TVP_START_DOCSTRING,
 )
 class TvpModel(TvpPreTrainedModel):
diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py
index a4ed81e54aad..76baae91346c 100644
--- a/src/transformers/models/tvp/processing_tvp.py
+++ b/src/transformers/models/tvp/processing_tvp.py
@@ -51,7 +51,7 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
-        TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the doctsring of
+        TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring of
         the above two methods for more information.
 
         Args:
@@ -59,7 +59,7 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
+            videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarray]]`,:
                 `List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
                 of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
                 each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
diff --git a/src/transformers/models/udop/convert_udop_to_hf.py b/src/transformers/models/udop/convert_udop_to_hf.py
deleted file mode 100644
index f2d54b8ca542..000000000000
--- a/src/transformers/models/udop/convert_udop_to_hf.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert UDOP checkpoints from the original repository. URL: https://github.com/microsoft/i-Code/tree/main/i-Code-Doc"""
-
-import argparse
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from torchvision import transforms as T
-
-from transformers import (
-    LayoutLMv3ImageProcessor,
-    UdopConfig,
-    UdopForConditionalGeneration,
-    UdopProcessor,
-    UdopTokenizer,
-)
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-
-
-def original_transform(image, image_size=224):
-    transform = T.Compose(
-        [
-            T.Resize([image_size, image_size]),
-            T.ToTensor(),
-            T.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
-        ]
-    )
-
-    image = transform(image)
-    return image
-
-
-def get_image():
-    filepath = hf_hub_download(
-        repo_id="hf-internal-testing/fixtures_docvqa", filename="document_2.png", repo_type="dataset"
-    )
-    image = Image.open(filepath).convert("RGB")
-
-    return image
-
-
-def prepare_dummy_inputs(tokenizer, image_processor):
-    prompt = "Question answering. What is the name of the company?"
-    prompt = "Question answering. In which year is the report made?"
-    prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
-
-    image = get_image()
-    # words, boxes = apply_tesseract(image, lang=None)
-    # fmt: off
-    words = ['7', 'ITC', 'Limited', 'REPORT', 'AND', 'ACCOUNTS', '2013', 'ITC’s', 'Brands:', 'An', 'Asset', 'for', 'the', 'Nation', 'The', 'consumer', 'needs', 'and', 'aspirations', 'they', 'fulfil,', 'the', 'benefit', 'they', 'generate', 'for', 'millions', 'across', 'ITC’s', 'value', 'chains,', 'the', 'future-ready', 'capabilities', 'that', 'support', 'them,', 'and', 'the', 'value', 'that', 'they', 'create', 'for', 'the', 'country,', 'have', 'made', 'ITC’s', 'brands', 'national', 'assets,', 'adding', 'to', 'India’s', 'competitiveness.', 'It', 'is', 'ITC’s', 'aspiration', 'to', 'be', 'the', 'No', '1', 'FMCG', 'player', 'in', 'the', 'country,', 'driven', 'by', 'its', 'new', 'FMCG', 'businesses.', 'A', 'recent', 'Nielsen', 'report', 'has', 'highlighted', 'that', "ITC's", 'new', 'FMCG', 'businesses', 'are', 'the', 'fastest', 'growing', 'among', 'the', 'top', 'consumer', 'goods', 'companies', 'operating', 'in', 'India.', 'ITC', 'takes', 'justifiable', 'pride', 'that,', 'along', 'with', 'generating', 'economic', 'value,', 'these', 'celebrated', 'Indian', 'brands', 'also', 'drive', 'the', 'creation', 'of', 'larger', 'societal', 'capital', 'through', 'the', 'virtuous', 'cycle', 'of', 'sustainable', 'and', 'inclusive', 'growth.', 'DI', 'WILLS', '*', ';', 'LOVE', 'DELIGHTFULLY', 'SOFT', 'SKIN?', 'aia', 'Ans', 'Source:', 'https://www.industrydocuments.ucsf.edu/docs/snbx0223']
-    boxes = [[0, 45, 67, 80], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [372, 59, 407, 66], [74, 136, 161, 158], [175, 137, 306, 158], [318, 137, 363, 158], [374, 137, 472, 158], [483, 136, 529, 158], [540, 137, 593, 158], [608, 137, 717, 158], [73, 194, 100, 203], [106, 196, 177, 203], [183, 194, 227, 203], [233, 194, 259, 203], [265, 194, 344, 205], [74, 211, 104, 222], [109, 210, 141, 221], [147, 211, 169, 220], [175, 210, 223, 220], [229, 211, 259, 222], [265, 211, 329, 222], [334, 210, 352, 220], [74, 227, 127, 236], [133, 229, 180, 236], [187, 227, 221, 236], [226, 227, 264, 236], [270, 227, 320, 237], [327, 227, 349, 236], [74, 243, 161, 254], [166, 243, 249, 254], [254, 243, 281, 252], [286, 244, 342, 254], [74, 260, 112, 270], [119, 260, 145, 269], [151, 260, 174, 269], [179, 260, 217, 269], [222, 260, 249, 269], [254, 260, 285, 271], [290, 260, 335, 269], [340, 259, 359, 269], [74, 276, 95, 284], [101, 276, 156, 287], [164, 276, 198, 284], [203, 276, 244, 284], [251, 275, 285, 284], [291, 276, 340, 284], [74, 292, 129, 301], [135, 292, 185, 302], [192, 292, 242, 303], [248, 292, 261, 301], [267, 292, 312, 301], [74, 308, 195, 319], [75, 335, 82, 344], [88, 335, 98, 344], [105, 335, 138, 344], [144, 335, 214, 346], [220, 336, 233, 344], [239, 335, 256, 344], [262, 335, 283, 344], [290, 335, 309, 344], [316, 335, 320, 344], [74, 351, 119, 360], [126, 352, 170, 362], [176, 352, 186, 360], [192, 352, 214, 360], [220, 352, 276, 362], [282, 352, 326, 360], [333, 352, 349, 362], [74, 368, 89, 377], [95, 370, 124, 377], [129, 367, 175, 377], [181, 368, 266, 377], [272, 368, 283, 376], [289, 368, 333, 377], [74, 384, 126, 393], [134, 385, 175, 395], [181, 384, 206, 393], [212, 384, 292, 395], [298, 384, 325, 393], [330, 384, 366, 393], [74, 403, 103, 409], [109, 400, 154, 409], [161, 401, 241, 409], [247, 403, 269, 409], [275, 401, 296, 409], [302, 400, 349, 409], [74, 417, 131, 428], [137, 419, 186, 428], [192, 417, 214, 426], [219, 417, 242, 428], [248, 419, 319, 426], [74, 433, 119, 444], [125, 433, 204, 444], [210, 433, 278, 444], [285, 433, 295, 441], [302, 433, 340, 442], [75, 449, 98, 458], [104, 449, 142, 458], [146, 449, 215, 460], [221, 449, 258, 460], [263, 449, 293, 459], [300, 449, 339, 460], [74, 466, 101, 474], [108, 466, 185, 476], [191, 466, 261, 474], [267, 466, 309, 476], [315, 466, 354, 474], [74, 482, 151, 491], [158, 482, 201, 491], [208, 482, 258, 491], [263, 482, 292, 491], [298, 482, 333, 491], [338, 482, 360, 491], [74, 498, 131, 507], [137, 498, 150, 507], [156, 498, 197, 509], [202, 498, 257, 507], [263, 498, 310, 509], [74, 515, 128, 525], [134, 515, 156, 523], [161, 515, 218, 523], [223, 515, 261, 525], [267, 514, 280, 523], [74, 531, 156, 540], [162, 531, 188, 540], [195, 531, 257, 540], [263, 531, 315, 542], [871, 199, 878, 202], [883, 199, 908, 202], [894, 251, 904, 257], [841, 268, 841, 270], [784, 373, 811, 378], [816, 373, 896, 378], [784, 381, 811, 387], [815, 381, 847, 387], [645, 908, 670, 915], [692, 908, 712, 915], [220, 984, 285, 993], [293, 983, 779, 996]]
-    # fmt: on
-    text_list = []
-    bbox_list = []
-    for text, box in zip(words, boxes):
-        if text == "":
-            continue
-        sub_tokens = tokenizer.tokenize(text)
-        for sub_token in sub_tokens:
-            text_list.append(sub_token)
-            bbox_list.append(box)
-
-    input_ids = tokenizer.convert_tokens_to_ids(text_list)
-
-    input_ids = prompt_ids + input_ids
-    bbox = [[0, 0, 0, 0]] * len(prompt_ids) + bbox_list
-
-    pixel_values = image_processor(image, return_tensors="pt").pixel_values
-    original_pixel_values = original_transform(image, image_size=image_processor.size["height"]).unsqueeze(0)
-    # verify pixel values
-    assert torch.allclose(original_pixel_values, pixel_values)
-    print("Pixel values are ok!")
-
-    return torch.tensor(input_ids).unsqueeze(0), torch.tensor(bbox).unsqueeze(0).float(), pixel_values
-
-
-def convert_udop_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    # model_name to checkpoint_path
-    name_to_checkpoint_path = {
-        "udop-large": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-224/pytorch_model.bin",
-        "udop-large-512": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-512/pytorch_model.bin",
-        "udop-large-512-300k": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-512-300k-steps/pytorch_model.bin",
-    }
-
-    # load original state dict
-    checkpoint_path = name_to_checkpoint_path[model_name]
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-
-    print("Checkpoint path:", checkpoint_path)
-
-    # create HF model
-    image_size = 512 if "512" in model_name else 224
-    config = UdopConfig(decoder_start_token_id=0, image_size=image_size)
-    model = UdopForConditionalGeneration(config)
-    model.eval()
-
-    # rename keys
-    state_dict = {k.replace("cell2dembedding", "cell_2d_embedding"): v for k, v in state_dict.items()}
-
-    # load weights
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    print("Missing keys:", missing_keys)
-    print("Unexpected keys:", unexpected_keys)
-    assert missing_keys == ["encoder.embed_patches.proj.weight", "encoder.embed_patches.proj.bias"]
-    assert unexpected_keys == ["pos_embed"]
-
-    # Add extra_ids to the special token list
-    # NOTE special tokens have a unique order
-    # see https://github.com/huggingface/transformers/issues/29591 for details
-    # fmt: off
-    additional_special_tokens = ['<extra_id_99>', '<extra_id_98>', '<extra_id_97>', '<extra_id_96>', '<extra_id_95>', '<extra_id_94>', '<extra_id_93>', '<extra_id_92>', '<extra_id_91>', '<extra_id_90>', '<extra_id_89>', '<extra_id_88>', '<extra_id_87>', '<extra_id_86>', '<extra_id_85>', '<extra_id_84>', '<extra_id_83>', '<extra_id_82>', '<extra_id_81>', '<extra_id_80>', '<extra_id_79>', '<extra_id_78>', '<extra_id_77>', '<extra_id_76>', '<extra_id_75>', '<extra_id_74>', '<extra_id_73>', '<extra_id_72>', '<extra_id_71>', '<extra_id_70>', '<extra_id_69>', '<extra_id_68>', '<extra_id_67>', '<extra_id_66>', '<extra_id_65>', '<extra_id_64>', '<extra_id_63>', '<extra_id_62>', '<extra_id_61>', '<extra_id_60>', '<extra_id_59>', '<extra_id_58>', '<extra_id_57>', '<extra_id_56>', '<extra_id_55>', '<extra_id_54>', '<extra_id_53>', '<extra_id_52>', '<extra_id_51>', '<extra_id_50>', '<extra_id_49>', '<extra_id_48>', '<extra_id_47>', '<extra_id_46>', '<extra_id_45>', '<extra_id_44>', '<extra_id_43>', '<extra_id_42>', '<extra_id_41>', '<extra_id_40>', '<extra_id_39>', '<extra_id_38>', '<extra_id_37>', '<extra_id_36>', '<extra_id_35>', '<extra_id_34>', '<extra_id_33>', '<extra_id_32>', '<extra_id_31>', '<extra_id_30>', '<extra_id_29>', '<extra_id_28>', '<extra_id_27>', '<extra_id_26>', '<extra_id_25>', '<extra_id_24>', '<extra_id_23>', '<extra_id_22>', '<extra_id_21>', '<extra_id_20>', '<extra_id_19>', '<extra_id_18>', '<extra_id_17>', '<extra_id_16>', '<extra_id_15>', '<extra_id_14>', '<extra_id_13>', '<extra_id_12>', '<extra_id_11>', '<extra_id_10>', '<extra_id_9>', '<extra_id_8>', '<extra_id_7>', '<extra_id_6>', '<extra_id_5>', '<extra_id_4>', '<extra_id_3>', '<extra_id_2>', '<extra_id_1>', '<extra_id_0>', '<extra_l_id_99>', '<extra_l_id_98>', '<extra_l_id_97>', '<extra_l_id_96>', '<extra_l_id_95>', '<extra_l_id_94>', '<extra_l_id_93>', '<extra_l_id_92>', '<extra_l_id_91>', '<extra_l_id_90>', '<extra_l_id_89>', '<extra_l_id_88>', '<extra_l_id_87>', '<extra_l_id_86>', '<extra_l_id_85>', '<extra_l_id_84>', '<extra_l_id_83>', '<extra_l_id_82>', '<extra_l_id_81>', '<extra_l_id_80>', '<extra_l_id_79>', '<extra_l_id_78>', '<extra_l_id_77>', '<extra_l_id_76>', '<extra_l_id_75>', '<extra_l_id_74>', '<extra_l_id_73>', '<extra_l_id_72>', '<extra_l_id_71>', '<extra_l_id_70>', '<extra_l_id_69>', '<extra_l_id_68>', '<extra_l_id_67>', '<extra_l_id_66>', '<extra_l_id_65>', '<extra_l_id_64>', '<extra_l_id_63>', '<extra_l_id_62>', '<extra_l_id_61>', '<extra_l_id_60>', '<extra_l_id_59>', '<extra_l_id_58>', '<extra_l_id_57>', '<extra_l_id_56>', '<extra_l_id_55>', '<extra_l_id_54>', '<extra_l_id_53>', '<extra_l_id_52>', '<extra_l_id_51>', '<extra_l_id_50>', '<extra_l_id_49>', '<extra_l_id_48>', '<extra_l_id_47>', '<extra_l_id_46>', '<extra_l_id_45>', '<extra_l_id_44>', '<extra_l_id_43>', '<extra_l_id_42>', '<extra_l_id_41>', '<extra_l_id_40>', '<extra_l_id_39>', '<extra_l_id_38>', '<extra_l_id_37>', '<extra_l_id_36>', '<extra_l_id_35>', '<extra_l_id_34>', '<extra_l_id_33>', '<extra_l_id_32>', '<extra_l_id_31>', '<extra_l_id_30>', '<extra_l_id_29>', '<extra_l_id_28>', '<extra_l_id_27>', '<extra_l_id_26>', '<extra_l_id_25>', '<extra_l_id_24>', '<extra_l_id_23>', '<extra_l_id_22>', '<extra_l_id_21>', '<extra_l_id_20>', '<extra_l_id_19>', '<extra_l_id_18>', '<extra_l_id_17>', '<extra_l_id_16>', '<extra_l_id_15>', '<extra_l_id_14>', '<extra_l_id_13>', '<extra_l_id_12>', '<extra_l_id_11>', '<extra_l_id_10>', '<extra_l_id_9>', '<extra_l_id_8>', '<extra_l_id_7>', '<extra_l_id_6>', '<extra_l_id_5>', '<extra_l_id_4>', '<extra_l_id_3>', '<extra_l_id_2>', '<extra_l_id_1>', '<extra_l_id_0>', '</extra_l_id_99>', '</extra_l_id_98>', '</extra_l_id_97>', '</extra_l_id_96>', '</extra_l_id_95>', '</extra_l_id_94>', '</extra_l_id_93>', '</extra_l_id_92>', '</extra_l_id_91>', '</extra_l_id_90>', '</extra_l_id_89>', '</extra_l_id_88>', '</extra_l_id_87>', '</extra_l_id_86>', '</extra_l_id_85>', '</extra_l_id_84>', '</extra_l_id_83>', '</extra_l_id_82>', '</extra_l_id_81>', '</extra_l_id_80>', '</extra_l_id_79>', '</extra_l_id_78>', '</extra_l_id_77>', '</extra_l_id_76>', '</extra_l_id_75>', '</extra_l_id_74>', '</extra_l_id_73>', '</extra_l_id_72>', '</extra_l_id_71>', '</extra_l_id_70>', '</extra_l_id_69>', '</extra_l_id_68>', '</extra_l_id_67>', '</extra_l_id_66>', '</extra_l_id_65>', '</extra_l_id_64>', '</extra_l_id_63>', '</extra_l_id_62>', '</extra_l_id_61>', '</extra_l_id_60>', '</extra_l_id_59>', '</extra_l_id_58>', '</extra_l_id_57>', '</extra_l_id_56>', '</extra_l_id_55>', '</extra_l_id_54>', '</extra_l_id_53>', '</extra_l_id_52>', '</extra_l_id_51>', '</extra_l_id_50>', '</extra_l_id_49>', '</extra_l_id_48>', '</extra_l_id_47>', '</extra_l_id_46>', '</extra_l_id_45>', '</extra_l_id_44>', '</extra_l_id_43>', '</extra_l_id_42>', '</extra_l_id_41>', '</extra_l_id_40>', '</extra_l_id_39>', '</extra_l_id_38>', '</extra_l_id_37>', '</extra_l_id_36>', '</extra_l_id_35>', '</extra_l_id_34>', '</extra_l_id_33>', '</extra_l_id_32>', '</extra_l_id_31>', '</extra_l_id_30>', '</extra_l_id_29>', '</extra_l_id_28>', '</extra_l_id_27>', '</extra_l_id_26>', '</extra_l_id_25>', '</extra_l_id_24>', '</extra_l_id_23>', '</extra_l_id_22>', '</extra_l_id_21>', '</extra_l_id_20>', '</extra_l_id_19>', '</extra_l_id_18>', '</extra_l_id_17>', '</extra_l_id_16>', '</extra_l_id_15>', '</extra_l_id_14>', '</extra_l_id_13>', '</extra_l_id_12>', '</extra_l_id_11>', '</extra_l_id_10>', '</extra_l_id_9>', '</extra_l_id_8>', '</extra_l_id_7>', '</extra_l_id_6>', '</extra_l_id_5>', '</extra_l_id_4>', '</extra_l_id_3>', '</extra_l_id_2>', '</extra_l_id_1>', '</extra_l_id_0>', '<extra_t_id_99>', '<extra_t_id_98>', '<extra_t_id_97>', '<extra_t_id_96>', '<extra_t_id_95>', '<extra_t_id_94>', '<extra_t_id_93>', '<extra_t_id_92>', '<extra_t_id_91>', '<extra_t_id_90>', '<extra_t_id_89>', '<extra_t_id_88>', '<extra_t_id_87>', '<extra_t_id_86>', '<extra_t_id_85>', '<extra_t_id_84>', '<extra_t_id_83>', '<extra_t_id_82>', '<extra_t_id_81>', '<extra_t_id_80>', '<extra_t_id_79>', '<extra_t_id_78>', '<extra_t_id_77>', '<extra_t_id_76>', '<extra_t_id_75>', '<extra_t_id_74>', '<extra_t_id_73>', '<extra_t_id_72>', '<extra_t_id_71>', '<extra_t_id_70>', '<extra_t_id_69>', '<extra_t_id_68>', '<extra_t_id_67>', '<extra_t_id_66>', '<extra_t_id_65>', '<extra_t_id_64>', '<extra_t_id_63>', '<extra_t_id_62>', '<extra_t_id_61>', '<extra_t_id_60>', '<extra_t_id_59>', '<extra_t_id_58>', '<extra_t_id_57>', '<extra_t_id_56>', '<extra_t_id_55>', '<extra_t_id_54>', '<extra_t_id_53>', '<extra_t_id_52>', '<extra_t_id_51>', '<extra_t_id_50>', '<extra_t_id_49>', '<extra_t_id_48>', '<extra_t_id_47>', '<extra_t_id_46>', '<extra_t_id_45>', '<extra_t_id_44>', '<extra_t_id_43>', '<extra_t_id_42>', '<extra_t_id_41>', '<extra_t_id_40>', '<extra_t_id_39>', '<extra_t_id_38>', '<extra_t_id_37>', '<extra_t_id_36>', '<extra_t_id_35>', '<extra_t_id_34>', '<extra_t_id_33>', '<extra_t_id_32>', '<extra_t_id_31>', '<extra_t_id_30>', '<extra_t_id_29>', '<extra_t_id_28>', '<extra_t_id_27>', '<extra_t_id_26>', '<extra_t_id_25>', '<extra_t_id_24>', '<extra_t_id_23>', '<extra_t_id_22>', '<extra_t_id_21>', '<extra_t_id_20>', '<extra_t_id_19>', '<extra_t_id_18>', '<extra_t_id_17>', '<extra_t_id_16>', '<extra_t_id_15>', '<extra_t_id_14>', '<extra_t_id_13>', '<extra_t_id_12>', '<extra_t_id_11>', '<extra_t_id_10>', '<extra_t_id_9>', '<extra_t_id_8>', '<extra_t_id_7>', '<extra_t_id_6>', '<extra_t_id_5>', '<extra_t_id_4>', '<extra_t_id_3>', '<extra_t_id_2>', '<extra_t_id_1>', '<extra_t_id_0>', '</extra_t_id_99>', '</extra_t_id_98>', '</extra_t_id_97>', '</extra_t_id_96>', '</extra_t_id_95>', '</extra_t_id_94>', '</extra_t_id_93>', '</extra_t_id_92>', '</extra_t_id_91>', '</extra_t_id_90>', '</extra_t_id_89>', '</extra_t_id_88>', '</extra_t_id_87>', '</extra_t_id_86>', '</extra_t_id_85>', '</extra_t_id_84>', '</extra_t_id_83>', '</extra_t_id_82>', '</extra_t_id_81>', '</extra_t_id_80>', '</extra_t_id_79>', '</extra_t_id_78>', '</extra_t_id_77>', '</extra_t_id_76>', '</extra_t_id_75>', '</extra_t_id_74>', '</extra_t_id_73>', '</extra_t_id_72>', '</extra_t_id_71>', '</extra_t_id_70>', '</extra_t_id_69>', '</extra_t_id_68>', '</extra_t_id_67>', '</extra_t_id_66>', '</extra_t_id_65>', '</extra_t_id_64>', '</extra_t_id_63>', '</extra_t_id_62>', '</extra_t_id_61>', '</extra_t_id_60>', '</extra_t_id_59>', '</extra_t_id_58>', '</extra_t_id_57>', '</extra_t_id_56>', '</extra_t_id_55>', '</extra_t_id_54>', '</extra_t_id_53>', '</extra_t_id_52>', '</extra_t_id_51>', '</extra_t_id_50>', '</extra_t_id_49>', '</extra_t_id_48>', '</extra_t_id_47>', '</extra_t_id_46>', '</extra_t_id_45>', '</extra_t_id_44>', '</extra_t_id_43>', '</extra_t_id_42>', '</extra_t_id_41>', '</extra_t_id_40>', '</extra_t_id_39>', '</extra_t_id_38>', '</extra_t_id_37>', '</extra_t_id_36>', '</extra_t_id_35>', '</extra_t_id_34>', '</extra_t_id_33>', '</extra_t_id_32>', '</extra_t_id_31>', '</extra_t_id_30>', '</extra_t_id_29>', '</extra_t_id_28>', '</extra_t_id_27>', '</extra_t_id_26>', '</extra_t_id_25>', '</extra_t_id_24>', '</extra_t_id_23>', '</extra_t_id_22>', '</extra_t_id_21>', '</extra_t_id_20>', '</extra_t_id_19>', '</extra_t_id_18>', '</extra_t_id_17>', '</extra_t_id_16>', '</extra_t_id_15>', '</extra_t_id_14>', '</extra_t_id_13>', '</extra_t_id_12>', '</extra_t_id_11>', '</extra_t_id_10>', '</extra_t_id_9>', '</extra_t_id_8>', '</extra_t_id_7>', '</extra_t_id_6>', '</extra_t_id_5>', '</extra_t_id_4>', '</extra_t_id_3>', '</extra_t_id_2>', '</extra_t_id_1>', '</extra_t_id_0>', '<loc_500>', '<loc_499>', '<loc_498>', '<loc_497>', '<loc_496>', '<loc_495>', '<loc_494>', '<loc_493>', '<loc_492>', '<loc_491>', '<loc_490>', '<loc_489>', '<loc_488>', '<loc_487>', '<loc_486>', '<loc_485>', '<loc_484>', '<loc_483>', '<loc_482>', '<loc_481>', '<loc_480>', '<loc_479>', '<loc_478>', '<loc_477>', '<loc_476>', '<loc_475>', '<loc_474>', '<loc_473>', '<loc_472>', '<loc_471>', '<loc_470>', '<loc_469>', '<loc_468>', '<loc_467>', '<loc_466>', '<loc_465>', '<loc_464>', '<loc_463>', '<loc_462>', '<loc_461>', '<loc_460>', '<loc_459>', '<loc_458>', '<loc_457>', '<loc_456>', '<loc_455>', '<loc_454>', '<loc_453>', '<loc_452>', '<loc_451>', '<loc_450>', '<loc_449>', '<loc_448>', '<loc_447>', '<loc_446>', '<loc_445>', '<loc_444>', '<loc_443>', '<loc_442>', '<loc_441>', '<loc_440>', '<loc_439>', '<loc_438>', '<loc_437>', '<loc_436>', '<loc_435>', '<loc_434>', '<loc_433>', '<loc_432>', '<loc_431>', '<loc_430>', '<loc_429>', '<loc_428>', '<loc_427>', '<loc_426>', '<loc_425>', '<loc_424>', '<loc_423>', '<loc_422>', '<loc_421>', '<loc_420>', '<loc_419>', '<loc_418>', '<loc_417>', '<loc_416>', '<loc_415>', '<loc_414>', '<loc_413>', '<loc_412>', '<loc_411>', '<loc_410>', '<loc_409>', '<loc_408>', '<loc_407>', '<loc_406>', '<loc_405>', '<loc_404>', '<loc_403>', '<loc_402>', '<loc_401>', '<loc_400>', '<loc_399>', '<loc_398>', '<loc_397>', '<loc_396>', '<loc_395>', '<loc_394>', '<loc_393>', '<loc_392>', '<loc_391>', '<loc_390>', '<loc_389>', '<loc_388>', '<loc_387>', '<loc_386>', '<loc_385>', '<loc_384>', '<loc_383>', '<loc_382>', '<loc_381>', '<loc_380>', '<loc_379>', '<loc_378>', '<loc_377>', '<loc_376>', '<loc_375>', '<loc_374>', '<loc_373>', '<loc_372>', '<loc_371>', '<loc_370>', '<loc_369>', '<loc_368>', '<loc_367>', '<loc_366>', '<loc_365>', '<loc_364>', '<loc_363>', '<loc_362>', '<loc_361>', '<loc_360>', '<loc_359>', '<loc_358>', '<loc_357>', '<loc_356>', '<loc_355>', '<loc_354>', '<loc_353>', '<loc_352>', '<loc_351>', '<loc_350>', '<loc_349>', '<loc_348>', '<loc_347>', '<loc_346>', '<loc_345>', '<loc_344>', '<loc_343>', '<loc_342>', '<loc_341>', '<loc_340>', '<loc_339>', '<loc_338>', '<loc_337>', '<loc_336>', '<loc_335>', '<loc_334>', '<loc_333>', '<loc_332>', '<loc_331>', '<loc_330>', '<loc_329>', '<loc_328>', '<loc_327>', '<loc_326>', '<loc_325>', '<loc_324>', '<loc_323>', '<loc_322>', '<loc_321>', '<loc_320>', '<loc_319>', '<loc_318>', '<loc_317>', '<loc_316>', '<loc_315>', '<loc_314>', '<loc_313>', '<loc_312>', '<loc_311>', '<loc_310>', '<loc_309>', '<loc_308>', '<loc_307>', '<loc_306>', '<loc_305>', '<loc_304>', '<loc_303>', '<loc_302>', '<loc_301>', '<loc_300>', '<loc_299>', '<loc_298>', '<loc_297>', '<loc_296>', '<loc_295>', '<loc_294>', '<loc_293>', '<loc_292>', '<loc_291>', '<loc_290>', '<loc_289>', '<loc_288>', '<loc_287>', '<loc_286>', '<loc_285>', '<loc_284>', '<loc_283>', '<loc_282>', '<loc_281>', '<loc_280>', '<loc_279>', '<loc_278>', '<loc_277>', '<loc_276>', '<loc_275>', '<loc_274>', '<loc_273>', '<loc_272>', '<loc_271>', '<loc_270>', '<loc_269>', '<loc_268>', '<loc_267>', '<loc_266>', '<loc_265>', '<loc_264>', '<loc_263>', '<loc_262>', '<loc_261>', '<loc_260>', '<loc_259>', '<loc_258>', '<loc_257>', '<loc_256>', '<loc_255>', '<loc_254>', '<loc_253>', '<loc_252>', '<loc_251>', '<loc_250>', '<loc_249>', '<loc_248>', '<loc_247>', '<loc_246>', '<loc_245>', '<loc_244>', '<loc_243>', '<loc_242>', '<loc_241>', '<loc_240>', '<loc_239>', '<loc_238>', '<loc_237>', '<loc_236>', '<loc_235>', '<loc_234>', '<loc_233>', '<loc_232>', '<loc_231>', '<loc_230>', '<loc_229>', '<loc_228>', '<loc_227>', '<loc_226>', '<loc_225>', '<loc_224>', '<loc_223>', '<loc_222>', '<loc_221>', '<loc_220>', '<loc_219>', '<loc_218>', '<loc_217>', '<loc_216>', '<loc_215>', '<loc_214>', '<loc_213>', '<loc_212>', '<loc_211>', '<loc_210>', '<loc_209>', '<loc_208>', '<loc_207>', '<loc_206>', '<loc_205>', '<loc_204>', '<loc_203>', '<loc_202>', '<loc_201>', '<loc_200>', '<loc_199>', '<loc_198>', '<loc_197>', '<loc_196>', '<loc_195>', '<loc_194>', '<loc_193>', '<loc_192>', '<loc_191>', '<loc_190>', '<loc_189>', '<loc_188>', '<loc_187>', '<loc_186>', '<loc_185>', '<loc_184>', '<loc_183>', '<loc_182>', '<loc_181>', '<loc_180>', '<loc_179>', '<loc_178>', '<loc_177>', '<loc_176>', '<loc_175>', '<loc_174>', '<loc_173>', '<loc_172>', '<loc_171>', '<loc_170>', '<loc_169>', '<loc_168>', '<loc_167>', '<loc_166>', '<loc_165>', '<loc_164>', '<loc_163>', '<loc_162>', '<loc_161>', '<loc_160>', '<loc_159>', '<loc_158>', '<loc_157>', '<loc_156>', '<loc_155>', '<loc_154>', '<loc_153>', '<loc_152>', '<loc_151>', '<loc_150>', '<loc_149>', '<loc_148>', '<loc_147>', '<loc_146>', '<loc_145>', '<loc_144>', '<loc_143>', '<loc_142>', '<loc_141>', '<loc_140>', '<loc_139>', '<loc_138>', '<loc_137>', '<loc_136>', '<loc_135>', '<loc_134>', '<loc_133>', '<loc_132>', '<loc_131>', '<loc_130>', '<loc_129>', '<loc_128>', '<loc_127>', '<loc_126>', '<loc_125>', '<loc_124>', '<loc_123>', '<loc_122>', '<loc_121>', '<loc_120>', '<loc_119>', '<loc_118>', '<loc_117>', '<loc_116>', '<loc_115>', '<loc_114>', '<loc_113>', '<loc_112>', '<loc_111>', '<loc_110>', '<loc_109>', '<loc_108>', '<loc_107>', '<loc_106>', '<loc_105>', '<loc_104>', '<loc_103>', '<loc_102>', '<loc_101>', '<loc_100>', '<loc_99>', '<loc_98>', '<loc_97>', '<loc_96>', '<loc_95>', '<loc_94>', '<loc_93>', '<loc_92>', '<loc_91>', '<loc_90>', '<loc_89>', '<loc_88>', '<loc_87>', '<loc_86>', '<loc_85>', '<loc_84>', '<loc_83>', '<loc_82>', '<loc_81>', '<loc_80>', '<loc_79>', '<loc_78>', '<loc_77>', '<loc_76>', '<loc_75>', '<loc_74>', '<loc_73>', '<loc_72>', '<loc_71>', '<loc_70>', '<loc_69>', '<loc_68>', '<loc_67>', '<loc_66>', '<loc_65>', '<loc_64>', '<loc_63>', '<loc_62>', '<loc_61>', '<loc_60>', '<loc_59>', '<loc_58>', '<loc_57>', '<loc_56>', '<loc_55>', '<loc_54>', '<loc_53>', '<loc_52>', '<loc_51>', '<loc_50>', '<loc_49>', '<loc_48>', '<loc_47>', '<loc_46>', '<loc_45>', '<loc_44>', '<loc_43>', '<loc_42>', '<loc_41>', '<loc_40>', '<loc_39>', '<loc_38>', '<loc_37>', '<loc_36>', '<loc_35>', '<loc_34>', '<loc_33>', '<loc_32>', '<loc_31>', '<loc_30>', '<loc_29>', '<loc_28>', '<loc_27>', '<loc_26>', '<loc_25>', '<loc_24>', '<loc_23>', '<loc_22>', '<loc_21>', '<loc_20>', '<loc_19>', '<loc_18>', '<loc_17>', '<loc_16>', '<loc_15>', '<loc_14>', '<loc_13>', '<loc_12>', '<loc_11>', '<loc_10>', '<loc_9>', '<loc_8>', '<loc_7>', '<loc_6>', '<loc_5>', '<loc_4>', '<loc_3>', '<loc_2>', '<loc_1>', '<loc_0>', '<other_199>', '<other_198>', '<other_197>', '<other_196>', '<other_195>', '<other_194>', '<other_193>', '<other_192>', '<other_191>', '<other_190>', '<other_189>', '<other_188>', '<other_187>', '<other_186>', '<other_185>', '<other_184>', '<other_183>', '<other_182>', '<other_181>', '<other_180>', '<other_179>', '<other_178>', '<other_177>', '<other_176>', '<other_175>', '<other_174>', '<other_173>', '<other_172>', '<other_171>', '<other_170>', '<other_169>', '<other_168>', '<other_167>', '<other_166>', '<other_165>', '<other_164>', '<other_163>', '<other_162>', '<other_161>', '<other_160>', '<other_159>', '<other_158>', '<other_157>', '<other_156>', '<other_155>', '<other_154>', '<other_153>', '<other_152>', '<other_151>', '<other_150>', '<other_149>', '<other_148>', '<other_147>', '<other_146>', '<other_145>', '<other_144>', '<other_143>', '<other_142>', '<other_141>', '<other_140>', '<other_139>', '<other_138>', '<other_137>', '<other_136>', '<other_135>', '<other_134>', '<other_133>', '<other_132>', '<other_131>', '<other_130>', '<other_129>', '<other_128>', '<other_127>', '<other_126>', '<other_125>', '<other_124>', '<other_123>', '<other_122>', '<other_121>', '<other_120>', '<other_119>', '<other_118>', '<other_117>', '<other_116>', '<other_115>', '<other_114>', '<other_113>', '<other_112>', '<other_111>', '<other_110>', '<other_109>', '<other_108>', '<other_107>', '<other_106>', '<other_105>', '<other_104>', '<other_103>', '<other_102>', '<other_101>', '<other_100>', '<other_99>', '<other_98>', '<other_97>', '<other_96>', '<other_95>', '<other_94>', '<other_93>', '<other_92>', '<other_91>', '<other_90>', '<other_89>', '<other_88>', '<other_87>', '<other_86>', '<other_85>', '<other_84>', '<other_83>', '<other_82>', '<other_81>', '<other_80>', '<other_79>', '<other_78>', '<other_77>', '<other_76>', '<other_75>', '<other_74>', '<other_73>', '<other_72>', '<other_71>', '<other_70>', '<other_69>', '<other_68>', '<other_67>', '<other_66>', '<other_65>', '<other_64>', '<other_63>', '<other_62>', '<other_61>', '<other_60>', '<other_59>', '<other_58>', '<other_57>', '<other_56>', '<other_55>', '<other_54>', '<other_53>', '<other_52>', '<other_51>', '<other_50>', '<other_49>', '<other_48>', '<other_47>', '<other_46>', '<other_45>', '<other_44>', '<other_43>', '<other_42>', '<other_41>', '<other_40>', '<other_39>', '<other_38>', '<other_37>', '<other_36>', '<other_35>', '<other_34>', '<other_33>', '<other_32>', '<other_31>', '<other_30>', '<other_29>', '<other_28>', '<other_27>', '<other_26>', '<other_25>', '<other_24>', '<other_23>', '<other_22>', '<other_21>', '<other_20>', '<other_19>', '<other_18>', '<other_17>', '<other_16>', '<other_15>', '<other_14>', '<other_13>', '<other_12>', '<other_11>', '<other_10>', '<other_9>', '<other_8>', '<other_7>', '<other_6>', '<other_5>', '<other_4>', '<other_3>', '<other_2>', '<other_1>', '<other_0>']
-    # fmt: on
-
-    tokenizer = UdopTokenizer.from_pretrained(
-        "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-512",
-        legacy=True,
-        additional_special_tokens=additional_special_tokens,
-    )
-    size = {"height": image_size, "width": image_size}
-    image_processor = LayoutLMv3ImageProcessor(
-        image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size=size
-    )
-    processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
-    # prepare dummy inputs
-    input_ids, bbox, image = prepare_dummy_inputs(tokenizer, image_processor)
-    prompt = "Question answering. In which year is the report made?"
-    encoding = processor(images=get_image(), text=prompt, return_tensors="pt")
-
-    input_ids = encoding.input_ids
-    try:
-        EXPECTED_INPUT_IDS = torch.tensor([[11860, 18243, 5, 86, 84, 215, 19, 8, 934, 263, 58, 1, 489, 27, 3838, 7363, 4083, 14536, 3430, 5686, 5911, 17161, 134, 2038, 27, 3838, 22, 7, 4688, 7, 10, 389, 18202, 21, 8, 11046, 37, 3733, 523, 11, 38, 2388, 1628, 3, 13133, 23334, 6, 8, 1656, 79, 3806, 21, 4040, 640, 27, 3838, 22, 7, 701, 16534, 6, 8, 3, 76, 2693, 18, 23015, 5644, 24, 380, 3, 6015, 6, 11, 8, 701, 24, 79, 482, 21, 3, 88, 684, 6, 43, 263, 27, 3838, 22, 7, 3635, 1157, 4089, 6, 2651, 12, 1547, 22, 7, 3265, 655, 5, 19, 27, 3838, 22, 7, 38, 2388, 257, 12, 36, 8, 465, 209, 13409, 12150, 1959, 16, 8, 684, 6, 6737, 57, 165, 126, 13409, 12150, 1623, 5, 71, 1100, 30298, 934, 65, 12566, 24, 27, 3838, 31, 7, 126, 13409, 12150, 1623, 33, 8, 10391, 1710, 859, 8, 420, 3733, 4968, 688, 2699, 16, 1547, 5, 27, 3838, 1217, 131, 99, 23, 179, 6064, 24, 6, 590, 28, 3, 11600, 1456, 701, 6, 175, 9443, 2557, 3635, 92, 1262, 8, 3409, 13, 2186, 3, 27908, 1784, 190, 8, 3, 5771, 17, 13281, 4005, 13, 5086, 11, 13066, 1170, 5, 10826, 16309, 134, 3, 2, 276, 26, 3, 55, 391, 13570, 5, 10315, 309, 3577, 19114, 371, 4254, 5121, 5055, 6245, 3, 10047, 3162, 58, 3, 9, 61, 1713, 2703, 476, 667, 25158, 301, 6058, 6038, 476, 3765, 9149, 10, 4893, 1303, 1986, 5, 13580, 7, 8224, 28244, 7, 5, 76, 75, 7, 89, 5, 15, 1259, 87, 7171, 7, 87, 7, 29, 115, 226, 4305, 2773, 1]])  # fmt: skip
-        torch.testing.assert_close(EXPECTED_INPUT_IDS, input_ids)
-        bbox = encoding.bbox.float()
-        pixel_values = encoding.pixel_values
-    except Exception:
-        print("Input_ids don't match, preparing dummy inputs")
-        input_ids, bbox, pixel_values = prepare_dummy_inputs(tokenizer, image_processor)
-
-    # Verify single forward pass
-    print("Testing single forward pass..")
-    with torch.no_grad():
-        decoder_input_ids = torch.tensor([[101]])
-        outputs = model(input_ids=input_ids, bbox=bbox, pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
-        print("Shape of logits:", outputs.logits.shape)
-        print("First values of logits:", outputs.logits[0, :3, :3])
-
-    # tensor([[-18.5262,   1.5087, -15.7051]]) on linux
-    # tensor([[-19.4976,   0.8515, -17.1873]]) on mac
-    try:
-        assert torch.allclose(outputs.logits[0, :3, :3], torch.tensor([[-18.5262, 1.5087, -15.7051]]), atol=1e-4)
-        print("Looks ok!")
-    except Exception:
-        print("logits don't match let's try to generate")
-
-    # Verify autoregressive decoding
-    print("Testing generation...")
-    model_kwargs = {"bbox": bbox, "pixel_values": pixel_values}
-    outputs = model.generate(input_ids=input_ids, **model_kwargs, max_new_tokens=20)
-
-    print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
-
-    # autoregressive decoding with original input data
-    print("Testing generation with original inputs...")
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="input_ids_udop.pt", repo_type="dataset")
-    input_ids = torch.load(filepath)
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="bbox_udop.pt", repo_type="dataset")
-    bbox = torch.load(filepath)
-    pixel_values_filename = "pixel_values_udop_512.pt" if "512" in model_name else "pixel_values_udop_224.pt"
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename=pixel_values_filename, repo_type="dataset")
-    pixel_values = torch.load(filepath)
-
-    print("Decoded input ids:", tokenizer.decode(input_ids[0], skip_special_tokens=True))
-    print("Bbox shape:", bbox.shape)
-
-    model_kwargs = {"bbox": bbox, "pixel_values": pixel_values}
-    outputs = model.generate(input_ids=input_ids, **model_kwargs, max_new_tokens=20)
-    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
-    print("Generated:", generated_text)
-
-    if pytorch_dump_folder_path is not None:
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model.push_to_hub(f"microsoft/{model_name}")
-        processor.push_to_hub(f"microsoft/{model_name}")
-        # BIG note here: to save the fast tokenizer files in the repo on the hub, you need to do the following:
-        # see https://discuss.huggingface.co/t/convert-slow-xlmrobertatokenizer-to-fast-one/20876
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="udop-large",
-        type=str,
-        choices=["udop-large", "udop-large-512", "udop-large-512-300k"],
-        help=("Name of the UDOP model you'd like to convert."),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_udop_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py
index 80c6d37ba9ff..4f355c9ed5f0 100644
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@@ -43,11 +43,18 @@
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
     is_torchdynamo_compiling,
     replace_return_docstrings,
 )
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -254,8 +261,8 @@ class BaseModelOutputWithAttentionMask(ModelOutput):
             used to compute the weighted average in the cross-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    attention_mask: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    attention_mask: Optional[torch.FloatTensor] = None
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -400,8 +407,7 @@ def forward(self, pixel_values):
         batch_size, num_channels, height, width = pixel_values.shape
         if height != self.image_size[0] or width != self.image_size[1]:
             raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model"
-                f" ({self.image_size[0]}*{self.image_size[1]})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
             )
         embeddings = self.proj(pixel_values)
         embeddings = embeddings.flatten(2).transpose(1, 2)
@@ -518,7 +524,7 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         # Udop uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
 
@@ -1535,12 +1541,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1622,7 +1633,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1708,8 +1719,8 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: Tensor = None,
-        attention_mask: Tensor = None,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
         bbox: Dict[str, Any] = None,
         pixel_values: Optional[Tensor] = None,
         visual_bbox: Dict[str, Any] = None,
@@ -1884,8 +1895,8 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: Tensor = None,
-        attention_mask: Tensor = None,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
         bbox: Dict[str, Any] = None,
         pixel_values: Optional[Tensor] = None,
         visual_bbox: Dict[str, Any] = None,
@@ -2097,9 +2108,9 @@ class PreTrainedModel
     @replace_return_docstrings(output_type=BaseModelOutputWithAttentionMask, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: Tensor = None,
+        input_ids: Optional[Tensor] = None,
         bbox: Dict[str, Any] = None,
-        attention_mask: Tensor = None,
+        attention_mask: Optional[Tensor] = None,
         pixel_values: Optional[Tensor] = None,
         visual_bbox: Dict[str, Any] = None,
         head_mask: Optional[Tensor] = None,
diff --git a/src/transformers/models/udop/tokenization_udop.py b/src/transformers/models/udop/tokenization_udop.py
index 608121f9b236..86ee6a873dd4 100644
--- a/src/transformers/models/udop/tokenization_udop.py
+++ b/src/transformers/models/udop/tokenization_udop.py
@@ -551,7 +551,7 @@ def call_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -697,7 +697,7 @@ def batch_encode_plus_boxes(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -707,7 +707,7 @@ def batch_encode_plus_boxes(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -818,7 +818,7 @@ def encode_plus_boxes(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -890,7 +890,7 @@ def _batch_encode_plus_boxes(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -899,7 +899,7 @@ def _batch_encode_plus_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -944,7 +944,7 @@ def _batch_encode_plus_boxes(
     def _batch_prepare_for_model_boxes(
         self,
         batch_text_or_text_pairs,
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[int]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -953,7 +953,7 @@ def _batch_prepare_for_model_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1026,7 +1026,7 @@ def _encode_plus_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1081,7 +1081,7 @@ def prepare_for_model_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1401,7 +1401,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/udop/tokenization_udop_fast.py b/src/transformers/models/udop/tokenization_udop_fast.py
index 9b1464d1d874..9992da7bddc4 100644
--- a/src/transformers/models/udop/tokenization_udop_fast.py
+++ b/src/transformers/models/udop/tokenization_udop_fast.py
@@ -286,7 +286,7 @@ def call_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -446,7 +446,7 @@ def batch_encode_plus_boxes(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -456,7 +456,7 @@ def batch_encode_plus_boxes(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -524,7 +524,7 @@ def _batch_encode_plus_boxes(
             List[TextInputPair],
             List[PreTokenizedInput],
         ],
-        is_pair: bool = None,
+        is_pair: Optional[bool] = None,
         boxes: Optional[List[List[List[int]]]] = None,
         word_labels: Optional[List[List[int]]] = None,
         add_special_tokens: bool = True,
@@ -533,7 +533,7 @@ def _batch_encode_plus_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -691,7 +691,7 @@ def _encode_plus_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -803,7 +803,7 @@ def encode_plus_boxes(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -875,7 +875,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
diff --git a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
deleted file mode 100644
index 848ca3c5660c..000000000000
--- a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Google LLC and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert T5X checkpoint to PyTorch
-
-Steps:
-- Install gsutil according to https://cloud.google.com/storage/docs/gsutil_install
-- Get a T5X checkpoint at https://github.com/google-research/t5x/blob/main/docs/models.md#t5-11-checkpoints Example:
-    `gsutil -m cp -r gs://t5-data/pretrained_models/t5x/t5_1_1_small $HOME/`
-- Create or download a corresponding config for the downloaded model. E.g. for T5 v1.1 small, you can use
-    https://huggingface.co/google/t5-v1_1-small/blob/main/config.json
-- Convert:
-    ```
-    python3 convert_t5x_checkpoint_to_pytorch.py --t5x_checkpoint_path=$HOME/t5_1_1_small --config_file=config.json\
-      --pytorch_dump_path=$HOME/t5_1_1_small_pt
-    ```
-"""
-
-import argparse
-import collections
-
-import numpy as np
-import torch
-from flax import traverse_util
-from t5x import checkpoints
-
-from transformers import MT5Config, UMT5EncoderModel, UMT5ForConditionalGeneration
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-
-
-def t5x_relpos_bias_lookup(params, i, prefix):
-    """Returns the Relative Position Bias parameters of a layer. Does not transpose."""
-    return params[f"{prefix}/{prefix}/relpos_bias/rel_embedding"][:, i, :]
-
-
-def t5x_attention_lookup(params, i, prefix, layer_name="attention"):
-    """Returns the KOQV parameters of (self-)attention. Does not transpose."""
-    k_tmp = k_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/key/kernel"][:, i, :, :])
-    k = k_tmp.reshape(k_tmp.shape[0], k_tmp.shape[1] * k_tmp.shape[2])
-    o_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/out/kernel"][:, i, :, :])
-    o = o_tmp.reshape(o_tmp.shape[0] * o_tmp.shape[1], o_tmp.shape[2])
-    q_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/query/kernel"][:, i, :, :])
-    q = q_tmp.reshape(q_tmp.shape[0], q_tmp.shape[1] * q_tmp.shape[2])
-    v_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/value/kernel"][:, i, :, :])
-    v = v_tmp.reshape(v_tmp.shape[0], v_tmp.shape[1] * v_tmp.shape[2])
-    return k, o, q, v
-
-
-def t5x_mlp_lookup(params, i, prefix, split_mlp_wi=False):
-    """Returns the MLP parameters of a layer. Does not transpose."""
-    if split_mlp_wi:
-        wi_0 = params[f"{prefix}/{prefix}/mlp/wi_0/kernel"][:, i, :]
-        wi_1 = params[f"{prefix}/{prefix}/mlp/wi_1/kernel"][:, i, :]
-        wi = (wi_0, wi_1)
-    else:
-        wi = params[f"{prefix}/{prefix}/mlp/wi/kernel"][:, i, :]
-
-    wo = params[f"{prefix}/{prefix}/mlp/wo/kernel"][:, i, :]
-    return wi, wo
-
-
-def t5x_layer_norm_lookup(params, i, prefix, layer_name):
-    """Returns the layer norm param of a layer."""
-    return params[f"{prefix}/{prefix}/{layer_name}/scale"][:, i]
-
-
-def convert_t5x_to_pytorch(
-    variables: dict, *, num_layers: int, is_encoder_only: bool, scalable_attention: bool = False
-):
-    """Converts the parameters from T5X-Flax to Transformers-PyTorch."""
-    old = traverse_util.flatten_dict(variables["target"])
-    old = {"/".join(k): v for k, v in old.items()}
-
-    # v1.1 models have a gated GeLU with wi_0 and wi_1 instead of wi
-    split_mlp_wi = "encoder/encoder/mlp/wi_0/kernel" in old
-    print("Split MLP:", split_mlp_wi)
-
-    new = collections.OrderedDict()
-
-    # Shared embeddings.
-    new["shared.weight"] = old["token_embedder/embedding"]
-
-    # Encoder.
-    for i in range(num_layers):
-        # Block i, layer 0 (Self Attention).
-        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_attention_layer_norm")
-        k, o, q, v = t5x_attention_lookup(old, i, "encoder", "attention")
-        new[f"encoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
-        new[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
-        new[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
-
-        # Block i, layer 1 (MLP).
-        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_mlp_layer_norm")
-        wi, wo = t5x_mlp_lookup(old, i, "encoder", split_mlp_wi)
-        new[f"encoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
-        if split_mlp_wi:
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = wi[0].T
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = wi[1].T
-        else:
-            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi.weight"] = wi.T
-        new[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = wo.T
-        if scalable_attention:
-            # convert the rel_embedding of each layer
-            new[f"encoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
-                old, i, "encoder"
-            ).T
-
-    new["encoder.final_layer_norm.weight"] = old["encoder/encoder_norm/scale"]
-
-    if not scalable_attention:
-        new["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
-            old, 0, "encoder"
-        ).T
-        new["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
-            old, 0, "decoder"
-        ).T
-
-    if not is_encoder_only:
-        # Decoder.
-        for i in range(num_layers):
-            # Block i, layer 0 (Self Attention).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
-            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
-            new[f"decoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
-            new[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
-            new[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T
-
-            # Block i, layer 1 (Cross Attention).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_cross_attention_layer_norm")
-            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "encoder_decoder_attention")
-            new[f"decoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = k.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = o.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = q.T
-            new[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = v.T
-
-            # Block i, layer 2 (MLP).
-            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
-            wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
-            new[f"decoder.block.{i}.layer.2.layer_norm.weight"] = layer_norm
-            if split_mlp_wi:
-                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = wi[0].T
-                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = wi[1].T
-            else:
-                new[f"encoder.block.{i}.layer.2.DenseReluDense.wi.weight"] = wi.T
-            new[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = wo.T
-
-            if scalable_attention:
-                # convert the rel_embedding of each layer
-                new[f"decoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"] = (
-                    t5x_relpos_bias_lookup(old, i, "decoder").T
-                )
-
-        new["decoder.final_layer_norm.weight"] = old["decoder/decoder_norm/scale"]
-
-        # LM Head (only in v1.1 checkpoints, in v1.0 embeddings are used instead)
-        if "decoder/logits_dense/kernel" in old:
-            new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T
-
-    return new
-
-
-def make_state_dict(converted_params, is_encoder_only: bool):
-    """Prepares a state dict for the PyTorch model."""
-    # Make a state dict with torch tensors.
-    state_dict = collections.OrderedDict([(k, torch.from_numpy(v.copy())) for (k, v) in converted_params.items()])
-
-    # Add what is missing.
-    if "encoder.embed_tokens.weight" not in state_dict:
-        state_dict["encoder.embed_tokens.weight"] = state_dict["shared.weight"]
-
-    if not is_encoder_only:
-        if "decoder.embed_tokens.weight" not in state_dict:
-            state_dict["decoder.embed_tokens.weight"] = state_dict["shared.weight"]
-
-        if "lm_head.weight" not in state_dict:  # For old 1.0 models.
-            print("Using shared word embeddings as lm_head.")
-            state_dict["lm_head.weight"] = state_dict["shared.weight"]
-
-    return state_dict
-
-
-def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
-    """Replaces the params in model witht the T5X converted params."""
-    variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-    converted = convert_t5x_to_pytorch(
-        variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention
-    )
-    state_dict = make_state_dict(converted, is_encoder_only)
-    model.load_state_dict(state_dict, strict=True)
-
-
-def convert_t5x_checkpoint_to_pytorch(
-    t5x_checkpoint_path,
-    config_file,
-    pytorch_dump_path,
-    is_encoder_only: bool = False,
-    scalable_attention: bool = False,
-):
-    """Loads the config and model, converts the T5X checkpoint, and saves a PyTorch checkpoint."""
-    # Initialise PyTorch model
-    config = MT5Config.from_json_file(config_file)
-    print(f"Building PyTorch model from configuration: {config}")
-    # Non-v1.1 checkpoints could also use T5Model, but this works for all.
-    # The v1.0 checkpoints will simply have an LM head that is the word embeddings.
-    if is_encoder_only:
-        model = UMT5EncoderModel(config)
-    else:
-        model = UMT5ForConditionalGeneration(config)
-
-    # Load weights from tf checkpoint
-    load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention)
-
-    # Save pytorch-model
-    print(f"Save PyTorch model to {pytorch_dump_path}")
-    model.save_pretrained(pytorch_dump_path)
-
-    # Verify that we can load the checkpoint.
-    model.from_pretrained(pytorch_dump_path)
-    print("Done")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
-    # Required parameters
-    parser.add_argument(
-        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path to the T5X checkpoint."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--is_encoder_only", action="store_true", help="Check if the model is encoder-decoder model", default=False
-    )
-    parser.add_argument(
-        "--scalable_attention",
-        action="store_true",
-        help="Whether the model uses scaled attention (umt5 model)",
-        default=False,
-    )
-    args = parser.parse_args()
-    convert_t5x_checkpoint_to_pytorch(
-        args.t5x_checkpoint_path,
-        args.config_file,
-        args.pytorch_dump_path,
-        args.is_encoder_only,
-        args.scalable_attention,
-    )
diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py
index 25d7a74eabfa..a25a99653ae5 100644
--- a/src/transformers/models/umt5/modeling_umt5.py
+++ b/src/transformers/models/umt5/modeling_umt5.py
@@ -41,6 +41,7 @@
     DUMMY_MASK,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
     is_torch_fx_proxy,
     is_torchdynamo_compiling,
     logging,
@@ -49,6 +50,11 @@
 from .configuration_umt5 import UMT5Config
 
 
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "UMT5Config"
@@ -67,7 +73,7 @@ def __init__(self, hidden_size, eps=1e-6):
 
     def forward(self, hidden_states):
         # UMT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
         # half-precision inputs is done in fp32
 
@@ -846,12 +852,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -933,7 +944,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1647,7 +1658,7 @@ def __init__(self, config: UMT5Config):
     @replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
diff --git a/src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 4eb8dfa7bbd2..000000000000
--- a/src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert UniSpeech checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-from transformers import (
-    UniSpeechConfig,
-    UniSpeechForCTC,
-    UniSpeechForPreTraining,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2PhonemeCTCTokenizer,
-    Wav2Vec2Processor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "ctc_proj",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "ctc_proj",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type, is_finetuned):
-    for attribute in key.split("."):
-        if is_finetuned:
-            if attribute in ["quantizer", "project_q", "project_hid"]:
-                # those layers are only relevant for pretraining and should be dropped
-                return
-
-            if attribute == "ctc_proj":
-                # we should rename `ctc_proj` to `lm_head` for fine-tuned phoneme models
-                attribute = "lm_head"
-
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.unispeech.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "unispeech." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type, is_finetuned)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_unispeech_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = UniSpeechConfig.from_pretrained(config_path)
-    else:
-        config = UniSpeechConfig()
-
-    if is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load_from_json(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            vocab_dict = target_dict.indices
-
-            # fairseq has the <pad> and <s> switched
-            vocab_dict["<pad>"] = 42
-            vocab_dict["<s>"] = 43
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(vocab_dict, vocab_handle)
-            tokenizer = Wav2Vec2PhonemeCTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            return_attention_mask = True if config.feat_extract_norm == "layer" else False
-            feature_extractor = Wav2Vec2FeatureExtractor(
-                feature_size=1,
-                sampling_rate=16000,
-                padding_value=0,
-                do_normalize=True,
-                return_attention_mask=return_attention_mask,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_unispeech = UniSpeechForCTC(config)
-    else:
-        hf_unispeech = UniSpeechForPreTraining(config)
-
-    if is_finetuned:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1]), "w2v_path": checkpoint_path}
-        )
-    else:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])
-
-    model = model[0].eval()
-
-    recursively_load_weights(model, hf_unispeech, is_finetuned)
-
-    hf_unispeech.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_unispeech_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
-    )
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 6f7e544b598a..74608797ab68 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -1,19 +1,9 @@
-# coding=utf-8
-# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch UniSpeech model."""
-
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/unispeech/modular_unispeech.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_unispeech.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import warnings
 from dataclasses import dataclass
@@ -21,47 +11,42 @@
 
 import numpy as np
 import torch
-import torch.utils.checkpoint
-from torch import nn
+import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...integrations.fsdp import is_fsdp_managed_module
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    ModelOutput,
+    SequenceClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
-    ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_unispeech import UniSpeechConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
 logger = logging.get_logger(__name__)
 
-
-_HIDDEN_STATES_START_POSITION = 2
-
-# General docstring
-_CONFIG_FOR_DOC = "UniSpeechConfig"
-
 # Base docstring
 _CHECKPOINT_FOR_DOC = "patrickvonplaten/unispeech-large-1500h-cv-timit"
-_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
 
-# CTC docstring
-_CTC_EXPECTED_OUTPUT = "'mister quilter is the apposl of the midle classes and weare glad to welcom his gosepl'"
-_CTC_EXPECTED_LOSS = 17.17
+# General docstring
+_CONFIG_FOR_DOC = "UniSpeechConfig"
 
 
 @dataclass
@@ -93,134 +78,69 @@ class UniSpeechForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    projected_states: torch.FloatTensor = None
-    projected_quantized_states: torch.FloatTensor = None
-    codevector_perplexity: torch.FloatTensor = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
-def _compute_mask_indices(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    attention_mask: Optional[torch.LongTensor] = None,
-    min_masks: int = 0,
-) -> np.ndarray:
-    """
-    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
-    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
-    CPU as part of the preprocessing during training.
+class UniSpeechSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
 
-    Args:
-        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-                    independently generated mask spans of length `mask_length` is computed by
-                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-                    actual percentage will be smaller.
-        mask_length: size of the mask
-        min_masks: minimum number of masked spans
-        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-                        each batch dimension.
-    """
-    batch_size, sequence_length = shape
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
 
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
 
-    if mask_length > sequence_length:
-        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
-            f" and `sequence_length`: {sequence_length}`"
+class UniSpeechPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
         )
 
-    # epsilon is used for probabilistic rounding
-    epsilon = np.random.rand(1).item()
-
-    def compute_num_masked_span(input_length):
-        """Given input length, compute how many spans should be masked"""
-        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
-        num_masked_span = max(num_masked_span, min_masks)
-
-        # make sure num masked span <= sequence_length
-        if num_masked_span * mask_length > sequence_length:
-            num_masked_span = sequence_length // mask_length
-
-        # make sure num_masked span is also <= input_length - (mask_length - 1)
-        if input_length - (mask_length - 1) < num_masked_span:
-            num_masked_span = max(input_length - (mask_length - 1), 0)
-
-        return num_masked_span
-
-    # compute number of masked spans in batch
-    input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
-        if attention_mask is not None
-        else [sequence_length for _ in range(batch_size)]
-    )
-
-    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
-    spec_aug_mask_idxs = []
-
-    max_num_masked_span = compute_num_masked_span(sequence_length)
-
-    if max_num_masked_span == 0:
-        return spec_aug_mask
-
-    for input_length in input_lengths:
-        # compute num of masked spans for this input
-        num_masked_span = compute_num_masked_span(input_length)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
 
-        # get random indices to mask
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
-        )
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
 
-        # pick first sampled index that will serve as a dummy index to pad vector
-        # to ensure same dimension for all batches due to probabilistic rounding
-        # Picking first sample just pads those vectors twice.
-        if len(spec_aug_mask_idx) == 0:
-            # this case can only happen if `input_length` is strictly smaller then
-            # `sequence_length` in which case the last token has to be a padding
-            # token which we can use as a dummy mask id
-            dummy_mask_idx = sequence_length - 1
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
         else:
-            dummy_mask_idx = spec_aug_mask_idx[0]
-
-        spec_aug_mask_idx = np.concatenate(
-            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
-        )
-        spec_aug_mask_idxs.append(spec_aug_mask_idx)
-
-    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
-
-    # expand masked indices to masked spans
-    spec_aug_mask_idxs = np.broadcast_to(
-        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
-    # add offset to the starting indexes so that indexes now create a span
-    offsets = np.arange(mask_length)[None, None, :]
-    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
-        batch_size, max_num_masked_span * mask_length
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+        self.padding = UniSpeechSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
 
-    # ensure that we cannot have indices larger than sequence_length
-    if spec_aug_mask_idxs.max() > sequence_length - 1:
-        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
 
-    # scatter indices to mask
-    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
 
-    return spec_aug_mask
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->UniSpeech
 class UniSpeechNoLayerNormConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -242,7 +162,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->UniSpeech
 class UniSpeechLayerNormConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -270,7 +189,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->UniSpeech
 class UniSpeechGroupNormConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -295,65 +213,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->UniSpeech
-class UniSpeechPositionalConvEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            config.hidden_size,
-            config.hidden_size,
-            kernel_size=config.num_conv_pos_embeddings,
-            padding=config.num_conv_pos_embeddings // 2,
-            groups=config.num_conv_pos_embedding_groups,
-        )
-
-        weight_norm = nn.utils.weight_norm
-        if hasattr(nn.utils.parametrizations, "weight_norm"):
-            weight_norm = nn.utils.parametrizations.weight_norm
-
-        if is_deepspeed_zero3_enabled():
-            import deepspeed
-
-            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = weight_norm(self.conv, name="weight", dim=2)
-            if hasattr(self.conv, "parametrizations"):
-                weight_g = self.conv.parametrizations.weight.original0
-                weight_v = self.conv.parametrizations.weight.original1
-            else:
-                weight_g = self.conv.weight_g
-                weight_v = self.conv.weight_v
-            deepspeed.zero.register_external_parameter(self, weight_v)
-            deepspeed.zero.register_external_parameter(self, weight_g)
-        else:
-            self.conv = weight_norm(self.conv, name="weight", dim=2)
-
-        self.padding = UniSpeechSamePadLayer(config.num_conv_pos_embeddings)
-        self.activation = ACT2FN[config.feat_extract_activation]
-
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.transpose(1, 2)
-
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.padding(hidden_states)
-        hidden_states = self.activation(hidden_states)
-
-        hidden_states = hidden_states.transpose(1, 2)
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->UniSpeech
-class UniSpeechSamePadLayer(nn.Module):
-    def __init__(self, num_conv_pos_embeddings):
-        super().__init__()
-        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
-
-    def forward(self, hidden_states):
-        if self.num_pad_remove > 0:
-            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->UniSpeech
 class UniSpeechFeatureEncoder(nn.Module):
     """Construct the features from raw audio waveform"""
 
@@ -401,18 +260,6 @@ def forward(self, input_values):
         return hidden_states
 
 
-class UniSpeechFeatureExtractor(UniSpeechFeatureEncoder):
-    def __init__(self, config):
-        super().__init__(config)
-        warnings.warn(
-            f"The class `{self.__class__.__name__}` has been depreciated "
-            "and will be removed in Transformers v5. "
-            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
-            FutureWarning,
-        )
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeech
 class UniSpeechFeatureProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -428,7 +275,6 @@ def forward(self, hidden_states):
         return hidden_states, norm_hidden_states
 
 
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->UniSpeech
 class UniSpeechAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -587,7 +433,6 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->UniSpeech
 class UniSpeechFlashAttention2(UniSpeechAttention):
     """
     UniSpeech flash attention module. This module inherits from `UniSpeechAttention` as the weights of the module stays
@@ -599,9 +444,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -715,7 +560,6 @@ def forward(
 
 
 class UniSpeechSdpaAttention(UniSpeechAttention):
-    # Copied from transformers.models.bart.modeling_bart.BartSdpaAttention.forward with Bart->UniSpeech
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -821,14 +665,6 @@ def forward(
         return attn_output, None, past_key_value
 
 
-UNISPEECH_ATTENTION_CLASSES = {
-    "eager": UniSpeechAttention,
-    "sdpa": UniSpeechSdpaAttention,
-    "flash_attention_2": UniSpeechFlashAttention2,
-}
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->UniSpeech
 class UniSpeechFeedForward(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -853,7 +689,13 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Wav2Vec2->UniSpeech, WAV2VEC2->UNISPEECH
+UNISPEECH_ATTENTION_CLASSES = {
+    "eager": UniSpeechAttention,
+    "sdpa": UniSpeechSdpaAttention,
+    "flash_attention_2": UniSpeechFlashAttention2,
+}
+
+
 class UniSpeechEncoderLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -889,79 +731,6 @@ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         return outputs
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->UniSpeech
-class UniSpeechAttnAdapterLayer(nn.Module):
-    def __init__(self, config):
-        """
-        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
-        up training throughput.
-        """
-        super().__init__()
-        self.input_dim = config.adapter_attn_dim
-        self.hidden_dim = config.hidden_size
-
-        self.norm = nn.LayerNorm(self.hidden_dim)
-        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
-        self.act_fn = nn.ReLU()
-        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
-
-    def forward(self, hidden_states: torch.FloatTensor):
-        hidden_states = self.norm(hidden_states)
-
-        hidden_states = self.linear_1(hidden_states)
-        hidden_states = self.act_fn(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
-
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->UniSpeech, WAV2VEC2->UNISPEECH
-class UniSpeechEncoderLayerStableLayerNorm(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.attention = UNISPEECH_ATTENTION_CLASSES[config._attn_implementation](
-            embed_dim=config.hidden_size,
-            num_heads=config.num_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=False,
-        )
-        self.dropout = nn.Dropout(config.hidden_dropout)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.feed_forward = UniSpeechFeedForward(config)
-        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-        if getattr(config, "adapter_attn_dim", None) is not None:
-            self.adapter_layer = UniSpeechAttnAdapterLayer(config)
-        else:
-            self.adapter_layer = None
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ):
-        attn_residual = hidden_states
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states, attn_weights, _ = self.attention(
-            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-        )
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = attn_residual + hidden_states
-        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
-
-        if self.adapter_layer is not None:
-            hidden_states = hidden_states + self.adapter_layer(hidden_states)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder with Wav2Vec2->UniSpeech
 class UniSpeechEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1047,7 +816,76 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderStableLayerNorm with Wav2Vec2->UniSpeech
+class UniSpeechAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
+class UniSpeechEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = UNISPEECH_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = UniSpeechFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = UniSpeechAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
 class UniSpeechEncoderStableLayerNorm(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1139,7 +977,7 @@ def forward(
 
 class UniSpeechGumbelVectorQuantizer(nn.Module):
     """
-    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
     GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
     """
 
@@ -1150,8 +988,8 @@ def __init__(self, config):
 
         if config.codevector_dim % self.num_groups != 0:
             raise ValueError(
-                f"`config.codevector_dim {config.codevector_dim} must be divisible by `config.num_codevector_groups`"
-                f" {self.num_groups} for concatenation"
+                f"`config.codevector_dim {config.codevector_dim} must be divisible "
+                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
             )
 
         # storage for codebook variables (codewords)
@@ -1284,6 +1122,128 @@ def _get_feature_vector_attention_mask(self, feature_vector_length: int, attenti
         return attention_mask
 
 
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
+
+
 UNISPEECH_START_DOCSTRING = r"""
     UniSpeech was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled
     Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei,
@@ -1302,7 +1262,6 @@ def _get_feature_vector_attention_mask(self, feature_vector_length: int, attenti
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-
 UNISPEECH_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -1340,6 +1299,9 @@ def _get_feature_vector_attention_mask(self, feature_vector_length: int, attenti
 """
 
 
+UniSpeechBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
 @add_start_docstrings(
     "The bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top.",
     UNISPEECH_START_DOCSTRING,
@@ -1362,7 +1324,6 @@ def __init__(self, config: UniSpeechConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
     def _mask_hidden_states(
         self,
         hidden_states: torch.FloatTensor,
@@ -1412,7 +1373,7 @@ def _mask_hidden_states(
     @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Wav2Vec2BaseModelOutput,
+        output_type=UniSpeechBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
         expected_output=_EXPECTED_OUTPUT_SHAPE,
@@ -1425,7 +1386,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+    ) -> Union[Tuple, UniSpeechBaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1457,7 +1418,7 @@ def forward(
         if not return_dict:
             return (hidden_states, extract_features) + encoder_outputs[1:]
 
-        return Wav2Vec2BaseModelOutput(
+        return UniSpeechBaseModelOutput(
             last_hidden_state=hidden_states,
             extract_features=extract_features,
             hidden_states=encoder_outputs.hidden_states,
@@ -1611,6 +1572,13 @@ def forward(
         )
 
 
+_HIDDEN_STATES_START_POSITION = 2
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'mister quilter is the apposl of the midle classes and weare glad to welcom his gosepl'"
+_CTC_EXPECTED_LOSS = 17.17
+
+
 @add_start_docstrings(
     """UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     UNISPEECH_START_DOCSTRING,
@@ -1621,7 +1589,6 @@ def forward(
             by default.
     """,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeech, wav2vec2->unispeech, WAV_2_VEC_2->UNISPEECH
 class UniSpeechForCTC(UniSpeechPreTrainedModel):
     def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
@@ -1798,7 +1765,6 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_extractor
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameters will
@@ -1811,7 +1777,6 @@ def freeze_feature_extractor(self):
         )
         self.freeze_feature_encoder()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->unispeech
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1819,7 +1784,6 @@ def freeze_feature_encoder(self):
         """
         self.unispeech.feature_extractor._freeze_parameters()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_base_model with wav2vec2->unispeech
     def freeze_base_model(self):
         """
         Calling this function will disable the gradient computation for the base model so that its parameters will not
@@ -1835,7 +1799,6 @@ def freeze_base_model(self):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->UniSpeech, wav2vec2->unispeech
     def forward(
         self,
         input_values: Optional[torch.Tensor],
diff --git a/src/transformers/models/unispeech/modular_unispeech.py b/src/transformers/models/unispeech/modular_unispeech.py
new file mode 100644
index 000000000000..1096bc559b44
--- /dev/null
+++ b/src/transformers/models/unispeech/modular_unispeech.py
@@ -0,0 +1,563 @@
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...modeling_outputs import CausalLMOutput, ModelOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Encoder,
+    Wav2Vec2EncoderStableLayerNorm,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2GumbelVectorQuantizer,
+    Wav2Vec2Model,
+    Wav2Vec2PositionalConvEmbedding,
+)
+from .configuration_unispeech import UniSpeechConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "UniSpeechConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "patrickvonplaten/unispeech-large-1500h-cv-timit"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'mister quilter is the apposl of the midle classes and weare glad to welcom his gosepl'"
+_CTC_EXPECTED_LOSS = 17.17
+
+
+@dataclass
+class UniSpeechForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class UniSpeechPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding):
+    pass
+
+
+class UniSpeechFeatureEncoder(Wav2Vec2FeatureEncoder):
+    pass
+
+
+class UniSpeechFeatureProjection(Wav2Vec2FeatureProjection):
+    pass
+
+
+class UniSpeechEncoder(Wav2Vec2Encoder):
+    pass
+
+
+class UniSpeechEncoderStableLayerNorm(Wav2Vec2EncoderStableLayerNorm):
+    pass
+
+
+class UniSpeechGumbelVectorQuantizer(Wav2Vec2GumbelVectorQuantizer):
+    @staticmethod
+    def _compute_perplexity(probs):
+        marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class UniSpeechPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = UniSpeechConfig
+    base_model_prefix = "unispeech"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # gumbel softmax requires special init
+        if isinstance(module, UniSpeechGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, UniSpeechPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, UniSpeechFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+UNISPEECH_START_DOCSTRING = r"""
+    UniSpeech was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled
+    Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei,
+    Michael Zeng, Xuedong Huang.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`UniSpeechConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+UNISPEECH_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
+            **not** be passed to avoid degraded performance when doing batched inference. For such models
+            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
+            models also yield slightly different results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+UniSpeechBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top.",
+    UNISPEECH_START_DOCSTRING,
+)
+class UniSpeechModel(UniSpeechPreTrainedModel, Wav2Vec2Model):
+    def __init__(self, config: UniSpeechConfig):
+        UniSpeechPreTrainedModel.__init__(config)
+        self.config = config
+        self.feature_extractor = UniSpeechFeatureEncoder(config)
+        self.feature_projection = UniSpeechFeatureProjection(config)
+
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = UniSpeechEncoderStableLayerNorm(config)
+        else:
+            self.encoder = UniSpeechEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for UniSpeech")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for UniSpeech")
+
+    @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=UniSpeechBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, UniSpeechBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return UniSpeechBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """UniSpeech Model with a vector-quantization module and ctc loss for pre-training.""", UNISPEECH_START_DOCSTRING
+)
+class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
+    def __init__(self, config: UniSpeechConfig):
+        super().__init__(config)
+        self.unispeech = UniSpeechModel(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+
+        self.quantizer = UniSpeechGumbelVectorQuantizer(config)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+        self.project_hid = nn.Linear(config.proj_codevector_dim, config.hidden_size)
+
+        self.ctc_proj = nn.Linear(config.hidden_size, config.num_ctc_classes)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.unispeech.feature_extractor._freeze_parameters()
+
+    @staticmethod
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1)
+        logits = logits.type_as(target_features)
+
+        # apply temperature
+        logits = logits / temperature
+        return logits
+
+    @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=UniSpeechForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, UniSpeechForPreTrainingOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
+        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
+        >>> # TODO: Add full pretraining example
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.unispeech(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        transformer_features = outputs[0]
+
+        # quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+        quantized_features, codevector_perplexity = self.quantizer(extract_features)
+
+        # project quantized features twice
+        quantized_features = self.project_q(quantized_features.to(self.project_q.weight.dtype))
+        quantized_features = self.project_hid(quantized_features)
+
+        prob_replace_matrix = torch.empty(transformer_features.size(0), transformer_features.size(1)).fill_(
+            self.config.replace_prob
+        )
+        prob_replace_matrix = prob_replace_matrix.transpose(0, 1)
+        sampled_replace_matrix = torch.bernoulli(prob_replace_matrix).bool().to(transformer_features.device)
+        sampled_replace_matrix = sampled_replace_matrix.transpose(0, 1)
+        sampled_replace_matrix = sampled_replace_matrix.unsqueeze(-1)
+        logits = transformer_features.masked_fill(sampled_replace_matrix, 0.0) + (
+            quantized_features.masked_fill(~sampled_replace_matrix, 0.0)
+        )
+
+        # project to ctc units
+        logits = self.dropout(logits)
+        logits = self.ctc_proj(logits)
+
+        # TODO(PVP) - add negative sampling & loss computation
+        loss = None
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return UniSpeechForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    UNISPEECH_START_DOCSTRING,
+    """
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng'
+            by default.
+    """,
+)
+class UniSpeechForCTC(Wav2Vec2ForCTC):
+    @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """,
+    UNISPEECH_START_DOCSTRING,
+)
+class UniSpeechForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+__all__ = [
+    "UniSpeechForCTC",
+    "UniSpeechForPreTraining",
+    "UniSpeechForSequenceClassification",
+    "UniSpeechModel",
+    "UniSpeechPreTrainedModel",
+]
diff --git a/src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
deleted file mode 100644
index fca35acb634d..000000000000
--- a/src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    UniSpeechSatConfig,
-    UniSpeechSatForAudioFrameClassification,
-    UniSpeechSatForSequenceClassification,
-    UniSpeechSatForXVector,
-    Wav2Vec2FeatureExtractor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def convert_classification(base_model_name, hf_config, downstream_dict):
-    model = UniSpeechSatForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["projector.weight"]
-    model.projector.bias.data = downstream_dict["projector.bias"]
-    model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
-    return model
-
-
-def convert_diarization(base_model_name, hf_config, downstream_dict):
-    model = UniSpeechSatForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
-    model.classifier.weight.data = downstream_dict["model.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.linear.bias"]
-    return model
-
-
-def convert_xvector(base_model_name, hf_config, downstream_dict):
-    model = UniSpeechSatForXVector.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["connector.weight"]
-    model.projector.bias.data = downstream_dict["connector.bias"]
-    for i, kernel_size in enumerate(hf_config.tdnn_kernel):
-        model.tdnn[i].kernel.weight.data = downstream_dict[
-            f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
-        ]
-        model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]
-
-    model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
-    model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
-    model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
-    model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
-    model.objective.weight.data = downstream_dict["objective.W"]
-    return model
-
-
-@torch.no_grad()
-def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    downstream_dict = checkpoint["Downstream"]
-
-    hf_config = UniSpeechSatConfig.from_pretrained(config_path)
-    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        base_model_name, return_attention_mask=True, do_normalize=False
-    )
-
-    arch = hf_config.architectures[0]
-    if arch.endswith("ForSequenceClassification"):
-        hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForAudioFrameClassification"):
-        hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForXVector"):
-        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
-    else:
-        raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")
-
-    if hf_config.use_weighted_layer_sum:
-        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
-
-    hf_feature_extractor.save_pretrained(model_dump_path)
-    hf_model.save_pretrained(model_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
-    )
-    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
-    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
-    args = parser.parse_args()
-    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
diff --git a/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 4a70d41dd282..000000000000
--- a/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert UniSpeechSat checkpoint."""
-
-import argparse
-
-import fairseq
-import torch
-
-from transformers import UniSpeechSatConfig, UniSpeechSatForCTC, UniSpeechSatForPreTraining, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "encoder.layer_norm_for_extract": "layer_norm_for_extract",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "lm_head",
-    "label_embs_concat": "label_embeddings_concat",
-    "mask_emb": "masked_spec_embed",
-    "spk_proj": "speaker_proj",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-    "label_embeddings_concat",
-    "speaker_proj",
-    "layer_norm_for_extract",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.unispeech_sat.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "unispeech_sat." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    if "layer_norm_for_extract" in name and (".".join(name.split(".")[:-1]) != key):
-                        # special case since naming is very similar
-                        continue
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor[layer_id].layer_norm.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_unispeech_sat_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = UniSpeechSatConfig.from_pretrained(config_path)
-    else:
-        config = UniSpeechSatConfig()
-
-    dict_path = ""
-
-    if is_finetuned:
-        hf_wav2vec = UniSpeechSatForCTC(config)
-    else:
-        hf_wav2vec = UniSpeechSatForPreTraining(config)
-
-    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-        [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-    )
-    model = model[0].eval()
-
-    recursively_load_weights(model, hf_wav2vec)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_unispeech_sat_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
-    )
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 8daea82a0e23..b1f8c4c3466e 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -1,19 +1,9 @@
-# coding=utf-8
-# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch UniSpeechSat model."""
-
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/unispeech_sat/modular_unispeech_sat.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_unispeech_sat.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import warnings
 from dataclasses import dataclass
@@ -21,16 +11,17 @@
 
 import numpy as np
 import torch
-import torch.utils.checkpoint
-from torch import nn
+import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
+    ModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
     Wav2Vec2BaseModelOutput,
@@ -38,12 +29,9 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
-    ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     is_peft_available,
     logging,
     replace_return_docstrings,
@@ -51,33 +39,17 @@
 from .configuration_unispeech_sat import UniSpeechSatConfig
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
 logger = logging.get_logger(__name__)
 
-
-_HIDDEN_STATES_START_POSITION = 2
-
-# General docstring
-_CONFIG_FOR_DOC = "UniSpeechSatConfig"
-
 # Base docstring
 _CHECKPOINT_FOR_DOC = "microsoft/unispeech-sat-base-100h-libri-ft"
-_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
 
-# CTC docstring
-_CTC_EXPECTED_OUTPUT = "'MISTER QUILDER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
-_CTC_EXPECTED_LOSS = 39.88
-
-# Frame class docstring
-_FRAME_CLASS_CHECKPOINT = "microsoft/unispeech-sat-base-plus-sd"
-_FRAME_EXPECTED_OUTPUT = [0, 0]
-
-# Speaker Verification docstring
-_XVECTOR_CHECKPOINT = "microsoft/unispeech-sat-base-plus-sv"
-_XVECTOR_EXPECTED_OUTPUT = 0.97
+# General docstring
+_CONFIG_FOR_DOC = "UniSpeechSatConfig"
 
 
 @dataclass
@@ -109,135 +81,70 @@ class UniSpeechSatForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    projected_states: torch.FloatTensor = None
-    projected_quantized_states: torch.FloatTensor = None
-    codevector_perplexity: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
-def _compute_mask_indices(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    attention_mask: Optional[torch.LongTensor] = None,
-    min_masks: int = 0,
-) -> np.ndarray:
-    """
-    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
-    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
-    CPU as part of the preprocessing during training.
+class UniSpeechSatSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
 
-    Args:
-        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-                    independently generated mask spans of length `mask_length` is computed by
-                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-                    actual percentage will be smaller.
-        mask_length: size of the mask
-        min_masks: minimum number of masked spans
-        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-                        each batch dimension.
-    """
-    batch_size, sequence_length = shape
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
 
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
 
-    if mask_length > sequence_length:
-        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
-            f" and `sequence_length`: {sequence_length}`"
+class UniSpeechSatPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
         )
 
-    # epsilon is used for probabilistic rounding
-    epsilon = np.random.rand(1).item()
-
-    def compute_num_masked_span(input_length):
-        """Given input length, compute how many spans should be masked"""
-        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
-        num_masked_span = max(num_masked_span, min_masks)
-
-        # make sure num masked span <= sequence_length
-        if num_masked_span * mask_length > sequence_length:
-            num_masked_span = sequence_length // mask_length
-
-        # make sure num_masked span is also <= input_length - (mask_length - 1)
-        if input_length - (mask_length - 1) < num_masked_span:
-            num_masked_span = max(input_length - (mask_length - 1), 0)
-
-        return num_masked_span
-
-    # compute number of masked spans in batch
-    input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
-        if attention_mask is not None
-        else [sequence_length for _ in range(batch_size)]
-    )
-
-    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
-    spec_aug_mask_idxs = []
-
-    max_num_masked_span = compute_num_masked_span(sequence_length)
-
-    if max_num_masked_span == 0:
-        return spec_aug_mask
-
-    for input_length in input_lengths:
-        # compute num of masked spans for this input
-        num_masked_span = compute_num_masked_span(input_length)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
 
-        # get random indices to mask
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
-        )
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
 
-        # pick first sampled index that will serve as a dummy index to pad vector
-        # to ensure same dimension for all batches due to probabilistic rounding
-        # Picking first sample just pads those vectors twice.
-        if len(spec_aug_mask_idx) == 0:
-            # this case can only happen if `input_length` is strictly smaller then
-            # `sequence_length` in which case the last token has to be a padding
-            # token which we can use as a dummy mask id
-            dummy_mask_idx = sequence_length - 1
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
         else:
-            dummy_mask_idx = spec_aug_mask_idx[0]
-
-        spec_aug_mask_idx = np.concatenate(
-            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
-        )
-        spec_aug_mask_idxs.append(spec_aug_mask_idx)
-
-    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
-
-    # expand masked indices to masked spans
-    spec_aug_mask_idxs = np.broadcast_to(
-        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
-    # add offset to the starting indexes so that indexes now create a span
-    offsets = np.arange(mask_length)[None, None, :]
-    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
-        batch_size, max_num_masked_span * mask_length
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+        self.padding = UniSpeechSatSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
 
-    # ensure that we cannot have indices larger than sequence_length
-    if spec_aug_mask_idxs.max() > sequence_length - 1:
-        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
 
-    # scatter indices to mask
-    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
 
-    return spec_aug_mask
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->UniSpeechSat
 class UniSpeechSatNoLayerNormConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -259,7 +166,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->UniSpeechSat
 class UniSpeechSatLayerNormConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -287,7 +193,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->UniSpeechSat
 class UniSpeechSatGroupNormConvLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -312,65 +217,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->UniSpeechSat
-class UniSpeechSatPositionalConvEmbedding(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            config.hidden_size,
-            config.hidden_size,
-            kernel_size=config.num_conv_pos_embeddings,
-            padding=config.num_conv_pos_embeddings // 2,
-            groups=config.num_conv_pos_embedding_groups,
-        )
-
-        weight_norm = nn.utils.weight_norm
-        if hasattr(nn.utils.parametrizations, "weight_norm"):
-            weight_norm = nn.utils.parametrizations.weight_norm
-
-        if is_deepspeed_zero3_enabled():
-            import deepspeed
-
-            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = weight_norm(self.conv, name="weight", dim=2)
-            if hasattr(self.conv, "parametrizations"):
-                weight_g = self.conv.parametrizations.weight.original0
-                weight_v = self.conv.parametrizations.weight.original1
-            else:
-                weight_g = self.conv.weight_g
-                weight_v = self.conv.weight_v
-            deepspeed.zero.register_external_parameter(self, weight_v)
-            deepspeed.zero.register_external_parameter(self, weight_g)
-        else:
-            self.conv = weight_norm(self.conv, name="weight", dim=2)
-
-        self.padding = UniSpeechSatSamePadLayer(config.num_conv_pos_embeddings)
-        self.activation = ACT2FN[config.feat_extract_activation]
-
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.transpose(1, 2)
-
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.padding(hidden_states)
-        hidden_states = self.activation(hidden_states)
-
-        hidden_states = hidden_states.transpose(1, 2)
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->UniSpeechSat
-class UniSpeechSatSamePadLayer(nn.Module):
-    def __init__(self, num_conv_pos_embeddings):
-        super().__init__()
-        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
-
-    def forward(self, hidden_states):
-        if self.num_pad_remove > 0:
-            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->UniSpeechSat
 class UniSpeechSatFeatureEncoder(nn.Module):
     """Construct the features from raw audio waveform"""
 
@@ -418,18 +264,6 @@ def forward(self, input_values):
         return hidden_states
 
 
-class UniSpeechSatFeatureExtractor(UniSpeechSatFeatureEncoder):
-    def __init__(self, config):
-        super().__init__(config)
-        warnings.warn(
-            f"The class `{self.__class__.__name__}` has been depreciated "
-            "and will be removed in Transformers v5. "
-            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
-            FutureWarning,
-        )
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeechSat
 class UniSpeechSatFeatureProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -445,7 +279,6 @@ def forward(self, hidden_states):
         return hidden_states, norm_hidden_states
 
 
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->UniSpeechSat
 class UniSpeechSatAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -604,7 +437,6 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->UniSpeechSat
 class UniSpeechSatFlashAttention2(UniSpeechSatAttention):
     """
     UniSpeechSat flash attention module. This module inherits from `UniSpeechSatAttention` as the weights of the module stays
@@ -616,9 +448,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -732,7 +564,6 @@ def forward(
 
 
 class UniSpeechSatSdpaAttention(UniSpeechSatAttention):
-    # Copied from transformers.models.bart.modeling_bart.BartSdpaAttention.forward with Bart->UniSpeechSat
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -838,14 +669,6 @@ def forward(
         return attn_output, None, past_key_value
 
 
-UNISPEECHSAT_ATTENTION_CLASSES = {
-    "eager": UniSpeechSatAttention,
-    "sdpa": UniSpeechSatSdpaAttention,
-    "flash_attention_2": UniSpeechSatFlashAttention2,
-}
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->UniSpeechSat
 class UniSpeechSatFeedForward(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -855,120 +678,54 @@ def __init__(self, config):
         if isinstance(config.hidden_act, str):
             self.intermediate_act_fn = ACT2FN[config.hidden_act]
         else:
-            self.intermediate_act_fn = config.hidden_act
-
-        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.output_dropout = nn.Dropout(config.hidden_dropout)
-
-    def forward(self, hidden_states):
-        hidden_states = self.intermediate_dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        hidden_states = self.intermediate_dropout(hidden_states)
-
-        hidden_states = self.output_dense(hidden_states)
-        hidden_states = self.output_dropout(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Wav2Vec2->UniSpeechSat, WAV2VEC2->UNISPEECHSAT
-class UniSpeechSatEncoderLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.attention = UNISPEECHSAT_ATTENTION_CLASSES[config._attn_implementation](
-            embed_dim=config.hidden_size,
-            num_heads=config.num_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=False,
-        )
-
-        self.dropout = nn.Dropout(config.hidden_dropout)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.feed_forward = UniSpeechSatFeedForward(config)
-        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
-        attn_residual = hidden_states
-        hidden_states, attn_weights, _ = self.attention(
-            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-        )
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = attn_residual + hidden_states
-
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = hidden_states + self.feed_forward(hidden_states)
-        hidden_states = self.final_layer_norm(hidden_states)
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->UniSpeechSat
-class UniSpeechSatAttnAdapterLayer(nn.Module):
-    def __init__(self, config):
-        """
-        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
-        up training throughput.
-        """
-        super().__init__()
-        self.input_dim = config.adapter_attn_dim
-        self.hidden_dim = config.hidden_size
-
-        self.norm = nn.LayerNorm(self.hidden_dim)
-        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
-        self.act_fn = nn.ReLU()
-        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+            self.intermediate_act_fn = config.hidden_act
 
-    def forward(self, hidden_states: torch.FloatTensor):
-        hidden_states = self.norm(hidden_states)
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
 
-        hidden_states = self.linear_1(hidden_states)
-        hidden_states = self.act_fn(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
 
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm with Wav2Vec2->UniSpeechSat, WAV2VEC2->UNISPEECHSAT
-class UniSpeechSatEncoderLayerStableLayerNorm(nn.Module):
+UNISPEECH_SAT_ATTENTION_CLASSES = {
+    "eager": UniSpeechSatAttention,
+    "sdpa": UniSpeechSatSdpaAttention,
+    "flash_attention_2": UniSpeechSatFlashAttention2,
+}
+
+
+class UniSpeechSatEncoderLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.attention = UNISPEECHSAT_ATTENTION_CLASSES[config._attn_implementation](
+        self.attention = UNISPEECH_SAT_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=config.hidden_size,
             num_heads=config.num_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=False,
         )
+
         self.dropout = nn.Dropout(config.hidden_dropout)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.feed_forward = UniSpeechSatFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
-        if getattr(config, "adapter_attn_dim", None) is not None:
-            self.adapter_layer = UniSpeechSatAttnAdapterLayer(config)
-        else:
-            self.adapter_layer = None
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ):
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
         attn_residual = hidden_states
-        hidden_states = self.layer_norm(hidden_states)
         hidden_states, attn_weights, _ = self.attention(
             hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
         )
         hidden_states = self.dropout(hidden_states)
         hidden_states = attn_residual + hidden_states
-        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
 
-        if self.adapter_layer is not None:
-            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
 
         outputs = (hidden_states,)
 
@@ -978,7 +735,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder with Wav2Vec2->UniSpeechSat
 class UniSpeechSatEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1064,7 +820,76 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderStableLayerNorm with Wav2Vec2->UniSpeechSat
+class UniSpeechSatAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
+class UniSpeechSatEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = UNISPEECH_SAT_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = UniSpeechSatFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = UniSpeechSatAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
 class UniSpeechSatEncoderStableLayerNorm(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1156,7 +981,7 @@ def forward(
 
 class UniSpeechSatGumbelVectorQuantizer(nn.Module):
     """
-    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
     GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
     """
 
@@ -1167,8 +992,8 @@ def __init__(self, config):
 
         if config.codevector_dim % self.num_groups != 0:
             raise ValueError(
-                f"`config.codevector_dim {config.codevector_dim} must be divisible by `config.num_codevector_groups`"
-                f" {self.num_groups} for concatenation"
+                f"`config.codevector_dim {config.codevector_dim} must be divisible "
+                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
             )
 
         # storage for codebook variables (codewords)
@@ -1301,6 +1126,128 @@ def _get_feature_vector_attention_mask(self, feature_vector_length: int, attenti
         return attention_mask
 
 
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+
 UNISPEECH_SAT_START_DOCSTRING = r"""
     UniSpeechSat was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
     Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
@@ -1319,7 +1266,6 @@ def _get_feature_vector_attention_mask(self, feature_vector_length: int, attenti
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-
 UNISPEECH_SAT_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -1339,12 +1285,10 @@ def _get_feature_vector_attention_mask(self, feature_vector_length: int, attenti
             <Tip warning={true}>
 
             `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
-            True`. For all models whose processor has `config.return_attention_mask == False`, such as
-            [microsoft/unispeech-sat-base-100h-libri-ft](https://huggingface.co/microsoft/unispeech-sat-base-100h-libri-ft),
-            `attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For
-            such models `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware
-            that these models also yield slightly different results depending on whether `input_values` is padded or
-            not.
+            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
+            **not** be passed to avoid degraded performance when doing batched inference. For such models
+            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
+            models also yield slightly different results depending on whether `input_values` is padded or not.
 
             </Tip>
 
@@ -1358,6 +1302,8 @@ def _get_feature_vector_attention_mask(self, feature_vector_length: int, attenti
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
+UniSpeechSatBaseModelOutput = Wav2Vec2BaseModelOutput
+
 
 @add_start_docstrings(
     "The bare UniSpeechSat Model transformer outputting raw hidden-states without any specific head on top.",
@@ -1380,7 +1326,6 @@ def __init__(self, config: UniSpeechSatConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
     def _mask_hidden_states(
         self,
         hidden_states: torch.FloatTensor,
@@ -1430,7 +1375,7 @@ def _mask_hidden_states(
     @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Wav2Vec2BaseModelOutput,
+        output_type=UniSpeechSatBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
         expected_output=_EXPECTED_OUTPUT_SHAPE,
@@ -1443,7 +1388,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+    ) -> Union[Tuple, UniSpeechSatBaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1475,7 +1420,7 @@ def forward(
         if not return_dict:
             return (hidden_states, extract_features) + encoder_outputs[1:]
 
-        return Wav2Vec2BaseModelOutput(
+        return UniSpeechSatBaseModelOutput(
             last_hidden_state=hidden_states,
             extract_features=extract_features,
             hidden_states=encoder_outputs.hidden_states,
@@ -1483,7 +1428,10 @@ def forward(
         )
 
 
-@add_start_docstrings("""UniSpeechSat Model with a quantizer and `VQ` head on top.""", UNISPEECH_SAT_START_DOCSTRING)
+@add_start_docstrings(
+    """UniSpeechSat Model with a vector-quantization module and ctc loss for pre-training.""",
+    UNISPEECH_SAT_START_DOCSTRING,
+)
 class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
     def __init__(self, config: UniSpeechSatConfig):
         super().__init__(config)
@@ -1530,7 +1478,7 @@ def freeze_feature_encoder(self):
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
         not be updated during training.
         """
-        self.wav2vec2.feature_extractor._freeze_parameters()
+        self.unispeech_sat.feature_extractor._freeze_parameters()
 
     @staticmethod
     def compute_contrastive_logits(
@@ -1595,16 +1543,6 @@ def forward(
         logits = extract_features
         loss = quantized_features = codevector_perplexity = None
 
-        # layer normalization (has no effect when `config.do_stable_layer_norm == False`)
-        #        extract_features = self.layer_norm_for_extract(extract_features)
-        #        quantized_features, codevector_perplexity = self.quantizer(extract_features)
-        #
-        # project quantized features twice
-        #        quantized_features = self.project_q(quantized_features)
-        #        quantized_features = self.project_hid(quantized_features)
-        #
-        #        loss = None
-        #        logits = quantized_features
         if not return_dict:
             if loss is not None:
                 return (loss, logits, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
@@ -1621,6 +1559,13 @@ def forward(
         )
 
 
+_HIDDEN_STATES_START_POSITION = 2
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILDER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 39.88
+
+
 @add_start_docstrings(
     """UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     UNISPEECH_SAT_START_DOCSTRING,
@@ -1631,7 +1576,6 @@ def forward(
             'eng' by default.
     """,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT
 class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
     def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
@@ -1785,8 +1729,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    UniSpeechSat Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
-    like SUPERB Keyword Spotting.
+    UniSpeechSat Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
     """,
     UNISPEECH_SAT_START_DOCSTRING,
 )
@@ -1808,7 +1752,6 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_extractor
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameters will
@@ -1821,7 +1764,6 @@ def freeze_feature_extractor(self):
         )
         self.freeze_feature_encoder()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->unispeech_sat
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1829,7 +1771,6 @@ def freeze_feature_encoder(self):
         """
         self.unispeech_sat.feature_extractor._freeze_parameters()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_base_model with wav2vec2->unispeech_sat
     def freeze_base_model(self):
         """
         Calling this function will disable the gradient computation for the base model so that its parameters will not
@@ -1845,7 +1786,6 @@ def freeze_base_model(self):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1909,13 +1849,17 @@ def forward(
         )
 
 
+# Frame class docstring
+_FRAME_CLASS_CHECKPOINT = "microsoft/unispeech-sat-base-plus-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+
 @add_start_docstrings(
     """
-    UniSpeech-SAT Model with a frame classification head on top for tasks like Speaker Diarization.
+    UniSpeechSat Model with a frame classification head on top for tasks like Speaker Diarization.
     """,
     UNISPEECH_SAT_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT
 class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -2022,7 +1966,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
 class AMSoftmaxLoss(nn.Module):
     def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
         super(AMSoftmaxLoss, self).__init__()
@@ -2046,7 +1989,6 @@ def forward(self, hidden_states, labels):
         return loss
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
 class TDNNLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -2062,6 +2004,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if is_peft_available():
             from peft.tuners.lora import LoraLayer
 
+        if is_peft_available():
             if isinstance(self.kernel, LoraLayer):
                 warnings.warn(
                     "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
@@ -2078,13 +2021,17 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
+# Speaker Verification docstring
+_XVECTOR_CHECKPOINT = "microsoft/unispeech-sat-base-plus-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.97
+
+
 @add_start_docstrings(
     """
-    UniSpeech-SAT Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    UniSpeechSat Model with an XVector feature extraction head on top for tasks like Speaker Verification.
     """,
     UNISPEECH_SAT_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT
 class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py
new file mode 100644
index 000000000000..44e566068ef5
--- /dev/null
+++ b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py
@@ -0,0 +1,610 @@
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...modeling_outputs import (
+    CausalLMOutput,
+    ModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Encoder,
+    Wav2Vec2EncoderStableLayerNorm,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2ForAudioFrameClassification,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2ForXVector,
+    Wav2Vec2GumbelVectorQuantizer,
+    Wav2Vec2Model,
+    Wav2Vec2PositionalConvEmbedding,
+)
+from .configuration_unispeech_sat import UniSpeechSatConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "UniSpeechSatConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "microsoft/unispeech-sat-base-100h-libri-ft"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILDER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 39.88
+
+# Frame class docstring
+_FRAME_CLASS_CHECKPOINT = "microsoft/unispeech-sat-base-plus-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+# Speaker Verification docstring
+_XVECTOR_CHECKPOINT = "microsoft/unispeech-sat-base-plus-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.97
+
+
+@dataclass
+class UniSpeechSatForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`UniSpeechSatForPreTrainingOutput`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class UniSpeechSatPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding):
+    pass
+
+
+class UniSpeechSatFeatureEncoder(Wav2Vec2FeatureEncoder):
+    pass
+
+
+class UniSpeechSatFeatureProjection(Wav2Vec2FeatureProjection):
+    pass
+
+
+class UniSpeechSatEncoder(Wav2Vec2Encoder):
+    pass
+
+
+class UniSpeechSatEncoderStableLayerNorm(Wav2Vec2EncoderStableLayerNorm):
+    pass
+
+
+class UniSpeechSatGumbelVectorQuantizer(Wav2Vec2GumbelVectorQuantizer):
+    def __init__(self, config):
+        super().__init__()
+        self.weight_proj = nn.Linear(config.hidden_size, self.num_groups * self.num_vars)
+
+    @staticmethod
+    def _compute_perplexity(probs, mask=None):
+        marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class UniSpeechSatPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = UniSpeechSatConfig
+    base_model_prefix = "unispeech_sat"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # gumbel softmax requires special init
+        if isinstance(module, UniSpeechSatGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, UniSpeechSatPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, UniSpeechSatFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+UNISPEECH_SAT_START_DOCSTRING = r"""
+    UniSpeechSat was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`UniSpeechSatConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+UNISPEECH_SAT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
+            **not** be passed to avoid degraded performance when doing batched inference. For such models
+            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
+            models also yield slightly different results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+UniSpeechSatBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare UniSpeechSat Model transformer outputting raw hidden-states without any specific head on top.",
+    UNISPEECH_SAT_START_DOCSTRING,
+)
+class UniSpeechSatModel(UniSpeechSatPreTrainedModel, Wav2Vec2Model):
+    def __init__(self, config: UniSpeechSatConfig):
+        UniSpeechSatPreTrainedModel.__init__(config)
+        self.config = config
+        self.feature_extractor = UniSpeechSatFeatureEncoder(config)
+        self.feature_projection = UniSpeechSatFeatureProjection(config)
+
+        self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = UniSpeechSatEncoderStableLayerNorm(config)
+        else:
+            self.encoder = UniSpeechSatEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for UniSpeechSat")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for UniSpeechSat")
+
+    @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=UniSpeechSatBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, UniSpeechSatBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return UniSpeechSatBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """UniSpeechSat Model with a vector-quantization module and ctc loss for pre-training.""",
+    UNISPEECH_SAT_START_DOCSTRING,
+)
+class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
+    def __init__(self, config: UniSpeechSatConfig):
+        super().__init__(config)
+        self.unispeech_sat = UniSpeechSatModel(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+
+        self.quantizer = UniSpeechSatGumbelVectorQuantizer(config)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
+
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.speaker_proj = nn.Linear(config.hidden_size, config.codevector_dim)
+        self.label_embeddings_concat = nn.Parameter(torch.FloatTensor(config.num_clusters, config.codevector_dim))
+        self.label_embeddings_concat.data.zero_()
+
+        self.layer_norm_for_extract = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        if self.config.do_stable_layer_norm:
+            self.layer_norm_for_extract.requires_grad = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.unispeech_sat.feature_extractor._freeze_parameters()
+
+    @staticmethod
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1)
+        logits = logits.type_as(target_features)
+
+        # apply temperature
+        logits = logits / temperature
+        return logits
+
+    @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=UniSpeechSatForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, UniSpeechSatForPreTrainingOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, UniSpeechSatForPreTraining
+        >>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-sat-base")
+        >>> model = UniSpeechSatForPreTraining.from_pretrained("microsoft/unispeech-sat-base")
+        >>> # TODO: Add full pretraining example
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.unispeech_sat(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        transformer_features = outputs[0]
+
+        # quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+
+        # TODO(PVP) - add pretraining logic and add to tests
+        logits = extract_features
+        loss = quantized_features = codevector_perplexity = None
+
+        if not return_dict:
+            if loss is not None:
+                return (loss, logits, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (logits, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return UniSpeechSatForPreTrainingOutput(
+            loss=loss,
+            logits=logits,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    UNISPEECH_SAT_START_DOCSTRING,
+    """
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechSatForCTC`] with adapters. Uses
+            'eng' by default.
+    """,
+)
+class UniSpeechSatForCTC(Wav2Vec2ForCTC):
+    @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    UniSpeechSat Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """,
+    UNISPEECH_SAT_START_DOCSTRING,
+)
+class UniSpeechSatForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    UniSpeechSat Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    UNISPEECH_SAT_START_DOCSTRING,
+)
+class UniSpeechSatForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification):
+    @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_FRAME_CLASS_CHECKPOINT,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_FRAME_EXPECTED_OUTPUT,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    UniSpeechSat Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    UNISPEECH_SAT_START_DOCSTRING,
+)
+class UniSpeechSatForXVector(Wav2Vec2ForXVector):
+    pass
+
+    @add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_XVECTOR_CHECKPOINT,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_XVECTOR_EXPECTED_OUTPUT,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+__all__ = [
+    "UniSpeechSatForAudioFrameClassification",
+    "UniSpeechSatForCTC",
+    "UniSpeechSatForPreTraining",
+    "UniSpeechSatForSequenceClassification",
+    "UniSpeechSatForXVector",
+    "UniSpeechSatModel",
+    "UniSpeechSatPreTrainedModel",
+]
diff --git a/src/transformers/models/univnet/convert_univnet.py b/src/transformers/models/univnet/convert_univnet.py
deleted file mode 100644
index 30520b7fa147..000000000000
--- a/src/transformers/models/univnet/convert_univnet.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-
-from transformers import UnivNetConfig, UnivNetModel, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.univnet")
-
-
-def get_kernel_predictor_key_mapping(config: UnivNetConfig, old_prefix: str = "", new_prefix: str = ""):
-    mapping = {}
-    # Initial conv layer
-    mapping[f"{old_prefix}.input_conv.0.weight_g"] = f"{new_prefix}.input_conv.weight_g"
-    mapping[f"{old_prefix}.input_conv.0.weight_v"] = f"{new_prefix}.input_conv.weight_v"
-    mapping[f"{old_prefix}.input_conv.0.bias"] = f"{new_prefix}.input_conv.bias"
-
-    # Kernel predictor resnet blocks
-    for i in range(config.kernel_predictor_num_blocks):
-        mapping[f"{old_prefix}.residual_convs.{i}.1.weight_g"] = f"{new_prefix}.resblocks.{i}.conv1.weight_g"
-        mapping[f"{old_prefix}.residual_convs.{i}.1.weight_v"] = f"{new_prefix}.resblocks.{i}.conv1.weight_v"
-        mapping[f"{old_prefix}.residual_convs.{i}.1.bias"] = f"{new_prefix}.resblocks.{i}.conv1.bias"
-
-        mapping[f"{old_prefix}.residual_convs.{i}.3.weight_g"] = f"{new_prefix}.resblocks.{i}.conv2.weight_g"
-        mapping[f"{old_prefix}.residual_convs.{i}.3.weight_v"] = f"{new_prefix}.resblocks.{i}.conv2.weight_v"
-        mapping[f"{old_prefix}.residual_convs.{i}.3.bias"] = f"{new_prefix}.resblocks.{i}.conv2.bias"
-
-    # Kernel output conv
-    mapping[f"{old_prefix}.kernel_conv.weight_g"] = f"{new_prefix}.kernel_conv.weight_g"
-    mapping[f"{old_prefix}.kernel_conv.weight_v"] = f"{new_prefix}.kernel_conv.weight_v"
-    mapping[f"{old_prefix}.kernel_conv.bias"] = f"{new_prefix}.kernel_conv.bias"
-
-    # Bias output conv
-    mapping[f"{old_prefix}.bias_conv.weight_g"] = f"{new_prefix}.bias_conv.weight_g"
-    mapping[f"{old_prefix}.bias_conv.weight_v"] = f"{new_prefix}.bias_conv.weight_v"
-    mapping[f"{old_prefix}.bias_conv.bias"] = f"{new_prefix}.bias_conv.bias"
-
-    return mapping
-
-
-def get_key_mapping(config: UnivNetConfig):
-    mapping = {}
-
-    # NOTE: inital conv layer keys are the same
-
-    # LVC Residual blocks
-    for i in range(len(config.resblock_stride_sizes)):
-        # LVCBlock initial convt layer
-        mapping[f"res_stack.{i}.convt_pre.1.weight_g"] = f"resblocks.{i}.convt_pre.weight_g"
-        mapping[f"res_stack.{i}.convt_pre.1.weight_v"] = f"resblocks.{i}.convt_pre.weight_v"
-        mapping[f"res_stack.{i}.convt_pre.1.bias"] = f"resblocks.{i}.convt_pre.bias"
-
-        # Kernel predictor
-        kernel_predictor_mapping = get_kernel_predictor_key_mapping(
-            config, old_prefix=f"res_stack.{i}.kernel_predictor", new_prefix=f"resblocks.{i}.kernel_predictor"
-        )
-        mapping.update(kernel_predictor_mapping)
-
-        # LVC Residual blocks
-        for j in range(len(config.resblock_dilation_sizes[i])):
-            mapping[f"res_stack.{i}.conv_blocks.{j}.1.weight_g"] = f"resblocks.{i}.resblocks.{j}.conv.weight_g"
-            mapping[f"res_stack.{i}.conv_blocks.{j}.1.weight_v"] = f"resblocks.{i}.resblocks.{j}.conv.weight_v"
-            mapping[f"res_stack.{i}.conv_blocks.{j}.1.bias"] = f"resblocks.{i}.resblocks.{j}.conv.bias"
-
-    # Output conv layer
-    mapping["conv_post.1.weight_g"] = "conv_post.weight_g"
-    mapping["conv_post.1.weight_v"] = "conv_post.weight_v"
-    mapping["conv_post.1.bias"] = "conv_post.bias"
-
-    return mapping
-
-
-def rename_state_dict(state_dict, keys_to_modify, keys_to_remove):
-    model_state_dict = {}
-    for key, value in state_dict.items():
-        if key in keys_to_remove:
-            continue
-
-        if key in keys_to_modify:
-            new_key = keys_to_modify[key]
-            model_state_dict[new_key] = value
-        else:
-            model_state_dict[key] = value
-    return model_state_dict
-
-
-def convert_univnet_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    repo_id=None,
-    safe_serialization=False,
-):
-    model_state_dict_base = torch.load(checkpoint_path, map_location="cpu")
-    # Get the generator's state dict
-    state_dict = model_state_dict_base["model_g"]
-
-    if config_path is not None:
-        config = UnivNetConfig.from_pretrained(config_path)
-    else:
-        config = UnivNetConfig()
-
-    keys_to_modify = get_key_mapping(config)
-    keys_to_remove = set()
-    hf_state_dict = rename_state_dict(state_dict, keys_to_modify, keys_to_remove)
-
-    model = UnivNetModel(config)
-    # Apply weight norm since the original checkpoint has weight norm applied
-    model.apply_weight_norm()
-    model.load_state_dict(hf_state_dict)
-    # Remove weight norm in preparation for inference
-    model.remove_weight_norm()
-
-    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        model.push_to_hub(repo_id)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-    parser.add_argument(
-        "--safe_serialization", action="store_true", help="Whether to save the model using `safetensors`."
-    )
-
-    args = parser.parse_args()
-
-    convert_univnet_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.push_to_hub,
-        args.safe_serialization,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/univnet/feature_extraction_univnet.py b/src/transformers/models/univnet/feature_extraction_univnet.py
index ab9d3ed5dd7f..5f43532d9df8 100644
--- a/src/transformers/models/univnet/feature_extraction_univnet.py
+++ b/src/transformers/models/univnet/feature_extraction_univnet.py
@@ -276,7 +276,7 @@ def batch_decode(self, waveforms, waveform_lengths=None) -> List[np.ndarray]:
             `List[np.ndarray]`: A ragged list of 1D waveform arrays with padding removed.
         """
         # Collapse the batched waveform tensor to a list of 1D audio waveforms
-        waveforms = [waveform.detach().clone().cpu().numpy() for waveform in waveforms]
+        waveforms = [waveform.detach().to(device="cpu", copy=True).numpy() for waveform in waveforms]
 
         if waveform_lengths is not None:
             waveforms = [waveform[: waveform_lengths[i]] for i, waveform in enumerate(waveforms)]
@@ -370,7 +370,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/univnet/modeling_univnet.py b/src/transformers/models/univnet/modeling_univnet.py
index 9cfd45e188ab..3c73625592f3 100644
--- a/src/transformers/models/univnet/modeling_univnet.py
+++ b/src/transformers/models/univnet/modeling_univnet.py
@@ -46,8 +46,8 @@ class UnivNetModelOutput(ModelOutput):
             The batched length in samples of each unpadded waveform in `waveforms`.
     """
 
-    waveforms: torch.FloatTensor = None
-    waveform_lengths: torch.FloatTensor = None
+    waveforms: Optional[torch.FloatTensor] = None
+    waveform_lengths: Optional[torch.FloatTensor] = None
 
 
 class UnivNetKernelPredictorResidualBlock(nn.Module):
diff --git a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
deleted file mode 100644
index eeb3ab5fc993..000000000000
--- a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ConvNext + UperNet checkpoints from mmsegmentation."""
-
-import argparse
-import json
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import ConvNextConfig, SegformerImageProcessor, UperNetConfig, UperNetForSemanticSegmentation
-
-
-def get_upernet_config(model_name):
-    auxiliary_in_channels = 384
-    if "tiny" in model_name:
-        depths = [3, 3, 9, 3]
-        hidden_sizes = [96, 192, 384, 768]
-    if "small" in model_name:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [96, 192, 384, 768]
-    if "base" in model_name:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [128, 256, 512, 1024]
-        auxiliary_in_channels = 512
-    if "large" in model_name:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [192, 384, 768, 1536]
-        auxiliary_in_channels = 768
-    if "xlarge" in model_name:
-        depths = [3, 3, 27, 3]
-        hidden_sizes = [256, 512, 1024, 2048]
-        auxiliary_in_channels = 1024
-
-    # set label information
-    num_labels = 150
-    repo_id = "huggingface/label-files"
-    filename = "ade20k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    label2id = {v: k for k, v in id2label.items()}
-
-    backbone_config = ConvNextConfig(
-        depths=depths, hidden_sizes=hidden_sizes, out_features=["stage1", "stage2", "stage3", "stage4"]
-    )
-    config = UperNetConfig(
-        backbone_config=backbone_config,
-        auxiliary_in_channels=auxiliary_in_channels,
-        num_labels=num_labels,
-        id2label=id2label,
-        label2id=label2id,
-    )
-
-    return config
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("backbone.downsample_layers.0.0.weight", "backbone.embeddings.patch_embeddings.weight"))
-    rename_keys.append(("backbone.downsample_layers.0.0.bias", "backbone.embeddings.patch_embeddings.bias"))
-    rename_keys.append(("backbone.downsample_layers.0.1.weight", "backbone.embeddings.layernorm.weight"))
-    rename_keys.append(("backbone.downsample_layers.0.1.bias", "backbone.embeddings.layernorm.bias"))
-    # stages
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"backbone.stages.{i}.{j}.gamma", f"backbone.encoder.stages.{i}.layers.{j}.layer_scale_parameter"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.depthwise_conv.weight", f"backbone.encoder.stages.{i}.layers.{j}.dwconv.weight"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.depthwise_conv.bias", f"backbone.encoder.stages.{i}.layers.{j}.dwconv.bias"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.norm.weight", f"backbone.encoder.stages.{i}.layers.{j}.layernorm.weight"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.norm.bias", f"backbone.encoder.stages.{i}.layers.{j}.layernorm.bias"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv1.weight", f"backbone.encoder.stages.{i}.layers.{j}.pwconv1.weight"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv1.bias", f"backbone.encoder.stages.{i}.layers.{j}.pwconv1.bias"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv2.weight", f"backbone.encoder.stages.{i}.layers.{j}.pwconv2.weight"))
-            rename_keys.append((f"backbone.stages.{i}.{j}.pointwise_conv2.bias", f"backbone.encoder.stages.{i}.layers.{j}.pwconv2.bias"))
-        if i > 0:
-            rename_keys.append((f"backbone.downsample_layers.{i}.0.weight", f"backbone.encoder.stages.{i}.downsampling_layer.0.weight"))
-            rename_keys.append((f"backbone.downsample_layers.{i}.0.bias", f"backbone.encoder.stages.{i}.downsampling_layer.0.bias"))
-            rename_keys.append((f"backbone.downsample_layers.{i}.1.weight", f"backbone.encoder.stages.{i}.downsampling_layer.1.weight"))
-            rename_keys.append((f"backbone.downsample_layers.{i}.1.bias", f"backbone.encoder.stages.{i}.downsampling_layer.1.bias"))
-
-        rename_keys.append((f"backbone.norm{i}.weight", f"backbone.hidden_states_norms.stage{i+1}.weight"))
-        rename_keys.append((f"backbone.norm{i}.bias", f"backbone.hidden_states_norms.stage{i+1}.bias"))
-
-    # decode head
-    rename_keys.extend(
-        [
-            ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
-            ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
-            ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
-            ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
-        ]
-    )
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    model_name_to_url = {
-        "upernet-convnext-tiny": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth",
-        "upernet-convnext-small": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth",
-        "upernet-convnext-base": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth",
-        "upernet-convnext-large": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth",
-        "upernet-convnext-xlarge": "https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth",
-    }
-    checkpoint_url = model_name_to_url[model_name]
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
-
-    config = get_upernet_config(model_name)
-    model = UperNetForSemanticSegmentation(config)
-    model.eval()
-
-    # replace "bn" => "batch_norm"
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        if "bn" in key:
-            key = key.replace("bn", "batch_norm")
-        state_dict[key] = val
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-
-    model.load_state_dict(state_dict)
-
-    # verify on image
-    url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    processor = SegformerImageProcessor()
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    with torch.no_grad():
-        outputs = model(pixel_values)
-
-    if model_name == "upernet-convnext-tiny":
-        expected_slice = torch.tensor(
-            [[-8.8110, -8.8110, -8.6521], [-8.8110, -8.8110, -8.6521], [-8.7746, -8.7746, -8.6130]]
-        )
-    elif model_name == "upernet-convnext-small":
-        expected_slice = torch.tensor(
-            [[-8.8236, -8.8236, -8.6771], [-8.8236, -8.8236, -8.6771], [-8.7638, -8.7638, -8.6240]]
-        )
-    elif model_name == "upernet-convnext-base":
-        expected_slice = torch.tensor(
-            [[-8.8558, -8.8558, -8.6905], [-8.8558, -8.8558, -8.6905], [-8.7669, -8.7669, -8.6021]]
-        )
-    elif model_name == "upernet-convnext-large":
-        expected_slice = torch.tensor(
-            [[-8.6660, -8.6660, -8.6210], [-8.6660, -8.6660, -8.6210], [-8.6310, -8.6310, -8.5964]]
-        )
-    elif model_name == "upernet-convnext-xlarge":
-        expected_slice = torch.tensor(
-            [[-8.4980, -8.4980, -8.3977], [-8.4980, -8.4980, -8.3977], [-8.4379, -8.4379, -8.3412]]
-        )
-    print("Logits:", outputs.logits[0, 0, :3, :3])
-    assert torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to hub")
-        model.push_to_hub(f"openmmlab/{model_name}")
-        processor.push_to_hub(f"openmmlab/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="upernet-convnext-tiny",
-        type=str,
-        choices=[f"upernet-convnext-{size}" for size in ["tiny", "small", "base", "large", "xlarge"]],
-        help="Name of the ConvNext UperNet model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_upernet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
deleted file mode 100644
index 9580af7c46a5..000000000000
--- a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Swin Transformer + UperNet checkpoints from mmsegmentation.
-
-URL: https://github.com/open-mmlab/mmsegmentation/tree/master/configs/swin
-"""
-
-import argparse
-import json
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import SegformerImageProcessor, SwinConfig, UperNetConfig, UperNetForSemanticSegmentation
-
-
-def get_upernet_config(model_name):
-    auxiliary_in_channels = 384
-    window_size = 7
-    if "tiny" in model_name:
-        embed_dim = 96
-        depths = (2, 2, 6, 2)
-        num_heads = (3, 6, 12, 24)
-    elif "small" in model_name:
-        embed_dim = 96
-        depths = (2, 2, 18, 2)
-        num_heads = (3, 6, 12, 24)
-    elif "base" in model_name:
-        embed_dim = 128
-        depths = (2, 2, 18, 2)
-        num_heads = (4, 8, 16, 32)
-        window_size = 12
-        auxiliary_in_channels = 512
-    elif "large" in model_name:
-        embed_dim = 192
-        depths = (2, 2, 18, 2)
-        num_heads = (6, 12, 24, 48)
-        window_size = 12
-        auxiliary_in_channels = 768
-
-    # set label information
-    num_labels = 150
-    repo_id = "huggingface/label-files"
-    filename = "ade20k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    label2id = {v: k for k, v in id2label.items()}
-
-    backbone_config = SwinConfig(
-        embed_dim=embed_dim,
-        depths=depths,
-        num_heads=num_heads,
-        window_size=window_size,
-        out_features=["stage1", "stage2", "stage3", "stage4"],
-    )
-    config = UperNetConfig(
-        backbone_config=backbone_config,
-        auxiliary_in_channels=auxiliary_in_channels,
-        num_labels=num_labels,
-        id2label=id2label,
-        label2id=label2id,
-    )
-
-    return config
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("backbone.patch_embed.projection.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
-    rename_keys.append(("backbone.patch_embed.projection.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
-    rename_keys.append(("backbone.patch_embed.norm.weight", "backbone.embeddings.norm.weight"))
-    rename_keys.append(("backbone.patch_embed.norm.bias", "backbone.embeddings.norm.bias"))
-    # stages
-    for i in range(len(config.backbone_config.depths)):
-        for j in range(config.backbone_config.depths[i]):
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.weight"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.bias"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.relative_position_bias_table", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.relative_position_index", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.proj.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.attn.w_msa.proj.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.weight"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.norm2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.bias"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.0.0.weight", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.0.0.bias", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.weight"))
-            rename_keys.append((f"backbone.stages.{i}.blocks.{j}.ffn.layers.1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.bias"))
-
-        if i < 3:
-            rename_keys.append((f"backbone.stages.{i}.downsample.reduction.weight", f"backbone.encoder.layers.{i}.downsample.reduction.weight"))
-            rename_keys.append((f"backbone.stages.{i}.downsample.norm.weight", f"backbone.encoder.layers.{i}.downsample.norm.weight"))
-            rename_keys.append((f"backbone.stages.{i}.downsample.norm.bias", f"backbone.encoder.layers.{i}.downsample.norm.bias"))
-        rename_keys.append((f"backbone.norm{i}.weight", f"backbone.hidden_states_norms.stage{i+1}.weight"))
-        rename_keys.append((f"backbone.norm{i}.bias", f"backbone.hidden_states_norms.stage{i+1}.bias"))
-
-    # decode head
-    rename_keys.extend(
-        [
-            ("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
-            ("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
-            ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
-            ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
-        ]
-    )
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, backbone_config):
-    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
-    for i in range(len(backbone_config.depths)):
-        dim = num_features[i]
-        for j in range(backbone_config.depths[i]):
-            # fmt: off
-            # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-            in_proj_weight = state_dict.pop(f"backbone.stages.{i}.blocks.{j}.attn.w_msa.qkv.weight")
-            in_proj_bias = state_dict.pop(f"backbone.stages.{i}.blocks.{j}.attn.w_msa.qkv.bias")
-            # next, add query, keys and values (in that order) to the state dict
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
-                dim : dim * 2, :
-            ]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
-                dim : dim * 2
-            ]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
-                -dim :, :
-            ]
-            state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
-            # fmt: on
-
-
-def correct_unfold_reduction_order(x):
-    out_channel, in_channel = x.shape
-    x = x.reshape(out_channel, 4, in_channel // 4)
-    x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel)
-    return x
-
-
-def reverse_correct_unfold_reduction_order(x):
-    out_channel, in_channel = x.shape
-    x = x.reshape(out_channel, in_channel // 4, 4)
-    x = x[:, :, [0, 2, 1, 3]].transpose(1, 2).reshape(out_channel, in_channel)
-
-    return x
-
-
-def correct_unfold_norm_order(x):
-    in_channel = x.shape[0]
-    x = x.reshape(4, in_channel // 4)
-    x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
-    return x
-
-
-# there was an incompatibility with this version, due to a new implementation of their downsampling operation using nn.Unfold.
-# was resolved as seen here:
-# https://github.com/open-mmlab/mmdetection/blob/31c84958f54287a8be2b99cbf87a6dcf12e57753/mmdet/models/utils/ckpt_convert.py#L96.
-def reverse_correct_unfold_norm_order(x):
-    in_channel = x.shape[0]
-    x = x.reshape(in_channel // 4, 4)
-    x = x[:, [0, 2, 1, 3]].transpose(0, 1).reshape(in_channel)
-    return x
-
-
-def convert_upernet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    model_name_to_url = {
-        "upernet-swin-tiny": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth",
-        "upernet-swin-small": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth",
-        "upernet-swin-base": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth",
-        "upernet-swin-large": "https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_large_patch4_window12_512x512_pretrain_384x384_22K_160k_ade20k/upernet_swin_large_patch4_window12_512x512_pretrain_384x384_22K_160k_ade20k_20220318_091743-9ba68901.pth",
-    }
-    checkpoint_url = model_name_to_url[model_name]
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)[
-        "state_dict"
-    ]
-
-    for name, param in state_dict.items():
-        print(name, param.shape)
-
-    config = get_upernet_config(model_name)
-    model = UperNetForSemanticSegmentation(config)
-    model.eval()
-
-    # replace "bn" => "batch_norm"
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        if "bn" in key:
-            key = key.replace("bn", "batch_norm")
-        state_dict[key] = val
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config.backbone_config)
-
-    # fix downsample parameters
-    for key, value in state_dict.items():
-        if "downsample" in key:
-            if "reduction" in key:
-                state_dict[key] = reverse_correct_unfold_reduction_order(value)
-            if "norm" in key:
-                state_dict[key] = reverse_correct_unfold_norm_order(value)
-
-    model.load_state_dict(state_dict)
-
-    # verify on image
-    url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-
-    processor = SegformerImageProcessor()
-    pixel_values = processor(image, return_tensors="pt").pixel_values
-
-    with torch.no_grad():
-        outputs = model(pixel_values)
-        logits = outputs.logits
-
-    print(logits.shape)
-    print("First values of logits:", logits[0, 0, :3, :3])
-    # assert values
-    if model_name == "upernet-swin-tiny":
-        expected_slice = torch.tensor(
-            [[-7.5958, -7.5958, -7.4302], [-7.5958, -7.5958, -7.4302], [-7.4797, -7.4797, -7.3068]]
-        )
-    elif model_name == "upernet-swin-small":
-        expected_slice = torch.tensor(
-            [[-7.1921, -7.1921, -6.9532], [-7.1921, -7.1921, -6.9532], [-7.0908, -7.0908, -6.8534]]
-        )
-    elif model_name == "upernet-swin-base":
-        expected_slice = torch.tensor(
-            [[-6.5851, -6.5851, -6.4330], [-6.5851, -6.5851, -6.4330], [-6.4763, -6.4763, -6.3254]]
-        )
-    elif model_name == "upernet-swin-large":
-        expected_slice = torch.tensor(
-            [[-7.5297, -7.5297, -7.3802], [-7.5297, -7.5297, -7.3802], [-7.4044, -7.4044, -7.2586]]
-        )
-    print("Logits:", outputs.logits[0, 0, :3, :3])
-    assert torch.allclose(outputs.logits[0, 0, :3, :3], expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        print(f"Saving processor to {pytorch_dump_folder_path}")
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to hub")
-        model.push_to_hub(f"openmmlab/{model_name}")
-        processor.push_to_hub(f"openmmlab/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="upernet-swin-tiny",
-        type=str,
-        choices=[f"upernet-swin-{size}" for size in ["tiny", "small", "base", "large"]],
-        help="Name of the Swin + UperNet model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_upernet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py b/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py
deleted file mode 100644
index 4c07ca0a03a7..000000000000
--- a/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoTokenizer,
-    VideoLlavaConfig,
-    VideoLlavaForConditionalGeneration,
-    VideoLlavaImageProcessor,
-    VideoLlavaProcessor,
-)
-
-
-EPILOG_TXT = """Example:
-    python transformers/src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py --text_model_id lmsys/vicuna-7b-v1.5 --vision_model_id openai/clip-vit-large-patch14 --output_hub_path org/video_llava-7b --old_state_dict_id LanguageBind/Video-LLaVA-7B
-
-Example for creating the old state dict file with Python:
-
-    import torch
-    from video_llava.model.language_model.video_llava import VideoLlavaForCausalLM
-
-    # load model
-    kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
-    model = VideoLlavaForCausalLM.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", low_cpu_mem_usage=True, **kwargs)
-
-    # load vision tower
-    model.get_vision_tower().load_model()
-
-    # Save state dict
-    torch.save(model.state_dict(), "tmp/hf_models/video_llava-7b/model_state_dict.bin")
-"""
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.video_tower.video_tower": "video_tower",
-    "model.image_tower.image_tower": "image_tower",
-    "model.mm_projector": "multi_modal_projector",
-    "model": "language_model.model",
-    "lm_head": "language_model.lm_head",
-    "video_tower": "video_tower.vision_model",
-    "image_tower": "image_tower.vision_model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-}
-
-
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-def convert_video_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_tokens(AddedToken("<video>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_special_tokens({"pad_token": "<pad>"})
-    tokenizer.padding_side = "left"
-
-    image_processor = VideoLlavaImageProcessor.from_pretrained(vision_model_id)
-
-    processor = VideoLlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-    config = VideoLlavaConfig(text_config=text_config)
-    config.pad_token_id = 32002
-
-    with torch.device("meta"):
-        model = VideoLlavaForConditionalGeneration(config)
-
-    model_state_dict = set(model.state_dict().keys())
-
-    # Pad to 64 for performance reasons
-    pad_shape = 64
-    state_dict_temp = "pytorch_model-0000{i}-of-00002.bin"
-    for shard in range(1, 3):
-        state_dict_path = hf_hub_download(old_state_dict_id, state_dict_temp.format(i=shard))
-        state_dict = torch.load(state_dict_path, map_location="cpu")
-        state_dict = convert_state_dict_to_hf(state_dict)
-        model.load_state_dict(state_dict, strict=False, assign=True)
-        model_state_dict -= set(state_dict.keys())
-
-    if len(model_state_dict) > 0:
-        raise RuntimeError(f"Missing keys in state dict: {model_state_dict}")
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image and video token so we resize the model
-    model.resize_token_embeddings(config.text_config.vocab_size + 3, pad_shape)
-    model.language_model.model.embed_tokens.weight.data[32000:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[32000:].shape[0]))),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[32000:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[32000:].shape[0]))),
-        dim=0,
-    )
-
-    model.push_to_hub(output_hub_path)
-    processor.push_to_hub(output_hub_path)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        epilog=EPILOG_TXT,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--text_model_id",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--vision_model_id",
-        help="Hub location of the vision model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--old_state_dict_id",
-        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
-    )
-    args = parser.parse_args()
-    convert_video_llava_llama_to_hf(
-        args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
index dbb105485770..8b7f48979188 100644
--- a/src/transformers/models/video_llava/image_processing_video_llava.py
+++ b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -174,17 +174,17 @@ def preprocess(
         self,
         images: List[ImageInput] = None,
         videos: List[VideoInput] = None,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -331,9 +331,9 @@ def _preprocess_image(
         do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_center_crop: bool = None,
-        crop_size: int = None,
-        do_convert_rgb: bool = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_convert_rgb: Optional[bool] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index ba4de6537442..24aaee0351ee 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -78,7 +78,7 @@ class VideoLlavaCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -364,9 +364,9 @@ def get_video_features(
     @replace_return_docstrings(output_type=VideoLlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values_images: torch.FloatTensor = None,
-        pixel_values_videos: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values_images: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -383,7 +383,6 @@ def forward(
         **lm_kwargs,
     ) -> Union[Tuple, VideoLlavaCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 19c10f8b4b21..4806720b3701 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -103,7 +103,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
deleted file mode 100644
index c98160a6bb82..000000000000
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VideoMAE checkpoints from the original repository: https://github.com/MCG-NJU/VideoMAE"""
-
-import argparse
-import json
-
-import gdown
-import numpy as np
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    VideoMAEConfig,
-    VideoMAEForPreTraining,
-    VideoMAEForVideoClassification,
-    VideoMAEImageProcessor,
-)
-
-
-def get_videomae_config(model_name):
-    config = VideoMAEConfig()
-
-    set_architecture_configs(model_name, config)
-
-    if "finetuned" not in model_name:
-        config.use_mean_pooling = False
-
-    if "finetuned" in model_name:
-        repo_id = "huggingface/label-files"
-        if "kinetics" in model_name:
-            config.num_labels = 400
-            filename = "kinetics400-id2label.json"
-        elif "ssv2" in model_name:
-            config.num_labels = 174
-            filename = "something-something-v2-id2label.json"
-        else:
-            raise ValueError("Model name should either contain 'kinetics' or 'ssv2' in case it's fine-tuned.")
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-def set_architecture_configs(model_name, config):
-    if "small" in model_name:
-        config.hidden_size = 384
-        config.intermediate_size = 1536
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 16
-        config.decoder_num_hidden_layers = 12
-        config.decoder_num_attention_heads = 3
-        config.decoder_hidden_size = 192
-        config.decoder_intermediate_size = 768
-    elif "large" in model_name:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.decoder_num_hidden_layers = 12
-        config.decoder_num_attention_heads = 8
-        config.decoder_hidden_size = 512
-        config.decoder_intermediate_size = 2048
-    elif "huge" in model_name:
-        config.hidden_size = 1280
-        config.intermediate_size = 5120
-        config.num_hidden_layers = 32
-        config.num_attention_heads = 16
-        config.decoder_num_hidden_layers = 12
-        config.decoder_num_attention_heads = 8
-        config.decoder_hidden_size = 640
-        config.decoder_intermediate_size = 2560
-    elif "base" not in model_name:
-        raise ValueError('Model name should include either "small", "base", "large", or "huge"')
-
-
-def rename_key(name):
-    if "encoder." in name:
-        name = name.replace("encoder.", "")
-    if "cls_token" in name:
-        name = name.replace("cls_token", "videomae.embeddings.cls_token")
-    if "decoder_pos_embed" in name:
-        name = name.replace("decoder_pos_embed", "decoder.decoder_pos_embed")
-    if "pos_embed" in name and "decoder" not in name:
-        name = name.replace("pos_embed", "videomae.embeddings.position_embeddings")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "videomae.embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "videomae.embeddings.norm")
-    if "decoder.blocks" in name:
-        name = name.replace("decoder.blocks", "decoder.decoder_layers")
-    if "blocks" in name:
-        name = name.replace("blocks", "videomae.encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name and "bias" not in name:
-        name = name.replace("attn", "attention.self")
-    if "attn" in name:
-        name = name.replace("attn", "attention.attention")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "decoder_embed" in name:
-        name = name.replace("decoder_embed", "decoder.decoder_embed")
-    if "decoder_norm" in name:
-        name = name.replace("decoder_norm", "decoder.decoder_norm")
-    if "decoder_pred" in name:
-        name = name.replace("decoder_pred", "decoder.decoder_pred")
-    if "norm.weight" in name and "decoder" not in name and "fc" not in name:
-        name = name.replace("norm.weight", "videomae.layernorm.weight")
-    if "norm.bias" in name and "decoder" not in name and "fc" not in name:
-        name = name.replace("norm.bias", "videomae.layernorm.bias")
-    if "head" in name and "decoder" not in name:
-        name = name.replace("head", "classifier")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if key.startswith("encoder."):
-            key = key.replace("encoder.", "")
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            if key.startswith("decoder.blocks"):
-                dim = config.decoder_hidden_size
-                layer_num = int(key_split[2])
-                prefix = "decoder.decoder_layers."
-                if "weight" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-            else:
-                dim = config.hidden_size
-                layer_num = int(key_split[1])
-                prefix = "videomae.encoder.layer."
-                if "weight" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
-def prepare_video():
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
-    )
-    video = np.load(file)
-    return list(video)
-
-
-def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_name, push_to_hub):
-    config = get_videomae_config(model_name)
-
-    if "finetuned" in model_name:
-        model = VideoMAEForVideoClassification(config)
-    else:
-        model = VideoMAEForPreTraining(config)
-
-    # download original checkpoint, hosted on Google Drive
-    output = "pytorch_model.bin"
-    gdown.cached_download(checkpoint_url, output, quiet=False)
-    files = torch.load(output, map_location="cpu")
-    if "model" in files:
-        state_dict = files["model"]
-    else:
-        state_dict = files["module"]
-    new_state_dict = convert_state_dict(state_dict, config)
-
-    model.load_state_dict(new_state_dict)
-    model.eval()
-
-    # verify model on basic input
-    image_processor = VideoMAEImageProcessor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
-    video = prepare_video()
-    inputs = image_processor(video, return_tensors="pt")
-
-    if "finetuned" not in model_name:
-        local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
-        inputs["bool_masked_pos"] = torch.load(local_path)
-
-    outputs = model(**inputs)
-    logits = outputs.logits
-
-    model_names = [
-        "videomae-small-finetuned-kinetics",
-        "videomae-small-finetuned-ssv2",
-        # Kinetics-400 checkpoints (short = pretrained only for 800 epochs instead of 1600)
-        "videomae-base-short",
-        "videomae-base-short-finetuned-kinetics",
-        "videomae-base",
-        "videomae-base-finetuned-kinetics",
-        "videomae-large",
-        "videomae-large-finetuned-kinetics",
-        "videomae-huge-finetuned-kinetics",
-        # Something-Something-v2 checkpoints (short = pretrained only for 800 epochs instead of 2400)
-        "videomae-base-short-ssv2",
-        "videomae-base-short-finetuned-ssv2",
-        "videomae-base-ssv2",
-        "videomae-base-finetuned-ssv2",
-    ]
-
-    # NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5]
-    if model_name == "videomae-small-finetuned-kinetics":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([-0.9291, -0.4061, -0.9307])
-    elif model_name == "videomae-small-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([0.2671, -0.4689, -0.8235])
-    elif model_name == "videomae-base":
-        expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]])
-    elif model_name == "videomae-base-short":
-        expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]])
-        # we verified the loss both for normalized and unnormalized targets for this one
-        expected_loss = torch.tensor([0.5142]) if config.norm_pix_loss else torch.tensor([0.6469])
-    elif model_name == "videomae-large":
-        expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.7149, 0.7997, 0.6966], [0.6768, 0.7869, 0.6948], [0.5139, 0.6221, 0.5605]])
-    elif model_name == "videomae-large-finetuned-kinetics":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0.0771, 0.0011, -0.3625])
-    elif model_name == "videomae-huge-finetuned-kinetics":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0.2433, 0.1632, -0.4894])
-    elif model_name == "videomae-base-short-finetuned-kinetics":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0.6588, 0.0990, -0.2493])
-    elif model_name == "videomae-base-finetuned-kinetics":
-        expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0.3669, -0.0688, -0.2421])
-    elif model_name == "videomae-base-short-ssv2":
-        expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.4712, 0.5296, 0.5786], [0.2278, 0.2729, 0.4026], [0.0352, 0.0730, 0.2506]])
-    elif model_name == "videomae-base-short-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([-0.0537, -0.1539, -0.3266])
-    elif model_name == "videomae-base-ssv2":
-        expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.8131, 0.8727, 0.8546], [0.7366, 0.9377, 0.8870], [0.5935, 0.8874, 0.8564]])
-    elif model_name == "videomae-base-finetuned-ssv2":
-        expected_shape = torch.Size([1, 174])
-        expected_slice = torch.tensor([0.1961, -0.8337, -0.6389])
-    else:
-        raise ValueError(f"Model name not supported. Should be one of {model_names}")
-
-    # verify logits
-    assert logits.shape == expected_shape
-    if "finetuned" in model_name:
-        assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
-    else:
-        print("Logits:", logits[0, :3, :3])
-        assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
-    print("Logits ok!")
-
-    # verify loss, if applicable
-    if model_name == "videomae-base-short":
-        loss = outputs.loss
-        assert torch.allclose(loss, expected_loss, atol=1e-4)
-        print("Loss ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-        model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing to the hub...")
-        model.push_to_hub(model_name, organization="nielsr")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://drive.google.com/u/1/uc?id=1tEhLyskjb755TJ65ptsrafUG2llSwQE1&amp;export=download&amp;confirm=t&amp;uuid=aa3276eb-fb7e-482a-adec-dc7171df14c4",
-        type=str,
-        help=(
-            "URL of the original PyTorch checkpoint (on Google Drive) you'd like to convert. Should be a direct"
-            " download link."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default="/Users/nielsrogge/Documents/VideoMAE/Test",
-        type=str,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument("--model_name", default="videomae-base", type=str, help="Name of the model.")
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_videomae_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index afba947bbdbf..eac4759af397 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -178,14 +178,14 @@ def resize(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
@@ -236,14 +236,14 @@ def _preprocess_image(
     def preprocess(
         self,
         videos: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 6e65ebf06d9c..077df5bcf474 100755
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -15,10 +15,9 @@
 """PyTorch VideoMAE (masked autoencoder) model."""
 
 import collections.abc
-import math
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Optional, Set, Tuple, Union
+from typing import Callable, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
@@ -28,7 +27,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
@@ -65,7 +64,7 @@ class VideoMAEDecoderOutput(ModelOutput):
             the self-attention heads.
     """
 
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -91,7 +90,7 @@ class VideoMAEForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -132,7 +131,9 @@ def forward(self, pixel_values, bool_masked_pos):
         embeddings = self.patch_embeddings(pixel_values)
 
         # add position embeddings
-        embeddings = embeddings + self.position_embeddings.type_as(embeddings).to(embeddings.device).clone().detach()
+        embeddings = embeddings + self.position_embeddings.detach().type_as(embeddings).to(
+            device=embeddings.device, copy=True
+        )
         # only keep visible patches
         # ~bool_masked_pos means visible
         if bool_masked_pos is not None:
@@ -196,18 +197,52 @@ def forward(self, pixel_values):
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 class VideoMAESelfAttention(nn.Module):
     def __init__(self, config: VideoMAEConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
-
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
@@ -220,8 +255,6 @@ def __init__(self, config: VideoMAEConfig) -> None:
             self.q_bias = None
             self.v_bias = None
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -239,65 +272,33 @@ def forward(
         value_layer = self.transpose_for_scores(values)
         query_layer = self.transpose_for_scores(queries)
 
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-class VideoMAESdpaSelfAttention(VideoMAESelfAttention):
-    def __init__(self, config: VideoMAEConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        k_bias = torch.zeros_like(self.v_bias, requires_grad=False) if self.q_bias is not None else None
-        keys = nn.functional.linear(input=hidden_states, weight=self.key.weight, bias=k_bias)
-        values = nn.functional.linear(input=hidden_states, weight=self.value.weight, bias=self.v_bias)
-        queries = nn.functional.linear(input=hidden_states, weight=self.query.weight, bias=self.q_bias)
-
-        key_layer = self.transpose_for_scores(keys)
-        value_layer = self.transpose_for_scores(values)
-        query_layer = self.transpose_for_scores(queries)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
 
-        return context_layer, None
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
 
 
 # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->VideoMAE
@@ -359,13 +360,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->VideoMAE
-class VideoMAESdpaAttention(VideoMAEAttention):
-    def __init__(self, config: VideoMAEConfig) -> None:
-        super().__init__(config)
-        self.attention = VideoMAESdpaSelfAttention(config)
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTIntermediate ViT->VideoMAE
 class VideoMAEIntermediate(nn.Module):
     def __init__(self, config: VideoMAEConfig) -> None:
@@ -399,9 +393,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-VIDEOMAE_ATTENTION_CLASSES = {"eager": VideoMAEAttention, "sdpa": VideoMAESdpaAttention}
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->VideoMAE,VIT->VIDEOMAE
 class VideoMAELayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
@@ -410,7 +401,7 @@ def __init__(self, config: VideoMAEConfig) -> None:
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = VIDEOMAE_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = VideoMAEAttention(config)
         self.intermediate = VideoMAEIntermediate(config)
         self.output = VideoMAEOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -508,6 +499,7 @@ class VideoMAEPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -866,7 +858,7 @@ def forward(
         if bool_masked_pos is None:
             raise ValueError("One must provided a boolean mask ")
         expanded_position_embeddings = self.position_embeddings.expand(batch_size, -1, -1).type_as(pixel_values)
-        expanded_position_embeddings = expanded_position_embeddings.to(pixel_values.device).clone().detach()
+        expanded_position_embeddings = expanded_position_embeddings.detach().to(device=pixel_values.device, copy=True)
         pos_emb_visible = expanded_position_embeddings[~bool_masked_pos].reshape(batch_size, -1, num_channels)
         pos_emb_mask = expanded_position_embeddings[bool_masked_pos].reshape(batch_size, -1, num_channels)
 
diff --git a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
deleted file mode 100644
index 79b9f3ba03ab..000000000000
--- a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViLT checkpoints from the original Github repository."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import (
-    BertTokenizer,
-    ViltConfig,
-    ViltForImageAndTextRetrieval,
-    ViltForImagesAndTextClassification,
-    ViltForMaskedLM,
-    ViltForQuestionAnswering,
-    ViltImageProcessor,
-    ViltProcessor,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, vqa_model=False, nlvr_model=False, irtr_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"transformer.blocks.{i}.norm1.weight", f"vilt.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"transformer.blocks.{i}.norm1.bias", f"vilt.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"transformer.blocks.{i}.attn.proj.weight", f"vilt.encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append(
-            (f"transformer.blocks.{i}.attn.proj.bias", f"vilt.encoder.layer.{i}.attention.output.dense.bias")
-        )
-        rename_keys.append((f"transformer.blocks.{i}.norm2.weight", f"vilt.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"transformer.blocks.{i}.norm2.bias", f"vilt.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append(
-            (f"transformer.blocks.{i}.mlp.fc1.weight", f"vilt.encoder.layer.{i}.intermediate.dense.weight")
-        )
-        rename_keys.append((f"transformer.blocks.{i}.mlp.fc1.bias", f"vilt.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"transformer.blocks.{i}.mlp.fc2.weight", f"vilt.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"transformer.blocks.{i}.mlp.fc2.bias", f"vilt.encoder.layer.{i}.output.dense.bias"))
-
-    # embeddings
-    rename_keys.extend(
-        [
-            # text embeddings
-            ("text_embeddings.word_embeddings.weight", "vilt.embeddings.text_embeddings.word_embeddings.weight"),
-            (
-                "text_embeddings.position_embeddings.weight",
-                "vilt.embeddings.text_embeddings.position_embeddings.weight",
-            ),
-            ("text_embeddings.position_ids", "vilt.embeddings.text_embeddings.position_ids"),
-            (
-                "text_embeddings.token_type_embeddings.weight",
-                "vilt.embeddings.text_embeddings.token_type_embeddings.weight",
-            ),
-            ("text_embeddings.LayerNorm.weight", "vilt.embeddings.text_embeddings.LayerNorm.weight"),
-            ("text_embeddings.LayerNorm.bias", "vilt.embeddings.text_embeddings.LayerNorm.bias"),
-            # patch embeddings
-            ("transformer.cls_token", "vilt.embeddings.cls_token"),
-            ("transformer.patch_embed.proj.weight", "vilt.embeddings.patch_embeddings.projection.weight"),
-            ("transformer.patch_embed.proj.bias", "vilt.embeddings.patch_embeddings.projection.bias"),
-            ("transformer.pos_embed", "vilt.embeddings.position_embeddings"),
-            # token type embeddings
-            ("token_type_embeddings.weight", "vilt.embeddings.token_type_embeddings.weight"),
-        ]
-    )
-
-    # final layernorm + pooler
-    rename_keys.extend(
-        [
-            ("transformer.norm.weight", "vilt.layernorm.weight"),
-            ("transformer.norm.bias", "vilt.layernorm.bias"),
-            ("pooler.dense.weight", "vilt.pooler.dense.weight"),
-            ("pooler.dense.bias", "vilt.pooler.dense.bias"),
-        ]
-    )
-
-    # classifier head(s)
-    if vqa_model:
-        # classification head
-        rename_keys.extend(
-            [
-                ("vqa_classifier.0.weight", "classifier.0.weight"),
-                ("vqa_classifier.0.bias", "classifier.0.bias"),
-                ("vqa_classifier.1.weight", "classifier.1.weight"),
-                ("vqa_classifier.1.bias", "classifier.1.bias"),
-                ("vqa_classifier.3.weight", "classifier.3.weight"),
-                ("vqa_classifier.3.bias", "classifier.3.bias"),
-            ]
-        )
-    elif nlvr_model:
-        # classification head
-        rename_keys.extend(
-            [
-                ("nlvr2_classifier.0.weight", "classifier.0.weight"),
-                ("nlvr2_classifier.0.bias", "classifier.0.bias"),
-                ("nlvr2_classifier.1.weight", "classifier.1.weight"),
-                ("nlvr2_classifier.1.bias", "classifier.1.bias"),
-                ("nlvr2_classifier.3.weight", "classifier.3.weight"),
-                ("nlvr2_classifier.3.bias", "classifier.3.bias"),
-            ]
-        )
-    else:
-        pass
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    for i in range(config.num_hidden_layers):
-        prefix = "vilt."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"transformer.blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"transformer.blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-@torch.no_grad()
-def convert_vilt_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our ViLT structure.
-    """
-
-    # define configuration and initialize HuggingFace model
-    config = ViltConfig(image_size=384, patch_size=32, tie_word_embeddings=False)
-    mlm_model = False
-    vqa_model = False
-    nlvr_model = False
-    irtr_model = False
-    if "vqa" in checkpoint_url:
-        vqa_model = True
-        config.num_labels = 3129
-        repo_id = "huggingface/label-files"
-        filename = "vqa2-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        model = ViltForQuestionAnswering(config)
-    elif "nlvr" in checkpoint_url:
-        nlvr_model = True
-        config.num_labels = 2
-        config.id2label = {0: "False", 1: "True"}
-        config.label2id = {v: k for k, v in config.id2label.items()}
-        config.modality_type_vocab_size = 3
-        model = ViltForImagesAndTextClassification(config)
-    elif "irtr" in checkpoint_url:
-        irtr_model = True
-        model = ViltForImageAndTextRetrieval(config)
-    elif "mlm_itm" in checkpoint_url:
-        mlm_model = True
-        model = ViltForMaskedLM(config)
-    else:
-        raise ValueError("Unknown model type")
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"]
-    rename_keys = create_rename_keys(config, vqa_model, nlvr_model, irtr_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config)
-    if mlm_model or irtr_model:
-        ignore_keys = ["itm_score.fc.weight", "itm_score.fc.bias"]
-        for k in ignore_keys:
-            state_dict.pop(k, None)
-
-    # load state dict into HuggingFace model
-    model.eval()
-    if mlm_model:
-        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-        assert missing_keys == ["mlm_score.decoder.bias"]
-    else:
-        model.load_state_dict(state_dict)
-
-    # Define processor
-    image_processor = ViltImageProcessor(size=384)
-    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
-    processor = ViltProcessor(image_processor, tokenizer)
-
-    # Forward pass on example inputs (image + text)
-    if nlvr_model:
-        image1 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
-        image2 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
-        text = (
-            "The left image contains twice the number of dogs as the right image, and at least two dogs in total are"
-            " standing."
-        )
-        encoding_1 = processor(image1, text, return_tensors="pt")
-        encoding_2 = processor(image2, text, return_tensors="pt")
-        outputs = model(
-            input_ids=encoding_1.input_ids,
-            pixel_values=encoding_1.pixel_values,
-            pixel_values_2=encoding_2.pixel_values,
-        )
-    else:
-        image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-        if mlm_model:
-            text = "a bunch of [MASK] laying on a [MASK]."
-        else:
-            text = "How many cats are there?"
-        encoding = processor(image, text, return_tensors="pt")
-        outputs = model(**encoding)
-
-    # Verify outputs
-    if mlm_model:
-        expected_shape = torch.Size([1, 11, 30522])
-        expected_slice = torch.tensor([-12.5061, -12.5123, -12.5174])
-        assert outputs.logits.shape == expected_shape
-        assert torch.allclose(outputs.logits[0, 0, :3], expected_slice, atol=1e-4)
-
-        # verify masked token prediction equals "cats"
-        predicted_id = outputs.logits[0, 4, :].argmax(-1).item()
-        assert tokenizer.decode([predicted_id]) == "cats"
-    elif vqa_model:
-        expected_shape = torch.Size([1, 3129])
-        expected_slice = torch.tensor([-15.9495, -18.1472, -10.3041])
-        assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
-        assert outputs.logits.shape == expected_shape
-        assert torch.allclose(outputs.logits[0, 0, :3], expected_slice, atol=1e-4)
-
-        # verify vqa prediction equals "2"
-        predicted_idx = outputs.logits.argmax(-1).item()
-        assert model.config.id2label[predicted_idx] == "2"
-    elif nlvr_model:
-        expected_shape = torch.Size([1, 2])
-        expected_slice = torch.tensor([-2.8721, 2.1291])
-        assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4)
-        assert outputs.logits.shape == expected_shape
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model and processor to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://github.com/dandelin/ViLT/releases/download/200k/vilt_200k_mlm_itm.ckpt",
-        type=str,
-        help="URL of the checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_vilt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 5ffb4b65ffba..00dc6f5ce7ec 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -70,7 +70,7 @@ class ViltForImagesAndTextClassificationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[List[Tuple[torch.FloatTensor]]] = None
     attentions: Optional[List[Tuple[torch.FloatTensor]]] = None
 
@@ -322,7 +322,7 @@ def __init__(self, config):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
diff --git a/src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py b/src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py
deleted file mode 100644
index 2914cfdfcd4b..000000000000
--- a/src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    AddedToken,
-    AutoConfig,
-    AutoTokenizer,
-    CLIPImageProcessor,
-    LlavaProcessor,
-    VipLlavaConfig,
-    VipLlavaForConditionalGeneration,
-)
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "model.vision_tower.": "",
-    "model.mm_projector": "multi_modal_projector",
-    "model": "model.model",
-    "vision_model.model": "vision_model",
-    "lm_head": "language_model.lm_head",
-    "model.model": "language_model.model",
-    "multi_modal_projector.0": "multi_modal_projector.linear_1",
-    "multi_modal_projector.2": "multi_modal_projector.linear_2",
-    "final_linear.0": "linear_1",
-    "final_linear.2": "linear_2",
-    "multi_modal_projector.clip_layernorm": "multi_modal_projector.projector_layernorm",
-}
-
-
-# Copied from transformers.models.llava.convert_llava_weights_to_hf.convert_state_dict_to_hf
-def convert_state_dict_to_hf(state_dict):
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if key.endswith(".inv_freq"):
-            continue
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-def convert_vipllava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
-    torch.set_default_dtype(torch.float16)
-    text_config = AutoConfig.from_pretrained(text_model_id)
-
-    tokenizer = AutoTokenizer.from_pretrained(text_model_id)
-    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special_tokens=True)
-    tokenizer.add_special_tokens({"pad_token": "<pad>"})
-
-    image_processor = CLIPImageProcessor.from_pretrained(vision_model_id)
-
-    processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
-
-    config = VipLlavaConfig(text_config=text_config)
-    config.pad_token_id = 32001
-
-    with torch.device("meta"):
-        model = VipLlavaForConditionalGeneration(config)
-
-    # Pad to 64 for performance reasons
-    pad_shape = 64
-
-    state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict_7b.bin")
-
-    state_dict = torch.load(state_dict_path, map_location="cpu")
-    state_dict = convert_state_dict_to_hf(state_dict)
-
-    model.load_state_dict(state_dict, strict=True, assign=True)
-
-    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
-    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
-    n = pre_expansion_embeddings.size()[0]
-    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
-    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
-
-    # We add an image token so we resize the model
-    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
-    model.language_model.model.embed_tokens.weight.data[32000:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[32000:].shape[0]))),
-        dim=0,
-    )
-    model.language_model.lm_head.weight.data[32000:] = torch.stack(
-        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[32000:].shape[0]))),
-        dim=0,
-    )
-
-    model.push_to_hub(output_hub_path)
-    processor.push_to_hub(output_hub_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--text_model_id",
-        help="Hub location of the text model",
-    )
-    parser.add_argument(
-        "--vision_model_id",
-        help="Hub location of the vision model",
-    )
-    parser.add_argument(
-        "--output_hub_path",
-        help="Location on the hub of the converted model",
-    )
-    parser.add_argument(
-        "--old_state_dict_id",
-        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
-    )
-    args = parser.parse_args()
-    convert_vipllava_llama_to_hf(
-        args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index ef4b3bff3958..3c706e43d599 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -76,7 +76,7 @@ class VipLlavaCausalLMOutputWithPast(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     past_key_values: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -306,8 +306,8 @@ def get_image_features(self, pixel_values: torch.FloatTensor, vision_feature_lay
     # Ignore copy
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -323,7 +323,6 @@ def forward(
         **lm_kwargs,
     ) -> Union[Tuple, VipLlavaCausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index 9a027f04784a..b909d136653f 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -309,8 +309,8 @@ def tf_to_pt_weight_rename(self, tf_weight):
     @classmethod
     def from_encoder_decoder_pretrained(
         cls,
-        encoder_pretrained_model_name_or_path: str = None,
-        decoder_pretrained_model_name_or_path: str = None,
+        encoder_pretrained_model_name_or_path: Optional[str] = None,
+        decoder_pretrained_model_name_or_path: Optional[str] = None,
         *model_args,
         **kwargs,
     ) -> TFPreTrainedModel:
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index 55c759f8e9ae..7451973b5b65 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -21,7 +21,6 @@
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
 
 from ...configuration_utils import PretrainedConfig
 from ...generation import GenerationMixin
@@ -368,21 +367,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
                 return model
 
-        # At the moment fast initialization is not supported for composite models
-        if kwargs.get("_fast_init", False):
-            logger.warning(
-                "Fast initialization is currently not supported for VisionEncoderDecoderModel. "
-                "Falling back to slow initialization..."
-            )
-        kwargs["_fast_init"] = False
-
         return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
     @classmethod
     def from_encoder_decoder_pretrained(
         cls,
-        encoder_pretrained_model_name_or_path: str = None,
-        decoder_pretrained_model_name_or_path: str = None,
+        encoder_pretrained_model_name_or_path: Optional[str] = None,
+        decoder_pretrained_model_name_or_path: Optional[str] = None,
         *model_args,
         **kwargs,
     ) -> PreTrainedModel:
@@ -582,6 +573,9 @@ def forward(
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        # num_items_in_batch is only needed for loss computation
+        num_items_in_batch = kwargs.pop("num_items_in_batch", None)
+
         kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
 
         kwargs_decoder = {
@@ -638,8 +632,13 @@ def forward(
         loss = None
         if labels is not None:
             logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.reshape(-1))
+
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.decoder.config.vocab_size,
+                num_items_in_batch=num_items_in_batch,
+            )
 
         if not return_dict:
             if loss is not None:
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index e259041cd92c..b12327d8ca02 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -414,8 +414,8 @@ def _get_features(module, pixel_values, deterministic):
     @classmethod
     def from_vision_text_pretrained(
         cls,
-        vision_model_name_or_path: str = None,
-        text_model_name_or_path: str = None,
+        vision_model_name_or_path: Optional[str] = None,
+        text_model_name_or_path: Optional[str] = None,
         *model_args,
         **kwargs,
     ) -> FlaxPreTrainedModel:
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
index bb1808aece91..ca88d2fec95e 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
@@ -465,8 +465,8 @@ def call(
     @classmethod
     def from_vision_text_pretrained(
         cls,
-        vision_model_name_or_path: str = None,
-        text_model_name_or_path: str = None,
+        vision_model_name_or_path: Optional[str] = None,
+        text_model_name_or_path: Optional[str] = None,
         *model_args,
         **kwargs,
     ) -> TFPreTrainedModel:
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index d7cceb5d2feb..3e770f6935d8 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -407,18 +407,11 @@ def forward(
             vision_model_output=vision_outputs,
         )
 
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        # At the moment fast initialization is not supported
-        # for composite models
-        kwargs["_fast_init"] = False
-        return super().from_pretrained(*args, **kwargs)
-
     @classmethod
     def from_vision_text_pretrained(
         cls,
-        vision_model_name_or_path: str = None,
-        text_model_name_or_path: str = None,
+        vision_model_name_or_path: Optional[str] = None,
+        text_model_name_or_path: Optional[str] = None,
         *model_args,
         **kwargs,
     ) -> PreTrainedModel:
diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
index 7ba82a131d3a..0b12bae8f7ef 100644
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -66,7 +66,7 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to VisionTextDualEncoderTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
         `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
         of the above two methods for more information.
 
         Args:
diff --git a/src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index dd03623c8807..000000000000
--- a/src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VisualBert checkpoint."""
-
-import argparse
-from collections import OrderedDict
-from pathlib import Path
-
-import torch
-
-from transformers import (
-    VisualBertConfig,
-    VisualBertForMultipleChoice,
-    VisualBertForPreTraining,
-    VisualBertForQuestionAnswering,
-    VisualBertForVisualReasoning,
-)
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-rename_keys_prefix = [
-    ("bert.bert", "visual_bert"),
-    ("bert.cls", "cls"),
-    ("bert.classifier", "cls"),
-    ("token_type_embeddings_visual", "visual_token_type_embeddings"),
-    ("position_embeddings_visual", "visual_position_embeddings"),
-    ("projection", "visual_projection"),
-]
-
-ACCEPTABLE_CHECKPOINTS = [
-    "nlvr2_coco_pre_trained.th",
-    "nlvr2_fine_tuned.th",
-    "nlvr2_pre_trained.th",
-    "vcr_coco_pre_train.th",
-    "vcr_fine_tune.th",
-    "vcr_pre_train.th",
-    "vqa_coco_pre_trained.th",
-    "vqa_fine_tuned.th",
-    "vqa_pre_trained.th",
-]
-
-
-def load_state_dict(checkpoint_path):
-    sd = torch.load(checkpoint_path, map_location="cpu")
-    return sd
-
-
-def get_new_dict(d, config, rename_keys_prefix=rename_keys_prefix):
-    new_d = OrderedDict()
-    new_d["visual_bert.embeddings.position_ids"] = torch.arange(config.max_position_embeddings).expand((1, -1))
-    # detector_d = OrderedDict()
-    for key in d:
-        if "detector" in key:
-            # detector_d[key.replace('detector.','')] = d[key]
-            continue
-        new_key = key
-        for name_pair in rename_keys_prefix:
-            new_key = new_key.replace(name_pair[0], name_pair[1])
-        new_d[new_key] = d[key]
-        if key == "bert.cls.predictions.decoder.weight":
-            # Old bert code didn't have `decoder.bias`, but was added separately
-            new_d["cls.predictions.decoder.bias"] = new_d["cls.predictions.bias"]
-    return new_d
-
-
-@torch.no_grad()
-def convert_visual_bert_checkpoint(checkpoint_path, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our VisualBERT structure.
-    """
-
-    assert (
-        checkpoint_path.split("/")[-1] in ACCEPTABLE_CHECKPOINTS
-    ), f"The checkpoint provided must be in {ACCEPTABLE_CHECKPOINTS}."
-
-    # Get Config
-    if "pre" in checkpoint_path:
-        model_type = "pretraining"
-        if "vcr" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 512}
-        elif "vqa_advanced" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 2048}
-        elif "vqa" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 2048}
-        elif "nlvr" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 1024}
-        else:
-            raise NotImplementedError(f"No implementation found for `{checkpoint_path}`.")
-    else:
-        if "vcr" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 512}
-            model_type = "multichoice"
-        elif "vqa_advanced" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 2048}
-            model_type = "vqa_advanced"
-        elif "vqa" in checkpoint_path:
-            config_params = {"visual_embedding_dim": 2048, "num_labels": 3129}
-            model_type = "vqa"
-        elif "nlvr" in checkpoint_path:
-            config_params = {
-                "visual_embedding_dim": 1024,
-                "num_labels": 2,
-            }
-            model_type = "nlvr"
-
-    config = VisualBertConfig(**config_params)
-
-    # Load State Dict
-    state_dict = load_state_dict(checkpoint_path)
-
-    new_state_dict = get_new_dict(state_dict, config)
-
-    if model_type == "pretraining":
-        model = VisualBertForPreTraining(config)
-    elif model_type == "vqa":
-        model = VisualBertForQuestionAnswering(config)
-    elif model_type == "nlvr":
-        model = VisualBertForVisualReasoning(config)
-    elif model_type == "multichoice":
-        model = VisualBertForMultipleChoice(config)
-
-    model.load_state_dict(new_state_dict)
-    # Save Checkpoints
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("orig_checkpoint_path", type=str, help="A path to .th on local filesystem.")
-    parser.add_argument("pytorch_dump_folder_path", type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    convert_visual_bert_checkpoint(args.orig_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index d9ce821101e5..7db7715f5206 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -559,8 +559,8 @@ class VisualBertForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    prediction_logits: torch.FloatTensor = None
-    seq_relationship_logits: torch.FloatTensor = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index bb8b908903fb..13ad3a7715c5 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -68,6 +68,12 @@ class ViTConfig(PretrainedConfig):
             Whether to add a bias to the queries, keys and values.
         encoder_stride (`int`, *optional*, defaults to 16):
            Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+        pooler_output_size (`int`, *optional*):
+           Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
+        pooler_act (`str`, *optional*, defaults to `"tanh"`):
+           The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and
+           Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are
+           supported for Tensorflow.
 
     Example:
 
@@ -102,6 +108,8 @@ def __init__(
         num_channels=3,
         qkv_bias=True,
         encoder_stride=16,
+        pooler_output_size=None,
+        pooler_act="tanh",
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -120,6 +128,8 @@ def __init__(
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
         self.encoder_stride = encoder_stride
+        self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size
+        self.pooler_act = pooler_act
 
 
 class ViTOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/vit/convert_dino_to_pytorch.py b/src/transformers/models/vit/convert_dino_to_pytorch.py
deleted file mode 100644
index 8608da8eb411..000000000000
--- a/src/transformers/models/vit/convert_dino_to_pytorch.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViT checkpoints trained with the DINO method."""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("cls_token", "vit.embeddings.cls_token"),
-            ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
-            ("pos_embed", "vit.embeddings.position_embeddings"),
-        ]
-    )
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "vit" from all keys that start with "vit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "vit.layernorm.weight"),
-                ("norm.bias", "vit.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "vit."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True):
-    """
-    Copy/paste/tweak model's weights to our ViT structure.
-    """
-
-    # define default ViT configuration
-    config = ViTConfig()
-    # patch_size
-    if model_name[-1] == "8":
-        config.patch_size = 8
-    # set labels if required
-    if not base_model:
-        config.num_labels = 1000
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-    # size of the architecture
-    if model_name in ["dino_vits8", "dino_vits16"]:
-        config.hidden_size = 384
-        config.intermediate_size = 1536
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 6
-
-    # load original model from torch hub
-    original_model = torch.hub.load("facebookresearch/dino:main", model_name)
-    original_model.eval()
-
-    # load state_dict of original model, remove and rename some keys
-    state_dict = original_model.state_dict()
-    if base_model:
-        remove_classification_head_(state_dict)
-    rename_keys = create_rename_keys(config, base_model=base_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model)
-
-    # load HuggingFace model
-    if base_model:
-        model = ViTModel(config, add_pooling_layer=False).eval()
-    else:
-        model = ViTForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by ViTImageProcessor
-    image_processor = ViTImageProcessor()
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values)
-
-    if base_model:
-        final_hidden_state_cls_token = original_model(pixel_values)
-        assert torch.allclose(final_hidden_state_cls_token, outputs.last_hidden_state[:, 0, :], atol=1e-1)
-    else:
-        logits = original_model(pixel_values)
-        assert logits.shape == outputs.logits.shape
-        assert torch.allclose(logits, outputs.logits, atol=1e-3)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="dino_vitb16",
-        type=str,
-        help="Name of the model trained with DINO you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--base_model",
-        action="store_true",
-        help="Whether to only convert the base model (no projection head weights).",
-    )
-
-    parser.set_defaults(base_model=True)
-    args = parser.parse_args()
-    convert_vit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.base_model)
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
deleted file mode 100644
index 7892842f8dc1..000000000000
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViT and non-distilled DeiT checkpoints from the timm library."""
-
-import argparse
-from pathlib import Path
-
-import requests
-import timm
-import torch
-from PIL import Image
-from timm.data import ImageNetInfo, infer_imagenet_subset
-
-from transformers import DeiTImageProcessor, ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("cls_token", "vit.embeddings.cls_token"),
-            ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
-            ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
-            ("pos_embed", "vit.embeddings.position_embeddings"),
-        ]
-    )
-
-    if base_model:
-        # layernorm
-        rename_keys.extend(
-            [
-                ("norm.weight", "layernorm.weight"),
-                ("norm.bias", "layernorm.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "vit" from all keys that start with "vit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "vit.layernorm.weight"),
-                ("norm.bias", "vit.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "vit."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-# We will verify our results on an image of cute cats
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
-    """
-    Copy/paste/tweak model's weights to our ViT structure.
-    """
-
-    # define default ViT configuration
-    config = ViTConfig()
-    base_model = False
-
-    # load original model from timm
-    timm_model = timm.create_model(vit_name, pretrained=True)
-    timm_model.eval()
-
-    # detect unsupported ViT models in transformers
-    # fc_norm is present
-    if not isinstance(getattr(timm_model, "fc_norm", None), torch.nn.Identity):
-        raise ValueError(f"{vit_name} is not supported in transformers because of the presence of fc_norm.")
-
-    # use of global average pooling in combination (or without) class token
-    if getattr(timm_model, "global_pool", None) == "avg":
-        raise ValueError(f"{vit_name} is not supported in transformers because of use of global average pooling.")
-
-    # CLIP style vit with norm_pre layer present
-    if "clip" in vit_name and not isinstance(getattr(timm_model, "norm_pre", None), torch.nn.Identity):
-        raise ValueError(
-            f"{vit_name} is not supported in transformers because it's a CLIP style ViT with norm_pre layer."
-        )
-
-    # SigLIP style vit with attn_pool layer present
-    if "siglip" in vit_name and getattr(timm_model, "global_pool", None) == "map":
-        raise ValueError(
-            f"{vit_name} is not supported in transformers because it's a SigLIP style ViT with attn_pool."
-        )
-
-    # use of layer scale in ViT model blocks
-    if not isinstance(getattr(timm_model.blocks[0], "ls1", None), torch.nn.Identity) or not isinstance(
-        getattr(timm_model.blocks[0], "ls2", None), torch.nn.Identity
-    ):
-        raise ValueError(f"{vit_name} is not supported in transformers because it uses a layer scale in its blocks.")
-
-    # Hybrid ResNet-ViTs
-    if not isinstance(timm_model.patch_embed, timm.layers.PatchEmbed):
-        raise ValueError(f"{vit_name} is not supported in transformers because it is a hybrid ResNet-ViT.")
-
-    # get patch size and image size from the patch embedding submodule
-    config.patch_size = timm_model.patch_embed.patch_size[0]
-    config.image_size = timm_model.patch_embed.img_size[0]
-
-    # retrieve architecture-specific parameters from the timm model
-    config.hidden_size = timm_model.embed_dim
-    config.intermediate_size = timm_model.blocks[0].mlp.fc1.out_features
-    config.num_hidden_layers = len(timm_model.blocks)
-    config.num_attention_heads = timm_model.blocks[0].attn.num_heads
-
-    # check whether the model has a classification head or not
-    if timm_model.num_classes != 0:
-        config.num_labels = timm_model.num_classes
-        # infer ImageNet subset from timm model
-        imagenet_subset = infer_imagenet_subset(timm_model)
-        dataset_info = ImageNetInfo(imagenet_subset)
-        config.id2label = {i: dataset_info.index_to_label_name(i) for i in range(dataset_info.num_classes())}
-        config.label2id = {v: k for k, v in config.id2label.items()}
-    else:
-        print(f"{vit_name} is going to be converted as a feature extractor only.")
-        base_model = True
-
-    # load state_dict of original model
-    state_dict = timm_model.state_dict()
-
-    # remove and rename some keys in the state dict
-    if base_model:
-        remove_classification_head_(state_dict)
-    rename_keys = create_rename_keys(config, base_model)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model)
-
-    # load HuggingFace model
-    if base_model:
-        model = ViTModel(config, add_pooling_layer=False).eval()
-    else:
-        model = ViTForImageClassification(config).eval()
-    model.load_state_dict(state_dict)
-
-    # Check outputs on an image, prepared by ViTImageProcessor/DeiTImageProcessor
-    if "deit" in vit_name:
-        image_processor = DeiTImageProcessor(size=config.image_size)
-    else:
-        image_processor = ViTImageProcessor(size=config.image_size)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    pixel_values = encoding["pixel_values"]
-    outputs = model(pixel_values)
-
-    if base_model:
-        timm_pooled_output = timm_model.forward_features(pixel_values)
-        assert timm_pooled_output.shape == outputs.last_hidden_state.shape
-        assert torch.allclose(timm_pooled_output, outputs.last_hidden_state, atol=1e-1)
-    else:
-        timm_logits = timm_model(pixel_values)
-        assert timm_logits.shape == outputs.logits.shape
-        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--vit_name",
-        default="vit_base_patch16_224",
-        type=str,
-        help="Name of the ViT timm model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vit/modeling_flax_vit.py b/src/transformers/models/vit/modeling_flax_vit.py
index 9df89b9674a1..5cf3477b5ddf 100644
--- a/src/transformers/models/vit/modeling_flax_vit.py
+++ b/src/transformers/models/vit/modeling_flax_vit.py
@@ -412,17 +412,18 @@ class FlaxViTPooler(nn.Module):
 
     def setup(self):
         self.dense = nn.Dense(
-            self.config.hidden_size,
+            self.config.pooler_output_size,
             kernel_init=jax.nn.initializers.variance_scaling(
                 self.config.initializer_range**2, "fan_in", "truncated_normal"
             ),
             dtype=self.dtype,
         )
+        self.activation = ACT2FN[self.config.pooler_act]
 
     def __call__(self, hidden_states):
         cls_hidden_state = hidden_states[:, 0]
         cls_hidden_state = self.dense(cls_hidden_state)
-        return nn.tanh(cls_hidden_state)
+        return self.activation(cls_hidden_state)
 
 
 class FlaxViTPreTrainedModel(FlaxPreTrainedModel):
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index 780a1dc5c7bd..e18b38e597f3 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -788,9 +788,9 @@ def __init__(self, config: ViTConfig, **kwargs):
         super().__init__(**kwargs)
 
         self.dense = keras.layers.Dense(
-            units=config.hidden_size,
+            units=config.pooler_output_size,
             kernel_initializer=get_initializer(config.initializer_range),
-            activation="tanh",
+            activation=config.pooler_act,
             name="dense",
         )
         self.config = config
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index b026a31d0a4c..d757aeaf28b1 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -16,7 +16,7 @@
 
 import collections.abc
 import math
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -30,7 +30,7 @@
     ImageClassifierOutput,
     MaskedImageModelingOutput,
 )
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_code_sample_docstrings,
@@ -184,25 +184,57 @@ def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = F
         return embeddings
 
 
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 class ViTSelfAttention(nn.Module):
     def __init__(self, config: ViTConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -211,84 +243,37 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-class ViTSdpaSelfAttention(ViTSelfAttention):
-    def __init__(self, config: ViTConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions or head_mask is not None:
-            logger.warning_once(
-                "`ViTSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
-                "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but "
-                "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        return context_layer, None
+        return outputs
 
 
 class ViTSelfOutput(nn.Module):
@@ -348,12 +333,6 @@ def forward(
         return outputs
 
 
-class ViTSdpaAttention(ViTAttention):
-    def __init__(self, config: ViTConfig) -> None:
-        super().__init__(config)
-        self.attention = ViTSdpaSelfAttention(config)
-
-
 class ViTIntermediate(nn.Module):
     def __init__(self, config: ViTConfig) -> None:
         super().__init__()
@@ -385,12 +364,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-VIT_ATTENTION_CLASSES = {
-    "eager": ViTAttention,
-    "sdpa": ViTSdpaAttention,
-}
-
-
 class ViTLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
 
@@ -398,7 +371,7 @@ def __init__(self, config: ViTConfig) -> None:
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = VIT_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = ViTAttention(config)
         self.intermediate = ViTIntermediate(config)
         self.output = ViTOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -496,6 +469,7 @@ class ViTPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["ViTEmbeddings", "ViTLayer"]
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
@@ -523,6 +497,9 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
                 std=self.config.initializer_range,
             ).to(module.cls_token.dtype)
 
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+
 
 VIT_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
@@ -662,8 +639,8 @@ def forward(
 class ViTPooler(nn.Module):
     def __init__(self, config: ViTConfig):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
+        self.dense = nn.Linear(config.hidden_size, config.pooler_output_size)
+        self.activation = ACT2FN[config.pooler_act]
 
     def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
diff --git a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
deleted file mode 100644
index 47e77593f6fd..000000000000
--- a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViT MAE checkpoints from the original repository: https://github.com/facebookresearch/mae"""
-
-import argparse
-
-import requests
-import torch
-from PIL import Image
-
-from transformers import ViTMAEConfig, ViTMAEForPreTraining, ViTMAEImageProcessor
-
-
-def rename_key(name):
-    if "cls_token" in name:
-        name = name.replace("cls_token", "vit.embeddings.cls_token")
-    if "mask_token" in name:
-        name = name.replace("mask_token", "decoder.mask_token")
-    if "decoder_pos_embed" in name:
-        name = name.replace("decoder_pos_embed", "decoder.decoder_pos_embed")
-    if "pos_embed" in name and "decoder" not in name:
-        name = name.replace("pos_embed", "vit.embeddings.position_embeddings")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "vit.embeddings.patch_embeddings.projection")
-    if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "vit.embeddings.norm")
-    if "decoder_blocks" in name:
-        name = name.replace("decoder_blocks", "decoder.decoder_layers")
-    if "blocks" in name:
-        name = name.replace("blocks", "vit.encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "decoder_embed" in name:
-        name = name.replace("decoder_embed", "decoder.decoder_embed")
-    if "decoder_norm" in name:
-        name = name.replace("decoder_norm", "decoder.decoder_norm")
-    if "decoder_pred" in name:
-        name = name.replace("decoder_pred", "decoder.decoder_pred")
-    if "norm.weight" in name and "decoder" not in name:
-        name = name.replace("norm.weight", "vit.layernorm.weight")
-    if "norm.bias" in name and "decoder" not in name:
-        name = name.replace("norm.bias", "vit.layernorm.bias")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[1])
-            if "decoder_blocks" in key:
-                dim = config.decoder_hidden_size
-                prefix = "decoder.decoder_layers."
-                if "weight" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-                elif "bias" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
-            else:
-                dim = config.hidden_size
-                prefix = "vit.encoder.layer."
-                if "weight" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-                elif "bias" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
-
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    config = ViTMAEConfig()
-    if "large" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-    elif "huge" in checkpoint_url:
-        config.patch_size = 14
-        config.hidden_size = 1280
-        config.intermediate_size = 5120
-        config.num_hidden_layers = 32
-        config.num_attention_heads = 16
-
-    model = ViTMAEForPreTraining(config)
-
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-
-    image_processor = ViTMAEImageProcessor(size=config.image_size)
-
-    new_state_dict = convert_state_dict(state_dict, config)
-
-    model.load_state_dict(new_state_dict)
-    model.eval()
-
-    url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
-
-    image = Image.open(requests.get(url, stream=True).raw)
-    image_processor = ViTMAEImageProcessor(size=config.image_size)
-    inputs = image_processor(images=image, return_tensors="pt")
-
-    # forward pass
-    torch.manual_seed(2)
-    outputs = model(**inputs)
-    logits = outputs.logits
-
-    if "large" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [[-0.7309, -0.7128, -1.0169], [-1.0161, -0.9058, -1.1878], [-1.0478, -0.9411, -1.1911]]
-        )
-    elif "huge" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [[-1.1599, -0.9199, -1.2221], [-1.1952, -0.9269, -1.2307], [-1.2143, -0.9337, -1.2262]]
-        )
-    else:
-        expected_slice = torch.tensor(
-            [[-0.9192, -0.8481, -1.1259], [-1.1349, -1.0034, -1.2599], [-1.1757, -1.0429, -1.2726]]
-        )
-
-    # verify logits
-    assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
-
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://dl.fbaipublicfiles.com/mae/visualize/mae_visualize_vit_base.pth",
-        type=str,
-        help="URL of the checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_vit_mae_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
index bc5a14f42090..8879a8665f3e 100644
--- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
@@ -74,9 +74,9 @@ class TFViTMAEModelOutput(ModelOutput):
             the self-attention heads.
     """
 
-    last_hidden_state: tf.Tensor = None
-    mask: tf.Tensor = None
-    ids_restore: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
+    mask: Optional[tf.Tensor] = None
+    ids_restore: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -99,7 +99,7 @@ class TFViTMAEDecoderOutput(ModelOutput):
             the self-attention heads.
     """
 
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -129,9 +129,9 @@ class TFViTMAEForPreTrainingOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
-    mask: tf.Tensor = None
-    ids_restore: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
+    mask: Optional[tf.Tensor] = None
+    ids_restore: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -314,7 +314,7 @@ def random_masking(self, sequence: tf.Tensor, noise: tf.Tensor | None = None):
         return sequence_unmasked, mask, ids_restore
 
     def call(
-        self, pixel_values: tf.Tensor, noise: tf.Tensor = None, interpolate_pos_encoding: bool = False
+        self, pixel_values: tf.Tensor, noise: Optional[tf.Tensor] = None, interpolate_pos_encoding: bool = False
     ) -> tf.Tensor:
         batch_size, num_channels, height, width = shape_list(pixel_values)
         embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
@@ -775,7 +775,7 @@ class PreTrainedModel
     def call(
         self,
         pixel_values: TFModelInputType | None = None,
-        noise: tf.Tensor = None,
+        noise: Optional[tf.Tensor] = None,
         head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -943,7 +943,7 @@ def get_input_embeddings(self):
     def call(
         self,
         pixel_values: TFModelInputType | None = None,
-        noise: tf.Tensor = None,
+        noise: Optional[tf.Tensor] = None,
         head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1294,7 +1294,7 @@ def forward_loss(self, pixel_values, pred, mask, interpolate_pos_encoding: bool
     def call(
         self,
         pixel_values: TFModelInputType | None = None,
-        noise: tf.Tensor = None,
+        noise: Optional[tf.Tensor] = None,
         head_mask: np.ndarray | tf.Tensor | None = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py
index 1595eb80ca31..4636519ee67e 100755
--- a/src/transformers/models/vit_mae/modeling_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_vit_mae.py
@@ -15,10 +15,9 @@
 """PyTorch ViT MAE (masked autoencoder) model."""
 
 import collections.abc
-import math
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Optional, Set, Tuple, Union
+from typing import Callable, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
@@ -27,7 +26,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
@@ -68,9 +67,9 @@ class ViTMAEModelOutput(ModelOutput):
             the self-attention heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    mask: torch.LongTensor = None
-    ids_restore: torch.LongTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    mask: Optional[torch.LongTensor] = None
+    ids_restore: Optional[torch.LongTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -93,7 +92,7 @@ class ViTMAEDecoderOutput(ModelOutput):
             the self-attention heads.
     """
 
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -123,9 +122,9 @@ class ViTMAEForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    mask: torch.LongTensor = None
-    ids_restore: torch.LongTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    mask: Optional[torch.LongTensor] = None
+    ids_restore: Optional[torch.LongTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
@@ -209,7 +208,6 @@ def __init__(self, config):
         )
         self.patch_size = config.patch_size
         self.config = config
-        self.initialize_weights()
 
     def initialize_weights(self):
         # initialize (and freeze) position embeddings by sin-cos embedding
@@ -356,26 +354,59 @@ def forward(self, pixel_values, interpolate_pos_encoding: bool = False):
         return x
 
 
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention ViT->ViTMAE
 class ViTMAESelfAttention(nn.Module):
     def __init__(self, config: ViTMAEConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -384,85 +415,37 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaSelfAttention ViT->ViTMAE
-class ViTMAESdpaSelfAttention(ViTMAESelfAttention):
-    def __init__(self, config: ViTMAEConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions or head_mask is not None:
-            logger.warning_once(
-                "`ViTMAESdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
-                "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but "
-                "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
 
-        return context_layer, None
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
 
 
 # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTMAE
@@ -524,13 +507,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->ViTMAE
-class ViTMAESdpaAttention(ViTMAEAttention):
-    def __init__(self, config: ViTMAEConfig) -> None:
-        super().__init__(config)
-        self.attention = ViTMAESdpaSelfAttention(config)
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTIntermediate ViT->ViTMAE
 class ViTMAEIntermediate(nn.Module):
     def __init__(self, config: ViTMAEConfig) -> None:
@@ -564,12 +540,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-VITMAE_ATTENTION_CLASSES = {
-    "eager": ViTMAEAttention,
-    "sdpa": ViTMAESdpaAttention,
-}
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTMAE,VIT->VITMAE
 class ViTMAELayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
@@ -578,7 +548,7 @@ def __init__(self, config: ViTMAEConfig) -> None:
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = VITMAE_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = ViTMAEAttention(config)
         self.intermediate = ViTMAEIntermediate(config)
         self.output = ViTMAEOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -676,6 +646,7 @@ class ViTMAEPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -688,6 +659,11 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, ViTMAEEmbeddings):
+            module.initialize_weights()
+        elif isinstance(module, ViTMAEDecoder):
+            module.mask_token.data.zero_()
+            module.decoder_pos_embed.data.zero_()
 
 
 VIT_MAE_START_DOCSTRING = r"""
diff --git a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
deleted file mode 100644
index 899c74f18320..000000000000
--- a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ViT MSN checkpoints from the original repository: https://github.com/facebookresearch/msn"""
-
-import argparse
-import json
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import ViTImageProcessor, ViTMSNConfig, ViTMSNModel
-from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-
-
-torch.set_grad_enabled(False)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config, base_model=False):
-    rename_keys = []
-    for i in range(config.num_hidden_layers):
-        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
-        rename_keys.append((f"module.blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
-        rename_keys.append((f"module.blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
-        rename_keys.append(
-            (f"module.blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight")
-        )
-        rename_keys.append((f"module.blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
-        rename_keys.append((f"module.blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
-        rename_keys.append((f"module.blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
-        rename_keys.append((f"module.blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
-        rename_keys.append((f"module.blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
-        rename_keys.append((f"module.blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
-        rename_keys.append((f"module.blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
-
-    # projection layer + position embeddings
-    rename_keys.extend(
-        [
-            ("module.cls_token", "vit.embeddings.cls_token"),
-            ("module.patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
-            ("module.patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
-            ("module.pos_embed", "vit.embeddings.position_embeddings"),
-        ]
-    )
-
-    if base_model:
-        # layernorm + pooler
-        rename_keys.extend(
-            [
-                ("module.norm.weight", "layernorm.weight"),
-                ("module.norm.bias", "layernorm.bias"),
-            ]
-        )
-
-        # if just the base model, we should remove "vit" from all keys that start with "vit"
-        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
-    else:
-        # layernorm + classification head
-        rename_keys.extend(
-            [
-                ("norm.weight", "vit.layernorm.weight"),
-                ("norm.bias", "vit.layernorm.bias"),
-                ("head.weight", "classifier.weight"),
-                ("head.bias", "classifier.bias"),
-            ]
-        )
-
-    return rename_keys
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config, base_model=False):
-    for i in range(config.num_hidden_layers):
-        if base_model:
-            prefix = ""
-        else:
-            prefix = "vit."
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"module.blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"module.blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
-            : config.hidden_size, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
-            -config.hidden_size :, :
-        ]
-        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def remove_classification_head_(state_dict):
-    ignore_keys = ["head.weight", "head.bias"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def remove_projection_head(state_dict):
-    # projection head is used in the self-supervised pre-training in MSN,
-    # for downstream task it's not needed.
-    ignore_keys = [
-        "module.fc.fc1.weight",
-        "module.fc.fc1.bias",
-        "module.fc.bn1.weight",
-        "module.fc.bn1.bias",
-        "module.fc.bn1.running_mean",
-        "module.fc.bn1.running_var",
-        "module.fc.bn1.num_batches_tracked",
-        "module.fc.fc2.weight",
-        "module.fc.fc2.bias",
-        "module.fc.bn2.weight",
-        "module.fc.bn2.bias",
-        "module.fc.bn2.running_mean",
-        "module.fc.bn2.running_var",
-        "module.fc.bn2.num_batches_tracked",
-        "module.fc.fc3.weight",
-        "module.fc.fc3.bias",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    config = ViTMSNConfig()
-    config.num_labels = 1000
-
-    repo_id = "datasets/huggingface/label-files"
-    filename = "imagenet-1k-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    if "s16" in checkpoint_url:
-        config.hidden_size = 384
-        config.intermediate_size = 1536
-        config.num_attention_heads = 6
-    elif "l16" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.hidden_dropout_prob = 0.1
-    elif "b4" in checkpoint_url:
-        config.patch_size = 4
-    elif "l7" in checkpoint_url:
-        config.patch_size = 7
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-        config.hidden_dropout_prob = 0.1
-
-    model = ViTMSNModel(config)
-
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["target_encoder"]
-
-    image_processor = ViTImageProcessor(size=config.image_size)
-
-    remove_projection_head(state_dict)
-    rename_keys = create_rename_keys(config, base_model=True)
-
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-    read_in_q_k_v(state_dict, config, base_model=True)
-
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-    image = Image.open(requests.get(url, stream=True).raw)
-    image_processor = ViTImageProcessor(
-        size=config.image_size, image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD
-    )
-    inputs = image_processor(images=image, return_tensors="pt")
-
-    # forward pass
-    torch.manual_seed(2)
-    outputs = model(**inputs)
-    last_hidden_state = outputs.last_hidden_state
-
-    # The following Colab Notebook was used to generate these outputs:
-    # https://colab.research.google.com/gist/sayakpaul/3672419a04f5997827503fd84079bdd1/scratchpad.ipynb
-    if "s16" in checkpoint_url:
-        expected_slice = torch.tensor([[-1.0915, -1.4876, -1.1809]])
-    elif "b16" in checkpoint_url:
-        expected_slice = torch.tensor([[14.2889, -18.9045, 11.7281]])
-    elif "l16" in checkpoint_url:
-        expected_slice = torch.tensor([[41.5028, -22.8681, 45.6475]])
-    elif "b4" in checkpoint_url:
-        expected_slice = torch.tensor([[-4.3868, 5.2932, -0.4137]])
-    else:
-        expected_slice = torch.tensor([[-0.1792, -0.6465, 2.4263]])
-
-    # verify logits
-    assert torch.allclose(last_hidden_state[:, 0, :3], expected_slice, atol=1e-4)
-
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--checkpoint_url",
-        default="https://dl.fbaipublicfiles.com/msn/vits16_800ep.pth.tar",
-        type=str,
-        help="URL of the checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-
-    args = parser.parse_args()
-    convert_vit_msn_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py
index d25611a41a66..fb5a3d56ba60 100644
--- a/src/transformers/models/vit_msn/modeling_vit_msn.py
+++ b/src/transformers/models/vit_msn/modeling_vit_msn.py
@@ -15,8 +15,7 @@
 """PyTorch ViT MSN (masked siamese network) model."""
 
 import collections.abc
-import math
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -25,7 +24,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_start_docstrings,
@@ -173,26 +172,59 @@ def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = F
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->ViTMSN
 class ViTMSNSelfAttention(nn.Module):
     def __init__(self, config: ViTMSNConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -201,85 +233,37 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaSelfAttention with ViT->ViTMSN
-class ViTMSNSdpaSelfAttention(ViTMSNSelfAttention):
-    def __init__(self, config: ViTMSNConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions or head_mask is not None:
-            logger.warning_once(
-                "`ViTMSNSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
-                "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but "
-                "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
 
-        return context_layer, None
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
 
 
 # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTMSN
@@ -341,13 +325,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->ViTMSN
-class ViTMSNSdpaAttention(ViTMSNAttention):
-    def __init__(self, config: ViTMSNConfig) -> None:
-        super().__init__(config)
-        self.attention = ViTMSNSdpaSelfAttention(config)
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->ViTMSN
 class ViTMSNIntermediate(nn.Module):
     def __init__(self, config: ViTMSNConfig) -> None:
@@ -381,9 +358,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-VITMSN_ATTENTION_CLASSES = {"eager": ViTMSNAttention, "sdpa": ViTMSNSdpaAttention}
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTMSN, VIT->VITMSN
 class ViTMSNLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
@@ -392,7 +366,7 @@ def __init__(self, config: ViTMSNConfig) -> None:
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = VITMSN_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = ViTMSNAttention(config)
         self.intermediate = ViTMSNIntermediate(config)
         self.output = ViTMSNOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -491,6 +465,7 @@ class ViTMSNPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["ViTMSNAttention", "ViTMSNSdpaAttention"]
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     # todo: Resort to https://github.com/facebookresearch/msn/blob/main/src/deit.py#L200-#L211
     # when creating pre-training scripts.
@@ -505,6 +480,11 @@ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> No
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, ViTMSNEmbeddings):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
 
 
 VIT_MSN_START_DOCSTRING = r"""
diff --git a/src/transformers/models/vitdet/modeling_vitdet.py b/src/transformers/models/vitdet/modeling_vitdet.py
index 9bd7ca2ff1c9..9585c295e18a 100644
--- a/src/transformers/models/vitdet/modeling_vitdet.py
+++ b/src/transformers/models/vitdet/modeling_vitdet.py
@@ -456,8 +456,14 @@ def __init__(
         super().__init__()
 
         dim = config.hidden_size
-        input_size = (config.image_size // config.patch_size, config.image_size // config.patch_size)
 
+        image_size = config.image_size
+        image_size = image_size if isinstance(image_size, (list, tuple)) else (image_size, image_size)
+
+        patch_size = config.patch_size
+        patch_size = patch_size if isinstance(patch_size, (list, tuple)) else (patch_size, patch_size)
+
+        input_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
         self.norm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
         self.attention = VitDetAttention(
             config, input_size=input_size if window_size == 0 else (window_size, window_size)
diff --git a/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py b/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py
deleted file mode 100644
index bcc055633371..000000000000
--- a/src/transformers/models/vitmatte/convert_vitmatte_to_hf.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VitMatte checkpoints from the original repository.
-
-URL: https://github.com/hustvl/ViTMatte
-"""
-
-import argparse
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import VitDetConfig, VitMatteConfig, VitMatteForImageMatting, VitMatteImageProcessor
-
-
-def get_config(model_name):
-    hidden_size = 384 if "small" in model_name else 768
-    num_attention_heads = 6 if "small" in model_name else 12
-
-    backbone_config = VitDetConfig(
-        num_channels=4,
-        image_size=512,
-        pretrain_image_size=224,
-        patch_size=16,
-        hidden_size=hidden_size,
-        num_attention_heads=num_attention_heads,
-        use_absolute_position_embeddings=True,
-        use_relative_position_embeddings=True,
-        window_size=14,
-        # 2, 5, 8, 11 for global attention
-        window_block_indices=[0, 1, 3, 4, 6, 7, 9, 10],
-        residual_block_indices=[2, 5, 8, 11],
-        out_features=["stage12"],
-    )
-
-    return VitMatteConfig(backbone_config=backbone_config, hidden_size=hidden_size)
-
-
-# here we list all keys to be renamed (original name on the left, our name on the right)
-def create_rename_keys(config):
-    rename_keys = []
-
-    # fmt: off
-    # stem
-    rename_keys.append(("backbone.pos_embed", "backbone.embeddings.position_embeddings"))
-    rename_keys.append(("backbone.patch_embed.proj.weight", "backbone.embeddings.projection.weight"))
-    rename_keys.append(("backbone.patch_embed.proj.bias", "backbone.embeddings.projection.bias"))
-    # fmt: on
-
-    return rename_keys
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def convert_vitmatte_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    config = get_config(model_name)
-
-    # load original state dict
-    model_name_to_filename = {
-        "vitmatte-small-composition-1k": "ViTMatte_S_Com.pth",
-        "vitmatte-base-composition-1k": "ViTMatte_B_Com.pth",
-        "vitmatte-small-distinctions-646": "ViTMatte_S_DIS.pth",
-        "vitmatte-base-distinctions-646": "ViTMatte_B_DIS.pth",
-    }
-
-    filename = model_name_to_filename[model_name]
-    filepath = hf_hub_download(repo_id="nielsr/vitmatte-checkpoints", filename=filename, repo_type="model")
-    state_dict = torch.load(filepath, map_location="cpu")
-
-    # rename keys
-    for key in state_dict.copy().keys():
-        val = state_dict.pop(key)
-        if "backbone.blocks" in key:
-            key = key.replace("backbone.blocks", "backbone.encoder.layer")
-        if "attn" in key:
-            key = key.replace("attn", "attention")
-        if "fusion_blks" in key:
-            key = key.replace("fusion_blks", "fusion_blocks")
-        if "bn" in key:
-            key = key.replace("bn", "batch_norm")
-        state_dict[key] = val
-
-    # rename keys
-    rename_keys = create_rename_keys(config)
-    for src, dest in rename_keys:
-        rename_key(state_dict, src, dest)
-
-    # create model
-    processor = VitMatteImageProcessor()
-    model = VitMatteForImageMatting(config)
-    model.eval()
-
-    # load state dict
-    model.load_state_dict(state_dict)
-
-    # verify on dummy image + trimap
-    url = "https://github.com/hustvl/ViTMatte/blob/main/demo/bulb_rgb.png?raw=true"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    url = "https://github.com/hustvl/ViTMatte/blob/main/demo/bulb_trimap.png?raw=true"
-    trimap = Image.open(requests.get(url, stream=True).raw)
-
-    pixel_values = processor(images=image, trimaps=trimap.convert("L"), return_tensors="pt").pixel_values
-
-    with torch.no_grad():
-        alphas = model(pixel_values).alphas
-
-    if model_name == "vitmatte-small-composition-1k":
-        expected_slice = torch.tensor([[0.9977, 0.9987, 0.9990], [0.9980, 0.9998, 0.9998], [0.9983, 0.9998, 0.9998]])
-    elif model_name == "vitmatte-base-composition-1k":
-        expected_slice = torch.tensor([[0.9972, 0.9971, 0.9981], [0.9948, 0.9987, 0.9994], [0.9963, 0.9992, 0.9995]])
-    elif model_name == "vitmatte-small-distinctions-646":
-        expected_slice = torch.tensor([[0.9880, 0.9970, 0.9972], [0.9960, 0.9996, 0.9997], [0.9963, 0.9996, 0.9997]])
-    elif model_name == "vitmatte-base-distinctions-646":
-        expected_slice = torch.tensor([[0.9963, 0.9998, 0.9999], [0.9995, 1.0000, 1.0000], [0.9992, 0.9999, 1.0000]])
-
-    assert torch.allclose(alphas[0, 0, :3, :3], expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-        processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print(f"Pushing model and processor for {model_name} to hub")
-        model.push_to_hub(f"hustvl/{model_name}")
-        processor.push_to_hub(f"hustvl/{model_name}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="vitmatte-small-composition-1k",
-        type=str,
-        choices=[
-            "vitmatte-small-composition-1k",
-            "vitmatte-base-composition-1k",
-            "vitmatte-small-distinctions-646",
-            "vitmatte-base-distinctions-646",
-        ],
-        help="Name of the VitMatte model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_vitmatte_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/vitmatte/modeling_vitmatte.py b/src/transformers/models/vitmatte/modeling_vitmatte.py
index b27bc2887080..aa3aa5b883b2 100644
--- a/src/transformers/models/vitmatte/modeling_vitmatte.py
+++ b/src/transformers/models/vitmatte/modeling_vitmatte.py
@@ -58,7 +58,7 @@ class ImageMattingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    alphas: torch.FloatTensor = None
+    alphas: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py
index 763c1f1bd7bd..aba8fec7ae41 100644
--- a/src/transformers/models/vitpose/configuration_vitpose.py
+++ b/src/transformers/models/vitpose/configuration_vitpose.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """VitPose model configuration"""
 
+from typing import Optional
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from ...utils.backbone_utils import verify_backbone_config_arguments
@@ -75,11 +77,11 @@ class VitPoseConfig(PretrainedConfig):
 
     def __init__(
         self,
-        backbone_config: PretrainedConfig = None,
-        backbone: str = None,
+        backbone_config: Optional[PretrainedConfig] = None,
+        backbone: Optional[str] = None,
         use_pretrained_backbone: bool = False,
         use_timm_backbone: bool = False,
-        backbone_kwargs: dict = None,
+        backbone_kwargs: Optional[dict] = None,
         initializer_range: float = 0.02,
         scale_factor: int = 4,
         use_simple_decoder: bool = True,
diff --git a/src/transformers/models/vitpose/convert_vitpose_to_hf.py b/src/transformers/models/vitpose/convert_vitpose_to_hf.py
deleted file mode 100644
index b1e55628a31b..000000000000
--- a/src/transformers/models/vitpose/convert_vitpose_to_hf.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VitPose checkpoints from the original repository.
-
-URL: https://github.com/vitae-transformer/vitpose
-
-Notebook to get the original logits: https://colab.research.google.com/drive/1QDX_2POTpl6JaZAV2WIFjuiqDsDwiqMZ?usp=sharing.
-"""
-
-import argparse
-import os
-import re
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import VitPoseBackboneConfig, VitPoseConfig, VitPoseForPoseEstimation, VitPoseImageProcessor
-
-
-ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
-    r"patch_embed.proj": "embeddings.patch_embeddings.projection",
-    r"pos_embed": "embeddings.position_embeddings",
-    r"blocks": "encoder.layer",
-    r"attn.proj": "attention.output.dense",
-    r"attn": "attention.self",
-    r"norm1": "layernorm_before",
-    r"norm2": "layernorm_after",
-    r"last_norm": "layernorm",
-    r"keypoint_head": "head",
-    r"final_layer": "conv",
-}
-
-MODEL_TO_FILE_NAME_MAPPING = {
-    # VitPose models, simple decoder
-    "vitpose-base-simple": "vitpose-b-simple.pth",
-    # VitPose models, classic decoder
-    "vitpose-base": "vitpose-b.pth",
-    # VitPose models, COCO-AIC-MPII
-    "vitpose-base-coco-aic-mpii": "vitpose_base_coco_aic_mpii.pth",
-    # VitPose+ models
-    "vitpose-plus-small": "vitpose+_small.pth",
-    "vitpose-plus-base": "vitpose+_base.pth",
-    "vitpose-plus-large": "vitpose+_large.pth",
-    "vitpose-plus-huge": "vitpose+_huge.pth",
-}
-
-
-def get_config(model_name):
-    if "plus" in model_name:
-        num_experts = 6
-        if "small" in model_name:
-            part_features = 96
-            out_indices = [12]
-        elif "base" in model_name:
-            part_features = 192
-            out_indices = [12]
-        elif "large" in model_name:
-            part_features = 256
-            out_indices = [24]
-        elif "huge" in model_name:
-            part_features = 320
-            out_indices = [32]
-        else:
-            raise ValueError(f"Model {model_name} not supported")
-    else:
-        num_experts = 1
-        part_features = 0
-
-    # size of the architecture
-    if "small" in model_name:
-        hidden_size = 384
-        num_hidden_layers = 12
-        num_attention_heads = 12
-    elif "large" in model_name:
-        hidden_size = 1024
-        num_hidden_layers = 24
-        num_attention_heads = 16
-    elif "huge" in model_name:
-        hidden_size = 1280
-        num_hidden_layers = 32
-        num_attention_heads = 16
-
-    backbone_config = VitPoseBackboneConfig(
-        out_indices=out_indices,
-        hidden_size=hidden_size,
-        num_hidden_layers=num_hidden_layers,
-        num_attention_heads=num_attention_heads,
-        num_experts=num_experts,
-        part_features=part_features,
-    )
-
-    use_simple_decoder = "simple" in model_name
-
-    edges = [
-        [15, 13],
-        [13, 11],
-        [16, 14],
-        [14, 12],
-        [11, 12],
-        [5, 11],
-        [6, 12],
-        [5, 6],
-        [5, 7],
-        [6, 8],
-        [7, 9],
-        [8, 10],
-        [1, 2],
-        [0, 1],
-        [0, 2],
-        [1, 3],
-        [2, 4],
-        [3, 5],
-        [4, 6],
-    ]
-    id2label = {
-        0: "Nose",
-        1: "L_Eye",
-        2: "R_Eye",
-        3: "L_Ear",
-        4: "R_Ear",
-        5: "L_Shoulder",
-        6: "R_Shoulder",
-        7: "L_Elbow",
-        8: "R_Elbow",
-        9: "L_Wrist",
-        10: "R_Wrist",
-        11: "L_Hip",
-        12: "R_Hip",
-        13: "L_Knee",
-        14: "R_Knee",
-        15: "L_Ankle",
-        16: "R_Ankle",
-    }
-
-    label2id = {v: k for k, v in id2label.items()}
-
-    config = VitPoseConfig(
-        backbone_config=backbone_config,
-        num_labels=17,
-        use_simple_decoder=use_simple_decoder,
-        edges=edges,
-        id2label=id2label,
-        label2id=label2id,
-    )
-
-    return config
-
-
-def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
-    """
-    This function should be applied only once, on the concatenated keys to efficiently rename using
-    the key mappings.
-    """
-    output_dict = {}
-    if state_dict_keys is not None:
-        old_text = "\n".join(state_dict_keys)
-        new_text = old_text
-        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
-            if replacement is None:
-                new_text = re.sub(pattern, "", new_text)  # an empty line
-                continue
-            new_text = re.sub(pattern, replacement, new_text)
-        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
-    return output_dict
-
-
-# We will verify our results on a COCO image
-def prepare_img():
-    url = "http://images.cocodataset.org/val2017/000000000139.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    return image
-
-
-@torch.no_grad()
-def write_model(model_name, model_path, push_to_hub, check_logits=True):
-    # ------------------------------------------------------------
-    # Vision model params and config
-    # ------------------------------------------------------------
-
-    # params from config
-    config = get_config(model_name)
-
-    # ------------------------------------------------------------
-    # Convert weights
-    # ------------------------------------------------------------
-
-    # load original state_dict
-    filename = MODEL_TO_FILE_NAME_MAPPING[model_name]
-    print(f"Fetching all parameters from the checkpoint at {filename}...")
-
-    checkpoint_path = hf_hub_download(
-        repo_id="nielsr/vitpose-original-checkpoints", filename=filename, repo_type="model"
-    )
-
-    print("Converting model...")
-    original_state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
-    all_keys = list(original_state_dict.keys())
-    new_keys = convert_old_keys_to_new_keys(all_keys)
-
-    dim = config.backbone_config.hidden_size
-
-    state_dict = {}
-    for key in all_keys:
-        new_key = new_keys[key]
-        value = original_state_dict[key]
-
-        if re.search("associate_heads", new_key) or re.search("backbone.cls_token", new_key):
-            # This associated_heads is concept of auxiliary head so does not require in inference stage.
-            # backbone.cls_token is optional forward function for dynamically change of size, see detail in https://github.com/ViTAE-Transformer/ViTPose/issues/34
-            pass
-        elif re.search("qkv", new_key):
-            state_dict[new_key.replace("self.qkv", "attention.query")] = value[:dim]
-            state_dict[new_key.replace("self.qkv", "attention.key")] = value[dim : dim * 2]
-            state_dict[new_key.replace("self.qkv", "attention.value")] = value[-dim:]
-        elif re.search("head", new_key) and not config.use_simple_decoder:
-            # Pattern for deconvolution layers
-            deconv_pattern = r"deconv_layers\.(0|3)\.weight"
-            new_key = re.sub(deconv_pattern, lambda m: f"deconv{int(m.group(1))//3 + 1}.weight", new_key)
-            # Pattern for batch normalization layers
-            bn_patterns = [
-                (r"deconv_layers\.(\d+)\.weight", r"batchnorm\1.weight"),
-                (r"deconv_layers\.(\d+)\.bias", r"batchnorm\1.bias"),
-                (r"deconv_layers\.(\d+)\.running_mean", r"batchnorm\1.running_mean"),
-                (r"deconv_layers\.(\d+)\.running_var", r"batchnorm\1.running_var"),
-                (r"deconv_layers\.(\d+)\.num_batches_tracked", r"batchnorm\1.num_batches_tracked"),
-            ]
-
-            for pattern, replacement in bn_patterns:
-                if re.search(pattern, new_key):
-                    # Convert the layer number to the correct batch norm index
-                    layer_num = int(re.search(pattern, key).group(1))
-                    bn_num = layer_num // 3 + 1
-                    new_key = re.sub(pattern, replacement.replace(r"\1", str(bn_num)), new_key)
-            state_dict[new_key] = value
-        else:
-            state_dict[new_key] = value
-
-    print("Loading the checkpoint in a Vitpose model.")
-    model = VitPoseForPoseEstimation(config)
-    model.eval()
-    model.load_state_dict(state_dict)
-    print("Checkpoint loaded successfully.")
-
-    # create image processor
-    image_processor = VitPoseImageProcessor()
-
-    # verify image processor
-    image = prepare_img()
-    boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
-    pixel_values = image_processor(images=image, boxes=boxes, return_tensors="pt").pixel_values
-
-    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_batch_data.pt", repo_type="dataset")
-    original_pixel_values = torch.load(filepath, map_location="cpu")["img"]
-    # we allow for a small difference in the pixel values due to the original repository using cv2
-    assert torch.allclose(pixel_values, original_pixel_values, atol=1e-1)
-
-    dataset_index = torch.tensor([0])
-
-    with torch.no_grad():
-        print("Shape of original_pixel_values: ", original_pixel_values.shape)
-        print("First values of original_pixel_values: ", original_pixel_values[0, 0, :3, :3])
-
-        # first forward pass
-        outputs = model(original_pixel_values, dataset_index=dataset_index)
-        output_heatmap = outputs.heatmaps
-
-        print("Shape of output_heatmap: ", output_heatmap.shape)
-        print("First values: ", output_heatmap[0, 0, :3, :3])
-
-        # second forward pass (flipped)
-        # this is done since the model uses `flip_test=True` in its test config
-        original_pixel_values_flipped = torch.flip(original_pixel_values, [3])
-        outputs_flipped = model(
-            original_pixel_values_flipped,
-            dataset_index=dataset_index,
-            flip_pairs=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]),
-        )
-        output_flipped_heatmap = outputs_flipped.heatmaps
-
-    outputs.heatmaps = (output_heatmap + output_flipped_heatmap) * 0.5
-
-    # Verify pose_results
-    pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
-
-    if check_logits:
-        # Simple decoder checkpoints
-        if model_name == "vitpose-base-simple":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([3.98180511e02, 1.81808380e02]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor([8.66642594e-01]),
-                atol=5e-2,
-            )
-        # Classic decoder checkpoints
-        elif model_name == "vitpose-base":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([3.9807913e02, 1.8182812e02]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor([8.8235235e-01]),
-                atol=5e-2,
-            )
-        # COCO-AIC-MPII checkpoints
-        elif model_name == "vitpose-base-coco-aic-mpii":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([3.98305542e02, 1.81741592e02]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor([8.69966745e-01]),
-                atol=5e-2,
-            )
-        # VitPose+ models
-        elif model_name == "vitpose-plus-small":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([398.1597, 181.6902]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor(0.9051),
-                atol=5e-2,
-            )
-        elif model_name == "vitpose-plus-base":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([3.98201294e02, 1.81728302e02]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor([8.75046968e-01]),
-                atol=5e-2,
-            )
-        elif model_name == "vitpose-plus-large":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([398.1409, 181.7412]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor(0.8746),
-                atol=5e-2,
-            )
-        elif model_name == "vitpose-plus-huge":
-            assert torch.allclose(
-                pose_results[1]["keypoints"][0],
-                torch.tensor([398.2079, 181.8026]),
-                atol=5e-2,
-            )
-            assert torch.allclose(
-                pose_results[1]["scores"][0],
-                torch.tensor(0.8693),
-                atol=5e-2,
-            )
-        else:
-            raise ValueError("Model not supported")
-    print("Conversion successfully done.")
-
-    if model_path is not None:
-        os.makedirs(model_path, exist_ok=True)
-        model.save_pretrained(model_path)
-        image_processor.save_pretrained(model_path)
-
-    if push_to_hub:
-        print(f"Pushing model and image processor for {model_name} to hub")
-        # we created a community organization on the hub for this model
-        # maintained by the Transformers team
-        model.push_to_hub(f"usyd-community/{model_name}")
-        image_processor.push_to_hub(f"usyd-community/{model_name}")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="vitpose-base-simple",
-        choices=MODEL_TO_FILE_NAME_MAPPING.keys(),
-        type=str,
-        help="Name of the VitPose model you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to store the converted model."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-    parser.add_argument(
-        "--check_logits", action="store_false", help="Whether or not to verify the logits of the converted model."
-    )
-
-    args = parser.parse_args()
-    write_model(
-        model_path=args.pytorch_dump_folder_path,
-        model_name=args.model_name,
-        push_to_hub=args.push_to_hub,
-        check_logits=args.check_logits,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/vitpose/image_processing_vitpose.py b/src/transformers/models/vitpose/image_processing_vitpose.py
index e7c5c524cb05..387b7225473b 100644
--- a/src/transformers/models/vitpose/image_processing_vitpose.py
+++ b/src/transformers/models/vitpose/image_processing_vitpose.py
@@ -424,11 +424,11 @@ def preprocess(
         self,
         images: ImageInput,
         boxes: Union[List[List[float]], np.ndarray],
-        do_affine_transform: bool = None,
+        do_affine_transform: Optional[bool] = None,
         size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
@@ -599,7 +599,7 @@ def post_process_pose_estimation(
         outputs: "VitPoseEstimatorOutput",
         boxes: Union[List[List[List[float]]], np.ndarray],
         kernel_size: int = 11,
-        threshold: float = None,
+        threshold: Optional[float] = None,
         target_sizes: Union[TensorType, List[Tuple]] = None,
     ):
         """
diff --git a/src/transformers/models/vitpose/modeling_vitpose.py b/src/transformers/models/vitpose/modeling_vitpose.py
index b5dd274654ac..dfe9738abf57 100644
--- a/src/transformers/models/vitpose/modeling_vitpose.py
+++ b/src/transformers/models/vitpose/modeling_vitpose.py
@@ -62,7 +62,7 @@ class VitPoseEstimatorOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    heatmaps: torch.FloatTensor = None
+    heatmaps: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index d89f95e26b54..c0d6d7f0227c 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -20,8 +20,7 @@
 """
 
 import collections.abc
-import math
-from typing import Optional, Set, Tuple, Union
+from typing import Callable, Optional, Set, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -29,7 +28,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BackboneOutput, BaseModelOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_start_docstrings,
@@ -103,26 +102,59 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->VitPoseBackbone
 class VitPoseBackboneSelfAttention(nn.Module):
     def __init__(self, config: VitPoseBackboneConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -131,33 +163,33 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
 
         outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
@@ -392,6 +424,8 @@ class VitPoseBackbonePreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _no_split_modules = ["VitPoseBackboneEmbeddings", "VitPoseBackboneLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm, VitPoseBackboneEmbeddings]) -> None:
         """Initialize the weights"""
diff --git a/src/transformers/models/vits/convert_original_checkpoint.py b/src/transformers/models/vits/convert_original_checkpoint.py
deleted file mode 100644
index 267f72ccd08f..000000000000
--- a/src/transformers/models/vits/convert_original_checkpoint.py
+++ /dev/null
@@ -1,390 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert VITS checkpoint."""
-
-import argparse
-import json
-import tempfile
-
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import VitsConfig, VitsModel, VitsTokenizer, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger("transformers.models.vits")
-
-MAPPING_TEXT_ENCODER = {
-    "enc_p.emb": "text_encoder.embed_tokens",
-    "enc_p.encoder.attn_layers.*.conv_k": "text_encoder.encoder.layers.*.attention.k_proj",
-    "enc_p.encoder.attn_layers.*.conv_v": "text_encoder.encoder.layers.*.attention.v_proj",
-    "enc_p.encoder.attn_layers.*.conv_q": "text_encoder.encoder.layers.*.attention.q_proj",
-    "enc_p.encoder.attn_layers.*.conv_o": "text_encoder.encoder.layers.*.attention.out_proj",
-    "enc_p.encoder.attn_layers.*.emb_rel_k": "text_encoder.encoder.layers.*.attention.emb_rel_k",
-    "enc_p.encoder.attn_layers.*.emb_rel_v": "text_encoder.encoder.layers.*.attention.emb_rel_v",
-    "enc_p.encoder.norm_layers_1.*.gamma": "text_encoder.encoder.layers.*.layer_norm.weight",
-    "enc_p.encoder.norm_layers_1.*.beta": "text_encoder.encoder.layers.*.layer_norm.bias",
-    "enc_p.encoder.ffn_layers.*.conv_1": "text_encoder.encoder.layers.*.feed_forward.conv_1",
-    "enc_p.encoder.ffn_layers.*.conv_2": "text_encoder.encoder.layers.*.feed_forward.conv_2",
-    "enc_p.encoder.norm_layers_2.*.gamma": "text_encoder.encoder.layers.*.final_layer_norm.weight",
-    "enc_p.encoder.norm_layers_2.*.beta": "text_encoder.encoder.layers.*.final_layer_norm.bias",
-    "enc_p.proj": "text_encoder.project",
-}
-MAPPING_STOCHASTIC_DURATION_PREDICTOR = {
-    "dp.pre": "duration_predictor.conv_pre",
-    "dp.proj": "duration_predictor.conv_proj",
-    "dp.convs.convs_sep.*": "duration_predictor.conv_dds.convs_dilated.*",
-    "dp.convs.convs_1x1.*": "duration_predictor.conv_dds.convs_pointwise.*",
-    "dp.convs.norms_1.*.gamma": "duration_predictor.conv_dds.norms_1.*.weight",
-    "dp.convs.norms_1.*.beta": "duration_predictor.conv_dds.norms_1.*.bias",
-    "dp.convs.norms_2.*.gamma": "duration_predictor.conv_dds.norms_2.*.weight",
-    "dp.convs.norms_2.*.beta": "duration_predictor.conv_dds.norms_2.*.bias",
-    "dp.flows.0.logs": "duration_predictor.flows.0.log_scale",
-    "dp.flows.0.m": "duration_predictor.flows.0.translate",
-    "dp.flows.*.pre": "duration_predictor.flows.*.conv_pre",
-    "dp.flows.*.proj": "duration_predictor.flows.*.conv_proj",
-    "dp.flows.*.convs.convs_1x1.0": "duration_predictor.flows.*.conv_dds.convs_pointwise.0",
-    "dp.flows.*.convs.convs_1x1.1": "duration_predictor.flows.*.conv_dds.convs_pointwise.1",
-    "dp.flows.*.convs.convs_1x1.2": "duration_predictor.flows.*.conv_dds.convs_pointwise.2",
-    "dp.flows.*.convs.convs_sep.0": "duration_predictor.flows.*.conv_dds.convs_dilated.0",
-    "dp.flows.*.convs.convs_sep.1": "duration_predictor.flows.*.conv_dds.convs_dilated.1",
-    "dp.flows.*.convs.convs_sep.2": "duration_predictor.flows.*.conv_dds.convs_dilated.2",
-    "dp.flows.*.convs.norms_1.0.gamma": "duration_predictor.flows.*.conv_dds.norms_1.0.weight",
-    "dp.flows.*.convs.norms_1.0.beta": "duration_predictor.flows.*.conv_dds.norms_1.0.bias",
-    "dp.flows.*.convs.norms_1.1.gamma": "duration_predictor.flows.*.conv_dds.norms_1.1.weight",
-    "dp.flows.*.convs.norms_1.1.beta": "duration_predictor.flows.*.conv_dds.norms_1.1.bias",
-    "dp.flows.*.convs.norms_1.2.gamma": "duration_predictor.flows.*.conv_dds.norms_1.2.weight",
-    "dp.flows.*.convs.norms_1.2.beta": "duration_predictor.flows.*.conv_dds.norms_1.2.bias",
-    "dp.flows.*.convs.norms_2.0.gamma": "duration_predictor.flows.*.conv_dds.norms_2.0.weight",
-    "dp.flows.*.convs.norms_2.0.beta": "duration_predictor.flows.*.conv_dds.norms_2.0.bias",
-    "dp.flows.*.convs.norms_2.1.gamma": "duration_predictor.flows.*.conv_dds.norms_2.1.weight",
-    "dp.flows.*.convs.norms_2.1.beta": "duration_predictor.flows.*.conv_dds.norms_2.1.bias",
-    "dp.flows.*.convs.norms_2.2.gamma": "duration_predictor.flows.*.conv_dds.norms_2.2.weight",
-    "dp.flows.*.convs.norms_2.2.beta": "duration_predictor.flows.*.conv_dds.norms_2.2.bias",
-    "dp.post_pre": "duration_predictor.post_conv_pre",
-    "dp.post_proj": "duration_predictor.post_conv_proj",
-    "dp.post_convs.convs_sep.*": "duration_predictor.post_conv_dds.convs_dilated.*",
-    "dp.post_convs.convs_1x1.*": "duration_predictor.post_conv_dds.convs_pointwise.*",
-    "dp.post_convs.norms_1.*.gamma": "duration_predictor.post_conv_dds.norms_1.*.weight",
-    "dp.post_convs.norms_1.*.beta": "duration_predictor.post_conv_dds.norms_1.*.bias",
-    "dp.post_convs.norms_2.*.gamma": "duration_predictor.post_conv_dds.norms_2.*.weight",
-    "dp.post_convs.norms_2.*.beta": "duration_predictor.post_conv_dds.norms_2.*.bias",
-    "dp.post_flows.0.logs": "duration_predictor.post_flows.0.log_scale",
-    "dp.post_flows.0.m": "duration_predictor.post_flows.0.translate",
-    "dp.post_flows.*.pre": "duration_predictor.post_flows.*.conv_pre",
-    "dp.post_flows.*.proj": "duration_predictor.post_flows.*.conv_proj",
-    "dp.post_flows.*.convs.convs_1x1.0": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.0",
-    "dp.post_flows.*.convs.convs_1x1.1": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.1",
-    "dp.post_flows.*.convs.convs_1x1.2": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.2",
-    "dp.post_flows.*.convs.convs_sep.0": "duration_predictor.post_flows.*.conv_dds.convs_dilated.0",
-    "dp.post_flows.*.convs.convs_sep.1": "duration_predictor.post_flows.*.conv_dds.convs_dilated.1",
-    "dp.post_flows.*.convs.convs_sep.2": "duration_predictor.post_flows.*.conv_dds.convs_dilated.2",
-    "dp.post_flows.*.convs.norms_1.0.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.0.weight",
-    "dp.post_flows.*.convs.norms_1.0.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.0.bias",
-    "dp.post_flows.*.convs.norms_1.1.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.1.weight",
-    "dp.post_flows.*.convs.norms_1.1.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.1.bias",
-    "dp.post_flows.*.convs.norms_1.2.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.2.weight",
-    "dp.post_flows.*.convs.norms_1.2.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.2.bias",
-    "dp.post_flows.*.convs.norms_2.0.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.0.weight",
-    "dp.post_flows.*.convs.norms_2.0.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.0.bias",
-    "dp.post_flows.*.convs.norms_2.1.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.1.weight",
-    "dp.post_flows.*.convs.norms_2.1.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.1.bias",
-    "dp.post_flows.*.convs.norms_2.2.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.2.weight",
-    "dp.post_flows.*.convs.norms_2.2.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.2.bias",
-    "dp.cond": "duration_predictor.cond",  # num_speakers > 1
-}
-MAPPING_FLOW = {
-    "flow.flows.*.pre": "flow.flows.*.conv_pre",
-    "flow.flows.*.enc.in_layers.0": "flow.flows.*.wavenet.in_layers.0",
-    "flow.flows.*.enc.in_layers.1": "flow.flows.*.wavenet.in_layers.1",
-    "flow.flows.*.enc.in_layers.2": "flow.flows.*.wavenet.in_layers.2",
-    "flow.flows.*.enc.in_layers.3": "flow.flows.*.wavenet.in_layers.3",
-    "flow.flows.*.enc.res_skip_layers.0": "flow.flows.*.wavenet.res_skip_layers.0",
-    "flow.flows.*.enc.res_skip_layers.1": "flow.flows.*.wavenet.res_skip_layers.1",
-    "flow.flows.*.enc.res_skip_layers.2": "flow.flows.*.wavenet.res_skip_layers.2",
-    "flow.flows.*.enc.res_skip_layers.3": "flow.flows.*.wavenet.res_skip_layers.3",
-    "flow.flows.*.enc.cond_layer": "flow.flows.*.wavenet.cond_layer",  # num_speakers > 1
-    "flow.flows.*.post": "flow.flows.*.conv_post",
-}
-MAPPING_GENERATOR = {
-    "dec.conv_pre": "decoder.conv_pre",
-    "dec.ups.0": "decoder.upsampler.0",
-    "dec.ups.1": "decoder.upsampler.1",
-    "dec.ups.2": "decoder.upsampler.2",
-    "dec.ups.3": "decoder.upsampler.3",
-    "dec.resblocks.*.convs1.0": "decoder.resblocks.*.convs1.0",
-    "dec.resblocks.*.convs1.1": "decoder.resblocks.*.convs1.1",
-    "dec.resblocks.*.convs1.2": "decoder.resblocks.*.convs1.2",
-    "dec.resblocks.*.convs2.0": "decoder.resblocks.*.convs2.0",
-    "dec.resblocks.*.convs2.1": "decoder.resblocks.*.convs2.1",
-    "dec.resblocks.*.convs2.2": "decoder.resblocks.*.convs2.2",
-    "dec.conv_post": "decoder.conv_post",
-    "dec.cond": "decoder.cond",  # num_speakers > 1
-}
-MAPPING_POSTERIOR_ENCODER = {
-    "enc_q.pre": "posterior_encoder.conv_pre",
-    "enc_q.enc.in_layers.*": "posterior_encoder.wavenet.in_layers.*",
-    "enc_q.enc.res_skip_layers.*": "posterior_encoder.wavenet.res_skip_layers.*",
-    "enc_q.enc.cond_layer": "posterior_encoder.wavenet.cond_layer",  # num_speakers > 1
-    "enc_q.proj": "posterior_encoder.conv_proj",
-}
-MAPPING = {
-    **MAPPING_TEXT_ENCODER,
-    **MAPPING_STOCHASTIC_DURATION_PREDICTOR,
-    **MAPPING_FLOW,
-    **MAPPING_GENERATOR,
-    **MAPPING_POSTERIOR_ENCODER,
-    "emb_g": "embed_speaker",  # num_speakers > 1
-}
-TOP_LEVEL_KEYS = []
-IGNORE_KEYS = []
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    # strip off the kernel dimension at the end (original weights are Conv1d)
-    if key.endswith(".k_proj") or key.endswith(".v_proj") or key.endswith(".q_proj") or key.endswith(".out_proj"):
-        value = value.squeeze(-1)
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "running_mean":
-        hf_pointer.running_mean.data = value
-    elif weight_type == "running_var":
-        hf_pointer.running_var.data = value
-    elif weight_type == "num_batches_tracked":
-        hf_pointer.num_batches_tracked.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
-
-
-def should_ignore(name, ignore_keys):
-    for key in ignore_keys:
-        if key.endswith(".*"):
-            if name.startswith(key[:-1]):
-                return True
-        elif ".*." in key:
-            prefix, suffix = key.split(".*.")
-            if prefix in name and suffix in name:
-                return True
-        elif key in name:
-            return True
-    return False
-
-
-def recursively_load_weights(fairseq_dict, hf_model):
-    unused_weights = []
-
-    for name, value in fairseq_dict.items():
-        if should_ignore(name, IGNORE_KEYS):
-            logger.info(f"{name} was ignored")
-            continue
-
-        is_used = False
-        for key, mapped_key in MAPPING.items():
-            if key.endswith(".*"):
-                key = key[:-1]
-            elif "*" in key:
-                prefix, suffix = key.split(".*.")
-                if prefix in name and suffix in name:
-                    key = suffix
-
-            if key in name:
-                is_used = True
-                if mapped_key.endswith(".*"):
-                    layer_index = name.split(key)[-1].split(".")[0]
-                    mapped_key = mapped_key.replace("*", layer_index)
-                elif "*" in mapped_key:
-                    layer_index = name.split(key)[0].split(".")[-2]
-
-                    # remap the layer index since we removed the Flip layers
-                    if "flow.flows" in mapped_key:
-                        layer_index = str(int(layer_index) // 2)
-                    if "duration_predictor.flows" in mapped_key or "duration_predictor.post_flows" in mapped_key:
-                        layer_index = str(int(layer_index) // 2 + 1)
-
-                    mapped_key = mapped_key.replace("*", layer_index)
-                if "weight_g" in name:
-                    weight_type = "weight_g"
-                elif "weight_v" in name:
-                    weight_type = "weight_v"
-                elif "bias" in name:
-                    weight_type = "bias"
-                elif "weight" in name:
-                    weight_type = "weight"
-                elif "running_mean" in name:
-                    weight_type = "running_mean"
-                elif "running_var" in name:
-                    weight_type = "running_var"
-                elif "num_batches_tracked" in name:
-                    weight_type = "num_batches_tracked"
-                else:
-                    weight_type = None
-                set_recursively(hf_model, mapped_key, value, name, weight_type)
-            continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-@torch.no_grad()
-def convert_checkpoint(
-    pytorch_dump_folder_path,
-    checkpoint_path=None,
-    config_path=None,
-    vocab_path=None,
-    language=None,
-    num_speakers=None,
-    sampling_rate=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = VitsConfig.from_pretrained(config_path)
-    else:
-        config = VitsConfig()
-
-    if num_speakers:
-        config.num_speakers = num_speakers
-        config.speaker_embedding_size = 256
-
-    if sampling_rate:
-        config.sampling_rate = sampling_rate
-
-    if checkpoint_path is None:
-        logger.info(f"***Converting model: facebook/mms-tts {language}***")
-
-        vocab_path = hf_hub_download(
-            repo_id="facebook/mms-tts",
-            filename="vocab.txt",
-            subfolder=f"models/{language}",
-        )
-        config_file = hf_hub_download(
-            repo_id="facebook/mms-tts",
-            filename="config.json",
-            subfolder=f"models/{language}",
-        )
-        checkpoint_path = hf_hub_download(
-            repo_id="facebook/mms-tts",
-            filename="G_100000.pth",
-            subfolder=f"models/{language}",
-        )
-
-        with open(config_file, "r") as f:
-            data = f.read()
-            hps = json.loads(data)
-
-        is_uroman = hps["data"]["training_files"].split(".")[-1] == "uroman"
-        if is_uroman:
-            logger.warning("For this checkpoint, you should use `uroman` to convert input text before tokenizing it!")
-    else:
-        logger.info(f"***Converting model: {checkpoint_path}***")
-        is_uroman = False
-
-    # original VITS checkpoint
-    if vocab_path is None:
-        _pad = "_"
-        _punctuation = ';:,.!?¡¿—…"«»“” '
-        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-        _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
-        symbols = _pad + _punctuation + _letters + _letters_ipa
-        symbol_to_id = {s: i for i, s in enumerate(symbols)}
-        phonemize = True
-    else:
-        # Save vocab as temporary json file
-        symbols = [line.replace("\n", "") for line in open(vocab_path, encoding="utf-8").readlines()]
-        symbol_to_id = {s: i for i, s in enumerate(symbols)}
-        # MMS-TTS does not use a <pad> token, so we set to the token used to space characters
-        _pad = symbols[0]
-        phonemize = False
-
-    with tempfile.NamedTemporaryFile() as tf:
-        with open(tf.name, "w", encoding="utf-8") as f:
-            f.write(json.dumps(symbol_to_id, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        tokenizer = VitsTokenizer(tf.name, language=language, phonemize=phonemize, is_uroman=is_uroman, pad_token=_pad)
-
-    config.vocab_size = len(symbols)
-    model = VitsModel(config)
-
-    model.decoder.apply_weight_norm()
-
-    orig_checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))
-    recursively_load_weights(orig_checkpoint["model"], model)
-
-    model.decoder.remove_weight_norm()
-
-    model.save_pretrained(pytorch_dump_folder_path)
-    tokenizer.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        print("Pushing to the hub...")
-        tokenizer.push_to_hub(repo_id)
-        model.push_to_hub(repo_id)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Local path to original checkpoint")
-    parser.add_argument("--vocab_path", default=None, type=str, help="Path to vocab.txt")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument("--language", default=None, type=str, help="Tokenizer language (three-letter code)")
-    parser.add_argument("--num_speakers", default=None, type=int, help="Number of speakers")
-    parser.add_argument(
-        "--sampling_rate", default=None, type=int, help="Sampling rate on which the model was trained."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_checkpoint(
-        args.pytorch_dump_folder_path,
-        args.checkpoint_path,
-        args.config_path,
-        args.vocab_path,
-        args.language,
-        args.num_speakers,
-        args.sampling_rate,
-        args.push_to_hub,
-    )
diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py
index 7a506d497f9a..59483d3e6136 100644
--- a/src/transformers/models/vits/modeling_vits.py
+++ b/src/transformers/models/vits/modeling_vits.py
@@ -69,8 +69,8 @@ class VitsModelOutput(ModelOutput):
             heads.
     """
 
-    waveform: torch.FloatTensor = None
-    sequence_lengths: torch.FloatTensor = None
+    waveform: Optional[torch.FloatTensor] = None
+    sequence_lengths: Optional[torch.FloatTensor] = None
     spectrogram: Optional[Tuple[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -101,9 +101,9 @@ class VitsTextEncoderOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: torch.FloatTensor = None
-    prior_means: torch.FloatTensor = None
-    prior_log_variances: torch.FloatTensor = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    prior_means: Optional[torch.FloatTensor] = None
+    prior_log_variances: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
diff --git a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
deleted file mode 100644
index c3075d60346e..000000000000
--- a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Flax ViViT checkpoints from the original repository to PyTorch. URL:
-https://github.com/google-research/scenic/tree/main/scenic/projects/vivit
-"""
-
-import argparse
-import json
-import os.path
-from collections import OrderedDict
-
-import numpy as np
-import requests
-import torch
-from flax.training.checkpoints import restore_checkpoint
-from huggingface_hub import hf_hub_download
-
-from transformers import VivitConfig, VivitForVideoClassification, VivitImageProcessor
-from transformers.image_utils import PILImageResampling
-
-
-def download_checkpoint(path):
-    url = "https://storage.googleapis.com/scenic-bucket/vivit/kinetics_400/vivit_base_16x2_unfactorized/checkpoint"
-
-    with open(path, "wb") as f:
-        with requests.get(url, stream=True) as req:
-            for chunk in req.iter_content(chunk_size=2048):
-                f.write(chunk)
-
-
-def get_vivit_config() -> VivitConfig:
-    config = VivitConfig()
-
-    config.num_labels = 400
-    repo_id = "huggingface/label-files"
-    filename = "kinetics400-id2label.json"
-
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-    return config
-
-
-# We will verify our results on a video of eating spaghetti
-# Frame indices used: [ 47, 51, 55, 59, 63, 67, 71, 75, 80, 84, 88, 92, 96, 100, 104, 108, 113, 117,
-# 121, 125, 129, 133, 137, 141, 146, 150, 154, 158, 162, 166, 170, 174]
-def prepare_video():
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti_32_frames.npy", repo_type="dataset"
-    )
-    video = np.load(file)
-    return list(video)
-
-
-def transform_attention(current: np.ndarray):
-    if np.ndim(current) == 2:
-        return transform_attention_bias(current)
-
-    elif np.ndim(current) == 3:
-        return transform_attention_kernel(current)
-
-    else:
-        raise Exception(f"Invalid number of dimesions: {np.ndim(current)}")
-
-
-def transform_attention_bias(current: np.ndarray):
-    return current.flatten()
-
-
-def transform_attention_kernel(current: np.ndarray):
-    return np.reshape(current, (current.shape[0], current.shape[1] * current.shape[2])).T
-
-
-def transform_attention_output_weight(current: np.ndarray):
-    return np.reshape(current, (current.shape[0] * current.shape[1], current.shape[2])).T
-
-
-def transform_state_encoder_block(state_dict, i):
-    state = state_dict["optimizer"]["target"]["Transformer"][f"encoderblock_{i}"]
-
-    prefix = f"encoder.layer.{i}."
-    new_state = {
-        prefix + "intermediate.dense.bias": state["MlpBlock_0"]["Dense_0"]["bias"],
-        prefix + "intermediate.dense.weight": np.transpose(state["MlpBlock_0"]["Dense_0"]["kernel"]),
-        prefix + "output.dense.bias": state["MlpBlock_0"]["Dense_1"]["bias"],
-        prefix + "output.dense.weight": np.transpose(state["MlpBlock_0"]["Dense_1"]["kernel"]),
-        prefix + "layernorm_before.bias": state["LayerNorm_0"]["bias"],
-        prefix + "layernorm_before.weight": state["LayerNorm_0"]["scale"],
-        prefix + "layernorm_after.bias": state["LayerNorm_1"]["bias"],
-        prefix + "layernorm_after.weight": state["LayerNorm_1"]["scale"],
-        prefix + "attention.attention.query.bias": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["query"]["bias"]
-        ),
-        prefix + "attention.attention.query.weight": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["query"]["kernel"]
-        ),
-        prefix + "attention.attention.key.bias": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["key"]["bias"]
-        ),
-        prefix + "attention.attention.key.weight": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["key"]["kernel"]
-        ),
-        prefix + "attention.attention.value.bias": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["value"]["bias"]
-        ),
-        prefix + "attention.attention.value.weight": transform_attention(
-            state["MultiHeadDotProductAttention_0"]["value"]["kernel"]
-        ),
-        prefix + "attention.output.dense.bias": state["MultiHeadDotProductAttention_0"]["out"]["bias"],
-        prefix + "attention.output.dense.weight": transform_attention_output_weight(
-            state["MultiHeadDotProductAttention_0"]["out"]["kernel"]
-        ),
-    }
-
-    return new_state
-
-
-def get_n_layers(state_dict):
-    return sum([1 if "encoderblock_" in k else 0 for k in state_dict["optimizer"]["target"]["Transformer"].keys()])
-
-
-def transform_state(state_dict, classification_head=False):
-    transformer_layers = get_n_layers(state_dict)
-
-    new_state = OrderedDict()
-
-    new_state["layernorm.bias"] = state_dict["optimizer"]["target"]["Transformer"]["encoder_norm"]["bias"]
-    new_state["layernorm.weight"] = state_dict["optimizer"]["target"]["Transformer"]["encoder_norm"]["scale"]
-
-    new_state["embeddings.patch_embeddings.projection.weight"] = np.transpose(
-        state_dict["optimizer"]["target"]["embedding"]["kernel"], (4, 3, 0, 1, 2)
-    )
-    new_state["embeddings.patch_embeddings.projection.bias"] = state_dict["optimizer"]["target"]["embedding"]["bias"]
-
-    new_state["embeddings.cls_token"] = state_dict["optimizer"]["target"]["cls"]
-    new_state["embeddings.position_embeddings"] = state_dict["optimizer"]["target"]["Transformer"]["posembed_input"][
-        "pos_embedding"
-    ]
-
-    for i in range(transformer_layers):
-        new_state.update(transform_state_encoder_block(state_dict, i))
-
-    if classification_head:
-        new_state = {"vivit." + k: v for k, v in new_state.items()}
-        new_state["classifier.weight"] = np.transpose(state_dict["optimizer"]["target"]["output_projection"]["kernel"])
-        new_state["classifier.bias"] = np.transpose(state_dict["optimizer"]["target"]["output_projection"]["bias"])
-
-    return {k: torch.tensor(v) for k, v in new_state.items()}
-
-
-# checks that image processor settings are the same as in the original implementation
-# original: https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/data/video_tfrecord_dataset.py
-# dataset specific config:
-# https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/configs/kinetics400/vivit_base_k400.py
-def get_processor() -> VivitImageProcessor:
-    extractor = VivitImageProcessor()
-
-    assert extractor.do_resize is True
-    assert extractor.size == {"shortest_edge": 256}
-    assert extractor.do_center_crop is True
-    assert extractor.crop_size == {"width": 224, "height": 224}
-    assert extractor.resample == PILImageResampling.BILINEAR
-
-    # here: https://github.com/deepmind/dmvr/blob/master/dmvr/modalities.py
-    # one can seen that add_image has default values for normalization_mean and normalization_std set to 0 and 1
-    # which effectively means no normalization (and ViViT does not overwrite those when calling this func)
-    assert extractor.do_normalize is False
-    assert extractor.do_rescale is True
-    assert extractor.rescale_factor == 1 / 255
-
-    # zero-centering = True in original implementation
-    assert extractor.do_zero_centering is True
-
-    return extractor
-
-
-def convert(output_path: str):
-    flax_model_path = "checkpoint"
-
-    if not os.path.exists(flax_model_path):
-        download_checkpoint(flax_model_path)
-
-    state_dict = restore_checkpoint(flax_model_path, None)
-    new_state = transform_state(state_dict, classification_head=True)
-
-    config = get_vivit_config()
-
-    assert config.image_size == 224
-    assert config.num_frames == 32
-
-    model = VivitForVideoClassification(config)
-    model.load_state_dict(new_state)
-    model.eval()
-
-    extractor = get_processor()
-
-    video = prepare_video()
-    inputs = extractor(video, return_tensors="pt")
-
-    outputs = model(**inputs)
-
-    expected_shape = torch.Size([1, 400])
-    expected_slice = torch.tensor([-1.0543, 2.0764, -0.2104, 0.4439, -0.9658])
-
-    assert outputs.logits.shape == expected_shape
-    assert torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4), outputs.logits[0, :5]
-
-    model.save_pretrained(output_path)
-    extractor.save_pretrained(output_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("--output_model_name", "-o", type=str, help="Output path for the converted HuggingFace model")
-
-    args = parser.parse_args()
-    convert(args.output_model_name)
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 1343d97a91a0..8b369be41ba8 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -227,15 +227,15 @@ def rescale(
     def _preprocess_image(
         self,
         image: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        offset: bool = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        offset: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
@@ -290,15 +290,15 @@ def _preprocess_image(
     def preprocess(
         self,
         videos: ImageInput,
-        do_resize: bool = None,
+        do_resize: Optional[bool] = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
-        do_center_crop: bool = None,
+        do_center_crop: Optional[bool] = None,
         crop_size: Dict[str, int] = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        offset: bool = None,
-        do_normalize: bool = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        offset: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
diff --git a/src/transformers/models/vivit/modeling_vivit.py b/src/transformers/models/vivit/modeling_vivit.py
index 22877c842f96..669106239a06 100755
--- a/src/transformers/models/vivit/modeling_vivit.py
+++ b/src/transformers/models/vivit/modeling_vivit.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 """PyTorch ViViT model."""
 
-import math
-from typing import Optional, Set, Tuple, Union
+from typing import Callable, Optional, Set, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -24,7 +23,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     add_start_docstrings,
@@ -73,8 +72,7 @@ def forward(self, pixel_values, interpolate_pos_encoding: bool = False):
         batch_size, num_frames, num_channels, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
-                f"Image image size ({height}*{width}) doesn't match model"
-                f" ({self.image_size[0]}*{self.image_size[1]})."
+                f"Image image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
             )
 
         # permute to (batch_size, num_channels, num_frames, height, width)
@@ -166,26 +164,59 @@ def forward(self, pixel_values, interpolate_pos_encoding: bool = False):
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Vivit
 class VivitSelfAttention(nn.Module):
     def __init__(self, config: VivitConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -194,82 +225,37 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-# Adapted from transformers.models.vit.modeling_vit.ViTSdpaSelfAttention with ViT->Vivit
-class VivitSdpaSelfAttention(VivitSelfAttention):
-    def __init__(self, config: VivitConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions or head_mask is not None:
-            logger.warning_once(
-                "VivitSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support"
-                " `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying"
-                " the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be"
-                ' removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states,
-                head_mask,
-                output_attentions,
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
-        return context_layer, None
+        return outputs
 
 
 # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Vivit
@@ -331,13 +317,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->Vivit
-class VivitSdpaAttention(VivitAttention):
-    def __init__(self, config: VivitConfig) -> None:
-        super().__init__(config)
-        self.attention = VivitSdpaSelfAttention(config)
-
-
 class VivitIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -372,12 +351,6 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-VIVIT_ATTENTION_CLASSES = {
-    "eager": VivitAttention,
-    "sdpa": VivitSdpaAttention,
-}
-
-
 class VivitLayer(nn.Module):
     """This corresponds to the EncoderBlock class in the scenic/vivit implementation."""
 
@@ -385,7 +358,7 @@ def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = VIVIT_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = VivitAttention(config)
         self.intermediate = VivitIntermediate(config)
         self.output = VivitOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -495,6 +468,7 @@ class VivitPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = []
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -511,8 +485,9 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-        elif isinstance(module, nn.Parameter):
-            module.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, VivitEmbeddings):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
 
 
 VIVIT_START_DOCSTRING = r"""
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 5613f83a86b4..000000000000
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2 checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-from transformers import (
-    Wav2Vec2Config,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2ForCTC,
-    Wav2Vec2ForPreTraining,
-    Wav2Vec2Processor,
-    logging,
-)
-from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2ForSequenceClassification
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "adapter_layer": "encoder.layers.*.adapter_layer",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-    "pooling_layer.linear": "projector",
-    "pooling_layer.projection": "classifier",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-    "projector",
-    "classifier",
-]
-
-
-def read_txt_into_dict(filename):
-    result = {}
-    with open(filename, "r") as file:
-        for line_number, line in enumerate(file):
-            line = line.strip()
-            if line:
-                words = line.split()
-                key = line_number
-                value = words[0]
-                result[key] = value
-    return result
-
-
-def set_recursively(key, value, full_name, weight_type, hf_pointer):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    hf_param_name = None
-    for param_key in PARAM_MAPPING.keys():
-        if full_name.endswith(param_key):
-            hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
-            weight_type = "param"
-
-    # fairseq uses nn.utils.weight_norm() while transformers switches to nn.utils.parametrizations.weight_norm()
-    # the mapping between two versions:
-    # https://github.com/pytorch/pytorch/blob/56935684c3dfad7841c83c719eeebecb560fe466/torch/nn/utils/parametrizations.py#L389-L395
-
-    if weight_type is not None and weight_type != "param":
-        if weight_type == "weight_g" and not hasattr(hf_pointer, "weight_g"):
-            hf_shape = hf_pointer.parametrizations.weight.original0.shape
-        elif weight_type == "weight_v" and not hasattr(hf_pointer, "weight_v"):
-            hf_shape = hf_pointer.parametrizations.weight.original1.shape
-        else:
-            hf_shape = getattr(hf_pointer, weight_type).shape
-    elif weight_type is not None and weight_type == "param":
-        shape_pointer = hf_pointer
-        for attribute in hf_param_name.split("."):
-            shape_pointer = getattr(shape_pointer, attribute)
-        hf_shape = shape_pointer.shape
-
-        # let's reduce dimension
-        value = value[0]
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        if hasattr(hf_pointer, "weight_g"):
-            hf_pointer.weight_g.data = value
-        else:
-            hf_pointer.parametrizations.weight.original0.data = value
-    elif weight_type == "weight_v":
-        if hasattr(hf_pointer, "weight_v"):
-            hf_pointer.weight_v.data = value
-        else:
-            hf_pointer.parametrizations.weight.original1.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "param":
-        for attribute in hf_param_name.split("."):
-            hf_pointer = getattr(hf_pointer, attribute)
-        hf_pointer.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def rename_dict(key, value, full_name, weight_type, hf_dict):
-    hf_param_name = None
-    for param_key in PARAM_MAPPING.keys():
-        if full_name.endswith(param_key):
-            hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
-            weight_type = "param"
-
-    if weight_type is not None and weight_type != "param":
-        full_key = ".".join([key, weight_type])
-    elif weight_type is not None and weight_type == "param":
-        full_key = ".".join([key, hf_param_name])
-    else:
-        full_key = key
-
-    hf_dict[full_key] = value if "lm_head" in full_key else value[0]
-
-
-PARAM_MAPPING = {
-    "W_a": "linear_1.weight",
-    "W_b": "linear_2.weight",
-    "b_a": "linear_1.bias",
-    "b_b": "linear_2.bias",
-    "ln_W": "norm.weight",
-    "ln_b": "norm.bias",
-}
-
-
-def load_wav2vec2_layer(name, value, hf_model=None, hf_dict=None):
-    is_used = False
-    for key, mapped_key in MAPPING.items():
-        mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-        if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-            is_used = True
-            if "*" in mapped_key:
-                layer_index = name.split(key)[0].split(".")[-2]
-                mapped_key = mapped_key.replace("*", layer_index)
-            if "weight_g" in name:
-                weight_type = "weight_g"
-            elif "weight_v" in name:
-                weight_type = "weight_v"
-            elif "bias" in name:
-                weight_type = "bias"
-            elif "weight" in name:
-                # TODO: don't match quantizer.weight_proj
-                weight_type = "weight"
-            else:
-                weight_type = None
-            if hf_dict is not None:
-                rename_dict(mapped_key, value, name, weight_type, hf_dict)
-            else:
-                set_recursively(mapped_key, value, name, weight_type, hf_model)
-            return is_used
-    return is_used
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_headless):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.wav2vec2.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            is_used = load_wav2vec2_layer(name, value, hf_model)
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_wav2vec2_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True, is_seq_class=False
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = Wav2Vec2Config.from_pretrained(config_path)
-    else:
-        config = Wav2Vec2Config()
-
-    if is_seq_class:
-        id2label = read_txt_into_dict(dict_path)
-        config.id2label = id2label
-        hf_wav2vec = Wav2Vec2ForSequenceClassification(config)
-        feature_extractor = Wav2Vec2FeatureExtractor(
-            feature_size=1,
-            sampling_rate=16000,
-            padding_value=0,
-            do_normalize=True,
-            return_attention_mask=True,
-        )
-        feature_extractor.save_pretrained(pytorch_dump_folder_path)
-
-    elif is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            vocab_dict = target_dict.indices
-
-            # fairseq has the <pad> and <s> switched
-            vocab_dict["<pad>"] = 0
-            vocab_dict["<s>"] = 1
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(vocab_dict, vocab_handle)
-            tokenizer = Wav2Vec2CTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            return_attention_mask = True if config.feat_extract_norm == "layer" else False
-            feature_extractor = Wav2Vec2FeatureExtractor(
-                feature_size=1,
-                sampling_rate=16000,
-                padding_value=0,
-                do_normalize=True,
-                return_attention_mask=return_attention_mask,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_wav2vec = Wav2Vec2ForCTC(config)
-    else:
-        hf_wav2vec = Wav2Vec2ForPreTraining(config)
-
-    if is_finetuned or is_seq_class:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-        )
-    else:
-        task_arg = argparse.Namespace(task="audio_pretraining")
-        task = fairseq.tasks.setup_task(task_arg)
-
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path], task=task)
-
-    model = model[0].eval()
-
-    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    parser.add_argument(
-        "--is_seq_class",
-        action="store_true",
-        help="Whether the model to convert is a fine-tuned sequence classification model or not",
-    )
-    args = parser.parse_args()
-
-    is_finetuned = not args.not_finetuned and not args.is_seq_class
-    convert_wav2vec2_checkpoint(
-        args.checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.config_path,
-        args.dict_path,
-        is_finetuned,
-        args.is_seq_class,
-    )
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
deleted file mode 100644
index 1702bc5a4732..000000000000
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    Wav2Vec2Config,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2ForAudioFrameClassification,
-    Wav2Vec2ForSequenceClassification,
-    Wav2Vec2ForXVector,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def convert_classification(base_model_name, hf_config, downstream_dict):
-    model = Wav2Vec2ForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["projector.weight"]
-    model.projector.bias.data = downstream_dict["projector.bias"]
-    model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
-    return model
-
-
-def convert_diarization(base_model_name, hf_config, downstream_dict):
-    model = Wav2Vec2ForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
-    model.classifier.weight.data = downstream_dict["model.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.linear.bias"]
-    return model
-
-
-def convert_xvector(base_model_name, hf_config, downstream_dict):
-    model = Wav2Vec2ForXVector.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["connector.weight"]
-    model.projector.bias.data = downstream_dict["connector.bias"]
-    for i, kernel_size in enumerate(hf_config.tdnn_kernel):
-        model.tdnn[i].kernel.weight.data = downstream_dict[
-            f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
-        ]
-        model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]
-
-    model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
-    model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
-    model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
-    model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
-    model.objective.weight.data = downstream_dict["objective.W"]
-    return model
-
-
-@torch.no_grad()
-def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    downstream_dict = checkpoint["Downstream"]
-
-    hf_config = Wav2Vec2Config.from_pretrained(config_path)
-    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        base_model_name, return_attention_mask=True, do_normalize=False
-    )
-
-    arch = hf_config.architectures[0]
-    if arch.endswith("ForSequenceClassification"):
-        hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForAudioFrameClassification"):
-        hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForXVector"):
-        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
-    else:
-        raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")
-
-    if hf_config.use_weighted_layer_sum:
-        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
-
-    hf_feature_extractor.save_pretrained(model_dump_path)
-    hf_model.save_pretrained(model_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
-    )
-    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
-    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
-    args = parser.parse_args()
-    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
diff --git a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
index f76d98309406..3dde386b32af 100644
--- a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
@@ -178,7 +178,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
index 55d34b84ef59..547076205018 100644
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -212,7 +212,7 @@ def _sample_negative_indices(features_shape: Tuple, num_negatives: int, attentio
     return sampled_negative_indices
 
 
-WAV_2_VEC_2_START_DOCSTRING = r"""
+WAV2VEC2_START_DOCSTRING = r"""
     Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
     Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
     Auli.
@@ -251,7 +251,7 @@ def _sample_negative_indices(features_shape: Tuple, num_negatives: int, attentio
 """
 
 
-WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
+WAV2VEC2_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
             Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
@@ -885,7 +885,7 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
         else:
             return random_params
 
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     def __call__(
         self,
         input_values,
@@ -1050,7 +1050,7 @@ def _get_feature_vector_attention_mask(
 
 @add_start_docstrings(
     "The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.",
-    WAV_2_VEC_2_START_DOCSTRING,
+    WAV2VEC2_START_DOCSTRING,
 )
 class FlaxWav2Vec2Model(FlaxWav2Vec2PreTrainedModel):
     module_class = FlaxWav2Vec2Module
@@ -1088,7 +1088,7 @@ class FlaxWav2Vec2Model(FlaxWav2Vec2PreTrainedModel):
 
 overwrite_call_docstring(
     FlaxWav2Vec2Model,
-    WAV_2_VEC_2_INPUTS_DOCSTRING + FLAX_WAV2VEC2_MODEL_DOCSTRING,
+    WAV2VEC2_INPUTS_DOCSTRING + FLAX_WAV2VEC2_MODEL_DOCSTRING,
 )
 append_replace_return_docstrings(
     FlaxWav2Vec2Model, output_type=FlaxWav2Vec2BaseModelOutput, config_class=Wav2Vec2Config
@@ -1168,7 +1168,7 @@ def _conv_out_length(input_length, kernel_size, stride):
 
 @add_start_docstrings(
     "Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).",
-    WAV_2_VEC_2_START_DOCSTRING,
+    WAV2VEC2_START_DOCSTRING,
 )
 class FlaxWav2Vec2ForCTC(FlaxWav2Vec2PreTrainedModel):
     module_class = FlaxWav2Vec2ForCTCModule
@@ -1211,7 +1211,7 @@ class FlaxWav2Vec2ForCTC(FlaxWav2Vec2PreTrainedModel):
 
 overwrite_call_docstring(
     FlaxWav2Vec2ForCTC,
-    WAV_2_VEC_2_INPUTS_DOCSTRING + FLAX_WAV2VEC2_FOR_CTC_DOCSTRING,
+    WAV2VEC2_INPUTS_DOCSTRING + FLAX_WAV2VEC2_FOR_CTC_DOCSTRING,
 )
 append_replace_return_docstrings(FlaxWav2Vec2ForCTC, output_type=FlaxCausalLMOutput, config_class=Wav2Vec2Config)
 
@@ -1315,11 +1315,11 @@ def _conv_out_length(input_length, kernel_size, stride):
         return input_lengths
 
 
-@add_start_docstrings("""Wav2Vec2 Model with a quantizer and `VQ` head on top.""", WAV_2_VEC_2_START_DOCSTRING)
+@add_start_docstrings("""Wav2Vec2 Model with a quantizer and `VQ` head on top.""", WAV2VEC2_START_DOCSTRING)
 class FlaxWav2Vec2ForPreTraining(FlaxWav2Vec2PreTrainedModel):
     module_class = FlaxWav2Vec2ForPreTrainingModule
 
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     # overwrite since has `gumbel_temperature` input
     def __call__(
         self,
@@ -1418,7 +1418,7 @@ def __call__(
 
 overwrite_call_docstring(
     FlaxWav2Vec2ForPreTraining,
-    WAV_2_VEC_2_INPUTS_DOCSTRING + FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING,
+    WAV2VEC2_INPUTS_DOCSTRING + FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING,
 )
 append_replace_return_docstrings(
     FlaxWav2Vec2ForPreTraining, output_type=FlaxWav2Vec2ForPreTrainingOutput, config_class=Wav2Vec2Config
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index 3b118162d7cc..c385c192a987 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -78,8 +78,8 @@ class TFWav2Vec2BaseModelOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: tf.Tensor = None
-    extract_features: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
+    extract_features: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor] | None = None
     attentions: Tuple[tf.Tensor] | None = None
 
@@ -614,7 +614,7 @@ def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None:
 
         if config.feat_extract_norm == "group":
             conv_layers = [TFWav2Vec2GroupNormConvLayer(config, layer_id=0, name=f"conv_layers.{0}")] + [
-                TFWav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1, name=f"conv_layers.{i+1}")
+                TFWav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1, name=f"conv_layers.{i + 1}")
                 for i in range(config.num_feat_extract_layers - 1)
             ]
         elif config.feat_extract_norm == "layer":
@@ -1397,7 +1397,7 @@ def _get_feature_vector_attention_mask(
         return attention_mask
 
 
-WAV_2_VEC_2_START_DOCSTRING = r"""
+WAV2VEC2_START_DOCSTRING = r"""
 
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -1439,7 +1439,7 @@ def _get_feature_vector_attention_mask(
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
+WAV2VEC2_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
@@ -1497,7 +1497,7 @@ def _get_feature_vector_attention_mask(
 
 @add_start_docstrings(
     "The bare TFWav2Vec2 Model transformer outputing raw hidden-states without any specific head on top.",
-    WAV_2_VEC_2_START_DOCSTRING,
+    WAV2VEC2_START_DOCSTRING,
 )
 class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
     def __init__(self, config: Wav2Vec2Config, *inputs, **kwargs):
@@ -1505,7 +1505,7 @@ def __init__(self, config: Wav2Vec2Config, *inputs, **kwargs):
         self.config = config
         self.wav2vec2 = TFWav2Vec2MainLayer(config, name="wav2vec2")
 
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
     @unpack_inputs
     def call(
@@ -1579,7 +1579,7 @@ def build(self, input_shape=None):
 
 @add_start_docstrings(
     """TFWav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
-    WAV_2_VEC_2_START_DOCSTRING,
+    WAV2VEC2_START_DOCSTRING,
 )
 class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
     def __init__(self, config: Wav2Vec2Config, *inputs, **kwargs):
@@ -1612,7 +1612,7 @@ def freeze_feature_encoder(self):
         self.wav2vec2.feature_extractor.trainable = False
 
     @unpack_inputs
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=TFCausalLMOutput, config_class=_CONFIG_FOR_DOC)
     def call(
         self,
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 81f2110e721c..2ac0e21486e7 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -28,6 +28,7 @@
 from ...activations import ACT2FN
 from ...integrations.deepspeed import is_deepspeed_zero3_enabled
 from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -44,8 +45,6 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     cached_file,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     is_peft_available,
     is_safetensors_available,
     logging,
@@ -61,9 +60,10 @@
     from safetensors.torch import load_file as safe_load_file
 
 
-if is_flash_attn_2_available():
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -127,9 +127,9 @@ class Wav2Vec2ForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    projected_states: torch.FloatTensor = None
-    projected_quantized_states: torch.FloatTensor = None
-    codevector_perplexity: torch.FloatTensor = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     contrastive_loss: Optional[torch.FloatTensor] = None
@@ -191,7 +191,7 @@ def compute_num_masked_span(input_length):
 
     # compute number of masked spans in batch
     input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
+        attention_mask.detach().sum(-1).tolist()
         if attention_mask is not None
         else [sequence_length for _ in range(batch_size)]
     )
@@ -662,9 +662,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -1589,11 +1589,10 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
                     cache_dir=cache_dir,
                 )
 
-                weights_only_kwarg = {"weights_only": True}
                 state_dict = torch.load(
                     weight_path,
                     map_location="cpu",
-                    **weights_only_kwarg,
+                    weights_only=True,
                 )
 
             except EnvironmentError:
@@ -1635,7 +1634,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
         self.target_lang = target_lang
 
 
-WAV_2_VEC_2_START_DOCSTRING = r"""
+WAV2VEC2_START_DOCSTRING = r"""
     Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
     Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
     Auli.
@@ -1654,7 +1653,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
 """
 
 
-WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
+WAV2VEC2_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
             Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
@@ -1694,7 +1693,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs):
 
 @add_start_docstrings(
     "The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.",
-    WAV_2_VEC_2_START_DOCSTRING,
+    WAV2VEC2_START_DOCSTRING,
 )
 class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
     def __init__(self, config: Wav2Vec2Config):
@@ -1782,7 +1781,7 @@ def _mask_hidden_states(
 
         return hidden_states
 
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=Wav2Vec2BaseModelOutput,
@@ -1843,7 +1842,7 @@ def forward(
         )
 
 
-@add_start_docstrings("""Wav2Vec2 Model with a quantizer and `VQ` head on top.""", WAV_2_VEC_2_START_DOCSTRING)
+@add_start_docstrings("""Wav2Vec2 Model with a quantizer and `VQ` head on top.""", WAV2VEC2_START_DOCSTRING)
 class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
     def __init__(self, config: Wav2Vec2Config):
         super().__init__(config)
@@ -1904,7 +1903,7 @@ def compute_contrastive_logits(
         logits = logits / temperature
         return logits
 
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Wav2Vec2ForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -2067,7 +2066,7 @@ def forward(
         )
 
 
-@add_start_docstrings("""Wav2Vec2 Model with a `language modeling` head on top.""", WAV_2_VEC_2_START_DOCSTRING)
+@add_start_docstrings("""Wav2Vec2 Model with a `language modeling` head on top.""", WAV2VEC2_START_DOCSTRING)
 class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -2083,7 +2082,7 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     def forward(
         self,
         input_values: torch.FloatTensor,
@@ -2115,7 +2114,7 @@ def forward(
 
 @add_start_docstrings(
     """Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
-    WAV_2_VEC_2_START_DOCSTRING,
+    WAV2VEC2_START_DOCSTRING,
     """
         target_lang (`str`, *optional*):
             Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
@@ -2195,7 +2194,7 @@ def freeze_base_model(self):
         for param in self.wav2vec2.parameters():
             param.requires_grad = False
 
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=CausalLMOutput,
@@ -2279,7 +2278,7 @@ def forward(
     Wav2Vec2 Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
     SUPERB Keyword Spotting.
     """,
-    WAV_2_VEC_2_START_DOCSTRING,
+    WAV2VEC2_START_DOCSTRING,
 )
 class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
     def __init__(self, config):
@@ -2326,7 +2325,7 @@ def freeze_base_model(self):
         for param in self.wav2vec2.parameters():
             param.requires_grad = False
 
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_SEQ_CLASS_CHECKPOINT,
         output_type=SequenceClassifierOutput,
@@ -2402,7 +2401,7 @@ def forward(
     """
     Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization.
     """,
-    WAV_2_VEC_2_START_DOCSTRING,
+    WAV2VEC2_START_DOCSTRING,
 )
 class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
     def __init__(self, config):
@@ -2448,7 +2447,7 @@ def freeze_base_model(self):
         for param in self.wav2vec2.parameters():
             param.requires_grad = False
 
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_FRAME_CLASS_CHECKPOINT,
         output_type=TokenClassifierOutput,
@@ -2548,6 +2547,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if is_peft_available():
             from peft.tuners.lora import LoraLayer
 
+        if is_peft_available():
             if isinstance(self.kernel, LoraLayer):
                 warnings.warn(
                     "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
@@ -2568,7 +2568,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     """
     Wav2Vec2 Model with an XVector feature extraction head on top for tasks like Speaker Verification.
     """,
-    WAV_2_VEC_2_START_DOCSTRING,
+    WAV2VEC2_START_DOCSTRING,
 )
 class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
     def __init__(self, config):
@@ -2632,7 +2632,7 @@ def _conv_out_length(input_length, kernel_size, stride):
 
         return input_lengths
 
-    @add_start_docstrings_to_model_forward(WAV_2_VEC_2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_XVECTOR_CHECKPOINT,
         output_type=XVectorOutput,
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index ecde491a70ef..ad51a4e4d028 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -404,7 +404,7 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         group_tokens: bool = True,
         spaces_between_special_tokens: bool = False,
         output_word_offsets: Optional[bool] = False,
@@ -459,7 +459,7 @@ def batch_decode(
         self,
         sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         output_char_offsets: bool = False,
         output_word_offsets: bool = False,
         **kwargs,
@@ -529,7 +529,7 @@ def decode(
         self,
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         output_char_offsets: bool = False,
         output_word_offsets: bool = False,
         **kwargs,
@@ -781,7 +781,7 @@ def __call__(
         padding: Union[bool, str, PaddingStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         verbose: bool = True,
         **kwargs,
@@ -876,7 +876,7 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs,
     ) -> str:
         """
diff --git a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py b/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
deleted file mode 100644
index adead75bf5de..000000000000
--- a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2Bert BERT checkpoint."""
-
-import argparse
-
-import torch
-import torchaudio
-from fairseq2.data import Collater
-from fairseq2.data.audio import WaveformToFbankConverter
-from fairseq2.nn.padding import get_seqs_and_padding_mask
-from seamless_communication.models.conformer_shaw import load_conformer_shaw_model
-
-from transformers import (
-    SeamlessM4TFeatureExtractor,
-    Wav2Vec2BertConfig,
-    Wav2Vec2BertModel,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-wav2vec_convert_list = [
-    ("encoder_frontend.model_dim_proj", "feature_projection.projection"),
-    ("encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
-    ("encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
-    ("encoder.inner.layers", "encoder.layers"),
-    ("encoder.inner_layer_norm", "encoder.layer_norm"),
-    ("encoder.adaptor_layers", "adapter.layers"),
-    ("inner_proj", "intermediate_dense"),
-    ("self_attn.output_proj", "self_attn.linear_out"),
-    ("output_proj", "output_dense"),
-    ("self_attn.k_proj", "self_attn.linear_k"),
-    ("self_attn.v_proj", "self_attn.linear_v"),
-    ("self_attn.q_proj", "self_attn.linear_q"),
-    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
-    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
-    ("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"),
-    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
-    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
-    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
-    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
-    ("conv.layer_norm", "conv_module.depthwise_layer_norm"),
-    ("conv_layer_norm", "conv_module.layer_norm"),
-    ("encoder.proj1", "intermediate_ffn.intermediate_dense"),
-    ("encoder.proj2", "intermediate_ffn.output_dense"),
-    ("encoder.layer_norm", "inner_layer_norm"),
-    ("masker.temporal_mask_embed", "masked_spec_embed"),
-]
-
-keys_to_remove = {
-    "quantizer.entry_proj",
-    "final_proj",
-    "final_target_proj",
-    "quantizer.entries",
-    "quantizer.num_updates",
-}
-
-
-def param_count(model):
-    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
-
-
-def _convert_model(
-    original_model,
-    hf_model,
-    convert_list,
-):
-    state_dict = original_model.state_dict()
-
-    for k, v in list(state_dict.items()):
-        new_key = k
-        for old_layer_name, new_layer_name in convert_list:
-            if old_layer_name in new_key:
-                new_key = new_key.replace(old_layer_name, new_layer_name)
-
-        # must do it by hand
-        if ".layer_norm" in new_key and new_key.split(".layer_norm")[0][-1].isnumeric():
-            new_key = new_key.replace("layer_norm", "final_layer_norm")
-
-        add_key = True
-        for key in keys_to_remove:
-            if key in new_key:
-                state_dict.pop(k)
-                add_key = False
-                break
-
-        if add_key:
-            state_dict[new_key] = state_dict.pop(k)
-
-    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
-    extra_keys = set({k for k in extra_keys if "num_updates" not in k})  # filter unecessary param
-    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
-    if len(extra_keys) != 0:
-        raise ValueError(f"extra keys found: {extra_keys}")
-    if len(missing_keys) != 0:
-        raise ValueError(f"missing keys: {missing_keys}")
-    hf_model.load_state_dict(state_dict, strict=True)
-    n_params = param_count(hf_model)
-
-    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
-
-    hf_model.eval()
-    del state_dict
-
-    return hf_model
-
-
-@torch.no_grad()
-def convert_wav2vec2_bert_checkpoint(
-    checkpoint_path,
-    pytorch_dump_folder_path,
-    config_path=None,
-    repo_id=None,
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = Wav2Vec2BertConfig.from_pretrained(config_path, hidden_act="swish")
-    else:
-        config = Wav2Vec2BertConfig(apply_spec_augment=False)
-
-    hf_wav2vec = Wav2Vec2BertModel(config)
-
-    model = load_conformer_shaw_model(checkpoint_path, dtype=torch.float32)
-    model.eval()
-
-    hf_wav2vec = _convert_model(model, hf_wav2vec, wav2vec_convert_list)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        hf_wav2vec.push_to_hub(repo_id, create_pr=True)
-
-    # save feature extractor
-    fe = SeamlessM4TFeatureExtractor(padding_value=1)
-    fe._set_processor_class("Wav2Vec2BertProcessor")
-    fe.save_pretrained(pytorch_dump_folder_path)
-
-    if repo_id:
-        fe.push_to_hub(repo_id, create_pr=True)
-
-    if args.audio_path:
-        waveform, sample_rate = torchaudio.load(args.audio_path)
-        waveform = torchaudio.functional.resample(waveform, sample_rate, fe.sampling_rate)
-
-        fbank_converter = WaveformToFbankConverter(
-            num_mel_bins=80,
-            waveform_scale=2**15,
-            channel_last=True,
-            standardize=True,
-            dtype=torch.float32,
-        )
-        collater = Collater(pad_value=1)
-
-        decoded_audio = {"waveform": waveform.T, "sample_rate": fe.sampling_rate, "format": -1}
-        src = collater(fbank_converter(decoded_audio))["fbank"]
-        seqs, padding_mask = get_seqs_and_padding_mask(src)
-
-        with torch.inference_mode():
-            seqs, padding_mask = model.encoder_frontend(seqs, padding_mask)
-            original_output, padding_mask = model.encoder(seqs, padding_mask)
-
-        hf_wav2vec.eval()
-
-        inputs = fe(waveform, return_tensors="pt", padding=True)
-        with torch.no_grad():
-            outputs = hf_wav2vec(**inputs)
-
-        torch.testing.assert_close(original_output, outputs.last_hidden_state, rtol=5e-3, atol=5e-3)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        help="Path to the output PyTorch model.",
-    )
-    parser.add_argument(
-        "--checkpoint_path", default="conformer_shaw", type=str, help="Path to seamless communication checkpoint"
-    )
-    parser.add_argument(
-        "--config_path",
-        default=None,
-        type=str,
-        help="Path to hf config.json of model to convert",
-    )
-    parser.add_argument("--repo_id", default=None, type=str, help="Push to this repo id if precised.")
-    parser.add_argument(
-        "--audio_path",
-        default=None,
-        type=str,
-        help="If specified, check that the original model and the converted model produce the same outputs.",
-    )
-
-    args = parser.parse_args()
-    convert_wav2vec2_bert_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.repo_id
-    )
diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
index 3935ad7feb8c..86e9b65b3d5e 100644
--- a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
@@ -1,26 +1,15 @@
-# coding=utf-8
-# Copyright 2024 The Seamless Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Wav2Vec2-BERT model."""
-
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_wav2vec2_bert.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import warnings
 from typing import Optional, Tuple, Union
 
 import numpy as np
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
@@ -42,213 +31,14 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_peft_available,
-    logging,
 )
 from .configuration_wav2vec2_bert import Wav2Vec2BertConfig
 
 
-logger = logging.get_logger(__name__)
-
-
-_HIDDEN_STATES_START_POSITION = 2
-
 # General docstring
 _CONFIG_FOR_DOC = "Wav2Vec2BertConfig"
 
-# Base docstring
-_BASE_CHECKPOINT_FOR_DOC = "facebook/w2v-bert-2.0"
-_PRETRAINED_CHECKPOINT_FOR_DOC = "hf-audio/wav2vec2-bert-CV16-en"
-_EXPECTED_OUTPUT_SHAPE = [1, 146, 1024]
-
-# CTC docstring
-_CTC_EXPECTED_OUTPUT = "'mr quilter is the apostle of the middle classes and we are glad to welcome his gospel'"
-_CTC_EXPECTED_LOSS = 17.04
-
-
-# Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask
-def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
-    """
-    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
-    stops at the corresponding element in `seq_lens`.
-    Args:
-        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
-            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
-        seq_lens (`torch.Tensor` of shape `(batch)`:
-            Each element represents the length of the sequence at the same index in `hidden_states`
-    Returns:
-        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
-    """
-    batch_size, mask_seq_len = hidden_states.shape[:2]
-
-    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
-
-    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
-
-    mask = hidden_states.new_ones((batch_size, mask_seq_len))
-
-    mask = mask.masked_fill(bool_mask, 0)
-
-    return mask
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
-def _compute_mask_indices(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    attention_mask: Optional[torch.LongTensor] = None,
-    min_masks: int = 0,
-) -> np.ndarray:
-    """
-    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
-    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
-    CPU as part of the preprocessing during training.
-
-    Args:
-        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-                    independently generated mask spans of length `mask_length` is computed by
-                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-                    actual percentage will be smaller.
-        mask_length: size of the mask
-        min_masks: minimum number of masked spans
-        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-                        each batch dimension.
-    """
-    batch_size, sequence_length = shape
-
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
-
-    if mask_length > sequence_length:
-        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
-            f" and `sequence_length`: {sequence_length}`"
-        )
-
-    # epsilon is used for probabilistic rounding
-    epsilon = np.random.rand(1).item()
-
-    def compute_num_masked_span(input_length):
-        """Given input length, compute how many spans should be masked"""
-        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
-        num_masked_span = max(num_masked_span, min_masks)
-
-        # make sure num masked span <= sequence_length
-        if num_masked_span * mask_length > sequence_length:
-            num_masked_span = sequence_length // mask_length
-
-        # make sure num_masked span is also <= input_length - (mask_length - 1)
-        if input_length - (mask_length - 1) < num_masked_span:
-            num_masked_span = max(input_length - (mask_length - 1), 0)
-
-        return num_masked_span
-
-    # compute number of masked spans in batch
-    input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
-        if attention_mask is not None
-        else [sequence_length for _ in range(batch_size)]
-    )
-
-    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
-    spec_aug_mask_idxs = []
-
-    max_num_masked_span = compute_num_masked_span(sequence_length)
-
-    if max_num_masked_span == 0:
-        return spec_aug_mask
-
-    for input_length in input_lengths:
-        # compute num of masked spans for this input
-        num_masked_span = compute_num_masked_span(input_length)
-
-        # get random indices to mask
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
-        )
-
-        # pick first sampled index that will serve as a dummy index to pad vector
-        # to ensure same dimension for all batches due to probabilistic rounding
-        # Picking first sample just pads those vectors twice.
-        if len(spec_aug_mask_idx) == 0:
-            # this case can only happen if `input_length` is strictly smaller then
-            # `sequence_length` in which case the last token has to be a padding
-            # token which we can use as a dummy mask id
-            dummy_mask_idx = sequence_length - 1
-        else:
-            dummy_mask_idx = spec_aug_mask_idx[0]
-
-        spec_aug_mask_idx = np.concatenate(
-            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
-        )
-        spec_aug_mask_idxs.append(spec_aug_mask_idx)
-
-    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
-
-    # expand masked indices to masked spans
-    spec_aug_mask_idxs = np.broadcast_to(
-        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
-
-    # add offset to the starting indexes so that indexes now create a span
-    offsets = np.arange(mask_length)[None, None, :]
-    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
-        batch_size, max_num_masked_span * mask_length
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
-
-    # ensure that we cannot have indices larger than sequence_length
-    if spec_aug_mask_idxs.max() > sequence_length - 1:
-        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
-
-    # scatter indices to mask
-    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
-
-    return spec_aug_mask
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._sample_negative_indices
-def _sample_negative_indices(
-    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
-):
-    """
-    Sample `num_negatives` vectors from feature vectors.
-    """
-    batch_size, sequence_length = features_shape
-
-    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
-    sequence_length_range = np.arange(sequence_length)
-
-    # get `num_negatives` random vector indices from the same utterance
-    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
-
-    mask_time_indices = (
-        mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
-    )
-
-    for batch_idx in range(batch_size):
-        high = mask_time_indices[batch_idx].sum() - 1
-        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
-
-        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
-        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
-        # avoid sampling the same positive vector, but keep the distribution uniform
-        sampled_indices[sampled_indices >= feature_indices] += 1
 
-        # remap to actual indices
-        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
-
-        # correct for batch size
-        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
-
-    return sampled_negative_indices
-
-
-# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRotaryPositionalEmbedding with Wav2Vec2Conformer->Wav2Vec2Bert
 class Wav2Vec2BertRotaryPositionalEmbedding(nn.Module):
     """Rotary positional embedding
     Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
@@ -284,7 +74,6 @@ def forward(self, hidden_states):
         return self.cached_rotary_positional_embedding
 
 
-# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRelPositionalEmbedding with Wav2Vec2Conformer->Wav2Vec2Bert
 class Wav2Vec2BertRelPositionalEmbedding(nn.Module):
     """Relative positional encoding module."""
 
@@ -363,7 +152,6 @@ def __init__(self, config, act_fn=None, hidden_size=None):
         self.output_dense = nn.Linear(config.intermediate_size, hidden_size)
         self.output_dropout = nn.Dropout(config.hidden_dropout)
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward.forward
     def forward(self, hidden_states):
         hidden_states = self.intermediate_dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
@@ -556,7 +344,6 @@ def forward(
 
         return hidden_states, probs
 
-    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention._apply_rotary_embedding
     def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
         batch_size, sequence_length, hidden_size = hidden_states.size()
         hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
@@ -576,7 +363,6 @@ def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
 
         return hidden_states
 
-    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention._apply_relative_embeddings
     def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
         # 1. project positional embeddings
         # => (batch, head, 2*time1-1, d_k)
@@ -823,6 +609,32 @@ def forward(self, hidden_states, attention_mask=None):
         return hidden_states
 
 
+# Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+    """
+    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+    stops at the corresponding element in `seq_lens`.
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+        seq_lens (`torch.Tensor` of shape `(batch)`:
+            Each element represents the length of the sequence at the same index in `hidden_states`
+    Returns:
+        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+    """
+    batch_size, mask_seq_len = hidden_states.shape[:2]
+
+    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+    mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+    mask = mask.masked_fill(bool_mask, 0)
+
+    return mask
+
+
 class Wav2Vec2BertAdapterLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -911,7 +723,6 @@ def forward(
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerPreTrainedModel with Wav2Vec2Conformer->Wav2Vec2Bert,wav2vec2_conformer->wav2vec2_bert, input_values->input_features
 class Wav2Vec2BertPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -995,6 +806,129 @@ def _get_feature_vector_attention_mask(
         return attention_mask
 
 
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+_PRETRAINED_CHECKPOINT_FOR_DOC = "hf-audio/wav2vec2-bert-CV16-en"
+_EXPECTED_OUTPUT_SHAPE = [1, 146, 1024]
+
+
 WAV2VEC2_BERT_START_DOCSTRING = r"""
     Wav2Vec2Bert was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
     Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
@@ -1003,8 +937,9 @@ def _get_feature_vector_attention_mask(
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving etc.).
 
-    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a
-    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
 
     Parameters:
         config ([`Wav2Vec2BertConfig`]): Model configuration class with all the parameters of the model.
@@ -1012,7 +947,6 @@ def _get_feature_vector_attention_mask(
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-
 WAV2VEC2_BERT_INPUTS_DOCSTRING = r"""
     Args:
         input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -1039,6 +973,9 @@ def _get_feature_vector_attention_mask(
 """
 
 
+Wav2Vec2BertBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
 @add_start_docstrings(
     "The bare Wav2Vec2Bert Model transformer outputting raw hidden-states without any specific head on top.",
     WAV2VEC2_BERT_START_DOCSTRING,
@@ -1064,7 +1001,6 @@ def __init__(self, config: Wav2Vec2BertConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
     def _mask_hidden_states(
         self,
         hidden_states: torch.FloatTensor,
@@ -1114,7 +1050,7 @@ def _mask_hidden_states(
     @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_PRETRAINED_CHECKPOINT_FOR_DOC,
-        output_type=Wav2Vec2BaseModelOutput,
+        output_type=Wav2Vec2BertBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
         expected_output=_EXPECTED_OUTPUT_SHAPE,
@@ -1127,7 +1063,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+    ) -> Union[Tuple, Wav2Vec2BertBaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1159,7 +1095,7 @@ def forward(
         if not return_dict:
             return (hidden_states, extract_features) + encoder_outputs[1:]
 
-        return Wav2Vec2BaseModelOutput(
+        return Wav2Vec2BertBaseModelOutput(
             last_hidden_state=hidden_states,
             extract_features=extract_features,
             hidden_states=encoder_outputs.hidden_states,
@@ -1167,12 +1103,18 @@ def forward(
         )
 
 
+_HIDDEN_STATES_START_POSITION = 2
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'mr quilter is the apostle of the middle classes and we are glad to welcome his gospel'"
+_CTC_EXPECTED_LOSS = 17.04
+
+
 @add_start_docstrings(
     """Wav2Vec2Bert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     WAV2VEC2_BERT_START_DOCSTRING,
 )
 class Wav2Vec2BertForCTC(Wav2Vec2BertPreTrainedModel):
-    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForCTC.__init__ with Wav2Vec2Conformer->Wav2Vec2Bert,WAV2VEC2_CONFORMER->WAV2VEC2_BERT,wav2vec2_conformer->wav2vec2_bert
     def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
@@ -1277,6 +1219,10 @@ def forward(
         )
 
 
+# Base docstring
+_BASE_CHECKPOINT_FOR_DOC = "facebook/w2v-bert-2.0"
+
+
 @add_start_docstrings(
     """
     Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output) for
@@ -1285,7 +1231,6 @@ def forward(
     WAV2VEC2_BERT_START_DOCSTRING,
 )
 class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel):
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert
     def __init__(self, config):
         super().__init__(config)
 
@@ -1318,7 +1263,6 @@ def freeze_base_model(self):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert,WAV_2_VEC_2->WAV2VEC2_BERT, input_values->input_features
     def forward(
         self,
         input_features: Optional[torch.Tensor],
@@ -1389,7 +1333,6 @@ def forward(
     WAV2VEC2_BERT_START_DOCSTRING,
 )
 class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2BertPreTrainedModel):
-    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForAudioFrameClassification.__init__ with Wav2Vec2Conformer->Wav2Vec2Bert,WAV2VEC2_CONFORMER->WAV2VEC2_BERT,wav2vec2_conformer->wav2vec2_bert
     def __init__(self, config):
         super().__init__(config)
 
@@ -1406,7 +1349,6 @@ def __init__(self, config):
 
         self.init_weights()
 
-    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForAudioFrameClassification.freeze_base_model with wav2vec2_conformer->wav2vec2_bert
     def freeze_base_model(self):
         """
         Calling this function will disable the gradient computation for the base model so that its parameters will not
@@ -1422,7 +1364,6 @@ def freeze_base_model(self):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForAudioFrameClassification.forward with wav2vec2_conformer->wav2vec2_bert, input_values->input_features
     def forward(
         self,
         input_features: Optional[torch.Tensor],
@@ -1477,7 +1418,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
 class AMSoftmaxLoss(nn.Module):
     def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
         super(AMSoftmaxLoss, self).__init__()
@@ -1501,7 +1441,6 @@ def forward(self, hidden_states, labels):
         return loss
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
 class TDNNLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -1517,6 +1456,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if is_peft_available():
             from peft.tuners.lora import LoraLayer
 
+        if is_peft_available():
             if isinstance(self.kernel, LoraLayer):
                 warnings.warn(
                     "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
@@ -1540,7 +1480,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     WAV2VEC2_BERT_START_DOCSTRING,
 )
 class Wav2Vec2BertForXVector(Wav2Vec2BertPreTrainedModel):
-    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForXVector.__init__ with Wav2Vec2Conformer->Wav2Vec2Bert,WAV2VEC2_CONFORMER->WAV2VEC2_BERT,wav2vec2_conformer->wav2vec2_bert
     def __init__(self, config):
         super().__init__(config)
 
@@ -1560,7 +1499,6 @@ def __init__(self, config):
 
         self.init_weights()
 
-    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForXVector.freeze_base_model with wav2vec2_conformer->wav2vec2_bert
     def freeze_base_model(self):
         """
         Calling this function will disable the gradient computation for the base model so that its parameters will not
@@ -1569,7 +1507,6 @@ def freeze_base_model(self):
         for param in self.wav2vec2_bert.parameters():
             param.requires_grad = False
 
-    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForXVector._get_tdnn_output_lengths
     def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
         """
         Computes the output length of the TDNN layers
@@ -1592,7 +1529,6 @@ def _conv_out_length(input_length, kernel_size, stride):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForXVector.forward with wav2vec2_conformer->wav2vec2_bert, input_values->input_features
     def forward(
         self,
         input_features: Optional[torch.Tensor],
diff --git a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
new file mode 100644
index 000000000000..b8dc95c67540
--- /dev/null
+++ b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
@@ -0,0 +1,1169 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ..wav2vec2.modeling_wav2vec2 import Wav2Vec2FeedForward, Wav2Vec2ForSequenceClassification, Wav2Vec2Model
+from ..wav2vec2_conformer.modeling_wav2vec2_conformer import (
+    Wav2Vec2ConformerForAudioFrameClassification,
+    Wav2Vec2ConformerForCTC,
+    Wav2Vec2ConformerForXVector,
+    Wav2Vec2ConformerRelPositionalEmbedding,
+    Wav2Vec2ConformerRotaryPositionalEmbedding,
+    Wav2Vec2ConformerSelfAttention,
+)
+from .configuration_wav2vec2_bert import Wav2Vec2BertConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2BertConfig"
+
+# Base docstring
+_BASE_CHECKPOINT_FOR_DOC = "facebook/w2v-bert-2.0"
+_PRETRAINED_CHECKPOINT_FOR_DOC = "hf-audio/wav2vec2-bert-CV16-en"
+_EXPECTED_OUTPUT_SHAPE = [1, 146, 1024]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'mr quilter is the apostle of the middle classes and we are glad to welcome his gospel'"
+_CTC_EXPECTED_LOSS = 17.04
+
+
+# Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+    """
+    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+    stops at the corresponding element in `seq_lens`.
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+        seq_lens (`torch.Tensor` of shape `(batch)`:
+            Each element represents the length of the sequence at the same index in `hidden_states`
+    Returns:
+        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+    """
+    batch_size, mask_seq_len = hidden_states.shape[:2]
+
+    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+    mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+    mask = mask.masked_fill(bool_mask, 0)
+
+    return mask
+
+
+class Wav2Vec2BertRotaryPositionalEmbedding(Wav2Vec2ConformerRotaryPositionalEmbedding, nn.Module):
+    def __init__(self, config):
+        nn.Module.__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        # Ignore copy
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+
+class Wav2Vec2BertRelPositionalEmbedding(Wav2Vec2ConformerRelPositionalEmbedding):
+    pass
+
+
+class Wav2Vec2BertFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class Wav2Vec2BertFeedForward(Wav2Vec2FeedForward, nn.Module):
+    def __init__(self, config, act_fn=None, hidden_size=None):
+        nn.Module.__init__()
+        act_fn = act_fn if act_fn is not None else config.hidden_act
+        hidden_size = hidden_size if hidden_size is not None else config.hidden_size
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
+
+        self.output_dense = nn.Linear(config.intermediate_size, hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+
+class Wav2Vec2BertConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=0,
+            groups=config.hidden_size,
+            bias=False,
+        )
+
+        self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
+
+    def forward(self, hidden_states, attention_mask=None):
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Ensure that we do not leak padded positions in depthwise convolution if attention mask is passed.
+        # Put 0 where necessary
+        if attention_mask is not None:
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # Pad the sequence entirely on the left because of causal convolution.
+        hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+
+        hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2BertSelfAttention(Wav2Vec2ConformerSelfAttention, nn.Module):
+    """Construct an Wav2Vec2BertSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config, is_adapter_attention=False):
+        nn.Module.__init__()
+        hidden_size = config.hidden_size if not is_adapter_attention else config.output_hidden_size
+
+        self.head_size = hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type if not is_adapter_attention else None
+
+        self.linear_q = nn.Linear(hidden_size, hidden_size)
+        self.linear_k = nn.Linear(hidden_size, hidden_size)
+        self.linear_v = nn.Linear(hidden_size, hidden_size)
+        self.linear_out = nn.Linear(hidden_size, hidden_size)
+
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(hidden_size, hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+        if self.position_embeddings_type == "relative_key":
+            self.left_max_position_embeddings = config.left_max_position_embeddings
+            self.right_max_position_embeddings = config.right_max_position_embeddings
+            num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1
+            self.distance_embedding = nn.Embedding(num_positions, self.head_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://arxiv.org/abs/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        if self.position_embeddings_type == "relative_key":
+            query_length, key_length = query.shape[2], key.shape[2]
+
+            position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_r - position_ids_l
+            distance = torch.clamp(distance, -self.left_max_position_embeddings, self.right_max_position_embeddings)
+
+            positional_embedding = self.distance_embedding(distance + self.left_max_position_embeddings)
+            positional_embedding = positional_embedding.to(dtype=query.dtype)  # fp16 compatibility
+
+            relative_position_attn_weights = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+            scores = scores + (relative_position_attn_weights / math.sqrt(self.head_size))
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+
+class Wav2Vec2BertEncoderLayer(nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn1 = Wav2Vec2BertFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2BertSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2BertConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn2 = Wav2Vec2BertFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        conv_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = hidden_states
+
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class Wav2Vec2BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2BertRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2BertRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        conv_attention_mask = attention_mask
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        relative_position_embeddings,
+                        output_attentions,
+                        conv_attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        relative_position_embeddings=relative_position_embeddings,
+                        output_attentions=output_attentions,
+                        conv_attention_mask=conv_attention_mask,
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2BertAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.proj = self.proj_layer_norm = None
+        self.layers = nn.ModuleList(Wav2Vec2BertAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+        self.kernel_size = config.adapter_kernel_size
+        self.stride = config.adapter_stride
+
+    def _compute_sub_sample_lengths_from_attention_mask(self, seq_lens):
+        if seq_lens is None:
+            return seq_lens
+        pad = self.kernel_size // 2
+        seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
+        return seq_lens.floor()
+
+    def forward(self, hidden_states, attention_mask=None):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        sub_sampled_lengths = None
+        if attention_mask is not None:
+            sub_sampled_lengths = (attention_mask.size(1) - (1 - attention_mask.int()).sum(1)).to(hidden_states.device)
+
+        for layer in self.layers:
+            layerdrop_prob = torch.rand([])
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(sub_sampled_lengths)
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(
+                    hidden_states, attention_mask=attention_mask, sub_sampled_lengths=sub_sampled_lengths
+                )
+
+        return hidden_states
+
+
+class Wav2Vec2BertAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.output_hidden_size
+        dropout = config.conformer_conv_dropout
+
+        self.kernel_size = config.adapter_kernel_size
+        self.stride = config.adapter_stride
+
+        # 1. residual convolution
+        self.residual_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.residual_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.activation = nn.GLU(dim=1)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.self_attn_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.self_attn = Wav2Vec2BertSelfAttention(config, is_adapter_attention=True)
+        self.self_attn_dropout = nn.Dropout(dropout)
+
+        # Feed-forward
+        self.ffn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn = Wav2Vec2BertFeedForward(config, act_fn=config.adapter_act, hidden_size=embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        sub_sampled_lengths: Optional[torch.Tensor] = None,
+    ):
+        residual = self.residual_layer_norm(hidden_states)
+
+        # Apply pooling to the residual to match the sequence length of the
+        # multi-head attention output.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        residual = residual.transpose(1, 2)
+        residual = self.residual_conv(residual)
+        residual = self.activation(residual)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        residual = residual.transpose(1, 2)
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Apply pooling before feeding to the multihead-attention layer.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.self_attn_conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        if attention_mask is not None:
+            attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                hidden_states.dtype,
+            )
+
+        # The rest of the computation is identical to a vanilla Transformer
+        # encoder layer.
+        hidden_states, attn_weigths = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+        hidden_states = self.ffn(hidden_states) + residual
+
+        return hidden_states
+
+
+class Wav2Vec2BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Wav2Vec2BertConfig
+    base_model_prefix = "wav2vec2_bert"
+    main_input_name = "input_features"
+    supports_gradient_checkpointing = True
+
+    # Ignore copy
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Wav2Vec2BertSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2BertFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    # Ignore copy
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride, padding):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length + 2 * padding - kernel_size, stride, rounding_mode="floor") + 1
+
+        if add_adapter:
+            padding = self.config.adapter_kernel_size // 2
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(
+                    input_lengths, self.config.adapter_kernel_size, self.config.adapter_stride, padding
+                )
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+WAV2VEC2_BERT_START_DOCSTRING = None
+
+WAV2VEC2_BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+Wav2Vec2BertBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare Wav2Vec2Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertModel(Wav2Vec2Model, Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config: Wav2Vec2BertConfig):
+        Wav2Vec2BertPreTrainedModel.__init__(config)
+        self.config = config
+        self.feature_projection = Wav2Vec2BertFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Wav2Vec2BertEncoder(config)
+
+        self.adapter = Wav2Vec2BertAdapter(config) if config.add_adapter else None
+
+        self.intermediate_ffn = None
+        if config.use_intermediate_ffn_before_adapter:
+            self.intermediate_ffn = Wav2Vec2BertFeedForward(config, act_fn="relu")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_PRETRAINED_CHECKPOINT_FOR_DOC,
+        output_type=Wav2Vec2BertBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2BertBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states, extract_features = self.feature_projection(input_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.intermediate_ffn:
+            expanded_hidden_states = self.intermediate_ffn(hidden_states)
+            hidden_states = hidden_states + 0.5 * expanded_hidden_states
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BertBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Wav2Vec2Bert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForCTC(Wav2Vec2ConformerForCTC):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_PRETRAINED_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask
+                if attention_mask is not None
+                else torch.ones(input_features.shape[:2], device=input_features.device, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum([-1])).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output) for
+    tasks like SUPERB Keyword Spotting.
+    """,
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_bert.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert,WAV_2_VEC_2->WAV2VEC2_BERT, input_values->input_features
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Bert Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2ConformerForAudioFrameClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForAudioFrameClassification.forward with wav2vec2_conformer->wav2vec2_bert, input_values->input_features
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Bert Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForXVector(Wav2Vec2ConformerForXVector):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForXVector.forward with wav2vec2_conformer->wav2vec2_bert, input_values->input_features
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Wav2Vec2BertForAudioFrameClassification",
+    "Wav2Vec2BertForCTC",
+    "Wav2Vec2BertForSequenceClassification",
+    "Wav2Vec2BertForXVector",
+    "Wav2Vec2BertModel",
+    "Wav2Vec2BertPreTrainedModel",
+]
diff --git a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
index d8c94d9a6978..dbd40c94ea5b 100644
--- a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
@@ -81,7 +81,7 @@ def __call__(
         Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
         and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
         `None` to pre-process the audio. To prepare the target sequences(s), this method forwards the `text` and `kwargs` arguments to
-        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the doctsring of the above two methods for more information.
+        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
 
         Args:
             audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
@@ -127,7 +127,7 @@ def pad(self, input_features=None, labels=None, **kwargs):
         """
         If `input_features` is not `None`, this method forwards the `input_features` and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.pad`] to pad the input features.
         If `labels` is not `None`, this method forwards the `labels` and `kwargs` arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`] to pad the label(s).
-        Please refer to the doctsring of the above two methods for more information.
+        Please refer to the docstring of the above two methods for more information.
         """
         if input_features is None and labels is None:
             raise ValueError("You need to specify either an `input_features` or `labels` input to pad.")
diff --git a/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 8c435c6cd920..000000000000
--- a/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Wav2Vec2Conformer checkpoint."""
-
-import argparse
-import json
-import os
-
-import fairseq
-import torch
-from fairseq.data import Dictionary
-
-from transformers import (
-    Wav2Vec2ConformerConfig,
-    Wav2Vec2ConformerForCTC,
-    Wav2Vec2ConformerForPreTraining,
-    Wav2Vec2CTCTokenizer,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Processor,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.linear_k": "encoder.layers.*.self_attn.linear_k",
-    "self_attn.linear_v": "encoder.layers.*.self_attn.linear_v",
-    "self_attn.linear_q": "encoder.layers.*.self_attn.linear_q",
-    "self_attn.pos_bias_u": "encoder.layers.*.self_attn.pos_bias_u",
-    "self_attn.pos_bias_v": "encoder.layers.*.self_attn.pos_bias_v",
-    "self_attn.linear_out": "encoder.layers.*.self_attn.linear_out",
-    "self_attn.linear_pos": "encoder.layers.*.self_attn.linear_pos",
-    "self_attn.rotary_emb": "encoder.embed_positions",
-    "self_attn_layer_norm": "encoder.layers.*.self_attn_layer_norm",
-    "conv_module.pointwise_conv1": "encoder.layers.*.conv_module.pointwise_conv1",
-    "conv_module.pointwise_conv2": "encoder.layers.*.conv_module.pointwise_conv2",
-    "conv_module.depthwise_conv": "encoder.layers.*.conv_module.depthwise_conv",
-    "conv_module.batch_norm": "encoder.layers.*.conv_module.batch_norm",
-    "conv_module.layer_norm": "encoder.layers.*.conv_module.layer_norm",
-    "ffn1.w_1": "encoder.layers.*.ffn1.intermediate_dense",
-    "ffn1.w_2": "encoder.layers.*.ffn1.output_dense",
-    "ffn1.layer_norm": "encoder.layers.*.ffn1_layer_norm",
-    "ffn2.w_1": "encoder.layers.*.ffn2.intermediate_dense",
-    "ffn2.w_2": "encoder.layers.*.ffn2.output_dense",
-    "ffn2.layer_norm": "encoder.layers.*.ffn2_layer_norm",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "lm_head",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "lm_head",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    if hf_shape != value.shape:
-        raise ValueError(
-            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-            f" {value.shape} for {full_name}"
-        )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    elif weight_type == "running_mean":
-        hf_pointer.running_mean.data = value
-    elif weight_type == "running_var":
-        hf_pointer.running_var.data = value
-    elif weight_type == "num_batches_tracked":
-        hf_pointer.num_batches_tracked.data = value
-    elif weight_type == "inv_freq":
-        hf_pointer.inv_freq.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model, is_headless):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.wav2vec2_conformer.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                mapped_key = "wav2vec2_conformer." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "pos_bias_u" in name:
-                        weight_type = None
-                    elif "pos_bias_v" in name:
-                        weight_type = None
-                    elif "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    elif "running_mean" in name:
-                        weight_type = "running_mean"
-                    elif "inv_freq" in name:
-                        weight_type = "inv_freq"
-                    elif "running_var" in name:
-                        weight_type = "running_var"
-                    elif "num_batches_tracked" in name:
-                        weight_type = "num_batches_tracked"
-                    else:
-                        weight_type = None
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-# Copied from transformers.models.wav2vec2.convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.load_conv_layer
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
-                raise ValueError(
-                    f"{full_name} has size {value.shape}, but"
-                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
-                )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_wav2vec2_conformer_checkpoint(
-    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
-):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    if config_path is not None:
-        config = Wav2Vec2ConformerConfig.from_pretrained(config_path, hidden_act="swish")
-    else:
-        config = Wav2Vec2ConformerConfig()
-
-    if "rope" in checkpoint_path:
-        config.position_embeddings_type = "rotary"
-
-    if is_finetuned:
-        if dict_path:
-            target_dict = Dictionary.load(dict_path)
-
-            # important change bos & pad token id since CTC symbol is <pad> and
-            # not <s> as in fairseq
-            config.bos_token_id = target_dict.pad_index
-            config.pad_token_id = target_dict.bos_index
-            config.eos_token_id = target_dict.eos_index
-            config.vocab_size = len(target_dict.symbols)
-            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
-            if not os.path.isdir(pytorch_dump_folder_path):
-                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
-                return
-            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
-            vocab_dict = target_dict.indices
-
-            # fairseq has the <pad> and <s> switched
-            vocab_dict["<pad>"] = 0
-            vocab_dict["<s>"] = 1
-            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
-                json.dump(vocab_dict, vocab_handle)
-            tokenizer = Wav2Vec2CTCTokenizer(
-                vocab_path,
-                unk_token=target_dict.unk_word,
-                pad_token=target_dict.pad_word,
-                bos_token=target_dict.bos_word,
-                eos_token=target_dict.eos_word,
-                word_delimiter_token="|",
-                do_lower_case=False,
-            )
-            return_attention_mask = True if config.feat_extract_norm == "layer" else False
-            feature_extractor = Wav2Vec2FeatureExtractor(
-                feature_size=1,
-                sampling_rate=16000,
-                padding_value=0,
-                do_normalize=True,
-                return_attention_mask=return_attention_mask,
-            )
-            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
-            processor.save_pretrained(pytorch_dump_folder_path)
-
-        hf_wav2vec = Wav2Vec2ConformerForCTC(config)
-    else:
-        hf_wav2vec = Wav2Vec2ConformerForPreTraining(config)
-
-    if is_finetuned:
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
-        )
-    else:
-        task_arg = argparse.Namespace(task="audio_pretraining")
-        task = fairseq.tasks.setup_task(task_arg)
-
-        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path], task=task)
-
-    model = model[0].eval()
-
-    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
-
-    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    parser.add_argument(
-        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
-    )
-    args = parser.parse_args()
-    convert_wav2vec2_conformer_checkpoint(
-        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
-    )
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index a9ee9ba83259..bd94e44b6166 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -1,19 +1,9 @@
-# coding=utf-8
-# Copyright 2022 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Wav2Vec2-Conformer model."""
-
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_wav2vec2_conformer.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import warnings
 from dataclasses import dataclass
@@ -21,7 +11,6 @@
 
 import numpy as np
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
@@ -43,31 +32,19 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_peft_available,
-    logging,
     replace_return_docstrings,
 )
 from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
 
 
-logger = logging.get_logger(__name__)
-
-
-_HIDDEN_STATES_START_POSITION = 2
-
-# General docstring
-_CONFIG_FOR_DOC = "Wav2Vec2ConformerConfig"
-
 # Base docstring
 _CHECKPOINT_FOR_DOC = "facebook/wav2vec2-conformer-rope-large-960h-ft"
-_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
 
-# CTC docstring
-_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
-_CTC_EXPECTED_LOSS = 64.21
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2ConformerConfig"
 
 
 @dataclass
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput with Wav2Vec2->Wav2Vec2Conformer
 class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
     """
     Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions.
@@ -100,248 +77,26 @@ class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    projected_states: torch.FloatTensor = None
-    projected_quantized_states: torch.FloatTensor = None
-    codevector_perplexity: torch.FloatTensor = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     contrastive_loss: Optional[torch.FloatTensor] = None
     diversity_loss: Optional[torch.FloatTensor] = None
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
-def _compute_mask_indices(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    attention_mask: Optional[torch.LongTensor] = None,
-    min_masks: int = 0,
-) -> np.ndarray:
-    """
-    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
-    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
-    CPU as part of the preprocessing during training.
-
-    Args:
-        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-                    independently generated mask spans of length `mask_length` is computed by
-                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-                    actual percentage will be smaller.
-        mask_length: size of the mask
-        min_masks: minimum number of masked spans
-        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-                        each batch dimension.
-    """
-    batch_size, sequence_length = shape
-
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
-
-    if mask_length > sequence_length:
-        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
-            f" and `sequence_length`: {sequence_length}`"
-        )
-
-    # epsilon is used for probabilistic rounding
-    epsilon = np.random.rand(1).item()
-
-    def compute_num_masked_span(input_length):
-        """Given input length, compute how many spans should be masked"""
-        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
-        num_masked_span = max(num_masked_span, min_masks)
-
-        # make sure num masked span <= sequence_length
-        if num_masked_span * mask_length > sequence_length:
-            num_masked_span = sequence_length // mask_length
-
-        # make sure num_masked span is also <= input_length - (mask_length - 1)
-        if input_length - (mask_length - 1) < num_masked_span:
-            num_masked_span = max(input_length - (mask_length - 1), 0)
-
-        return num_masked_span
-
-    # compute number of masked spans in batch
-    input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
-        if attention_mask is not None
-        else [sequence_length for _ in range(batch_size)]
-    )
-
-    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
-    spec_aug_mask_idxs = []
-
-    max_num_masked_span = compute_num_masked_span(sequence_length)
-
-    if max_num_masked_span == 0:
-        return spec_aug_mask
-
-    for input_length in input_lengths:
-        # compute num of masked spans for this input
-        num_masked_span = compute_num_masked_span(input_length)
-
-        # get random indices to mask
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
-        )
-
-        # pick first sampled index that will serve as a dummy index to pad vector
-        # to ensure same dimension for all batches due to probabilistic rounding
-        # Picking first sample just pads those vectors twice.
-        if len(spec_aug_mask_idx) == 0:
-            # this case can only happen if `input_length` is strictly smaller then
-            # `sequence_length` in which case the last token has to be a padding
-            # token which we can use as a dummy mask id
-            dummy_mask_idx = sequence_length - 1
-        else:
-            dummy_mask_idx = spec_aug_mask_idx[0]
-
-        spec_aug_mask_idx = np.concatenate(
-            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
-        )
-        spec_aug_mask_idxs.append(spec_aug_mask_idx)
-
-    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
-
-    # expand masked indices to masked spans
-    spec_aug_mask_idxs = np.broadcast_to(
-        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
-
-    # add offset to the starting indexes so that indexes now create a span
-    offsets = np.arange(mask_length)[None, None, :]
-    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
-        batch_size, max_num_masked_span * mask_length
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
-
-    # ensure that we cannot have indices larger than sequence_length
-    if spec_aug_mask_idxs.max() > sequence_length - 1:
-        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
-
-    # scatter indices to mask
-    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
-
-    return spec_aug_mask
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._sample_negative_indices
-def _sample_negative_indices(
-    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
-):
-    """
-    Sample `num_negatives` vectors from feature vectors.
-    """
-    batch_size, sequence_length = features_shape
-
-    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
-    sequence_length_range = np.arange(sequence_length)
-
-    # get `num_negatives` random vector indices from the same utterance
-    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
-
-    mask_time_indices = (
-        mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
-    )
-
-    for batch_idx in range(batch_size):
-        high = mask_time_indices[batch_idx].sum() - 1
-        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
-
-        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
-        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
-        # avoid sampling the same positive vector, but keep the distribution uniform
-        sampled_indices[sampled_indices >= feature_indices] += 1
-
-        # remap to actual indices
-        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
-
-        # correct for batch size
-        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
-
-    return sampled_negative_indices
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerNoLayerNormConvLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
-        super().__init__()
-        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
-        self.out_conv_dim = config.conv_dim[layer_id]
-
-        self.conv = nn.Conv1d(
-            self.in_conv_dim,
-            self.out_conv_dim,
-            kernel_size=config.conv_kernel[layer_id],
-            stride=config.conv_stride[layer_id],
-            bias=config.conv_bias,
-        )
-        self.activation = ACT2FN[config.feat_extract_activation]
-
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerLayerNormConvLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
-        super().__init__()
-        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
-        self.out_conv_dim = config.conv_dim[layer_id]
-
-        self.conv = nn.Conv1d(
-            self.in_conv_dim,
-            self.out_conv_dim,
-            kernel_size=config.conv_kernel[layer_id],
-            stride=config.conv_stride[layer_id],
-            bias=config.conv_bias,
-        )
-        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
-        self.activation = ACT2FN[config.feat_extract_activation]
-
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-
-        hidden_states = hidden_states.transpose(-2, -1)
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = hidden_states.transpose(-2, -1)
-
-        hidden_states = self.activation(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerGroupNormConvLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
+class Wav2Vec2ConformerSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
         super().__init__()
-        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
-        self.out_conv_dim = config.conv_dim[layer_id]
-
-        self.conv = nn.Conv1d(
-            self.in_conv_dim,
-            self.out_conv_dim,
-            kernel_size=config.conv_kernel[layer_id],
-            stride=config.conv_stride[layer_id],
-            bias=config.conv_bias,
-        )
-        self.activation = ACT2FN[config.feat_extract_activation]
-
-        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
 
     def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = self.activation(hidden_states)
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->Wav2Vec2Conformer
 class Wav2Vec2ConformerPositionalConvEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -471,19 +226,78 @@ def forward(self, hidden_states: torch.Tensor):
         return relative_position_embeddings
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->Wav2Vec2Conformer
-class Wav2Vec2ConformerSamePadLayer(nn.Module):
-    def __init__(self, num_conv_pos_embeddings):
+class Wav2Vec2ConformerNoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
         super().__init__()
-        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
 
     def forward(self, hidden_states):
-        if self.num_pad_remove > 0:
-            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2ConformerLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2ConformerGroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Wav2Vec2Conformer
 class Wav2Vec2ConformerFeatureEncoder(nn.Module):
     """Construct the features from raw audio waveform"""
 
@@ -531,7 +345,6 @@ def forward(self, input_values):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->Wav2Vec2Conformer
 class Wav2Vec2ConformerFeatureProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -547,7 +360,6 @@ def forward(self, hidden_states):
         return hidden_states, norm_hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Wav2Vec2Conformer
 class Wav2Vec2ConformerFeedForward(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -943,7 +755,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GumbelVectorQuantizer with Wav2Vec2->Wav2Vec2Conformer
 class Wav2Vec2ConformerGumbelVectorQuantizer(nn.Module):
     """
     Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
@@ -1020,7 +831,6 @@ def forward(self, hidden_states, mask_time_indices=None):
         return codevectors, perplexity
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter with Wav2Vec2->Wav2Vec2Conformer
 class Wav2Vec2ConformerAdapter(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1052,7 +862,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer with Wav2Vec2->Wav2Vec2Conformer
 class Wav2Vec2ConformerAdapterLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1170,6 +979,128 @@ def _get_feature_vector_attention_mask(
         return attention_mask
 
 
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
+
+
 WAV2VEC2_CONFORMER_START_DOCSTRING = r"""
     Wav2Vec2Conformer was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
     Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
@@ -1178,8 +1109,9 @@ def _get_feature_vector_attention_mask(
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving etc.).
 
-    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a
-    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
 
     Parameters:
         config ([`Wav2Vec2ConformerConfig`]): Model configuration class with all the parameters of the model.
@@ -1187,7 +1119,6 @@ def _get_feature_vector_attention_mask(
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-
 WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -1226,6 +1157,8 @@ def _get_feature_vector_attention_mask(
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
+Wav2Vec2ConformerBaseModelOutput = Wav2Vec2BaseModelOutput
+
 
 @add_start_docstrings(
     "The bare Wav2Vec2Conformer Model transformer outputting raw hidden-states without any specific head on top.",
@@ -1249,7 +1182,6 @@ def __init__(self, config: Wav2Vec2ConformerConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.freeze_feature_encoder
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1257,7 +1189,6 @@ def freeze_feature_encoder(self):
         """
         self.feature_extractor._freeze_parameters()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
     def _mask_hidden_states(
         self,
         hidden_states: torch.FloatTensor,
@@ -1307,12 +1238,11 @@ def _mask_hidden_states(
     @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Wav2Vec2BaseModelOutput,
+        output_type=Wav2Vec2ConformerBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
         expected_output=_EXPECTED_OUTPUT_SHAPE,
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.forward with wav2vec2->wav2vec2_conformer
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1321,7 +1251,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+    ) -> Union[Tuple, Wav2Vec2ConformerBaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1358,7 +1288,7 @@ def forward(
         if not return_dict:
             return (hidden_states, extract_features) + encoder_outputs[1:]
 
-        return Wav2Vec2BaseModelOutput(
+        return Wav2Vec2ConformerBaseModelOutput(
             last_hidden_state=hidden_states,
             extract_features=extract_features,
             hidden_states=encoder_outputs.hidden_states,
@@ -1370,7 +1300,6 @@ def forward(
     """Wav2Vec2Conformer Model with a quantizer and `VQ` head on top.""", WAV2VEC2_CONFORMER_START_DOCSTRING
 )
 class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel):
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
     def __init__(self, config: Wav2Vec2ConformerConfig):
         super().__init__(config)
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
@@ -1384,14 +1313,12 @@ def __init__(self, config: Wav2Vec2ConformerConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.set_gumbel_temperature
     def set_gumbel_temperature(self, temperature: int):
         """
         Set the Gumbel softmax temperature to a given value. Only necessary for training
         """
         self.quantizer.temperature = temperature
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1400,7 +1327,6 @@ def freeze_feature_encoder(self):
         self.wav2vec2_conformer.feature_extractor._freeze_parameters()
 
     @staticmethod
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.compute_contrastive_logits
     def compute_contrastive_logits(
         target_features: torch.FloatTensor,
         negative_features: torch.FloatTensor,
@@ -1423,7 +1349,6 @@ def compute_contrastive_logits(
 
     @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Wav2Vec2ConformerForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,wav2vec2_conformer-base->wav2vec2-conformer-rel-pos-large
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1452,8 +1377,8 @@ def forward(
         >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import _compute_mask_indices, _sample_negative_indices
         >>> from datasets import load_dataset
 
-        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
-        >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2_conformer-base")
+        >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2_conformer-base")
 
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
@@ -1585,12 +1510,18 @@ def forward(
         )
 
 
+_HIDDEN_STATES_START_POSITION = 2
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 64.21
+
+
 @add_start_docstrings(
     """Wav2Vec2Conformer Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     WAV2VEC2_CONFORMER_START_DOCSTRING,
 )
 class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
     def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
 
@@ -1614,7 +1545,6 @@ def __init__(self, config, target_lang: Optional[str] = None):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1630,7 +1560,6 @@ def freeze_feature_encoder(self):
         expected_output=_CTC_EXPECTED_OUTPUT,
         expected_loss=_CTC_EXPECTED_LOSS,
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1710,7 +1639,6 @@ def forward(
     WAV2VEC2_CONFORMER_START_DOCSTRING,
 )
 class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedModel):
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
     def __init__(self, config):
         super().__init__(config)
 
@@ -1728,7 +1656,6 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1751,7 +1678,6 @@ def freeze_base_model(self):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1822,7 +1748,6 @@ def forward(
     WAV2VEC2_CONFORMER_START_DOCSTRING,
 )
 class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedModel):
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
     def __init__(self, config):
         super().__init__(config)
 
@@ -1839,7 +1764,6 @@ def __init__(self, config):
 
         self.init_weights()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1847,7 +1771,6 @@ def freeze_feature_encoder(self):
         """
         self.wav2vec2_conformer.feature_extractor._freeze_parameters()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.freeze_base_model with wav2vec2->wav2vec2_conformer
     def freeze_base_model(self):
         """
         Calling this function will disable the gradient computation for the base model so that its parameters will not
@@ -1863,7 +1786,6 @@ def freeze_base_model(self):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.forward with wav2vec2->wav2vec2_conformer
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1918,7 +1840,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
 class AMSoftmaxLoss(nn.Module):
     def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
         super(AMSoftmaxLoss, self).__init__()
@@ -1942,7 +1863,6 @@ def forward(self, hidden_states, labels):
         return loss
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
 class TDNNLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -1958,6 +1878,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if is_peft_available():
             from peft.tuners.lora import LoraLayer
 
+        if is_peft_available():
             if isinstance(self.kernel, LoraLayer):
                 warnings.warn(
                     "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
@@ -2000,7 +1921,6 @@ def __init__(self, config):
 
         self.init_weights()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -2008,7 +1928,6 @@ def freeze_feature_encoder(self):
         """
         self.wav2vec2_conformer.feature_extractor._freeze_parameters()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.freeze_base_model with wav2vec2->wav2vec2_conformer
     def freeze_base_model(self):
         """
         Calling this function will disable the gradient computation for the base model so that its parameters will not
@@ -2017,7 +1936,6 @@ def freeze_base_model(self):
         for param in self.wav2vec2_conformer.parameters():
             param.requires_grad = False
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector._get_tdnn_output_lengths with wav2vec2->wav2vec2_conformer
     def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
         """
         Computes the output length of the TDNN layers
@@ -2040,7 +1958,6 @@ def _conv_out_length(input_length, kernel_size, stride):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
     def forward(
         self,
         input_values: Optional[torch.Tensor],
diff --git a/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py
new file mode 100644
index 000000000000..c2d101385fad
--- /dev/null
+++ b/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py
@@ -0,0 +1,892 @@
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Adapter,
+    Wav2Vec2AdapterLayer,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2FeedForward,
+    Wav2Vec2ForAudioFrameClassification,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForPreTraining,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2ForXVector,
+    Wav2Vec2GumbelVectorQuantizer,
+    Wav2Vec2Model,
+    Wav2Vec2PositionalConvEmbedding,
+)
+from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2ConformerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-conformer-rope-large-960h-ft"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 64.21
+
+
+@dataclass
+class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+
+
+class Wav2Vec2ConformerPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding):
+    pass
+
+
+class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embedding
+    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+    def forward(self, hidden_states):
+        sequence_length = hidden_states.shape[1]
+
+        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
+            return self.cached_rotary_positional_embedding
+
+        self.cached_sequence_length = sequence_length
+        # Embeddings are computed in the dtype of the inv_freq constant
+        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
+        embeddings = torch.cat((freqs, freqs), dim=-1)
+
+        cos_embeddings = embeddings.cos()[:, None, None, :]
+        sin_embeddings = embeddings.sin()[:, None, None, :]
+        # Computed embeddings are cast to the dtype of the hidden state inputs
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
+        return self.cached_rotary_positional_embedding
+
+
+class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
+    """Relative positional encoding module."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.max_len = config.max_source_positions
+        self.d_model = config.hidden_size
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+
+    def extend_pe(self, x):
+        # Reset the positional encodings
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` is the position of query vector and `j` is the
+        # position of key vector. We use positive relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reverse the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, hidden_states: torch.Tensor):
+        self.extend_pe(hidden_states)
+        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
+        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
+        relative_position_embeddings = self.pe[:, start_idx:end_idx]
+
+        return relative_position_embeddings
+
+
+class Wav2Vec2ConformerFeatureEncoder(Wav2Vec2FeatureEncoder):
+    pass
+
+
+class Wav2Vec2ConformerFeatureProjection(Wav2Vec2FeatureProjection):
+    pass
+
+
+class Wav2Vec2ConformerFeedForward(Wav2Vec2FeedForward):
+    pass
+
+
+class Wav2Vec2ConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=(config.conv_depthwise_kernel_size - 1) // 2,
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm1d(config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2ConformerSelfAttention(nn.Module):
+    """Construct an Wav2Vec2ConformerSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type
+
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://arxiv.org/abs/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., : self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2 :]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+
+        return hidden_states
+
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
+            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
+        )
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
+
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+
+        return scores
+
+
+class Wav2Vec2ConformerEncoderLayer(nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = Wav2Vec2ConformerFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2ConformerSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2ConformerConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = Wav2Vec2ConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        hidden_states = hidden_states
+
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class Wav2Vec2ConformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0.0
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        relative_position_embeddings,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        relative_position_embeddings=relative_position_embeddings,
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2ConformerGumbelVectorQuantizer(Wav2Vec2GumbelVectorQuantizer):
+    pass
+
+
+class Wav2Vec2ConformerAdapter(Wav2Vec2Adapter):
+    pass
+
+
+class Wav2Vec2ConformerAdapterLayer(Wav2Vec2AdapterLayer):
+    pass
+
+
+class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Wav2Vec2ConformerConfig
+    base_model_prefix = "wav2vec2_conformer"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ConformerForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
+        # gumbel softmax requires special init
+        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+WAV2VEC2_CONFORMER_START_DOCSTRING = None  # will be automatically redefined
+
+WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, such as
+            [wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large),
+            `attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For
+            such models `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware
+            that these models also yield slightly different results depending on whether `input_values` is padded or
+            not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+Wav2Vec2ConformerBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare Wav2Vec2Conformer Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel, Wav2Vec2Model):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        Wav2Vec2ConformerPreTrainedModel.__init__(config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config)
+        self.feature_projection = Wav2Vec2ConformerFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Wav2Vec2ConformerEncoder(config)
+
+        self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Wav2Vec2ConformerBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """Wav2Vec2Conformer Model with a quantizer and `VQ` head on top.""", WAV2VEC2_CONFORMER_START_DOCSTRING
+)
+class Wav2Vec2ConformerForPreTraining(Wav2Vec2ForPreTraining):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Wav2Vec2ConformerForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(self, **super_kwargs) -> Union[Tuple, Wav2Vec2ConformerForPreTrainingOutput]:
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """Wav2Vec2Conformer Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForCTC(Wav2Vec2ForCTC):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+
+    def tie_weights(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    def freeze_base_model(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with a sequence classification head on top (a linear layer over the pooled output) for
+    tasks like SUPERB Keyword Spotting.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForXVector(Wav2Vec2ForXVector):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+__all__ = [
+    "Wav2Vec2ConformerForAudioFrameClassification",
+    "Wav2Vec2ConformerForCTC",
+    "Wav2Vec2ConformerForPreTraining",
+    "Wav2Vec2ConformerForSequenceClassification",
+    "Wav2Vec2ConformerForXVector",
+    "Wav2Vec2ConformerModel",
+    "Wav2Vec2ConformerPreTrainedModel",
+]
diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
index b617b17d02b9..574de200fa30 100644
--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -408,7 +408,7 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         group_tokens: bool = True,
         filter_word_delimiter_token: bool = True,
         spaces_between_special_tokens: bool = False,
@@ -455,7 +455,7 @@ def decode(
         self,
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         output_char_offsets: bool = False,
         **kwargs,
     ) -> str:
@@ -511,7 +511,7 @@ def batch_decode(
         self,
         sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         output_char_offsets: bool = False,
         **kwargs,
     ) -> List[str]:
diff --git a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index e41ae0881d97..000000000000
--- a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert WavLM checkpoint."""
-
-import argparse
-
-import torch
-
-# Step 1. clone https://github.com/microsoft/unilm
-# Step 2. git checkout to https://github.com/microsoft/unilm/commit/b94ec76c36f02fb2b0bf0dcb0b8554a2185173cd
-# Step 3. cd unilm
-# Step 4. ln -s $(realpath wavlm/modules.py) ./  # create simlink
-# import classes
-from unilm.wavlm.WavLM import WavLM as WavLMOrig
-from unilm.wavlm.WavLM import WavLMConfig as WavLMConfigOrig
-
-from transformers import WavLMConfig, WavLMModel, logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-MAPPING = {
-    "post_extract_proj": "feature_projection.projection",
-    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
-    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
-    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
-    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
-    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
-    "self_attn.grep_linear": "encoder.layers.*.attention.gru_rel_pos_linear",
-    "self_attn.relative_attention_bias": "encoder.layers.*.attention.rel_attn_embed",
-    "self_attn.grep_a": "encoder.layers.*.attention.gru_rel_pos_const",
-    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
-    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
-    "fc2": "encoder.layers.*.feed_forward.output_dense",
-    "final_layer_norm": "encoder.layers.*.final_layer_norm",
-    "encoder.layer_norm": "encoder.layer_norm",
-    "w2v_model.layer_norm": "feature_projection.layer_norm",
-    "quantizer.weight_proj": "quantizer.weight_proj",
-    "quantizer.vars": "quantizer.codevectors",
-    "project_q": "project_q",
-    "final_proj": "project_hid",
-    "w2v_encoder.proj": "ctc_proj",
-    "mask_emb": "masked_spec_embed",
-}
-TOP_LEVEL_KEYS = [
-    "ctc_proj",
-    "quantizer.weight_proj",
-    "quantizer.codevectors",
-    "project_q",
-    "project_hid",
-]
-
-
-def set_recursively(hf_pointer, key, value, full_name, weight_type):
-    for attribute in key.split("."):
-        hf_pointer = getattr(hf_pointer, attribute)
-
-    if weight_type is not None:
-        hf_shape = getattr(hf_pointer, weight_type).shape
-    else:
-        hf_shape = hf_pointer.shape
-
-    assert hf_shape == value.shape, (
-        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
-        f" {value.shape} for {full_name}"
-    )
-
-    if weight_type == "weight":
-        hf_pointer.weight.data = value
-    elif weight_type == "weight_g":
-        hf_pointer.weight_g.data = value
-    elif weight_type == "weight_v":
-        hf_pointer.weight_v.data = value
-    elif weight_type == "bias":
-        hf_pointer.bias.data = value
-    else:
-        hf_pointer.data = value
-
-    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
-
-
-def recursively_load_weights(fairseq_model, hf_model):
-    unused_weights = []
-    fairseq_dict = fairseq_model.state_dict()
-
-    feature_extractor = hf_model.feature_extractor
-
-    for name, value in fairseq_dict.items():
-        is_used = False
-        if "conv_layers" in name:
-            load_conv_layer(
-                name,
-                value,
-                feature_extractor,
-                unused_weights,
-                hf_model.config.feat_extract_norm == "group",
-            )
-            is_used = True
-        else:
-            for key, mapped_key in MAPPING.items():
-                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
-                    is_used = True
-                    if "*" in mapped_key:
-                        layer_index = name.split(key)[0].split(".")[-2]
-                        mapped_key = mapped_key.replace("*", layer_index)
-                    if "weight_g" in name:
-                        weight_type = "weight_g"
-                    elif "weight_v" in name:
-                        weight_type = "weight_v"
-                    elif "bias" in name and "relative_attention_bias" not in name:
-                        weight_type = "bias"
-                    elif "weight" in name:
-                        # TODO: don't match quantizer.weight_proj
-                        weight_type = "weight"
-                    else:
-                        weight_type = None
-
-                    set_recursively(hf_model, mapped_key, value, name, weight_type)
-                continue
-        if not is_used:
-            unused_weights.append(name)
-
-    logger.warning(f"Unused weights: {unused_weights}")
-
-
-def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
-    name = full_name.split("conv_layers.")[-1]
-    items = name.split(".")
-    layer_id = int(items[0])
-    type_id = int(items[1])
-
-    if type_id == 0:
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.bias.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].conv.weight.data = value
-            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
-    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
-        if "bias" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
-                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
-                " found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-        elif "weight" in name:
-            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
-                f"{full_name} has size {value.shape}, but"
-                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
-            )
-            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
-            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
-    else:
-        unused_weights.append(full_name)
-
-
-@torch.no_grad()
-def convert_wavlm_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-    # load the pre-trained checkpoints
-    checkpoint = torch.load(checkpoint_path)
-    cfg = WavLMConfigOrig(checkpoint["cfg"])
-    model = WavLMOrig(cfg)
-    model.load_state_dict(checkpoint["model"])
-    model.eval()
-
-    if config_path is not None:
-        config = WavLMConfig.from_pretrained(config_path)
-    else:
-        config = WavLMConfig()
-
-    hf_wavlm = WavLMModel(config)
-
-    recursively_load_weights(model, hf_wavlm)
-
-    hf_wavlm.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
-    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
-    args = parser.parse_args()
-    convert_wavlm_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
deleted file mode 100644
index 447d4db67fc4..000000000000
--- a/src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Hubert checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import (
-    Wav2Vec2FeatureExtractor,
-    WavLMConfig,
-    WavLMForAudioFrameClassification,
-    WavLMForSequenceClassification,
-    WavLMForXVector,
-    logging,
-)
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def convert_classification(base_model_name, hf_config, downstream_dict):
-    model = WavLMForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["projector.weight"]
-    model.projector.bias.data = downstream_dict["projector.bias"]
-    model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
-    return model
-
-
-def convert_diarization(base_model_name, hf_config, downstream_dict):
-    model = WavLMForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
-    model.classifier.weight.data = downstream_dict["model.linear.weight"]
-    model.classifier.bias.data = downstream_dict["model.linear.bias"]
-    return model
-
-
-def convert_xvector(base_model_name, hf_config, downstream_dict):
-    model = WavLMForXVector.from_pretrained(base_model_name, config=hf_config)
-    model.projector.weight.data = downstream_dict["connector.weight"]
-    model.projector.bias.data = downstream_dict["connector.bias"]
-    for i, kernel_size in enumerate(hf_config.tdnn_kernel):
-        model.tdnn[i].kernel.weight.data = downstream_dict[
-            f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
-        ]
-        model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]
-
-    model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
-    model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
-    model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
-    model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
-    model.objective.weight.data = downstream_dict["objective.W"]
-    return model
-
-
-@torch.no_grad()
-def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
-    """
-    Copy/paste/tweak model's weights to transformers design.
-    """
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    downstream_dict = checkpoint["Downstream"]
-
-    hf_config = WavLMConfig.from_pretrained(config_path)
-    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
-        base_model_name, return_attention_mask=True, do_normalize=False
-    )
-
-    arch = hf_config.architectures[0]
-    if arch.endswith("ForSequenceClassification"):
-        hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForAudioFrameClassification"):
-        hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
-    elif arch.endswith("ForXVector"):
-        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
-    else:
-        raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")
-
-    if hf_config.use_weighted_layer_sum:
-        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
-
-    hf_feature_extractor.save_pretrained(model_dump_path)
-    hf_model.save_pretrained(model_dump_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
-    )
-    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
-    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
-    args = parser.parse_args()
-    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index 3791acff0345..1c3c09d1a70f 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -1,28 +1,17 @@
-# coding=utf-8
-# Copyright 2021 The Fairseq Authors, Microsoft Research, and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch WavLM model."""
-
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/wavlm/modular_wavlm.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_wavlm.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import math
 import warnings
 from typing import Optional, Tuple, Union
 
 import numpy as np
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
@@ -49,225 +38,22 @@
 
 logger = logging.get_logger(__name__)
 
-
-_HIDDEN_STATES_START_POSITION = 2
-
-# General docstring
-_CONFIG_FOR_DOC = "WavLMConfig"
-
-# Base docstring
 _CHECKPOINT_FOR_DOC = "patrickvonplaten/wavlm-libri-clean-100h-base-plus"
-_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
-
-# CTC docstring
-_CTC_EXPECTED_OUTPUT = "'mister quilter is the aposle of the middle classes and we are glad to welcome his gospel'"
-_CTC_EXPECTED_LOSS = 12.51
-
-# Frame class docstring
-_FRAME_CLASS_CHECKPOINT = "microsoft/wavlm-base-plus-sd"
-_FRAME_EXPECTED_OUTPUT = [0, 0]
-
-# Speaker Verification docstring
-_XVECTOR_CHECKPOINT = "microsoft/wavlm-base-plus-sv"
-_XVECTOR_EXPECTED_OUTPUT = 0.97
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
-def _compute_mask_indices(
-    shape: Tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    attention_mask: Optional[torch.LongTensor] = None,
-    min_masks: int = 0,
-) -> np.ndarray:
-    """
-    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
-    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
-    CPU as part of the preprocessing during training.
-
-    Args:
-        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-                    independently generated mask spans of length `mask_length` is computed by
-                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
-                    actual percentage will be smaller.
-        mask_length: size of the mask
-        min_masks: minimum number of masked spans
-        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
-                        each batch dimension.
-    """
-    batch_size, sequence_length = shape
 
-    if mask_length < 1:
-        raise ValueError("`mask_length` has to be bigger than 0.")
-
-    if mask_length > sequence_length:
-        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
-            f" and `sequence_length`: {sequence_length}`"
-        )
-
-    # epsilon is used for probabilistic rounding
-    epsilon = np.random.rand(1).item()
-
-    def compute_num_masked_span(input_length):
-        """Given input length, compute how many spans should be masked"""
-        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
-        num_masked_span = max(num_masked_span, min_masks)
-
-        # make sure num masked span <= sequence_length
-        if num_masked_span * mask_length > sequence_length:
-            num_masked_span = sequence_length // mask_length
-
-        # make sure num_masked span is also <= input_length - (mask_length - 1)
-        if input_length - (mask_length - 1) < num_masked_span:
-            num_masked_span = max(input_length - (mask_length - 1), 0)
-
-        return num_masked_span
-
-    # compute number of masked spans in batch
-    input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
-        if attention_mask is not None
-        else [sequence_length for _ in range(batch_size)]
-    )
-
-    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
-    spec_aug_mask_idxs = []
-
-    max_num_masked_span = compute_num_masked_span(sequence_length)
-
-    if max_num_masked_span == 0:
-        return spec_aug_mask
-
-    for input_length in input_lengths:
-        # compute num of masked spans for this input
-        num_masked_span = compute_num_masked_span(input_length)
-
-        # get random indices to mask
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
-        )
-
-        # pick first sampled index that will serve as a dummy index to pad vector
-        # to ensure same dimension for all batches due to probabilistic rounding
-        # Picking first sample just pads those vectors twice.
-        if len(spec_aug_mask_idx) == 0:
-            # this case can only happen if `input_length` is strictly smaller then
-            # `sequence_length` in which case the last token has to be a padding
-            # token which we can use as a dummy mask id
-            dummy_mask_idx = sequence_length - 1
-        else:
-            dummy_mask_idx = spec_aug_mask_idx[0]
-
-        spec_aug_mask_idx = np.concatenate(
-            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
-        )
-        spec_aug_mask_idxs.append(spec_aug_mask_idx)
-
-    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
-
-    # expand masked indices to masked spans
-    spec_aug_mask_idxs = np.broadcast_to(
-        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
-
-    # add offset to the starting indexes so that indexes now create a span
-    offsets = np.arange(mask_length)[None, None, :]
-    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
-        batch_size, max_num_masked_span * mask_length
-    )
-    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
-
-    # ensure that we cannot have indices larger than sequence_length
-    if spec_aug_mask_idxs.max() > sequence_length - 1:
-        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
-
-    # scatter indices to mask
-    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
-
-    return spec_aug_mask
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->WavLM
-class WavLMNoLayerNormConvLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
-        super().__init__()
-        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
-        self.out_conv_dim = config.conv_dim[layer_id]
-
-        self.conv = nn.Conv1d(
-            self.in_conv_dim,
-            self.out_conv_dim,
-            kernel_size=config.conv_kernel[layer_id],
-            stride=config.conv_stride[layer_id],
-            bias=config.conv_bias,
-        )
-        self.activation = ACT2FN[config.feat_extract_activation]
-
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.activation(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->WavLM
-class WavLMLayerNormConvLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
-        super().__init__()
-        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
-        self.out_conv_dim = config.conv_dim[layer_id]
-
-        self.conv = nn.Conv1d(
-            self.in_conv_dim,
-            self.out_conv_dim,
-            kernel_size=config.conv_kernel[layer_id],
-            stride=config.conv_stride[layer_id],
-            bias=config.conv_bias,
-        )
-        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
-        self.activation = ACT2FN[config.feat_extract_activation]
-
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-
-        hidden_states = hidden_states.transpose(-2, -1)
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = hidden_states.transpose(-2, -1)
-
-        hidden_states = self.activation(hidden_states)
-        return hidden_states
+_CONFIG_FOR_DOC = "WavLMConfig"
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->WavLM
-class WavLMGroupNormConvLayer(nn.Module):
-    def __init__(self, config, layer_id=0):
+class WavLMSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
         super().__init__()
-        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
-        self.out_conv_dim = config.conv_dim[layer_id]
-
-        self.conv = nn.Conv1d(
-            self.in_conv_dim,
-            self.out_conv_dim,
-            kernel_size=config.conv_kernel[layer_id],
-            stride=config.conv_stride[layer_id],
-            bias=config.conv_bias,
-        )
-        self.activation = ACT2FN[config.feat_extract_activation]
-
-        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
 
     def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = self.layer_norm(hidden_states)
-        hidden_states = self.activation(hidden_states)
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->WavLM
 class WavLMPositionalConvEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -313,75 +99,6 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->WavLM
-class WavLMSamePadLayer(nn.Module):
-    def __init__(self, num_conv_pos_embeddings):
-        super().__init__()
-        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
-
-    def forward(self, hidden_states):
-        if self.num_pad_remove > 0:
-            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
-        return hidden_states
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->WavLM
-class WavLMFeatureEncoder(nn.Module):
-    """Construct the features from raw audio waveform"""
-
-    def __init__(self, config):
-        super().__init__()
-
-        if config.feat_extract_norm == "group":
-            conv_layers = [WavLMGroupNormConvLayer(config, layer_id=0)] + [
-                WavLMNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
-            ]
-        elif config.feat_extract_norm == "layer":
-            conv_layers = [WavLMLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
-        else:
-            raise ValueError(
-                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
-            )
-        self.conv_layers = nn.ModuleList(conv_layers)
-        self.gradient_checkpointing = False
-        self._requires_grad = True
-
-    def _freeze_parameters(self):
-        for param in self.parameters():
-            param.requires_grad = False
-        self._requires_grad = False
-
-    def forward(self, input_values):
-        hidden_states = input_values[:, None]
-
-        # make sure hidden_states require grad for gradient_checkpointing
-        if self._requires_grad and self.training:
-            hidden_states.requires_grad = True
-
-        for conv_layer in self.conv_layers:
-            if self._requires_grad and self.gradient_checkpointing and self.training:
-                hidden_states = self._gradient_checkpointing_func(
-                    conv_layer.__call__,
-                    hidden_states,
-                )
-            else:
-                hidden_states = conv_layer(hidden_states)
-
-        return hidden_states
-
-
-class WavLMFeatureExtractor(WavLMFeatureEncoder):
-    def __init__(self, config):
-        super().__init__(config)
-        warnings.warn(
-            f"The class `{self.__class__.__name__}` has been depreciated "
-            "and will be removed in Transformers v5. "
-            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
-            FutureWarning,
-        )
-
-
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->WavLM
 class WavLMFeatureProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -563,7 +280,6 @@ def _relative_positions_bucket(self, relative_positions: torch.FloatTensor) -> t
         return relative_buckets
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->WavLM
 class WavLMFeedForward(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -869,41 +585,264 @@ def _compute_perplexity(probs):
     def forward(self, hidden_states):
         batch_size, sequence_length, hidden_size = hidden_states.shape
 
-        # project to codevector dim
-        hidden_states = self.weight_proj(hidden_states)
-        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
+            codevector_probs = codevector_probs.type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class WavLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = WavLMConfig
+    base_model_prefix = "wavlm"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = False
+    _supports_sdpa = False
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # gumbel softmax requires special init
+        if isinstance(module, WavLMGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, WavLMPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, WavLMFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+class WavLMNoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class WavLMLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class WavLMGroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class WavLMFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [WavLMGroupNormConvLayer(config, layer_id=0)] + [
+                WavLMNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [WavLMLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
 
-        if self.training:
-            # sample code vector probs via gumbel in differentiateable way
-            codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
-            codevector_probs = codevector_probs.type_as(hidden_states)
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    conv_layer.__call__,
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
 
-            # compute perplexity
-            codevector_soft_dist = torch.softmax(
-                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
-            )
-            perplexity = self._compute_perplexity(codevector_soft_dist)
-        else:
-            # take argmax in non-differentiable way
-            # comptute hard codevector distribution (one hot)
-            codevector_idx = hidden_states.argmax(dim=-1)
-            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
-                -1, codevector_idx.view(-1, 1), 1.0
-            )
-            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+        return hidden_states
 
-            perplexity = self._compute_perplexity(codevector_probs)
 
-        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
-        # use probs to retrieve codevectors
-        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
-        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
-        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+class WavLMAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
 
-        return codevectors, perplexity
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter with Wav2Vec2->WavLM
 class WavLMAdapter(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -935,111 +874,126 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer with Wav2Vec2->WavLM
-class WavLMAdapterLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv = nn.Conv1d(
-            config.output_hidden_size,
-            2 * config.output_hidden_size,
-            config.adapter_kernel_size,
-            stride=config.adapter_stride,
-            padding=1,
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
         )
 
-    def forward(self, hidden_states):
-        hidden_states = self.conv(hidden_states)
-        hidden_states = nn.functional.glu(hidden_states, dim=1)
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
 
-        return hidden_states
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
 
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
 
-class WavLMPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
 
-    config_class = WavLMConfig
-    base_model_prefix = "wavlm"
-    main_input_name = "input_values"
-    supports_gradient_checkpointing = True
+        return num_masked_span
 
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        # gumbel softmax requires special init
-        if isinstance(module, WavLMGumbelVectorQuantizer):
-            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
-            module.weight_proj.bias.data.zero_()
-            nn.init.uniform_(module.codevectors)
-        elif isinstance(module, WavLMPositionalConvEmbedding):
-            nn.init.normal_(
-                module.conv.weight,
-                mean=0,
-                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
-            )
-            nn.init.constant_(module.conv.bias, 0)
-        elif isinstance(module, WavLMFeatureProjection):
-            k = math.sqrt(1 / module.projection.in_features)
-            nn.init.uniform_(module.projection.weight, a=-k, b=k)
-            nn.init.uniform_(module.projection.bias, a=-k, b=k)
-        elif isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
 
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        elif isinstance(module, nn.Conv1d):
-            nn.init.kaiming_normal_(module.weight)
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
 
-            if module.bias is not None:
-                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
-                nn.init.uniform_(module.bias, a=-k, b=k)
+    max_num_masked_span = compute_num_masked_span(sequence_length)
 
-    def _get_feat_extract_output_lengths(
-        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
-    ):
-        """
-        Computes the output length of the convolutional layers
-        """
+    if max_num_masked_span == 0:
+        return spec_aug_mask
 
-        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
 
-        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
-            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
 
-        if add_adapter:
-            for _ in range(self.config.num_adapter_layers):
-                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
 
-        return input_lengths
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
 
-    def _get_feature_vector_attention_mask(
-        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
-    ):
-        # Effectively attention_mask.sum(-1), but not inplace to be able to run
-        # on inference mode.
-        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
 
-        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
-        output_lengths = output_lengths.to(torch.long)
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
 
-        batch_size = attention_mask.shape[0]
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
 
-        attention_mask = torch.zeros(
-            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
-        )
-        # these two operations makes sure that all values before the output lengths idxs are attended to
-        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
-        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
-        return attention_mask
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
 
 
 WAVLM_START_DOCSTRING = r"""
@@ -1061,7 +1015,6 @@ def _get_feature_vector_attention_mask(
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-
 WAVLM_INPUTS_DOCSTRING = r"""
     Args:
         input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
@@ -1098,12 +1051,13 @@ def _get_feature_vector_attention_mask(
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
+WavLMBaseModelOutput = Wav2Vec2BaseModelOutput
+
 
 @add_start_docstrings(
     "The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.",
     WAVLM_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM, WavLMBaseModelOutput->Wav2Vec2BaseModelOutput
 class WavLMModel(WavLMPreTrainedModel):
     def __init__(self, config: WavLMConfig):
         super().__init__(config)
@@ -1193,7 +1147,7 @@ def _mask_hidden_states(
     @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Wav2Vec2BaseModelOutput,
+        output_type=WavLMBaseModelOutput,
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
         expected_output=_EXPECTED_OUTPUT_SHAPE,
@@ -1206,7 +1160,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+    ) -> Union[Tuple, WavLMBaseModelOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1243,7 +1197,7 @@ def forward(
         if not return_dict:
             return (hidden_states, extract_features) + encoder_outputs[1:]
 
-        return Wav2Vec2BaseModelOutput(
+        return WavLMBaseModelOutput(
             last_hidden_state=hidden_states,
             extract_features=extract_features,
             hidden_states=encoder_outputs.hidden_states,
@@ -1251,11 +1205,16 @@ def forward(
         )
 
 
+_HIDDEN_STATES_START_POSITION = 2
+
+_CTC_EXPECTED_OUTPUT = "'mister quilter is the aposle of the middle classes and we are glad to welcome his gospel'"
+_CTC_EXPECTED_LOSS = 12.51
+
+
 @add_start_docstrings(
     """WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     WAVLM_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM
 class WavLMForCTC(WavLMPreTrainedModel):
     def __init__(self, config, target_lang: Optional[str] = None):
         super().__init__(config)
@@ -1432,7 +1391,6 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_extractor
     def freeze_feature_extractor(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameters will
@@ -1445,7 +1403,6 @@ def freeze_feature_extractor(self):
         )
         self.freeze_feature_encoder()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->wavlm
     def freeze_feature_encoder(self):
         """
         Calling this function will disable the gradient computation for the feature encoder so that its parameter will
@@ -1453,7 +1410,6 @@ def freeze_feature_encoder(self):
         """
         self.wavlm.feature_extractor._freeze_parameters()
 
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_base_model with wav2vec2->wavlm
     def freeze_base_model(self):
         """
         Calling this function will disable the gradient computation for the base model so that its parameters will not
@@ -1469,7 +1425,6 @@ def freeze_base_model(self):
         config_class=_CONFIG_FOR_DOC,
         modality="audio",
     )
-    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->WavLM, wav2vec2->wavlm
     def forward(
         self,
         input_values: Optional[torch.Tensor],
@@ -1533,13 +1488,16 @@ def forward(
         )
 
 
+_FRAME_CLASS_CHECKPOINT = "microsoft/wavlm-base-plus-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+
 @add_start_docstrings(
     """
     WavLM Model with a frame classification head on top for tasks like Speaker Diarization.
     """,
     WAVLM_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM
 class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1646,7 +1604,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
 class AMSoftmaxLoss(nn.Module):
     def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
         super(AMSoftmaxLoss, self).__init__()
@@ -1670,7 +1627,6 @@ def forward(self, hidden_states, labels):
         return loss
 
 
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
 class TDNNLayer(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -1686,6 +1642,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if is_peft_available():
             from peft.tuners.lora import LoraLayer
 
+        if is_peft_available():
             if isinstance(self.kernel, LoraLayer):
                 warnings.warn(
                     "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
@@ -1702,13 +1659,16 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
+_XVECTOR_CHECKPOINT = "microsoft/wavlm-base-plus-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.97
+
+
 @add_start_docstrings(
     """
     WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
     """,
     WAVLM_START_DOCSTRING,
 )
-# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->WavLM, wav2vec2->wavlm, WAV_2_VEC_2->WAVLM
 class WavLMForXVector(WavLMPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/wavlm/modular_wavlm.py b/src/transformers/models/wavlm/modular_wavlm.py
new file mode 100644
index 000000000000..9ae9170fec56
--- /dev/null
+++ b/src/transformers/models/wavlm/modular_wavlm.py
@@ -0,0 +1,758 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2FeedForward,
+    Wav2Vec2ForAudioFrameClassification,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2ForXVector,
+    Wav2Vec2Model,
+    Wav2Vec2PositionalConvEmbedding,
+    Wav2Vec2PreTrainedModel,
+)
+from .configuration_wavlm import WavLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "WavLMConfig"
+
+_CHECKPOINT_FOR_DOC = "patrickvonplaten/wavlm-libri-clean-100h-base-plus"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+_CTC_EXPECTED_OUTPUT = "'mister quilter is the aposle of the middle classes and we are glad to welcome his gospel'"
+_CTC_EXPECTED_LOSS = 12.51
+
+_FRAME_CLASS_CHECKPOINT = "microsoft/wavlm-base-plus-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+_XVECTOR_CHECKPOINT = "microsoft/wavlm-base-plus-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.97
+
+
+class WavLMPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding):
+    pass
+
+
+class WavLMFeatureProjection(Wav2Vec2FeatureProjection):
+    pass
+
+
+class WavLMAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        num_buckets: int = 320,
+        max_distance: int = 800,
+        has_relative_position_bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+
+        self.gru_rel_pos_const = nn.Parameter(torch.ones(1, self.num_heads, 1, 1))
+        self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8)
+
+        if has_relative_position_bias:
+            self.rel_attn_embed = nn.Embedding(self.num_buckets, self.num_heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        index=0,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Attention layer with relative attention"""
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # first pass of attention layer creates position bias
+        if position_bias is None:
+            position_bias = self.compute_bias(tgt_len, tgt_len)
+            position_bias = (
+                position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, tgt_len)
+            )
+
+        # Compute relative position bias:
+        # 1) get reshape hidden_states
+        gated_hidden_states = hidden_states.view(hidden_states.shape[:-1] + (self.num_heads, -1))
+        gated_hidden_states = gated_hidden_states.permute(0, 2, 1, 3)
+
+        # 2) project hidden states
+        relative_position_proj = self.gru_rel_pos_linear(gated_hidden_states)
+        relative_position_proj = relative_position_proj.view(gated_hidden_states.shape[:-1] + (2, 4)).sum(-1)
+
+        # 3) compute gate for position bias from projected hidden states
+        gate_a, gate_b = torch.sigmoid(relative_position_proj).chunk(2, dim=-1)
+        gate_output = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0
+
+        # 4) apply gate to position bias to compute gated position_bias
+        gated_position_bias = gate_output.view(bsz * self.num_heads, -1, 1) * position_bias
+        gated_position_bias = gated_position_bias.view((-1, tgt_len, tgt_len))
+
+        attn_output, attn_weights = self.torch_multi_head_self_attention(
+            hidden_states, attention_mask, gated_position_bias, output_attentions
+        )
+
+        return attn_output, attn_weights, position_bias
+
+    def torch_multi_head_self_attention(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Union[torch.LongTensor, torch.BoolTensor],
+        gated_position_bias: torch.FloatTensor,
+        output_attentions: bool,
+    ) -> (torch.FloatTensor, torch.FloatTensor):
+        """simple wrapper around torch's multi_head_attention_forward function"""
+        # self-attention assumes q = k = v
+        query = key = value = hidden_states.transpose(0, 1)
+        key_padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+
+        # disable bias and add_zero_attn
+        bias_k = bias_v = None
+        add_zero_attn = False
+
+        # PyTorch 1.3.0 has F.multi_head_attention_forward defined
+        # so no problem with backwards compatibility
+        attn_output, attn_weights = F.multi_head_attention_forward(
+            query,
+            key,
+            value,
+            self.embed_dim,
+            self.num_heads,
+            torch.empty([0]),
+            torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+            bias_k,
+            bias_v,
+            add_zero_attn,
+            self.dropout,
+            self.out_proj.weight,
+            self.out_proj.bias,
+            self.training,
+            key_padding_mask,
+            output_attentions,
+            gated_position_bias,
+            use_separate_proj_weight=True,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+        )
+
+        # [Seq_Len, Batch Size, ...] -> [Batch Size, Seq_Len, ...]
+        attn_output = attn_output.transpose(0, 1)
+
+        if attn_weights is not None:
+            # IMPORTANT: Attention weights are averaged weights
+            # here which should not be the case. This is an open issue
+            # on PyTorch: https://github.com/pytorch/pytorch/issues/32590
+            attn_weights = attn_weights[:, None].broadcast_to(
+                attn_weights.shape[:1] + (self.num_heads,) + attn_weights.shape[1:]
+            )
+
+        return attn_output, attn_weights
+
+    def compute_bias(self, query_length: int, key_length: int) -> torch.FloatTensor:
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        relative_position_bucket = self._relative_positions_bucket(relative_position)
+        relative_position_bucket = relative_position_bucket.to(self.rel_attn_embed.weight.device)
+        values = self.rel_attn_embed(relative_position_bucket)
+        values = values.permute([2, 0, 1])
+        return values
+
+    def _relative_positions_bucket(self, relative_positions: torch.FloatTensor) -> torch.FloatTensor:
+        num_buckets = self.num_buckets // 2
+
+        relative_buckets = (relative_positions > 0).to(torch.long) * num_buckets
+        relative_positions = torch.abs(relative_positions)
+
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+
+        relative_positions_if_large = torch.log(relative_positions.float() / max_exact)
+        relative_positions_if_large = relative_positions_if_large / math.log(self.max_distance / max_exact)
+        relative_positions_if_large = relative_positions_if_large * (num_buckets - max_exact)
+        relative_position_if_large = (max_exact + relative_positions_if_large).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_positions, relative_position_if_large)
+        return relative_buckets
+
+
+class WavLMFeedForward(Wav2Vec2FeedForward):
+    pass
+
+
+class WavLMEncoderLayer(nn.Module):
+    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            index=index,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [WavLMEncoderLayer(config, has_relative_position_bias=(i == 0)) for i in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        position_bias = None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        position_bias,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        position_bias=position_bias,
+                        output_attentions=output_attentions,
+                        index=i,
+                    )
+
+                hidden_states, position_bias = layer_outputs[:2]
+
+            if skip_the_layer:
+                layer_outputs = (None, None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class WavLMEncoderStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [
+                WavLMEncoderLayerStableLayerNorm(config, has_relative_position_bias=(i == 0))
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens are not attended to
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        position_bias = None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        position_bias,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        output_attentions=output_attentions,
+                        position_bias=position_bias,
+                    )
+                hidden_states, position_bias = layer_outputs[:2]
+
+            if skip_the_layer:
+                layer_outputs = (None, None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+
+
+class WavLMGumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible"
+                f" by `config.num_codevector_groups` {self.num_groups} "
+                "for concatenation."
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs):
+        marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
+            codevector_probs = codevector_probs.type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class WavLMPreTrainedModel(PreTrainedModel, Wav2Vec2PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = WavLMConfig
+    base_model_prefix = "wavlm"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = False
+    _supports_sdpa = False
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # gumbel softmax requires special init
+        if isinstance(module, WavLMGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, WavLMPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, WavLMFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_adapters(self):
+        raise AttributeError("Not needed for WavLM")
+
+    def init_adapter_layers(self):
+        raise AttributeError("Not needed for WavLM")
+
+    def load_adapter(self):
+        raise AttributeError("Not needed for WavLM")
+
+
+WAVLM_START_DOCSTRING = r"""
+    WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled
+    Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo
+    Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian,
+    Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`WavLMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+WAVLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
+            **not** be passed to avoid degraded performance when doing batched inference. For such models
+            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
+            models also yield slightly different results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+WavLMBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.",
+    WAVLM_START_DOCSTRING,
+)
+class WavLMModel(Wav2Vec2Model):
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=WavLMBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForCTC(Wav2Vec2ForCTC):
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """,
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    WavLM Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification):
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_FRAME_CLASS_CHECKPOINT,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_FRAME_EXPECTED_OUTPUT,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForXVector(Wav2Vec2ForXVector):
+    pass
+
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_XVECTOR_CHECKPOINT,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_XVECTOR_EXPECTED_OUTPUT,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+__all__ = [
+    "WavLMForAudioFrameClassification",
+    "WavLMForCTC",
+    "WavLMForSequenceClassification",
+    "WavLMForXVector",
+    "WavLMModel",
+    "WavLMPreTrainedModel",
+]
diff --git a/src/transformers/models/whisper/convert_openai_to_hf.py b/src/transformers/models/whisper/convert_openai_to_hf.py
deleted file mode 100755
index bb63cd24cd61..000000000000
--- a/src/transformers/models/whisper/convert_openai_to_hf.py
+++ /dev/null
@@ -1,370 +0,0 @@
-#!/usr/bin/env python
-"""Converts a Whisper model in OpenAI format to Hugging Face format."""
-# Copyright 2022 The HuggingFace Inc. team and the OpenAI team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import io
-import json
-import os
-import tempfile
-import urllib
-import warnings
-from typing import Any, List, Optional, Tuple
-
-import torch
-from huggingface_hub.utils import insecure_hashlib
-from torch import nn
-from tqdm import tqdm
-
-from transformers import (
-    GenerationConfig,
-    WhisperConfig,
-    WhisperFeatureExtractor,
-    WhisperForConditionalGeneration,
-    WhisperProcessor,
-    WhisperTokenizer,
-    WhisperTokenizerFast,
-)
-from transformers.models.whisper.tokenization_whisper import LANGUAGES, bytes_to_unicode
-from transformers.utils.import_utils import _is_package_available
-
-
-_MODELS = {
-    "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
-    "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
-    "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
-    "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
-    "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
-    "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
-    "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
-    "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
-    "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
-    "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
-    "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
-}
-
-
-_TOKENIZERS = {
-    "multilingual": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken",
-    "english": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/gpt2.tiktoken",
-}
-
-
-def _get_generation_config(
-    is_multilingual: bool,
-    num_languages: int = 100,
-    openai_version: Optional[str] = None,
-) -> GenerationConfig:
-    """
-    Loads the appropriate generation config from HF repo
-    """
-    if openai_version is not None:
-        repo = f"openai/whisper-{openai_version}"
-    elif not is_multilingual:
-        repo = "openai/whisper-medium.en"
-    elif num_languages < 100:
-        repo = "openai/whisper-large-v2"
-    else:
-        repo = "openai/whisper-large-v3"
-
-    gen_cfg = GenerationConfig.from_pretrained(repo)
-    if openai_version is None:
-        gen_cfg.alignment_heads = None
-        warnings.warn(
-            "Alignment heads have not been included in the generation config, since they are available "
-            "only for the original OpenAI checkpoints."
-            "If you want to use word-level timestamps with a custom version of Whisper,"
-            "see https://github.com/openai/whisper/blob/main/notebooks/Multilingual_ASR.ipynb"
-            "for the example of how to produce word-level timestamps manually."
-        )
-
-    return gen_cfg
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["layers", "blocks"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-WHISPER_MAPPING = {
-    "blocks": "layers",
-    "mlp.0": "fc1",
-    "mlp.2": "fc2",
-    "mlp_ln": "final_layer_norm",
-    ".attn.query": ".self_attn.q_proj",
-    ".attn.key": ".self_attn.k_proj",
-    ".attn.value": ".self_attn.v_proj",
-    ".attn_ln": ".self_attn_layer_norm",
-    ".attn.out": ".self_attn.out_proj",
-    ".cross_attn.query": ".encoder_attn.q_proj",
-    ".cross_attn.key": ".encoder_attn.k_proj",
-    ".cross_attn.value": ".encoder_attn.v_proj",
-    ".cross_attn_ln": ".encoder_attn_layer_norm",
-    ".cross_attn.out": ".encoder_attn.out_proj",
-    "decoder.ln.": "decoder.layer_norm.",
-    "encoder.ln.": "encoder.layer_norm.",
-    "token_embedding": "embed_tokens",
-    "encoder.positional_embedding": "encoder.embed_positions.weight",
-    "decoder.positional_embedding": "decoder.embed_positions.weight",
-    "ln_post": "layer_norm",
-}
-
-
-def rename_keys(s_dict):
-    keys = list(s_dict.keys())
-    for key in keys:
-        new_key = key
-        for k, v in WHISPER_MAPPING.items():
-            if k in key:
-                new_key = new_key.replace(k, v)
-
-        print(f"{key} -> {new_key}")
-
-        s_dict[new_key] = s_dict.pop(key)
-    return s_dict
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def _download(url: str, root: str) -> Any:
-    os.makedirs(root, exist_ok=True)
-    filename = os.path.basename(url)
-
-    expected_sha256 = url.split("/")[-2]
-    download_target = os.path.join(root, filename)
-
-    if os.path.exists(download_target) and not os.path.isfile(download_target):
-        raise RuntimeError(f"{download_target} exists and is not a regular file")
-
-    if os.path.isfile(download_target):
-        model_bytes = open(download_target, "rb").read()
-        if insecure_hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
-            return torch.load(io.BytesIO(model_bytes))
-        else:
-            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
-
-    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
-        with tqdm(
-            total=int(source.info().get("Content-Length")), ncols=80, unit="iB", unit_scale=True, unit_divisor=1024
-        ) as loop:
-            while True:
-                buffer = source.read(8192)
-                if not buffer:
-                    break
-
-                output.write(buffer)
-                loop.update(len(buffer))
-
-    model_bytes = open(download_target, "rb").read()
-    if insecure_hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
-        raise RuntimeError(
-            "Model has been downloaded but the SHA256 checksum does not match. Please retry loading the model."
-        )
-
-    return torch.load(io.BytesIO(model_bytes))
-
-
-def convert_openai_whisper_to_tfms(
-    checkpoint_path, pytorch_dump_folder_path
-) -> Tuple[WhisperForConditionalGeneration, bool, int]:
-    if ".pt" not in checkpoint_path:
-        root = os.path.dirname(pytorch_dump_folder_path) or "."
-        original_checkpoint = _download(_MODELS[checkpoint_path], root)
-        openai_version = checkpoint_path
-    else:
-        original_checkpoint = torch.load(checkpoint_path, map_location="cpu")
-        openai_version = None
-
-    dimensions = original_checkpoint["dims"]
-    state_dict = original_checkpoint["model_state_dict"]
-    proj_out_weights = state_dict["decoder.token_embedding.weight"]
-    remove_ignore_keys_(state_dict)
-    rename_keys(state_dict)
-    tie_embeds = True
-    ffn_dim = state_dict["decoder.layers.0.fc1.weight"].shape[0]
-
-    # a hacky way to properly set up the bos/eos/pad token ids in the model
-    endoftext_id = 50257 if dimensions["n_vocab"] > 51865 else 50256
-
-    config = WhisperConfig(
-        vocab_size=dimensions["n_vocab"],
-        encoder_ffn_dim=ffn_dim,
-        decoder_ffn_dim=ffn_dim,
-        num_mel_bins=dimensions["n_mels"],
-        d_model=dimensions["n_audio_state"],
-        max_target_positions=dimensions["n_text_ctx"],
-        encoder_layers=dimensions["n_audio_layer"],
-        encoder_attention_heads=dimensions["n_audio_head"],
-        decoder_layers=dimensions["n_text_layer"],
-        decoder_attention_heads=dimensions["n_text_head"],
-        max_source_positions=dimensions["n_audio_ctx"],
-        eos_token_id=endoftext_id,
-        bos_token_id=endoftext_id,
-        pad_token_id=endoftext_id,
-        decoder_start_token_id=endoftext_id + 1,
-    )
-
-    model = WhisperForConditionalGeneration(config)
-    missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
-    if len(missing) > 0 and not set(missing) <= {
-        "encoder.embed_positions.weights",
-        "decoder.embed_positions.weights",
-    }:
-        raise ValueError(
-            "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights`  are allowed to be missing,"
-            f" but all the following weights are missing {missing}"
-        )
-
-    if tie_embeds:
-        model.proj_out = make_linear_from_emb(model.model.decoder.embed_tokens)
-    else:
-        model.proj_out.weight.data = proj_out_weights
-
-    # determine those parameters from a model checkpoint as Whisper repo does
-    is_multilingual = model.config.vocab_size >= 51865
-    num_languages = model.config.vocab_size - 51765 - int(is_multilingual)
-
-    model.generation_config = _get_generation_config(
-        is_multilingual,
-        num_languages,
-        openai_version,
-    )
-
-    return model, is_multilingual, num_languages
-
-
-# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960
-def _bpe(mergeable_ranks, token: bytes, max_rank=None) -> List[bytes]:
-    parts = [bytes([b]) for b in token]
-    while True:
-        min_idx = None
-        min_rank = None
-        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
-            rank = mergeable_ranks.get(pair[0] + pair[1])
-            if rank is not None and (min_rank is None or rank < min_rank):
-                min_idx = i
-                min_rank = rank
-        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
-            break
-        assert min_idx is not None
-        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]
-    return parts
-
-
-def convert_tiktoken_bpe_to_hf(tiktoken_url: str):
-    bpe_ranks = load_tiktoken_bpe(tiktoken_url)
-    byte_encoder = bytes_to_unicode()
-
-    def token_bytes_to_string(b):
-        return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
-
-    merges = []
-    vocab = {}
-    for token, rank in bpe_ranks.items():
-        vocab[token_bytes_to_string(token)] = rank
-        if len(token) == 1:
-            continue
-        merged = tuple(_bpe(bpe_ranks, token, max_rank=rank))
-        if len(merged) == 2:  # account for empty token
-            merges.append(" ".join(map(token_bytes_to_string, merged)))
-    return vocab, merges
-
-
-def convert_tiktoken_to_hf(
-    multilingual: bool = True, num_languages: int = 100, time_precision=0.02
-) -> WhisperTokenizer:
-    # requires whisper, unless we use the path to the tiktoken file
-    tiktoken_tokenizer_path = _TOKENIZERS["multilingual" if multilingual else "english"]
-    start_of_transcript = ["<|endoftext|>", "<|startoftranscript|>"]
-    control_tokens = [
-        "<|translate|>",
-        "<|transcribe|>",
-        "<|startoflm|>",
-        "<|startofprev|>",
-        "<|nospeech|>",
-        "<|notimestamps|>",
-    ]
-    # these are special tokens, not normalized
-    language_tokens = [f"<|{k}|>" for k in list(LANGUAGES)[:num_languages]]
-    # These are not special but normalized
-    timestamp_tokens = [("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)]
-
-    vocab, merges = convert_tiktoken_bpe_to_hf(tiktoken_tokenizer_path)
-
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        vocab_file = f"{tmpdirname}/vocab.json"
-        merge_file = f"{tmpdirname}/merges.txt"
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens in merges:
-                writer.write(bpe_tokens + "\n")
-
-        hf_tokenizer = WhisperTokenizer(vocab_file, merge_file)
-
-    hf_tokenizer.add_tokens(start_of_transcript + language_tokens + control_tokens, special_tokens=True)
-    hf_tokenizer.add_tokens(timestamp_tokens, special_tokens=False)
-    return hf_tokenizer
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # # Required parameters
-    parser.add_argument("--checkpoint_path", type=str, help="Path to the downloaded checkpoints")
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--convert_preprocessor",
-        type=bool,
-        default=False,
-        help="Whether or not the preprocessor (tokenizer + feature extractor) should be converted along with the model.",
-    )
-    args = parser.parse_args()
-
-    model, is_multilingual, num_languages = convert_openai_whisper_to_tfms(
-        args.checkpoint_path, args.pytorch_dump_folder_path
-    )
-
-    if args.convert_preprocessor:
-        try:
-            if not _is_package_available("tiktoken"):
-                raise ModuleNotFoundError(
-                    """`tiktoken` is not installed, use `pip install tiktoken` to convert the tokenizer"""
-                )
-        except Exception as e:
-            print(e)
-        else:
-            from tiktoken.load import load_tiktoken_bpe
-
-            tokenizer = convert_tiktoken_to_hf(is_multilingual, num_languages)
-            feature_extractor = WhisperFeatureExtractor(
-                feature_size=model.config.num_mel_bins,
-                # the rest of default parameters are the same as hardcoded in openai/whisper
-            )
-            processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-            processor.save_pretrained(args.pytorch_dump_folder_path)
-
-            # save fast tokenizer as well
-            fast_tokenizer = WhisperTokenizerFast.from_pretrained(args.pytorch_dump_folder_path)
-            fast_tokenizer.save_pretrained(args.pytorch_dump_folder_path, legacy_format=False)
-
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 1519fb028623..af21fb7d3eca 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -57,6 +57,14 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
             Size of the Fourier transform.
         padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
+        dither (`float`, *optional*, defaults to 0.0):
+            Adds dithering. In other words, adds a small Gaussian noise to each frame.
+            E.g. use 0.0001 to add dithering with a normal distribution centered
+            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
+            The value 0.0 means no dithering.
+            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
+            the high log_mel_fbank values for signals with hard-zero sections,
+            when VAD cutoff is present in the signal.
     """
 
     model_input_names = ["input_features"]
@@ -69,6 +77,7 @@ def __init__(
         chunk_length=30,
         n_fft=400,
         padding_value=0.0,
+        dither=0.0,
         return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
         **kwargs,
     ):
@@ -85,6 +94,7 @@ def __init__(
         self.n_samples = chunk_length * sampling_rate
         self.nb_max_frames = self.n_samples // hop_length
         self.sampling_rate = sampling_rate
+        self.dither = dither
         self.mel_filters = mel_filter_bank(
             num_frequency_bins=1 + n_fft // 2,
             num_mel_filters=feature_size,
@@ -114,6 +124,7 @@ def _np_extract_fbank_features(self, waveform_batch: np.array, device: str) -> n
                 frame_length=self.n_fft,
                 hop_length=self.hop_length,
                 power=2.0,
+                dither=self.dither,
                 mel_filters=self.mel_filters,
                 log_mel="log10",
             )
@@ -132,6 +143,12 @@ def _torch_extract_fbank_features(self, waveform: np.array, device: str = "cpu")
         waveform = torch.from_numpy(waveform).to(device, torch.float32)
         window = torch.hann_window(self.n_fft, device=device)
 
+        # Note: it would be better to dither the chunked waveform,
+        # so overlapping signal does not get the same dithering.
+        # But, chunking is happening inside pytorch, so it is here.
+        if self.dither != 0.0:
+            waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device)
+
         stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
         magnitudes = stft[..., :-1].abs() ** 2
 
@@ -238,7 +255,6 @@ def __call__(
                 Whether or not to return the number of frames of the input raw_speech.
                 These num_frames can be used by the model to compute word level timestamps.
         """
-
         if sampling_rate is not None:
             if sampling_rate != self.sampling_rate:
                 raise ValueError(
@@ -248,7 +264,7 @@ def __call__(
                 )
         else:
             logger.warning(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
                 "Failing to do so can result in silent errors that might be hard to debug."
             )
 
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 035b4fb8907e..50e3db09a42c 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1779,7 +1779,10 @@ def _prepare_decoder_input_ids(
 
         prev_start_of_text = getattr(generation_config, "prev_sot_token_id", None)
         if prev_start_of_text is None:
-            prev_start_of_text = suppress_tokens[-2] if suppress_tokens is not None else None
+            if suppress_tokens is not None and len(suppress_tokens) >= 2:
+                prev_start_of_text = suppress_tokens[-2]
+            else:
+                prev_start_of_text = None
 
         if any(do_condition_on_prev_tokens) and len(current_segments[0]) > 0:
             # according to https://github.com/openai/whisper/blob/e58f28804528831904c3b6f2c0e473f346223433/whisper/decoding.py#L609
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index f6ffab062993..42fb928b7acf 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -27,6 +27,7 @@
 from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -39,8 +40,7 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
+    is_torch_flex_attn_available,
     logging,
     replace_return_docstrings,
 )
@@ -48,7 +48,12 @@
 from .generation_whisper import WhisperGenerationMixin
 
 
-if is_flash_attn_2_available():
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+if is_flash_attn_available():
     from ...modeling_flash_attention_utils import _flash_attention_forward
 
 
@@ -145,7 +150,7 @@ def compute_num_masked_span(input_length):
 
     # compute number of masked spans in batch
     input_lengths = (
-        attention_mask.sum(-1).detach().tolist()
+        attention_mask.detach().sum(-1).tolist()
         if attention_mask is not None
         else [sequence_length for _ in range(batch_size)]
     )
@@ -358,9 +363,9 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
 
     def forward(
         self,
@@ -631,9 +636,7 @@ def forward(
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
+        if hidden_states.dtype == torch.float16:
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
@@ -646,7 +649,7 @@ def forward(
 
 
 class WhisperDecoderLayer(nn.Module):
-    def __init__(self, config: WhisperConfig, layer_idx: int = None):
+    def __init__(self, config: WhisperConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.embed_dim = config.d_model
 
@@ -1037,9 +1040,9 @@ def forward(
 
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
-            assert head_mask.size()[0] == (
-                len(self.layers)
-            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
 
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
@@ -1372,12 +1375,17 @@ def _update_causal_mask(
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
         past_key_values: Cache,
-        output_attentions: bool,
+        output_attentions: bool = False,
     ):
         if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and (attention_mask == 0.0).any():
                 return attention_mask
             return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
 
         # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
         # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
@@ -1459,7 +1467,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
@@ -1871,7 +1879,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
         head_mask: Optional[torch.Tensor] = None,
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index ad5fa22e370f..b5f703c6cae0 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -48,7 +48,7 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
     def __call__(self, *args, **kwargs):
         """
         Forwards the `audio` argument to WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] and the `text`
-        argument to [`~WhisperTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
+        argument to [`~WhisperTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
         information.
         """
         # For backward compatibility
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 7983799ad8a7..9e4c89b591b4 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -377,7 +377,9 @@ def bpe(self, token):
         self.cache[token] = word
         return word
 
-    def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None):
+    def set_prefix_tokens(
+        self, language: Optional[str] = None, task: Optional[str] = None, predict_timestamps: Optional[bool] = None
+    ):
         """
         Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
         update the prefix tokens as required when fine-tuning. Example:
@@ -674,7 +676,7 @@ def decode(
         self,
         token_ids,
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         output_offsets: bool = False,
         time_precision: float = 0.02,
         decode_with_timestamps: bool = False,
@@ -1276,7 +1278,7 @@ def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language, retu
 def _combine_tokens_into_words(
     tokenizer,
     tokens: List[int],
-    language: str = None,
+    language: Optional[str] = None,
     prepend_punctuations: str = "\"'“¡¿([{-",
     append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
 ):
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index 9a2c65254403..64dcc186bf8e 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -313,7 +313,7 @@ def decode(
         self,
         token_ids,
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         output_offsets: bool = False,
         time_precision: float = 0.02,
         decode_with_timestamps: bool = False,
@@ -451,7 +451,9 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
 
         return tuple(files) + (normalizer_file,)
 
-    def set_prefix_tokens(self, language: str = None, task: str = None, predict_timestamps: bool = None):
+    def set_prefix_tokens(
+        self, language: Optional[str] = None, task: Optional[str] = None, predict_timestamps: Optional[bool] = None
+    ):
         """
         Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
         update the prefix tokens as required when fine-tuning. Example:
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
deleted file mode 100644
index 8ff878f2cc9f..000000000000
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import gdown
-import numpy as np
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    CLIPTokenizer,
-    CLIPTokenizerFast,
-    VideoMAEImageProcessor,
-    XCLIPConfig,
-    XCLIPModel,
-    XCLIPProcessor,
-    XCLIPTextConfig,
-    XCLIPVisionConfig,
-)
-
-
-def get_xclip_config(model_name, num_frames):
-    text_config = XCLIPTextConfig()
-
-    # derive patch size from model name
-    start_idx = model_name.find("patch")
-    patch_size = int(model_name[start_idx + len("patch") : start_idx + len("patch") + 2])
-    vision_config = XCLIPVisionConfig(patch_size=patch_size, num_frames=num_frames)
-
-    if "large" in model_name:
-        text_config.hidden_size = 768
-        text_config.intermediate_size = 3072
-        text_config.num_attention_heads = 12
-
-        vision_config.hidden_size = 1024
-        vision_config.intermediate_size = 4096
-        vision_config.num_attention_heads = 16
-        vision_config.num_hidden_layers = 24
-        vision_config.mit_hidden_size = 768
-        vision_config.mit_intermediate_size = 3072
-
-    if model_name == "xclip-large-patch14-16-frames":
-        vision_config.image_size = 336
-
-    config = XCLIPConfig.from_text_vision_configs(text_config, vision_config)
-
-    if "large" in model_name:
-        config.projection_dim = 768
-
-    return config
-
-
-def rename_key(name):
-    # text encoder
-    if name == "token_embedding.weight":
-        name = name.replace("token_embedding.weight", "text_model.embeddings.token_embedding.weight")
-    if name == "positional_embedding":
-        name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
-    if "ln_1" in name:
-        name = name.replace("ln_1", "layer_norm1")
-    if "ln_2" in name:
-        name = name.replace("ln_2", "layer_norm2")
-    if "c_fc" in name:
-        name = name.replace("c_fc", "fc1")
-    if "c_proj" in name:
-        name = name.replace("c_proj", "fc2")
-    if name.startswith("transformer.resblocks"):
-        name = name.replace("transformer.resblocks", "text_model.encoder.layers")
-    if "attn.out_proj" in name and "message" not in name:
-        name = name.replace("attn.out_proj", "self_attn.out_proj")
-    if "ln_final" in name:
-        name = name.replace("ln_final", "text_model.final_layer_norm")
-    # visual encoder
-    if name == "visual.class_embedding":
-        name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
-    if name == "visual.positional_embedding":
-        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
-    if name.startswith("visual.transformer.resblocks"):
-        name = name.replace("visual.transformer.resblocks", "vision_model.encoder.layers")
-    if "visual.conv1" in name:
-        name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
-    if "visual.ln_pre" in name:
-        name = name.replace("visual.ln_pre", "vision_model.pre_layernorm")
-    if "visual.ln_post" in name:
-        name = name.replace("visual.ln_post", "vision_model.post_layernorm")
-    if "visual.proj" in name:
-        name = name.replace("visual.proj", "visual_projection.weight")
-    if "text_projection" in name:
-        name = name.replace("text_projection", "text_projection.weight")
-    # things on top
-    if "prompts_visual_proj" in name:
-        name = name.replace("prompts_visual_proj", "prompts_visual_projection")
-    if "prompts_visual_ln" in name:
-        name = name.replace("prompts_visual_ln", "prompts_visual_layernorm")
-    # mit
-    if name == "mit.positional_embedding":
-        name = name.replace("positional", "position")
-    if name.startswith("mit.resblocks"):
-        name = name.replace("mit.resblocks", "mit.encoder.layers")
-    # prompts generator
-    if name.startswith("prompts_generator.norm"):
-        name = name.replace("prompts_generator.norm", "prompts_generator.layernorm")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict, config):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "attn.in_proj" in key:
-            key_split = key.split(".")
-            if key.startswith("visual"):
-                layer_num = key_split[3]
-                dim = config.vision_config.hidden_size
-                if "message_attn" in key:
-                    if "weight" in key:
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.q_proj.weight"] = val[
-                            :dim, :
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.k_proj.weight"] = val[
-                            dim : dim * 2, :
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.v_proj.weight"] = val[
-                            -dim:, :
-                        ]
-                    else:
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.q_proj.bias"] = val[
-                            :dim
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.k_proj.bias"] = val[
-                            dim : dim * 2
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.v_proj.bias"] = val[
-                            -dim:
-                        ]
-                else:
-                    if "weight" in key:
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[
-                            :dim, :
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
-                            dim : dim * 2, :
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[
-                            -dim:, :
-                        ]
-                    else:
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
-                            dim : dim * 2
-                        ]
-                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-            elif key.startswith("mit"):
-                layer_num = key_split[2]
-                dim = config.vision_config.mit_hidden_size
-                if "weight" in key:
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
-                else:
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
-                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-            else:
-                layer_num = key_split[2]
-                dim = config.text_config.hidden_size
-                if "weight" in key:
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
-                        dim : dim * 2, :
-                    ]
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
-                else:
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
-                        dim : dim * 2
-                    ]
-                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
-        else:
-            new_key_name = rename_key(key)
-            if new_key_name in ["visual_projection.weight", "text_projection.weight"]:
-                val = val.T
-            orig_state_dict[new_key_name] = val
-
-    return orig_state_dict
-
-
-def prepare_video(num_frames):
-    if num_frames == 8:
-        filename = "eating_spaghetti_8_frames.npy"
-    elif num_frames == 16:
-        filename = "eating_spaghetti.npy"
-    elif num_frames == 32:
-        filename = "eating_spaghetti_32_frames.npy"
-    file = hf_hub_download(
-        repo_id="hf-internal-testing/spaghetti-video",
-        filename=filename,
-        repo_type="dataset",
-    )
-    video = np.load(file)
-    return list(video)
-
-
-def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-    model_to_url = {
-        # fully supervised kinetics-400 checkpoints
-        "xclip-base-patch32": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth",
-        "xclip-base-patch32-16-frames": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_16.pth"
-        ),
-        "xclip-base-patch16": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_8.pth",
-        "xclip-base-patch16-16-frames": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_16.pth"
-        ),
-        "xclip-large-patch14": "https://drive.google.com/u/0/uc?id=1NUOImq0o5DlQTST17iIP3vG7DgmHQuCx&amp;export=download&amp;confirm=t&amp;uuid=b26caedc-88e2-473e-830a-9d158b653cdb",
-        "xclip-large-patch14-16-frames": "https://drive.google.com/u/0/uc?id=1FOYgnJc097OJ4lGwtRCCydQyVPJEOH7d&amp;export=download&amp;confirm=t&amp;uuid=538fa810-e671-4050-b385-9a623f89804f",
-        # fully supervised kinetics-600 checkpoints
-        "xclip-base-patch16-kinetics-600": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_8.pth"
-        ),
-        "xclip-base-patch16-kinetics-600-16-frames": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_16.pth"
-        ),
-        "xclip-large-patch14-kinetics-600": "https://drive.google.com/u/0/uc?id=1FV8C1INuM91sLAN4ImjzePLIlpMSihwV&amp;export=download&amp;confirm=t&amp;uuid=141d4977-4a65-44ae-864f-4b0c19f838be",
-        # few shot
-        "xclip-base-patch16-hmdb-2-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_2.pth"
-        ),
-        "xclip-base-patch16-hmdb-4-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_4.pth"
-        ),
-        "xclip-base-patch16-hmdb-8-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_8.pth"
-        ),
-        "xclip-base-patch16-hmdb-16-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_16.pth"
-        ),
-        "xclip-base-patch16-ucf-2-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_2.pth"
-        ),
-        "xclip-base-patch16-ucf-4-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_4.pth"
-        ),
-        "xclip-base-patch16-ucf-8-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_8.pth"
-        ),
-        "xclip-base-patch16-ucf-16-shot": (
-            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_16.pth"
-        ),
-        # zero shot
-        "xclip-base-patch16-zero-shot": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/zero.pth",
-    }
-
-    checkpoint_url = model_to_url[model_name]
-    num_frames = 8
-    if "16-frames" in model_name:
-        num_frames = 16
-    elif "shot" in model_name:
-        num_frames = 32
-
-    config = get_xclip_config(model_name, num_frames)
-    model = XCLIPModel(config)
-    model.eval()
-
-    if "drive" in checkpoint_url:
-        output = "pytorch_model.bin"
-        gdown.cached_download(checkpoint_url, output, quiet=False)
-        state_dict = torch.load(output, map_location="cpu")["model"]
-    else:
-        state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
-
-    state_dict = convert_state_dict(state_dict, config)
-
-    model = XCLIPModel(config)
-    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-    assert missing_keys == ["text_model.embeddings.position_ids", "vision_model.embeddings.position_ids"]
-    model.eval()
-
-    size = 336 if model_name == "xclip-large-patch14-16-frames" else 224
-    image_processor = VideoMAEImageProcessor(size=size)
-    slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-    fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
-    processor = XCLIPProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
-
-    video = prepare_video(num_frames)
-    inputs = processor(
-        text=["playing sports", "eating spaghetti", "go shopping"], videos=video, return_tensors="pt", padding=True
-    )
-
-    print("Shape of pixel values:", inputs.pixel_values.shape)
-
-    with torch.no_grad():
-        outputs = model(**inputs)
-
-    # Verify outputs
-    logits_per_video = outputs.logits_per_video
-    probs = logits_per_video.softmax(dim=1)
-    print("Probs:", probs)
-    # kinetics-400
-    if model_name == "xclip-base-patch32":
-        expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
-    elif model_name == "xclip-base-patch32-16-frames":
-        expected_probs = torch.tensor([[7.0999e-04, 9.9883e-01, 4.5580e-04]])
-    elif model_name == "xclip-base-patch16":
-        expected_probs = torch.tensor([[0.0083, 0.9681, 0.0236]])
-    elif model_name == "xclip-base-patch16-16-frames":
-        expected_probs = torch.tensor([[7.6937e-04, 9.9728e-01, 1.9473e-03]])
-    elif model_name == "xclip-large-patch14":
-        expected_probs = torch.tensor([[0.0062, 0.9864, 0.0075]])
-    elif model_name == "xclip-large-patch14-16-frames":
-        expected_probs = torch.tensor([[3.3877e-04, 9.9937e-01, 2.8888e-04]])
-    # kinetics-600
-    elif model_name == "xclip-base-patch16-kinetics-600":
-        expected_probs = torch.tensor([[0.0555, 0.8914, 0.0531]])
-    elif model_name == "xclip-base-patch16-kinetics-600-16-frames":
-        expected_probs = torch.tensor([[3.8554e-04, 9.9929e-01, 3.2754e-04]])
-    elif model_name == "xclip-large-patch14-kinetics-600":
-        expected_probs = torch.tensor([[0.0036, 0.9920, 0.0045]])
-    # few shot
-    elif model_name == "xclip-base-patch16-hmdb-2-shot":
-        expected_probs = torch.tensor([[7.1890e-06, 9.9994e-01, 5.6559e-05]])
-    elif model_name == "xclip-base-patch16-hmdb-4-shot":
-        expected_probs = torch.tensor([[1.0320e-05, 9.9993e-01, 6.2435e-05]])
-    elif model_name == "xclip-base-patch16-hmdb-8-shot":
-        expected_probs = torch.tensor([[4.1377e-06, 9.9990e-01, 9.8386e-05]])
-    elif model_name == "xclip-base-patch16-hmdb-16-shot":
-        expected_probs = torch.tensor([[4.1347e-05, 9.9962e-01, 3.3411e-04]])
-    elif model_name == "xclip-base-patch16-ucf-2-shot":
-        expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
-    elif model_name == "xclip-base-patch16-ucf-4-shot":
-        expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
-    elif model_name == "xclip-base-patch16-ucf-8-shot":
-        expected_probs = torch.tensor([[0.0027, 0.9904, 0.0070]])
-    elif model_name == "xclip-base-patch16-ucf-16-shot":
-        expected_probs = torch.tensor([[9.8219e-04, 9.9593e-01, 3.0863e-03]])
-    # zero shot
-    elif model_name == "xclip-base-patch16-zero-shot":
-        expected_probs = torch.tensor([[3.5082e-04, 9.9785e-01, 1.7966e-03]])
-    else:
-        raise ValueError(f"Model name {model_name} not supported")
-    assert torch.allclose(probs, expected_probs, atol=1e-3)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-        model.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        print("Pushing model, processor and slow tokenizer files to the hub...")
-        model.push_to_hub(model_name, organization="nielsr")
-        processor.push_to_hub(model_name, organization="nielsr")
-        slow_tokenizer.push_to_hub(model_name, organization="nielsr")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="xclip-base-patch32",
-        type=str,
-        help="Name of the model.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_xclip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 4cf2af1da5c0..f85b4636cdf4 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -81,10 +81,10 @@ class XCLIPOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits_per_video: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    video_embeds: torch.FloatTensor = None
+    logits_per_video: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    video_embeds: Optional[torch.FloatTensor] = None
     text_model_output: BaseModelOutputWithPooling = None
     vision_model_output: BaseModelOutputWithPooling = None
     mit_output: BaseModelOutputWithPooling = None
@@ -167,7 +167,7 @@ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=Fals
         batch_size, _, height, width = pixel_values.shape
         if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
             raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})."
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
             )
         target_dtype = self.patch_embedding.weight.dtype
         patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
@@ -300,7 +300,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index 4a17d3a15a20..66568a4fee27 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -65,14 +65,14 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
         and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
         VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
-        doctsring of the above two methods for more information.
+        docstring of the above two methods for more information.
 
         Args:
             text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
+            videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarray]]`,:
                 `List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
                 of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
                 each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
diff --git a/src/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py b/src/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py
deleted file mode 100644
index f8b5dba3c1e4..000000000000
--- a/src/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import argparse
-from argparse import Namespace
-
-import torch
-from torch import nn
-
-from transformers import XGLMConfig, XGLMForCausalLM
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "decoder.version",
-        "decoder.output_projection.weight",
-        "_float_tensor",
-        "decoder.embed_positions._float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
-def convert_fairseq_xglm_checkpoint_from_disk(checkpoint_path):
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-    args = Namespace(**checkpoint["cfg"]["model"])
-    state_dict = checkpoint["model"]
-    remove_ignore_keys_(state_dict)
-    vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
-
-    state_dict = {key.replace("decoder", "model"): val for key, val in state_dict.items()}
-
-    config = XGLMConfig(
-        vocab_size=vocab_size,
-        max_position_embeddings=args.max_target_positions,
-        num_layers=args.decoder_layers,
-        attention_heads=args.decoder_attention_heads,
-        ffn_dim=args.decoder_ffn_embed_dim,
-        d_model=args.decoder_embed_dim,
-        layerdrop=args.decoder_layerdrop,
-        dropout=args.dropout,
-        attention_dropout=args.attention_dropout,
-        activation_dropout=args.activation_dropout,
-        activation_function="gelu",
-        scale_embedding=not args.no_scale_embedding,
-        tie_word_embeddings=args.share_decoder_input_output_embed,
-    )
-
-    model = XGLMForCausalLM(config)
-    missing = model.load_state_dict(state_dict, strict=False)
-    print(missing)
-    model.lm_head = make_linear_from_emb(model.model.embed_tokens)
-
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.")
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    model = convert_fairseq_xglm_checkpoint_from_disk(args.fairseq_path)
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/xglm/modeling_flax_xglm.py b/src/transformers/models/xglm/modeling_flax_xglm.py
index 2c560dc8e637..3b7a933e4dba 100644
--- a/src/transformers/models/xglm/modeling_flax_xglm.py
+++ b/src/transformers/models/xglm/modeling_flax_xglm.py
@@ -169,7 +169,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index 3c1d1afb9aa2..dc9f82900a30 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -969,7 +969,7 @@ def call(
         if labels is not None:
             # shift labels to the left and cut last logit token
             labels = tf.concat(
-                [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(self.config.pad_token_id, labels.dtype))],
+                [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(-100, labels.dtype))],
                 axis=-1,
             )
             loss = self.hf_compute_loss(labels, lm_logits)
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 190f50ca401d..feb91b46f36c 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -177,7 +177,7 @@ def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional
         return emb.to(torch.get_default_dtype())
 
     @torch.no_grad()
-    def forward(self, position_ids: torch.Tensor = None, past_key_values_length: int = 0):
+    def forward(self, position_ids: Optional[torch.Tensor] = None, past_key_values_length: int = 0):
         bsz, seq_len = position_ids.size()
         position_ids += self.offset
 
@@ -601,8 +601,7 @@ def forward(
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
-                    "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache ="
-                    " False`..."
+                    "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`..."
                 )
                 use_cache = False
 
@@ -691,33 +690,6 @@ def forward(
         )
 
 
-def xglm_cross_entropy_loss(
-    logits,
-    labels,
-    num_items_in_batch: int = None,
-    ignore_index: int = -100,
-    pad_token_id: int = -100,
-    vocab_size: int = None,
-):
-    """
-    Loss function for XGLM that takes into account `num_items_in_batch`
-    """
-    shift_labels = labels.new_zeros(labels.shape)
-    shift_labels[:, :-1] = labels[:, 1:].clone()
-    shift_labels[:, -1] = pad_token_id
-    # move labels to correct device to enable model parallelism
-    labels = labels.float().to(logits.device)
-
-    logits = logits.view(-1, vocab_size).float()
-    shift_labels = shift_labels.view(-1)
-
-    reduction = "sum" if num_items_in_batch is not None else "mean"
-    loss = nn.functional.cross_entropy(logits, shift_labels, ignore_index=ignore_index, reduction=reduction)
-    if reduction == "sum":
-        loss = loss / num_items_in_batch
-    return loss
-
-
 @add_start_docstrings(
     """
     The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
@@ -737,8 +709,6 @@ def __init__(self, config):
         # Initialize weights and apply final processing
         self.post_init()
 
-        self._loss_function = xglm_cross_entropy_loss
-
     def get_input_embeddings(self):
         return self.model.embed_tokens
 
diff --git a/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100755
index 71c3a1f989fd..000000000000
--- a/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
-
-import argparse
-import json
-
-import numpy
-import torch
-
-from transformers.models.xlm.tokenization_xlm import VOCAB_FILES_NAMES
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
-    # Load checkpoint
-    chkpt = torch.load(xlm_checkpoint_path, map_location="cpu")
-
-    state_dict = chkpt["model"]
-
-    # We have the base model one level deeper than the original XLM repository
-    two_levels_state_dict = {}
-    for k, v in state_dict.items():
-        if "pred_layer" in k:
-            two_levels_state_dict[k] = v
-        else:
-            two_levels_state_dict["transformer." + k] = v
-
-    config = chkpt["params"]
-    config = {n: v for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))}
-
-    vocab = chkpt["dico_word2id"]
-    vocab = {s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""): i for s, i in vocab.items()}
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
-
-    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
-    torch.save(two_levels_state_dict, pytorch_weights_dump_path)
-
-    print(f"Save configuration file to {pytorch_config_dump_path}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(json.dumps(config, indent=2) + "\n")
-
-    print(f"Save vocab file to {pytorch_config_dump_path}")
-    with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
-        f.write(json.dumps(vocab, indent=2) + "\n")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index 87f4dbca17f6..d161778c07d8 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -599,7 +599,7 @@ class TFXLMWithLMHeadModelOutput(ModelOutput):
             heads.
     """
 
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
 
diff --git a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
index 271d0aeb9769..63432be06ddd 100644
--- a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
@@ -228,7 +228,7 @@ def _merge_heads(self, hidden_states):
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
+        states from previous steps. This function is slightly adapted from the official Flax repository:
         https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
         """
         # detect if we're initializing by absence of existing cache data.
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 07800804c1bf..1fe5823c2066 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -700,7 +700,7 @@ class XLMRobertaPreTrainedModel(PreTrainedModel):
     _no_split_modules = ["XLMRobertaEmbeddings", "XLMRobertaSelfAttention", "XLMRobertaSdpaSelfAttention"]
     _supports_sdpa = True
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->XLMRobertaLMHead
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -716,6 +716,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, XLMRobertaLMHead):
+            module.bias.data.zero_()
 
 
 XLM_ROBERTA_START_DOCSTRING = r"""
diff --git a/src/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 7f0fec32c387..000000000000
--- a/src/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoBERTa checkpoint."""
-
-import argparse
-import pathlib
-
-import fairseq
-import torch
-from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
-from fairseq.modules import TransformerSentenceEncoderLayer
-from packaging import version
-
-from transformers import XLMRobertaConfig, XLMRobertaXLForMaskedLM, XLMRobertaXLForSequenceClassification
-from transformers.models.bert.modeling_bert import (
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-from transformers.models.roberta.modeling_roberta import RobertaAttention
-from transformers.utils import logging
-
-
-if version.parse(fairseq.__version__) < version.parse("1.0.0a"):
-    raise Exception("requires fairseq >= 1.0.0a")
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = "Hello world! cécé herlolip"
-
-
-def convert_xlm_roberta_xl_checkpoint_to_pytorch(
-    roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
-):
-    """
-    Copy/paste/tweak roberta's weights to our BERT structure.
-    """
-    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
-    roberta.eval()  # disable dropout
-    roberta_sent_encoder = roberta.model.encoder.sentence_encoder
-    config = XLMRobertaConfig(
-        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=roberta.cfg.model.encoder_embed_dim,
-        num_hidden_layers=roberta.cfg.model.encoder_layers,
-        num_attention_heads=roberta.cfg.model.encoder_attention_heads,
-        intermediate_size=roberta.cfg.model.encoder_ffn_embed_dim,
-        max_position_embeddings=514,
-        type_vocab_size=1,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-    )
-    if classification_head:
-        config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]
-
-    print("Our RoBERTa config:", config)
-
-    model = XLMRobertaXLForSequenceClassification(config) if classification_head else XLMRobertaXLForMaskedLM(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
-    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
-    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.roberta.embeddings.token_type_embeddings.weight
-    )  # just zero them out b/c RoBERTa doesn't use them.
-
-    model.roberta.encoder.LayerNorm.weight = roberta_sent_encoder.layer_norm.weight
-    model.roberta.encoder.LayerNorm.bias = roberta_sent_encoder.layer_norm.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer: BertLayer = model.roberta.encoder.layer[i]
-        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
-
-        attention: RobertaAttention = layer.attention
-        attention.self_attn_layer_norm.weight = roberta_layer.self_attn_layer_norm.weight
-        attention.self_attn_layer_norm.bias = roberta_layer.self_attn_layer_norm.bias
-
-        # self attention
-        self_attn: BertSelfAttention = layer.attention.self
-        assert (
-            roberta_layer.self_attn.k_proj.weight.data.shape
-            == roberta_layer.self_attn.q_proj.weight.data.shape
-            == roberta_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        )
-
-        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
-
-        # self-attention output
-        self_output: BertSelfOutput = layer.attention.output
-        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
-        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
-        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
-
-        # this one is final layer norm
-        layer.LayerNorm.weight = roberta_layer.final_layer_norm.weight
-        layer.LayerNorm.bias = roberta_layer.final_layer_norm.bias
-
-        # intermediate
-        intermediate: BertIntermediate = layer.intermediate
-        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
-        intermediate.dense.weight = roberta_layer.fc1.weight
-        intermediate.dense.bias = roberta_layer.fc1.bias
-
-        # output
-        bert_output: BertOutput = layer.output
-        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
-        bert_output.dense.weight = roberta_layer.fc2.weight
-        bert_output.dense.bias = roberta_layer.fc2.bias
-        # end of layer
-
-    if classification_head:
-        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
-        model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
-        model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias
-
-    # Let's check that we get the same results.
-    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
-
-    our_output = model(input_ids)[0]
-    if classification_head:
-        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
-    else:
-        their_output = roberta.model(input_ids)[0]
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    args = parser.parse_args()
-    convert_xlm_roberta_xl_checkpoint_to_pytorch(
-        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
-    )
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index 014480ecd82e..ad43c7903f4f 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -692,7 +692,7 @@ class XLMRobertaXLPreTrainedModel(PreTrainedModel):
     _no_split_modules = ["XLMRobertaXLEmbeddings", "XLMRobertaXLLayer"]
     _supports_sdpa = True
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->XLMRobertaXLLMHead
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -708,6 +708,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, XLMRobertaXLLMHead):
+            module.bias.data.zero_()
 
 
 XLM_ROBERTA_XL_START_DOCSTRING = r"""
diff --git a/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
deleted file mode 100755
index a15c5f22ad68..000000000000
--- a/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BERT checkpoint."""
-
-import argparse
-import os
-
-import torch
-
-from transformers import (
-    XLNetConfig,
-    XLNetForQuestionAnswering,
-    XLNetForSequenceClassification,
-    XLNetLMHeadModel,
-    load_tf_weights_in_xlnet,
-)
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-GLUE_TASKS_NUM_LABELS = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
-
-
-logging.set_verbosity_info()
-
-
-def convert_xlnet_checkpoint_to_pytorch(
-    tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
-):
-    # Initialise PyTorch model
-    config = XLNetConfig.from_json_file(bert_config_file)
-
-    finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
-    if finetuning_task in GLUE_TASKS_NUM_LABELS:
-        print(f"Building PyTorch XLNetForSequenceClassification model from configuration: {config}")
-        config.finetuning_task = finetuning_task
-        config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
-        model = XLNetForSequenceClassification(config)
-    elif "squad" in finetuning_task:
-        config.finetuning_task = finetuning_task
-        model = XLNetForQuestionAnswering(config)
-    else:
-        model = XLNetLMHeadModel(config)
-
-    # Load weights from tf checkpoint
-    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
-    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
-    print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
-    )
-    parser.add_argument(
-        "--xlnet_config_file",
-        default=None,
-        type=str,
-        required=True,
-        help=(
-            "The config json file corresponding to the pre-trained XLNet model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the folder to store the PyTorch model or dataset/vocab.",
-    )
-    parser.add_argument(
-        "--finetuning_task",
-        default=None,
-        type=str,
-        help="Name of a task on which the XLNet TensorFlow model was fine-tuned",
-    )
-    args = parser.parse_args()
-    print(args)
-
-    convert_xlnet_checkpoint_to_pytorch(
-        args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
-    )
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index 83e097ebc835..2383352ae6af 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -863,7 +863,7 @@ class TFXLNetModelOutput(ModelOutput):
             heads.
     """
 
-    last_hidden_state: tf.Tensor = None
+    last_hidden_state: Optional[tf.Tensor] = None
     mems: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
@@ -900,7 +900,7 @@ class TFXLNetLMHeadModelOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     mems: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
@@ -934,7 +934,7 @@ class TFXLNetForSequenceClassificationOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     mems: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
@@ -968,7 +968,7 @@ class TFXLNetForTokenClassificationOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     mems: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
@@ -1004,7 +1004,7 @@ class TFXLNetForMultipleChoiceOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    logits: tf.Tensor = None
+    logits: Optional[tf.Tensor] = None
     mems: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
@@ -1040,8 +1040,8 @@ class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput):
     """
 
     loss: tf.Tensor | None = None
-    start_logits: tf.Tensor = None
-    end_logits: tf.Tensor = None
+    start_logits: Optional[tf.Tensor] = None
+    end_logits: Optional[tf.Tensor] = None
     mems: List[tf.Tensor] | None = None
     hidden_states: Tuple[tf.Tensor, ...] | None = None
     attentions: Tuple[tf.Tensor, ...] | None = None
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index 91f2d09f96f7..db0703f949c3 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -164,15 +164,15 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
             array = np.transpose(array)
         if isinstance(pointer, list):
             # Here we will split the TF weights
-            assert (
-                len(pointer) == array.shape[0]
-            ), f"Pointer length {len(pointer)} and array length {array.shape[0]} mismatched"
+            assert len(pointer) == array.shape[0], (
+                f"Pointer length {len(pointer)} and array length {array.shape[0]} mismatched"
+            )
             for i, p_i in enumerate(pointer):
                 arr_i = array[i, ...]
                 try:
-                    assert (
-                        p_i.shape == arr_i.shape
-                    ), f"Pointer shape {p_i.shape} and array shape {arr_i.shape} mismatched"
+                    assert p_i.shape == arr_i.shape, (
+                        f"Pointer shape {p_i.shape} and array shape {arr_i.shape} mismatched"
+                    )
                 except AssertionError as e:
                     e.args += (p_i.shape, arr_i.shape)
                     raise
@@ -180,9 +180,9 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
                 p_i.data = torch.from_numpy(arr_i)
         else:
             try:
-                assert (
-                    pointer.shape == array.shape
-                ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+                assert pointer.shape == array.shape, (
+                    f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+                )
             except AssertionError as e:
                 e.args += (pointer.shape, array.shape)
                 raise
@@ -636,7 +636,7 @@ class XLNetLMHeadModelOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     mems: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -670,7 +670,7 @@ class XLNetForSequenceClassificationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     mems: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -704,7 +704,7 @@ class XLNetForTokenClassificationOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     mems: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -740,7 +740,7 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
     mems: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
@@ -776,8 +776,8 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    start_logits: torch.FloatTensor = None
-    end_logits: torch.FloatTensor = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
     mems: Optional[List[torch.FloatTensor]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index ab40980211b4..640558e1d81f 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -240,7 +240,7 @@ def _decode(
         self,
         token_ids: List[int],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         spaces_between_special_tokens: bool = True,
         **kwargs,
     ) -> str:
diff --git a/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 6352b7130055..000000000000
--- a/src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert X-MOD checkpoint."""
-
-import argparse
-from pathlib import Path
-
-import fairseq
-import torch
-from fairseq.models.xmod import XMODModel as FairseqXmodModel
-from packaging import version
-
-from transformers import XmodConfig, XmodForMaskedLM, XmodForSequenceClassification
-from transformers.utils import logging
-
-
-if version.parse(fairseq.__version__) < version.parse("0.12.2"):
-    raise Exception("requires fairseq >= 0.12.2")
-if version.parse(fairseq.__version__) > version.parse("2"):
-    raise Exception("requires fairseq < v2")
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = "Hello, World!"
-SAMPLE_LANGUAGE = "en_XX"
-
-
-def convert_xmod_checkpoint_to_pytorch(
-    xmod_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
-):
-    data_dir = Path("data_bin")
-    xmod = FairseqXmodModel.from_pretrained(
-        model_name_or_path=str(Path(xmod_checkpoint_path).parent),
-        checkpoint_file=Path(xmod_checkpoint_path).name,
-        _name="xmod_base",
-        arch="xmod_base",
-        task="multilingual_masked_lm",
-        data_name_or_path=str(data_dir),
-        bpe="sentencepiece",
-        sentencepiece_model=str(Path(xmod_checkpoint_path).parent / "sentencepiece.bpe.model"),
-        src_dict=str(data_dir / "dict.txt"),
-    )
-    xmod.eval()  # disable dropout
-    print(xmod)
-
-    xmod_sent_encoder = xmod.model.encoder.sentence_encoder
-    config = XmodConfig(
-        vocab_size=xmod_sent_encoder.embed_tokens.num_embeddings,
-        hidden_size=xmod.cfg.model.encoder_embed_dim,
-        num_hidden_layers=xmod.cfg.model.encoder_layers,
-        num_attention_heads=xmod.cfg.model.encoder_attention_heads,
-        intermediate_size=xmod.cfg.model.encoder_ffn_embed_dim,
-        max_position_embeddings=514,
-        type_vocab_size=1,
-        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
-        pre_norm=xmod.cfg.model.encoder_normalize_before,
-        adapter_reduction_factor=getattr(xmod.cfg.model, "bottleneck", 2),
-        adapter_layer_norm=xmod.cfg.model.adapter_layer_norm,
-        adapter_reuse_layer_norm=xmod.cfg.model.adapter_reuse_layer_norm,
-        ln_before_adapter=xmod.cfg.model.ln_before_adapter,
-        languages=xmod.cfg.model.languages,
-    )
-    if classification_head:
-        config.num_labels = xmod.model.classification_heads["mnli"].out_proj.weight.shape[0]
-
-    print("Our X-MOD config:", config)
-
-    model = XmodForSequenceClassification(config) if classification_head else XmodForMaskedLM(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    model.roberta.embeddings.word_embeddings.weight = xmod_sent_encoder.embed_tokens.weight
-    model.roberta.embeddings.position_embeddings.weight = xmod_sent_encoder.embed_positions.weight
-    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
-        model.roberta.embeddings.token_type_embeddings.weight
-    )  # just zero them out b/c xmod doesn't use them.
-
-    model.roberta.embeddings.LayerNorm.weight = xmod_sent_encoder.layernorm_embedding.weight
-    model.roberta.embeddings.LayerNorm.bias = xmod_sent_encoder.layernorm_embedding.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer = model.roberta.encoder.layer[i]
-        xmod_layer = xmod_sent_encoder.layers[i]
-
-        # self attention
-        self_attn = layer.attention.self
-        if not (
-            xmod_layer.self_attn.k_proj.weight.data.shape
-            == xmod_layer.self_attn.q_proj.weight.data.shape
-            == xmod_layer.self_attn.v_proj.weight.data.shape
-            == torch.Size((config.hidden_size, config.hidden_size))
-        ):
-            raise AssertionError("Dimensions of self-attention weights do not match.")
-
-        self_attn.query.weight.data = xmod_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = xmod_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = xmod_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = xmod_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = xmod_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = xmod_layer.self_attn.v_proj.bias
-
-        # self-attention output
-        self_output = layer.attention.output
-        if self_output.dense.weight.shape != xmod_layer.self_attn.out_proj.weight.shape:
-            raise AssertionError("Dimensions of self-attention output weights do not match.")
-        self_output.dense.weight = xmod_layer.self_attn.out_proj.weight
-        self_output.dense.bias = xmod_layer.self_attn.out_proj.bias
-        self_output.LayerNorm.weight = xmod_layer.self_attn_layer_norm.weight
-        self_output.LayerNorm.bias = xmod_layer.self_attn_layer_norm.bias
-
-        # intermediate
-        intermediate = layer.intermediate
-        if intermediate.dense.weight.shape != xmod_layer.fc1.weight.shape:
-            raise AssertionError("Dimensions of intermediate weights do not match.")
-        intermediate.dense.weight = xmod_layer.fc1.weight
-        intermediate.dense.bias = xmod_layer.fc1.bias
-
-        # output
-        bert_output = layer.output
-        if bert_output.dense.weight.shape != xmod_layer.fc2.weight.shape:
-            raise AssertionError("Dimensions of feed-forward weights do not match.")
-        bert_output.dense.weight = xmod_layer.fc2.weight
-        bert_output.dense.bias = xmod_layer.fc2.bias
-        bert_output.LayerNorm.weight = xmod_layer.final_layer_norm.weight
-        bert_output.LayerNorm.bias = xmod_layer.final_layer_norm.bias
-        if bert_output.adapter_layer_norm is not None:
-            bert_output.adapter_layer_norm.weight = xmod_layer.adapter_layer_norm.weight
-            bert_output.adapter_layer_norm.bias = xmod_layer.adapter_layer_norm.bias
-
-        if sorted(bert_output.adapter_modules.keys()) != sorted(xmod_layer.adapter_modules.keys()):
-            raise AssertionError("Lists of language adapters do not match.")
-        for lang_code, adapter in xmod_layer.adapter_modules.items():
-            to_adapter = bert_output.adapter_modules[lang_code]
-            from_adapter = xmod_layer.adapter_modules[lang_code]
-            to_adapter.dense1.weight = from_adapter.fc1.weight
-            to_adapter.dense1.bias = from_adapter.fc1.bias
-            to_adapter.dense2.weight = from_adapter.fc2.weight
-            to_adapter.dense2.bias = from_adapter.fc2.bias
-
-        # end of layer
-
-    if xmod_sent_encoder.layer_norm is not None:
-        model.roberta.encoder.LayerNorm.weight = xmod_sent_encoder.layer_norm.weight
-        model.roberta.encoder.LayerNorm.bias = xmod_sent_encoder.layer_norm.bias
-
-    if classification_head:
-        model.classifier.dense.weight = xmod.model.classification_heads["mnli"].dense.weight
-        model.classifier.dense.bias = xmod.model.classification_heads["mnli"].dense.bias
-        model.classifier.out_proj.weight = xmod.model.classification_heads["mnli"].out_proj.weight
-        model.classifier.out_proj.bias = xmod.model.classification_heads["mnli"].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = xmod.model.encoder.lm_head.dense.weight
-        model.lm_head.dense.bias = xmod.model.encoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = xmod.model.encoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = xmod.model.encoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = xmod.model.encoder.lm_head.weight
-        model.lm_head.decoder.bias = xmod.model.encoder.lm_head.bias
-
-    # Let's check that we get the same results.
-    input_ids = xmod.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
-    model.roberta.set_default_language(SAMPLE_LANGUAGE)
-
-    our_output = model(input_ids)[0]
-    if classification_head:
-        their_output = xmod.model.classification_heads["mnli"](xmod.extract_features(input_ids))
-    else:
-        their_output = xmod.model(input_ids, lang_id=[SAMPLE_LANGUAGE])[0]
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print("Do both models output the same tensors?", "🔥" if success else "💩")
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--xmod_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--classification_head", action="store_true", help="Whether to convert a final classification head."
-    )
-    args = parser.parse_args()
-    convert_xmod_checkpoint_to_pytorch(
-        args.xmod_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
-    )
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index a3bde4c2b59d..21aad7188e0e 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -645,7 +645,7 @@ class XmodPreTrainedModel(PreTrainedModel):
     base_model_prefix = "roberta"
     supports_gradient_checkpointing = True
 
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->XmodLMHead
     def _init_weights(self, module):
         """Initialize the weights"""
         if isinstance(module, nn.Linear):
@@ -661,6 +661,8 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+        elif isinstance(module, XmodLMHead):
+            module.bias.data.zero_()
 
     def set_default_language(self, language: str):
         """
diff --git a/src/transformers/models/yolos/convert_yolos_to_pytorch.py b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
deleted file mode 100644
index 6cddc606614c..000000000000
--- a/src/transformers/models/yolos/convert_yolos_to_pytorch.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert YOLOS checkpoints from the original repository. URL: https://github.com/hustvl/YOLOS"""
-
-import argparse
-import json
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import YolosConfig, YolosForObjectDetection, YolosImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_yolos_config(yolos_name: str) -> YolosConfig:
-    config = YolosConfig()
-
-    # size of the architecture
-    if "yolos_ti" in yolos_name:
-        config.hidden_size = 192
-        config.intermediate_size = 768
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 3
-        config.image_size = [800, 1333]
-        config.use_mid_position_embeddings = False
-    elif yolos_name == "yolos_s_dWr":
-        config.hidden_size = 330
-        config.num_hidden_layers = 14
-        config.num_attention_heads = 6
-        config.intermediate_size = 1320
-    elif "yolos_s" in yolos_name:
-        config.hidden_size = 384
-        config.intermediate_size = 1536
-        config.num_hidden_layers = 12
-        config.num_attention_heads = 6
-    elif "yolos_b" in yolos_name:
-        config.image_size = [800, 1344]
-
-    config.num_labels = 91
-    repo_id = "huggingface/label-files"
-    filename = "coco-detection-id2label.json"
-    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
-
-    return config
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict: dict, config: YolosConfig, base_model: bool = False):
-    for i in range(config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
-        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            config.hidden_size : config.hidden_size * 2, :
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
-            config.hidden_size : config.hidden_size * 2
-        ]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
-        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
-
-
-def rename_key(name: str) -> str:
-    if "backbone" in name:
-        name = name.replace("backbone", "vit")
-    if "cls_token" in name:
-        name = name.replace("cls_token", "embeddings.cls_token")
-    if "det_token" in name:
-        name = name.replace("det_token", "embeddings.detection_tokens")
-    if "mid_pos_embed" in name:
-        name = name.replace("mid_pos_embed", "encoder.mid_position_embeddings")
-    if "pos_embed" in name:
-        name = name.replace("pos_embed", "embeddings.position_embeddings")
-    if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
-    if "blocks" in name:
-        name = name.replace("blocks", "encoder.layer")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
-        name = name.replace("attn", "attention.self")
-    if "norm1" in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "class_embed" in name:
-        name = name.replace("class_embed", "class_labels_classifier")
-    if "bbox_embed" in name:
-        name = name.replace("bbox_embed", "bbox_predictor")
-    if "vit.norm" in name:
-        name = name.replace("vit.norm", "vit.layernorm")
-
-    return name
-
-
-def convert_state_dict(orig_state_dict: dict, model: YolosForObjectDetection) -> dict:
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if "qkv" in key:
-            key_split = key.split(".")
-            layer_num = int(key_split[2])
-            dim = model.vit.encoder.layer[layer_num].attention.attention.all_head_size
-            if "weight" in key:
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[
-                    dim : dim * 2, :
-                ]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-            else:
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    return orig_state_dict
-
-
-# We will verify our results on an image of cute cats
-def prepare_img() -> torch.Tensor:
-    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    im = Image.open(requests.get(url, stream=True).raw)
-    return im
-
-
-@torch.no_grad()
-def convert_yolos_checkpoint(
-    yolos_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
-):
-    """
-    Copy/paste/tweak model's weights to our YOLOS structure.
-    """
-    config = get_yolos_config(yolos_name)
-
-    # load original state_dict
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-
-    # load 🤗 model
-    model = YolosForObjectDetection(config)
-    model.eval()
-    new_state_dict = convert_state_dict(state_dict, model)
-    model.load_state_dict(new_state_dict)
-
-    # Check outputs on an image, prepared by YolosImageProcessor
-    size = 800 if yolos_name != "yolos_ti" else 512
-    image_processor = YolosImageProcessor(format="coco_detection", size=size)
-    encoding = image_processor(images=prepare_img(), return_tensors="pt")
-    outputs = model(**encoding)
-    logits, pred_boxes = outputs.logits, outputs.pred_boxes
-
-    expected_slice_logits, expected_slice_boxes = None, None
-    if yolos_name == "yolos_ti":
-        expected_slice_logits = torch.tensor(
-            [[-39.5022, -11.9820, -17.6888], [-29.9574, -9.9769, -17.7691], [-42.3281, -20.7200, -30.6294]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.4021, 0.0836, 0.7979], [0.0184, 0.2609, 0.0364], [0.1781, 0.2004, 0.2095]]
-        )
-    elif yolos_name == "yolos_s_200_pre":
-        expected_slice_logits = torch.tensor(
-            [[-24.0248, -10.3024, -14.8290], [-42.0392, -16.8200, -27.4334], [-27.2743, -11.8154, -18.7148]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.2559, 0.5455, 0.4706], [0.2989, 0.7279, 0.1875], [0.7732, 0.4017, 0.4462]]
-        )
-    elif yolos_name == "yolos_s_300_pre":
-        expected_slice_logits = torch.tensor(
-            [[-36.2220, -14.4385, -23.5457], [-35.6970, -14.7583, -21.3935], [-31.5939, -13.6042, -16.8049]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.7614, 0.2316, 0.4728], [0.7168, 0.4495, 0.3855], [0.4996, 0.1466, 0.9996]]
-        )
-    elif yolos_name == "yolos_s_dWr":
-        expected_slice_logits = torch.tensor(
-            [[-42.8668, -24.1049, -41.1690], [-34.7456, -14.1274, -24.9194], [-33.7898, -12.1946, -25.6495]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.5587, 0.2773, 0.0605], [0.5004, 0.3014, 0.9994], [0.4999, 0.1548, 0.9994]]
-        )
-    elif yolos_name == "yolos_base":
-        expected_slice_logits = torch.tensor(
-            [[-40.6064, -24.3084, -32.6447], [-55.1990, -30.7719, -35.5877], [-51.4311, -33.3507, -35.6462]]
-        )
-        expected_slice_boxes = torch.tensor(
-            [[0.5555, 0.2794, 0.0655], [0.9049, 0.2664, 0.1894], [0.9183, 0.1984, 0.1635]]
-        )
-    else:
-        raise ValueError(f"Unknown yolos_name: {yolos_name}")
-
-    assert torch.allclose(logits[0, :3, :3], expected_slice_logits, atol=1e-4)
-    assert torch.allclose(pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)
-
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    print(f"Saving model {yolos_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-    print(f"Saving image processor to {pytorch_dump_folder_path}")
-    image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_mapping = {
-            "yolos_ti": "yolos-tiny",
-            "yolos_s_200_pre": "yolos-small",
-            "yolos_s_300_pre": "yolos-small-300",
-            "yolos_s_dWr": "yolos-small-dwr",
-            "yolos_base": "yolos-base",
-        }
-
-        print("Pushing to the hub...")
-        model_name = model_mapping[yolos_name]
-        image_processor.push_to_hub(model_name, organization="hustvl")
-        model.push_to_hub(model_name, organization="hustvl")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--yolos_name",
-        default="yolos_s_200_pre",
-        type=str,
-        help=(
-            "Name of the YOLOS model you'd like to convert. Should be one of 'yolos_ti', 'yolos_s_200_pre',"
-            " 'yolos_s_300_pre', 'yolos_s_dWr', 'yolos_base'."
-        ),
-    )
-    parser.add_argument(
-        "--checkpoint_path", default=None, type=str, help="Path to the original state dict (.pth file)."
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
-    )
-
-    args = parser.parse_args()
-    convert_yolos_checkpoint(args.yolos_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 5a94d6b41d3d..3cff6950dd08 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -862,7 +862,7 @@ def prepare_annotation(
         image: np.ndarray,
         target: Dict,
         format: Optional[AnnotationFormat] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
@@ -1177,7 +1177,7 @@ def preprocess(
         self,
         images: ImageInput,
         annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
-        return_segmentation_masks: bool = None,
+        return_segmentation_masks: Optional[bool] = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         do_resize: Optional[bool] = None,
         size: Optional[Dict[str, int]] = None,
diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py
index 729fd1b354b9..42bebb37f76f 100755
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -15,9 +15,8 @@
 """PyTorch YOLOS model."""
 
 import collections.abc
-import math
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -25,7 +24,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
-from ...modeling_utils import PreTrainedModel
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     ModelOutput,
@@ -85,8 +84,8 @@ class YolosObjectDetectionOutput(ModelOutput):
 
     loss: Optional[torch.FloatTensor] = None
     loss_dict: Optional[Dict] = None
-    logits: torch.FloatTensor = None
-    pred_boxes: torch.FloatTensor = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
     auxiliary_outputs: Optional[List[Dict]] = None
     last_hidden_state: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -231,26 +230,59 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
 # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Yolos
 class YolosSelfAttention(nn.Module):
     def __init__(self, config: YolosConfig) -> None:
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
                 f"heads {config.num_attention_heads}."
             )
 
+        self.config = config
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
 
         self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
         self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
 
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
     def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(new_x_shape)
@@ -259,85 +291,37 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
-
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        return outputs
-
-
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaSelfAttention with ViT->Yolos
-class YolosSdpaSelfAttention(YolosSelfAttention):
-    def __init__(self, config: YolosConfig) -> None:
-        super().__init__(config)
-        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        if output_attentions or head_mask is not None:
-            logger.warning_once(
-                "`YolosSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
-                "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but "
-                "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-            )
-
-        mixed_query_layer = self.query(hidden_states)
-
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
 
-        context_layer = torch.nn.functional.scaled_dot_product_attention(
+        context_layer, attention_probs = attention_interface(
+            self,
             query_layer,
             key_layer,
             value_layer,
             head_mask,
-            self.attention_probs_dropout_prob if self.training else 0.0,
-            is_causal=False,
-            scale=None,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
         )
 
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(new_context_layer_shape)
+        context_layer = context_layer.reshape(new_context_layer_shape)
 
-        return context_layer, None
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
 
 
 # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Yolos
@@ -399,13 +383,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->Yolos
-class YolosSdpaAttention(YolosAttention):
-    def __init__(self, config: YolosConfig) -> None:
-        super().__init__(config)
-        self.attention = YolosSdpaSelfAttention(config)
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->Yolos
 class YolosIntermediate(nn.Module):
     def __init__(self, config: YolosConfig) -> None:
@@ -439,9 +416,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to
         return hidden_states
 
 
-YOLOS_ATTENTION_CLASSES = {"eager": YolosAttention, "sdpa": YolosSdpaAttention}
-
-
 # Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->Yolos,VIT->YOLOS
 class YolosLayer(nn.Module):
     """This corresponds to the Block class in the timm implementation."""
@@ -450,7 +424,7 @@ def __init__(self, config: YolosConfig) -> None:
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = YOLOS_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.attention = YolosAttention(config)
         self.intermediate = YolosIntermediate(config)
         self.output = YolosOutput(config)
         self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -575,6 +549,7 @@ class YolosPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = []
     _supports_sdpa = True
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
         """Initialize the weights"""
diff --git a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
deleted file mode 100644
index be46a4de81b3..000000000000
--- a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert YOSO checkpoints from the original repository. URL: https://github.com/mlpen/YOSO"""
-
-import argparse
-
-import torch
-
-from transformers import YosoConfig, YosoForMaskedLM
-
-
-def rename_key(orig_key):
-    if "model" in orig_key:
-        orig_key = orig_key.replace("model.", "")
-    if "norm1" in orig_key:
-        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
-    if "norm2" in orig_key:
-        orig_key = orig_key.replace("norm2", "output.LayerNorm")
-    if "norm" in orig_key:
-        orig_key = orig_key.replace("norm", "LayerNorm")
-    if "transformer" in orig_key:
-        layer_num = orig_key.split(".")[0].split("_")[-1]
-        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
-    if "mha.attn" in orig_key:
-        orig_key = orig_key.replace("mha.attn", "attention.self")
-    if "mha" in orig_key:
-        orig_key = orig_key.replace("mha", "attention")
-    if "W_q" in orig_key:
-        orig_key = orig_key.replace("W_q", "self.query")
-    if "W_k" in orig_key:
-        orig_key = orig_key.replace("W_k", "self.key")
-    if "W_v" in orig_key:
-        orig_key = orig_key.replace("W_v", "self.value")
-    if "ff1" in orig_key:
-        orig_key = orig_key.replace("ff1", "intermediate.dense")
-    if "ff2" in orig_key:
-        orig_key = orig_key.replace("ff2", "output.dense")
-    if "ff" in orig_key:
-        orig_key = orig_key.replace("ff", "output.dense")
-    if "mlm_class" in orig_key:
-        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
-    if "mlm" in orig_key:
-        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
-    if "cls" not in orig_key:
-        orig_key = "yoso." + orig_key
-
-    return orig_key
-
-
-def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        if ("pooler" in key) or ("sen_class" in key):
-            continue
-        else:
-            orig_state_dict[rename_key(key)] = val
-
-    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
-    orig_state_dict["yoso.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
-
-    return orig_state_dict
-
-
-def convert_yoso_checkpoint(checkpoint_path, yoso_config_file, pytorch_dump_path):
-    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
-    config = YosoConfig.from_json_file(yoso_config_file)
-    model = YosoForMaskedLM(config)
-
-    new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
-
-    print(model.load_state_dict(new_state_dict))
-    model.eval()
-    model.save_pretrained(pytorch_dump_path)
-
-    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--pytorch_model_path", default=None, type=str, required=True, help="Path to YOSO pytorch checkpoint."
-    )
-    parser.add_argument(
-        "--config_file",
-        default=None,
-        type=str,
-        required=True,
-        help="The json file for YOSO model config.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-    convert_yoso_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
diff --git a/src/transformers/models/zamba/configuration_zamba.py b/src/transformers/models/zamba/configuration_zamba.py
index df165154a00b..46d99a322720 100644
--- a/src/transformers/models/zamba/configuration_zamba.py
+++ b/src/transformers/models/zamba/configuration_zamba.py
@@ -203,9 +203,9 @@ def __init__(
 
         self.layers_block_type = self._layers_block_type(num_hidden_layers, attn_layer_period, attn_layer_offset)
 
-        assert (
-            self.mamba_expand * self.hidden_size
-        ) % self.n_mamba_heads == 0, "`intermediate_size` should be divisible by `n_mamba_heads`."
+        assert (self.mamba_expand * self.hidden_size) % self.n_mamba_heads == 0, (
+            "`intermediate_size` should be divisible by `n_mamba_heads`."
+        )
 
         super().__init__(
             pad_token_id=pad_token_id,
diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py
index 54f57971a82e..3018d3aaf686 100644
--- a/src/transformers/models/zamba/modeling_zamba.py
+++ b/src/transformers/models/zamba/modeling_zamba.py
@@ -45,7 +45,6 @@
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -680,7 +679,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         original_hidden_states: Optional[torch.Tensor] = None,
-        layer_idx: int = None,
+        layer_idx: Optional[int] = None,
         attention_mask: Optional[torch.Tensor] = None,
         causal_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[ZambaHybridDynamicCache] = None,
@@ -748,7 +747,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         original_hidden_states: Optional[torch.Tensor] = None,
-        layer_idx: int = None,
+        layer_idx: Optional[int] = None,
         attention_mask: Optional[torch.Tensor] = None,
         causal_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[ZambaHybridDynamicCache] = None,
@@ -1034,7 +1033,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(ZAMBA_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[ZambaHybridDynamicCache] = None,
@@ -1213,7 +1212,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[ZambaHybridDynamicCache] = None,
@@ -1228,7 +1227,6 @@ def forward(
         **loss_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1326,7 +1324,7 @@ def prepare_inputs_for_generation(
             #              (we can't check exception 3 while compiling)
             if (
                 inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
             ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
@@ -1442,7 +1440,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py
index 8a5642e0f5d1..fb3f9b481fc4 100644
--- a/src/transformers/models/zamba2/modeling_zamba2.py
+++ b/src/transformers/models/zamba2/modeling_zamba2.py
@@ -34,13 +34,12 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
     logging,
     replace_return_docstrings,
 )
@@ -240,45 +239,18 @@ def __init__(
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
     @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
@@ -386,8 +358,8 @@ def __init__(
         self,
         config: Zamba2Config,
         layer_idx: Optional[int] = None,
-        num_fwd_mem_blocks: int = None,
-        block_id: int = None,
+        num_fwd_mem_blocks: Optional[int] = None,
+        block_id: Optional[int] = None,
     ):
         super().__init__()
         self.config = config
@@ -561,7 +533,7 @@ class Zamba2MambaMixer(nn.Module):
     and is why Mamba is called **selective** state spaces)
     """
 
-    def __init__(self, config: Zamba2Config, layer_idx: int = None):
+    def __init__(self, config: Zamba2Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -984,7 +956,7 @@ def forward(
 
 
 class Zamba2MLP(nn.Module):
-    def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: int = None):
+    def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: Optional[int] = None):
         """
         This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
         is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
@@ -1026,7 +998,7 @@ def forward(self, hidden_state, layer_idx=None):
 
 
 class Zamba2AttentionDecoderLayer(nn.Module):
-    def __init__(self, config: Zamba2Config, block_id: int = None, layer_idx: Optional[int] = None):
+    def __init__(self, config: Zamba2Config, block_id: Optional[int] = None, layer_idx: Optional[int] = None):
         super().__init__()
         self.block_id = block_id
         num_gs = len(config.hybrid_layer_ids)
@@ -1100,7 +1072,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         original_hidden_states: Optional[torch.Tensor] = None,
-        layer_idx: int = None,
+        layer_idx: Optional[int] = None,
         attention_mask: Optional[torch.Tensor] = None,
         causal_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Zamba2HybridDynamicCache] = None,
@@ -1170,7 +1142,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         original_hidden_states: Optional[torch.Tensor] = None,
-        layer_idx: int = None,
+        layer_idx: Optional[int] = None,
         attention_mask: Optional[torch.Tensor] = None,
         causal_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Zamba2HybridDynamicCache] = None,
@@ -1413,7 +1385,7 @@ def set_input_embeddings(self, value):
     @add_start_docstrings_to_model_forward(ZAMBA2_INPUTS_DOCSTRING)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Zamba2HybridDynamicCache] = None,
@@ -1650,7 +1622,7 @@ def get_decoder(self):
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Zamba2HybridDynamicCache] = None,
@@ -1665,7 +1637,6 @@ def forward(
         **loss_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
-        Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
@@ -1763,7 +1734,7 @@ def prepare_inputs_for_generation(
             #              (we can't check exception 3 while compiling)
             if (
                 inputs_embeds is not None  # Exception 1
-                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
             ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
@@ -1879,7 +1850,7 @@ def forward(
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
             non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
             last_non_pad_token = -1
diff --git a/src/transformers/models/zamba2/modular_zamba2.py b/src/transformers/models/zamba2/modular_zamba2.py
index f2074b76f3da..ece5fb106535 100644
--- a/src/transformers/models/zamba2/modular_zamba2.py
+++ b/src/transformers/models/zamba2/modular_zamba2.py
@@ -199,8 +199,8 @@ def __init__(
         self,
         config: Zamba2Config,
         layer_idx: Optional[int] = None,
-        num_fwd_mem_blocks: int = None,
-        block_id: int = None,
+        num_fwd_mem_blocks: Optional[int] = None,
+        block_id: Optional[int] = None,
     ):
         super().__init__(config, layer_idx)
         self.num_fwd_mem_blocks = num_fwd_mem_blocks
@@ -302,7 +302,7 @@ class Zamba2MambaMixer(nn.Module):
     and is why Mamba is called **selective** state spaces)
     """
 
-    def __init__(self, config: Zamba2Config, layer_idx: int = None):
+    def __init__(self, config: Zamba2Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -725,7 +725,7 @@ def forward(
 
 
 class Zamba2MLP(nn.Module):
-    def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: int = None):
+    def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: Optional[int] = None):
         """
         This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
         is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
@@ -767,7 +767,7 @@ def forward(self, hidden_state, layer_idx=None):
 
 
 class Zamba2AttentionDecoderLayer(ZambaAttentionDecoderLayer):
-    def __init__(self, config: Zamba2Config, block_id: int = None, layer_idx: Optional[int] = None):
+    def __init__(self, config: Zamba2Config, block_id: Optional[int] = None, layer_idx: Optional[int] = None):
         self.block_id = block_id
         num_gs = len(config.hybrid_layer_ids)
         super().__init__(config, layer_idx)
@@ -847,7 +847,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         original_hidden_states: Optional[torch.Tensor] = None,
-        layer_idx: int = None,
+        layer_idx: Optional[int] = None,
         attention_mask: Optional[torch.Tensor] = None,
         causal_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Zamba2HybridDynamicCache] = None,
@@ -1041,7 +1041,7 @@ def get_layers(self, blocks, linear_layers, mamba_layers):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Zamba2HybridDynamicCache] = None,
diff --git a/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py b/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py
deleted file mode 100644
index 9a6701c35bcd..000000000000
--- a/src/transformers/models/zoedepth/convert_zoedepth_to_hf.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert ZoeDepth checkpoints from the original repository. URL: https://github.com/isl-org/ZoeDepth.
-
-Original logits where obtained by running the following code:
-!git clone -b understanding_zoedepth https://github.com/NielsRogge/ZoeDepth
-!python inference.py
-"""
-
-import argparse
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-
-from transformers import BeitConfig, ZoeDepthConfig, ZoeDepthForDepthEstimation, ZoeDepthImageProcessor
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-def get_zoedepth_config(model_name):
-    image_size = 384
-    backbone_config = BeitConfig(
-        image_size=image_size,
-        num_hidden_layers=24,
-        hidden_size=1024,
-        intermediate_size=4096,
-        num_attention_heads=16,
-        use_relative_position_bias=True,
-        reshape_hidden_states=False,
-        out_features=["stage6", "stage12", "stage18", "stage24"],  # beit-large-512 uses [5, 11, 17, 23],
-    )
-
-    neck_hidden_sizes = [256, 512, 1024, 1024]
-    bin_centers_type = "softplus" if model_name in ["ZoeD_N", "ZoeD_NK"] else "normed"
-    if model_name == "ZoeD_NK":
-        bin_configurations = [
-            {"name": "nyu", "n_bins": 64, "min_depth": 1e-3, "max_depth": 10.0},
-            {"name": "kitti", "n_bins": 64, "min_depth": 1e-3, "max_depth": 80.0},
-        ]
-    elif model_name in ["ZoeD_N", "ZoeD_K"]:
-        bin_configurations = [
-            {"name": "nyu", "n_bins": 64, "min_depth": 1e-3, "max_depth": 10.0},
-        ]
-    config = ZoeDepthConfig(
-        backbone_config=backbone_config,
-        neck_hidden_sizes=neck_hidden_sizes,
-        bin_centers_type=bin_centers_type,
-        bin_configurations=bin_configurations,
-        num_patch_transformer_layers=4 if model_name == "ZoeD_NK" else None,
-        patch_transformer_hidden_size=128 if model_name == "ZoeD_NK" else None,
-        patch_transformer_intermediate_size=1024 if model_name == "ZoeD_NK" else None,
-        patch_transformer_num_attention_heads=4 if model_name == "ZoeD_NK" else None,
-    )
-
-    return config, image_size
-
-
-def rename_key(name):
-    # Transformer backbone
-    if "core.core.pretrained.model.blocks" in name:
-        name = name.replace("core.core.pretrained.model.blocks", "backbone.encoder.layer")
-    if "core.core.pretrained.model.patch_embed.proj" in name:
-        name = name.replace(
-            "core.core.pretrained.model.patch_embed.proj", "backbone.embeddings.patch_embeddings.projection"
-        )
-    if "core.core.pretrained.model.cls_token" in name:
-        name = name.replace("core.core.pretrained.model.cls_token", "backbone.embeddings.cls_token")
-    if "norm1" in name and "patch_transformer" not in name:
-        name = name.replace("norm1", "layernorm_before")
-    if "norm2" in name and "patch_transformer" not in name:
-        name = name.replace("norm2", "layernorm_after")
-    if "mlp.fc1" in name:
-        name = name.replace("mlp.fc1", "intermediate.dense")
-    if "mlp.fc2" in name:
-        name = name.replace("mlp.fc2", "output.dense")
-    if "gamma_1" in name:
-        name = name.replace("gamma_1", "lambda_1")
-    if "gamma_2" in name:
-        name = name.replace("gamma_2", "lambda_2")
-    if "attn.proj" in name:
-        name = name.replace("attn.proj", "attention.output.dense")
-    if "attn.relative_position_bias_table" in name:
-        name = name.replace(
-            "attn.relative_position_bias_table",
-            "attention.attention.relative_position_bias.relative_position_bias_table",
-        )
-    if "attn.relative_position_index" in name:
-        name = name.replace(
-            "attn.relative_position_index", "attention.attention.relative_position_bias.relative_position_index"
-        )
-
-    # activation postprocessing (readout projections + resize blocks)
-    if "core.core.pretrained.act_postprocess1.0.project" in name:
-        name = name.replace(
-            "core.core.pretrained.act_postprocess1.0.project", "neck.reassemble_stage.readout_projects.0"
-        )
-    if "core.core.pretrained.act_postprocess2.0.project" in name:
-        name = name.replace(
-            "core.core.pretrained.act_postprocess2.0.project", "neck.reassemble_stage.readout_projects.1"
-        )
-    if "core.core.pretrained.act_postprocess3.0.project" in name:
-        name = name.replace(
-            "core.core.pretrained.act_postprocess3.0.project", "neck.reassemble_stage.readout_projects.2"
-        )
-    if "core.core.pretrained.act_postprocess4.0.project" in name:
-        name = name.replace(
-            "core.core.pretrained.act_postprocess4.0.project", "neck.reassemble_stage.readout_projects.3"
-        )
-
-    if "core.core.pretrained.act_postprocess1.3" in name:
-        name = name.replace("core.core.pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
-    if "core.core.pretrained.act_postprocess2.3" in name:
-        name = name.replace("core.core.pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
-    if "core.core.pretrained.act_postprocess3.3" in name:
-        name = name.replace("core.core.pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
-    if "core.core.pretrained.act_postprocess4.3" in name:
-        name = name.replace("core.core.pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
-
-    if "core.core.pretrained.act_postprocess1.4" in name:
-        name = name.replace("core.core.pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
-    if "core.core.pretrained.act_postprocess2.4" in name:
-        name = name.replace("core.core.pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
-    if "core.core.pretrained.act_postprocess4.4" in name:
-        name = name.replace("core.core.pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
-
-    # scratch convolutions
-    if "core.core.scratch.layer1_rn.weight" in name:
-        name = name.replace("core.core.scratch.layer1_rn.weight", "neck.convs.0.weight")
-    if "core.core.scratch.layer2_rn.weight" in name:
-        name = name.replace("core.core.scratch.layer2_rn.weight", "neck.convs.1.weight")
-    if "core.core.scratch.layer3_rn.weight" in name:
-        name = name.replace("core.core.scratch.layer3_rn.weight", "neck.convs.2.weight")
-    if "core.core.scratch.layer4_rn.weight" in name:
-        name = name.replace("core.core.scratch.layer4_rn.weight", "neck.convs.3.weight")
-
-    # fusion layers
-    # tricky here: mapping = {1:3, 2:2, 3:1, 4:0}
-    if "core.core.scratch.refinenet1" in name:
-        name = name.replace("core.core.scratch.refinenet1", "neck.fusion_stage.layers.3")
-    if "core.core.scratch.refinenet2" in name:
-        name = name.replace("core.core.scratch.refinenet2", "neck.fusion_stage.layers.2")
-    if "core.core.scratch.refinenet3" in name:
-        name = name.replace("core.core.scratch.refinenet3", "neck.fusion_stage.layers.1")
-    if "core.core.scratch.refinenet4" in name:
-        name = name.replace("core.core.scratch.refinenet4", "neck.fusion_stage.layers.0")
-
-    if "resConfUnit1" in name:
-        name = name.replace("resConfUnit1", "residual_layer1")
-
-    if "resConfUnit2" in name:
-        name = name.replace("resConfUnit2", "residual_layer2")
-
-    if "conv1" in name:
-        name = name.replace("conv1", "convolution1")
-
-    if "conv2" in name and "residual_layer" in name:
-        name = name.replace("conv2", "convolution2")
-
-    if "out_conv" in name:
-        name = name.replace("out_conv", "projection")
-
-    # relative depth estimation head
-    if "core.core.scratch.output_conv.0" in name:
-        name = name.replace("core.core.scratch.output_conv.0", "relative_head.conv1")
-
-    if "core.core.scratch.output_conv.2" in name:
-        name = name.replace("core.core.scratch.output_conv.2", "relative_head.conv2")
-
-    if "core.core.scratch.output_conv.4" in name:
-        name = name.replace("core.core.scratch.output_conv.4", "relative_head.conv3")
-
-    # patch transformer
-    if "patch_transformer" in name:
-        name = name.replace("patch_transformer", "metric_head.patch_transformer")
-
-    if "mlp_classifier.0" in name:
-        name = name.replace("mlp_classifier.0", "metric_head.mlp_classifier.linear1")
-    if "mlp_classifier.2" in name:
-        name = name.replace("mlp_classifier.2", "metric_head.mlp_classifier.linear2")
-
-    if "projectors" in name:
-        name = name.replace("projectors", "metric_head.projectors")
-
-    if "seed_bin_regressors" in name:
-        name = name.replace("seed_bin_regressors", "metric_head.seed_bin_regressors")
-
-    if "seed_bin_regressor" in name and "seed_bin_regressors" not in name:
-        name = name.replace("seed_bin_regressor", "metric_head.seed_bin_regressor")
-
-    if "seed_projector" in name:
-        name = name.replace("seed_projector", "metric_head.seed_projector")
-
-    if "_net.0" in name:
-        name = name.replace("_net.0", "conv1")
-
-    if "_net.2" in name:
-        name = name.replace("_net.2", "conv2")
-
-    if "attractors" in name:
-        name = name.replace("attractors", "metric_head.attractors")
-
-    if "conditional_log_binomial" in name:
-        name = name.replace("conditional_log_binomial", "metric_head.conditional_log_binomial")
-
-    # metric depth estimation head
-    if "conv2" in name and "metric_head" not in name and "attractors" not in name and "relative_head" not in name:
-        name = name.replace("conv2", "metric_head.conv2")
-
-    if "transformer_encoder.layers" in name:
-        name = name.replace("transformer_encoder.layers", "transformer_encoder")
-
-    return name
-
-
-def read_in_q_k_v_metric_head(state_dict):
-    hidden_size = 128
-    for i in range(4):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"patch_transformer.transformer_encoder.layers.{i}.self_attn.in_proj_weight")
-        in_proj_bias = state_dict.pop(f"patch_transformer.transformer_encoder.layers.{i}.self_attn.in_proj_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.query.weight"] = in_proj_weight[
-            :hidden_size, :
-        ]
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.query.bias"] = in_proj_bias[:hidden_size]
-
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.key.bias"] = in_proj_bias[
-            hidden_size : hidden_size * 2
-        ]
-
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.value.weight"] = in_proj_weight[
-            -hidden_size:, :
-        ]
-        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.value.bias"] = in_proj_bias[-hidden_size:]
-
-
-def convert_state_dict(orig_state_dict):
-    for key in orig_state_dict.copy().keys():
-        val = orig_state_dict.pop(key)
-
-        # rename key
-        new_name = rename_key(key)
-        orig_state_dict[new_name] = val
-
-    return orig_state_dict
-
-
-def remove_ignore_keys(state_dict):
-    for key, _ in state_dict.copy().items():
-        if (
-            "fc_norm" in key
-            or "relative_position_index" in key
-            or "k_idx" in key
-            or "K_minus_1" in key
-            or "core.core.pretrained.model.head" in key
-        ):
-            state_dict.pop(key, None)
-
-
-# we split up the matrix of each encoder layer into queries, keys and values
-def read_in_q_k_v(state_dict, config):
-    hidden_size = config.backbone_config.hidden_size
-    for i in range(config.backbone_config.num_hidden_layers):
-        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
-        in_proj_weight = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.qkv.weight")
-        q_bias = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.q_bias")
-        v_bias = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.v_bias")
-        # next, add query, keys and values (in that order) to the state dict
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
-            hidden_size : hidden_size * 2, :
-        ]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
-        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
-
-
-# We will verify our results on an image
-def prepare_img():
-    filepath = hf_hub_download(repo_id="shariqfarooq/ZoeDepth", filename="examples/person_1.jpeg", repo_type="space")
-    image = Image.open(filepath).convert("RGB")
-    return image
-
-
-@torch.no_grad()
-def convert_zoedepth_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
-    """
-    Copy/paste/tweak model's weights to our ZoeDepth structure.
-    """
-
-    # define ZoeDepth configuration based on URL
-    config, _ = get_zoedepth_config(model_name)
-
-    # load original model
-    original_model = torch.hub.load(
-        "NielsRogge/ZoeDepth:understanding_zoedepth", model_name, pretrained=True, force_reload=True
-    )
-    original_model.eval()
-    state_dict = original_model.state_dict()
-
-    print("Original state dict:")
-    for name, param in state_dict.items():
-        print(name, param.shape)
-
-    # read in qkv matrices
-    read_in_q_k_v(state_dict, config)
-    if model_name == "ZoeD_NK":
-        read_in_q_k_v_metric_head(state_dict)
-
-    # rename keys
-    state_dict = convert_state_dict(state_dict)
-    # remove certain keys
-    remove_ignore_keys(state_dict)
-
-    # load HuggingFace model
-    model = ZoeDepthForDepthEstimation(config)
-    model.load_state_dict(state_dict)
-    model.eval()
-
-    # verify image processor
-    image = prepare_img()
-
-    image_processor = ZoeDepthImageProcessor()
-    pixel_values = image_processor(image, return_tensors="pt").pixel_values
-    filepath = hf_hub_download(
-        repo_id="nielsr/test-image",
-        filename="zoedepth_pixel_values.pt",
-        repo_type="dataset",
-    )
-    original_pixel_values = torch.load(filepath, map_location="cpu")
-    assert torch.allclose(pixel_values, original_pixel_values)
-
-    # verify logits
-    # this was done on a resized version of the cats image (384x384)
-    filepath = hf_hub_download(
-        repo_id="nielsr/test-image",
-        filename="zoedepth_pixel_values.pt",
-        repo_type="dataset",
-        revision="1865dbb81984f01c89e83eec10f8d07efd10743d",
-    )
-    cats_pixel_values = torch.load(filepath, map_location="cpu")
-    depth = model(cats_pixel_values).predicted_depth
-
-    # Verify logits
-    # These were obtained by inserting the pixel_values at the patch embeddings of BEiT
-    if model_name == "ZoeD_N":
-        expected_shape = torch.Size([1, 384, 384])
-        expected_slice = torch.tensor([[1.0328, 1.0604, 1.0747], [1.0816, 1.1293, 1.1456], [1.1117, 1.1629, 1.1766]])
-    elif model_name == "ZoeD_K":
-        expected_shape = torch.Size([1, 384, 384])
-        expected_slice = torch.tensor([[1.6567, 1.6852, 1.7065], [1.6707, 1.6764, 1.6713], [1.7195, 1.7166, 1.7118]])
-    elif model_name == "ZoeD_NK":
-        expected_shape = torch.Size([1, 384, 384])
-        expected_slice = torch.tensor([[1.1228, 1.1079, 1.1382], [1.1807, 1.1658, 1.1891], [1.2344, 1.2094, 1.2317]])
-
-    print("Shape of depth:", depth.shape)
-    print("First 3x3 slice of depth:", depth[0, :3, :3])
-
-    assert depth.shape == torch.Size(expected_shape)
-    assert torch.allclose(depth[0, :3, :3], expected_slice, atol=1e-4)
-    print("Looks ok!")
-
-    if pytorch_dump_folder_path is not None:
-        print(f"Saving model and processor to {pytorch_dump_folder_path}")
-        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-        model.save_pretrained(pytorch_dump_folder_path)
-        image_processor.save_pretrained(pytorch_dump_folder_path)
-
-    if push_to_hub:
-        model_name_to_repo_id = {
-            "ZoeD_N": "zoedepth-nyu",
-            "ZoeD_K": "zoedepth-kitti",
-            "ZoeD_NK": "zoedepth-nyu-kitti",
-        }
-
-        print("Pushing model and processor to the hub...")
-        repo_id = model_name_to_repo_id[model_name]
-        model.push_to_hub(f"Intel/{repo_id}")
-        image_processor = ZoeDepthImageProcessor()
-        image_processor.push_to_hub(f"Intel/{repo_id}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--model_name",
-        default="ZoeD_N",
-        choices=["ZoeD_N", "ZoeD_K", "ZoeD_NK"],
-        type=str,
-        help="Name of the original ZoeDepth checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path",
-        default=None,
-        type=str,
-        required=False,
-        help="Path to the output PyTorch model directory.",
-    )
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-    )
-
-    args = parser.parse_args()
-    convert_zoedepth_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py
index f0457d00d937..d40b938403ca 100644
--- a/src/transformers/models/zoedepth/image_processing_zoedepth.py
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py
@@ -298,16 +298,16 @@ def pad_image(
     def preprocess(
         self,
         images: ImageInput,
-        do_pad: bool = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
+        do_pad: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        do_resize: bool = None,
-        size: int = None,
-        keep_aspect_ratio: bool = None,
-        ensure_multiple_of: int = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[int] = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
         resample: PILImageResampling = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: ChannelDimension = ChannelDimension.FIRST,
diff --git a/src/transformers/models/zoedepth/modeling_zoedepth.py b/src/transformers/models/zoedepth/modeling_zoedepth.py
index 81eca0e3bfd4..5f2e1a86568f 100644
--- a/src/transformers/models/zoedepth/modeling_zoedepth.py
+++ b/src/transformers/models/zoedepth/modeling_zoedepth.py
@@ -69,8 +69,8 @@ class ZoeDepthDepthEstimatorOutput(ModelOutput):
     """
 
     loss: Optional[torch.FloatTensor] = None
-    predicted_depth: torch.FloatTensor = None
-    domain_logits: torch.FloatTensor = None
+    predicted_depth: Optional[torch.FloatTensor] = None
+    domain_logits: Optional[torch.FloatTensor] = None
     hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
 
@@ -1219,7 +1219,8 @@ def forward(self, outconv_activation, bottleneck, feature_blocks, relative_depth
         return out, None
 
 
-# Copied from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->ZoeDepth,dpt->zoedepth
+# Modified from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->ZoeDepth,dpt->zoedepth
+# avoiding sdpa and flash_attn_2 support, it's done int the backend
 class ZoeDepthPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
diff --git a/src/transformers/onnx/config.py b/src/transformers/onnx/config.py
index 02bf2421f4d2..460ee9329977 100644
--- a/src/transformers/onnx/config.py
+++ b/src/transformers/onnx/config.py
@@ -291,7 +291,7 @@ def generate_dummy_inputs(
         sampling_rate: int = 22050,
         time_duration: float = 5.0,
         frequency: int = 220,
-        tokenizer: "PreTrainedTokenizerBase" = None,
+        tokenizer: Optional["PreTrainedTokenizerBase"] = None,
     ) -> Mapping[str, Any]:
         """
         Generate inputs to provide to the ONNX exporter for the specific framework
@@ -337,7 +337,7 @@ def generate_dummy_inputs(
                 " `preprocessor` instead.",
                 FutureWarning,
             )
-            logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
+            logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
             preprocessor = tokenizer
         if isinstance(preprocessor, PreTrainedTokenizerBase):
             # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
@@ -445,7 +445,7 @@ def __init__(
         self,
         config: "PretrainedConfig",
         task: str = "default",
-        patching_specs: List[PatchingSpec] = None,
+        patching_specs: Optional[list[PatchingSpec]] = None,
         use_past: bool = False,
     ):
         super().__init__(config, task=task, patching_specs=patching_specs)
@@ -639,7 +639,7 @@ def num_attention_heads(self) -> Tuple[int]:
 
     def generate_dummy_inputs(
         self,
-        tokenizer: "PreTrainedTokenizerBase",
+        tokenizer: Optional["PreTrainedTokenizerBase"],
         batch_size: int = -1,
         seq_length: int = -1,
         is_pair: bool = False,
diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py
index 36cd3232d4ba..58bc51f8e801 100644
--- a/src/transformers/onnx/convert.py
+++ b/src/transformers/onnx/convert.py
@@ -16,7 +16,7 @@
 from inspect import signature
 from itertools import chain
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Tuple, Union
+from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 from packaging.version import Version, parse
@@ -85,7 +85,7 @@ def export_pytorch(
     config: OnnxConfig,
     opset: int,
     output: Path,
-    tokenizer: "PreTrainedTokenizer" = None,
+    tokenizer: Optional["PreTrainedTokenizer"] = None,
     device: str = "cpu",
 ) -> Tuple[List[str], List[str]]:
     """
@@ -118,7 +118,7 @@ def export_pytorch(
             " `preprocessor` instead.",
             FutureWarning,
         )
-        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
+        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
         preprocessor = tokenizer
 
     if issubclass(type(model), PreTrainedModel):
@@ -188,7 +188,7 @@ def export_tensorflow(
     config: OnnxConfig,
     opset: int,
     output: Path,
-    tokenizer: "PreTrainedTokenizer" = None,
+    tokenizer: Optional["PreTrainedTokenizer"] = None,
 ) -> Tuple[List[str], List[str]]:
     """
     Export a TensorFlow model to an ONNX Intermediate Representation (IR)
@@ -221,7 +221,7 @@ def export_tensorflow(
             " `preprocessor` instead.",
             FutureWarning,
         )
-        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
+        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
         preprocessor = tokenizer
 
     model.config.return_dict = True
@@ -254,7 +254,7 @@ def export(
     config: OnnxConfig,
     opset: int,
     output: Path,
-    tokenizer: "PreTrainedTokenizer" = None,
+    tokenizer: Optional["PreTrainedTokenizer"] = None,
     device: str = "cpu",
 ) -> Tuple[List[str], List[str]]:
     """
@@ -296,7 +296,7 @@ def export(
             " `preprocessor` instead.",
             FutureWarning,
         )
-        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
+        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
         preprocessor = tokenizer
 
     if is_torch_available():
@@ -321,7 +321,7 @@ def validate_model_outputs(
     onnx_model: Path,
     onnx_named_outputs: List[str],
     atol: float,
-    tokenizer: "PreTrainedTokenizer" = None,
+    tokenizer: Optional["PreTrainedTokenizer"] = None,
 ):
     from onnxruntime import InferenceSession, SessionOptions
 
@@ -335,7 +335,7 @@ def validate_model_outputs(
             " `preprocessor` instead.",
             FutureWarning,
         )
-        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
+        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
         preprocessor = tokenizer
 
     # generate inputs with a different batch_size and seq_len that was used for conversion to properly test
diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py
index 1b0bf23d6121..da9ca2355fef 100644
--- a/src/transformers/onnx/features.py
+++ b/src/transformers/onnx/features.py
@@ -53,7 +53,7 @@
 
 
 def supported_features_mapping(
-    *supported_features: str, onnx_config_cls: str = None
+    *supported_features: str, onnx_config_cls: Optional[str] = None
 ) -> Dict[str, Callable[[PretrainedConfig], OnnxConfig]]:
     """
     Generate the mapping between supported the features and their corresponding OnnxConfig for a given model.
@@ -626,7 +626,7 @@ def get_model_class_for_feature(feature: str, framework: str = "pt") -> Type:
         return task_to_automodel[task]
 
     @staticmethod
-    def determine_framework(model: str, framework: str = None) -> str:
+    def determine_framework(model: str, framework: Optional[str] = None) -> str:
         """
         Determines the framework to use for the export.
 
@@ -677,7 +677,7 @@ def determine_framework(model: str, framework: str = None) -> str:
 
     @staticmethod
     def get_model_from_feature(
-        feature: str, model: str, framework: str = None, cache_dir: str = None
+        feature: str, model: str, framework: Optional[str] = None, cache_dir: Optional[str] = None
     ) -> Union["PreTrainedModel", "TFPreTrainedModel"]:
         """
         Attempts to retrieve a model from a model's name and the feature to be enabled.
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index d00c65925ef2..ba9d9920c1c9 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,10 +16,9 @@
 import math
 import warnings
 from functools import partial
-from typing import Callable, Iterable, Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
-from torch import nn
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
 
@@ -286,7 +284,7 @@ def get_polynomial_decay_schedule_with_warmup(
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
-def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: int = None):
+def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: Optional[int] = None):
     if current_step < num_warmup_steps:
         return float(current_step) / float(max(1, num_warmup_steps))
     shift = timescale - num_warmup_steps
@@ -295,7 +293,7 @@ def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps:
 
 
 def get_inverse_sqrt_schedule(
-    optimizer: Optimizer, num_warmup_steps: int, timescale: int = None, last_epoch: int = -1
+    optimizer: Optimizer, num_warmup_steps: int, timescale: Optional[int] = None, last_epoch: int = -1
 ):
     """
     Create a schedule with an inverse square-root learning rate, from the initial lr set in the optimizer, after a
@@ -341,8 +339,8 @@ def get_cosine_with_min_lr_schedule_with_warmup(
     num_training_steps: int,
     num_cycles: float = 0.5,
     last_epoch: int = -1,
-    min_lr: float = None,
-    min_lr_rate: float = None,
+    min_lr: Optional[float] = None,
+    min_lr_rate: Optional[float] = None,
 ):
     """
     Create a schedule with a learning rate that decreases following the values of the cosine function between the
@@ -584,6 +582,7 @@ def scheduler_hook(param):
     if name == SchedulerType.INVERSE_SQRT:
         return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
 
+    # wsd scheduler requires either num_training_steps or num_stable_steps
     if name == SchedulerType.WARMUP_STABLE_DECAY:
         return schedule_func(
             optimizer,
@@ -604,120 +603,6 @@ def scheduler_hook(param):
     )
 
 
-class AdamW(Optimizer):
-    """
-    Implements Adam algorithm with weight decay fix as introduced in [Decoupled Weight Decay
-    Regularization](https://arxiv.org/abs/1711.05101).
-
-    Parameters:
-        params (`Iterable[nn.parameter.Parameter]`):
-            Iterable of parameters to optimize or dictionaries defining parameter groups.
-        lr (`float`, *optional*, defaults to 0.001):
-            The learning rate to use.
-        betas (`Tuple[float,float]`, *optional*, defaults to `(0.9, 0.999)`):
-            Adam's betas parameters (b1, b2).
-        eps (`float`, *optional*, defaults to 1e-06):
-            Adam's epsilon for numerical stability.
-        weight_decay (`float`, *optional*, defaults to 0.0):
-            Decoupled weight decay to apply.
-        correct_bias (`bool`, *optional*, defaults to `True`):
-            Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
-        no_deprecation_warning (`bool`, *optional*, defaults to `False`):
-            A flag used to disable the deprecation warning (set to `True` to disable the warning).
-    """
-
-    def __init__(
-        self,
-        params: Iterable[nn.parameter.Parameter],
-        lr: float = 1e-3,
-        betas: Tuple[float, float] = (0.9, 0.999),
-        eps: float = 1e-6,
-        weight_decay: float = 0.0,
-        correct_bias: bool = True,
-        no_deprecation_warning: bool = False,
-    ):
-        if not no_deprecation_warning:
-            warnings.warn(
-                "This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch"
-                " implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this"
-                " warning",
-                FutureWarning,
-            )
-        require_version("torch>=1.5.0")  # add_ with alpha
-        if lr < 0.0:
-            raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)")
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
-        if not 0.0 <= eps:
-            raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
-        defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
-        super().__init__(params, defaults)
-
-    @torch.no_grad()
-    def step(self, closure: Callable = None):
-        """
-        Performs a single optimization step.
-
-        Arguments:
-            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                grad = p.grad
-                if grad.is_sparse:
-                    raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(p)
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(p)
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-
-                # Decay the first and second moment running average coefficient
-                # In-place operations to update the averages at the same time
-                exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
-                denom = exp_avg_sq.sqrt().add_(group["eps"])
-
-                step_size = group["lr"]
-                if group["correct_bias"]:  # No bias correction for Bert
-                    bias_correction1 = 1.0 - beta1 ** state["step"]
-                    bias_correction2 = 1.0 - beta2 ** state["step"]
-                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
-
-                p.addcdiv_(exp_avg, denom, value=-step_size)
-
-                # Just adding the square of the weights to the loss function is *not*
-                # the correct way of using L2 regularization/weight decay with Adam,
-                # since that will interact with the m and v parameters in strange ways.
-                #
-                # Instead we want to decay the weights in a manner that doesn't interact
-                # with the m/v parameters. This is equivalent to adding the square
-                # of the weights to the loss with plain (non-momentum) SGD.
-                # Add weight decay at the end (fixed version)
-                if group["weight_decay"] > 0.0:
-                    p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
-
-        return loss
-
-
 class Adafactor(Optimizer):
     """
     AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code:
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index f27913156c44..4da4ecc90191 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -15,7 +15,7 @@
 """Functions and classes related to optimization (weight updates)."""
 
 import re
-from typing import Callable, List, Optional, Union
+from typing import Callable, Optional, Union
 
 import tensorflow as tf
 
@@ -59,7 +59,7 @@ def __init__(
         decay_schedule_fn: Callable,
         warmup_steps: int,
         power: float = 1.0,
-        name: str = None,
+        name: Optional[str] = None,
     ):
         super().__init__()
         self.initial_learning_rate = initial_learning_rate
@@ -105,7 +105,7 @@ def create_optimizer(
     adam_global_clipnorm: Optional[float] = None,
     weight_decay_rate: float = 0.0,
     power: float = 1.0,
-    include_in_weight_decay: Optional[List[str]] = None,
+    include_in_weight_decay: Optional[list[str]] = None,
 ):
     """
     Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.
@@ -224,8 +224,8 @@ def __init__(
         epsilon: float = 1e-7,
         amsgrad: bool = False,
         weight_decay_rate: float = 0.0,
-        include_in_weight_decay: Optional[List[str]] = None,
-        exclude_from_weight_decay: Optional[List[str]] = None,
+        include_in_weight_decay: Optional[list[str]] = None,
+        exclude_from_weight_decay: Optional[list[str]] = None,
         name: str = "AdamWeightDecay",
         **kwargs,
     ):
@@ -238,10 +238,10 @@ def __init__(
     def from_config(cls, config):
         """Creates an optimizer from its config with WarmUp custom object."""
         custom_objects = {"WarmUp": WarmUp}
-        return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects)
+        return super().from_config(config, custom_objects=custom_objects)
 
     def _prepare_local(self, var_device, var_dtype, apply_state):
-        super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state)
+        super()._prepare_local(var_device, var_dtype, apply_state)
         apply_state[(var_device, var_dtype)]["weight_decay_rate"] = tf.constant(
             self.weight_decay_rate, name="adam_weight_decay_rate"
         )
@@ -257,7 +257,7 @@ def _decay_weights_op(self, var, learning_rate, apply_state):
 
     def apply_gradients(self, grads_and_vars, name=None, **kwargs):
         grads, tvars = list(zip(*grads_and_vars))
-        return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars), name=name, **kwargs)
+        return super().apply_gradients(zip(grads, tvars), name=name, **kwargs)
 
     def _get_lr(self, var_device, var_dtype, apply_state):
         """Retrieves the learning rate with the given state."""
@@ -276,13 +276,13 @@ def _resource_apply_dense(self, grad, var, apply_state=None):
         lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
         decay = self._decay_weights_op(var, lr_t, apply_state)
         with tf.control_dependencies([decay]):
-            return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs)
+            return super()._resource_apply_dense(grad, var, **kwargs)
 
     def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
         lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
         decay = self._decay_weights_op(var, lr_t, apply_state)
         with tf.control_dependencies([decay]):
-            return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs)
+            return super()._resource_apply_sparse(grad, var, indices, **kwargs)
 
     def get_config(self):
         config = super().get_config()
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 257f5689b0ed..a62b3c3eab23 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -565,7 +565,7 @@ def clean_custom_task(task_info):
 
 
 def pipeline(
-    task: str = None,
+    task: Optional[str] = None,
     model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None,
     config: Optional[Union[str, PretrainedConfig]] = None,
     tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
@@ -580,7 +580,7 @@ def pipeline(
     device_map=None,
     torch_dtype=None,
     trust_remote_code: Optional[bool] = None,
-    model_kwargs: Dict[str, Any] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
     pipeline_class: Optional[Any] = None,
     **kwargs,
 ) -> Pipeline:
@@ -824,6 +824,7 @@ def pipeline(
 
     # Config is the primordial information item.
     # Instantiate config if needed
+    adapter_path = None
     if isinstance(config, str):
         config = AutoConfig.from_pretrained(
             config, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
@@ -844,6 +845,7 @@ def pipeline(
             if maybe_adapter_path is not None:
                 with open(maybe_adapter_path, "r", encoding="utf-8") as f:
                     adapter_config = json.load(f)
+                    adapter_path = model
                     model = adapter_config["base_model_name_or_path"]
 
         config = AutoConfig.from_pretrained(
@@ -938,7 +940,7 @@ def pipeline(
     if isinstance(model, str) or framework is None:
         model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
         framework, model = infer_framework_load_model(
-            model,
+            adapter_path if adapter_path is not None else model,
             model_classes=model_classes,
             config=config,
             framework=framework,
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index d3ee4e871e24..1df663a26e9f 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -45,6 +45,7 @@
     is_tf_available,
     is_torch_available,
     is_torch_cuda_available,
+    is_torch_hpu_available,
     is_torch_mlu_available,
     is_torch_mps_available,
     is_torch_musa_available,
@@ -946,12 +947,18 @@ def __init__(
             if device == -1 and self.model.device is not None:
                 device = self.model.device
             if isinstance(device, torch.device):
-                if device.type == "xpu" and not is_torch_xpu_available(check_device=True):
+                if (device.type == "xpu" and not is_torch_xpu_available(check_device=True)) or (
+                    device.type == "hpu" and not is_torch_hpu_available()
+                ):
                     raise ValueError(f'{device} is not available, you should use device="cpu" instead')
+
                 self.device = device
             elif isinstance(device, str):
-                if "xpu" in device and not is_torch_xpu_available(check_device=True):
+                if ("xpu" in device and not is_torch_xpu_available(check_device=True)) or (
+                    "hpu" in device and not is_torch_hpu_available()
+                ):
                     raise ValueError(f'{device} is not available, you should use device="cpu" instead')
+
                 self.device = torch.device(device)
             elif device < 0:
                 self.device = torch.device("cpu")
@@ -963,6 +970,8 @@ def __init__(
                 self.device = torch.device(f"cuda:{device}")
             elif is_torch_npu_available():
                 self.device = torch.device(f"npu:{device}")
+            elif is_torch_hpu_available():
+                self.device = torch.device(f"hpu:{device}")
             elif is_torch_xpu_available(check_device=True):
                 self.device = torch.device(f"xpu:{device}")
             elif is_torch_mps_available():
@@ -972,6 +981,8 @@ def __init__(
         else:
             self.device = device if device is not None else -1
 
+        if torch.distributed.is_initialized():
+            self.device = self.model.device
         logger.warning(f"Device set to use {self.device}")
 
         self.binary_output = binary_output
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index 804dc4af0750..899a7cc53908 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -339,7 +339,7 @@ def preprocess(
             )
 
         if self.model_type == ModelType.VisionEncoderDecoder:
-            task_prompt = f'<s_docvqa><s_question>{input["question"]}</s_question><s_answer>'
+            task_prompt = f"<s_docvqa><s_question>{input['question']}</s_question><s_answer>"
             # Adapted from https://huggingface.co/spaces/nielsr/donut-docvqa/blob/main/app.py
             encoding = {
                 "inputs": image_features["pixel_values"],
diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py
index 5afba0d7c041..6b743997f5ee 100644
--- a/src/transformers/pipelines/image_text_to_text.py
+++ b/src/transformers/pipelines/image_text_to_text.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import enum
+from collections.abc import Iterable  # pylint: disable=g-importing-member
 from typing import Dict, List, Optional, Union
 
 from ..processing_utils import ProcessingKwargs, Unpack
@@ -71,6 +72,8 @@ def retrieve_images_in_messages(
     """
     if images is None:
         images = []
+    elif not isinstance(images, Iterable):
+        images = [images]
     idx_images = 0
     retrieved_images = []
     for message in messages:
@@ -188,14 +191,15 @@ def _sanitize_parameters(
         return_full_text=None,
         return_tensors=None,
         return_type=None,
+        clean_up_tokenization_spaces=None,
+        stop_sequence=None,
         continue_final_message=None,
         **kwargs: Unpack[ProcessingKwargs],
     ):
         forward_kwargs = {}
         preprocess_params = {}
         postprocess_params = {}
-
-        preprocess_params["processing_kwargs"] = kwargs
+        preprocess_params.update(kwargs)
 
         if timeout is not None:
             preprocess_params["timeout"] = timeout
@@ -226,7 +230,16 @@ def _sanitize_parameters(
             postprocess_params["return_type"] = return_type
         if continue_final_message is not None:
             postprocess_params["continue_final_message"] = continue_final_message
-
+        if clean_up_tokenization_spaces is not None:
+            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+        if stop_sequence is not None:
+            stop_sequence_ids = self.processor.tokenizer.encode(stop_sequence, add_special_tokens=False)
+            if len(stop_sequence_ids) > 1:
+                logger.warning_once(
+                    "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
+                    " the stop sequence will be used as the stop sequence string in the interim."
+                )
+            generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
         return preprocess_params, forward_kwargs, postprocess_params
 
     def __call__(
@@ -264,6 +277,8 @@ def __call__(
             return_full_text (`bool`, *optional*, defaults to `True`):
                 If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
                 specified at the same time as `return_text`.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the potential extra spaces in the text output.
             continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
                 last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
                 By default this is `True` when the final message in the input chat has the `assistant` role and
@@ -315,7 +330,7 @@ def __call__(
 
         return super().__call__({"images": images, "text": text}, **kwargs)
 
-    def preprocess(self, inputs=None, timeout=None, continue_final_message=None, processing_kwargs=None):
+    def preprocess(self, inputs=None, timeout=None, continue_final_message=None, **processing_kwargs):
         # In case we only have text inputs
         if isinstance(inputs, (list, tuple, str)):
             images = None
@@ -332,6 +347,7 @@ def preprocess(self, inputs=None, timeout=None, continue_final_message=None, pro
                     add_generation_prompt=not continue_final_message,
                     continue_final_message=continue_final_message,
                     return_tensors=self.framework,
+                    **processing_kwargs,
                 )
                 inputs_text = inputs
                 images = inputs.images
@@ -340,14 +356,14 @@ def preprocess(self, inputs=None, timeout=None, continue_final_message=None, pro
                 inputs_text = inputs["text"]
                 images = inputs["images"]
 
-            images = load_images(images)
+            images = load_images(images, timeout=timeout)
 
         # if batched text inputs, we set padding to True unless specified otherwise
         if isinstance(text, (list, tuple)) and len(text) > 1:
             processing_kwargs.setdefault("padding", True)
-        model_inputs = self.processor(
-            images=images, text=text, return_tensors=self.framework, legacy=False, **processing_kwargs
-        ).to(dtype=self.torch_dtype)
+        model_inputs = self.processor(images=images, text=text, return_tensors=self.framework, **processing_kwargs).to(
+            dtype=self.torch_dtype
+        )
 
         model_inputs["text"] = inputs_text
 
@@ -363,7 +379,9 @@ def _forward(self, model_inputs, generate_kwargs=None):
 
         return {"generated_sequence": generated_sequence, "prompt_text": prompt_text, "input_ids": input_ids}
 
-    def postprocess(self, model_outputs, return_type=ReturnType.FULL_TEXT, continue_final_message=None):
+    def postprocess(
+        self, model_outputs, return_type=ReturnType.FULL_TEXT, continue_final_message=None, **postprocess_kwargs
+    ):
         input_texts = model_outputs["prompt_text"]
         input_texts = [input_texts] if isinstance(input_texts, (str, Chat)) else input_texts
         generated_sequence = model_outputs["generated_sequence"]
@@ -375,8 +393,8 @@ def postprocess(self, model_outputs, return_type=ReturnType.FULL_TEXT, continue_
             ]
 
         # Decode inputs and outputs the same way to remove input text from generated text if present
-        generated_texts = self.processor.post_process_image_text_to_text(generated_sequence)
-        decoded_inputs = self.processor.post_process_image_text_to_text(input_ids)
+        generated_texts = self.processor.post_process_image_text_to_text(generated_sequence, **postprocess_kwargs)
+        decoded_inputs = self.processor.post_process_image_text_to_text(input_ids, **postprocess_kwargs)
 
         # Force consistent behavior for including the input text in the output
         if return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index eee05b9f2c52..85d2e7c15d58 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -340,7 +340,6 @@ def _sanitize_parameters(
         if max_answer_len is not None:
             if max_answer_len < 1:
                 raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
-        if max_answer_len is not None:
             postprocess_params["max_answer_len"] = max_answer_len
         if handle_impossible_answer is not None:
             postprocess_params["handle_impossible_answer"] = handle_impossible_answer
@@ -542,11 +541,9 @@ def postprocess(
         for output in model_outputs:
             if self.framework == "pt" and output["start"].dtype == torch.bfloat16:
                 start_ = output["start"].to(torch.float32)
-            else:
-                start_ = output["start"]
-            if self.framework == "pt" and output["start"].dtype == torch.bfloat16:
                 end_ = output["end"].to(torch.float32)
             else:
+                start_ = output["start"]
                 end_ = output["end"]
             example = output["example"]
             p_mask = output["p_mask"]
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index 9bc754455028..0adefdffb9ff 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -290,7 +290,7 @@ def check_inputs(self, input_length: int, min_length: int, max_length: int) -> b
             logger.warning(
                 f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is "
                 "a summarization task, where outputs shorter than the input are typically wanted, you might "
-                f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length//2})"
+                f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length // 2})"
             )
 
 
diff --git a/src/transformers/pipelines/text_to_audio.py b/src/transformers/pipelines/text_to_audio.py
index b7beca586d21..d0fbe5f115d8 100644
--- a/src/transformers/pipelines/text_to_audio.py
+++ b/src/transformers/pipelines/text_to_audio.py
@@ -18,6 +18,8 @@
 
 
 if is_torch_available():
+    import torch
+
     from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING
     from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan
 
@@ -213,7 +215,7 @@ def postprocess(self, waveform):
             waveform = waveform["waveform"]
         elif isinstance(waveform, tuple):
             waveform = waveform[0]
-        output_dict["audio"] = waveform.cpu().float().numpy()
+        output_dict["audio"] = waveform.to(device="cpu", dtype=torch.float).numpy()
         output_dict["sampling_rate"] = self.sampling_rate
 
         return output_dict
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index c53b515dcccd..7e49ba0efaaf 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -145,8 +145,11 @@ def preprocess(
             inputs = inputs.to(self.torch_dtype)
         inputs["candidate_labels"] = candidate_labels
         sequences = [hypothesis_template.format(x) for x in candidate_labels]
-        padding = "max_length" if self.model.config.model_type == "siglip" else True
-        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=padding, **tokenizer_kwargs)
+        tokenizer_default_kwargs = {"padding": True}
+        if "siglip" in self.model.config.model_type:
+            tokenizer_default_kwargs.update(padding="max_length", max_length=64, truncation=True)
+        tokenizer_default_kwargs.update(tokenizer_kwargs)
+        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, **tokenizer_default_kwargs)
         inputs["text_inputs"] = [text_inputs]
         return inputs
 
@@ -170,7 +173,7 @@ def _forward(self, model_inputs):
     def postprocess(self, model_outputs):
         candidate_labels = model_outputs.pop("candidate_labels")
         logits = model_outputs["logits"][0]
-        if self.framework == "pt" and self.model.config.model_type == "siglip":
+        if self.framework == "pt" and "siglip" in self.model.config.model_type:
             probs = torch.sigmoid(logits).squeeze(-1)
             scores = probs.tolist()
             if not isinstance(scores, list):
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 5b1e45259f2c..b1c40e7ff2d7 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,11 +23,12 @@
 import typing
 import warnings
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union
+from typing import Any, Callable, Dict, List, Optional, TypedDict, Union
 
 import numpy as np
 import typing_extensions
 
+from .audio_utils import load_audio
 from .dynamic_module_utils import custom_object_save
 from .image_utils import (
     ChannelDimension,
@@ -69,7 +69,7 @@
 
 logger = logging.get_logger(__name__)
 
-# Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
+# Dynamically import the Transformers module to grab the attribute classes of the processor from their names.
 transformers_module = direct_transformers_import(Path(__file__).parent)
 
 
@@ -123,9 +123,9 @@ class TextKwargs(TypedDict, total=False):
             The side on which padding will be applied.
     """
 
-    text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
-    text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
-    text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
+    text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
+    text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
+    text_pair_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
     add_special_tokens: Optional[bool]
     padding: Union[bool, str, PaddingStrategy]
     truncation: Union[bool, str, TruncationStrategy]
@@ -184,17 +184,17 @@ class methods and docstrings.
     """
 
     do_resize: Optional[bool]
-    size: Optional[Dict[str, int]]
+    size: Optional[dict[str, int]]
     size_divisor: Optional[int]
-    crop_size: Optional[Dict[str, int]]
+    crop_size: Optional[dict[str, int]]
     resample: Optional[Union["PILImageResampling", int]]
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, List[float]]]
-    image_std: Optional[Union[float, List[float]]]
+    image_mean: Optional[Union[float, list[float]]]
+    image_std: Optional[Union[float, list[float]]]
     do_pad: Optional[bool]
-    pad_size: Optional[Dict[str, int]]
+    pad_size: Optional[dict[str, int]]
     do_center_crop: Optional[bool]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
@@ -235,14 +235,14 @@ class VideosKwargs(TypedDict, total=False):
     """
 
     do_resize: Optional[bool]
-    size: Optional[Dict[str, int]]
+    size: Optional[dict[str, int]]
     size_divisor: Optional[int]
     resample: Optional["PILImageResampling"]
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, List[float]]]
-    image_std: Optional[Union[float, List[float]]]
+    image_mean: Optional[Union[float, list[float]]]
+    image_std: Optional[Union[float, list[float]]]
     do_pad: Optional[bool]
     do_center_crop: Optional[bool]
     data_format: Optional[ChannelDimension]
@@ -280,7 +280,7 @@ class AudioKwargs(TypedDict, total=False):
     """
 
     sampling_rate: Optional[int]
-    raw_speech: Optional[Union["np.ndarray", List[float], List["np.ndarray"], List[List[float]]]]
+    raw_speech: Optional[Union["np.ndarray", list[float], list["np.ndarray"], list[list[float]]]]
     padding: Optional[Union[bool, str, PaddingStrategy]]
     max_length: Optional[int]
     truncation: Optional[bool]
@@ -379,21 +379,17 @@ class TokenizerChatTemplateKwargs(TypedDict, total=False):
         This functionality is only available for chat templates that support it via the `{% generation %}` keyword.
     """
 
-    tools: Optional[List[Dict]] = None
-    documents: Optional[List[Dict[str, str]]] = None
+    tools: Optional[list[dict]] = None
+    documents: Optional[list[dict[str, str]]] = None
     add_generation_prompt: Optional[bool] = False
     continue_final_message: Optional[bool] = False
     return_assistant_tokens_mask: Optional[bool] = False
 
 
-class ProcessorChatTemplateKwargs(TokenizerChatTemplateKwargs, total=False):
+class ChatTemplateLoadKwargs(TypedDict, total=False):
     """
-    Keyword arguments for processor chat templates.
+    Keyword arguments used to load multimodal data in processor chat templates.
 
-    tokenize (`bool`, *optional*, defaults to `False`):
-        Whether to tokenize the output or not.
-    return_dict (`bool`, defaults to `False`):
-        Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
     num_frames (`int`, *optional*):
         Number of frames to sample uniformly. If not passed, the whole video is loaded.
     video_load_backend (`str`, *optional*, defaults to `"pyav"`):
@@ -415,12 +411,26 @@ def sample_indices_fn(num_frames, fps, metadata, **kwargs):
                 return np.linspace(start_idx, end_idx, num_frames, dtype=int)
     """
 
-    tokenize: Optional[bool] = False
-    return_dict: Optional[bool] = False
     num_frames: Optional[int] = None
     video_load_backend: Optional[str] = "pyav"
     video_fps: Optional[int] = None
+    sampling_rate: Optional[int] = 16_000
     sample_indices_fn: Optional[Callable] = None
+    load_audio_from_video: Optional[bool] = False
+
+
+class ProcessorChatTemplateKwargs(ChatTemplateLoadKwargs, TokenizerChatTemplateKwargs, total=False):
+    """
+    Keyword arguments for processor's `apply_chat_template`.
+
+    tokenize (`bool`, *optional*, defaults to `False`):
+        Whether to tokenize the output or not.
+    return_dict (`bool`, defaults to `False`):
+        Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
+    """
+
+    tokenize: Optional[bool] = False
+    return_dict: Optional[bool] = False
 
 
 class AllKwargsForChatTemplate(
@@ -435,12 +445,12 @@ class ProcessorMixin(PushToHubMixin):
 
     attributes = ["feature_extractor", "tokenizer"]
     optional_attributes = ["chat_template"]
-    optional_call_args: List[str] = []
+    optional_call_args: list[str] = []
     # Names need to be attr_class for attr in attributes
     feature_extractor_class = None
     tokenizer_class = None
     _auto_class = None
-    valid_kwargs: List[str] = []
+    valid_kwargs: list[str] = []
 
     # args have to match the attributes class attribute
     def __init__(self, *args, **kwargs):
@@ -470,9 +480,9 @@ def __init__(self, *args, **kwargs):
             # Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class.
             class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name)
             if isinstance(class_name, tuple):
-                proper_class = tuple(getattr(transformers_module, n) for n in class_name if n is not None)
+                proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None)
             else:
-                proper_class = getattr(transformers_module, class_name)
+                proper_class = self.get_possibly_dynamic_module(class_name)
 
             if not isinstance(arg, proper_class):
                 raise TypeError(
@@ -481,7 +491,7 @@ def __init__(self, *args, **kwargs):
 
             setattr(self, attribute_name, arg)
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """
         Serializes this instance to a Python dictionary.
 
@@ -659,7 +669,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
     @classmethod
     def get_processor_dict(
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
         """
         From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
         processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`.
@@ -764,13 +774,13 @@ def get_processor_dict(
                     subfolder=subfolder,
                     _raise_exceptions_for_missing_entries=False,
                 )
-            except EnvironmentError:
+            except OSError:
                 # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
                 # the original exception.
                 raise
             except Exception:
                 # For any other exception, we throw a generic error.
-                raise EnvironmentError(
+                raise OSError(
                     f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
                     " it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
                     f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
@@ -779,11 +789,11 @@ def get_processor_dict(
 
         # Add chat template as kwarg before returning because most models don't have processor config
         if resolved_raw_chat_template_file is not None:
-            with open(resolved_raw_chat_template_file, "r", encoding="utf-8") as reader:
+            with open(resolved_raw_chat_template_file, encoding="utf-8") as reader:
                 chat_template = reader.read()
             kwargs["chat_template"] = chat_template
         elif resolved_chat_template_file is not None:
-            with open(resolved_chat_template_file, "r", encoding="utf-8") as reader:
+            with open(resolved_chat_template_file, encoding="utf-8") as reader:
                 text = reader.read()
             chat_template = json.loads(text)["chat_template"]
             kwargs["chat_template"] = chat_template
@@ -801,14 +811,12 @@ def get_processor_dict(
 
         try:
             # Load processor dict
-            with open(resolved_processor_file, "r", encoding="utf-8") as reader:
+            with open(resolved_processor_file, encoding="utf-8") as reader:
                 text = reader.read()
             processor_dict = json.loads(text)
 
         except json.JSONDecodeError:
-            raise EnvironmentError(
-                f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
-            )
+            raise OSError(f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file.")
 
         if is_local:
             logger.info(f"loading configuration file {resolved_processor_file}")
@@ -837,7 +845,7 @@ def get_processor_dict(
         return processor_dict, kwargs
 
     @classmethod
-    def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
+    def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kwargs):
         """
         Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
 
@@ -882,9 +890,9 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
     def _merge_kwargs(
         self,
         ModelProcessorKwargs: ProcessingKwargs,
-        tokenizer_init_kwargs: Optional[Dict] = None,
+        tokenizer_init_kwargs: Optional[dict] = None,
         **kwargs,
-    ) -> Dict[str, Dict]:
+    ) -> dict[str, dict]:
         """
         Method to merge dictionaries of kwargs cleanly separated by modality within a Processor instance.
         The order of operations is as follows:
@@ -901,7 +909,7 @@ def _merge_kwargs(
                 ```python
                 tokenizer = tokenizer_class(..., {"padding": "max_length"})
                 image_processor = image_processor_class(...)
-                processor(tokenizer, image_processor) # will pass max_length unless overriden by kwargs at call
+                processor(tokenizer, image_processor) # will pass max_length unless overridden by kwargs at call
                 ```
             4) defaults kwargs specified at processor level have lowest priority.
                 ```python
@@ -941,6 +949,7 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
             "common_kwargs": {},
         }
 
+        possible_modality_keywords = {"text", "audio", "videos", "images"}
         used_keys = set()
 
         # get defaults from set model processor kwargs if they exist
@@ -979,7 +988,7 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                     kwarg_value = kwargs.get(modality_key, "__empty__")
                 else:
                     kwarg_value = "__empty__"
-                if kwarg_value != "__empty__":
+                if not isinstance(kwarg_value, str) or kwarg_value != "__empty__":
                     output_kwargs[modality][modality_key] = kwarg_value
                     used_keys.add(modality_key)
 
@@ -998,7 +1007,7 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                 if key not in used_keys:
                     if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys():
                         output_kwargs["common_kwargs"][key] = kwargs[key]
-                    else:
+                    elif key not in possible_modality_keywords:
                         logger.warning_once(
                             f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
                         )
@@ -1069,7 +1078,7 @@ def from_pretrained(
 
         args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
         processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
-
+        processor_dict.update({k: v for k, v in kwargs.items() if k in processor_dict.keys()})
         return cls.from_args_and_dict(args, processor_dict, **kwargs)
 
     @classmethod
@@ -1100,22 +1109,64 @@ def register_for_auto_class(cls, auto_class="AutoProcessor"):
 
     @classmethod
     def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        """
+        Identify and instantiate the subcomponents of Processor classes, like image processors and
+        tokenizers. This method uses the Processor attributes like `tokenizer_class` to figure out what class those
+        subcomponents should be. Note that any subcomponents must either be library classes that are accessible in
+        the `transformers` root, or they must be custom code that has been registered with the relevant autoclass,
+        via methods like `AutoTokenizer.register()`. If neither of these conditions are fulfilled, this method
+        will be unable to find the relevant subcomponent class and will raise an error.
+        """
         args = []
         for attribute_name in cls.attributes:
             class_name = getattr(cls, f"{attribute_name}_class")
             if isinstance(class_name, tuple):
-                classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
-                use_fast = kwargs.get("use_fast", True)
+                classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name)
+                if attribute_name == "image_processor":
+                    # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)
+                    use_fast = kwargs.get("use_fast", None)
+                    if use_fast is None:
+                        logger.warning_once(
+                            "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. "
+                            "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. "
+                            "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`."
+                        )
+                else:
+                    use_fast = kwargs.get("use_fast", True)
                 if use_fast and classes[1] is not None:
                     attribute_class = classes[1]
                 else:
                     attribute_class = classes[0]
             else:
-                attribute_class = getattr(transformers_module, class_name)
+                attribute_class = cls.get_possibly_dynamic_module(class_name)
 
             args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
         return args
 
+    @staticmethod
+    def get_possibly_dynamic_module(module_name):
+        if hasattr(transformers_module, module_name):
+            return getattr(transformers_module, module_name)
+        lookup_locations = [
+            transformers_module.IMAGE_PROCESSOR_MAPPING,
+            transformers_module.TOKENIZER_MAPPING,
+            transformers_module.FEATURE_EXTRACTOR_MAPPING,
+        ]
+        for lookup_location in lookup_locations:
+            for custom_class in lookup_location._extra_content.values():
+                if isinstance(custom_class, tuple):
+                    for custom_subclass in custom_class:
+                        if custom_subclass is not None and custom_subclass.__name__ == module_name:
+                            return custom_subclass
+                elif custom_class is not None and custom_class.__name__ == module_name:
+                    return custom_class
+        else:
+            raise ValueError(
+                f"Could not find module {module_name} in `transformers`. If this is a custom class, "
+                f"it should be registered using the relevant `AutoClass.register()` function so that "
+                f"other functions can find it!"
+            )
+
     @property
     def model_input_names(self):
         first_attribute = getattr(self, self.attributes[0])
@@ -1127,10 +1178,6 @@ def validate_init_kwargs(processor_config, valid_kwargs):
         unused_kwargs = {}
         unused_keys = set(kwargs_from_config) - set(valid_kwargs)
         if unused_keys:
-            unused_key_str = ", ".join(unused_keys)
-            logger.warning(
-                f"Some kwargs in processor config are unused and will not have any effect: {unused_key_str}. "
-            )
             unused_kwargs = {k: processor_config[k] for k in unused_keys}
         return unused_kwargs
 
@@ -1198,14 +1245,14 @@ def _process_messages_for_chat_template(
         batch_images: List[ImageInput],
         batch_videos: List[VideoInput],
         batch_video_metadata: List[List[Dict[str, any]]],
-        **chat_template_kwargs: Unpack[AllKwargsForChatTemplate],
+        **mm_load_kwargs: Unpack[ChatTemplateLoadKwargs],
     ):
         """
         Used within `apply_chat_template` when a model has a special way to process conversation history. For example,
         video models might want to specify in the prompt the duration of video or which frame indices at which timestamps
         were sampled. This information cannot be accessed before the video is loaded.
 
-        For most models it is a no-op, and must be overriden by model processors which require special processing.
+        For most models it is a no-op, and must be overridden by model processors which require special processing.
 
         Args:
             conversation (`List[Dict, str, str]`):
@@ -1228,7 +1275,7 @@ def _process_messages_for_chat_template(
 
     def apply_chat_template(
         self,
-        conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
+        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
         chat_template: Optional[str] = None,
         **kwargs: Unpack[AllKwargsForChatTemplate],
     ) -> str:
@@ -1269,18 +1316,18 @@ def apply_chat_template(
                 )
 
         # Fill two sets of kwargs that should be used by tokenizer's `apply_chat_template`
-        # and for multimodal chat template
+        # and for multimodal data loading. Everything else will be used in `__call__`
         tokenizer_template_kwargs = {}
         for tokenizer_key in TokenizerChatTemplateKwargs.__annotations__.keys():
-            tokenizer_value = getattr(TokenizerChatTemplateKwargs, tokenizer_key, None)
-            value = kwargs.pop(tokenizer_key, tokenizer_value)
+            default_value = getattr(TokenizerChatTemplateKwargs, tokenizer_key, None)
+            value = kwargs.pop(tokenizer_key, default_value)
             tokenizer_template_kwargs[tokenizer_key] = value
 
-        chat_template_kwargs = {}
-        for key in ProcessorChatTemplateKwargs.__annotations__.keys():
-            processor_value = getattr(ProcessorChatTemplateKwargs, key, None)
-            value = kwargs.pop(key, processor_value)
-            chat_template_kwargs[key] = value
+        mm_load_kwargs = {}
+        for mm_load_key in ChatTemplateLoadKwargs.__annotations__.keys():
+            default_value = getattr(ChatTemplateLoadKwargs, mm_load_key, None)
+            value = kwargs.pop(mm_load_key, default_value)
+            mm_load_kwargs[mm_load_key] = value
 
         if isinstance(conversation, (list, tuple)) and (
             isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
@@ -1291,21 +1338,24 @@ def apply_chat_template(
             is_batched = False
             conversations = [conversation]
 
-        num_frames = chat_template_kwargs.get("num_frames")
-        video_fps = chat_template_kwargs.get("video_fps")
-        video_load_backend = chat_template_kwargs.get("video_load_backend")
-        tokenize = chat_template_kwargs.get("tokenize")
-        return_dict = chat_template_kwargs.get("return_dict")
-        sample_indices_fn = chat_template_kwargs.get("sample_indices_fn")
+        tokenize = kwargs.pop("tokenize", False)
+        return_dict = kwargs.pop("return_dict", False)
 
         if tokenize:
             batch_images, batch_videos = [], []
+            batch_audios = []
             batch_video_metadata = []
             for conversation in conversations:
                 images, videos = [], []
                 video_metadata = []
                 for message in conversation:
                     visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
+                    audio_fnames = [
+                        content[key]
+                        for content in message["content"]
+                        for key in ["audio", "url", "path"]
+                        if key in content and content["type"] == "audio"
+                    ]
                     image_fnames = [
                         vision_info[key]
                         for vision_info in visuals
@@ -1318,8 +1368,18 @@ def apply_chat_template(
                         for key in ["video", "url", "path"]
                         if key in vision_info and vision_info["type"] == "video"
                     ]
+
                     for fname in image_fnames:
                         images.append(load_image(fname))
+
+                    # Audio models do not accept nested list of audios (yet!) so we construct a flat input audio list
+                    if not mm_load_kwargs["load_audio_from_video"]:
+                        for fname in audio_fnames:
+                            batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
+                    else:
+                        for fname in video_fnames:
+                            batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
+
                     for fname in video_fnames:
                         if isinstance(fname, (list, tuple)) and isinstance(fname[0], str):
                             video = [np.array(load_image(image_fname)).T for image_fname in fname]
@@ -1328,15 +1388,15 @@ def apply_chat_template(
                             metadata = None
                             logger.warning(
                                 "When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. "
-                                "If you model applies special processing based on metadata, please load the whole video and let the model sample frames."
+                                "If your model uses this metadata during processing, please load the whole video and let the model sample frames instead."
                             )
                         else:
                             video, metadata = load_video(
                                 fname,
-                                num_frames=num_frames,
-                                fps=video_fps,
-                                backend=video_load_backend,
-                                sample_indices_fn=sample_indices_fn,
+                                num_frames=mm_load_kwargs["num_frames"],
+                                fps=mm_load_kwargs["video_fps"],
+                                backend=mm_load_kwargs["video_load_backend"],
+                                sample_indices_fn=mm_load_kwargs["sample_indices_fn"],
                             )
                         videos.append(video)
                         video_metadata.append(metadata)
@@ -1355,7 +1415,7 @@ def apply_chat_template(
                 batch_images=batch_images,
                 batch_videos=batch_videos,
                 batch_video_metadata=batch_video_metadata,
-                **chat_template_kwargs,
+                **mm_load_kwargs,
             )
 
         prompt = self.tokenizer.apply_chat_template(
@@ -1372,7 +1432,7 @@ def apply_chat_template(
         if tokenize:
             # Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
             # But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
-            # and pass it to the processor. Users thus never worried about special tokens relying on processor hadnling
+            # and pass it to the processor. Users thus never worried about special tokens relying on processor handling
             # everything internally. The below line is to keep BC for that and be able to work with model that have
             # special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
             # without actionable solution for users
@@ -1384,6 +1444,7 @@ def apply_chat_template(
                 text=prompt,
                 images=batch_images if batch_images else None,
                 videos=batch_videos if batch_videos else None,
+                audio=batch_audios if batch_audios else None,
                 **kwargs,
             )
             if return_dict:
@@ -1392,7 +1453,7 @@ def apply_chat_template(
                 return out["input_ids"]
         return prompt
 
-    def post_process_image_text_to_text(self, generated_outputs):
+    def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs):
         """
         Post-process the output of a vlm to decode the text.
 
@@ -1400,11 +1461,15 @@ def post_process_image_text_to_text(self, generated_outputs):
             generated_outputs (`torch.Tensor` or `np.ndarray`):
                 The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                 or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
 
         Returns:
             `List[str]`: The decoded text.
         """
-        return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
+        return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
 
 
 def _validate_images_text_input_order(images, text):
diff --git a/examples/research_projects/bert-loses-patience/pabee/__init__.py b/src/transformers/py.typed
similarity index 100%
rename from examples/research_projects/bert-loses-patience/pabee/__init__.py
rename to src/transformers/py.typed
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index c36adffd9722..ff3856c9d457 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -15,10 +15,9 @@
 
 import inspect
 from functools import lru_cache, wraps
-from typing import Callable, List, Optional, Set, Tuple, Union
+from typing import Callable
 
 import torch
-from packaging import version
 from safetensors.torch import storage_ptr, storage_size
 from torch import nn
 
@@ -29,27 +28,22 @@
 
 logger = logging.get_logger(__name__)
 
-parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
-
-is_torch_greater_or_equal_than_2_4 = parsed_torch_version_base >= version.parse("2.4")
-is_torch_greater_or_equal_than_2_3 = parsed_torch_version_base >= version.parse("2.3")
-is_torch_greater_or_equal_than_2_2 = parsed_torch_version_base >= version.parse("2.2")
-is_torch_greater_or_equal_than_2_1 = parsed_torch_version_base >= version.parse("2.1")
+is_torch_greater_or_equal_than_2_6 = is_torch_greater_or_equal("2.6", accept_dev=True)
+is_torch_greater_or_equal_than_2_4 = is_torch_greater_or_equal("2.4", accept_dev=True)
+is_torch_greater_or_equal_than_2_3 = is_torch_greater_or_equal("2.3", accept_dev=True)
+is_torch_greater_or_equal_than_2_2 = is_torch_greater_or_equal("2.2", accept_dev=True)
+is_torch_greater_or_equal_than_2_1 = is_torch_greater_or_equal("2.1", accept_dev=True)
 
 # For backwards compatibility (e.g. some remote codes on Hub using those variables).
-is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0")
-is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse("1.13")
-is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")
+is_torch_greater_or_equal_than_2_0 = is_torch_greater_or_equal("2.0", accept_dev=True)
+is_torch_greater_or_equal_than_1_13 = is_torch_greater_or_equal("1.13", accept_dev=True)
+is_torch_greater_or_equal_than_1_12 = is_torch_greater_or_equal("1.12", accept_dev=True)
 
 # Cache this result has it's a C FFI call which can be pretty time-consuming
 _torch_distributed_available = torch.distributed.is_available()
 
 if is_torch_greater_or_equal("2.5") and _torch_distributed_available:
-    from torch.distributed.tensor import Replicate
-    from torch.distributed.tensor.parallel import (
-        ColwiseParallel,
-        RowwiseParallel,
-    )
+    pass
 
 
 def softmax_backward_data(parent, grad_output, output, dim, self):
@@ -78,12 +72,12 @@ def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0)
         `torch.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`.
     """
     index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
+    W = layer.weight.index_select(dim, index).detach().clone()
     if layer.bias is not None:
         if dim == 1:
-            b = layer.bias.clone().detach()
+            b = layer.bias.detach().clone()
         else:
-            b = layer.bias[index].clone().detach()
+            b = layer.bias[index].detach().clone()
     new_size = list(layer.weight.size())
     new_size[dim] = len(index)
     new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
@@ -142,11 +136,11 @@ def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) ->
         [`~pytorch_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`.
     """
     index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
+    W = layer.weight.index_select(dim, index).detach().clone()
     if dim == 0:
-        b = layer.bias.clone().detach()
+        b = layer.bias.detach().clone()
     else:
-        b = layer.bias[index].clone().detach()
+        b = layer.bias[index].detach().clone()
     new_size = list(layer.weight.size())
     new_size[dim] = len(index)
     new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
@@ -159,9 +153,7 @@ def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) ->
     return new_layer
 
 
-def prune_layer(
-    layer: Union[nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None
-) -> Union[nn.Linear, Conv1D]:
+def prune_layer(layer: nn.Linear | Conv1D, index: torch.LongTensor, dim: int | None = None) -> nn.Linear | Conv1D:
     """
     Prune a Conv1D or linear layer to keep only entries in index.
 
@@ -262,8 +254,8 @@ def forward(self, hidden_states):
 
 
 def find_pruneable_heads_and_indices(
-    heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int]
-) -> Tuple[Set[int], torch.LongTensor]:
+    heads: list[int], n_heads: int, head_size: int, already_pruned_heads: set[int]
+) -> tuple[set[int], torch.LongTensor]:
     """
     Finds the heads and their indices taking `already_pruned_heads` into account.
 
@@ -288,9 +280,7 @@ def find_pruneable_heads_and_indices(
     return heads, index
 
 
-def meshgrid(
-    *tensors: Union[torch.Tensor, List[torch.Tensor]], indexing: Optional[str] = None
-) -> Tuple[torch.Tensor, ...]:
+def meshgrid(*tensors: torch.Tensor | list[torch.Tensor], indexing: str | None = None) -> tuple[torch.Tensor, ...]:
     """
     Wrapper around torch.meshgrid to avoid warning messages about the introduced `indexing` argument.
 
@@ -299,7 +289,7 @@ def meshgrid(
     return torch.meshgrid(*tensors, indexing=indexing)
 
 
-def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
+def id_tensor_storage(tensor: torch.Tensor) -> tuple[torch.device, int, int]:
     """
     Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
     example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
@@ -344,29 +334,6 @@ def isin_mps_friendly(elements: torch.Tensor, test_elements: torch.Tensor | int)
         return torch.isin(elements, test_elements)
 
 
-# TODO need to add the __repr__ that shows that it is a colwise parallel
-# See https://github.com/pytorch/pytorch/issues/145726
-def translate_to_torch_parallel_style(style: str):
-    """
-    In model configurations, we use a neutral type (string) to specify parallel
-    styles, here we translate them into torch.distributed tensor-parallel
-    types.
-    """
-    if not isinstance(style, str):
-        raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
-
-    if style == "colwise":
-        return ColwiseParallel()
-    elif style == "rowwise":
-        return RowwiseParallel()
-    elif style == "colwise_rep":
-        return ColwiseParallel(output_layouts=Replicate())
-    elif style == "rowwise_rep":
-        return RowwiseParallel(input_layouts=Replicate())
-    else:
-        raise ValueError(f"Unsupported parallel style value: {style}")
-
-
 def compile_compatible_method_lru_cache(*lru_args, **lru_kwargs):
     """
     LRU cache decorator from standard functools library, but with a workaround to disable
diff --git a/src/transformers/quantizers/__init__.py b/src/transformers/quantizers/__init__.py
index 3409af4cd78c..7117bc2b5d80 100755
--- a/src/transformers/quantizers/__init__.py
+++ b/src/transformers/quantizers/__init__.py
@@ -11,5 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .auto import AutoHfQuantizer, AutoQuantizationConfig
+from .auto import AutoHfQuantizer, AutoQuantizationConfig, register_quantization_config, register_quantizer
 from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
old mode 100755
new mode 100644
index ee7c832b1de1..9d24b3539530
--- a/src/transformers/quantizers/auto.py
+++ b/src/transformers/quantizers/auto.py
@@ -1,4 +1,5 @@
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Modifications Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,10 +32,12 @@
     QuantizationConfigMixin,
     QuantizationMethod,
     QuantoConfig,
+    QuarkConfig,
     SpQRConfig,
     TorchAoConfig,
     VptqConfig,
 )
+from .base import HfQuantizer
 from .quantizer_aqlm import AqlmHfQuantizer
 from .quantizer_awq import AwqQuantizer
 from .quantizer_bitnet import BitNetHfQuantizer
@@ -48,6 +51,7 @@
 from .quantizer_higgs import HiggsHfQuantizer
 from .quantizer_hqq import HqqHfQuantizer
 from .quantizer_quanto import QuantoHfQuantizer
+from .quantizer_quark import QuarkHfQuantizer
 from .quantizer_spqr import SpQRHfQuantizer
 from .quantizer_torchao import TorchAoHfQuantizer
 from .quantizer_vptq import VptqHfQuantizer
@@ -60,6 +64,7 @@
     "gptq": GptqHfQuantizer,
     "aqlm": AqlmHfQuantizer,
     "quanto": QuantoHfQuantizer,
+    "quark": QuarkHfQuantizer,
     "eetq": EetqHfQuantizer,
     "higgs": HiggsHfQuantizer,
     "hqq": HqqHfQuantizer,
@@ -80,6 +85,7 @@
     "gptq": GPTQConfig,
     "aqlm": AqlmConfig,
     "quanto": QuantoConfig,
+    "quark": QuarkConfig,
     "hqq": HqqConfig,
     "compressed-tensors": CompressedTensorsConfig,
     "fbgemm_fp8": FbgemmFp8Config,
@@ -226,3 +232,35 @@ def supports_quant_method(quantization_config_dict):
             )
             return False
         return True
+
+
+def register_quantization_config(method: str):
+    """Register a custom quantization configuration."""
+
+    def register_config_fn(cls):
+        if method in AUTO_QUANTIZATION_CONFIG_MAPPING:
+            raise ValueError(f"Config '{method}' already registered")
+
+        if not issubclass(cls, QuantizationConfigMixin):
+            raise ValueError("Config must extend QuantizationConfigMixin")
+
+        AUTO_QUANTIZATION_CONFIG_MAPPING[method] = cls
+        return cls
+
+    return register_config_fn
+
+
+def register_quantizer(name: str):
+    """Register a custom quantizer."""
+
+    def register_quantizer_fn(cls):
+        if name in AUTO_QUANTIZER_MAPPING:
+            raise ValueError(f"Quantizer '{name}' already registered")
+
+        if not issubclass(cls, HfQuantizer):
+            raise ValueError("Quantizer must extend HfQuantizer")
+
+        AUTO_QUANTIZER_MAPPING[name] = cls
+        return cls
+
+    return register_quantizer_fn
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
old mode 100755
new mode 100644
index d6303b230204..a780dca7548f
--- a/src/transformers/quantizers/base.py
+++ b/src/transformers/quantizers/base.py
@@ -15,7 +15,8 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from ..utils import is_torch_available
-from ..utils.quantization_config import QuantizationConfigMixin
+from ..utils.quantization_config import QuantizationConfigMixin, QuantizationMethod
+from .quantizers_utils import get_module_from_name
 
 
 if TYPE_CHECKING:
@@ -23,6 +24,9 @@
 
 if is_torch_available():
     import torch
+    from torch.nn import ModuleList
+else:
+    ModuleList = str
 
 
 class HfQuantizer(ABC):
@@ -109,6 +113,27 @@ def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> Li
         """
         return missing_keys
 
+    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `unexpected_keys`.
+
+        Args:
+            unexpected_keys (`List[str]`, *optional*):
+                The list of unexpected keys in the checkpoint compared to the state dict of the model
+        """
+        return unexpected_keys
+
+    def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `missing_keys` after loading the model params,
+        but before the model is post-processed.
+
+        Args:
+            missing_keys (`List[str]`, *optional*):
+                The list of missing keys in the checkpoint compared to the state dict of the model
+        """
+        return missing_keys
+
     def update_expected_keys(self, model, expected_keys: List[str], loaded_keys: List[str]) -> List[str]:
         """
         Override this method if you want to adjust the `update_expected_keys`.
@@ -177,6 +202,10 @@ def validate_environment(self, *args, **kwargs):
         """
         return
 
+    def update_tp_plan(self, config):
+        "updates the tp plan for the scales"
+        return config
+
     def preprocess_model(self, model: "PreTrainedModel", **kwargs):
         """
         Setting model attributes and/or converting model before weights loading. At this point
@@ -191,6 +220,7 @@ def preprocess_model(self, model: "PreTrainedModel", **kwargs):
         """
         model.is_quantized = True
         model.quantization_method = self.quantization_config.quant_method
+        self._convert_model_for_quantization(model)
         return self._process_model_before_weight_loading(model, **kwargs)
 
     def postprocess_model(self, model: "PreTrainedModel", **kwargs):
@@ -226,11 +256,35 @@ def _dequantize(self, model):
             f"{self.quantization_config.quant_method} has no implementation of `dequantize`, please raise an issue on GitHub."
         )
 
+    @staticmethod
+    def get_modules_to_not_convert(
+        model: "PreTrainedModel",
+        skip_modules: Optional[List[str]] = None,
+        keep_in_fp32_modules: Optional[List[str]] = None,
+    ):
+        from ..integrations import get_keys_to_not_convert
+
+        modules_to_not_convert = []
+        if skip_modules is None:
+            modules_to_not_convert = get_keys_to_not_convert(model)
+        else:
+            modules_to_not_convert = skip_modules
+
+        if keep_in_fp32_modules is not None:
+            modules_to_not_convert.extend(keep_in_fp32_modules)
+
+        return modules_to_not_convert
+
     @property
     def is_qat_trainable(self) -> bool:
         """Flag indicating whether the quantized model can carry out quantization aware training"""
         return False
 
+    @property
+    def is_compileable(self) -> bool:
+        """Flag indicating whether the quantized model can be compiled"""
+        return False
+
     @abstractmethod
     def _process_model_before_weight_loading(self, model, **kwargs): ...
 
@@ -243,3 +297,44 @@ def is_serializable(self, safe_serialization=None): ...
     @property
     @abstractmethod
     def is_trainable(self): ...
+
+    def _convert_model_for_quantization(self, model):
+        from accelerate import init_empty_weights
+
+        for name, module in model.named_modules():
+            module_class_name = module.__class__.__name__
+            if (
+                module_class_name in MODULES_TO_PATCH_FOR_QUANTIZATION.keys()
+                and self.quantization_config.quant_method == QuantizationMethod.COMPRESSED_TENSORS
+            ):
+                with init_empty_weights():
+                    parent_module, name = get_module_from_name(model, name)
+                    parent_module._modules[name] = MODULES_TO_PATCH_FOR_QUANTIZATION[module_class_name](
+                        model.config.get_text_config()
+                    )
+
+
+class SequentialLlama4TextExperts(ModuleList):
+    """
+    A module that implements a compressed version of a list of expert modules.
+    This is specifically designed to work with Llama4TextExperts in MoE layers.
+    """
+
+    def __init__(self, config):
+        from transformers.models.llama4.modeling_llama4 import Llama4TextMLP
+
+        super().__init__([Llama4TextMLP(config) for _ in range(config.num_local_experts)])
+        self.num_experts = config.num_local_experts
+
+    def forward(
+        self,
+        hidden_states: "torch.Tensor",
+    ) -> "torch.Tensor":
+        hidden_states = hidden_states.reshape(self.num_experts, -1, hidden_states.shape[-1])
+        routed_out = torch.zeros_like(hidden_states)
+        for expert_idx in range(self.num_experts):
+            routed_out[expert_idx] = self[expert_idx](hidden_states[expert_idx])
+        return routed_out
+
+
+MODULES_TO_PATCH_FOR_QUANTIZATION = {"Llama4TextExperts": SequentialLlama4TextExperts}
diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py
index 9109fccb5757..28460ac38edb 100644
--- a/src/transformers/quantizers/quantizer_awq.py
+++ b/src/transformers/quantizers/quantizer_awq.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib.metadata
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, List, Optional
 
 from packaging import version
 
@@ -96,13 +96,14 @@ def update_torch_dtype(self, torch_dtype):
             logger.warning("We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.")
         return torch_dtype
 
-    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
-        from ..integrations import get_keys_to_not_convert, replace_quantization_scales, replace_with_awq_linear
+    def _process_model_before_weight_loading(
+        self, model: "PreTrainedModel", keep_in_fp32_modules: Optional[List[str]] = None, **kwargs
+    ):
+        from ..integrations import replace_quantization_scales, replace_with_awq_linear
 
-        self.modules_to_not_convert = get_keys_to_not_convert(model)
-
-        if self.quantization_config.modules_to_not_convert is not None:
-            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
 
         model, has_been_replaced = replace_with_awq_linear(
             model, quantization_config=self.quantization_config, modules_to_not_convert=self.modules_to_not_convert
diff --git a/src/transformers/quantizers/quantizer_bitnet.py b/src/transformers/quantizers/quantizer_bitnet.py
index 3607caa00733..e56bb161ac44 100644
--- a/src/transformers/quantizers/quantizer_bitnet.py
+++ b/src/transformers/quantizers/quantizer_bitnet.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Dict, List, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from .base import HfQuantizer
 
@@ -81,16 +81,14 @@ def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        device_map,
-        keep_in_fp32_modules: List[str] = [],
+        keep_in_fp32_modules: Optional[List[str]] = None,
         **kwargs,
     ):
-        from ..integrations import get_keys_to_not_convert, replace_with_bitnet_linear
+        from ..integrations import replace_with_bitnet_linear
 
-        self.modules_to_not_convert = get_keys_to_not_convert(model)
-
-        if self.quantization_config.modules_to_not_convert is not None:
-            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
 
         model = replace_with_bitnet_linear(
             model,
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 8657bda16625..7fb9176c4673 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -29,6 +29,7 @@
     is_accelerate_available,
     is_bitsandbytes_available,
     is_torch_available,
+    is_torch_hpu_available,
     is_torch_npu_available,
     is_torch_xpu_available,
     logging,
@@ -269,6 +270,8 @@ def update_device_map(self, device_map):
                 device_map = {"": torch.cuda.current_device()}
             elif is_torch_npu_available():
                 device_map = {"": f"npu:{torch.npu.current_device()}"}
+            elif is_torch_hpu_available():
+                device_map = {"": f"hpu:{torch.hpu.current_device()}"}
             elif is_torch_xpu_available():
                 device_map = {"": f"xpu:{torch.xpu.current_device()}"}
             else:
@@ -285,23 +288,16 @@ def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
         device_map,
-        keep_in_fp32_modules: List[str] = [],
+        keep_in_fp32_modules: Optional[List[str]] = None,
         **kwargs,
     ):
-        from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear
+        from ..integrations import replace_with_bnb_linear
 
         llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
 
-        # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
-        if self.quantization_config.llm_int8_skip_modules is None:
-            self.modules_to_not_convert = get_keys_to_not_convert(model)
-        else:
-            self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
-
-        if not isinstance(self.modules_to_not_convert, list):
-            self.modules_to_not_convert = [self.modules_to_not_convert]
-
-        self.modules_to_not_convert.extend(keep_in_fp32_modules)
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.llm_int8_skip_modules, keep_in_fp32_modules
+        )
 
         # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk`
         if isinstance(device_map, dict) and len(device_map.keys()) > 1:
diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
index 093d612b914c..cac339b16b9a 100644
--- a/src/transformers/quantizers/quantizer_bnb_8bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -245,23 +245,16 @@ def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
         device_map,
-        keep_in_fp32_modules: List[str] = [],
+        keep_in_fp32_modules: Optional[List[str]] = None,
         **kwargs,
     ):
-        from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear
+        from ..integrations import replace_with_bnb_linear
 
         llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
 
-        # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
-        if self.quantization_config.llm_int8_skip_modules is None:
-            self.modules_to_not_convert = get_keys_to_not_convert(model)
-        else:
-            self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
-
-        if not isinstance(self.modules_to_not_convert, list):
-            self.modules_to_not_convert = [self.modules_to_not_convert]
-
-        self.modules_to_not_convert.extend(keep_in_fp32_modules)
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.llm_int8_skip_modules, keep_in_fp32_modules
+        )
 
         # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk`
         if isinstance(device_map, dict) and len(device_map.keys()) > 1:
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 7d208087bbbf..4e45abf95340 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -14,6 +14,8 @@
 
 
 import os
+import re
+from typing import List
 
 from ..utils import is_compressed_tensors_available, is_torch_available, logging
 from ..utils.quantization_config import CompressedTensorsConfig
@@ -44,12 +46,55 @@ def __init__(self, quantization_config: CompressedTensorsConfig, **kwargs):
                 "`pip install compressed-tensors`"
             )
 
+        # Call post_init here to ensure proper config setup when `run_compressed`
+        # is provided directly via CompressedTensorsConfig, and to avoid duplicate logging.
+
+        quantization_config.post_init()
         from compressed_tensors.compressors import ModelCompressor
 
         self.compressor = ModelCompressor.from_compression_config(quantization_config)
         self.run_compressed = quantization_config.run_compressed
         self.quantization_config = quantization_config
 
+    def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        """
+        Update missing keys after loading the model. This is necessary for compressed tensors
+        to load the model correctly. We expect weights to be present in missing keys.
+        The weight's are re-constructed by ModelCompressor in _process_model_after_weight_loading
+
+        This function cleans up expected missing keys and returns the remaining missing keys
+        """
+
+        if self.run_compressed:
+            return missing_keys
+
+        # We expect some keys to be missing for
+        # compresed models
+        # This is fine as the weights are reconstructed by ModelCompressor
+        # in _process_model_after_weight_loading
+
+        expected_missing_keys = self.compressor.get_missing_module_keys(model)
+        return [
+            key for key in missing_keys if not any(re.match(f".*{pattern}", key) for pattern in expected_missing_keys)
+        ]
+
+    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `unexpected_keys`.
+
+        Args:
+            unexpected_keys (`List[str]`, *optional*):
+                The list of unexpected keys in the checkpoint compared to the state dict of the model
+        """
+
+        if self.run_compressed:
+            return unexpected_keys
+
+        # We expect some unexpected keys in model
+        # safetensors file for compressed models
+        keys_to_ignore = self.compressor.get_unexpected_file_keys(model)
+        return [key for key in unexpected_keys if not any(re.match(f".*{pattern}", key) for pattern in keys_to_ignore)]
+
     def validate_environment(self, *args, **kwargs):
         if not is_compressed_tensors_available():
             raise ImportError(
@@ -75,15 +120,17 @@ def _process_model_before_weight_loading(self, model, **kwargs):
 
         ct_quantization_config = self.compressor.quantization_config
 
-        if self.run_compressed and self.is_quantization_compressed:
+        if self.run_compressed:
             apply_quantization_config(model, ct_quantization_config, run_compressed=True)
-        elif not self.is_quantization_compressed:
+        elif not self.quantization_config.is_quantization_compressed:
             apply_quantization_config(model, ct_quantization_config)
 
     def _process_model_after_weight_loading(self, model, **kwargs):
         """Decompress loaded model if necessary - need for qat"""
 
-        if (self.is_quantization_compressed and not self.run_compressed) or self.is_sparsification_compressed:
+        if (
+            self.quantization_config.is_quantization_compressed and not self.run_compressed
+        ) or self.quantization_config.is_sparsification_compressed:
             config = kwargs.get("config", None)
             cache_path = config._name_or_path
 
@@ -93,29 +140,24 @@ def _process_model_after_weight_loading(self, model, **kwargs):
                 config_file_path = cached_file(cache_path, "config.json")
                 cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
 
-            if self.is_quantization_compressed and not self.run_compressed:
+            if self.quantization_config.is_quantization_compressed and not self.run_compressed:
                 from compressed_tensors.quantization import QuantizationStatus
 
                 self.compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN
             self.compressor.decompress(model_path=cache_path, model=model)
 
-    @property
-    def is_quantization_compressed(self):
-        from compressed_tensors.quantization import QuantizationStatus
-
-        return (
-            self.quantization_config.quantization_config is not None
-            and self.quantization_config.quantization_config.quantization_status == QuantizationStatus.COMPRESSED
-        )
-
-    @property
-    def is_sparsification_compressed(self):
-        from compressed_tensors.config.base import CompressionFormat
+    def update_tp_plan(self, config):
+        additional_plan = {
+            "layers.*.feed_forward.experts.*.gate_proj.weight": "local_colwise",
+            "layers.*.feed_forward.experts.*.gate_proj.weight_scale": "local_colwise",
+            "layers.*.feed_forward.experts.*.up_proj.weight": "local_colwise",
+            "layers.*.feed_forward.experts.*.up_proj.weight_scale": "local_colwise",
+            "layers.*.feed_forward.experts.*.down_proj.weight": "local_rowwise",
+        }
+        if config.get_text_config() is not None and config.get_text_config().base_model_tp_plan is not None:
+            config.get_text_config().base_model_tp_plan.update(additional_plan)
 
-        return (
-            self.quantization_config.sparsity_config is not None
-            and self.quantization_config.sparsity_config.format != CompressionFormat.dense.value
-        )
+        return config
 
     @property
     def is_trainable(self):
@@ -124,7 +166,7 @@ def is_trainable(self):
     def is_qat_trainable(self) -> bool:
         """Loaded Models can carry out quantization aware training"""
         # models need to be decompressed carry out qat
-        return not self.run_compressed or not self.is_quantization_compressed
+        return not self.run_compressed or not self.quantization_config.is_quantization_compressed
 
     def is_serializable(self, safe_serialization=None) -> bool:
         """Models quantized using compressed tensors can be saved to disk"""
diff --git a/src/transformers/quantizers/quantizer_eetq.py b/src/transformers/quantizers/quantizer_eetq.py
index 7dfce75c373a..988f90789ac3 100644
--- a/src/transformers/quantizers/quantizer_eetq.py
+++ b/src/transformers/quantizers/quantizer_eetq.py
@@ -155,16 +155,14 @@ def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        device_map,
-        keep_in_fp32_modules: List[str] = [],
+        keep_in_fp32_modules: Optional[List[str]] = None,
         **kwargs,
     ):
-        from ..integrations import get_keys_to_not_convert, replace_with_eetq_linear
+        from ..integrations import replace_with_eetq_linear
 
-        self.modules_to_not_convert = get_keys_to_not_convert(model)
-
-        if self.quantization_config.modules_to_not_convert is not None:
-            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
 
         model = replace_with_eetq_linear(
             model,
diff --git a/src/transformers/quantizers/quantizer_fbgemm_fp8.py b/src/transformers/quantizers/quantizer_fbgemm_fp8.py
index 07d5ce87ef6c..f0fa8f063bb4 100644
--- a/src/transformers/quantizers/quantizer_fbgemm_fp8.py
+++ b/src/transformers/quantizers/quantizer_fbgemm_fp8.py
@@ -104,8 +104,7 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
             )
         elif torch_dtype == torch.float16:
             raise ValueError(
-                "You cannot use FP8 with torch_dtype=torch.float16."
-                "We recommend you passing torch_dtype=torch.bfloat16"
+                "You cannot use FP8 with torch_dtype=torch.float16.We recommend you passing torch_dtype=torch.bfloat16"
             )
         return torch_dtype
 
@@ -117,7 +116,7 @@ def check_quantized_param(
         state_dict: Dict[str, Any],
         **kwargs,
     ):
-        from ..integrations import FbgemmFp8Linear
+        from ..integrations import FbgemmFp8Linear, FbgemmFp8Llama4TextExperts
 
         module, tensor_name = get_module_from_name(model, param_name)
 
@@ -130,6 +129,13 @@ def check_quantized_param(
                 if tensor_name == "weight_scale":
                     raise ValueError("Expect unquantized weights but got a quantized weight_scale")
                 return True
+        if isinstance(module, FbgemmFp8Llama4TextExperts):
+            if self.pre_quantized or tensor_name == "bias":
+                return False
+            else:
+                if tensor_name == "gate_up_proj_scale" or tensor_name == "down_proj_scale":
+                    raise ValueError("Expect unquantized weights but got a quantized weight_scale")
+                return True
         return False
 
     def create_quantized_param(
@@ -144,12 +150,52 @@ def create_quantized_param(
         """
         Quantizes weights into weight and weight_scale
         """
-        new_value, weight_scale = torch.ops.fbgemm.quantize_fp8_per_row(param_value)
+
+        from ..integrations import FbgemmFp8Llama4TextExperts
 
         module, tensor_name = get_module_from_name(model, param_name)
-        module._buffers[tensor_name] = new_value.to(target_device)
-        # to have the right output shape -> (out_features, 1)
-        module._buffers["weight_scale"] = weight_scale.view(weight_scale.shape[0], 1).to(target_device)
+        if isinstance(module, FbgemmFp8Llama4TextExperts):
+            if tensor_name == "gate_up_proj":
+                # Process each expert separately
+                # Transpose the second and third dimension
+                transposed_param = param_value.transpose(1, 2)
+
+                # Reshape to 2D for quantization
+                original_shape = transposed_param.shape
+                flattened_param = transposed_param.reshape(-1, original_shape[-1])
+
+                # Quantize using per row instead of per column
+                new_value_flat, weight_scale_flat = torch.ops.fbgemm.quantize_fp8_per_row(flattened_param)
+
+                # Reshape back to original dimensions
+                new_value = new_value_flat.reshape(original_shape)
+                new_value = new_value.transpose(1, 2)
+                weight_scale = weight_scale_flat.reshape(original_shape[0], 1, original_shape[1])
+            elif tensor_name == "down_proj":
+                # Process each expert separately
+                # Transpose the weights for proper quantization
+                transposed_param = param_value.transpose(1, 2)
+
+                # Reshape to 2D for quantization
+                original_shape = transposed_param.shape
+                flattened_param = transposed_param.reshape(-1, original_shape[-1])
+
+                # Quantize using per column
+                new_value_flat, weight_scale_flat = torch.ops.fbgemm.quantize_fp8_per_row(flattened_param)
+
+                # Reshape back to original dimensions
+                new_value = new_value_flat.reshape(original_shape)
+                new_value = new_value.transpose(1, 2)
+                weight_scale = weight_scale_flat.reshape(original_shape[0], original_shape[1], 1)
+
+            module._parameters[f"{tensor_name}_scale"] = torch.nn.Parameter(weight_scale.to(target_device))
+        else:
+            new_value, weight_scale = torch.ops.fbgemm.quantize_fp8_per_row(param_value)
+            module._parameters[f"{tensor_name}_scale"] = torch.nn.Parameter(
+                weight_scale.view(weight_scale.shape[0], 1).to(target_device)
+            )
+
+        module._parameters[tensor_name] = torch.nn.Parameter(new_value.to(target_device))
 
         if unexpected_keys is not None and param_name in unexpected_keys:
             unexpected_keys.remove(param_name)
@@ -161,32 +207,34 @@ def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        device_map,
-        keep_in_fp32_modules: List[str] = [],
+        keep_in_fp32_modules: Optional[List[str]] = None,
         **kwargs,
     ):
-        from ..integrations import get_keys_to_not_convert, replace_with_fbgemm_fp8_linear
+        from ..integrations import replace_with_fbgemm_fp8_linear
 
-        self.modules_to_not_convert = get_keys_to_not_convert(model)
-
-        if self.quantization_config.modules_to_not_convert is not None:
-            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+        tp_plan = model._tp_plan
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
 
+        config = model.config
         model = replace_with_fbgemm_fp8_linear(
             model,
             modules_to_not_convert=self.modules_to_not_convert,
             quantization_config=self.quantization_config,
             pre_quantized=self.pre_quantized,
+            config=config,
+            tp_plan=tp_plan,
         )
 
         model.config.quantization_config = self.quantization_config
 
     def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
-        from ..integrations import FbgemmFp8Linear
+        from ..integrations import FbgemmFp8Linear, FbgemmFp8Llama4TextExperts
 
         not_missing_keys = []
         for name, module in model.named_modules():
-            if isinstance(module, FbgemmFp8Linear):
+            if isinstance(module, FbgemmFp8Linear) or isinstance(module, FbgemmFp8Llama4TextExperts):
                 for missing in missing_keys:
                     if (
                         (name in missing or name in f"{prefix}.{missing}")
diff --git a/src/transformers/quantizers/quantizer_finegrained_fp8.py b/src/transformers/quantizers/quantizer_finegrained_fp8.py
index 7816ed2f5834..16ce7f6a9eb6 100644
--- a/src/transformers/quantizers/quantizer_finegrained_fp8.py
+++ b/src/transformers/quantizers/quantizer_finegrained_fp8.py
@@ -52,9 +52,10 @@ def validate_environment(self, *args, **kwargs):
 
         compute_capability = torch.cuda.get_device_capability()
         major, minor = compute_capability
-        if major < 9:
+        if (major < 8) or (major == 8 and minor < 9):
             raise ValueError(
-                "FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)"
+                "FP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100)"
+                f", actual = `{major}.{minor}`"
             )
 
         device_map = kwargs.get("device_map", None)
@@ -162,16 +163,14 @@ def check_quantized_param(
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        device_map,
-        modules_to_not_convert: List[str] = [],
+        keep_in_fp32_modules: Optional[List[str]] = None,
         **kwargs,
     ):
         from ..integrations.finegrained_fp8 import replace_with_fp8_linear
 
-        self.modules_to_not_convert = ["lm_head"] + modules_to_not_convert
-
-        if self.quantization_config.modules_to_not_convert:
-            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
 
         model = replace_with_fp8_linear(
             model,
diff --git a/src/transformers/quantizers/quantizer_hqq.py b/src/transformers/quantizers/quantizer_hqq.py
index 775fea8f4901..4adc323f958e 100755
--- a/src/transformers/quantizers/quantizer_hqq.py
+++ b/src/transformers/quantizers/quantizer_hqq.py
@@ -124,7 +124,14 @@ def _find_hqq_quantizable_layers(model, layers):
             # valid modules are Linear layers that have HQQLinear state_dict. We ignore skip_modules and any layers with Linear state_dict() params
             _valid_modules = set()
             _find_hqq_quantizable_layers(model, _valid_modules)
-            _valid_modules -= set(model.config.quantization_config["skip_modules"])
+
+            # Remove skipped modules
+            _skipped_modules = set()
+            for _module in _valid_modules:
+                for _skip_module in model.config.quantization_config["skip_modules"]:
+                    if _skip_module in _module:
+                        _skipped_modules.add(_module)
+            _valid_modules -= _skipped_modules
 
             # Append new expected layers based on _ref_keys
             _ref_keys = HQQLinear(
@@ -169,7 +176,12 @@ def check_quantized_param(
                 and tensor_name != "bias"
             )
         else:
-            return isinstance(module, torch.nn.Linear) and tensor_name == "weight"
+            # we need a special path for bias since hqq overwrote load_state_dict for this layer
+            return (
+                isinstance(module, torch.nn.Linear)
+                and tensor_name == "weight"
+                or (isinstance(module, HQQLinear) and tensor_name == "bias")
+            )
 
     def create_quantized_param(
         self,
@@ -181,7 +193,7 @@ def create_quantized_param(
         unexpected_keys: List[str],
     ):
         """
-        Each nn.Linear layer is processsed here.
+        Each nn.Linear layer is processed here.
         We first check if the corresponding module state_dict contains already HQQ quantized parameters.
         If not, we create a temp linear layer with the module state_dict params and use it for quantization
         """
@@ -194,6 +206,10 @@ def create_quantized_param(
         parent_module = find_parent(model, layer_name)
         node = layer_name.split(".")[-1]
 
+        if tensor_name == "bias":
+            # this should already be set
+            return
+
         # set module state_dict
         module_state_dict = {}
         for k, v in state_dict.items():
@@ -234,10 +250,24 @@ def create_quantized_param(
 
         # Step 2: Replace module with either HQQLinear or move it to device. We do this via setattr on the parent as doing on it on the module
         # directly doesn't work.
-        if hasattr(module, "quant_config"):
+        quant_config = model.config.quantization_config["quant_config"]
+        skip_modules = model.config.quantization_config["skip_modules"]
+        module_tag = ".".join(module.name.split(".")[-2:])
+        module_quant_config = None
+        if "weight_quant_params" in quant_config:
+            module_quant_config = quant_config
+        elif module_tag in quant_config:
+            module_quant_config = quant_config[module_tag]
+
+        for skip_module in skip_modules:
+            if skip_module in module.name:
+                module_quant_config = None
+                break
+
+        if module_quant_config is not None:
             hqq_layer = HQQLinear(
                 module,
-                module.quant_config,
+                quant_config=module_quant_config,
                 compute_dtype=self.torch_dtype,
                 device=target_device,
                 del_orig=True,
@@ -273,12 +303,8 @@ def forward_with_device(self, x):
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
-        device_map,
-        keep_in_fp32_modules: List[str] = None,
         **kwargs,
     ):
-        keep_in_fp32_modules = keep_in_fp32_modules if keep_in_fp32_modules is not None else []
-
         # Add the corresponding quant_config to each valid module. This allows us to do the actual nn.Linear -> HQQLinear conversion in create_quantized_param().
         # prepare_for_hqq_linear() also sets the right quantization config inside the model (model.config.quantization_config) and the layers (hqq_layer.quant_config)
         model = prepare_for_hqq_linear(model, quantization_config=self.quantization_config)
diff --git a/src/transformers/quantizers/quantizer_quanto.py b/src/transformers/quantizers/quantizer_quanto.py
index 230e8efe1506..be760f0d430c 100644
--- a/src/transformers/quantizers/quantizer_quanto.py
+++ b/src/transformers/quantizers/quantizer_quanto.py
@@ -177,20 +177,13 @@ def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
             )
 
     def _process_model_before_weight_loading(
-        self, model: "PreTrainedModel", keep_in_fp32_modules: List[str] = [], **kwargs
+        self, model: "PreTrainedModel", keep_in_fp32_modules: Optional[List[str]] = None, **kwargs
     ):
-        from ..integrations import get_keys_to_not_convert, replace_with_quanto_layers
+        from ..integrations import replace_with_quanto_layers
 
-        # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
-        if self.quantization_config.modules_to_not_convert is None:
-            self.modules_to_not_convert = get_keys_to_not_convert(model)
-        else:
-            self.modules_to_not_convert = self.quantization_config.modules_to_not_convert
-
-        if not isinstance(self.modules_to_not_convert, list):
-            self.modules_to_not_convert = [self.modules_to_not_convert]
-
-        self.modules_to_not_convert.extend(keep_in_fp32_modules)
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
 
         model, _ = replace_with_quanto_layers(
             model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config
diff --git a/src/transformers/quantizers/quantizer_quark.py b/src/transformers/quantizers/quantizer_quark.py
new file mode 100644
index 000000000000..374360b1cb8f
--- /dev/null
+++ b/src/transformers/quantizers/quantizer_quark.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2025 Advanced Micro Devices, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Dict
+
+from ..file_utils import is_torch_available
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+    if is_torch_available():
+        import torch
+
+from ..utils import is_accelerate_available, is_quark_available, logging
+
+
+if is_accelerate_available():
+    from accelerate.utils import set_module_tensor_to_device
+
+logger = logging.get_logger(__name__)
+
+
+CHECKPOINT_KEYS = {
+    "weight_scale": "weight_quantizer.scale",
+    "bias_scale": "bias_quantizer.scale",
+    "input_scale": "input_quantizer.scale",
+    "output_scale": "output_quantizer.scale",
+    "weight_zero_point": "weight_quantizer.zero_point",
+    "bias_zero_point": "bias_quantizer.zero_point",
+    "input_zero_point": "input_quantizer.zero_point",
+    "output_zero_point": "output_quantizer.zero_point",
+}
+
+
+class QuarkHfQuantizer(HfQuantizer):
+    """
+    Quark quantizer (https://quark.docs.amd.com/latest/).
+    """
+
+    requires_calibration = True  # On-the-fly quantization with quark is not supported for now.
+    required_packages = ["quark"]
+
+    # Checkpoints are expected to be already quantized when loading a quark model. However, as some keys from
+    # the checkpoint might mismatch the model parameters keys, we use the `create_quantized_param` method
+    # to load the checkpoints, remapping the keys.
+    requires_parameters_quantization = True
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        self.json_export_config = quantization_config.json_export_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_quark_available():
+            raise ImportError(
+                "Loading a Quark quantized model requires the `quark` library but it was not found in the environment. Please refer to https://quark.docs.amd.com/latest/install.html."
+            )
+
+    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        from quark.torch.export.api import _map_to_quark
+
+        _map_to_quark(
+            model,
+            self.quantization_config.quant_config,
+            pack_method=self.json_export_config.pack_method,
+            custom_mode=self.quantization_config.custom_mode,
+        )
+
+        return model
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        return True
+
+    def create_quantized_param(
+        self, model, param, param_name, param_device, state_dict, unexpected_keys
+    ) -> "torch.nn.Parameter":
+        postfix = param_name.split(".")[-1]
+
+        if postfix in CHECKPOINT_KEYS:
+            param_name = param_name.replace(postfix, CHECKPOINT_KEYS[postfix])
+
+        set_module_tensor_to_device(model, param_name, param_device, value=param)
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    def is_serializable(self, safe_serialization=None):
+        return False
+
+    @property
+    def is_trainable(self):
+        return False
diff --git a/src/transformers/quantizers/quantizer_spqr.py b/src/transformers/quantizers/quantizer_spqr.py
index 7252e9808ee9..4cf1193edbf2 100644
--- a/src/transformers/quantizers/quantizer_spqr.py
+++ b/src/transformers/quantizers/quantizer_spqr.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 from .base import HfQuantizer
 
@@ -65,12 +65,17 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
+        keep_in_fp32_modules: Optional[List[str]] = None,
         **kwargs,
     ):
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
+
         replace_with_spqr_linear(
             model,
             quantization_config=self.quantization_config,
-            modules_to_not_convert=self.quantization_config.modules_to_not_convert,
+            modules_to_not_convert=self.modules_to_not_convert,
         )
         model.config.quantization_config = self.quantization_config
 
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
index bcc9c57dfa00..ac8155f22767 100644
--- a/src/transformers/quantizers/quantizer_torchao.py
+++ b/src/transformers/quantizers/quantizer_torchao.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
+import re
 import types
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 from packaging import version
 
@@ -27,6 +28,7 @@
 from typing import Any, Dict, List
 
 from ..utils import is_torch_available, is_torchao_available, logging
+from ..utils.quantization_config import TorchAoConfig
 
 
 if is_torch_available():
@@ -36,6 +38,21 @@
 logger = logging.get_logger(__name__)
 
 
+def fuzzy_match_size(config_name: str) -> Optional[str]:
+    """
+    Extract the size digit from strings like "4weight", "8weight".
+    Returns the digit as an integer if found, otherwise None.
+    """
+    config_name = config_name.lower()
+
+    str_match = re.search(r"(\d)weight", config_name)
+
+    if str_match:
+        return str_match.group(1)
+
+    return None
+
+
 # Finds the parent of a node module named "name"
 def find_parent(model, name):
     module_tree = name.split(".")[:-1]
@@ -121,14 +138,33 @@ def update_torch_dtype(self, torch_dtype):
                 torch_dtype = torch.float32
         return torch_dtype
 
-    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+    def adjust_target_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
         if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"):
             from accelerate.utils import CustomDtype
 
+            # Import AOBaseConfig directly since we know we have the right version
+            if self.quantization_config._get_ao_version() > version.Version("0.9.0"):
+                from torchao.core.config import AOBaseConfig
+
+                quant_type = self.quantization_config.quant_type
+                if isinstance(quant_type, AOBaseConfig):
+                    # Extract size digit using fuzzy match on the class name
+                    config_name = quant_type.__class__.__name__
+                    size_digit = fuzzy_match_size(config_name)
+
+                    # Map the extracted digit to appropriate dtype
+                    if size_digit == "4":
+                        return CustomDtype.INT4
+                    else:
+                        # Default to int8
+                        return torch.int8
+
+            # Original mapping for non-AOBaseConfig types
             map_to_target_dtype = {
                 "int4_weight_only": CustomDtype.INT4,
                 "int8_weight_only": torch.int8,
                 "int8_dynamic_activation_int8_weight": torch.int8,
+                "autoquant": None,
             }
             return map_to_target_dtype[self.quantization_config.quant_type]
         else:
@@ -143,14 +179,12 @@ def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str,
         max_memory = {key: val * 0.9 for key, val in max_memory.items()}
         return max_memory
 
-    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
-        from ..integrations import get_keys_to_not_convert
-
-        self.modules_to_not_convert = get_keys_to_not_convert(model)
-
-        if self.quantization_config.modules_to_not_convert is not None:
-            self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
-
+    def _process_model_before_weight_loading(
+        self, model: "PreTrainedModel", keep_in_fp32_modules: Optional[List[str]] = None, **kwargs
+    ):
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
         return
 
     def check_quantized_param(
@@ -161,6 +195,9 @@ def check_quantized_param(
         state_dict: Dict[str, Any],
         **kwargs,
     ) -> bool:
+        if self.quantization_config.quant_type == "autoquant":
+            return False
+
         param_device = kwargs.pop("param_device", None)
         # check if the param_name is not in self.modules_to_not_convert
         if any((key + "." in param_name) or (key == param_name) for key in self.modules_to_not_convert):
@@ -186,27 +223,41 @@ def create_quantized_param(
         Each nn.Linear layer that needs to be quantized is processsed here.
         First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
         """
+        if self.quantization_config.quant_type == "autoquant":
+            return
+
         from torchao.quantization import quantize_
 
         module, tensor_name = get_module_from_name(model, param_name)
-
         if self.pre_quantized:
             module._parameters[tensor_name] = torch.nn.Parameter(param_value.to(device=target_device))
             if isinstance(module, nn.Linear):
                 module.extra_repr = types.MethodType(_linear_extra_repr, module)
         else:
+            assert isinstance(self.quantization_config, TorchAoConfig)
             module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
             quantize_(module, self.quantization_config.get_apply_tensor_subclass())
 
     def _process_model_after_weight_loading(self, model, **kwargs):
         """No process required for torchao quantized model"""
+        if self.quantization_config.quant_type == "autoquant":
+            from torchao import autoquant
+            from torchao.quantization import ALL_AUTOQUANT_CLASS_LIST
+
+            model = torch.compile(model, mode="max-autotune")
+            model = autoquant(
+                model,
+                qtensor_class_list=ALL_AUTOQUANT_CLASS_LIST,
+                set_inductor_config=False,
+                **self.quantization_config.quant_type_kwargs,
+            )
+            return model
         return
 
-    def is_serializable(self, safe_serialization=None):
+    def is_serializable(self, safe_serialization=None) -> bool:
         if safe_serialization:
             logger.warning(
-                "torchao quantized model does not support safe serialization, "
-                "please set `safe_serialization` to False"
+                "torchao quantized model does not support safe serialization, please set `safe_serialization` to False"
             )
             return False
         _is_torchao_serializable = version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse(
@@ -223,9 +274,13 @@ def is_serializable(self, safe_serialization=None):
         return _is_torchao_serializable
 
     @property
-    def is_trainable(self):
+    def is_trainable(self) -> bool:
         supported_quant_types_for_training = [
             "int8_weight_only",
             "int8_dynamic_activation_int8_weight",
         ]
         return self.quantization_config.quant_type in supported_quant_types_for_training
+
+    @property
+    def is_compileable(self) -> bool:
+        return True
diff --git a/src/transformers/quantizers/quantizer_vptq.py b/src/transformers/quantizers/quantizer_vptq.py
index 1672c3ebc5a7..85483357448f 100644
--- a/src/transformers/quantizers/quantizer_vptq.py
+++ b/src/transformers/quantizers/quantizer_vptq.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 from .base import HfQuantizer
 
@@ -68,6 +68,7 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
+        keep_in_fp32_modules: Optional[List[str]] = None,
         **kwargs,
     ):
         """
@@ -76,14 +77,14 @@ def _process_model_before_weight_loading(
         """
         from ..integrations import replace_with_vptq_linear
 
-        modules_to_not_convert = kwargs.get("modules_to_not_convert", []) + (
-            self.quantization_config.modules_to_not_convert or []
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
         )
 
         replace_with_vptq_linear(
             model,
             quantization_config=self.quantization_config,
-            modules_to_not_convert=modules_to_not_convert,
+            modules_to_not_convert=self.modules_to_not_convert,
         )
         model.config.quantization_config = self.quantization_config
 
diff --git a/src/transformers/quantizers/quantizers_utils.py b/src/transformers/quantizers/quantizers_utils.py
index 6ae287bf251b..bbed8317049f 100644
--- a/src/transformers/quantizers/quantizers_utils.py
+++ b/src/transformers/quantizers/quantizers_utils.py
@@ -16,11 +16,6 @@
 
 def get_module_from_name(module, tensor_name: str) -> Tuple[Any, str]:
     if "." in tensor_name:
-        splits = tensor_name.split(".")
-        for split in splits[:-1]:
-            new_module = getattr(module, split)
-            if new_module is None:
-                raise ValueError(f"{module} has no attribute {split}.")
-            module = new_module
-        tensor_name = splits[-1]
+        module_name, tensor_name = tensor_name.rsplit(".", 1)
+        module = module.get_submodule(module_name)
     return module, tensor_name
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 14fef2988488..3ee55e893a33 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -32,20 +32,23 @@
 import threading
 import time
 import unittest
-from collections import defaultdict
-from collections.abc import Mapping
+from collections import UserDict, defaultdict
+from collections.abc import Generator, Iterable, Iterator, Mapping
 from dataclasses import MISSING, fields
-from functools import wraps
+from functools import cache, wraps
 from io import StringIO
 from pathlib import Path
-from typing import Callable, Dict, Iterable, Iterator, List, Optional, Union
+from typing import Any, Callable, Optional, Union
 from unittest import mock
 from unittest.mock import patch
 
 import huggingface_hub.utils
+import requests
 import urllib3
 from huggingface_hub import delete_repo
+from packaging import version
 
+from transformers import Trainer
 from transformers import logging as transformers_logging
 
 from .integrations import (
@@ -53,6 +56,7 @@
     is_optuna_available,
     is_ray_available,
     is_sigopt_available,
+    is_swanlab_available,
     is_tensorboard_available,
     is_wandb_available,
 )
@@ -112,6 +116,7 @@
     is_pytesseract_available,
     is_pytest_available,
     is_pytorch_quantization_available,
+    is_quark_available,
     is_rjieba_available,
     is_sacremoses_available,
     is_safetensors_available,
@@ -138,6 +143,8 @@
     is_torch_deterministic,
     is_torch_fp16_available_on_device,
     is_torch_greater_or_equal,
+    is_torch_hpu_available,
+    is_torch_mlu_available,
     is_torch_neuroncore_available,
     is_torch_npu_available,
     is_torch_sdpa_available,
@@ -199,6 +206,8 @@
     IS_ROCM_SYSTEM = False
     IS_CUDA_SYSTEM = False
 
+logger = transformers_logging.get_logger(__name__)
+
 
 def parse_flag_from_env(key, default=False):
     try:
@@ -230,65 +239,13 @@ def parse_int_from_env(key, default=None):
 
 
 _run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
-_run_pt_tf_cross_tests = parse_flag_from_env("RUN_PT_TF_CROSS_TESTS", default=True)
-_run_pt_flax_cross_tests = parse_flag_from_env("RUN_PT_FLAX_CROSS_TESTS", default=True)
 _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
 _run_staging = parse_flag_from_env("HUGGINGFACE_CO_STAGING", default=False)
-_tf_gpu_memory_limit = parse_int_from_env("TF_GPU_MEMORY_LIMIT", default=None)
 _run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True)
 _run_agent_tests = parse_flag_from_env("RUN_AGENT_TESTS", default=False)
 _run_third_party_device_tests = parse_flag_from_env("RUN_THIRD_PARTY_DEVICE_TESTS", default=False)
 
 
-def get_device_count():
-    import torch
-
-    if is_torch_xpu_available():
-        num_devices = torch.xpu.device_count()
-    else:
-        num_devices = torch.cuda.device_count()
-
-    return num_devices
-
-
-def is_pt_tf_cross_test(test_case):
-    """
-    Decorator marking a test as a test that control interactions between PyTorch and TensorFlow.
-
-    PT+TF tests are skipped by default and we can run only them by setting RUN_PT_TF_CROSS_TESTS environment variable
-    to a truthy value and selecting the is_pt_tf_cross_test pytest mark.
-
-    """
-    if not _run_pt_tf_cross_tests or not is_torch_available() or not is_tf_available():
-        return unittest.skip(reason="test is PT+TF test")(test_case)
-    else:
-        try:
-            import pytest  # We don't need a hard dependency on pytest in the main library
-        except ImportError:
-            return test_case
-        else:
-            return pytest.mark.is_pt_tf_cross_test()(test_case)
-
-
-def is_pt_flax_cross_test(test_case):
-    """
-    Decorator marking a test as a test that control interactions between PyTorch and Flax
-
-    PT+FLAX tests are skipped by default and we can run only them by setting RUN_PT_FLAX_CROSS_TESTS environment
-    variable to a truthy value and selecting the is_pt_flax_cross_test pytest mark.
-
-    """
-    if not _run_pt_flax_cross_tests or not is_torch_available() or not is_flax_available():
-        return unittest.skip(reason="test is PT+FLAX test")(test_case)
-    else:
-        try:
-            import pytest  # We don't need a hard dependency on pytest in the main library
-        except ImportError:
-            return test_case
-        else:
-            return pytest.mark.is_pt_flax_cross_test()(test_case)
-
-
 def is_staging_test(test_case):
     """
     Decorator marking a test as a staging test.
@@ -796,17 +753,17 @@ def require_spacy(test_case):
 
 def require_torch_multi_gpu(test_case):
     """
-    Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without
-    multiple GPUs.
+    Decorator marking a test that requires a multi-GPU CUDA setup (in PyTorch). These tests are skipped on a machine without
+    multiple CUDA GPUs.
 
     To run *only* the multi_gpu tests, assuming all test names contain multi_gpu: $ pytest -sv ./tests -k "multi_gpu"
     """
     if not is_torch_available():
         return unittest.skip(reason="test requires PyTorch")(test_case)
 
-    device_count = get_device_count()
+    import torch
 
-    return unittest.skipUnless(device_count > 1, "test requires multiple GPUs")(test_case)
+    return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple CUDA GPUs")(test_case)
 
 
 def require_torch_multi_accelerator(test_case):
@@ -905,6 +862,13 @@ def require_torch_multi_npu(test_case):
     return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case)
 
 
+def require_non_hpu(test_case):
+    """
+    Decorator marking a test that should be skipped for HPU.
+    """
+    return unittest.skipUnless(torch_device != "hpu", "test requires a non-HPU")(test_case)
+
+
 def require_torch_xpu(test_case):
     """
     Decorator marking a test that requires XPU (in PyTorch).
@@ -936,6 +900,19 @@ def require_torch_multi_xpu(test_case):
     return unittest.skipUnless(torch.xpu.device_count() > 1, "test requires multiple XPUs")(test_case)
 
 
+def require_torch_multi_hpu(test_case):
+    """
+    Decorator marking a test that requires a multi-HPU setup (in PyTorch). These tests are skipped on a machine without
+    multiple HPUs.
+
+    To run *only* the multi_hpu tests, assuming all test names contain multi_hpu: $ pytest -sv ./tests -k "multi_hpu"
+    """
+    if not is_torch_hpu_available():
+        return unittest.skip(reason="test requires PyTorch HPU")(test_case)
+
+    return unittest.skipUnless(torch.hpu.device_count() > 1, "test requires multiple HPUs")(test_case)
+
+
 if is_torch_available():
     # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
     import torch
@@ -964,6 +941,14 @@ def require_torch_multi_xpu(test_case):
             raise ValueError(
                 f"TRANSFORMERS_TEST_DEVICE={torch_device}, but NPU is unavailable. Please double-check your testing environment."
             )
+        if torch_device == "mlu" and not is_torch_mlu_available():
+            raise ValueError(
+                f"TRANSFORMERS_TEST_DEVICE={torch_device}, but MLU is unavailable. Please double-check your testing environment."
+            )
+        if torch_device == "hpu" and not is_torch_hpu_available():
+            raise ValueError(
+                f"TRANSFORMERS_TEST_DEVICE={torch_device}, but HPU is unavailable. Please double-check your testing environment."
+            )
 
         try:
             # try creating device to see if provided device is valid
@@ -976,6 +961,10 @@ def require_torch_multi_xpu(test_case):
         torch_device = "cuda"
     elif _run_third_party_device_tests and is_torch_npu_available():
         torch_device = "npu"
+    elif _run_third_party_device_tests and is_torch_mlu_available():
+        torch_device = "mlu"
+    elif _run_third_party_device_tests and is_torch_hpu_available():
+        torch_device = "hpu"
     elif _run_third_party_device_tests and is_torch_xpu_available():
         torch_device = "xpu"
     else:
@@ -1004,6 +993,18 @@ def require_torchao(test_case):
     return unittest.skipUnless(is_torchao_available(), "test requires torchao")(test_case)
 
 
+def require_torchao_version_greater_or_equal(torchao_version):
+    def decorator(test_case):
+        correct_torchao_version = is_torchao_available() and version.parse(
+            version.parse(importlib.metadata.version("torchao")).base_version
+        ) >= version.parse(torchao_version)
+        return unittest.skipUnless(
+            correct_torchao_version, f"Test requires torchao with the version greater than {torchao_version}."
+        )(test_case)
+
+    return decorator
+
+
 def require_torch_tensorrt_fx(test_case):
     """Decorator marking a test that requires Torch-TensorRT FX"""
     return unittest.skipUnless(is_torch_tensorrt_fx_available(), "test requires Torch-TensorRT FX")(test_case)
@@ -1134,6 +1135,16 @@ def require_sigopt(test_case):
     return unittest.skipUnless(is_sigopt_available(), "test requires SigOpt")(test_case)
 
 
+def require_swanlab(test_case):
+    """
+    Decorator marking a test that requires swanlab.
+
+    These tests are skipped when swanlab isn't installed.
+
+    """
+    return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case)
+
+
 def require_wandb(test_case):
     """
     Decorator marking a test that requires wandb.
@@ -1296,6 +1307,13 @@ def require_fbgemm_gpu(test_case):
     return unittest.skipUnless(is_fbgemm_gpu_available(), "test requires fbgemm-gpu")(test_case)
 
 
+def require_quark(test_case):
+    """
+    Decorator for quark dependency
+    """
+    return unittest.skipUnless(is_quark_available(), "test requires quark")(test_case)
+
+
 def require_flute_hadamard(test_case):
     """
     Decorator marking a test that requires higgs and hadamard
@@ -1438,6 +1456,33 @@ def get_tests_dir(append_path=None):
         return tests_dir
 
 
+def get_steps_per_epoch(trainer: Trainer) -> int:
+    training_args = trainer.args
+    train_dataloader = trainer.get_train_dataloader()
+
+    initial_training_values = trainer.set_initial_training_values(
+        args=training_args,
+        dataloader=train_dataloader,
+        total_train_batch_size=training_args.per_device_train_batch_size,
+    )
+    steps_per_epoch = initial_training_values[1]
+
+    return steps_per_epoch
+
+
+def evaluate_side_effect_factory(
+    side_effect_values: list[dict[str, float]],
+) -> Generator[dict[str, float], None, None]:
+    """
+    Function that returns side effects for the _evaluate method.
+    Used when we're unsure of exactly how many times _evaluate will be called.
+    """
+    yield from side_effect_values
+
+    while True:
+        yield side_effect_values[-1]
+
+
 #
 # Helper functions for dealing with testing text outputs
 # The original code came from:
@@ -2329,8 +2374,8 @@ def tee(line, sink, pipe, label=""):
     # XXX: the timeout doesn't seem to make any difference here
     await asyncio.wait(
         [
-            _read_stream(p.stdout, lambda l: tee(l, out, sys.stdout, label="stdout:")),
-            _read_stream(p.stderr, lambda l: tee(l, err, sys.stderr, label="stderr:")),
+            asyncio.create_task(_read_stream(p.stdout, lambda l: tee(l, out, sys.stdout, label="stdout:"))),
+            asyncio.create_task(_read_stream(p.stderr, lambda l: tee(l, err, sys.stderr, label="stderr:"))),
         ],
         timeout=timeout,
     )
@@ -2413,7 +2458,7 @@ def nested_simplify(obj, decimals=3):
 
 
 def check_json_file_has_correct_format(file_path):
-    with open(file_path, "r") as f:
+    with open(file_path) as f:
         lines = f.readlines()
         if len(lines) == 1:
             # length can only be 1 if dict is empty
@@ -2440,10 +2485,10 @@ class SubprocessCallException(Exception):
     pass
 
 
-def run_command(command: List[str], return_stdout=False):
+def run_command(command: list[str], return_stdout=False):
     """
     Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
-    if an error occured while running `command`
+    if an error occurred while running `command`
     """
     try:
         output = subprocess.check_output(command, stderr=subprocess.STDOUT)
@@ -2493,10 +2538,12 @@ def wrap(*args, **kwargs):
 
     def __exit__(self, *args, **kwargs) -> None:
         assert len(self.mock.call_args_list) == len(self._extra_info)
-
         for thread_id, call in zip(self._extra_info, self.mock.call_args_list):
             if thread_id != self._thread_id:
                 continue
+            # code 307: the URL being requested by the user has moved to a temporary location
+            if call.args[-2] == 307:
+                continue
             log = call.args[0] % call.args[1:]
             for method in ("HEAD", "GET", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"):
                 if method in log:
@@ -2516,6 +2563,11 @@ def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None, d
     """
     To decorate flaky tests. They will be retried on failures.
 
+    Please note that our push tests use `pytest-rerunfailures`, which prompts the CI to rerun certain types of
+    failed tests. More specifically, if the test exception contains any substring in `FLAKY_TEST_FAILURE_PATTERNS`
+    (in `.circleci/create_circleci_config.py`), it will be rerun. If you find a recurrent pattern of failures,
+    expand `FLAKY_TEST_FAILURE_PATTERNS` in our CI configuration instead of using `is_flaky`.
+
     Args:
         max_attempts (`int`, *optional*, defaults to 5):
             The maximum number of attempts to retry the flaky test.
@@ -2536,7 +2588,7 @@ def wrapper(*args, **kwargs):
                     return test_func_ref(*args, **kwargs)
 
                 except Exception as err:
-                    print(f"Test failed with {err} at try {retry_count}/{max_attempts}.", file=sys.stderr)
+                    logger.error(f"Test failed with {err} at try {retry_count}/{max_attempts}.")
                     if wait_before_retry is not None:
                         time.sleep(wait_before_retry)
                     retry_count += 1
@@ -2548,6 +2600,62 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
+def hub_retry(max_attempts: int = 5, wait_before_retry: Optional[float] = 2):
+    """
+    To decorate tests that download from the Hub. They can fail due to a
+    variety of network issues such as timeouts, connection resets, etc.
+
+    Args:
+        max_attempts (`int`, *optional*, defaults to 5):
+            The maximum number of attempts to retry the flaky test.
+        wait_before_retry (`float`, *optional*, defaults to 2):
+            If provided, will wait that number of seconds before retrying the test.
+    """
+
+    def decorator(test_func_ref):
+        @functools.wraps(test_func_ref)
+        def wrapper(*args, **kwargs):
+            retry_count = 1
+
+            while retry_count < max_attempts:
+                try:
+                    return test_func_ref(*args, **kwargs)
+                # We catch all exceptions related to network issues from requests
+                except (
+                    requests.exceptions.ConnectionError,
+                    requests.exceptions.Timeout,
+                    requests.exceptions.ReadTimeout,
+                    requests.exceptions.HTTPError,
+                    requests.exceptions.RequestException,
+                ) as err:
+                    logger.error(
+                        f"Test failed with {err} at try {retry_count}/{max_attempts} as it couldn't connect to the specified Hub repository."
+                    )
+                    if wait_before_retry is not None:
+                        time.sleep(wait_before_retry)
+                    retry_count += 1
+
+            return test_func_ref(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+def run_first(test_case):
+    """
+    Decorator marking a test with order(1). When pytest-order plugin is installed, tests marked with this decorator
+    are garanteed to run first.
+
+    This is especially useful in some test settings like on a Gaudi instance where a Gaudi device can only be used by a
+    single process at a time. So we make sure all tests that run in a subprocess are launched first, to avoid device
+    allocation conflicts.
+    """
+    import pytest
+
+    return pytest.mark.order(1)(test_case)
+
+
 def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
     """
     To run a test in a subprocess. In particular, this can avoid (GPU) memory issue.
@@ -2655,7 +2763,7 @@ def wrapper(*args, **kwargs):
 The following contains utils to run the documentation tests without having to overwrite any files.
 
 The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is
-made as a print would otherwise fail the corresonding line.
+made as a print would otherwise fail the corresponding line.
 
 To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules <path_to_files_to_test>
 """
@@ -2669,8 +2777,8 @@ def preprocess_string(string, skip_cuda_tests):
     cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for
     `string`.
     """
-    codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )((?:.*?\n)*?.*?```)"
-    codeblocks = re.split(re.compile(codeblock_pattern, flags=re.MULTILINE | re.DOTALL), string)
+    codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )(.*?```)"
+    codeblocks = re.split(codeblock_pattern, string, flags=re.DOTALL)
     is_cuda_found = False
     for i, codeblock in enumerate(codeblocks):
         if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock:
@@ -2812,7 +2920,7 @@ def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None:
                 yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test)
 
 
-def _device_agnostic_dispatch(device: str, dispatch_table: Dict[str, Callable], *args, **kwargs):
+def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs):
     if device not in dispatch_table:
         return dispatch_table["default"](*args, **kwargs)
 
@@ -2828,14 +2936,50 @@ def _device_agnostic_dispatch(device: str, dispatch_table: Dict[str, Callable],
 if is_torch_available():
     # Mappings from device names to callable functions to support device agnostic
     # testing.
-    BACKEND_MANUAL_SEED = {"cuda": torch.cuda.manual_seed, "cpu": torch.manual_seed, "default": torch.manual_seed}
-    BACKEND_EMPTY_CACHE = {"cuda": torch.cuda.empty_cache, "cpu": None, "default": None}
-    BACKEND_DEVICE_COUNT = {"cuda": torch.cuda.device_count, "cpu": lambda: 0, "default": lambda: 1}
+    BACKEND_MANUAL_SEED = {
+        "cuda": torch.cuda.manual_seed,
+        "cpu": torch.manual_seed,
+        "default": torch.manual_seed,
+    }
+    BACKEND_EMPTY_CACHE = {
+        "cuda": torch.cuda.empty_cache,
+        "cpu": None,
+        "default": None,
+    }
+    BACKEND_DEVICE_COUNT = {
+        "cuda": torch.cuda.device_count,
+        "cpu": lambda: 0,
+        "default": lambda: 1,
+    }
 else:
     BACKEND_MANUAL_SEED = {"default": None}
     BACKEND_EMPTY_CACHE = {"default": None}
     BACKEND_DEVICE_COUNT = {"default": lambda: 0}
 
+if is_torch_hpu_available():
+    BACKEND_MANUAL_SEED["hpu"] = torch.hpu.manual_seed
+    BACKEND_DEVICE_COUNT["hpu"] = torch.hpu.device_count
+
+if is_torch_mlu_available():
+    BACKEND_EMPTY_CACHE["mlu"] = torch.mlu.empty_cache
+    BACKEND_MANUAL_SEED["mlu"] = torch.mlu.manual_seed
+    BACKEND_DEVICE_COUNT["mlu"] = torch.mlu.device_count
+
+if is_torch_npu_available():
+    BACKEND_EMPTY_CACHE["npu"] = torch.npu.empty_cache
+    BACKEND_MANUAL_SEED["npu"] = torch.npu.manual_seed
+    BACKEND_DEVICE_COUNT["npu"] = torch.npu.device_count
+
+if is_torch_xpu_available():
+    BACKEND_EMPTY_CACHE["xpu"] = torch.xpu.empty_cache
+    BACKEND_MANUAL_SEED["xpu"] = torch.xpu.manual_seed
+    BACKEND_DEVICE_COUNT["xpu"] = torch.xpu.device_count
+
+if is_torch_xla_available():
+    BACKEND_EMPTY_CACHE["xla"] = torch.cuda.empty_cache
+    BACKEND_MANUAL_SEED["xla"] = torch.cuda.manual_seed
+    BACKEND_DEVICE_COUNT["xla"] = torch.cuda.device_count
+
 
 def backend_manual_seed(device: str, seed: int):
     return _device_agnostic_dispatch(device, BACKEND_MANUAL_SEED, seed)
@@ -2861,6 +3005,8 @@ def backend_device_count(device: str):
 
         # Try to strip extension for later import – also verifies we are importing a
         # python file.
+        device_spec_dir, _ = os.path.split(os.path.realpath(device_spec_path))
+        sys.path.append(device_spec_dir)
         try:
             import_name = device_spec_path[: device_spec_path.index(".py")]
         except ValueError as e:
@@ -2881,7 +3027,7 @@ def backend_device_count(device: str):
 
         torch_device = device_name
 
-        def update_mapping_from_spec(device_fn_dict: Dict[str, Callable], attribute_name: str):
+        def update_mapping_from_spec(device_fn_dict: dict[str, Callable], attribute_name: str):
             try:
                 # Try to import the function directly
                 spec_fn = getattr(device_spec_module, attribute_name)
@@ -2931,3 +3077,79 @@ def cleanup(device: str, gc_collect=False):
     if gc_collect:
         gc.collect()
     backend_empty_cache(device)
+    torch._dynamo.reset()
+
+
+# Type definition of key used in `Expectations` class.
+DeviceProperties = tuple[Union[str, None], Union[int, None]]
+
+
+@cache
+def get_device_properties() -> DeviceProperties:
+    """
+    Get environment device properties.
+    """
+    if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
+        import torch
+
+        major, _ = torch.cuda.get_device_capability()
+        if IS_ROCM_SYSTEM:
+            return ("rocm", major)
+        else:
+            return ("cuda", major)
+    else:
+        return (torch_device, None)
+
+
+class Expectations(UserDict[DeviceProperties, Any]):
+    def get_expectation(self) -> Any:
+        """
+        Find best matching expectation based on environment device properties.
+        """
+        return self.find_expectation(get_device_properties())
+
+    @staticmethod
+    def is_default(key: DeviceProperties) -> bool:
+        return all(p is None for p in key)
+
+    @staticmethod
+    def score(key: DeviceProperties, other: DeviceProperties) -> int:
+        """
+        Returns score indicating how similar two instances of the `Properties` tuple are.
+        Points are calculated using bits, but documented as int.
+        Rules are as follows:
+            * Matching `type` gives 8 points.
+            * Semi-matching `type`, for example cuda and rocm, gives 4 points.
+            * Matching `major` (compute capability major version) gives 2 points.
+            * Default expectation (if present) gives 1 points.
+        """
+        (device_type, major) = key
+        (other_device_type, other_major) = other
+
+        score = 0b0
+        if device_type == other_device_type:
+            score |= 0b1000
+        elif device_type in ["cuda", "rocm"] and other_device_type in ["cuda", "rocm"]:
+            score |= 0b100
+
+        if major == other_major and other_major is not None:
+            score |= 0b10
+
+        if Expectations.is_default(other):
+            score |= 0b1
+
+        return int(score)
+
+    def find_expectation(self, key: DeviceProperties = (None, None)) -> Any:
+        """
+        Find best matching expectation based on provided device properties.
+        """
+        (result_key, result) = max(self.data.items(), key=lambda x: Expectations.score(key, x[0]))
+
+        if Expectations.score(key, result_key) == 0:
+            raise ValueError(f"No matching expectation found for {key}")
+
+        return result
+
+    def __repr__(self):
+        return f"{self.data}"
diff --git a/src/transformers/tf_utils.py b/src/transformers/tf_utils.py
index b91a2ea520f0..c3770cb1237f 100644
--- a/src/transformers/tf_utils.py
+++ b/src/transformers/tf_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 import tensorflow as tf
@@ -25,7 +25,7 @@
 logger = logging.get_logger(__name__)
 
 
-def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]:
+def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> list[int]:
     """
     Deal with dynamic shape in tensorflow cleanly.
 
@@ -105,7 +105,7 @@ def functional_layernorm(inputs, weight, bias, epsilon=1e-5, axis=-1):
 
 
 def scaled_dot_product_attention(
-    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale: float = None
+    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale: Optional[float] = None
 ):
     """TF equivalent for torch's nn.functional.scaled_dot_product_attention"""
     if dropout_p != 0.0:
diff --git a/src/transformers/time_series_utils.py b/src/transformers/time_series_utils.py
index 7d9716e48124..3a5cf4f2f4d8 100644
--- a/src/transformers/time_series_utils.py
+++ b/src/transformers/time_series_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 The HuggingFace Inc. team.
 # Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #
@@ -17,7 +16,7 @@
 Time series distributional output classes and utilities.
 """
 
-from typing import Callable, Dict, Optional, Tuple
+from typing import Callable, Optional
 
 import torch
 from torch import nn
@@ -63,14 +62,14 @@ def stddev(self):
 
 class ParameterProjection(nn.Module):
     def __init__(
-        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
+        self, in_features: int, args_dim: dict[str, int], domain_map: Callable[..., tuple[torch.Tensor]], **kwargs
     ) -> None:
         super().__init__(**kwargs)
         self.args_dim = args_dim
         self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
         self.domain_map = domain_map
 
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor]:
         params_unbounded = [proj(x) for proj in self.proj]
 
         return self.domain_map(*params_unbounded)
@@ -88,7 +87,7 @@ def forward(self, x, *args):
 class DistributionOutput:
     distribution_class: type
     in_features: int
-    args_dim: Dict[str, int]
+    args_dim: dict[str, int]
 
     def __init__(self, dim: int = 1) -> None:
         self.dim = dim
@@ -113,7 +112,7 @@ def distribution(
             return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)
 
     @property
-    def event_shape(self) -> Tuple:
+    def event_shape(self) -> tuple:
         r"""
         Shape of each individual event contemplated by the distributions that this object constructs.
         """
@@ -167,7 +166,7 @@ class StudentTOutput(DistributionOutput):
     Student-T distribution output class.
     """
 
-    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
+    args_dim: dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
     distribution_class: type = StudentT
 
     @classmethod
@@ -182,7 +181,7 @@ class NormalOutput(DistributionOutput):
     Normal distribution output class.
     """
 
-    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
+    args_dim: dict[str, int] = {"loc": 1, "scale": 1}
     distribution_class: type = Normal
 
     @classmethod
@@ -196,7 +195,7 @@ class NegativeBinomialOutput(DistributionOutput):
     Negative Binomial distribution output class.
     """
 
-    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
+    args_dim: dict[str, int] = {"total_count": 1, "logits": 1}
     distribution_class: type = NegativeBinomial
 
     @classmethod
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 1bc13020e65b..51382d776f06 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,7 +21,7 @@
 import re
 import unicodedata
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Tuple, Union, overload
+from typing import Any, Optional, Union, overload
 
 from .tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
@@ -103,7 +102,7 @@ def add(self, word: str):
             ref = ref[char]
         ref[self._termination_char] = 1
 
-    def split(self, text: str) -> List[str]:
+    def split(self, text: str) -> list[str]:
         """
         Will look for the words added to the trie within `text`. Output is the original string splitted along the
         boundaries of the words found.
@@ -391,7 +390,7 @@ def _is_start_of_word(text):
     return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
 
 
-def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str):
+def _insert_one_token_to_ordered_list(token_list: list[str], new_token: str):
     """
     Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
     """
@@ -425,11 +424,11 @@ def __init__(self, **kwargs):
 
         # 2. init `_added_tokens_decoder` if child class did not
         if not hasattr(self, "_added_tokens_decoder"):
-            self._added_tokens_decoder: Dict[int, AddedToken] = {}
+            self._added_tokens_decoder: dict[int, AddedToken] = {}
 
         # 3. if a `added_tokens_decoder` is passed, we are loading from a saved tokenizer, we overwrite
         self._added_tokens_decoder.update(kwargs.pop("added_tokens_decoder", {}))
-        self._added_tokens_encoder: Dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
+        self._added_tokens_encoder: dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
 
         # 4 init the parent class
         super().__init__(**kwargs)
@@ -455,7 +454,7 @@ def vocab_size(self) -> int:
         raise NotImplementedError
 
     @property
-    def added_tokens_encoder(self) -> Dict[str, int]:
+    def added_tokens_encoder(self) -> dict[str, int]:
         """
         Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
         optimisation in `self._added_tokens_encoder` for the slow tokenizers.
@@ -463,7 +462,7 @@ def added_tokens_encoder(self) -> Dict[str, int]:
         return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}
 
     @property
-    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
+    def added_tokens_decoder(self) -> dict[int, AddedToken]:
         """
         Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
 
@@ -473,7 +472,7 @@ def added_tokens_decoder(self) -> Dict[int, AddedToken]:
         return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))
 
     @added_tokens_decoder.setter
-    def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]:
+    def added_tokens_decoder(self, value: dict[int, Union[AddedToken, str]]) -> dict[int, AddedToken]:
         # Always raise an error if string because users should define the behavior
         for index, token in value.items():
             if not isinstance(token, (str, AddedToken)) or not isinstance(index, int):
@@ -485,7 +484,7 @@ def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict
             self._added_tokens_encoder[str(token)] = index
         self._update_total_vocab_size()
 
-    def get_added_vocab(self) -> Dict[str, int]:
+    def get_added_vocab(self) -> dict[str, int]:
         """
         Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
         the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
@@ -510,7 +509,7 @@ def _update_total_vocab_size(self):
         """
         self.total_vocab_size = len(self.get_vocab())
 
-    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+    def _add_tokens(self, new_tokens: Union[list[str], list[AddedToken]], special_tokens: bool = False) -> int:
         """
         Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
         it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
@@ -619,7 +618,7 @@ def num_special_tokens_to_add(self, pair: bool = False) -> int:
         token_ids_1 = []
         return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
 
-    def tokenize(self, text: TextInput, **kwargs) -> List[str]:
+    def tokenize(self, text: TextInput, **kwargs) -> list[str]:
         """
         Converts a string into a sequence of tokens, using the tokenizer.
 
@@ -708,7 +707,7 @@ def _tokenize(self, text, **kwargs):
         """
         raise NotImplementedError
 
-    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+    def convert_tokens_to_ids(self, tokens: Union[str, list[str]]) -> Union[int, list[int]]:
         """
         Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
         vocabulary.
@@ -752,7 +751,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -824,12 +823,12 @@ def get_input_ids(text):
     def _batch_encode_plus(
         self,
         batch_text_or_text_pairs: Union[
-            List[TextInput],
-            List[TextInputPair],
-            List[PreTokenizedInput],
-            List[PreTokenizedInputPair],
-            List[EncodedInput],
-            List[EncodedInputPair],
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+            list[PreTokenizedInputPair],
+            list[EncodedInput],
+            list[EncodedInputPair],
         ],
         add_special_tokens: bool = True,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
@@ -838,7 +837,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -913,14 +912,14 @@ def get_input_ids(text):
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def _batch_prepare_for_model(
         self,
-        batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
+        batch_ids_pairs: list[Union[PreTokenizedInputPair, tuple[list[int], None]]],
         add_special_tokens: bool = True,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -982,7 +981,7 @@ def _batch_prepare_for_model(
 
     def prepare_for_tokenization(
         self, text: str, is_split_into_words: bool = False, **kwargs
-    ) -> Tuple[str, Dict[str, Any]]:
+    ) -> tuple[str, dict[str, Any]]:
         """
         Performs any necessary transformations before tokenization.
 
@@ -1005,8 +1004,8 @@ def prepare_for_tokenization(
         return (text, kwargs)
 
     def get_special_tokens_mask(
-        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
+        self, token_ids_0: list, token_ids_1: Optional[list] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
@@ -1038,11 +1037,11 @@ def get_special_tokens_mask(
     def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
 
     @overload
-    def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]: ...
+    def convert_ids_to_tokens(self, ids: list[int], skip_special_tokens: bool = False) -> list[str]: ...
 
     def convert_ids_to_tokens(
-        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
-    ) -> Union[str, List[str]]:
+        self, ids: Union[int, list[int]], skip_special_tokens: bool = False
+    ) -> Union[str, list[str]]:
         """
         Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
         added tokens.
@@ -1075,14 +1074,14 @@ def convert_ids_to_tokens(
     def _convert_id_to_token(self, index: int) -> str:
         raise NotImplementedError
 
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
         return " ".join(tokens)
 
     def _decode(
         self,
-        token_ids: Union[int, List[int]],
+        token_ids: Union[int, list[int]],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         spaces_between_special_tokens: bool = True,
         **kwargs,
     ) -> str:
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index fc31e72984f4..8e047569e7ce 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -531,7 +531,7 @@ def word_to_tokens(
         span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
         return TokenSpan(*span) if span is not None else None
 
-    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
+    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> Optional[CharSpan]:
         """
         Get the character span corresponding to an encoded token in a sequence of the batch.
 
@@ -868,7 +868,7 @@ class SpecialTokensMixin:
     def __init__(self, verbose=False, **kwargs):
         self._pad_token_type_id = 0
         self.verbose = verbose
-        self._special_tokens_map = {attr: None for attr in self.SPECIAL_TOKENS_ATTRIBUTES}
+        self._special_tokens_map = dict.fromkeys(self.SPECIAL_TOKENS_ATTRIBUTES)
         self._special_tokens_map["additional_special_tokens"] = []  # for BC where it defaults to empty list
 
         # We directly set the hidden value to allow initialization with special tokens
@@ -881,9 +881,9 @@ def __init__(self, verbose=False, **kwargs):
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                 if key == "additional_special_tokens":
                     assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
-                    assert all(
-                        isinstance(t, (str, AddedToken)) for t in value
-                    ), "One of the tokens is not a string or an AddedToken"
+                    assert all(isinstance(t, (str, AddedToken)) for t in value), (
+                        "One of the tokens is not a string or an AddedToken"
+                    )
                     setattr(self, key, value)
                 elif isinstance(value, (str, AddedToken)):
                     setattr(self, key, value)
@@ -967,9 +967,9 @@ def add_special_tokens(
                 logger.info(f"Assigning {value} to the {key} key of the tokenizer")
 
             if key == "additional_special_tokens":
-                assert isinstance(value, (list, tuple)) and all(
-                    isinstance(t, (str, AddedToken)) for t in value
-                ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
+                assert isinstance(value, (list, tuple)) and all(isinstance(t, (str, AddedToken)) for t in value), (
+                    f"Tokens {value} for key {key} should all be str or AddedToken instances"
+                )
 
                 to_add = []
                 for token in value:
@@ -1184,7 +1184,7 @@ def _set_model_specific_special_tokens(self, special_tokens: List[str]):
         """
         Adds new special tokens to the "SPECIAL_TOKENS_ATTRIBUTES" list which will be part
         of "self.special_tokens" and saved as a special token in tokenizer's config.
-        This allows us to dynamically add new model-type specific tokens after initilizing the tokenizer.
+        This allows us to dynamically add new model-type specific tokens after initializing the tokenizer.
         For example: if the model tokenizers is multimodal, we can support special image or audio tokens.
         """
         self.SPECIAL_TOKENS_ATTRIBUTES = self.SPECIAL_TOKENS_ATTRIBUTES + list(special_tokens.keys())
@@ -1199,7 +1199,7 @@ def _set_model_specific_special_tokens(self, special_tokens: List[str]):
             add_special_tokens (`bool`, *optional*, defaults to `True`):
                 Whether or not to add special tokens when encoding the sequences. This will use the underlying
                 `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are
-                automatically added to the input ids. This is usefull if you want to add `bos` or `eos` tokens
+                automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens
                 automatically.
             padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Activates and controls padding. Accepts the following values:
@@ -1702,13 +1702,30 @@ def apply_chat_template(
             if continue_final_message:
                 final_message = chat[-1]["content"]
                 if isinstance(final_message, (list, tuple)):
-                    final_message = final_message[-1]["text"]
-                try:
-                    rendered_chat = rendered_chat[: rendered_chat.rindex(final_message) + len(final_message)]
-                except:  # noqa: E722
-                    # Some chat templates like Llama-3.1 trim messages before rendering, so we must do the same here.
-                    final_message = final_message.strip()
-                    rendered_chat = rendered_chat[: rendered_chat.rindex(final_message) + len(final_message)]
+                    for content_block in reversed(final_message):
+                        if "text" in content_block:
+                            # Pick the last text block in the message (the first one we hit while iterating in reverse)
+                            final_message = content_block["text"]
+                            break
+                    else:
+                        raise ValueError(
+                            "continue_final_message is set but we could not find any text to continue"
+                            "in the final message!"
+                        )
+                if final_message.strip() not in rendered_chat:
+                    raise ValueError(
+                        "continue_final_message is set but the final message does not appear in the chat after "
+                        "applying the chat template! This can happen if the chat template deletes portions of "
+                        "the final message. Please verify the chat template and final message in your chat to "
+                        "ensure they are compatible."
+                    )
+                final_msg_loc = rendered_chat.rindex(final_message.strip())
+                if rendered_chat[final_msg_loc : final_msg_loc + len(final_message.lstrip())] == final_message:
+                    # The template preserves spacing or the message doesn't have trailing spacing, so things are simple
+                    rendered_chat = rendered_chat[: final_msg_loc + len(final_message.lstrip())]
+                else:
+                    # The message has trailing spacing that was trimmed, so we must be more cautious
+                    rendered_chat = rendered_chat[: final_msg_loc + len(final_message.strip())]
             rendered.append(rendered_chat)
 
         if not is_batched:
@@ -1995,7 +2012,6 @@ def from_pretrained(
 
         # Get files from url, cache, or disk depending on the case
         resolved_vocab_files = {}
-        unresolved_files = []
         for file_id, file_path in vocab_files.items():
             if file_path is None:
                 resolved_vocab_files[file_id] = None
@@ -2024,12 +2040,6 @@ def from_pretrained(
                 )
                 commit_hash = extract_commit_hash(resolved_vocab_files[file_id], commit_hash)
 
-        if len(unresolved_files) > 0:
-            logger.info(
-                f"Can't load following files from cache: {unresolved_files} and cannot check if these "
-                "files are necessary for the tokenizer to operate."
-            )
-
         # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be
         # loaded directly from the GGUF file.
         if all(full_file_name is None for full_file_name in resolved_vocab_files.values()) and not gguf_file:
@@ -2464,7 +2474,7 @@ def save_pretrained(
         # no typefields, this way old fast and slow can load it
         tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True, save=True)
 
-        # Process added tokens seperatly: allows previous versions to ignore it!
+        # Process added tokens separately: allows previous versions to ignore it!
         added_tokens = {}
         for key, value in self.added_tokens_decoder.items():
             added_tokens[key] = value.__getstate__()
@@ -2619,10 +2629,10 @@ def encode(
         text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
+        truncation: Union[bool, str, TruncationStrategy, None] = None,
         max_length: Optional[int] = None,
         stride: int = 0,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> List[int]:
@@ -2800,20 +2810,20 @@ def _get_padding_truncation_strategies(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], None] = None,
         text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
-        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], None] = None,
         text_pair_target: Optional[
             Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
         ] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
+        truncation: Union[bool, str, TruncationStrategy, None] = None,
         max_length: Optional[int] = None,
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -2895,12 +2905,12 @@ def _call_one(
         text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
+        truncation: Union[bool, str, TruncationStrategy, None] = None,
         max_length: Optional[int] = None,
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3014,12 +3024,12 @@ def encode_plus(
         text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
+        truncation: Union[bool, str, TruncationStrategy, None] = None,
         max_length: Optional[int] = None,
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3094,7 +3104,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3121,12 +3131,12 @@ def batch_encode_plus(
         ],
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
+        truncation: Union[bool, str, TruncationStrategy, None] = None,
         max_length: Optional[int] = None,
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3203,7 +3213,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3229,7 +3239,7 @@ def pad(
         padding: Union[bool, str, PaddingStrategy] = True,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         verbose: bool = True,
@@ -3369,9 +3379,9 @@ def pad(
             return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
 
         batch_size = len(required_input)
-        assert all(
-            len(v) == batch_size for v in encoded_inputs.values()
-        ), "Some items in the output dictionary have a different batch size than others."
+        assert all(len(v) == batch_size for v in encoded_inputs.values()), (
+            "Some items in the output dictionary have a different batch size than others."
+        )
 
         if padding_strategy == PaddingStrategy.LONGEST:
             max_length = max(len(inputs) for inputs in required_input)
@@ -3443,11 +3453,11 @@ def prepare_for_model(
         pair_ids: Optional[List[int]] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
+        truncation: Union[bool, str, TruncationStrategy, None] = None,
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3704,7 +3714,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -3797,7 +3807,7 @@ def batch_decode(
         self,
         sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs,
     ) -> List[str]:
         """
@@ -3831,7 +3841,7 @@ def decode(
         self,
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs,
     ) -> str:
         """
@@ -3868,7 +3878,7 @@ def _decode(
         self,
         token_ids: Union[int, List[int]],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs,
     ) -> str:
         raise NotImplementedError
@@ -3940,7 +3950,7 @@ def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Opt
             verbose (`bool`): Whether or not to print more information and warnings.
 
         """
-        if max_length is None and len(ids) > self.model_max_length and verbose:
+        if max_length is None and len(ids) > self.model_max_length and verbose and self.model_max_length != 0:
             if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
                 logger.warning(
                     "Token indices sequence length is longer than the specified maximum sequence length "
@@ -4011,7 +4021,7 @@ def prepare_seq2seq_batch(
         max_length: Optional[int] = None,
         max_target_length: Optional[int] = None,
         padding: str = "longest",
-        return_tensors: str = None,
+        return_tensors: Optional[str] = None,
         truncation: bool = True,
         **kwargs,
     ) -> BatchEncoding:
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 925069f2c2f9..54605dbb02f6 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,7 +20,8 @@
 import json
 import os
 from collections import defaultdict
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from collections.abc import Iterable
+from typing import Any, Optional, Union
 
 import tokenizers.pre_tokenizers as pre_tokenizers_fast
 from tokenizers import Encoding as EncodingFast
@@ -238,15 +238,15 @@ def vocab_size(self) -> int:
         """
         return self._tokenizer.get_vocab_size(with_added_tokens=False)
 
-    def get_vocab(self) -> Dict[str, int]:
+    def get_vocab(self) -> dict[str, int]:
         return self._tokenizer.get_vocab(with_added_tokens=True)
 
     @property
-    def vocab(self) -> Dict[str, int]:
+    def vocab(self) -> dict[str, int]:
         return self.get_vocab()
 
     @property
-    def added_tokens_encoder(self) -> Dict[str, int]:
+    def added_tokens_encoder(self) -> dict[str, int]:
         """
         Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
         optimisation in `self._added_tokens_encoder` for the slow tokenizers.
@@ -254,7 +254,7 @@ def added_tokens_encoder(self) -> Dict[str, int]:
         return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
 
     @property
-    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
+    def added_tokens_decoder(self) -> dict[int, AddedToken]:
         """
         Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
 
@@ -263,7 +263,7 @@ def added_tokens_decoder(self) -> Dict[int, AddedToken]:
         """
         return self._tokenizer.get_added_tokens_decoder()
 
-    def get_added_vocab(self) -> Dict[str, int]:
+    def get_added_vocab(self) -> dict[str, int]:
         """
         Returns the added tokens in the vocabulary as a dictionary of token to index.
 
@@ -302,7 +302,7 @@ def _convert_encoding(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-    ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
+    ) -> tuple[dict[str, Any], list[EncodingFast]]:
         """
         Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
         of encodings, take care of building a batch from overflowing tokens.
@@ -339,7 +339,7 @@ def _convert_encoding(
 
         return encoding_dict, encodings
 
-    def convert_tokens_to_ids(self, tokens: Union[str, Iterable[str]]) -> Union[int, List[int]]:
+    def convert_tokens_to_ids(self, tokens: Union[str, Iterable[str]]) -> Union[int, list[int]]:
         """
         Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the
         vocabulary.
@@ -364,7 +364,7 @@ def _convert_token_to_id_with_added_voc(self, token: str) -> int:
     def _convert_id_to_token(self, index: int) -> Optional[str]:
         return self._tokenizer.id_to_token(int(index))
 
-    def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
+    def _add_tokens(self, new_tokens: list[Union[str, AddedToken]], special_tokens=False) -> int:
         if special_tokens:
             return self._tokenizer.add_special_tokens(new_tokens)
 
@@ -392,8 +392,8 @@ def num_special_tokens_to_add(self, pair: bool = False) -> int:
         return self._tokenizer.num_special_tokens_to_add(pair)
 
     def convert_ids_to_tokens(
-        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
-    ) -> Union[str, List[str]]:
+        self, ids: Union[int, list[int]], skip_special_tokens: bool = False
+    ) -> Union[str, list[str]]:
         """
         Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
         added tokens.
@@ -417,7 +417,7 @@ def convert_ids_to_tokens(
             tokens.append(self._tokenizer.id_to_token(index))
         return tokens
 
-    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
         return self.encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens()
 
     def set_truncation_and_padding(
@@ -427,7 +427,7 @@ def set_truncation_and_padding(
         max_length: int,
         stride: int,
         pad_to_multiple_of: Optional[int],
-        padding_side: Optional[bool],
+        padding_side: Optional[str],
     ):
         """
         Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
@@ -498,7 +498,7 @@ def set_truncation_and_padding(
     def _batch_encode_plus(
         self,
         batch_text_or_text_pairs: Union[
-            List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
+            list[TextInput], list[TextInputPair], list[PreTokenizedInput], list[PreTokenizedInputPair]
         ],
         add_special_tokens: bool = True,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
@@ -507,7 +507,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -597,7 +597,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[bool] = None,
+        padding_side: Optional[str] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -647,7 +647,7 @@ def _encode_plus(
 
         return batched_output
 
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
         return (
             self.backend_tokenizer.decoder.decode(tokens)
             if self.backend_tokenizer.decoder is not None
@@ -656,9 +656,9 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
 
     def _decode(
         self,
-        token_ids: Union[int, List[int]],
+        token_ids: Union[int, list[int]],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: bool = None,
+        clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
@@ -681,10 +681,10 @@ def _decode(
     def _save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
-        file_names: Tuple[str],
+        file_names: tuple[str],
         legacy_format: Optional[bool] = None,
         filename_prefix: Optional[str] = None,
-    ) -> Tuple[str]:
+    ) -> tuple[str]:
         """
         Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
         file containing {config + vocab + added-tokens}.
@@ -708,7 +708,7 @@ def _save_pretrained(
             added_tokens_file = os.path.join(
                 save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
             )
-            # make sure to be foward compatible
+            # make sure to be forward compatible
             added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
             if added_vocab:
                 with open(added_tokens_file, "w", encoding="utf-8") as f:
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index f970885314f1..727c2ca7feff 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,7 +33,7 @@
 import warnings
 from collections.abc import Mapping
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 
 # Integrations must be imported before ML frameworks:
@@ -166,6 +165,7 @@
     is_sagemaker_mp_enabled,
     is_schedulefree_available,
     is_torch_compile_available,
+    is_torch_hpu_available,
     is_torch_mlu_available,
     is_torch_mps_available,
     is_torch_musa_available,
@@ -241,6 +241,8 @@
     )
 
     DATA_SAMPLERS = [RandomSampler]
+    if version.parse(accelerate_version) > version.parse("1.3.0"):
+        from accelerate.utils import TorchTensorParallelPlugin
     if version.parse(accelerate_version) > version.parse("0.23.0"):
         from accelerate.data_loader import SeedableRandomSampler
 
@@ -412,20 +414,20 @@ class Trainer:
     @deprecate_kwarg("tokenizer", new_name="processing_class", version="5.0.0", raise_if_both_names=True)
     def __init__(
         self,
-        model: Union[PreTrainedModel, nn.Module] = None,
+        model: Union[PreTrainedModel, nn.Module, None] = None,
         args: TrainingArguments = None,
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset], "datasets.Dataset"]] = None,
         processing_class: Optional[
             Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
         ] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
         compute_loss_func: Optional[Callable] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
-        optimizer_cls_and_kwargs: Optional[Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
+        optimizer_cls_and_kwargs: Optional[tuple[type[torch.optim.Optimizer], dict[str, Any]]] = None,
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ):
         if args is None:
@@ -1058,7 +1060,9 @@ def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.
                 )
             else:
                 lengths = None
-            model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
+            model_input_name = (
+                self.processing_class.model_input_names[0] if self.processing_class is not None else None
+            )
             return LengthGroupedSampler(
                 self.args.eval_batch_size,
                 dataset=eval_dataset,
@@ -1182,7 +1186,7 @@ def create_optimizer_and_scheduler(self, num_training_steps: int):
             optimizer = self.optimizer
         self.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
 
-    def get_decay_parameter_names(self, model) -> List[str]:
+    def get_decay_parameter_names(self, model) -> list[str]:
         """
         Get all parameter names that weight decay will be applied to.
 
@@ -1293,7 +1297,7 @@ def get_optimizer_group(self, param: Optional[Union[str, torch.nn.parameter.Para
     @staticmethod
     def get_optimizer_cls_and_kwargs(
         args: TrainingArguments, model: Optional[PreTrainedModel] = None
-    ) -> Tuple[Any, Any]:
+    ) -> tuple[Any, Any]:
         """
         Returns the optimizer class and optimizer parameters based on the training arguments.
 
@@ -1319,10 +1323,10 @@ def get_optimizer_cls_and_kwargs(
 
         def setup_low_rank_optimizer(
             optimizer_name: str,
-            optimizer_mapping: Dict[str, Any],
-            optim_kwargs: Dict[str, Any],
+            optimizer_mapping: dict[str, Any],
+            optim_kwargs: dict[str, Any],
             is_layerwise_supported: bool = True,
-        ) -> Tuple[Any, Any]:
+        ) -> tuple[Any, Any]:
             """
             Helper function to set up low-rank optimizers like GaLore and Apollo.
 
@@ -1416,11 +1420,6 @@ def optimizer_hook(param):
         if args.optim == OptimizerNames.ADAFACTOR:
             optimizer_cls = Adafactor
             optimizer_kwargs.update({"scale_parameter": False, "relative_step": False})
-        elif args.optim == OptimizerNames.ADAMW_HF:
-            from .optimization import AdamW
-
-            optimizer_cls = AdamW
-            optimizer_kwargs.update(adam_kwargs)
         elif args.optim in [OptimizerNames.ADAMW_TORCH, OptimizerNames.ADAMW_TORCH_FUSED]:
             from torch.optim import AdamW
 
@@ -1783,7 +1782,7 @@ def num_tokens(train_dl: DataLoader, max_steps: Optional[int] = None) -> int:
             logger.warning("Cannot get num_tokens from dataloader")
         return train_tokens
 
-    def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
+    def _hp_search_setup(self, trial: Union["optuna.Trial", dict[str, Any]]):
         """HP search setup code"""
         self._trial = trial
 
@@ -1839,7 +1838,7 @@ def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
 
         self.create_accelerator_and_postprocess()
 
-    def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], step: int, metrics: Dict[str, float]):
+    def _report_to_hp_search(self, trial: Union["optuna.Trial", dict[str, Any]], step: int, metrics: dict[str, float]):
         if self.hp_search_backend is None or trial is None:
             return
         metrics = metrics.copy()
@@ -2140,8 +2139,8 @@ def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
     def train(
         self,
         resume_from_checkpoint: Optional[Union[str, bool]] = None,
-        trial: Union["optuna.Trial", Dict[str, Any]] = None,
-        ignore_keys_for_eval: Optional[List[str]] = None,
+        trial: Union["optuna.Trial", dict[str, Any], None] = None,
+        ignore_keys_for_eval: Optional[list[str]] = None,
         **kwargs,
     ):
         """
@@ -2176,7 +2175,12 @@ def train(
 
         # do_train is not a reliable argument, as it might not be set and .train() still called, so
         # the following is a workaround:
-        if (args.fp16_full_eval or args.bf16_full_eval) and not args.do_train and not self.is_model_parallel:
+        if (
+            (args.fp16_full_eval or args.bf16_full_eval)
+            and not args.do_train
+            and not self.is_model_parallel
+            and self.model_init is None
+        ):
             self._move_model_to_device(self.model, args.device)
 
         if "model_path" in kwargs:
@@ -2257,7 +2261,7 @@ def _inner_training_loop(
                 (self.model_wrapped,) = release_memory(self.model_wrapped)
                 self.model_wrapped = self.model
 
-                # Check for DeepSpeed *after* the intial pass and modify the config
+                # Check for DeepSpeed *after* the initial pass and modify the config
                 if self.is_deepspeed_enabled:
                     # Temporarily unset `self.args.train_batch_size`
                     original_bs = self.args.per_device_train_batch_size
@@ -2309,6 +2313,11 @@ def _inner_training_loop(
 
         delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
 
+        # Can't delay optimizer creation when using FSDP2: https://github.com/huggingface/accelerate/blob/3f636d626063ffcf9a337c7d3624d61b7d187d59/src/accelerate/accelerator.py#L1404
+        is_fsdp2 = self.is_fsdp_enabled and (getattr(self.accelerator.state.fsdp_plugin, "fsdp_version", 1) == 2)
+        if is_fsdp2:
+            delay_optimizer_creation = False
+
         # We need to reset the scheduler, as its parameters may be different on subsequent calls
         if self._created_lr_scheduler:
             self.lr_scheduler = None
@@ -2443,15 +2452,20 @@ def _inner_training_loop(
                 )
 
         # Update the references
-        self.state.init_training_references(self, train_dataloader, max_steps, num_train_epochs, trial)
+        for attr in ("model", "optimizer", "lr_scheduler"):
+            setattr(self.callback_handler, attr, getattr(self, attr))
+        self.callback_handler.train_dataloader = train_dataloader
+
+        self.state.init_training_references(self, max_steps, num_train_epochs, trial)
 
         # tr_loss is a tensor to avoid synchronization of TPUs through .item()
-        tr_loss = torch.tensor(0.0).to(args.device)
+        tr_loss = torch.tensor(0.0, device=args.device)
         # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
         self._total_loss_scalar = 0.0
         self._globalstep_last_logged = self.state.global_step
         model.zero_grad()
         grad_norm: Optional[float] = None
+        learning_rate = None
         self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
 
         if args.eval_on_start:
@@ -2497,7 +2511,7 @@ def _inner_training_loop(
             for _ in range(total_updates):
                 update_step += 1
                 num_batches = args.gradient_accumulation_steps if update_step != (total_updates - 1) else remainder
-                batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches)
+                batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, args.device)
                 for i, inputs in enumerate(batch_samples):
                     step += 1
                     do_sync_step = (step + 1) % args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch
@@ -2515,9 +2529,7 @@ def _inner_training_loop(
                         else:
                             input_tokens = inputs[main_input_name].numel()
                             input_tokens = torch.tensor(input_tokens, device=self.args.device, dtype=torch.int64)
-                            self.state.num_input_tokens_seen += (
-                                self.accelerator.gather(input_tokens).sum().cpu().item()
-                            )
+                            self.state.num_input_tokens_seen += self.accelerator.gather(input_tokens).sum().item()
                     if rng_to_sync:
                         self._load_rng_state(resume_from_checkpoint)
                         rng_to_sync = False
@@ -2600,6 +2612,9 @@ def _inner_training_loop(
 
                         self.control = self.callback_handler.on_optimizer_step(args, self.state, self.control)
 
+                        # get leaning rate before update
+                        learning_rate = self._get_learning_rate()
+
                         if not self.accelerator.optimizer_step_was_skipped:
                             # Delay optimizer scheduling until metrics are generated
                             if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
@@ -2610,7 +2625,14 @@ def _inner_training_loop(
                         self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
                         self.control = self.callback_handler.on_step_end(args, self.state, self.control)
                         self._maybe_log_save_evaluate(
-                            tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time
+                            tr_loss,
+                            grad_norm,
+                            model,
+                            trial,
+                            epoch,
+                            ignore_keys_for_eval,
+                            start_time,
+                            learning_rate=learning_rate,
                         )
                     else:
                         self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
@@ -2636,7 +2658,9 @@ def _inner_training_loop(
                 self.control.should_training_stop = True
 
             self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
-            self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time)
+            self._maybe_log_save_evaluate(
+                tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time, learning_rate=learning_rate
+            )
 
             if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
                 if is_torch_xla_available():
@@ -2799,7 +2823,6 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                 )
 
         if os.path.isfile(weights_file) or os.path.isfile(safe_weights_file) or is_fsdp_ckpt:
-            weights_only_kwarg = {"weights_only": True}
             # If the model is on the GPU, it still works!
             if is_sagemaker_mp_enabled():
                 if os.path.isfile(os.path.join(resume_from_checkpoint, "user_content.pt")):
@@ -2813,13 +2836,9 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                     # Checkpoint must have been saved with the old smp api.
                     if hasattr(self.args, "fp16") and self.args.fp16 is True:
                         logger.warning(
-                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported."
+                            "Enabling FP16 and loading from smp < 1.10 checkpoint together is not supported."
                         )
-                    state_dict = torch.load(
-                        weights_file,
-                        map_location="cpu",
-                        **weights_only_kwarg,
-                    )
+                    state_dict = torch.load(weights_file, map_location="cpu", weights_only=True)
                     # Required for smp to not auto-translate state_dict from hf to smp (is already smp).
                     state_dict["_smp_is_partial"] = False
                     load_result = model.load_state_dict(state_dict, strict=True)
@@ -2838,11 +2857,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
                 if self.args.save_safetensors and os.path.isfile(safe_weights_file):
                     state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu")
                 else:
-                    state_dict = torch.load(
-                        weights_file,
-                        map_location="cpu",
-                        **weights_only_kwarg,
-                    )
+                    state_dict = torch.load(weights_file, map_location="cpu", weights_only=True)
 
                 # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
                 # which takes *args instead of **kwargs
@@ -2920,7 +2935,6 @@ def _load_best_model(self):
             or os.path.exists(best_safe_adapter_model_path)
         ):
             has_been_loaded = True
-            weights_only_kwarg = {"weights_only": True}
             if is_sagemaker_mp_enabled():
                 if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")):
                     # If the 'user_content.pt' file exists, load with the new smp api.
@@ -2937,11 +2951,7 @@ def _load_best_model(self):
                     if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
                         state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
                     else:
-                        state_dict = torch.load(
-                            best_model_path,
-                            map_location="cpu",
-                            **weights_only_kwarg,
-                        )
+                        state_dict = torch.load(best_model_path, map_location="cpu", weights_only=True)
 
                     state_dict["_smp_is_partial"] = False
                     load_result = model.load_state_dict(state_dict, strict=True)
@@ -2996,11 +3006,7 @@ def _load_best_model(self):
                     if self.args.save_safetensors and os.path.isfile(best_safe_model_path):
                         state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu")
                     else:
-                        state_dict = torch.load(
-                            best_model_path,
-                            map_location="cpu",
-                            **weights_only_kwarg,
-                        )
+                        state_dict = torch.load(best_model_path, map_location="cpu", weights_only=True)
 
                     # If the model is on the GPU, it still works!
                     # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963
@@ -3056,12 +3062,14 @@ def _evaluate(self, trial, ignore_keys_for_eval, skip_scheduler=False):
                 ) from exc
         return metrics
 
-    def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time):
+    def _maybe_log_save_evaluate(
+        self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time, learning_rate=None
+    ):
         if self.control.should_log and self.state.global_step > self._globalstep_last_logged:
             if is_torch_xla_available():
                 xm.mark_step()
 
-            logs: Dict[str, float] = {}
+            logs: dict[str, float] = {}
 
             # all_gather + mean() to get average loss over all processes
             tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
@@ -3071,8 +3079,11 @@ def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, igno
 
             logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
             if grad_norm is not None:
-                logs["grad_norm"] = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm
-            logs["learning_rate"] = self._get_learning_rate()
+                logs["grad_norm"] = grad_norm.item() if isinstance(grad_norm, torch.Tensor) else grad_norm
+            if learning_rate is not None:
+                logs["learning_rate"] = learning_rate
+            else:
+                logs["learning_rate"] = self._get_learning_rate()
 
             self._total_loss_scalar += tr_loss_scalar
             self._globalstep_last_logged = self.state.global_step
@@ -3116,7 +3127,7 @@ def _load_rng_state(self, checkpoint):
                 return
 
         with safe_globals():
-            checkpoint_rng_state = torch.load(rng_file)
+            checkpoint_rng_state = torch.load(rng_file, weights_only=True)
         random.setstate(checkpoint_rng_state["python"])
         np.random.set_state(checkpoint_rng_state["numpy"])
         torch.random.set_rng_state(checkpoint_rng_state["cpu"])
@@ -3128,9 +3139,10 @@ def _load_rng_state(self, checkpoint):
             set_rng_state_for_device("CUDA", torch.cuda, checkpoint_rng_state, is_distributed)
         if is_torch_npu_available():
             set_rng_state_for_device("NPU", torch.npu, checkpoint_rng_state, is_distributed)
+        if is_torch_hpu_available():
+            set_rng_state_for_device("HPU", torch.hpu, checkpoint_rng_state, is_distributed)
         if is_torch_mlu_available():
             set_rng_state_for_device("MLU", torch.mlu, checkpoint_rng_state, is_distributed)
-
         if is_torch_musa_available():
             set_rng_state_for_device("MUSA", torch.musa, checkpoint_rng_state, is_distributed)
 
@@ -3163,12 +3175,10 @@ def _determine_best_metric(self, metrics, trial):
                 self.state.best_metric = float("-inf") if self.args.greater_is_better else float("inf")
 
             if operator(metric_value, self.state.best_metric):
-                run_dir = self._get_output_dir(trial=trial)
-                checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
-                output_dir = os.path.join(run_dir, checkpoint_folder)
-
                 self.state.best_metric = metric_value
-                self.state.best_model_checkpoint = output_dir
+
+                if self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH]:
+                    self.state.best_global_step = self.state.global_step
 
                 is_new_best_metric = True
 
@@ -3189,6 +3199,13 @@ def _save_checkpoint(self, model, trial):
         output_dir = os.path.join(run_dir, checkpoint_folder)
         self.save_model(output_dir, _internal_call=True)
 
+        if self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH] and self.state.best_global_step:
+            best_checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.best_global_step}"
+            best_checkpoint_dir = os.path.join(run_dir, best_checkpoint_folder)
+
+            if os.path.exists(best_checkpoint_dir):
+                self.state.best_model_checkpoint = best_checkpoint_dir
+
         if not self.args.save_only_model:
             # Save optimizer and scheduler
             self._save_optimizer_and_scheduler(output_dir)
@@ -3242,6 +3259,12 @@ def _save_rng_state(self, output_dir):
             else:
                 rng_states["npu"] = torch.npu.random.get_rng_state()
 
+        if is_torch_hpu_available():
+            if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
+                rng_states["hpu"] = torch.hpu.random.get_rng_state_all()
+            else:
+                rng_states["hpu"] = torch.hpu.random.get_rng_state()
+
         if is_torch_mlu_available():
             if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
                 rng_states["mlu"] = torch.mlu.random.get_rng_state_all()
@@ -3337,7 +3360,9 @@ def _load_optimizer_and_scheduler(self, checkpoint):
             # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init
             if not isinstance(self.lr_scheduler, DeepSpeedSchedulerWrapper):
                 with warnings.catch_warnings(record=True) as caught_warnings:
-                    self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME)))
+                    self.lr_scheduler.load_state_dict(
+                        torch.load(os.path.join(checkpoint, SCHEDULER_NAME), weights_only=True)
+                    )
                 reissue_pt_warnings(caught_warnings)
             return
 
@@ -3372,13 +3397,18 @@ def _load_optimizer_and_scheduler(self, checkpoint):
                             checkpoint, f"rank{self.args.process_index}-of-{self.args.world_size}-{OPTIMIZER_NAME}"
                         ),
                         map_location="cpu",
+                        weights_only=True,
                     )
                     # We only need `optimizer` when resuming from checkpoint
                     optimizer_state = optimizer_state["optimizer"]
                 else:
-                    optimizer_state = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location="cpu")
+                    optimizer_state = torch.load(
+                        os.path.join(checkpoint, OPTIMIZER_NAME), map_location="cpu", weights_only=True
+                    )
                 with warnings.catch_warnings(record=True) as caught_warnings:
-                    lr_scheduler_state = torch.load(os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu")
+                    lr_scheduler_state = torch.load(
+                        os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu", weights_only=True
+                    )
                 reissue_pt_warnings(caught_warnings)
 
                 xm.send_cpu_data_to_device(optimizer_state, self.args.device)
@@ -3420,10 +3450,14 @@ def opt_load_hook(mod, opt):
                         )
                     else:
                         self.optimizer.load_state_dict(
-                            torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
+                            torch.load(
+                                os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location, weights_only=True
+                            )
                         )
                 with warnings.catch_warnings(record=True) as caught_warnings:
-                    self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME)))
+                    self.lr_scheduler.load_state_dict(
+                        torch.load(os.path.join(checkpoint, SCHEDULER_NAME), weights_only=True)
+                    )
                 reissue_pt_warnings(caught_warnings)
 
     def _save_scaler(self, output_dir):
@@ -3458,13 +3492,17 @@ def _load_scaler(self, checkpoint):
             # Load in scaler states
             if is_torch_xla_available():
                 with warnings.catch_warnings(record=True) as caught_warnings:
-                    scaler_state = torch.load(os.path.join(checkpoint, SCALER_NAME), map_location="cpu")
+                    scaler_state = torch.load(
+                        os.path.join(checkpoint, SCALER_NAME), map_location="cpu", weights_only=True
+                    )
                 reissue_pt_warnings(caught_warnings)
                 xm.send_cpu_data_to_device(scaler_state, self.args.device)
                 self.accelerator.scaler.load_state_dict(scaler_state)
             else:
                 with warnings.catch_warnings(record=True) as caught_warnings:
-                    self.accelerator.scaler.load_state_dict(torch.load(os.path.join(checkpoint, SCALER_NAME)))
+                    self.accelerator.scaler.load_state_dict(
+                        torch.load(os.path.join(checkpoint, SCALER_NAME), weights_only=True)
+                    )
                 reissue_pt_warnings(caught_warnings)
 
     def _load_callback_state(self):
@@ -3508,14 +3546,14 @@ def _load_callback_state(self):
 
     def hyperparameter_search(
         self,
-        hp_space: Optional[Callable[["optuna.Trial"], Dict[str, float]]] = None,
-        compute_objective: Optional[Callable[[Dict[str, float]], float]] = None,
+        hp_space: Optional[Callable[["optuna.Trial"], dict[str, float]]] = None,
+        compute_objective: Optional[Callable[[dict[str, float]], float]] = None,
         n_trials: int = 20,
-        direction: Union[str, List[str]] = "minimize",
+        direction: Union[str, list[str]] = "minimize",
         backend: Optional[Union["str", HPSearchBackend]] = None,
         hp_name: Optional[Callable[["optuna.Trial"], str]] = None,
         **kwargs,
-    ) -> Union[BestRun, List[BestRun]]:
+    ) -> Union[BestRun, list[BestRun]]:
         """
         Launch an hyperparameter search using `optuna` or `Ray Tune` or `SigOpt`. The optimized quantity is determined
         by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided,
@@ -3590,7 +3628,7 @@ def hyperparameter_search(
         self.hp_search_backend = None
         return best_run
 
-    def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
         """
         Log `logs` on the various objects watching training.
 
@@ -3631,7 +3669,7 @@ def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor,
             return data.to(**kwargs)
         return data
 
-    def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]:
+    def _prepare_inputs(self, inputs: dict[str, Union[torch.Tensor, Any]]) -> dict[str, Union[torch.Tensor, Any]]:
         """
         Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
         handling potential state.
@@ -3666,7 +3704,7 @@ def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
         return ctx_manager
 
     def training_step(
-        self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], num_items_in_batch=None
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch=None
     ) -> torch.Tensor:
         """
         Perform a training step on a batch of inputs.
@@ -3712,6 +3750,10 @@ def training_step(
                 torch.npu.empty_cache()
             elif is_torch_mps_available(min_version="2.0"):
                 torch.mps.empty_cache()
+            elif is_torch_hpu_available():
+                logger.warning(
+                    "`torch_empty_cache_steps` is set but HPU device/backend does not support empty_cache()."
+                )
             else:
                 torch.cuda.empty_cache()
 
@@ -3967,6 +4009,13 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
 
         if self.processing_class is not None:
             self.processing_class.save_pretrained(output_dir)
+        elif (
+            self.data_collator is not None
+            and hasattr(self.data_collator, "tokenizer")
+            and self.data_collator.tokenizer is not None
+        ):
+            logger.info("Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`")
+            self.data_collator.tokenizer.save_pretrained(output_dir)
 
         # Good practice: save your training arguments together with the trained model
         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
@@ -3984,7 +4033,7 @@ def store_flos(self):
 
     def _sorted_checkpoints(
         self, output_dir=None, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False
-    ) -> List[str]:
+    ) -> list[str]:
         ordering_and_checkpoint_path = []
 
         glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
@@ -4036,10 +4085,10 @@ def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None:
 
     def evaluate(
         self,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
-        ignore_keys: Optional[List[str]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
-    ) -> Dict[str, float]:
+    ) -> dict[str, float]:
         """
         Run evaluation and returns metrics.
 
@@ -4078,7 +4127,7 @@ def evaluate(
             A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
             dictionary also contains the epoch number which comes from the training state.
         """
-        # handle multipe eval datasets
+        # handle multiple eval datasets
         override = eval_dataset is not None
         eval_dataset = eval_dataset if override else self.eval_dataset
         if isinstance(eval_dataset, dict):
@@ -4139,7 +4188,7 @@ def evaluate(
         return output.metrics
 
     def predict(
-        self, test_dataset: Dataset, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "test"
+        self, test_dataset: Dataset, ignore_keys: Optional[list[str]] = None, metric_key_prefix: str = "test"
     ) -> PredictionOutput:
         """
         Run prediction and returns predictions and potential metrics.
@@ -4207,7 +4256,7 @@ def evaluation_loop(
         dataloader: DataLoader,
         description: str,
         prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
         """
@@ -4307,11 +4356,11 @@ def evaluation_loop(
 
             # Update containers
             if losses is not None:
-                losses = self.gather_function((losses.repeat(batch_size)))
+                losses = self.gather_function(losses.repeat(batch_size))
                 all_losses.add(losses)
             if inputs_decode is not None:
                 inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
-                inputs_decode = self.gather_function((inputs_decode))
+                inputs_decode = self.gather_function(inputs_decode)
                 if not self.args.batch_eval_metrics or description == "Prediction":
                     all_inputs.add(inputs_decode)
             if labels is not None:
@@ -4321,11 +4370,11 @@ def evaluation_loop(
                 logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100)
                 if self.preprocess_logits_for_metrics is not None:
                     logits = self.preprocess_logits_for_metrics(logits, labels)
-                logits = self.gather_function((logits))
+                logits = self.gather_function(logits)
                 if not self.args.batch_eval_metrics or description == "Prediction":
                     all_preds.add(logits)
             if labels is not None:
-                labels = self.gather_function((labels))
+                labels = self.gather_function(labels)
                 if not self.args.batch_eval_metrics or description == "Prediction":
                     all_labels.add(labels)
 
@@ -4438,10 +4487,10 @@ def _nested_gather(self, tensors, name=None):
     def prediction_step(
         self,
         model: nn.Module,
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        ignore_keys: Optional[list[str]] = None,
+    ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform an evaluation step on `model` using `inputs`.
 
@@ -4513,7 +4562,7 @@ def prediction_step(
                 if has_labels or loss_without_labels:
                     with self.compute_loss_context_manager():
                         loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
-                    loss = loss.mean().detach()
+                    loss = loss.detach().mean()
 
                     if isinstance(outputs, dict):
                         logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
@@ -4540,7 +4589,7 @@ def prediction_step(
 
         return (loss, logits, labels)
 
-    def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]):
+    def floating_point_ops(self, inputs: dict[str, Union[torch.Tensor, Any]]):
         """
         For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point
         operations for every backward + forward pass. If using another model, either implement such a method in the
@@ -4580,13 +4629,13 @@ def create_model_card(
         self,
         language: Optional[str] = None,
         license: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
         model_name: Optional[str] = None,
         finetuned_from: Optional[str] = None,
-        tasks: Union[str, List[str], None] = None,
-        dataset_tags: Union[str, List[str], None] = None,
-        dataset: Union[str, List[str], None] = None,
-        dataset_args: Union[str, List[str], None] = None,
+        tasks: Union[str, list[str], None] = None,
+        dataset_tags: Union[str, list[str], None] = None,
+        dataset: Union[str, list[str], None] = None,
+        dataset_args: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -4808,7 +4857,7 @@ def prediction_loop(
         dataloader: DataLoader,
         description: str,
         prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
         """
@@ -4871,10 +4920,10 @@ def prediction_loop(
         logger.info(f"  Num examples = {num_examples}")
         logger.info(f"  Batch size = {batch_size}")
 
-        losses_host: torch.Tensor = None
-        preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
-        labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
-        inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None
+        losses_host: Optional[torch.Tensor] = None
+        preds_host: Union[torch.Tensor, list[torch.Tensor], None] = None
+        labels_host: Union[torch.Tensor, list[torch.Tensor], None] = None
+        inputs_host: Union[torch.Tensor, list[torch.Tensor], None] = None
         metrics: Optional[dict] = None
         eval_set_kwargs: dict = {}
 
@@ -5015,7 +5064,7 @@ def _add_sm_patterns_to_gitignore(self) -> None:
 
         # Get current .gitignore content
         if os.path.exists(os.path.join(self.repo.local_dir, ".gitignore")):
-            with open(os.path.join(self.repo.local_dir, ".gitignore"), "r") as f:
+            with open(os.path.join(self.repo.local_dir, ".gitignore")) as f:
                 current_content = f.read()
         else:
             current_content = ""
@@ -5094,6 +5143,14 @@ def create_accelerator_and_postprocess(self):
             args["dataloader_config"] = dataloader_config
         else:
             args.update(accelerator_config)
+        # tp is initialized at Accelerator init phase so
+        # args should be prepared here
+        if self.args.tp_size > 1:
+            self.is_tp_enabled = True
+            if version.parse(accelerate_version) > version.parse("1.3.0"):
+                args["torch_tp_plugin"] = TorchTensorParallelPlugin(tp_size=self.args.tp_size)
+            else:
+                raise ValueError("Requires accelerate>1.3.0 to use Tensor Parallelism.")
 
         # create accelerator object
         self.accelerator = Accelerator(**args)
@@ -5108,7 +5165,7 @@ def create_accelerator_and_postprocess(self):
         # deepspeed and accelerate flags covering both trainer args and accelerate launcher
         self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None
         self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None
-
+        self.is_tp_enabled = getattr(self.accelerator.state, "torch_tp_plugin", None) is not None
         # post accelerator creation setup
         if self.is_fsdp_enabled:
             fsdp_plugin = self.accelerator.state.fsdp_plugin
@@ -5142,6 +5199,12 @@ def create_accelerator_and_postprocess(self):
             raise ValueError(
                 "`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDP"
             )
+        if (
+            self.args.save_only_model
+            and self.is_fsdp_enabled
+            and "SHARDED_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)
+        ):
+            raise ValueError("save_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'")
 
     def propagate_args_to_deepspeed(self, auto_find_batch_size=False):
         """
@@ -5157,42 +5220,62 @@ def propagate_args_to_deepspeed(self, auto_find_batch_size=False):
 
     def _fsdp_qlora_plugin_updates(self):
         if self.is_fsdp_enabled and _is_peft_model(self.model):
-            from peft import LoraConfig
+            from peft import PeftConfig
             from peft.utils.other import fsdp_auto_wrap_policy
 
-            if isinstance(self.model.active_peft_config, LoraConfig):
-                fsdp_plugin = self.accelerator.state.fsdp_plugin
-                fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(self.model)
+            if isinstance(self.model.active_peft_config, PeftConfig):
+                self.accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(self.model)
             if (
                 getattr(self.model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
                 and self.model.hf_quantizer.quantization_config.bnb_4bit_quant_storage.is_floating_point
                 and version.parse(accelerate_version) > version.parse("0.27.0")
             ):
-                fsdp_plugin.set_mixed_precision(
+                self.accelerator.state.fsdp_plugin.set_mixed_precision(
                     self.model.hf_quantizer.quantization_config.bnb_4bit_quant_storage, override=True
                 )
 
-    def get_batch_samples(self, epoch_iterator, num_batches):
+    def get_batch_samples(self, epoch_iterator, num_batches, device):
         batch_samples = []
         num_items_in_batch = None
+
         for _ in range(num_batches):
             try:
-                batch_samples += [next(epoch_iterator)]
+                batch_samples.append(next(epoch_iterator))
             except StopIteration:
                 break
 
-        if len(batch_samples) > 0 and "labels" in batch_samples[0]:
+        count_num_items_in_batch = (
+            len(batch_samples) > 0
+            and "labels" in batch_samples[0]
+            and (
+                # num_items_in_batch is passed to model forward
+                # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3757
+                self.model_accepts_loss_kwargs
+                # num_items_in_batch is passed to compute_loss_func
+                # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3773
+                or self.compute_loss_func is not None
+                # num_items_in_batch is also verified if (self.model_accepts_loss_kwargs or self.compute_loss_func)
+                # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3790
+            )
+        )
+
+        if count_num_items_in_batch:
             # For now we don't support object detection
             try:
                 num_items_in_batch = sum([(batch["labels"].ne(-100)).sum() for batch in batch_samples])
             except (TypeError, AttributeError):
                 pass
 
-        if self.args.average_tokens_across_devices and num_items_in_batch is not None:
-            num_items_in_batch = self.accelerator.gather(num_items_in_batch).sum().item()
+        if num_items_in_batch is not None:
+            if self.args.average_tokens_across_devices:
+                num_items_in_batch = self.accelerator.gather(num_items_in_batch).sum()
+
+            if torch.is_tensor(num_items_in_batch):
+                num_items_in_batch = num_items_in_batch.to(device)
 
-        if torch.is_tensor(num_items_in_batch):
-            num_items_in_batch = num_items_in_batch.item()
+                if self.args.n_gpu > 1 and num_items_in_batch.dim() == 0:
+                    # In the DataParallel case, convert the scalar tensor into a 1-dim tensor
+                    num_items_in_batch = num_items_in_batch.unsqueeze(0)
 
         return batch_samples, num_items_in_batch
 
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index a2f938e733d2..e4c465dfe558 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,7 +19,7 @@
 import json
 import math
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 from tqdm.auto import tqdm
@@ -74,6 +73,9 @@ class TrainerState:
             The list of logs done since the beginning of training.
         best_metric (`float`, *optional*):
             When tracking the best model, the value of the best metric encountered so far.
+        best_global_step (`int`, *optional*):
+            When tracking the best model, the step at which the best metric was encountered.
+            Used for setting `best_model_checkpoint`.
         best_model_checkpoint (`str`, *optional*):
             When tracking the best model, the value of the name of the checkpoint for the best model encountered so
             far.
@@ -88,7 +90,7 @@ class TrainerState:
             impact the way data will be logged in TensorBoard.
         stateful_callbacks (`List[StatefulTrainerCallback]`, *optional*):
             Callbacks attached to the `Trainer` that should have their states be saved or restored.
-            Relevent callbacks should implement a `state` and `from_state` function.
+            Relevant callbacks should implement a `state` and `from_state` function.
     """
 
     epoch: Optional[float] = None
@@ -97,19 +99,20 @@ class TrainerState:
     logging_steps: int = 500
     eval_steps: int = 500
     save_steps: int = 500
-    train_batch_size: int = None
+    train_batch_size: Optional[int] = None
     num_train_epochs: int = 0
     num_input_tokens_seen: int = 0
     total_flos: float = 0
-    log_history: List[Dict[str, float]] = None
+    log_history: list[dict[str, float]] = None
     best_metric: Optional[float] = None
+    best_global_step: Optional[int] = None
     best_model_checkpoint: Optional[str] = None
     is_local_process_zero: bool = True
     is_world_process_zero: bool = True
     is_hyper_param_search: bool = False
-    trial_name: str = None
-    trial_params: Dict[str, Union[str, float, int, bool]] = None
-    stateful_callbacks: List["TrainerCallback"] = None
+    trial_name: Optional[str] = None
+    trial_params: dict[str, Union[str, float, int, bool]] = None
+    stateful_callbacks: list["TrainerCallback"] = None
 
     def __post_init__(self):
         if self.log_history is None:
@@ -147,7 +150,7 @@ def save_to_json(self, json_path: str):
     @classmethod
     def load_from_json(cls, json_path: str):
         """Create an instance from the content of `json_path`."""
-        with open(json_path, "r", encoding="utf-8") as f:
+        with open(json_path, encoding="utf-8") as f:
             text = f.read()
         return cls(**json.loads(text))
 
@@ -164,14 +167,10 @@ def compute_steps(self, args, max_steps):
                     num_steps = math.ceil(max_steps * num_steps)
                 setattr(self, f"{step_kind}_steps", num_steps)
 
-    def init_training_references(self, trainer, train_dataloader, max_steps, num_train_epochs, trial):
+    def init_training_references(self, trainer, max_steps, num_train_epochs, trial):
         """
         Stores the initial training references needed in `self`
         """
-        for attr in ("model", "optimizer", "lr_scheduler"):
-            setattr(self, attr, getattr(trainer, attr))
-
-        self.train_dataloader = train_dataloader
         if trainer.hp_name is not None and trainer._trial is not None:
             # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial
             # parameter to Train when using DDP.
@@ -604,7 +603,7 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
         if state.global_step >= state.max_steps:
             control.should_training_stop = True
             # Save the model at the end if we have a save strategy
-            if args.save_strategy not in [SaveStrategy.NO, SaveStrategy.BEST]:
+            if args.save_strategy == SaveStrategy.STEPS:
                 control.should_save = True
 
         return control
@@ -750,12 +749,12 @@ def on_train_begin(self, args, state, control, **kwargs):
                 "Using EarlyStoppingCallback without load_best_model_at_end=True. "
                 "Once training is finished, the best model will not be loaded automatically."
             )
-        assert (
-            args.metric_for_best_model is not None
-        ), "EarlyStoppingCallback requires metric_for_best_model to be defined"
-        assert (
-            args.eval_strategy != IntervalStrategy.NO
-        ), "EarlyStoppingCallback requires IntervalStrategy of steps or epoch"
+        assert args.metric_for_best_model is not None, (
+            "EarlyStoppingCallback requires metric_for_best_model to be defined"
+        )
+        assert args.eval_strategy != IntervalStrategy.NO, (
+            "EarlyStoppingCallback requires IntervalStrategy of steps or epoch"
+        )
 
     def on_evaluate(self, args, state, control, metrics, **kwargs):
         metric_to_check = args.metric_for_best_model
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 10e6678728f6..30474daea6e4 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,12 +23,12 @@
 import os
 import sys
 import warnings
-from collections.abc import Mapping
+from collections.abc import Iterator, Mapping
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from itertools import chain
 from logging import StreamHandler
-from typing import Any, Dict, Iterator, List, Optional, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 import torch
@@ -122,9 +121,9 @@ def nested_concat(tensors, new_tensors, padding_index=-100):
     nested list/tuples/dict of tensors.
     """
     if not (isinstance(tensors, torch.Tensor) and isinstance(new_tensors, torch.Tensor)):
-        assert (
-            type(tensors) is type(new_tensors)
-        ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
+        assert type(tensors) is type(new_tensors), (
+            f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
+        )
     if isinstance(tensors, (list, tuple)):
         return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
     elif isinstance(tensors, torch.Tensor):
@@ -221,12 +220,12 @@ def distributed_concat(tensor: Any, num_total_examples: Optional[int] = None) ->
 
 
 def distributed_broadcast_scalars(
-    scalars: List[Union[int, float]],
+    scalars: list[Union[int, float]],
     num_total_examples: Optional[int] = None,
     device: Optional[torch.device] = torch.device("cuda"),
 ) -> torch.Tensor:
     try:
-        tensorized_scalar = torch.tensor(scalars).to(device)
+        tensorized_scalar = torch.tensor(scalars, device=device)
         output_tensors = [tensorized_scalar.clone() for _ in range(dist.get_world_size())]
         dist.all_gather(output_tensors, tensorized_scalar)
         concat = torch.cat(output_tensors, dim=0)
@@ -292,7 +291,7 @@ def __iter__(self):
 
 class EvalLoopContainer:
     """
-    Container to store intermediate results of evaluation loop
+    Container to store intermediate results of evaluation loop.
 
     Args:
         do_nested_concat (`bool`, *optional*, defaults to `True`):
@@ -382,15 +381,15 @@ def __iter__(self):
 
         # add extra samples to make it evenly divisible
         indices += indices[: (self.total_size - len(indices))]
-        assert (
-            len(indices) == self.total_size
-        ), f"Indices length {len(indices)} and total size {self.total_size} mismatched"
+        assert len(indices) == self.total_size, (
+            f"Indices length {len(indices)} and total size {self.total_size} mismatched"
+        )
 
         # subsample
         indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples]
-        assert (
-            len(indices) == self.num_samples
-        ), f"Indices length {len(indices)} and sample number {self.num_samples} mismatched"
+        assert len(indices) == self.num_samples, (
+            f"Indices length {len(indices)} and sample number {self.num_samples} mismatched"
+        )
 
         return iter(indices)
 
@@ -444,7 +443,7 @@ class DistributedTensorGatherer:
         - P1: `[6, 7, 8, 9, 10, 11]`
         - P2: `[12, 13, 14, 15, 0, 1]`
 
-    The first batch treated on each process will be
+    The first batch treated on each process will be:
 
         - P0: `[0, 1]`
         - P1: `[6, 7]`
@@ -507,9 +506,9 @@ def _nested_set_tensors(self, storage, arrays):
         if isinstance(arrays, (list, tuple)):
             result = [self._nested_set_tensors(x, y) for x, y in zip(storage, arrays)]
             return result[0][0], type(arrays)(r[1] for r in result)
-        assert (
-            arrays.shape[0] % self.world_size == 0
-        ), f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}."
+        assert arrays.shape[0] % self.world_size == 0, (
+            f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}."
+        )
 
         slice_len = arrays.shape[0] // self.world_size
         for i in range(self.world_size):
@@ -624,7 +623,7 @@ def __init__(
         self,
         batch_size: int,
         dataset: Optional[Dataset] = None,
-        lengths: Optional[List[int]] = None,
+        lengths: Optional[list[int]] = None,
         model_input_name: Optional[str] = None,
         generator=None,
     ):
@@ -675,7 +674,7 @@ def __init__(
         rank: Optional[int] = None,
         seed: int = 0,
         drop_last: bool = False,
-        lengths: Optional[List[int]] = None,
+        lengths: Optional[list[int]] = None,
         model_input_name: Optional[str] = None,
     ):
         if dataset is None and lengths is None:
@@ -737,7 +736,7 @@ def __iter__(self) -> Iterator:
             # add extra samples to make it evenly divisible
             indices += indices[: (self.total_size - len(indices))]
         else:
-            # remove tail of data to make it evenly divisible.
+            # remove tail of data to make it evenly divisible
             indices = indices[: self.total_size]
         assert len(indices) == self.total_size
 
@@ -929,16 +928,16 @@ def _get_learning_rate(self):
 
 def _secs2timedelta(secs):
     """
-    convert seconds to hh:mm:ss.msec, msecs rounded to 2 decimals
+    Convert seconds to hh:mm:ss.msec, msecs rounded to 2 decimal places.
     """
 
     msec = int(abs(secs - int(secs)) * 100)
     return f"{datetime.timedelta(seconds=int(secs))}.{msec:02d}"
 
 
-def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
+def metrics_format(self, metrics: dict[str, float]) -> dict[str, float]:
     """
-    Reformat Trainer metrics values to a human-readable format
+    Reformat Trainer metrics values to a human-readable format.
 
     Args:
         metrics (`Dict[str, float]`):
@@ -951,11 +950,11 @@ def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
     metrics_copy = metrics.copy()
     for k, v in metrics_copy.items():
         if "_mem_" in k:
-            metrics_copy[k] = f"{ v >> 20 }MB"
+            metrics_copy[k] = f"{v >> 20}MB"
         elif "_runtime" in k:
             metrics_copy[k] = _secs2timedelta(v)
         elif k == "total_flos":
-            metrics_copy[k] = f"{ int(v) >> 30 }GF"
+            metrics_copy[k] = f"{int(v) >> 30}GF"
         elif isinstance(metrics_copy[k], float):
             metrics_copy[k] = round(v, 4)
 
@@ -964,7 +963,7 @@ def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
 
 def log_metrics(self, split, metrics):
     """
-    Log metrics in a specially formatted way
+    Log metrics in a specially formatted way.
 
     Under distributed environment this is done only for a process with rank 0.
 
@@ -978,7 +977,7 @@ def log_metrics(self, split, metrics):
 
     In order to get memory usage report you need to install `psutil`. You can do that with `pip install psutil`.
 
-    Now when this method is run, you will see a report that will include: :
+    Now when this method is run, you will see a report that will include:
 
     ```
     init_mem_cpu_alloc_delta   =     1301MB
@@ -1007,7 +1006,7 @@ def log_metrics(self, split, metrics):
     The reporting happens only for process of rank 0 and gpu 0 (if there is a gpu). Typically this is enough since the
     main process does the bulk of work, but it could be not quite so if model parallel is used and then other GPUs may
     use a different amount of gpu memory. This is also not the same under DataParallel where gpu0 may require much more
-    memory than the rest since it stores the gradient and optimizer states for all participating GPUS. Perhaps in the
+    memory than the rest since it stores the gradient and optimizer states for all participating GPUs. Perhaps in the
     future these reports will evolve to measure those too.
 
     The CPU RAM metric measures RSS (Resident Set Size) includes both the memory which is unique to the process and the
@@ -1080,7 +1079,7 @@ def save_metrics(self, split, metrics, combined=True):
     if combined:
         path = os.path.join(self.args.output_dir, "all_results.json")
         if os.path.exists(path):
-            with open(path, "r") as f:
+            with open(path) as f:
                 all_metrics = json.load(f)
         else:
             all_metrics = {}
@@ -1092,7 +1091,7 @@ def save_metrics(self, split, metrics, combined=True):
 
 def save_state(self):
     """
-    Saves the Trainer state, since Trainer.save_model saves only the tokenizer with the model
+    Saves the Trainer state, since Trainer.save_model saves only the tokenizer with the model.
 
     Under distributed environment this is done only for a process with rank 0.
     """
@@ -1105,7 +1104,7 @@ def save_state(self):
 
 def get_model_param_count(model, trainable_only=False):
     """
-    Calculate model's total param count. If trainable_only is True then count only those requiring grads
+    Calculate model's total param count. If trainable_only is True then count only those requiring grads.
     """
     if is_deepspeed_zero3_enabled():
 
@@ -1205,7 +1204,7 @@ def smp_nested_concat(tensor):
             return type(tensor)({k: smp_nested_concat(v) for k, v in tensor.items()})
         # It doesn't seem possible to check here if `tensor` is a StepOutput because StepOutput lives in `smp.step`
         # which is also the name of the decorator so Python is confused.
-        return tensor.concat().detach().cpu()
+        return tensor.detach().concat().cpu()
 
 
 @dataclass
@@ -1231,8 +1230,8 @@ class AcceleratorConfig:
             all workers.
         use_seedable_sampler (`bool`, *optional*, defaults to `True`):
             Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
-            training results are fully reproducable using a different sampling technique. While seed-to-seed results
-            may differ, on average the differences are neglible when using multiple different seeds to compare. Should
+            training results are fully reproducible using a different sampling technique. While seed-to-seed results
+            may differ, on average the differences are negligible when using multiple different seeds to compare. Should
             also be ran with [`~utils.set_seed`] for the best results.
         gradient_accumulation_kwargs (`dict`, *optional*):
             Additional kwargs to configure gradient accumulation, see [`accelerate.utils.GradientAccumulationPlugin`].
@@ -1264,7 +1263,7 @@ class AcceleratorConfig:
             " in your script multiplied by the number of processes."
         },
     )
-    dispatch_batches: bool = field(
+    dispatch_batches: Optional[bool] = field(
         default=None,
         metadata={
             "help": "If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process"
@@ -1284,8 +1283,8 @@ class AcceleratorConfig:
         default=True,
         metadata={
             "help": "Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`])."
-            "Ensures training results are fully reproducable using a different sampling technique. "
-            "While seed-to-seed results may differ, on average the differences are neglible when using"
+            "Ensures training results are fully reproducible using a different sampling technique. "
+            "While seed-to-seed results may differ, on average the differences are negligible when using"
             "multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
         },
     )
@@ -1300,7 +1299,7 @@ class AcceleratorConfig:
         },
     )
 
-    gradient_accumulation_kwargs: Optional[Dict] = field(
+    gradient_accumulation_kwargs: Optional[dict] = field(
         default=None,
         metadata={
             "help": "Additional kwargs to configure gradient accumulation, see [`accelerate.utils.GradientAccumulationPlugin`]. "
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index 76b7c1556d8a..e9fa797f0628 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -16,7 +16,7 @@
 import warnings
 from copy import deepcopy
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 import torch
 from torch import nn
@@ -59,15 +59,15 @@ def __init__(
         args: "TrainingArguments" = None,
         data_collator: Optional["DataCollator"] = None,
         train_dataset: Optional[Union[Dataset, "IterableDataset", "datasets.Dataset"]] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         processing_class: Optional[
             Union["PreTrainedTokenizerBase", "BaseImageProcessor", "FeatureExtractionMixin", "ProcessorMixin"]
         ] = None,
         model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
         compute_loss_func: Optional[Callable] = None,
-        compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
-        callbacks: Optional[List["TrainerCallback"]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        compute_metrics: Optional[Callable[["EvalPrediction"], dict]] = None,
+        callbacks: Optional[list["TrainerCallback"]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ):
         super().__init__(
@@ -143,10 +143,10 @@ def load_generation_config(gen_config_arg: Union[str, GenerationConfig]) -> Gene
     def evaluate(
         self,
         eval_dataset: Optional[Dataset] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
         **gen_kwargs,
-    ) -> Dict[str, float]:
+    ) -> dict[str, float]:
         """
         Run evaluation and returns metrics.
 
@@ -199,7 +199,7 @@ def evaluate(
     def predict(
         self,
         test_dataset: Dataset,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "test",
         **gen_kwargs,
     ) -> "PredictionOutput":
@@ -263,11 +263,11 @@ def predict(
     def prediction_step(
         self,
         model: nn.Module,
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         **gen_kwargs,
-    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform an evaluation step on `model` using `inputs`.
 
@@ -351,9 +351,9 @@ def prediction_step(
                 with self.compute_loss_context_manager():
                     outputs = model(**inputs)
                 if self.label_smoother is not None:
-                    loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
+                    loss = self.label_smoother(outputs, inputs["labels"]).detach().mean()
                 else:
-                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
+                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).detach().mean()
             else:
                 loss = None
 
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index f7edded7d435..4d3dd6d6bb17 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020-present the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,7 +24,7 @@
 import re
 import threading
 import time
-from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union
+from typing import Any, NamedTuple, Optional, Union
 
 import numpy as np
 
@@ -35,6 +34,7 @@
     is_tf_available,
     is_torch_available,
     is_torch_cuda_available,
+    is_torch_hpu_available,
     is_torch_mlu_available,
     is_torch_mps_available,
     is_torch_musa_available,
@@ -113,6 +113,8 @@ def set_seed(seed: int, deterministic: bool = False):
         torch.musa.manual_seed_all(seed)
     if is_torch_npu_available():
         torch.npu.manual_seed_all(seed)
+    if is_torch_hpu_available():
+        torch.hpu.manual_seed_all(seed)
     if is_torch_xpu_available():
         torch.xpu.manual_seed_all(seed)
     if is_tf_available():
@@ -162,10 +164,10 @@ class EvalPrediction:
 
     def __init__(
         self,
-        predictions: Union[np.ndarray, Tuple[np.ndarray]],
-        label_ids: Union[np.ndarray, Tuple[np.ndarray]],
-        inputs: Optional[Union[np.ndarray, Tuple[np.ndarray]]] = None,
-        losses: Optional[Union[np.ndarray, Tuple[np.ndarray]]] = None,
+        predictions: Union[np.ndarray, tuple[np.ndarray]],
+        label_ids: Union[np.ndarray, tuple[np.ndarray]],
+        inputs: Optional[Union[np.ndarray, tuple[np.ndarray]]] = None,
+        losses: Optional[Union[np.ndarray, tuple[np.ndarray]]] = None,
     ):
         self.predictions = predictions
         self.label_ids = label_ids
@@ -187,22 +189,22 @@ def __getitem__(self, idx):
 
 
 class EvalLoopOutput(NamedTuple):
-    predictions: Union[np.ndarray, Tuple[np.ndarray]]
-    label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
-    metrics: Optional[Dict[str, float]]
+    predictions: Union[np.ndarray, tuple[np.ndarray]]
+    label_ids: Optional[Union[np.ndarray, tuple[np.ndarray]]]
+    metrics: Optional[dict[str, float]]
     num_samples: Optional[int]
 
 
 class PredictionOutput(NamedTuple):
-    predictions: Union[np.ndarray, Tuple[np.ndarray]]
-    label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
-    metrics: Optional[Dict[str, float]]
+    predictions: Union[np.ndarray, tuple[np.ndarray]]
+    label_ids: Optional[Union[np.ndarray, tuple[np.ndarray]]]
+    metrics: Optional[dict[str, float]]
 
 
 class TrainOutput(NamedTuple):
     global_step: int
     training_loss: float
-    metrics: Dict[str, float]
+    metrics: dict[str, float]
 
 
 PREFIX_CHECKPOINT_DIR = "checkpoint"
@@ -264,12 +266,12 @@ class BestRun(NamedTuple):
     """
 
     run_id: str
-    objective: Union[float, List[float]]
-    hyperparameters: Dict[str, Any]
+    objective: Union[float, list[float]]
+    hyperparameters: dict[str, Any]
     run_summary: Optional[Any] = None
 
 
-def default_compute_objective(metrics: Dict[str, float]) -> float:
+def default_compute_objective(metrics: dict[str, float]) -> float:
     """
     The default objective to maximize/minimize when doing an hyperparameter search. It is the evaluation loss if no
     metrics are provided to the [`Trainer`], the sum of all metrics otherwise.
@@ -294,7 +296,7 @@ def default_compute_objective(metrics: Dict[str, float]) -> float:
     return loss if len(metrics) == 0 else sum(metrics.values())
 
 
-def default_hp_space_optuna(trial) -> Dict[str, float]:
+def default_hp_space_optuna(trial) -> dict[str, float]:
     from .integrations import is_optuna_available
 
     assert is_optuna_available(), "This function needs Optuna installed: `pip install optuna`"
@@ -306,7 +308,7 @@ def default_hp_space_optuna(trial) -> Dict[str, float]:
     }
 
 
-def default_hp_space_ray(trial) -> Dict[str, float]:
+def default_hp_space_ray(trial) -> dict[str, float]:
     from .integrations import is_ray_tune_available
 
     assert is_ray_tune_available(), "This function needs ray installed: `pip install ray[tune]`"
@@ -333,7 +335,7 @@ def default_hp_space_sigopt(trial):
     ]
 
 
-def default_hp_space_wandb(trial) -> Dict[str, float]:
+def default_hp_space_wandb(trial) -> dict[str, float]:
     from .integrations import is_wandb_available
 
     if not is_wandb_available():
@@ -506,6 +508,11 @@ def __init__(self, skip_memory_metrics=False):
         elif is_torch_npu_available():
             import torch
 
+            self.torch = torch
+            self.gpu = {}
+        elif is_torch_hpu_available():
+            import torch
+
             self.torch = torch
             self.gpu = {}
         else:
@@ -573,6 +580,10 @@ def start(self):
             elif is_torch_npu_available():
                 self.torch.npu.reset_peak_memory_stats()
                 self.torch.npu.empty_cache()
+            elif is_torch_hpu_available():
+                self.torch.hpu.reset_peak_memory_stats()
+                # not available on hpu as it reserves all device memory for the current process
+                # self.torch.hpu.empty_cache()
             elif is_torch_mps_available():
                 self.torch.mps.empty_cache()
 
@@ -588,6 +599,8 @@ def start(self):
                 self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated()
             elif is_torch_npu_available():
                 self.gpu_mem_used_at_start = self.torch.npu.memory_allocated()
+            elif is_torch_hpu_available():
+                self.gpu_mem_used_at_start = self.torch.hpu.memory_allocated()
             elif is_torch_mps_available():
                 self.gpu_mem_used_at_start = self.torch.mps.current_allocated_memory()
 
@@ -623,6 +636,10 @@ def stop(self, stage):
                 self.torch.xpu.empty_cache()
             elif is_torch_npu_available():
                 self.torch.npu.empty_cache()
+            elif is_torch_hpu_available():
+                # not available on hpu as it reserves all device memory for the current process
+                # self.torch.npu.empty_cache()
+                pass
             elif is_torch_mps_available():
                 self.torch.mps.empty_cache()
 
@@ -648,6 +665,9 @@ def stop(self, stage):
             elif is_torch_npu_available():
                 self.gpu_mem_used_now = self.torch.npu.memory_allocated()
                 self.gpu_mem_used_peak = self.torch.npu.max_memory_allocated()
+            elif is_torch_hpu_available():
+                self.gpu_mem_used_now = self.torch.hpu.memory_allocated()
+                self.gpu_mem_used_peak = self.torch.hpu.max_memory_allocated()
             elif is_torch_mps_available():
                 self.gpu_mem_used_now = self.torch.mps.current_allocated_memory()
                 # self.torch.mps.max_memory_allocated() does not exist yet
@@ -741,6 +761,9 @@ def has_length(dataset):
     except TypeError:
         # TypeError: len() of unsized object
         return False
+    except AttributeError:
+        # Ray DataSets raises an AttributeError: https://github.com/ray-project/ray/blob/master/python/ray/data/dataset.py#L5616
+        return False
 
 
 def denumpify_detensorize(metrics):
@@ -843,7 +866,7 @@ def _remove_columns(self, feature: dict) -> dict:
                 self.message_logged = True
         return {k: v for k, v in feature.items() if k in self.signature_columns}
 
-    def __call__(self, features: List[dict]):
+    def __call__(self, features: list[dict]):
         features = [self._remove_columns(feature) for feature in features]
         return self.data_collator(features)
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 6afdfb33249f..0e153df47a28 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import contextlib
-import io
 import json
 import math
 import os
@@ -22,7 +21,7 @@
 from datetime import timedelta
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 
 from huggingface_hub import get_full_repo_name
 from packaging import version
@@ -48,6 +47,7 @@
     is_torch_available,
     is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
+    is_torch_hpu_available,
     is_torch_mlu_available,
     is_torch_mps_available,
     is_torch_musa_available,
@@ -145,7 +145,6 @@ class OptimizerNames(ExplicitEnum):
     Stores the acceptable string identifiers for optimizers.
     """
 
-    ADAMW_HF = "adamw_hf"
     ADAMW_TORCH = "adamw_torch"
     ADAMW_TORCH_FUSED = "adamw_torch_fused"
     ADAMW_TORCH_XLA = "adamw_torch_xla"
@@ -189,19 +188,6 @@ class OptimizerNames(ExplicitEnum):
     APOLLO_ADAMW_LAYERWISE = "apollo_adamw_layerwise"
 
 
-# Sometimes users will pass in a `str` repr of a dict in the CLI
-# We need to track what fields those can be. Each time a new arg
-# has a dict type, it must be added to this list.
-# Important: These should be typed with Optional[Union[dict,str,...]]
-_VALID_DICT_FIELDS = [
-    "accelerator_config",
-    "fsdp_config",
-    "deepspeed",
-    "gradient_checkpointing_kwargs",
-    "lr_scheduler_kwargs",
-]
-
-
 def _convert_str_dict(passed_value: dict):
     "Safely checks that a passed value is a dictionary and converts any string values to their appropriate types."
     for key, value in passed_value.items():
@@ -260,9 +246,9 @@ class TrainingArguments:
         prediction_loss_only (`bool`, *optional*, defaults to `False`):
             When performing evaluation and generating predictions, only returns the loss.
         per_device_train_batch_size (`int`, *optional*, defaults to 8):
-            The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for training.
+            The batch size per device accelerator core/CPU for training.
         per_device_eval_batch_size (`int`, *optional*, defaults to 8):
-            The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for evaluation.
+            The batch size per device accelerator core/CPU for evaluation.
         gradient_accumulation_steps (`int`, *optional*, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
@@ -275,7 +261,7 @@ class TrainingArguments:
 
         eval_accumulation_steps (`int`, *optional*):
             Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
-            left unset, the whole predictions are accumulated on GPU/NPU/TPU before being moved to the CPU (faster but
+            left unset, the whole predictions are accumulated on the device accelerator before being moved to the CPU (faster but
             requires more memory).
         eval_delay (`float`, *optional*):
             Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
@@ -451,8 +437,8 @@ class TrainingArguments:
             training step under the keyword argument `mems`.
         run_name (`str`, *optional*, defaults to `output_dir`):
             A descriptor for the run. Typically used for [wandb](https://www.wandb.com/),
-            [mlflow](https://www.mlflow.org/) and [comet](https://www.comet.com/site) logging. If not specified, will
-            be the same as `output_dir`.
+            [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and [swanlab](https://swanlab.cn)
+            logging. If not specified, will be the same as `output_dir`.
         disable_tqdm (`bool`, *optional*):
             Whether or not to disable the tqdm progress bars and table of metrics produced by
             [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
@@ -542,7 +528,7 @@ class TrainingArguments:
                      all-gathers.
                 - use_orig_params (`bool`, *optional*, defaults to `True`)
                     If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
-                    frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
+                    frozen and trainable parameters. Useful in cases such as parameter-efficient fine-tuning. Please
                     refer this
                     [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019
                 - sync_module_states (`bool`, *optional*, defaults to `True`)
@@ -569,7 +555,10 @@ class TrainingArguments:
                     Will use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be
                     used when the xla flag is set to true, and an auto wrapping policy is specified through
                     fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap.
-
+        tp_size (`int`, *optional*):
+            Use tp_size to enable PyTorch tensor parallelism. Tensor parallelism support is only available to models having `base_tp_plan`
+            in their respective config classes.
+            Set a value greater than 1 to activate TP. The same is used to prepare device mesh internally. Requires accelerate>1.3.0.
         deepspeed (`str` or `dict`, *optional*):
             Use [Deepspeed](https://github.com/deepspeedai/DeepSpeed). This is an experimental feature and its API may
             evolve in the future. The value is either the location of DeepSpeed json config file (e.g.,
@@ -601,8 +590,8 @@ class TrainingArguments:
                     all workers.
                 - use_seedable_sampler (`bool`, *optional*, defaults to `True`):
                     Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
-                    training results are fully reproducable using a different sampling technique. While seed-to-seed results
-                    may differ, on average the differences are neglible when using multiple different seeds to compare. Should
+                    training results are fully reproducible using a different sampling technique. While seed-to-seed results
+                    may differ, on average the differences are negligible when using multiple different seeds to compare. Should
                     also be ran with [`~utils.set_seed`] for the best results.
                 - use_configured_state (`bool`, *optional*, defaults to `False`):
                     Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined before calling `TrainingArguments`.
@@ -624,7 +613,7 @@ class TrainingArguments:
 
             The options should be separated by whitespaces.
         optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
-            The optimizer to use, such as "adamw_hf", "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision",
+            The optimizer to use, such as "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision",
             "adafactor". See `OptimizerNames` in [training_args.py](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py)
             for a full list of optimizers.
         optim_args (`str`, *optional*):
@@ -639,8 +628,8 @@ class TrainingArguments:
         report_to (`str` or `List[str]`, *optional*, defaults to `"all"`):
             The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
             `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
-            `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"` for no
-            integrations.
+            `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"`
+            for no integrations.
         ddp_find_unused_parameters (`bool`, *optional*):
             When using distributed training, the value of the flag `find_unused_parameters` passed to
             `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
@@ -766,14 +755,6 @@ class TrainingArguments:
             Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions.
 
             This flag is experimental and subject to change in future releases.
-        split_batches (`bool`, *optional*):
-            Whether or not the accelerator should split the batches yielded by the dataloaders across the devices
-            during distributed training. If
-
-            set to `True`, the actual batch size used will be the same on any kind of distributed processes, but it
-            must be a
-
-            round multiple of the number of processes you are using (such as GPUs).
         include_tokens_per_second (`bool`, *optional*):
             Whether or not to compute the number of tokens per second per device for training speed metrics.
 
@@ -820,6 +801,18 @@ class TrainingArguments:
             https://github.com/huggingface/transformers/issues/34242
     """
 
+    # Sometimes users will pass in a `str` repr of a dict in the CLI
+    # We need to track what fields those can be. Each time a new arg
+    # has a dict type, it must be added to this list.
+    # Important: These should be typed with Optional[Union[dict,str,...]]
+    _VALID_DICT_FIELDS = [
+        "accelerator_config",
+        "fsdp_config",
+        "deepspeed",
+        "gradient_checkpointing_kwargs",
+        "lr_scheduler_kwargs",
+    ]
+
     framework = "pt"
     output_dir: Optional[str] = field(
         default=None,
@@ -850,10 +843,10 @@ class TrainingArguments:
     )
 
     per_device_train_batch_size: int = field(
-        default=8, metadata={"help": "Batch size per GPU/TPU/MPS/NPU core/CPU for training."}
+        default=8, metadata={"help": "Batch size per device accelerator core/CPU for training."}
     )
     per_device_eval_batch_size: int = field(
-        default=8, metadata={"help": "Batch size per GPU/TPU/MPS/NPU core/CPU for evaluation."}
+        default=8, metadata={"help": "Batch size per device accelerator core/CPU for evaluation."}
     )
 
     per_gpu_train_batch_size: Optional[int] = field(
@@ -1041,7 +1034,7 @@ class TrainingArguments:
     use_cpu: bool = field(
         default=False,
         metadata={
-            "help": "Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available."
+            "help": "Whether or not to use cpu. If left to False, we will use the available torch device/backend (cuda/mps/xpu/hpu etc.)"
         },
     )
     use_mps_device: bool = field(
@@ -1135,7 +1128,7 @@ class TrainingArguments:
             )
         },
     )
-    debug: Union[str, List[DebugOption]] = field(
+    debug: Union[str, list[DebugOption]] = field(
         default="",
         metadata={
             "help": (
@@ -1184,7 +1177,9 @@ class TrainingArguments:
 
     run_name: Optional[str] = field(
         default=None,
-        metadata={"help": "An optional descriptor for the run. Notably used for wandb, mlflow and comet logging."},
+        metadata={
+            "help": "An optional descriptor for the run. Notably used for wandb, mlflow comet and swanlab logging."
+        },
     )
     disable_tqdm: Optional[bool] = field(
         default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."}
@@ -1193,7 +1188,7 @@ class TrainingArguments:
     remove_unused_columns: Optional[bool] = field(
         default=True, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."}
     )
-    label_names: Optional[List[str]] = field(
+    label_names: Optional[list[str]] = field(
         default=None, metadata={"help": "The list of keys in your dictionary of inputs that correspond to the labels."}
     )
     load_best_model_at_end: Optional[bool] = field(
@@ -1220,7 +1215,7 @@ class TrainingArguments:
             )
         },
     )
-    fsdp: Optional[Union[List[FSDPOption], str]] = field(
+    fsdp: Optional[Union[list[FSDPOption], str]] = field(
         default="",
         metadata={
             "help": (
@@ -1250,6 +1245,18 @@ class TrainingArguments:
             )
         },
     )
+    tp_size: Optional[int] = field(
+        default=0,
+        metadata={
+            "help": (
+                "Use tp_size to enable pytorch tensor parallelism."
+                "Tensor parallelism support is only available to models having `base_tp_plan` in their respective config classes."
+                "Set a value greater than 1 to activate TP."
+                "The same is used to prepare device mesh internally."
+                "Requires accelerate>1.3.0."
+            )
+        },
+    )
     fsdp_transformer_layer_cls_to_wrap: Optional[str] = field(
         default=None,
         metadata={
@@ -1263,7 +1270,7 @@ class TrainingArguments:
         default=None,
         metadata={
             "help": (
-                "Config to be used with the internal Accelerator object initializtion. The value is either a "
+                "Config to be used with the internal Accelerator object initialization. The value is either a "
                 "accelerator json config file (e.g., `accelerator_config.json`) or an already loaded json file as `dict`."
             )
         },
@@ -1301,7 +1308,7 @@ class TrainingArguments:
         default="length",
         metadata={"help": "Column name with precomputed lengths to use when grouping by length."},
     )
-    report_to: Union[None, str, List[str]] = field(
+    report_to: Union[None, str, list[str]] = field(
         default=None, metadata={"help": "The list of integrations to report the results and logs to."}
     )
     ddp_find_unused_parameters: Optional[bool] = field(
@@ -1389,7 +1396,7 @@ class TrainingArguments:
             "help": "This argument is deprecated and will be removed in version 5 of 🤗 Transformers. Use `include_for_metrics` instead."
         },
     )
-    include_for_metrics: List[str] = field(
+    include_for_metrics: list[str] = field(
         default_factory=list,
         metadata={
             "help": "List of strings to specify additional data to include in the `compute_metrics` function."
@@ -1410,10 +1417,6 @@ class TrainingArguments:
             "choices": ["auto", "apex", "cpu_amp"],
         },
     )
-    evaluation_strategy: Union[IntervalStrategy, str] = field(
-        default=None,
-        metadata={"help": "Deprecated. Use `eval_strategy` instead"},
-    )
     push_to_hub_model_id: Optional[str] = field(
         default=None, metadata={"help": "The name of the repository to which push the `Trainer`."}
     )
@@ -1488,16 +1491,6 @@ class TrainingArguments:
         },
     )
 
-    dispatch_batches: Optional[bool] = field(
-        default=None,
-        metadata={"help": "Deprecated. Pass {'dispatch_batches':VALUE} to `accelerator_config`."},
-    )
-
-    split_batches: Optional[bool] = field(
-        default=None,
-        metadata={"help": "Deprecated. Pass {'split_batches':True} to `accelerator_config`."},
-    )
-
     include_tokens_per_second: Optional[bool] = field(
         default=False,
         metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
@@ -1513,11 +1506,11 @@ class TrainingArguments:
     neftune_noise_alpha: Optional[float] = field(
         default=None,
         metadata={
-            "help": "Activates neftune noise embeddings into the model. NEFTune has been proven to drastically improve model performances for instrcution fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
+            "help": "Activates neftune noise embeddings into the model. NEFTune has been proven to drastically improve model performances for instruction fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
         },
     )
 
-    optim_target_modules: Union[None, str, List[str]] = field(
+    optim_target_modules: Union[None, str, list[str]] = field(
         default=None,
         metadata={
             "help": "Target modules for the optimizer defined in the `optim` argument. Only used for the GaLore optimizer at the moment."
@@ -1567,9 +1560,9 @@ def __post_init__(self):
             )
 
         # Parse in args that could be `dict` sent in from the CLI as a string
-        for field in _VALID_DICT_FIELDS:
+        for field in self._VALID_DICT_FIELDS:
             passed_value = getattr(self, field)
-            # We only want to do this if the str starts with a bracket to indiciate a `dict`
+            # We only want to do this if the str starts with a bracket to indicate a `dict`
             # else its likely a filename if supported
             if isinstance(passed_value, str) and passed_value.startswith("{"):
                 loaded_dict = json.loads(passed_value)
@@ -1590,13 +1583,6 @@ def __post_init__(self):
         if self.disable_tqdm is None:
             self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
 
-        if self.evaluation_strategy is not None:
-            warnings.warn(
-                "`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead",
-                FutureWarning,
-            )
-            self.eval_strategy = self.evaluation_strategy
-
         if isinstance(self.eval_strategy, EvaluationStrategy):
             warnings.warn(
                 "using `EvaluationStrategy` for `eval_strategy` is deprecated and will be removed in version 5"
@@ -1623,7 +1609,7 @@ def __post_init__(self):
             self.do_eval = True
 
         if self.torch_empty_cache_steps is not None:
-            if not (isinstance(self.torch_empty_cache_steps, int) or self.torch_empty_cache_steps > 0):
+            if not (isinstance(self.torch_empty_cache_steps, int) and self.torch_empty_cache_steps > 0):
                 raise ValueError(
                     f"`torch_empty_cache_steps` must be an integer bigger than 0, got {self.torch_empty_cache_steps}."
                 )
@@ -1755,7 +1741,7 @@ def __post_init__(self):
 
         # We need to setup the accelerator config here *before* the first call to `self.device`
         if is_accelerate_available():
-            if not isinstance(self.accelerator_config, (AcceleratorConfig)):
+            if not isinstance(self.accelerator_config, AcceleratorConfig):
                 if self.accelerator_config is None:
                     self.accelerator_config = AcceleratorConfig()
                 elif isinstance(self.accelerator_config, dict):
@@ -1770,22 +1756,6 @@ def __post_init__(self):
                 else:
                     self.accelerator_config = AcceleratorConfig.from_json_file(self.accelerator_config)
 
-            if self.dispatch_batches is not None:
-                warnings.warn(
-                    "Using `--dispatch_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use"
-                    " `--accelerator_config {'dispatch_batches':VALUE} instead",
-                    FutureWarning,
-                )
-                self.accelerator_config.dispatch_batches = self.dispatch_batches
-
-            if self.split_batches is not None:
-                warnings.warn(
-                    "Using `--split_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use"
-                    " `--accelerator_config {'split_batches':VALUE} instead",
-                    FutureWarning,
-                )
-                self.accelerator_config.split_batches = self.split_batches
-
         # Initialize device before we proceed
         if self.framework == "pt" and is_torch_available():
             self.device
@@ -1813,7 +1783,10 @@ def __post_init__(self):
         if (self.torch_compile_mode is not None or self.torch_compile_backend is not None) and not self.torch_compile:
             self.torch_compile = True
         if self.torch_compile and self.torch_compile_backend is None:
-            self.torch_compile_backend = "inductor"
+            if not self.use_cpu and is_torch_hpu_available():
+                self.torch_compile_backend = "hpu_backend"
+            else:
+                self.torch_compile_backend = "inductor"
 
         # accelerate integration for torch compile
         if self.torch_compile:
@@ -1834,7 +1807,7 @@ def __post_init__(self):
                     torch.backends.cudnn.allow_tf32 = True
             else:
                 logger.warning(
-                    "The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here."
+                    "The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here."
                 )
         if self.framework == "pt" and is_torch_available() and self.tf32 is not None:
             if self.tf32:
@@ -1920,7 +1893,7 @@ def __post_init__(self):
         if isinstance(self.fsdp_config, str):
             if len(self.fsdp) == 0:
                 warnings.warn("`--fsdp_config` is useful only when `--fsdp` is specified.")
-            with io.open(self.fsdp_config, "r", encoding="utf-8") as f:
+            with open(self.fsdp_config, encoding="utf-8") as f:
                 self.fsdp_config = json.load(f)
                 for k in list(self.fsdp_config.keys()):
                     if k.startswith("fsdp_"):
@@ -1975,6 +1948,14 @@ def __post_init__(self):
             if self.fsdp_config["xla_fsdp_grad_ckpt"]:
                 warnings.warn("`--xla_fsdp_grad_ckpt` is useful only when `--xla` is set to true.")
 
+        if self.tp_size > 1:
+            if not is_accelerate_available("1.3.1"):
+                raise NotImplementedError(
+                    "TP using PyTorch requires Accelerate version `accelerate` >= 1.3.1. "
+                    "This is not supported and we recommend you to update your version."
+                )
+            os.environ["ACCELERATE_USE_TP"] = "true"
+            os.environ["TP_SIZE"] = str(self.tp_size)
         # accelerate integration for FSDP
         if len(self.fsdp) > 0 and not self.fsdp_config["xla"]:
             os.environ["ACCELERATE_USE_FSDP"] = "true"
@@ -2287,6 +2268,9 @@ def _setup_devices(self) -> "torch.device":
             elif is_torch_npu_available():
                 device = torch.device("npu:0")
                 torch.npu.set_device(device)
+            elif is_torch_hpu_available():
+                device = torch.device("hpu:0")
+                torch.hpu.set_device(device)
             else:
                 # if n_gpu is > 1 we'll use nn.DataParallel.
                 # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
@@ -2515,7 +2499,7 @@ def get_warmup_steps(self, num_training_steps: int):
         )
         return warmup_steps
 
-    def _dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
+    def _dict_torch_dtype_to_str(self, d: dict[str, Any]) -> None:
         """
         Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
         converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
@@ -2555,7 +2539,7 @@ def to_json_string(self):
         """
         return json.dumps(self.to_dict(), indent=2)
 
-    def to_sanitized_dict(self) -> Dict[str, Any]:
+    def to_sanitized_dict(self) -> dict[str, Any]:
         """
         Sanitized serialization to use with TensorBoard’s hparams
         """
@@ -2798,7 +2782,7 @@ def set_logging(
         self,
         strategy: Union[str, IntervalStrategy] = "steps",
         steps: int = 500,
-        report_to: Union[str, List[str]] = "none",
+        report_to: Union[str, list[str]] = "none",
         level: str = "passive",
         first_step: bool = False,
         nan_inf_filter: bool = False,
@@ -2825,8 +2809,8 @@ def set_logging(
             report_to (`str` or `List[str]`, *optional*, defaults to `"all"`):
                 The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
                 `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`,
-                `"neptune"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed,
-                `"none"` for no integrations.
+                `"neptune"`, `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations
+                installed, `"none"` for no integrations.
             first_step (`bool`, *optional*, defaults to `False`):
                 Whether to log and evaluate the first `global_step` or not.
             nan_inf_filter (`bool`, *optional*, defaults to `True`):
@@ -2954,7 +2938,7 @@ def set_optimizer(
 
         Args:
             name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
-                The optimizer to use: `"adamw_hf"`, `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`,
+                The optimizer to use: `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`,
                 `"adamw_anyprecision"` or `"adafactor"`.
             learning_rate (`float`, *optional*, defaults to 5e-5):
                 The initial learning rate.
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 3716a78879d5..6bbd4b89a724 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -14,7 +14,7 @@
 
 import warnings
 from dataclasses import dataclass, field
-from typing import Optional, Tuple
+from typing import Optional
 
 from .training_args import TrainingArguments
 from .utils import cached_property, is_tf_available, logging, requires_backends
@@ -160,7 +160,7 @@ class TFTrainingArguments(TrainingArguments):
             Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
             automatically detect from metadata.
         run_name (`str`, *optional*):
-            A descriptor for the run. Notably used for wandb, mlflow and comet logging.
+            A descriptor for the run. Notably used for wandb, mlflow, comet and swanlab logging.
         xla (`bool`, *optional*):
             Whether to activate the XLA compilation or not.
     """
@@ -189,7 +189,7 @@ class TFTrainingArguments(TrainingArguments):
     xla: bool = field(default=False, metadata={"help": "Whether to activate the XLA compilation or not"})
 
     @cached_property
-    def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]:
+    def _setup_strategy(self) -> tuple["tf.distribute.Strategy", int]:
         requires_backends(self, ["tf"])
         logger.info("Tensorflow: setting up strategy")
 
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
old mode 100755
new mode 100644
index cf13060ee307..3a1c2e59985d
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
@@ -16,7 +15,6 @@
 # limitations under the License.
 
 from functools import lru_cache
-from typing import FrozenSet
 
 from huggingface_hub import get_full_repo_name  # for backward compatibility
 from huggingface_hub.constants import HF_HUB_DISABLE_TELEMETRY as DISABLE_TELEMETRY  # for backward compatibility
@@ -45,6 +43,7 @@
     add_model_info_to_custom_pipelines,
     cached_property,
     can_return_loss,
+    can_return_tuple,
     expand_dims,
     filter_out_non_signature_kwargs,
     find_labels,
@@ -91,7 +90,6 @@
     define_sagemaker_information,
     download_url,
     extract_commit_hash,
-    get_file_from_repo,
     has_file,
     http_user_agent,
     is_offline_mode,
@@ -148,6 +146,7 @@
     is_gguf_available,
     is_gptqmodel_available,
     is_grokadamw_available,
+    is_habana_gaudi1,
     is_hadamard_available,
     is_hqq_available,
     is_in_notebook,
@@ -165,6 +164,7 @@
     is_natten_available,
     is_ninja_available,
     is_nltk_available,
+    is_num2words_available,
     is_onnx_available,
     is_openai_available,
     is_optimum_available,
@@ -180,6 +180,8 @@
     is_pytesseract_available,
     is_pytest_available,
     is_pytorch_quantization_available,
+    is_quark_available,
+    is_rich_available,
     is_rjieba_available,
     is_sacremoses_available,
     is_safetensors_available,
@@ -216,6 +218,7 @@
     is_torch_fx_available,
     is_torch_fx_proxy,
     is_torch_greater_or_equal,
+    is_torch_hpu_available,
     is_torch_mlu_available,
     is_torch_mps_available,
     is_torch_musa_available,
@@ -224,7 +227,6 @@
     is_torch_sdpa_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
-    is_torch_tpu_available,
     is_torch_xla_available,
     is_torch_xpu_available,
     is_torchao_available,
@@ -232,6 +234,7 @@
     is_torchdistx_available,
     is_torchdynamo_available,
     is_torchdynamo_compiling,
+    is_torchdynamo_exporting,
     is_torchvision_available,
     is_torchvision_v2_available,
     is_training_run_on_sagemaker,
@@ -295,8 +298,8 @@ def check_min_version(min_version):
         )
 
 
-@lru_cache()
-def get_available_devices() -> FrozenSet[str]:
+@lru_cache
+def get_available_devices() -> frozenset[str]:
     """
     Returns a frozenset of devices available for the current PyTorch installation.
     """
@@ -314,6 +317,9 @@ def get_available_devices() -> FrozenSet[str]:
     if is_torch_npu_available():
         devices.add("npu")
 
+    if is_torch_hpu_available():
+        devices.add("hpu")
+
     if is_torch_mlu_available():
         devices.add("mlu")
 
diff --git a/src/transformers/utils/attention_visualizer.py b/src/transformers/utils/attention_visualizer.py
new file mode 100644
index 000000000000..bad6a7471e54
--- /dev/null
+++ b/src/transformers/utils/attention_visualizer.py
@@ -0,0 +1,228 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import requests
+from PIL import Image
+
+from ..models.auto.auto_factory import _get_model_class
+from ..models.auto.configuration_auto import AutoConfig
+from ..models.auto.modeling_auto import MODEL_FOR_PRETRAINING_MAPPING, MODEL_MAPPING
+from ..models.auto.processing_auto import PROCESSOR_MAPPING_NAMES, AutoProcessor
+from ..models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES, AutoTokenizer
+from .import_utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+# Print the matrix with words as row labels
+GREEN = "\033[92m"
+YELLOW = "\033[93m"
+RESET = "\033[0m"
+BLACK_SQUARE = "■"
+WHITE_SQUARE = "⬚"
+
+
+def generate_attention_matrix_from_mask(words, mask, img_token="<img>", sliding_window=None, token_type_ids=None):
+    """
+    Generates an attention matrix from a given attention mask.
+
+    Optionally applies a sliding window mask (e.g., for Gemma2/3) and
+    marks regions where image tokens occur based on the specified `img_token`.
+    """
+    mask = mask.int()
+    if mask.ndim == 3:
+        mask = mask[0, :, :]
+    if mask.ndim == 4:
+        mask = mask[0, 0, :, :]
+
+    n = len(words)
+    max_word_length = max(len(repr(word)) for word in words)
+    first_img_idx = 0
+    output = []
+
+    for i, k in enumerate(words):
+        if k == img_token and not first_img_idx:
+            first_img_idx = i
+            mask[i, i] = 2  # Mark yellow regions
+        if first_img_idx > 0 and (k != img_token or i == n - 1):
+            if i == n - 1:
+                i += 1
+            mask[first_img_idx:i, first_img_idx:i] = 2  # Mark yellow regions
+            first_img_idx = 0
+
+    # Generate sliding window mask (size = 4), excluding img_token
+    sliding_window_mask = None
+    if sliding_window is not None:
+        sliding_window_mask = [[1 if (0 <= i - j < sliding_window) else 0 for j in range(n)] for i in range(n)]
+
+    row_dummy = " ".join(
+        f"{YELLOW}{BLACK_SQUARE}{RESET}"
+        if mask[0, j]
+        else f"{GREEN}{BLACK_SQUARE}{RESET}"
+        if 0 == j
+        else BLACK_SQUARE
+        if mask[0, j]
+        else WHITE_SQUARE
+        for j in range(n)
+    )
+
+    # Print headers
+    legend = f"{GREEN}{BLACK_SQUARE}{RESET}: i == j (diagonal)   {YELLOW}{BLACK_SQUARE}{RESET}: token_type_ids"
+    output.append(" " + legend)
+    f_string = " " * (max_word_length + 5) + "Attention Matrix".ljust(len(row_dummy) // 2)
+    if sliding_window is not None:
+        f_string += "Sliding Window Mask"
+    output.append(f_string)
+
+    vertical_header = []
+    for idx, word in enumerate(words):
+        if mask[idx, idx] == 2:
+            vertical_header.append([f"{YELLOW}{k}{RESET}" for k in list(str(idx).rjust(len(str(n))))])
+        else:
+            vertical_header.append(list(str(idx).rjust(len(str(n)))))
+
+    vertical_header = list(map(list, zip(*vertical_header)))  # Transpose
+
+    for row in vertical_header:
+        output.append(
+            (max_word_length + 5) * " " + " ".join(row) + "    |    " + " ".join(row)
+            if sliding_window is not None
+            else ""
+        )
+
+    for i, word in enumerate(words):
+        word_repr = repr(word).ljust(max_word_length)
+        colored_word = f"{YELLOW}{word_repr}{RESET}" if img_token in word else word_repr
+        row_display = " ".join(
+            f"{YELLOW}{BLACK_SQUARE}{RESET}"
+            if img_token in words[j] and mask[i, j] and img_token in words[i]
+            else f"{GREEN}{BLACK_SQUARE}{RESET}"
+            if i == j
+            else BLACK_SQUARE
+            if mask[i, j]
+            else WHITE_SQUARE
+            for j in range(n)
+        )
+        sliding_window_row = ""
+        if sliding_window is not None:
+            sliding_window_row = " ".join(
+                f"{YELLOW}{BLACK_SQUARE}{RESET}"
+                if img_token in words[j] and img_token in words[i]
+                else f"{GREEN}{BLACK_SQUARE}{RESET}"
+                if i == j
+                else BLACK_SQUARE
+                if sliding_window_mask[i][j]
+                else WHITE_SQUARE
+                for j in range(n)
+            )
+
+        output.append(f"{colored_word}: {str(i).rjust(2)} {row_display}    |    {sliding_window_row}")
+
+    return "\n".join(output)
+
+
+class AttentionMaskVisualizer:
+    def __init__(self, model_name: str):
+        config = AutoConfig.from_pretrained(model_name)
+        self.image_token = "<img>"
+        if hasattr(config.get_text_config(), "sliding_window"):
+            config.sliding_window = 5
+        try:
+            mapped_cls = _get_model_class(config, MODEL_MAPPING)
+        except Exception:
+            mapped_cls = _get_model_class(config, MODEL_FOR_PRETRAINING_MAPPING)
+
+        if mapped_cls is None:
+            raise ValueError(f"Model name {model_name} is not supported for attention visualization")
+        self.mapped_cls = mapped_cls
+
+        class _ModelWrapper(mapped_cls, nn.Module):
+            def __init__(self, config, model_name):
+                nn.Module.__init__(self)
+                self.dummy_module = nn.Linear(1, 1)
+                self.config = config
+
+        self.model = _ModelWrapper(config, model_name)
+        self.model.to(config.torch_dtype)
+        self.repo_id = model_name
+        self.config = config
+
+    def __call__(self, input_sentence: str, suffix=""):
+        self.visualize_attention_mask(input_sentence, suffix=suffix)
+
+    def visualize_attention_mask(self, input_sentence: str, suffix=""):
+        model = self.model
+        kwargs = {}
+        if self.config.model_type in PROCESSOR_MAPPING_NAMES:
+            img = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg?download=true"
+            img = Image.open(requests.get(img, stream=True).raw)
+            processor = AutoProcessor.from_pretrained(self.repo_id, image_seq_length=5)
+            if hasattr(processor, "image_token"):
+                image_token = processor.image_token
+            else:
+                image_token = processor.tokenizer.convert_ids_to_tokens([processor.image_token_id])[0]
+
+            if image_token:
+                input_sentence = input_sentence.replace("<img>", image_token)
+
+            inputs = processor(img, input_sentence, suffix=suffix, return_tensors="pt")
+
+            self.image_token = processor.tokenizer.convert_ids_to_tokens([processor.image_token_id])[0]
+
+            attention_mask = inputs["attention_mask"]
+            if "token_type_ids" in inputs:  # TODO inspect signature of update causal mask
+                kwargs["token_type_ids"] = inputs["token_type_ids"]
+            tokens = processor.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+        elif self.config.model_type in TOKENIZER_MAPPING_NAMES:
+            tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
+            tokens = tokenizer.tokenize(input_sentence)
+            attention_mask = tokenizer(input_sentence, return_tensors="pt")["attention_mask"]
+        else:
+            raise ValueError(f"Model type {model.config.model_type} does not support attention visualization")
+
+        model.config._attn_implementation = "eager"
+        model.train()
+        attention_mask = ~model._update_causal_mask(
+            attention_mask=attention_mask,
+            input_tensor=attention_mask.to(self.model.dtype),
+            cache_position=torch.arange(attention_mask.shape[1]),
+            past_key_values=None,
+            **kwargs,
+        ).bool()
+        top_bottom_border = "##" * (
+            len(f"Attention visualization for {self.config.model_type} | {self.mapped_cls}") + 4
+        )  # Box width adjusted to text length
+        side_border = "##"
+        print(f"\n{top_bottom_border}")
+        print(
+            "##"
+            + f"  Attention visualization for \033[1m{self.config.model_type}:{self.repo_id}\033[0m {self.mapped_cls.__name__}".center(
+                len(top_bottom_border)
+            )
+            + "    "
+            + side_border
+        )
+        print(f"{top_bottom_border}")
+        f_string = generate_attention_matrix_from_mask(
+            tokens,
+            attention_mask,
+            img_token=self.image_token,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            token_type_ids=kwargs.get("token_type_ids", None),
+        )
+        print(f_string)
+        print(f"{top_bottom_border}")
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 86a1fae4ad0c..a6ab0046bc5c 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,7 +16,8 @@
 
 import enum
 import inspect
-from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Optional, Union
 
 
 if TYPE_CHECKING:
@@ -75,9 +75,9 @@ def verify_out_features_out_indices(
 
 
 def _align_output_features_output_indices(
-    out_features: Optional[List[str]],
-    out_indices: Optional[Union[List[int], Tuple[int]]],
-    stage_names: List[str],
+    out_features: Optional[list[str]],
+    out_indices: Optional[Union[list[int], tuple[int]]],
+    stage_names: list[str],
 ):
     """
     Finds the corresponding `out_features` and `out_indices` for the given `stage_names`.
@@ -106,10 +106,10 @@ def _align_output_features_output_indices(
 
 
 def get_aligned_output_features_output_indices(
-    out_features: Optional[List[str]],
-    out_indices: Optional[Union[List[int], Tuple[int]]],
-    stage_names: List[str],
-) -> Tuple[List[str], List[int]]:
+    out_features: Optional[list[str]],
+    out_indices: Optional[Union[list[int], tuple[int]]],
+    stage_names: list[str],
+) -> tuple[list[str], list[int]]:
     """
     Get the `out_features` and `out_indices` so that they are aligned.
 
@@ -198,7 +198,7 @@ def out_features(self):
         return self._out_features
 
     @out_features.setter
-    def out_features(self, out_features: List[str]):
+    def out_features(self, out_features: list[str]):
         """
         Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
         """
@@ -211,7 +211,7 @@ def out_indices(self):
         return self._out_indices
 
     @out_indices.setter
-    def out_indices(self, out_indices: Union[Tuple[int], List[int]]):
+    def out_indices(self, out_indices: Union[tuple[int], list[int]]):
         """
         Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
         """
@@ -264,7 +264,7 @@ def out_features(self):
         return self._out_features
 
     @out_features.setter
-    def out_features(self, out_features: List[str]):
+    def out_features(self, out_features: list[str]):
         """
         Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
         """
@@ -277,7 +277,7 @@ def out_indices(self):
         return self._out_indices
 
     @out_indices.setter
-    def out_indices(self, out_indices: Union[Tuple[int], List[int]]):
+    def out_indices(self, out_indices: Union[tuple[int], list[int]]):
         """
         Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
         """
diff --git a/src/transformers/utils/chat_template_utils.py b/src/transformers/utils/chat_template_utils.py
index db1db042e5bd..f96b5d8ddeca 100644
--- a/src/transformers/utils/chat_template_utils.py
+++ b/src/transformers/utils/chat_template_utils.py
@@ -19,7 +19,7 @@
 from contextlib import contextmanager
 from datetime import datetime
 from functools import lru_cache
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, get_args, get_origin, get_type_hints
+from typing import Any, Callable, Optional, Union, get_args, get_origin, get_type_hints
 
 from packaging import version
 
@@ -71,7 +71,7 @@ class DocstringParsingException(Exception):
     pass
 
 
-def _get_json_schema_type(param_type: str) -> Dict[str, str]:
+def _get_json_schema_type(param_type: str) -> dict[str, str]:
     type_mapping = {
         int: {"type": "integer"},
         float: {"type": "number"},
@@ -87,7 +87,7 @@ def _get_json_schema_type(param_type: str) -> Dict[str, str]:
     return type_mapping.get(param_type, {"type": "object"})
 
 
-def _parse_type_hint(hint: str) -> Dict:
+def _parse_type_hint(hint: str) -> dict:
     origin = get_origin(hint)
     args = get_args(hint)
 
@@ -152,7 +152,7 @@ def _parse_type_hint(hint: str) -> Dict:
     raise TypeHintParsingException("Couldn't parse this type hint, likely due to a custom class or object: ", hint)
 
 
-def _convert_type_hints_to_json_schema(func: Callable) -> Dict:
+def _convert_type_hints_to_json_schema(func: Callable) -> dict:
     type_hints = get_type_hints(func)
     signature = inspect.signature(func)
     required = []
@@ -173,7 +173,7 @@ def _convert_type_hints_to_json_schema(func: Callable) -> Dict:
     return schema
 
 
-def parse_google_format_docstring(docstring: str) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
+def parse_google_format_docstring(docstring: str) -> tuple[Optional[str], Optional[dict], Optional[str]]:
     """
     Parses a Google-style docstring to extract the function description,
     argument descriptions, and return description.
@@ -206,7 +206,7 @@ def parse_google_format_docstring(docstring: str) -> Tuple[Optional[str], Option
     return description, args_dict, returns
 
 
-def get_json_schema(func: Callable) -> Dict:
+def get_json_schema(func: Callable) -> dict:
     """
     This function generates a JSON schema for a given function, based on its docstring and type hints. This is
     mostly used for passing lists of tools to a chat template. The JSON schema contains the name and description of
@@ -398,7 +398,7 @@ def is_active(self) -> bool:
             return self._rendered_blocks or self._generation_indices
 
         @contextmanager
-        def activate_tracker(self, rendered_blocks: List[int], generation_indices: List[int]):
+        def activate_tracker(self, rendered_blocks: list[int], generation_indices: list[int]):
             try:
                 if self.is_active():
                     raise ValueError("AssistantTracker should not be reused before closed")
@@ -412,7 +412,7 @@ def activate_tracker(self, rendered_blocks: List[int], generation_indices: List[
 
     if version.parse(jinja2.__version__) < version.parse("3.1.0"):
         raise ImportError(
-            "apply_chat_template requires jinja2>=3.1.0 to be installed. Your version is " f"{jinja2.__version__}."
+            f"apply_chat_template requires jinja2>=3.1.0 to be installed. Your version is {jinja2.__version__}."
         )
 
     def raise_exception(message):
diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py
index 7ca1c1347142..f01ffc28442c 100644
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@@ -16,10 +16,23 @@
 """
 
 import functools
+import inspect
 import re
+import textwrap
 import types
 
 
+def get_docstring_indentation_level(func):
+    """Return the indentation level of the start of the docstring of a class or function (or method)."""
+    # We assume classes are always defined in the global scope
+    if inspect.isclass(func):
+        return 4
+    source = inspect.getsource(func)
+    first_line = source.splitlines()[0]
+    function_def_level = len(first_line) - len(first_line.lstrip())
+    return 4 + function_def_level
+
+
 def add_start_docstrings(*docstr):
     def docstring_decorator(fn):
         fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
@@ -30,10 +43,8 @@ def docstring_decorator(fn):
 
 def add_start_docstrings_to_model_forward(*docstr):
     def docstring_decorator(fn):
-        docstring = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
         class_name = f"[`{fn.__qualname__.split('.')[0]}`]"
-        intro = f"   The {class_name} forward method, overrides the `__call__` special method."
-        note = r"""
+        intro = rf"""    The {class_name} forward method, overrides the `__call__` special method.
 
     <Tip>
 
@@ -44,7 +55,23 @@ def docstring_decorator(fn):
     </Tip>
 """
 
-        fn.__doc__ = intro + note + docstring
+        correct_indentation = get_docstring_indentation_level(fn)
+        current_doc = fn.__doc__ if fn.__doc__ is not None else ""
+        try:
+            first_non_empty = next(line for line in current_doc.splitlines() if line.strip() != "")
+            doc_indentation = len(first_non_empty) - len(first_non_empty.lstrip())
+        except StopIteration:
+            doc_indentation = correct_indentation
+
+        docs = docstr
+        # In this case, the correct indentation level (class method, 2 Python levels) was respected, and we should
+        # correctly reindent everything. Otherwise, the doc uses a single indentation level
+        if doc_indentation == 4 + correct_indentation:
+            docs = [textwrap.indent(textwrap.dedent(doc), " " * correct_indentation) for doc in docstr]
+            intro = textwrap.indent(textwrap.dedent(intro), " " * correct_indentation)
+
+        docstring = "".join(docs) + current_doc
+        fn.__doc__ = intro + docstring
         return fn
 
     return docstring_decorator
@@ -1153,6 +1180,7 @@ def docstring_decorator(fn):
             built_doc = built_doc.replace(
                 f'from_pretrained("{checkpoint}")', f'from_pretrained("{checkpoint}", revision="{revision}")'
             )
+
         fn.__doc__ = func_doc + output_doc + built_doc
         return fn
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 5a78ab786a95..b03455c89e6e 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -538,9 +538,28 @@ def convert_and_export_with_cache(*args, **kwargs):
     requires_backends(convert_and_export_with_cache, ["torch"])
 
 
+def model_addition_debugger(*args, **kwargs):
+    requires_backends(model_addition_debugger, ["torch"])
+
+
+def model_addition_debugger_context(*args, **kwargs):
+    requires_backends(model_addition_debugger_context, ["torch"])
+
+
 ROPE_INIT_FUNCTIONS = None
 
 
+def dynamic_rope_update(*args, **kwargs):
+    requires_backends(dynamic_rope_update, ["torch"])
+
+
+class AttentionInterface(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class PreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -1146,6 +1165,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AyaVisionForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AyaVisionPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class BambaForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2783,6 +2816,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class DeepseekV3ForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DeepseekV3Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DeepseekV3PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class DeformableDetrForObjectDetection(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4595,6 +4649,34 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Gemma3ForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Gemma3ForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Gemma3PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Gemma3TextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GitForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4967,6 +5049,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class GraniteMoeSharedForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GraniteMoeSharedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GraniteMoeSharedPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GroundingDinoForObjectDetection(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -5720,6 +5823,41 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Llama4ForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Llama4ForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Llama4PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Llama4TextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Llama4VisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class LlavaForConditionalGeneration(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6329,6 +6467,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Mistral3ForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Mistral3PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MixtralForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6600,6 +6752,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class ModernBertForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class ModernBertForSequenceClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7661,6 +7820,55 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Phi4MultimodalAudioModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Phi4MultimodalAudioPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Phi4MultimodalForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Phi4MultimodalModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Phi4MultimodalPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Phi4MultimodalVisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Phi4MultimodalVisionPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class PhimoeForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7801,6 +8009,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class PromptDepthAnythingForDepthEstimation(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PromptDepthAnythingPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class ProphetNetDecoder(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -8039,6 +8261,90 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Qwen3ForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3ForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3ForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3ForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3MoeForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3MoeForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3MoeForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3MoeForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3MoeModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Qwen3MoePreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class RagModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -8569,6 +8875,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class SamVisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class SeamlessM4TCodeHifiGan(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -8793,6 +9106,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class ShieldGemma2ForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class SiglipForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -8828,6 +9148,83 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Siglip2ForImageClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Siglip2Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Siglip2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Siglip2TextModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Siglip2VisionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SmolVLMForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SmolVLMModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SmolVLMPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SmolVLMProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SmolVLMVisionConfig(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SmolVLMVisionTransformer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class SpeechEncoderDecoderModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -10702,13 +11099,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class AdamW(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 def get_constant_schedule(*args, **kwargs):
     requires_backends(get_constant_schedule, ["torch"])
 
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index cade9ade6984..985445fbba56 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -2375,6 +2375,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+class TFSamVisionModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFSegformerDecodeHead(metaclass=DummyObject):
     _backends = ["tf"]
 
diff --git a/src/transformers/utils/dummy_torchvision_objects.py b/src/transformers/utils/dummy_torchvision_objects.py
index 87b60fbc0463..c59c9b4bdd55 100644
--- a/src/transformers/utils/dummy_torchvision_objects.py
+++ b/src/transformers/utils/dummy_torchvision_objects.py
@@ -58,6 +58,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torchvision"])
 
 
+class Gemma3ImageProcessorFast(metaclass=DummyObject):
+    _backends = ["torchvision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchvision"])
+
+
+class GotOcr2ImageProcessorFast(metaclass=DummyObject):
+    _backends = ["torchvision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchvision"])
+
+
+class Llama4ImageProcessorFast(metaclass=DummyObject):
+    _backends = ["torchvision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchvision"])
+
+
 class LlavaImageProcessorFast(metaclass=DummyObject):
     _backends = ["torchvision"]
 
@@ -79,6 +100,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torchvision"])
 
 
+class Phi4MultimodalImageProcessorFast(metaclass=DummyObject):
+    _backends = ["torchvision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchvision"])
+
+
 class PixtralImageProcessorFast(metaclass=DummyObject):
     _backends = ["torchvision"]
 
@@ -107,6 +135,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torchvision"])
 
 
+class Siglip2ImageProcessorFast(metaclass=DummyObject):
+    _backends = ["torchvision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchvision"])
+
+
 class ViTImageProcessorFast(metaclass=DummyObject):
     _backends = ["torchvision"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 64a69ef11717..4e7293d50ae9 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -289,6 +289,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class Gemma3ImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class GLPNFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -401,6 +408,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class Llama4ImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class LlavaImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -583,6 +597,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class PromptDepthAnythingImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class PvtImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
@@ -639,6 +660,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class Siglip2ImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class SmolVLMImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class SuperGlueImageProcessor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 45fa3d9ca68c..6ab0c45d996a 100755
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,7 +23,7 @@
 import random
 import sys
 import warnings
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Type, Union
+from typing import Any, Callable, Literal, Optional, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -78,9 +77,9 @@
 
 
 def _generate_supported_model_class_names(
-    model_name: Type[PretrainedConfig],
-    supported_tasks: Optional[Union[str, List[str]]] = None,
-) -> List[str]:
+    model_name: type[PretrainedConfig],
+    supported_tasks: Optional[Union[str, list[str]]] = None,
+) -> list[str]:
     task_mapping = {
         "default": MODEL_MAPPING_NAMES,
         "pretraining": MODEL_FOR_PRETRAINING_MAPPING_NAMES,
@@ -158,6 +157,8 @@ def _generate_supported_model_class_names(
     "plbart",
     "qwen2",
     "qwen2_moe",
+    "qwen3",
+    "qwen3_moe",
     "resnet",
     "roberta",
     "segformer",
@@ -588,7 +589,7 @@ def to_concrete(t):
     return operator.getitem(a, b)
 
 
-_MANUAL_META_OVERRIDES: Dict[Callable, Callable] = {
+_MANUAL_META_OVERRIDES: dict[Callable, Callable] = {
     torch.nn.Embedding: torch_nn_embedding,
     torch.nn.functional.embedding: torch_nn_functional_embedding,
     torch.nn.LayerNorm: torch_nn_layernorm,
@@ -714,7 +715,7 @@ class HFCacheProxy(HFProxy):
     Proxy that represents an instance of `transformers.cache_utils.Cache`.
     """
 
-    def install_orig_cache_cls(self, orig_cache_cls: Type[Cache]):
+    def install_orig_cache_cls(self, orig_cache_cls: type[Cache]):
         self._orig_cache_cls = orig_cache_cls
 
     @property
@@ -768,8 +769,8 @@ class HFProxyableClassMeta(type):
     def __new__(
         cls,
         name: str,
-        bases: Tuple[Type, ...],
-        attrs: Dict[str, Any],
+        bases: tuple[type, ...],
+        attrs: dict[str, Any],
         proxy_factory_fn: Optional[Callable[[Node], Proxy]] = None,
     ):
         cls = super().__new__(cls, name, bases, attrs)
@@ -792,7 +793,7 @@ def __new__(
         return cls
 
 
-def gen_constructor_wrapper(target: Callable) -> Tuple[Callable, Callable]:
+def gen_constructor_wrapper(target: Callable) -> tuple[Callable, Callable]:
     """
     Wraps `target` to be proxyable. Used for tensor creators like `torch.ones`, `torch.arange` and so on.
     """
@@ -811,7 +812,7 @@ def _proxies_to_metas(v):
     return v
 
 
-def create_cache_proxy_factory_fn(orig_cache_cls: Type[Cache]) -> Callable[[Node], HFCacheProxy]:
+def create_cache_proxy_factory_fn(orig_cache_cls: type[Cache]) -> Callable[[Node], HFCacheProxy]:
     def cache_proxy_factory_fn(n: Node) -> HFCacheProxy:
         global _CURRENT_TRACER
         if not isinstance(_CURRENT_TRACER, HFTracer):
@@ -847,7 +848,7 @@ def cache_proxy_factory_fn(n: Node) -> HFCacheProxy:
 )
 
 
-def _generate_random_int(low: int = 10, high: int = 20, forbidden_values: Optional[List[int]] = None):
+def _generate_random_int(low: int = 10, high: int = 20, forbidden_values: Optional[list[int]] = None):
     if forbidden_values is None:
         forbidden_values = []
     value = random.randint(low, high)
@@ -897,8 +898,8 @@ def __init__(self, autowrap_modules=(math,), autowrap_functions=()):
             )
 
     def _generate_dummy_input(
-        self, model: "PreTrainedModel", input_name: str, shape: List[int], input_names: List[str]
-    ) -> Dict[str, torch.Tensor]:
+        self, model: "PreTrainedModel", input_name: str, shape: list[int], input_names: list[str]
+    ) -> dict[str, torch.Tensor]:
         """Generates dummy input for model inference recording."""
         # Retrieving the model class, either from the "class_for_deserialization" attribute if the model was restored
         # from pickle, or from the "__class__" attribute in the general case.
@@ -1179,7 +1180,7 @@ def maybe_get_proxy_for_attr(attr_val, collection_to_search, parameter_proxy_cac
             return attr_val
 
     # Needed for PyTorch 1.13+
-    def getattr(self, attr: str, attr_val: Any, parameter_proxy_cache: Dict[str, Any]):
+    def getattr(self, attr: str, attr_val: Any, parameter_proxy_cache: dict[str, Any]):
         return self._module_getattr(attr, attr_val, parameter_proxy_cache)
 
     def call_module(self, m, forward, args, kwargs):
@@ -1231,8 +1232,8 @@ def patch_for_tracing(self, root: Union[torch.nn.Module, Callable[..., Any]]):
     def trace(
         self,
         root: Union[torch.nn.Module, Callable[..., Any]],
-        concrete_args: Optional[Dict[str, Any]] = None,
-        dummy_inputs: Optional[Dict[str, Any]] = None,
+        concrete_args: Optional[dict[str, Any]] = None,
+        dummy_inputs: Optional[dict[str, Any]] = None,
         complete_concrete_args_with_inputs_not_in_dummy_inputs: bool = True,
     ) -> Graph:
         """
@@ -1420,7 +1421,7 @@ def keys(self, obj: "Proxy") -> Any:
         return attribute
 
 
-def get_concrete_args(model: nn.Module, input_names: List[str]):
+def get_concrete_args(model: nn.Module, input_names: list[str]):
     sig = inspect.signature(model.forward)
 
     if not (set(input_names) <= set(sig.parameters.keys())):
@@ -1448,9 +1449,9 @@ def check_if_model_is_supported(model: "PreTrainedModel"):
 
 def symbolic_trace(
     model: "PreTrainedModel",
-    input_names: Optional[List[str]] = None,
+    input_names: Optional[list[str]] = None,
     disable_check: bool = False,
-    tracer_cls: Type[HFTracer] = HFTracer,
+    tracer_cls: type[HFTracer] = HFTracer,
 ) -> GraphModule:
     """
     Performs symbolic tracing on the model.
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index a997da79e841..65a3efaed5ab 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -21,12 +21,12 @@
 import tempfile
 import warnings
 from collections import OrderedDict, UserDict
-from collections.abc import MutableMapping
+from collections.abc import Iterable, MutableMapping
 from contextlib import ExitStack, contextmanager
 from dataclasses import fields, is_dataclass
 from enum import Enum
 from functools import partial, wraps
-from typing import Any, ContextManager, Dict, Iterable, List, Optional, Tuple, TypedDict
+from typing import Any, ContextManager, Optional, TypedDict
 
 import numpy as np
 from packaging import version
@@ -41,6 +41,11 @@
 )
 
 
+if is_torch_available():
+    # required for @can_return_tuple decorator to work with torchdynamo
+    import torch  # noqa: F401
+
+
 class cached_property(property):
     """
     Descriptor that mimics @property but caches output in member variable.
@@ -257,19 +262,26 @@ def to_py_obj(obj):
     """
     Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
     """
+    if isinstance(obj, (int, float)):
+        return obj
+    elif isinstance(obj, (dict, UserDict)):
+        return {k: to_py_obj(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        try:
+            arr = np.array(obj)
+            if np.issubdtype(arr.dtype, np.integer) or np.issubdtype(arr.dtype, np.floating):
+                return arr.tolist()
+        except Exception:
+            pass
+        return [to_py_obj(o) for o in obj]
 
     framework_to_py_obj = {
-        "pt": lambda obj: obj.detach().cpu().tolist(),
+        "pt": lambda obj: obj.tolist(),
         "tf": lambda obj: obj.numpy().tolist(),
         "jax": lambda obj: np.asarray(obj).tolist(),
         "np": lambda obj: obj.tolist(),
     }
 
-    if isinstance(obj, (dict, UserDict)):
-        return {k: to_py_obj(v) for k, v in obj.items()}
-    elif isinstance(obj, (list, tuple)):
-        return [to_py_obj(o) for o in obj]
-
     # This gives us a smart order to test the frameworks with the corresponding tests.
     framework_to_test_func = _get_frameworks_and_test_func(obj)
     for framework, test_func in framework_to_test_func.items():
@@ -355,7 +367,7 @@ def __init__(self, *args, **kwargs):
 
         if is_modeloutput_subclass and not is_dataclass(self):
             raise TypeError(
-                f"{self.__module__}.{self.__class__.__name__} is not a dataclasss."
+                f"{self.__module__}.{self.__class__.__name__} is not a dataclass."
                 " This is a subclass of ModelOutput and so must use the @dataclass decorator."
             )
 
@@ -453,7 +465,7 @@ def __reduce__(self):
         args = tuple(getattr(self, field.name) for field in fields(self))
         return callable, args, *remaining
 
-    def to_tuple(self) -> Tuple[Any]:
+    def to_tuple(self) -> tuple[Any]:
         """
         Convert self to a tuple containing all the attributes/keys that are not `None`.
         """
@@ -463,7 +475,7 @@ def to_tuple(self) -> Tuple[Any]:
 if is_torch_available():
     import torch.utils._pytree as _torch_pytree
 
-    def _model_output_flatten(output: ModelOutput) -> Tuple[List[Any], "_torch_pytree.Context"]:
+    def _model_output_flatten(output: ModelOutput) -> tuple[list[Any], "_torch_pytree.Context"]:
         return list(output.values()), list(output.keys())
 
     def _model_output_unflatten(
@@ -530,7 +542,7 @@ class ContextManagers:
     in the `fastcore` library.
     """
 
-    def __init__(self, context_managers: List[ContextManager]):
+    def __init__(self, context_managers: list[ContextManager]):
         self.context_managers = context_managers
         self.stack = ExitStack()
 
@@ -871,7 +883,7 @@ class LossKwargs(TypedDict, total=False):
     num_items_in_batch: Optional[int]
 
 
-def is_timm_config_dict(config_dict: Dict[str, Any]) -> bool:
+def is_timm_config_dict(config_dict: dict[str, Any]) -> bool:
     """Checks whether a config dict is a timm config dict."""
     return "pretrained_cfg" in config_dict
 
@@ -891,14 +903,73 @@ def is_timm_local_checkpoint(pretrained_model_path: str) -> bool:
 
     # pretrained_model_path is a file
     if is_file and pretrained_model_path.endswith(".json"):
-        with open(pretrained_model_path, "r") as f:
+        with open(pretrained_model_path) as f:
             config_dict = json.load(f)
         return is_timm_config_dict(config_dict)
 
     # pretrained_model_path is a directory with a config.json
     if is_dir and os.path.exists(os.path.join(pretrained_model_path, "config.json")):
-        with open(os.path.join(pretrained_model_path, "config.json"), "r") as f:
+        with open(os.path.join(pretrained_model_path, "config.json")) as f:
             config_dict = json.load(f)
         return is_timm_config_dict(config_dict)
 
     return False
+
+
+def set_attribute_for_modules(module: "torch.nn.Module", key: str, value: Any):
+    """
+    Set a value to a module and all submodules.
+    """
+    setattr(module, key, value)
+    for submodule in module.children():
+        set_attribute_for_modules(submodule, key, value)
+
+
+def del_attribute_from_modules(module: "torch.nn.Module", key: str):
+    """
+    Delete a value from a module and all submodules.
+    """
+    # because we might remove it previously in case it's a shared module, e.g. activation function
+    if hasattr(module, key):
+        delattr(module, key)
+
+    for submodule in module.children():
+        del_attribute_from_modules(submodule, key)
+
+
+def can_return_tuple(func):
+    """
+    Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or
+    use_return_dict=False is set in the config.
+
+    Note:
+        output.to_tuple() convert output to tuple skipping all `None` values.
+    """
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        is_requested_to_return_tuple = kwargs.pop("return_dict", True) is False
+        is_configured_to_return_tuple = self.config.use_return_dict is False if hasattr(self, "config") else False
+
+        # The following allows to convert output to tuple ONLY on top level forward call,
+        # while internal modules of the model will return Output objects
+        # to be able to use name-based attribute access in modeling code.
+
+        # We will check if we are on top level module, if so, turn off to tuple conversion for all
+        # underling calls.
+        is_top_level_module = getattr(self, "_is_top_level_module", True)
+        if is_configured_to_return_tuple and is_top_level_module:
+            set_attribute_for_modules(self, "_is_top_level_module", False)
+
+        try:
+            output = func(self, *args, **kwargs)
+            if is_requested_to_return_tuple or (is_configured_to_return_tuple and is_top_level_module):
+                output = output.to_tuple()
+        finally:
+            # Remove the flag after the model forward call is finished.
+            if is_configured_to_return_tuple and is_top_level_module:
+                del_attribute_from_modules(self, "_is_top_level_module")
+
+        return output
+
+    return wrapper
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 21d6d04489e5..57a267b72889 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -23,7 +23,7 @@
 import warnings
 from concurrent import futures
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 from urllib.parse import urlparse
 from uuid import uuid4
 
@@ -40,6 +40,7 @@
     create_repo,
     hf_hub_download,
     hf_hub_url,
+    snapshot_download,
     try_to_load_from_cache,
 )
 from huggingface_hub.file_download import REGEX_COMMIT_HASH, http_get
@@ -47,7 +48,6 @@
     EntryNotFoundError,
     GatedRepoError,
     HfHubHTTPError,
-    HFValidationError,
     LocalEntryNotFoundError,
     OfflineModeIsEnabled,
     RepositoryNotFoundError,
@@ -69,7 +69,6 @@
     is_torch_available,
     is_training_run_on_sagemaker,
 )
-from .logging import tqdm
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -169,7 +168,7 @@ def define_sagemaker_information():
     return sagemaker_object
 
 
-def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+def http_user_agent(user_agent: Union[dict, str, None] = None) -> str:
     """
     Formats a user-agent string with basic info about a request.
     """
@@ -209,16 +208,79 @@ def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str]
 def cached_file(
     path_or_repo_id: Union[str, os.PathLike],
     filename: str,
+    **kwargs,
+) -> Optional[str]:
+    """
+    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
+
+    Args:
+        path_or_repo_id (`str` or `os.PathLike`):
+            This can be either:
+            - a string, the *model id* of a model repo on huggingface.co.
+            - a path to a *directory* potentially containing the file.
+        filename (`str`):
+            The name of the file to locate in `path_or_repo`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download:
+            Deprecated and ignored. All downloads are now resumed by default when possible.
+            Will be removed in v5 of Transformers.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
+        repo_type (`str`, *optional*):
+            Specify the repo type (useful when downloading from a space for instance).
+
+    <Tip>
+
+    Passing `token=True` is required when you want to use a private model.
+
+    </Tip>
+
+    Returns:
+        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo).
+
+    Examples:
+
+    ```python
+    # Download a model weight from the Hub and cache it.
+    model_weights_file = cached_file("google-bert/bert-base-uncased", "pytorch_model.bin")
+    ```
+    """
+    file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
+    file = file[0] if file is not None else file
+    return file
+
+
+def cached_files(
+    path_or_repo_id: Union[str, os.PathLike],
+    filenames: list[str],
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
     resume_download: Optional[bool] = None,
-    proxies: Optional[Dict[str, str]] = None,
+    proxies: Optional[dict[str, str]] = None,
     token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
     subfolder: str = "",
     repo_type: Optional[str] = None,
-    user_agent: Optional[Union[str, Dict[str, str]]] = None,
+    user_agent: Optional[Union[str, dict[str, str]]] = None,
     _raise_exceptions_for_gated_repo: bool = True,
     _raise_exceptions_for_missing_entries: bool = True,
     _raise_exceptions_for_connection_errors: bool = True,
@@ -226,16 +288,15 @@ def cached_file(
     **deprecated_kwargs,
 ) -> Optional[str]:
     """
-    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
+    Tries to locate several files in a local folder and repo, downloads and cache them if necessary.
 
     Args:
         path_or_repo_id (`str` or `os.PathLike`):
             This can be either:
-
             - a string, the *model id* of a model repo on huggingface.co.
             - a path to a *directory* potentially containing the file.
-        filename (`str`):
-            The name of the file to locate in `path_or_repo`.
+        filenames (`List[str]`):
+            The name of all the files to locate in `path_or_repo`.
         cache_dir (`str` or `os.PathLike`, *optional*):
             Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
             cache should not be used.
@@ -263,6 +324,17 @@ def cached_file(
         repo_type (`str`, *optional*):
             Specify the repo type (useful when downloading from a space for instance).
 
+    Private args:
+        _raise_exceptions_for_gated_repo (`bool`):
+            if False, do not raise an exception for gated repo error but return None.
+        _raise_exceptions_for_missing_entries (`bool`):
+            if False, do not raise an exception for missing entries but return None.
+        _raise_exceptions_for_connection_errors (`bool`):
+            if False, do not raise an exception for connection errors but return None.
+        _commit_hash (`str`, *optional*):
+            passed when we are chaining several calls to various files (e.g. when loading a tokenizer or
+            a pipeline). If files are cached for this commit hash, avoid calls to head and get from the cache.
+
     <Tip>
 
     Passing `token=True` is required when you want to use a private model.
@@ -289,144 +361,176 @@ def cached_file(
             raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
         token = use_auth_token
 
-    # Private arguments
-    #     _raise_exceptions_for_gated_repo: if False, do not raise an exception for gated repo error but return
-    #         None.
-    #     _raise_exceptions_for_missing_entries: if False, do not raise an exception for missing entries but return
-    #         None.
-    #     _raise_exceptions_for_connection_errors: if False, do not raise an exception for connection errors but return
-    #         None.
-    #     _commit_hash: passed when we are chaining several calls to various files (e.g. when loading a tokenizer or
-    #         a pipeline). If files are cached for this commit hash, avoid calls to head and get from the cache.
     if is_offline_mode() and not local_files_only:
         logger.info("Offline mode: forcing local_files_only=True")
         local_files_only = True
     if subfolder is None:
         subfolder = ""
 
+    # Add folder to filenames
+    full_filenames = [os.path.join(subfolder, file) for file in filenames]
+
     path_or_repo_id = str(path_or_repo_id)
-    full_filename = os.path.join(subfolder, filename)
-    if os.path.isdir(path_or_repo_id):
-        resolved_file = os.path.join(os.path.join(path_or_repo_id, subfolder), filename)
-        if not os.path.isfile(resolved_file):
-            if _raise_exceptions_for_missing_entries and filename not in ["config.json", f"{subfolder}/config.json"]:
-                raise EnvironmentError(
-                    f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout "
-                    f"'https://huggingface.co/{path_or_repo_id}/tree/{revision}' for available files."
-                )
-            else:
-                return None
-        return resolved_file
+    existing_files = []
+    for filename in full_filenames:
+        if os.path.isdir(path_or_repo_id):
+            resolved_file = os.path.join(path_or_repo_id, filename)
+            if not os.path.isfile(resolved_file):
+                if _raise_exceptions_for_missing_entries and filename != os.path.join(subfolder, "config.json"):
+                    revision_ = "main" if revision is None else revision
+                    raise OSError(
+                        f"{path_or_repo_id} does not appear to have a file named {filename}. Checkout "
+                        f"'https://huggingface.co/{path_or_repo_id}/tree/{revision_}' for available files."
+                    )
+                else:
+                    return None
+            existing_files.append(resolved_file)
+
+    # All files exist
+    if len(existing_files) == len(full_filenames):
+        return existing_files
 
     if cache_dir is None:
         cache_dir = TRANSFORMERS_CACHE
     if isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
 
+    existing_files = []
+    file_counter = 0
     if _commit_hash is not None and not force_download:
-        # If the file is cached under that commit hash, we return it directly.
-        resolved_file = try_to_load_from_cache(
-            path_or_repo_id, full_filename, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
-        )
-        if resolved_file is not None:
-            if resolved_file is not _CACHED_NO_EXIST:
-                return resolved_file
-            elif not _raise_exceptions_for_missing_entries:
-                return None
-            else:
-                raise EnvironmentError(f"Could not locate {full_filename} inside {path_or_repo_id}.")
+        for filename in full_filenames:
+            # If the file is cached under that commit hash, we return it directly.
+            resolved_file = try_to_load_from_cache(
+                path_or_repo_id, filename, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
+            )
+            if resolved_file is not None:
+                if resolved_file is not _CACHED_NO_EXIST:
+                    file_counter += 1
+                    existing_files.append(resolved_file)
+                elif not _raise_exceptions_for_missing_entries:
+                    file_counter += 1
+                else:
+                    raise OSError(f"Could not locate {filename} inside {path_or_repo_id}.")
+
+    # Either all the files were found, or some were _CACHED_NO_EXIST but we do not raise for missing entries
+    if file_counter == len(full_filenames):
+        return existing_files if len(existing_files) > 0 else None
 
     user_agent = http_user_agent(user_agent)
+    # download the files if needed
     try:
-        # Load from URL or cache if already cached
-        resolved_file = hf_hub_download(
-            path_or_repo_id,
-            filename,
-            subfolder=None if len(subfolder) == 0 else subfolder,
-            repo_type=repo_type,
-            revision=revision,
-            cache_dir=cache_dir,
-            user_agent=user_agent,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            token=token,
-            local_files_only=local_files_only,
-        )
-    except GatedRepoError as e:
-        resolved_file = _get_cache_file_to_return(path_or_repo_id, full_filename, cache_dir, revision)
-        if resolved_file is not None or not _raise_exceptions_for_gated_repo:
-            return resolved_file
-        raise EnvironmentError(
-            "You are trying to access a gated repo.\nMake sure to have access to it at "
-            f"https://huggingface.co/{path_or_repo_id}.\n{str(e)}"
-        ) from e
-    except RepositoryNotFoundError as e:
-        raise EnvironmentError(
-            f"{path_or_repo_id} is not a local folder and is not a valid model identifier "
-            "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token "
-            "having permission to this repo either by logging in with `huggingface-cli login` or by passing "
-            "`token=<your_token>`"
-        ) from e
-    except RevisionNotFoundError as e:
-        raise EnvironmentError(
-            f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
-            "for this model name. Check the model page at "
-            f"'https://huggingface.co/{path_or_repo_id}' for available revisions."
-        ) from e
-    except LocalEntryNotFoundError as e:
-        resolved_file = _get_cache_file_to_return(path_or_repo_id, full_filename, cache_dir, revision)
-        if (
-            resolved_file is not None
-            or not _raise_exceptions_for_missing_entries
-            or not _raise_exceptions_for_connection_errors
-        ):
-            return resolved_file
-        raise EnvironmentError(
-            f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this file, couldn't find it in the"
-            f" cached files and it looks like {path_or_repo_id} is not the path to a directory containing a file named"
-            f" {full_filename}.\nCheckout your internet connection or see how to run the library in offline mode at"
-            " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
-        ) from e
-    except EntryNotFoundError as e:
-        if not _raise_exceptions_for_missing_entries:
-            return None
-        if revision is None:
-            revision = "main"
-        if filename in ["config.json", f"{subfolder}/config.json"]:
+        if len(full_filenames) == 1:
+            # This is slightly better for only 1 file
+            hf_hub_download(
+                path_or_repo_id,
+                filenames[0],
+                subfolder=None if len(subfolder) == 0 else subfolder,
+                repo_type=repo_type,
+                revision=revision,
+                cache_dir=cache_dir,
+                user_agent=user_agent,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+        else:
+            snapshot_download(
+                path_or_repo_id,
+                allow_patterns=full_filenames,
+                repo_type=repo_type,
+                revision=revision,
+                cache_dir=cache_dir,
+                user_agent=user_agent,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+
+    except Exception as e:
+        # We cannot recover from them
+        if isinstance(e, RepositoryNotFoundError) and not isinstance(e, GatedRepoError):
+            raise OSError(
+                f"{path_or_repo_id} is not a local folder and is not a valid model identifier "
+                "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token "
+                "having permission to this repo either by logging in with `huggingface-cli login` or by passing "
+                "`token=<your_token>`"
+            ) from e
+        elif isinstance(e, RevisionNotFoundError):
+            raise OSError(
+                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
+                "for this model name. Check the model page at "
+                f"'https://huggingface.co/{path_or_repo_id}' for available revisions."
+            ) from e
+
+        # Now we try to recover if we can find all files correctly in the cache
+        resolved_files = [
+            _get_cache_file_to_return(path_or_repo_id, filename, cache_dir, revision) for filename in full_filenames
+        ]
+        if all(file is not None for file in resolved_files):
+            return resolved_files
+
+        # Raise based on the flags. Note that we will raise for missing entries at the very end, even when
+        # not entering this Except block, as it may also happen when `snapshot_download` does not raise
+        if isinstance(e, GatedRepoError):
+            if not _raise_exceptions_for_gated_repo:
+                return None
+            raise OSError(
+                "You are trying to access a gated repo.\nMake sure to have access to it at "
+                f"https://huggingface.co/{path_or_repo_id}.\n{str(e)}"
+            ) from e
+        elif isinstance(e, LocalEntryNotFoundError):
+            if not _raise_exceptions_for_connection_errors:
+                return None
+            # Here we only raise if both flags for missing entry and connection errors are True (because it can be raised
+            # even when `local_files_only` is True, in which case raising for connections errors only would not make sense)
+            elif _raise_exceptions_for_missing_entries:
+                raise OSError(
+                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load the files, and couldn't find them in the"
+                    f" cached files.\nCheckout your internet connection or see how to run the library in offline mode at"
+                    " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
+                ) from e
+        # snapshot_download will not raise EntryNotFoundError, but hf_hub_download can. If this is the case, it will be treated
+        # later on anyway and re-raised if needed
+        elif isinstance(e, HTTPError) and not isinstance(e, EntryNotFoundError):
+            if not _raise_exceptions_for_connection_errors:
+                return None
+            raise OSError(f"There was a specific connection error when trying to load {path_or_repo_id}:\n{e}")
+
+    resolved_files = [
+        _get_cache_file_to_return(path_or_repo_id, filename, cache_dir, revision) for filename in full_filenames
+    ]
+    # If there are any missing file and the flag is active, raise
+    if any(file is None for file in resolved_files) and _raise_exceptions_for_missing_entries:
+        missing_entries = [original for original, resolved in zip(full_filenames, resolved_files) if resolved is None]
+        # Last escape
+        if len(resolved_files) == 1 and missing_entries[0] == os.path.join(subfolder, "config.json"):
             return None
+        # Now we raise for missing entries
+        revision_ = "main" if revision is None else revision
+        msg = (
+            f"a file named {missing_entries[0]}" if len(missing_entries) == 1 else f"files named {(*missing_entries,)}"
+        )
         raise EnvironmentError(
-            f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout "
-            f"'https://huggingface.co/{path_or_repo_id}/tree/{revision}' for available files."
-        ) from e
-    except HTTPError as err:
-        resolved_file = _get_cache_file_to_return(path_or_repo_id, full_filename, cache_dir, revision)
-        if resolved_file is not None or not _raise_exceptions_for_connection_errors:
-            return resolved_file
-        raise EnvironmentError(f"There was a specific connection error when trying to load {path_or_repo_id}:\n{err}")
-    except HFValidationError as e:
-        raise EnvironmentError(
-            f"Incorrect path_or_model_id: '{path_or_repo_id}'. Please provide either the path to a local folder or the repo_id of a model on the Hub."
-        ) from e
-    return resolved_file
+            f"{path_or_repo_id} does not appear to have {msg}. Checkout 'https://huggingface.co/{path_or_repo_id}/tree/{revision_}'"
+            "for available files."
+        )
+
+    # Remove potential missing entries (we can silently remove them at this point based on the flags)
+    resolved_files = [file for file in resolved_files if file is not None]
+    # Return `None` if the list is empty, coherent with other Exception when the flag is not active
+    resolved_files = None if len(resolved_files) == 0 else resolved_files
 
+    return resolved_files
 
-# TODO: deprecate `get_file_from_repo` or document it differently?
-#       Docstring is exactly the same as `cached_repo` but behavior is slightly different. If file is missing or if
-#       there is a connection error, `cached_repo` will return None while `get_file_from_repo` will raise an error.
-#       IMO we should keep only 1 method and have a single `raise_error` argument (to be discussed).
+
+# TODO cyril: Deprecated and should be removed in 4.51
 def get_file_from_repo(
-    path_or_repo: Union[str, os.PathLike],
-    filename: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: Optional[bool] = None,
-    proxies: Optional[Dict[str, str]] = None,
-    token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-    subfolder: str = "",
-    **deprecated_kwargs,
+    *args,
+    **kwargs,
 ):
     """
     Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
@@ -483,30 +587,15 @@ def get_file_from_repo(
     tokenizer_config = get_file_from_repo("FacebookAI/xlm-roberta-base", "tokenizer_config.json")
     ```
     """
-    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
-    if use_auth_token is not None:
-        warnings.warn(
-            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
-            FutureWarning,
-        )
-        if token is not None:
-            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
-        token = use_auth_token
-
+    logger.warning(
+        "`get_file_from_repo` is deprecated and will be removed in version 4.51. Use `cached_file` instead."
+    )
     return cached_file(
-        path_or_repo_id=path_or_repo,
-        filename=filename,
-        cache_dir=cache_dir,
-        force_download=force_download,
-        resume_download=resume_download,
-        proxies=proxies,
-        token=token,
-        revision=revision,
-        local_files_only=local_files_only,
-        subfolder=subfolder,
+        *args,
         _raise_exceptions_for_gated_repo=False,
         _raise_exceptions_for_missing_entries=False,
         _raise_exceptions_for_connection_errors=False,
+        **kwargs,
     )
 
 
@@ -541,7 +630,7 @@ def has_file(
     path_or_repo: Union[str, os.PathLike],
     filename: str,
     revision: Optional[str] = None,
-    proxies: Optional[Dict[str, str]] = None,
+    proxies: Optional[dict[str, str]] = None,
     token: Optional[Union[bool, str]] = None,
     *,
     local_files_only: bool = False,
@@ -616,19 +705,17 @@ def has_file(
         return True
     except GatedRepoError as e:
         logger.error(e)
-        raise EnvironmentError(
+        raise OSError(
             f"{path_or_repo} is a gated repository. Make sure to request access at "
             f"https://huggingface.co/{path_or_repo} and pass a token having permission to this repo either by "
             "logging in with `huggingface-cli login` or by passing `token=<your_token>`."
         ) from e
     except RepositoryNotFoundError as e:
         logger.error(e)
-        raise EnvironmentError(
-            f"{path_or_repo} is not a local folder or a valid repository name on 'https://hf.co'."
-        ) from e
+        raise OSError(f"{path_or_repo} is not a local folder or a valid repository name on 'https://hf.co'.") from e
     except RevisionNotFoundError as e:
         logger.error(e)
-        raise EnvironmentError(
+        raise OSError(
             f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
             f"model name. Check the model page at 'https://huggingface.co/{path_or_repo}' for available revisions."
         ) from e
@@ -689,12 +776,12 @@ def _upload_modified_files(
         self,
         working_dir: Union[str, os.PathLike],
         repo_id: str,
-        files_timestamps: Dict[str, float],
+        files_timestamps: dict[str, float],
         commit_message: Optional[str] = None,
         token: Optional[Union[bool, str]] = None,
         create_pr: bool = False,
-        revision: str = None,
-        commit_description: str = None,
+        revision: Optional[str] = None,
+        commit_description: Optional[str] = None,
     ):
         """
         Uploads all modified files in `working_dir` to `repo_id`, based on `files_timestamps`.
@@ -774,9 +861,9 @@ def push_to_hub(
         max_shard_size: Optional[Union[int, str]] = "5GB",
         create_pr: bool = False,
         safe_serialization: bool = True,
-        revision: str = None,
-        commit_description: str = None,
-        tags: Optional[List[str]] = None,
+        revision: Optional[str] = None,
+        commit_description: Optional[str] = None,
+        tags: Optional[list[str]] = None,
         **deprecated_kwargs,
     ) -> str:
         """
@@ -1010,7 +1097,7 @@ def get_checkpoint_shard_files(
     if not os.path.isfile(index_filename):
         raise ValueError(f"Can't find a checkpoint index ({index_filename}) in {pretrained_model_name_or_path}.")
 
-    with open(index_filename, "r") as f:
+    with open(index_filename) as f:
         index = json.loads(f.read())
 
     shard_filenames = sorted(set(index["weight_map"].values()))
@@ -1023,52 +1110,29 @@ def get_checkpoint_shard_files(
         shard_filenames = [os.path.join(pretrained_model_name_or_path, subfolder, f) for f in shard_filenames]
         return shard_filenames, sharded_metadata
 
-    # At this stage pretrained_model_name_or_path is a model identifier on the Hub
-    cached_filenames = []
-    # Check if the model is already cached or not. We only try the last checkpoint, this should cover most cases of
-    # downloaded (if interrupted).
-    last_shard = try_to_load_from_cache(
-        pretrained_model_name_or_path, shard_filenames[-1], cache_dir=cache_dir, revision=_commit_hash
+    # At this stage pretrained_model_name_or_path is a model identifier on the Hub. Try to get everything from cache,
+    # or download the files
+    cached_filenames = cached_files(
+        pretrained_model_name_or_path,
+        shard_filenames,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        proxies=proxies,
+        resume_download=resume_download,
+        local_files_only=local_files_only,
+        token=token,
+        user_agent=user_agent,
+        revision=revision,
+        subfolder=subfolder,
+        _commit_hash=_commit_hash,
     )
-    show_progress_bar = last_shard is None or force_download
-    for shard_filename in tqdm(shard_filenames, desc="Downloading shards", disable=not show_progress_bar):
-        try:
-            # Load from URL
-            cached_filename = cached_file(
-                pretrained_model_name_or_path,
-                shard_filename,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-                token=token,
-                user_agent=user_agent,
-                revision=revision,
-                subfolder=subfolder,
-                _commit_hash=_commit_hash,
-            )
-        # We have already dealt with RepositoryNotFoundError and RevisionNotFoundError when getting the index, so
-        # we don't have to catch them here.
-        except EntryNotFoundError:
-            raise EnvironmentError(
-                f"{pretrained_model_name_or_path} does not appear to have a file named {shard_filename} which is "
-                "required according to the checkpoint index."
-            )
-        except HTTPError:
-            raise EnvironmentError(
-                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load {shard_filename}. You should try"
-                " again after checking your internet connection."
-            )
-
-        cached_filenames.append(cached_filename)
 
     return cached_filenames, sharded_metadata
 
 
 def create_and_tag_model_card(
     repo_id: str,
-    tags: Optional[List[str]] = None,
+    tags: Optional[list[str]] = None,
     token: Optional[str] = None,
     ignore_metadata_errors: bool = False,
 ):
@@ -1082,7 +1146,7 @@ def create_and_tag_model_card(
             The list of tags to add in the model card
         token (`str`, *optional*):
             Authentication token, obtained with `huggingface_hub.HfApi.login` method. Will default to the stored token.
-        ignore_metadata_errors (`str`):
+        ignore_metadata_errors (`bool`, *optional*, defaults to `False`):
             If True, errors while parsing the metadata section will be ignored. Some information might be lost during
             the process. Use it at your own risk.
     """
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
old mode 100755
new mode 100644
index aa7be764c520..61fb91e5a844
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -45,6 +45,11 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
     package_version = "N/A"
     if package_exists:
         try:
+            # TODO: Once python 3.9 support is dropped, `importlib.metadata.packages_distributions()`
+            # should be used here to map from package name to distribution names
+            # e.g. PIL -> Pillow, Pillow-SIMD; quark -> amd-quark; onnxruntime -> onnxruntime-gpu.
+            # `importlib.metadata.packages_distributions()` is not available in Python 3.9.
+
             # Primary method to get the package version
             package_version = importlib.metadata.version(pkg_name)
         except importlib.metadata.PackageNotFoundError:
@@ -62,6 +67,12 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
                 except ImportError:
                     # If the package can't be imported, it's not available
                     package_exists = False
+            elif pkg_name == "quark":
+                # TODO: remove once `importlib.metadata.packages_distributions()` is supported.
+                try:
+                    package_version = importlib.metadata.version("amd-quark")
+                except Exception:
+                    package_exists = False
             else:
                 # For packages other than "torch", don't attempt the fallback and set as not available
                 package_exists = False
@@ -95,6 +106,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 XLA_FSDPV2_MIN_VERSION = "2.2.0"
 HQQ_MIN_VERSION = "0.2.1"
 VPTQ_MIN_VERSION = "0.0.4"
+TORCHAO_MIN_VERSION = "0.4.0"
 
 
 _accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
@@ -149,7 +161,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _gptqmodel_available = _is_package_available("gptqmodel")
 # `importlib.metadata.version` doesn't work with `awq`
 _auto_awq_available = importlib.util.find_spec("awq") is not None
-_quanto_available = _is_package_available("quanto")
+_quark_available = _is_package_available("quark")
 _is_optimum_quanto_available = False
 try:
     importlib.metadata.version("optimum_quanto")
@@ -192,16 +204,18 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _timm_available = _is_package_available("timm")
 _tokenizers_available = _is_package_available("tokenizers")
 _torchaudio_available = _is_package_available("torchaudio")
-_torchao_available = _is_package_available("torchao")
+_torchao_available, _torchao_version = _is_package_available("torchao", return_version=True)
 _torchdistx_available = _is_package_available("torchdistx")
 _torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True)
 _mlx_available = _is_package_available("mlx")
+_num2words_available = _is_package_available("num2words")
 _hqq_available, _hqq_version = _is_package_available("hqq", return_version=True)
 _tiktoken_available = _is_package_available("tiktoken")
 _blobfile_available = _is_package_available("blobfile")
 _liger_kernel_available = _is_package_available("liger_kernel")
 _triton_available = _is_package_available("triton")
 _spqr_available = _is_package_available("spqr_quant")
+_rich_available = _is_package_available("rich")
 
 _torch_version = "N/A"
 _torch_available = False
@@ -541,10 +555,16 @@ def is_torch_fp16_available_on_device(device):
     if not is_torch_available():
         return False
 
+    if is_torch_hpu_available():
+        if is_habana_gaudi1():
+            return False
+        else:
+            return True
+
     import torch
 
     try:
-        x = torch.zeros(2, 2, dtype=torch.float16).to(device)
+        x = torch.zeros(2, 2, dtype=torch.float16, device=device)
         _ = x @ x
 
         # At this moment, let's be strict of the check: check if `LayerNorm` is also supported on device, because many
@@ -572,8 +592,11 @@ def is_torch_bf16_available_on_device(device):
     if device == "cuda":
         return is_torch_bf16_gpu_available()
 
+    if device == "hpu":
+        return True
+
     try:
-        x = torch.zeros(2, 2, dtype=torch.bfloat16).to(device)
+        x = torch.zeros(2, 2, dtype=torch.bfloat16, device=device)
         _ = x @ x
     except:  # noqa: E722
         # TODO: more precise exception matching, if possible.
@@ -652,31 +675,6 @@ def is_g2p_en_available():
     return _g2p_en_available
 
 
-@lru_cache()
-def is_torch_tpu_available(check_device=True):
-    "Checks if `torch_xla` is installed and potentially if a TPU is in the environment"
-    warnings.warn(
-        "`is_torch_tpu_available` is deprecated and will be removed in 4.41.0. "
-        "Please use the `is_torch_xla_available` instead.",
-        FutureWarning,
-    )
-
-    if not _torch_available:
-        return False
-    if importlib.util.find_spec("torch_xla") is not None:
-        if check_device:
-            # We need to check if `xla_device` can be found, will raise a RuntimeError if not
-            try:
-                import torch_xla.core.xla_model as xm
-
-                _ = xm.xla_device()
-                return True
-            except RuntimeError:
-                return False
-        return True
-    return False
-
-
 @lru_cache
 def is_torch_xla_available(check_is_tpu=False, check_is_gpu=False):
     """
@@ -772,6 +770,65 @@ def is_torch_musa_available(check_device=False):
     return hasattr(torch, "musa") and torch.musa.is_available()
 
 
+@lru_cache
+def is_torch_hpu_available():
+    "Checks if `torch.hpu` is available and potentially if a HPU is in the environment"
+    if (
+        not _torch_available
+        or importlib.util.find_spec("habana_frameworks") is None
+        or importlib.util.find_spec("habana_frameworks.torch") is None
+    ):
+        return False
+
+    torch_hpu_min_version = "1.5.0"
+    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_version):
+        return False
+
+    import torch
+
+    if os.environ.get("PT_HPU_LAZY_MODE", "1") == "1":
+        # import habana_frameworks.torch in case of lazy mode to patch torch with torch.hpu
+        import habana_frameworks.torch  # noqa: F401
+
+    if not hasattr(torch, "hpu") or not torch.hpu.is_available():
+        return False
+
+    import habana_frameworks.torch.utils.experimental as htexp  # noqa: F401
+
+    # IlyasMoutawwakil: We patch masked_fill_ for int64 tensors to avoid a bug on Gaudi1
+    # synNodeCreateWithId failed for node: masked_fill_fwd_i64 with synStatus 26 [Generic failure]
+    # This can be removed once Gaudi1 support is discontinued but for now we need it to keep using
+    # dl1.24xlarge Gaudi1 instances on AWS for testing.
+    # check if the device is Gaudi1 (vs Gaudi2, Gaudi3).
+    if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi:
+        original_masked_fill_ = torch.Tensor.masked_fill_
+
+        def patched_masked_fill_(self, mask, value):
+            if self.dtype == torch.int64:
+                logger.warning_once(
+                    "In-place tensor.masked_fill_(mask, value) is not supported for int64 tensors on Gaudi1. "
+                    "This operation will be performed out-of-place using tensor[mask] = value."
+                )
+                self[mask] = value
+            else:
+                original_masked_fill_(self, mask, value)
+
+        torch.Tensor.masked_fill_ = patched_masked_fill_
+
+    return True
+
+
+@lru_cache
+def is_habana_gaudi1():
+    if not is_torch_hpu_available():
+        return False
+
+    import habana_frameworks.torch.utils.experimental as htexp  # noqa: F401
+
+    # Check if the device is Gaudi1 (vs Gaudi2, Gaudi3)
+    return htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi
+
+
 def is_torchdynamo_available():
     if not is_torch_available():
         return False
@@ -809,6 +866,23 @@ def is_torchdynamo_compiling():
             return False
 
 
+def is_torchdynamo_exporting():
+    if not is_torch_available():
+        return False
+
+    try:
+        import torch
+
+        return torch.compiler.is_exporting()
+    except Exception:
+        try:
+            import torch._dynamo as dynamo  # noqa: F401
+
+            return dynamo.is_exporting()
+        except Exception:
+            return False
+
+
 def is_torch_tensorrt_fx_available():
     if importlib.util.find_spec("torch_tensorrt") is None:
         return False
@@ -895,17 +969,19 @@ def get_major_and_minor_from_version(full_version):
 @lru_cache
 def is_torch_xpu_available(check_device=False):
     """
-    Checks if XPU acceleration is available either via `intel_extension_for_pytorch` or
-    via stock PyTorch (>=2.4) and potentially if a XPU is in the environment
+    Checks if XPU acceleration is available either via native PyTorch (>=2.6),
+    `intel_extension_for_pytorch` or via stock PyTorch (>=2.4) and potentially
+    if a XPU is in the environment.
     """
     if not is_torch_available():
         return False
 
     torch_version = version.parse(_torch_version)
-    if is_ipex_available():
-        import intel_extension_for_pytorch  # noqa: F401
-    elif torch_version.major < 2 or (torch_version.major == 2 and torch_version.minor < 4):
-        return False
+    if torch_version.major < 2 or (torch_version.major == 2 and torch_version.minor < 6):
+        if is_ipex_available():
+            import intel_extension_for_pytorch  # noqa: F401
+        elif torch_version.major < 2 or (torch_version.major == 2 and torch_version.minor < 4):
+            return False
 
     import torch
 
@@ -985,11 +1061,21 @@ def is_flash_attn_greater_or_equal(library_version: str):
 
 
 @lru_cache()
-def is_torch_greater_or_equal(library_version: str):
+def is_torch_greater_or_equal(library_version: str, accept_dev: bool = False):
+    """
+    Accepts a library version and returns True if the current version of the library is greater than or equal to the
+    given version. If `accept_dev` is True, it will also accept development versions (e.g. 2.7.0.dev20250320 matches
+    2.7.0).
+    """
     if not _is_package_available("torch"):
         return False
 
-    return version.parse(importlib.metadata.version("torch")) >= version.parse(library_version)
+    if accept_dev:
+        return version.parse(version.parse(importlib.metadata.version("torch")).base_version) >= version.parse(
+            library_version
+        )
+    else:
+        return version.parse(importlib.metadata.version("torch")) >= version.parse(library_version)
 
 
 def is_torchdistx_available():
@@ -1043,6 +1129,10 @@ def is_optimum_quanto_available():
     return _is_optimum_quanto_available
 
 
+def is_quark_available():
+    return _quark_available
+
+
 def is_compressed_tensors_available():
     return _compressed_tensors_available
 
@@ -1205,8 +1295,8 @@ def is_torchaudio_available():
     return _torchaudio_available
 
 
-def is_torchao_available():
-    return _torchao_available
+def is_torchao_available(min_version: str = TORCHAO_MIN_VERSION):
+    return _torchao_available and version.parse(_torchao_version) >= version.parse(min_version)
 
 
 def is_speech_available():
@@ -1280,6 +1370,10 @@ def is_mlx_available():
     return _mlx_available
 
 
+def is_num2words_available():
+    return _num2words_available
+
+
 def is_tiktoken_available():
     return _tiktoken_available and _blobfile_available
 
@@ -1295,6 +1389,10 @@ def is_triton_available():
     return _triton_available
 
 
+def is_rich_available():
+    return _rich_available
+
+
 # docstyle-ignore
 AV_IMPORT_ERROR = """
 {0} requires the PyAv library but it was not found in your environment. You can install it with:
@@ -1654,6 +1752,11 @@ def is_triton_available():
 jinja2`. Please note that you may need to restart your runtime after installation.
 """
 
+RICH_IMPORT_ERROR = """
+{0} requires the rich library but it was not found in your environment. You can install it with pip: `pip install
+rich`. Please note that you may need to restart your runtime after installation.
+"""
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("av", (is_av_available, AV_IMPORT_ERROR)),
@@ -1700,6 +1803,7 @@ def is_triton_available():
         ("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
         ("jinja", (is_jinja_available, JINJA_IMPORT_ERROR)),
         ("yt_dlp", (is_yt_dlp_available, YT_DLP_IMPORT_ERROR)),
+        ("rich", (is_rich_available, RICH_IMPORT_ERROR)),
     ]
 )
 
@@ -1760,7 +1864,7 @@ def __init__(
         name: str,
         module_file: str,
         import_structure: IMPORT_STRUCTURE_T,
-        module_spec: importlib.machinery.ModuleSpec = None,
+        module_spec: Optional[importlib.machinery.ModuleSpec] = None,
         extra_objects: Dict[str, object] = None,
     ):
         super().__init__(name)
diff --git a/src/transformers/utils/logging.py b/src/transformers/utils/logging.py
index 67f70b96eddc..a2915e167a0b 100644
--- a/src/transformers/utils/logging.py
+++ b/src/transformers/utils/logging.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 Optuna, Hugging Face
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -65,7 +64,7 @@ def _get_default_logging_level():
         else:
             logging.getLogger().warning(
                 f"Unknown option TRANSFORMERS_VERBOSITY={env_level_str}, "
-                f"has to be one of: { ', '.join(log_levels.keys()) }"
+                f"has to be one of: {', '.join(log_levels.keys())}"
             )
     return _default_log_level
 
diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index 7ec79a5e23cb..7db16b70a75c 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index eff8a28459e7..22a44d858e73 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2020 Hugging Face
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -121,7 +120,7 @@ def __init__(
             self.update_every = 0.5  # Adjusted for smooth updated as html rending is slow on VS Code
             # This is the only adjustment required to optimize training html rending
 
-    def update(self, value: int, force_update: bool = False, comment: str = None):
+    def update(self, value: int, force_update: bool = False, comment: Optional[str] = None):
         """
         The main method to update the progress bar to `value`.
 
@@ -186,7 +185,7 @@ def update_bar(self, value, comment=None):
             if self.average_time_per_item == 0:
                 self.label += ", +inf it/s"
             else:
-                self.label += f", {1/self.average_time_per_item:.2f} it/s"
+                self.label += f", {1 / self.average_time_per_item:.2f} it/s"
 
         self.label += "]" if self.comment is None or len(self.comment) == 0 else f", {self.comment}]"
         self.display()
diff --git a/src/transformers/utils/peft_utils.py b/src/transformers/utils/peft_utils.py
index 7efa80e92347..3eb62a099059 100644
--- a/src/transformers/utils/peft_utils.py
+++ b/src/transformers/utils/peft_utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import importlib
 import os
-from typing import Dict, Optional, Union
+from typing import Optional, Union
 
 from packaging import version
 
@@ -31,7 +31,7 @@ def find_adapter_config_file(
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
     resume_download: Optional[bool] = None,
-    proxies: Optional[Dict[str, str]] = None,
+    proxies: Optional[dict[str, str]] = None,
     token: Optional[Union[bool, str]] = None,
     revision: Optional[str] = None,
     local_files_only: bool = False,
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
old mode 100755
new mode 100644
index ec8a5ef70d4f..b0f119c58b67
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -2,6 +2,7 @@
 # coding=utf-8
 
 # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Modifications Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,6 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+import dataclasses
 import importlib.metadata
 import json
 import os
@@ -27,8 +29,10 @@
 
 from ..utils import (
     is_auto_awq_available,
+    is_compressed_tensors_available,
     is_gptqmodel_available,
     is_hqq_available,
+    is_quark_available,
     is_torch_available,
     is_torchao_available,
     logging,
@@ -58,6 +62,7 @@ class QuantizationMethod(str, Enum):
     BITNET = "bitnet"
     SPQR = "spqr"
     FP8 = "fp8"
+    QUARK = "quark"
 
 
 class AWQLinearVersion(str, Enum):
@@ -641,7 +646,7 @@ def __init__(
         sym: bool = True,
         true_sequential: bool = True,
         checkpoint_format: str = "gptq",
-        meta: Optional[Dict[str, any]] = None,
+        meta: Optional[Dict[str, Any]] = None,
         backend: Optional[str] = None,
         use_cuda_fp16: bool = False,
         model_seqlen: Optional[int] = None,
@@ -677,7 +682,6 @@ def __init__(
         self.use_exllama = use_exllama
         self.max_input_length = max_input_length
         self.exllama_config = exllama_config
-        self.disable_exllama = kwargs.pop("disable_exllama", None)
         self.cache_block_outputs = cache_block_outputs
         self.modules_in_block_to_quantize = modules_in_block_to_quantize
         self.post_init()
@@ -685,7 +689,6 @@ def __init__(
     def get_loading_attributes(self):
         attibutes_dict = copy.deepcopy(self.__dict__)
         loading_attibutes = [
-            "disable_exllama",
             "use_exllama",
             "exllama_config",
             "use_cuda_fp16",
@@ -734,20 +737,9 @@ def post_init(self):
                 self.use_exllama = False
 
         # auto-gptq specific kernel control logic
-        if self.disable_exllama is None and self.use_exllama is None:
+        if self.use_exllama is None:
             # New default behaviour
             self.use_exllama = True
-        elif self.disable_exllama is not None and self.use_exllama is None:
-            # Follow pattern of old config
-            logger.warning(
-                "Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`."
-                "The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file."
-            )
-            self.use_exllama = not self.disable_exllama
-            self.disable_exllama = None
-        elif self.disable_exllama is not None and self.use_exllama is not None:
-            # Only happens if user explicitly passes in both arguments
-            raise ValueError("Cannot specify both `disable_exllama` and `use_exllama`. Please use just `use_exllama`")
 
         if self.exllama_config is None:
             self.exllama_config = {"version": ExllamaVersion.ONE}
@@ -804,7 +796,7 @@ def from_dict_optimum(cls, config_dict):
         if "disable_exllama" in config_dict:
             config_dict["use_exllama"] = not config_dict["disable_exllama"]
             # switch to None to not trigger the warning
-            config_dict["disable_exllama"] = None
+            config_dict.pop("disable_exllama")
 
         config = cls(**config_dict)
         return config
@@ -900,10 +892,14 @@ def post_init(self):
             )
 
         if self.backend == AwqBackendPackingMethod.LLMAWQ:
-            compute_capability = torch.cuda.get_device_capability()
-            major, minor = compute_capability
-            if major < 8:
-                raise ValueError("LLM-AWQ backend is only supported on GPUs with compute capability >= 8.0")
+            # Only cuda device can run this function
+            if not (torch.cuda.is_available() or torch.xpu.is_available()):
+                raise ValueError("LLM-AWQ backend is only supported on CUDA and XPU")
+            if torch.cuda.is_available():
+                compute_capability = torch.cuda.get_device_capability()
+                major, minor = compute_capability
+                if major < 8:
+                    raise ValueError("LLM-AWQ backend is only supported on CUDA GPUs with compute capability >= 8.0")
 
         if self.do_fuse and self.fuse_max_seq_len is None:
             raise ValueError(
@@ -1253,9 +1249,13 @@ def __init__(
         run_compressed: bool = True,
         **kwargs,
     ):
-        from compressed_tensors.config import SparsityCompressionConfig
-        from compressed_tensors.quantization import QuantizationConfig
-
+        if is_compressed_tensors_available():
+            from compressed_tensors.config import SparsityCompressionConfig
+            from compressed_tensors.quantization import QuantizationConfig
+        else:
+            raise ImportError(
+                "compressed_tensors is not installed and is required for compressed-tensors quantization. Please install it with `pip install compressed-tensors`."
+            )
         self.quantization_config = None
         self.sparsity_config = None
 
@@ -1263,7 +1263,7 @@ def __init__(
 
         # parse from dict to load nested QuantizationScheme objects
         if config_groups or kv_cache_scheme:
-            self.quantization_config = QuantizationConfig.parse_obj(
+            self.quantization_config = QuantizationConfig.model_validate(
                 {
                     "config_groups": config_groups,
                     "quant_method": quant_method,
@@ -1282,7 +1282,19 @@ def __init__(
                 sparsity_config.get("format"), **sparsity_config
             )
 
-        super().__init__(quant_method=QuantizationMethod.COMPRESSED_TENSORS)
+        self.quant_method = QuantizationMethod.COMPRESSED_TENSORS
+
+    def post_init(self):
+        if self.run_compressed:
+            if self.is_sparsification_compressed:
+                logger.warn(
+                    "`run_compressed` is only supported for quantized_compressed models"
+                    " and not for sparsified models. Setting `run_compressed=False`"
+                )
+                self.run_compressed = False
+            elif not self.is_quantization_compressed:
+                logger.warn("`run_compressed` is only supported for compressed models. Setting `run_compressed=False`")
+                self.run_compressed = False
 
     @classmethod
     def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
@@ -1348,7 +1360,7 @@ def to_diff_dict(self) -> Dict[str, Any]:
 
         # only serialize values that differ from the default config
         for key, value in config_dict.items():
-            if value != default_config_dict[key]:
+            if key not in default_config_dict or value != default_config_dict[key]:
                 serializable_config_dict[key] = value
 
         return serializable_config_dict
@@ -1356,6 +1368,28 @@ def to_diff_dict(self) -> Dict[str, Any]:
     def get_loading_attributes(self):
         return {"run_compressed": self.run_compressed}
 
+    @property
+    def is_quantized(self):
+        return bool(self.quantization_config) and bool(self.quantization_config.config_groups)
+
+    @property
+    def is_quantization_compressed(self):
+        from compressed_tensors.quantization import QuantizationStatus
+
+        return self.is_quantized and self.quantization_config.quantization_status == QuantizationStatus.COMPRESSED
+
+    @property
+    def is_sparsification_compressed(self):
+        from compressed_tensors.config import (
+            CompressionFormat,
+            SparsityCompressionConfig,
+        )
+
+        return (
+            isinstance(self.sparsity_config, SparsityCompressionConfig)
+            and self.sparsity_config.format != CompressionFormat.dense.value
+        )
+
 
 @dataclass
 class FbgemmFp8Config(QuantizationConfigMixin):
@@ -1418,8 +1452,6 @@ def __init__(
         tune_metadata: Optional[Dict[str, Any]] = None,
         **kwargs,
     ):
-        if modules_to_not_convert is None:
-            modules_to_not_convert = ["lm_head"]
         if tune_metadata is None:
             tune_metadata = {}
         self.quant_method = QuantizationMethod.HIGGS
@@ -1448,11 +1480,18 @@ def post_init(self):
 
 @dataclass
 class TorchAoConfig(QuantizationConfigMixin):
+    quant_method: QuantizationMethod
+    quant_type: Union[str, "AOBaseConfig"]  # noqa: F821
+    modules_to_not_convert: Optional[List]
+    quant_type_kwargs: Dict[str, Any]
+
     """This is a config class for torchao quantization/sparsity techniques.
 
     Args:
-        quant_type (`str`):
-            The type of quantization we want to use, currently supporting: `int4_weight_only`, `int8_weight_only` and `int8_dynamic_activation_int8_weight`.
+        quant_type (`Union[str, AOBaseConfig]`):
+            The type of quantization we want to use. Can be either:
+            - A string: currently supporting: `int4_weight_only`, `int8_weight_only` and `int8_dynamic_activation_int8_weight`.
+            - An AOBaseConfig instance: for more advanced configuration options.
         modules_to_not_convert (`list`, *optional*, default to `None`):
             The list of modules to not quantize, useful for quantizing models that explicitly require to have
             some modules left in their original precision.
@@ -1464,80 +1503,177 @@ class TorchAoConfig(QuantizationConfigMixin):
     Example:
 
     ```python
+    # AOBaseConfig-based configuration
+    config = Int4WeightOnlyConfig(group_size=32)
+    quantization_config = TorchAoConfig(config)
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+
+    # String-based configuration
     quantization_config = TorchAoConfig("int4_weight_only", group_size=32)
     # int4_weight_only quant is only working with *torch.bfloat16* dtype right now
     model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+
+    # autoquant
+    # `autoquant` is a convenient way for users to search for the best quantization for each layer
+    # `min_sqnr` is an option to control the accuracy of the model, higher value means the model is more
+    # accurate, we can start with 30 and adjust it to larger or smaller (e.g. 40, 20)
+    # defaults to None, which means we'll try to get the best performing quantized model without
+    # considering accuracy
+    quantization_config = TorchAoConfig("autoquant", min_sqnr=30)
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+    # run through example inputs, quantization methods will be selected based on the shape of example input
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    input_text = "What are we having for dinner?"
+    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+    MAX_NEW_TOKENS = 1000
+    model.generate(**input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static")
+    # manually ran finalize_autoquant if needed
+    if hasattr(quantized_model, "finalize_autoquant"):
+      print("finalizing autoquant")
+      quantized_model.finalize_autoquant()
+
     ```
     """
 
-    def __init__(self, quant_type: str, modules_to_not_convert: Optional[List] = None, **kwargs):
+    def __init__(
+        self,
+        quant_type: Union[str, "AOBaseConfig"],  # noqa: F821
+        modules_to_not_convert: Optional[List] = None,
+        **kwargs,
+    ):
         self.quant_method = QuantizationMethod.TORCHAO
         self.quant_type = quant_type
         self.modules_to_not_convert = modules_to_not_convert
-        # when we load from serailized config, "quant_type_kwargs" will be the key
-        if "quant_type_kwargs" in kwargs:
-            self.quant_type_kwargs = kwargs["quant_type_kwargs"]
-        else:
-            self.quant_type_kwargs = kwargs
-
+        self.quant_type_kwargs = kwargs.get("quant_type_kwargs", kwargs)
         self.post_init()
 
+    @staticmethod
+    def _get_ao_version() -> version.Version:
+        """Centralized check for TorchAO availability and version requirements."""
+        if not is_torchao_available():
+            raise ValueError("TorchAoConfig requires torchao to be installed. Install with `pip install torchao`")
+
+        return version.parse(importlib.metadata.version("torchao"))
+
     def post_init(self):
-        r"""
-        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
-        """
-        if is_torchao_available():
-            if not version.parse(importlib.metadata.version("torchao")) >= version.parse("0.4.0"):
-                raise ValueError("Requires torchao 0.4.0 version and above")
+        """Validate configuration and set defaults."""
+        ao_version = self._get_ao_version()
+
+        # Handle quant_type based on type and version
+        if isinstance(self.quant_type, str):
+            self._validate_string_quant_type()
+        elif ao_version > version.parse("0.9.0"):
+            from torchao.quantization.quant_api import AOBaseConfig
+
+            if not isinstance(self.quant_type, AOBaseConfig):
+                raise ValueError(
+                    f"quant_type must be either a string or an AOBaseConfig instance, got {type(self.quant_type)}"
+                )
         else:
             raise ValueError(
-                "TorchAoConfig requires torchao to be installed, please install with `pip install torchao`"
+                f"In torchao <= 0.9.0, quant_type must be a string. Got {type(self.quant_type)}. "
+                f"Please upgrade to torchao > 0.9.0 to use AOBaseConfig instances."
             )
 
-        _STR_TO_METHOD = self._get_torchao_quant_type_to_method()
-        if self.quant_type not in _STR_TO_METHOD.keys():
+    def _validate_string_quant_type(self):
+        """Validate string quant_type and its kwargs."""
+        methods = self._get_torchao_quant_type_to_method()
+
+        if self.quant_type not in methods:
             raise ValueError(
-                f"Requested quantization type: {self.quant_type} is not supported yet, please add support in TorchAoConfig and TorchAoHfQuantizer."
+                f"Unsupported string quantization type: {self.quant_type}. "
+                f"Supported types: {', '.join(methods.keys())}"
             )
 
-        method = _STR_TO_METHOD[self.quant_type]
+        # Validate kwargs against method signature
+        method = methods[self.quant_type]
         sig = signature(method)
-        all_kwargs = [
+        valid_kwargs = {
             param.name
             for param in sig.parameters.values()
             if param.kind in [Parameter.KEYWORD_ONLY, Parameter.POSITIONAL_OR_KEYWORD]
-        ]
-        for k in self.quant_type_kwargs:
-            if k not in all_kwargs:
-                raise ValueError(
-                    f"Unexpected keyword arg: {k} for API: {method}, accepted keyword args are: {all_kwargs}"
-                )
-
-    def _get_torchao_quant_type_to_method(self):
-        if is_torchao_available():
-            from torchao.quantization import (
-                int4_weight_only,
-                int8_dynamic_activation_int8_weight,
-                int8_weight_only,
-            )
+        }
 
-            return {
-                "int4_weight_only": int4_weight_only,
-                "int8_weight_only": int8_weight_only,
-                "int8_dynamic_activation_int8_weight": int8_dynamic_activation_int8_weight,
-            }
-        else:
+        invalid_kwargs = set(self.quant_type_kwargs) - valid_kwargs
+        if invalid_kwargs:
             raise ValueError(
-                "TorchAoConfig requires torchao to be installed, please install with `pip install torchao`"
+                f"Unexpected keyword arg for {self.quant_type}: {', '.join(invalid_kwargs)}. "
+                f"Valid kwargs: {', '.join(valid_kwargs)}"
             )
 
+    def _get_torchao_quant_type_to_method(self):
+        """Get mapping of quant_type strings to their corresponding methods."""
+        from torchao.quantization import (
+            autoquant,
+            int4_weight_only,
+            int8_dynamic_activation_int8_weight,
+            int8_weight_only,
+        )
+
+        return {
+            "int4_weight_only": int4_weight_only,
+            "int8_weight_only": int8_weight_only,
+            "int8_dynamic_activation_int8_weight": int8_dynamic_activation_int8_weight,
+            "autoquant": autoquant,
+        }
+
     def get_apply_tensor_subclass(self):
-        _STR_TO_METHOD = self._get_torchao_quant_type_to_method()
-        return _STR_TO_METHOD[self.quant_type](**self.quant_type_kwargs)
+        """Create the appropriate quantization method based on configuration."""
+        if isinstance(self.quant_type, str):
+            methods = self._get_torchao_quant_type_to_method()
+            quant_type_kwargs = self.quant_type_kwargs.copy()
+            if (
+                not torch.cuda.is_available()
+                and is_torchao_available()
+                and self.quant_type == "int4_weight_only"
+                and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
+            ):
+                from torchao.dtypes import Int4CPULayout
+
+                quant_type_kwargs["layout"] = Int4CPULayout()
+
+            return methods[self.quant_type](**quant_type_kwargs)
+        else:
+            return self.quant_type
 
-    def __repr__(self):
-        config_dict = self.to_dict()
-        return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
+    def to_dict(self):
+        """Convert configuration to a dictionary."""
+        d = super().to_dict()
+
+        if isinstance(self.quant_type, str):
+            # Handle layout serialization if present
+            if "quant_type_kwargs" in d and "layout" in d["quant_type_kwargs"]:
+                d["quant_type_kwargs"]["layout"] = dataclasses.asdict(d["quant_type_kwargs"]["layout"])
+        else:
+            # Handle AOBaseConfig serialization
+            from torchao.core.config import config_to_dict
+
+            # For now we assume there is 1 config per Transfomer, however in the future
+            # We may want to support a config per fqn.
+            d["quant_type"] = {"default": config_to_dict(self.quant_type)}
+
+        return d
+
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
+        """Create configuration from a dictionary."""
+        ao_verison = cls._get_ao_version()
+        assert ao_verison > version.parse("0.9.0"), "TorchAoConfig requires torchao > 0.9.0 for construction from dict"
+        config_dict = config_dict.copy()
+        quant_type = config_dict.pop("quant_type")
+        # Check if we only have one key which is "default"
+        # In the future we may update this
+        assert len(quant_type) == 1 and "default" in quant_type, (
+            "Expected only one key 'default' in quant_type dictionary"
+        )
+        quant_type = quant_type["default"]
+
+        # Deserialize quant_type if needed
+        from torchao.core.config import config_from_dict
+
+        quant_type = config_from_dict(quant_type)
+
+        return cls(quant_type=quant_type, **config_dict)
 
 
 @dataclass
@@ -1597,8 +1733,6 @@ def __init__(
         self.bits = bits
         self.beta1 = beta1
         self.beta2 = beta2
-        if modules_to_not_convert is None:
-            modules_to_not_convert = []
         self.modules_to_not_convert = modules_to_not_convert
         self.post_init()
 
@@ -1619,10 +1753,6 @@ def post_init(self):
             raise ValueError("SpQR currently only supports beta1 = 16")
         if self.beta2 != 16:
             raise ValueError("SpQR currently only supports beta2 = 16")
-
-        if self.modules_to_not_convert is not None and not isinstance(self.modules_to_not_convert, list):
-            raise ValueError("modules_to_not_convert must be a list of strings")
-
         if not isinstance(self.shapes, dict):
             raise TypeError("shapes must be a dict")
 
@@ -1665,3 +1795,41 @@ def post_init(self):
             raise ValueError("weight_block_size must be a tuple of two integers")
         if self.weight_block_size[0] <= 0 or self.weight_block_size[1] <= 0:
             raise ValueError("weight_block_size must be a tuple of two positive integers")
+
+
+class QuarkConfig(QuantizationConfigMixin):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        if is_torch_available() and is_quark_available():
+            from quark import __version__ as quark_version
+            from quark.torch.export.config.config import JsonExporterConfig
+            from quark.torch.export.main_export.quant_config_parser import QuantConfigParser
+            from quark.torch.quantization.config.config import Config
+
+        # This might be e.g. `"fp8"` or `"awq"`.
+        self.custom_mode = kwargs["quant_method"]
+        self.legacy = "export" not in kwargs
+
+        if self.custom_mode in ["awq", "fp8"]:
+            # Legacy (quark<1.0) or custom export.
+            self.quant_config = QuantConfigParser.from_custom_config(kwargs, is_bias_quantized=False)
+            self.json_export_config = JsonExporterConfig()
+        else:
+            self.quant_config = Config.from_dict(kwargs)
+
+            if "export" in kwargs:
+                # TODO: Remove this check once configuration version is handled natively by Quark.
+                if "min_kv_scale" in kwargs["export"] and version.parse(quark_version) < version.parse("0.8"):
+                    min_kv_scale = kwargs["export"].pop("min_kv_scale")
+                    logger.warning(
+                        f"The parameter `min_kv_scale={min_kv_scale}` was found in the model config.json's `quantization_config.export` configuration, but this parameter is supported only for quark>=0.8. Ignoring this configuration parameter. Please update the `amd-quark` package."
+                    )
+
+                self.json_export_config = JsonExporterConfig(**kwargs["export"])
+            else:
+                # Legacy (quark<1.0) or custom export.
+                self.json_export_config = JsonExporterConfig()
+
+        self.quant_method = QuantizationMethod.QUARK
diff --git a/src/transformers/utils/sentencepiece_model_pb2_new.py b/src/transformers/utils/sentencepiece_model_pb2_new.py
index 65d03e5a2815..2ea4f4d64ce6 100644
--- a/src/transformers/utils/sentencepiece_model_pb2_new.py
+++ b/src/transformers/utils/sentencepiece_model_pb2_new.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: sentencepiece_model.proto
 """Generated protocol buffer code."""
diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
index c2b753c103e1..83916cc58736 100755
--- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
@@ -535,7 +535,6 @@ def _mp_fn(index):
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
-    AdamW,
     AutoConfig,
     {{cookiecutter.model_class}},
     AutoTokenizer,
@@ -863,7 +862,7 @@ def tokenize_function(examples):
             "weight_decay": 0.0,
         },
     ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
 
     # Prepare everything with our `accelerator`.
     model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
diff --git a/tests/bettertransformer/test_integration.py b/tests/bettertransformer/test_integration.py
index 8f67852bfd05..b123001f1023 100644
--- a/tests/bettertransformer/test_integration.py
+++ b/tests/bettertransformer/test_integration.py
@@ -38,7 +38,7 @@ class BetterTransformerIntegrationTest(unittest.TestCase):
 
     def test_transform_and_reverse(self):
         r"""
-        Classic tests to simply check if the conversion has been successfull.
+        Classic tests to simply check if the conversion has been successful.
         """
         model_id = "hf-internal-testing/tiny-random-t5"
         tokenizer = AutoTokenizer.from_pretrained(model_id)
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 28ab70059091..11bea3c3aa88 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -45,12 +45,14 @@
     require_deepspeed,
     require_optuna,
     require_torch_accelerator,
+    require_torch_fp16,
     require_torch_multi_accelerator,
+    run_first,
     slow,
     torch_device,
 )
 from transformers.trainer_utils import get_last_checkpoint, set_seed
-from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_available_on_device
+from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
 
 
 if is_torch_available():
@@ -150,10 +152,12 @@ def get_launcher(distributed=False):
 schedulers = [HF_SCHEDULER, DS_SCHEDULER]
 
 stages = [ZERO2, ZERO3]
+
+dtypes = []
 if is_torch_bf16_available_on_device(torch_device):
-    dtypes = [FP16, BF16]
-else:
-    dtypes = [FP16]
+    dtypes.append(BF16)
+if is_torch_fp16_available_on_device(torch_device):
+    dtypes.append(FP16)
 
 
 def parameterized_custom_name_func(func, param_num, param):
@@ -170,7 +174,6 @@ def parameterized_custom_name_func(func, param_num, param):
 
 
 @require_deepspeed
-@require_torch_accelerator
 class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
     """
     Testing non-Trainer DeepSpeed integration
@@ -194,6 +197,43 @@ def tearDown(self):
         # reset the ds config global so that tests state doesn't leak
         unset_hf_deepspeed_config()
 
+    def test_init_zero3(self):
+        # test that zero.Init() works correctly
+        ds_config = {
+            "train_batch_size": 1,
+            "zero_optimization": {
+                "stage": 3,
+            },
+        }
+
+        dschf = HfDeepSpeedConfig(ds_config)
+
+        self.assertTrue(dschf.is_zero3())
+        self.assertTrue(is_deepspeed_zero3_enabled())
+
+        with LoggingLevel(logging.INFO):
+            with mockenv_context(**self.dist_env_1_gpu):
+                logger = logging.get_logger("transformers.modeling_utils")
+                with CaptureLogger(logger) as cl:
+                    AutoModel.from_pretrained(T5_TINY)
+        self.assertIn("Detected DeepSpeed ZeRO-3", cl.out)
+
+        # now remove zero optimization
+        del ds_config["zero_optimization"]
+        dschf = HfDeepSpeedConfig(ds_config)
+
+        self.assertFalse(dschf.is_zero3())
+        self.assertFalse(is_deepspeed_zero3_enabled())
+
+        with LoggingLevel(logging.INFO):
+            with mockenv_context(**self.dist_env_1_gpu):
+                logger = logging.get_logger("transformers.modeling_utils")
+                with CaptureLogger(logger) as cl:
+                    AutoModel.from_pretrained(T5_TINY)
+        self.assertNotIn("Detected DeepSpeed ZeRO-3", cl.out)
+
+    @require_torch_fp16
+    @require_torch_accelerator
     def test_init_zero3_fp16(self):
         # test that zero.Init() works correctly under zero3/fp16
         ds_config = {
@@ -201,6 +241,9 @@ def test_init_zero3_fp16(self):
             "zero_optimization": {
                 "stage": 3,
             },
+            "fp16": {
+                "enabled": True,
+            },
         }
 
         dschf = HfDeepSpeedConfig(ds_config)
@@ -362,7 +405,7 @@ def bad_deepspeed_create_sinusoidal_positions(num_pos: int, dim: int) -> torch.T
         self.assertFalse(torch.allclose(good_deepspeed_sin_cos, bad_deepspeed_sin_cos))
         torch.testing.assert_close(good_torch_sin_cos, good_deepspeed_sin_cos.cpu())
 
-        # Finally, we can see that the incorrect pattern is okay on vanilla torch, demostrating that this issue is
+        # Finally, we can see that the incorrect pattern is okay on vanilla torch, demonstrating that this issue is
         # exclusive to DeepSpeed
         bad_torch_sin_cos = bad_deepspeed_create_sinusoidal_positions(
             model.config.max_position_embeddings, model.config.rotary_dim
@@ -418,6 +461,7 @@ def get_config_dict(self, stage):
 
 
 @require_deepspeed
+@require_torch_fp16
 @require_torch_accelerator
 class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, TrainerIntegrationCommon):
     """
@@ -584,7 +628,7 @@ def model_init():
                 with CaptureStd() as cs:
                     trainer.hyperparameter_search(direction="maximize", n_trials=n_trials)
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
-            self.assertIn(f"Trial {n_trials-1} finished with value", cs.err, "expected hyperparameter_search output")
+            self.assertIn(f"Trial {n_trials - 1} finished with value", cs.err, "expected hyperparameter_search output")
             self.assertIn("Best is trial", cs.err, "expected hyperparameter_search output")
 
     # --- These tests need to run on both zero stages --- #
@@ -676,7 +720,7 @@ def test_gradient_accumulation(self, stage, dtype):
         # dynamic loss scale value set to:
         #   "fp16.initial_scale_power": 1
         # plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file
-        # but for some reason going to train_len=64 the weights, weights start to mismatch with this setup.
+        # but for some reason going to train_len=64, the weights start to mismatch with this setup.
         # the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical
 
         train_len = 64
@@ -719,8 +763,12 @@ def test_gradient_accumulation(self, stage, dtype):
 
         # training with half the batch size but accumulation steps as 2 should give the same
         # weights, but sometimes get a slight difference still of 1e-6
-        self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
-        self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
+        if torch_device == "hpu":
+            self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, delta=1e-4)
+            self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, delta=1e-4)
+        else:
+            self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
+            self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
 
         # Relative difference. See the note above how to get identical loss on a small bs
         self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)
@@ -1062,6 +1110,7 @@ def get_dataset():
 
 
 @slow
+@run_first
 @require_deepspeed
 @require_torch_accelerator
 class TestDeepSpeedWithLauncher(TestCasePlus):
@@ -1088,6 +1137,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
     def test_basic_distributed(self, stage, dtype):
         self.run_and_check(stage=stage, dtype=dtype, distributed=True)
 
+    @require_torch_fp16
     def test_do_eval_no_train(self):
         # testing only zero3 since zero2 makes no sense with inference
         self.run_and_check(
@@ -1161,12 +1211,15 @@ def test_inference(self, dtype):
         if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
             self.skipTest(reason="test requires bfloat16 hardware support")
 
+        if dtype == "fp16" and not is_torch_fp16_available_on_device(torch_device):
+            self.skipTest(reason="test requires fp16 hardware support")
+
         # this is just inference, so no optimizer should be loaded
         # it only works for z3 (makes no sense with z1-z2)
         fp32 = True if dtype == "fp32" else False
         self.run_and_check(
             stage=ZERO3,
-            dtype=FP16,
+            dtype=dtype,
             model_name=T5_TINY,
             distributed=True,
             do_train=False,
@@ -1343,6 +1396,7 @@ def test_clm(self, stage, dtype):
         # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
         execute_subprocess_async(cmd, env=self.get_env())
 
+    @require_torch_fp16
     def test_clm_from_config_zero3_fp16(self):
         # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
 
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
index f5af373f49bc..48024298ed48 100644
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -33,12 +33,17 @@
     require_fsdp,
     require_torch_accelerator,
     require_torch_multi_accelerator,
+    run_first,
     slow,
     torch_device,
 )
 from transformers.trainer_callback import TrainerState
 from transformers.trainer_utils import FSDPOption, set_seed
-from transformers.utils import is_accelerate_available, is_torch_bf16_available_on_device
+from transformers.utils import (
+    is_accelerate_available,
+    is_torch_bf16_available_on_device,
+    is_torch_fp16_available_on_device,
+)
 
 
 if is_torch_available():
@@ -49,14 +54,19 @@
 
 # default torch.distributed port
 DEFAULT_MASTER_PORT = "10999"
-dtypes = ["fp16"]
+
+dtypes = []
 if is_torch_bf16_available_on_device(torch_device):
     dtypes += ["bf16"]
+if is_torch_fp16_available_on_device(torch_device):
+    dtypes += ["fp16"]
+
 sharding_strategies = ["full_shard", "shard_grad_op"]
 state_dict_types = ["FULL_STATE_DICT", "SHARDED_STATE_DICT"]
-set_seed(42)
 params = list(itertools.product(sharding_strategies, dtypes))
 
+set_seed(42)
+
 
 def get_master_port(real_launcher=False):
     """
@@ -99,6 +109,15 @@ def get_master_port(real_launcher=False):
     require_fsdp_version = partial(require_fsdp, min_version=FSDP_PYTORCH_VERSION)
 
 
+FSDP2_ACCELERATE_VERSION = "1.6.0"
+require_accelerate_fsdp2 = partial(require_accelerate, min_version=FSDP2_ACCELERATE_VERSION)
+require_fsdp_v2_version = require_fsdp
+if is_accelerate_available(min_version=FSDP2_ACCELERATE_VERSION):
+    from accelerate.utils.constants import FSDP2_PYTORCH_VERSION
+
+    require_fsdp_v2_version = partial(require_fsdp, min_version=FSDP2_PYTORCH_VERSION)
+
+
 def get_launcher(distributed=False, use_accelerate=False):
     # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
     # - it won't be able to handle that
@@ -140,13 +159,13 @@ def setUp(self):
         }
 
         self.fsdp_config = {
-            "backward_prefetch": "backward_pre",
-            "forward_prefetch": "False",
-            "limit_all_gathers": "False",
-            "use_orig_params": "True",
-            "sync_module_states": "True",
-            "cpu_ram_efficient_loading": "True",
-            "activation_checkpointing": "False",
+            "backward_prefetch": "BACKWARD_PRE",
+            "forward_prefetch": "false",
+            "limit_all_gathers": "false",
+            "use_orig_params": "true",
+            "sync_module_states": "true",
+            "cpu_ram_efficient_loading": "true",
+            "activation_checkpointing": "false",
             "min_num_params": 1,
         }
 
@@ -202,7 +221,7 @@ def test_fsdp_config_transformers_auto_wrap(self, sharding_strategy, dtype):
             self.assertEqual(
                 os.environ[f"{prefix}TRANSFORMER_CLS_TO_WRAP"], ",".join(fsdp_config["transformer_layer_cls_to_wrap"])
             )
-            self.assertEqual(os.environ[f"{prefix}BACKWARD_PREFETCH"], fsdp_config["backward_prefetch"].upper())
+            self.assertEqual(os.environ[f"{prefix}BACKWARD_PREFETCH"], fsdp_config["backward_prefetch"])
             self.assertEqual(os.environ[f"{prefix}FORWARD_PREFETCH"], fsdp_config["forward_prefetch"])
             self.assertEqual(os.environ[f"{prefix}USE_ORIG_PARAMS"], fsdp_config["use_orig_params"])
             self.assertEqual(os.environ[f"{prefix}SYNC_MODULE_STATES"], fsdp_config["sync_module_states"])
@@ -213,6 +232,7 @@ def test_fsdp_config_transformers_auto_wrap(self, sharding_strategy, dtype):
 
     @parameterized.expand(params, name_func=_parameterized_custom_name_func)
     @require_torch_multi_accelerator
+    @run_first
     @slow
     def test_basic_run(self, sharding_strategy, dtype):
         launcher = get_launcher(distributed=True, use_accelerate=False)
@@ -225,6 +245,7 @@ def test_basic_run(self, sharding_strategy, dtype):
 
     @parameterized.expand(params, name_func=_parameterized_custom_name_func)
     @require_torch_multi_accelerator
+    @run_first
     @slow
     def test_basic_run_with_gradient_accumulation(self, sharding_strategy, dtype):
         launcher = get_launcher(distributed=True, use_accelerate=False)
@@ -237,6 +258,7 @@ def test_basic_run_with_gradient_accumulation(self, sharding_strategy, dtype):
 
     @parameterized.expand(dtypes)
     @require_torch_multi_accelerator
+    @run_first
     @slow
     @unittest.skipIf(not is_torch_greater_or_equal_than_2_1, reason="This test on pytorch 2.0 takes 4 hours.")
     def test_basic_run_with_cpu_offload(self, dtype):
@@ -250,6 +272,7 @@ def test_basic_run_with_cpu_offload(self, dtype):
 
     @parameterized.expand(state_dict_types, name_func=_parameterized_custom_name_func)
     @require_torch_multi_accelerator
+    @run_first
     @slow
     def test_training_and_can_resume_normally(self, state_dict_type):
         output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
@@ -286,10 +309,13 @@ def test_training_and_can_resume_normally(self, state_dict_type):
                 self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)
 
     @require_torch_multi_accelerator
+    @run_first
     @slow
-    @require_torch_accelerator
-    @require_fsdp
     def test_fsdp_cpu_offloading(self):
+        # TODO: This file is missing and should be added or the test should be removed
+        if not os.path.exists("utils/testing_scripts/fsdp_cpu_offloading.py"):
+            raise unittest.SkipTest("FSDP CPU offloading script not found!")
+
         try:
             subprocess.run(
                 "accelerate launch utils/testing_scripts/fsdp_cpu_offloading.py --config utils/testing_scripts/dummy_fsdp_config.yml",
@@ -299,6 +325,73 @@ def test_fsdp_cpu_offloading(self):
         except:  # noqa
             raise AssertionError("CPU offloading failed with FSDP!")
 
+    @require_torch_multi_accelerator
+    @slow
+    @require_fsdp
+    @require_fsdp_v2_version
+    @require_accelerate_fsdp2
+    def test_accelerate_fsdp2_integration(self):
+        output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
+        sharding_strategy = "full_shard"
+        use_accelerate = True
+
+        num_gpus = min(2, backend_device_count(torch_device))
+        master_port = get_master_port(real_launcher=True)
+        launcher = f"""accelerate launch
+            --num_processes {num_gpus}
+            --main_process_port {master_port}
+            --use_fsdp
+            --fsdp_version 2
+            --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP
+            --fsdp_state_dict_type SHARDED_STATE_DICT
+            --fsdp_transformer_layer_cls_to_wrap BertLayer""".split()
+        args = self.get_base_args(output_dir, 2, 25).split()
+        script = [f"{self.examples_dir_str}/pytorch/text-classification/run_glue.py"]
+        logs = self.run_cmd_and_get_logs(use_accelerate, sharding_strategy, launcher, script, args, output_dir)
+
+        # resume from ckpt
+        checkpoint = os.path.join(output_dir, "checkpoint-115")
+        resume_args = args + f"--resume_from_checkpoint {checkpoint}".split()
+
+        is_fsdp_ckpt = os.path.isdir(checkpoint) and (
+            # this checks the FSDP state dict when `SHARDED_STATE_DICT` is used
+            any(
+                FSDP_MODEL_NAME in folder_name
+                for folder_name in os.listdir(checkpoint)
+                if os.path.isdir(os.path.join(checkpoint, folder_name))
+            )
+            # this checks the FSDP state dict when `FULL_STATE_DICT` is used
+            or os.path.isfile(os.path.join(checkpoint, f"{FSDP_MODEL_NAME}.bin"))
+        )
+        self.assertTrue(is_fsdp_ckpt)
+
+        logs_resume = self.run_cmd_and_get_logs(
+            use_accelerate, sharding_strategy, launcher, script, resume_args, output_dir
+        )
+
+        for log, log1 in zip(logs, logs_resume):
+            if "learning_rate" in log:
+                self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)
+
+    @require_torch_multi_accelerator
+    @slow
+    @require_fsdp
+    @require_fsdp_v2_version
+    @require_accelerate_fsdp2
+    def test_fsdp2_cpu_offloading(self):
+        # TODO: This file is missing and should be added or the test should be removed
+        if not os.path.exists("utils/testing_scripts/fsdp_cpu_offloading.py"):
+            raise unittest.SkipTest("FSDP 2 CPU offloading script not found!")
+
+        try:
+            subprocess.run(
+                "accelerate launch --fsdp_version 2 utils/testing_scripts/fsdp_cpu_offloading.py --config utils/testing_scripts/dummy_fsdp_config.yml",
+                shell=True,
+                check=True,
+            )
+        except:  # noqa
+            raise AssertionError("CPU offloading failed with FSDP!")
+
     def run_cmd_and_get_logs(self, use_accelerate, sharding_strategy, launcher, script, args, output_dir):
         if not use_accelerate:
             fsdp_args = [
diff --git a/tests/generation/test_beam_constraints.py b/tests/generation/test_beam_constraints.py
index ae8a0c41eb95..7b884b99e702 100644
--- a/tests/generation/test_beam_constraints.py
+++ b/tests/generation/test_beam_constraints.py
@@ -43,7 +43,7 @@ def test_input_types(self):
             DisjunctiveConstraint([torch.LongTensor([1, 2, 4]), torch.LongTensor([1, 2, 3, 4, 5])])
 
     def test_check_illegal_input(self):
-        # We can't have constraints that are complete subsets of another. This leads to a preverse
+        # We can't have constraints that are complete subsets of another. This leads to a perverse
         # interpretation of "constraint fulfillment": does generating [1,2,3] fulfill the constraint?
         # It would mean that it generated [1,2] which fulfills it, but it's in the middle of potentially
         # fulfilling [1,2,3,4]. If we believe that [1,2,3] does fulfill the constraint, then the algorithm
diff --git a/tests/generation/test_beam_search.py b/tests/generation/test_beam_search.py
index 47d3b4b38a7b..8de00019504a 100644
--- a/tests/generation/test_beam_search.py
+++ b/tests/generation/test_beam_search.py
@@ -522,9 +522,9 @@ def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
         # set to same device. we don't care what device.
 
         if not isinstance(tensor_1, list):
-            tensor_1 = tensor_1.cpu().tolist()
+            tensor_1 = tensor_1.tolist()
         if not isinstance(tensor_2, list):
-            tensor_2 = tensor_2.cpu().tolist()
+            tensor_2 = tensor_2.tolist()
 
         in_order = len(tensor_1) <= len(tensor_2)
         longer = tensor_2 if in_order else tensor_1
diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py
index 03fd51324b02..38df48ab08d2 100644
--- a/tests/generation/test_candidate_generator.py
+++ b/tests/generation/test_candidate_generator.py
@@ -1,43 +1,325 @@
+import gc
 import unittest
+import weakref
+from unittest.mock import MagicMock
 
-import numpy as np
+import torch
 
-from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
+from transformers.generation.candidate_generator import (
+    AssistantToTargetTranslator,
+    AssistantVocabTranslatorCache,
+    UniversalSpeculativeDecodingGenerator,
+)
+from transformers.testing_utils import require_torch, torch_device
 
 
-class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase):
-    def test_no_intersection(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[4, 5, 6]])
-        result = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt, prompt_plus_new_tokens)
-        self.assertEqual(result, (None, None, None))
+@require_torch
+class TestAssistantToTargetTranslator(unittest.TestCase):
+    def setUp(self):
+        # Create mock tokenizers with predefined vocabularies
+        self.target_tokenizer = MagicMock()
+        self.assistant_tokenizer = MagicMock()
 
-    def test_complete_overlap(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[1, 2, 3, 4, 5]])
-        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
-            prompt, prompt_plus_new_tokens
+        # Define mock vocabularies for the tokenizers
+        self.target_vocab = {"hello": 0, "world": 1, "foo": 2, "bar": 3}
+        self.assistant_vocab = {"hello": 0, "world": 1, "foo": 2, "baz": 4}
+
+        self.target_tokenizer.get_vocab.return_value = self.target_vocab
+        self.assistant_tokenizer.get_vocab.return_value = self.assistant_vocab
+        self.assistant_model_device = torch_device
+        self.target_vocab_size = 6
+
+        # Instantiate the class under test
+        self.translator = AssistantToTargetTranslator(
+            target_tokenizer=self.target_tokenizer,
+            assistant_tokenizer=self.assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
+        )
+
+    def test_get_assistant_to_target_input_ids(self):
+        """Test the mapping from assistant tokens to target tokens."""
+        expected_mapping = [0, 1, 2, self.translator.SUPPRESS_TOKEN_ID, self.translator.SUPPRESS_TOKEN_ID]
+        actual_mapping = self.translator._assistant_to_target_input_ids.tolist()
+        self.assertEqual(actual_mapping, expected_mapping)
+
+    def test_get_suppress_input_ids(self):
+        """Test the suppression of assistant input IDs not present in the target vocabulary."""
+        expected_suppress_ids = [3, 4]
+        actual_suppress_ids = self.translator._get_suppress_input_ids().tolist()
+        self.assertEqual(actual_suppress_ids, expected_suppress_ids)
+
+    def test_get_target_ids(self):
+        """Test the translation of assistant candidate IDs to target candidate IDs."""
+        assistant_input_ids = torch.LongTensor([[0, 1, 2]]).to(
+            self.assistant_model_device
+        )  # 'hello world foo' in assistant tokenizer
+        target_input_ids = torch.LongTensor([[0, 1, 2]]).to(
+            self.assistant_model_device
+        )  # 'hello world foo' in target tokenizer
+        assistant_candidate_ids = torch.LongTensor([[0, 1, 2, 4]]).to(
+            self.assistant_model_device
+        )  # 'hello world foo baz' in assistant tokenizer
+
+        expected_target_ids = torch.LongTensor(
+            [[0, 1, 2, self.translator.SUPPRESS_TOKEN_ID]]
+        ).to(
+            self.assistant_model_device
+        )  # 'hello world foo baz' in target tokenizer (baz is mapped to self.translator.suppress_tokens_id since it does not exist in target vocab)
+
+        actual_target_ids = self.translator.get_target_ids(
+            assistant_input_ids, target_input_ids, assistant_candidate_ids
+        )
+        self.assertTrue(torch.equal(actual_target_ids, expected_target_ids))
+
+    def test_get_target_logits(self):
+        """Test the conversion of assistant logits to target logits."""
+        # Assistant logits for IDs 0, 1, 2
+        assistant_logits = torch.FloatTensor([[[0.1, 0.2, 0.3, 0.4, self.translator.FILTER_VALUE]]]).to(
+            self.assistant_model_device
+        )  # Shape (1, 1, 5)
+
+        # Expected target logits (target_vocab_size = 4)
+        expected_target_logits = torch.full((1, 1, self.target_vocab_size), self.translator.FILTER_VALUE).to(
+            self.assistant_model_device
+        )
+        expected_target_logits[0, 0, 0] = 0.1  # 'hello'
+        expected_target_logits[0, 0, 1] = 0.2  # 'world'
+        expected_target_logits[0, 0, 2] = 0.3  # 'foo'
+        # The 'bar' token in target vocab remains at -inf
+
+        actual_target_logits = self.translator.get_target_logits(assistant_logits)
+        self.assertTrue(torch.equal(actual_target_logits, expected_target_logits))
+
+
+class MockTokenizer:
+    """A simple mock tokenizer class that supports weak references."""
+
+    def __init__(self, vocab=None):
+        self._vocab = vocab or {}
+
+    def get_vocab(self):
+        return self._vocab
+
+    def __call__(self, text, add_special_tokens=True):
+        # Mock implementation of the __call__ method
+        tokens = text.split()
+        input_ids = [self._vocab.get(token, 0) for token in tokens]
+        return {"input_ids": input_ids}
+
+
+@require_torch
+class TestAssistantVocabTranslatorCache(unittest.TestCase):
+    def setUp(self):
+        # Clear the cache before each test
+        AssistantVocabTranslatorCache._cache.clear()
+        # Create mock tokenizers with different vocabularies
+        self.target_tokenizer = MockTokenizer({"hello": 0, "world": 1})
+        self.assistant_tokenizer = MockTokenizer({"hello": 0, "world": 1, "foo": 2})
+        self.other_target_tokenizer = MockTokenizer({"foo": 2, "bar": 3})
+        self.other_assistant_tokenizer = MockTokenizer({"baz": 4, "qux": 5})
+        self.assistant_model_device = torch_device
+        self.target_vocab_size = 6
+
+    def test_same_instance_for_same_tokenizers(self):
+        """Test that the same translator is returned for the same tokenizers."""
+        translator1 = AssistantVocabTranslatorCache.get_translator(
+            self.target_tokenizer,
+            self.assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
         )
-        self.assertEqual(discrep_length, 0)
-        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
-        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+        translator2 = AssistantVocabTranslatorCache.get_translator(
+            self.target_tokenizer,
+            self.assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
+        )
+        self.assertIs(translator1, translator2, "Translators should be cached and identical")
 
-    def test_partial_overlap(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[2, 3, 4, 5]])
-        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
-            prompt, prompt_plus_new_tokens
+    def test_different_instances_for_different_tokenizers(self):
+        """Test that different tokenizers produce different translators."""
+        translator1 = AssistantVocabTranslatorCache.get_translator(
+            self.target_tokenizer,
+            self.assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
+        )
+        translator2 = AssistantVocabTranslatorCache.get_translator(
+            self.other_target_tokenizer,
+            self.other_assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
         )
-        self.assertEqual(discrep_length, 0)
-        np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]]))
-        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+        self.assertIsNot(translator1, translator2, "Translators should differ for different tokenizers")
 
-    def test_no_new_tokens(self):
-        prompt = np.array([[1, 2, 3]])
-        prompt_plus_new_tokens = np.array([[1, 2, 3]])
-        discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(
-            prompt, prompt_plus_new_tokens
+    def test_cache_with_weakref_key(self):
+        """Ensure that the cache uses weak references as keys."""
+        initial_cache_size = len(AssistantVocabTranslatorCache._cache)
+        target_tokenizer = MockTokenizer({"hello": 0})
+        assistant_tokenizer = MockTokenizer({"hello": 0})
+
+        # Store translator in a local variable to avoid it being kept alive
+        translator = AssistantVocabTranslatorCache.get_translator(
+            target_tokenizer,
+            assistant_tokenizer,
+            assistant_model_device=self.assistant_model_device,
+            target_vocab_size=self.target_vocab_size,
         )
-        self.assertEqual(discrep_length, 0)
-        np.testing.assert_array_equal(new_tokens_only, np.array([[]]))
-        np.testing.assert_array_equal(discrep_only, np.array([[]]))
+        self.assertEqual(len(AssistantVocabTranslatorCache._cache), initial_cache_size + 1)
+
+        # Delete all strong references
+        del target_tokenizer
+        del assistant_tokenizer
+        del translator
+
+        # Force garbage collection
+        gc.collect()
+
+        # Call cleanup to remove dead entries
+        AssistantVocabTranslatorCache.cleanup()
+
+        # The cache size remains increased due to strong references
+        self.assertEqual(len(AssistantVocabTranslatorCache._cache), initial_cache_size + 1)
+
+    def test_weakref_cache_cleanup(self):
+        """Test that the cache cleans up translators when tokenizers are garbage collected."""
+
+        def create_translator():
+            target_tokenizer = MockTokenizer({"hello": 0})
+            assistant_tokenizer = MockTokenizer({"hello": 0})
+            translator = AssistantVocabTranslatorCache.get_translator(
+                target_tokenizer,
+                assistant_tokenizer,
+                assistant_model_device=self.assistant_model_device,
+                target_vocab_size=self.target_vocab_size,
+            )
+            # Create weak references before returning
+            refs = (weakref.ref(translator), weakref.ref(target_tokenizer), weakref.ref(assistant_tokenizer))
+            # Remove strong references inside the function
+            del target_tokenizer
+            del assistant_tokenizer
+            del translator
+            return refs
+
+        translator_ref, target_ref, assistant_ref = create_translator()
+
+        # Force garbage collection
+        gc.collect()
+
+        # Call cleanup to remove dead entries
+        AssistantVocabTranslatorCache.cleanup()
+
+        # The tokenizers and translator are not garbage collected due to strong references
+        self.assertIsNotNone(target_ref(), "Target tokenizer should still be alive due to strong references")
+        self.assertIsNotNone(assistant_ref(), "Assistant tokenizer should still be alive due to strong references")
+        self.assertIsNotNone(translator_ref(), "Translator should still be alive due to strong references")
+
+
+@require_torch
+class TestUniversalSpeculativeDecoding(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.target_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"
+        cls.assistant_name = "hf-internal-testing/tiny-random-PhiForCausalLM"
+
+    def setUp(self):
+        self.target_tokenizer = AutoTokenizer.from_pretrained(self.target_name)
+        self.target_config = AutoConfig.from_pretrained(self.target_name)
+        self.assistant_model = AutoModelForCausalLM.from_pretrained(self.assistant_name).to(torch_device)
+        self.assistant_tokenizer = AutoTokenizer.from_pretrained(self.assistant_name)
+
+        self.generation_config = GenerationConfig()
+
+        # Ensure required tokens exist
+        if self.target_tokenizer.pad_token_id is None:
+            self.target_tokenizer.pad_token_id = self.target_tokenizer.eos_token_id
+        if self.target_tokenizer.bos_token_id is None:
+            self.target_tokenizer.bos_token_id = self.target_tokenizer.eos_token_id
+        if self.assistant_tokenizer.pad_token_id is None:
+            self.assistant_tokenizer.pad_token_id = self.assistant_tokenizer.eos_token_id
+        if self.target_tokenizer.bos_token_id is None:
+            self.assistant_tokenizer.bos_token_id = self.assistant_tokenizer.eos_token_id
+
+        self.input_ids = torch.tensor([[1, 2, 3]]).to(torch_device)
+        self.model_kwargs = {
+            "attention_mask": torch.ones_like(self.input_ids).to(torch_device),
+        }
+
+        atm_translator = AssistantVocabTranslatorCache.get_translator(
+            self.target_tokenizer, self.assistant_tokenizer, self.target_config.vocab_size, torch_device
+        )
+        self.generator = UniversalSpeculativeDecodingGenerator(
+            input_ids=self.input_ids,
+            assistant_model=self.assistant_model,
+            target_tokenizer=self.target_tokenizer,
+            assistant_tokenizer=self.assistant_tokenizer,
+            generation_config=self.generation_config,
+            model_kwargs=self.model_kwargs,
+            atm_translator=atm_translator,
+        )
+
+    def test_basic_generation(self):
+        """Test basic speculative decoding works"""
+        input_text = "The quick brown fox"
+        input_ids = self.target_tokenizer.encode(input_text, return_tensors="pt")
+        self.generator.input_ids = input_ids
+        candidates, scores = self.generator.get_candidates(input_ids)
+
+        self.assertIsNotNone(candidates)
+        self.assertIsNotNone(scores)
+        self.assertTrue(torch.is_tensor(candidates))
+        self.assertTrue(torch.is_tensor(scores))
+
+    def test_mismatched_vocabularies(self):
+        """Test handling of mismatched vocabularies between models"""
+        # Create input with tokens present in main but not assistant vocab
+        # Find a token that is not in the assistant tokenizer but in
+        # the main tokenizer.
+        missing_token = next(
+            token
+            for token in self.target_tokenizer.get_vocab()
+            if token not in self.assistant_tokenizer.get_vocab()
+            and token not in self.target_tokenizer.all_special_tokens
+            and "reserved_" not in token
+        )
+        input_ids = torch.tensor([[self.target_tokenizer.convert_tokens_to_ids(missing_token)]])
+        self.generator.input_ids = input_ids
+        candidates, scores = self.generator.get_candidates(input_ids)
+        self.assertIsNotNone(candidates)
+
+    def test_speculation_depth(self):
+        """Test different speculation depths"""
+        input_ids = self.target_tokenizer.encode("Test text", return_tensors="pt")
+        self.generator.input_ids = input_ids
+
+        for depth in [1, 8, 17]:
+            self.generator.num_assistant_tokens = depth
+            candidates, scores = self.generator.get_candidates(input_ids)
+            self.assertLessEqual(candidates.shape[1] - input_ids.shape[1], depth)
+
+    def test_device_consistency(self):
+        """Test handling of inputs on different devices"""
+        input_ids = torch.tensor([[1, 2, 3]]).to(torch_device)
+        self.generator.input_ids = input_ids
+        candidates, _ = self.generator.get_candidates(input_ids)
+        self.assertEqual(candidates.device, input_ids.device)
+
+    def test_usd_vs_vanilla_sampling(cls):
+        """Test that USD matches vanilla sampling with temperature set to nearly 0"""
+        prompt = "Test text"
+
+        pipe_usd = pipeline("text-generation", model=cls.target_name, assistant_model=cls.assistant_name)
+        pipe_usd_output = pipe_usd(prompt, max_new_tokens=5, do_sample=True, temperature=1e-9)  # Nearly 0 temperature
+        usd_text = pipe_usd_output[0]["generated_text"]
+
+        pipe_vanilla = pipeline(
+            "text-generation",
+            model=cls.target_name,
+        )
+        pipe_vanilla_output = pipe_vanilla(prompt, max_new_tokens=5, do_sample=False)
+        vanilla_text = pipe_vanilla_output[0]["generated_text"]
+
+        # Assert that the outputs match
+        cls.assertEqual(usd_text, vanilla_text)
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index ef30599581ff..ef8010074b45 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -193,7 +193,7 @@ def test_validate(self):
             generation_config_bad_temperature.update(temperature=None)
         self.assertEqual(len(captured_warnings), 0)
 
-        # Impossible sets of contraints/parameters will raise an exception
+        # Impossible sets of constraints/parameters will raise an exception
         with self.assertRaises(ValueError):
             GenerationConfig(do_sample=False, num_beams=1, num_return_sequences=2)
         with self.assertRaises(ValueError):
diff --git a/tests/generation/test_flax_logits_process.py b/tests/generation/test_flax_logits_process.py
deleted file mode 100644
index bd5f8f648cbb..000000000000
--- a/tests/generation/test_flax_logits_process.py
+++ /dev/null
@@ -1,343 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-
-from transformers import is_flax_available
-from transformers.testing_utils import require_flax
-
-from ..test_modeling_flax_common import ids_tensor
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-
-    from transformers.generation import (
-        FlaxForcedBOSTokenLogitsProcessor,
-        FlaxForcedEOSTokenLogitsProcessor,
-        FlaxLogitsProcessorList,
-        FlaxMinLengthLogitsProcessor,
-        FlaxNoRepeatNGramLogitsProcessor,
-        FlaxTemperatureLogitsWarper,
-        FlaxTopKLogitsWarper,
-        FlaxTopPLogitsWarper,
-    )
-
-
-@require_flax
-class LogitsProcessorTest(unittest.TestCase):
-    def _get_uniform_logits(self, batch_size: int, length: int):
-        scores = jnp.ones((batch_size, length)) / length
-        return scores
-
-    def test_temperature_dist_warper(self):
-        input_ids = None
-        length = 20
-
-        scores = self._get_uniform_logits(batch_size=2, length=length)
-
-        # tweak scores to not be uniform anymore
-        scores = scores.at[1, 5].set((1 / length) + 0.1)  # peak, 1st batch
-        scores = scores.at[1, 10].set((1 / length) - 0.4)  # valley, 1st batch
-
-        # compute softmax
-        probs = jax.nn.softmax(scores, axis=-1)
-
-        temp_dist_warper_sharper = FlaxTemperatureLogitsWarper(temperature=0.5)
-        temp_dist_warper_smoother = FlaxTemperatureLogitsWarper(temperature=1.3)
-
-        warped_prob_sharp = jax.nn.softmax(temp_dist_warper_sharper(input_ids, scores.copy(), cur_len=None), axis=-1)
-        warped_prob_smooth = jax.nn.softmax(temp_dist_warper_smoother(input_ids, scores.copy(), cur_len=None), axis=-1)
-
-        # uniform distribution stays uniform
-        self.assertTrue(jnp.allclose(probs[0, :], warped_prob_sharp[0, :], atol=1e-3))
-        self.assertTrue(jnp.allclose(probs[0, :], warped_prob_smooth[0, :], atol=1e-3))
-
-        # sharp peaks get higher, valleys get lower
-        self.assertLess(probs[1, :].max(), warped_prob_sharp[1, :].max())
-        self.assertGreater(probs[1, :].min(), warped_prob_sharp[1, :].min())
-
-        # smooth peaks get lower, valleys get higher
-        self.assertGreater(probs[1, :].max(), warped_prob_smooth[1, :].max())
-        self.assertLess(probs[1, :].min(), warped_prob_smooth[1, :].min())
-
-    def test_top_k_dist_warper(self):
-        input_ids = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create ramp distribution
-        ramp_logits = np.broadcast_to(np.arange(vocab_size)[None, :], (batch_size, vocab_size)).copy()
-        ramp_logits[1:, : vocab_size // 2] = ramp_logits[1:, : vocab_size // 2] + vocab_size
-
-        top_k_warp = FlaxTopKLogitsWarper(3)
-
-        scores = top_k_warp(input_ids, ramp_logits, cur_len=None)
-
-        # check that correct tokens are filtered
-        self.assertListEqual(jnp.isinf(scores[0]).tolist(), 7 * [True] + 3 * [False])
-        self.assertListEqual(jnp.isinf(scores[1]).tolist(), 2 * [True] + 3 * [False] + 5 * [True])
-
-        # check special case
-        length = 5
-        top_k_warp_safety_check = FlaxTopKLogitsWarper(top_k=1, filter_value=0.0, min_tokens_to_keep=3)
-
-        ramp_logits = np.broadcast_to(np.arange(length)[None, :], (batch_size, length)).copy()
-        scores = top_k_warp_safety_check(input_ids, ramp_logits, cur_len=None)
-
-        # min_tokens overwrites k: 3 tokens are kept => 2 tokens are nullified
-        self.assertListEqual((scores == 0.0).sum(axis=-1).tolist(), [2, 2])
-
-    def test_top_p_dist_warper(self):
-        input_ids = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create distribution and take log (inverse to Softmax as taken in TopPLogitsWarper)
-        dist = np.log(np.array([[0.3, 0.1, 0.1, 0.5], [0.15, 0.3, 0.3, 0.25]]))
-
-        top_p_warp = FlaxTopPLogitsWarper(0.8)
-        filtered_dist = np.exp(top_p_warp(input_ids, dist, cur_len=None))
-
-        # dist should be filtered to keep min num values so that sum is >= top_p
-        # exp (-inf) => 0
-        EXPECTED_FILTERED_DIST = np.array([[0.3, 0.0, 0.0, 0.5], [0.0, 0.3, 0.3, 0.25]])
-        self.assertTrue(np.allclose(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3))
-
-        # check edge cases with negative and extreme logits
-        ramp_logits = np.broadcast_to(np.arange(vocab_size)[None, :], (batch_size, vocab_size)).copy() - (
-            vocab_size // 2
-        )
-
-        # make ramp_logits more extreme
-        ramp_logits[1] = ramp_logits[1] * 100.0
-
-        # make sure at least 2 tokens are kept
-        top_p_warp = FlaxTopPLogitsWarper(0.9, min_tokens_to_keep=2, filter_value=0.0)
-        filtered_dist = top_p_warp(input_ids, ramp_logits, cur_len=None)
-
-        # first batch should keep three tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
-        self.assertListEqual((filtered_dist != 0.0).sum(axis=-1).tolist(), [3, 2])
-
-    def test_min_length_dist_processor(self):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-
-        min_dist_processor = FlaxMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-
-        # check that min length is applied at length 5
-        input_ids = ids_tensor((batch_size, 20), vocab_size=20)
-        cur_len = 5
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = min_dist_processor(input_ids, scores, cur_len=cur_len)
-        self.assertListEqual(scores_before_min_length[:, eos_token_id].tolist(), 4 * [-float("inf")])
-
-        # check that min length is not applied anymore at length 15
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        cur_len = 15
-        scores_before_min_length = min_dist_processor(input_ids, scores, cur_len=cur_len)
-        self.assertFalse(jnp.isinf(scores_before_min_length).any())
-
-    def test_forced_bos_token_logits_processor(self):
-        vocab_size = 20
-        batch_size = 4
-        bos_token_id = 0
-
-        logits_processor = FlaxForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
-
-        # check that all scores are -inf except the bos_token_id score
-        input_ids = ids_tensor((batch_size, 1), vocab_size=20)
-        cur_len = 1
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len=cur_len)
-        self.assertTrue(jnp.isneginf(scores[:, bos_token_id + 1 :]).all())
-        self.assertListEqual(scores[:, bos_token_id].tolist(), 4 * [0])  # score for bos_token_id shold be zero
-
-        # check that bos_token_id is not forced if current length is greater than 1
-        cur_len = 3
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len=cur_len)
-        self.assertFalse(jnp.isinf(scores).any())
-
-    def test_forced_eos_token_logits_processor(self):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-        max_length = 5
-
-        logits_processor = FlaxForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
-
-        # check that all scores are -inf except the eos_token_id when max_length is reached
-        input_ids = ids_tensor((batch_size, 4), vocab_size=20)
-        cur_len = 4
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len=cur_len)
-        self.assertTrue(jnp.isneginf(scores[:, eos_token_id + 1 :]).all())
-        self.assertListEqual(scores[:, eos_token_id].tolist(), 4 * [0])  # score for eos_token_id should be zero
-
-        # check that eos_token_id is not forced if max_length is not reached
-        cur_len = 3
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len=cur_len)
-        self.assertFalse(jnp.isinf(scores).any())
-
-    def test_no_repeat_ngram_dist_processor(self):
-        vocab_size = 3
-        batch_size = 2
-
-        cur_len = 4
-        input_ids = np.array([[1, 1, 2, 1], [0, 1, 0, 1]], dtype="i4")
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-
-        no_repeat_proc_2_gram = FlaxNoRepeatNGramLogitsProcessor(2)
-        no_repeat_proc_3_gram = FlaxNoRepeatNGramLogitsProcessor(3)
-
-        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, scores, cur_len=cur_len)
-        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, scores, cur_len=cur_len)
-
-        # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch
-        self.assertListEqual(jnp.isinf(filtered_scores_2_gram).tolist(), [[False, True, True], [True, False, False]])
-
-        # 3-gram would forbid no token at 1st batch and 1st token (0) at 2nd batch
-        self.assertListEqual(jnp.isinf(filtered_scores_3_gram).tolist(), [[False, False, False], [True, False, False]])
-
-    def test_processor_list(self):
-        batch_size = 4
-        sequence_length = 10
-        vocab_size = 15
-        eos_token_id = 2
-        bos_token_id = 1
-        max_length = 15
-
-        # dummy input_ids and scores
-        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
-        input_ids_comp = input_ids.copy()
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_comp = scores.copy()
-
-        # instantiate all dist processors
-        temp_dist_warp = FlaxTemperatureLogitsWarper(temperature=0.5)
-        top_k_warp = FlaxTopKLogitsWarper(3)
-        top_p_warp = FlaxTopPLogitsWarper(0.8)
-        no_repeat_proc = FlaxNoRepeatNGramLogitsProcessor(2)
-
-        # instantiate all logits processors
-        min_dist_proc = FlaxMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-        bos_dist_proc = FlaxForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
-        eos_dist_proc = FlaxForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
-
-        cur_len = 10
-
-        # no processor list
-        scores = temp_dist_warp(input_ids, scores, cur_len=cur_len)
-        scores = top_k_warp(input_ids, scores, cur_len=cur_len)
-        scores = top_p_warp(input_ids, scores, cur_len=cur_len)
-        scores = min_dist_proc(input_ids, scores, cur_len=cur_len)
-        scores = bos_dist_proc(input_ids, scores, cur_len=cur_len)
-        scores = eos_dist_proc(input_ids, scores, cur_len=cur_len)
-        scores = no_repeat_proc(input_ids, scores, cur_len=cur_len)
-
-        # with processor list
-        processor = FlaxLogitsProcessorList(
-            [
-                temp_dist_warp,
-                top_k_warp,
-                top_p_warp,
-                min_dist_proc,
-                bos_dist_proc,
-                eos_dist_proc,
-                no_repeat_proc,
-            ]
-        )
-        scores_comp = processor(input_ids, scores_comp, cur_len=cur_len)
-
-        # scores should be equal
-        self.assertTrue(jnp.allclose(scores, scores_comp, atol=1e-3))
-
-        # input_ids should never be changed
-        self.assertListEqual(input_ids.tolist(), input_ids_comp.tolist())
-
-    def test_processor_list_jitted(self):
-        batch_size = 4
-        sequence_length = 10
-        vocab_size = 15
-        eos_token_id = 2
-        bos_token_id = 1
-        max_length = 15
-
-        # dummy input_ids and scores
-        input_ids = ids_tensor((batch_size, sequence_length), vocab_size)
-        input_ids_comp = input_ids.copy()
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_comp = scores.copy()
-
-        # instantiate all dist processors
-        temp_dist_warp = FlaxTemperatureLogitsWarper(temperature=0.5)
-        top_k_warp = FlaxTopKLogitsWarper(3)
-        top_p_warp = FlaxTopPLogitsWarper(0.8)
-        no_repeat_proc = FlaxNoRepeatNGramLogitsProcessor(2)
-
-        # instantiate all logits processors
-        min_dist_proc = FlaxMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-        bos_dist_proc = FlaxForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
-        eos_dist_proc = FlaxForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
-
-        cur_len = 10
-
-        # no processor list
-        def run_no_processor_list(input_ids, scores, cur_len):
-            scores = temp_dist_warp(input_ids, scores, cur_len=cur_len)
-            scores = top_k_warp(input_ids, scores, cur_len=cur_len)
-            scores = top_p_warp(input_ids, scores, cur_len=cur_len)
-            scores = min_dist_proc(input_ids, scores, cur_len=cur_len)
-            scores = bos_dist_proc(input_ids, scores, cur_len=cur_len)
-            scores = eos_dist_proc(input_ids, scores, cur_len=cur_len)
-            scores = no_repeat_proc(input_ids, scores, cur_len=cur_len)
-            return scores
-
-        # with processor list
-        def run_processor_list(input_ids, scores, cur_len):
-            processor = FlaxLogitsProcessorList(
-                [
-                    temp_dist_warp,
-                    top_k_warp,
-                    top_p_warp,
-                    min_dist_proc,
-                    bos_dist_proc,
-                    eos_dist_proc,
-                    no_repeat_proc,
-                ]
-            )
-            scores = processor(input_ids, scores, cur_len=cur_len)
-            return scores
-
-        jitted_run_no_processor_list = jax.jit(run_no_processor_list)
-        jitted_run_processor_list = jax.jit(run_processor_list)
-
-        scores = jitted_run_no_processor_list(input_ids, scores, cur_len)
-        scores_comp = jitted_run_processor_list(input_ids, scores_comp, cur_len)
-
-        # scores should be equal
-        self.assertTrue(jnp.allclose(scores, scores_comp, atol=1e-3))
-
-        # input_ids should never be changed
-        self.assertListEqual(input_ids.tolist(), input_ids_comp.tolist())
diff --git a/tests/generation/test_flax_utils.py b/tests/generation/test_flax_utils.py
deleted file mode 100644
index 302617c6688d..000000000000
--- a/tests/generation/test_flax_utils.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-
-import transformers
-from transformers import is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax
-
-
-if is_flax_available():
-    import os
-
-    import jax.numpy as jnp
-    from jax import jit
-
-    from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
-    from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
-
-    os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
-
-
-if is_torch_available():
-    import torch
-
-
-def ids_tensor(shape, vocab_size, rng=None):
-    """Creates a random int32 tensor of the shape within the vocab size."""
-    if rng is None:
-        rng = random.Random()
-
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
-
-    output = np.array(values, dtype=jnp.int32).reshape(shape)
-
-    return output
-
-
-def random_attention_mask(shape, rng=None):
-    attn_mask = ids_tensor(shape, vocab_size=2, rng=rng)
-    # make sure that at least one token is attended to for each batch
-    attn_mask[:, -1] = 1
-    return attn_mask
-
-
-@require_flax
-class FlaxGenerationTesterMixin:
-    model_tester = None
-
-    def _get_input_ids_and_config(self):
-        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # cut to half length & take max batch_size 3
-        max_batch_size = 2
-        sequence_length = inputs["input_ids"].shape[-1] // 2
-        input_ids = inputs["input_ids"][:max_batch_size, :sequence_length]
-
-        attention_mask = jnp.ones_like(input_ids)
-        attention_mask = attention_mask[:max_batch_size, :sequence_length]
-
-        # generate max 5 tokens
-        max_length = input_ids.shape[-1] + 5
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            config.pad_token_id = config.eos_token_id
-        return config, input_ids, attention_mask, max_length
-
-    @is_pt_flax_cross_test
-    def test_greedy_generate_pt_fx(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = False
-        config.max_length = max_length
-        config.decoder_start_token_id = 0
-
-        for model_class in self.all_generative_model_classes:
-            flax_model = model_class(config)
-
-            pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-            pt_model_class = getattr(transformers, pt_model_class_name)
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, flax_model.params)
-
-            # Generate max 5 tokens only otherwise seems to be numerical error accumulation
-            pt_model.generation_config.max_length = 5
-            flax_model.generation_config.max_length = 5
-
-            flax_generation_outputs = flax_model.generate(input_ids).sequences
-            pt_generation_outputs = pt_model.generate(torch.tensor(input_ids, dtype=torch.long))
-
-            if flax_generation_outputs.shape[-1] > pt_generation_outputs.shape[-1]:
-                flax_generation_outputs = flax_generation_outputs[:, : pt_generation_outputs.shape[-1]]
-
-            self.assertListEqual(pt_generation_outputs.numpy().tolist(), flax_generation_outputs.tolist())
-
-    def test_greedy_generate(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = False
-        config.max_length = max_length
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_sample_generate(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = True
-        config.max_length = max_length
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_beam_search_generate(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = False
-        config.max_length = max_length
-        config.num_beams = 2
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_beam_search_generate_num_return_sequences(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = False
-        config.max_length = max_length
-        config.num_beams = 2
-        config.num_return_sequences = 2
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[0], input_ids.shape[0] * config.num_return_sequences)
-
-    def test_sample_generate_logits_warper(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.do_sample = True
-        config.max_length = max_length
-        config.temperature = 0.8
-        config.top_k = 10
-        config.top_p = 0.3
-        config.min_length = 1
-        config.forced_bos_token_id = 8
-        config.forced_eos_token_id = 9
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_greedy_generate_logits_warper(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.max_length = max_length
-        config.min_length = 1
-        config.forced_bos_token_id = 8
-        config.forced_eos_token_id = 9
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_beam_search_generate_logits_warper(self):
-        config, input_ids, _, max_length = self._get_input_ids_and_config()
-        config.max_length = max_length
-        config.num_beams = 2
-        config.min_length = 1
-        config.forced_bos_token_id = 8
-        config.forced_eos_token_id = 9
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_greedy_generate_attn_mask(self):
-        config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-        # pad attention mask on the left
-        attention_mask = attention_mask.at[(0, 0)].set(0)
-
-        config.do_sample = False
-        config.max_length = max_length
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids, attention_mask=attention_mask).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids, attention_mask=attention_mask).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_sample_generate_attn_mask(self):
-        config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-        # pad attention mask on the left
-        attention_mask = attention_mask.at[(0, 0)].set(0)
-
-        config.do_sample = True
-        config.max_length = max_length
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids, attention_mask=attention_mask).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids, attention_mask=attention_mask).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-    def test_beam_search_generate_attn_mask(self):
-        config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-        # pad attention mask on the left
-        attention_mask = attention_mask.at[(0, 0)].set(0)
-
-        config.num_beams = 2
-        config.max_length = max_length
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            generation_outputs = model.generate(input_ids, attention_mask=attention_mask).sequences
-            self.assertEqual(generation_outputs.shape[-1], max_length)
-
-            jit_generate = jit(model.generate)
-            jit_generation_outputs = jit_generate(input_ids, attention_mask=attention_mask).sequences
-
-            self.assertListEqual(generation_outputs.tolist(), jit_generation_outputs.tolist())
-
-
-@require_flax
-class FlaxGenerationIntegrationTests(unittest.TestCase):
-    def test_validate_generation_inputs(self):
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-bert")
-        model = FlaxAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-
-        encoder_input_str = "Hello world"
-        input_ids = tokenizer(encoder_input_str, return_tensors="np").input_ids
-
-        # typos are quickly detected (the correct argument is `do_sample`)
-        with self.assertRaisesRegex(ValueError, "do_samples"):
-            model.generate(input_ids, do_samples=True)
-
-        # arbitrary arguments that will not be used anywhere are also not accepted
-        with self.assertRaisesRegex(ValueError, "foo"):
-            fake_model_kwargs = {"foo": "bar"}
-            model.generate(input_ids, **fake_model_kwargs)
diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py
deleted file mode 100644
index 634824c2b38e..000000000000
--- a/tests/generation/test_framework_agnostic.py
+++ /dev/null
@@ -1,688 +0,0 @@
-"""
-Framework agnostic tests for generate()-related methods.
-"""
-
-import numpy as np
-
-from transformers import AutoTokenizer
-from transformers.testing_utils import slow, torch_device
-
-
-class GenerationIntegrationTestsMixin:
-    # To be populated by the child classes
-    framework_dependent_parameters = {
-        "AutoModelForCausalLM": None,
-        "AutoModelForSpeechSeq2Seq": None,
-        "AutoModelForSeq2SeqLM": None,
-        "AutoModelForVision2Seq": None,
-        "LogitsProcessorList": None,
-        "MinLengthLogitsProcessor": None,
-        "create_tensor_fn": None,
-        "floats_tensor": None,
-        "return_tensors": None,
-        "set_seed": None,
-    }
-
-    def test_validate_generation_inputs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-        encoder_input_str = "Hello world"
-        input_ids = tokenizer(encoder_input_str, return_tensors=return_tensors).input_ids
-
-        # typos are quickly detected (the correct argument is `do_sample`)
-        with self.assertRaisesRegex(ValueError, "do_samples"):
-            model.generate(input_ids, do_samples=True)
-
-        # arbitrary arguments that will not be used anywhere are also not accepted
-        with self.assertRaisesRegex(ValueError, "foo"):
-            fake_model_kwargs = {"foo": "bar"}
-            model.generate(input_ids, **fake_model_kwargs)
-
-        # however, valid model_kwargs are accepted
-        valid_model_kwargs = {"attention_mask": create_tensor_fn(np.zeros_like(input_ids))}
-        model.generate(input_ids, **valid_model_kwargs)
-
-    def test_custom_logits_processor(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        logits_processor_list_cls = self.framework_dependent_parameters["LogitsProcessorList"]
-        min_length_logits_processor_cls = self.framework_dependent_parameters["MinLengthLogitsProcessor"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", min_length=1)
-        input_ids = bart_tokenizer(article, return_tensors=return_tensors).input_ids
-
-        logits_processor = logits_processor_list_cls()
-        logits_processor.append(min_length_logits_processor_cls(min_length=10, eos_token_id=0))
-        # it should not be allowed to both define `min_length` via config and `logits_processor` list
-        with self.assertRaises(ValueError):
-            bart_model.generate(input_ids, logits_processor=logits_processor)
-
-        bart_model.config.min_length = None
-        bart_model.generate(input_ids, logits_processor=logits_processor)
-
-    def test_max_new_tokens_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        bart_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart")
-        input_ids = bart_tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            bart_model = bart_model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 29])
-
-        max_new_tokens = 3
-        bart_model.config.max_length = 20
-        bart_model.config.eos_token_id = None
-
-        # Encoder decoder call
-        outputs = bart_model.generate(input_ids, max_new_tokens=max_new_tokens)
-        # 1 BOS + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 4])
-
-        # Decoder only call
-        outputs = bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=max_new_tokens)
-        # 1 BOS + 29 (input length) + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 33])
-
-        # Encoder decoder call > 20
-        outputs = bart_model.generate(max_new_tokens=max_new_tokens + 20)
-
-        # 1 BOS + 20 + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-    def test_max_new_tokens_decoder_only(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """Justin Timberlake."""
-        gpt2_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        gpt2_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_ids = gpt2_tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            gpt2_model = gpt2_model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        self.assertEqual(list(input_ids.shape), [1, 9])
-
-        max_new_tokens = 3
-        gpt2_model.config.max_length = 20
-
-        # call < 20
-        outputs = gpt2_model.generate(input_ids, max_new_tokens=max_new_tokens)
-
-        # 9 input_ids + 3 new tokens
-        self.assertEqual(list(outputs.shape), [1, 12])
-
-        # call > 20
-        outputs = gpt2_model.generate(max_new_tokens=max_new_tokens + 20)
-
-        # 1 BOS token + 23 new tokens
-        self.assertEqual(list(outputs.shape), [1, 24])
-
-    def test_encoder_decoder_generate_with_inputs_embeds(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5)
-        model.config.eos_token_id = None
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-
-        inputs_embeds = model.get_input_embeddings()(input_ids)
-
-        output_sequences = model.generate(inputs_embeds=inputs_embeds)
-
-        # make sure model generated correctly until `max_length`
-        self.assertEqual(output_sequences.shape, (1, 5))
-
-    def test_transition_scores_greedy_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = ["Justin Timberlake", "Michael Phelps"]
-        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", padding_side="left")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained("distilbert/distilgpt2")
-        model.generation_config.eos_token_id = None
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            max_new_tokens=5,
-            pad_token_id=tokenizer.eos_token_id,
-            return_dict_in_generate=True,
-            output_scores=True,
-        )
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-
-        expected_scores = np.array(
-            [
-                [-57.8844, -60.45698, -70.16364, -65.50791, -66.35648],
-                [-54.417572, -60.216614, -62.661243, -58.621933, -58.298683],
-            ]
-        )
-        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
-
-    def test_transition_scores_greedy_search_normalized(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = ["Justin Timberlake", "Michael Phelps"]
-        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", padding_side="left")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained("distilbert/distilgpt2")
-        model.generation_config.eos_token_id = None
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids=input_ids,
-            max_new_tokens=5,
-            pad_token_id=tokenizer.eos_token_id,
-            return_dict_in_generate=True,
-            output_scores=True,
-        )
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-
-        expected_scores = np.array(
-            [
-                [-2.538938, -2.2694316, -2.1580915, -1.572299, -2.6719835],
-                [-1.8826028, -2.2461371, -1.7556462, -2.9644494, -1.7996008],
-            ]
-        )
-        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_encoder_decoder_with_eos(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_search_decoder_only(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake",
-            "Michael Phelps",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        tokenizer.pad_token = tokenizer.eos_token
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2",
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    def test_transition_scores_beam_sample_encoder_decoder(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = [
-            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
-            "Michael Phelps is arguably the most decorated Olympian of all time.",
-        ]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart",
-            do_sample=True,
-            max_length=10,
-            num_beams=4,
-            num_return_sequences=2,
-            eos_token_id=None,
-            return_dict_in_generate=True,
-            output_scores=True,
-            length_penalty=0.0,
-        )
-        input_ids = tokenizer(articles, return_tensors=return_tensors, padding=True).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(input_ids=input_ids)
-
-        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
-
-    @slow
-    def test_transition_scores_early_stopping(self):
-        # This is an aggressive test that makes sure that `beam_search's`
-        # transition scores are computed correctly for varying `num_return_sequences`, `num_beams` and `batch_size > 1`
-        # 2 x input_ids for "question: How are you? \n context: I had a long day, "
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        input_ids = create_tensor_fn(2 * [[822, 10, 571, 33, 25, 58, 2625, 10, 27, 141, 3, 9, 307, 239, 6, 1]])
-        model = model_cls.from_pretrained("google-t5/t5-small")
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        outputs = model.generate(
-            input_ids,
-            max_length=10,
-            return_dict_in_generate=True,
-            output_scores=True,
-            forced_eos_token_id=model.config.eos_token_id,
-            num_beams=4,
-            do_sample=False,
-            num_return_sequences=3,
-            length_penalty=0.0,
-        )
-
-        transition_scores = model.compute_transition_scores(
-            sequences=outputs.sequences, scores=outputs.scores, beam_indices=outputs.beam_indices
-        )
-        if is_pt:
-            transition_scores = transition_scores.cpu().numpy()
-            outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
-
-        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores))
-
-    def test_encoder_decoder_generate_attention_mask(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        articles = ["Timberlake", "Jessica Biel, welcome to parenthood among other things"]
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        # need extreme generation values here to force this test
-        # to fail when `attention_mask` is not correctly treated in generate
-        model = model_cls.from_pretrained(
-            "hf-internal-testing/tiny-random-bart", max_length=50, num_beams=5, num_return_sequences=5
-        )
-        model.config.eos_token_id = None
-        input_ids = tokenizer(articles[0], return_tensors=return_tensors).input_ids
-        input_ids_batched = tokenizer(articles, padding=True, return_tensors=return_tensors).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-            input_ids_batched = input_ids_batched.to(torch_device)
-
-        output_sequences_batched = model.generate(
-            input_ids=input_ids_batched, return_dict_in_generate=True, output_scores=True
-        )
-        output_sequences = model.generate(input_ids=input_ids, return_dict_in_generate=True, output_scores=True)
-
-        batched_out = output_sequences_batched.sequences_scores
-        out = output_sequences.sequences_scores
-        if is_pt:
-            batched_out = batched_out.cpu().numpy()
-            out = out.cpu().numpy()
-
-        diff = np.abs(np.sum(batched_out[:5]) - np.sum(out))
-        self.assertTrue(diff < 1e-4)
-
-    def test_generate_input_ids_as_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=15)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids)
-        output_sequences = model.generate(input_ids)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (1, 15))
-
-    def test_generate_input_ids_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=5)
-        model.config.eos_token_id = None
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        if is_pt:
-            model = model.to(torch_device)
-            input_ids = input_ids.to(torch_device)
-
-        output_sequences_kwargs = model.generate(input_ids=input_ids)
-        output_sequences = model.generate(input_ids)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (1, 5))
-
-    def test_generate_inputs_and_encoder_kwargs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=10)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        with self.assertRaises(ValueError):
-            model.generate(input_ids, input_ids=input_ids)
-
-    def test_generate_too_many_encoder_kwargs(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSeq2SeqLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-
-        article = """I need input_ids to generate"""
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=10)
-        input_ids = tokenizer(article, return_tensors=return_tensors).input_ids
-        with self.assertRaises(ValueError):
-            model.generate(input_ids=input_ids, inputs_embeds=input_ids)
-
-    def test_generate_input_features_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        input_features = floats_tensor((3, 80, 60))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-WhisperForConditionalGeneration")
-        if is_pt:
-            input_features.to(torch_device)
-            model = model.to(torch_device)
-
-        output_sequences_kwargs = model.generate(input_features=input_features, max_length=5)
-        output_sequences = model.generate(input_features, max_length=5)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (3, 5))
-
-    def test_generate_pixel_values_as_encoder_kwarg(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        pixel_values = floats_tensor((2, 3, 30, 30))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
-        model.generation_config.eos_token_id = None
-        if is_pt:
-            pixel_values = pixel_values.to(torch_device)
-            model = model.to(torch_device)
-
-        output_sequences_kwargs = model.generate(pixel_values=pixel_values, max_length=5)
-        output_sequences = model.generate(pixel_values, max_length=5)
-        if is_pt:
-            output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
-            output_sequences = output_sequences.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
-        self.assertEqual(output_sequences.shape, (2, 5))
-
-    def test_generate_encoder_outputs_attention_mask(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForSpeechSeq2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        input_features = floats_tensor((3, 80, 60))
-        attention_mask = create_tensor_fn(np.ones(input_features.shape))
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-WhisperForConditionalGeneration")
-        if is_pt:
-            input_features = input_features.to(torch_device)
-            attention_mask = attention_mask.to(torch_device)
-            model = model.to(torch_device)
-
-        encoder = model.get_encoder()
-        encoder_outputs = encoder(input_features)
-
-        output_sequences_no_mask = model.generate(encoder_outputs=encoder_outputs)
-        output_sequences_with_mask = model.generate(encoder_outputs=encoder_outputs, attention_mask=attention_mask)
-        if is_pt:
-            output_sequences_no_mask = output_sequences_no_mask.cpu().numpy()
-            output_sequences_with_mask = output_sequences_with_mask.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences_no_mask, output_sequences_with_mask))
-
-    def test_eos_token_id_int_and_list_greedy_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 1,
-        }
-        expectation = 13
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        if is_pt:
-            model = model.to(torch_device)
-            tokens = tokens.to(torch_device)
-
-        eos_token_id = 873
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        eos_token_id = [873, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_eos_token_id_int_and_list_contrastive_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 1,
-            "penalty_alpha": 0.6,
-            "top_k": 4,
-        }
-        expectation = 17
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        if is_pt:
-            model = model.to(torch_device)
-            tokens = tokens.to(torch_device)
-
-        eos_token_id = 225
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        eos_token_id = [225, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_eos_token_id_int_and_list_beam_search(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForCausalLM"]
-        return_tensors = self.framework_dependent_parameters["return_tensors"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        generation_kwargs = {
-            "do_sample": False,
-            "num_beams": 3,
-        }
-        expectation = 13
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors=return_tensors)
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        if is_pt:
-            model = model.to(torch_device)
-            tokens = tokens.to(torch_device)
-
-        eos_token_id = 873
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        unpadded_correct_condition = expectation == len(generated_tokens[0])
-        padded_correct_condition = expectation < len(generated_tokens[0]) and all(
-            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
-        )
-        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
-
-        eos_token_id = [873, 198]
-        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        unpadded_correct_condition = expectation == len(generated_tokens[0])
-        padded_correct_condition = expectation < len(generated_tokens[0]) and all(
-            token == model.config.pad_token_id for token in generated_tokens[0][expectation:]
-        )
-        self.assertTrue(unpadded_correct_condition or padded_correct_condition)
-
-    def test_generate_vision2text_conditioning(self):
-        model_cls = self.framework_dependent_parameters["AutoModelForVision2Seq"]
-        floats_tensor = self.framework_dependent_parameters["floats_tensor"]
-        create_tensor_fn = self.framework_dependent_parameters["create_tensor_fn"]
-        is_pt = not model_cls.__name__.startswith("TF")
-
-        pixel_values = floats_tensor((2, 3, 30, 30))
-        conditioning_input = create_tensor_fn([[10], [10]])  # this should be the 2nd output token, after the BOS token
-        model = model_cls.from_pretrained("hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2")
-        if is_pt:
-            pixel_values = pixel_values.to(torch_device)
-            model = model.to(torch_device)
-            conditioning_input = conditioning_input.to(torch_device)
-
-        # we can condition on decoder_input_ids (expected decoder input) and input_ids (which we pipe internally as
-        # decoder_input_ids, if the encoder is not a model with text input)
-        output_sequences_decoder_input_ids = model.generate(
-            pixel_values, max_length=5, decoder_input_ids=conditioning_input
-        )
-        output_sequences_input_ids = model.generate(pixel_values, max_length=5, input_ids=conditioning_input)
-        if is_pt:
-            output_sequences_decoder_input_ids = output_sequences_decoder_input_ids.cpu().numpy()
-            output_sequences_input_ids = output_sequences_input_ids.cpu().numpy()
-            conditioning_input = conditioning_input.cpu().numpy()
-
-        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids, output_sequences_input_ids))
-        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids[:, 1:2], conditioning_input))
diff --git a/tests/generation/test_fsdp.py b/tests/generation/test_fsdp.py
index 904ccdea63ba..2f4c77078f8c 100644
--- a/tests/generation/test_fsdp.py
+++ b/tests/generation/test_fsdp.py
@@ -15,12 +15,12 @@
 import argparse
 from typing import Any, Callable
 
-from transformers import is_torch_available
+from transformers import is_torch_available, is_torch_mlu_available
 from transformers.testing_utils import (
     TestCasePlus,
     execute_subprocess_async,
     get_torch_dist_unique_port,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
 )
 
 
@@ -46,7 +46,11 @@ def manage_process_group(func: Callable[..., Any]) -> Callable[..., Any]:
         """Manage the creation and destruction of the distributed process group for the wrapped function."""
 
         def wrapped(*args: Any, **kwargs: Any) -> Any:
-            torch.distributed.init_process_group(world_size=torch.cuda.device_count())
+            if is_torch_mlu_available():
+                device_count = torch.mlu.device_count()
+            else:
+                device_count = torch.cuda.device_count()
+            torch.distributed.init_process_group(world_size=device_count)
             try:
                 return func(*args, **kwargs)
             finally:
@@ -56,7 +60,10 @@ def wrapped(*args: Any, **kwargs: Any) -> Any:
 
     @manage_process_group
     def fsdp_generate():
-        torch.cuda.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+        if is_torch_mlu_available():
+            torch.mlu.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+        else:
+            torch.cuda.set_device(device := torch.device(rank := torch.distributed.get_rank()))
 
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
 
@@ -79,11 +86,14 @@ def fsdp_generate():
 
     @manage_process_group
     def fsdp2_generate():
-        torch.cuda.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+        if is_torch_mlu_available():
+            torch.mlu.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+        else:
+            torch.cuda.set_device(device := torch.device(rank := torch.distributed.get_rank()))
 
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
 
-        mesh = init_device_mesh("cuda", (torch.distributed.get_world_size(),))
+        mesh = init_device_mesh(device.type, (torch.distributed.get_world_size(),))
         for submodule in model.modules():
             if isinstance(submodule, GPT2Block):
                 fully_shard(submodule, mesh=mesh)
@@ -102,9 +112,13 @@ def fsdp2_generate():
 
 
 class TestFSDPGeneration(TestCasePlus):
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_fsdp_generate(self):
-        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
+        if is_torch_mlu_available():
+            device_count = torch.mlu.device_count()
+        else:
+            device_count = torch.cuda.device_count()
+        distributed_args = f"""--nproc_per_node={device_count}
             --master_port={get_torch_dist_unique_port()}
             {self.test_file_dir}/test_fsdp.py
         """.split()
@@ -113,9 +127,13 @@ def test_fsdp_generate(self):
         execute_subprocess_async(cmd, env=self.get_env())
         # successful return here == success - any errors would have caused an error in the sub-call
 
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_fsdp2_generate(self):
-        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
+        if is_torch_mlu_available():
+            device_count = torch.mlu.device_count()
+        else:
+            device_count = torch.cuda.device_count()
+        distributed_args = f"""--nproc_per_node={device_count}
             --master_port={get_torch_dist_unique_port()}
             {self.test_file_dir}/test_fsdp.py
         """.split()
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index a922a71c22c6..9d7acdfcb5fe 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -751,7 +751,7 @@ def test_forced_bos_token_logits_processor(self):
         scores = self._get_uniform_logits(batch_size, vocab_size)
         processed_scores = logits_processor(input_ids, scores)
         self.assertTrue(torch.isneginf(processed_scores[:, bos_token_id + 1 :]).all())
-        # score for bos_token_id shold be zero
+        # score for bos_token_id should be zero
         self.assertListEqual(processed_scores[:, bos_token_id].tolist(), 4 * [0])
 
         # processor should not change logits in-place
@@ -972,11 +972,12 @@ def test_watermarking_processor(self):
 
         watermark = WatermarkLogitsProcessor(vocab_size=vocab_size, device=input_ids.device)
 
-        # use fixed id for last token, needed for reprodicibility and tests
+        # use fixed id for last token, needed for reproducibility and tests
         input_ids[:, -1] = 10
         scores_wo_bias = scores[:, -1].clone()
         out = watermark(input_ids=input_ids, scores=scores)
-        self.assertTrue((out[:, 1] == scores_wo_bias + watermark.bias).all())
+        greenlist_id = 3 if torch_device == "xpu" else 1
+        self.assertTrue((out[:, greenlist_id] == scores_wo_bias + watermark.bias).all())
 
     @parameterized.expand([(5, 3, 10000), (10, 5, 1000)])
     def test_synthidtext_watermarking_processor_bias_uniformity(self, ngram_len, num_layers, vocab_size):
diff --git a/tests/generation/test_stopping_criteria.py b/tests/generation/test_stopping_criteria.py
index ace7d496dab6..375c10c67243 100644
--- a/tests/generation/test_stopping_criteria.py
+++ b/tests/generation/test_stopping_criteria.py
@@ -256,7 +256,7 @@ def test_criterias_per_row(self):
             ]
         )
 
-        # trigger stopping when at leat one criteria is satisfied, one value per batch
+        # trigger stopping when at least one criteria is satisfied, one value per batch
         self.assertTrue(criteria(inputs["input_ids"], scores))
 
         # return False when neither is satisfied
@@ -283,7 +283,7 @@ def test_criterias_per_row_batched(self):
             ]
         )
 
-        # trigger stopping when at leat one criteria is satisfied
+        # trigger stopping when at least one criteria is satisfied
         self.assertListEqual(criteria(inputs["input_ids"], scores).tolist(), [True, False, False])
 
         # False when neither is satisfied
diff --git a/tests/generation/test_tf_logits_process.py b/tests/generation/test_tf_logits_process.py
deleted file mode 100644
index f06f5695b1ce..000000000000
--- a/tests/generation/test_tf_logits_process.py
+++ /dev/null
@@ -1,487 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-import numpy as np
-from parameterized import parameterized
-
-from transformers import is_tf_available
-from transformers.testing_utils import require_tf
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers.generation import (
-        TFForcedBOSTokenLogitsProcessor,
-        TFForcedEOSTokenLogitsProcessor,
-        TFForceTokensLogitsProcessor,
-        TFLogitsProcessorList,
-        TFMinLengthLogitsProcessor,
-        TFNoBadWordsLogitsProcessor,
-        TFNoRepeatNGramLogitsProcessor,
-        TFRepetitionPenaltyLogitsProcessor,
-        TFSuppressTokensAtBeginLogitsProcessor,
-        TFSuppressTokensLogitsProcessor,
-        TFTemperatureLogitsWarper,
-        TFTopKLogitsWarper,
-        TFTopPLogitsWarper,
-    )
-
-    from ..test_modeling_tf_common import ids_tensor
-
-
-@require_tf
-class TFLogitsProcessorTest(unittest.TestCase):
-    def _get_uniform_logits(self, batch_size: int, length: int):
-        scores = tf.ones((batch_size, length), dtype=tf.float32) / length
-        return scores
-
-    @parameterized.expand([(False,), (True,)])
-    def test_min_length_dist_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-
-        min_dist_processor = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-        if use_xla:
-            min_dist_processor = tf.function(min_dist_processor, jit_compile=True)
-
-        # check that min length is applied at length 5
-        cur_len = 5
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = min_dist_processor(input_ids, scores, cur_len)
-        self.assertListEqual(scores_before_min_length[:, eos_token_id].numpy().tolist(), 4 * [-float("inf")])
-
-        # check that min length is not applied anymore at length 15
-        cur_len = 15
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_before_min_length = min_dist_processor(input_ids, scores, cur_len)
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf(scores_before_min_length)).numpy())
-
-    @parameterized.expand([(False,), (True,)])
-    def test_temperature_dist_warper(self, use_xla):
-        input_ids = None
-        cur_len = None
-        length = 20
-
-        scores = self._get_uniform_logits(batch_size=2, length=length)
-
-        # tweak scores to not be uniform anymore
-        scores = scores.numpy()
-        scores[1, 5] = (1 / length) + 0.1  # peak, 1st batch
-        scores[1, 10] = (1 / length) - 0.4  # valley, 1st batch
-        scores = tf.convert_to_tensor(scores)
-
-        # compute softmax
-        probs = tf.nn.softmax(scores, axis=-1)
-
-        temp_dist_warper_sharper = TFTemperatureLogitsWarper(temperature=0.5)
-        temp_dist_warper_smoother = TFTemperatureLogitsWarper(temperature=1.3)
-        if use_xla:
-            temp_dist_warper_sharper = tf.function(temp_dist_warper_sharper, jit_compile=True)
-            temp_dist_warper_smoother = tf.function(temp_dist_warper_smoother, jit_compile=True)
-
-        warped_prob_sharp = tf.nn.softmax(temp_dist_warper_sharper(input_ids, tf.identity(scores), cur_len), axis=-1)
-        warped_prob_smooth = tf.nn.softmax(temp_dist_warper_smoother(input_ids, tf.identity(scores), cur_len), axis=-1)
-
-        # uniform distribution stays uniform
-        tf.debugging.assert_near(probs[0, :], warped_prob_sharp[0, :], atol=1e-3)
-        tf.debugging.assert_near(probs[0, :], warped_prob_smooth[0, :], atol=1e-3)
-
-        # sharp peaks get higher, valleys get lower
-        self.assertLess(tf.math.reduce_max(probs[1, :]), tf.math.reduce_max(warped_prob_sharp[1, :]))
-        self.assertGreater(tf.math.reduce_min(probs[1, :]), tf.math.reduce_min(warped_prob_sharp[1, :]))
-
-        # smooth peaks get lower, valleys get higher
-        self.assertGreater(tf.math.reduce_max(probs[1, :]), tf.math.reduce_max(warped_prob_smooth[1, :]))
-        self.assertLess(tf.math.reduce_min(probs[1, :]), tf.math.reduce_min(warped_prob_smooth[1, :]))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_repetition_penalty_dist_process(self, use_xla):
-        vocab_size = 10
-        cur_len = 2
-
-        input_ids = tf.constant([[0, 1], [5, 0]], dtype=tf.int32)
-        self.assertEqual(cur_len, input_ids.shape[1])
-
-        scores = self._get_uniform_logits(batch_size=2, length=vocab_size)
-
-        mask = tf.cast(tf.constant([[1] + 9 * [0], 10 * [0]]), tf.bool)
-        scores = tf.where(mask, -1 / vocab_size, scores)
-        mask = tf.cast(tf.constant([10 * [0], 5 * [0] + [1] + 4 * [0]]), tf.bool)
-        scores = tf.where(mask, 4 / vocab_size, scores)
-        rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0)
-        if use_xla:
-            rep_penalty_proc = tf.function(rep_penalty_proc, jit_compile=True)
-
-        scores = rep_penalty_proc(input_ids, tf.identity(scores), cur_len)
-
-        # check that values were correctly changed (negative scores for used tokens should increase, others
-        # should decrease)
-        self.assertAlmostEqual(scores[0, 0].numpy(), -(1 / vocab_size) * 2)
-        self.assertAlmostEqual(scores[0, 1].numpy(), (1 / vocab_size) / 2)
-        self.assertAlmostEqual(scores[0, 2].numpy(), (1 / vocab_size))  # unused tokens should see no change
-
-        self.assertAlmostEqual(scores[1, 0].numpy(), (1 / vocab_size) / 2)
-        self.assertAlmostEqual(scores[1, 5].numpy(), (4 / vocab_size) / 2)
-        self.assertAlmostEqual(scores[0, 2].numpy(), (1 / vocab_size))  # unused tokens should see no change
-
-    @parameterized.expand([(False,), (True,)])
-    def test_top_k_dist_warper(self, use_xla):
-        input_ids = None
-        cur_len = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create ramp distribution
-        ramp_logits = np.broadcast_to(np.arange(vocab_size, dtype=np.float32), (batch_size, vocab_size)).copy()
-        ramp_logits[1:, : vocab_size // 2] = ramp_logits[1:, : vocab_size // 2] + vocab_size
-
-        top_k_warp = TFTopKLogitsWarper(3)
-        if use_xla:
-            top_k_warp = tf.function(top_k_warp, jit_compile=True)
-
-        scores = top_k_warp(input_ids, ramp_logits, cur_len)
-
-        # check that correct tokens are filtered
-        self.assertListEqual(tf.math.is_inf(scores[0]).numpy().tolist(), 7 * [True] + 3 * [False])
-        self.assertListEqual(tf.math.is_inf(scores[1]).numpy().tolist(), 2 * [True] + 3 * [False] + 5 * [True])
-
-        # check special cases
-        length = 5
-
-        logits = self._get_uniform_logits(batch_size=batch_size, length=length)
-        top_k_warp_safety_check = TFTopKLogitsWarper(top_k=1, filter_value=0.0, min_tokens_to_keep=3)
-        if use_xla:
-            top_k_warp_safety_check = tf.function(top_k_warp_safety_check, jit_compile=True)
-
-        scores = top_k_warp_safety_check(input_ids, logits, cur_len)
-        # uniform dist is not changed
-        self.assertListEqual(tf.math.reduce_sum(tf.where(scores == 0.0, 1, 0), axis=-1).numpy().tolist(), [0, 0])
-
-        ramp_logits = np.broadcast_to(np.arange(length, dtype=np.float32), (batch_size, length)).copy()
-        scores = top_k_warp_safety_check(input_ids, ramp_logits, cur_len)
-
-        # min_tokens overwrites k: 3 tokens are kept => 2 tokens are nullified
-        self.assertListEqual(tf.math.reduce_sum(tf.where(scores == 0.0, 1, 0), axis=-1).numpy().tolist(), [2, 2])
-
-    @parameterized.expand([(False,), (True,)])
-    def test_top_p_dist_warper(self, use_xla):
-        input_ids = None
-        cur_len = None
-        vocab_size = 10
-        batch_size = 2
-
-        # create distribution and take log (inverse to Softmax as taken in TFTopPLogitsWarper)
-        dist = np.log(np.array([[0.3, 0.1, 0.1, 0.5], [0.15, 0.3, 0.3, 0.25]], dtype=np.float32))
-
-        # top_p should have been 0.8 to test the edge case of top_p being exactly equal to sum of some token prob
-        # However, due to the numerical instability of softmax in TF we choose this as the edge case
-        # top_p as 0.8 passes when use_xla is True and fails when False. Refer PR #18984.
-        top_p_warp = TFTopPLogitsWarper(0.79999995)
-        if use_xla:
-            top_p_warp = tf.function(top_p_warp, jit_compile=True)
-        filtered_dist = tf.exp(top_p_warp(input_ids, dist, cur_len))
-
-        # dist should be filtered to keep min num values so that sum is >= top_p
-        # exp (-inf) => 0
-        EXPECTED_FILTERED_DIST = tf.constant([[0.3, 0.0, 0.0, 0.5], [0.0, 0.3, 0.3, 0.25]], dtype=tf.float32)
-        tf.debugging.assert_near(filtered_dist, EXPECTED_FILTERED_DIST, atol=1e-3)
-
-        # check edge cases with negative and extreme logits
-        ramp_logits = np.broadcast_to(
-            np.arange(vocab_size, dtype=np.float32)[None, :], (batch_size, vocab_size)
-        ).copy() - (vocab_size // 2)
-
-        # make ramp_logits more extreme
-        ramp_logits[1] = ramp_logits[1] * 100.0
-
-        # make sure at least 2 tokens are kept
-        top_p_warp = TFTopPLogitsWarper(0.9, min_tokens_to_keep=2, filter_value=0.0)
-        if use_xla:
-            top_p_warp = tf.function(top_p_warp, jit_compile=True)
-        filtered_dist = top_p_warp(input_ids, ramp_logits, cur_len)
-
-        # first batch should keep three tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps
-        # 2.
-        self.assertListEqual(
-            tf.math.reduce_sum(tf.where(filtered_dist != 0.0, 1, 0), axis=-1).numpy().tolist(), [3, 2]
-        )
-
-    def test_no_repeat_ngram_dist_processor(self):
-        vocab_size = 3
-        batch_size = 2
-        cur_len = 4
-
-        input_ids = tf.constant([[1, 1, 2, 1], [0, 1, 0, 1]], dtype=tf.int32)
-        self.assertEqual(cur_len, input_ids.shape[1])
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-
-        no_repeat_proc_2_gram = TFNoRepeatNGramLogitsProcessor(2)
-        no_repeat_proc_3_gram = TFNoRepeatNGramLogitsProcessor(3)
-
-        filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, tf.identity(scores), cur_len)
-        filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, tf.identity(scores), cur_len)
-
-        # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch
-        self.assertListEqual(
-            tf.math.is_inf(filtered_scores_2_gram).numpy().tolist(), [[False, True, True], [True, False, False]]
-        )
-
-        # 3-gram would forbid no token at 1st batch and 1st token (0) at 2nd batch
-        self.assertListEqual(
-            tf.math.is_inf(filtered_scores_3_gram).numpy().tolist(), [[False, False, False], [True, False, False]]
-        )
-
-    @parameterized.expand([(False,), (True,)])
-    def test_no_bad_words_dist_processor(self, use_xla):
-        vocab_size = 5
-        batch_size = 2
-        eos_token_id = 4
-        cur_len = 4
-
-        input_ids = tf.constant([[0, 1, 3, 1], [0, 1, 0, 1]], dtype=tf.int32)
-        self.assertEqual(cur_len, input_ids.shape[1])
-
-        bad_word_tokens = [[1], [4], [1, 0], [0, 1, 2], [1, 3, 1, 3]]
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-
-        no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=bad_word_tokens, eos_token_id=eos_token_id)
-        if use_xla:
-            no_bad_words_dist_proc = tf.function(no_bad_words_dist_proc, jit_compile=True)
-
-        filtered_scores = no_bad_words_dist_proc(input_ids, tf.identity(scores), cur_len)
-
-        # batch 1: 1st, 2nd, and 4th (0, 1, 3) token are forbidden
-        # batch 2: 1st, 2nd, and 3rd (0, 1, 2) token are forbidden
-        self.assertListEqual(
-            tf.math.is_inf(filtered_scores).numpy().tolist(),
-            [[True, True, False, True, True], [True, True, True, False, True]],
-        )
-
-    @parameterized.expand([(False,), (True,)])
-    def test_forced_bos_token_logits_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-        bos_token_id = 0
-
-        logits_processor = TFForcedBOSTokenLogitsProcessor(bos_token_id=bos_token_id)
-        if use_xla:
-            logits_processor = tf.function(logits_processor, jit_compile=True)
-
-        # check that all scores are -inf except the bos_token_id score
-        cur_len = 1
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertTrue(
-            tf.math.reduce_all(tf.math.is_inf(scores[:, bos_token_id + 1 :]) & (scores[:, bos_token_id + 1 :] < 0))
-        )
-        self.assertListEqual(scores[:, bos_token_id].numpy().tolist(), 4 * [0])  # score for bos_token_id shold be zero
-
-        # check that bos_token_id is not forced if current length is greater than 1
-        cur_len = 4
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf((scores))))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_forced_eos_token_logits_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-        eos_token_id = 0
-        max_length = 5
-
-        logits_processor = TFForcedEOSTokenLogitsProcessor(max_length=max_length, eos_token_id=eos_token_id)
-        if use_xla:
-            logits_processor = tf.function(logits_processor, jit_compile=True)
-
-        # check that all scores are -inf except the eos_token_id when max_length-1 is reached
-        cur_len = 4
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertTrue(
-            tf.math.reduce_all(tf.math.is_inf(scores[:, eos_token_id + 1 :]) & (scores[:, eos_token_id + 1 :] < 0))
-        )
-        self.assertListEqual(
-            scores[:, eos_token_id].numpy().tolist(), 4 * [0]
-        )  # score for eos_token_id should be zero
-
-        # check that eos_token_id is not forced if max_length-1 is not reached
-        cur_len = 3
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf((scores))))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_suppress_tokens_at_begin_logits_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-
-        begin_suppress_tokens = [1, 2, 3]
-        begin_index = 5
-
-        logits_processor = TFSuppressTokensAtBeginLogitsProcessor(
-            begin_suppress_tokens=begin_suppress_tokens, begin_index=begin_index
-        )
-        if use_xla:
-            logits_processor = tf.function(logits_processor, jit_compile=True)
-
-        # Check that no scores are suppressed if begin_index is not reached
-        cur_len = 4
-        input_ids = tf.convert_to_tensor([[11, 17, 15, 8], [14, 0, 19, 5], [13, 11, 18, 19], [11, 12, 16, 15]])
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf((scores))))
-
-        # Check that scores are suppressed if begin_index is reached
-        cur_len = 5
-        input_ids = tf.convert_to_tensor([[5, 5, 5, 0, 17], [18, 1, 9, 14, 17], [18, 6, 8, 15, 19], [8, 12, 17, 1, 2]])
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertTrue(tf.math.reduce_all(tf.math.is_inf(tf.gather(scores, begin_suppress_tokens, axis=1))))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_suppress_tokens_logits_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-
-        suppress_tokens = [1, 3, 5]
-        keep_tokens = [i for i in range(vocab_size) if i not in suppress_tokens]
-
-        logits_processor = TFSuppressTokensLogitsProcessor(suppress_tokens=suppress_tokens)
-        if use_xla:
-            logits_processor = tf.function(logits_processor, jit_compile=True)
-
-        # Check that suppress_tokens are suppressed and others are not
-        cur_len = 5
-        input_ids = tf.convert_to_tensor([[0, 10, 19, 6, 3], [17, 4, 8, 17, 2], [7, 1, 11, 6, 15], [5, 8, 13, 16, 0]])
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertTrue(tf.math.reduce_all(tf.math.is_inf(tf.gather(scores, suppress_tokens, axis=1))))
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf(tf.gather(scores, keep_tokens, axis=1))))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_force_tokens_logits_processor(self, use_xla):
-        vocab_size = 20
-        batch_size = 4
-
-        force_token_map = {1: 2, 3: 2}
-
-        logits_processor = TFForceTokensLogitsProcessor(force_token_map=force_token_map)
-        if use_xla:
-            logits_processor = tf.function(logits_processor, jit_compile=True)
-
-        # check that if the cur_len is contained in the force_token_map, the logits are the same
-        # for all tokens except the one the force_token_map points to
-        cur_len = 1
-        input_ids = tf.convert_to_tensor([[11], [7], [5], [15]])
-        ids_tensor((batch_size, cur_len), vocab_size=20)
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        tf.debugging.assert_near(tf.gather(scores, [force_token_map[cur_len]], axis=1), 0.0)
-
-        non_forced_inds = [i for i in range(vocab_size) if i != force_token_map[cur_len]]
-        self.assertTrue(
-            tf.math.reduce_all(
-                tf.experimental.numpy.isclose(
-                    tf.gather(scores, [non_forced_inds], axis=1),
-                    tf.constant(scores.dtype.min),
-                )
-            )
-        )
-
-        # check that if the cur_len is not contained in the force_token_map, the logits are not modified
-        cur_len = 2
-        input_ids = tf.convert_to_tensor([[2, 19], [19, 15], [4, 9], [7, 6]])
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores = logits_processor(input_ids, scores, cur_len)
-        self.assertFalse(tf.math.reduce_any(tf.math.is_inf((scores))))
-
-    @parameterized.expand([(False,), (True,)])
-    def test_processor_list(self, use_xla):
-        # TODO (Joao): reintroduce TFNoRepeatNGramLogitsProcessor when it gets compatible with XLA
-        batch_size = 4
-        cur_len = 10
-        vocab_size = 15
-        eos_token_id = 0
-
-        # dummy input_ids and scores
-        input_ids = ids_tensor((batch_size, cur_len), vocab_size)
-        input_ids_comp = tf.identity(input_ids)
-
-        scores = self._get_uniform_logits(batch_size, vocab_size)
-        scores_comp = tf.identity(scores)
-
-        # instantiate all dist processors
-        min_dist_proc = TFMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id)
-        temp_dist_warp = TFTemperatureLogitsWarper(temperature=0.5)
-        rep_penalty_proc = TFRepetitionPenaltyLogitsProcessor(penalty=2.0)
-        top_k_warp = TFTopKLogitsWarper(3)
-        top_p_warp = TFTopPLogitsWarper(0.8)
-        # no_repeat_proc = TFNoRepeatNGramLogitsProcessor(2)
-        no_bad_words_dist_proc = TFNoBadWordsLogitsProcessor(bad_words_ids=[[1]], eos_token_id=eos_token_id)
-        if use_xla:
-            min_dist_proc = tf.function(min_dist_proc, jit_compile=True)
-            temp_dist_warp = tf.function(temp_dist_warp, jit_compile=True)
-            rep_penalty_proc = tf.function(rep_penalty_proc, jit_compile=True)
-            top_k_warp = tf.function(top_k_warp, jit_compile=True)
-            top_p_warp = tf.function(top_p_warp, jit_compile=True)
-            # no_repeat_proc = tf.function(no_repeat_proc, jit_compile=True)
-            no_bad_words_dist_proc = tf.function(no_bad_words_dist_proc, jit_compile=True)
-
-        # no processor list
-        scores = min_dist_proc(input_ids, scores, cur_len)
-        scores = temp_dist_warp(input_ids, scores, cur_len)
-        scores = rep_penalty_proc(input_ids, scores, cur_len)
-        scores = top_k_warp(input_ids, scores, cur_len)
-        scores = top_p_warp(input_ids, scores, cur_len)
-        # scores = no_repeat_proc(input_ids, scores, cur_len)
-        scores = no_bad_words_dist_proc(input_ids, scores, cur_len)
-
-        # with processor list
-        processor = TFLogitsProcessorList(
-            [
-                min_dist_proc,
-                temp_dist_warp,
-                rep_penalty_proc,
-                top_k_warp,
-                top_p_warp,
-                # no_repeat_proc,
-                no_bad_words_dist_proc,
-            ]
-        )
-        scores_comp = processor(input_ids, scores_comp, cur_len)
-
-        # remove inf
-        scores = tf.where(tf.math.is_inf(scores), -1e9, scores)
-        scores_comp = tf.where(tf.math.is_inf(scores_comp), -1e9, scores_comp)
-
-        # scores should be equal
-        tf.debugging.assert_near(scores, scores_comp, atol=1e-3)
-
-        # input_ids should never be changed
-        self.assertListEqual(input_ids.numpy().tolist(), input_ids_comp.numpy().tolist())
diff --git a/tests/generation/test_tf_utils.py b/tests/generation/test_tf_utils.py
deleted file mode 100644
index f40ceebef76f..000000000000
--- a/tests/generation/test_tf_utils.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import os
-import tempfile
-import unittest
-
-import numpy as np
-from huggingface_hub import hf_hub_download
-
-from transformers import is_tensorflow_text_available, is_tf_available
-from transformers.testing_utils import require_tensorflow_text, require_tf, slow
-
-from ..test_modeling_tf_common import floats_tensor
-from .test_framework_agnostic import GenerationIntegrationTestsMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        AutoTokenizer,
-        TFAutoModelForCausalLM,
-        TFAutoModelForSeq2SeqLM,
-        TFAutoModelForSpeechSeq2Seq,
-        TFAutoModelForVision2Seq,
-        TFBartForConditionalGeneration,
-        TFLogitsProcessorList,
-        TFMinLengthLogitsProcessor,
-    )
-    from transformers.modeling_tf_utils import keras
-
-if is_tensorflow_text_available():
-    import tensorflow_text as text
-
-
-@require_tf
-class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
-    # setting framework_dependent_parameters needs to be gated, just like its contents' imports
-    if is_tf_available():
-        framework_dependent_parameters = {
-            "AutoModelForCausalLM": TFAutoModelForCausalLM,
-            "AutoModelForSpeechSeq2Seq": TFAutoModelForSpeechSeq2Seq,
-            "AutoModelForSeq2SeqLM": TFAutoModelForSeq2SeqLM,
-            "AutoModelForVision2Seq": TFAutoModelForVision2Seq,
-            "LogitsProcessorList": TFLogitsProcessorList,
-            "MinLengthLogitsProcessor": TFMinLengthLogitsProcessor,
-            "create_tensor_fn": tf.convert_to_tensor,
-            "floats_tensor": floats_tensor,
-            "return_tensors": "tf",
-        }
-
-    @slow
-    def test_generate_tf_function_export_fixed_input_length(self):
-        # TF-only test: tf.saved_model export
-        test_model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        input_length = 2
-        max_new_tokens = 2
-
-        class DummyModel(tf.Module):
-            def __init__(self, model):
-                super(DummyModel, self).__init__()
-                self.model = model
-
-            @tf.function(
-                input_signature=(
-                    tf.TensorSpec((None, input_length), tf.int32, name="input_ids"),
-                    tf.TensorSpec((None, input_length), tf.int32, name="attention_mask"),
-                ),
-                jit_compile=True,
-            )
-            def serving(self, input_ids, attention_mask):
-                outputs = self.model.generate(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    return_dict_in_generate=True,
-                )
-                return {"sequences": outputs["sequences"]}
-
-        dummy_input_ids = [[2, 0], [102, 103]]
-        dummy_attention_masks = [[1, 0], [1, 1]]
-        dummy_model = DummyModel(model=test_model)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tf.saved_model.save(dummy_model, tmp_dir, signatures={"serving_default": dummy_model.serving})
-            serving_func = tf.saved_model.load(tmp_dir).signatures["serving_default"]
-            for batch_size in range(1, len(dummy_input_ids) + 1):
-                inputs = {
-                    "input_ids": tf.constant(dummy_input_ids[:batch_size]),
-                    "attention_mask": tf.constant(dummy_attention_masks[:batch_size]),
-                }
-                tf_func_outputs = serving_func(**inputs)["sequences"]
-                tf_model_outputs = test_model.generate(**inputs, max_new_tokens=max_new_tokens)
-                tf.debugging.assert_equal(tf_func_outputs, tf_model_outputs)
-
-    @slow
-    def test_generate_tf_function_export_fixed_batch_size(self):
-        # TF-only test: tf.saved_model export
-        test_model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        batch_size = 1
-        max_new_tokens = 2
-
-        class DummyModel(tf.Module):
-            def __init__(self, model):
-                super(DummyModel, self).__init__()
-                self.model = model
-
-            @tf.function(
-                input_signature=(
-                    tf.TensorSpec((batch_size, None), tf.int32, name="input_ids"),
-                    tf.TensorSpec((batch_size, None), tf.int32, name="attention_mask"),
-                ),
-                jit_compile=True,
-            )
-            def serving(self, input_ids, attention_mask):
-                outputs = self.model.generate(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    return_dict_in_generate=True,
-                )
-                return {"sequences": outputs["sequences"]}
-
-        dummy_input_ids = [[2], [102, 103]]
-        dummy_attention_masks = [[1], [1, 1]]
-        dummy_model = DummyModel(model=test_model)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tf.saved_model.save(dummy_model, tmp_dir, signatures={"serving_default": dummy_model.serving})
-            serving_func = tf.saved_model.load(tmp_dir).signatures["serving_default"]
-            for input_row in range(len(dummy_input_ids)):
-                inputs = {
-                    "input_ids": tf.constant([dummy_input_ids[input_row]]),
-                    "attention_mask": tf.constant([dummy_attention_masks[input_row]]),
-                }
-                tf_func_outputs = serving_func(**inputs)["sequences"]
-                tf_model_outputs = test_model.generate(**inputs, max_new_tokens=max_new_tokens)
-                tf.debugging.assert_equal(tf_func_outputs, tf_model_outputs)
-
-    @slow
-    @require_tensorflow_text
-    def test_generate_tf_function_export_with_tf_tokenizer(self):
-        # TF-only test: tf.saved_model export
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # file needed to load the TF tokenizer
-            hf_hub_download(repo_id="google/flan-t5-small", filename="spiece.model", local_dir=tmp_dir)
-
-            class CompleteSentenceTransformer(keras.layers.Layer):
-                def __init__(self):
-                    super().__init__()
-                    self.tokenizer = text.SentencepieceTokenizer(
-                        model=tf.io.gfile.GFile(os.path.join(tmp_dir, "spiece.model"), "rb").read()
-                    )
-                    self.model = TFAutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-                def call(self, inputs, *args, **kwargs):
-                    tokens = self.tokenizer.tokenize(inputs)
-                    input_ids, attention_mask = text.pad_model_inputs(
-                        tokens, max_seq_length=64, pad_value=self.model.config.pad_token_id
-                    )
-                    outputs = self.model.generate(input_ids=input_ids, attention_mask=attention_mask)
-                    return self.tokenizer.detokenize(outputs)
-
-            complete_model = CompleteSentenceTransformer()
-            inputs = keras.layers.Input(shape=(1,), dtype=tf.string, name="inputs")
-            outputs = complete_model(inputs)
-            keras_model = keras.Model(inputs, outputs)
-            keras_model.save(tmp_dir)
-
-    def test_eos_token_id_int_and_list_top_k_top_sampling(self):
-        # Has PT equivalent: this test relies on random sampling
-        generation_kwargs = {
-            "do_sample": True,
-            "num_beams": 1,
-            "top_p": 0.7,
-            "top_k": 10,
-            "temperature": 0.7,
-        }
-        expectation = 14
-
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        text = """Hello, my dog is cute and"""
-        tokens = tokenizer(text, return_tensors="tf")
-        model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-
-        eos_token_id = 638
-        # forces the generation to happen on CPU, to avoid GPU-related quirks
-        with tf.device(":/CPU:0"):
-            tf.random.set_seed(0)
-            generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-        eos_token_id = [638, 198]
-        with tf.device(":/CPU:0"):
-            tf.random.set_seed(0)
-            generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
-        self.assertTrue(expectation == len(generated_tokens[0]))
-
-    def test_model_kwarg_encoder_signature_filtering(self):
-        # Has PT equivalent: ample use of framework-specific code
-        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        article = """Hugging Face is a technology company based in New York and Paris."""
-        input_ids = bart_tokenizer(article, return_tensors="tf").input_ids
-        bart_model = TFBartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart")
-        output = bart_model.generate(input_ids).numpy()
-
-        # Let's create a fake model that has a different signature. In particular, this fake model accepts "foo" as an
-        # argument. Because "foo" is not in the encoder signature and doesn't start with "decoder_", it will be part of
-        # the encoder kwargs prior to signature filtering, which would lead to an exception. But filtering kicks in and
-        # saves the day.
-        class FakeBart(TFBartForConditionalGeneration):
-            def call(self, input_ids, foo=None, **kwargs):
-                return super().call(input_ids, **kwargs)
-
-        bart_model = FakeBart.from_pretrained("hf-internal-testing/tiny-random-bart")
-        fake_output = bart_model.generate(input_ids, foo="bar").numpy()
-        self.assertTrue(np.array_equal(output, fake_output))
-
-        # Encoder signature filtering only kicks in if it doesn't accept wildcard kwargs. The following test will fail
-        # because it doesn't do signature filtering.
-        class FakeEncoder(bart_model.model.encoder.__class__):
-            def call(self, input_ids, **kwargs):
-                return super().call(input_ids, **kwargs)
-
-        fake_encoder = FakeEncoder(bart_model.config, bart_model.model.shared)
-        bart_model.model.encoder = fake_encoder
-
-        # Normal generation still works (the output will be different because the encoder weights are different)
-        fake_output = bart_model.generate(input_ids).numpy()
-        with self.assertRaises(ValueError):
-            # FakeEncoder.call() accepts **kwargs -> no filtering -> value error due to unexpected input "foo"
-            bart_model.generate(input_ids, foo="bar")
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 3b9700dc20c9..a922f88d7cf8 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -19,6 +19,7 @@
 import datetime
 import gc
 import inspect
+import random
 import tempfile
 import unittest
 import warnings
@@ -46,10 +47,7 @@
     slow,
     torch_device,
 )
-from transformers.utils import is_ipex_available
-
-from ..test_modeling_common import floats_tensor, ids_tensor
-from .test_framework_agnostic import GenerationIntegrationTestsMixin
+from transformers.utils import is_ipex_available, is_torchdynamo_exporting
 
 
 if is_torch_available():
@@ -58,6 +56,7 @@
 
     from transformers import (
         AutoModelForCausalLM,
+        AutoModelForImageTextToText,
         AutoModelForSeq2SeqLM,
         AutoModelForSpeechSeq2Seq,
         AutoModelForVision2Seq,
@@ -88,6 +87,7 @@
         GenerateDecoderOnlyOutput,
         GenerateEncoderDecoderOutput,
         GenerationConfig,
+        GenerationMixin,
         GreedySearchDecoderOnlyOutput,
         GreedySearchEncoderDecoderOutput,
         LogitsProcessorList,
@@ -114,6 +114,24 @@
 from transformers.utils import is_sklearn_available
 
 
+# TODO: raushan remove this when VLMs start accepting input embeds
+VLM_CLASS_NAMES = [
+    "llava",
+    "idefics2",
+    "idefics3",
+    "mllama",
+    "paligemma",
+    "emu3",
+    "gotocr2",
+    "qwen2vl",
+    "qwen2_5_vl",
+    "ayavision",
+    "gemma3",
+    "mistral3",
+    "chameleon",
+]
+
+
 class GenerationTesterMixin:
     input_name = "input_ids"
     model_tester = None
@@ -159,7 +177,7 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2):
     def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e-5):
         """
         Checks whether a pair of generate outputs are similar. Two `generate` call outputs are considered similar in
-        the following siturations:
+        the following situations:
         1. The sequences are the same
         2. The sequences are different, but the scores up to (and including) the first mismatch are nearly identical
         """
@@ -263,6 +281,7 @@ def _greedy_generate(
             do_sample=False,
             num_beams=1,
             max_new_tokens=self.max_new_tokens,
+            min_new_tokens=self.max_new_tokens,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             output_scores=output_scores,
@@ -293,6 +312,7 @@ def _sample_generate(
             do_sample=True,
             num_beams=1,
             max_new_tokens=self.max_new_tokens,
+            min_new_tokens=self.max_new_tokens,
             num_return_sequences=num_return_sequences,
             output_scores=output_scores,
             output_logits=output_logits,
@@ -322,6 +342,7 @@ def _beam_search_generate(
         output_generate = model.generate(
             do_sample=False,
             max_new_tokens=self.max_new_tokens,
+            min_new_tokens=self.max_new_tokens,
             output_scores=output_scores,
             output_logits=output_logits,
             output_attentions=output_attentions,
@@ -352,6 +373,7 @@ def _beam_sample_generate(
         output_generate = model.generate(
             do_sample=True,
             max_new_tokens=self.max_new_tokens,
+            min_new_tokens=self.max_new_tokens,
             output_scores=output_scores,
             output_logits=output_logits,
             output_attentions=output_attentions,
@@ -381,6 +403,7 @@ def _group_beam_search_generate(
         output_generate = model.generate(
             do_sample=False,
             max_new_tokens=self.max_new_tokens,
+            min_new_tokens=self.max_new_tokens,
             output_scores=output_scores,
             output_logits=output_logits,
             output_attentions=output_attentions,
@@ -411,6 +434,7 @@ def _constrained_beam_search_generate(
         output_generate = model.generate(
             do_sample=False,
             max_new_tokens=self.max_new_tokens,
+            min_new_tokens=self.max_new_tokens,
             output_scores=output_scores,
             output_logits=output_logits,
             output_attentions=output_attentions,
@@ -446,6 +470,7 @@ def _contrastive_generate(
             do_sample=False,
             num_beams=1,
             max_new_tokens=self.max_new_tokens,
+            min_new_tokens=self.max_new_tokens,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             output_scores=output_scores,
@@ -864,8 +889,7 @@ def test_group_beam_search_generate_dict_output(self):
                 num_beams=beam_kwargs["num_beams"],
             )
 
-    # TODO: @gante check why it is flaky
-    @is_flaky()
+    @is_flaky()  # Some models have position-specific tokens, this test may try to force them in an invalid position
     @pytest.mark.generate
     def test_constrained_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
@@ -922,6 +946,7 @@ def test_constrained_beam_search_generate(self):
             for generation_output in output_generate:
                 self._check_sequence_inside_sequence(force_tokens, generation_output)
 
+    @is_flaky()  # Some models have position-specific tokens, this test may try to force them in an invalid position
     @pytest.mark.generate
     def test_constrained_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
@@ -1084,72 +1109,8 @@ def test_contrastive_generate_low_memory(self):
             )
             self.assertListEqual(low_output.tolist(), high_output.tolist())
 
-    @pytest.mark.generate
-    def test_beam_search_low_memory(self):
-        # Check that choosing 'low_memory' does not change the model output
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="May fix in the future: need custom cache handling")
-            if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-            if any(
-                model_name in model_class.__name__.lower()
-                for model_name in [
-                    "ctrl",
-                    "gptbigcode",
-                    "transo_xl",
-                    "xlnet",
-                    "cpm",
-                    "jamba",
-                ]
-            ):
-                self.skipTest(reason="May fix in the future: need model-specific fixes")
-
-            set_model_tester_for_less_flaky_test(self)
-
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            set_config_for_less_flaky_test(config)
-            # batch_size=1 is ok, but batch_size>1 will cause non-identical output
-
-            config.use_cache = True
-            config.is_decoder = True
-
-            # test output equality of low versus high memory
-            model = model_class(config).to(torch_device).eval()
-            set_model_for_less_flaky_test(model)
-
-            logits_processor_kwargs = self._get_logits_processor_kwargs(config=model.config)
-
-            low_output = model.generate(
-                **inputs_dict,
-                max_new_tokens=8,
-                num_beams=5,
-                early_stopping=True,
-                low_memory=True,
-                use_cache=True,
-                output_scores=True,
-                output_logits=True,
-                return_dict_in_generate=True,
-                **logits_processor_kwargs,
-            )
-
-            high_output = model.generate(
-                **inputs_dict,
-                max_new_tokens=8,
-                num_beams=5,
-                early_stopping=True,
-                low_memory=False,
-                use_cache=True,
-                output_scores=True,
-                output_logits=True,
-                return_dict_in_generate=True,
-                **logits_processor_kwargs,
-            )
-            # The two outputs must match and their shape must be as expected
-            self._check_similar_generate_outputs(low_output, high_output)
-
-    @pytest.mark.generate
     @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
     def test_assisted_decoding_matches_greedy_search(self, assistant_type):
         # This test ensures that the assisted generation does not introduce output changes over greedy search.
         # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info.
@@ -1259,6 +1220,7 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
                     "blip2",  # overridden `generate()`
                     "instructblip",
                     "instructblipvideo",
+                    *VLM_CLASS_NAMES,  # shouldn't suggest image tokens
                 ]
             ):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
@@ -1412,7 +1374,8 @@ def test_assisted_decoding_sample(self):
                 "return_dict_in_generate": True,
                 "use_cache": True,
             }
-            output_assisted = model.generate(**generation_kwargs, **inputs_dict)
+            logits_processor_kwargs = self._get_logits_processor_kwargs(config=model.config)
+            output_assisted = model.generate(**generation_kwargs, **inputs_dict, **logits_processor_kwargs)
 
             self._check_generate_outputs(output_assisted, config, use_cache=True)
 
@@ -1601,7 +1564,7 @@ def test_past_key_values_format(self):
             embed_dim = getattr(text_config, "d_model", text_config.hidden_size)
             per_head_embed_dim = embed_dim // num_attention_heads
 
-            # some models have diffent num-head for query vs key/value so we need to assign correct value
+            # some models have different num-head for query vs key/value so we need to assign correct value
             # BUT only after `per_head_embed_dim` is set
             num_attention_heads = (
                 text_config.num_key_value_heads
@@ -1691,8 +1654,7 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
             #   exception above (complex `inputs_embeds` computation). Popping `pixel_values` allow us to run the
             #   checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images`
             pixel_values_is_mutually_exclusive = any(
-                model_name in model_class.__name__.lower()
-                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma", "emu3", "gotocr2"]
+                model_name in model_class.__name__.lower() for model_name in VLM_CLASS_NAMES
             )
             if pixel_values_is_mutually_exclusive:
                 inputs_dict.pop("pixel_values", None)
@@ -1700,7 +1662,7 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
                 inputs_dict.pop("pixel_values_images", None)
             #   2.C - No easy fix, let's skip the check that compares the outputs from `input_ids` and `inputs_embeds`
             has_complex_embeds_computation = any(
-                model_name in model_class.__name__.lower() for model_name in ["moshi", "qwen2vl", "qwen2_5_vl"]
+                model_name in model_class.__name__.lower() for model_name in ["moshi"]
             )
             # 3 - `inputs_dict` doesn't contain `attention_mask`. When `attention_mask` is not passed to generate,
             # we infer it from `input_ids`. The last test case will fail if there is a pad token in the original input.
@@ -1770,8 +1732,7 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
             #   exception above (complex `inputs_embeds` computation). Popping `pixel_values` allow us to run the
             #   checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images`
             pixel_values_is_mutually_exclusive = any(
-                model_name in model_class.__name__.lower()
-                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma", "emu3"]
+                model_name in model_class.__name__.lower() for model_name in VLM_CLASS_NAMES
             )
             if pixel_values_is_mutually_exclusive:
                 inputs_dict.pop("pixel_values", None)
@@ -1930,8 +1891,7 @@ def test_generate_continue_from_inputs_embeds(self):
                 self.skipTest(reason="This model doesn't return `past_key_values`")
 
             pixel_values_is_mutually_exclusive = any(
-                model_name in model_class.__name__.lower()
-                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma", "emu3"]
+                model_name in model_class.__name__.lower() for model_name in VLM_CLASS_NAMES
             )
             if pixel_values_is_mutually_exclusive:
                 inputs_dict.pop("pixel_values", None)
@@ -2106,6 +2066,9 @@ def test_generate_compile_model_forward(self):
         Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results.
         ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️
         """
+        # Monkey-patching the HybridCache at test-time to continue testing compilation support
+        HybridCache.is_compileable = True
+
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_static_cache:
                 self.skipTest("This model doesn't support static cache (= no expectations of compilation support)")
@@ -2202,6 +2165,9 @@ def test_generate_compilation_all_outputs(self):
         Tests that all optional outputs are behaving as expected when compilation is triggered.
         In essence, it's the same as `test_greedy_generate_dict_outputs`, but with automatic compilation triggered.
         """
+        # Monkey-patching the HybridCache at test-time to continue testing compilation support
+        HybridCache.is_compileable = True
+
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_static_cache:
                 self.skipTest("This model doesn't support static cache (= no expectations of compilation support)")
@@ -2285,41 +2251,6 @@ def test_generate_methods_with_logits_to_keep(self):
             without_all_logits = model.generate(**inputs_dict, **generation_kwargs)
             self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
 
-    @pytest.mark.generate
-    def test_assisted_decoding_with_logits_to_keep(self):
-        for model_class in self.all_generative_model_classes:
-            if "logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
-                self.skipTest(reason="This model does not support `logits_to_keep` argument.")
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support assisted generation")
-
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            # NOTE: assisted generation only works with cache on at the moment.
-            if not hasattr(config.get_text_config(), "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-            config.use_cache = True
-            config.is_decoder = True
-
-            model = model_class(config).to(torch_device).eval()
-            assistant_model = model
-            # All generation methods (except assisted decoding) rely on always extracting the last token logits of the
-            # full logits matrix, so testing out only greedy search and assisted decoding is enough (if it works,
-            # other methods will work as well)
-            generation_kwargs = {
-                "max_new_tokens": 10,
-                "do_sample": False,
-                "assistant_model": assistant_model,
-                "return_dict_in_generate": True,
-                "output_scores": True,
-            }
-
-            # Setting logits_to_keep at 0 keeps all logits (old behavior)
-            with_all_logits = model.generate(**generation_kwargs, **inputs_dict, logits_to_keep=0)
-            # By default, logits_to_keep is automatically set to 1 if not provided (new behavior)
-            without_all_logits = model.generate(**inputs_dict, **generation_kwargs)
-
-            self._check_similar_generate_outputs(with_all_logits, without_all_logits)
-
     @pytest.mark.generate
     def test_inherits_generation_mixin(self):
         """
@@ -2332,7 +2263,7 @@ def test_inherits_generation_mixin(self):
     def _test_attention_implementation(self, attn_implementation):
         """
         Compares the output of generate with the eager attention implementation against other implementations.
-        NOTE: despite the test logic being the same, different implementations actually need diferent decorators, hence
+        NOTE: despite the test logic being the same, different implementations actually need different decorators, hence
         this separate function.
         """
         max_new_tokens = 30
@@ -2354,6 +2285,14 @@ def _test_attention_implementation(self, attn_implementation):
                     inputs_dict[input_name] = input_data
             main_input = inputs_dict[model_class.main_input_name]
 
+            # FA2 doesn't accept masking in the middle of the sequence for now. We usually generate right-padded
+            # attention masks at test time and, with generate, the mask will be appended with 1s on the right,
+            # resulting in a mask with holes (not supported properly by FA2).
+            if attn_implementation == "flash_attention_2":
+                for input_name in ("attention_mask", "decoder_attention_mask", "encoder_attention_mask"):
+                    if input_name in inputs_dict:
+                        inputs_dict[input_name] = torch.ones_like(inputs_dict[input_name])
+
             # make sure that all models have enough positions for generation
             if hasattr(config, "max_position_embeddings"):
                 config.max_position_embeddings = max_new_tokens + main_input.shape[1] + 1
@@ -2408,8 +2347,6 @@ def test_eager_matches_sdpa_generate(self):
     @slow
     def test_eager_matches_fa2_generate(self):
         """Tests that generate has equivalent outputs with FA2 and eager attention implementations."""
-        # TODO (@joao @raushan) -- this test is failing the output checks on most models, investigate. After fixing,
-        # check whether we still need the overwrites
         self._test_attention_implementation("flash_attention_2")
 
     def _check_generate_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
@@ -2672,9 +2609,9 @@ def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
         # set to same device. we don't care what device.
 
         if not isinstance(tensor_1, list):
-            tensor_1 = tensor_1.cpu().tolist()
+            tensor_1 = tensor_1.tolist()
         if not isinstance(tensor_2, list):
-            tensor_2 = tensor_2.cpu().tolist()
+            tensor_2 = tensor_2.tolist()
 
         in_order = len(tensor_1) <= len(tensor_2)
         longer = tensor_2 if in_order else tensor_1
@@ -2780,27 +2717,97 @@ def test_speculative_sampling_target_distribution(self):
         self.assertTrue(last_token_counts[1] > last_token_counts[3] > last_token_counts[7] > 0)
         self.assertTrue(last_token_counts[8] > last_token_counts[3])
 
+    def test_cache_dependant_input_preparation_exporting(self):
+        self.assertFalse(
+            is_torchdynamo_exporting()
+        )  # otherwise this test does not compare two different implementation
+        # Case 1
+        input_ids = torch.randint(0, 16, (2, 8), dtype=torch.int64)[:, :0]
+        inputs_embeds = torch.rand((2, 8), dtype=torch.float32)
+        cache_position = torch.arange(0, 8, dtype=torch.int64)
+        eager1, eager2 = GenerationMixin()._cache_dependant_input_preparation(input_ids, inputs_embeds, cache_position)
+        export1, export2 = GenerationMixin()._cache_dependant_input_preparation_exporting(
+            input_ids, inputs_embeds, cache_position
+        )
+        torch.testing.assert_close(eager1, export1)
+        torch.testing.assert_close(eager2, export2)
+
+        # Case 2
+        input_ids = torch.randint(0, 16, (2, 8), dtype=torch.int64)
+        inputs_embeds = torch.rand((2, 8), dtype=torch.float32)
+        cache_position = torch.arange(0, 8, dtype=torch.int64)
+        eager1, eager2 = GenerationMixin()._cache_dependant_input_preparation(input_ids, inputs_embeds, cache_position)
+        export1, export2 = GenerationMixin()._cache_dependant_input_preparation_exporting(
+            input_ids, inputs_embeds, cache_position
+        )
+        torch.testing.assert_close(eager1, export1)
+        torch.testing.assert_close(eager2, export2)
+
+        # Case 3
+        input_ids = torch.randint(0, 16, (2, 12), dtype=torch.int64)
+        inputs_embeds = None
+        cache_position = torch.arange(0, 8, dtype=torch.int64)
+        eager1, eager2 = GenerationMixin()._cache_dependant_input_preparation(input_ids, inputs_embeds, cache_position)
+        export1, export2 = GenerationMixin()._cache_dependant_input_preparation_exporting(
+            input_ids, inputs_embeds, cache_position
+        )
+        torch.testing.assert_close(eager1, export1)
+        torch.testing.assert_close(eager2, export2)
+
+        # Case 4
+        input_ids = torch.randint(0, 16, (2, 8), dtype=torch.int64)
+        inputs_embeds = None
+        cache_position = torch.arange(0, 8, dtype=torch.int64)
+        eager1, eager2 = GenerationMixin()._cache_dependant_input_preparation(input_ids, inputs_embeds, cache_position)
+        export1, export2 = GenerationMixin()._cache_dependant_input_preparation_exporting(
+            input_ids, inputs_embeds, cache_position
+        )
+        torch.testing.assert_close(eager1, export1)
+        torch.testing.assert_close(eager2, export2)
+
+
+global_rng = random.Random()
+
+
+# Copied from tests.test_modeling_common.ids_tensor
+def ids_tensor(shape, vocab_size, rng=None, name=None):
+    #  Creates a random int32 tensor of the shape within the vocab size
+    if rng is None:
+        rng = global_rng
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+
+    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
+
+
+# Copied from tests.test_modeling_common.floats_tensor
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
+
+    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
+
 
 @pytest.mark.generate
 @require_torch
-class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
-    # setting framework_dependent_parameters needs to be gated, just like its contents' imports
-    if is_torch_available():
-        framework_dependent_parameters = {
-            "AutoModelForCausalLM": AutoModelForCausalLM,
-            "AutoModelForSpeechSeq2Seq": AutoModelForSpeechSeq2Seq,
-            "AutoModelForSeq2SeqLM": AutoModelForSeq2SeqLM,
-            "AutoModelForVision2Seq": AutoModelForVision2Seq,
-            "LogitsProcessorList": LogitsProcessorList,
-            "MinLengthLogitsProcessor": MinLengthLogitsProcessor,
-            "create_tensor_fn": torch.tensor,
-            "floats_tensor": floats_tensor,
-            "return_tensors": "pt",
-        }
-
+class GenerationIntegrationTests(unittest.TestCase):
     @slow
     def test_diverse_beam_search(self):
-        # PT-only test: TF doesn't have a diverse beam search implementation
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood.
         The celebrity couple announced the arrival of their son, Silas Randall Timberlake, in statements to People.
         "Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first," People reports.
@@ -2834,7 +2841,6 @@ def test_diverse_beam_search(self):
         )
 
     def test_max_length_if_input_embeds(self):
-        # PT-only test: TF doesn't have StoppingCriteria
         article = "Today a dragon flew over Paris."
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
@@ -2848,7 +2854,6 @@ def test_max_length_if_input_embeds(self):
         self.assertEqual(out_gen.shape[-1], input_len + out_gen_embeds.shape[-1])
 
     def test_min_length_if_input_embeds(self):
-        # PT-only test: TF doesn't have StoppingCriteria
         article = "Today a dragon flew over Paris."
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
@@ -2862,7 +2867,6 @@ def test_min_length_if_input_embeds(self):
         self.assertEqual(out_gen.shape[-1], input_len + out_gen_embeds.shape[-1])
 
     def test_custom_stopping_criteria_overload_error(self):
-        # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
         bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
@@ -2876,7 +2880,6 @@ def test_custom_stopping_criteria_overload_error(self):
             bart_model.generate(input_ids, stopping_criteria=stopping_criteria, max_length=32)
 
     def test_custom_stopping_criteria(self):
-        # PT-only test: TF doesn't have StoppingCriteria
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
         bart_tokenizer = BartTokenizer.from_pretrained("sshleifer/bart-tiny-random")
         bart_model = BartForConditionalGeneration.from_pretrained("sshleifer/bart-tiny-random").to(torch_device)
@@ -2900,7 +2903,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
 
     # TODO (joao): replace `stop_sequence` in the pipeline by the more recent `generate` functionality
     def test_stop_sequence_stopping_criteria(self):
-        # PT-only test: TF doesn't have StoppingCriteria
         prompt = """Hello I believe in"""
         generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-bart")
         output = generator(prompt)
@@ -2913,7 +2915,6 @@ def test_stop_sequence_stopping_criteria(self):
         self.assertEqual(output, [{"generated_text": "Hello I believe in we"}])
 
     def test_generate_non_nlp_input_ids_as_kwarg(self):
-        # PT-only test: AFAIK there's no non-NLP model architecture in TF that supports `input_ids` as its only input
         model = ImageGPTForCausalImageModeling.from_pretrained(
             "hf-internal-testing/tiny-random-imagegpt", max_length=10
         ).to(torch_device)
@@ -2926,7 +2927,6 @@ def test_generate_non_nlp_input_ids_as_kwarg(self):
         self.assertEqual(output_sequences.shape, (3, 10))
 
     def test_generate_input_values_as_encoder_kwarg(self):
-        # PT-only test: AFAIK there's no generate-capable architecture in TF that supports `input_values` as its input
         input_values = floats_tensor((2, 250))
         model = SpeechEncoderDecoderModel.from_pretrained("hf-internal-testing/tiny-random-speech-encoder-decoder")
         model = model.to(torch_device)
@@ -2937,7 +2937,6 @@ def test_generate_input_values_as_encoder_kwarg(self):
         self.assertEqual(output_sequences.shape, (2, 5))
 
     def test_transition_scores_group_beam_search_encoder_decoder(self):
-        # PT-only test: TF doesn't have group beam search
         articles = [
             "Justin Timberlake and Jessica Biel, welcome to parenthood.",
             "Michael Phelps is arguably the most decorated Olympian of all time.",
@@ -2965,19 +2964,6 @@ def test_transition_scores_group_beam_search_encoder_decoder(self):
 
         torch.testing.assert_close(transition_scores_sum, outputs.sequences_scores, rtol=1e-3, atol=1e-3)
 
-    def test_beam_search_low_memory(self):
-        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
-        model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-        tokenizer.pad_token_id = tokenizer.eos_token_id
-        model_inputs = tokenizer("I", return_tensors="pt")["input_ids"]
-
-        low_output = model.generate(model_inputs, max_new_tokens=40, num_beams=5, early_stopping=True, low_memory=True)
-
-        high_output = model.generate(
-            model_inputs, max_new_tokens=40, num_beams=5, early_stopping=True, low_memory=False
-        )
-        self.assertListEqual(low_output.tolist(), high_output.tolist())
-
     @slow
     def test_green_red_watermark_generation(self):
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
@@ -3067,7 +3053,6 @@ def test_synthid_text_watermark_generation_mean_expected_bias(self):
 
     @slow
     def test_beam_search_example_integration(self):
-        # PT-only test: TF doesn't have a BeamSearchScorer
         # exactly the example provided in the docstrings of beam search, which previously
         # failed after directly copying from it. Refer to PR #15555
         tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
@@ -3094,7 +3079,6 @@ def test_beam_search_example_integration(self):
 
     @slow
     def test_constrained_beam_search(self):
-        # PT-only test: TF doesn't have constrained beam search
         model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
         tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
@@ -3132,7 +3116,6 @@ def test_constrained_beam_search(self):
 
     @slow
     def test_constrained_beam_search_mixed(self):
-        # PT-only test: TF doesn't have constrained beam search
         model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
         tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
@@ -3173,7 +3156,6 @@ def test_constrained_beam_search_mixed(self):
 
     @slow
     def test_constrained_beam_search_mixed_mixin(self):
-        # PT-only test: TF doesn't have constrained beam search
         model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2").to(torch_device)
         tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
 
@@ -3251,7 +3233,6 @@ def test_cfg_mixin(self):
 
     @slow
     def test_constrained_beam_search_example_translation_mixin(self):
-        # PT-only test: TF doesn't have constrained beam search
         tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
         model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
@@ -3276,7 +3257,6 @@ def test_constrained_beam_search_example_translation_mixin(self):
 
     @slow
     def test_constrained_beam_search_example_integration(self):
-        # PT-only test: TF doesn't have constrained beam search
         tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
         model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
 
@@ -3345,7 +3325,6 @@ def test_per_row_stopping_criteria(self):
         self.assertListEqual(out_text, expected_out)
 
     def test_constrained_beam_search_mixin_type_checks(self):
-        # PT-only test: TF doesn't have constrained beam search
         tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/t5-tiny-random")
         model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/t5-tiny-random")
 
@@ -3386,7 +3365,6 @@ def test_constrained_beam_search_mixin_type_checks(self):
             model.generate(input_ids, force_words_ids=[[[-1]]])
 
     def test_batched_decoder_start_id(self):
-        # PT-only test: TF doesn't support batched_decoder_start_id
         articles = [
             "Justin Timberlake and Jessica Biel, welcome to parenthood.",
             "Michael Phelps is arguably the most decorated Olympian of all time.",
@@ -3435,7 +3413,6 @@ def test_decoder_start_id_from_config(self):
             outputs = bart_model.generate(input_ids, generation_config=GenerationConfig(do_sample=False))
 
     def test_contrastive_search_batched(self):
-        # PT-only test: TF doesn't have constrained beam search
         # Tests that contrastive search works with batched inputs (i.e. has the same output as for non-batched inputs)
         articles = ["Foo", "Bar Baz"]
         tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
@@ -3461,7 +3438,6 @@ def test_contrastive_search_batched(self):
         self.assertTrue(max_score_diff < 1e-5)
 
     def test_logits_processor_not_inplace(self):
-        # PT-only test: TF fixes were not made
         article = "Today a dragon flew over Paris."
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
@@ -3572,7 +3548,6 @@ def test_default_max_length_warning(self):
             self.assertEqual(len(warning_list), 0)
 
     def test_length_warning_assisted_generation(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
@@ -3604,7 +3579,6 @@ def test_default_assisted_generation(self):
         self.assertEqual(config.is_assistant, False)
 
     def test_generated_length_assisted_generation(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
@@ -3639,7 +3613,6 @@ def test_generated_length_assisted_generation(self):
         self.assertTrue(out.shape[-1] <= (input_length + 7))
 
     def test_model_kwarg_assisted_decoding_decoder_only(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
         model.generation_config.pad_token_id = tokenizer.eos_token_id
@@ -3837,12 +3810,13 @@ def test_return_unprocessed_logit_scores(self):
         self.assertTrue(y_prob <= 1.0 and n_prob <= 1.0)
 
     @slow
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_assisted_decoding_in_different_gpu(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
-        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to("cuda:0")
+        device_0 = f"{torch_device}:0" if torch_device != "cpu" else "cpu"
+        device_1 = f"{torch_device}:1" if torch_device != "cpu" else "cpu"
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(device_0)
         assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
-            "cuda:1"
+            device_1
         )
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
         model.config.pad_token_id = tokenizer.eos_token_id
@@ -3863,7 +3837,6 @@ def test_assisted_decoding_in_different_gpu(self):
     @slow
     @require_torch_accelerator
     def test_assisted_decoding_model_in_gpu_assistant_in_cpu(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
             torch_device
         )
@@ -3887,7 +3860,6 @@ def test_assisted_decoding_model_in_gpu_assistant_in_cpu(self):
         self.assertTrue(input_length <= out.shape[-1] <= input_length + 20)
 
     def test_special_tokens_fall_back_to_model_default(self):
-        # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(
             torch_device
         )
@@ -4008,7 +3980,7 @@ def test_init_static_cache_multi_gpu(self):
         # TODO: We need to raise a warning in case the cache is not set correctly
         # with self.assertRaisesRegex(ValueError, "If you are manually initializing the cache"):
         #     past_key_values = StaticCache(
-        #         config=model.config, batch_size=1, max_cache_len=30, device=torch_device, dtype=model.dtype
+        #         config=model.config, max_batch_size=1, max_cache_len=30, device=torch_device, dtype=model.dtype
         #     )
         #     results = model.generate(input_ids, past_key_values=past_key_values, **generation_kwargs)
 
@@ -4016,7 +3988,7 @@ def test_init_static_cache_multi_gpu(self):
         layer_device_map = {0: 0, 1: 1}
         past_key_values = StaticCache(
             config=model.config,
-            batch_size=1,
+            max_batch_size=1,
             max_cache_len=30,
             device=torch_device,
             dtype=model.dtype,
@@ -4217,7 +4189,11 @@ def test_prepare_inputs_for_generation_decoder_llm(self):
         batch_size = 2
         query_length = input_ids.shape[-1] - init_input_ids.shape[-1]
         static_cache = StaticCache(
-            config=config, batch_size=batch_size, max_cache_len=max_cache_len, device=torch_device, dtype=torch.float32
+            config=config,
+            max_batch_size=batch_size,
+            max_cache_len=max_cache_len,
+            device=torch_device,
+            dtype=torch.float32,
         )
         static_cache = model(init_input_ids, past_key_values=static_cache).past_key_values
         model_inputs = model.prepare_inputs_for_generation(
@@ -4328,6 +4304,42 @@ def test_assisted_generation_early_exit(self):
         self.assertEqual(decoded_assisted, [expected_output])
 
     @slow
+    def test_beam_search_advanced_stopping_criteria(self):
+        """
+        Tests that beam search works with a stopping criteria that is not max length or EOS token. Prior to the beam
+        search vectorization PR (#35802), beam search was not accepting other stopping criteria. Test inspired on
+        the original issue (#34843).
+        """
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+        model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct").to(torch_device)
+
+        prompt = (
+            "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. "
+            "How many clips did Natalia sell altogether in April and May?"
+        )
+        tokens = tokenizer(prompt, return_tensors="pt").to(torch_device)
+        generation_config = GenerationConfig(num_beams=3, do_sample=False, length_penalty=1.0, max_new_tokens=100)
+
+        # This particular prompt should result in a ":" being present in the answer
+        out = model.generate(**tokens, generation_config=generation_config, tokenizer=tokenizer)
+        output_text = tokenizer.decode(out[0], skip_special_tokens=True)
+        last_non_special_token_decoded = tokenizer.decode(out[out != tokenizer.pad_token_id][-1])
+        self.assertTrue(":" in output_text)
+        self.assertFalse(":" in output_text[-5:])
+        self.assertFalse(":" in last_non_special_token_decoded)
+
+        # Adding an advanced stopping criteria: text generation should stop when a ":" is generated.
+        # Note that:
+        # 1 - the text up to ":" doesn't have to be the same, it can belong to a different beam
+        # 2 - ":" may not be the last char, but it must be in the last non-special token
+        generation_config.stop_strings = ":"
+        out = model.generate(**tokens, generation_config=generation_config, tokenizer=tokenizer)
+        output_text = tokenizer.decode(out[0], skip_special_tokens=True)
+        last_non_special_token_decoded = tokenizer.decode(out[out != tokenizer.pad_token_id][-1])
+        self.assertTrue(":" in output_text)
+        self.assertTrue(":" in output_text[-5:])
+        self.assertTrue(":" in last_non_special_token_decoded)
+
     def test_max_time(self):
         tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
         model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
@@ -4367,6 +4379,470 @@ def test_max_time(self):
         duration = datetime.datetime.now() - start
         self.assertGreater(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
 
+    def test_validate_generation_inputs(self):
+        """Tests validation of inputs to `generate`"""
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+        model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        encoder_input_str = "Hello world"
+        input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+        # typos are quickly detected (the correct argument is `do_sample`)
+        with self.assertRaisesRegex(ValueError, "do_samples"):
+            model.generate(input_ids, do_samples=True)
+
+        # arbitrary arguments that will not be used anywhere are also not accepted
+        with self.assertRaisesRegex(ValueError, "foo"):
+            fake_model_kwargs = {"foo": "bar"}
+            model.generate(input_ids, **fake_model_kwargs)
+
+        # however, valid model_kwargs are accepted
+        valid_model_kwargs = {"attention_mask": torch.tensor(np.zeros_like(input_ids))}
+        model.generate(input_ids, **valid_model_kwargs)
+
+    def test_custom_logits_processor(self):
+        """Tests that custom logits processors can be used in `generate`, and that redundant arguments are caught."""
+        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        article = """Justin Timberlake and Jessica Biel, welcome to parenthood."""
+        bart_model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart", min_length=1)
+        input_ids = bart_tokenizer(article, return_tensors="pt").input_ids
+
+        logits_processor = LogitsProcessorList()
+        logits_processor.append(MinLengthLogitsProcessor(min_length=10, eos_token_id=0))
+
+        # it should not be allowed to both define `min_length` via config and `logits_processor` list
+        with self.assertRaises(ValueError):
+            bart_model.generate(input_ids, logits_processor=logits_processor, min_length=10)
+        bart_model.generate(input_ids, logits_processor=logits_processor)
+
+    def test_transition_scores_greedy_search(self):
+        """Test that `compute_transition_scores` is working as expected with gready search"""
+        articles = ["Justin Timberlake", "Michael Phelps"]
+        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", padding_side="left")
+        tokenizer.pad_token = tokenizer.eos_token
+
+        model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+        model.generation_config.eos_token_id = None
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=5,
+            pad_token_id=tokenizer.eos_token_id,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores)
+        transition_scores = transition_scores.cpu().numpy()
+
+        expected_scores = np.array(
+            [
+                [-57.8844, -60.45698, -70.16364, -65.50791, -66.35648],
+                [-54.417572, -60.216614, -62.661243, -58.621933, -58.298683],
+            ]
+        )
+        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
+
+    def test_transition_scores_greedy_search_normalized(self):
+        """
+        Test that `compute_transition_scores` is working as expected with gready search, with `normalize_logits=True`
+        """
+        articles = ["Justin Timberlake", "Michael Phelps"]
+        tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2", padding_side="left")
+        tokenizer.pad_token = tokenizer.eos_token
+
+        model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+        model.generation_config.eos_token_id = None
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=5,
+            pad_token_id=tokenizer.eos_token_id,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
+        transition_scores = transition_scores.cpu().numpy()
+
+        expected_scores = np.array(
+            [
+                [-2.538938, -2.2694316, -2.1580915, -1.572299, -2.6719835],
+                [-1.8826028, -2.2461371, -1.7556462, -2.9644494, -1.7996008],
+            ]
+        )
+        self.assertTrue(np.allclose(transition_scores, expected_scores, atol=1e-3))
+
+    def test_transition_scores_beam_search_encoder_decoder(self):
+        """
+        Test that `compute_transition_scores` is working as expected with beam search and encoder-decoder models
+        """
+        articles = [
+            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
+            "Michael Phelps is arguably the most decorated Olympian of all time.",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart")
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_length=10,
+            num_beams=4,
+            num_return_sequences=2,
+            eos_token_id=None,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        transition_scores = transition_scores.cpu().numpy()
+        outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
+
+    def test_transition_scores_beam_search_encoder_decoder_with_eos(self):
+        """
+        Test that `compute_transition_scores` is working as expected with beam search and encoder-decoder models, when
+        an EOS token is defined
+        """
+        articles = [
+            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
+            "Michael Phelps is arguably the most decorated Olympian of all time.",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+
+        model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart")
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_length=10,
+            num_beams=4,
+            num_return_sequences=2,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        transition_scores = transition_scores.cpu().numpy()
+        outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
+
+    def test_transition_scores_beam_search_decoder_only(self):
+        """
+        Test that `compute_transition_scores` is working as expected with beam search and decoder-only models
+        """
+        articles = [
+            "Justin Timberlake",
+            "Michael Phelps",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        tokenizer.pad_token = tokenizer.eos_token
+
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids=input_ids,
+            max_length=10,
+            num_beams=4,
+            num_return_sequences=2,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=None,
+            return_dict_in_generate=True,
+            output_scores=True,
+            length_penalty=0.0,
+        )
+
+        transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
+        transition_scores = transition_scores.cpu().numpy()
+        outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores, atol=1e-3))
+
+    @slow
+    def test_transition_scores_early_stopping(self):
+        """
+        Test that `compute_transition_scores` is working as expected with beam search and early stopping
+
+        This is an aggressive test that makes sure that `beam_search's`
+        transition scores are computed correctly for varying `num_return_sequences`, `num_beams` and `batch_size > 1`
+        2 x input_ids for "question: How are you? \n context: I had a long day, "
+        """
+        input_ids = torch.tensor(2 * [[822, 10, 571, 33, 25, 58, 2625, 10, 27, 141, 3, 9, 307, 239, 6, 1]])
+        model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        outputs = model.generate(
+            input_ids,
+            max_length=10,
+            return_dict_in_generate=True,
+            output_scores=True,
+            forced_eos_token_id=model.config.eos_token_id,
+            num_beams=4,
+            do_sample=False,
+            num_return_sequences=3,
+            length_penalty=0.0,
+        )
+
+        transition_scores = model.compute_transition_scores(
+            sequences=outputs.sequences, scores=outputs.scores, beam_indices=outputs.beam_indices
+        )
+        transition_scores = transition_scores.cpu().numpy()
+        outputs.sequences_scores = outputs.sequences_scores.cpu().numpy()
+
+        self.assertTrue(np.allclose(np.sum(transition_scores, axis=-1), outputs.sequences_scores))
+
+    def test_encoder_decoder_generate_attention_mask(self):
+        """
+        Test that `generate` automagically creates the correct `attention_mask` for encoder-decoder models (which
+        has a different keyword)
+        """
+        articles = ["Timberlake", "Jessica Biel, welcome to parenthood among other things"]
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        # need extreme generation values here to force this test
+        # to fail when `attention_mask` is not correctly treated in generate
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            "hf-internal-testing/tiny-random-bart",
+        )
+        model.config.eos_token_id = None
+        input_ids = tokenizer(articles[0], return_tensors="pt").input_ids
+        input_ids_batched = tokenizer(articles, padding=True, return_tensors="pt").input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+        input_ids_batched = input_ids_batched.to(torch_device)
+
+        generate_kwargs = {
+            "return_dict_in_generate": True,
+            "output_scores": True,
+            "max_length": 50,
+            "num_beams": 5,
+            "num_return_sequences": 5,
+        }
+
+        output_sequences_batched = model.generate(input_ids=input_ids_batched, **generate_kwargs)
+        output_sequences = model.generate(input_ids=input_ids, **generate_kwargs)
+
+        batched_out = output_sequences_batched.sequences_scores
+        out = output_sequences.sequences_scores
+        batched_out = batched_out.cpu().numpy()
+        out = out.cpu().numpy()
+
+        diff = np.abs(np.sum(batched_out[:5]) - np.sum(out))
+        self.assertTrue(diff < 1e-4)
+
+    def test_generate_input_ids_as_kwarg(self):
+        """Test that `input_ids` work equally as a positional and keyword argument in decoder-only models"""
+        article = "I need input_ids to generate"
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=15)
+        input_ids = tokenizer(article, return_tensors="pt").input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        output_sequences_kwargs = model.generate(input_ids=input_ids)
+        output_sequences = model.generate(input_ids)
+        output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
+        output_sequences = output_sequences.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
+        self.assertEqual(output_sequences.shape, (1, 15))
+
+    def test_generate_input_ids_as_encoder_kwarg(self):
+        """Test that `input_ids` work equally as a positional and keyword argument in encoder-decoder models"""
+        article = "Justin Timberlake and Jessica Biel, welcome to parenthood."
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model.config.eos_token_id = None
+        input_ids = tokenizer(article, return_tensors="pt").input_ids
+        model = model.to(torch_device)
+        input_ids = input_ids.to(torch_device)
+
+        output_sequences_kwargs = model.generate(input_ids=input_ids, max_length=5)
+        output_sequences = model.generate(input_ids, max_length=5)
+        output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
+        output_sequences = output_sequences.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
+        self.assertEqual(output_sequences.shape, (1, 5))
+
+    def test_generate_inputs_and_encoder_kwargs(self):
+        """
+        Test that an exception is thrown if the main tensor (`input_ids` in LLMs) is passed as both a positional and
+        keyword argument
+        """
+        article = "I need input_ids to generate"
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2", max_length=10)
+        input_ids = tokenizer(article, return_tensors="pt").input_ids
+        with self.assertRaises(ValueError):
+            model.generate(input_ids, input_ids=input_ids)
+
+    def test_generate_too_many_encoder_kwargs(self):
+        """Test that passing redundant inputs results in an exception (`input_ids` and `inputs_embeds` in LLMs)"""
+        article = "I need input_ids to generate"
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        model = AutoModelForSeq2SeqLM.from_pretrained("hf-internal-testing/tiny-random-bart", max_length=10)
+        input_ids = tokenizer(article, return_tensors="pt").input_ids
+        with self.assertRaises(ValueError):
+            model.generate(input_ids=input_ids, inputs_embeds=input_ids)
+
+    def test_generate_input_features_as_encoder_kwarg(self):
+        """Test that non-`input_ids` main model inputs are correctly handled as positional arguments"""
+        input_features = floats_tensor((3, 80, 60))
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            "hf-internal-testing/tiny-random-WhisperForConditionalGeneration"
+        )
+        input_features.to(torch_device)
+        model = model.to(torch_device)
+
+        output_sequences_kwargs = model.generate(input_features=input_features, max_length=5)
+        output_sequences = model.generate(input_features, max_length=5)
+        output_sequences_kwargs = output_sequences_kwargs.cpu().numpy()
+        output_sequences = output_sequences.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences, output_sequences_kwargs))
+        self.assertEqual(output_sequences.shape, (3, 5))
+
+    def test_generate_encoder_outputs_attention_mask(self):
+        """Test that `generate` can handle attention masks when the encoder outputs are passed"""
+        input_features = floats_tensor((3, 80, 60))
+        attention_mask = torch.randint(0, 2, input_features.shape).to(torch_device)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            "hf-internal-testing/tiny-random-WhisperForConditionalGeneration"
+        )
+        input_features = input_features.to(torch_device)
+        attention_mask = attention_mask.to(torch_device)
+        model = model.to(torch_device)
+
+        encoder = model.get_encoder()
+        encoder_outputs = encoder(input_features)
+
+        output_sequences_no_mask = model.generate(encoder_outputs=encoder_outputs)
+        output_sequences_with_mask = model.generate(encoder_outputs=encoder_outputs, attention_mask=attention_mask)
+        output_sequences_no_mask = output_sequences_no_mask.cpu().numpy()
+        output_sequences_with_mask = output_sequences_with_mask.cpu().numpy()
+
+        self.assertFalse(np.array_equal(output_sequences_no_mask, output_sequences_with_mask))
+
+    def test_eos_token_id_int_and_list_greedy_search(self):
+        """Test that `generate` can handle multiple EOS tokens"""
+        generation_kwargs = {
+            "do_sample": False,
+            "num_beams": 1,
+        }
+        expectation = 13
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        text = """Hello, my dog is cute and"""
+        tokens = tokenizer(text, return_tensors="pt")
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model = model.to(torch_device)
+        tokens = tokens.to(torch_device)
+
+        eos_token_id = 873
+        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        self.assertTrue(expectation == len(generated_tokens[0]))
+
+        eos_token_id = [873, 198]
+        generated_tokens = model.generate(**tokens, eos_token_id=eos_token_id, **generation_kwargs)
+        self.assertTrue(expectation == len(generated_tokens[0]))
+
+    def test_generate_vision2text_conditioning(self):
+        """Test that `decoder_input_ids` can be used to condition the generation in vision-to-text models"""
+        pixel_values = floats_tensor((2, 3, 30, 30))
+        conditioning_input = torch.tensor([[10], [10]])  # this should be the 2nd output token, after the BOS token
+        model = AutoModelForVision2Seq.from_pretrained(
+            "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2"
+        )
+        pixel_values = pixel_values.to(torch_device)
+        model = model.to(torch_device)
+        conditioning_input = conditioning_input.to(torch_device)
+
+        # we can condition on decoder_input_ids (expected decoder input) and input_ids (which we pipe internally as
+        # decoder_input_ids, if the encoder is not a model with text input)
+        output_sequences_decoder_input_ids = model.generate(
+            pixel_values, max_length=5, decoder_input_ids=conditioning_input
+        )
+        output_sequences_input_ids = model.generate(pixel_values, max_length=5, input_ids=conditioning_input)
+        output_sequences_decoder_input_ids = output_sequences_decoder_input_ids.cpu().numpy()
+        output_sequences_input_ids = output_sequences_input_ids.cpu().numpy()
+        conditioning_input = conditioning_input.cpu().numpy()
+
+        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids, output_sequences_input_ids))
+        self.assertTrue(np.array_equal(output_sequences_decoder_input_ids[:, 1:2], conditioning_input))
+
+    @slow
+    @require_torch_gpu
+    def test_cache_device_map_with_vision_layer_device_map(self):
+        """
+        Test that the cache device map is correctly set when the vision layer has a device map. Regression test for
+        #36942
+        """
+        # gemma 3 uses hybrid cache, which can be compiled -> needs a device map at allocation time
+        model_id = "google/gemma-3-4b-it"
+
+        # important part of this device map: the `.layers.` pattern is NOT present in the decoder
+        device_map = {
+            "vision_tower.vision_model.embeddings": 0,
+            "vision_tower.vision_model.encoder.layers.0": 0,
+            "vision_tower.vision_model.encoder.layers.1": 0,
+            "vision_tower.vision_model.encoder.layers.2": 0,
+            "vision_tower.vision_model.encoder.layers.3": 0,
+            "vision_tower.vision_model.encoder.layers.4": 0,
+            "vision_tower.vision_model.encoder.layers.5": 0,
+            "vision_tower.vision_model.encoder.layers.6": 0,
+            "vision_tower.vision_model.encoder.layers.7": 0,
+            "vision_tower.vision_model.encoder.layers.8": 0,
+            "vision_tower.vision_model.encoder.layers.9": 0,
+            "vision_tower.vision_model.encoder.layers.10": 0,
+            "vision_tower.vision_model.encoder.layers.11": 0,
+            "vision_tower.vision_model.encoder.layers.12": 0,
+            "vision_tower.vision_model.encoder.layers.13": 0,
+            "vision_tower.vision_model.encoder.layers.14": "cpu",
+            "vision_tower.vision_model.encoder.layers.15": "cpu",
+            "vision_tower.vision_model.encoder.layers.16": "cpu",
+            "vision_tower.vision_model.encoder.layers.17": "cpu",
+            "vision_tower.vision_model.encoder.layers.18": "cpu",
+            "vision_tower.vision_model.encoder.layers.19": "cpu",
+            "vision_tower.vision_model.encoder.layers.20": "cpu",
+            "vision_tower.vision_model.encoder.layers.21": "cpu",
+            "vision_tower.vision_model.encoder.layers.22": "cpu",
+            "vision_tower.vision_model.encoder.layers.23": "cpu",
+            "vision_tower.vision_model.encoder.layers.24": "cpu",
+            "vision_tower.vision_model.encoder.layers.25": "cpu",
+            "vision_tower.vision_model.encoder.layers.26": "cpu",
+            "vision_tower.vision_model.post_layernorm": "cpu",
+            "multi_modal_projector": "cpu",
+            "language_model": "cpu",
+        }
+
+        model = AutoModelForImageTextToText.from_pretrained(
+            model_id, device_map=device_map, torch_dtype=torch.bfloat16
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = tokenizer(["This is a text input"], return_tensors="pt").to(model.device)
+
+        # If the generate doesn't infer the DECODER device map correctly, this will fail
+        _ = model.generate(**inputs, max_new_tokens=2, do_sample=False)
+
 
 @require_torch
 class TokenHealingTestCase(unittest.TestCase):
diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py
index 8f3fe3f817a1..5665345f7d56 100644
--- a/tests/models/albert/test_modeling_albert.py
+++ b/tests/models/albert/test_modeling_albert.py
@@ -17,11 +17,10 @@
 import unittest
 
 from packaging import version
-from parameterized import parameterized
 
 from transformers import AlbertConfig, AutoTokenizer, is_torch_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device
+from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
@@ -199,16 +198,6 @@ def create_and_check_for_sequence_classification(
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
 
-    def create_and_check_for_token_classification(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = AlbertForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
     def create_and_check_for_multiple_choice(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
@@ -289,12 +278,6 @@ def setUp(self):
         self.model_tester = AlbertModelTester(self)
         self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @unittest.skip("Albert requires `head_mask` which is currently not done in this test.")
-    def test_eager_matches_sdpa_inference(self):
-        pass
-
     def test_config(self):
         self.config_tester.run_common_tests()
 
diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py
index beb910b9d155..73022d5b02d4 100644
--- a/tests/models/albert/test_tokenization_albert.py
+++ b/tests/models/albert/test_tokenization_albert.py
@@ -34,12 +34,13 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece = True
     test_sentencepiece_ignore_case = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "this is a test"
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index 3a3a33edf609..08d4ea221050 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -219,13 +219,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -361,13 +361,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -380,14 +380,6 @@ def test_inputs_embeds(self):
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip(reason="AlignTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="AlignTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "kakaobrain/align-base"
@@ -495,7 +487,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `temperature` parameter initilization is different for ALIGN
+    # override as the `temperature` parameter initialization is different for ALIGN
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -504,7 +496,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `temperature` is initilized as per the original implementation
+                    # check if `temperature` is initialized as per the original implementation
                     if name == "temperature":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index c70269b1d196..a111181b699d 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -187,25 +187,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="AltCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="AltCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(reason="AltCLIPVisionModel use the same cv backbone with CLIP model.")
     def test_model_from_pretrained(self):
         pass
@@ -335,13 +327,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -357,14 +349,6 @@ def test_hidden_states_output(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="AltCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="AltCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "BAAI/AltCLIP"
@@ -482,7 +466,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for AltCLIP
+    # override as the `logit_scale` parameter initialization is different for AltCLIP
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         configs_no_init = _config_zero_init(config)
@@ -490,7 +474,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/aria/test_modeling_aria.py b/tests/models/aria/test_modeling_aria.py
index 8b5e62de14c7..0545aa77b2f7 100644
--- a/tests/models/aria/test_modeling_aria.py
+++ b/tests/models/aria/test_modeling_aria.py
@@ -168,19 +168,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def create_and_check_aria_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
-        model = AriaForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            logits = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                pixel_values=pixel_values.to(torch.bfloat16),
-                return_dict=True,
-            )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
 
 @require_torch
 class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -241,19 +228,19 @@ def test_inputs_embeds_matches_input_ids(self):
             torch.testing.assert_close(out_embeds, out_ids)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -311,7 +298,7 @@ def tearDown(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test(self):
-        # Let' s make sure we test the preprocessing to replace what is used
+        # Let's make sure we test the preprocessing to replace what is used
         model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
 
         prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
@@ -333,7 +320,7 @@ def test_small_model_integration_test(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_llama_single(self):
-        # Let' s make sure we test the preprocessing to replace what is used
+        # Let's make sure we test the preprocessing to replace what is used
         model_id = "rhymes-ai/Aria"
 
         model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
@@ -355,7 +342,7 @@ def test_small_model_integration_test_llama_single(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_llama_batched(self):
-        # Let' s make sure we test the preprocessing to replace what is used
+        # Let's make sure we test the preprocessing to replace what is used
         model_id = "rhymes-ai/Aria"
 
         model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
@@ -382,7 +369,7 @@ def test_small_model_integration_test_llama_batched(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_batch(self):
-        # Let' s make sure we test the preprocessing to replace what is used
+        # Let's make sure we test the preprocessing to replace what is used
         model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
         # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
         prompts = [
@@ -408,7 +395,7 @@ def test_small_model_integration_test_batch(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_llama_batched_regression(self):
-        # Let' s make sure we test the preprocessing to replace what is used
+        # Let's make sure we test the preprocessing to replace what is used
         model_id = "rhymes-ai/Aria"
 
         # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
@@ -436,12 +423,13 @@ def test_small_model_integration_test_llama_batched_regression(self):
     @slow
     @require_torch
     @require_vision
+    @require_bitsandbytes
     def test_batched_generation(self):
         model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True)
 
         processor = AutoProcessor.from_pretrained("rhymes-ai/Aria")
 
-        prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
+        prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
         prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
         prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
         url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
@@ -459,7 +447,7 @@ def test_batched_generation(self):
         model = model.eval()
 
         EXPECTED_OUTPUT = [
-            "\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
+            "\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
             "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
             "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
         ]
diff --git a/tests/models/aria/test_processor_aria.py b/tests/models/aria/test_processor_aria.py
index 623153b6798a..679d2467b492 100644
--- a/tests/models/aria/test_processor_aria.py
+++ b/tests/models/aria/test_processor_aria.py
@@ -238,7 +238,7 @@ def test_apply_chat_template(self):
         self.assertEqual(rendered, expected_rendered)
 
     # Override as AriaImageProcessor doesn't accept `do_rescale`
-    def test_chat_template_accepts_processing_kwargs(self):
+    def test_image_chat_template_accepts_processing_kwargs(self):
         processor = self.get_processor()
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
diff --git a/tests/models/auto/test_modeling_tf_pytorch.py b/tests/models/auto/test_modeling_tf_pytorch.py
deleted file mode 100644
index 2c59c906db6b..000000000000
--- a/tests/models/auto/test_modeling_tf_pytorch.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import unittest
-
-from transformers import is_tf_available, is_torch_available
-from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, is_pt_tf_cross_test, slow
-
-
-if is_tf_available():
-    from transformers import (
-        AutoConfig,
-        BertConfig,
-        GPT2Config,
-        T5Config,
-        TFAutoModel,
-        TFAutoModelForCausalLM,
-        TFAutoModelForMaskedLM,
-        TFAutoModelForPreTraining,
-        TFAutoModelForQuestionAnswering,
-        TFAutoModelForSeq2SeqLM,
-        TFAutoModelForSequenceClassification,
-        TFAutoModelWithLMHead,
-        TFBertForMaskedLM,
-        TFBertForPreTraining,
-        TFBertForQuestionAnswering,
-        TFBertForSequenceClassification,
-        TFBertModel,
-        TFGPT2LMHeadModel,
-        TFRobertaForMaskedLM,
-        TFT5ForConditionalGeneration,
-    )
-
-if is_torch_available():
-    from transformers import (
-        AutoModel,
-        AutoModelForCausalLM,
-        AutoModelForMaskedLM,
-        AutoModelForPreTraining,
-        AutoModelForQuestionAnswering,
-        AutoModelForSeq2SeqLM,
-        AutoModelForSequenceClassification,
-        AutoModelWithLMHead,
-        BertForMaskedLM,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertModel,
-        GPT2LMHeadModel,
-        RobertaForMaskedLM,
-        T5ForConditionalGeneration,
-    )
-
-
-@is_pt_tf_cross_test
-class TFPTAutoModelTest(unittest.TestCase):
-    @slow
-    def test_model_from_pretrained(self):
-        #     model_name = 'google-bert/bert-base-uncased'
-        for model_name in ["google-bert/bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModel.from_pretrained(model_name, from_pt=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertModel)
-
-            model = AutoModel.from_pretrained(model_name, from_tf=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, BertModel)
-
-    @slow
-    def test_model_for_pretraining_from_pretrained(self):
-        #     model_name = 'google-bert/bert-base-uncased'
-        for model_name in ["google-bert/bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModelForPreTraining.from_pretrained(model_name, from_pt=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertForPreTraining)
-
-            model = AutoModelForPreTraining.from_pretrained(model_name, from_tf=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, BertForPreTraining)
-
-    @slow
-    def test_model_for_causal_lm(self):
-        model_name = "openai-community/gpt2"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, GPT2Config)
-
-        model = TFAutoModelForCausalLM.from_pretrained(model_name, from_pt=True)
-        model, loading_info = TFAutoModelForCausalLM.from_pretrained(
-            model_name, output_loading_info=True, from_pt=True
-        )
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFGPT2LMHeadModel)
-
-        model = AutoModelForCausalLM.from_pretrained(model_name, from_tf=True)
-        model, loading_info = AutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True, from_tf=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, GPT2LMHeadModel)
-
-    @slow
-    def test_lmhead_model_from_pretrained(self):
-        model_name = "google-bert/bert-base-uncased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = TFAutoModelWithLMHead.from_pretrained(model_name, from_pt=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFBertForMaskedLM)
-
-        model = AutoModelWithLMHead.from_pretrained(model_name, from_tf=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, BertForMaskedLM)
-
-    @slow
-    def test_model_for_masked_lm(self):
-        model_name = "google-bert/bert-base-uncased"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, BertConfig)
-
-        model = TFAutoModelForMaskedLM.from_pretrained(model_name, from_pt=True)
-        model, loading_info = TFAutoModelForMaskedLM.from_pretrained(
-            model_name, output_loading_info=True, from_pt=True
-        )
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFBertForMaskedLM)
-
-        model = AutoModelForMaskedLM.from_pretrained(model_name, from_tf=True)
-        model, loading_info = AutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True, from_tf=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, BertForMaskedLM)
-
-    @slow
-    def test_model_for_encoder_decoder_lm(self):
-        model_name = "google-t5/t5-base"
-        config = AutoConfig.from_pretrained(model_name)
-        self.assertIsNotNone(config)
-        self.assertIsInstance(config, T5Config)
-
-        model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, from_pt=True)
-        model, loading_info = TFAutoModelForSeq2SeqLM.from_pretrained(
-            model_name, output_loading_info=True, from_pt=True
-        )
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, TFT5ForConditionalGeneration)
-
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, from_tf=True)
-        model, loading_info = AutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True, from_tf=True)
-        self.assertIsNotNone(model)
-        self.assertIsInstance(model, T5ForConditionalGeneration)
-
-    @slow
-    def test_sequence_classification_model_from_pretrained(self):
-        #     model_name = 'google-bert/bert-base-uncased'
-        for model_name in ["google-bert/bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertForSequenceClassification)
-
-            model = AutoModelForSequenceClassification.from_pretrained(model_name, from_tf=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, BertForSequenceClassification)
-
-    @slow
-    def test_question_answering_model_from_pretrained(self):
-        #     model_name = 'google-bert/bert-base-uncased'
-        for model_name in ["google-bert/bert-base-uncased"]:
-            config = AutoConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, BertConfig)
-
-            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name, from_pt=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, TFBertForQuestionAnswering)
-
-            model = AutoModelForQuestionAnswering.from_pretrained(model_name, from_tf=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, BertForQuestionAnswering)
-
-    def test_from_pretrained_identifier(self):
-        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, from_pt=True)
-        self.assertIsInstance(model, TFBertForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14410)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
-
-        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, from_tf=True)
-        self.assertIsInstance(model, BertForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14410)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
-
-    def test_from_identifier_from_model_type(self):
-        model = TFAutoModelWithLMHead.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, from_pt=True)
-        self.assertIsInstance(model, TFRobertaForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14410)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
-
-        model = AutoModelWithLMHead.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, from_tf=True)
-        self.assertIsInstance(model, RobertaForMaskedLM)
-        self.assertEqual(model.num_parameters(), 14410)
-        self.assertEqual(model.num_parameters(only_trainable=True), 14410)
diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py
index fd361f160f2b..f9d0fa6438b6 100644
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -186,7 +186,7 @@ def test_processor_from_local_directory_from_model_config(self):
             model_config.save_pretrained(tmpdirname)
             # copy relevant files
             copyfile(SAMPLE_VOCAB, os.path.join(tmpdirname, "vocab.json"))
-            # create emtpy sample processor
+            # create empty sample processor
             with open(os.path.join(tmpdirname, FEATURE_EXTRACTOR_NAME), "w") as f:
                 f.write("{}")
 
@@ -354,6 +354,40 @@ def __init__(self, feature_extractor, tokenizer, processor_attr_1=1, processor_a
             if CustomConfig in PROCESSOR_MAPPING._extra_content:
                 del PROCESSOR_MAPPING._extra_content[CustomConfig]
 
+    def test_dynamic_processor_with_specific_dynamic_subcomponents(self):
+        class NewFeatureExtractor(Wav2Vec2FeatureExtractor):
+            pass
+
+        class NewTokenizer(BertTokenizer):
+            pass
+
+        class NewProcessor(ProcessorMixin):
+            feature_extractor_class = "NewFeatureExtractor"
+            tokenizer_class = "NewTokenizer"
+
+            def __init__(self, feature_extractor, tokenizer):
+                super().__init__(feature_extractor, tokenizer)
+
+        try:
+            AutoConfig.register("custom", CustomConfig)
+            AutoFeatureExtractor.register(CustomConfig, NewFeatureExtractor)
+            AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
+            AutoProcessor.register(CustomConfig, NewProcessor)
+            # If remote code is not set, the default is to use local classes.
+            processor = AutoProcessor.from_pretrained(
+                "hf-internal-testing/test_dynamic_processor",
+            )
+            self.assertEqual(processor.__class__.__name__, "NewProcessor")
+        finally:
+            if "custom" in CONFIG_MAPPING._extra_content:
+                del CONFIG_MAPPING._extra_content["custom"]
+            if CustomConfig in FEATURE_EXTRACTOR_MAPPING._extra_content:
+                del FEATURE_EXTRACTOR_MAPPING._extra_content[CustomConfig]
+            if CustomConfig in TOKENIZER_MAPPING._extra_content:
+                del TOKENIZER_MAPPING._extra_content[CustomConfig]
+            if CustomConfig in PROCESSOR_MAPPING._extra_content:
+                del PROCESSOR_MAPPING._extra_content[CustomConfig]
+
     def test_auto_processor_creates_tokenizer(self):
         processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
         self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")
diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py
index f49ece15ffac..efae6555f7ff 100644
--- a/tests/models/auto/test_tokenization_auto.py
+++ b/tests/models/auto/test_tokenization_auto.py
@@ -49,6 +49,7 @@
     DUMMY_UNKNOWN_IDENTIFIER,
     SMALL_MODEL_IDENTIFIER,
     RequestCounter,
+    is_flaky,
     require_tokenizers,
     slow,
 )
@@ -147,6 +148,7 @@ def test_tokenizer_identifier_with_correct_config(self):
             self.assertEqual(tokenizer.model_max_length, 512)
 
     @require_tokenizers
+    @is_flaky()  # This one is flaky even with the new retry logic because it raises an unusual error
     def test_tokenizer_identifier_non_existent(self):
         for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
             with self.assertRaisesRegex(
@@ -439,6 +441,7 @@ def test_revision_not_found(self):
         ):
             _ = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
 
+    @unittest.skip("This test is failing on main")  # TODO Matt/ydshieh, fix this test!
     def test_cached_tokenizer_has_minimum_calls_to_head(self):
         # Make sure we have cached the tokenizer.
         _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py
index 3a2ed6385151..cca8f3b3ac87 100644
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -245,19 +245,19 @@ def test_resize_tokens_embeddings(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
@@ -415,7 +415,7 @@ def test_model_get_set_embeddings(self):
 
 def prepare_batch(filename="train-batch.pt"):
     file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
-    batch = torch.load(file, map_location=torch_device)
+    batch = torch.load(file, map_location=torch_device, weights_only=True)
     return batch
 
 
diff --git a/examples/research_projects/bertabs/__init__.py b/tests/models/aya_vision/__init__.py
similarity index 100%
rename from examples/research_projects/bertabs/__init__.py
rename to tests/models/aya_vision/__init__.py
diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py
new file mode 100644
index 000000000000..688086fa0f83
--- /dev/null
+++ b/tests/models/aya_vision/test_modeling_aya_vision.py
@@ -0,0 +1,547 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch GotOcr2 model."""
+
+import unittest
+
+import pytest
+from parameterized import parameterized
+
+from transformers import (
+    AutoProcessor,
+    AyaVisionConfig,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import (
+    cleanup,
+    require_read_token,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AyaVisionForConditionalGeneration,
+    )
+
+
+if is_vision_available():
+    pass
+
+
+class AyaVisionVisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        seq_length=7,
+        vision_feature_layer=-1,
+        downsample_factor=2,
+        ignore_index=-100,
+        bos_token_id=0,
+        eos_token_id=0,
+        pad_token_id=0,
+        image_token_index=1,
+        num_channels=3,
+        image_size=64,
+        model_type="aya_vision",
+        is_training=True,
+        text_config={
+            "model_type": "cohere2",
+            "vocab_size": 99,
+            "hidden_size": 128,
+            "intermediate_size": 37,
+            "num_hidden_layers": 4,
+            "num_attention_heads": 4,
+            "output_channels": 64,
+            "hidden_act": "silu",
+            "max_position_embeddings": 512,
+            "tie_word_embeddings": True,
+            "bos_token_id": 0,
+            "eos_token_id": 0,
+            "pad_token_id": 0,
+        },
+        vision_config={
+            "model_type": "siglip_vision_model",
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 128,
+            "image_size": 64,
+            "patch_size": 8,
+            "vision_use_head": False,
+        },
+    ):
+        self.parent = parent
+        self.ignore_index = ignore_index
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.image_token_index = image_token_index
+        self.model_type = model_type
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.batch_size = batch_size
+        self.vision_feature_layer = vision_feature_layer
+        self.downsample_factor = downsample_factor
+        self.is_training = is_training
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.image_seq_length = (image_size // (vision_config["patch_size"] * downsample_factor)) ** 2
+        self.seq_length = seq_length + self.image_seq_length
+
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+
+    def get_config(self):
+        return AyaVisionConfig(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+            model_type=self.model_type,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            image_token_index=self.image_token_index,
+            vision_feature_layer=self.vision_feature_layer,
+            downsample_factor=self.downsample_factor,
+        )
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+        print("attention_mask", attention_mask.shape)
+        # input_ids[:, -1] = self.pad_token_id
+        input_ids[input_ids == self.image_token_index] = self.pad_token_id
+        input_ids[:, : self.image_seq_length] = self.image_token_index
+
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class AyaVisionModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (AyaVisionForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (AyaVisionForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "image-text-to-text": AyaVisionForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
+    fx_compatible = False
+    test_pruning = False
+    test_torchscript = False
+    test_head_masking = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = AyaVisionVisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AyaVisionConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            torch.testing.assert_close(out_embeds, out_ids)
+
+    @unittest.skip("Failing because of unique cache (HybridCache)")
+    def test_model_outputs_equivalence(self, **kwargs):
+        pass
+
+    @unittest.skip("Cohere2's forcefully disables sdpa due to softcapping")
+    def test_sdpa_can_dispatch_non_composite_models(self):
+        pass
+
+    @unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different")
+    def test_eager_matches_sdpa_generate(self):
+        pass
+
+    @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
+    @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding")
+    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding")
+    def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding")
+    def test_assisted_decoding_sample(self):
+        pass
+
+    @unittest.skip("Cohere2 has HybridCache which is not compatible with dola decoding")
+    def test_dola_decoding_sample(self):
+        pass
+
+    @unittest.skip("Cohere2 has HybridCache and doesn't support continue from past kv")
+    def test_generate_continue_from_past_key_values(self):
+        pass
+
+    @unittest.skip("Cohere2 has HybridCache and doesn't support low_memory generation")
+    def test_beam_search_low_memory(self):
+        pass
+
+    @unittest.skip("Cohere2 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate(self):
+        pass
+
+    @unittest.skip("Cohere2 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate_dict_outputs_use_cache(self):
+        pass
+
+    @unittest.skip("Cohere2 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate_low_memory(self):
+        pass
+
+    @unittest.skip("Cohere2 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.")
+    def test_generate_with_static_cache(self):
+        pass
+
+    @unittest.skip("Cohere2 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.")
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
+    @unittest.skip("Cohere2 has HybridCache and doesn't support progressive generation using input embeds.")
+    def test_generate_continue_from_inputs_embeds(self):
+        pass
+
+    @unittest.skip("Failing because of unique cache (HybridCache)")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    @unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different")
+    def test_sdpa_equivalence(self):
+        pass
+
+    @unittest.skip(reason="SiglipVisionModel does not support standalone training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="SiglipVisionModel does not support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="SiglipVisionModel does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="SiglipVisionModel does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="Siglip uses the same initialization scheme as the Flax original implementation")
+    def test_initialization(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported because in LLava models")
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
+    @unittest.skip("FlashAttention only support fp16 and bf16 data type")
+    def test_flash_attn_2_fp32_ln(self):
+        pass
+
+    # todo: yoni - fix or improve the test
+    @unittest.skip("Difference is slightly higher than the threshold")
+    def test_batching_equivalence(self):
+        pass
+
+
+@require_read_token
+@require_torch
+class AyaVisionIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.model_checkpoint = "CohereForAI/aya-vision-8b"
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    @slow
+    @require_torch_gpu
+    def test_small_model_integration_forward(self):
+        processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+        model = AyaVisionForConditionalGeneration.from_pretrained(
+            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+                    {"type": "text", "text": "Please describe the image explicitly."},
+                ],
+            }
+        ]
+
+        inputs = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(torch_device, dtype=torch.float16)
+        # Forward
+        with torch.inference_mode():
+            output = model(**inputs)
+
+        actual_logits = output.logits[0, -1, :5].cpu()
+        print("actual_logits", actual_logits)
+        expected_logits = torch.tensor([0.4109, 0.1532, 0.8018, 2.1328, 0.5483], dtype=torch.float16)
+        self.assertTrue(
+            torch.allclose(actual_logits, expected_logits, atol=0.1),
+            f"Actual logits: {actual_logits}"
+            f"\nExpected logits: {expected_logits}"
+            f"\nDifference: {torch.abs(actual_logits - expected_logits)}",
+        )
+
+    @slow
+    @require_torch_gpu
+    def test_small_model_integration_generate_text_only(self):
+        processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+        model = AyaVisionForConditionalGeneration.from_pretrained(
+            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Write a haiku"},
+                ],
+            }
+        ]
+
+        inputs = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(torch_device, dtype=torch.float16)
+        with torch.no_grad():
+            generate_ids = model.generate(**inputs, max_new_tokens=25, do_sample=False)
+            decoded_output = processor.decode(
+                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+            )
+        print("decoded_output", decoded_output)
+        expected_output = "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song."
+        self.assertEqual(decoded_output, expected_output)
+
+    @slow
+    @require_torch_gpu
+    def test_small_model_integration_generate_chat_template(self):
+        processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+        model = AyaVisionForConditionalGeneration.from_pretrained(
+            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+                    {"type": "text", "text": "Please describe the image explicitly."},
+                ],
+            }
+        ]
+
+        inputs = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(torch_device, dtype=torch.float16)
+        with torch.no_grad():
+            generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+            decoded_output = processor.decode(
+                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+            )
+        print("decoded_output", decoded_output)
+        expected_output = "The image depicts a cozy scene of two cats resting on a bright pink blanket. The cats,"  # fmt: skip
+        self.assertEqual(decoded_output, expected_output)
+
+    @slow
+    @require_torch_gpu
+    def test_small_model_integration_batched_generate(self):
+        processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+        model = AyaVisionForConditionalGeneration.from_pretrained(
+            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
+        )
+        # Prepare inputs
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                        {"type": "text", "text": "Write a haiku for this image"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                        {"type": "text", "text": "Describe this image"},
+                    ],
+                },
+            ],
+        ]
+        inputs = processor.apply_chat_template(
+            messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(model.device, dtype=torch.float16)
+
+        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
+
+        # Check first output
+        decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+        print("decoded_output", decoded_output)
+        expected_output = "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene."  # fmt: skip
+        self.assertEqual(
+            decoded_output,
+            expected_output,
+            f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+        )
+
+        # Check second output
+        decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+        print("decoded_output", decoded_output)
+        expected_output = 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a'  # fmt: skip
+
+        self.assertEqual(
+            decoded_output,
+            expected_output,
+            f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+        )
+
+    @slow
+    @require_torch_gpu
+    def test_small_model_integration_batched_generate_multi_image(self):
+        processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+        model = AyaVisionForConditionalGeneration.from_pretrained(
+            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.float16
+        )
+        # Prepare inputs
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                        {"type": "text", "text": "Write a haiku for this image"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                        },
+                        {
+                            "type": "image",
+                            "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+                        },
+                        {
+                            "type": "text",
+                            "text": "These images depict two different landmarks. Can you identify them?",
+                        },
+                    ],
+                },
+            ],
+        ]
+        inputs = processor.apply_chat_template(
+            messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(model.device, dtype=torch.float16)
+        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
+
+        # Check first output
+        decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+        # Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
+        expected_output = "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene."  # fmt: skip
+        print("decoded_output", decoded_output)
+        self.assertEqual(
+            decoded_output,
+            expected_output,
+            f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+        )
+
+        # Check second output
+        decoded_output = processor.decode(output[1, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+        print("decoded_output", decoded_output)
+        expected_output = "The first image showcases the Statue of Liberty, a colossal neoclassical sculpture on Liberty Island in New York Harbor. Standing at a"  # fmt: skip
+        self.assertEqual(
+            decoded_output,
+            expected_output,
+            f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+        )
diff --git a/tests/models/aya_vision/test_processor_aya_vision.py b/tests/models/aya_vision/test_processor_aya_vision.py
new file mode 100644
index 000000000000..8830f85c50c1
--- /dev/null
+++ b/tests/models/aya_vision/test_processor_aya_vision.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+from typing import Optional
+
+from transformers import AutoProcessor, AutoTokenizer, AyaVisionProcessor
+from transformers.testing_utils import require_read_token, require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+
+if is_vision_available():
+    from transformers import GotOcr2ImageProcessor
+
+
+@require_read_token
+@require_vision
+class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = AyaVisionProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = GotOcr2ImageProcessor(
+            do_resize=True,
+            size={"height": 20, "width": 20},
+            max_patches=2,
+            do_rescale=True,
+            rescale_factor=1 / 255,
+            do_normalize=True,
+            image_mean=[0.485, 0.456, 0.406],
+            image_std=[0.229, 0.224, 0.225],
+            do_convert_rgb=True,
+        )
+        tokenizer = AutoTokenizer.from_pretrained("CohereForAI/aya-vision-8b", padding_side="left")
+        processor_kwargs = self.prepare_processor_dict()
+        processor = AyaVisionProcessor.from_pretrained(
+            "CohereForAI/aya-vision-8b",
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            **processor_kwargs,
+        )
+        processor.save_pretrained(self.tmpdirname)
+
+    def prepare_processor_dict(self):
+        return {"patch_size": 10, "img_size": 20}
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def get_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    # todo: yoni, fix this test
+    @unittest.skip("Chat template has long system prompt")
+    def test_chat_template_accepts_processing_kwargs(self, **kwargs):
+        pass
+
+    # Override as AyaVisionProcessor needs image tokens in prompts
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer <image>"
+
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+
+        if batch_size == 1:
+            return ["lower newer <image>"]
+        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
+            batch_size - 2
+        )
+
+    @require_torch
+    def test_process_interleaved_images_videos(self):
+        processor = self.get_processor()
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                        },
+                        {
+                            "type": "image",
+                            "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+                        },
+                        {"type": "text", "text": "What are the differences between these two images?"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "url": "https://llava-vl.github.io/static/images/view.jpg",
+                        },
+                        {"type": "text", "text": "Write a haiku for this image"},
+                    ],
+                }
+            ],
+        ]
+
+        inputs_batched = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            padding=True,
+        )
+
+        # Process non batched inputs to check if the pixel_values and input_ids are reconstructed in the correct order when batched together
+        images_patches_index = 0
+        for i, message in enumerate(messages):
+            inputs = processor.apply_chat_template(
+                message,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+                padding=True,
+            )
+            # We slice with [-inputs["input_ids"].shape[1] :] as the input_ids are left padded
+            torch.testing.assert_close(
+                inputs["input_ids"][0], inputs_batched["input_ids"][i][-inputs["input_ids"].shape[1] :]
+            )
+            torch.testing.assert_close(
+                inputs["pixel_values"],
+                inputs_batched["pixel_values"][
+                    images_patches_index : images_patches_index + inputs["pixel_values"].shape[0]
+                ],
+            )
+            images_patches_index += inputs["pixel_values"].shape[0]
diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py
index aa5bbbfba6cb..ca0f4c95ff5c 100644
--- a/tests/models/bamba/test_modeling_bamba.py
+++ b/tests/models/bamba/test_modeling_bamba.py
@@ -20,12 +20,7 @@
 import pytest
 
 from transformers import AutoTokenizer, BambaConfig, is_torch_available
-from transformers.testing_utils import (
-    require_torch,
-    require_torch_gpu,
-    slow,
-    torch_device,
-)
+from transformers.testing_utils import Expectations, require_torch, require_torch_gpu, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -503,15 +498,18 @@ def setUpClass(cls):
             cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
 
     def test_simple_generate(self):
-        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
-        #
-        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
-        # considering differences in hardware processing and potential deviations in generated text.
-        EXPECTED_TEXTS = {
-            # 7: "",
-            8: "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all having a good time.",
-            #  9: """,
-        }
+        expectations = Expectations(
+            {
+                (
+                    "cuda",
+                    8,
+                ): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are all having a good time.",
+                (
+                    "rocm",
+                    9,
+                ): "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
+            }
+        )
 
         self.model.to(torch_device)
 
@@ -520,7 +518,8 @@ def test_simple_generate(self):
         ].to(torch_device)
         out = self.model.generate(input_ids, do_sample=False, max_new_tokens=10)
         output_sentence = self.tokenizer.decode(out[0, :])
-        self.assertEqual(output_sentence, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
+        expected = expectations.get_expectation()
+        self.assertEqual(output_sentence, expected)
 
         # TODO: there are significant differences in the logits across major cuda versions, which shouldn't exist
         if self.cuda_compute_capability_major_version == 8:
@@ -549,7 +548,10 @@ def test_simple_batched_generate_with_padding(self):
                 "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
                 "!!!<|begin_of_text|>I am late! I need to get to work! I have to get to the",
             ],
-            9: [],
+            9: [
+                "<|begin_of_text|>Hey how are you doing on this lovely evening? I hope you are doing well. I am here",
+                "!!!<|begin_of_text|>I am late! I need to be at the airport in 20 minutes! I",
+            ],
         }
 
         self.model.to(torch_device)
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index d94f6d26d6e8..b44c3d3f0484 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -1076,6 +1076,10 @@ def fine_generation_config(self):
         fine_generation_config = BarkFineGenerationConfig(**self.model.generation_config.fine_acoustics_config)
         return fine_generation_config
 
+    def test_model_can_generate(self):
+        # Bark has custom generate without inheriting GenerationMixin. This test could prevent regression.
+        self.assertTrue(self.model.can_generate())
+
     @slow
     def test_generate_semantic(self):
         input_ids = self.inputs
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index 1cddb898e9c6..50c5cd37d089 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -504,7 +504,7 @@ def test_generate_fp16(self):
         model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
@@ -599,13 +599,15 @@ def test_xsum_1_1_generation(self):
             " 2002 to prosecute genocide, crimes against humanity and war crimes."
         )
         EXPECTED = (
+            "</s>"
             " The International Criminal Court (ICC) has announced that it has been announced by the International"
             " Criminal court."
+            "</s>"
         )
 
         dct = tok(ARTICLE, return_tensors="pt")
         generated_ids = hf.generate(**dct, num_beams=4)
-        result = tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        result = tok.batch_decode(generated_ids)[0]
         assert EXPECTED == result
 
     def test_xsum_1_1_batch_generation(self):
@@ -729,16 +731,18 @@ def test_xsum_1_1_batch_generation(self):
             truncation=True,
         )
         generated_ids = self.xsum_1_1_model.generate(**batch, num_beams=4)
-        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)
-        assert (
-            result[0]
-            == " The International Criminal Court (ICC) has announced that it has been announced by the International"
+        result = self.tok.batch_decode(generated_ids)
+        assert result[0] == (
+            "</s>"
+            " The International Criminal Court (ICC) has announced that it has been announced by the International"
             " Criminal court."
+            "</s><pad><pad><pad><pad><pad>"
         )
-        assert (
-            result[1]
-            == " An investigation into the crash that killed at least 10 people in the French capital has been"
+        assert result[1] == (
+            "</s>"
+            " An investigation into the crash that killed at least 10 people in the French capital has been"
             " released by the French police investigating the crash."
+            "</s>"
         )
 
     def test_encoder_equiv(self):
@@ -939,8 +943,10 @@ def test_xsum_summarization_same_as_fairseq(self):
         PGE_ARTICLE = """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
 
         EXPECTED_SUMMARY = (
+            "</s>"
             "California's largest power company has begun shutting off electricity to thousands of customers in the"
             " state."
+            "</s>"
         )
         dct = tok.batch_encode_plus(
             [PGE_ARTICLE],
@@ -962,10 +968,7 @@ def test_xsum_summarization_same_as_fairseq(self):
             decoder_start_token_id=model.config.eos_token_id,
         )
 
-        decoded = tok.batch_decode(
-            hypotheses_batch,
-            skip_special_tokens=True,
-        )
+        decoded = tok.batch_decode(hypotheses_batch)
         self.assertEqual(EXPECTED_SUMMARY, decoded[0])
 
     def test_xsum_config_generation_params(self):
@@ -1189,26 +1192,32 @@ def test_cnn_summarization_same_as_fairseq(self):
         assert hypotheses_batch[:, 1].eq(0).all().item()
 
         EXPECTED = [
+            "</s><s>"
             "A French prosecutor says he is not aware of any video footage from on board the plane. Two German "
             "magazines claim to have found a cell phone video showing the crash. The publications say they watched "
             "the video, which was found by a source close to the investigation. All 150 on board Germanwings Flight "
-            "9525 were killed.",
+            "9525 were killed."
+            "</s>",
+            "</s><s>"
             "Palestinian Authority becomes 123rd member of the International Criminal Court. The move gives the court "
             "jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the "
             "Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said it was a "
-            "move toward greater justice.",
+            "move toward greater justice."
+            "</s><pad><pad><pad><pad>",
+            "</s><s>"
             "U.S. and its negotiating partners reached a strong framework agreement with Iran. Peter Bergen: The "
             "debate that has already begun will likely result in more heat than light. He says critics have made "
             "dubious assumptions and doubtful assertions. Bergen says the goal was to block Iran from building a "
-            "nuclear weapon.",
+            "nuclear weapon."
+            "</s><pad><pad><pad>",
+            "</s><s>"
             "Liana Barrientos, 39, has been married 10 times, sometimes within two weeks of each other. Prosecutors "
             "say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the "
-            "Bronx on Friday. If convicted, she faces up to four years in prison.",
+            "Bronx on Friday. If convicted, she faces up to four years in prison."
+            "</s><pad><pad><pad><pad><pad>",
         ]
 
-        generated_summaries = tok.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
+        generated_summaries = tok.batch_decode(hypotheses_batch.tolist())
         assert generated_summaries == EXPECTED
 
     @slow
@@ -1526,7 +1535,3 @@ def test_decoder_model_attn_mask_past(self):
     @unittest.skip(reason="Decoder cannot keep gradients")
     def test_retain_grad_hidden_states_attentions(self):
         return
-
-    @unittest.skip
-    def test_save_load_fast_init_from_base(self):
-        pass
diff --git a/tests/models/bart/test_modeling_flax_bart.py b/tests/models/bart/test_modeling_flax_bart.py
index 87603ce127b3..df2c689133e8 100644
--- a/tests/models/bart/test_modeling_flax_bart.py
+++ b/tests/models/bart/test_modeling_flax_bart.py
@@ -19,7 +19,6 @@
 from transformers import BartConfig, BartTokenizer, is_flax_available
 from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 
 
@@ -324,7 +323,7 @@ def test_shift_tokens_right(self):
 
 
 @require_flax
-class FlaxBartModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxBartModelTest(FlaxModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (
         (
diff --git a/tests/models/bart/test_modeling_tf_bart.py b/tests/models/bart/test_modeling_tf_bart.py
index 87c3b895f17b..e93ab4479506 100644
--- a/tests/models/bart/test_modeling_tf_bart.py
+++ b/tests/models/bart/test_modeling_tf_bart.py
@@ -87,7 +87,7 @@ def prepare_config_and_inputs_for_common(self):
             clip_value_min=self.eos_token_id + 1,
             clip_value_max=self.vocab_size + 1,
         )
-        # Explicity add "end of sequence" to the inputs
+        # Explicitly add "end of sequence" to the inputs
         eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
         input_ids = tf.concat([input_ids, eos_tensor], axis=1)
 
@@ -225,7 +225,7 @@ def test_decoder_model_past_large_inputs(self):
         self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
 
     # TODO (Joao): fix me
-    @unittest.skip("Onnx compliancy broke with TF 2.10")
+    @unittest.skip("Onnx compliance broke with TF 2.10")
     def test_onnx_compliancy(self):
         pass
 
diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py
index 274312983f18..f6b66982cc88 100644
--- a/tests/models/bart/test_tokenization_bart.py
+++ b/tests/models/bart/test_tokenization_bart.py
@@ -14,13 +14,14 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding
 from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_tokenizers, require_torch
 from transformers.utils import cached_property
 
-from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors
+from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors, use_cache_if_possible
 
 
 @require_tokenizers
@@ -32,8 +33,10 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_filter = filter_roberta_detectors
     # from_pretrained_kwargs = {'add_prefix_space': True}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
         vocab = [
             "l",
             "o",
@@ -58,22 +61,30 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         return "lower newer", "lower newer"
@@ -154,8 +165,8 @@ def test_pretokenized_inputs(self):
     def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 sentence = "A, <mask> AllenNLP sentence."
                 tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
                 tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
diff --git a/tests/models/barthez/test_tokenization_barthez.py b/tests/models/barthez/test_tokenization_barthez.py
index c76435958c6a..86663ce60c1a 100644
--- a/tests/models/barthez/test_tokenization_barthez.py
+++ b/tests/models/barthez/test_tokenization_barthez.py
@@ -31,13 +31,14 @@ class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         tokenizer = BarthezTokenizerFast.from_pretrained("moussaKam/mbarthez")
-        tokenizer.save_pretrained(self.tmpdirname)
-        tokenizer.save_pretrained(self.tmpdirname, legacy_format=False)
-        self.tokenizer = tokenizer
+        tokenizer.save_pretrained(cls.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname, legacy_format=False)
+        cls.tokenizer = tokenizer
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
diff --git a/tests/models/bartpho/test_tokenization_bartpho.py b/tests/models/bartpho/test_tokenization_bartpho.py
index 023584e91f81..6eb05a17acc4 100644
--- a/tests/models/bartpho/test_tokenization_bartpho.py
+++ b/tests/models/bartpho/test_tokenization_bartpho.py
@@ -15,11 +15,12 @@
 
 import os
 import unittest
+from functools import lru_cache
 
 from transformers.models.bartpho.tokenization_bartpho import VOCAB_FILES_NAMES, BartphoTokenizer
 from transformers.testing_utils import get_tests_dir
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model")
@@ -31,24 +32,29 @@ class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = False
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab = ["▁This", "▁is", "▁a", "▁t", "est"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.monolingual_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"])
-        with open(self.monolingual_vocab_file, "w", encoding="utf-8") as fp:
+        cls.monolingual_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"])
+        with open(cls.monolingual_vocab_file, "w", encoding="utf-8") as fp:
             for token in vocab_tokens:
                 fp.write(f"{token} {vocab_tokens[token]}\n")
 
-        tokenizer = BartphoTokenizer(SAMPLE_VOCAB, self.monolingual_vocab_file, **self.special_tokens_map)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer = BartphoTokenizer(SAMPLE_VOCAB, cls.monolingual_vocab_file, **cls.special_tokens_map)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return BartphoTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return BartphoTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "This is a là test"
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index c455c9eebb16..99abd4fa244d 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -14,20 +14,15 @@
 # limitations under the License.
 """Testing suite for the PyTorch BEiT model."""
 
-import inspect
-import tempfile
 import unittest
 
-import numpy as np
 from datasets import load_dataset
 from packaging import version
-from parameterized import parameterized
 
 from transformers import BeitConfig
 from transformers.testing_utils import (
     require_torch,
     require_torch_multi_gpu,
-    require_torch_sdpa,
     require_vision,
     slow,
     torch_device,
@@ -35,14 +30,12 @@
 from transformers.utils import (
     cached_property,
     is_torch_available,
-    is_torch_bf16_available_on_device,
-    is_torch_fp16_available_on_device,
     is_vision_available,
 )
 
 from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor, sdpa_kernel
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -119,6 +112,7 @@ def __init__(
         # in BeiT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
+        self.mask_length = self.seq_length - 1
         self.num_masks = int(mask_ratio * self.seq_length)
         self.attn_implementation = attn_implementation
 
@@ -379,13 +373,13 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -414,193 +408,6 @@ def test_model_from_pretrained(self):
         model = BeitModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        # The common test modifies the num_hidden_layers to be 1. However, for Beit we want to
-        # avoid that because the num_hidden_layers is generally assumed to be 4. Also, the code
-        # related to attention masks in the original common tests is not required as the Beit
-        # model does not handle attention masks. Furthermore, some extra code like modifying
-        # the norm layers eps values for specialized configs and checking for the 'noise'
-        # has been omitted to simply the test.
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self.all_model_classes[0]._supports_sdpa:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
-        if torch_dtype == "float16":
-            torch_dtype = torch.float16
-        elif torch_dtype == "bfloat16":
-            torch_dtype = torch.bfloat16
-        elif torch_dtype == "float32":
-            torch_dtype = torch.float32
-
-        atols = {
-            ("cpu", False, torch.float32): 1e-6,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-6,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-6,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-6,
-            ("cuda", True, torch.bfloat16): 1e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-        rtols = {
-            ("cpu", False, torch.float32): 1e-4,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-4,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-4,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-
-        def get_mean_reldiff(failcase, x, ref, atol, rtol):
-            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            config.rms_norm_eps = 1.0
-            config.layer_norm_eps = 1.0
-            config.norm_eps = 1.0
-            config.norm_epsilon = 1.0
-            config.layer_norm_epsilon = 1.0
-
-            model = model_class(config)
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype, use_mask_token=True)
-                model_sdpa = model_sdpa.eval().to(torch_device, dtype=torch_dtype)
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                    use_mask_token=True,
-                )
-                model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype)
-
-                # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
-                for x in model_eager.modules():
-                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
-                        x.eps = 1.0
-                for x in model_sdpa.modules():
-                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
-                        x.eps = 1.0
-
-                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
-                # but it would be nicer to have an efficient way to use parameterized.expand
-                fail_cases = []
-                for padding_side in ["left", "right"]:
-                    for use_mask in [False, True]:
-                        for output_attentions in [True, False]:
-                            can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                            if not (self.has_attentions and can_output_attn) and output_attentions:
-                                continue
-                            # TODO: if we can also check with `batch_size=1` without being flaky?
-                            for batch_size in [7]:
-                                dummy_input = inputs_dict[model.main_input_name]
-
-                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                    dummy_input = dummy_input.to(torch_dtype)
-
-                                dummy_input = dummy_input[:batch_size]
-                                for enable_kernels in [False, True]:
-                                    failcase = f"padding_side={padding_side}, use_mask={use_mask}, enable_kernels={enable_kernels}"
-                                    processed_inputs = {
-                                        model.main_input_name: dummy_input,
-                                        "output_hidden_states": True,
-                                    }
-
-                                    if (
-                                        self.has_attentions
-                                        and "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                                    ):
-                                        processed_inputs["output_attentions"] = output_attentions
-
-                                    if "bool_masked_pos" in inspect.signature(model_eager.forward).parameters:
-                                        dummy_mask = torch.ones((self.model_tester.num_masks,))
-                                        mask_length = self.model_tester.seq_length - 1 - dummy_mask.size(0)
-                                        dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
-                                        dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
-                                        processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)
-
-                                    with torch.no_grad():
-                                        with sdpa_kernel(
-                                            enable_flash=enable_kernels,
-                                            enable_math=True,
-                                            enable_mem_efficient=enable_kernels,
-                                        ):
-                                            prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
-                                            outputs_eager = model_eager(**prepared_inputs)
-                                            outputs_sdpa = model_sdpa(**prepared_inputs)
-
-                                    logits_eager = outputs_eager.hidden_states[-1]
-                                    logits_sdpa = outputs_sdpa.hidden_states[-1]
-                                    if torch_device in ["cpu", "cuda"]:
-                                        atol = atols[torch_device, enable_kernels, torch_dtype]
-                                        rtol = rtols[torch_device, enable_kernels, torch_dtype]
-                                    elif torch_device == "xpu":
-                                        # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
-                                        # which is implemented on PyTorch level using aten operators and is
-                                        # device agnostic with respect to implementation of each aten operator.
-                                        atol = atols["cuda", False, torch_dtype]
-                                        rtol = rtols["cuda", False, torch_dtype]
-                                    else:
-                                        atol = 1e-7
-                                        rtol = 1e-4
-
-                                    # Masked tokens output slightly deviates - we don't mind that.
-                                    if use_mask:
-                                        _logits_sdpa = torch.zeros_like(input=logits_sdpa)
-                                        _logits_eager = torch.zeros_like(input=logits_eager)
-
-                                        _logits_sdpa[:-1] = logits_sdpa[:-1]
-                                        _logits_eager[:-1] = logits_eager[:-1]
-
-                                        if padding_side == "left":
-                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
-                                            _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
-
-                                        elif padding_side == "right":
-                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
-                                            _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
-
-                                        logits_sdpa = _logits_sdpa
-                                        logits_eager = _logits_eager
-
-                                    results = [
-                                        torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
-                                        for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
-                                    ]
-                                    # If 80% batch elements have matched results, it's fine
-                                    if np.mean(results) < 0.8:
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/bert/test_modeling_tf_bert.py b/tests/models/bert/test_modeling_tf_bert.py
index 335a184d2929..a8f8b7347767 100644
--- a/tests/models/bert/test_modeling_tf_bert.py
+++ b/tests/models/bert/test_modeling_tf_bert.py
@@ -735,7 +735,7 @@ def test_custom_load_tf_weights(self):
             self.assertTrue(layer.split("_")[0] in ["dropout", "classifier"])
 
     # TODO (Joao): fix me
-    @unittest.skip("Onnx compliancy broke with TF 2.10")
+    @unittest.skip("Onnx compliance broke with TF 2.10")
     def test_onnx_compliancy(self):
         pass
 
diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py
index 747b0cf2a732..c4392b306b4f 100644
--- a/tests/models/bert/test_tokenization_bert.py
+++ b/tests/models/bert/test_tokenization_bert.py
@@ -41,8 +41,9 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     space_between_special_tokens = True
     from_pretrained_filter = filter_non_english
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "[UNK]",
@@ -61,8 +62,8 @@ def setUp(self):
             "low",
             "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
@@ -257,7 +258,7 @@ def test_sequence_builders(self):
     def test_offsets_with_special_characters(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
                 tokens = tokenizer_r.encode_plus(
@@ -312,8 +313,8 @@ def test_change_tokenize_chinese_chars(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 kwargs["tokenize_chinese_chars"] = True
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
                 ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@@ -326,8 +327,8 @@ def test_change_tokenize_chinese_chars(self):
                 self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
 
                 kwargs["tokenize_chinese_chars"] = False
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
                 ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
diff --git a/tests/models/bert_generation/test_tokenization_bert_generation.py b/tests/models/bert_generation/test_tokenization_bert_generation.py
index e1ccfba8f4e1..1569932d7152 100644
--- a/tests/models/bert_generation/test_tokenization_bert_generation.py
+++ b/tests/models/bert_generation/test_tokenization_bert_generation.py
@@ -34,11 +34,12 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = False
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
index d4954c965222..73020e70527f 100644
--- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py
+++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
@@ -17,6 +17,7 @@
 import os
 import pickle
 import unittest
+from functools import lru_cache
 
 from transformers import AutoTokenizer
 from transformers.models.bert.tokenization_bert import BertTokenizer
@@ -31,7 +32,7 @@
 )
 from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @custom_tokenizers
@@ -41,8 +42,9 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = False
     space_between_special_tokens = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "[UNK]",
@@ -72,8 +74,8 @@ def setUp(self):
             "です",
         ]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
@@ -408,17 +410,21 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC
     tokenizer_class = BertJapaneseTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-    def get_tokenizer(self, **kwargs):
-        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        return BertJapaneseTokenizer.from_pretrained(cls.tmpdirname, subword_tokenizer_type="character", **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "こんにちは、世界。 \nこんばんは、世界。"
diff --git a/tests/models/bertweet/test_tokenization_bertweet.py b/tests/models/bertweet/test_tokenization_bertweet.py
index 71e0a0afe5b5..d0659bc95afc 100644
--- a/tests/models/bertweet/test_tokenization_bertweet.py
+++ b/tests/models/bertweet/test_tokenization_bertweet.py
@@ -15,10 +15,11 @@
 
 import os
 import unittest
+from functools import lru_cache
 
 from transformers.models.bertweet.tokenization_bertweet import VOCAB_FILES_NAMES, BertweetTokenizer
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -26,26 +27,31 @@ class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = BertweetTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["I", "m", "V@@", "R@@", "r", "e@@"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "a m</w>"]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             for token in vocab_tokens:
                 fp.write(f"{token} {vocab_tokens[token]}\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return BertweetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return BertweetTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "I am VinAI Research"
diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py
index 6aca3cbc4109..762d6531b2c2 100644
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@@ -235,24 +235,6 @@ def create_and_check_model_as_decoder(
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = BigBirdForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
     def create_and_check_for_masked_lm(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
@@ -451,6 +433,8 @@ class BigBirdModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": BigBirdModel,
@@ -607,32 +591,23 @@ def test_for_change_to_full_attn(self):
         self.model_tester.create_and_check_for_change_to_full_attn(*config_and_inputs)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    # overwrite from common in order to skip the check on `attentions`
-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None):
-        # `bigbird_block_sparse_attention` in `FlaxBigBird` returns `attention_probs = None`, while in PyTorch version,
-        # an effort was done to return `attention_probs` (yet to be verified).
-        if name.startswith("outputs.attentions"):
-            return
-        else:
-            super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
-
 
 @require_torch
 @slow
diff --git a/tests/models/big_bird/test_modeling_flax_big_bird.py b/tests/models/big_bird/test_modeling_flax_big_bird.py
index 8beb12b8c6c8..fe1790bf75c0 100644
--- a/tests/models/big_bird/test_modeling_flax_big_bird.py
+++ b/tests/models/big_bird/test_modeling_flax_big_bird.py
@@ -212,12 +212,3 @@ def model_jitted(input_ids, attention_mask=None, **kwargs):
                 self.assertEqual(len(outputs), len(jitted_outputs))
                 for jitted_output, output in zip(jitted_outputs, outputs):
                     self.assertEqual(jitted_output.shape, output.shape)
-
-    # overwrite from common in order to skip the check on `attentions`
-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None):
-        # `bigbird_block_sparse_attention` in `FlaxBigBird` returns `attention_probs = None`, while in PyTorch version,
-        # an effort was done to return `attention_probs` (yet to be verified).
-        if name.startswith("outputs.attentions"):
-            return
-        else:
-            super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py
index 25f8de17700f..f8fa29ba4845 100644
--- a/tests/models/big_bird/test_tokenization_big_bird.py
+++ b/tests/models/big_bird/test_tokenization_big_bird.py
@@ -36,11 +36,12 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
-        tokenizer = self.tokenizer_class(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer = cls.tokenizer_class(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
index 9103c2d52f62..c4752097e958 100644
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@@ -464,7 +464,7 @@ def test_for_change_to_full_attn(self):
         torch.testing.assert_close(outputs1, outputs2, rtol=1e-5, atol=1e-5)
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py
index 836f0e8216b6..8d92f0a24774 100644
--- a/tests/models/biogpt/test_modeling_biogpt.py
+++ b/tests/models/biogpt/test_modeling_biogpt.py
@@ -136,24 +136,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = BioGptForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
     def create_and_check_biogpt_model_attention_mask_past(
         self, config, input_ids, input_mask, head_mask, token_type_ids, *args
     ):
@@ -363,7 +345,7 @@ def test_batch_generation(self):
         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
         output_non_padded = model.generate(input_ids=inputs_non_padded)
 
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
 
@@ -434,7 +416,7 @@ def test_inference_lm_head_model(self):
         torch.testing.assert_close(output[:, :3, :3], expected_slice, rtol=1e-4, atol=1e-4)
 
     @slow
-    def test_biogpt_generation(self):
+    def test_biogpt_generation_beam_search(self):
         tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
         model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
         model.to(torch_device)
@@ -448,13 +430,15 @@ def test_biogpt_generation(self):
             num_beams=5,
             early_stopping=True,
         )
-        output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        output_str = tokenizer.decode(output_ids[0])
 
         EXPECTED_OUTPUT_STR = (
+            "</s>"
             "COVID-19 is a global pandemic caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), the"
             " causative agent of coronavirus disease 2019 (COVID-19), which has spread to more than 200 countries and"
             " territories, including the United States (US), Canada, Australia, New Zealand, the United Kingdom (UK),"
             " and the United States of America (USA), as of March 11, 2020, with more than 800,000 confirmed cases and"
-            " more than 800,000 deaths."
+            " more than 800,000 deaths. "
+            "</s>"
         )
         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
diff --git a/tests/models/biogpt/test_tokenization_biogpt.py b/tests/models/biogpt/test_tokenization_biogpt.py
index ea52a7cf7f3a..4a9c53a6d057 100644
--- a/tests/models/biogpt/test_tokenization_biogpt.py
+++ b/tests/models/biogpt/test_tokenization_biogpt.py
@@ -30,8 +30,9 @@ class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = BioGptTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -60,11 +61,11 @@ def setUp(self):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w") as fp:
             fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
+        with open(cls.merges_file, "w") as fp:
             fp.write("\n".join(merges))
 
     def get_input_output_texts(self, tokenizer):
diff --git a/tests/models/blenderbot/test_modeling_flax_blenderbot.py b/tests/models/blenderbot/test_modeling_flax_blenderbot.py
index d5d9c3d7cfb7..09af037e9ca2 100644
--- a/tests/models/blenderbot/test_modeling_flax_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_flax_blenderbot.py
@@ -20,7 +20,6 @@
 from transformers import BlenderbotConfig, is_flax_available
 from transformers.testing_utils import jax_device, require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -309,7 +308,7 @@ def test_shift_tokens_right(self):
 
 
 @require_flax
-class FlaxBlenderbotModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxBlenderbotModelTest(FlaxModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (
         (
diff --git a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
index f09b7f05ce3a..7818dd67c7c3 100644
--- a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
@@ -20,7 +20,6 @@
 from transformers import BlenderbotSmallConfig, is_flax_available
 from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -308,7 +307,7 @@ def test_shift_tokens_right(self):
 
 
 @require_flax
-class FlaxBlenderbotSmallModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxBlenderbotSmallModelTest(FlaxModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (
         (
diff --git a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
index 7ee3e989fb18..286052558b90 100644
--- a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
@@ -18,13 +18,14 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers.models.blenderbot_small.tokenization_blenderbot_small import (
     VOCAB_FILES_NAMES,
     BlenderbotSmallTokenizer,
 )
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
@@ -32,25 +33,30 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = BlenderbotSmallTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
 
         merges = ["#version: 0.2", "a p", "t e</w>", "ap t</w>", "a d", "ad apt</w>", "a c", "ac t</w>", ""]
-        self.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
+        cls.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return BlenderbotSmallTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return BlenderbotSmallTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "adapt act apte"
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index c4029e6377be..1bf594d3c999 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -202,25 +202,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="BlipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="BlipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "Salesforce/blip-vqa-base"
@@ -346,13 +338,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -361,23 +353,12 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "Salesforce/blip-vqa-base"
         model = BlipTextModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_pt_tf_model_equivalence(self):
-        super().test_pt_tf_model_equivalence(allow_missing_keys=True)
-
 
 class BlipModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
@@ -477,7 +458,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for Blip
+    # override as the `logit_scale` parameter initialization is different for Blip
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -486,7 +467,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
@@ -650,9 +631,6 @@ def test_get_multimodal_features(self):
             ),
         )
 
-    def test_pt_tf_model_equivalence(self):
-        super().test_pt_tf_model_equivalence(allow_missing_keys=True)
-
 
 class BlipTextRetrievalModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
@@ -805,6 +783,8 @@ def prepare_config_and_inputs_for_common(self):
 @require_vision
 class BlipVQAModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BlipForQuestionAnswering,) if is_torch_available() else ()
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -981,18 +961,18 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for Blip
+    # override as the `logit_scale` parameter initialization is different for Blip
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -1001,7 +981,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
@@ -1112,6 +1092,8 @@ def test_model_from_pretrained(self):
 @require_torch
 class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -1208,7 +1190,7 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    # override as the `logit_scale` parameter initilization is different for Blip
+    # override as the `logit_scale` parameter initialization is different for Blip
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -1217,7 +1199,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py
index 85ab462a0d54..d6614c6a2f3f 100644
--- a/tests/models/blip/test_modeling_blip_text.py
+++ b/tests/models/blip/test_modeling_blip_text.py
@@ -150,13 +150,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -165,19 +165,8 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "Salesforce/blip-vqa-base"
         model = BlipTextModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
-
-    def test_pt_tf_model_equivalence(self):
-        super().test_pt_tf_model_equivalence(allow_missing_keys=True)
diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py
index 06e7d7e2247f..ed427af7ee31 100644
--- a/tests/models/blip/test_modeling_tf_blip.py
+++ b/tests/models/blip/test_modeling_tf_blip.py
@@ -179,14 +179,6 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip(reason="BlipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="BlipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "Salesforce/blip-vqa-base"
@@ -307,23 +299,12 @@ def test_model(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "Salesforce/blip-vqa-base"
         model = TFBlipTextModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
-        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
-
 
 class TFBlipModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
@@ -430,9 +411,6 @@ def test_model_from_pretrained(self):
         model = TFBlipModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
-        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
-
     @unittest.skip("Matt: Re-enable this test when we have a proper export function for TF models.")
     def test_saved_model_creation(self):
         # This fails because the if return_loss: conditional can return None or a Tensor and TF hates that.
diff --git a/tests/models/blip/test_modeling_tf_blip_text.py b/tests/models/blip/test_modeling_tf_blip_text.py
index dff9bef82e05..082473dfd507 100644
--- a/tests/models/blip/test_modeling_tf_blip_text.py
+++ b/tests/models/blip/test_modeling_tf_blip_text.py
@@ -148,13 +148,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -163,19 +163,8 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="BlipTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "Salesforce/blip-vqa-base"
         model = TFBlipTextModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
-
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
-        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index a405a1f97fb3..a360cb98a4ba 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -15,7 +15,6 @@
 """Testing suite for the PyTorch BLIP-2 model."""
 
 import inspect
-import os
 import tempfile
 import unittest
 
@@ -36,7 +35,7 @@
     slow,
     torch_device,
 )
-from transformers.utils import is_torch_available, is_torch_sdpa_available, is_vision_available
+from transformers.utils import is_torch_available, is_vision_available
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -210,25 +209,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="Blip2VisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Blip2VisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "Salesforce/blip2-opt-2.7b"
@@ -477,7 +468,7 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
     test_pruning = False
     test_resize_embeddings = False
     test_attention_outputs = False
-    test_torchscript = True
+    test_torchscript = False
     _is_composite = True
 
     def setUp(self):
@@ -494,116 +485,6 @@ def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
 
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        # overwrite because BLIP requires ipnut ids and pixel values as input
-        if not self.test_torchscript:
-            self.skipTest(reason="test_torchscript is set to `False`")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        for model_class in self.all_model_classes:
-            for attn_implementation in ["eager", "sdpa"]:
-                if attn_implementation == "sdpa" and (not model_class._supports_sdpa or not is_torch_sdpa_available()):
-                    continue
-
-                configs_no_init._attn_implementation = attn_implementation
-                model = model_class(config=configs_no_init)
-                model.to(torch_device)
-                model.eval()
-                inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                main_input_name = model_class.main_input_name
-
-                try:
-                    if model.config.is_encoder_decoder:
-                        model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                        main_input = inputs[main_input_name]
-                        input_ids = inputs["input_ids"]
-                        attention_mask = inputs["attention_mask"]
-                        decoder_input_ids = inputs["decoder_input_ids"]
-                        decoder_attention_mask = inputs["decoder_attention_mask"]
-                        model(main_input, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
-                        traced_model = torch.jit.trace(
-                            model, (main_input, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
-                        )
-                    else:
-                        main_input = inputs[main_input_name]
-                        input_ids = inputs["input_ids"]
-
-                        if model.config._attn_implementation == "sdpa":
-                            trace_input = {main_input_name: main_input, "input_ids": input_ids}
-
-                            if "attention_mask" in inputs:
-                                trace_input["attention_mask"] = inputs["attention_mask"]
-                            else:
-                                self.skipTest(reason="testing SDPA without attention_mask is not supported")
-
-                            model(main_input, attention_mask=inputs["attention_mask"])
-                            # example_kwarg_inputs was introduced in torch==2.0, but it is fine here since SDPA has a requirement on torch>=2.1.
-                            traced_model = torch.jit.trace(model, example_kwarg_inputs=trace_input)
-                        else:
-                            model(main_input, input_ids)
-                            traced_model = torch.jit.trace(model, (main_input, input_ids))
-                except RuntimeError:
-                    self.fail("Couldn't trace module.")
-
-                with tempfile.TemporaryDirectory() as tmp_dir_name:
-                    pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-
-                    try:
-                        torch.jit.save(traced_model, pt_file_name)
-                    except Exception:
-                        self.fail("Couldn't save module.")
-
-                    try:
-                        loaded_model = torch.jit.load(pt_file_name)
-                    except Exception:
-                        self.fail("Couldn't load module.")
-
-                model.to(torch_device)
-                model.eval()
-
-                loaded_model.to(torch_device)
-                loaded_model.eval()
-
-                model_state_dict = model.state_dict()
-                loaded_model_state_dict = loaded_model.state_dict()
-
-                non_persistent_buffers = {}
-                for key in loaded_model_state_dict.keys():
-                    if key not in model_state_dict.keys():
-                        non_persistent_buffers[key] = loaded_model_state_dict[key]
-
-                loaded_model_state_dict = {
-                    key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-                }
-
-                self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
-
-                model_buffers = list(model.buffers())
-                for non_persistent_buffer in non_persistent_buffers.values():
-                    found_buffer = False
-                    for i, model_buffer in enumerate(model_buffers):
-                        if torch.equal(non_persistent_buffer, model_buffer):
-                            found_buffer = True
-                            break
-
-                    self.assertTrue(found_buffer)
-                    model_buffers.pop(i)
-
-                models_equal = True
-                for layer_name, p1 in model_state_dict.items():
-                    if layer_name in loaded_model_state_dict:
-                        p2 = loaded_model_state_dict[layer_name]
-                        if p1.data.ne(p2.data).sum() > 0:
-                            models_equal = False
-
-                self.assertTrue(models_equal)
-
-                # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-                # (Even with this call, there are still memory leak by ~0.04MB)
-                self.clear_torch_jit_class_registry()
-
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
@@ -620,19 +501,11 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="There's no base Blip2Model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="There's no base Blip2Model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         """
         Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
-        This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
+        This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
         In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
         is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
         See https://github.com/huggingface/transformers/pull/32238 for more info
@@ -996,7 +869,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Blip2ForConditionalGeneration, Blip2Model) if is_torch_available() else ()
     # Doesn't run generation tests. TODO: fix generation tests for Blip2ForConditionalGeneration
     all_generative_model_classes = ()
@@ -1015,7 +888,7 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
     test_pruning = False
     test_resize_embeddings = True
     test_attention_outputs = False
-    test_torchscript = True
+    test_torchscript = False
     _is_composite = True
 
     # TODO: Fix the failed tests
@@ -1049,116 +922,6 @@ def test_for_conditional_generation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
 
-    def _create_and_check_torchscript(self, config, inputs_dict):
-        # overwrite because BLIP requires ipnut ids and pixel values as input
-        if not self.test_torchscript:
-            self.skipTest(reason="test_torchscript is set to `False`")
-
-        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-        configs_no_init.torchscript = True
-        for model_class in self.all_model_classes:
-            for attn_implementation in ["eager", "sdpa"]:
-                if attn_implementation == "sdpa" and (not model_class._supports_sdpa or not is_torch_sdpa_available()):
-                    continue
-
-                configs_no_init._attn_implementation = attn_implementation
-                model = model_class(config=configs_no_init)
-                model.to(torch_device)
-                model.eval()
-                inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                main_input_name = model_class.main_input_name
-
-                try:
-                    if model.config.is_encoder_decoder:
-                        model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                        main_input = inputs[main_input_name]
-                        input_ids = inputs["input_ids"]
-                        attention_mask = inputs["attention_mask"]
-                        decoder_input_ids = inputs["decoder_input_ids"]
-                        decoder_attention_mask = inputs["decoder_attention_mask"]
-                        model(main_input, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
-                        traced_model = torch.jit.trace(
-                            model, (main_input, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
-                        )
-                    else:
-                        main_input = inputs[main_input_name]
-                        input_ids = inputs["input_ids"]
-
-                        if model.config._attn_implementation == "sdpa":
-                            trace_input = {main_input_name: main_input, "input_ids": input_ids}
-
-                            if "attention_mask" in inputs:
-                                trace_input["attention_mask"] = inputs["attention_mask"]
-                            else:
-                                self.skipTest(reason="testing SDPA without attention_mask is not supported")
-
-                            model(main_input, attention_mask=inputs["attention_mask"])
-                            # example_kwarg_inputs was introduced in torch==2.0, but it is fine here since SDPA has a requirement on torch>=2.1.
-                            traced_model = torch.jit.trace(model, example_kwarg_inputs=trace_input)
-                        else:
-                            model(main_input, input_ids)
-                            traced_model = torch.jit.trace(model, (main_input, input_ids))
-                except RuntimeError:
-                    self.fail("Couldn't trace module.")
-
-                with tempfile.TemporaryDirectory() as tmp_dir_name:
-                    pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
-
-                    try:
-                        torch.jit.save(traced_model, pt_file_name)
-                    except Exception:
-                        self.fail("Couldn't save module.")
-
-                    try:
-                        loaded_model = torch.jit.load(pt_file_name)
-                    except Exception:
-                        self.fail("Couldn't load module.")
-
-                model.to(torch_device)
-                model.eval()
-
-                loaded_model.to(torch_device)
-                loaded_model.eval()
-
-                model_state_dict = model.state_dict()
-                loaded_model_state_dict = loaded_model.state_dict()
-
-                non_persistent_buffers = {}
-                for key in loaded_model_state_dict.keys():
-                    if key not in model_state_dict.keys():
-                        non_persistent_buffers[key] = loaded_model_state_dict[key]
-
-                loaded_model_state_dict = {
-                    key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-                }
-
-                self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
-
-                model_buffers = list(model.buffers())
-                for non_persistent_buffer in non_persistent_buffers.values():
-                    found_buffer = False
-                    for i, model_buffer in enumerate(model_buffers):
-                        if torch.equal(non_persistent_buffer, model_buffer):
-                            found_buffer = True
-                            break
-
-                    self.assertTrue(found_buffer)
-                    model_buffers.pop(i)
-
-                models_equal = True
-                for layer_name, p1 in model_state_dict.items():
-                    if layer_name in loaded_model_state_dict:
-                        p2 = loaded_model_state_dict[layer_name]
-                        if p1.data.ne(p2.data).sum() > 0:
-                            models_equal = False
-
-                self.assertTrue(models_equal)
-
-                # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-                # (Even with this call, there are still memory leak by ~0.04MB)
-                self.clear_torch_jit_class_registry()
-
     @unittest.skip(reason="Hidden_states is tested in individual model tests")
     def test_hidden_states_output(self):
         pass
@@ -1175,14 +938,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="There's no base Blip2Model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="There's no base Blip2Model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
     def test_cpu_offload(self):
         pass
@@ -1191,7 +946,7 @@ def test_cpu_offload(self):
     def test_sdpa_can_dispatch_composite_models(self):
         """
         Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
-        This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
+        This tests only by looking at layer names, as usually SDPA layers are called "SDPAAttention".
         In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
         is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
         See https://github.com/huggingface/transformers/pull/32238 for more info
@@ -1466,14 +1221,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip(reason="Blip2TextModelWithProjection has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Blip2TextModelWithProjection has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -1641,14 +1388,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    @unittest.skip(reason="Blip2VisionModelWithProjection has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Blip2VisionModelWithProjection has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -1868,7 +1607,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index d5de0e92d480..62c45f65b19b 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -275,14 +275,6 @@ def create_and_check_token_classification_model(self, config, input_ids, input_m
         result = model(input_ids, attention_mask=input_mask)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
 
-    def create_and_check_question_answering_model(self, config, input_ids, input_mask, *args):
-        model = BloomForQuestionAnswering(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
     def create_and_check_forward_and_backwards(
         self, config, input_ids, input_mask, *args, gradient_checkpointing=False
     ):
diff --git a/tests/models/bloom/test_modeling_flax_bloom.py b/tests/models/bloom/test_modeling_flax_bloom.py
index dffee6793652..60b865a45fe2 100644
--- a/tests/models/bloom/test_modeling_flax_bloom.py
+++ b/tests/models/bloom/test_modeling_flax_bloom.py
@@ -18,7 +18,6 @@
 from transformers import BloomConfig, BloomTokenizerFast, is_flax_available
 from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -169,7 +168,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxBloomModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxBloomModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxBloomModel, FlaxBloomForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py
index 71318d9dd174..e8e255f49c1e 100644
--- a/tests/models/bloom/test_tokenization_bloom.py
+++ b/tests/models/bloom/test_tokenization_bloom.py
@@ -13,14 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import unittest
+from functools import lru_cache
 
 from datasets import load_dataset
 
 from transformers import BloomTokenizerFast
 from transformers.testing_utils import require_jinja, require_tokenizers
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -34,14 +36,21 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_vocab_key = "tokenizer_file"
     special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
         tokenizer = BloomTokenizerFast.from_pretrained("bigscience/tokenizer")
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+        tokenizer.save_pretrained(cls.tmpdirname)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        _kwargs = copy.deepcopy(cls.special_tokens_map)
+        _kwargs.update(kwargs)
+        kwargs = _kwargs
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return BloomTokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     @unittest.skip(reason="This needs a slow tokenizer. Bloom does not have one!")
     def test_encode_decode_with_spaces(self):
@@ -65,7 +74,7 @@ def test_encodings_from_sample_data(self):
     def test_padding(self, max_length=6):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
                 # tokenizer_r.pad_token = None # Hotfixing padding = None
                 # Simple input
                 s = "This is a simple input"
@@ -135,7 +144,7 @@ def test_encodings_from_xnli_dataset(self):
     @require_jinja
     def test_tokenization_for_chat(self):
         tokenizer = self.get_rust_tokenizer()
-        tokenizer.chat_template = "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
+        tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
         test_chats = [
             [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
             [
diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py
index 66d0d82b6d75..bc6ff0d6e47e 100644
--- a/tests/models/bridgetower/test_modeling_bridgetower.py
+++ b/tests/models/bridgetower/test_modeling_bridgetower.py
@@ -14,11 +14,8 @@
 # limitations under the License.
 """Testing suite for the PyTorch BridgeTower model."""
 
-import tempfile
 import unittest
 
-import numpy as np
-
 from transformers import (
     BridgeTowerConfig,
     BridgeTowerTextConfig,
@@ -359,39 +356,6 @@ def test_model_from_pretrained(self):
         model = BridgeTowerModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @slow
-    def test_save_load_fast_init_from_base(self):
-        # Override as it is a slow test on this model
-        super().test_save_load_fast_init_from_base()
-
-    # Override as extracting meaningful tensor from output is different for BridgeTower
-    def test_save_load(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**input_dict)
-
-            out_2 = self.extract_output(outputs, model_class.__name__)
-            out_2 = out_2.cpu().numpy()
-            out_2[np.isnan(out_2)] = 0
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname)
-                model.to(torch_device)
-                with torch.no_grad():
-                    after_outputs = model(**input_dict)
-
-                # Make sure we don't have nans
-                out_1 = self.extract_output(after_outputs, model_class.__name__)
-                out_1 = out_1.cpu().numpy()
-                out_1[np.isnan(out_1)] = 0
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
     # Override this as `hidden states output` is different for BridgeTower
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py
index c3075beb5063..5024ff3abe54 100644
--- a/tests/models/byt5/test_tokenization_byt5.py
+++ b/tests/models/byt5/test_tokenization_byt5.py
@@ -19,12 +19,13 @@
 import shutil
 import tempfile
 import unittest
+from functools import lru_cache
 from typing import Tuple
 
 from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
 from transformers.utils import cached_property, is_tf_available, is_torch_available
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 if is_torch_available():
@@ -39,17 +40,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = ByT5Tokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
         tokenizer = ByT5Tokenizer()
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     @cached_property
     def t5_base_tokenizer(self):
         return ByT5Tokenizer.from_pretrained("google/byt5-small")
 
-    def get_tokenizer(self, **kwargs) -> ByT5Tokenizer:
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> ByT5Tokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
         # XXX The default common tokenizer tests assume that every ID is decodable on its own.
diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py
index f66137256801..4e46df0edafa 100644
--- a/tests/models/camembert/test_tokenization_camembert.py
+++ b/tests/models/camembert/test_tokenization_camembert.py
@@ -15,6 +15,7 @@
 
 import tempfile
 import unittest
+from tempfile import TemporaryDirectory
 
 from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast
 from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
@@ -38,12 +39,13 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     @unittest.skip(
         "Token maps are not equal because someone set the probability of ('<unk>NOTUSED', -100), so it's never encoded for fast"
@@ -72,8 +74,9 @@ def test_vocab_size(self):
 
     def test_rust_and_python_bpe_tokenizers(self):
         tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
-        rust_tokenizer = CamembertTokenizerFast.from_pretrained(self.tmpdirname)
+        with TemporaryDirectory() as tmpdirname:
+            tokenizer.save_pretrained(tmpdirname)
+            rust_tokenizer = CamembertTokenizerFast.from_pretrained(tmpdirname)
 
         sequence = "I was born in 92000, and this is falsé."
 
@@ -147,11 +150,11 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
             self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items()))
             return tokenizer
 
-        new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False)
+        new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True)
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 # Load a slow tokenizer from the hub, init with the new token for fast to also include it
-                tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
+                tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos)
                 EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
                 with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
                     self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
@@ -191,9 +194,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
 
                 with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
                     if self.rust_tokenizer_class is not None:
-                        tokenizer_fast = self.rust_tokenizer_class.from_pretrained(
-                            pretrained_name, eos_token=new_eos, from_slow=True
-                        )
+                        tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos, from_slow=True)
                         self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
                         self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
                         # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index 31d02a2c0414..803e942d7aaf 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -510,19 +510,19 @@ def test_model_get_set_embeddings(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py
index e7e19c63ce93..e2efc99ca99d 100644
--- a/tests/models/canine/test_tokenization_canine.py
+++ b/tests/models/canine/test_tokenization_canine.py
@@ -18,13 +18,14 @@
 import shutil
 import tempfile
 import unittest
+from functools import lru_cache
 
 from transformers import BatchEncoding, CanineTokenizer
 from transformers.testing_utils import require_tokenizers, require_torch
 from transformers.tokenization_utils import AddedToken
 from transformers.utils import cached_property
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -32,17 +33,22 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = CanineTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
         tokenizer = CanineTokenizer()
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     @cached_property
     def canine_tokenizer(self):
         return CanineTokenizer.from_pretrained("google/canine-s")
 
-    def get_tokenizer(self, **kwargs) -> CanineTokenizer:
-        tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> CanineTokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        tokenizer = cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
         tokenizer._unicode_vocab_size = 1024
         return tokenizer
 
@@ -64,7 +70,7 @@ def test_prepare_batch_integration(self):
     @require_torch
     def test_encoding_keys(self):
         tokenizer = self.canine_tokenizer
-        src_text = ["Once there was a man.", "He wrote a test in HuggingFace Tranformers."]
+        src_text = ["Once there was a man.", "He wrote a test in HuggingFace Transformers."]
         batch = tokenizer(src_text, padding=True, return_tensors="pt")
         # check if input_ids, attention_mask and token_type_ids are returned
         self.assertIn("input_ids", batch)
diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py
index 09eec986857a..9bcbdbf8a9cd 100644
--- a/tests/models/chameleon/test_modeling_chameleon.py
+++ b/tests/models/chameleon/test_modeling_chameleon.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """Testing suite for the PyTorch chameleon model."""
 
+import copy
 import unittest
 
 import requests
@@ -30,7 +31,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -52,12 +53,12 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        seq_length=7,
+        seq_length=35,
         is_training=False,
         use_input_mask=True,
         use_labels=True,
         vocab_size=99,
-        image_token_id=98,
+        image_token_id=4,
         hidden_size=32,
         num_hidden_layers=2,
         num_attention_heads=2,
@@ -73,9 +74,9 @@ def __init__(
         num_labels=3,
         num_choices=4,
         pad_token_id=0,
-        vq_num_embeds=12,
-        vq_embed_dim=12,
-        vq_channel_multiplier=[1, 2],
+        vq_num_embeds=5,
+        vq_embed_dim=5,
+        vq_channel_multiplier=[1, 4],
         vq_img_token_start_id=10,  # has to be less than vocab size when added with vq_num_embeds
         scope=None,
     ):
@@ -129,7 +130,7 @@ def prepare_config_and_inputs(self):
 
     def get_config(self):
         # create dummy vocab map for image2bpe mapping if it needs remapping
-        # we assume that vocab size is big enough to accoun for image tokens somewhere in the beginning
+        # we assume that vocab size is big enough to account for image tokens somewhere in the beginning
         # same way as in real ckpt, when img tokens are in first half of embeds
         # we will need "vq_num_embeds" amount of tokens
 
@@ -138,7 +139,9 @@ def get_config(self):
         start = self.vq_img_token_start_id
         end = self.vq_img_token_start_id + self.vq_num_embeds
         for i in range(start, end):
-            vocab_map[i] = f"IMGIMGBS{i}"  # dummy str for each token, anything starting with IMGIMG
+            image_token_infix = "".join(chr(ord("A") + int(c)) for c in str(i))
+            # dummy str for each image token, anything starting with IMGIMG
+            vocab_map[i] = f"IMGIMG{image_token_infix}Z"
 
         return ChameleonConfig(
             vocab_size=self.vocab_size,
@@ -177,83 +180,6 @@ def create_and_check_model(self, config, input_ids, input_mask, sequence_labels,
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = ChameleonForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        model = ChameleonForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -275,7 +201,6 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
         {
             "feature-extraction": ChameleonModel,
             "text-generation": ChameleonForConditionalGeneration,
-            "image-text-to-text": ChameleonForConditionalGeneration,
         }
         if is_torch_available()
         else {}
@@ -330,6 +255,149 @@ def test_model_rope_scaling(self, scaling_type):
     def test_batching_equivalence(self):
         pass
 
+    @unittest.skip("Chameleon VQ model cannot be squishes more due to hardcoded layer params in model code")
+    def test_model_is_small(self):
+        pass
+
+
+class ChameleonVision2SeqModelTester(ChameleonModelTester):
+    def __init__(self, parent, image_size=10, **kwargs):
+        super().__init__(parent, **kwargs)
+        self.image_size = image_size
+        self.image_seq_length = 25
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids[input_ids == self.image_token_id] = self.pad_token_id
+        input_ids[:, : self.image_seq_length] = self.image_token_id
+        attention_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+        pixel_values = floats_tensor([self.batch_size, 3, self.image_size, self.image_size])
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class ChameleonVision2SeqModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (ChameleonModel, ChameleonForConditionalGeneration) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "image-text-to-text": ChameleonForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    fx_compatible = False
+
+    def setUp(self):
+        self.model_tester = ChameleonVision2SeqModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ChameleonConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip("Chameleon forces some token ids to be -inf!")
+    def test_batching_equivalence(self):
+        pass
+
+    @unittest.skip("Chameleon cannot do offload because it uses `self.linear.weight` in forward")
+    def test_cpu_offload(self):
+        pass
+
+    @unittest.skip("Chameleon cannot do offload because it uses `self.linear.weight` in forward")
+    def test_disk_offload_bin(self):
+        pass
+
+    @unittest.skip("Chameleon cannot do offload because it uses `self.linear.weight` in forward")
+    def test_disk_offload_safetensors(self):
+        pass
+
+    @unittest.skip("Chameleon VQ model cannot be squishes more due to hardcoded layer params in model code")
+    def test_model_is_small(self):
+        pass
+
+    def test_mismatching_num_image_tokens(self):
+        """
+        Tests that VLMs through an error with explicit message saying what is wrong
+        when number of images don't match number of image tokens in the text.
+        Also we need to test multi-image cases when one prompr has multiple image tokens.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            curr_input_dict = copy.deepcopy(input_dict)  # the below tests modify dict in-place
+            _ = model(**curr_input_dict)  # successful forward with no modifications
+
+            # remove one image but leave the image token in text
+            curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
+            with self.assertRaises(ValueError):
+                _ = model(**curr_input_dict)
+
+            # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
+            input_ids = curr_input_dict["input_ids"][:1]
+            pixel_values = curr_input_dict["pixel_values"][:1]
+            input_ids = torch.cat([input_ids, input_ids], dim=0)
+
+            # one image and two image tokens raise an error
+            with self.assertRaises(ValueError):
+                _ = model(input_ids=input_ids, pixel_values=pixel_values)
+
+            # two images and two image tokens don't raise an error
+            pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
+            _ = model(input_ids=input_ids, pixel_values=pixel_values)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            torch.testing.assert_close(out_embeds, out_ids)
+
 
 @require_torch
 class ChameleonIntegrationTest(unittest.TestCase):
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index d63c152431cf..959304288365 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -397,25 +397,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="ChineseCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="ChineseCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
 
 @require_torch
 class ChineseCLIPVisionModelTest(ModelTesterMixin, unittest.TestCase):
@@ -477,25 +469,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="ChineseCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="ChineseCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "OFA-Sys/chinese-clip-vit-base-patch16"
@@ -596,7 +580,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for CHINESE_CLIP
+    # override as the `logit_scale` parameter initialization is different for CHINESE_CLIP
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -608,7 +592,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index 60b77d0efa4b..21281ced3e89 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -252,25 +252,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="ClapAudioModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="ClapAudioModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "laion/clap-htsat-fused"
@@ -417,13 +409,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -432,14 +424,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="ClapTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="ClapTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "laion/clap-htsat-fused"
@@ -543,7 +527,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for CLAP
+    # override as the `logit_scale` parameter initialization is different for CLAP
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -552,7 +536,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 75ee9a189ad4..5600e67a70ca 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -25,11 +25,8 @@
 from parameterized import parameterized
 from pytest import mark
 
-import transformers
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 from transformers.testing_utils import (
-    is_flax_available,
-    is_pt_flax_cross_test,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
@@ -82,15 +79,6 @@
     from transformers import CLIPProcessor
 
 
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-
 class CLIPVisionModelTester:
     def __init__(
         self,
@@ -447,25 +435,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="CLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="CLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "openai/clip-vit-base-patch32"
@@ -625,13 +605,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -640,14 +620,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="CLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="CLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "openai/clip-vit-base-patch32"
@@ -773,7 +745,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for CLIP
+    # override as the `logit_scale` parameter initialization is different for CLIP
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -782,7 +754,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
@@ -883,126 +855,6 @@ def test_load_vision_text_config(self):
             text_config = CLIPTextConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
 
-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                pt_model.to(torch_device)
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 4e-2)
-
-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load corresponding PyTorch class
-                pt_model = model_class(config).eval()
-
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                pt_model.to(torch_device)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-                    pt_model_loaded.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 4e-2)
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "openai/clip-vit-base-patch32"
diff --git a/tests/models/clip/test_modeling_flax_clip.py b/tests/models/clip/test_modeling_flax_clip.py
index c1d05081ca53..d499f4bf7dcb 100644
--- a/tests/models/clip/test_modeling_flax_clip.py
+++ b/tests/models/clip/test_modeling_flax_clip.py
@@ -4,21 +4,15 @@
 
 import numpy as np
 
-import transformers
-from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
 
 from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 
 
 if is_flax_available():
     import jax
-    import jax.numpy as jnp
 
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
     from transformers.models.clip.modeling_flax_clip import (
         FlaxCLIPModel,
         FlaxCLIPTextModel,
@@ -26,9 +20,6 @@
         FlaxCLIPVisionModel,
     )
 
-if is_torch_available():
-    import torch
-
 
 class FlaxCLIPVisionModelTester:
     def __init__(
@@ -223,21 +214,6 @@ def test_save_load_from_base(self):
     def test_save_load_to_base(self):
         pass
 
-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        pass
-
-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        pass
-
-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
@@ -333,21 +309,6 @@ def test_save_load_from_base(self):
     def test_save_load_to_base(self):
         pass
 
-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        pass
-
-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        pass
-
-    # FlaxCLIPVisionModel does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
@@ -472,92 +433,6 @@ def test_model_from_pretrained(self):
             outputs = model(input_ids=np.ones((1, 1)), pixel_values=np.ones((1, 3, 224, 224)))
             self.assertIsNotNone(outputs)
 
-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
-
-    # overwrite from common since FlaxCLIPModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
     # overwrite from common since FlaxCLIPModel returns nested output
     # which is not supported in the common test
     def test_from_pretrained_save_pretrained(self):
diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py
index c24f554a0788..f0dfec6bd7e7 100644
--- a/tests/models/clip/test_tokenization_clip.py
+++ b/tests/models/clip/test_tokenization_clip.py
@@ -17,12 +17,13 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import CLIPTokenizer, CLIPTokenizerFast
 from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_ftfy, require_tokenizers
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -34,28 +35,37 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_kwargs = {}
     test_seq2seq = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]  # fmt: skip
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return CLIPTokenizer.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return CLIPTokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
@@ -77,8 +87,8 @@ def test_full_tokenizer(self):
     def test_check_encoding_slow_fast(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_s = self.get_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
                 text_tokenized_s = tokenizer_s.tokenize(text)
@@ -138,7 +148,7 @@ def test_offsets_mapping_with_different_add_prefix_space_argument(self):
                 text_of_1_token = "hello"  # `hello` is a token in the vocabulary of `pretrained_name`
                 text = f"{text_of_1_token} {text_of_1_token}"
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name,
                     use_fast=True,
                 )
@@ -151,7 +161,7 @@ def test_offsets_mapping_with_different_add_prefix_space_argument(self):
 
                 text = f" {text}"
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name,
                     use_fast=True,
                 )
@@ -166,7 +176,7 @@ def test_log_warning(self):
         # Test related to the breaking change introduced in transformers v4.17.0
         # We need to check that an error in raised when the user try to load a previous version of the tokenizer.
         with self.assertRaises(ValueError) as context:
-            self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer")
+            self.get_rust_tokenizer("robot-test/old-clip-tokenizer")
 
         self.assertTrue(
             context.exception.args[0].startswith(
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 4b712f199004..85115499267c 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -22,11 +22,8 @@
 import numpy as np
 import requests
 
-import transformers
 from transformers import CLIPSegConfig, CLIPSegProcessor, CLIPSegTextConfig, CLIPSegVisionConfig
 from transformers.testing_utils import (
-    is_flax_available,
-    is_pt_flax_cross_test,
     require_torch,
     require_vision,
     slow,
@@ -57,15 +54,6 @@
     from PIL import Image
 
 
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-
 class CLIPSegVisionModelTester:
     def __init__(
         self,
@@ -203,25 +191,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="CLIPSegVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="CLIPSegVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "CIDAS/clipseg-rd64-refined"
@@ -342,13 +322,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -357,14 +337,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="CLIPSegTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="CLIPSegTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "CIDAS/clipseg-rd64-refined"
@@ -505,19 +477,19 @@ def test_model_get_set_embeddings(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -531,7 +503,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if "logit_scale" in name:
                         self.assertAlmostEqual(
                             param.data.item(),
@@ -635,123 +607,6 @@ def test_load_vision_text_config(self):
             text_config = CLIPSegTextConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
 
-    # overwrite from common since FlaxCLIPSegModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output_loaded, pt_output.numpy(), 4e-2)
-
-    # overwrite from common since FlaxCLIPSegModel returns nested output
-    # which is not supported in the common test
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # load corresponding PyTorch class
-                pt_model = model_class(config).eval()
-
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_outputs = fx_model(**fx_inputs).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-                    self.assert_almost_equals(fx_output, pt_output.numpy(), 4e-2)
-
     def test_training(self):
         if not self.model_tester.is_training:
             self.skipTest(reason="Training test is skipped as the model was not trained")
diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
index 839c831eb9f6..8064d4059ec6 100644
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -408,6 +408,8 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (ClvpModelForConditionalGeneration,) if is_torch_available() else ()
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
 
     test_head_masking = False
     test_pruning = False
@@ -498,7 +500,7 @@ def test_inputs_embeds(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for Clvp
+    # override as the `logit_scale` parameter initialization is different for Clvp
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -507,7 +509,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         expected_value = np.log(1 / 0.07)
                         returned_value = param.data.item()
diff --git a/tests/models/clvp/test_tokenization_clvp.py b/tests/models/clvp/test_tokenization_clvp.py
index aa8d2d22a5be..1c526e84d147 100644
--- a/tests/models/clvp/test_tokenization_clvp.py
+++ b/tests/models/clvp/test_tokenization_clvp.py
@@ -17,11 +17,12 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 from typing import List
 
 from transformers import ClvpTokenizer
 
-from ...test_tokenization_common import TokenizerTesterMixin, slow
+from ...test_tokenization_common import TokenizerTesterMixin, slow, use_cache_if_possible
 
 
 class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -32,8 +33,9 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_seq2seq = False
     test_sentencepiece_ignore_case = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -62,19 +64,23 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, "vocab.json")
-        self.merges_file = os.path.join(self.tmpdirname, "merges.txt")
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, "vocab.json")
+        cls.merges_file = os.path.join(cls.tmpdirname, "merges.txt")
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
     # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return ClvpTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return ClvpTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
     def get_input_output_texts(self, tokenizer):
@@ -134,7 +140,7 @@ def test_rust_and_python_full_tokenizers(self):
     def test_padding(self, max_length=15):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 # Simple input
                 s = "This is a simple input"
diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py
index ee07c54c16aa..774c17f51308 100644
--- a/tests/models/code_llama/test_tokenization_code_llama.py
+++ b/tests/models/code_llama/test_tokenization_code_llama.py
@@ -53,15 +53,16 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece = True
     from_pretrained_kwargs = {}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
         tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
-    def get_tokenizers(self, **kwargs):
+    def get_tokenizers(cls, **kwargs):
         kwargs.update({"pad_token": "<PAD>"})
         return super().get_tokenizers(**kwargs)
 
@@ -151,8 +152,8 @@ def test_save_pretrained(self):
         ]
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 tmpdirname2 = tempfile.mkdtemp()
 
@@ -255,7 +256,7 @@ def test_special_tokens_initialization(self):
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
                 r_output = tokenizer_r.encode("Hey this is a <special> token")
@@ -265,7 +266,7 @@ def test_special_tokens_initialization(self):
                 self.assertTrue(special_token_id in r_output)
 
                 if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                    tokenizer_cr = self.get_rust_tokenizer(
                         pretrained_name,
                         additional_special_tokens=added_tokens,
                         **kwargs,  # , from_slow=True <- unfortunately too slow to convert
diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py
index fc8ece13a801..2cc2f74dd75c 100644
--- a/tests/models/codegen/test_modeling_codegen.py
+++ b/tests/models/codegen/test_modeling_codegen.py
@@ -406,7 +406,7 @@ def test_batch_generation(self):
         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
         output_non_padded = model.generate(input_ids=inputs_non_padded)
 
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
 
diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py
index 184c75216290..28d388202b8a 100644
--- a/tests/models/codegen/test_tokenization_codegen.py
+++ b/tests/models/codegen/test_tokenization_codegen.py
@@ -18,12 +18,13 @@
 import os
 import re
 import unittest
+from functools import lru_cache
 
 from transformers import CodeGenTokenizer, CodeGenTokenizerFast
 from transformers.models.codegen.tokenization_codegen import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_tokenizers, slow
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -35,8 +36,9 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_kwargs = {"add_prefix_space": True}
     test_seq2seq = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -64,22 +66,30 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CodeGenTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CodeGenTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return CodeGenTokenizer.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return CodeGenTokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
@@ -136,7 +146,7 @@ def test_pretokenized_inputs(self, *args, **kwargs):
     def test_padding(self, max_length=15):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 # Simple input
                 s = "This is a simple input"
diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py
index 1d3a435fdb76..24adee07df22 100644
--- a/tests/models/cohere/test_modeling_cohere.py
+++ b/tests/models/cohere/test_modeling_cohere.py
@@ -146,116 +146,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = self.model_class(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = self.for_causal_lm_class(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = self.for_causal_lm_class(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py
index a8ab85fe3b89..cec1334a3355 100644
--- a/tests/models/cohere/test_tokenization_cohere.py
+++ b/tests/models/cohere/test_tokenization_cohere.py
@@ -13,12 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import unittest
+from functools import lru_cache
 
 from transformers import CohereTokenizerFast
 from transformers.testing_utils import require_jinja, require_tokenizers, require_torch_multi_gpu
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -37,14 +39,21 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         "pad_token": "<PAD>",
     }
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
         tokenizer = CohereTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-CohereForCausalLM")
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CohereTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+        tokenizer.save_pretrained(cls.tmpdirname)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        _kwargs = copy.deepcopy(cls.special_tokens_map)
+        _kwargs.update(kwargs)
+        kwargs = _kwargs
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return CohereTokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     # This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough.
     @require_torch_multi_gpu
@@ -80,7 +89,7 @@ def test_encodings_from_sample_data(self):
     def test_padding(self, max_length=10):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
                 # tokenizer_r.pad_token = None # Hotfixing padding = None
                 # Simple input
                 s = "This is a simple input"
diff --git a/tests/models/cohere2/test_modeling_cohere2.py b/tests/models/cohere2/test_modeling_cohere2.py
index ab0af27c00e2..789649b832a8 100644
--- a/tests/models/cohere2/test_modeling_cohere2.py
+++ b/tests/models/cohere2/test_modeling_cohere2.py
@@ -16,6 +16,7 @@
 
 import unittest
 
+import pytest
 from packaging import version
 from parameterized import parameterized
 from pytest import mark
@@ -76,16 +77,12 @@ def test_model_outputs_equivalence(self, **kwargs):
     def test_sdpa_can_dispatch_non_composite_models(self):
         pass
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different")
-    def test_eager_matches_sdpa_inference(self):
-        pass
-
     @unittest.skip("Cohere2's eager attn/sdpa attn outputs are expected to be different")
     def test_eager_matches_sdpa_generate(self):
         pass
 
     @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
     @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding")
     def test_assisted_decoding_matches_greedy_search(self, assistant_type):
         pass
@@ -94,6 +91,7 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
     def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type):
         pass
 
+    @pytest.mark.generate
     @unittest.skip("Cohere2 has HybridCache which is not compatible with assisted decoding")
     def test_assisted_decoding_sample(self):
         pass
@@ -106,10 +104,6 @@ def test_dola_decoding_sample(self):
     def test_generate_continue_from_past_key_values(self):
         pass
 
-    @unittest.skip("Cohere2 has HybridCache and doesn't support low_memory generation")
-    def test_beam_search_low_memory(self):
-        pass
-
     @unittest.skip("Cohere2 has HybridCache and doesn't support contrastive generation")
     def test_contrastive_generate(self):
         pass
diff --git a/tests/models/colpali/test_modeling_colpali.py b/tests/models/colpali/test_modeling_colpali.py
index 5e94ecaab9fc..70ff0ed8bd7c 100644
--- a/tests/models/colpali/test_modeling_colpali.py
+++ b/tests/models/colpali/test_modeling_colpali.py
@@ -20,7 +20,6 @@
 
 import torch
 from datasets import load_dataset
-from parameterized import parameterized
 
 from tests.test_configuration_common import ConfigTester
 from tests.test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
@@ -32,7 +31,6 @@
 from transformers.models.colpali.processing_colpali import ColPaliProcessor
 from transformers.testing_utils import (
     require_torch,
-    require_torch_sdpa,
     require_vision,
     slow,
     torch_device,
@@ -254,31 +252,23 @@ def test_colpali_forward_inputs(self):
             self.assertIsInstance(outputs, ColPaliForRetrievalOutput)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @require_torch_sdpa
-    @slow
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        self.skipTest(
-            "Due to custom causal mask, there is a slightly too big difference between eager and sdpa in bfloat16."
-        )
-
     @unittest.skip(
         reason="From PaliGemma: Some undefined behavior encountered with test versions of this model. Skip for now."
     )
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 80360e8177e7..118552fce5fd 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -22,7 +22,6 @@
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -173,7 +172,7 @@ def create_and_check_conditional_detr_object_detection_head_model(self, config,
 
 
 @require_torch
-class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class ConditionalDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             ConditionalDetrModel,
@@ -466,13 +465,13 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             elif model_class.__name__ == "ConditionalDetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
@@ -502,13 +501,13 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             elif model_class.__name__ == "ConditionalDetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.conditional_detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py
index 4a163ddcd7c1..341f738fddfe 100644
--- a/tests/models/convnextv2/test_modeling_convnextv2.py
+++ b/tests/models/convnextv2/test_modeling_convnextv2.py
@@ -119,35 +119,6 @@ def create_and_check_for_image_classification(self, config, pixel_values, labels
         result = model(pixel_values, labels=labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
 
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = ConvNextV2Backbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify hidden states
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = ConvNextV2Backbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values, labels = config_and_inputs
diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py
index e796d850a8d0..4a6c28e1eb91 100644
--- a/tests/models/cpmant/test_modeling_cpmant.py
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -136,6 +136,8 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class CpmAntModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (CpmAntModel, CpmAntForCausalLM) if is_torch_available() else ()
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {"feature-extraction": CpmAntModel, "text-generation": CpmAntForCausalLM} if is_torch_available() else {}
     )
@@ -159,7 +161,7 @@ def test_inputs_embeds(self):
     def test_retain_grad_hidden_states_attentions(self):
         unittest.skip(
             "CPMAnt doesn't support retain grad in hidden_states or attentions, because prompt management will peel off the output.hidden_states from graph.\
-                 So is attentions. We strongly recommand you use loss to tune model."
+                 So is attentions. We strongly recommend you use loss to tune model."
         )(self.test_retain_grad_hidden_states_attentions)
 
     def test_cpmant_model(self):
diff --git a/tests/models/cpmant/test_tokenization_cpmant.py b/tests/models/cpmant/test_tokenization_cpmant.py
index 042473065be2..32449763eaed 100644
--- a/tests/models/cpmant/test_tokenization_cpmant.py
+++ b/tests/models/cpmant/test_tokenization_cpmant.py
@@ -28,8 +28,9 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = CpmAntTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "<d>",
@@ -49,8 +50,8 @@ def setUp(self):
             "n",
             "t",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     @tooslow
diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index 285729382f50..1e3a40f833ab 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -180,15 +180,6 @@ def prepare_config_and_inputs_for_common(self):
 
         return config, inputs_dict
 
-    def create_and_check_ctrl_for_sequence_classification(self, config, input_ids, head_mask, token_type_ids, *args):
-        config.num_labels = self.num_labels
-        model = CTRLForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-        result = model(input_ids, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
 
 @require_torch
 class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/models/ctrl/test_tokenization_ctrl.py b/tests/models/ctrl/test_tokenization_ctrl.py
index 7fe61f360747..e22ca8abe591 100644
--- a/tests/models/ctrl/test_tokenization_ctrl.py
+++ b/tests/models/ctrl/test_tokenization_ctrl.py
@@ -16,10 +16,11 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers.models.ctrl.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -28,25 +29,30 @@ class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = False
     test_seq2seq = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return CTRLTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "adapt react readapt apt"
diff --git a/tests/models/dab_detr/test_modeling_dab_detr.py b/tests/models/dab_detr/test_modeling_dab_detr.py
index 35d43123bfd8..572efc5de5d6 100644
--- a/tests/models/dab_detr/test_modeling_dab_detr.py
+++ b/tests/models/dab_detr/test_modeling_dab_detr.py
@@ -23,7 +23,6 @@
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -174,15 +173,8 @@ def create_and_check_dab_detr_object_detection_head_model(self, config, pixel_va
 
 
 @require_torch
-class DabDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            DabDetrModel,
-            DabDetrForObjectDetection,
-        )
-        if is_torch_available()
-        else ()
-    )
+class DabDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (DabDetrModel, DabDetrForObjectDetection) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
             "image-feature-extraction": DabDetrModel,
@@ -707,10 +699,10 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
@@ -734,7 +726,7 @@ def test_initialization(self):
                             abs(param.data.max().item()),
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
-                    # Modifed from RT-DETR
+                    # Modified from RT-DETR
                     elif "class_embed" in name and "bias" in name:
                         bias_tensor = torch.full_like(param.data, bias_value)
                         torch.testing.assert_close(
diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py
index d43128286853..7aec53a6e7a9 100644
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -22,7 +22,7 @@
 
 from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
 from transformers import Data2VecAudioConfig, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init
@@ -184,32 +184,6 @@ def create_and_check_model_with_adapter_proj_dim(self, config, input_values, att
             (self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
         )
 
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Data2VecAudioModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
     def check_ctc_loss(self, config, input_values, *args):
         model = Data2VecAudioForCTC(config=config)
         model.to(torch_device)
@@ -442,16 +416,6 @@ def test_resize_tokens_embeddings(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    # non-robust architecture does not exist in Flax
-    def test_equivalence_pt_to_flax(self):
-        pass
-
     def test_retain_grad_hidden_states_attentions(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_hidden_states = True
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index f297d3a3c6db..ee0511a055d9 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -14,18 +14,12 @@
 # limitations under the License.
 """Testing suite for the PyTorch Data2VecVision model."""
 
-import inspect
-import tempfile
 import unittest
 
-import numpy as np
-from parameterized import parameterized
-
 from transformers import Data2VecVisionConfig
 from transformers.testing_utils import (
     require_torch,
     require_torch_multi_gpu,
-    require_torch_sdpa,
     require_vision,
     slow,
     torch_device,
@@ -33,13 +27,11 @@
 from transformers.utils import (
     cached_property,
     is_torch_available,
-    is_torch_bf16_available_on_device,
-    is_torch_fp16_available_on_device,
     is_vision_available,
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor, sdpa_kernel
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -111,6 +103,7 @@ def __init__(
         # in BeiT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
+        self.mask_length = self.seq_length - 1
         self.num_masks = int(mask_ratio * self.seq_length)
         self.attn_implementation = attn_implementation
 
@@ -309,10 +302,6 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-4, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as semseg models tend to diverge a bit more
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
-
     def test_for_image_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
@@ -323,194 +312,6 @@ def test_model_from_pretrained(self):
         model = Data2VecVisionModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    # Copied from tests.models.beit.test_modeling_beit.BeitModelTest.test_eager_matches_sdpa_inference with Beit->Data2VecVision
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        # The common test modifies the num_hidden_layers to be 1. However, for Data2VecVision we want to
-        # avoid that because the num_hidden_layers is generally assumed to be 4. Also, the code
-        # related to attention masks in the original common tests is not required as the Data2VecVision
-        # model does not handle attention masks. Furthermore, some extra code like modifying
-        # the norm layers eps values for specialized configs and checking for the 'noise'
-        # has been omitted to simply the test.
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self.all_model_classes[0]._supports_sdpa:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
-        if torch_dtype == "float16":
-            torch_dtype = torch.float16
-        elif torch_dtype == "bfloat16":
-            torch_dtype = torch.bfloat16
-        elif torch_dtype == "float32":
-            torch_dtype = torch.float32
-
-        atols = {
-            ("cpu", False, torch.float32): 1e-6,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-6,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-6,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-6,
-            ("cuda", True, torch.bfloat16): 1e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-        rtols = {
-            ("cpu", False, torch.float32): 1e-4,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-4,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-4,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-
-        def get_mean_reldiff(failcase, x, ref, atol, rtol):
-            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            config.rms_norm_eps = 1.0
-            config.layer_norm_eps = 1.0
-            config.norm_eps = 1.0
-            config.norm_epsilon = 1.0
-            config.layer_norm_epsilon = 1.0
-
-            model = model_class(config)
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype, use_mask_token=True)
-                model_sdpa = model_sdpa.eval().to(torch_device, dtype=torch_dtype)
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                    use_mask_token=True,
-                )
-                model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype)
-
-                # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
-                for x in model_eager.modules():
-                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
-                        x.eps = 1.0
-                for x in model_sdpa.modules():
-                    if isinstance(x, (nn.LayerNorm, nn.GroupNorm)):
-                        x.eps = 1.0
-
-                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
-                # but it would be nicer to have an efficient way to use parameterized.expand
-                fail_cases = []
-                for padding_side in ["left", "right"]:
-                    for use_mask in [False, True]:
-                        for output_attentions in [True, False]:
-                            can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                            if not (self.has_attentions and can_output_attn) and output_attentions:
-                                continue
-                            # TODO: if we can also check with `batch_size=1` without being flaky?
-                            for batch_size in [7]:
-                                dummy_input = inputs_dict[model.main_input_name]
-
-                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                    dummy_input = dummy_input.to(torch_dtype)
-
-                                dummy_input = dummy_input[:batch_size]
-                                for enable_kernels in [False, True]:
-                                    failcase = f"padding_side={padding_side}, use_mask={use_mask}, enable_kernels={enable_kernels}"
-                                    processed_inputs = {
-                                        model.main_input_name: dummy_input,
-                                        "output_hidden_states": True,
-                                    }
-
-                                    if (
-                                        self.has_attentions
-                                        and "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                                    ):
-                                        processed_inputs["output_attentions"] = output_attentions
-
-                                    if "bool_masked_pos" in inspect.signature(model_eager.forward).parameters:
-                                        dummy_mask = torch.ones((self.model_tester.num_masks,))
-                                        mask_length = self.model_tester.seq_length - 1 - dummy_mask.size(0)
-                                        dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
-                                        dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
-                                        processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)
-
-                                    with torch.no_grad():
-                                        with sdpa_kernel(
-                                            enable_flash=enable_kernels,
-                                            enable_math=True,
-                                            enable_mem_efficient=enable_kernels,
-                                        ):
-                                            prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
-                                            outputs_eager = model_eager(**prepared_inputs)
-                                            outputs_sdpa = model_sdpa(**prepared_inputs)
-
-                                    logits_eager = outputs_eager.hidden_states[-1]
-                                    logits_sdpa = outputs_sdpa.hidden_states[-1]
-                                    if torch_device in ["cpu", "cuda"]:
-                                        atol = atols[torch_device, enable_kernels, torch_dtype]
-                                        rtol = rtols[torch_device, enable_kernels, torch_dtype]
-                                    elif torch_device == "xpu":
-                                        # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
-                                        # which is implemented on PyTorch level using aten operators and is
-                                        # device agnostic with respect to implementation of each aten operator.
-                                        atol = atols["cuda", False, torch_dtype]
-                                        rtol = rtols["cuda", False, torch_dtype]
-                                    else:
-                                        atol = 1e-7
-                                        rtol = 1e-4
-
-                                    # Masked tokens output slightly deviates - we don't mind that.
-                                    if use_mask:
-                                        _logits_sdpa = torch.zeros_like(input=logits_sdpa)
-                                        _logits_eager = torch.zeros_like(input=logits_eager)
-
-                                        _logits_sdpa[:-1] = logits_sdpa[:-1]
-                                        _logits_eager[:-1] = logits_eager[:-1]
-
-                                        if padding_side == "left":
-                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
-                                            _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
-
-                                        elif padding_side == "right":
-                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
-                                            _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
-
-                                        logits_sdpa = _logits_sdpa
-                                        logits_eager = _logits_eager
-
-                                    results = [
-                                        torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
-                                        for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
-                                    ]
-                                    # If 80% batch elements have matched results, it's fine
-                                    if np.mean(results) < 0.8:
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
@@ -551,7 +352,7 @@ def test_inference_image_classification_head_imagenet_1k(self):
         torch.testing.assert_close(logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
 
         expected_top2 = [model.config.label2id[i] for i in ["remote control, remote", "tabby, tabby cat"]]
-        self.assertEqual(logits[0].topk(2).indices.cpu().tolist(), expected_top2)
+        self.assertEqual(logits[0].topk(2).indices.tolist(), expected_top2)
 
     @slow
     def test_inference_interpolate_pos_encoding(self):
diff --git a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
index 5ccbca8f0d4d..5c7ff8835517 100644
--- a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
@@ -385,10 +385,6 @@ def test_keras_fit(self):
                     val_loss2 = history2.history["val_loss"][0]
                     self.assertTrue(np.allclose(val_loss1, val_loss2, atol=1e-2, rtol=1e-3))
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-4, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as semseg models tend to diverge a bit more
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
-
     # Overriding this method since the base method won't be compatible with Data2VecVision.
     def test_loss_computation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index bc0249d666dc..dbe0cb8b271b 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -191,118 +191,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Dbrx
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = DbrxModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Dbrx
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = DbrxForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = DbrxForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Dbrx
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py
index 286dc940e080..d6c15efa4a46 100644
--- a/tests/models/deberta/test_modeling_deberta.py
+++ b/tests/models/deberta/test_modeling_deberta.py
@@ -285,10 +285,6 @@ def test_torch_fx_output_loss(self):
     def test_torch_fx(self):
         pass
 
-    @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker")
-    def test_pt_tf_model_equivalence(self):
-        pass
-
 
 @require_torch
 @require_sentencepiece
diff --git a/tests/models/deberta/test_modeling_tf_deberta.py b/tests/models/deberta/test_modeling_tf_deberta.py
index 003c1a9240b7..14a99ea947ec 100644
--- a/tests/models/deberta/test_modeling_tf_deberta.py
+++ b/tests/models/deberta/test_modeling_tf_deberta.py
@@ -270,10 +270,6 @@ def test_model_from_pretrained(self):
         model = TFDebertaModel.from_pretrained("kamalkraj/deberta-base")
         self.assertIsNotNone(model)
 
-    @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker")
-    def test_pt_tf_model_equivalence(self):
-        pass
-
 
 @require_tf
 class TFDeBERTaModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/deberta/test_tokenization_deberta.py b/tests/models/deberta/test_tokenization_deberta.py
index 96248cf2ec12..dc3c84c8713f 100644
--- a/tests/models/deberta/test_tokenization_deberta.py
+++ b/tests/models/deberta/test_tokenization_deberta.py
@@ -17,12 +17,13 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import DebertaTokenizer, DebertaTokenizerFast
 from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES
 from transformers.testing_utils import slow
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -31,8 +32,9 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     rust_tokenizer_class = DebertaTokenizerFast
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -59,18 +61,22 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "[UNK]"}
+        cls.special_tokens_map = {"unk_token": "[UNK]"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
diff --git a/tests/models/deberta_v2/test_modeling_deberta_v2.py b/tests/models/deberta_v2/test_modeling_deberta_v2.py
index 02fd11ce4d5c..af2dfe03b33d 100644
--- a/tests/models/deberta_v2/test_modeling_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_deberta_v2.py
@@ -303,10 +303,6 @@ def test_torch_fx_output_loss(self):
     def test_torch_fx(self):
         pass
 
-    @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker")
-    def test_pt_tf_model_equivalence(self):
-        pass
-
 
 @require_torch
 @require_sentencepiece
diff --git a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
index 4f2a5bffd074..b46f68525d36 100644
--- a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
@@ -290,10 +290,6 @@ def test_model_from_pretrained(self):
         model = TFDebertaV2Model.from_pretrained("kamalkraj/deberta-v2-xlarge")
         self.assertIsNotNone(model)
 
-    @unittest.skip("This test was broken by the refactor in #22105, TODO @ArthurZucker")
-    def test_pt_tf_model_equivalence(self):
-        pass
-
 
 @require_tf
 class TFDeBERTaV2ModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
index da59fa282928..c2e57a580900 100644
--- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py
+++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
@@ -33,12 +33,13 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece = True
     test_sentencepiece_ignore_case = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>")
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "this is a test"
diff --git a/tests/models/decision_transformer/test_modeling_decision_transformer.py b/tests/models/decision_transformer/test_modeling_decision_transformer.py
index f22911db9580..e004629c9c34 100644
--- a/tests/models/decision_transformer/test_modeling_decision_transformer.py
+++ b/tests/models/decision_transformer/test_modeling_decision_transformer.py
@@ -20,7 +20,6 @@
 from transformers import DecisionTransformerConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -125,7 +124,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DecisionTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class DecisionTransformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (DecisionTransformerModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": DecisionTransformerModel} if is_torch_available() else {}
 
diff --git a/examples/research_projects/codeparrot/scripts/tests/__init__.py b/tests/models/deepseek_v3/__init__.py
similarity index 100%
rename from examples/research_projects/codeparrot/scripts/tests/__init__.py
rename to tests/models/deepseek_v3/__init__.py
diff --git a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
new file mode 100644
index 000000000000..f6394fe354a0
--- /dev/null
+++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py
@@ -0,0 +1,552 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch DeepseekV3 model."""
+
+import unittest
+
+from packaging import version
+from parameterized import parameterized
+
+from transformers import AutoTokenizer, DeepseekV3Config, is_torch_available, set_seed
+from transformers.testing_utils import (
+    cleanup,
+    require_read_token,
+    require_torch,
+    require_torch_accelerator,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        DeepseekV3ForCausalLM,
+        DeepseekV3Model,
+    )
+    from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
+        DeepseekV3RotaryEmbedding,
+    )
+
+
+class DeepseekV3ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        intermediate_size=37,
+        moe_intermediate_size=12,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        num_key_value_heads=4,
+        n_shared_experts=1,
+        n_routed_experts=8,
+        routed_scaling_factor=2.5,
+        kv_lora_rank=16,
+        q_lora_rank=32,
+        qk_rope_head_dim=16,
+        v_head_dim=32,
+        qk_nope_head_dim=32,
+        n_group=2,
+        topk_group=1,
+        num_experts_per_tok=8,
+        first_k_dense_replace=2,
+        norm_topk_prob=True,
+        aux_loss_alpha=0.001,
+        hidden_act="silu",
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        attention_probs_dropout_prob=0.1,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.aux_loss_alpha = aux_loss_alpha
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return DeepseekV3Config(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size,
+            moe_intermediate_size=self.moe_intermediate_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            n_shared_experts=self.n_shared_experts,
+            n_routed_experts=self.n_routed_experts,
+            routed_scaling_factor=self.routed_scaling_factor,
+            kv_lora_rank=self.kv_lora_rank,
+            q_lora_rank=self.q_lora_rank,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            v_head_dim=self.v_head_dim,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            n_group=self.n_group,
+            topk_group=self.topk_group,
+            num_experts_per_tok=self.num_experts_per_tok,
+            first_k_dense_replace=self.first_k_dense_replace,
+            norm_topk_prob=self.norm_topk_prob,
+            aux_loss_alpha=self.aux_loss_alpha,
+            hidden_act=self.hidden_act,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+            use_cache=True,
+            pad_token_id=self.pad_token_id,
+            attention_dropout=self.attention_probs_dropout_prob,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DeepseekV3Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class DeepseekV3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            DeepseekV3Model,
+            DeepseekV3ForCausalLM,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (DeepseekV3ForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": DeepseekV3Model,
+            "text-generation": DeepseekV3ForCausalLM,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    fx_compatible = False
+
+    # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
+    # This is because we are hitting edge cases with the causal_mask buffer
+    model_split_percents = [0.5, 0.7, 0.8]
+
+    # used in `test_torch_compile_for_training`
+    _torch_compile_train_cls = DeepseekV3ForCausalLM if is_torch_available() else None
+
+    def setUp(self):
+        self.model_tester = DeepseekV3ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DeepseekV3Config, hidden_size=37)
+
+    @unittest.skip("Failing because of unique cache (HybridCache)")
+    def test_model_outputs_equivalence(self, **kwargs):
+        pass
+
+    @parameterized.expand([("random",), ("same",)])
+    @unittest.skip("DeepseekV3 has HybridCache which is not compatible with assisted decoding")
+    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @unittest.skip("DeepseekV3 has HybridCache which is not compatible with assisted decoding")
+    def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @unittest.skip("DeepseekV3 has HybridCache which is not compatible with assisted decoding")
+    def test_assisted_decoding_sample(self):
+        pass
+
+    @unittest.skip("DeepseekV3 has HybridCache which is not compatible with dola decoding")
+    def test_dola_decoding_sample(self):
+        pass
+
+    @unittest.skip("DeepseekV3 has HybridCache and doesn't support continue from past kv")
+    def test_generate_continue_from_past_key_values(self):
+        pass
+
+    @unittest.skip("DeepseekV3 has HybridCache and doesn't support low_memory generation")
+    def test_beam_search_low_memory(self):
+        pass
+
+    @unittest.skip("DeepseekV3 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate(self):
+        pass
+
+    @unittest.skip("DeepseekV3 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate_dict_outputs_use_cache(self):
+        pass
+
+    @unittest.skip("DeepseekV3 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate_low_memory(self):
+        pass
+
+    @unittest.skip(
+        "DeepseekV3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support."
+    )
+    def test_generate_with_static_cache(self):
+        pass
+
+    @unittest.skip(
+        "DeepseekV3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support."
+    )
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
+    @unittest.skip(
+        "DeepseekV3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support."
+    )
+    def test_generate_continue_from_inputs_embeds(self):
+        pass
+
+    @unittest.skip("DeepseekV3's eager attn/sdpa attn outputs are expected to be different")
+    def test_sdpa_equivalence(self):
+        pass
+
+    @unittest.skip("Deepseek-V3 uses MLA so it is not compatible with the standard cache format")
+    def test_beam_search_generate_dict_outputs_use_cache(self):
+        pass
+
+    @unittest.skip("Deepseek-V3 uses MLA so it is not compatible with the standard cache format")
+    def test_generate_compilation_all_outputs(self):
+        pass
+
+    @unittest.skip("Deepseek-V3 uses MLA so it is not compatible with the standard cache format")
+    def test_generate_compile_model_forward(self):
+        pass
+
+    @unittest.skip("Deepseek-V3 uses MLA so it is not compatible with the standard cache format")
+    def test_greedy_generate_dict_outputs_use_cache(self):
+        pass
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @parameterized.expand([("yarn",)])
+    def test_model_rope_scaling_from_config(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = DeepseekV3Model(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = DeepseekV3Model(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5)
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+    def test_model_rope_scaling(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        scaling_factor = 10
+        short_input_length = 10
+        long_input_length = int(config.max_position_embeddings * 1.5)
+
+        # Inputs
+        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
+        position_ids_short = position_ids_short.unsqueeze(0)
+        position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
+        position_ids_long = position_ids_long.unsqueeze(0)
+
+        # Sanity check original RoPE
+        original_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device)
+        original_cos_short, original_sin_short = original_rope(x, position_ids_short)
+        original_cos_long, original_sin_long = original_rope(x, position_ids_long)
+        torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :])
+
+        # Sanity check linear RoPE scaling
+        # New position "x" should match original position with index "x/scaling_factor"
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+        linear_scaling_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device)
+        linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short)
+        linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :])
+        for new_position in range(0, long_input_length, scaling_factor):
+            original_position = int(new_position // scaling_factor)
+            torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :])
+            torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :])
+
+        # Sanity check Dynamic NTK RoPE scaling
+        # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
+        # with scaling_factor (or that `inv_freq` decreases)
+        config.rope_scaling = {"type": "dynamic", "factor": scaling_factor}
+        ntk_scaling_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device)
+        ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short)
+        ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(ntk_cos_short, original_cos_short)
+        torch.testing.assert_close(ntk_sin_short, original_sin_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(ntk_cos_long, original_cos_long)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(ntk_sin_long, original_sin_long)
+        self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
+
+        # Sanity check Yarn RoPE scaling
+        # Scaling should be over the entire input
+        config.rope_scaling = {"type": "yarn", "factor": scaling_factor}
+        yarn_scaling_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device)
+        yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short)
+        yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :])
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_cos_short, original_cos_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_sin_short, original_sin_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_cos_long, original_cos_long)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_sin_long, original_sin_long)
+
+    @unittest.skip(reason="Deepseek-V3 uses MLA on all models so the KV cache is a non standard format")
+    def test_past_key_values_format(self):
+        pass
+
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_generate(self):
+        """
+        Overwritting the common test as the test is flaky on tiny models
+        """
+        max_new_tokens = 30
+
+        tokenizer = AutoTokenizer.from_pretrained("bzantium/tiny-deepseek-v3")
+
+        model_sdpa = DeepseekV3ForCausalLM.from_pretrained(
+            "bzantium/tiny-deepseek-v3",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+        ).to(torch_device)
+
+        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+        model_eager = DeepseekV3ForCausalLM.from_pretrained(
+            "bzantium/tiny-deepseek-v3",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation="eager",
+        ).to(torch_device)
+
+        self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+        texts = [
+            "hi here's a longer context, getting longer and",
+            "Hello this is a very long sentence my friend, very long for real",
+            "Today I am in Paris and",
+        ]
+
+        for padding_side in ["left", "right"]:
+            tokenizer.padding_side = padding_side
+            tokenizer.pad_token = tokenizer.eos_token
+
+            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
+
+            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+
+            with self.subTest(f"{padding_side}"):
+                torch.testing.assert_close(
+                    res_eager,
+                    res_sdpa,
+                    msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}",
+                )
+
+
+@require_torch_accelerator
+class DeepseekV3IntegrationTest(unittest.TestCase):
+    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
+    # Depending on the hardware we get different logits / generations
+    cuda_compute_capability_major_version = None
+
+    @classmethod
+    def setUpClass(cls):
+        if is_torch_available() and torch.cuda.is_available():
+            # 8 is for A100 / A10 and 7 for T4
+            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
+
+    def tearDown(self):
+        # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
+        cleanup(torch_device, gc_collect=False)
+
+    @slow
+    @require_torch_accelerator
+    @require_read_token
+    def test_compile_static_cache(self):
+        # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
+        # work as intended. See https://github.com/pytorch/pytorch/issues/121943
+        if version.parse(torch.__version__) < version.parse("2.3.0"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+        NUM_TOKENS_TO_GENERATE = 40
+        # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test
+        # was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs.
+        EXPECTED_TEXT_COMPLETION = [
+            "Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial "
+            "reference frames, and 2) the laws of physics are the same for all inertial reference frames.\nThe "
+            "theory of relativ",
+            "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, "
+            "my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p",
+        ]
+
+        prompts = [
+            "Simply put, the theory of relativity states that ",
+            "My favorite all time favorite condiment is ketchup.",
+        ]
+        tokenizer = AutoTokenizer.from_pretrained("bzantium/tiny-deepseek-v3", pad_token="</s>", padding_side="right")
+        model = DeepseekV3ForCausalLM.from_pretrained(
+            "bzantium/tiny-deepseek-v3", device_map=torch_device, torch_dtype=torch.float16
+        )
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+
+        # Dynamic Cache
+        generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False)
+        dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, dynamic_text)
+
+        # Static Cache
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
+        )
+        static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, static_text)
+
+        # Static Cache + compile
+        model._cache = None  # clear cache object, initialized when we pass `cache_implementation="static"`
+        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
+        )
+        static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, static_compiled_text)
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index b9404e08a9df..e3441e606d23 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -31,7 +31,6 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -188,7 +187,7 @@ def create_and_check_deformable_detr_object_detection_head_model(self, config, p
 
 
 @require_torch
-class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class DeformableDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_torch_available() else ()
     pipeline_model_mapping = (
         {"image-feature-extraction": DeformableDetrModel, "object-detection": DeformableDetrForObjectDetection}
@@ -543,10 +542,10 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4)
 
             self.assertTrue(outputs)
@@ -575,10 +574,10 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 4)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 4)
 
             self.assertTrue(outputs)
diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py
index bf58e1cd3228..333ed0fa95b9 100644
--- a/tests/models/deit/test_modeling_deit.py
+++ b/tests/models/deit/test_modeling_deit.py
@@ -317,13 +317,13 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/depth_anything/test_modeling_depth_anything.py b/tests/models/depth_anything/test_modeling_depth_anything.py
index 95026c1054e9..b9d259b1c712 100644
--- a/tests/models/depth_anything/test_modeling_depth_anything.py
+++ b/tests/models/depth_anything/test_modeling_depth_anything.py
@@ -181,14 +181,6 @@ def test_training_gradient_checkpointing(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="Depth Anything with AutoBackbone does not have a base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Depth Anything with AutoBackbone does not have a base model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(
         reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
@@ -214,7 +206,7 @@ def _validate_backbone_init():
                 model.to(torch_device)
                 model.eval()
 
-                # Confirm out_indices propogated to backbone
+                # Confirm out_indices propagated to backbone
                 self.assertEqual(len(model.backbone.out_indices), 2)
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py
index 2966c213b481..cb38380dc740 100644
--- a/tests/models/depth_pro/test_modeling_depth_pro.py
+++ b/tests/models/depth_pro/test_modeling_depth_pro.py
@@ -284,13 +284,13 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -323,7 +323,7 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    # this started when switched from normal initialization to kaiming_normal intialization
+    # this started when switched from normal initialization to kaiming_normal initialization
     # maybe because the magnitude of offset values from ViT-encoders increases when followed by many convolution layers
     def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
         super().test_batching_equivalence(atol=atol, rtol=rtol)
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index e92cc6ddc289..003deceab762 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -22,7 +22,6 @@
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -169,7 +168,7 @@ def create_and_check_detr_object_detection_head_model(self, config, pixel_values
 
 
 @require_torch
-class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class DetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             DetrModel,
@@ -466,13 +465,13 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             elif model_class.__name__ == "DetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
@@ -501,13 +500,13 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             elif model_class.__name__ == "DetrForSegmentation":
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.detr.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
diff --git a/tests/models/diffllama/test_modeling_diffllama.py b/tests/models/diffllama/test_modeling_diffllama.py
index 81b963cfc47f..f56b0f764b84 100644
--- a/tests/models/diffllama/test_modeling_diffllama.py
+++ b/tests/models/diffllama/test_modeling_diffllama.py
@@ -25,6 +25,7 @@
 from transformers import AutoTokenizer, DiffLlamaConfig, StaticCache, is_torch_available, set_seed
 from transformers.testing_utils import (
     backend_empty_cache,
+    cleanup,
     require_bitsandbytes,
     require_flash_attn,
     require_read_token,
@@ -158,116 +159,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = DiffLlamaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = DiffLlamaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = DiffLlamaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -391,10 +282,6 @@ def test_diffllama_token_classification_model(self):
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip(reason="DiffLlama buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
     def test_model_rope_scaling_from_config(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -433,7 +320,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
@@ -553,7 +442,7 @@ def _reinitialize_config(base_config, new_kwargs):
     @slow
     def test_flash_attn_2_generate_padding_right(self):
         """
-        Overwritting the common test as the test is flaky on tiny models
+        Overwriting the common test as the test is flaky on tiny models
         """
         model = DiffLlamaForCausalLM.from_pretrained(
             "kajuma/DiffLlama-0.3B-handcut",
@@ -617,7 +506,7 @@ def test_use_flash_attention_2_true(self):
     @slow
     def test_eager_matches_sdpa_generate(self):
         """
-        Overwritting the common test as the test is flaky on tiny models
+        Overwriting the common test as the test is flaky on tiny models
         """
         max_new_tokens = 30
 
@@ -687,6 +576,10 @@ def setUpClass(cls):
             # 8 is for A100 / A10 and 7 for T4
             cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
 
+    def tearDown(self):
+        # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
+        cleanup(torch_device, gc_collect=False)
+
     @slow
     @require_torch_accelerator
     @require_read_token
@@ -886,7 +779,7 @@ def test_stacked_causal_mask_static_cache(self):
         max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
         past_key_values = StaticCache(
             config=self.model.config,
-            batch_size=1,
+            max_batch_size=1,
             max_cache_len=max_cache_len,
             device=torch_device,
             dtype=self.model.dtype,
@@ -934,7 +827,7 @@ def test_partial_stacked_causal_mask_static_cache(self):
         max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
         past_key_values = StaticCache(
             config=self.model.config,
-            batch_size=1,
+            max_batch_size=1,
             max_cache_len=max_cache_len,
             device=torch_device,
             dtype=self.model.dtype,
diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py
index 3e52ad49af3d..a8f11c91729f 100644
--- a/tests/models/dinov2/test_modeling_dinov2.py
+++ b/tests/models/dinov2/test_modeling_dinov2.py
@@ -250,19 +250,19 @@ def test_inputs_embeds(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/dinov2/test_modeling_flax_dinov2.py b/tests/models/dinov2/test_modeling_flax_dinov2.py
index 68510bb505e5..09ce20611a6d 100644
--- a/tests/models/dinov2/test_modeling_flax_dinov2.py
+++ b/tests/models/dinov2/test_modeling_flax_dinov2.py
@@ -202,7 +202,7 @@ def test_model_from_pretrained(self):
 # We will verify our results on an image of cute cats
 def prepare_img():
     image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
+    return [image, image]
 
 
 @require_vision
@@ -224,18 +224,25 @@ def test_inference_no_head(self):
         outputs = model(pixel_values=pixel_values)
 
         # verify the logits
-        expected_shape = (1, 257, 768)
+        expected_shape = (2, 257, 768)
         self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
 
         expected_slice = np.array(
             [
-                [-2.1629121, -0.46566057, 1.0925977],
-                [-3.5971704, -1.0283585, -1.1780515],
-                [-2.900407, 1.1334689, -0.74357724],
+                [
+                    [-2.1629121, -0.46566057, 1.0925977],
+                    [-3.5971704, -1.0283585, -1.1780515],
+                    [-2.900407, 1.1334689, -0.74357724],
+                ],
+                [
+                    [-2.1629121, -0.46566057, 1.0925977],
+                    [-3.5971704, -1.0283585, -1.1780515],
+                    [-2.900407, 1.1334689, -0.74357724],
+                ],
             ]
         )
 
-        self.assertTrue(np.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+        self.assertTrue(np.allclose(outputs.last_hidden_state[:2, :3, :3], expected_slice, atol=1e-4))
 
     @slow
     def test_inference_image_classification_head_imagenet_1k(self):
@@ -252,12 +259,13 @@ def test_inference_image_classification_head_imagenet_1k(self):
         logits = outputs.logits
 
         # verify the logits
-        expected_shape = (1, 1000)
+        expected_shape = (2, 1000)
         self.assertEqual(logits.shape, expected_shape)
 
-        expected_slice = np.array([-2.1776447, 0.36716992, 0.13870952])
+        expected_slice = np.array([[-2.1776447, 0.36716992, 0.13870952], [-2.1776447, 0.36716992, 0.13870952]])
 
-        self.assertTrue(np.allclose(logits[0, :3], expected_slice, atol=1e-4))
+        self.assertTrue(np.allclose(logits[:2, :3], expected_slice, atol=1e-3))
 
         expected_class_idx = 281
-        self.assertEqual(logits.argmax(-1).item(), expected_class_idx)
+        self.assertEqual(logits[0].argmax(-1).item(), expected_class_idx)
+        self.assertEqual(logits[1].argmax(-1).item(), expected_class_idx)
diff --git a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
index a276eedd3a16..c40af0c197cf 100644
--- a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
+++ b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
@@ -267,19 +267,19 @@ def test_inputs_embeds(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/distilbert/test_tokenization_distilbert.py b/tests/models/distilbert/test_tokenization_distilbert.py
index c61393f6a6a9..42f6d6a4ad16 100644
--- a/tests/models/distilbert/test_tokenization_distilbert.py
+++ b/tests/models/distilbert/test_tokenization_distilbert.py
@@ -17,11 +17,11 @@
 from transformers import DistilBertTokenizer, DistilBertTokenizerFast
 from transformers.testing_utils import require_tokenizers, slow
 
-from ..bert.test_tokenization_bert import BertTokenizationTest
+from ..bert import test_tokenization_bert
 
 
 @require_tokenizers
-class DistilBertTokenizationTest(BertTokenizationTest):
+class DistilBertTokenizationTest(test_tokenization_bert.BertTokenizationTest):
     tokenizer_class = DistilBertTokenizer
     rust_tokenizer_class = DistilBertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py
index 1fd3d8bdb9ec..28c5562ec8dd 100644
--- a/tests/models/dpr/test_tokenization_dpr.py
+++ b/tests/models/dpr/test_tokenization_dpr.py
@@ -25,11 +25,11 @@
 from transformers.testing_utils import require_tokenizers, slow
 from transformers.tokenization_utils_base import BatchEncoding
 
-from ..bert.test_tokenization_bert import BertTokenizationTest
+from ..bert import test_tokenization_bert
 
 
 @require_tokenizers
-class DPRContextEncoderTokenizationTest(BertTokenizationTest):
+class DPRContextEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
     tokenizer_class = DPRContextEncoderTokenizer
     rust_tokenizer_class = DPRContextEncoderTokenizerFast
     test_rust_tokenizer = True
@@ -37,7 +37,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest):
 
 
 @require_tokenizers
-class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
+class DPRQuestionEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
     tokenizer_class = DPRQuestionEncoderTokenizer
     rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
     test_rust_tokenizer = True
@@ -45,7 +45,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
 
 
 @require_tokenizers
-class DPRReaderTokenizationTest(BertTokenizationTest):
+class DPRReaderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
     tokenizer_class = DPRReaderTokenizer
     rust_tokenizer_class = DPRReaderTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 44fb2afb3bfd..bdfd480a90ab 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -244,17 +244,21 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
+    @unittest.skip(reason="Inductor error for dynamic shape")
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -286,7 +290,7 @@ def _validate_backbone_init():
                 model.eval()
 
                 if model.__class__.__name__ == "DPTForDepthEstimation":
-                    # Confirm out_indices propogated to backbone
+                    # Confirm out_indices propagated to backbone
                     self.assertEqual(len(model.backbone.out_indices), 2)
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
index 62240d24bced..e4c40ca80909 100644
--- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py
+++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
@@ -221,22 +221,14 @@ def test_initialization(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="DPT with AutoBackbone does not have a base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="DPT with AutoBackbone does not have a base model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 568b05e2d4a5..cf1662031478 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -258,13 +258,13 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py
index e2aa0d41f219..08865f22aba8 100644
--- a/tests/models/electra/test_modeling_electra.py
+++ b/tests/models/electra/test_modeling_electra.py
@@ -389,6 +389,8 @@ class ElectraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": ElectraModel,
diff --git a/tests/models/electra/test_tokenization_electra.py b/tests/models/electra/test_tokenization_electra.py
index 2a9c47b93c24..0155c21bf280 100644
--- a/tests/models/electra/test_tokenization_electra.py
+++ b/tests/models/electra/test_tokenization_electra.py
@@ -40,8 +40,9 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     space_between_special_tokens = True
     from_pretrained_filter = filter_non_english
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "[UNK]",
@@ -60,8 +61,8 @@ def setUp(self):
             "low",
             "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
@@ -250,7 +251,7 @@ def test_sequence_builders(self):
     def test_offsets_with_special_characters(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
                 tokens = tokenizer_r.encode_plus(
@@ -305,8 +306,8 @@ def test_change_tokenize_chinese_chars(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 kwargs["tokenize_chinese_chars"] = True
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
                 ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@@ -319,8 +320,8 @@ def test_change_tokenize_chinese_chars(self):
                 self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
 
                 kwargs["tokenize_chinese_chars"] = False
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
                 ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index 2d5eca4b83ae..f7e9d639157b 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -117,7 +117,7 @@ def prepare_config_and_inputs_for_normalization(self):
         config.normalize = True
 
         processor = EncodecFeatureExtractor(feature_size=config.audio_channels, sampling_rate=config.sampling_rate)
-        input_values = list(input_values.cpu().numpy())
+        input_values = input_values.tolist()
         inputs_dict = processor(
             input_values, sampling_rate=config.sampling_rate, padding=True, return_tensors="pt"
         ).to(torch_device)
@@ -492,10 +492,10 @@ def test_integration_24kHz(self):
 
         for bandwidth, expected_rmse in expected_rmse.items():
             with torch.no_grad():
-                # use max bandwith for best possible reconstruction
+                # use max bandwidth for best possible reconstruction
                 encoder_outputs = model.encode(inputs["input_values"], bandwidth=float(bandwidth))
 
-                audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
+                audio_code_sums = [a[0].sum().item() for a in encoder_outputs[0]]
 
                 # make sure audio encoded codes are correct
                 self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
@@ -548,11 +548,11 @@ def test_integration_48kHz(self):
 
         for bandwidth, expected_rmse in expected_rmse.items():
             with torch.no_grad():
-                # use max bandwith for best possible reconstruction
+                # use max bandwidth for best possible reconstruction
                 encoder_outputs = model.encode(
                     inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth), return_dict=False
                 )
-                audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
+                audio_code_sums = [a[0].sum().item() for a in encoder_outputs[0]]
 
                 # make sure audio encoded codes are correct
                 self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
@@ -608,10 +608,10 @@ def test_batch_48kHz(self):
         input_values = inputs["input_values"].to(torch_device)
         for bandwidth, expected_rmse in expected_rmse.items():
             with torch.no_grad():
-                # use max bandwith for best possible reconstruction
+                # use max bandwidth for best possible reconstruction
                 encoder_outputs = model.encode(input_values, bandwidth=float(bandwidth), return_dict=False)
-                audio_code_sums_0 = [a[0][0].sum().cpu().item() for a in encoder_outputs[0]]
-                audio_code_sums_1 = [a[0][1].sum().cpu().item() for a in encoder_outputs[0]]
+                audio_code_sums_0 = [a[0][0].sum().item() for a in encoder_outputs[0]]
+                audio_code_sums_1 = [a[0][1].sum().item() for a in encoder_outputs[0]]
 
                 # make sure audio encoded codes are correct
                 self.assertListEqual(audio_code_sums_0, expected_codesums[bandwidth][0])
diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
index 1c4051f2e264..4f1cb32348d8 100644
--- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
@@ -179,7 +179,10 @@ def check_encoder_decoder_model_from_pretrained_using_model_paths(
         **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
-        with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
+        with (
+            tempfile.TemporaryDirectory() as encoder_tmp_dirname,
+            tempfile.TemporaryDirectory() as decoder_tmp_dirname,
+        ):
             encoder_model.save_pretrained(encoder_tmp_dirname)
             decoder_model.save_pretrained(decoder_tmp_dirname)
             model_kwargs = {"encoder_hidden_dropout_prob": 0.0}
@@ -306,7 +309,10 @@ def check_save_and_load_encoder_decoder_model(
             out_2 = outputs[0].cpu().numpy()
             out_2[np.isnan(out_2)] = 0
 
-            with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
+            with (
+                tempfile.TemporaryDirectory() as encoder_tmp_dirname,
+                tempfile.TemporaryDirectory() as decoder_tmp_dirname,
+            ):
                 enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname)
                 enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname)
                 enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained(
diff --git a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
index 35434a280e9a..362981820024 100644
--- a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
@@ -19,8 +19,8 @@
 
 import numpy as np
 
-from transformers import is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow, torch_device
+from transformers import is_flax_available
+from transformers.testing_utils import require_flax, slow
 
 from ...test_modeling_flax_common import ids_tensor
 from ..bart.test_modeling_flax_bart import FlaxBartStandaloneDecoderModelTester
@@ -38,15 +38,6 @@
         FlaxEncoderDecoderModel,
         FlaxGPT2LMHeadModel,
     )
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-if is_torch_available():
-    import torch
-
-    from transformers import EncoderDecoderModel
 
 
 @require_flax
@@ -291,68 +282,6 @@ def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config
         generated_sequences = generated_output.sequences
         self.assertEqual(generated_sequences.shape, (input_ids.shape[0],) + (decoder_config.max_length,))
 
-    def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()).to(torch_device) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-            self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 1e-5)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 1e-5)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = EncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output_loaded in zip(fx_outputs, pt_outputs_loaded):
-            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(force=True), 1e-5)
-
-    def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = EncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxEncoderDecoderModel(encoder_decoder_config)
-
-        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-        fx_model.params = fx_state
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
-    def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = EncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxEncoderDecoderModel(encoder_decoder_config)
-
-        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
     def test_encoder_decoder_model_from_pretrained_configs(self):
         input_ids_dict = self.prepare_config_and_inputs()
         self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict)
@@ -385,40 +314,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         diff = np.abs((a - b)).max()
         self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
 
-    @is_pt_flax_cross_test
-    def test_pt_flax_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        config = config_inputs_dict.pop("config")
-        decoder_config = config_inputs_dict.pop("decoder_config")
-
-        inputs_dict = config_inputs_dict
-        # `encoder_hidden_states` is not used in model call/forward
-        del inputs_dict["encoder_hidden_states"]
-
-        # Avoid the case where a sequence has no place to attend (after combined with the causal attention mask)
-        batch_size = inputs_dict["decoder_attention_mask"].shape[0]
-        inputs_dict["decoder_attention_mask"] = np.concatenate(
-            [np.ones(shape=(batch_size, 1)), inputs_dict["decoder_attention_mask"][:, 1:]], axis=1
-        )
-
-        # Flax models don't use the `use_cache` option and cache is not returned as a default.
-        # So we disable `use_cache` here for PyTorch model.
-        decoder_config.use_cache = False
-
-        self.assertTrue(decoder_config.cross_attention_hidden_size is None)
-
-        # check without `enc_to_dec_proj` projection
-        decoder_config.hidden_size = config.hidden_size
-        self.assertTrue(config.hidden_size == decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
-        # check `enc_to_dec_proj` work as expected
-        decoder_config.hidden_size = decoder_config.hidden_size * 2
-        self.assertTrue(config.hidden_size != decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
     @slow
     def test_real_model_save_load_from_pretrained(self):
         model_2 = self.get_pretrained_model()
diff --git a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
index b1f0b52dad35..cd35578273e7 100644
--- a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
@@ -16,16 +16,14 @@
 
 from __future__ import annotations
 
-import copy
 import os
 import tempfile
 import unittest
 
 import numpy as np
 
-from transformers import is_tf_available, is_torch_available
-from transformers.testing_utils import is_pt_tf_cross_test, require_tf, require_torch, slow, torch_device
-from transformers.utils.generic import ModelOutput
+from transformers import is_tf_available
+from transformers.testing_utils import require_tf, slow
 
 from ...test_modeling_tf_common import ids_tensor
 from ..bert.test_modeling_tf_bert import TFBertModelTester
@@ -35,8 +33,6 @@
 
 
 if is_tf_available():
-    import tensorflow as tf
-
     from transformers import (
         AutoConfig,
         AutoTokenizer,
@@ -54,11 +50,6 @@
     )
     from transformers.modeling_tf_outputs import TFBaseModelOutput
 
-if is_torch_available():
-    import torch
-
-    from transformers import BertLMHeadModel, BertModel, EncoderDecoderModel
-
 
 @require_tf
 class TFEncoderDecoderMixin:
@@ -386,188 +377,6 @@ def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config
         )
         self.assertEqual(tuple(generated_output.shape.as_list()), (input_ids.shape[0],) + (decoder_config.max_length,))
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None):
-        """Check the outputs from PyTorch and TensorFlow models are close enough. Checks are done in a recursive way.
-
-        Args:
-            model_class: The class of the model that is currently testing. For example, `TFBertModel`,
-                TFBertForMaskedLM`, `TFBertForSequenceClassification`, etc. Mainly used for providing more informative
-                error messages.
-            name (`str`): The name of the output. For example, `output.hidden_states`, `output.attentions`, etc.
-            attributes (`Tuple[str]`): The names of the output's element if the output is a tuple/list with each element
-                being a named field in the output.
-        """
-
-        self.assertEqual(type(name), str)
-        if attributes is not None:
-            self.assertEqual(type(attributes), tuple, f"{name}: The argument `attributes` should be a `tuple`")
-
-        # Allow `ModelOutput` (e.g. `CLIPOutput` has `text_model_output` and `vision_model_output`).
-        if isinstance(tf_outputs, ModelOutput):
-            self.assertTrue(
-                isinstance(pt_outputs, ModelOutput),
-                f"{name}: `pt_outputs` should an instance of `ModelOutput` when `tf_outputs` is",
-            )
-
-            tf_keys = [k for k, v in tf_outputs.items() if v is not None]
-            pt_keys = [k for k, v in pt_outputs.items() if v is not None]
-
-            self.assertEqual(tf_keys, pt_keys, f"{name}: Output keys differ between TF and PyTorch")
-
-            # convert to the case of `tuple`
-            # appending each key to the current (string) `names`
-            attributes = tuple([f"{name}.{k}" for k in tf_keys])
-            self.check_pt_tf_outputs(
-                tf_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, tol=tol, name=name, attributes=attributes
-            )
-
-        # Allow `list` (e.g. `TransfoXLModelOutput.mems` is a list of tensors.)
-        elif type(tf_outputs) in [tuple, list]:
-            self.assertEqual(type(tf_outputs), type(pt_outputs), f"{name}: Output types differ between TF and PyTorch")
-            self.assertEqual(len(tf_outputs), len(pt_outputs), f"{name}: Output lengths differ between TF and PyTorch")
-
-            if attributes is not None:
-                # case 1: each output has assigned name (e.g. a tuple form of a `ModelOutput`)
-                self.assertEqual(
-                    len(attributes),
-                    len(tf_outputs),
-                    f"{name}: The tuple `names` should have the same length as `tf_outputs`",
-                )
-            else:
-                # case 2: each output has no assigned name (e.g. hidden states of each layer) -> add an index to `names`
-                attributes = tuple([f"{name}_{idx}" for idx in range(len(tf_outputs))])
-
-            for tf_output, pt_output, attr in zip(tf_outputs, pt_outputs, attributes):
-                self.check_pt_tf_outputs(tf_output, pt_output, model_class, tol=tol, name=attr)
-
-        elif isinstance(tf_outputs, tf.Tensor):
-            self.assertTrue(
-                isinstance(pt_outputs, torch.Tensor), f"{name}: `pt_outputs` should a tensor when `tf_outputs` is"
-            )
-
-            tf_outputs = tf_outputs.numpy()
-            pt_outputs = pt_outputs.detach().to("cpu").numpy()
-
-            self.assertEqual(
-                tf_outputs.shape, pt_outputs.shape, f"{name}: Output shapes differ between TF and PyTorch"
-            )
-
-            # deal with NumPy's scalars to make replacing nan values by 0 work.
-            if np.isscalar(tf_outputs):
-                tf_outputs = np.array([tf_outputs])
-                pt_outputs = np.array([pt_outputs])
-
-            tf_nans = np.isnan(tf_outputs)
-            pt_nans = np.isnan(pt_outputs)
-
-            pt_outputs[tf_nans] = 0
-            tf_outputs[tf_nans] = 0
-            pt_outputs[pt_nans] = 0
-            tf_outputs[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tf_outputs - pt_outputs))
-            self.assertLessEqual(max_diff, tol, f"{name}: Difference between torch and tf is {max_diff} (>= {tol}).")
-        else:
-            raise ValueError(
-                "`tf_outputs` should be an instance of `tf.Tensor`, a `tuple`, or an instance of `tf.Tensor`. Got"
-                f" {type(tf_outputs)} instead."
-            )
-
-    def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
-        pt_inputs_dict = {}
-        for name, key in tf_inputs_dict.items():
-            if isinstance(key, bool):
-                pt_inputs_dict[name] = key
-            elif name == "input_values":
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            elif name == "pixel_values":
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            elif name == "input_features":
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            # other general float inputs
-            elif tf_inputs_dict[name].dtype.is_floating:
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            else:
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
-
-        return pt_inputs_dict
-
-    def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-        pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict)
-
-        # send pytorch inputs to the correct device
-        pt_inputs_dict = {
-            k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs_dict.items()
-        }
-
-        # send pytorch model to the correct device
-        pt_model.to(torch_device)
-
-        # Check predictions on first output (logits/hidden-states) are close enough given low-level computational differences
-        pt_model.eval()
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs_dict)
-        tf_outputs = tf_model(tf_inputs_dict)
-
-        # tf models returned loss is usually a tensor rather than a scalar.
-        # (see `hf_compute_loss`: it uses `keras.losses.Reduction.NONE`)
-        # Change it here to a scalar to match PyTorch models' loss
-        tf_loss = getattr(tf_outputs, "loss", None)
-        if tf_loss is not None:
-            tf_outputs.loss = tf.math.reduce_mean(tf_loss)
-
-        self.check_pt_tf_outputs(tf_outputs, pt_outputs, type(tf_model))
-
-    def check_pt_tf_equivalence(self, tf_model, pt_model, tf_inputs_dict):
-        """Wrap `check_pt_tf_models` to further check PT -> TF again"""
-
-        self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
-        # PT -> TF
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            tf_model = TFEncoderDecoderModel.from_pretrained(tmpdirname)
-
-        self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
-    def check_pt_to_tf_equivalence(self, config, decoder_config, tf_inputs_dict):
-        """EncoderDecoderModel requires special way to cross load (PT -> TF)"""
-
-        encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        # Output all for aggressive testing
-        encoder_decoder_config.output_hidden_states = True
-        # All models tested in this file have attentions
-        encoder_decoder_config.output_attentions = True
-
-        pt_model = EncoderDecoderModel(encoder_decoder_config)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            tf_model = TFEncoderDecoderModel.from_pretrained(tmpdirname)
-
-        self.check_pt_tf_equivalence(tf_model, pt_model, tf_inputs_dict)
-
-    def check_tf_to_pt_equivalence(self, config, decoder_config, tf_inputs_dict):
-        """EncoderDecoderModel requires special way to cross load (TF -> PT)"""
-
-        encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        # Output all for aggressive testing
-        encoder_decoder_config.output_hidden_states = True
-        # TODO: A generalizable way to determine this attribute
-        encoder_decoder_config.output_attentions = True
-
-        tf_model = TFEncoderDecoderModel(encoder_decoder_config)
-        # Make sure model is built before saving
-        tf_model(**tf_inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            # TODO Matt: PT doesn't support loading TF safetensors - remove the arg and from_tf=True when it does
-            tf_model.save_pretrained(tmpdirname, safe_serialization=False)
-            pt_model = EncoderDecoderModel.from_pretrained(tmpdirname, from_tf=True)
-
-        self.check_pt_tf_equivalence(tf_model, pt_model, tf_inputs_dict)
-
     def test_encoder_decoder_model(self):
         input_ids_dict = self.prepare_config_and_inputs()
         self.check_encoder_decoder_model(**input_ids_dict)
@@ -608,70 +417,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         diff = np.abs((a - b)).max()
         self.assertLessEqual(diff, tol, f"Difference between torch and tf is {diff} (>= {tol}).")
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        labels = config_inputs_dict.pop("decoder_token_labels")
-
-        # Keep only common arguments
-        arg_names = [
-            "config",
-            "input_ids",
-            "attention_mask",
-            "decoder_config",
-            "decoder_input_ids",
-            "decoder_attention_mask",
-            "encoder_hidden_states",
-        ]
-        config_inputs_dict = {k: v for k, v in config_inputs_dict.items() if k in arg_names}
-
-        config = config_inputs_dict.pop("config")
-        decoder_config = config_inputs_dict.pop("decoder_config")
-
-        # Output all for aggressive testing
-        config.output_hidden_states = True
-        decoder_config.output_hidden_states = True
-        # All models tested in this file have attentions
-        config.output_attentions = True
-        decoder_config.output_attentions = True
-
-        tf_inputs_dict = config_inputs_dict
-        # `encoder_hidden_states` is not used in model call/forward
-        del tf_inputs_dict["encoder_hidden_states"]
-
-        # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
-        # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
-        for k in ["attention_mask", "decoder_attention_mask"]:
-            attention_mask = tf_inputs_dict[k]
-
-            # Make sure no all 0s attention masks - to avoid failure at this moment.
-            # Put `1` at the beginning of sequences to make it still work when combining causal attention masks.
-            # TODO: remove this line once a fix regarding large negative values for attention mask is done.
-            attention_mask = tf.concat(
-                [tf.ones_like(attention_mask[:, :1], dtype=attention_mask.dtype), attention_mask[:, 1:]], axis=-1
-            )
-            tf_inputs_dict[k] = attention_mask
-
-        tf_inputs_dict_with_labels = copy.copy(tf_inputs_dict)
-        tf_inputs_dict_with_labels["labels"] = labels
-
-        self.assertTrue(decoder_config.cross_attention_hidden_size is None)
-
-        # Original test: check without `labels` and  without `enc_to_dec_proj` projection
-        self.assertTrue(config.hidden_size == decoder_config.hidden_size)
-        self.check_pt_to_tf_equivalence(config, decoder_config, tf_inputs_dict)
-        self.check_tf_to_pt_equivalence(config, decoder_config, tf_inputs_dict)
-
-        # check with `labels`
-        self.check_pt_to_tf_equivalence(config, decoder_config, tf_inputs_dict_with_labels)
-        self.check_tf_to_pt_equivalence(config, decoder_config, tf_inputs_dict_with_labels)
-
-        # check `enc_to_dec_proj` work as expected
-        decoder_config.hidden_size = decoder_config.hidden_size * 2
-        self.assertTrue(config.hidden_size != decoder_config.hidden_size)
-        self.check_pt_to_tf_equivalence(config, decoder_config, tf_inputs_dict)
-        self.check_tf_to_pt_equivalence(config, decoder_config, tf_inputs_dict)
-
     def test_model_save_load_from_pretrained(self):
         model_2 = self.get_pretrained_model()
         input_ids = ids_tensor([13, 5], model_2.config.encoder.vocab_size)
@@ -761,44 +506,6 @@ def prepare_config_and_inputs(self):
             "labels": decoder_token_labels,
         }
 
-    @slow
-    @is_pt_tf_cross_test
-    def test_bert2bert_summarization(self):
-        from transformers import EncoderDecoderModel
-
-        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
-
-        """Not working, because pt checkpoint has `encoder.encoder.layer...` while tf model has `encoder.bert.encoder.layer...`.
-        (For Bert decoder, there is no issue, because `BertModel` is wrapped into `decoder` as `bert`)
-        model = TFEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16", from_pt=True)
-        """
-
-        # workaround to load from pt
-        _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
-        _model.encoder.save_pretrained("./encoder")
-        _model.decoder.save_pretrained("./decoder")
-        model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
-        )
-        model.config = _model.config
-
-        ARTICLE_STUDENTS = """(CNN)Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members singing a racist chant. SAE's national chapter suspended the students, but University of Oklahoma President David Boren took it a step further, saying the university's affiliation with the fraternity is permanently done. The news is shocking, but it's not the first time SAE has faced controversy. SAE was founded March 9, 1856, at the University of Alabama, five years before the American Civil War, according to the fraternity website. When the war began, the group had fewer than 400 members, of which "369 went to war for the Confederate States and seven for the Union Army," the website says. The fraternity now boasts more than 200,000 living alumni, along with about 15,000 undergraduates populating 219 chapters and 20 "colonies" seeking full membership at universities. SAE has had to work hard to change recently after a string of member deaths, many blamed on the hazing of new recruits, SAE national President Bradley Cohen wrote in a message on the fraternity's website. The fraternity's website lists more than 130 chapters cited or suspended for "health and safety incidents" since 2010. At least 30 of the incidents involved hazing, and dozens more involved alcohol. However, the list is missing numerous incidents from recent months. Among them, according to various media outlets: Yale University banned the SAEs from campus activities last month after members allegedly tried to interfere with a sexual misconduct investigation connected to an initiation rite. Stanford University in December suspended SAE housing privileges after finding sorority members attending a fraternity function were subjected to graphic sexual content. And Johns Hopkins University in November suspended the fraternity for underage drinking. "The media has labeled us as the 'nation's deadliest fraternity,' " Cohen said. In 2011, for example, a student died while being coerced into excessive alcohol consumption, according to a lawsuit. SAE's previous insurer dumped the fraternity. "As a result, we are paying Lloyd's of London the highest insurance rates in the Greek-letter world," Cohen said. Universities have turned down SAE's attempts to open new chapters, and the fraternity had to close 12 in 18 months over hazing incidents."""
-        EXPECTED_SUMMARY_STUDENTS = """sae was founded in 1856, five years before the civil war. the fraternity has had to work hard to change recently. the university of oklahoma president says the university's affiliation with the fraternity is permanently done. the sae has had a string of members in recent months."""
-
-        input_dict = tokenizer(ARTICLE_STUDENTS, return_tensors="tf")
-        output_ids = model.generate(input_ids=input_dict["input_ids"]).numpy().tolist()
-        summary = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        self.assertEqual(summary, [EXPECTED_SUMMARY_STUDENTS])
-
-        # Test with the TF checkpoint
-        model = TFEncoderDecoderModel.from_pretrained("ydshieh/bert2bert-cnn_dailymail-fp16")
-
-        output_ids = model.generate(input_ids=input_dict["input_ids"]).numpy().tolist()
-        summary = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-        self.assertEqual(summary, [EXPECTED_SUMMARY_STUDENTS])
-
 
 @require_tf
 class TFGPT2EncoderDecoderModelTest(TFEncoderDecoderMixin, unittest.TestCase):
@@ -861,37 +568,6 @@ def prepare_config_and_inputs(self):
             "labels": decoder_token_labels,
         }
 
-    @slow
-    @is_pt_tf_cross_test
-    def test_bert2gpt2_summarization(self):
-        from transformers import EncoderDecoderModel
-
-        tokenizer_in = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
-        tokenizer_out = AutoTokenizer.from_pretrained("openai-community/gpt2")
-
-        """Not working, because pt checkpoint has `encoder.encoder.layer...` while tf model has `encoder.bert.encoder.layer...`.
-        (For GPT2 decoder, there is no issue)
-        model = TFEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16", from_pt=True)
-        """
-
-        # workaround to load from pt
-        _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
-        _model.encoder.save_pretrained("./encoder")
-        _model.decoder.save_pretrained("./decoder")
-        model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-            "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
-        )
-        model.config = _model.config
-
-        ARTICLE_STUDENTS = """(CNN)Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members singing a racist chant. SAE's national chapter suspended the students, but University of Oklahoma President David Boren took it a step further, saying the university's affiliation with the fraternity is permanently done. The news is shocking, but it's not the first time SAE has faced controversy. SAE was founded March 9, 1856, at the University of Alabama, five years before the American Civil War, according to the fraternity website. When the war began, the group had fewer than 400 members, of which "369 went to war for the Confederate States and seven for the Union Army," the website says. The fraternity now boasts more than 200,000 living alumni, along with about 15,000 undergraduates populating 219 chapters and 20 "colonies" seeking full membership at universities. SAE has had to work hard to change recently after a string of member deaths, many blamed on the hazing of new recruits, SAE national President Bradley Cohen wrote in a message on the fraternity's website. The fraternity's website lists more than 130 chapters cited or suspended for "health and safety incidents" since 2010. At least 30 of the incidents involved hazing, and dozens more involved alcohol. However, the list is missing numerous incidents from recent months. Among them, according to various media outlets: Yale University banned the SAEs from campus activities last month after members allegedly tried to interfere with a sexual misconduct investigation connected to an initiation rite. Stanford University in December suspended SAE housing privileges after finding sorority members attending a fraternity function were subjected to graphic sexual content. And Johns Hopkins University in November suspended the fraternity for underage drinking. "The media has labeled us as the 'nation's deadliest fraternity,' " Cohen said. In 2011, for example, a student died while being coerced into excessive alcohol consumption, according to a lawsuit. SAE's previous insurer dumped the fraternity. "As a result, we are paying Lloyd's of London the highest insurance rates in the Greek-letter world," Cohen said. Universities have turned down SAE's attempts to open new chapters, and the fraternity had to close 12 in 18 months over hazing incidents."""
-        EXPECTED_SUMMARY_STUDENTS = """SAS Alpha Epsilon suspended the students, but university president says it's permanent.\nThe fraternity has had to deal with a string of student deaths since 2010.\nSAS has more than 200,000 members, many of whom are students.\nA student died while being forced into excessive alcohol consumption."""
-
-        input_dict = tokenizer_in(ARTICLE_STUDENTS, return_tensors="tf")
-        output_ids = model.generate(input_ids=input_dict["input_ids"]).numpy().tolist()
-        summary = tokenizer_out.batch_decode(output_ids, skip_special_tokens=True)
-
-        self.assertEqual(summary, [EXPECTED_SUMMARY_STUDENTS])
-
 
 @require_tf
 class TFRoBertaEncoderDecoderModelTest(TFEncoderDecoderMixin, unittest.TestCase):
@@ -1113,54 +789,6 @@ def test_encoder_decoder_save_load_from_encoder_decoder(self):
         max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy()))
         self.assertAlmostEqual(max_diff, 0.0, places=4)
 
-    @require_torch
-    @is_pt_tf_cross_test
-    def test_encoder_decoder_save_load_from_encoder_decoder_from_pt(self):
-        config = self.get_encoder_decoder_config_small()
-
-        # create two random BERT models for bert2bert & initialize weights (+cross_attention weights)
-        encoder_pt = BertModel(config.encoder).to(torch_device).eval()
-        decoder_pt = BertLMHeadModel(config.decoder).to(torch_device).eval()
-
-        encoder_decoder_pt = EncoderDecoderModel(encoder=encoder_pt, decoder=decoder_pt).to(torch_device).eval()
-
-        input_ids = ids_tensor([13, 5], encoder_pt.config.vocab_size)
-        decoder_input_ids = ids_tensor([13, 1], decoder_pt.config.vocab_size)
-
-        pt_input_ids = torch.tensor(input_ids.numpy(), device=torch_device, dtype=torch.long)
-        pt_decoder_input_ids = torch.tensor(decoder_input_ids.numpy(), device=torch_device, dtype=torch.long)
-
-        logits_pt = encoder_decoder_pt(input_ids=pt_input_ids, decoder_input_ids=pt_decoder_input_ids).logits
-
-        # PyTorch => TensorFlow
-        with tempfile.TemporaryDirectory() as tmp_dirname_1, tempfile.TemporaryDirectory() as tmp_dirname_2:
-            encoder_decoder_pt.encoder.save_pretrained(tmp_dirname_1)
-            encoder_decoder_pt.decoder.save_pretrained(tmp_dirname_2)
-            encoder_decoder_tf = TFEncoderDecoderModel.from_encoder_decoder_pretrained(tmp_dirname_1, tmp_dirname_2)
-
-        logits_tf = encoder_decoder_tf(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
-
-        max_diff = np.max(np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy()))
-        self.assertAlmostEqual(max_diff, 0.0, places=3)
-
-        # Make sure `from_pretrained` following `save_pretrained` work and give the same result
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            encoder_decoder_tf.save_pretrained(tmp_dirname)
-            encoder_decoder_tf = TFEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-            logits_tf_2 = encoder_decoder_tf(input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits
-
-            max_diff = np.max(np.abs(logits_tf_2.numpy() - logits_tf.numpy()))
-            self.assertAlmostEqual(max_diff, 0.0, places=3)
-
-        # TensorFlow => PyTorch
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            encoder_decoder_tf.save_pretrained(tmp_dirname, safe_serialization=False)
-            encoder_decoder_pt = EncoderDecoderModel.from_pretrained(tmp_dirname, from_tf=True)
-
-        max_diff = np.max(np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy()))
-        self.assertAlmostEqual(max_diff, 0.0, places=3)
-
     @slow
     def test_encoder_decoder_from_pretrained(self):
         load_weight_prefix = TFEncoderDecoderModel.load_weight_prefix
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index 7504ec2462eb..7be71c22c783 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -335,13 +335,13 @@ def test_inference_no_head(self):
     def test_inference_bitsandbytes(self):
         model = EsmForMaskedLM.from_pretrained("facebook/esm2_t36_3B_UR50D", load_in_8bit=True)
 
-        input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]])
+        input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]]).to(model.device)
         # Just test if inference works
         with torch.no_grad():
             _ = model(input_ids)[0]
 
         model = EsmForMaskedLM.from_pretrained("facebook/esm2_t36_3B_UR50D", load_in_4bit=True)
 
-        input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]])
+        input_ids = torch.tensor([[0, 6, 4, 13, 5, 4, 16, 12, 11, 7, 2]]).to(model.device)
         # Just test if inference works
         _ = model(input_ids)[0]
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index 7450f0295f77..03b1981dc87d 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -241,10 +241,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_outputs_equivalence(self):
         pass
 
-    @unittest.skip(reason="This test doesn't work for ESMFold and doesn't test core functionality")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="ESMFold does not support input chunking.")
     def test_feed_forward_chunking(self):
         pass
diff --git a/tests/models/esm/test_tokenization_esm.py b/tests/models/esm/test_tokenization_esm.py
index aac03b535edc..e0013d8a189f 100644
--- a/tests/models/esm/test_tokenization_esm.py
+++ b/tests/models/esm/test_tokenization_esm.py
@@ -17,6 +17,7 @@
 import os
 import tempfile
 import unittest
+from functools import lru_cache
 from typing import List
 
 from transformers.models.esm.tokenization_esm import VOCAB_FILES_NAMES, EsmTokenizer
@@ -24,24 +25,32 @@
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
+from ...test_tokenization_common import use_cache_if_possible
+
 
 @require_tokenizers
 class ESMTokenizationTest(unittest.TestCase):
     tokenizer_class = EsmTokenizer
 
-    def setUp(self):
-        super().setUp()
-        self.tmpdirname = tempfile.mkdtemp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+        cls.tmpdirname = tempfile.mkdtemp()
         vocab_tokens: List[str] = ["<cls>", "<pad>", "<eos>", "<unk>", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "<null_1>", "<mask>"]  # fmt: skip
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-    def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
-        return [self.get_tokenizer(**kwargs)]
+    def get_tokenizers(cls, **kwargs) -> List[PreTrainedTokenizerBase]:
+        return [cls.get_tokenizer(**kwargs)]
 
-    def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def test_tokenizer_single_example(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index 2838df380745..a40b9fa26b10 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -152,116 +152,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = FalconModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = FalconForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = FalconForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -460,7 +350,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
@@ -591,12 +483,12 @@ def test_batched_generation(self):
 
         test_text = "A sequence: 1, 2"  # should generate the rest of the sequence
 
-        unpadded_inputs = tokenizer([test_text], return_tensors="pt").to("cuda:0")
+        unpadded_inputs = tokenizer([test_text], return_tensors="pt").to(f"{torch_device}:0")
         unpadded_gen_out = model.generate(**unpadded_inputs, max_new_tokens=20)
         unpadded_gen_text = tokenizer.batch_decode(unpadded_gen_out, skip_special_tokens=True)
 
         dummy_text = "This is a longer text " * 2  # forces left-padding on `test_text`
-        padded_inputs = tokenizer([test_text, dummy_text], return_tensors="pt", padding=True).to("cuda:0")
+        padded_inputs = tokenizer([test_text, dummy_text], return_tensors="pt", padding=True).to(f"{torch_device}:0")
         padded_gen_out = model.generate(**padded_inputs, max_new_tokens=20)
         padded_gen_text = tokenizer.batch_decode(padded_gen_out, skip_special_tokens=True)
 
diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
index 6ac432766ac1..4e372a5fd93e 100644
--- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
+++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py
@@ -147,6 +147,25 @@ def get_pipeline_config(self):
         config.vocab_size = 300
         return config
 
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            attention_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
     def create_and_check_falcon_mamba_model(self, config, input_ids, *args):
         config.output_hidden_states = True
         model = FalconMambaModel(config=config)
@@ -192,7 +211,7 @@ def create_and_check_state_equivalency(self, config, input_ids, *args):
         output_two = outputs.last_hidden_state
 
         self.parent.assertTrue(torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5))
-        # TODO the orignal mamba does not support decoding more than 1 token neither do we
+        # TODO the original mamba does not support decoding more than 1 token neither do we
 
     def create_and_check_falcon_mamba_cached_slow_forward_and_backwards(
         self, config, input_ids, *args, gradient_checkpointing=False
@@ -212,7 +231,7 @@ def create_and_check_falcon_mamba_cached_slow_forward_and_backwards(
             token_emb, cache, cache_position=torch.arange(0, config.conv_kernel, device=input_ids.device)
         )
 
-        loss = torch.log(1 + torch.abs(outputs.sum()))
+        loss = torch.log1p(torch.abs(outputs.sum()))
         self.parent.assertEqual(loss.shape, ())
         self.parent.assertEqual(outputs.shape, (self.batch_size, self.seq_length, self.hidden_size))
         loss.backward()
diff --git a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
index cc413b94a63e..0480335f05aa 100644
--- a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
+++ b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
@@ -104,7 +104,7 @@ def create_and_check_model(self, config, input_ids, *args):
         # check batch sizes match
         for value in result.values():
             self.parent.assertEqual(value.size(0), self.batch_size)
-        # check duration, pitch, and energy have the appopriate shapes
+        # check duration, pitch, and energy have the appropriate shapes
         # duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1)
         self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape)
         self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape)
@@ -527,7 +527,7 @@ def create_and_check_model(self, config, input_ids, *args):
         # check batch sizes match
         for value in result.values():
             self.parent.assertEqual(value.size(0), self.batch_size)
-        # check duration, pitch, and energy have the appopriate shapes
+        # check duration, pitch, and energy have the appropriate shapes
         # duration: (batch_size, max_text_length), pitch and energy: (batch_size, max_text_length, 1)
         self.parent.assertEqual(result["duration_outputs"].shape + (1,), result["pitch_outputs"].shape)
         self.parent.assertEqual(result["pitch_outputs"].shape, result["energy_outputs"].shape)
diff --git a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
index 72acb83999b9..23c8a35dc65d 100644
--- a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
+++ b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py
@@ -28,10 +28,11 @@ class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase)
     tokenizer_class = FastSpeech2ConformerTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
         tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "this is a test"
diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py
index 2ba0b509e47e..4211e92e21e0 100644
--- a/tests/models/flaubert/test_modeling_flaubert.py
+++ b/tests/models/flaubert/test_modeling_flaubert.py
@@ -377,6 +377,8 @@ class FlaubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. Outdated custom `prepare_inputs_for_generation` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": FlaubertModel,
diff --git a/tests/models/flaubert/test_tokenization_flaubert.py b/tests/models/flaubert/test_tokenization_flaubert.py
index 6a90de030d2e..0fd42da306e7 100644
--- a/tests/models/flaubert/test_tokenization_flaubert.py
+++ b/tests/models/flaubert/test_tokenization_flaubert.py
@@ -30,8 +30,9 @@ class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = FlaubertTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "i</w>", "lo", "low", "ne", "new", "er</w>", "low</w>", "lowest</w>", "new</w>", "newer</w>", "wider</w>", "<unk>"]  # fmt: skip
@@ -39,11 +40,11 @@ def setUp(self):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["n e 300", "ne w 301", "e r</w> 302", ""]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
     # Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index e4949c54ca21..3e3321a3141c 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -309,27 +309,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="FlavaImageModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    # skip this test as FlavaImageModel has no base class and is
-    # not available in MODEL_MAPPING
-    @unittest.skip(reason="FlavaImageModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "facebook/flava-full"
@@ -470,13 +460,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -486,14 +476,6 @@ def test_inputs_embeds(self):
         # FLAVA does not use inputs_embeds
         pass
 
-    @unittest.skip(reason="FlavaTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="FlavaTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "facebook/flava-full"
@@ -635,13 +617,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -650,14 +632,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="FlavaMultimodalModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="FlavaMultimodalModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "facebook/flava-full"
@@ -766,13 +740,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -785,14 +759,6 @@ def test_inputs_embeds(self):
     def test_model_outputs_equivalence(self):
         pass
 
-    @unittest.skip(reason="FlavaImageCodebook has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="FlavaImageCodebook has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "facebook/flava-full"
@@ -1247,19 +1213,19 @@ class FlavaForPreTrainingTest(FlavaModelTest):
     test_torchscript = False
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index 26eec0f29908..eab0f2598e66 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -22,7 +22,7 @@
 from transformers.testing_utils import require_tokenizers, require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -41,10 +41,6 @@
         FNetModel,
         FNetTokenizerFast,
     )
-    from transformers.models.fnet.modeling_fnet import (
-        FNetBasicFourierTransform,
-        is_scipy_available,
-    )
 
 
 # Override ConfigTester
@@ -133,26 +129,6 @@ def get_config(self):
             tpu_short_seq_length=self.seq_length,
         )
 
-    @require_torch
-    def create_and_check_fourier_transform(self, config):
-        hidden_states = floats_tensor([self.batch_size, self.seq_length, config.hidden_size])
-        transform = FNetBasicFourierTransform(config)
-        fftn_output = transform(hidden_states)
-
-        config.use_tpu_fourier_optimizations = True
-        if is_scipy_available():
-            transform = FNetBasicFourierTransform(config)
-            dft_output = transform(hidden_states)
-
-        config.max_position_embeddings = 4097
-        transform = FNetBasicFourierTransform(config)
-        fft_output = transform(hidden_states)
-
-        if is_scipy_available():
-            self.parent.assertTrue(torch.allclose(fftn_output[0][0], dft_output[0][0], atol=1e-4))
-            self.parent.assertTrue(torch.allclose(fft_output[0][0], dft_output[0][0], atol=1e-4))
-        self.parent.assertTrue(torch.allclose(fftn_output[0][0], fft_output[0][0], atol=1e-4))
-
     def create_and_check_model(self, config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels):
         model = FNetModel(config=config)
         model.to(torch_device)
@@ -185,19 +161,6 @@ def create_and_check_for_masked_lm(
         result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
-    def create_and_check_for_next_sentence_prediction(
-        self, config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels
-    ):
-        model = FNetForNextSentencePrediction(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            token_type_ids=token_type_ids,
-            next_sentence_label=sequence_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
-
     def create_and_check_for_question_answering(
         self, config, input_ids, token_type_ids, sequence_labels, token_labels, choice_labels
     ):
@@ -326,25 +289,25 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 )
         return inputs_dict
 
-    # Overriden Tests
+    # Overridden Tests
     @unittest.skip
     def test_attention_outputs(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py
index 16f2e4950ef0..c8156a7b2084 100644
--- a/tests/models/fnet/test_tokenization_fnet.py
+++ b/tests/models/fnet/test_tokenization_fnet.py
@@ -36,12 +36,13 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece_ignore_case = True
     test_seq2seq = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = FNetTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "this is a test"
@@ -141,13 +142,13 @@ def test_sequence_builders(self):
             tokenizer.sep_token_id
         ]
 
-    # Overriden Tests - loading the fast tokenizer from slow just takes too long
+    # Overridden Tests - loading the fast tokenizer from slow just takes too long
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
                 r_output = tokenizer_r.encode("Hey this is a <special> token")
@@ -175,7 +176,7 @@ def test_special_tokens_initialization_from_slow(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [AddedToken("<special>", lstrip=True)]
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
                 )
                 special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
@@ -190,7 +191,7 @@ def test_special_tokens_initialization_from_slow(self):
                 self.assertTrue(special_token_id in p_output)
                 self.assertTrue(special_token_id in cr_output)
 
-    # Overriden Tests
+    # Overridden Tests
     def test_padding(self, max_length=50):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
@@ -198,8 +199,8 @@ def test_padding(self, max_length=50):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
diff --git a/tests/models/fsmt/test_modeling_fsmt.py b/tests/models/fsmt/test_modeling_fsmt.py
index 95789c844aec..4119769676a9 100644
--- a/tests/models/fsmt/test_modeling_fsmt.py
+++ b/tests/models/fsmt/test_modeling_fsmt.py
@@ -262,20 +262,6 @@ def test_save_load_missing_keys(self):
                 model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
             self.assertEqual(info["missing_keys"], [])
 
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        model = FSMTModel(config).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (inputs_dict["input_ids"], inputs_dict["attention_mask"]),
-                f"{tmpdirname}/fsmt_test.onnx",
-                export_params=True,
-                opset_version=12,
-                input_names=["input_ids", "attention_mask"],
-            )
-
     def test_ensure_weights_are_shared(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
 
@@ -562,6 +548,7 @@ def test_basic(self):
         emb1 = SinusoidalPositionalEmbedding(num_positions=6, embedding_dim=6, padding_idx=self.padding_idx).to(
             torch_device
         )
+        emb1.make_weight(*emb1.weight.shape, emb1.padding_idx)
         emb = emb1(input_ids)
         desired_weights = torch.tensor(
             [
@@ -576,10 +563,16 @@ def test_basic(self):
 
     def test_odd_embed_dim(self):
         # odd embedding_dim  is allowed
-        SinusoidalPositionalEmbedding(num_positions=4, embedding_dim=5, padding_idx=self.padding_idx).to(torch_device)
+        test = SinusoidalPositionalEmbedding(num_positions=4, embedding_dim=5, padding_idx=self.padding_idx).to(
+            torch_device
+        )
+        test.make_weight(*test.weight.shape, test.padding_idx)
 
         # odd num_embeddings is allowed
-        SinusoidalPositionalEmbedding(num_positions=5, embedding_dim=4, padding_idx=self.padding_idx).to(torch_device)
+        test = SinusoidalPositionalEmbedding(num_positions=5, embedding_dim=4, padding_idx=self.padding_idx).to(
+            torch_device
+        )
+        test.make_weight(*test.weight.shape, test.padding_idx)
 
     @unittest.skip(reason="different from marian (needs more research)")
     def test_positional_emb_weights_against_marian(self):
@@ -593,6 +586,7 @@ def test_positional_emb_weights_against_marian(self):
         emb1 = SinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512, padding_idx=self.padding_idx).to(
             torch_device
         )
+        emb1.make_weight(*emb1.weight.shape, emb1.padding_idx)
         weights = emb1.weights.data[:3, :5]
         # XXX: only the 1st and 3rd lines match - this is testing against
         # verbatim copy of SinusoidalPositionalEmbedding from fairseq
diff --git a/tests/models/fsmt/test_tokenization_fsmt.py b/tests/models/fsmt/test_tokenization_fsmt.py
index bac487767ba2..cbc969225540 100644
--- a/tests/models/fsmt/test_tokenization_fsmt.py
+++ b/tests/models/fsmt/test_tokenization_fsmt.py
@@ -34,8 +34,9 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = FSMTTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -64,22 +65,22 @@ def setUp(self):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
 
-        self.langs = ["en", "ru"]
+        cls.langs = ["en", "ru"]
         config = {
-            "langs": self.langs,
+            "langs": cls.langs,
             "src_vocab_size": 10,
             "tgt_vocab_size": 20,
         }
 
-        self.src_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"])
-        self.tgt_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"])
-        config_file = os.path.join(self.tmpdirname, "tokenizer_config.json")
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.src_vocab_file, "w") as fp:
+        cls.src_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"])
+        cls.tgt_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"])
+        config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json")
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.src_vocab_file, "w") as fp:
             fp.write(json.dumps(vocab_tokens))
-        with open(self.tgt_vocab_file, "w") as fp:
+        with open(cls.tgt_vocab_file, "w") as fp:
             fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
+        with open(cls.merges_file, "w") as fp:
             fp.write("\n".join(merges))
         with open(config_file, "w") as fp:
             fp.write(json.dumps(config))
diff --git a/tests/models/funnel/test_tokenization_funnel.py b/tests/models/funnel/test_tokenization_funnel.py
index 9ddb3b325d0e..d5e22a3c6ede 100644
--- a/tests/models/funnel/test_tokenization_funnel.py
+++ b/tests/models/funnel/test_tokenization_funnel.py
@@ -16,12 +16,13 @@
 
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import FunnelTokenizer, FunnelTokenizerFast
 from transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_tokenizers
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -32,8 +33,9 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     space_between_special_tokens = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "<unk>",
@@ -50,15 +52,23 @@ def setUp(self):
             "low",
             "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-    def get_tokenizer(self, **kwargs):
-        return FunnelTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return FunnelTokenizer.from_pretrained(pretrained_name, **kwargs)
 
-    def get_rust_tokenizer(self, **kwargs):
-        return FunnelTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return FunnelTokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "UNwant\u00e9d,running"
diff --git a/tests/models/fuyu/test_image_processing_fuyu.py b/tests/models/fuyu/test_image_processing_fuyu.py
index a9930e2fb812..fd9fea1f741a 100644
--- a/tests/models/fuyu/test_image_processing_fuyu.py
+++ b/tests/models/fuyu/test_image_processing_fuyu.py
@@ -42,9 +42,9 @@ def test_patches(self):
         expected_num_patches = self.processor.get_num_patches(image_height=self.height, image_width=self.width)
 
         patches_final = self.processor.patchify_image(image=self.image_input)
-        assert (
-            patches_final.shape[1] == expected_num_patches
-        ), f"Expected {expected_num_patches} patches, got {patches_final.shape[1]}."
+        assert patches_final.shape[1] == expected_num_patches, (
+            f"Expected {expected_num_patches} patches, got {patches_final.shape[1]}."
+        )
 
     def test_scale_to_target_aspect_ratio(self):
         # (h:450, w:210) fitting (160, 320) -> (160, 210*160/450)
diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py
index 1b308973fb14..513486766c68 100644
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -22,7 +22,7 @@
 from parameterized import parameterized
 
 from transformers import FuyuConfig, is_torch_available, is_vision_available
-from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
+from transformers.testing_utils import require_torch, require_torch_accelerator, slow
 from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -39,8 +39,6 @@
 
 
 if is_torch_available():
-    import torch
-
     from transformers import FuyuForCausalLM
 
 
@@ -133,125 +131,6 @@ def get_config(self):
             pad_token_id=self.pad_token_id,
         )
 
-    def create_and_check_model(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-    ):
-        model = FuyuForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = FuyuForCausalLM(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = FuyuForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = FuyuForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -282,29 +161,30 @@ def setUp(self):
         self.model_tester = FuyuModelTester(self)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @pytest.mark.generate
     @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
     @unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices")
     def test_assisted_decoding_matches_greedy_search(self):
         pass
 
+    @pytest.mark.generate
     @unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices")
     def test_assisted_decoding_sample(self):
         pass
diff --git a/tests/models/fuyu/test_processor_fuyu.py b/tests/models/fuyu/test_processor_fuyu.py
index 39a47293040b..d3ce39fa89aa 100644
--- a/tests/models/fuyu/test_processor_fuyu.py
+++ b/tests/models/fuyu/test_processor_fuyu.py
@@ -1,6 +1,7 @@
 import io
 import tempfile
 import unittest
+from shutil import rmtree
 
 import requests
 
@@ -32,18 +33,21 @@
 class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = FuyuProcessor
 
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
+    @classmethod
+    def setUpClass(cls):
+        cls.tmpdirname = tempfile.mkdtemp()
+        # Ensure tempdir is cleaned up even if there's a failure
+        cls.addClassCleanup(lambda tempdir=cls.tmpdirname: rmtree(tempdir))
 
         image_processor = FuyuImageProcessor()
         tokenizer = AutoTokenizer.from_pretrained("adept/fuyu-8b")
 
         processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
-        processor.save_pretrained(self.tmpdirname)
+        processor.save_pretrained(cls.tmpdirname)
 
-        self.text_prompt = "Generate a coco-style caption.\\n"
+        cls.text_prompt = "Generate a coco-style caption.\\n"
         bus_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
-        self.bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content))
+        cls.bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content))
 
     def get_processor(self):
         image_processor = FuyuImageProcessor()
diff --git a/tests/models/gemma/test_modeling_flax_gemma.py b/tests/models/gemma/test_modeling_flax_gemma.py
index 3a56cbfb6b54..e8582268fee9 100644
--- a/tests/models/gemma/test_modeling_flax_gemma.py
+++ b/tests/models/gemma/test_modeling_flax_gemma.py
@@ -18,7 +18,6 @@
 from transformers import AutoTokenizer, GemmaConfig, is_flax_available
 from transformers.testing_utils import require_flax, require_read_token, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -174,7 +173,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxGemmaModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxGemmaModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxGemmaModel, FlaxGemmaForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index ffadf3377e0a..9cea971a97ff 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -23,6 +23,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, GemmaConfig, is_torch_available
 from transformers.generation.configuration_utils import GenerationConfig
 from transformers.testing_utils import (
+    cleanup,
     is_flaky,
     require_bitsandbytes,
     require_flash_attn,
@@ -167,116 +168,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = self.model_class(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = self.for_causal_lm_class(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = self.for_causal_lm_class(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Gemma
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -353,7 +244,6 @@ def test_model_various_embeddings(self):
 
     def test_Gemma_sequence_classification_model(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
         config.num_labels = 3
         input_ids = input_dict["input_ids"]
         attention_mask = input_ids.ne(1).to(torch_device)
@@ -407,10 +297,6 @@ def test_Gemma_token_classification_model(self):
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip(reason="Gemma buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="Gemma uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
@@ -503,6 +389,10 @@ def setUpClass(cls):
             # 8 is for A100 / A10 and 7 for T4
             cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
 
+    def tearDown(self):
+        # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
+        cleanup(torch_device, gc_collect=False)
+
     @require_read_token
     def test_model_2b_fp16(self):
         model_id = "google/gemma-2b"
diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py
index 3a9e7af4b6f5..e48d19a25341 100644
--- a/tests/models/gemma/test_tokenization_gemma.py
+++ b/tests/models/gemma/test_tokenization_gemma.py
@@ -53,12 +53,13 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece = True
     from_pretrained_kwargs = {}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
         # We have a SentencePiece fixture for testing
         tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
         tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     @require_torch
     def test_batch_tokenization(self):
@@ -103,7 +104,7 @@ def test_special_tokens_initialization(self):
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
                 r_output = tokenizer_r.encode("Hey this is a <special> token")
@@ -113,7 +114,7 @@ def test_special_tokens_initialization(self):
                 self.assertTrue(special_token_id in r_output)
 
                 if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                    tokenizer_cr = self.get_rust_tokenizer(
                         pretrained_name,
                         additional_special_tokens=added_tokens,
                         **kwargs,  # , from_slow=True <- unfortunately too slow to convert
@@ -342,8 +343,8 @@ def test_integration_test_xnli(self):
                 encoded1,
                 encoded2,
                 msg="Hint: the following tokenization diff were obtained for slow vs fast:\n "
-                f"elements in slow: {set(pyth_tokenizer.tokenize(string))-set(rust_tokenizer.tokenize(string))} \nvs\n "
-                f"elements in fast: {set(rust_tokenizer.tokenize(string))-set(pyth_tokenizer.tokenize(string))} \n\n{string}",
+                f"elements in slow: {set(pyth_tokenizer.tokenize(string)) - set(rust_tokenizer.tokenize(string))} \nvs\n "
+                f"elements in fast: {set(rust_tokenizer.tokenize(string)) - set(pyth_tokenizer.tokenize(string))} \n\n{string}",
             )
 
             decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py
index c881ecaea559..cf27d90a9b02 100644
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@@ -16,6 +16,7 @@
 
 import unittest
 
+import pytest
 from packaging import version
 from parameterized import parameterized
 from pytest import mark
@@ -91,16 +92,12 @@ def test_model_outputs_equivalence(self, **kwargs):
     def test_sdpa_can_dispatch_non_composite_models(self):
         pass
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different")
-    def test_eager_matches_sdpa_inference(self):
-        pass
-
     @unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different")
     def test_eager_matches_sdpa_generate(self):
         pass
 
     @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
     @unittest.skip("Gemma2 has HybridCache which is not compatible with assisted decoding")
     def test_assisted_decoding_matches_greedy_search(self, assistant_type):
         pass
@@ -109,6 +106,7 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
     def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type):
         pass
 
+    @pytest.mark.generate
     @unittest.skip("Gemma2 has HybridCache which is not compatible with assisted decoding")
     def test_assisted_decoding_sample(self):
         pass
@@ -121,10 +119,6 @@ def test_dola_decoding_sample(self):
     def test_generate_continue_from_past_key_values(self):
         pass
 
-    @unittest.skip("Gemma2 has HybridCache and doesn't support low_memory generation")
-    def test_beam_search_low_memory(self):
-        pass
-
     @unittest.skip("Gemma2 has HybridCache and doesn't support contrastive generation")
     def test_contrastive_generate(self):
         pass
@@ -153,6 +147,13 @@ def test_generate_continue_from_inputs_embeds(self):
     def test_sdpa_equivalence(self):
         pass
 
+    @unittest.skip(
+        reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
+        " as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
+    )
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
 
 @slow
 @require_torch_gpu
diff --git a/examples/research_projects/deebert/src/__init__.py b/tests/models/gemma3/__init__.py
similarity index 100%
rename from examples/research_projects/deebert/src/__init__.py
rename to tests/models/gemma3/__init__.py
diff --git a/tests/models/gemma3/test_image_processing_gemma3.py b/tests/models/gemma3/test_image_processing_gemma3.py
new file mode 100644
index 000000000000..28410f2d51e9
--- /dev/null
+++ b/tests/models/gemma3/test_image_processing_gemma3.py
@@ -0,0 +1,293 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import Gemma3ImageProcessor
+
+    if is_torchvision_available():
+        from transformers import Gemma3ImageProcessorFast
+
+
+class Gemma3ImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=IMAGENET_STANDARD_MEAN,
+        image_std=IMAGENET_STANDARD_STD,
+        do_convert_rgb=True,
+        do_pan_and_scan=True,
+        pan_and_scan_min_crop_size=10,
+        pan_and_scan_max_num_crops=2,
+        pan_and_scan_min_ratio_to_activate=1.2,
+    ):
+        super().__init__()
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+        self.do_pan_and_scan = do_pan_and_scan
+        self.pan_and_scan_min_crop_size = pan_and_scan_min_crop_size
+        self.pan_and_scan_max_num_crops = pan_and_scan_max_num_crops
+        self.pan_and_scan_min_ratio_to_activate = pan_and_scan_min_ratio_to_activate
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+            "do_pan_and_scan": self.do_pan_and_scan,
+            "pan_and_scan_min_crop_size": self.pan_and_scan_min_crop_size,
+            "pan_and_scan_max_num_crops": self.pan_and_scan_max_num_crops,
+            "pan_and_scan_min_ratio_to_activate": self.pan_and_scan_min_ratio_to_activate,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class Gemma3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = Gemma3ImageProcessor if is_vision_available() else None
+    fast_image_processing_class = Gemma3ImageProcessorFast if is_torchvision_available() else None
+
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.setUp with CLIP->Gemma3
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = Gemma3ImageProcessingTester(self)
+
+    @property
+    # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "size"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+            self.assertTrue(hasattr(image_processing, "do_pan_and_scan"))
+            self.assertTrue(hasattr(image_processing, "pan_and_scan_min_crop_size"))
+            self.assertTrue(hasattr(image_processing, "pan_and_scan_max_num_crops"))
+            self.assertTrue(hasattr(image_processing, "pan_and_scan_min_ratio_to_activate"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+            image_processor = image_processing_class.from_dict(self.image_processor_dict, size=84)
+            self.assertEqual(image_processor.size, {"height": 84, "width": 84})
+
+    def test_without_pan_and_scan(self):
+        """
+        Disable do_pan_and_scan parameter.
+        """
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processor = image_processing_class.from_dict(self.image_processor_dict, do_pan_and_scan=False)
+
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+            # Test not batched input
+            encoded_images = image_processor(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (1, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched
+            encoded_images = image_processor(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    def test_pan_and_scan(self):
+        """
+        Enables Pan and Scan path by choosing the correct input image resolution. If you are changing
+        image processor attributes for PaS, please update this test.
+        """
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            """This function prepares a list of PIL images"""
+            image_inputs = [np.random.randint(255, size=(3, 300, 600), dtype=np.uint8)] * 3
+            image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+
+            # Test not batched input, 3 images because we have base image + 2 crops
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (3, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched, 9 images because we have base image + 2 crops per each item
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (9, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched unbalanced, 9 images because we have base image + 2 crops per each item
+            encoded_images = image_processing(
+                [[image_inputs[0], image_inputs[1]], [image_inputs[2]]], return_tensors="pt"
+            ).pixel_values
+            expected_output_image_shape = (9, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    def test_call_pil(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (1, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    def test_call_numpy(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (1, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    def test_call_pytorch(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = image_processing_class(**self.image_processor_dict)
+            # create random PyTorch tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+
+            for image in image_inputs:
+                self.assertIsInstance(image, torch.Tensor)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = (1, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = (7, 3, 18, 18)
+            self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+    @unittest.skip("Gemma3 doesn't work with 4 channels due to pan and scan method")
+    def test_call_numpy_4_channels(self):
+        pass
+
+    @require_vision
+    @require_torch
+    def test_slow_fast_equivalence_batched_pas(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
+            self.skipTest(
+                reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
+            )
+        crop_config = {
+            "do_pan_and_scan": True,
+            "pan_and_scan_max_num_crops": 448,
+            "pan_and_scan_min_crop_size": 32,
+            "pan_and_scan_min_ratio_to_activate": 0.3,
+        }
+        image_processor_dict = self.image_processor_dict
+        image_processor_dict.update(crop_config)
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        image_processor_slow = self.image_processing_class(**image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
+
+        torch.testing.assert_close(encoding_slow.num_crops, encoding_fast.num_crops)
+        self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
+        self.assertLessEqual(
+            torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3
+        )
diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py
new file mode 100644
index 000000000000..7aba3454bd63
--- /dev/null
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -0,0 +1,663 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Gemma3 model."""
+
+import tempfile
+import unittest
+
+import pytest
+from parameterized import parameterized
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Gemma3Config,
+    Gemma3TextConfig,
+    GenerationConfig,
+    is_torch_available,
+)
+from transformers.testing_utils import (
+    cleanup,
+    require_flash_attn,
+    require_read_token,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...models.gemma.test_modeling_gemma import GemmaModelTester
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Gemma3ForCausalLM,
+        Gemma3ForConditionalGeneration,
+        Gemma3Processor,
+        Gemma3TextModel,
+    )
+
+
+class Gemma3ModelTester(GemmaModelTester):
+    if is_torch_available():
+        config_class = Gemma3TextConfig
+        model_class = Gemma3TextModel
+        for_causal_lm_class = Gemma3ForCausalLM
+
+
+@require_torch
+class Gemma3ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (Gemma3TextModel, Gemma3ForCausalLM) if is_torch_available() else ()
+    all_generative_model_classes = (Gemma3ForCausalLM,) if is_torch_available() else ()
+    test_headmasking = False
+    test_pruning = False
+    _is_stateful = True
+    model_split_percents = [0.5, 0.6]
+
+    def setUp(self):
+        self.model_tester = Gemma3ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Gemma3Config, hidden_size=37)
+
+    @unittest.skip("Failing because of unique cache (HybridCache)")
+    def test_model_outputs_equivalence(self, **kwargs):
+        pass
+
+    @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
+    @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding")
+    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding")
+    def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding")
+    def test_assisted_decoding_sample(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache which is not compatible with dola decoding")
+    def test_dola_decoding_sample(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support continue from past kv")
+    def test_generate_continue_from_past_key_values(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support low_memory generation")
+    def test_beam_search_low_memory(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate_dict_outputs_use_cache(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate_low_memory(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.")
+    def test_generate_with_static_cache(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.")
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.")
+    def test_generate_continue_from_inputs_embeds(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache which auto-compiles. Compile and FA2 don't work together.")
+    def test_eager_matches_fa2_generate(self):
+        pass
+
+    @unittest.skip(
+        reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
+        " as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
+    )
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+
+class Gemma3Vision2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        mm_tokens_per_image=2,
+        image_token_index=1,
+        boi_token_index=2,
+        eoi_token_index=3,
+        seq_length=25,
+        is_training=True,
+        vision_config={
+            "use_labels": True,
+            "image_size": 20,
+            "patch_size": 5,
+            "num_channels": 3,
+            "is_training": True,
+            "hidden_size": 32,
+            "num_key_value_heads": 1,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "dropout": 0.1,
+            "attention_dropout": 0.1,
+            "initializer_range": 0.02,
+        },
+        use_cache=False,
+    ):
+        self.parent = parent
+        # `image_token_index` is set to 0 to pass "resize_embeddings" test, do not modify
+        self.mm_tokens_per_image = mm_tokens_per_image
+        self.image_token_index = image_token_index
+        self.boi_token_index = boi_token_index
+        self.eoi_token_index = eoi_token_index
+        self.llm_tester = Gemma3ModelTester(self.parent)
+        self.text_config = self.llm_tester.get_config()
+        self.vision_config = vision_config
+        self.seq_length = seq_length
+        self.pad_token_id = self.text_config.pad_token_id
+
+        self.num_hidden_layers = self.text_config.num_hidden_layers
+        self.vocab_size = self.text_config.vocab_size
+        self.hidden_size = self.text_config.hidden_size
+        self.num_attention_heads = self.text_config.num_attention_heads
+        self.is_training = is_training
+
+        self.batch_size = 3
+        self.num_channels = vision_config["num_channels"]
+        self.image_size = vision_config["image_size"]
+        self.encoder_seq_length = seq_length
+        self.use_cache = use_cache
+
+    def get_config(self):
+        return Gemma3Config(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+            image_token_index=self.image_token_index,
+            boi_token_index=self.boi_token_index,
+            eoi_token_index=self.eoi_token_index,
+            mm_tokens_per_image=self.mm_tokens_per_image,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                self.vision_config["num_channels"],
+                self.vision_config["image_size"],
+                self.vision_config["image_size"],
+            ]
+        )
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
+        attention_mask = input_ids.ne(self.pad_token_id).to(torch_device)
+
+        # set the 3 first tokens to be image, and ensure that no other tokens are image tokens
+        # do not change this unless you modified image size or patch size
+        input_ids[input_ids == config.image_token_index] = self.pad_token_id
+        input_ids[:, :1] = config.image_token_index
+
+        token_type_ids = torch.zeros_like(input_ids)
+        token_type_ids[input_ids == config.image_token_index] = 1
+
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (Gemma3ForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (Gemma3ForConditionalGeneration,) if is_torch_available() else ()
+    test_headmasking = False
+    test_pruning = False
+    test_missing_keys = False
+    _is_stateful = True
+    model_split_percents = [0.5, 0.6]
+
+    # MP works but offload doesn't work when the SigLIP MultiheadAttention is offloaded
+    # TODO: One potential solution would be to add to set preload_module_classes = ["SiglipMultiheadAttentionPoolingHead"]
+    # in the dispatch_model function
+    test_cpu_offload = False
+    test_disk_offload_safetensors = False
+    test_disk_offload_bin = False
+
+    def setUp(self):
+        self.model_tester = Gemma3Vision2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Gemma3Config, hidden_size=37)
+
+    @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(
+        reason="HybridCache can't be gathered because it is not iterable. Adding a simple iter and dumping `distributed_iterator`"
+        " as in Dynamic Cache doesnt work. NOTE: @gante all cache objects would need better compatibility with multi gpu setting"
+    )
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    @unittest.skip("Failing because of unique cache (HybridCache)")
+    def test_model_outputs_equivalence(self, **kwargs):
+        pass
+
+    @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
+    @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding")
+    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding")
+    def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @pytest.mark.generate
+    @unittest.skip("Gemma3 has HybridCache which is not compatible with assisted decoding")
+    def test_assisted_decoding_sample(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache which is not compatible with dola decoding")
+    def test_dola_decoding_sample(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support continue from past kv")
+    def test_generate_continue_from_past_key_values(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support low_memory generation")
+    def test_beam_search_low_memory(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate_dict_outputs_use_cache(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support contrastive generation")
+    def test_contrastive_generate_low_memory(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.")
+    def test_generate_with_static_cache(self):
+        pass
+
+    @unittest.skip("Gemma3 has HybridCache and doesn't support StaticCache. Though it could, it shouldn't support.")
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
+    @unittest.skip(
+        reason="Siglip (vision backbone) uses the same initialization scheme as the Flax original implementation"
+    )
+    def test_initialization(self):
+        pass
+
+    @unittest.skip(
+        reason="Siglip has no FLEX attention, and we don't have a proper way to set/test attn in VLMs. TODO @raushan"
+    )
+    def test_flex_attention_with_grads(self):
+        pass
+
+    def test_automodelforcausallm(self):
+        """
+        Regression test for #36741/#36917 -- make sure `AutoModelForCausalLM` works with a Gemma3 config, i.e. that
+        `AutoModelForCausalLM.from_pretrained` pulls the text config before loading the model
+        """
+        config = self.model_tester.get_config()
+        model = Gemma3ForConditionalGeneration(config)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            for_causal_lm = AutoModelForCausalLM.from_pretrained(tmp_dir)
+            self.assertIsInstance(for_causal_lm, Gemma3ForConditionalGeneration)
+
+
+@slow
+@require_torch_gpu
+@require_read_token
+class Gemma3IntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = Gemma3Processor.from_pretrained("google/gemma-3-4b-it", padding_side="left")
+
+        url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+        self.messages = [
+            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": url},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def test_model_4b_bf16(self):
+        model_id = "google/gemma-3-4b-it"
+
+        model = Gemma3ForConditionalGeneration.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
+        ).to(torch_device)
+
+        inputs = self.processor.apply_chat_template(
+            self.messages,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            add_generation_prompt=True,
+        ).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+
+        EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like']  # fmt: skip
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    def test_model_4b_batch(self):
+        model_id = "google/gemma-3-4b-it"
+
+        model = Gemma3ForConditionalGeneration.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
+        ).to(torch_device)
+
+        messages_2 = [
+            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
+                    },
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "text", "text": "Are these images identical?"},
+                ],
+            },
+        ]
+
+        inputs = self.processor.apply_chat_template(
+            [self.messages, messages_2],
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            padding=True,
+            add_generation_prompt=True,
+        ).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+
+        EXPECTED_TEXTS = [
+            'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like',
+            "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n*   **Image 1:** Shows a cow"
+        ]  # fmt: skip
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    def test_model_4b_crops(self):
+        model_id = "google/gemma-3-4b-it"
+
+        model = Gemma3ForConditionalGeneration.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
+        ).to(torch_device)
+
+        crop_config = {
+            "images_kwargs": {
+                "do_pan_and_scan": True,
+                "pan_and_scan_max_num_crops": 448,
+                "pan_and_scan_min_crop_size": 32,
+                "pan_and_scan_min_ratio_to_activate": 0.3,
+            }
+        }
+
+        inputs = self.processor.apply_chat_template(
+            self.messages,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            add_generation_prompt=True,
+            **crop_config,
+        ).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+
+        EXPECTED_NUM_IMAGES = 3  # one for the origin image and two crops of images
+        EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a beach with a turquoise ocean and blue sky in the background. It looks like the cow is enjoying the beach']  # fmt: skip
+        self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    def test_model_4b_batch_crops(self):
+        model_id = "google/gemma-3-4b-it"
+
+        model = Gemma3ForConditionalGeneration.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
+        ).to(torch_device)
+        crop_config = {
+            "images_kwargs": {
+                "do_pan_and_scan": True,
+                "pan_and_scan_max_num_crops": 448,
+                "pan_and_scan_min_crop_size": 32,
+                "pan_and_scan_min_ratio_to_activate": 0.3,
+            }
+        }
+        messages_2 = [
+            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
+                    },
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "text", "text": "Are these images identical?"},
+                ],
+            },
+        ]
+
+        inputs = self.processor.apply_chat_template(
+            [self.messages, messages_2],
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            padding=True,
+            add_generation_prompt=True,
+            **crop_config,
+        ).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+        EXPECTED_NUM_IMAGES = 9  # 3 * (one for the origin image and two crops of images) = 9
+        EXPECTED_TEXTS = [
+            "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a beach with a turquoise ocean and blue sky in the background. It looks like the cow is enjoying the beach",
+            "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nWhile they all feature a brown cow in the foreground and a similar background (including the stop signs and",
+        ]  # fmt: skip
+        self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    def test_model_4b_multiimage(self):
+        model_id = "google/gemma-3-4b-it"
+
+        model = Gemma3ForConditionalGeneration.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
+        ).to(torch_device)
+
+        messages = [
+            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "text", "text": "What do you see here?"},
+                ],
+            },
+        ]
+
+        inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            padding=True,
+            add_generation_prompt=True,
+        ).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+
+        EXPECTED_TEXTS = ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a vibrant,"]  # fmt: skip
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    def test_model_1b_text_only(self):
+        model_id = "google/gemma-3-1b-it"
+
+        model = Gemma3ForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
+            torch_device
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
+        inputs = tokenizer("Write a poem about Machine Learning.", return_tensors="pt").to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+
+        EXPECTED_TEXTS = ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a river deep,\nWith patterns hidden, secrets sleep.\nA neural net, a watchful eye,\nLearning']  # fmt: skip
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    # TODO: raushan FA2 generates gibberish for no reason, check later
+    @require_flash_attn
+    @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    def test_model_4b_flash_attn(self):
+        model_id = "google/gemma-3-4b-it"
+
+        model = Gemma3ForConditionalGeneration.from_pretrained(
+            model_id, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+        ).to(torch_device)
+
+        inputs = self.processor.apply_chat_template(
+            self.messages,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            add_generation_prompt=True,
+        ).to(torch_device)
+
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+
+        EXPECTED_TEXTS = ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and']  # fmt: skip
+        self.assertEqual(output_text, EXPECTED_TEXTS)
+
+    @parameterized.expand([("flash_attention_2",), ("sdpa",), ("eager",)])
+    def test_generation_beyond_sliding_window(self, attn_implementation: str):
+        """Test that we can correctly generate beyond the sliding window. This is non trivial as
+        we need to correctly slice the attention mask in all cases (because we use a HybridCache).
+        Outputs for every attention functions should be coherent and identical.
+        """
+        model_id = "google/gemma-3-1b-it"
+
+        input_text = [
+            "This is a nice place. " * 800 + "I really enjoy the scenery,",  # This is larger than 4096 tokens
+            "A list of colors: red, blue",  # This will almost all be padding tokens
+        ]
+        tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left")
+        inputs = tokenizer(input_text, padding=True, return_tensors="pt").to(torch_device)
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, attn_implementation=attn_implementation, torch_dtype=torch.float16
+        ).to(torch_device)
+
+        # Make sure prefill is larger than sliding window
+        input_size = inputs.input_ids.shape[-1]
+        self.assertTrue(input_size > model.config.sliding_window)
+
+        out = model.generate(**inputs, max_new_tokens=20, do_sample=False)[:, input_size:]
+        output_text = tokenizer.batch_decode(out)
+
+        EXPECTED_COMPLETIONS = [" and I'm going to take a walk.\n\nI really enjoy the scenery, and I'", ", green, yellow, orange, purple, brown, black, white, gray.\n\nI'"]  # fmt: skip
+        self.assertEqual(output_text, EXPECTED_COMPLETIONS)
+
+    def test_generation_beyond_sliding_window_with_generation_config(self):
+        """
+        Similar to `test_generation_beyond_sliding_window`, but passing a GenerationConfig. Regression test for #36684
+        -- ensures `cache_implementation='hybrid'` is correctly inherited from the base `model.generation_config`.
+        """
+        model_id = "google/gemma-3-1b-it"
+        attn_implementation = "sdpa"
+
+        input_text = [
+            "This is a nice place. " * 800 + "I really enjoy the scenery,",  # This is larger than 4096 tokens
+            "A list of colors: red, blue",  # This will almost all be padding tokens
+        ]
+        tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left")
+        inputs = tokenizer(input_text, padding=True, return_tensors="pt").to(torch_device)
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id, attn_implementation=attn_implementation, torch_dtype=torch.float16
+        ).to(torch_device)
+
+        # Make sure prefill is larger than sliding window
+        input_size = inputs.input_ids.shape[-1]
+        self.assertGreater(input_size, model.config.sliding_window)
+
+        generation_config = GenerationConfig(max_new_tokens=5, min_new_tokens=5)
+        out = model.generate(**inputs, generation_config=generation_config)
+
+        out = model.generate(**inputs, generation_config=generation_config, do_sample=False)[:, input_size:]
+        output_text = tokenizer.batch_decode(out)
+        EXPECTED_COMPLETIONS = [" and I'm going to take a walk.\n\nI really enjoy the scenery, and I'", ", green, yellow, orange, purple, brown, black, white, gray.\n\nI'"]  # fmt: skip
+        self.assertEqual(output_text, EXPECTED_COMPLETIONS)
+
+        # Generation works beyond sliding window
+        self.assertGreater(out.shape[1], model.config.sliding_window)
+        self.assertEqual(out.shape[1], input_size + 5)
+
+        # Note: Auto-inheritance only works for models saved starting from 4.50.0
+        model.generation_config.transformers_version = "4.49.0"
+        with self.assertRaises(RuntimeError):  # errors out because it is not using hybrid cache
+            out = model.generate(**inputs, generation_config=generation_config)
diff --git a/tests/models/gemma3/test_processing_gemma3.py b/tests/models/gemma3/test_processing_gemma3.py
new file mode 100644
index 000000000000..e72ca9c2bf19
--- /dev/null
+++ b/tests/models/gemma3/test_processing_gemma3.py
@@ -0,0 +1,136 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+from typing import Optional
+
+from transformers import Gemma3Processor, GemmaTokenizer
+from transformers.testing_utils import get_tests_dir, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import Gemma3ImageProcessor
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+
+
+@require_vision
+class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Gemma3Processor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        gemma3_image_processor_kwargs = {
+            "do_pan_and_scan": True,
+            "pan_and_scan_min_crop_size": 256,
+            "pan_and_scan_max_num_crops": 4,
+            "pan_and_scan_min_ratio_to_activate": 1.2,
+        }
+        image_processor = Gemma3ImageProcessor.from_pretrained(
+            "google/siglip-so400m-patch14-384", **gemma3_image_processor_kwargs
+        )
+
+        extra_special_tokens = {
+            "image_token": "<image_soft_token>",
+            "boi_token": "<start_of_image>",
+            "eoi_token": "<end_of_image>",
+        }
+        tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True, extra_special_tokens=extra_special_tokens)
+        processor_kwargs = self.prepare_processor_dict()
+        processor = Gemma3Processor(image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs)
+        processor.save_pretrained(self.tmpdirname)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    # TODO: raushan or arthur: add the real chat template
+    def prepare_processor_dict(self):
+        return {
+            "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",            "image_seq_length": 3,
+        }  # fmt: skip
+
+    # Override as VLMs need image tokens in prompts
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer <start_of_image>"
+
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+
+        if batch_size == 1:
+            return ["lower newer <start_of_image>"]
+        return ["lower newer <start_of_image>", "<start_of_image> upper older longer string"] + [
+            "<start_of_image> lower newer"
+        ] * (batch_size - 2)
+
+    # Override as Gemma3 needs images to be an explicitly nested batch
+    def prepare_image_inputs(self, batch_size: Optional[int] = None):
+        """This function prepares a list of PIL images for testing"""
+        images = super().prepare_image_inputs(batch_size)
+        if isinstance(images, (list, tuple)):
+            images = [[image] for image in images]
+        return images
+
+    def test_text_with_image_tokens(self):
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        text_multi_images = f"{processor.boi_token}{processor.boi_token}Dummy text!"
+        text_single_image = f"{processor.boi_token}Dummy text!"
+        text_no_image = "Dummy text!"
+
+        image = self.prepare_image_inputs()
+
+        # If text has no image tokens, iamge should be `None`
+        with self.assertRaises(ValueError):
+            _ = processor(text=text_no_image, images=image, return_tensors="np")
+
+        # We can't be sure what is users intention: if user wants one image per text OR two images for first text and no image for second text
+        with self.assertRaises(ValueError):
+            _ = processor(text=[text_single_image, text_single_image], images=[image, image], return_tensors="np")
+
+        # The users is expected to be explicit about which image belong to which text by nesting the images list
+        out_multiimages = processor(text=text_multi_images, images=[image, image], return_tensors="np")
+        out_batch_oneimage = processor(
+            text=[text_single_image, text_single_image], images=[[image], [image]], return_tensors="np"
+        )
+        self.assertListEqual(
+            out_batch_oneimage[self.images_input_name].tolist(), out_multiimages[self.images_input_name].tolist()
+        )
+
+    def test_pan_and_scan(self):
+        processor_components = self.prepare_components()
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="np",
+            do_pan_and_scan=True,
+            image_seq_length=2,
+            pan_and_scan_min_crop_size=10,
+        )
+
+        # base image + 4 crops
+        self.assertEqual(len(inputs[self.images_input_name]), 5)
+        self.assertEqual(len(inputs[self.text_input_name][0]), 67)
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 9912d9b6fea8..03f7c8285bfc 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -176,25 +176,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="GitVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="GitVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "microsoft/git-base"
@@ -549,7 +541,7 @@ def test_inference_image_captioning(self):
         self.assertEqual(outputs.sequences.shape, expected_shape)
         self.assertEqual(generated_caption, "two cats laying on a pink blanket")
         self.assertTrue(outputs.scores[-1].shape, expected_shape)
-        expected_slice = torch.tensor([[-0.8805, -0.8803, -0.8799]], device=torch_device)
+        expected_slice = torch.tensor([-0.8805, -0.8803, -0.8799], device=torch_device)
         torch.testing.assert_close(outputs.scores[-1][0, :3], expected_slice, rtol=1e-4, atol=1e-4)
 
     def test_visual_question_answering(self):
diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py
index e4ceec2d0978..ce9ed841b458 100644
--- a/tests/models/glm/test_modeling_glm.py
+++ b/tests/models/glm/test_modeling_glm.py
@@ -156,113 +156,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = self.model_class(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = self.for_causal_lm_class(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = self.for_causal_lm_class(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Glm
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
diff --git a/tests/models/got_ocr2/test_image_processing_got_ocr2.py b/tests/models/got_ocr2/test_image_processing_got_ocr2.py
index c4e75feee660..93cd347dea62 100644
--- a/tests/models/got_ocr2/test_image_processing_got_ocr2.py
+++ b/tests/models/got_ocr2/test_image_processing_got_ocr2.py
@@ -16,15 +16,22 @@
 
 import unittest
 
+from transformers.image_utils import SizeDict
 from transformers.testing_utils import require_torch, require_vision
-from transformers.utils import is_vision_available
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
 
 
+if is_torch_available():
+    import torch
+
 if is_vision_available():
     from transformers import GotOcr2ImageProcessor
 
+    if is_torchvision_available():
+        from transformers import GotOcr2ImageProcessorFast
+
 
 class GotOcr2ImageProcessingTester(unittest.TestCase):
     def __init__(
@@ -89,6 +96,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 @require_vision
 class GotOcr2ProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = GotOcr2ImageProcessor if is_vision_available() else None
+    fast_image_processing_class = GotOcr2ImageProcessorFast if is_torchvision_available() else None
 
     def setUp(self):
         super().setUp()
@@ -99,17 +107,72 @@ def image_processor_dict(self):
         return self.image_processor_tester.prepare_image_processor_dict()
 
     def test_image_processor_properties(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        self.assertTrue(hasattr(image_processor, "do_resize"))
-        self.assertTrue(hasattr(image_processor, "size"))
-        self.assertTrue(hasattr(image_processor, "do_normalize"))
-        self.assertTrue(hasattr(image_processor, "image_mean"))
-        self.assertTrue(hasattr(image_processor, "image_std"))
-        self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processor, "do_resize"))
+            self.assertTrue(hasattr(image_processor, "size"))
+            self.assertTrue(hasattr(image_processor, "do_normalize"))
+            self.assertTrue(hasattr(image_processor, "image_mean"))
+            self.assertTrue(hasattr(image_processor, "image_std"))
+            self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
+
+    def test_slow_fast_equivalence_crop_to_patches(self):
+        dummy_image = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)[0]
+
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict, crop_to_patches=True)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict, crop_to_patches=True)
+
+        encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
+
+        torch.testing.assert_close(encoding_slow.num_patches, encoding_fast.num_patches)
+        self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
+        self.assertLessEqual(
+            torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3
+        )
+
+    def test_slow_fast_equivalence_batched_crop_to_patches(self):
+        # Prepare image inputs so that we have two groups of images with equal resolution with a group of images with
+        # different resolutions in between
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+        dummy_images += self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        dummy_images += self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
+
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict, crop_to_patches=True)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict, crop_to_patches=True)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
+
+        torch.testing.assert_close(encoding_slow.num_patches, encoding_fast.num_patches)
+        self.assertTrue(torch.allclose(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1))
+        self.assertLessEqual(
+            torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 1e-3
+        )
 
     def test_crop_to_patches(self):
-        image_processor = self.image_processing_class(**self.image_processor_dict)
-        image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)[0]
-        processed_images = image_processor.crop_image_to_patches(image, 1, 6, use_thumbnail=True)
+        # test slow image processor
+        image_processor = self.image_processor_list[0](**self.image_processor_dict)
+        image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)[0]
+        processed_images = image_processor.crop_image_to_patches(
+            image,
+            min_patches=1,
+            max_patches=6,
+            use_thumbnail=True,
+            patch_size={"height": 20, "width": 20},
+        )
         self.assertEqual(len(processed_images), 5)
-        self.assertEqual(processed_images[0].size, (20, 20))
+        self.assertEqual(processed_images[0].shape[:2], (20, 20))
+
+        # test fast image processor (process batch)
+        image_processor = self.image_processor_list[1](**self.image_processor_dict)
+        image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)[0]
+        processed_images = image_processor.crop_image_to_patches(
+            image.unsqueeze(0),
+            min_patches=1,
+            max_patches=6,
+            use_thumbnail=True,
+            patch_size=SizeDict(height=20, width=20),
+        )
+        self.assertEqual(len(processed_images[0]), 5)
+        self.assertEqual(processed_images.shape[-2:], (20, 20))
diff --git a/tests/models/got_ocr2/test_modeling_got_ocr2.py b/tests/models/got_ocr2/test_modeling_got_ocr2.py
index 178bec98ac62..29c69dc586f2 100644
--- a/tests/models/got_ocr2/test_modeling_got_ocr2.py
+++ b/tests/models/got_ocr2/test_modeling_got_ocr2.py
@@ -138,33 +138,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def create_and_check_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
-        model = GotOcr2ForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.half()
-        model.eval()
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            pixel_values=pixel_values.to(torch.bfloat16),
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
-    def create_and_check_model_fp16_autocast_forward(self, config, input_ids, pixel_values, attention_mask):
-        config.torch_dtype = torch.float16
-        model = GotOcr2ForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            logits = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                pixel_values=pixel_values.to(torch.bfloat16),
-                return_dict=True,
-            )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
 
 @require_torch
 class GotOcr2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py
index 2e98930e4c69..5beb43d90e69 100644
--- a/tests/models/gpt2/test_modeling_flax_gpt2.py
+++ b/tests/models/gpt2/test_modeling_flax_gpt2.py
@@ -13,16 +13,13 @@
 # limitations under the License.
 
 
-import tempfile
 import unittest
 
 import numpy as np
 
-import transformers
-from transformers import GPT2Config, GPT2Tokenizer, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers import GPT2Config, GPT2Tokenizer, is_flax_available
+from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 
 
@@ -30,15 +27,8 @@
     import jax
     import jax.numpy as jnp
 
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
     from transformers.models.gpt2.modeling_flax_gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model
 
-if is_torch_available():
-    import torch
-
 
 class FlaxGPT2ModelTester:
     def __init__(
@@ -209,7 +199,7 @@ def check_bool_attention_mask_in_generation(self, model_class_name, config, inpu
 
 
 @require_flax
-class FlaxGPT2ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxGPT2ModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxGPT2Model, FlaxGPT2LMHeadModel) if is_flax_available() else ()
 
     def setUp(self):
@@ -256,105 +246,6 @@ def test_batch_generation(self):
 
         self.assertListEqual(output_string, expected_string)
 
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index b45c84fc6445..603d8d88a744 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -431,9 +431,9 @@ def create_and_check_cached_forward_with_and_without_attention_mask(self, config
         model.eval()
 
         # We want this for SDPA, eager works with a `None` attention mask
-        assert (
-            model.config._attn_implementation == "sdpa"
-        ), "This test assumes the model to have the SDPA implementation for its attention calculations."
+        assert model.config._attn_implementation == "sdpa", (
+            "This test assumes the model to have the SDPA implementation for its attention calculations."
+        )
 
         # Prepare cache and non_cache input, needs a full attention mask
         cached_len = input_ids.shape[-1] // 2
@@ -601,19 +601,19 @@ def test_cached_forward_with_and_without_attention_mask(self):
         self.model_tester.create_and_check_cached_forward_with_and_without_attention_mask(*config_and_inputs)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -662,7 +662,7 @@ def test_batch_generation(self):
         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
         output_non_padded = model.generate(input_ids=inputs_non_padded, max_length=20)
 
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
 
@@ -724,7 +724,7 @@ def test_batch_generation_2heads(self):
         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
         output_non_padded = model.generate(input_ids=inputs_non_padded, max_length=20)
 
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
 
@@ -866,7 +866,7 @@ def test_contrastive_search_gpt2(self):
     @slow
     def test_flash_attn_2_generate_padding_left(self):
         """
-        Overwritting the common test as the test is flaky on tiny models
+        Overwriting the common test as the test is flaky on tiny models
         """
         model = GPT2LMHeadModel.from_pretrained("gpt2", torch_dtype=torch.float16).to(0)
 
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
index c56d837939c5..ed4c4a2e89b7 100644
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -451,7 +451,7 @@ def test_onnx_runtime_optimize(self):
             onnxruntime.InferenceSession(onnx_model_proto.SerializeToString())
 
     # TODO (Joao): fix me
-    @unittest.skip("Onnx compliancy broke with TF 2.10")
+    @unittest.skip("Onnx compliance broke with TF 2.10")
     def test_onnx_compliancy(self):
         pass
 
@@ -548,7 +548,7 @@ def test_lm_generate_greedy_distilgpt2_beam_search_special(self):
 
     @slow
     def test_lm_generate_distilgpt2_left_padding(self):
-        """Tests that the generated text is the same, regarless of left padding"""
+        """Tests that the generated text is the same, regardless of left padding"""
         model = TFGPT2LMHeadModel.from_pretrained("distilbert/distilgpt2")
         tokenizer = GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
 
diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py
index 379485fd5623..40e9f2fe48ee 100644
--- a/tests/models/gpt2/test_tokenization_gpt2.py
+++ b/tests/models/gpt2/test_tokenization_gpt2.py
@@ -17,12 +17,13 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast
 from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_jinja, require_tokenizers
+from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -34,8 +35,9 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_kwargs = {"add_prefix_space": True}
     test_seq2seq = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -63,22 +65,30 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return GPT2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return GPT2Tokenizer.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return GPT2TokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
@@ -135,7 +145,7 @@ def test_pretokenized_inputs(self, *args, **kwargs):
     def test_padding(self, max_length=15):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 # Simple input
                 s = "This is a simple input"
@@ -299,6 +309,23 @@ def test_tokenization_for_chat(self):
         for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
             self.assertListEqual(tokenized_chat, expected_tokens)
 
+    @require_tiktoken
+    def test_tokenization_tiktoken(self):
+        from tiktoken import encoding_name_for_model
+
+        from transformers.integrations.tiktoken import convert_tiktoken_to_fast
+
+        encoding = encoding_name_for_model("gpt2")
+        convert_tiktoken_to_fast(encoding, self.tmpdirname)
+
+        tiktoken_fast_tokenizer = GPT2TokenizerFast.from_pretrained(self.tmpdirname)
+        rust_tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        sequence = "lower newer"
+        self.assertEqual(
+            rust_tokenizer.decode(rust_tokenizer.encode(sequence)),
+            tiktoken_fast_tokenizer.decode(rust_tokenizer.encode(sequence)),
+        )
+
 
 @require_tokenizers
 class OPTTokenizationTest(unittest.TestCase):
diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
index 490d58c8d112..170a261b4121 100644
--- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
@@ -13,16 +13,13 @@
 # limitations under the License.
 
 
-import tempfile
 import unittest
 
 import numpy as np
 
-import transformers
-from transformers import GPT2Tokenizer, GPTNeoConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers import GPT2Tokenizer, GPTNeoConfig, is_flax_available
+from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
 
 
@@ -30,15 +27,8 @@
     import jax
     import jax.numpy as jnp
 
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
     from transformers.models.gpt_neo.modeling_flax_gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel
 
-if is_torch_available():
-    import torch
-
 
 class FlaxGPTNeoModelTester:
     def __init__(
@@ -181,7 +171,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxGPTNeoModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxGPTNeoModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxGPTNeoModel, FlaxGPTNeoForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
@@ -225,105 +215,6 @@ def test_batch_generation(self):
 
         self.assertListEqual(output_string, expected_string)
 
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index 213c3ed497f1..9a447e2cadec 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -479,7 +479,7 @@ def test_local_attn_probs(self):
         # the last 2 tokens are masked, and should have 0 attn_probs
         self.assertTrue(torch.all(attn_probs[:, :, -mask_tokens:, -mask_tokens:] == 0))
 
-        # in loacal attention each token can only attend to the previous window_size tokens (inlcuding itself)
+        # in loacal attention each token can only attend to the previous window_size tokens (including itself)
         # here window_size is 4, so a token at index 5 can only attend to indcies [2, 3, 4, 5]
         # and the attn_probs should be 0 for token [0, 1]
         self.assertTrue(torch.all(attn_probs[:, :, 5, 2:6] != 0))
@@ -552,7 +552,7 @@ def test_batch_generation(self):
         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
         output_non_padded = model.generate(input_ids=inputs_non_padded)
 
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
 
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 34a8e54f700d..45906d60777c 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -222,9 +222,9 @@ def create_and_check_cached_forward_with_and_without_attention_mask(self, config
         model.eval()
 
         # We want this for SDPA, eager works with a `None` attention mask
-        assert (
-            model.config._attn_implementation == "sdpa"
-        ), "This test assumes the model to have the SDPA implementation for its attention calculations."
+        assert model.config._attn_implementation == "sdpa", (
+            "This test assumes the model to have the SDPA implementation for its attention calculations."
+        )
 
         # Prepare cache and non_cache input, needs a full attention mask
         cached_len = input_ids.shape[-1] // 2
@@ -381,7 +381,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
index 029c8b99d44b..6402c579560c 100644
--- a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
@@ -17,6 +17,7 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import (
     VOCAB_FILES_NAMES,
@@ -24,7 +25,7 @@
 )
 from transformers.testing_utils import require_tokenizers, slow
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -34,8 +35,9 @@ class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = False
     from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "こん",
@@ -62,18 +64,22 @@ def setUp(self):
             "<|endoftext|>",
         ]
         emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}}  # 😀
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.emoji_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.emoji_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-        with open(self.emoji_file, "w") as emoji_writer:
+        with open(cls.emoji_file, "w") as emoji_writer:
             emoji_writer.write(json.dumps(emoji_tokens))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return GPTNeoXJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return GPTNeoXJapaneseTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀"
diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
index eb5de3a6c20e..a13be778a6e5 100644
--- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
+++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
@@ -33,13 +33,14 @@ class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece = True
     test_sentencepiece_ignore_case = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, eos_token="<unk>", bos_token="<unk>", pad_token="<unk>")
 
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "This is a test"
diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py
index ece207ee5dbb..305a86ece1e5 100644
--- a/tests/models/gptj/test_modeling_flax_gptj.py
+++ b/tests/models/gptj/test_modeling_flax_gptj.py
@@ -13,16 +13,13 @@
 # limitations under the License.
 
 
-import tempfile
 import unittest
 
 import numpy as np
 
-import transformers
-from transformers import GPT2Tokenizer, GPTJConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, tooslow
+from transformers import GPT2Tokenizer, GPTJConfig, is_flax_available
+from transformers.testing_utils import require_flax, tooslow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
 
 
@@ -30,15 +27,8 @@
     import jax
     import jax.numpy as jnp
 
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
     from transformers.models.gptj.modeling_flax_gptj import FlaxGPTJForCausalLM, FlaxGPTJModel
 
-if is_torch_available():
-    import torch
-
 
 class FlaxGPTJModelTester:
     def __init__(
@@ -178,7 +168,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxGPTJModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxGPTJModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxGPTJModel, FlaxGPTJForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
@@ -222,105 +212,6 @@ def test_batch_generation(self):
 
         self.assertListEqual(output_string, expected_string)
 
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
     @tooslow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index 793afc7f5c30..ddc1521821f3 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -466,7 +466,7 @@ def test_batch_generation(self):
         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
         output_non_padded = model.generate(input_ids=inputs_non_padded)
 
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
 
diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py
index 469e96fd8304..8a377239582b 100644
--- a/tests/models/granite/test_modeling_granite.py
+++ b/tests/models/granite/test_modeling_granite.py
@@ -146,116 +146,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = GraniteModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = GraniteForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = GraniteForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -314,10 +204,6 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip("Granite buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @parameterized.expand([("linear",), ("dynamic",)])
     def test_model_rope_scaling_from_config(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -356,7 +242,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py
index 0e64d29c9189..7ed6f764f386 100644
--- a/tests/models/granitemoe/test_modeling_granitemoe.py
+++ b/tests/models/granitemoe/test_modeling_granitemoe.py
@@ -145,116 +145,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = GraniteMoeModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = GraniteMoeForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = GraniteMoeForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -313,10 +203,6 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip("GraniteMoe buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @parameterized.expand([("linear",), ("dynamic",)])
     def test_model_rope_scaling_from_config(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -355,7 +241,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/examples/research_projects/information-gain-filtration/igf/__init__.py b/tests/models/granitemoeshared/__init__.py
similarity index 100%
rename from examples/research_projects/information-gain-filtration/igf/__init__.py
rename to tests/models/granitemoeshared/__init__.py
diff --git a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py
new file mode 100644
index 000000000000..cc71b17850ff
--- /dev/null
+++ b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py
@@ -0,0 +1,366 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch GraniteMoeShared model."""
+
+import unittest
+
+from parameterized import parameterized
+
+from transformers import AutoTokenizer, GraniteMoeSharedConfig, is_torch_available, set_seed
+from transformers.testing_utils import (
+    require_read_token,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        GraniteMoeSharedForCausalLM,
+        GraniteMoeSharedModel,
+    )
+    from transformers.models.granitemoeshared.modeling_granitemoeshared import (
+        GraniteMoeSharedRotaryEmbedding,
+    )
+
+
+class GraniteMoeSharedModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        shared_intermediate_size=174,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.shared_intermediate_size = shared_intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return GraniteMoeSharedConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            shared_intermediate_size=self.shared_intermediate_size,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = GraniteMoeSharedModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class GraniteMoeSharedModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            GraniteMoeSharedModel,
+            GraniteMoeSharedForCausalLM,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": GraniteMoeSharedModel,
+            "text-generation": GraniteMoeSharedForCausalLM,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    fx_compatible = False
+
+    # Need to use `0.8` instead of `0.9` for `test_cpu_offload`
+    # This is because we are hitting edge cases with the causal_mask buffer
+    model_split_percents = [0.5, 0.7, 0.8]
+
+    def setUp(self):
+        self.model_tester = GraniteMoeSharedModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=GraniteMoeSharedConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @parameterized.expand([("linear",), ("dynamic",)])
+    def test_model_rope_scaling_from_config(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        short_input = ids_tensor([1, 10], config.vocab_size)
+        long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        original_model = GraniteMoeSharedModel(config)
+        original_model.to(torch_device)
+        original_model.eval()
+        original_short_output = original_model(short_input).last_hidden_state
+        original_long_output = original_model(long_input).last_hidden_state
+
+        set_seed(42)  # Fixed seed at init time so the two models get the same random weights
+        config.rope_scaling = {"type": scaling_type, "factor": 10.0}
+        scaled_model = GraniteMoeSharedModel(config)
+        scaled_model.to(torch_device)
+        scaled_model.eval()
+        scaled_short_output = scaled_model(short_input).last_hidden_state
+        scaled_long_output = scaled_model(long_input).last_hidden_state
+
+        # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
+        # maximum sequence length, so the outputs for the short input should match.
+        if scaling_type == "dynamic":
+            torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5)
+        else:
+            self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
+
+        # The output should be different for long inputs
+        self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+
+    def test_model_rope_scaling(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        scaling_factor = 10
+        short_input_length = 10
+        long_input_length = int(config.max_position_embeddings * 1.5)
+
+        # Inputs
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
+        position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
+        position_ids_short = position_ids_short.unsqueeze(0)
+        position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
+        position_ids_long = position_ids_long.unsqueeze(0)
+
+        # Sanity check original RoPE
+        original_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device)
+        original_cos_short, original_sin_short = original_rope(x, position_ids_short)
+        original_cos_long, original_sin_long = original_rope(x, position_ids_long)
+        torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :])
+
+        # Sanity check linear RoPE scaling
+        # New position "x" should match original position with index "x/scaling_factor"
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+        linear_scaling_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device)
+        linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short)
+        linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :])
+        for new_position in range(0, long_input_length, scaling_factor):
+            original_position = int(new_position // scaling_factor)
+            torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :])
+            torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :])
+
+        # Sanity check Dynamic NTK RoPE scaling
+        # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
+        # with scaling_factor (or that `inv_freq` decreases)
+        config.rope_scaling = {"type": "dynamic", "factor": scaling_factor}
+        ntk_scaling_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device)
+        ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short)
+        ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(ntk_cos_short, original_cos_short)
+        torch.testing.assert_close(ntk_sin_short, original_sin_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(ntk_cos_long, original_cos_long)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(ntk_sin_long, original_sin_long)
+        self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
+
+        # Sanity check Yarn RoPE scaling
+        # Scaling should be over the entire input
+        config.rope_scaling = {"type": "yarn", "factor": scaling_factor}
+        yarn_scaling_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device)
+        yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short)
+        yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long)
+        torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :])
+        torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :])
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_cos_short, original_cos_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_sin_short, original_sin_short)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_cos_long, original_cos_long)
+        with self.assertRaises(AssertionError):
+            torch.testing.assert_close(yarn_sin_long, original_sin_long)
+
+
+@require_torch_gpu
+class GraniteMoeSharedIntegrationTest(unittest.TestCase):
+    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
+    # Depending on the hardware we get different logits / generations
+    cuda_compute_capability_major_version = None
+
+    @classmethod
+    def setUpClass(cls):
+        if is_torch_available() and torch.cuda.is_available():
+            # 8 is for A100 / A10 and 7 for T4
+            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
+
+    @slow
+    @require_read_token
+    def test_model_3b_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+
+        model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto")
+
+        with torch.no_grad():
+            out = model(torch.tensor([input_ids]).to(torch_device))
+
+        # fmt: off
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]])
+
+        torch.testing.assert_close(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), rtol=1e-2, atol=1e-2)
+
+        # slicing logits[0, 0, 0:15]
+        EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892,
+        -2.2895, -2.2891, -2.2887, -2.2882, -2.2889, -2.2898, -2.2892]])
+        # fmt: on
+
+        self.assertTrue(
+            torch.allclose(
+                EXPECTED_SLICE.to(torch_device),
+                out.logits[0, 0, :15].float(),
+                atol=1e-3,
+                rtol=1e-3,
+            )
+        )
+
+    @slow
+    def test_model_3b_generation(self):
+        # ground truth text generated with dola_layers="low", repetition_penalty=1.2
+        EXPECTED_TEXT_COMPLETION = (
+            "Simply put, the theory of relativity states that \n$$\n\\frac{d^2x^\\mu}{d\\tau^2} = "
+            "\\frac{1}{c^2}\\frac{d^2x^\\mu}{dt^2}\n$$\nwhere $x^\\mu$ is a four-vector, $\\tau$ is the proper time"
+        )
+        prompt = "Simply put, the theory of relativity states that "
+        tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")
+        model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b", device_map="auto")
+        model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+        # greedy generation outputs
+        generated_ids = model.generate(**model_inputs, max_new_tokens=64, top_p=None, temperature=1, do_sample=False)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index b102c357e518..88ba1caa2bb6 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -20,6 +20,8 @@
 import re
 import unittest
 
+from datasets import load_dataset
+
 from transformers import (
     GroundingDinoConfig,
     SwinConfig,
@@ -28,6 +30,7 @@
 )
 from transformers.file_utils import cached_property
 from transformers.testing_utils import (
+    is_flaky,
     require_timm,
     require_torch,
     require_torch_accelerator,
@@ -37,14 +40,14 @@
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
 
-    from transformers import GroundingDinoForObjectDetection, GroundingDinoModel
+    from transformers import GroundingDinoConfig, GroundingDinoForObjectDetection, GroundingDinoModel
     from transformers.pytorch_utils import id_tensor_storage
 
 
@@ -54,6 +57,39 @@
     from transformers import AutoProcessor
 
 
+def generate_fake_bounding_boxes(n_boxes):
+    """Generate bounding boxes in the format (center_x, center_y, width, height)"""
+    # Validate the input
+    if not isinstance(n_boxes, int):
+        raise ValueError("n_boxes must be an integer")
+    if n_boxes <= 0:
+        raise ValueError("n_boxes must be a positive integer")
+
+    # Generate random bounding boxes in the format (center_x, center_y, width, height)
+    bounding_boxes = torch.rand((n_boxes, 4))
+
+    # Extract the components
+    center_x = bounding_boxes[:, 0]
+    center_y = bounding_boxes[:, 1]
+    width = bounding_boxes[:, 2]
+    height = bounding_boxes[:, 3]
+
+    # Ensure width and height do not exceed bounds
+    width = torch.min(width, torch.tensor(1.0))
+    height = torch.min(height, torch.tensor(1.0))
+
+    # Ensure the bounding box stays within the normalized space
+    center_x = torch.where(center_x - width / 2 < 0, width / 2, center_x)
+    center_x = torch.where(center_x + width / 2 > 1, 1 - width / 2, center_x)
+    center_y = torch.where(center_y - height / 2 < 0, height / 2, center_y)
+    center_y = torch.where(center_y + height / 2 > 1, 1 - height / 2, center_y)
+
+    # Combine back into bounding boxes
+    bounding_boxes = torch.stack([center_x, center_y, width, height], dim=1)
+
+    return bounding_boxes
+
+
 class GroundingDinoModelTester:
     def __init__(
         self,
@@ -72,7 +108,7 @@ def __init__(
         num_channels=3,
         image_size=98,
         n_targets=8,
-        num_labels=3,
+        num_labels=2,
         num_feature_levels=4,
         encoder_n_points=2,
         decoder_n_points=6,
@@ -115,7 +151,11 @@ def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
         pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device)
 
-        input_ids = ids_tensor([self.batch_size, self.max_text_len], self.num_labels)
+        # When using `GroundingDino` the text input template is '{label1}. {label2}. {label3. ... {labelN}.'
+        # Therefore to avoid errors when running tests with `labels` `input_ids` have to follow this structure.
+        # Otherwise when running `build_label_maps` it will throw an error when trying to split the input_ids into segments.
+        input_ids = torch.tensor([101, 3869, 1012, 11420, 3869, 1012, 102], device=torch_device)
+        input_ids = input_ids.unsqueeze(0).expand(self.batch_size, -1)
 
         labels = None
         if self.use_labels:
@@ -126,7 +166,7 @@ def prepare_config_and_inputs(self):
                 target["class_labels"] = torch.randint(
                     high=self.num_labels, size=(self.n_targets,), device=torch_device
                 )
-                target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device)
+                target["boxes"] = generate_fake_bounding_boxes(self.n_targets).to(torch_device)
                 target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device)
                 labels.append(target)
 
@@ -317,7 +357,7 @@ def test_attention_outputs(self):
             )
             out_len = len(outputs)
 
-            correct_outlen = 10
+            correct_outlen = 12
 
             # loss is at first position
             if "labels" in inputs_dict:
@@ -677,6 +717,7 @@ def test_inference_object_detection_head(self):
         self.assertListEqual(results["text_labels"], expected_labels)
 
     @require_torch_accelerator
+    @is_flaky()
     def test_inference_object_detection_head_equivalence_cpu_gpu(self):
         processor = self.default_processor
         image = prepare_img()
@@ -716,6 +757,7 @@ def test_inference_object_detection_head_equivalence_cpu_gpu(self):
         torch.testing.assert_close(results_cpu["scores"], result_gpu["scores"].cpu(), rtol=1e-3, atol=1e-3)
         torch.testing.assert_close(results_cpu["boxes"], result_gpu["boxes"].cpu(), rtol=1e-3, atol=1e-3)
 
+    @is_flaky()
     def test_cross_attention_mask(self):
         model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(torch_device)
 
@@ -740,4 +782,56 @@ def test_cross_attention_mask(self):
 
         torch.testing.assert_close(outputs1.logits, outputs_batched.logits[:1], rtol=1e-3, atol=1e-3)
         # For some reason 12 elements are > 1e-3, but the rest are fine
-        torch.testing.assert_close(outputs2.logits, outputs_batched.logits[1:], rtol=1.8e-3, atol=1.8e-3)
+        self.assertTrue(torch.allclose(outputs2.logits, outputs_batched.logits[1:], atol=1.8e-3))
+
+    def test_grounding_dino_loss(self):
+        ds = load_dataset("EduardoPacheco/aquarium-sample", split="train")
+        image_processor = self.default_processor.image_processor
+        tokenizer = self.default_processor.tokenizer
+        id2label = {0: "fish", 1: "jellyfish", 2: "penguins", 3: "sharks", 4: "puffins", 5: "stingrays", 6: "starfish"}
+        prompt = ". ".join(id2label.values()) + "."
+
+        text_inputs = tokenizer([prompt, prompt], return_tensors="pt")
+        image_inputs = image_processor(images=ds["image"], annotations=ds["annotations"], return_tensors="pt")
+
+        # Passing auxiliary_loss=True to compare with the expected loss
+        model = GroundingDinoForObjectDetection.from_pretrained(
+            "IDEA-Research/grounding-dino-tiny",
+            auxiliary_loss=True,
+        )
+        # Interested in the loss only
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**text_inputs, **image_inputs)
+
+        # Loss differs by CPU and GPU, also this can be changed in future.
+        expected_loss_dict = {
+            "loss_ce": torch.tensor(1.1147),
+            "loss_bbox": torch.tensor(0.2031),
+            "loss_giou": torch.tensor(0.5819),
+            "loss_ce_0": torch.tensor(1.1941),
+            "loss_bbox_0": torch.tensor(0.1978),
+            "loss_giou_0": torch.tensor(0.5524),
+            "loss_ce_1": torch.tensor(1.1621),
+            "loss_bbox_1": torch.tensor(0.1909),
+            "loss_giou_1": torch.tensor(0.5892),
+            "loss_ce_2": torch.tensor(1.1641),
+            "loss_bbox_2": torch.tensor(0.1892),
+            "loss_giou_2": torch.tensor(0.5626),
+            "loss_ce_3": torch.tensor(1.1943),
+            "loss_bbox_3": torch.tensor(0.1941),
+            "loss_giou_3": torch.tensor(0.5607),
+            "loss_ce_4": torch.tensor(1.0956),
+            "loss_bbox_4": torch.tensor(0.2008),
+            "loss_giou_4": torch.tensor(0.5836),
+            "loss_ce_enc": torch.tensor(16226.3164),
+            "loss_bbox_enc": torch.tensor(0.3063),
+            "loss_giou_enc": torch.tensor(0.7380),
+        }
+
+        expected_loss = torch.tensor(32482.2305)
+
+        for key in expected_loss_dict:
+            self.assertTrue(torch.allclose(outputs.loss_dict[key], expected_loss_dict[key], atol=1e-3))
+
+        self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-3))
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index da1db5a1fc82..3c48743a590e 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -24,7 +24,7 @@
 import requests
 
 from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
-from transformers.testing_utils import is_flaky, is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -166,18 +166,6 @@ def test_inputs_embeds(self):
     def test_batching_equivalence(self):
         super().test_batching_equivalence()
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        import tensorflow as tf
-
-        seed = 338
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
-        tf.random.set_seed(seed)
-        return super().test_pt_tf_model_equivalence()
-
     def test_model_get_set_embeddings(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -275,25 +263,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="GroupViTVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="GroupViTVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     # override since the attention mask from GroupViT is not used to compute loss, thus no grad
     def test_retain_grad_hidden_states_attentions(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -473,13 +453,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -488,14 +468,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="GroupViTTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="GroupViTTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "nvidia/groupvit-gcc-yfcc"
@@ -595,23 +567,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # overwritten from parent as this equivalent test needs a specific `seed` and hard to get a good one!
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-5, name="outputs", attributes=None):
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes)
-
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        import tensorflow as tf
-
-        seed = 163
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
-        tf.random.set_seed(seed)
-        return super().test_pt_tf_model_equivalence()
-
-    # override as the `logit_scale` parameter initilization is different for GROUPVIT
+    # override as the `logit_scale` parameter initialization is different for GROUPVIT
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -620,7 +576,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py
index 33897357348b..06c37dcd40a8 100644
--- a/tests/models/groupvit/test_modeling_tf_groupvit.py
+++ b/tests/models/groupvit/test_modeling_tf_groupvit.py
@@ -23,12 +23,10 @@
 import unittest
 from importlib import import_module
 
-import numpy as np
 import requests
 
 from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
 from transformers.testing_utils import (
-    is_pt_tf_cross_test,
     require_tensorflow_probability,
     require_tf,
     require_vision,
@@ -149,10 +147,6 @@ class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase):
     test_head_masking = False
     test_onnx = False
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as this model tends to diverge a bit more
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
-
     def setUp(self):
         self.model_tester = TFGroupViTVisionModelTester(self)
         self.config_tester = ConfigTester(
@@ -291,25 +285,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        # `GroupViT` computes some indices using argmax, uses them as
-        # one-hot encoding for further computation. The problem is
-        # while PT/TF have very small difference in `y_soft` (~ 1e-9),
-        # the argmax could be totally different, if there are at least
-        # 2 indices with almost identical values. This leads to very
-        # large difference in the outputs. We need specific seeds to
-        # avoid almost identical values happening in `y_soft`.
-        import torch
-
-        seed = 338
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
-        tf.random.set_seed(seed)
-        return super().test_pt_tf_model_equivalence()
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "nvidia/groupvit-gcc-yfcc"
@@ -462,10 +437,6 @@ class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase):
     test_head_masking = False
     test_onnx = False
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as this model tends to diverge a bit more
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
-
     def setUp(self):
         self.model_tester = TFGroupViTTextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GroupViTTextConfig, hidden_size=37)
@@ -588,10 +559,6 @@ class TFGroupViTModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test
     test_attention_outputs = False
     test_onnx = False
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as this model tends to diverge a bit more
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
-
     def setUp(self):
         self.model_tester = TFGroupViTModelTester(self)
 
@@ -616,25 +583,6 @@ def test_model_common_attributes(self):
     def test_keras_fit(self):
         super().test_keras_fit()
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        # `GroupViT` computes some indices using argmax, uses them as
-        # one-hot encoding for further computation. The problem is
-        # while PT/TF have very small difference in `y_soft` (~ 1e-9),
-        # the argmax could be totally different, if there are at least
-        # 2 indices with almost identical values. This leads to very
-        # large difference in the outputs. We need specific seeds to
-        # avoid almost identical values happening in `y_soft`.
-        import torch
-
-        seed = 158
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
-        tf.random.set_seed(seed)
-        return super().test_pt_tf_model_equivalence()
-
     # overwrite from common since `TFGroupViTModelTester` set `return_loss` to `True` and causes the preparation of
     # `symbolic_inputs` failed.
     def test_keras_save_load(self):
diff --git a/tests/models/herbert/test_tokenization_herbert.py b/tests/models/herbert/test_tokenization_herbert.py
index 02b2c54a2f08..36849bb69833 100644
--- a/tests/models/herbert/test_tokenization_herbert.py
+++ b/tests/models/herbert/test_tokenization_herbert.py
@@ -33,12 +33,13 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     rust_tokenizer_class = HerbertTokenizerFast
     test_rust_tokenizer = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Use a simpler test file without japanese/chinese characters
         with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data:
-            self._data = f_data.read().replace("\n\n", "\n").strip()
+            cls._data = f_data.read().replace("\n\n", "\n").strip()
 
         vocab = [
             "<s>",
@@ -69,11 +70,11 @@ def setUp(self):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w") as fp:
             fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
+        with open(cls.merges_file, "w") as fp:
             fp.write("\n".join(merges))
 
     def get_input_output_texts(self, tokenizer):
diff --git a/tests/models/hubert/test_modeling_tf_hubert.py b/tests/models/hubert/test_modeling_tf_hubert.py
index 3685e6598740..6372fc13a140 100644
--- a/tests/models/hubert/test_modeling_tf_hubert.py
+++ b/tests/models/hubert/test_modeling_tf_hubert.py
@@ -19,15 +19,13 @@
 import copy
 import inspect
 import math
-import os
-import tempfile
 import unittest
 
 import numpy as np
 import pytest
 
 from transformers import is_tf_available
-from transformers.testing_utils import is_pt_tf_cross_test, require_soundfile, require_tf, slow
+from transformers.testing_utils import require_soundfile, require_tf, slow
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
@@ -337,62 +335,6 @@ def test_keras_fit(self):
         # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
         pass
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
-        # We override the base test here to skip loss calculation for Hubert models because the loss is massive with
-        # the default labels and frequently overflows to inf or exceeds numerical tolerances between TF/PT
-        import torch
-
-        import transformers
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            # Output all for aggressive testing
-            config.output_hidden_states = True
-            config.output_attentions = self.has_attentions
-
-            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
-            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
-            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
-            self._make_attention_mask_non_null(inputs_dict)
-
-            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
-            pt_model_class = getattr(transformers, pt_model_class_name)
-
-            tf_model = model_class(config)
-            pt_model = pt_model_class(config)
-
-            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
-            )
-            pt_model = transformers.load_tf2_model_in_pytorch_model(
-                pt_model, tf_model, allow_missing_keys=allow_missing_keys
-            )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
-                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
-                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
 
 @require_tf
 class TFHubertRobustModelTest(TFModelTesterMixin, unittest.TestCase):
@@ -518,62 +460,6 @@ def test_keras_fit(self):
         # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
         pass
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
-        # We override the base test here to skip loss calculation for Hubert models because the loss is massive with
-        # the default labels and frequently overflows to inf or exceeds numerical tolerances between TF/PT
-        import torch
-
-        import transformers
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            # Output all for aggressive testing
-            config.output_hidden_states = True
-            config.output_attentions = self.has_attentions
-
-            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
-            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
-            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
-            self._make_attention_mask_non_null(inputs_dict)
-
-            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
-            pt_model_class = getattr(transformers, pt_model_class_name)
-
-            tf_model = model_class(config)
-            pt_model = pt_model_class(config)
-
-            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
-            )
-            pt_model = transformers.load_tf2_model_in_pytorch_model(
-                pt_model, tf_model, allow_missing_keys=allow_missing_keys
-            )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
-                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
-                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
 
 @require_tf
 class TFHubertUtilsTest(unittest.TestCase):
diff --git a/tests/models/idefics/test_image_processing_idefics.py b/tests/models/idefics/test_image_processing_idefics.py
index ad208881578c..5b40a0393e5c 100644
--- a/tests/models/idefics/test_image_processing_idefics.py
+++ b/tests/models/idefics/test_image_processing_idefics.py
@@ -28,7 +28,7 @@
     import torch
 
 if is_torchvision_available():
-    import torchvision.transforms as transforms
+    from torchvision import transforms
 
 if is_vision_available():
     from PIL import Image
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index 32c45d6e71f7..2b591aa04f36 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -23,10 +23,8 @@
 from transformers import BitsAndBytesConfig, IdeficsConfig, is_torch_available, is_vision_available
 from transformers.testing_utils import (
     TestCasePlus,
-    is_pt_tf_cross_test,
     require_bitsandbytes,
     require_torch,
-    require_torch_sdpa,
     require_vision,
     slow,
     torch_device,
@@ -35,7 +33,13 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -312,21 +316,19 @@ def prepare_config_and_inputs_for_common(self):
     def prepare_pixel_values(self):
         return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
-    @require_torch_sdpa
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test")
-
-    @require_torch_sdpa
-    @slow
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    def test_eager_matches_sdpa_generate(self):
-        self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test")
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @unittest.skip(reason="Idefics has a hard requirement on SDPA, skipping this test")
+    def test_eager_matches_sdpa_inference(
+        self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels
+    ):
+        pass
 
 
 @require_torch
 class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else ()
+    # Doesn't run generation tests here -- idefics has a dedicated tester for generation tests below
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text}
         if is_torch_available()
@@ -348,10 +350,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
         return inputs_dict
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.")
-    def test_eager_matches_sdpa_inference(self):
+    def test_eager_matches_sdpa_inference(
+        self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels
+    ):
         pass
 
     def test_model_outputs_equivalence(self):
@@ -477,13 +480,13 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -518,7 +521,7 @@ def test_attention_outputs(self):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs.attentions
-            # IDEFICS does not support outputting attention score becuase it uses SDPA under the hood
+            # IDEFICS does not support outputting attention score because it uses SDPA under the hood
             self.assertTrue(attentions[0] is None)
             out_len = len(outputs)
 
@@ -536,7 +539,7 @@ def test_attention_outputs(self):
             self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
 
             self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            # IDEFICS does not support outputting attention score becuase it uses SDPA under the hood
+            # IDEFICS does not support outputting attention score because it uses SDPA under the hood
             self.assertTrue(self_attentions[0] is None)
 
     def test_hidden_states_output(self):
@@ -574,11 +577,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
-        self.has_attentions = False
-        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "HuggingFaceM4/idefics-9b"
@@ -601,10 +599,11 @@ def setUp(self):
         )
         self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.")
-    def test_eager_matches_sdpa_inference(self, torch_dtype):
+    def test_eager_matches_sdpa_inference(
+        self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels
+    ):
         pass
 
     @pytest.mark.generate
@@ -859,13 +858,13 @@ def test_retain_grad_hidden_states_attentions(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -874,6 +873,12 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_sdpa_can_dispatch_non_composite_models(self):
         pass
 
+    @unittest.skip(
+        "Idefics has a separate test runner for generation tests with complex inheritance, causing this check to fail"
+    )
+    def test_generation_tester_mixin_inheritance(self):
+        pass
+
 
 @require_torch
 @require_vision
diff --git a/tests/models/idefics/test_modeling_tf_idefics.py b/tests/models/idefics/test_modeling_tf_idefics.py
index ffc4e586ba8b..3d2ba3f78c04 100644
--- a/tests/models/idefics/test_modeling_tf_idefics.py
+++ b/tests/models/idefics/test_modeling_tf_idefics.py
@@ -20,7 +20,7 @@
 from importlib import import_module
 
 from transformers import IdeficsConfig, is_tf_available, is_vision_available
-from transformers.testing_utils import TestCasePlus, is_pt_tf_cross_test, require_tf, require_vision, slow
+from transformers.testing_utils import TestCasePlus, require_tf, require_vision, slow
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -372,7 +372,7 @@ def test_attention_outputs(self):
             model = model_class(config)
             outputs = model(**self._prepare_for_class(inputs_dict, model_class))
             attentions = outputs.attentions
-            # IDEFICS does not support outputting attention score becuase it uses SDPA under the hood
+            # IDEFICS does not support outputting attention score because it uses SDPA under the hood
             self.assertTrue(attentions[0] is None)
             out_len = len(outputs)
 
@@ -386,7 +386,7 @@ def test_attention_outputs(self):
             self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
 
             self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            # IDEFICS does not support outputting attention score becuase it uses SDPA under the hood
+            # IDEFICS does not support outputting attention score because it uses SDPA under the hood
             self.assertTrue(self_attentions[0] is None)
 
     def test_hidden_states_output(self):
@@ -420,11 +420,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
-        self.has_attentions = False
-        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
-
     def test_keras_save_load(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index 56df4bb801b6..98a0b2d50bb6 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -344,17 +344,15 @@ def test_sdpa_can_dispatch_composite_models(self):
                 model_sdpa = model_class.from_pretrained(tmpdirname)
                 model_sdpa = model_sdpa.eval().to(torch_device)
 
-                vision_attn = None if model.vision_model._supports_sdpa else "eager"
-                perceiver_attn = None if model.connector.perceiver_resampler._supports_sdpa else "eager"
                 self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
-                self.assertTrue(model_sdpa.connector.perceiver_resampler.config._attn_implementation == perceiver_attn)
+                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa")
+                self.assertTrue(model_sdpa.connector.perceiver_resampler.config._attn_implementation == "sdpa")
 
                 model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
                 model_eager = model_eager.eval().to(torch_device)
                 self.assertTrue(model_eager.config._attn_implementation == "eager")
                 self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
-                self.assertTrue(model_sdpa.connector.perceiver_resampler.config._attn_implementation == "eager")
+                self.assertTrue(model_eager.connector.perceiver_resampler.config._attn_implementation == "eager")
 
                 for name, submodule in model_eager.named_modules():
                     class_name = submodule.__class__.__name__
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index dae54afe6208..9318f82d0279 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -193,6 +193,10 @@ def test_inputs_embeds_matches_input_ids(self):
     def test_flash_attn_2_inference_padding_right(self):
         pass
 
+    @unittest.skip(reason="Compile not yet supported in idefics3 models")
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
     # We need to override as we need to prepare such that the image token is the last token
     def test_resize_tokens_embeddings(self):
         (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
@@ -377,6 +381,10 @@ def test_flash_attn_2_fp32_ln(self):
     def test_eager_matches_sdpa_generate(self):
         pass
 
+    @unittest.skip(reason="Compile not yet supported in Idefics3 models end-to-end")
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
     # We need to override as we need to prepare such that the image token is the last token
     def test_resize_tokens_embeddings(self):
         (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index a2d65f4d4b6d..b563dc69a798 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -188,7 +188,7 @@ def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mas
         labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1)
         result = model(input_ids, token_type_ids=token_type_ids, labels=labels)
         self.parent.assertEqual(result.loss.shape, ())
-        # ImageGPTForCausalImageModeling doens't have tied input- and output embeddings
+        # ImageGPTForCausalImageModeling doesn't have tied input- and output embeddings
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size - 1))
 
     def create_and_check_imagegpt_for_image_classification(
@@ -281,19 +281,19 @@ def test_imagegpt_image_classification(self):
         self.model_tester.create_and_check_imagegpt_for_image_classification(*config_and_inputs)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 5415717cd4b8..6b5a36e58434 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -171,6 +171,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
         embed_positions = InformerSinusoidalPositionalEmbedding(
             config.context_length + config.prediction_length, config.d_model
         ).to(torch_device)
+        embed_positions._init_weight()
         self.parent.assertTrue(torch.equal(model.encoder.embed_positions.weight, embed_positions.weight))
         self.parent.assertTrue(torch.equal(model.decoder.embed_positions.weight, embed_positions.weight))
 
@@ -294,19 +295,19 @@ def test_batching_equivalence(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -475,7 +476,7 @@ def test_model_get_set_embeddings(self):
 
 def prepare_batch(filename="train-batch.pt"):
     file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
-    batch = torch.load(file, map_location=torch_device)
+    batch = torch.load(file, map_location=torch_device, weights_only=True)
     return batch
 
 
@@ -546,4 +547,4 @@ def test_seq_to_seq_generation(self):
 
         expected_slice = torch.tensor([3400.8005, 4289.2637, 7101.9209], device=torch_device)
         mean_prediction = outputs.sequences.mean(dim=1)
-        torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1)
+        torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1, atol=1e-1)
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index bbf877289040..f7c13dd09d98 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -205,25 +205,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="InstructBlipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="InstructBlipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "Salesforce/instructblip-flan-t5-xl"
@@ -522,14 +514,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="There's no base InstructBlipModel")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="There's no base InstructBlipModel")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(
         "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present"
     )
@@ -791,15 +775,12 @@ def test_inference_flant5_xl(self):
             num_beams=5,
             max_length=256,
             min_length=1,
-            top_p=0.9,
             repetition_penalty=1.5,
             length_penalty=1.0,
             temperature=1,
         )
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
 
-        expected_outputs = [0, 37, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 3, 9, 2756, 4459, 6177, 6, 11, 3, 88, 19, 338, 46, 3575, 53, 1476, 12, 743, 112, 2491, 5, 37, 1023, 19, 7225, 788, 12, 8, 685, 24, 34, 1267, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 94, 19, 487, 24, 8, 388, 19, 1119, 12, 1097, 540, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 6, 68, 34, 19, 92, 487, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 3, 13865, 13, 8, 1053, 21, 8, 388, 31, 7, 2874, 6, 34, 19, 964, 24, 3, 88, 19, 1119, 12, 1097, 97, 57, 692, 112, 10428, 30, 8, 223, 13, 8, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 1]  # fmt: skip
-
         expected_outputs = [0, 37, 7225, 1023, 9850, 7, 3, 9, 388, 3575, 53, 4954, 30, 8, 223, 13, 3, 9, 4459, 4049, 16, 8, 2214, 13, 3, 9, 3164, 690, 2815, 5, 37, 388, 19, 5119, 3, 9, 4459, 8677, 28, 46, 3575, 53, 1476, 5223, 12, 34, 6, 15495, 24, 3, 88, 19, 692, 112, 293, 10428, 44, 234, 1066, 145, 338, 3, 9, 50, 1106, 3522, 144, 42, 2192, 7919, 31, 7, 5, 37, 1023, 92, 1267, 3, 9, 381, 13, 119, 3203, 16, 8, 2458, 6, 379, 14264, 6, 9256, 7, 6, 11, 11718, 7, 5, 1]  # fmt: skip
 
         self.assertEqual(outputs[0].tolist(), expected_outputs)
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 351dea3d6fae..e5cc00d92c6a 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -213,25 +213,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="InstructBlipVideoVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="InstructBlipVideoVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "Salesforce/instructblip-vicuna-7b"
@@ -538,14 +530,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip(reason="There's no base InstructBlipVideoModel")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="There's no base InstructBlipVideoModel")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(
         "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present"
     )
diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py
index 4538ad56108b..e2c185133314 100644
--- a/tests/models/jetmoe/test_modeling_jetmoe.py
+++ b/tests/models/jetmoe/test_modeling_jetmoe.py
@@ -150,116 +150,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = JetMoeModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = JetMoeForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = JetMoeForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -362,10 +252,6 @@ def test_jetmoe_sequence_classification_model_for_multi_label(self):
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
-    @unittest.skip(reason="JetMoe buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="JetMoe uses MoA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
diff --git a/tests/models/kosmos2/test_processor_kosmos2.py b/tests/models/kosmos2/test_processor_kosmos2.py
index 8874c7d1d30e..0a34c3968188 100644
--- a/tests/models/kosmos2/test_processor_kosmos2.py
+++ b/tests/models/kosmos2/test_processor_kosmos2.py
@@ -70,12 +70,15 @@ def setUp(self):
         processor = Kosmos2Processor(image_processor, fast_tokenizer)
         processor.save_pretrained(self.tmpdirname)
 
-    # We override this method to take the fast tokenizer or image processor by default
+    # We override this method to take the fast tokenizer by default
     def get_component(self, attribute, **kwargs):
         assert attribute in self.processor_class.attributes
         component_class_name = getattr(self.processor_class, f"{attribute}_class")
         if isinstance(component_class_name, tuple):
-            component_class_name = component_class_name[-1]
+            if attribute == "image_processor":
+                component_class_name = component_class_name[0]
+            else:
+                component_class_name = component_class_name[-1]
 
         component_class = processor_class_from_name(component_class_name)
         component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index ab2a9cd51359..68605c1d6a78 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -280,19 +280,19 @@ def test_for_question_answering(self):
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py
index eb0e1de626a5..7143d8b0e009 100644
--- a/tests/models/layoutlm/test_tokenization_layoutlm.py
+++ b/tests/models/layoutlm/test_tokenization_layoutlm.py
@@ -16,12 +16,13 @@
 
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast
 from transformers.models.layoutlm.tokenization_layoutlm import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_tokenizers
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -32,8 +33,9 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     space_between_special_tokens = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "[UNK]",
@@ -50,12 +52,16 @@ def setUp(self):
             "low",
             "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-    def get_tokenizer(self, **kwargs):
-        return LayoutLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return LayoutLMTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "UNwant\u00e9d,running"
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index a8b5083ebd51..ee62ed011bcd 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -497,7 +497,7 @@ def recursive_check(batched_object, single_row_object, model_name, key):
                     single_batch_shape = value.shape[0] // batch_size
                     single_row_input[key] = value[:single_batch_shape]
                 elif hasattr(value, "tensor"):
-                    # layoutlmv2uses ImageList intead of pixel values (needs for torchscript)
+                    # layoutlmv2uses ImageList instead of pixel values (needs for torchscript)
                     single_row_input[key] = value.tensor[:single_batch_shape]
 
             with torch.no_grad():
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index 7dcf53997031..e2271d60c6d8 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -41,7 +41,6 @@
     _is_whitespace,
 )
 from transformers.testing_utils import (
-    is_pt_tf_cross_test,
     require_detectron2,
     require_pandas,
     require_tokenizers,
@@ -103,8 +102,9 @@ def get_question_words_and_boxes_batch(self):
 
         return questions, words, boxes
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "[UNK]",
@@ -123,8 +123,8 @@ def setUp(self):
             "test",
             "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
@@ -268,7 +268,7 @@ def test_sequence_builders(self):
     def test_offsets_with_special_characters(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 words, boxes = self.get_words_and_boxes()
                 words[1] = tokenizer_r.mask_token
@@ -606,8 +606,8 @@ def test_padding_to_max_length(self):
     def test_padding(self, max_length=50):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
@@ -1061,7 +1061,7 @@ def test_build_inputs_with_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 # Input tokens id
                 words, boxes = self.get_words_and_boxes()
@@ -1364,7 +1364,7 @@ def test_tokenization_python_rust_equals(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 words, boxes = self.get_words_and_boxes()
 
@@ -1418,7 +1418,7 @@ def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 words, boxes = self.get_words_and_boxes()
                 tokens_r = tokenizer_r.encode_plus(
                     words,
@@ -1497,48 +1497,6 @@ def test_layoutlmv2_truncation_integration_test(self):
         self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
         self.assertLessEqual(len(new_encoded_inputs), 20)
 
-    @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes_batch()
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="pt")
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="tf")
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        words,
-                        boxes=boxes,
-                        padding=True,
-                        return_tensors="pt",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        words,
-                        boxes=boxes,
-                        padding="longest",
-                        return_tensors="tf",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True, return_tensors="pt")
-                    tensorflow_tensor = tokenizer.batch_encode_plus(
-                        words, boxes=boxes, padding="longest", return_tensors="tf"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
     def test_sequence_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -1758,7 +1716,7 @@ def test_padding_different_model_input_name(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
 
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 9af0861536f7..48e36411dbc8 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -20,6 +20,7 @@
 import shutil
 import tempfile
 import unittest
+from functools import lru_cache
 from typing import List
 
 from parameterized import parameterized
@@ -34,7 +35,6 @@
 )
 from transformers.models.layoutlmv3.tokenization_layoutlmv3 import VOCAB_FILES_NAMES, LayoutLMv3Tokenizer
 from transformers.testing_utils import (
-    is_pt_tf_cross_test,
     require_pandas,
     require_tf,
     require_tokenizers,
@@ -42,7 +42,12 @@
     slow,
 )
 
-from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin, merge_model_tokenizer_mappings
+from ...test_tokenization_common import (
+    SMALL_TRAINING_CORPUS,
+    TokenizerTesterMixin,
+    merge_model_tokenizer_mappings,
+    use_cache_if_possible,
+)
 
 
 logger = logging.get_logger(__name__)
@@ -55,7 +60,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = LayoutLMv3Tokenizer
     rust_tokenizer_class = LayoutLMv3TokenizerFast
     test_rust_tokenizer = True
-    # determined by the tokenization algortihm and the way it's decoded by the fast tokenizers
+    # determined by the tokenization algorithm and the way it's decoded by the fast tokenizers
     space_between_special_tokens = False
     test_seq2seq = False
     from_pretrained_kwargs = {"cls_token": "<s>"}
@@ -92,8 +97,9 @@ def get_question_words_and_boxes_batch(self):
 
         return questions, words, boxes
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -120,22 +126,30 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return LayoutLMv3TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return LayoutLMv3TokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
@@ -486,8 +500,8 @@ def test_padding_to_max_length(self):
     def test_padding(self, max_length=50):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
@@ -941,7 +955,7 @@ def test_build_inputs_with_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 # Input tokens id
                 words, boxes = self.get_words_and_boxes()
@@ -1242,7 +1256,7 @@ def test_tokenization_python_rust_equals(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 words, boxes = self.get_words_and_boxes()
 
@@ -1296,7 +1310,7 @@ def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 words, boxes = self.get_words_and_boxes()
                 tokens_r = tokenizer_r.encode_plus(
                     words,
@@ -1375,48 +1389,6 @@ def test_layoutlmv3_truncation_integration_test(self):
         self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
         self.assertLessEqual(len(new_encoded_inputs), 20)
 
-    @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes_batch()
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="pt")
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="tf")
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        words,
-                        boxes=boxes,
-                        padding=True,
-                        return_tensors="pt",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        words,
-                        boxes=boxes,
-                        padding="longest",
-                        return_tensors="tf",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True, return_tensors="pt")
-                    tensorflow_tensor = tokenizer.batch_encode_plus(
-                        words, boxes=boxes, padding="longest", return_tensors="tf"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
     def test_sequence_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -1636,7 +1608,7 @@ def test_padding_different_model_input_name(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
 
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index f387e52790fc..36f837a89f8b 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -32,7 +32,6 @@
 from transformers.models.layoutxlm.tokenization_layoutxlm import LayoutXLMTokenizer
 from transformers.testing_utils import (
     get_tests_dir,
-    is_pt_tf_cross_test,
     require_pandas,
     require_sentencepiece,
     require_tokenizers,
@@ -97,12 +96,13 @@ def get_question_words_and_boxes_batch(self):
 
         return questions, words, boxes
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "UNwant\u00e9d,running"
@@ -158,7 +158,7 @@ def test_split_special_tokens(self):
             _, _, boxes = self.get_question_words_and_boxes()
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_rust = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
                 )
                 tokenizer_py = self.tokenizer_class.from_pretrained(
@@ -207,7 +207,7 @@ def test_sequence_builders(self):
     def test_offsets_with_special_characters(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 words, boxes = self.get_words_and_boxes()
                 words[1] = tokenizer_r.mask_token
@@ -537,8 +537,8 @@ def test_padding_to_max_length(self):
     def test_padding(self, max_length=50):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
@@ -991,8 +991,8 @@ def test_build_inputs_with_special_tokens(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 # Input tokens id
                 words, boxes = self.get_words_and_boxes()
@@ -1293,7 +1293,7 @@ def test_tokenization_python_rust_equals(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 words, boxes = self.get_words_and_boxes()
 
@@ -1347,7 +1347,7 @@ def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 words, boxes = self.get_words_and_boxes()
                 tokens_r = tokenizer_r.encode_plus(
                     words,
@@ -1426,48 +1426,6 @@ def test_layoutxlm_truncation_integration_test(self):
         self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
         self.assertLessEqual(len(new_encoded_inputs), 20)
 
-    @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes_batch()
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="pt")
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, words, boxes=boxes, return_tensors="tf")
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        words,
-                        boxes=boxes,
-                        padding=True,
-                        return_tensors="pt",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        words,
-                        boxes=boxes,
-                        padding="longest",
-                        return_tensors="tf",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True, return_tensors="pt")
-                    tensorflow_tensor = tokenizer.batch_encode_plus(
-                        words, boxes=boxes, padding="longest", return_tensors="tf"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus(words, boxes=boxes, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
     def test_sequence_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -1687,7 +1645,7 @@ def test_padding_different_model_input_name(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
 
@@ -1786,7 +1744,7 @@ def test_save_pretrained(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 tmpdirname2 = tempfile.mkdtemp()
 
diff --git a/tests/models/led/test_tokenization_led.py b/tests/models/led/test_tokenization_led.py
index 7d677bf3f5e2..a50acac048d0 100644
--- a/tests/models/led/test_tokenization_led.py
+++ b/tests/models/led/test_tokenization_led.py
@@ -14,13 +14,14 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import BatchEncoding, LEDTokenizer, LEDTokenizerFast
 from transformers.models.led.tokenization_led import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_tokenizers, require_torch
 from transformers.utils import cached_property
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -30,8 +31,10 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
     rust_tokenizer_class = LEDTokenizerFast
     test_rust_tokenizer = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
         vocab = [
             "l",
             "o",
@@ -56,22 +59,30 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         return "lower newer", "lower newer"
@@ -161,8 +172,8 @@ def test_pretokenized_inputs(self):
     def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 sentence = "A, <mask> AllenNLP sentence."
                 tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
                 tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py
index 9bfbb1c520c8..5ac02f942729 100644
--- a/tests/models/lilt/test_modeling_lilt.py
+++ b/tests/models/lilt/test_modeling_lilt.py
@@ -19,7 +19,6 @@
 from transformers import LiltConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -218,7 +217,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class LiltModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class LiltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             LiltModel,
@@ -282,19 +281,19 @@ def test_for_question_answering(self):
         self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index da326e797d61..e4d9418bec1c 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -20,7 +20,6 @@
 from transformers import LlamaConfig, is_flax_available, is_tokenizers_available
 from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -174,7 +173,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxLlamaModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxLlamaModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxLlamaModel, FlaxLlamaForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 01d807fbdba2..b5ce6ecf7c09 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -151,116 +151,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = LlamaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = LlamaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = LlamaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -384,10 +274,6 @@ def test_llama_token_classification_model(self):
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip(reason="Llama buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @parameterized.expand([("linear",), ("dynamic",), ("yarn",)])
     def test_model_rope_scaling_from_config(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -426,7 +312,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
@@ -551,6 +439,13 @@ def setUpClass(cls):
             # 8 is for A100 / A10 and 7 for T4
             cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
 
+    def tearDown(self):
+        # TODO (joao): automatic compilation, i.e. compilation when `cache_implementation="static"` is used, leaves
+        # some memory allocated in the cache, which means some object is not being released properly. This causes some
+        # unoptimal memory usage, e.g. after certain tests a 7B model in FP16 no longer fits in a 24GB GPU.
+        # Investigate the root cause.
+        cleanup(torch_device, gc_collect=False)
+
     @slow
     @require_read_token
     def test_llama_3_1_hard(self):
@@ -750,14 +645,6 @@ def test_export_static_cache(self):
                 "Simply put, the theory of relativity states that 1) the speed of light is the same for all "
                 "observers, regardless of their location, and 2) the laws of physics are the same for all observers"
             ],
-            "meta-llama/Llama-3.2-3B": [
-                "Simply put, the theory of relativity states that 1. the speed of light is constant, and 2. "
-                "the speed of light is the fastest speed possible"
-            ],
-            "meta-llama/Llama-2-7b-hf": [
-                "Simply put, the theory of relativity states that 1) the speed of light is a constant, and 2) "
-                "the laws of physics are the same for all",
-            ],
         }
 
         for llama_model_ckp, EXPECTED_TEXT_COMPLETION in llama_models.items():
@@ -948,7 +835,7 @@ def test_stacked_causal_mask_static_cache(self):
         max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
         past_key_values = StaticCache(
             config=self.model.config,
-            batch_size=1,
+            max_batch_size=1,
             max_cache_len=max_cache_len,
             device=torch_device,
             dtype=self.model.dtype,
@@ -996,7 +883,7 @@ def test_partial_stacked_causal_mask_static_cache(self):
         max_cache_len = 16  # note that max_cache_len is greater than the attention_mask.shape[-1]
         past_key_values = StaticCache(
             config=self.model.config,
-            batch_size=1,
+            max_batch_size=1,
             max_cache_len=max_cache_len,
             device=torch_device,
             dtype=self.model.dtype,
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index 2ae9127dee30..2c0e15bffda8 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -60,13 +60,14 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece = True
     from_pretrained_kwargs = {}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
         tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def get_tokenizers(self, **kwargs):
         kwargs.update({"pad_token": "<PAD>"})
@@ -149,8 +150,8 @@ def test_save_pretrained(self):
         self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {})
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 tmpdirname2 = tempfile.mkdtemp()
 
@@ -253,7 +254,7 @@ def test_special_tokens_initialization(self):
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
                 r_output = tokenizer_r.encode("Hey this is a <special> token")
@@ -263,7 +264,7 @@ def test_special_tokens_initialization(self):
                 self.assertTrue(special_token_id in r_output)
 
                 if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                    tokenizer_cr = self.get_rust_tokenizer(
                         pretrained_name,
                         additional_special_tokens=added_tokens,
                         **kwargs,  # , from_slow=True <- unfortunately too slow to convert
@@ -313,8 +314,8 @@ def test_add_prefix_space(self):
         EXPECTED_WITH_SPACE = [1, 18637, 920, 526, 366, 2599]
         EXPECTED_WO_SPACE = [1, 29950, 1032, 920, 526, 366, 2599]
 
-        slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
-        fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
+        slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
+        fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
         self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
         self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
         self.assertEqual(slow_.tokenize(inputs), ["H", "ey", "▁how", "▁are", "▁you", "▁doing"])
@@ -324,8 +325,8 @@ def test_add_prefix_space(self):
             fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
         )
 
-        slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
-        fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
+        slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
+        fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
         self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
         self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
         self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])
diff --git a/tests/quantization/compressed_tensor/__init__.py b/tests/models/llama4/__init__.py
similarity index 100%
rename from tests/quantization/compressed_tensor/__init__.py
rename to tests/models/llama4/__init__.py
diff --git a/tests/models/llama4/test_image_processing_llama4.py b/tests/models/llama4/test_image_processing_llama4.py
new file mode 100644
index 000000000000..bf84b3550db7
--- /dev/null
+++ b/tests/models/llama4/test_image_processing_llama4.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    pass
+
+if is_vision_available() and is_torchvision_available():
+    from transformers import Llama4ImageProcessorFast
+
+
+class Llama4ImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        max_patches=1,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        do_pad=False,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_convert_rgb=True,
+    ):
+        super().__init__()
+        size = size if size is not None else {"height": 20, "width": 20}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.max_patches = max_patches
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_pad = do_pad
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "max_patches": self.max_patches,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+            "do_pad": self.do_pad,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class Llama4ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    test_slow_image_processor = False
+    fast_image_processing_class = Llama4ImageProcessorFast if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = Llama4ImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processor, "do_resize"))
+            self.assertTrue(hasattr(image_processor, "size"))
+            self.assertTrue(hasattr(image_processor, "do_normalize"))
+            self.assertTrue(hasattr(image_processor, "image_mean"))
+            self.assertTrue(hasattr(image_processor, "image_std"))
+            self.assertTrue(hasattr(image_processor, "do_convert_rgb"))
+
+    def test_split_tiles(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class(**self.image_processor_dict)
+            image = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)[0]
+            processed_images = image_processor(
+                image,
+                max_patches=16,
+            )
+            self.assertEqual(len(processed_images.pixel_values), 1)
+            self.assertEqual(processed_images.pixel_values[0].shape[0], 17)
+            self.assertEqual(processed_images.pixel_values[0].shape[-2:], (20, 20))
diff --git a/tests/models/llama4/test_modeling_llama4.py b/tests/models/llama4/test_modeling_llama4.py
new file mode 100644
index 000000000000..65672993a09f
--- /dev/null
+++ b/tests/models/llama4/test_modeling_llama4.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Llama4 model."""
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    require_read_token,
+    require_torch_large_gpu,
+    slow,
+    torch_device,
+)
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Llama4ForConditionalGeneration,
+        Llama4Processor,
+    )
+
+
+@slow
+@require_torch_large_gpu
+@require_read_token
+class Llama4IntegrationTest(unittest.TestCase):
+    model_id = "ll-re/Llama-4-17B-Omni-Instruct"
+    # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
+    # Depending on the hardware we get different logits / generations
+    cuda_compute_capability_major_version = None
+
+    @classmethod
+    def setUpClass(cls):
+        if is_torch_available() and torch.cuda.is_available():
+            # 8 is for A100 / A10 and 7 for T4
+            cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]
+        cls.model = Llama4ForConditionalGeneration.from_pretrained(
+            "ll-re/Llama-4-17B-Omni-Instruct", device_map="auto", torch_dtype=torch.float32
+        )
+
+    def setUp(self):
+        self.processor = Llama4Processor.from_pretrained("ll-re/Llama-4-17B-Omni-Instruct", padding_side="left")
+
+        url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+        self.messages = [
+            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": url},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+
+    def test_model_17b_16e_fp16(self):
+        EXPECTED_TEXT = [
+            "The capital of France is Paris, which is located in the north-central part of the country. Paris is known for its iconic landmarks such as the",
+            "Roses are red, violets are blue, and this poem is about you. Roses are red, violets are blue, and I love",
+        ]
+
+        messages = [
+            {"role": "user", "content": "Who are you?"},
+        ]
+        inputs = self.processor.apply_chat_template(
+            messages, add_generation_prompt=True, return_tensors="pt", return_dict=True
+        ).to(torch_device)
+
+        output = self.model.generate(**inputs, max_new_tokens=100)
+        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+
+        print(output_text)
+        self.assertEqual(output_text, EXPECTED_TEXT)
+
+    def test_model_17b_16e_batch(self):
+        messages_2 = [
+            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
+                    },
+                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "text", "text": "Are these images identical?"},
+                ],
+            },
+        ]
+
+        inputs = self.processor.apply_chat_template(
+            [self.messages, messages_2],
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            padding=True,
+            add_generation_prompt=True,
+        ).to(torch_device)
+
+        output = self.model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+
+        EXPECTED_TEXTS = [
+            'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like',
+            "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n*   **Image 1:** Shows a cow"
+        ]  # fmt: skip
+        self.assertEqual(output_text, EXPECTED_TEXTS)
diff --git a/tests/models/llama4/test_processor_llama4.py b/tests/models/llama4/test_processor_llama4.py
new file mode 100644
index 000000000000..4ec01fa49703
--- /dev/null
+++ b/tests/models/llama4/test_processor_llama4.py
@@ -0,0 +1,65 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+from typing import Optional
+
+from transformers import AutoProcessor, Llama4Processor, PreTrainedTokenizerFast
+from transformers.testing_utils import require_vision
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import Llama4ImageProcessorFast
+
+
+@require_vision
+class Llama4ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Llama4Processor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = Llama4ImageProcessorFast(max_patches=1, size={"height": 20, "width": 20})
+        tokenizer = PreTrainedTokenizerFast.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit")
+        processor_kwargs = self.prepare_processor_dict()
+        processor = Llama4Processor(image_processor, tokenizer, **processor_kwargs)
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    # Override as Llama4ProcessorProcessor needs image tokens in prompts
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer <image>"
+
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+
+        if batch_size == 1:
+            return ["lower newer <image>"]
+        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
+            batch_size - 2
+        )
diff --git a/tests/models/llava/test_configuration_llava.py b/tests/models/llava/test_configuration_llava.py
index 458743887d3a..3b28adc1ee66 100644
--- a/tests/models/llava/test_configuration_llava.py
+++ b/tests/models/llava/test_configuration_llava.py
@@ -58,13 +58,13 @@ def test_arbitrary_reload(self):
         """
         Simple test for reloading arbirarily composed subconfigs
         """
-        default_values = LlavaConfig().to_dict()
-        default_values["vision_config"]["model_type"] = "qwen2_vl"
+        default_values = LlavaConfig().to_diff_dict()
+        default_values["vision_config"]["model_type"] = "pixtral"
         default_values["text_config"]["model_type"] = "opt"
-
+        self.maxDiff = None
         with tempfile.TemporaryDirectory() as tmp_dir:
             config = LlavaConfig(**default_values)
             config.save_pretrained(tmp_dir)
 
             reloaded = LlavaConfig.from_pretrained(tmp_dir)
-            assert config.to_dict() == reloaded.to_dict()
+            self.assertDictEqual(config.to_dict(), reloaded.to_dict())
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index b47423a02ec7..cf709a280700 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -160,19 +160,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def create_and_check_llava_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
-        model = LlavaForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            logits = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                pixel_values=pixel_values.to(torch.bfloat16),
-                return_dict=True,
-            )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
 
 @require_torch
 class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -198,6 +185,13 @@ def setUp(self):
         )
 
     def test_config(self):
+        # overwritten from `tests/test_configuration_common.py::ConfigTester` after #36077
+        # TODO: avoid overwritten once there is a better fix for #36077
+        def check_config_can_be_init_without_params():
+            config = self.config_tester.config_class()
+            self.config_tester.parent.assertIsNotNone(config)
+
+        self.config_tester.check_config_can_be_init_without_params = check_config_can_be_init_without_params
         self.config_tester.run_common_tests()
 
     # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
@@ -246,13 +240,13 @@ def test_inputs_embeds_matches_input_ids(self):
     def test_mismatching_num_image_tokens(self):
         """
         Tests that VLMs through an error with explicit message saying what is wrong
-        when number of images don't match number of image tokens in the text.
+        when number of images doesn't match number of image tokens in the text.
         Also we need to test multi-image cases when one prompr has multiple image tokens.
         """
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
@@ -299,19 +293,19 @@ def test_vision_feature_layers(self, vision_feature_layer):
             model(**input_dict)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -338,7 +332,7 @@ def tearDown(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test(self):
-        # Let' s make sure we test the preprocessing to replace what is used
+        # Let's make sure we test the preprocessing to replace what is used
         model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
 
         prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
@@ -357,7 +351,7 @@ def test_small_model_integration_test(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_llama_single(self):
-        # Let' s make sure we test the preprocessing to replace what is used
+        # Let's make sure we test the preprocessing to replace what is used
         model_id = "llava-hf/llava-1.5-7b-hf"
 
         model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
@@ -379,7 +373,7 @@ def test_small_model_integration_test_llama_single(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_llama_batched(self):
-        # Let' s make sure we test the preprocessing to replace what is used
+        # Let's make sure we test the preprocessing to replace what is used
         model_id = "llava-hf/llava-1.5-7b-hf"
 
         model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
@@ -406,7 +400,7 @@ def test_small_model_integration_test_llama_batched(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_batch(self):
-        # Let' s make sure we test the preprocessing to replace what is used
+        # Let's make sure we test the preprocessing to replace what is used
         model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
         # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
         prompts = [
@@ -434,7 +428,7 @@ def test_small_model_integration_test_batch(self):
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_llama_batched_regression(self):
-        # Let' s make sure we test the preprocessing to replace what is used
+        # Let's make sure we test the preprocessing to replace what is used
         model_id = "llava-hf/llava-1.5-7b-hf"
 
         # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
@@ -471,7 +465,7 @@ def test_batched_generation(self):
 
         processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
 
-        prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
+        prompt1 = "<image>\n<image>\nUSER: What's the difference of two images?\nASSISTANT:"
         prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
         prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
         url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
@@ -489,7 +483,7 @@ def test_batched_generation(self):
         model = model.eval()
 
         EXPECTED_OUTPUT = [
-            "\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
+            "\n \nUSER: What's the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
             "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
             "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
         ]
@@ -610,8 +604,13 @@ def test_pixtral_4bit(self):
         generate_ids = model.generate(**inputs, max_new_tokens=50)
         output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
-        EXPECTED_GENERATION = "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador"  # fmt: skip
-        self.assertEqual(output, EXPECTED_GENERATION)
+        EXPECTED_GENERATION = [
+            # CUDA output
+            "Describe the images. The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador",
+            # XPU output
+            "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which covers the entire background. The dog appears to be the main focus",
+        ]  # fmt: skip
+        self.assertTrue(output in EXPECTED_GENERATION)
 
     @slow
     @require_bitsandbytes
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index 0c75df53c1bb..99bfdb632d6c 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -174,39 +174,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def create_and_check_llava_next_model_fp16_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_sizes
-    ):
-        model = LlavaNextForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.half()
-        model.eval()
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            image_sizes=image_sizes,
-            pixel_values=pixel_values.to(torch.bfloat16),
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
-    def create_and_check_llava_next_model_fp16_autocast_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_sizes
-    ):
-        config.torch_dtype = torch.float16
-        model = LlavaNextForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            logits = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                image_sizes=image_sizes,
-                pixel_values=pixel_values.to(torch.bfloat16),
-                return_dict=True,
-            )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
 
 @require_torch
 class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -298,7 +265,7 @@ def test_mismatching_num_image_tokens(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
@@ -348,19 +315,19 @@ def test_vision_feature_layers(self, vision_feature_layer):
             model(**input_dict)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -408,7 +375,7 @@ def test_small_model_integration_test(self):
             filename="llava_1_6_input_ids.pt",
             repo_type="dataset",
         )
-        original_input_ids = torch.load(filepath, map_location="cpu")
+        original_input_ids = torch.load(filepath, map_location="cpu", weights_only=True)
         # replace -200 by image_token_index (since we use token ID = 32000 for the image token)
         # remove image token indices because HF impl expands image tokens `image_seq_length` times
         original_input_ids = original_input_ids[original_input_ids != -200]
@@ -420,7 +387,7 @@ def test_small_model_integration_test(self):
             filename="llava_1_6_pixel_values.pt",
             repo_type="dataset",
         )
-        original_pixel_values = torch.load(filepath, map_location="cpu")
+        original_pixel_values = torch.load(filepath, map_location="cpu", weights_only=True)
         assert torch.allclose(original_pixel_values, inputs.pixel_values.half())
 
         # verify generation
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index 6d4df92f5c22..6e17836f5c3c 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -188,41 +188,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def create_and_check_llava_next_video_model_fp16_forward(
-        self, config, input_ids, pixel_values, pixel_values_videos, attention_mask, image_sizes
-    ):
-        model = LlavaNextVideoForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.half()
-        model.eval()
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            image_sizes=image_sizes,
-            pixel_values=pixel_values.to(torch.bfloat16),
-            pixel_values_videos=pixel_values_videos.to(torch.bfloat16),
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
-    def create_and_check_llava_next_video_model_fp16_autocast_forward(
-        self, config, input_ids, pixel_values, pixel_values_videos, attention_mask, image_sizes
-    ):
-        config.torch_dtype = torch.float16
-        model = LlavaNextVideoForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            logits = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                image_sizes=image_sizes,
-                pixel_values=pixel_values.to(torch.bfloat16),
-                pixel_values_videos=pixel_values_videos.to(torch.bfloat16),
-                return_dict=True,
-            )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
 
 @require_torch
 class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -315,7 +280,7 @@ def test_mismatching_num_image_tokens(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
@@ -365,19 +330,19 @@ def test_vision_feature_layers(self, vision_feature_layer):
             model(**input_dict)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -430,13 +395,13 @@ def test_small_model_integration_test(self):
 
         # verify generation
         output = model.generate(**inputs, do_sample=False, max_new_tokens=40)
-        EXPECTED_DECODED_TEXT = 'USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a book while another child is attempting to read the same book. The child who is reading the book seems'  # fmt: skip
-
-        self.assertEqual(
-            self.processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
+        EXPECTED_DECODED_TEXT = (
+            "USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a book while another child is attempting to read the same book. The child who is reading the book seems",  # cuda output
+            "USER: \nWhy is this video funny? ASSISTANT: The humor in this video comes from the unexpected and somewhat comical situation of a young child reading a book while wearing a pair of glasses that are too large for them. The glasses are",  # xpu output
         )
 
+        self.assertTrue(self.processor.decode(output[0], skip_special_tokens=True) in EXPECTED_DECODED_TEXT)
+
     @slow
     @require_bitsandbytes
     def test_small_model_integration_test_batch(self):
diff --git a/tests/models/llava_next_video/test_processor_llava_next_video.py b/tests/models/llava_next_video/test_processor_llava_next_video.py
new file mode 100644
index 000000000000..764c944bac89
--- /dev/null
+++ b/tests/models/llava_next_video/test_processor_llava_next_video.py
@@ -0,0 +1,166 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import shutil
+import tempfile
+import unittest
+
+from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextVideoProcessor
+from transformers.testing_utils import require_av, require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import LlavaNextImageProcessor, LlavaNextVideoImageProcessor
+
+if is_torch_available:
+    import torch
+
+
+@require_vision
+class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = LlavaNextVideoProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = LlavaNextImageProcessor()
+        video_processor = LlavaNextVideoImageProcessor()
+        tokenizer = LlamaTokenizerFast.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = LlavaNextVideoProcessor(
+            video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs
+        )
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def get_video_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
+
+    def prepare_processor_dict(self):
+        return {
+            "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + ' '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ '\n' + content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ '\n' + content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+            "num_additional_image_tokens": 6,
+            "patch_size": 4,
+            "vision_feature_select_strategy": "default",
+        }
+
+    def test_processor_to_json_string(self):
+        processor = self.get_processor()
+        obj = json.loads(processor.to_json_string())
+        for key, value in self.prepare_processor_dict().items():
+            # chat_tempalate are tested as a separate test because they are saved in separate files
+            if key != "chat_template":
+                self.assertEqual(obj[key], value)
+                self.assertEqual(getattr(processor, key, None), value)
+
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
+    def test_chat_template_is_saved(self):
+        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
+        processor_dict_loaded = json.loads(processor_loaded.to_json_string())
+        # chat templates aren't serialized to json in processors
+        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+
+        # they have to be saved as separate file and loaded back from that file
+        # so we check if the same template is loaded
+        processor_dict = self.prepare_processor_dict()
+        self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_chat_template(self):
+        processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
+        expected_prompt = "USER: <image>\nWhat is shown in this image? ASSISTANT:"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        self.assertEqual(expected_prompt, formatted_prompt)
+
+    @require_av
+    def test_chat_template_dict(self):
+        processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video"},
+                    {"type": "text", "text": "What is shown in this video?"},
+                ],
+            },
+        ]
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_tensors=None
+        )
+        expected_output = [[1, 3148, 1001, 29901, 29871, 32000, 13, 5618, 338, 4318, 297, 445, 4863, 29973, 319, 1799, 9047, 13566, 29901]]  # fmt: skip
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+
+        # add image URL for return dict
+        messages[0]["content"][0] = {
+            "type": "video",
+            "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+        }
+        out_dict_with_video = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True
+        )
+        self.assertListEqual(list(out_dict_with_video.keys()), ["input_ids", "attention_mask", "pixel_values_videos"])
+
+    @require_torch
+    @require_av
+    def test_chat_template_dict_torch(self):
+        processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+                    },
+                    {"type": "text", "text": "What is shown in this video?"},
+                ],
+            },
+        ]
+
+        out_dict_tensors = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        self.assertListEqual(list(out_dict_tensors.keys()), ["input_ids", "attention_mask", "pixel_values_videos"])
+        self.assertTrue(isinstance(out_dict_tensors["input_ids"], torch.Tensor))
diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py
index c9bb448278e7..b6eb87f95f8e 100644
--- a/tests/models/llava_onevision/test_modeling_llava_onevision.py
+++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@@ -174,39 +174,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def create_and_check_llava_onevision_model_fp16_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_sizes
-    ):
-        model = LlavaOnevisionForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.half()
-        model.eval()
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            image_sizes=image_sizes,
-            pixel_values=pixel_values.to(torch.bfloat16),
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
-    def create_and_check_llava_onevision_model_fp16_autocast_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_sizes
-    ):
-        config.torch_dtype = torch.float16
-        model = LlavaOnevisionForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            logits = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                image_sizes=image_sizes,
-                pixel_values=pixel_values.to(torch.bfloat16),
-                return_dict=True,
-            )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
 
 @require_torch
 class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -319,19 +286,19 @@ def test_vision_feature_layers(self, vision_feature_layer):
             model(**input_dict)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training"
+        reason="This architecture seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training"
+        reason="This architecture seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training"
+        reason="This architecture seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/llava_onevision/test_processor_llava_onevision.py b/tests/models/llava_onevision/test_processor_llava_onevision.py
index 3f5db8c9c3ac..485da75c767f 100644
--- a/tests/models/llava_onevision/test_processor_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processor_llava_onevision.py
@@ -39,17 +39,19 @@
 class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = LlavaOnevisionProcessor
 
-    def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
+    @classmethod
+    def setUpClass(cls):
+        cls.tmpdirname = tempfile.mkdtemp()
+        cls.addClassCleanup(lambda tempdir=cls.tmpdirname: shutil.rmtree(tempdir))
         image_processor = LlavaOnevisionImageProcessor()
         video_processor = LlavaOnevisionVideoProcessor()
         tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-        processor_kwargs = self.prepare_processor_dict()
+        processor_kwargs = cls.prepare_processor_dict()
 
         processor = LlavaOnevisionProcessor(
             video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs
         )
-        processor.save_pretrained(self.tmpdirname)
+        processor.save_pretrained(cls.tmpdirname)
 
     def get_tokenizer(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
@@ -60,7 +62,8 @@ def get_image_processor(self, **kwargs):
     def get_video_processor(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
 
-    def prepare_processor_dict(self):
+    @staticmethod
+    def prepare_processor_dict():
         return {
             "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + ' '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ '\n' + content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ '\n' + content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
             "num_image_tokens": 6,
@@ -88,9 +91,6 @@ def test_chat_template_is_saved(self):
         processor_dict = self.prepare_processor_dict()
         self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
 
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
-
     def test_chat_template(self):
         processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")
         expected_prompt = "<|im_start|>user <image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
diff --git a/tests/models/longformer/test_tokenization_longformer.py b/tests/models/longformer/test_tokenization_longformer.py
index 65c42a0cab94..303a9ae2d091 100644
--- a/tests/models/longformer/test_tokenization_longformer.py
+++ b/tests/models/longformer/test_tokenization_longformer.py
@@ -18,12 +18,13 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import AddedToken, LongformerTokenizer, LongformerTokenizerFast
 from transformers.models.longformer.tokenization_longformer import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_tokenizers, slow
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -36,8 +37,9 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     rust_tokenizer_class = LongformerTokenizerFast
     test_rust_tokenizer = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -64,22 +66,30 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
@@ -173,8 +183,8 @@ def test_pretokenized_inputs(self):
     def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 sentence = "A, <mask> AllenNLP sentence."
                 tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
                 tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
@@ -204,7 +214,7 @@ def test_embeded_special_tokens(self):
 
     def test_change_add_prefix_space_and_trim_offsets_args(self):
         for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
-            tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+            tokenizer_r = self.get_rust_tokenizer(
                 self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
             )
 
@@ -224,7 +234,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                 text_of_1_token = "hello"  # `hello` is a token in the vocabulary of `pretrained_name`
                 text = f"{text_of_1_token} {text_of_1_token}"
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -234,7 +244,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                     (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
                 )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -244,7 +254,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                     (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
                 )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -254,7 +264,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                     (len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
                 )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -276,7 +286,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                 #     (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
                 # )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -286,7 +296,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                     (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
                 )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -296,7 +306,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                     (1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
                 )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
diff --git a/tests/models/longt5/test_modeling_flax_longt5.py b/tests/models/longt5/test_modeling_flax_longt5.py
index f779ceefc5bd..ec5432f9efd1 100644
--- a/tests/models/longt5/test_modeling_flax_longt5.py
+++ b/tests/models/longt5/test_modeling_flax_longt5.py
@@ -17,18 +17,15 @@
 
 import numpy as np
 
-import transformers
 from transformers import is_flax_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import (
-    is_pt_flax_cross_test,
     require_flax,
     require_sentencepiece,
     require_tokenizers,
     slow,
 )
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
@@ -47,7 +44,6 @@
     from flax.traverse_util import flatten_dict
 
     from transformers import FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING, FLAX_MODEL_MAPPING, AutoTokenizer, LongT5Config
-    from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
     from transformers.models.longt5.modeling_flax_longt5 import (
         FlaxLongT5ForConditionalGeneration,
         FlaxLongT5Model,
@@ -235,7 +231,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_flax
-class FlaxLongT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxLongT5ModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxLongT5Model, FlaxLongT5ForConditionalGeneration) if is_flax_available() else ()
     is_encoder_decoder = True
 
@@ -468,95 +464,6 @@ def test_attention_outputs(self):
                 [self.model_tester.num_attention_heads, block_len, 3 * block_len],
             )
 
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                # save pt model
-                pt_model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            model.params = model.to_bf16(model.params)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
 
 class FlaxLongT5TGlobalModelTest(FlaxLongT5ModelTest):
     def setUp(self):
diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py
index 38b159679f04..06bb76b74de9 100644
--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@@ -627,20 +627,6 @@ def test_model_from_pretrained(self):
         model = LongT5Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @slow
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = LongT5Model(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/longt5_test.onnx",
-                export_params=True,
-                opset_version=14,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     def test_generate_with_head_masking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -794,7 +780,7 @@ def _check_encoder_attention_for_generate(self, attentions, batch_size, config,
         )
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
@@ -1139,7 +1125,7 @@ def test_attention_outputs(self):
                 )
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py
index 1a0b63d33e0c..8864e5687eee 100644
--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -863,19 +863,19 @@ def test_retain_grad_entity_hidden_states(self):
         self.assertIsNotNone(entity_hidden_states.grad)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/luke/test_tokenization_luke.py b/tests/models/luke/test_tokenization_luke.py
index a648f28d1ff2..b935f3e49c4a 100644
--- a/tests/models/luke/test_tokenization_luke.py
+++ b/tests/models/luke/test_tokenization_luke.py
@@ -14,12 +14,13 @@
 # limitations under the License.
 
 import unittest
+from functools import lru_cache
 from typing import Tuple
 
 from transformers import AddedToken, LukeTokenizer
 from transformers.testing_utils import get_tests_dir, require_torch, slow
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json")
@@ -33,13 +34,17 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = False
     from_pretrained_kwargs = {"cls_token": "<s>"}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
-        self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
+        cls.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
 
-    def get_tokenizer(self, task=None, **kwargs):
-        kwargs.update(self.special_tokens_map)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, task=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
         tokenizer = LukeTokenizer(
             vocab_file=SAMPLE_VOCAB,
             merges_file=SAMPLE_MERGE_FILE,
@@ -137,8 +142,8 @@ def test_pretokenized_inputs(self):
     def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 sentence = "A, <mask> AllenNLP sentence."
                 tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
                 tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py
index 50be9cce9b2a..308662335d66 100644
--- a/tests/models/lxmert/test_modeling_lxmert.py
+++ b/tests/models/lxmert/test_modeling_lxmert.py
@@ -779,7 +779,7 @@ def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
diff --git a/tests/models/lxmert/test_tokenization_lxmert.py b/tests/models/lxmert/test_tokenization_lxmert.py
index 6f1c5306ff31..b634d259ed56 100644
--- a/tests/models/lxmert/test_tokenization_lxmert.py
+++ b/tests/models/lxmert/test_tokenization_lxmert.py
@@ -32,8 +32,9 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     space_between_special_tokens = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "[UNK]",
@@ -50,8 +51,8 @@ def setUp(self):
             "low",
             "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
diff --git a/tests/models/m2m_100/test_modeling_m2m_100.py b/tests/models/m2m_100/test_modeling_m2m_100.py
index 015a2c5f3875..f70cde61330c 100644
--- a/tests/models/m2m_100/test_modeling_m2m_100.py
+++ b/tests/models/m2m_100/test_modeling_m2m_100.py
@@ -338,7 +338,7 @@ def test_generate_fp16(self):
         model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
@@ -415,16 +415,20 @@ def test_seq_to_seq_generation(self):
         )
 
         expected_en = [
-            "The NSA case highlights the total absence of intelligence debate",
-            "I think there are two levels of response from the French government.",
+            "</s> __en__ "
+            "The NSA case highlights the total absence of intelligence debate"
+            "</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+            "</s> __en__ "
+            "I think there are two levels of response from the French government."
+            "</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",
+            "</s> __en__ "
             "When François Hollande calls Barack Obama or when Foreign Minister Laurent Fabius calls the U.S."
             " Ambassador, they respond to a real discovery, which is that of the scale of U.S. surveillance on all"
-            " communications in France.",
+            " communications in France."
+            "</s>",
         ]
 
-        generated = tokenizer.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
+        generated = tokenizer.batch_decode(hypotheses_batch)
         assert generated == expected_en
 
     @require_flash_attn
@@ -433,7 +437,7 @@ def test_seq_to_seq_generation(self):
     @slow
     def test_flash_attn_2_seq_to_seq_generation(self):
         """
-        Overwritting the common test as the test is flaky on tiny models
+        Overwriting the common test as the test is flaky on tiny models
         """
         model = M2M100ForConditionalGeneration.from_pretrained(
             "facebook/m2m100_418M", attn_implementation="flash_attention_2"
diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py
index 76cadf2f3bc3..0632eaf24878 100644
--- a/tests/models/m2m_100/test_tokenization_m2m_100.py
+++ b/tests/models/m2m_100/test_tokenization_m2m_100.py
@@ -14,6 +14,7 @@
 
 import tempfile
 import unittest
+from functools import lru_cache
 from pathlib import Path
 from shutil import copyfile
 
@@ -32,7 +33,7 @@
 if is_sentencepiece_available():
     from transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 if is_sentencepiece_available():
@@ -54,21 +55,26 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_seq2seq = False
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        save_dir = Path(self.tmpdirname)
+        save_dir = Path(cls.tmpdirname)
         save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
         if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
             copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
 
-        tokenizer = M2M100Tokenizer.from_pretrained(self.tmpdirname)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer = M2M100Tokenizer.from_pretrained(cls.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
-    def get_tokenizer(self, **kwargs):
-        return M2M100Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return M2M100Tokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         return (
diff --git a/tests/models/mamba/test_modeling_mamba.py b/tests/models/mamba/test_modeling_mamba.py
index 0c1bbef910f9..02ecbbfd3cbe 100644
--- a/tests/models/mamba/test_modeling_mamba.py
+++ b/tests/models/mamba/test_modeling_mamba.py
@@ -185,7 +185,7 @@ def create_and_check_state_equivalency(self, config, input_ids, *args):
         output_two = outputs.last_hidden_state
 
         self.parent.assertTrue(torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5))
-        # TODO the orignal mamba does not support decoding more than 1 token neither do we
+        # TODO the original mamba does not support decoding more than 1 token neither do we
 
     def create_and_check_mamba_cached_slow_forward_and_backwards(
         self, config, input_ids, *args, gradient_checkpointing=False
@@ -205,7 +205,7 @@ def create_and_check_mamba_cached_slow_forward_and_backwards(
             token_emb, cache, cache_position=torch.arange(0, config.conv_kernel, device=input_ids.device)
         )
 
-        loss = torch.log(1 + torch.abs(outputs.sum()))
+        loss = torch.log1p(torch.abs(outputs.sum()))
         self.parent.assertEqual(loss.shape, ())
         self.parent.assertEqual(outputs.shape, (self.batch_size, self.seq_length, self.hidden_size))
         loss.backward()
@@ -422,7 +422,7 @@ def test_dtype_mismatch_handled_in_cache(self):
         model.eval()
 
         # Create cache with float32 dtype
-        cache_params = MambaCache(config, batch_size=input_ids.size(0), dtype=torch.float32, device=torch_device)
+        cache_params = MambaCache(config, max_batch_size=input_ids.size(0), dtype=torch.float32, device=torch_device)
 
         # If code is correct, no error occurs and test passes
         outputs = model(
@@ -451,7 +451,7 @@ def test_simple_generate(self, device):
         tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
         tokenizer.pad_token = tokenizer.eos_token
 
-        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf", torch_dtype=torch.float16)
+        model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf", torch_dtype=torch.float32)
         model.to(device)
         input_ids = tokenizer("Hey how are you doing?", return_tensors="pt")["input_ids"].to(device)
 
@@ -464,14 +464,13 @@ def test_simple_generate(self, device):
 
         EXPECTED_LOGITS_NO_GRAD = torch.tensor(
             [
-                -55.6875, -69.8750, -49.9062, -51.7500, -57.6875, -57.9375, -56.9688,
-                -57.9375, -54.6875, -55.9375, -55.3125, -58.0938, -60.5625, -47.0000,
-                -52.0312, -49.7812, -55.9375, -57.9062, -56.7812, -57.1250, -57.3438,
-                -58.3125, -57.8125, -58.7812, -59.6250, -59.0938, -58.7188, -52.9375,
-                -53.4688, -57.3750, -56.9375, -55.7500, -53.3125, -55.8438, -57.0000,
-                -56.9062, -56.2188, -54.7188, -56.4375, -57.5000
-            ]
-        ,dtype=torch.float32)  # fmt: skip
+                -55.6909, -69.7903, -49.8981, -51.7581, -57.6544, -57.9368, -56.9591,
+                -57.9033, -54.6787, -55.9261, -55.3011, -58.0765, -60.5642, -47.0176,
+                -52.0344, -49.7836, -55.9463, -57.8957, -56.7627, -57.1080, -57.3434,
+                -58.3015, -57.7875, -58.7760, -59.6037, -59.0665, -58.7087, -52.9293,
+                -53.4654, -57.3466, -56.9294, -55.7314, -53.3141, -55.8171, -56.9879,
+                -56.9121, -56.2139, -54.7198, -56.4134, -57.4825
+            ])  # fmt: skip
 
         torch.testing.assert_close(logits[0, 0, :40].cpu(), EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1e-3)
 
diff --git a/tests/models/marian/test_modeling_flax_marian.py b/tests/models/marian/test_modeling_flax_marian.py
index 9f15291754de..4353bd173235 100644
--- a/tests/models/marian/test_modeling_flax_marian.py
+++ b/tests/models/marian/test_modeling_flax_marian.py
@@ -21,7 +21,6 @@
 from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
 from transformers.utils import cached_property
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -228,7 +227,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxMarianModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxMarianModelTest(FlaxModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (FlaxMarianModel, FlaxMarianMTModel) if is_flax_available() else ()
 
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index f27ee53575d5..8a1e2b4a33b3 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -17,8 +17,6 @@
 import tempfile
 import unittest
 
-from huggingface_hub.hf_api import list_models
-
 from transformers import MarianConfig, is_torch_available
 from transformers.testing_utils import (
     require_sentencepiece,
@@ -47,11 +45,6 @@
         MarianMTModel,
         TranslationPipeline,
     )
-    from transformers.models.marian.convert_marian_to_pytorch import (
-        ORG_NAME,
-        convert_hf_name_to_opus_name,
-        convert_opus_name_to_hf_name,
-    )
     from transformers.models.marian.modeling_marian import (
         MarianDecoder,
         MarianEncoder,
@@ -342,19 +335,19 @@ def test_tie_word_embeddings_decoder(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -395,17 +388,6 @@ def _long_tensor(tok_lst):
     return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
 
 
-class ModelManagementTests(unittest.TestCase):
-    @slow
-    @require_torch
-    def test_model_names(self):
-        model_list = list_models()
-        model_ids = [x.id for x in model_list if x.id.startswith(ORG_NAME)]
-        bad_model_ids = [mid for mid in model_ids if "+" in model_ids]
-        self.assertListEqual([], bad_model_ids)
-        self.assertGreater(len(model_ids), 500)
-
-
 @require_torch
 @require_sentencepiece
 @require_tokenizers
@@ -654,30 +636,6 @@ def test_batch_generation_fi_en(self):
         self._assert_generated_batch_equal_expected()
 
 
-@require_torch
-class TestConversionUtils(unittest.TestCase):
-    def test_renaming_multilingual(self):
-        old_names = [
-            "opus-mt-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
-            "opus-mt-cmn+cn-fi",  # no group
-            "opus-mt-en-de",  # standard name
-            "opus-mt-en-de",  # standard name
-        ]
-        expected = ["opus-mt-ZH-fi", "opus-mt-cmn_cn-fi", "opus-mt-en-de", "opus-mt-en-de"]
-        self.assertListEqual(expected, [convert_opus_name_to_hf_name(x) for x in old_names])
-
-    def test_undoing_renaming(self):
-        hf_names = ["opus-mt-ZH-fi", "opus-mt-cmn_cn-fi", "opus-mt-en-de", "opus-mt-en-de"]
-        converted_opus_names = [convert_hf_name_to_opus_name(x) for x in hf_names]
-        expected_opus_names = [
-            "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
-            "cmn+cn-fi",
-            "en-de",  # standard name
-            "en-de",
-        ]
-        self.assertListEqual(expected_opus_names, converted_opus_names)
-
-
 class MarianStandaloneDecoderModelTester:
     def __init__(
         self,
diff --git a/tests/models/marian/test_tokenization_marian.py b/tests/models/marian/test_tokenization_marian.py
index 3ef85e24de61..03814663604e 100644
--- a/tests/models/marian/test_tokenization_marian.py
+++ b/tests/models/marian/test_tokenization_marian.py
@@ -15,6 +15,7 @@
 
 import tempfile
 import unittest
+from functools import lru_cache
 from pathlib import Path
 from shutil import copyfile
 
@@ -26,7 +27,7 @@
 if is_sentencepiece_available():
     from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
@@ -50,22 +51,28 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = False
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
         vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        save_dir = Path(self.tmpdirname)
+        save_dir = Path(cls.tmpdirname)
         save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"])
         save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"])
         if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists():
             copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"])
             copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"])
 
-        tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer = MarianTokenizer.from_pretrained(cls.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
-    def get_tokenizer(self, **kwargs) -> MarianTokenizer:
-        return MarianTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> MarianTokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return MarianTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         return (
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index eaf30131d340..c7cb39964fac 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -33,7 +33,7 @@
     logging,
 )
 from transformers.models.markuplm.tokenization_markuplm import VOCAB_FILES_NAMES, MarkupLMTokenizer
-from transformers.testing_utils import is_pt_tf_cross_test, require_tokenizers, require_torch, slow
+from transformers.testing_utils import require_tokenizers, require_torch, slow
 
 from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin, merge_model_tokenizer_mappings
 
@@ -50,26 +50,27 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_kwargs = {"cls_token": "<s>"}
     test_seq2seq = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "\u0120hello", "\u0120world", "<unk>",]  # fmt: skip
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        self.tokenizer_config_file = os.path.join(self.tmpdirname, "tokenizer_config.json")
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        cls.tokenizer_config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json")
 
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
-        with open(self.tokenizer_config_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps({"tags_dict": self.tags_dict}))
+        with open(cls.tokenizer_config_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps({"tags_dict": cls.tags_dict}))
 
     def get_nodes_and_xpaths(self):
         nodes = ["hello", "world"]
@@ -421,8 +422,8 @@ def test_padding_to_max_length(self):
     def test_padding(self, max_length=50):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
@@ -828,8 +829,8 @@ def test_build_inputs_with_special_tokens(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 # Input tokens id
                 nodes, xpaths = self.get_nodes_and_xpaths()
@@ -1010,7 +1011,7 @@ def test_token_type_ids(self):
     def test_offsets_mapping(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 text = ["a", "wonderful", "test"]
                 xpaths = ["html/body" for _ in range(len(text))]
@@ -1125,7 +1126,7 @@ def test_tokenization_python_rust_equals(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 nodes, xpaths = self.get_nodes_and_xpaths()
 
@@ -1187,7 +1188,7 @@ def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 nodes, xpaths = self.get_nodes_and_xpaths()
                 tokens_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
                 tokens_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
@@ -1258,50 +1259,6 @@ def test_markuplm_truncation_integration_test(self):
         self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
         self.assertLessEqual(len(new_encoded_inputs), 20)
 
-    @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                nodes, xpaths = self.get_nodes_and_xpaths_batch()
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, nodes, xpaths=xpaths, return_tensors="pt")
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, nodes, xpaths=xpaths, return_tensors="tf")
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        nodes,
-                        xpaths=xpaths,
-                        padding=True,
-                        return_tensors="pt",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        nodes,
-                        xpaths=xpaths,
-                        padding="longest",
-                        return_tensors="tf",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus(
-                        nodes, xpaths=xpaths, padding=True, return_tensors="pt"
-                    )
-                    tensorflow_tensor = tokenizer.batch_encode_plus(
-                        nodes, xpaths=xpaths, padding="longest", return_tensors="tf"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus(nodes, xpaths=xpaths, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
     def test_sequence_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -1534,7 +1491,7 @@ def test_padding_different_model_input_name(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
 
diff --git a/tests/models/mask2former/test_image_processing_mask2former.py b/tests/models/mask2former/test_image_processing_mask2former.py
index aaca13dbc367..f2c5cd779469 100644
--- a/tests/models/mask2former/test_image_processing_mask2former.py
+++ b/tests/models/mask2former/test_image_processing_mask2former.py
@@ -315,7 +315,7 @@ def get_instance_segmentation_and_mapping(annotation):
             inst2class = {}
             for label in class_labels:
                 instance_ids = np.unique(instance_seg[class_id_map == label])
-                inst2class.update({i: label for i in instance_ids})
+                inst2class.update(dict.fromkeys(instance_ids, label))
 
             return instance_seg, inst2class
 
diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py
index d042c702a601..d97522261c0c 100644
--- a/tests/models/maskformer/test_image_processing_maskformer.py
+++ b/tests/models/maskformer/test_image_processing_maskformer.py
@@ -269,7 +269,7 @@ def get_instance_segmentation_and_mapping(annotation):
             inst2class = {}
             for label in class_labels:
                 instance_ids = np.unique(instance_seg[class_id_map == label])
-                inst2class.update({i: label for i in instance_ids})
+                inst2class.update(dict.fromkeys(instance_ids, label))
 
             return instance_seg, inst2class
 
diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py
index 502660b191ef..6125c2854f79 100644
--- a/tests/models/maskformer/test_modeling_maskformer_swin.py
+++ b/tests/models/maskformer/test_modeling_maskformer_swin.py
@@ -235,10 +235,6 @@ def test_model_get_set_embeddings(self):
     def test_attention_outputs(self):
         pass
 
-    @unittest.skip(reason="MaskFormerSwin is only used as an internal backbone")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
         model = model_class(config)
         model.to(torch_device)
diff --git a/tests/models/mbart/test_modeling_flax_mbart.py b/tests/models/mbart/test_modeling_flax_mbart.py
index 6e0230646051..bacecec8571e 100644
--- a/tests/models/mbart/test_modeling_flax_mbart.py
+++ b/tests/models/mbart/test_modeling_flax_mbart.py
@@ -21,7 +21,6 @@
 from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
 from transformers.utils import cached_property
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -330,7 +329,7 @@ def test_shift_tokens_right(self):
 
 
 @require_flax
-class FlaxMBartModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxMBartModelTest(FlaxModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     all_model_classes = (
         (
diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py
index 9ff5ef33bdb2..6920ea9b7457 100644
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -368,7 +368,7 @@ def test_ensure_weights_are_shared(self):
         )
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
diff --git a/tests/models/mbart/test_tokenization_mbart.py b/tests/models/mbart/test_tokenization_mbart.py
index bbe800357e71..f219965ae52f 100644
--- a/tests/models/mbart/test_tokenization_mbart.py
+++ b/tests/models/mbart/test_tokenization_mbart.py
@@ -47,12 +47,13 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_full_tokenizer(self):
         tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
@@ -139,8 +140,8 @@ def test_save_pretrained(self):
         self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {})
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 tmpdirname2 = tempfile.mkdtemp()
 
diff --git a/tests/models/mbart50/test_tokenization_mbart50.py b/tests/models/mbart50/test_tokenization_mbart50.py
index cd86bcf623ab..bed8b8cb376f 100644
--- a/tests/models/mbart50/test_tokenization_mbart50.py
+++ b/tests/models/mbart50/test_tokenization_mbart50.py
@@ -47,12 +47,13 @@ class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@@ -117,8 +118,8 @@ def test_save_pretrained(self):
         self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {})
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 tmpdirname2 = tempfile.mkdtemp()
 
diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py
index ee6bedfd0ca4..f201c0ea934c 100644
--- a/tests/models/megatron_bert/test_modeling_megatron_bert.py
+++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py
@@ -157,15 +157,6 @@ def create_and_check_megatron_bert_for_masked_lm(
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
-    def create_and_check_for_causal_lm(
-        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = MegatronBertForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
     def create_and_check_megatron_bert_for_next_sequence_prediction(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
@@ -282,6 +273,8 @@ class MegatronBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": MegatronBertModel,
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py
index 465444f6927e..849eb90a75a0 100644
--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -202,7 +202,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    # override as the `logit_scale` parameter initilization is different for MgpstrModel
+    # override as the `logit_scale` parameter initialization is different for MgpstrModel
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/mgp_str/test_processor_mgp_str.py b/tests/models/mgp_str/test_processor_mgp_str.py
index 783a61ebf144..f42dfb266149 100644
--- a/tests/models/mgp_str/test_processor_mgp_str.py
+++ b/tests/models/mgp_str/test_processor_mgp_str.py
@@ -70,7 +70,7 @@ def setUp(self):
         with open(self.image_processor_file, "w", encoding="utf-8") as fp:
             json.dump(image_processor_map, fp)
 
-    # We copy here rather than use the ProcessorTesterMixin as this processor has a `char_tokenizer` instad of a
+    # We copy here rather than use the ProcessorTesterMixin as this processor has a `char_tokenizer` instead of a
     # tokenizer attribute, which means all the tests would need to be overridden.
     @require_vision
     def prepare_image_inputs(self):
diff --git a/tests/models/mgp_str/test_tokenization_mgp_str.py b/tests/models/mgp_str/test_tokenization_mgp_str.py
index 91ec39f027a1..2d021606ffcc 100644
--- a/tests/models/mgp_str/test_tokenization_mgp_str.py
+++ b/tests/models/mgp_str/test_tokenization_mgp_str.py
@@ -17,12 +17,13 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import MgpstrTokenizer
 from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_tokenizers
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -33,18 +34,23 @@ class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_kwargs = {}
     test_seq2seq = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']  # fmt: skip
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
 
-    def get_tokenizer(self, **kwargs):
-        return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return MgpstrTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "tester"
diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
index 4542fe3bbace..1cf819d4017a 100644
--- a/tests/models/mimi/test_modeling_mimi.py
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -21,7 +21,6 @@
 
 import numpy as np
 from datasets import Audio, load_dataset
-from parameterized import parameterized
 from pytest import mark
 
 from transformers import AutoFeatureExtractor, MimiConfig
@@ -31,17 +30,12 @@
     require_flash_attn,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     slow,
     torch_device,
 )
-from transformers.utils import (
-    is_torch_bf16_available_on_device,
-    is_torch_fp16_available_on_device,
-)
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor, sdpa_kernel
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
 
 
 if is_torch_available():
@@ -409,291 +403,6 @@ def test_identity_shortcut(self):
         config.use_conv_shortcut = False
         self.model_tester.create_and_check_model_forward(config, inputs_dict)
 
-    # Overwrite to use `audio_values` as the tensors to compare.
-    # TODO: Try to do this in the parent class.
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        if torch_dtype == "float16" and torch_device == "cpu":
-            self.skipTest("`replication_pad1d` not implemented for 'Half")
-
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self.all_model_classes[0]._supports_sdpa:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
-        if torch_dtype == "float16":
-            torch_dtype = torch.float16
-        elif torch_dtype == "bfloat16":
-            torch_dtype = torch.bfloat16
-        elif torch_dtype == "float32":
-            torch_dtype = torch.float32
-
-        atols = {
-            ("cpu", False, torch.float32): 1e-6,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-6,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-6,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-6,
-            ("cuda", True, torch.bfloat16): 1e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-        rtols = {
-            ("cpu", False, torch.float32): 1e-4,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-4,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-4,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-
-        def get_mean_reldiff(failcase, x, ref, atol, rtol):
-            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-            # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
-            # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
-            # This means that the class needs to be instantiated much later, after `use_mask` is set, which means a significant refactor of the code.
-            # However masking there is not done at any layers that matters (i.e self-attention), therefore we can safely deactivate it.
-            deactivate_mask = "use_mask_token" in inspect.signature(model_class).parameters
-
-            is_encoder_decoder = model.config.is_encoder_decoder
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                )
-                model_eager = model_eager.eval().to(torch_device)
-
-                self.assertTrue(model_eager.config._attn_implementation == "eager")
-
-                for name, submodule in model_eager.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        raise ValueError("The eager model should not have SDPA attention layers")
-
-                has_sdpa = False
-                for name, submodule in model_sdpa.named_modules():
-                    class_name = submodule.__class__.__name__
-                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                        has_sdpa = True
-                        break
-                if not has_sdpa and model_sdpa.config.model_type != "falcon":
-                    raise ValueError("The SDPA model should have SDPA attention layers")
-
-                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
-                # but it would be nicer to have an efficient way to use parameterized.expand
-                fail_cases = []
-                for padding_side in ["left", "right"]:
-                    for use_mask in [False, True]:
-                        for output_attentions in [True, False]:
-                            can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                            if not (self.has_attentions and can_output_attn) and output_attentions:
-                                continue
-                            for batch_size in [7]:
-                                dummy_input = inputs_dict[model.main_input_name]
-
-                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                    dummy_input = dummy_input.to(torch_dtype)
-
-                                dummy_input = dummy_input[:batch_size]
-                                if dummy_input.shape[0] != batch_size:
-                                    if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                        extension = torch.rand(
-                                            batch_size - dummy_input.shape[0],
-                                            *dummy_input.shape[1:],
-                                            dtype=torch_dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-                                    else:
-                                        extension = torch.randint(
-                                            high=5,
-                                            size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]),
-                                            dtype=dummy_input.dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-
-                                if not use_mask:
-                                    dummy_attention_mask = None
-                                else:
-                                    dummy_attention_mask = inputs_dict.get("attention_mask", None)
-                                    if dummy_attention_mask is None:
-                                        if is_encoder_decoder:
-                                            seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
-                                        else:
-                                            seqlen = dummy_input.shape[-1]
-                                        dummy_attention_mask = (
-                                            torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
-                                        )
-
-                                    dummy_attention_mask = dummy_attention_mask[:batch_size]
-                                    if dummy_attention_mask.shape[0] != batch_size:
-                                        extension = torch.ones(
-                                            batch_size - dummy_attention_mask.shape[0],
-                                            *dummy_attention_mask.shape[1:],
-                                            dtype=dummy_attention_mask.dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
-                                        dummy_attention_mask = dummy_attention_mask.to(torch_device)
-
-                                    dummy_attention_mask[:] = 1
-                                    if padding_side == "left":
-                                        dummy_attention_mask[-1, :2] = 0
-                                        dummy_attention_mask[-1, 2:] = 1
-                                    elif padding_side == "right":
-                                        dummy_attention_mask[-1, -2:] = 0
-                                        dummy_attention_mask[-1, :-2] = 1
-
-                                for enable_kernels in [False, True]:
-                                    failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
-                                    if is_encoder_decoder:
-                                        decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[
-                                            :batch_size
-                                        ]
-                                        if decoder_input_ids.shape[0] != batch_size:
-                                            extension = torch.ones(
-                                                batch_size - decoder_input_ids.shape[0],
-                                                *decoder_input_ids.shape[1:],
-                                                dtype=decoder_input_ids.dtype,
-                                                device=torch_device,
-                                            )
-                                            decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
-                                            decoder_input_ids = decoder_input_ids.to(torch_device)
-
-                                        # TODO: never an `attention_mask` arg here?
-                                        processed_inputs = {
-                                            model.main_input_name: dummy_input,
-                                            "decoder_input_ids": decoder_input_ids,
-                                            "decoder_attention_mask": dummy_attention_mask,
-                                            "output_hidden_states": True,
-                                        }
-                                    else:
-                                        processed_inputs = {
-                                            model.main_input_name: dummy_input,
-                                            "output_hidden_states": True,
-                                        }
-
-                                        # Otherwise fails for e.g. WhisperEncoderModel
-                                        if "attention_mask" in inspect.signature(model_eager.forward).parameters:
-                                            processed_inputs["attention_mask"] = dummy_attention_mask
-
-                                        if (
-                                            self.has_attentions
-                                            and "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                                        ):
-                                            processed_inputs["output_attentions"] = output_attentions
-                                    if not deactivate_mask and (
-                                        "bool_masked_pos" in inspect.signature(model_eager.forward).parameters
-                                    ):
-                                        dummy_mask = torch.ones((self.model_tester.num_masks,))
-
-                                        # In case of additional token (like class) we define a custom `mask_length`
-                                        if hasattr(self.model_tester, "mask_length"):
-                                            mask_length = self.model_tester.mask_length - dummy_mask.size(0)
-                                        else:
-                                            mask_length = self.model_tester.seq_length - dummy_mask.size(0)
-                                        dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
-                                        dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
-                                        processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)
-
-                                    if "noise" in inspect.signature(model_eager.forward).parameters:
-                                        np.random.seed(2)
-                                        num_patches = int(
-                                            (self.model_tester.image_size // self.model_tester.patch_size) ** 2
-                                        )
-                                        noise = np.random.uniform(size=(batch_size, num_patches))
-                                        processed_inputs["noise"] = torch.from_numpy(noise)
-
-                                    # TODO: test gradients as well (& for FA2 as well!)
-                                    with torch.no_grad():
-                                        with sdpa_kernel(
-                                            enable_flash=enable_kernels,
-                                            enable_math=True,
-                                            enable_mem_efficient=enable_kernels,
-                                        ):
-                                            prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
-                                            outputs_eager = model_eager(**prepared_inputs)
-                                            outputs_sdpa = model_sdpa(**prepared_inputs)
-
-                                    # Ignore copy
-                                    logits_eager = outputs_eager.audio_values
-                                    # Ignore copy
-                                    logits_sdpa = outputs_sdpa.audio_values
-
-                                    if torch_device in ["cpu", "cuda"]:
-                                        atol = atols[torch_device, enable_kernels, torch_dtype]
-                                        rtol = rtols[torch_device, enable_kernels, torch_dtype]
-                                    elif torch_device == "xpu":
-                                        # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
-                                        # which is implemented on PyTorch level using aten operators and is
-                                        # device agnostic with respect to implementation of each aten operator.
-                                        atol = atols["cuda", False, torch_dtype]
-                                        rtol = rtols["cuda", False, torch_dtype]
-                                    else:
-                                        atol = 1e-7
-                                        rtol = 1e-4
-
-                                    # Masked tokens output slightly deviates - we don't mind that.
-                                    if use_mask:
-                                        _logits_sdpa = torch.zeros_like(input=logits_sdpa)
-                                        _logits_eager = torch.zeros_like(input=logits_eager)
-
-                                        _logits_sdpa[:-1] = logits_sdpa[:-1]
-                                        _logits_eager[:-1] = logits_eager[:-1]
-
-                                        if padding_side == "left":
-                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
-                                            _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
-
-                                        elif padding_side == "right":
-                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
-                                            _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
-
-                                        logits_sdpa = _logits_sdpa
-                                        logits_eager = _logits_eager
-
-                                    results = [
-                                        torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
-                                        for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
-                                    ]
-                                    # If 80% batch elements have matched results, it's fine
-                                    if np.mean(results) < 0.8:
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
     @require_flash_attn
     @require_torch_gpu
     @mark.flash_attn_test
@@ -776,7 +485,7 @@ def test_integration_using_cache_decode(self):
 
         for num_codebooks, expected_rmse in expected_rmse.items():
             with torch.no_grad():
-                # use max bandwith for best possible reconstruction
+                # use max bandwidth for best possible reconstruction
                 encoder_outputs = model.encode(inputs["input_values"], num_quantizers=int(num_codebooks))
 
                 audio_codes = encoder_outputs[0]
@@ -828,10 +537,10 @@ def test_integration(self):
             model = MimiModel.from_pretrained(model_id, use_cache=use_cache).to(torch_device)
             for num_codebooks, expected_rmse in expected_rmses.items():
                 with torch.no_grad():
-                    # use max bandwith for best possible reconstruction
+                    # use max bandwidth for best possible reconstruction
                     encoder_outputs = model.encode(inputs["input_values"], num_quantizers=int(num_codebooks))
 
-                    audio_code_sums = encoder_outputs[0].sum().cpu().item()
+                    audio_code_sums = encoder_outputs[0].sum().item()
 
                     # make sure audio encoded codes are correct
                     # assert relative difference less than a threshold, because `audio_code_sums` varies a bit
diff --git a/tests/models/mistral/test_modeling_flax_mistral.py b/tests/models/mistral/test_modeling_flax_mistral.py
index c78a402fa90c..14c91be258f9 100644
--- a/tests/models/mistral/test_modeling_flax_mistral.py
+++ b/tests/models/mistral/test_modeling_flax_mistral.py
@@ -20,7 +20,6 @@
 from transformers import MistralConfig, is_flax_available, is_tokenizers_available
 from transformers.testing_utils import require_flax, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -185,7 +184,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxMistralModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxMistralModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxMistralModel, FlaxMistralForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index bbd6aee82a78..be82cbc796b7 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -159,119 +159,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Mistral
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = MistralModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Mistral
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = MistralForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Mistral
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = MistralForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -352,7 +239,6 @@ def test_torch_fx_output_loss(self):
 
     def test_Mistral_sequence_classification_model(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
         config.num_labels = 3
         input_ids = input_dict["input_ids"]
         attention_mask = input_ids.ne(1).to(torch_device)
@@ -407,10 +293,6 @@ def test_Mistral_token_classification_model(self):
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip(reason="Mistral buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="Mistral uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
diff --git a/tests/models/mistral/test_modeling_tf_mistral.py b/tests/models/mistral/test_modeling_tf_mistral.py
index 448b40fc44c8..a45935e9784c 100644
--- a/tests/models/mistral/test_modeling_tf_mistral.py
+++ b/tests/models/mistral/test_modeling_tf_mistral.py
@@ -24,7 +24,6 @@
     slow,
 )
 
-from ...generation.test_tf_utils import TFGenerationIntegrationTests
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -244,7 +243,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_tf
-class TFMistralModelTest(TFModelTesterMixin, TFGenerationIntegrationTests, PipelineTesterMixin, unittest.TestCase):
+class TFMistralModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (TFMistralModel, TFMistralForCausalLM, TFMistralForSequenceClassification) if is_tf_available() else ()
     )
@@ -326,10 +325,6 @@ def test_Mistral_sequence_classification_model_for_multi_label(self):
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
-    @unittest.skip("Mistral buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip("Mistral uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
diff --git a/tests/models/mistral3/__init__.py b/tests/models/mistral3/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mistral3/test_modeling_mistral3.py b/tests/models/mistral3/test_modeling_mistral3.py
new file mode 100644
index 000000000000..58096b5db7af
--- /dev/null
+++ b/tests/models/mistral3/test_modeling_mistral3.py
@@ -0,0 +1,460 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch GotOcr2 model."""
+
+import unittest
+
+from transformers import (
+    AutoProcessor,
+    Mistral3Config,
+    is_bitsandbytes_available,
+    is_torch_available,
+)
+from transformers.testing_utils import (
+    cleanup,
+    require_bitsandbytes,
+    require_read_token,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Mistral3ForConditionalGeneration,
+    )
+
+
+if is_bitsandbytes_available():
+    from transformers import BitsAndBytesConfig
+
+
+class Mistral3VisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=3,
+        seq_length=7,
+        image_seq_length=4,
+        vision_feature_layer=-1,
+        ignore_index=-100,
+        bos_token_id=0,
+        eos_token_id=0,
+        pad_token_id=0,
+        image_token_index=1,
+        num_channels=3,
+        image_size=30,
+        model_type="mistral3",
+        is_training=True,
+        text_config={
+            "model_type": "mistral",
+            "vocab_size": 99,
+            "attention_dropout": 0.0,
+            "hidden_act": "silu",
+            "hidden_size": 32,
+            "initializer_range": 0.02,
+            "intermediate_size": 37,
+            "max_position_embeddings": 512,
+            "num_attention_heads": 4,
+            "num_hidden_layers": 2,
+            "num_key_value_heads": 2,
+            "rms_norm_eps": 1e-05,
+            "rope_theta": 1000000000.0,
+            "sliding_window": None,
+            "bos_token_id": 0,
+            "eos_token_id": 0,
+            "pad_token_id": 0,
+        },
+        vision_config={
+            "model_type": "pixtral",
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "image_size": 30,
+            "patch_size": 6,
+            "num_channels": 3,
+            "hidden_act": "gelu",
+        },
+    ):
+        self.parent = parent
+        self.ignore_index = ignore_index
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.image_token_index = image_token_index
+        self.model_type = model_type
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.batch_size = batch_size
+        self.vision_feature_layer = vision_feature_layer
+        self.is_training = is_training
+        self.image_seq_length = image_seq_length
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.seq_length = seq_length + self.image_seq_length
+
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+
+    def get_config(self):
+        return Mistral3Config(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+            model_type=self.model_type,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            image_token_index=self.image_token_index,
+            image_seq_length=self.image_seq_length,
+            vision_feature_layer=self.vision_feature_layer,
+        )
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+        image_sizes = torch.tensor(
+            [[self.image_size, self.image_size]] * self.batch_size, dtype=torch.long, device=torch_device
+        )
+
+        # input_ids[:, -1] = self.pad_token_id
+        input_ids[input_ids == self.image_token_index] = self.pad_token_id
+        input_ids[:, : self.image_seq_length] = self.image_token_index
+
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "image_sizes": image_sizes,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Mistral3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Mistral3ForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (Mistral3ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "image-text-to-text": Mistral3ForConditionalGeneration,
+        }
+        if is_torch_available()
+        else {}
+    )
+    _is_composite = True
+    test_headmasking = False
+    test_pruning = False
+
+    def setUp(self):
+        self.model_tester = Mistral3VisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Mistral3Config, has_text_modality=False)
+
+    def test_config(self):
+        # overwritten from `tests/test_configuration_common.py::ConfigTester` after #36077
+        # TODO: avoid overwritten once there is a better fix for #36077
+        def check_config_can_be_init_without_params():
+            config = self.config_tester.config_class()
+            self.config_tester.parent.assertIsNotNone(config)
+
+        self.config_tester.check_config_can_be_init_without_params = check_config_can_be_init_without_params
+        self.config_tester.run_common_tests()
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            torch.testing.assert_close(out_embeds, out_ids)
+
+    @unittest.skip(reason="Compile not yet supported because in LLava models")
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
+    @unittest.skip("FlashAttention only support fp16 and bf16 data type")
+    def test_flash_attn_2_fp32_ln(self):
+        pass
+
+    @unittest.skip("Pixtral does not support attention interfaces.")
+    def test_eager_matches_fa2_generate(self):
+        pass
+
+    @unittest.skip("Pixtral does not support attention interfaces.")
+    def test_eager_matches_sdpa_generate(self):
+        pass
+
+    @unittest.skip("Pixtral does not support attention interfaces.")
+    def test_flash_attn_2_from_config(self):
+        pass
+
+    @unittest.skip("Pixtral does not support attention interfaces.")
+    def test_flash_attn_2_inference_equivalence(self):
+        pass
+
+    @unittest.skip("Pixtral does not support attention interfaces.")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip("Pixtral does not support attention interfaces.")
+    def test_sdpa_can_dispatch_on_flash(self):
+        pass
+
+
+@slow
+@require_torch_gpu
+class Mistral3IntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    @require_read_token
+    def test_mistral3_integration_generate_text_only(self):
+        processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+        model = Mistral3ForConditionalGeneration.from_pretrained(
+            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+        )
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Write a haiku"},
+                ],
+            }
+        ]
+
+        inputs = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(torch_device, dtype=torch.bfloat16)
+
+        with torch.no_grad():
+            generate_ids = model.generate(**inputs, max_new_tokens=200, do_sample=False)
+            decoded_output = processor.decode(
+                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+            )
+        expected_output = "Sure, here's a haiku for you:\n\nWhispers of the breeze,\nCherry blossoms softly fall,\nSpring's gentle embrace."
+        self.assertEqual(decoded_output, expected_output)
+
+    @require_read_token
+    def test_mistral3_integration_generate(self):
+        processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+        model = Mistral3ForConditionalGeneration.from_pretrained(
+            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+                    {"type": "text", "text": "Describe this image"},
+                ],
+            }
+        ]
+
+        inputs = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(torch_device, dtype=torch.bfloat16)
+        with torch.no_grad():
+            generate_ids = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+            decoded_output = processor.decode(
+                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+            )
+        expected_output = "The image depicts two cats lying on a pink blanket. The larger cat, which appears to be an"
+        self.assertEqual(decoded_output, expected_output)
+
+    @require_read_token
+    def test_mistral3_integration_batched_generate(self):
+        processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+        model = Mistral3ForConditionalGeneration.from_pretrained(
+            self.model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16
+        )
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                        {"type": "text", "text": "Write a haiku for this image"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                        {"type": "text", "text": "Describe this image"},
+                    ],
+                },
+            ],
+        ]
+
+        inputs = processor.apply_chat_template(
+            messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(model.device, dtype=torch.bfloat16)
+
+        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
+
+        # Check first output
+        decoded_output = processor.decode(output[0], skip_special_tokens=True)
+        expected_output = "Write a haiku for this imageSure, here is a haiku inspired by the image:\n\nCalm lake's mirror gleams,\nWhispering pines"
+        self.assertEqual(
+            decoded_output,
+            expected_output,
+            f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+        )
+
+        # Check second output
+        decoded_output = processor.decode(output[1], skip_special_tokens=True)
+        expected_output = "Describe this imageThe image depicts a vibrant street scene in what appears to be a Chinatown district. The focal point is a traditional Chinese"
+        self.assertEqual(
+            decoded_output,
+            expected_output,
+            f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+        )
+
+    @require_read_token
+    @require_bitsandbytes
+    def test_mistral3_integration_batched_generate_multi_image(self):
+        processor = AutoProcessor.from_pretrained(self.model_checkpoint)
+        quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+        model = Mistral3ForConditionalGeneration.from_pretrained(
+            self.model_checkpoint, quantization_config=quantization_config
+        )
+
+        # Prepare inputs
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                        {"type": "text", "text": "Write a haiku for this image"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                        },
+                        {
+                            "type": "image",
+                            "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+                        },
+                        {
+                            "type": "text",
+                            "text": "These images depict two different landmarks. Can you identify them?",
+                        },
+                    ],
+                },
+            ],
+        ]
+        inputs = processor.apply_chat_template(
+            messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(model.device, dtype=torch.float16)
+
+        output = model.generate(**inputs, do_sample=False, max_new_tokens=25)
+
+        # Check first output
+        decoded_output = processor.decode(output[0], skip_special_tokens=True)
+        expected_output = "Write a haiku for this imageSure, here is a haiku inspired by the image:\n\nCalm lake's wooden path\nSilent forest stands guard\n"
+        self.assertEqual(
+            decoded_output,
+            expected_output,
+            f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+        )
+
+        # Check second output
+        decoded_output = processor.decode(output[1], skip_special_tokens=True)
+        expected_output = "These images depict two different landmarks. Can you identify them?Certainly! The images depict two iconic landmarks:\n\n1. The first image shows the Statue of Liberty in New York City."
+        self.assertEqual(
+            decoded_output,
+            expected_output,
+            f"Decoded output: {decoded_output}\nExpected output: {expected_output}",
+        )
diff --git a/tests/models/mistral3/test_processor_mistral3.py b/tests/models/mistral3/test_processor_mistral3.py
new file mode 100644
index 000000000000..916ba668f013
--- /dev/null
+++ b/tests/models/mistral3/test_processor_mistral3.py
@@ -0,0 +1,358 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+import requests
+
+from transformers import PixtralProcessor
+from transformers.testing_utils import require_read_token, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+@require_vision
+@require_read_token
+class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    """This tests Pixtral processor with the new `spatial_merge_size` argument in Mistral3."""
+
+    processor_class = PixtralProcessor
+
+    @classmethod
+    def setUpClass(cls):
+        cls.url_0 = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        cls.image_0 = Image.open(requests.get(cls.url_0, stream=True).raw)
+        cls.url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        cls.image_1 = Image.open(requests.get(cls.url_1, stream=True).raw)
+        cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+        cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = PixtralProcessor.from_pretrained(
+            "hf-internal-testing/Mistral-Small-3.1-24B-Instruct-2503-only-processor"
+        )
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_processor(self):
+        return self.processor_class.from_pretrained(self.tmpdirname)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_chat_template_accepts_processing_kwargs(self):
+        # override to use slow image processor to return numpy arrays
+        processor = self.processor_class.from_pretrained(self.tmpdirname, use_fast=False)
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is shown in this image?"},
+                    ],
+                },
+            ]
+        ]
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            padding="max_length",
+            truncation=True,
+            max_length=50,
+        )
+        self.assertEqual(len(formatted_prompt_tokenized[0]), 50)
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            truncation=True,
+            max_length=5,
+        )
+        self.assertEqual(len(formatted_prompt_tokenized[0]), 5)
+
+        # Now test the ability to return dict
+        messages[0][0]["content"].append(
+            {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"}
+        )
+        out_dict = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            do_rescale=True,
+            rescale_factor=-1,
+            return_tensors="np",
+        )
+        self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0)
+
+    def test_chat_template(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname, use_fast=False)
+        expected_prompt = "<s>[SYSTEM_PROMPT][/SYSTEM_PROMPT][INST][IMG]What is shown in this image?[/INST]"
+
+        messages = [
+            {
+                "role": "system",
+                "content": "",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        self.assertEqual(expected_prompt, formatted_prompt)
+
+    def test_image_token_filling(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
+        # Important to check with non square image
+        image = torch.randint(0, 2, (3, 500, 316))
+        expected_image_tokens = 198
+        image_token_index = 10
+
+        messages = [
+            {
+                "role": "system",
+                "content": "",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        inputs = processor(
+            text=[processor.apply_chat_template(messages)],
+            images=[image],
+            return_tensors="pt",
+        )
+        image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
+        self.assertEqual(expected_image_tokens, image_tokens)
+
+    def test_processor_with_single_image(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
+        prompt_string = "USER: [IMG]\nWhat's the content of the image? ASSISTANT:"
+
+        # Make small for checking image token expansion
+        processor.image_processor.size = {"longest_edge": 30}
+        processor.patch_size = 6
+
+        # Test passing in an image
+        inputs_image = processor(text=prompt_string, images=self.image_0, return_tensors="pt")
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 1)
+        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
+
+        # fmt: off
+        input_ids = inputs_image["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
+            [1, 21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+        )
+        # fmt: on
+
+        # Test passing in a url
+        inputs_url = processor(text=prompt_string, images=self.url_0, return_tensors="pt")
+        self.assertIn("input_ids", inputs_url)
+        self.assertTrue(len(inputs_url["input_ids"]) == 1)
+        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
+
+        # fmt: off
+        input_ids = inputs_url["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
+            [1, 21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+        )
+        # fmt: on
+
+        # Test passing inputs as a single list
+        inputs_image = processor(text=prompt_string, images=[self.image_0], return_tensors="pt")
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
+
+        # fmt: off
+        self.assertEqual(
+            inputs_image["input_ids"][0].tolist(),
+            [1, 21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+        )
+        # fmt: on
+
+        # Test as nested single list
+        inputs_image = processor(text=prompt_string, images=[[self.image_0]], return_tensors="pt")
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([1, 3, 24, 30]))
+
+        # fmt: off
+        self.assertEqual(
+            inputs_image["input_ids"][0].tolist(),
+            [1, 21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+        )
+        # fmt: on
+
+    def test_processor_with_multiple_images_single_list(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
+        prompt_string = "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:"
+
+        # Make small for checking image token expansion
+        processor.image_processor.size = {"longest_edge": 30}
+        processor.patch_size = 6
+
+        # Test passing in an image
+        inputs_image = processor(text=prompt_string, images=[self.image_0, self.image_1], return_tensors="pt")
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 1)
+        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30]))
+
+        # fmt: off
+        input_ids = inputs_image["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+                    )
+        # fmt: on
+
+        # Test passing in a url
+        inputs_url = processor(text=prompt_string, images=[self.url_0, self.url_1], return_tensors="pt")
+        self.assertIn("input_ids", inputs_url)
+        self.assertTrue(len(inputs_url["input_ids"]) == 1)
+        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30]))
+
+        # fmt: off
+        input_ids = inputs_url["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on
+
+        # Test passing in as a nested list
+        inputs_url = processor(text=prompt_string, images=[[self.image_0, self.image_1]], return_tensors="pt")
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([2, 3, 24, 30]))
+
+        # fmt: off
+        self.assertEqual(
+            inputs_url["input_ids"][0].tolist(),
+            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on
+
+    def test_processor_with_multiple_images_multiple_lists(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
+        prompt_string = [
+            "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:",
+            "USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
+        ]
+        processor.tokenizer.pad_token = "</s>"
+        image_inputs = [[self.image_0, self.image_1], [self.image_2]]
+
+        # Make small for checking image token expansion
+        processor.image_processor.size = {"longest_edge": 30}
+        processor.patch_size = 6
+
+        # Test passing in an image
+        inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 2)
+        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30]))
+
+        # fmt: off
+        input_ids = inputs_image["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on
+
+        # Test passing in a url
+        inputs_url = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+        self.assertIn("input_ids", inputs_url)
+        self.assertTrue(len(inputs_url["input_ids"]) == 2)
+        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], torch.Tensor)
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30]))
+
+        # fmt: off
+        input_ids = inputs_url["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on
+
+        # Test passing as a single flat list
+        inputs_image = processor(
+            text=prompt_string, images=[self.image_0, self.image_1, self.image_2], return_tensors="pt", padding=True
+        )
+        self.assertTrue(inputs_image["pixel_values"].shape == torch.Size([3, 3, 30, 30]))
+
+        # fmt: off
+        self.assertEqual(
+            inputs_image["input_ids"][0].tolist(),
+            [1, 21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on
+
+    def test_processor_returns_full_length_batches(self):
+        # to avoid https://github.com/huggingface/transformers/issues/34204
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
+        prompt_string = [
+            "USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
+        ] * 5
+        processor.tokenizer.pad_token = "</s>"
+        image_inputs = [[self.image_0]] * 5
+
+        # Make small for checking image token expansion
+        processor.image_processor.size = {"longest_edge": 30}
+        processor.patch_size = 6
+
+        # Test passing in an image
+        inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 5)
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index 7ad879243933..1c12cbc2505f 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -157,119 +157,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Mixtral
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = MixtralModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Mixtral
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = MixtralForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Mixtral
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = MixtralForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Mixtral
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -351,7 +238,6 @@ def test_torch_fx_output_loss(self):
 
     def test_Mixtral_sequence_classification_model(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
         config.num_labels = 3
         input_ids = input_dict["input_ids"]
         attention_mask = input_ids.ne(1).to(torch_device)
@@ -406,10 +292,6 @@ def test_Mixtral_token_classification_model(self):
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip(reason="Mixtral buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="Mixtral uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
@@ -450,7 +332,7 @@ def test_load_balancing_loss(self):
         padded_result = model(padded_input_ids, attention_mask=padded_attention_mask)
         torch.testing.assert_close(result.aux_loss.cpu(), padded_result.aux_loss.cpu(), rtol=1e-4, atol=1e-4)
 
-        # We make sure that the loss of includding padding tokens != the loss without padding tokens
+        # We make sure that the loss of including padding tokens != the loss without padding tokens
         # if attention_mask=None --> we don't exclude padding tokens
         include_padding_result = model(padded_input_ids, attention_mask=None)
 
@@ -480,13 +362,13 @@ def test_small_model_logits(self):
             torch_device
         )
         # TODO: might need to tweak it in case the logits do not match on our daily runners
-        # these logits have been obtained with the original megablocks impelmentation.
+        # these logits have been obtained with the original megablocks implementation.
         # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
         #
         # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
         # considering differences in hardware processing and potential deviations in output.
         EXPECTED_LOGITS = {
-            7: torch.Tensor([[0.1670, 0.1620, 0.6094], [-0.8906, -0.1588, -0.6060], [0.1572, 0.1290, 0.7246]]).to(
+            7: torch.Tensor([[0.1640, 0.1621, 0.6093], [-0.8906, -0.1640, -0.6093], [0.1562, 0.1250, 0.7226]]).to(
                 torch_device
             ),
             8: torch.Tensor([[0.1631, 0.1621, 0.6094], [-0.8906, -0.1621, -0.6094], [0.1572, 0.1270, 0.7227]]).to(
@@ -499,6 +381,8 @@ def test_small_model_logits(self):
         with torch.no_grad():
             logits = model(dummy_input).logits
 
+        logits = logits.float()
+
         torch.testing.assert_close(
             logits[0, :3, :3], EXPECTED_LOGITS[self.cuda_compute_capability_major_version], atol=1e-3, rtol=1e-3
         )
@@ -525,7 +409,7 @@ def test_small_model_logits_batched(self):
         # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_LOGITS_LEFT = {
             7: torch.Tensor(
-                [[0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007]],
+                [[0.1904, 0.0500, 0.7187], [0.1933, 0.0515, 0.7187], [0.2001, 0.0559, 0.7148]],
             ).to(torch_device),
             8: torch.Tensor([[0.1914, 0.0508, 0.7188], [0.1953, 0.0510, 0.7227], [0.1973, 0.0562, 0.7148]]).to(
                 torch_device
@@ -537,7 +421,7 @@ def test_small_model_logits_batched(self):
 
         EXPECTED_LOGITS_LEFT_UNPADDED = {
             7: torch.Tensor(
-                [[0.2212, 0.5200, -0.3816], [0.8213, -0.2313, 0.6069], [0.2664, -0.7090, 0.2468]],
+                [[0.2236, 0.5195, -0.3828], [0.8203, -0.2275, 0.6054], [0.2656, -0.7070, 0.2460]],
             ).to(torch_device),
             8: torch.Tensor([[0.2217, 0.5195, -0.3828], [0.8203, -0.2295, 0.6055], [0.2676, -0.7109, 0.2461]]).to(
                 torch_device
@@ -548,7 +432,7 @@ def test_small_model_logits_batched(self):
         }
 
         EXPECTED_LOGITS_RIGHT_UNPADDED = {
-            7: torch.Tensor([[0.2205, 0.1232, -0.1611], [-0.3484, 0.3030, -1.0312], [0.0742, 0.7930, 0.7969]]).to(
+            7: torch.Tensor([[0.2167, 0.1269, -0.1640], [-0.3496, 0.2988, -1.0312], [0.0688, 0.7929, 0.8007]]).to(
                 torch_device
             ),
             8: torch.Tensor([[0.2178, 0.1260, -0.1621], [-0.3496, 0.2988, -1.0312], [0.0693, 0.7930, 0.8008]]).to(
@@ -561,6 +445,7 @@ def test_small_model_logits_batched(self):
 
         with torch.no_grad():
             logits = model(dummy_input, attention_mask=attention_mask).logits
+        logits = logits.float()
 
         torch.testing.assert_close(
             logits[0, :3, :3], EXPECTED_LOGITS_LEFT[self.cuda_compute_capability_major_version], atol=1e-3, rtol=1e-3
diff --git a/tests/models/mllama/test_image_processing_mllama.py b/tests/models/mllama/test_image_processing_mllama.py
index 351f1f16f299..4b7fbcb81d93 100644
--- a/tests/models/mllama/test_image_processing_mllama.py
+++ b/tests/models/mllama/test_image_processing_mllama.py
@@ -224,6 +224,36 @@ def test_call_pil(self):
             tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
         )
 
+    def test_call_channels_last(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+
+        # a white 1x1 pixel RGB image
+        image_inputs = [[np.full(shape=(1, 1, 3), fill_value=1.0, dtype=float)]]
+        encoded_images = image_processing(
+            image_inputs, return_tensors="pt", input_data_format="channels_last"
+        ).pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+    def test_ambiguous_channel_pil_image(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+
+        image_inputs = [[Image.new("RGB", (1, 1))], [Image.new("RGB", (100, 1))]]
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(tuple(encoded_images.shape), (2, *expected_output_image_shape))
+
+    def test_resize_impractical_aspect_ratio(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # Ensure that no error is raised even if the aspect ratio is impractical
+        image_inputs = [[Image.new("RGB", (9999999, 1))]]
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+        self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
     def test_call_pytorch(self):
         # Initialize image_processing
         image_processing = self.image_processing_class(**self.image_processor_dict)
diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py
index 9dcc712346a9..ae28bd6697e8 100644
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -321,6 +321,24 @@ def test_inputs_embeds_matches_input_ids(self):
                 out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
             torch.testing.assert_close(out_embeds, out_ids)
 
+    def test_resize_embeddings_results_in_successful_loss(self):
+        # resizing embeddings should result in successful loss computation
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model_vocab_size = config.get_text_config().vocab_size
+            inputs = self._prepare_for_class(inputs, model_class, return_labels=True)
+            # Resize embeddings and call forward
+            model.resize_token_embeddings(model_vocab_size + 10)
+            output = model(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                labels=inputs["labels"],
+                return_dict=True,
+            )
+            self.assertTrue("loss" in output)
+
     def _check_attentions_for_generate(
         self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values
     ):
@@ -392,7 +410,7 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
         pass
 
     @pytest.mark.generate
-    # overriden because mllama has special cache for self and cross attentions
+    # overridden because mllama has special cache for self and cross attentions
     def test_past_key_values_format(self):
         # Test that the KV cache is formatted correctly. Exceptions need to explicitly overwrite this test. Having a
         # standard KV cache format is important for a consistent API (and for advanced generation methods).
@@ -414,7 +432,7 @@ def test_past_key_values_format(self):
             embed_dim = getattr(text_config, "d_model", text_config.hidden_size)
             per_head_embed_dim = embed_dim // num_attention_heads
 
-            # some models have diffent num-head for query vs key/value so we need to assign correct value
+            # some models have different num-head for query vs key/value so we need to assign correct value
             # BUT only after `per_head_embed_dim` is set
             num_attention_heads = (
                 text_config.num_key_value_heads
@@ -444,7 +462,7 @@ def test_past_key_values_format(self):
                         past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
                     )
 
-    # overriden because mllama has special cache for self and cross attentions
+    # overridden because mllama has special cache for self and cross attentions
     def _check_past_key_values_for_generate(self, batch_size, decoder_past_key_values, cache_length, config):
         self.assertIsInstance(decoder_past_key_values, Cache)
         self.assertListEqual(
diff --git a/tests/models/mllama/test_processor_mllama.py b/tests/models/mllama/test_processor_mllama.py
index 6d5db2f677cf..bbc1d3dfc86e 100644
--- a/tests/models/mllama/test_processor_mllama.py
+++ b/tests/models/mllama/test_processor_mllama.py
@@ -327,6 +327,11 @@ def test_process_interleaved_images_prompts_image_error(self):
         with self.assertRaises(ValueError):
             processor(text=text, images=None, padding=True)
 
+        # see https://github.com/huggingface/transformers/pull/35934
+        images = [self.image1, self.image2]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=None, padding=True)
+
     # Override as MllamaProcessor needs image tokens in prompts
     def prepare_text_inputs(self, batch_size: Optional[int] = None):
         if batch_size is None:
@@ -340,3 +345,32 @@ def prepare_text_inputs(self, batch_size: Optional[int] = None):
         return ["lower newer <|image|>", "<|image|> upper older longer string"] + ["<|image|> lower newer"] * (
             batch_size - 2
         )
+
+    def test_unstructured_kwargs_batched(self):
+        # Overriden because Mllama expects images in nested format. For 2 images it can't infer
+        # the correct nesting, so we better throw an error
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
+        image_input = [[image_input[0]], [image_input[1]]]
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            do_rescale=True,
+            rescale_factor=-1,
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
+        self.assertTrue(
+            len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
+            and len(inputs[self.text_input_name][1]) < 76
+        )
diff --git a/tests/models/mluke/test_tokenization_mluke.py b/tests/models/mluke/test_tokenization_mluke.py
index 7af516849f9f..a86ea690a468 100644
--- a/tests/models/mluke/test_tokenization_mluke.py
+++ b/tests/models/mluke/test_tokenization_mluke.py
@@ -15,12 +15,13 @@
 
 
 import unittest
+from functools import lru_cache
 from typing import Tuple
 
 from transformers.models.mluke.tokenization_mluke import MLukeTokenizer
 from transformers.testing_utils import get_tests_dir, require_torch, slow
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@@ -33,13 +34,17 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = False
     from_pretrained_kwargs = {"cls_token": "<s>"}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
-        self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
+        cls.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
 
-    def get_tokenizer(self, task=None, **kwargs):
-        kwargs.update(self.special_tokens_map)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, task=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
         kwargs.update({"task": task})
         tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs)
         return tokenizer
@@ -100,8 +105,8 @@ def test_pretokenized_inputs(self):
     def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 sentence = "A, <mask> AllenNLP sentence."
                 tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
                 tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
diff --git a/tests/models/mobilebert/test_tokenization_mobilebert.py b/tests/models/mobilebert/test_tokenization_mobilebert.py
index 2a5c250b8495..e0f4bff647ec 100644
--- a/tests/models/mobilebert/test_tokenization_mobilebert.py
+++ b/tests/models/mobilebert/test_tokenization_mobilebert.py
@@ -41,8 +41,9 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_filter = filter_non_english
     pre_trained_model_path = "google/mobilebert-uncased"
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "[UNK]",
@@ -61,13 +62,13 @@ def setUp(self):
             "low",
             "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-        self.tokenizers_list = [
-            (tokenizer_def[0], self.pre_trained_model_path, tokenizer_def[2])  # else the 'google/' prefix is stripped
-            for tokenizer_def in self.tokenizers_list
+        cls.tokenizers_list = [
+            (tokenizer_def[0], cls.pre_trained_model_path, tokenizer_def[2])  # else the 'google/' prefix is stripped
+            for tokenizer_def in cls.tokenizers_list
         ]
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.get_input_output_texts
@@ -275,7 +276,7 @@ def test_sequence_builders(self):
     def test_offsets_with_special_characters(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
                 tokens = tokenizer_r.encode_plus(
@@ -331,8 +332,8 @@ def test_change_tokenize_chinese_chars(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 kwargs["tokenize_chinese_chars"] = True
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
                 ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@@ -345,8 +346,8 @@ def test_change_tokenize_chinese_chars(self):
                 self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
 
                 kwargs["tokenize_chinese_chars"] = False
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
                 ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
diff --git a/tests/models/mobilevit/test_modeling_tf_mobilevit.py b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
index fcad3be021e2..61967ec3414f 100644
--- a/tests/models/mobilevit/test_modeling_tf_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
@@ -284,7 +284,7 @@ def test_keras_fit(self):
                     super().test_keras_fit()
 
     # The default test_loss_computation() uses -100 as a proxy ignore_index
-    # to test masked losses. Overridding to avoid -100 since semantic segmentation
+    # to test masked losses. Overriding to avoid -100 since semantic segmentation
     #  models use `semantic_loss_ignore_index` from the config.
     def test_loss_computation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py
index 5cd26b352366..82a0f8505273 100644
--- a/tests/models/modernbert/test_modeling_modernbert.py
+++ b/tests/models/modernbert/test_modeling_modernbert.py
@@ -12,7 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import os
+import tempfile
 import unittest
 
 import pytest
@@ -29,7 +31,6 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -41,6 +42,7 @@
     from transformers import (
         MODEL_FOR_PRETRAINING_MAPPING,
         ModernBertForMaskedLM,
+        ModernBertForQuestionAnswering,
         ModernBertForSequenceClassification,
         ModernBertForTokenClassification,
         ModernBertModel,
@@ -216,7 +218,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ModernBertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class ModernBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_torchscript = False
 
     all_model_classes = (
@@ -225,6 +227,7 @@ class ModernBertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
             ModernBertForMaskedLM,
             ModernBertForSequenceClassification,
             ModernBertForTokenClassification,
+            ModernBertForQuestionAnswering,
         )
         if is_torch_available()
         else ()
@@ -236,6 +239,7 @@ class ModernBertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
             "text-classification": ModernBertForSequenceClassification,
             "token-classification": ModernBertForTokenClassification,
             "zero-shot": ModernBertForSequenceClassification,
+            "question-answering": ModernBertForQuestionAnswering,
         }
         if is_torch_available()
         else {}
@@ -290,7 +294,12 @@ def test_initialization(self):
                 # are initialized without `initializer_range`, so they're not set to ~0 via the _config_zero_init
                 if param.requires_grad and not (
                     name == "classifier.weight"
-                    and model_class in [ModernBertForSequenceClassification, ModernBertForTokenClassification]
+                    and model_class
+                    in [
+                        ModernBertForSequenceClassification,
+                        ModernBertForTokenClassification,
+                        ModernBertForQuestionAnswering,
+                    ]
                 ):
                     self.assertIn(
                         ((param.data.mean() * 1e9).round() / 1e9).item(),
@@ -359,6 +368,14 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
     def test_flash_attn_2_conversion(self):
         self.skipTest(reason="ModernBert doesn't use the ModernBertFlashAttention2 class method.")
 
+    def test_saved_config_excludes_reference_compile(self):
+        config = ModernBertConfig(reference_compile=True)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            config.save_pretrained(tmpdirname)
+            with open(os.path.join(tmpdirname, "config.json"), "r") as f:
+                config_dict = json.load(f)
+            self.assertNotIn("reference_compile", config_dict)
+
 
 @require_torch
 class ModernBertModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/moonshine/test_modeling_moonshine.py b/tests/models/moonshine/test_modeling_moonshine.py
index bf30f2c3d522..d5f6ed639632 100644
--- a/tests/models/moonshine/test_modeling_moonshine.py
+++ b/tests/models/moonshine/test_modeling_moonshine.py
@@ -20,7 +20,6 @@
 from transformers import MoonshineConfig, is_torch_available
 from transformers.testing_utils import cleanup, require_torch, slow, torch_device
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
@@ -110,41 +109,6 @@ def get_config(self):
             eos_token_id=self.eos_token_id,
         )
 
-    def create_and_check_model(self, config, input_values, attention_mask):
-        model = MoonshineModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_values, attention_mask=attention_mask)
-        self.parent.assertEqual(
-            result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
-        )
-
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = MoonshineModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
     def check_output_attentions(self, config, input_values, attention_mask):
         model = MoonshineModel(config=config)
         model.config.layerdrop = 1.0
@@ -168,7 +132,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MoonshineModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class MoonshineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MoonshineModel, MoonshineForConditionalGeneration) if is_torch_available() else ()
     # Doesn't run generation tests. TODO (eustache): remove this line and then make CI green
     all_generative_model_classes = ()
diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
index 37b4bc46baa6..2f92e74d5a77 100644
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@@ -44,7 +44,12 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+)
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -188,11 +193,15 @@ def _get_logits_processor_kwargs(self, do_sample=False, config=None):
         logits_processor_kwargs = {}
         return logits_processor_kwargs
 
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     @require_torch_sdpa
-    @slow
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        self.skipTest(reason="Moshi has no strict equivalence between two modes, skipping this test.")
+    def test_eager_matches_sdpa_inference(
+        self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels
+    ):
+        if use_attention_mask or (not use_attention_mask and torch_dtype == "fp32" and not output_attentions):
+            self.skipTest("Test is failing, fix me :) ")
+        parent_parameterized_test = getattr(ModelTesterMixin, self._testMethodName)
+        parent_parameterized_test(self)
 
     # Copied from tests.test_modeling_common.ModelTesterMixin.test_resize_tokens_embeddings
     def test_resize_tokens_embeddings(self):
@@ -572,7 +581,7 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2):
         return config, filtered_inputs_dict
 
     def _check_generate_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
-        # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True`
+        # Overwrite because the generate method actually always uses `inputs_embeds` so `use_cache` is always `True`
         super()._check_generate_outputs(
             output, config, use_cache=True, num_return_sequences=num_return_sequences, num_beams=num_beams
         )
@@ -609,22 +618,22 @@ def test_contrastive_generate_low_memory(self):
         pass
 
     @unittest.skip(
-        "Moshi either needs deafult generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
+        "Moshi either needs default generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
     )
     def test_greedy_generate_dict_outputs_use_cache(self):
         pass
 
     @unittest.skip(
-        "Moshi either needs deafult generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
+        "Moshi either needs default generation config or fix for fullgraph compile because it hardcodes SlidingWindowCache in custom generation loop."
     )
     def test_beam_search_generate_dict_outputs_use_cache(self):
         pass
 
-    @unittest.skip("Adapting this test is costly. `test_eager_matches_sdpa_generate` tests this already.")
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @unittest.skip(reason="Unimplemented. Relies on `test_eager_matches_sdpa_generate` to check correctness.")
+    def test_eager_matches_sdpa_inference(
+        self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels
+    ):
         pass
 
     @unittest.skip(reason="The Moshi model does not have support dynamic compile yet")
@@ -840,7 +849,7 @@ def test_generate_from_unconditional(self):
                 **model.get_unconditional_inputs(num_samples=4), max_new_tokens=5, concat_unconditional_inputs=False
             )
 
-            # check same results from uncondtional or no inputs
+            # check same results from unconditional or no inputs
             outputs_from_unconditional = model.generate(
                 **model.get_unconditional_inputs(num_samples=1), max_new_tokens=5, concat_unconditional_inputs=False
             )
@@ -942,8 +951,8 @@ def test_moshika_conditional_greedy(self):
         expected_text_token = 452
         expected_audio_tokens = [916, 1396, 1238, 579, 1105, 914, 1257, 810]  # fmt: skip
 
-        self.assertTrue(expected_text_token == model_outputs.sequences[0, -2].cpu().item())
-        self.assertTrue(expected_audio_tokens == model_outputs.audio_codes[0, :, -1].cpu().tolist())
+        self.assertTrue(expected_text_token == model_outputs.sequences[0, -2].item())
+        self.assertTrue(expected_audio_tokens == model_outputs.audio_codes[0, :, -1].tolist())
 
     @slow
     def test_moshiko_greedy_unconditional_fp16_eager(self):
@@ -957,7 +966,7 @@ def test_moshiko_greedy_unconditional_fp16_eager(self):
         )
 
         # eager equivalence is not as strict as sdpa.
-        self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
+        self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
 
     @slow
     def test_moshiko_greedy_unconditional_fp32(self):
@@ -977,8 +986,8 @@ def test_moshiko_greedy_unconditional_fp32(self):
         audio_code_sums = model_outputs.audio_codes.sum().item()
         self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= (3e-3 * audio_code_sums))
 
-        self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist())
-        self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
+        self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].tolist())
+        self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
 
     @slow
     @require_torch_fp16
@@ -999,8 +1008,8 @@ def test_moshiko_greedy_unconditional_fp16(self):
         audio_code_sums = model_outputs.audio_codes.sum().item()
         self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= (3e-3 * audio_code_sums))
 
-        self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist())
-        self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
+        self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].tolist())
+        self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
 
     @slow
     @require_torch_fp16
@@ -1021,5 +1030,5 @@ def test_moshika_greedy_unconditional_fp16(self):
         audio_code_sums = model_outputs.audio_codes.sum().item()
         self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= 2048)
 
-        self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist())
-        self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
+        self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].tolist())
+        self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
diff --git a/tests/models/moshi/test_tokenization_moshi.py b/tests/models/moshi/test_tokenization_moshi.py
index a520cca94bbe..740cfd55d371 100644
--- a/tests/models/moshi/test_tokenization_moshi.py
+++ b/tests/models/moshi/test_tokenization_moshi.py
@@ -51,8 +51,9 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     from_pretrained_kwargs = {}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = PreTrainedTokenizerFast(
@@ -62,10 +63,11 @@ def setUp(self):
             eos_token="</s>",
         )
         tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
-    def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
-        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     @unittest.skip(reason="No slow tokenizer")
     def test_added_tokens_serialization(self):
@@ -289,7 +291,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 self.assertTrue(
                     find,
                     f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = "
-                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing"
+                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k) == new_special_token_str]} but it is missing"
                     ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.",
                 )
             elif special_token not in special_tokens_map:
diff --git a/tests/models/mpnet/test_tokenization_mpnet.py b/tests/models/mpnet/test_tokenization_mpnet.py
index f1049f8ef548..2b934a9ed68c 100644
--- a/tests/models/mpnet/test_tokenization_mpnet.py
+++ b/tests/models/mpnet/test_tokenization_mpnet.py
@@ -32,8 +32,9 @@ class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     space_between_special_tokens = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "[UNK]",
@@ -52,8 +53,8 @@ def setUp(self):
             "low",
             "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index 1f581f875cd1..cad2f88e2387 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -287,14 +287,6 @@ def create_and_check_token_classification_model(self, config, input_ids, input_m
         result = model(input_ids, attention_mask=input_mask)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
 
-    def create_and_check_question_answering_model(self, config, input_ids, input_mask, *args):
-        model = MptForQuestionAnswering(config)
-        model.to(torch_device)
-        model.eval()
-
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
     def create_and_check_forward_and_backwards(
         self, config, input_ids, input_mask, *args, gradient_checkpointing=False
     ):
diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py
index dfeebf28bcf6..bf87d39085aa 100644
--- a/tests/models/mra/test_modeling_mra.py
+++ b/tests/models/mra/test_modeling_mra.py
@@ -169,38 +169,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = MraModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
     def create_and_check_for_masked_lm(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
@@ -359,19 +327,19 @@ def test_attention_outputs(self):
         return
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py
index 994d88444809..185bd149175e 100644
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -871,20 +871,6 @@ def test_model_from_pretrained(self):
         model = MT5Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = MT5Model(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/t5_test.onnx",
-                export_params=True,
-                opset_version=9,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     def test_generate_with_head_masking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 3852d8c3c4ff..9d49e1b6c5b6 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -21,7 +21,6 @@
 import unittest
 
 import numpy as np
-from parameterized import parameterized
 from pytest import mark
 
 from transformers import (
@@ -43,7 +42,7 @@
     slow,
     torch_device,
 )
-from transformers.utils import cached_property, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
+from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -452,225 +451,14 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
                 assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self.all_model_classes[0]._supports_sdpa:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
-        if torch_dtype == "float16":
-            torch_dtype = torch.float16
-        elif torch_dtype == "bfloat16":
-            torch_dtype = torch.bfloat16
-        elif torch_dtype == "float32":
-            torch_dtype = torch.float32
-
-        atols = {
-            ("cpu", False, torch.float32): 1e-6,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-6,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-6,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-6,
-            ("cuda", True, torch.bfloat16): 1e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-        rtols = {
-            ("cpu", False, torch.float32): 1e-4,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-4,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-4,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-
-        def get_mean_reldiff(failcase, x, ref, atol, rtol):
-            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            is_encoder_decoder = model.config.is_encoder_decoder
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                )
-                model_eager = model_eager.eval().to(torch_device)
-
-                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
-                # but it would be nicer to have an efficient way to use parameterized.expand
-                fail_cases = []
-                for padding_side in ["left", "right"]:
-                    for use_mask in [False, True]:
-                        for batch_size in [7]:
-                            # Ignore copy
-                            batch_size_input_ids = self.model_tester.num_codebooks * batch_size
-                            dummy_input = inputs_dict[model.main_input_name]
-
-                            if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                dummy_input = dummy_input.to(torch_dtype)
-
-                            # Ignore copy
-                            dummy_input = dummy_input[:batch_size_input_ids]
-                            # Ignore copy
-                            if dummy_input.shape[0] != batch_size_input_ids:
-                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                    # Ignore copy
-                                    extension = torch.rand(
-                                        batch_size_input_ids - dummy_input.shape[0],
-                                        *dummy_input.shape[1:],
-                                        dtype=torch_dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-                                else:
-                                    # Ignore copy
-                                    extension = torch.randint(
-                                        high=5,
-                                        size=(batch_size_input_ids - dummy_input.shape[0], *dummy_input.shape[1:]),
-                                        dtype=dummy_input.dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-
-                            if not use_mask:
-                                dummy_attention_mask = None
-                            else:
-                                dummy_attention_mask = inputs_dict.get("attention_mask", None)
-                                if dummy_attention_mask is None:
-                                    if is_encoder_decoder:
-                                        seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
-                                    else:
-                                        seqlen = dummy_input.shape[-1]
-                                    dummy_attention_mask = (
-                                        torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
-                                    )
-
-                                dummy_attention_mask = dummy_attention_mask[:batch_size]
-                                if dummy_attention_mask.shape[0] != batch_size:
-                                    extension = torch.ones(
-                                        batch_size - dummy_attention_mask.shape[0],
-                                        *dummy_attention_mask.shape[1:],
-                                        dtype=dummy_attention_mask.dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
-                                    dummy_attention_mask = dummy_attention_mask.to(torch_device)
-
-                                dummy_attention_mask[:] = 1
-                                if padding_side == "left":
-                                    dummy_attention_mask[-1, :2] = 0
-                                    dummy_attention_mask[-1, 2:] = 1
-                                elif padding_side == "right":
-                                    dummy_attention_mask[-1, -2:] = 0
-                                    dummy_attention_mask[-1, :-2] = 1
-
-                            for enable_kernels in [False, True]:
-                                failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
-
-                                other_inputs = {
-                                    "output_hidden_states": True,
-                                }
-
-                                # Otherwise fails for e.g. WhisperEncoderModel
-                                if "attention_mask" in inspect.signature(model_eager.forward).parameters:
-                                    other_inputs["attention_mask"] = dummy_attention_mask
-
-                                # TODO: test gradients as well (& for FA2 as well!)
-                                with torch.no_grad():
-                                    with sdpa_kernel(
-                                        enable_flash=enable_kernels,
-                                        enable_math=True,
-                                        enable_mem_efficient=enable_kernels,
-                                    ):
-                                        outputs_eager = model_eager(dummy_input, **other_inputs)
-                                        outputs_sdpa = model_sdpa(dummy_input, **other_inputs)
-
-                                logits_eager = (
-                                    outputs_eager.hidden_states[-1]
-                                    if not is_encoder_decoder
-                                    else outputs_eager.decoder_hidden_states[-1]
-                                )
-                                logits_sdpa = (
-                                    outputs_sdpa.hidden_states[-1]
-                                    if not is_encoder_decoder
-                                    else outputs_sdpa.decoder_hidden_states[-1]
-                                )
-
-                                if torch_device in ["cpu", "cuda"]:
-                                    atol = atols[torch_device, enable_kernels, torch_dtype]
-                                    rtol = rtols[torch_device, enable_kernels, torch_dtype]
-                                elif torch_device == "xpu":
-                                    # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
-                                    # which is implemented on PyTorch level using aten operators and is
-                                    # device agnostic with respect to implementation of each aten operator.
-                                    atol = atols["cuda", False, torch_dtype]
-                                    rtol = rtols["cuda", False, torch_dtype]
-                                else:
-                                    atol = 1e-7
-                                    rtol = 1e-4
-
-                                # Masked tokens output slightly deviates - we don't mind that.
-                                if use_mask:
-                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
-                                    _logits_eager = torch.zeros_like(input=logits_eager)
-
-                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
-                                    _logits_eager[:-1] = logits_eager[:-1]
-
-                                    if padding_side == "left":
-                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
-                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
-
-                                    elif padding_side == "right":
-                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
-                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
-
-                                    logits_sdpa = _logits_sdpa
-                                    logits_eager = _logits_eager
-
-                                results = [
-                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
-                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
-                                ]
-                                # If 80% batch elements have matched results, it's fine
-                                if np.mean(results) < 0.8:
-                                    fail_cases.append(
-                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                    )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
+    @unittest.skip(
+        reason=(
+            "MusicGen has a custom set of generation tests that rely on `GenerationTesterMixin`, controlled by "
+            "`greedy_sample_model_classes`"
+        )
+    )
+    def test_generation_tester_mixin_inheritance(self):
+        pass
 
 
 def prepare_musicgen_inputs_dict(
@@ -1182,12 +970,6 @@ def test_greedy_generate_stereo_outputs(self):
         super().test_greedy_generate_dict_outputs()
         self.model_tester.audio_channels = original_audio_channels
 
-    @unittest.skip(
-        reason="MusicgenModel is actually not the base of MusicgenForCausalLM as the latter is a composit model"
-    )
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @require_flash_attn
     @require_torch_gpu
     @mark.flash_attn_test
@@ -1487,261 +1269,6 @@ def test_sdpa_can_dispatch_composite_models(self):
                 if not has_sdpa and model_sdpa.config.model_type != "falcon":
                     raise ValueError("The SDPA model should have SDPA attention layers")
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self.all_model_classes[0]._supports_sdpa:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
-        if torch_dtype == "float16":
-            torch_dtype = torch.float16
-        elif torch_dtype == "bfloat16":
-            torch_dtype = torch.bfloat16
-        elif torch_dtype == "float32":
-            torch_dtype = torch.float32
-
-        atols = {
-            ("cpu", False, torch.float32): 1e-6,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-6,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-6,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-6,
-            ("cuda", True, torch.bfloat16): 1e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-        rtols = {
-            ("cpu", False, torch.float32): 1e-4,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-4,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-4,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-
-        def get_mean_reldiff(failcase, x, ref, atol, rtol):
-            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        if hasattr(self.model_tester, "num_hidden_layers"):
-            self.model_tester.num_hidden_layers = 1
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            config.rms_norm_eps = 1.0
-            config.layer_norm_eps = 1.0
-            config.norm_eps = 1.0
-            config.norm_epsilon = 1.0
-            config.layer_norm_epsilon = 1.0
-
-            for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]:
-                if hasattr(config, attr):
-                    getattr(config, attr).rms_norm_eps = 1.0
-                    getattr(config, attr).layer_norm_eps = 1.0
-                    getattr(config, attr).norm_eps = 1.0
-                    getattr(config, attr).norm_epsilon = 1.0
-                    getattr(config, attr).layer_norm_epsilon = 1.0
-
-            model = model_class(config)
-
-            is_encoder_decoder = model.config.is_encoder_decoder
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                )
-                model_eager = model_eager.eval().to(torch_device)
-
-                for x in model_eager.modules():
-                    if isinstance(x, (torch.nn.LayerNorm, torch.nn.GroupNorm)):
-                        x.eps = 1.0
-                for x in model_sdpa.modules():
-                    if isinstance(x, (torch.nn.LayerNorm, torch.nn.GroupNorm)):
-                        x.eps = 1.0
-
-                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
-                # but it would be nicer to have an efficient way to use parameterized.expand
-                fail_cases = []
-                for padding_side in ["left", "right"]:
-                    for use_mask in [False, True]:
-                        for batch_size in [7]:
-                            dummy_input = inputs_dict[model.main_input_name]
-
-                            if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                dummy_input = dummy_input.to(torch_dtype)
-
-                            dummy_input = dummy_input[:batch_size]
-                            if dummy_input.shape[0] != batch_size:
-                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                    extension = torch.rand(
-                                        batch_size - dummy_input.shape[0],
-                                        *dummy_input.shape[1:],
-                                        dtype=torch_dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-                                else:
-                                    extension = torch.randint(
-                                        high=5,
-                                        size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]),
-                                        dtype=dummy_input.dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-
-                            if not use_mask:
-                                dummy_attention_mask = None
-                            else:
-                                dummy_attention_mask = inputs_dict.get("attention_mask", None)
-                                if dummy_attention_mask is None:
-                                    # Ignore copy
-                                    seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
-                                    # Ignore copy
-                                    dummy_attention_mask = (
-                                        torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
-                                    )
-
-                                dummy_attention_mask = dummy_attention_mask[:batch_size]
-                                if dummy_attention_mask.shape[0] != batch_size:
-                                    extension = torch.ones(
-                                        batch_size - dummy_attention_mask.shape[0],
-                                        *dummy_attention_mask.shape[1:],
-                                        dtype=dummy_attention_mask.dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
-                                    dummy_attention_mask = dummy_attention_mask.to(torch_device)
-
-                                dummy_attention_mask[:] = 1
-                                if padding_side == "left":
-                                    dummy_attention_mask[-1, :2] = 0
-                                    dummy_attention_mask[-1, 2:] = 1
-                                elif padding_side == "right":
-                                    dummy_attention_mask[-1, -2:] = 0
-                                    dummy_attention_mask[-1, :-2] = 1
-
-                            for enable_kernels in [False, True]:
-                                failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
-                                # Ignore copy
-                                batch_size_input_ids = self.model_tester.num_codebooks * batch_size
-                                # Ignore copy
-                                decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[
-                                    :batch_size_input_ids
-                                ]
-                                # Ignore copy
-                                if decoder_input_ids.shape[0] != batch_size_input_ids:
-                                    # Ignore copy
-                                    extension = torch.ones(
-                                        batch_size_input_ids - decoder_input_ids.shape[0],
-                                        *decoder_input_ids.shape[1:],
-                                        dtype=decoder_input_ids.dtype,
-                                        device=torch_device,
-                                    )
-                                    decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
-                                    decoder_input_ids = decoder_input_ids.to(torch_device)
-
-                                # TODO: never an `attention_mask` arg here?
-                                # Ignore copy
-                                other_inputs = {
-                                    "decoder_input_ids": decoder_input_ids,
-                                    "decoder_attention_mask": dummy_attention_mask,
-                                    "output_hidden_states": True,
-                                }
-
-                                # TODO: test gradients as well (& for FA2 as well!)
-                                # Ignore copy
-                                with torch.no_grad():
-                                    with sdpa_kernel(
-                                        enable_flash=enable_kernels,
-                                        enable_math=True,
-                                        enable_mem_efficient=enable_kernels,
-                                    ):
-                                        outputs_eager = model_eager(dummy_input, **other_inputs)
-                                        outputs_sdpa = model_sdpa(dummy_input, **other_inputs)
-
-                                logits_eager = (
-                                    outputs_eager.hidden_states[-1]
-                                    if not is_encoder_decoder
-                                    else outputs_eager.decoder_hidden_states[-1]
-                                )
-                                logits_sdpa = (
-                                    outputs_sdpa.hidden_states[-1]
-                                    if not is_encoder_decoder
-                                    else outputs_sdpa.decoder_hidden_states[-1]
-                                )
-
-                                if torch_device in ["cpu", "cuda"]:
-                                    atol = atols[torch_device, enable_kernels, torch_dtype]
-                                    rtol = rtols[torch_device, enable_kernels, torch_dtype]
-                                elif torch_device == "xpu":
-                                    # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
-                                    # which is implemented on PyTorch level using aten operators and is
-                                    # device agnostic with respect to implementation of each aten operator.
-                                    atol = atols["cuda", False, torch_dtype]
-                                    rtol = rtols["cuda", False, torch_dtype]
-                                else:
-                                    atol = 1e-7
-                                    rtol = 1e-4
-
-                                # Masked tokens output slightly deviates - we don't mind that.
-                                if use_mask:
-                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
-                                    _logits_eager = torch.zeros_like(input=logits_eager)
-
-                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
-                                    _logits_eager[:-1] = logits_eager[:-1]
-
-                                    if padding_side == "left":
-                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
-                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
-
-                                    elif padding_side == "right":
-                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
-                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
-
-                                    logits_sdpa = _logits_sdpa
-                                    logits_eager = _logits_eager
-
-                                results = [
-                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
-                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
-                                ]
-                                # If 80% batch elements have matched results, it's fine
-                                if np.mean(results) < 0.8:
-                                    fail_cases.append(
-                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                    )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
     def test_requires_grad_with_frozen_encoders(self):
         config = self.model_tester.get_config()
         for model_class in self.all_model_classes:
@@ -1763,6 +1290,15 @@ def test_requires_grad_with_frozen_encoders(self):
             self.assertTrue(all(audio_encoder_grads))
             self.assertFalse(all(text_encoder_grads))
 
+    @unittest.skip(
+        reason=(
+            "MusicGen has a custom set of generation tests that rely on `GenerationTesterMixin`, controlled by "
+            "`greedy_sample_model_classes`"
+        )
+    )
+    def test_generation_tester_mixin_inheritance(self):
+        pass
+
 
 def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
     """Produces a series of 'bip bip' sounds at a given frequency."""
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index 7cb31adaedbd..a979fb8f6646 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -21,7 +21,6 @@
 import unittest
 
 import numpy as np
-from parameterized import parameterized
 from pytest import mark
 
 from transformers import (
@@ -41,13 +40,10 @@
     require_torch_gpu,
     require_torch_sdpa,
     require_torchaudio,
-    set_config_for_less_flaky_test,
-    set_model_for_less_flaky_test,
-    set_model_tester_for_less_flaky_test,
     slow,
     torch_device,
 )
-from transformers.utils import cached_property, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
+from transformers.utils import cached_property
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -463,231 +459,14 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
                 assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        if not self.all_model_classes[0]._supports_sdpa:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
-        if torch_dtype == "float16":
-            torch_dtype = torch.float16
-        elif torch_dtype == "bfloat16":
-            torch_dtype = torch.bfloat16
-        elif torch_dtype == "float32":
-            torch_dtype = torch.float32
-
-        atols = {
-            ("cpu", False, torch.float32): 1e-6,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-6,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-6,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-6,
-            ("cuda", True, torch.bfloat16): 1e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-        rtols = {
-            ("cpu", False, torch.float32): 1e-4,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-4,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-4,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-
-        def get_mean_reldiff(failcase, x, ref, atol, rtol):
-            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        set_model_tester_for_less_flaky_test(self)
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            set_config_for_less_flaky_test(config)
-            model = model_class(config)
-
-            is_encoder_decoder = model.config.is_encoder_decoder
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                )
-                model_eager = model_eager.eval().to(torch_device)
-
-                set_model_for_less_flaky_test(model_eager)
-                set_model_for_less_flaky_test(model_sdpa)
-
-                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
-                # but it would be nicer to have an efficient way to use parameterized.expand
-                fail_cases = []
-                for padding_side in ["left", "right"]:
-                    for use_mask in [False, True]:
-                        for batch_size in [7]:
-                            # Ignore copy
-                            batch_size_input_ids = self.model_tester.num_codebooks * batch_size
-                            dummy_input = inputs_dict[model.main_input_name]
-
-                            if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                dummy_input = dummy_input.to(torch_dtype)
-
-                            # Ignore copy
-                            dummy_input = dummy_input[:batch_size_input_ids]
-                            # Ignore copy
-                            if dummy_input.shape[0] != batch_size_input_ids:
-                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                    # Ignore copy
-                                    extension = torch.rand(
-                                        batch_size_input_ids - dummy_input.shape[0],
-                                        *dummy_input.shape[1:],
-                                        dtype=torch_dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-                                else:
-                                    # Ignore copy
-                                    extension = torch.randint(
-                                        high=5,
-                                        size=(batch_size_input_ids - dummy_input.shape[0], *dummy_input.shape[1:]),
-                                        dtype=dummy_input.dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-
-                            if not use_mask:
-                                dummy_attention_mask = None
-                            else:
-                                dummy_attention_mask = inputs_dict.get("attention_mask", None)
-                                if dummy_attention_mask is None:
-                                    if is_encoder_decoder:
-                                        seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
-                                    else:
-                                        seqlen = dummy_input.shape[-1]
-                                    dummy_attention_mask = (
-                                        torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
-                                    )
-
-                                dummy_attention_mask = dummy_attention_mask[:batch_size]
-                                if dummy_attention_mask.shape[0] != batch_size:
-                                    extension = torch.ones(
-                                        batch_size - dummy_attention_mask.shape[0],
-                                        *dummy_attention_mask.shape[1:],
-                                        dtype=dummy_attention_mask.dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
-                                    dummy_attention_mask = dummy_attention_mask.to(torch_device)
-
-                                dummy_attention_mask[:] = 1
-                                if padding_side == "left":
-                                    dummy_attention_mask[-1, :2] = 0
-                                    dummy_attention_mask[-1, 2:] = 1
-                                elif padding_side == "right":
-                                    dummy_attention_mask[-1, -2:] = 0
-                                    dummy_attention_mask[-1, :-2] = 1
-
-                            for enable_kernels in [False, True]:
-                                failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
-
-                                other_inputs = {
-                                    "output_hidden_states": True,
-                                }
-
-                                # Otherwise fails for e.g. WhisperEncoderModel
-                                if "attention_mask" in inspect.signature(model_eager.forward).parameters:
-                                    other_inputs["attention_mask"] = dummy_attention_mask
-
-                                # TODO: test gradients as well (& for FA2 as well!)
-                                with torch.no_grad():
-                                    with sdpa_kernel(
-                                        enable_flash=enable_kernels,
-                                        enable_math=True,
-                                        enable_mem_efficient=enable_kernels,
-                                    ):
-                                        outputs_eager = model_eager(dummy_input, **other_inputs)
-                                        outputs_sdpa = model_sdpa(dummy_input, **other_inputs)
-
-                                logits_eager = (
-                                    outputs_eager.hidden_states[-1]
-                                    if not is_encoder_decoder
-                                    else outputs_eager.decoder_hidden_states[-1]
-                                )
-                                logits_sdpa = (
-                                    outputs_sdpa.hidden_states[-1]
-                                    if not is_encoder_decoder
-                                    else outputs_sdpa.decoder_hidden_states[-1]
-                                )
-
-                                if torch_device in ["cpu", "cuda"]:
-                                    atol = atols[torch_device, enable_kernels, torch_dtype]
-                                    rtol = rtols[torch_device, enable_kernels, torch_dtype]
-                                elif torch_device == "xpu":
-                                    # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
-                                    # which is implemented on PyTorch level using aten operators and is
-                                    # device agnostic with respect to implementation of each aten operator.
-                                    atol = atols["cuda", False, torch_dtype]
-                                    rtol = rtols["cuda", False, torch_dtype]
-                                else:
-                                    atol = 1e-7
-                                    rtol = 1e-4
-
-                                # Masked tokens output slightly deviates - we don't mind that.
-                                if use_mask:
-                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
-                                    _logits_eager = torch.zeros_like(input=logits_eager)
-
-                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
-                                    _logits_eager[:-1] = logits_eager[:-1]
-
-                                    if padding_side == "left":
-                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
-                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
-
-                                    elif padding_side == "right":
-                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
-                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
-
-                                    logits_sdpa = _logits_sdpa
-                                    logits_eager = _logits_eager
-
-                                results = [
-                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
-                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
-                                ]
-                                # If 80% batch elements have matched results, it's fine
-                                if np.mean(results) < 0.8:
-                                    fail_cases.append(
-                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                    )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
+    @unittest.skip(
+        reason=(
+            "MusicGen has a custom set of generation tests that rely on `GenerationTesterMixin`, controlled by "
+            "`greedy_sample_model_classes`"
+        )
+    )
+    def test_generation_tester_mixin_inheritance(self):
+        pass
 
 
 def prepare_musicgen_melody_inputs_dict(
@@ -1181,12 +960,6 @@ def test_greedy_generate_stereo_outputs(self):
         super().test_greedy_generate_dict_outputs()
         self.model_tester.audio_channels = original_audio_channels
 
-    @unittest.skip(
-        reason="MusicgenMelodyModel is actually not the base of MusicgenMelodyForCausalLM as the latter is a composit model"
-    )
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @require_flash_attn
     @require_torch_gpu
     @mark.flash_attn_test
@@ -1486,240 +1259,6 @@ def test_sdpa_can_dispatch_composite_models(self):
                 if not has_sdpa and model_sdpa.config.model_type != "falcon":
                     raise ValueError("The SDPA model should have SDPA attention layers")
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        if not self.all_model_classes[0]._supports_sdpa:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
-        if torch_dtype == "float16":
-            torch_dtype = torch.float16
-        elif torch_dtype == "bfloat16":
-            torch_dtype = torch.bfloat16
-        elif torch_dtype == "float32":
-            torch_dtype = torch.float32
-
-        atols = {
-            ("cpu", False, torch.float32): 1e-6,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-6,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-6,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-6,
-            ("cuda", True, torch.bfloat16): 1e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-        rtols = {
-            ("cpu", False, torch.float32): 1e-4,
-            ("cpu", False, torch.float16): 5e-3,
-            ("cpu", False, torch.bfloat16): 1e-2,
-            ("cpu", True, torch.float32): 1e-4,
-            ("cpu", True, torch.float16): 5e-3,
-            ("cpu", True, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float32): 1e-4,
-            ("cuda", False, torch.bfloat16): 1e-2,
-            ("cuda", False, torch.float16): 5e-3,
-            ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
-            ("cuda", True, torch.float16): 5e-3,
-        }
-
-        def get_mean_reldiff(failcase, x, ref, atol, rtol):
-            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        set_model_tester_for_less_flaky_test(self)
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            set_config_for_less_flaky_test(config)
-            model = model_class(config)
-
-            is_encoder_decoder = model.config.is_encoder_decoder
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                )
-                model_eager = model_eager.eval().to(torch_device)
-
-                set_model_for_less_flaky_test(model_eager)
-                set_model_for_less_flaky_test(model_sdpa)
-
-                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
-                # but it would be nicer to have an efficient way to use parameterized.expand
-                fail_cases = []
-                for padding_side in ["left", "right"]:
-                    for use_mask in [False, True]:
-                        for batch_size in [7]:
-                            dummy_input = inputs_dict[model.main_input_name]
-
-                            if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                dummy_input = dummy_input.to(torch_dtype)
-
-                            dummy_input = dummy_input[:batch_size]
-                            if dummy_input.shape[0] != batch_size:
-                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                    extension = torch.rand(
-                                        batch_size - dummy_input.shape[0],
-                                        *dummy_input.shape[1:],
-                                        dtype=torch_dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-                                else:
-                                    extension = torch.randint(
-                                        high=5,
-                                        size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]),
-                                        dtype=dummy_input.dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-
-                            if not use_mask:
-                                dummy_attention_mask = None
-                            else:
-                                dummy_attention_mask = inputs_dict.get("attention_mask", None)
-                                if dummy_attention_mask is None:
-                                    # Ignore copy
-                                    seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
-                                    # Ignore copy
-                                    dummy_attention_mask = (
-                                        torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
-                                    )
-
-                                dummy_attention_mask = dummy_attention_mask[:batch_size]
-                                if dummy_attention_mask.shape[0] != batch_size:
-                                    extension = torch.ones(
-                                        batch_size - dummy_attention_mask.shape[0],
-                                        *dummy_attention_mask.shape[1:],
-                                        dtype=dummy_attention_mask.dtype,
-                                        device=torch_device,
-                                    )
-                                    dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
-                                    dummy_attention_mask = dummy_attention_mask.to(torch_device)
-
-                                dummy_attention_mask[:] = 1
-                                if padding_side == "left":
-                                    dummy_attention_mask[-1, :2] = 0
-                                    dummy_attention_mask[-1, 2:] = 1
-                                elif padding_side == "right":
-                                    dummy_attention_mask[-1, -2:] = 0
-                                    dummy_attention_mask[-1, :-2] = 1
-
-                            for enable_kernels in [False, True]:
-                                failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
-                                # Ignore copy
-                                batch_size_input_ids = self.model_tester.num_codebooks * batch_size
-                                # Ignore copy
-                                decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[
-                                    :batch_size_input_ids
-                                ]
-                                # Ignore copy
-                                if decoder_input_ids.shape[0] != batch_size_input_ids:
-                                    # Ignore copy
-                                    extension = torch.ones(
-                                        batch_size_input_ids - decoder_input_ids.shape[0],
-                                        *decoder_input_ids.shape[1:],
-                                        dtype=decoder_input_ids.dtype,
-                                        device=torch_device,
-                                    )
-                                    decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
-                                    decoder_input_ids = decoder_input_ids.to(torch_device)
-
-                                # TODO: never an `attention_mask` arg here?
-                                # Ignore copy
-                                other_inputs = {
-                                    "decoder_input_ids": decoder_input_ids,
-                                    "decoder_attention_mask": dummy_attention_mask,
-                                    "output_hidden_states": True,
-                                }
-
-                                # TODO: test gradients as well (& for FA2 as well!)
-                                # Ignore copy
-                                with torch.no_grad():
-                                    with sdpa_kernel(
-                                        enable_flash=enable_kernels,
-                                        enable_math=True,
-                                        enable_mem_efficient=enable_kernels,
-                                    ):
-                                        outputs_eager = model_eager(dummy_input, **other_inputs)
-                                        outputs_sdpa = model_sdpa(dummy_input, **other_inputs)
-
-                                logits_eager = (
-                                    outputs_eager.hidden_states[-1]
-                                    if not is_encoder_decoder
-                                    else outputs_eager.decoder_hidden_states[-1]
-                                )
-                                logits_sdpa = (
-                                    outputs_sdpa.hidden_states[-1]
-                                    if not is_encoder_decoder
-                                    else outputs_sdpa.decoder_hidden_states[-1]
-                                )
-
-                                if torch_device in ["cpu", "cuda"]:
-                                    atol = atols[torch_device, enable_kernels, torch_dtype]
-                                    rtol = rtols[torch_device, enable_kernels, torch_dtype]
-                                elif torch_device == "xpu":
-                                    # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
-                                    # which is implemented on PyTorch level using aten operators and is
-                                    # device agnostic with respect to implementation of each aten operator.
-                                    atol = atols["cuda", False, torch_dtype]
-                                    rtol = rtols["cuda", False, torch_dtype]
-                                else:
-                                    atol = 1e-7
-                                    rtol = 1e-4
-
-                                # Masked tokens output slightly deviates - we don't mind that.
-                                if use_mask:
-                                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
-                                    _logits_eager = torch.zeros_like(input=logits_eager)
-
-                                    _logits_sdpa[:-1] = logits_sdpa[:-1]
-                                    _logits_eager[:-1] = logits_eager[:-1]
-
-                                    if padding_side == "left":
-                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
-                                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
-
-                                    elif padding_side == "right":
-                                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
-                                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
-
-                                    logits_sdpa = _logits_sdpa
-                                    logits_eager = _logits_eager
-
-                                results = [
-                                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
-                                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
-                                ]
-                                # If 80% batch elements have matched results, it's fine
-                                if np.mean(results) < 0.8:
-                                    fail_cases.append(
-                                        get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                    )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
     def test_requires_grad_with_frozen_encoders(self):
         config = self.model_tester.get_config()
         for model_class in self.all_model_classes:
@@ -1741,6 +1280,15 @@ def test_requires_grad_with_frozen_encoders(self):
             self.assertTrue(all(audio_encoder_grads))
             self.assertFalse(all(text_encoder_grads))
 
+    @unittest.skip(
+        reason=(
+            "MusicGen has a custom set of generation tests that rely on `GenerationTesterMixin`, controlled by "
+            "`greedy_sample_model_classes`"
+        )
+    )
+    def test_generation_tester_mixin_inheritance(self):
+        pass
+
 
 # Copied from tests.models.musicgen.test_modeling_musicgen.get_bip_bip
 def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py
index 9320f8f020d8..af44cc961c5b 100644
--- a/tests/models/mvp/test_tokenization_mvp.py
+++ b/tests/models/mvp/test_tokenization_mvp.py
@@ -14,13 +14,14 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import BatchEncoding, MvpTokenizer, MvpTokenizerFast
 from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_tokenizers, require_torch
 from transformers.utils import cached_property
 
-from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors
+from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors, use_cache_if_possible
 
 
 @require_tokenizers
@@ -32,8 +33,10 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_filter = filter_roberta_detectors
     # from_pretrained_kwargs = {'add_prefix_space': True}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
         vocab = [
             "l",
             "o",
@@ -58,22 +61,30 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         return "lower newer", "lower newer"
@@ -153,8 +164,8 @@ def test_pretokenized_inputs(self):
     def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 sentence = "A, <mask> AllenNLP sentence."
                 tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
                 tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
diff --git a/tests/models/myt5/test_tokenization_myt5.py b/tests/models/myt5/test_tokenization_myt5.py
index 36e10ac36da6..aab67978f21c 100644
--- a/tests/models/myt5/test_tokenization_myt5.py
+++ b/tests/models/myt5/test_tokenization_myt5.py
@@ -16,6 +16,7 @@
 import unittest
 
 from transformers import MyT5Tokenizer
+from transformers.testing_utils import slow
 from transformers.utils import is_tf_available, is_torch_available
 
 from ...test_tokenization_common import TokenizerTesterMixin
@@ -86,15 +87,14 @@ def test_unrecognized_byte(self):
         self.assertEqual(decompose_rewriter.rewrite_bytes(in_hex), out_hex)
 
 
+# This is way too slow, let's not run it on CircleCI. When trying to use cache, we get OOM and worker(s) crashed.
+@slow
 class MyT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = MyT5Tokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
-
-    def get_tokenizer(self, **kwargs) -> MyT5Tokenizer:
-        return self.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs)
+    def get_tokenizer(cls, **kwargs) -> MyT5Tokenizer:
+        return cls.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs)
 
     @unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string")
     def test_pretokenized_inputs(self):
diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py
index 1759731bc68b..0e9f37d5945e 100644
--- a/tests/models/nllb/test_tokenization_nllb.py
+++ b/tests/models/nllb/test_tokenization_nllb.py
@@ -56,12 +56,13 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece = True
     from_pretrained_kwargs = {}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_full_tokenizer(self):
         tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
@@ -143,8 +144,8 @@ def test_save_pretrained(self):
         self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", {})
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 tmpdirname2 = tempfile.mkdtemp()
 
@@ -262,7 +263,7 @@ def test_special_tokens_initialization(self):
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
                 r_output = tokenizer_r.encode("Hey this is a <special> token")
@@ -272,7 +273,7 @@ def test_special_tokens_initialization(self):
                 self.assertTrue(special_token_id in r_output)
 
                 if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                    tokenizer_cr = self.get_rust_tokenizer(
                         pretrained_name,
                         additional_special_tokens=added_tokens,
                         **kwargs,  # , from_slow=True <- unfortunately too slow to convert
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
index b3ae5779bd27..8aee844b478a 100644
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -353,7 +353,7 @@ def test_get_loss(self):
         self.assertIsNotNone(model(**input_dict)["decoder_router_logits"][0])
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
diff --git a/tests/models/nougat/test_tokenization_nougat.py b/tests/models/nougat/test_tokenization_nougat.py
index 38a9e3ba9c07..c5da1f0291b5 100644
--- a/tests/models/nougat/test_tokenization_nougat.py
+++ b/tests/models/nougat/test_tokenization_nougat.py
@@ -13,13 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import unittest
+from functools import lru_cache
 
 from transformers import NougatTokenizerFast
 from transformers.models.nougat.tokenization_nougat_fast import markdown_compatible, normalize_list_like_lines
 from transformers.testing_utils import require_levenshtein, require_nltk, require_tokenizers
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -33,19 +35,26 @@ class NougatTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_vocab_key = "tokenizer_file"
     special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
         tokenizer = NougatTokenizerFast.from_pretrained("facebook/nougat-base")
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return NougatTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+        tokenizer.save_pretrained(cls.tmpdirname)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        _kwargs = copy.deepcopy(cls.special_tokens_map)
+        _kwargs.update(kwargs)
+        kwargs = _kwargs
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return NougatTokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     def test_padding(self, max_length=6):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
                 # Simple input
                 sentence1 = "This is a simple input"
                 sentence2 = ["This is a simple input 1", "This is a simple input 2"]
diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py
index a96eb9111359..fb4b10e910d7 100644
--- a/tests/models/olmo/test_modeling_olmo.py
+++ b/tests/models/olmo/test_modeling_olmo.py
@@ -146,116 +146,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = OlmoModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = OlmoForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = OlmoForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -310,10 +200,6 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip(reason="OLMo buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @parameterized.expand([("linear",), ("dynamic",)])
     def test_model_rope_scaling(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/olmo2/test_modeling_olmo2.py b/tests/models/olmo2/test_modeling_olmo2.py
index 51496188f9fc..222aeb5b3928 100644
--- a/tests/models/olmo2/test_modeling_olmo2.py
+++ b/tests/models/olmo2/test_modeling_olmo2.py
@@ -145,116 +145,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Olmo2Model(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Olmo2ForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Olmo2ForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -309,10 +199,6 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip(reason="OLMo2 buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @parameterized.expand([("linear",), ("dynamic",)])
     def test_model_rope_scaling(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/olmoe/test_modeling_olmoe.py b/tests/models/olmoe/test_modeling_olmoe.py
index 07d904699faa..9c72225edca1 100644
--- a/tests/models/olmoe/test_modeling_olmoe.py
+++ b/tests/models/olmoe/test_modeling_olmoe.py
@@ -159,116 +159,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = OlmoeModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = OlmoeForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = OlmoeForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -323,10 +213,6 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip(reason="OLMoE buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @parameterized.expand([("linear",), ("dynamic",)])
     def test_model_rope_scaling(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index 8f1df74ea627..c544154b2364 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -173,7 +173,7 @@ def create_and_check_oneformer_model(
 
             output = model(pixel_values=pixel_values, task_inputs=task_inputs, pixel_mask=pixel_mask)
             output = model(pixel_values, task_inputs=task_inputs, output_hidden_states=True)
-        # the correct shape of output.transformer_decoder_hidden_states ensure the correcteness of the
+        # the correct shape of output.transformer_decoder_hidden_states ensure the correctness of the
         # encoder and pixel decoder
         self.parent.assertEqual(
             output.transformer_decoder_object_queries.shape,
diff --git a/tests/models/openai/test_tokenization_openai.py b/tests/models/openai/test_tokenization_openai.py
index 5c8a76a5ae47..e91765f93d48 100644
--- a/tests/models/openai/test_tokenization_openai.py
+++ b/tests/models/openai/test_tokenization_openai.py
@@ -35,8 +35,9 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_seq2seq = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -65,11 +66,11 @@ def setUp(self):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w") as fp:
             fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
+        with open(cls.merges_file, "w") as fp:
             fp.write("\n".join(merges))
 
     def get_input_output_texts(self, tokenizer):
@@ -90,7 +91,7 @@ def test_full_tokenizer(self):
     def test_padding(self, max_length=15):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 # Simple input
                 s = "This is a simple input"
diff --git a/tests/models/opt/test_modeling_flax_opt.py b/tests/models/opt/test_modeling_flax_opt.py
index c5c3d10f23c5..0b943de559da 100644
--- a/tests/models/opt/test_modeling_flax_opt.py
+++ b/tests/models/opt/test_modeling_flax_opt.py
@@ -19,7 +19,6 @@
 from transformers import OPTConfig, is_flax_available
 from transformers.testing_utils import require_flax, require_sentencepiece, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
@@ -203,7 +202,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 
 @require_flax
-class FlaxOPTModelTest(FlaxModelTesterMixin, unittest.TestCase, FlaxGenerationTesterMixin):
+class FlaxOPTModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxOPTModel, FlaxOPTForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index dad740cde721..6fc00d6daa4c 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -486,7 +486,7 @@ def test_batch_generation(self):
         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
         output_non_padded = model.generate(input_ids=inputs_non_padded)
 
-        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
 
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index e6aff1c7021c..9b38fa1a9f9a 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -196,25 +196,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="Owlv2VisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Owlv2VisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "google/owlv2-base-patch16-ensemble"
@@ -340,13 +332,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -355,14 +347,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="Owlv2TextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Owlv2TextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "google/owlv2-base-patch16-ensemble"
@@ -475,7 +459,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for OWLV2
+    # override as the `logit_scale` parameter initialization is different for OWLV2
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -484,7 +468,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
@@ -689,10 +673,6 @@ def test_initialization(self):
     def test_forward_signature(self):
         pass
 
-    @unittest.skip(reason="Test_save_load_fast_init_from_base is tested in individual model tests")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="OwlV2 does not support training yet")
     def test_training(self):
         pass
@@ -702,13 +682,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -1009,7 +989,7 @@ def test_inference_object_detection(self):
             outputs, text_labels=text_labels
         )
 
-        objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
+        objects_labels = post_processed_output_with_text_labels[0]["labels"].tolist()
         self.assertListEqual(objects_labels, [0, 0])
 
         objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 81034df4cbc9..ecda5b178ee9 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -194,25 +194,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="OwlViTVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="OwlViTVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "google/owlvit-base-patch32"
@@ -336,13 +328,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -351,14 +343,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="OwlViTTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="OwlViTTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "google/owlvit-base-patch32"
@@ -470,7 +454,7 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for OWLVIT
+    # override as the `logit_scale` parameter initialization is different for OWLVIT
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -479,7 +463,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
@@ -682,10 +666,6 @@ def test_initialization(self):
     def test_forward_signature(self):
         pass
 
-    @unittest.skip(reason="Test_save_load_fast_init_from_base is tested in individual model tests")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="OWL-ViT does not support training yet")
     def test_training(self):
         pass
@@ -695,13 +675,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -995,7 +975,7 @@ def test_inference_object_detection(self):
             outputs, text_labels=text_labels
         )
 
-        objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
+        objects_labels = post_processed_output_with_text_labels[0]["labels"].tolist()
         self.assertListEqual(objects_labels, [0, 0])
 
         objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index 570ccf374272..8fa19cb41182 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -237,13 +237,13 @@ def test_inputs_embeds_matches_input_ids(self):
     def test_mismatching_num_image_tokens(self):
         """
         Tests that VLMs through an error with explicit message saying what is wrong
-        when number of images don't match number of image tokens in the text.
+        when number of images doesn't match number of image tokens in the text.
         Also we need to test multi-image cases when one prompr has multiple image tokens.
         """
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
@@ -264,19 +264,19 @@ def test_mismatching_num_image_tokens(self):
             _ = model(input_ids=input_ids, pixel_values=pixel_values)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/paligemma2/test_modeling_paligemma2.py b/tests/models/paligemma2/test_modeling_paligemma2.py
index 8eb9c9a2d92f..140006d9f728 100644
--- a/tests/models/paligemma2/test_modeling_paligemma2.py
+++ b/tests/models/paligemma2/test_modeling_paligemma2.py
@@ -16,6 +16,7 @@
 
 import unittest
 
+import pytest
 from parameterized import parameterized
 
 from transformers import (
@@ -233,13 +234,13 @@ def test_inputs_embeds_matches_input_ids(self):
     def test_mismatching_num_image_tokens(self):
         """
         Tests that VLMs through an error with explicit message saying what is wrong
-        when number of images don't match number of image tokens in the text.
+        when number of images doesn't match number of image tokens in the text.
         Also we need to test multi-image cases when one prompr has multiple image tokens.
         """
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
@@ -260,19 +261,19 @@ def test_mismatching_num_image_tokens(self):
             _ = model(input_ids=input_ids, pixel_values=pixel_values)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -351,6 +352,7 @@ def test_beam_search_low_memory(self):
         pass
 
     @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
     @unittest.skip("Gemma2 has HybridCache which is not compatible with assisted decoding")
     def test_assisted_decoding_matches_greedy_search(self, assistant_type):
         pass
@@ -359,6 +361,7 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
     def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type):
         pass
 
+    @pytest.mark.generate
     @unittest.skip("Gemma2 has HybridCache which is not compatible with assisted decoding")
     def test_assisted_decoding_sample(self):
         pass
diff --git a/tests/models/patchtsmixer/test_modeling_patchtsmixer.py b/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
index 939072d825fe..2f39b9e68006 100644
--- a/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
+++ b/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
@@ -452,7 +452,7 @@ def test_model_get_set_embeddings(self):
 def prepare_batch(repo_id="ibm/patchtsmixer-etth1-test-data", file="pretrain_batch.pt"):
     # TODO: Make repo public
     file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
-    batch = torch.load(file, map_location=torch_device)
+    batch = torch.load(file, map_location=torch_device, weights_only=True)
     return batch
 
 
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 0f6f019dc3ef..8f48a6f904a2 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -303,7 +303,7 @@ def test_model_get_set_embeddings(self):
 
 def prepare_batch(repo_id="hf-internal-testing/etth1-hourly-batch", file="train-batch.pt"):
     file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
-    batch = torch.load(file, map_location=torch_device)
+    batch = torch.load(file, map_location=torch_device, weights_only=True)
     return batch
 
 
@@ -385,4 +385,4 @@ def test_regression_generation(self):
             device=torch_device,
         )
         mean_prediction = outputs.sequences.mean(dim=1)
-        torch.testing.assert_close(mean_prediction[-5:], expected_slice, rtol=TOLERANCE)
+        torch.testing.assert_close(mean_prediction[-5:], expected_slice, rtol=TOLERANCE, atol=TOLERANCE)
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index 90e0fcaabc91..83ef12be4fed 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -288,19 +288,19 @@ def test_generate_fp16(self):
         model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -348,6 +348,19 @@ class PegasusXSUMIntegrationTest(AbstractSeq2SeqIntegrationTest):
     def model(self):
         return AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name).to(torch_device)
 
+    @slow
+    def test_device_map(self):
+        model_no_device_map = AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name).to(torch_device)
+        model_with_device_map = AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name, device_map="auto")
+        assert torch.equal(
+            model_no_device_map.model.decoder.embed_positions.weight,
+            model_with_device_map.model.decoder.embed_positions.weight,
+        )
+        assert torch.equal(
+            model_no_device_map.model.encoder.embed_positions.weight,
+            model_with_device_map.model.encoder.embed_positions.weight,
+        )
+
     @slow
     @require_torch_fp16
     def test_pegasus_xsum_summary(self):
diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py
index bb52b8c47311..35292d5f240d 100644
--- a/tests/models/pegasus/test_tokenization_pegasus.py
+++ b/tests/models/pegasus/test_tokenization_pegasus.py
@@ -13,12 +13,13 @@
 # limitations under the License.
 
 import unittest
+from functools import lru_cache
 
 from transformers import PegasusTokenizer, PegasusTokenizerFast
 from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
 from transformers.utils import cached_property
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model")
@@ -33,19 +34,24 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = PegasusTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     @cached_property
     def _large_tokenizer(self):
         return PegasusTokenizer.from_pretrained("google/pegasus-large")
 
-    def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
-        return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PegasusTokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return PegasusTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         return ("This is a test", "This is a test")
@@ -70,8 +76,8 @@ def test_vocab_size(self):
         self.assertEqual(self.get_tokenizer().vocab_size, 1_103)
 
     def test_mask_tokens_rust_pegasus(self):
-        rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
-        py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
+        rust_tokenizer = self.get_rust_tokenizer(self.tmpdirname)
+        py_tokenizer = self.get_tokenizer(self.tmpdirname)
         raw_input_str = (
             "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important"
             " </s> <pad> <pad> <pad>"
@@ -138,26 +144,31 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]")
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     @cached_property
     def _large_tokenizer(self):
         return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
 
-    def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
-        return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PegasusTokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return PegasusTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         return ("This is a test", "This is a test")
 
     def test_mask_tokens_rust_pegasus(self):
-        rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
-        py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
+        rust_tokenizer = self.get_rust_tokenizer(self.tmpdirname)
+        py_tokenizer = self.get_tokenizer(self.tmpdirname)
         raw_input_str = (
             "Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s>"
             " <pad> <pad> <pad>"
diff --git a/tests/models/pegasus_x/test_modeling_pegasus_x.py b/tests/models/pegasus_x/test_modeling_pegasus_x.py
index 97451ce766a1..cadacf716ac3 100644
--- a/tests/models/pegasus_x/test_modeling_pegasus_x.py
+++ b/tests/models/pegasus_x/test_modeling_pegasus_x.py
@@ -847,7 +847,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class PegasusXStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class PegasusXStandaloneDecoderModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (PegasusXDecoder,) if is_torch_available() else ()
     test_pruning = False
     is_encoder_decoder = False
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index e6bcb930ec61..2cd4719083c2 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -812,14 +812,6 @@ def test_problem_types(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip(reason="Perceiver models don't have a typical head like is the case with BERT")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Perceiver models don't have a typical head like is the case with BERT")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(reason="Perceiver doesn't support resize_token_embeddings")
     def test_resize_tokens_embeddings(self):
         pass
diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py
index de9bf36b4349..16c279ae18a3 100644
--- a/tests/models/perceiver/test_tokenization_perceiver.py
+++ b/tests/models/perceiver/test_tokenization_perceiver.py
@@ -19,12 +19,13 @@
 import shutil
 import tempfile
 import unittest
+from functools import lru_cache
 from typing import Tuple
 
 from transformers import AddedToken, BatchEncoding, PerceiverTokenizer
 from transformers.utils import cached_property, is_tf_available, is_torch_available
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 if is_torch_available():
@@ -40,17 +41,22 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = PerceiverTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
         tokenizer = PerceiverTokenizer()
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     @cached_property
     def perceiver_tokenizer(self):
         return PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
 
-    def get_tokenizer(self, **kwargs) -> PerceiverTokenizer:
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PerceiverTokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
         # XXX The default common tokenizer tests assume that every ID is decodable on its own.
diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py
index 744788cf6447..5269f7183b1b 100644
--- a/tests/models/persimmon/test_modeling_persimmon.py
+++ b/tests/models/persimmon/test_modeling_persimmon.py
@@ -151,116 +151,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = PersimmonModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = PersimmonForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = PersimmonForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -379,11 +269,6 @@ def test_persimmon_token_classification_model(self):
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip(reason="Persimmon buffers include complex numbers, which breaks this test")
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_save_load_fast_init_from_base
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @parameterized.expand([("linear",), ("dynamic",)])
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model_rope_scaling_from_config with Llama->Persimmon
     def test_model_rope_scaling_from_config(self, scaling_type):
@@ -424,7 +309,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
index 9b7d44ca1cb7..1812f2b61eb4 100644
--- a/tests/models/phi/test_modeling_phi.py
+++ b/tests/models/phi/test_modeling_phi.py
@@ -146,116 +146,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = PhiModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = PhiForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = PhiForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -403,7 +293,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
index a6a9ab4e63f2..200a34ff2574 100644
--- a/tests/models/phi3/test_modeling_phi3.py
+++ b/tests/models/phi3/test_modeling_phi3.py
@@ -21,6 +21,7 @@
 from parameterized import parameterized
 
 from transformers import Phi3Config, StaticCache, is_torch_available, set_seed
+from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.testing_utils import (
     require_torch,
     slow,
@@ -52,7 +53,7 @@ def __init__(self, model: Phi3ForCausalLM, batch_size: int, max_seq_len: int):
             self.model = model
             self.cache = StaticCache(
                 config=model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                 max_cache_len=max_seq_len,
                 device=self.model.device,
                 dtype=self.model.dtype,
@@ -197,119 +198,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Phi3
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Phi3Model(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Phi3
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Phi3ForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Phi3
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Phi3ForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -707,3 +595,72 @@ def test_phi3_mini_4k_sliding_window(self):
         ]
 
         self.assertListEqual(output_text, EXPECTED_OUTPUT)
+
+    @slow
+    def test_export_static_cache(self):
+        from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
+
+        if not is_torch_greater_or_equal_than_2_4:
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+        from transformers.integrations.executorch import (
+            TorchExportableModuleWithStaticCache,
+            convert_and_export_with_cache,
+        )
+
+        model_id = "microsoft/Phi-4-mini-instruct"
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="</s>", padding_side="right")
+        EXPECTED_TEXT_COMPLETION = [
+            "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user. A 45-year-old patient with a 10-year history of type 2 diabetes mellitus, who is currently on metformin and a SGLT2 inhibitor, presents with a 2-year history"
+        ]
+        max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[
+            "input_ids"
+        ].shape[-1]
+
+        # Load config
+        config = AutoConfig.from_pretrained(model_id)
+        # NOTE: To make the model exportable we need to set the rope scaling to default to avoid hitting
+        # the data-dependent control flow in _longrope_frequency_update. Alternatively, we can rewrite
+        # that function to avoid the data-dependent control flow.
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            config.rope_scaling["type"] = "default"
+
+        # Load model
+        device = "cpu"
+        dtype = torch.bfloat16
+        cache_implementation = "static"
+        attn_implementation = "sdpa"
+        batch_size = 1
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            config=config,
+            device_map=device,
+            torch_dtype=dtype,
+            attn_implementation=attn_implementation,
+            generation_config=GenerationConfig(
+                use_cache=True,
+                cache_implementation=cache_implementation,
+                max_length=max_generation_length,
+                cache_config={
+                    "batch_size": batch_size,
+                    "max_cache_len": max_generation_length,
+                },
+            ),
+        )
+
+        prompt = [
+            "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."
+        ]
+        prompt_tokens = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
+        prompt_token_ids = prompt_tokens["input_ids"]
+        max_new_tokens = max_generation_length - prompt_token_ids.shape[-1]
+
+        # Static Cache + export
+        exported_program = convert_and_export_with_cache(model)
+        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
+            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
+        )
+        ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text)
diff --git a/tests/models/phi4_multimodal/__init__.py b/tests/models/phi4_multimodal/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
new file mode 100644
index 000000000000..3268d29ddf16
--- /dev/null
+++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
@@ -0,0 +1,388 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import requests
+from parameterized import parameterized
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoProcessor,
+    GenerationConfig,
+    Phi4MultimodalAudioConfig,
+    Phi4MultimodalConfig,
+    Phi4MultimodalForCausalLM,
+    Phi4MultimodalModel,
+    Phi4MultimodalVisionConfig,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import (
+    require_soundfile,
+    require_torch,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_soundfile_available
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+if is_soundfile_available():
+    import soundfile
+
+
+class Phi4MultimodalModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=12,
+        image_seq_length=275,
+        audio_seq_length=8,
+        is_training=True,
+        num_hidden_layers=2,
+        vocab_size=49,
+        hidden_size=32,
+        intermediate_size=64,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        bos_token_id=0,
+        eos_token_id=0,
+        pad_token_id=0,
+        image_token_id=1,
+        audio_token_id=2,
+        image_size=16,
+        audio_size=12,
+        audio_config=Phi4MultimodalAudioConfig(
+            num_blocks=2,
+            hidden_size=32,
+            num_attention_heads=8,
+            intermediate_size=48,
+            depthwise_seperable_out_channel=128,
+            nemo_conv_channels=128,
+        ),
+        vision_config=Phi4MultimodalVisionConfig(
+            num_hidden_layers=2,
+            hidden_size=32,
+            intermediate_size=64,
+            num_attention_heads=8,
+            crop_size=16,
+        ),
+    ):
+        self.parent = parent
+        self.num_hidden_layers = num_hidden_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+        self.image_token_id = image_token_id
+        self.audio_token_id = audio_token_id
+        self.audio_config = audio_config
+        self.vision_config = vision_config
+
+        self.is_training = is_training
+        self.batch_size = batch_size
+        self.seq_length = seq_length + image_seq_length + audio_seq_length
+        self.image_seq_length = image_seq_length
+        self.audio_seq_length = audio_seq_length
+        self.image_size = image_size
+        self.audio_size = audio_size
+        self.num_channels = 3
+
+    def get_config(self):
+        return Phi4MultimodalConfig(
+            num_hidden_layers=self.num_hidden_layers,
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            bos_token_id=self.bos_token_id,
+            eos_token_id=self.eos_token_id,
+            pad_token_id=self.pad_token_id,
+            vision_config=self.vision_config,
+            audio_config=self.audio_config,
+        )
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        # The shapes corresponds to the inputs for image of size 16x16
+        image_pixel_values = floats_tensor([self.batch_size, 2, self.num_channels, self.image_size, self.image_size])
+        image_attention_mask = torch.ones(self.batch_size, 2, 1, 1)
+        image_sizes = torch.tensor(
+            [[self.image_size, self.image_size]] * self.batch_size, dtype=torch.long, device=torch_device
+        )
+
+        # Feature sizes returned by an audio of size 10000
+        audio_input_features = floats_tensor([self.batch_size, 61, 80])
+        audio_embed_sizes = torch.tensor([self.audio_seq_length] * self.batch_size, dtype=torch.long)
+
+        input_ids[input_ids == self.pad_token_id] = self.pad_token_id + 1  # random value but not pad token
+        input_ids[-1, 0] = self.pad_token_id  # mask the last text token
+        input_ids[:, -self.image_seq_length - self.audio_seq_length : -self.audio_seq_length] = self.image_token_id
+        input_ids[:, -self.audio_seq_length :] = self.audio_token_id
+
+        attention_mask = torch.ones_like(input_ids)
+        attention_mask[-1, 0] = 0  # mask the last text token
+        config = self.get_config()
+
+        return (
+            config,
+            input_ids,
+            attention_mask,
+            image_pixel_values,
+            image_attention_mask,
+            image_sizes,
+            audio_input_features,
+            audio_embed_sizes,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        (
+            config,
+            input_ids,
+            attention_mask,
+            image_pixel_values,
+            image_attention_mask,
+            image_sizes,
+            audio_input_features,
+            audio_embed_sizes,
+        ) = self.prepare_config_and_inputs()
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "image_pixel_values": image_pixel_values,
+            "image_attention_mask": image_attention_mask,
+            "image_sizes": image_sizes,
+            "audio_input_features": audio_input_features,
+            "audio_embed_sizes": audio_embed_sizes,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Phi4MultimodalModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    """
+    Model tester for `Phi4Multimodal`.
+    """
+
+    all_model_classes = (Phi4MultimodalForCausalLM, Phi4MultimodalModel) if is_torch_available() else ()
+    test_pruning = False
+    test_head_masking = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = Phi4MultimodalModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Phi4MultimodalConfig)
+
+    @unittest.skip(reason="Unstable test")
+    def test_initialization(self):
+        pass
+
+    @unittest.skip(reason="Right padding not supported")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip(reason="Depending on input modalities, some params may not have gradients")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="Depending on input modalities, some params may not have gradients")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="Depending on input modalities, some params may not have gradients")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="Test tries to instantiate dynamic cache with an arg")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    @unittest.skip(reason="Test is only for old attention format")
+    def test_sdpa_can_dispatch_composite_models(self):
+        pass
+
+    @unittest.skip(reason="Static cache supported only for text-only inputs (not images or audios)")
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
+    @unittest.skip(reason="Static cache supported only for text-only inputs (not images or audios)")
+    def test_generate_with_static_cache(self):
+        pass
+
+    @unittest.skip(
+        reason="Supported only for text-only inputs (otherwise dynamic control flows for multimodal inputs)"
+    )
+    def test_generate_compilation_all_outputs(self):
+        pass
+
+    @unittest.skip(
+        reason="Supported only for text-only inputs (otherwise dynamic control flows for multimodal inputs)"
+    )
+    def test_generate_compile_model_forward(self):
+        pass
+
+    @parameterized.expand([("random",), ("same",)])
+    @unittest.skip(reason="`image_attention_mask` has a specific shape")
+    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    @unittest.skip(reason="`image_attention_mask` has a specific shape")
+    def test_assisted_decoding_sample(self):
+        pass
+
+    @unittest.skip(reason="`image_attention_mask` has a specific shape")
+    def test_prompt_lookup_decoding_matches_greedy_search(self):
+        pass
+
+    @unittest.skip(reason="Cannot unpad inputs for all modalities so easily")
+    def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
+        pass
+
+    @unittest.skip(reason="Dynamo error")
+    def test_flex_attention_with_grads(self):
+        pass
+
+
+@require_torch
+@slow
+class Phi4MultimodalIntegrationTest(unittest.TestCase):
+    checkpoint_path = "microsoft/Phi-4-multimodal-instruct"
+    image_url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+    audio_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"
+
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained(self.checkpoint_path)
+        self.generation_config = GenerationConfig(max_new_tokens=20, do_sample=False)
+        self.user_token = "<|user|>"
+        self.assistant_token = "<|assistant|>"
+        self.end_token = "<|end|>"
+        self.image = Image.open(requests.get(self.image_url, stream=True).raw)
+        with tempfile.NamedTemporaryFile(mode="w+b", suffix=".wav") as tmp:
+            tmp.write(requests.get(self.audio_url, stream=True).raw.data)
+            tmp.flush()
+            tmp.seek(0)
+            self.audio, self.sampling_rate = soundfile.read(tmp.name)
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_text_only_generation(self):
+        model = AutoModelForCausalLM.from_pretrained(
+            self.checkpoint_path, torch_dtype=torch.float16, device_map=torch_device
+        )
+
+        prompt = f"{self.user_token}What is the answer for 1+1? Explain it.{self.end_token}{self.assistant_token}"
+        inputs = self.processor(prompt, images=None, return_tensors="pt").to(torch_device)
+
+        output = model.generate(
+            **inputs,
+            generation_config=self.generation_config,
+        )
+        output = output[:, inputs["input_ids"].shape[1] :]
+        response = self.processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+        EXPECTED_RESPONSE = "The answer for 1+1 is 2. This is because when you add one to another"
+
+        self.assertEqual(response, EXPECTED_RESPONSE)
+
+    def test_vision_text_generation(self):
+        model = AutoModelForCausalLM.from_pretrained(
+            self.checkpoint_path, torch_dtype=torch.float16, device_map=torch_device
+        )
+
+        prompt = f"{self.user_token}<|image_1|>What is shown in this image?{self.end_token}{self.assistant_token}"
+        inputs = self.processor(prompt, images=self.image, return_tensors="pt").to(torch_device)
+
+        output = model.generate(
+            **inputs,
+            generation_config=self.generation_config,
+        )
+        output = output[:, inputs["input_ids"].shape[1] :]
+        response = self.processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+        EXPECTED_RESPONSE = "The image shows a vibrant scene at a street intersection in a city with a Chinese-influenced architectural"
+
+        self.assertEqual(response, EXPECTED_RESPONSE)
+
+    def test_multi_image_vision_text_generation(self):
+        model = AutoModelForCausalLM.from_pretrained(
+            self.checkpoint_path, torch_dtype=torch.float16, device_map=torch_device
+        )
+
+        images = []
+        placeholder = ""
+        for i in range(1, 5):
+            url = f"https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-{i}-2048.jpg"
+            images.append(Image.open(requests.get(url, stream=True).raw))
+            placeholder += f"<|image_{i}|>"
+
+        prompt = f"{self.user_token}{placeholder}Summarize the deck of slides.{self.end_token}{self.assistant_token}"
+        inputs = self.processor(prompt, images, return_tensors="pt").to(torch_device)
+
+        output = model.generate(
+            **inputs,
+            generation_config=self.generation_config,
+        )
+        output = output[:, inputs["input_ids"].shape[1] :]
+        response = self.processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+        EXPECTED_RESPONSE = "The presentation provides an overview of Microsoft Azure, a cloud computing platform by Microsoft, and its various services"
+
+        self.assertEqual(response, EXPECTED_RESPONSE)
+
+    @require_soundfile
+    def test_audio_text_generation(self):
+        model = AutoModelForCausalLM.from_pretrained(
+            self.checkpoint_path, torch_dtype=torch.float16, device_map=torch_device
+        )
+
+        prompt = f"{self.user_token}<|audio_1|>What is happening in this audio?{self.end_token}{self.assistant_token}"
+        inputs = self.processor(prompt, audios=self.audio, sampling_rate=self.sampling_rate, return_tensors="pt").to(
+            torch_device
+        )
+
+        output = model.generate(
+            **inputs,
+            generation_config=self.generation_config,
+        )
+        output = output[:, inputs["input_ids"].shape[1] :]
+        response = self.processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+        # Yes, it is truly the expected response... Even though the model correctly treats the audio file
+        EXPECTED_RESPONSE = "I'm sorry, but I can't listen to audio. However, if you describe the audio to me,"
+
+        self.assertEqual(response, EXPECTED_RESPONSE)
diff --git a/tests/models/phimoe/test_modeling_phimoe.py b/tests/models/phimoe/test_modeling_phimoe.py
index 40448a0a85e8..a61091522b85 100644
--- a/tests/models/phimoe/test_modeling_phimoe.py
+++ b/tests/models/phimoe/test_modeling_phimoe.py
@@ -22,6 +22,7 @@
 
 from transformers import PhimoeConfig, StaticCache, is_torch_available, set_seed
 from transformers.testing_utils import (
+    is_flaky,
     require_torch,
     slow,
     torch_device,
@@ -51,7 +52,7 @@ def __init__(self, model: PhimoeForCausalLM, batch_size: int, max_seq_len: int):
             self.model = model
             self.cache = StaticCache(
                 config=model.config,
-                batch_size=batch_size,
+                max_batch_size=batch_size,
                 max_cache_len=max_seq_len,
                 device=self.model.device,
                 dtype=self.model.dtype,
@@ -204,119 +205,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Phimoe
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = PhimoeModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Phimoe
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = PhimoeForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Phimoe
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = PhimoeForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -449,6 +337,7 @@ def test_model_rope_scaling_from_config(self, scaling_type):
         self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
 
     @parameterized.expand([("longrope",)])
+    @is_flaky()  # TODO (joao): unify rope tests in the mixin
     def test_model_rope_scaling_short_long_factor(self, scaling_type):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         n_factors = config.hidden_size // config.num_key_value_heads // 2
diff --git a/tests/models/phobert/test_tokenization_phobert.py b/tests/models/phobert/test_tokenization_phobert.py
index bdf02d5f51ad..323355e3cab3 100644
--- a/tests/models/phobert/test_tokenization_phobert.py
+++ b/tests/models/phobert/test_tokenization_phobert.py
@@ -15,10 +15,11 @@
 
 import os
 import unittest
+from functools import lru_cache
 
 from transformers.models.phobert.tokenization_phobert import VOCAB_FILES_NAMES, PhobertTokenizer
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -26,27 +27,32 @@ class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = PhobertTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["T@@", "i", "I", "R@@", "r", "e@@"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l à</w>"]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
 
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             for token in vocab_tokens:
                 fp.write(f"{token} {vocab_tokens[token]}\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return PhobertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return PhobertTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "Tôi là VinAI Research"
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index cbef43f3fb33..1517ae9dbd2f 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -197,13 +197,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -212,14 +212,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip(reason="Pix2StructVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Pix2StructVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "google/pix2struct-textcaps-base"
@@ -346,13 +338,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -361,14 +353,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="Pix2StructTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Pix2StructTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "google/pix2struct-textcaps-base"
diff --git a/tests/models/pixtral/test_image_processing_pixtral.py b/tests/models/pixtral/test_image_processing_pixtral.py
index a2a0243724da..3f9deded6d8b 100644
--- a/tests/models/pixtral/test_image_processing_pixtral.py
+++ b/tests/models/pixtral/test_image_processing_pixtral.py
@@ -109,8 +109,8 @@ def expected_output_image_shape(self, images):
 
             ratio = max(height / max_height, width / max_width)
             if ratio > 1:
-                height = int(np.ceil(height / ratio))
-                width = int(np.ceil(width / ratio))
+                height = int(np.floor(height / ratio))
+                width = int(np.floor(width / ratio))
 
             patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
             num_height_tokens = (height - 1) // patch_height + 1
diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py
index 173ddc1a134f..6ea6fa33c80e 100644
--- a/tests/models/pixtral/test_modeling_pixtral.py
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -96,32 +96,6 @@ def get_config(self):
             initializer_range=self.initializer_range,
         )
 
-    def create_and_check_model(self, config, pixel_values):
-        model = PixtralVisionModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
-
-    def create_and_check_model_with_projection(self, config, pixel_values):
-        model = PixtralVisionModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            result = model(pixel_values)
-        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
-        image_size = (self.image_size, self.image_size)
-        patch_size = (self.patch_size, self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
-        self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values, image_sizes = config_and_inputs
diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py
index 6a4e5df59921..fec49c5559b5 100644
--- a/tests/models/plbart/test_modeling_plbart.py
+++ b/tests/models/plbart/test_modeling_plbart.py
@@ -330,7 +330,7 @@ def test_sample_generate(self):
         pass
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
diff --git a/tests/models/plbart/test_tokenization_plbart.py b/tests/models/plbart/test_tokenization_plbart.py
index ff0ef386e372..1ac7d1a7d171 100644
--- a/tests/models/plbart/test_tokenization_plbart.py
+++ b/tests/models/plbart/test_tokenization_plbart.py
@@ -45,12 +45,13 @@ class PLBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     rust_tokenizer_class = None
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_full_base_tokenizer(self):
         tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True)
diff --git a/tests/models/pop2piano/test_modeling_pop2piano.py b/tests/models/pop2piano/test_modeling_pop2piano.py
index 50d25aaf5cca..baa79e729da4 100644
--- a/tests/models/pop2piano/test_modeling_pop2piano.py
+++ b/tests/models/pop2piano/test_modeling_pop2piano.py
@@ -26,7 +26,6 @@
 from transformers.testing_utils import (
     require_essentia,
     require_librosa,
-    require_onnx,
     require_scipy,
     require_torch,
     slow,
@@ -34,7 +33,6 @@
 )
 from transformers.utils import is_essentia_available, is_librosa_available, is_scipy_available, is_torch_available
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -360,24 +358,6 @@ def create_and_check_decoder_model_past_large_inputs(
         # test that outputs are equal for slice
         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
-    def create_and_check_generate_with_past_key_values(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = Pop2PianoForConditionalGeneration(config=config).to(torch_device).eval()
-        torch.manual_seed(0)
-        output_without_past_cache = model.generate(
-            input_ids[:1], num_beams=2, max_length=5, do_sample=True, use_cache=False
-        )
-        torch.manual_seed(0)
-        output_with_past_cache = model.generate(input_ids[:1], num_beams=2, max_length=5, do_sample=True)
-        self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
-
     def create_and_check_model_fp16_forward(
         self,
         config,
@@ -505,7 +485,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Pop2PianoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class Pop2PianoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Pop2PianoForConditionalGeneration,) if is_torch_available() else ()
     # Doesn't run generation tests. Has custom generation method with a different interface
     all_generative_model_classes = ()
@@ -611,20 +591,6 @@ def test_model_from_pretrained(self):
         model = Pop2PianoForConditionalGeneration.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @require_onnx
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = Pop2PianoForConditionalGeneration(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/Pop2Piano_test.onnx",
-                export_params=True,
-                opset_version=14,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     def test_pass_with_input_features(self):
         input_features = BatchFeature(
             {
diff --git a/tests/models/prompt_depth_anything/__init__.py b/tests/models/prompt_depth_anything/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/prompt_depth_anything/test_image_processing_prompt_depth_anything.py b/tests/models/prompt_depth_anything/test_image_processing_prompt_depth_anything.py
new file mode 100644
index 000000000000..7becbe5dfa50
--- /dev/null
+++ b/tests/models/prompt_depth_anything/test_image_processing_prompt_depth_anything.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.file_utils import is_vision_available
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_vision_available():
+    from transformers import PromptDepthAnythingImageProcessor
+
+
+class PromptDepthAnythingImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        super().__init__()
+        size = size if size is not None else {"height": 18, "width": 18}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.num_channels, self.size["height"], self.size["width"]
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class PromptDepthAnythingImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = PromptDepthAnythingImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = PromptDepthAnythingImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+        self.assertTrue(hasattr(image_processing, "size_divisor"))
+        self.assertTrue(hasattr(image_processing, "prompt_scale_to_meter"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"height": 18, "width": 18})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42)
+        self.assertEqual(image_processor.size, {"height": 42, "width": 42})
+
+    def test_keep_aspect_ratio(self):
+        size = {"height": 512, "width": 512}
+        image_processor = PromptDepthAnythingImageProcessor(size=size, keep_aspect_ratio=True, ensure_multiple_of=32)
+
+        image = np.zeros((489, 640, 3))
+
+        pixel_values = image_processor(image, return_tensors="pt").pixel_values
+
+        self.assertEqual(list(pixel_values.shape), [1, 3, 512, 672])
+
+    def test_prompt_depth_processing(self):
+        size = {"height": 756, "width": 756}
+        image_processor = PromptDepthAnythingImageProcessor(size=size, keep_aspect_ratio=True, ensure_multiple_of=32)
+
+        image = np.zeros((756, 1008, 3))
+        prompt_depth = np.random.random((192, 256))
+
+        outputs = image_processor(image, prompt_depth=prompt_depth, return_tensors="pt")
+        pixel_values = outputs.pixel_values
+        prompt_depth_values = outputs.prompt_depth
+
+        self.assertEqual(list(pixel_values.shape), [1, 3, 768, 1024])
+        self.assertEqual(list(prompt_depth_values.shape), [1, 1, 192, 256])
diff --git a/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py b/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py
new file mode 100644
index 000000000000..0dc6838a9dfb
--- /dev/null
+++ b/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py
@@ -0,0 +1,317 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Prompt Depth Anything model."""
+
+import unittest
+
+import requests
+
+from transformers import Dinov2Config, PromptDepthAnythingConfig
+from transformers.file_utils import is_torch_available, is_vision_available
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import PromptDepthAnythingForDepthEstimation
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoImageProcessor
+
+
+class PromptDepthAnythingModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        num_channels=3,
+        image_size=32,
+        patch_size=16,
+        use_labels=True,
+        num_labels=3,
+        is_training=True,
+        hidden_size=4,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=8,
+        out_features=["stage1", "stage2"],
+        apply_layernorm=False,
+        reshape_hidden_states=False,
+        neck_hidden_sizes=[2, 2],
+        fusion_hidden_size=6,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.out_features = out_features
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+        self.use_labels = use_labels
+        self.num_labels = num_labels
+        self.is_training = is_training
+        self.neck_hidden_sizes = neck_hidden_sizes
+        self.fusion_hidden_size = fusion_hidden_size
+        self.seq_length = (self.image_size // self.patch_size) ** 2 + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
+
+        prompt_depth = floats_tensor([self.batch_size, 1, self.image_size // 4, self.image_size // 4])
+
+        config = self.get_config()
+
+        return config, pixel_values, labels, prompt_depth
+
+    def get_config(self):
+        return PromptDepthAnythingConfig(
+            backbone_config=self.get_backbone_config(),
+            reassemble_hidden_size=self.hidden_size,
+            patch_size=self.patch_size,
+            neck_hidden_sizes=self.neck_hidden_sizes,
+            fusion_hidden_size=self.fusion_hidden_size,
+        )
+
+    def get_backbone_config(self):
+        return Dinov2Config(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            is_training=self.is_training,
+            out_features=self.out_features,
+            reshape_hidden_states=self.reshape_hidden_states,
+        )
+
+    def create_and_check_for_depth_estimation(self, config, pixel_values, labels, prompt_depth):
+        config.num_labels = self.num_labels
+        model = PromptDepthAnythingForDepthEstimation(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values, prompt_depth=prompt_depth)
+        self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels, prompt_depth = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values, "prompt_depth": prompt_depth}
+        return config, inputs_dict
+
+
+@require_torch
+class PromptDepthAnythingModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as Prompt Depth Anything does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (PromptDepthAnythingForDepthEstimation,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"depth-estimation": PromptDepthAnythingForDepthEstimation} if is_torch_available() else {}
+    )
+
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = PromptDepthAnythingModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=PromptDepthAnythingConfig,
+            has_text_modality=False,
+            hidden_size=37,
+            common_properties=["patch_size"],
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(
+        reason="Prompt Depth Anything with AutoBackbone does not have a base model and hence no input_embeddings"
+    )
+    def test_inputs_embeds(self):
+        pass
+
+    def test_for_depth_estimation(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
+
+    @unittest.skip(reason="Prompt Depth Anything does not support training yet")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="Prompt Depth Anything does not support training yet")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="Prompt Depth Anything with AutoBackbone does not have a base model and hence no input_embeddings"
+    )
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "depth-anything/prompt-depth-anything-vits-hf"
+        model = PromptDepthAnythingForDepthEstimation.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+    def test_backbone_selection(self):
+        def _validate_backbone_init():
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+
+                self.assertEqual(len(model.backbone.out_indices), 2)
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        config.backbone = "facebook/dinov2-small"
+        config.use_pretrained_backbone = True
+        config.use_timm_backbone = False
+        config.backbone_config = None
+        config.backbone_kwargs = {"out_indices": [-2, -1]}
+        _validate_backbone_init()
+
+
+def prepare_img():
+    url = "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/image.jpg?raw=true"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+
+def prepare_prompt_depth():
+    prompt_depth_url = (
+        "https://github.com/DepthAnything/PromptDA/blob/main/assets/example_images/arkit_depth.png?raw=true"
+    )
+    prompt_depth = Image.open(requests.get(prompt_depth_url, stream=True).raw)
+    return prompt_depth
+
+
+@require_torch
+@require_vision
+@slow
+class PromptDepthAnythingModelIntegrationTest(unittest.TestCase):
+    def test_inference_wo_prompt_depth(self):
+        image_processor = AutoImageProcessor.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
+        model = PromptDepthAnythingForDepthEstimation.from_pretrained(
+            "depth-anything/prompt-depth-anything-vits-hf"
+        ).to(torch_device)
+
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predicted_depth = outputs.predicted_depth
+
+        expected_shape = torch.Size([1, 756, 1008])
+        self.assertEqual(predicted_depth.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.5029, 0.5120, 0.5176], [0.4998, 0.5147, 0.5197], [0.4973, 0.5201, 0.5241]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-3))
+
+    def test_inference(self):
+        image_processor = AutoImageProcessor.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
+        model = PromptDepthAnythingForDepthEstimation.from_pretrained(
+            "depth-anything/prompt-depth-anything-vits-hf"
+        ).to(torch_device)
+
+        image = prepare_img()
+        prompt_depth = prepare_prompt_depth()
+        inputs = image_processor(images=image, return_tensors="pt", prompt_depth=prompt_depth).to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predicted_depth = outputs.predicted_depth
+
+        expected_shape = torch.Size([1, 756, 1008])
+        self.assertEqual(predicted_depth.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[3.0100, 3.0016, 3.0219], [3.0046, 3.0137, 3.0275], [3.0083, 3.0191, 3.0292]]
+        ).to(torch_device)
+
+        self.assertTrue(torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-3))
+
+    def test_export(self):
+        for strict in [True, False]:
+            with self.subTest(strict=strict):
+                if not is_torch_greater_or_equal_than_2_4:
+                    self.skipTest(reason="This test requires torch >= 2.4 to run.")
+                model = (
+                    PromptDepthAnythingForDepthEstimation.from_pretrained(
+                        "depth-anything/prompt-depth-anything-vits-hf"
+                    )
+                    .to(torch_device)
+                    .eval()
+                )
+                image_processor = AutoImageProcessor.from_pretrained("depth-anything/prompt-depth-anything-vits-hf")
+                image = prepare_img()
+                prompt_depth = prepare_prompt_depth()
+                inputs = image_processor(images=image, prompt_depth=prompt_depth, return_tensors="pt").to(torch_device)
+
+                exported_program = torch.export.export(
+                    model,
+                    args=(inputs["pixel_values"], inputs["prompt_depth"]),
+                    strict=strict,
+                )
+                with torch.no_grad():
+                    eager_outputs = model(**inputs)
+                    exported_outputs = exported_program.module().forward(
+                        inputs["pixel_values"], inputs["prompt_depth"]
+                    )
+                self.assertEqual(eager_outputs.predicted_depth.shape, exported_outputs.predicted_depth.shape)
+                self.assertTrue(
+                    torch.allclose(eager_outputs.predicted_depth, exported_outputs.predicted_depth, atol=1e-4)
+                )
diff --git a/tests/models/prophetnet/test_tokenization_prophetnet.py b/tests/models/prophetnet/test_tokenization_prophetnet.py
index 5eede4d38411..b271c898da1d 100644
--- a/tests/models/prophetnet/test_tokenization_prophetnet.py
+++ b/tests/models/prophetnet/test_tokenization_prophetnet.py
@@ -36,8 +36,9 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = ProphetNetTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "[UNK]",
@@ -56,8 +57,8 @@ def setUp(self):
             "low",
             "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py
index df2b3bcc85aa..ea5a1b69d9ca 100644
--- a/tests/models/pvt/test_modeling_pvt.py
+++ b/tests/models/pvt/test_modeling_pvt.py
@@ -121,24 +121,6 @@ def create_and_check_model(self, config, pixel_values, labels):
         result = model(pixel_values)
         self.parent.assertIsNotNone(result.last_hidden_state)
 
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.type_sequence_label_size
-        model = PvtForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = PvtForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values, labels = config_and_inputs
diff --git a/tests/models/pvt_v2/test_modeling_pvt_v2.py b/tests/models/pvt_v2/test_modeling_pvt_v2.py
index d850da24f3ca..381ab1c0004a 100644
--- a/tests/models/pvt_v2/test_modeling_pvt_v2.py
+++ b/tests/models/pvt_v2/test_modeling_pvt_v2.py
@@ -128,53 +128,6 @@ def create_and_check_model(self, config, pixel_values, labels):
         result = model(pixel_values)
         self.parent.assertIsNotNone(result.last_hidden_state)
 
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = PvtV2Backbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = PvtV2Backbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
-
-    def create_and_check_for_image_classification(self, config, pixel_values, labels):
-        config.num_labels = self.num_labels
-        model = PvtV2ForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values, labels=labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-        # test greyscale images
-        config.num_channels = 1
-        model = PvtV2ForImageClassification(config)
-        model.to(torch_device)
-        model.eval()
-
-        pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
-        result = model(pixel_values)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values, labels = config_and_inputs
diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py
index e69698a347e3..943fa3a91bda 100644
--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@@ -169,119 +169,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Qwen2
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Qwen2Model(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Qwen2
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Qwen2ForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Qwen2
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Qwen2ForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -363,7 +250,6 @@ def test_torch_fx_output_loss(self):
 
     def test_Qwen2_sequence_classification_model(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
         config.num_labels = 3
         input_ids = input_dict["input_ids"]
         attention_mask = input_ids.ne(1).to(torch_device)
@@ -418,10 +304,6 @@ def test_Qwen2_token_classification_model(self):
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip(reason="Qwen2 buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="Qwen2 uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
diff --git a/tests/models/qwen2/test_tokenization_qwen2.py b/tests/models/qwen2/test_tokenization_qwen2.py
index b188fd2f8c4c..e37ecac9694a 100644
--- a/tests/models/qwen2/test_tokenization_qwen2.py
+++ b/tests/models/qwen2/test_tokenization_qwen2.py
@@ -14,15 +14,17 @@
 # limitations under the License.
 
 
+import copy
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import AddedToken, Qwen2Tokenizer, Qwen2TokenizerFast
 from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES, bytes_to_unicode
 from transformers.testing_utils import require_tokenizers, slow
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -36,8 +38,9 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_kwargs = None
     test_seq2seq = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # this make sure the vocabuary is complete at the byte level.
         vocab = list(bytes_to_unicode().values())
@@ -81,22 +84,34 @@ def setUp(self):
             "# #",
         ]
 
-        self.special_tokens_map = {"eos_token": "<|endoftext|>"}
+        cls.special_tokens_map = {"eos_token": "<|endoftext|>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return Qwen2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return Qwen2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        _kwargs = copy.deepcopy(cls.special_tokens_map)
+        _kwargs.update(kwargs)
+        kwargs = _kwargs
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return Qwen2Tokenizer.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        _kwargs = copy.deepcopy(cls.special_tokens_map)
+        _kwargs.update(kwargs)
+        kwargs = _kwargs
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return Qwen2TokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         # this case should cover
diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
index cfcfd3a620c9..5bbd21d6e960 100644
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -27,6 +27,7 @@
     is_vision_available,
 )
 from transformers.testing_utils import (
+    is_flaky,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
@@ -188,39 +189,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def create_and_check_qwen2_5_vl_model_fp16_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_grid_thw
-    ):
-        model = Qwen2_5_VLForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.half()
-        model.eval()
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            image_grid_thw=image_grid_thw,
-            pixel_values=pixel_values.to(torch.bfloat16),
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
-    def create_and_check_qwen2_5_vl_model_fp16_autocast_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_grid_thw
-    ):
-        config.torch_dtype = torch.float16
-        model = Qwen2_5_VLForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            logits = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                image_grid_thw=image_grid_thw,
-                pixel_values=pixel_values.to(torch.bfloat16),
-                return_dict=True,
-            )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
 
 @require_torch
 class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -262,7 +230,7 @@ def test_mismatching_num_image_tokens(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             patch_size = config.vision_config.patch_size
@@ -331,12 +299,6 @@ def test_multi_gpu_data_parallel_forward(self):
     def test_model_is_small(self):
         pass
 
-    @unittest.skip(
-        reason="Qwen2.5-VL can't do low-memory generation because position IDs have extra dimension and split function doesn't work for that"
-    )
-    def test_beam_search_low_memory(self):
-        pass
-
     @unittest.skip(
         reason="VLMs can't generate from inputs embeds and pixels. This can be tested as part of bacbone LM, no need to run the tes for VLMs"
     )
@@ -347,6 +309,10 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
     def test_generate_compile_fullgraph(self):
         pass
 
+    @is_flaky()  # TODO (joao/raushan): Investigate why this test is flaky on this model
+    def test_prompt_lookup_decoding_matches_greedy_search(self):
+        super().test_prompt_lookup_decoding_matches_greedy_search()
+
 
 @require_torch
 class Qwen2_5_VLIntegrationTest(unittest.TestCase):
diff --git a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
index 481e206a7186..d2dc9252021b 100644
--- a/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_processor_qwen2_5_vl.py
@@ -18,6 +18,7 @@
 import unittest
 
 import pytest
+from huggingface_hub import hf_hub_download
 
 from transformers import AutoProcessor, Qwen2Tokenizer
 from transformers.testing_utils import require_av, require_torch, require_vision
@@ -116,7 +117,7 @@ def test_model_input_names(self):
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
 
-    def test_chat_template_single(self):
+    def test_image_chat_template_single(self):
         processor = self.get_processor()
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
@@ -154,7 +155,7 @@ def test_chat_template_single(self):
         self.assertEqual(len(out_dict["attention_mask"]), 1)
         self.assertEqual(len(out_dict[self.images_input_name]), 71280)
 
-    def test_chat_template_batched(self):
+    def test_image_chat_template_batched(self):
         processor = self.get_processor()
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
@@ -310,3 +311,130 @@ def test_chat_template_video(self):
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
+
+    def test_kwargs_overrides_custom_image_processor_kwargs(self):
+        processor_components = self.prepare_components()
+        processor_components["image_processor"] = self.get_component("image_processor")
+        processor_components["tokenizer"] = self.get_component("tokenizer")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(**processor_components, **processor_kwargs, use_fast=True)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
+        self.assertEqual(inputs[self.images_input_name].shape[0], 612)
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(inputs[self.images_input_name].shape[0], 800)
+
+    @require_av
+    def test_chat_template_video_custom_sampling(self):
+        """
+        Tests that models can pass their custom callables to sample video indices.
+        """
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        signature = inspect.signature(processor.__call__)
+        if "videos" not in {*signature.parameters.keys()} or (
+            signature.parameters.get("videos") is not None
+            and signature.parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest("Processor doesn't accept videos at input")
+
+        video_file_path = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+        )
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "path": video_file_path},
+                        {"type": "text", "text": "What is shown in this video?"},
+                    ],
+                },
+            ]
+        ]
+
+        def dummy_sample_indices_fn(metadata, **fn_kwargs):
+            # sample only the first two frame always
+            return [0, 1]
+
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            sample_indices_fn=dummy_sample_indices_fn,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400)
+
+    @require_av
+    def test_chat_template_video_special_processing(self):
+        """
+        Tests that models can use their own preprocessing to preprocess conversations.
+        """
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        signature = inspect.signature(processor.__call__)
+        if "videos" not in {*signature.parameters.keys()} or (
+            signature.parameters.get("videos") is not None
+            and signature.parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest("Processor doesn't accept videos at input")
+
+        video_file_path = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+        )
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "path": video_file_path},
+                        {"type": "text", "text": "What is shown in this video?"},
+                    ],
+                },
+            ]
+        ]
+
+        def _process_messages_for_chat_template(
+            conversation,
+            batch_images,
+            batch_videos,
+            batch_video_metadata,
+            **chat_template_kwargs,
+        ):
+            # Let us just always return a dummy prompt
+            new_msg = [
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "video"},  # no need to use path, video is loaded already by this moment
+                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
+                        ],
+                    },
+                ]
+            ]
+            return new_msg
+
+        processor._process_messages_for_chat_template = _process_messages_for_chat_template
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+
+        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
+        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
+        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1756800)
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index ef8def3caef2..e4b16d834413 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -131,19 +131,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def create_and_check_qwen2audio_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
-        model = Qwen2AudioForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            logits = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                pixel_values=pixel_values.to(torch.bfloat16),
-                return_dict=True,
-            )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
 
 @require_torch
 class Qwen2AudioForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
@@ -152,6 +139,8 @@ class Qwen2AudioForConditionalGenerationModelTest(ModelTesterMixin, unittest.Tes
     """
 
     all_model_classes = (Qwen2AudioForConditionalGeneration,) if is_torch_available() else ()
+    # Doesn't run generation tests. TODO eustache/joao: some generation tests are broken, the errors seem cache-related
+    all_generative_model_classes = ()
     test_pruning = False
     test_head_masking = False
     _is_composite = True
diff --git a/tests/models/qwen2_audio/test_processor_qwen2_audio.py b/tests/models/qwen2_audio/test_processor_qwen2_audio.py
index d324a7d91050..409d47e8866a 100644
--- a/tests/models/qwen2_audio/test_processor_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_processor_qwen2_audio.py
@@ -11,20 +11,63 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import shutil
 import tempfile
 import unittest
+from typing import Optional
 
 from transformers import AutoProcessor, AutoTokenizer, Qwen2AudioProcessor, WhisperFeatureExtractor
 from transformers.testing_utils import require_torch, require_torchaudio
+from transformers.utils import is_torch_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_torch_available:
+    pass
 
 
 @require_torch
 @require_torchaudio
-class Qwen2AudioProcessorTest(unittest.TestCase):
+class Qwen2AudioProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Qwen2AudioProcessor
+
     def setUp(self):
         self.checkpoint = "Qwen/Qwen2-Audio-7B-Instruct"
         self.tmpdirname = tempfile.mkdtemp()
 
+        processor_kwargs = self.prepare_processor_dict()
+        processor = Qwen2AudioProcessor.from_pretrained(self.checkpoint, **processor_kwargs)
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_audio_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).audio_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_processor_dict(self):
+        return {
+            "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+        }
+
+    # Override as Qwen2AudioProcessor needs audio tokens in prompts
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer <|AUDIO|>"
+
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+
+        if batch_size == 1:
+            return ["lower newer <|AUDIO|>"]
+        return ["lower newer <|AUDIO|>", "<|AUDIO|> upper older longer string"] + ["<|AUDIO|> lower newer"] * (
+            batch_size - 2
+        )
+
     def test_can_load_various_tokenizers(self):
         processor = Qwen2AudioProcessor.from_pretrained(self.checkpoint)
         tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
@@ -77,7 +120,7 @@ def test_tokenizer_integration(self):
             "assistant",
             "Ċ",
         ]
-        print(slow_tokenizer.tokenize(prompt))
+
         self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
         self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
 
@@ -110,5 +153,31 @@ def test_chat_template(self):
             },
         ]
 
-        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        formatted_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         self.assertEqual(expected_prompt, formatted_prompt)
+
+    def test_chat_template_with_continue_final_message(self):
+        processor = AutoProcessor.from_pretrained(self.checkpoint)
+        expected_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat's that sound?<|im_end|>\n<|im_start|>assistant\nIt is the sound of "  # fmt: skip
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant."}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
+                    },
+                    {"type": "text", "text": "What's that sound?"},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "It is the sound of "}],
+            },
+        ]
+        prompt = processor.apply_chat_template(messages, continue_final_message=True)
+        self.assertEqual(expected_prompt, prompt)
diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
index 570b678ec3eb..b7b8fbab11b7 100644
--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -89,6 +89,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=1,
         scope=None,
+        qkv_bias=False,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -127,6 +128,7 @@ def __init__(
         self.norm_topk_prob = norm_topk_prob
         self.output_router_logits = output_router_logits
         self.router_aux_loss_coef = router_aux_loss_coef
+        self.qkv_bias = qkv_bias
 
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
     def prepare_config_and_inputs(self):
@@ -181,6 +183,7 @@ def get_config(self):
             initializer_range=self.initializer_range,
             pad_token_id=self.pad_token_id,
             bos_token_id=self.bos_token_id,
+            qkv_bias=self.qkv_bias,
         )
 
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Qwen2Moe
@@ -194,119 +197,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Qwen2Moe
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Qwen2MoeModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Qwen2Moe
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Qwen2MoeForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Qwen2Moe
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Qwen2MoeForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -388,7 +278,6 @@ def test_torch_fx_output_loss(self):
 
     def test_Qwen2Moe_sequence_classification_model(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        print(config)
         config.num_labels = 3
         input_ids = input_dict["input_ids"]
         attention_mask = input_ids.ne(1).to(torch_device)
@@ -443,10 +332,6 @@ def test_Qwen2Moe_token_classification_model(self):
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip(reason="Qwen2Moe buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="Qwen2Moe uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
diff --git a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
index bfa4dca85e32..95d758f43816 100644
--- a/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_image_processing_qwen2_vl.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
 
 import numpy as np
@@ -298,6 +299,20 @@ def test_custom_patch_size(self):
                 expected_output_video_shape = (171500, 1176)
                 self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
 
+    def test_custom_image_size(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                image_processing.save_pretrained(tmpdirname)
+                image_processor_loaded = image_processing_class.from_pretrained(
+                    tmpdirname, max_pixels=56 * 56, min_pixels=28 * 28
+                )
+
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+            prcocess_out = image_processor_loaded(image_inputs, return_tensors="pt")
+            expected_output_video_shape = [112, 1176]
+            self.assertListEqual(list(prcocess_out.pixel_values.shape), expected_output_video_shape)
+
     @require_vision
     @require_torch
     def test_slow_fast_equivalence(self):
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 655effb09d74..13c58fae7422 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -184,39 +184,6 @@ def prepare_config_and_inputs_for_common(self):
         }
         return config, inputs_dict
 
-    def create_and_check_qwen2_vl_model_fp16_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_grid_thw
-    ):
-        model = Qwen2VLForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.half()
-        model.eval()
-        logits = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            image_grid_thw=image_grid_thw,
-            pixel_values=pixel_values.to(torch.bfloat16),
-            return_dict=True,
-        )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
-    def create_and_check_qwen2_vl_model_fp16_autocast_forward(
-        self, config, input_ids, pixel_values, attention_mask, image_grid_thw
-    ):
-        config.torch_dtype = torch.float16
-        model = Qwen2VLForConditionalGeneration(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            logits = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                image_grid_thw=image_grid_thw,
-                pixel_values=pixel_values.to(torch.bfloat16),
-                return_dict=True,
-            )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
 
 @require_torch
 class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
@@ -344,12 +311,6 @@ def test_multi_gpu_data_parallel_forward(self):
     def test_model_is_small(self):
         pass
 
-    @unittest.skip(
-        reason="Qwen2-VL can't do low-memory generation because position IDs have extra dimension and split function doesn't work for that"
-    )
-    def test_beam_search_low_memory(self):
-        pass
-
     @unittest.skip(
         reason="VLMs can't generate from inputs embeds and pixels. This can be tested as part of bacbone LM, no need to run the test for VLMs"
     )
diff --git a/tests/models/qwen2_vl/test_processor_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
index 59546485c597..58dd2c8544d8 100644
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -18,6 +18,7 @@
 import unittest
 
 import pytest
+from huggingface_hub import hf_hub_download
 
 from transformers import AutoProcessor, Qwen2Tokenizer
 from transformers.testing_utils import require_av, require_torch, require_vision
@@ -113,7 +114,7 @@ def test_model_input_names(self):
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
 
-    def test_chat_template_single(self):
+    def test_image_chat_template_single(self):
         processor = self.get_processor()
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
@@ -151,7 +152,7 @@ def test_chat_template_single(self):
         self.assertEqual(len(out_dict["attention_mask"]), 1)
         self.assertEqual(len(out_dict[self.images_input_name]), 71280)
 
-    def test_chat_template_batched(self):
+    def test_image_chat_template_batched(self):
         processor = self.get_processor()
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
@@ -307,3 +308,130 @@ def test_chat_template_video(self):
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 71280)
+
+    @require_av
+    def test_chat_template_video_custom_sampling(self):
+        """
+        Tests that models can pass their custom callables to sample video indices.
+        """
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        signature = inspect.signature(processor.__call__)
+        if "videos" not in {*signature.parameters.keys()} or (
+            signature.parameters.get("videos") is not None
+            and signature.parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest("Processor doesn't accept videos at input")
+
+        video_file_path = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+        )
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "path": video_file_path},
+                        {"type": "text", "text": "What is shown in this video?"},
+                    ],
+                },
+            ]
+        ]
+
+        def dummy_sample_indices_fn(metadata, **fn_kwargs):
+            # sample only the first two frame always
+            return [0, 1]
+
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            sample_indices_fn=dummy_sample_indices_fn,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400)
+
+    @require_av
+    def test_chat_template_video_special_processing(self):
+        """
+        Tests that models can use their own preprocessing to preprocess conversations.
+        """
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        signature = inspect.signature(processor.__call__)
+        if "videos" not in {*signature.parameters.keys()} or (
+            signature.parameters.get("videos") is not None
+            and signature.parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest("Processor doesn't accept videos at input")
+
+        video_file_path = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+        )
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "video", "path": video_file_path},
+                        {"type": "text", "text": "What is shown in this video?"},
+                    ],
+                },
+            ]
+        ]
+
+        def _process_messages_for_chat_template(
+            conversation,
+            batch_images,
+            batch_videos,
+            batch_video_metadata,
+            **chat_template_kwargs,
+        ):
+            # Let us just always return a dummy prompt
+            new_msg = [
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "video"},  # no need to use path, video is loaded already by this moment
+                            {"type": "text", "text": "Dummy prompt for preprocess testing"},
+                        ],
+                    },
+                ]
+            ]
+            return new_msg
+
+        processor._process_messages_for_chat_template = _process_messages_for_chat_template
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+
+        # Check with `in` because we don't know how each template formats the prompt with BOS/EOS/etc
+        formatted_text = processor.batch_decode(out_dict_with_video["input_ids"], skip_special_tokens=True)[0]
+        self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1756800)
+
+    def test_kwargs_overrides_custom_image_processor_kwargs(self):
+        processor_components = self.prepare_components()
+        processor_components["image_processor"] = self.get_component("image_processor")
+        processor_components["tokenizer"] = self.get_component("tokenizer")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(**processor_components, **processor_kwargs, use_fast=True)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(inputs[self.images_input_name].shape[0], 800)
+        inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
+        self.assertEqual(inputs[self.images_input_name].shape[0], 612)
diff --git a/tests/models/qwen3/__init__.py b/tests/models/qwen3/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/qwen3/test_modeling_qwen3.py b/tests/models/qwen3/test_modeling_qwen3.py
new file mode 100644
index 000000000000..bd46f11ce3f1
--- /dev/null
+++ b/tests/models/qwen3/test_modeling_qwen3.py
@@ -0,0 +1,920 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Qwen3 model."""
+
+import gc
+import unittest
+
+import pytest
+from packaging import version
+
+from transformers import AutoTokenizer, Qwen3Config, is_torch_available, set_seed
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.testing_utils import (
+    backend_empty_cache,
+    require_bitsandbytes,
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Qwen3ForCausalLM,
+        Qwen3ForQuestionAnswering,
+        Qwen3ForSequenceClassification,
+        Qwen3ForTokenClassification,
+        Qwen3Model,
+    )
+
+
+class Qwen3ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=64,
+        num_hidden_layers=5,
+        max_window_layers=3,
+        use_sliding_window=True,
+        sliding_window=50,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=16,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        bos_token_id=1,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.max_window_layers = max_window_layers
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.scope = scope
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return Qwen3Config(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            max_window_layers=self.max_window_layers,
+            use_sliding_window=self.use_sliding_window,
+            sliding_window=self.sliding_window,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            head_dim=self.head_dim,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            bos_token_id=self.bos_token_id,
+        )
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Qwen3
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Qwen3Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+# Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Qwen3
+class Qwen3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            Qwen3Model,
+            Qwen3ForCausalLM,
+            Qwen3ForSequenceClassification,
+            Qwen3ForTokenClassification,
+            Qwen3ForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": Qwen3Model,
+            "text-classification": Qwen3ForSequenceClassification,
+            "token-classification": Qwen3ForTokenClassification,
+            "text-generation": Qwen3ForCausalLM,
+            "zero-shot": Qwen3ForSequenceClassification,
+            "question-answering": Qwen3ForQuestionAnswering,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
+
+    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
+    def is_pipeline_test_to_skip(
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
+    ):
+        return True
+
+    def setUp(self):
+        self.model_tester = Qwen3ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Qwen3Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_torch_fx_output_loss(self):
+        super().test_torch_fx_output_loss()
+
+    def test_Qwen3_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = Qwen3ForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Qwen3_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = Qwen3ForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Qwen3_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = Qwen3ForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Qwen3,llama->Qwen3
+    def test_Qwen3_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = Qwen3ForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
+    # Ignore copy
+    def test_past_key_values_format(self):
+        super().test_past_key_values_format()
+
+    @require_flash_attn
+    @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        self.skipTest(reason="Qwen3 flash attention does not support right padding")
+
+
+@require_torch
+class Qwen3IntegrationTest(unittest.TestCase):
+    @slow
+    def test_model_600m_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base", device_map="auto")
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        with torch.no_grad():
+            out = model(input_ids).logits.float().cpu()
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor([[-1.4577, 1.3261, 3.8498, 3.4229, 2.9009, 1.8813, 2.1530, 2.1431]])
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, rtol=1e-2, atol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        EXPECTED_SLICE = torch.tensor([5.9062, 6.0938, 5.5625, 3.8594, 2.6094, 1.9531, 4.3125, 4.9375, 3.8906, 3.1094, 3.6719, 5.1562, 6.9062, 5.7500, 5.4062, 7.0625, 8.7500, 8.7500, 8.1250, 7.9375, 8.0625, 7.5312, 7.3750, 7.2188, 7.2500, 5.8750, 2.8750, 4.3438, 2.3438, 2.2500])  # fmt: skip
+        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, rtol=1e-4, atol=1e-4)
+
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    @slow
+    def test_model_600m_generation(self):
+        EXPECTED_TEXT_COMPLETION = """My favourite condiment is 100% plain, unflavoured, and unadulterated. It is"""
+        prompt = "My favourite condiment is "
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base", use_fast=False)
+        model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base", device_map="auto")
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
+
+        # greedy generation outputs
+        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    @require_bitsandbytes
+    @slow
+    @require_flash_attn
+    @pytest.mark.flash_attn_test
+    def test_model_600m_long_prompt(self):
+        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
+        # An input with 4097 tokens that is above the size of the sliding window
+        input_ids = [1] + [306, 338] * 2048
+        model = Qwen3ForCausalLM.from_pretrained(
+            "Qwen/Qwen3-0.6B-Base",
+            device_map="auto",
+            load_in_4bit=True,
+            attn_implementation="flash_attention_2",
+        )
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
+        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
+
+        # Assisted generation
+        assistant_model = model
+        assistant_model.generation_config.num_assistant_tokens = 2
+        assistant_model.generation_config.num_assistant_tokens_schedule = "constant"
+        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
+        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
+
+        del assistant_model
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    @slow
+    @require_torch_sdpa
+    def test_model_600m_long_prompt_sdpa(self):
+        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
+        # An input with 4097 tokens that is above the size of the sliding window
+        input_ids = [1] + [306, 338] * 2048
+        model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base", device_map="auto", attn_implementation="sdpa")
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
+        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
+
+        # Assisted generation
+        assistant_model = model
+        assistant_model.generation_config.num_assistant_tokens = 2
+        assistant_model.generation_config.num_assistant_tokens_schedule = "constant"
+        generated_ids = assistant_model.generate(input_ids, max_new_tokens=4, temperature=0)
+        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
+
+        del assistant_model
+
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+        EXPECTED_TEXT_COMPLETION = "My favourite condiment is 100% plain, unflavoured, and unadulterated. It is"
+        prompt = "My favourite condiment is "
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base", use_fast=False)
+
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
+
+        # greedy generation outputs
+        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+    @slow
+    def test_speculative_generation(self):
+        EXPECTED_TEXT_COMPLETION = (
+            "My favourite condiment is 100% peanut butter. I love it so much that I can't help but use it"
+        )
+        prompt = "My favourite condiment is "
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base", use_fast=False)
+        model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base", device_map="auto", torch_dtype=torch.float16)
+        assistant_model = Qwen3ForCausalLM.from_pretrained(
+            "Qwen/Qwen3-0.6B-Base", device_map="auto", torch_dtype=torch.float16
+        )
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
+
+        # greedy generation outputs
+        set_seed(0)
+        generated_ids = model.generate(
+            input_ids, max_new_tokens=20, do_sample=True, temperature=0.3, assistant_model=assistant_model
+        )
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    @slow
+    def test_export_static_cache(self):
+        if version.parse(torch.__version__) < version.parse("2.4.0"):
+            self.skipTest(reason="This test requires torch >= 2.4 to run.")
+
+        from transformers.integrations.executorch import (
+            TorchExportableModuleWithStaticCache,
+            convert_and_export_with_cache,
+        )
+
+        qwen_model = "Qwen/Qwen3-0.6B-Base"
+
+        tokenizer = AutoTokenizer.from_pretrained(qwen_model, pad_token="</s>", padding_side="right")
+        EXPECTED_TEXT_COMPLETION = ["My favourite condiment is 100% plain, unflavoured, and unadulterated. It is"]
+        max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[
+            "input_ids"
+        ].shape[-1]
+
+        # Load model
+        device = "cpu"
+        dtype = torch.bfloat16
+        cache_implementation = "static"
+        attn_implementation = "sdpa"
+        batch_size = 1
+        model = Qwen3ForCausalLM.from_pretrained(
+            qwen_model,
+            device_map=device,
+            torch_dtype=dtype,
+            attn_implementation=attn_implementation,
+            generation_config=GenerationConfig(
+                use_cache=True,
+                cache_implementation=cache_implementation,
+                max_length=max_generation_length,
+                cache_config={
+                    "batch_size": batch_size,
+                    "max_cache_len": max_generation_length,
+                },
+            ),
+        )
+
+        prompt = ["My favourite condiment is "]
+        prompt_tokens = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)
+        prompt_token_ids = prompt_tokens["input_ids"]
+        max_new_tokens = max_generation_length - prompt_token_ids.shape[-1]
+
+        # Static Cache + export
+        exported_program = convert_and_export_with_cache(model)
+        ep_generated_ids = TorchExportableModuleWithStaticCache.generate(
+            exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens
+        )
+        ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text)
+
+    @require_flash_attn
+    @slow
+    def test_600m_generation(self):
+        model_id = "Qwen/Qwen3-0.6B-Base"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = Qwen3ForCausalLM.from_pretrained(
+            model_id, use_sliding_window=True, max_window_layers=28, sliding_window=2048
+        ).to(torch_device)
+        # we need a long text to test sliding window
+        # fmt: off
+        LONG_TEXT = """The Warring States period in Chinese history (c. 475 – 221 BC) comprises the final centuries of the Zhou dynasty (c. 1046 – 256 BC), which were characterized by warfare, bureaucratic and military reform, and political consolidation. It followed the Spring and Autumn period and concluded with the wars of conquest that saw the state of Qin annex each of the other contender states by 221 BC and found the Qin dynasty, the first imperial dynastic state in East Asian history.
+
+
+While scholars have identified several different dates as marking the beginning of the Warring States period, Sima Qian's choice of 475 BC is the most often cited. The era largely corresponds to the second half of the Eastern Zhou period, where the king of Zhou formally ruled as Chinese sovereign, but had lost political power and functioned in practice as a figurehead. This dynamic served as the backdrop for the machinations of the eponymous Warring States. The label "Warring States period" derives from the Record of the Warring States, a work of history compiled during the early Han dynasty (202 BC – 220 AD).
+
+
+Geography
+
+The political geography of the era was dominated by the Seven Warring States, namely:
+
+
+Besides these seven major states other smaller states survived into the period. They include:
+
+
+Periodisation
+
+The eastward flight of the Zhou court in 771 BC marks the start of the Spring and Autumn period. No one single incident or starting point inaugurated the Warring States era. The political situation of the period represented a culmination of historical trends of conquest and annexation which also characterised the Spring and Autumn period. As a result, there is some controversy as to the beginning of the era. Proposed starting points include:
+
+
+History
+
+Background and formation
+
+The Eastern Zhou dynasty began its fall around 5th century BC. As their influence waned, they had to rely on armies in allied states rather than their own military force. Hundreds of smaller polities coalesced into seven major states which included: Chu, Han, Qin, Wei, Yan, Qi and Zhao. However, there eventually was a shift in alliances because each state's ruler wanted independence. This caused hundreds of wars between 535 and 286 BC. The victorious state would have overall rule and control in China.
+
+
+The system of feudal states created by the Western Zhou dynasty underwent enormous changes after 771 BC with the flight of the Zhou court to modern-day Luoyang and the diminution of its relevance and power. The Spring and Autumn period led to a few states gaining power at the expense of many others, the latter no longer able to depend on central authority for legitimacy or protection. During the Warring States period, many rulers claimed the Mandate of Heaven to justify their conquest of other states and spread their influence.
+
+
+The struggle for hegemony eventually created a state system dominated by several large states, such as Jin, Chu, Qin, Yan, and Qi, while the smaller states of the Central Plain tended to be their satellites and tributaries. Other major states also existed, such as Wu and Yue in the southeast. The last decades of the Spring and Autumn era were marked by increased stability, as the result of peace negotiations between Jin and Chu which established their respective spheres of influence. This situation ended with the partition of Jin, whereby the state was divided between the houses of Han, Zhao and Wei, leading to the seven major warring states.
+
+
+Partition of Jin (453–403 BC)
+
+The rulers of Jin had steadily lost political powers since the middle of the 6th century BC to their nominally subordinate nobles and military commanders, a situation arising from the traditions of the Jin which forbade the enfeoffment of relatives of the ducal house. This allowed other clans to gain fiefs and military authority, and decades of internecine struggle led to the establishment of four major families, the Han, Zhao, Wei and Zhi.
+
+
+The Battle of Jinyang saw the allied Han, Zhao and Wei destroy the Zhi family (453 BC) and their lands were distributed among them. With this, they became the de facto rulers of most of Jin's territory, though this situation would not be officially recognised until half a century later. The Jin division created a political vacuum that enabled during the first 50 years expansion of Chu and Yue northward and Qi southward. Qin increased its control of the local tribes and began its expansion southwest to Sichuan.
+
+
+Early Warring States
+
+The three Jins recognized (403–364 BC)
+
+In 403 BC, the court of King Weilie of Zhou officially recognized Zhao, Wei and Han as immediate vassals, thereby raising them to the same rank as the other warring states.
+
+
+From before 405 until 383 BC the three Jins were united under the leadership of Wei and expanded in all directions. The most important figure was Marquess Wen of Wei (445–396 BC). In 408–406 BC he conquered the State of Zhongshan to the northeast on the other side of Zhao. At the same time he pushed west across the Yellow River to the Luo River taking the area of Xihe (literally 'west of the  river').
+
+
+The growing power of Wei caused Zhao to back away from the alliance. In 383 BC it moved its capital to Handan and attacked the small state of Wey. Wey appealed to Wei which attacked Zhao on the western side. Being in danger, Zhao called in Chu. As usual, Chu used this as a pretext to annex territory to its north, but the diversion allowed Zhao to occupy a part of Wei. This conflict marked the end of the power of the united Jins and the beginning a period of shifting alliances and wars on several fronts.
+
+
+In 376 BC, the states of Han, Wei and Zhao deposed Duke Jing of Jin and divided the last remaining Jin territory between themselves, which marked the final end of the Jin state.
+
+
+In 370 BC, Marquess Wu of Wei died without naming a successor, which led to a war of succession. After three years of civil war, Zhao from the north and Han from the south invaded Wei. On the verge of conquering Wei, the leaders of Zhao and Han fell into disagreement about what to do with Wei, and both armies abruptly retreated. As a result, King Hui of Wei (still a Marquess at the time) was able to ascend the throne of Wei.
+
+
+Zhao extended from the Shanxi plateau across the plain to the borders of Qi. Wei reached east to Qi, Lu, and Song. To the south, the weaker state of Han held the east–west part of the Yellow River valley, surrounded the Zhou royal domain at Luoyang and held an area north of Luoyang called Shangdang.
+
+
+Qi resurgence under Tian (379–340 BC)
+
+Duke Kang of Qi died in 379 BC with no heir from the house of Jiang, which had ruled Qi since the state's founding. The throne instead passed to the future King Wei, from the house of Tian. The Tian had been very influential at court towards the end of Jiang rule, and now openly assumed power.
+
+
+The new ruler set about reclaiming territories that had been lost to other states. He launched a successful campaign against Zhao, Wey and Wei, once again extending Qi territory to the Great Wall. Sima Qian writes that the other states were so awestruck that nobody dared attack Qi for more than 20 years. The demonstrated military prowess also had a calming effect on Qi's own population, which experienced great domestic tranquility during Wei's reign.
+
+
+By the end of King Wei's reign, Qi had become the strongest of the states and proclaimed itself "king"; establishing independence from the Zhou dynasty (see below).
+
+
+Wars of Wei
+
+King Hui of Wei (370–319 BC) set about restoring the state. In 362–359 BC he exchanged territories with Han and Zhao in order to make the boundaries of the three states more rational.
+
+
+In 364 BC, Wei was defeated by Qin at the Battle of Shimen and was only saved by the intervention of Zhao. Qin won another victory in 362 BC. In 361 BC the Wei capital was moved east to Daliang to be out of the reach of Qin.
+
+
+In 354 BC, King Hui of Wei started a large-scale attack on Zhao. By 353 BC, Zhao was losing badly and its capital, Handan, was under siege. The state of Qi intervened. The famous Qi strategist, Sun Bin the great-great-great-grandson of Sun Tzu, the author of the Art of War, proposed to attack the Wei capital while the Wei army was tied up besieging Zhao. The strategy was a success; the Wei army hastily moved south to protect its capital, was caught on the road and decisively defeated at the Battle of Guiling. The battle is remembered in the second of the Thirty-Six Stratagems, "besiege Wei, save Zhao"—meaning to attack a vulnerable spot to relieve pressure at another point.
+
+
+Domestically, King Hui patronized philosophy and the arts, and is perhaps best remembered for hosting the Confucian philosopher Mencius at his court; their conversations form the first two chapters of the book which bears Meng Zi's name.
+
+
+Dukes become kings
+
+Qi and Wei became kingdoms (344 BC)
+
+The title of king (wang, 王) was held by figurehead rulers of the Zhou dynasty, while the rulers of most states held the title of duke (gong, 公) or marquess (hou, 侯). A major exception was Chu, whose rulers were called kings since King Wu of Chu started using the title c. 703 BC.
+
+
+In 344 BC the rulers of Qi and Wei mutually recognized each other as kings: King Wei of Qi and King Hui of Wei, in effect declaring their independence from the Zhou court. This marked a major turning point: unlike those in the Spring and Autumn period, the new generation of rulers ascending the thrones in the Warring States period would not entertain even the pretence of being vassals of the Zhou dynasty, instead proclaiming themselves fully independent kingdoms.
+
+
+Shang Yang reforms Qin (356–338 BC)
+
+During the early Warring States period Qin generally avoided conflicts with the other states. This changed during the reign of Duke Xiao, when prime minister Shang Yang made centralizing and authoritarian reforms in accordance with his Legalist philosophy between the years 356 and 338 BC.
+
+
+Shang introduced land reforms, privatized land, rewarded farmers who exceeded harvest quotas, enslaved farmers who failed to meet quotas, and used enslaved subjects as rewards for those who met government policies. As manpower was short in Qin relative to the other states at the time, Shang enacted policies to increase its manpower. As Qin peasants were recruited into the military, he encouraged active immigration of peasants from other states into Qin as a replacement workforce; this policy simultaneously increased the manpower of Qin and weakened the manpower of Qin's rivals.
+
+
+Shang made laws forcing citizens to marry at a young age and passed tax laws to encourage raising multiple children. He also enacted policies to free convicts who worked in opening wastelands for agriculture. Shang abolished primogeniture and created a double tax on households that had more than one son living in the household, to break up large clans into nuclear families. Shang also moved the capital to reduce the influence of nobles on the administration.
+
+
+The rise of Qin was recognized by the royal court, and in 343 BC the king conferred the title of Count (伯 Bó) on Duke Xiao. As was customary, a conference was hosted which the feudal lords attended, and during which the Son of Heaven bestowed the title.
+
+
+After the reforms Qin became much more aggressive. In 340 Qin took land from Wèi after it had been defeated by Qi. In 316 Qin conquered Shu and Ba in Sichuan to the southwest. Development of this area took a long time but slowly added greatly to Qin's wealth and power.
+
+
+Qin defeats Wei (341–340 BC)
+
+In 341 BC, Wei attacked Han. Qi allowed Han to be nearly defeated and then intervened. The generals from the Battle of Guiling met again (Sun Bin and Tian Ji versus Pang Juan), using the same tactic, attacking Wei's capital. Sun Bin feigned a retreat and then turned on the overconfident Wei troops and decisively defeated them at the Battle of Maling. After the battle all three of the Jin successor states appeared before King Xuan of Qi, pledging their loyalty.
+
+
+In the following year Qin attacked the weakened Wei. Wei was devastatingly defeated and ceded a large part of its territory in return for truce. With Wei severely weakened, Qi and Qin became the dominant states in China.
+
+
+Wei came to rely on Qi for protection, with King Hui of Wei meeting King Xuan of Qi on two occasions. After Hui's death, his successor King Xiang also established a good relationship with his Qi counterpart, with both promising to recognize the other as "king".
+
+
+Chu conquers Yue (334 BC)
+
+Early in the Warring States period, Chu was one of the strongest states in China. The state rose to a new level of power around 389 BC when King Dao of Chu (楚悼王) named the famous reformer Wu Qi as his chancellor.
+
+
+Chu rose to its peak in 334 BC, when it conquered Yue to its east on the Pacific coast. The series of events leading up to this began when Yue prepared to attack Qi to its north. The King of Qi sent an emissary who persuaded the King of Yue to attack Chu instead. Yue initiated a large-scale attack at Chu but was defeated by Chu's counter-attack. Chu then proceeded to conquer Yue.
+
+
+Qin, Han and Yan became kingdoms (325–323 BC)
+
+King Xian of Zhou had attempted to use what little royal prerogative he had left by appointing the dukes Xian (384–362 BC), Xiao (361–338 BC) and Hui (338–311 BC) of Qin as hegemons, thereby in theory making Qin the chief ally of the court.
+
+
+However, in 325 the confidence of Duke Hui grew so great that he proclaimed himself "king" of Qin; adopting the same title as the king of Zhou and thereby effectively proclaiming independence from the Zhou dynasty. King Hui of Qin was guided by his prime minister Zhang Yi, a prominent representative of the School of Diplomacy.
+
+
+He was followed in 323 BC by King Xuanhui of Han and King Yi of Yan, as well as King Cuo of the minor state Zhongshan. In 318 BC even the ruler of Song, a relatively minor state, declared himself king. Uniquely, while King Wuling of Zhao had joined the other kings in declaring himself king, he retracted this order in 318 BC, after Zhao suffered a great defeat at the hands of Qin.
+
+
+Partition of Zhou (314 BC)
+
+King Kao of Zhou had enfeoffed his younger brother as Duke Huan of Henan. Three generations later, this cadet branch of the royal house began calling themselves "dukes of East Zhou".
+
+
+Upon the ascension of King Nan in 314, East Zhou became an independent state. The king came to reside in what became known as West Zhou.
+
+
+Horizontal and vertical alliances (334–249 BC)
+
+Towards the end of the Warring States period, the state of Qin became disproportionately powerful compared with the other six states. As a result, the policies of the six states became overwhelmingly oriented towards dealing with the Qin threat, with two opposing schools of thought. One school advocated a 'vertical' or north–south alliance called hezong (合縱) in which the states would ally with each other to repel Qin. The other advocated a 'horizontal' or east–west alliance called lianheng (連橫{), in which a state would ally with Qin to participate in its ascendancy.
+
+
+There were some initial successes in hezong, though mutual suspicions between allied states led to the breakdown of such alliances. Qin repeatedly exploited the horizontal alliance strategy to defeat the states one by one. During this period, many philosophers and tacticians travelled around the states, recommending that the rulers put their respective ideas into use. These "lobbyists", such as Su Qin, who advocated vertical alliances, and Zhang Yi, who advocated horizontal alliances, were famous for their tact and intellect, and were collectively known as the School of Diplomacy, whose Chinese name (縱橫家 'the school of the vertical and horizontal') was derived from the two opposing ideas.
+
+
+Su Qin and the first vertical alliance (334–300 BC)
+
+Beginning in 334 BC the diplomat Su Qin spent years visiting the courts of Yan, Zhao, Han, Wei, Qi and Chu and persuaded them to form a united front against Qin. In 318 BC all states except Qi launched a joint attack on Qin, which was not successful.
+
+
+King Hui of Qin died in 311 BC, followed by prime minister Zhang Yi one year later. The new monarch, King Wu, reigned only four years before dying without legitimate heirs. Some damaging turbulence ensued throughout 307 BC before a son of King Hui by a concubine (i.e. a younger half-brother of King Wu) could be established as King Zhao, who in stark contrast to his predecessor went on to rule for an unprecedented 53 years.
+
+
+After the failure of the first vertical alliance, Su Qin eventually came to live in Qi, where he was favored by King Xuan and drew the envy of the ministers. An assassination attempt in 300 BC left Su mortally wounded but not dead. Sensing death approaching, he advised the newly crowned King Min have him publicly executed to draw out the assassins. King Min complied with Su's request and killed him, putting an end to the first generation of Vertical alliance thinkers.
+
+
+The first horizontal alliance (300–287 BC)
+
+King Min of Qi came to be highly influenced by Lord Mengchang, a grandson of the former King Wei of Qi. Lord Mengchang made a westward alliance with the states of Wei and Han. In the far west, Qin, which had been weakened by a succession struggle in 307, yielded to the new coalition and appointed Lord Mengchang its chief minister. The alliance between Qin and Qi was sealed by a Qin princess marrying King Min. This horizontal or east–west alliance might have secured peace except that it excluded the State of Zhao.
+
+
+Around 299 BC, the ruler of Zhao became the last of the seven major states to proclaim himself "king".
+
+
+In 298 BC, Zhao offered Qin an alliance and Lord Mengchang was driven out of Qin. The remaining three allies, Qi, Wei and Han, attacked Qin, driving up the Yellow River below Shanxi to the Hangu Pass. After 3 years of fighting they took the pass and forced Qin to return territory to Han and Wei. They next inflicted major defeats on Yan and Chu. During the 5-year administration of Lord Mengchang, Qi was the major power in China.
+
+
+In 294, Lord Mengchang was implicated in a coup d'état and fled to Wei. His alliance system collapsed.
+Qi and Qin made a truce and pursued their own interests. Qi moved south against the state of Song whilst the Qin General Bai Qi pushed back eastward against a Han/Wei alliance, gaining victory at the Battle of Yique.
+
+
+In 288, King Zhao of Qin and King Min of Qi took the title di (帝 'emperor'), of the west and east respectively. They swore a covenant and started planning an attack on Zhao.
+
+
+Su Dai and the second vertical alliance
+
+In 287 BC the strategist Su Dai, younger brother of Su Qin and possibly an agent of Yan, persuaded King Min that the Zhao war would only benefit Qin. King Min agreed and formed a 'vertical' alliance with the other states against Qin. Qin backed off, abandoned the presumptuous title of "Di", and restored territory to Wei and Zhao. In 286 Qi annexed the state of Song.
+
+
+The second horizontal alliance and fall of Qi
+
+In 285 BC, the success of Qi had frightened the other states. Under the leadership of Lord Mengchang, who was exiled in Wei, Qin, Zhao, Wei and Yan formed an alliance. Yan had normally been a relatively weak ally of Qi and Qi feared little from this quarter. Yan's onslaught under general Yue Yi came as a devastating surprise. Simultaneously, the other allies attacked from the west. Chu declared itself an ally of Qi but contented itself with annexing some territory to its north. Qi's armies were destroyed while the territory of Qi was reduced to the two cities of Ju and Jimo. King Min himself was later captured and executed by his own followers.
+
+
+King Min was succeeded by King Xiang in 283 BC. His general Tian Dan was eventually able to restore much of Qi's territory, but it never regained the influence it had under King Min.
+
+
+Qin and Zhao expansion
+
+In 278 BC, the Qin general Bai Qi attacked from Qin's new territory in Sichuan to the west of Chu. The capital of Ying was captured and Chu's western lands on the Han River were lost. The effect was to shift Chu significantly to the east.
+
+
+After Chu was defeated in 278, the remaining great powers were Qin in the west and Zhao in the north-center. There was little room for diplomatic maneuver and matters were decided by wars. Zhao had been much strengthened by King Wuling of Zhao (325–299). In 307 he enlarged his cavalry by copying the northern nomads. In 306 he took more land in the northern Shanxi plateau. In 305 he defeated the north-eastern border state of Zhongshan. In 304 he pushed far to the north-west and occupied the east–west section of the Yellow River in the north of the Ordos Loop. King Huiwen of Zhao (298–266) chose able servants and expanded against the weakened Qi and Wei. In 296 his general Lian Po defeated two Qin armies.
+
+
+In 269 BC Fan Sui became chief advisor to Qin. He advocated authoritarian reforms, irrevocable expansion and an alliance with distant states to attack nearby states (the twenty-third of the Thirty-Six Stratagems). His maxim "attack not only the territory, but also the people" enunciated a policy of mass slaughter that became increasingly frequent.
+
+
+Qin-Zhao wars (282–257 BC)
+
+In 265 King Zhaoxiang of Qin made the first move by attacking the weak state of Han which held the Yellow River gateway into Qin. He moved north-east across Wei territory to cut off the Han exclave of Shangdang north of Luoyang and south of Zhao. The Han king agreed to surrender Shangdang, but the local governor refused and presented it to King Xiaocheng of Zhao. Zhao sent out Lian Po who based his armies at Changping and Qin sent out general Wang He. Lian Po was too wise to risk a decisive battle with the Qin army and remained inside his fortifications. Qin could not break through and the armies were locked in stalemate for three years. The Zhao king decided that Lian Po was not aggressive enough and sent out Zhao Kuo who promised a decisive battle. At the same time Qin secretly replaced Wang He with the notoriously violent Bai Qi. When Zhao Kuo left his fortifications, Bai Qi used a Cannae maneuver, falling back in the center and surrounding the Zhao army from the sides. After being surrounded for 46 days, the starving Zhao troops surrendered in September 260 BC. It is said that Bai Qi had all the prisoners killed and that Zhao lost 400,000 men.
+
+
+Qin was too exhausted to follow up its victory. Some time later it sent an army to besiege the Zhao capital but the army was destroyed when it was attacked from the rear. Zhao survived, but there was no longer a state that could resist Qin on its own. The other states could have survived if they remained united against Qin, but they did not.
+
+
+In 257 BC, Qin army failed to besiege Handan and was defeated by the allied force of Zhao, Wei and Chu during the Battle of Handan.
+
+
+End of Zhou dynasty (256–249 BC)
+
+The forces of King Zhao of Qin defeated King Nan of Zhou and conquered West Zhou in 256 BC, claiming the Nine Cauldrons and thereby symbolically becoming The Son of Heaven.
+
+
+King Zhao's exceptionally long reign ended in 251 BC. His son King Xiaowen, already an old man, died just three days after his coronation and was succeeded by his son King Zhuangxiang of Qin. The new Qin king proceeded to conquer East Zhou, seven years after the fall of West Zhou. Thus the 800-year Zhou dynasty, nominally China's longest-ruling regime, finally came to an end.
+
+
+Sima Qian contradicts himself regarding the ultimate fate of the East Zhou court. Chapter 4 (The Annals of Zhou) concludes with the sentence "thus the sacrifices of Zhou ended", but in the following chapter 5 (The Annals of Qin) we learn that "Qin did not prohibit their sacrifices; the Lord of Zhou was allotted a patch of land in Yangren where he could continue his ancestral sacrifices".
+
+
+Qin unites China (247–221 BC)
+
+King Zhuangxiang of Qin ruled for only three years. He was succeeded by his son Zheng, who unlike the two elderly kings that preceded him was only 13 years old at his coronation. As an adult, Zheng became a brilliant commander who, in the span of just nine years, unified China.
+
+
+Conquest of Han
+
+In 230 BC, Qin conquered Han. Han, the weakest of the Seven Warring States, was adjacent to the much stronger Qin, and had suffered continuous assaults by Qin in earlier years of the Warring States period. This went on until Emperor Qin Shi Huang sent general Wang Jian to attack Zhao. King An of Han, frightened by the thought that Han would be the next target of the Qin state, immediately sent diplomats to surrender the entire kingdom without a fight, saving the Han populace from the terrible potential consequences of an unsuccessful resistance.
+
+
+Conquest of Wei
+
+In 225 BC, Qin conquered Wei. The Qin army led a direct invasion into Wei by besieging its capital Daliang but soon realized that the city walls were too tough to break into. They devised a new strategy in which they utilized the power of a local river that was linked to the Yellow River. The river was used to flood the city's walls, causing massive devastation to the city. Upon realizing the situation, King Jia of Wei hurriedly came out of the capital and surrendered it to the Qin army in order to avoid further bloodshed of his people.
+
+
+Conquest of Chu
+
+In 223 BC, Qin conquered Chu.
+The first invasion was however an utter disaster when 200,000 Qin troops, led by the general, Li Xin, were defeated by 500,000 Chu troops in the unfamiliar territory of Huaiyang, modern-day northern Jiangsu and Anhui provinces. Xiang Yan, the Chu commander, had lured Qin by allowing a few initial victories, but then counterattacked and burnt two large Qin camps.
+
+
+In 222 BC, Wang Jian was recalled to lead a second military invasion with 600,000 men against the Chu state. High in morale after their victory in the previous year, the Chu forces were content to sit back and defend against what they expected to be a siege of Chu. However, Wang Jian decided to weaken Chu's resolve and tricked the Chu army by appearing to be idle in his fortifications whilst secretly training his troops to fight in Chu territory. After a year, the Chu defenders decided to disband due to apparent lack of action from the Qin. Wang Jian invaded at that point, with full force, and overran Huaiyang and the remaining Chu forces. Chu lost the initiative and could only sustain local guerrilla-style resistance until it too was fully conquered with the destruction of Shouchun and the death of its last leader, Lord Changping, in 223 BC. At their peak, the combined armies of Chu and Qin are estimated to have ranged from hundreds of thousands to a million soldiers, more than those involved in the campaign of Changping between Qin and Zhao 35 years earlier.
+
+
+Conquest of Zhao and Yan
+
+In 222 BC, Qin conquered Zhao and Yan.
+After the conquest of Zhao, the Qin army turned its attention towards Yan. Realizing the danger and gravity of this situation, Crown Prince Dan of Yan had sent Jing Ke to assassinate King Zheng of Qin, but this failure only helped to fuel the rage and determination of the Qin king, and he increased the number of troops to conquer the Yan state.
+
+
+Conquest of Qi
+
+In 221 BC, Qin conquered Qi, the final unconquered state. It had not previously contributed or helped other states when Qin was conquering them. As soon as Qin's intention to invade it became clear, Qi swiftly surrendered all its cities, completing the unification of China and ushering in the Qin dynasty. The last Qi king lived out his days in exile in Gong and was not given a posthumous name, therefore he is known to posterity by his personal name Jian.
+
+
+Aftermath
+
+The Qin king Ying Zheng declared himself as Qin Shi Huangdi, "The first Sovereign Emperor of Qin".
+
+
+In the rule of the Qin state, the union was based solely on military power. The feudal holdings were abolished, and noble families were forced to live in the capital city Xianyang, in order to be supervised. A national road (as well as greater use of canals) allowed for faster and easier deployment and supply of the army. The peasants were given a wider range of land rights, although they were subject to taxation, creating a large amount of revenue to the state.
+
+
+Military theory and practice
+
+Increasing scale of warfare
+
+The chariot remained a major factor in Chinese warfare long after it went out of fashion in the Middle East. Near the beginning of the Warring States period there is a shift from chariots to massed infantry, possibly associated with the invention of the crossbow. This had two major effects. First it led the dukes to weaken their chariot-riding nobility so they could get direct access to the peasantry who could be drafted as infantry. This change was associated with the shift from aristocratic to bureaucratic government. Second, it led to a massive increase in the scale of warfare. When the Zhou overthrew the Shang at the Battle of Muye they used 45,000 troops and 300 chariots. For the Warring States period the following figures for the military strengths of various states are reported:
+
+
+For major battles, the following figures are reported:
+
+
+Many scholars think these numbers are exaggerated (records are inadequate, they are much larger than those from similar societies, soldiers were paid by the number of enemies they killed and the Han dynasty had an interest in exaggerating the bloodiness of the age before China was unified). Regardless of exaggeration, it seems clear that warfare had become excessive during this period. The bloodshed and misery of the Warring States period goes a long way in explaining China's traditional and current preference for a united throne.
+
+
+Military developments
+
+The Warring States period saw the introduction of many innovations to the art of warfare in China, such as the use of iron and of cavalry.
+
+
+Warfare in the Warring States period evolved considerably from the Spring and Autumn period, as most armies made use of infantry and cavalry in battles, and the use of chariots became less widespread. The use of massed infantry made warfare bloodier and reduced the importance of the aristocracy, which in turn made the kings more despotic. From this period onward, as the various states competed with each other by mobilizing their armies to war, nobles in China belonged to the literate class, rather than to the warrior class as had previously been the case.
+
+
+The various states fielded massive armies of infantry, cavalry, and chariots. Complex logistical systems maintained by efficient government bureaucracies were needed to supply, train, and control such large forces. The size of the armies ranged from tens of thousands to several hundred thousand men.
+Iron weapons became more widespread and began to replace bronze. Most armour and weapons of this period were made from iron.
+
+
+The first official native Chinese cavalry unit was formed in 307 BC during the military reforms of King Wuling of Zhao, who advocated 'nomadic dress and horse archery'. But the war chariot still retained its prestige and importance, despite the tactical superiority of cavalry.
+
+
+The crossbow was the preferred long-range weapon of this period, due to several reasons. The crossbow could be mass-produced easily, and mass training of crossbowmen was possible. These qualities made it a powerful weapon against the enemy.
+
+
+Infantrymen deployed a variety of weapons, but the most popular was the dagger-axe. The dagger-axe came in various lengths, from 9 to 18 feet; the weapon consisted of a thrusting spear with a slashing blade appended to it. Dagger-axes were an extremely popular weapon in various kingdoms, especially for the Qin, who produced 18-foot-long pike-like weapons.
+
+
+The Qiang battle spear was named as the king 'wang' of all ancient weapons. It had the biggest impact on the battlefield and was quite difficult to master. The second important weapon of that era was the double-edged battle sword Jian. The fighting methods of using the Qiang spear and Jian sword were very different from what we see in movies or re-enactment shows today. Professional warriors of that era used the military concepts of "Master" Sun Tzu and created several successful "Ge Dou" martial schools.
+
+
+Military thought
+
+The Warring States was a great period for military strategy; of the Seven Military Classics of China, four were written during this period:
+
+
+Culture and society
+
+The Warring States period was an era of warfare in ancient China, as well as bureaucratic and military reforms and consolidation; the major states, ruling over large territories, quickly sought to consolidate their powers, leading to the final erosion of the Zhou court's prestige. As a sign of this shift, the rulers of all the major states (except for Chu, which had claimed kingly title much earlier) abandoned their former feudal titles for the title of 王, or King, claiming equality with the rulers of the Zhou.
+
+
+At the same time, the constant conflict and need for innovative social and political models led to the development of many philosophical doctrines, later known as the Hundred Schools of Thought. The most notable schools of thought include Mohism (expounded by Mozi), Confucianism (represented by Mencius and Xunzi), Legalism (represented by Shang Yang, Shen Buhai, Shen Dao and Han Fei) and Taoism (represented by Zhuangzi and Lao Tzu).
+
+
+The many states that were competing between each other attempted to display their power not only militarily but in their courts and in state philosophy. Many differing rulers adopted the differing philosophies to their own advantage or that of their kingdom.
+
+
+Mencius attempted to instate Confucianism as a state philosophy, proposing that through the governing of moral principles like benevolence and righteousness, the state would win popular support from one state and those neighboring, eliminating the need of a war altogether. Mencius had attempted to convince King Hui of Liang, although was unsuccessful since the king saw no advantage in the period of wars.
+
+
+Mohism was developed by Mozi (468–376 BC) and it provided a unified moral and political philosophy based on impartiality and benevolence. Mohists had the belief that people change depending on environments around. The same was applied to rulers, which is why one must be cautious of foreign influences. Mozi was very much against warfare, although he was a great tactician in defense. He defended the small state of Song from many attempts of the Chu state.
+
+
+Taoism was advocated by Laozi, and believed that human nature was good and can achieve perfection by returning to its original state. It believed that like a baby, humans are simple and innocent although with development of civilizations it lost its innocence only to be replaced by fraud and greed. 	Contrarily to other schools, it did not want to gain influence in the offices of states and Laozi even refused to be the minister of the state of Chu.
+
+
+Legalism created by Shang Yang in 338 BC, rejected all notions of religion and practices, and believed a nation should be governed by strict law. Not only were severe punishments applied, but they would be grouped with the families and made mutually responsible for criminal act. It proposed radical reforms, and established a society based on solid ranks. Peasants were encouraged to practice agriculture as occupation, and military performance was rewarded. Laws were also applied to all ranks with no exception; even the king was not above punishment. The philosophy was adapted by the Qin state and it created it into an organized, centralized state with a bureaucracy chosen on the basis of merit.
+This period is most famous for the establishment of complex bureaucracies and centralized governments, as well as a clear legal system. The developments in political and military organization were the basis of the power of the Qin state, which conquered the other states and unified them under the Qin dynasty in 221 BC.
+
+
+Nobles, bureaucrats and reformers
+
+The phenomenon of intensive warfare, based on mass formations of infantry rather than the traditional chariots, was one major trend which led to the creation of strong central bureaucracies in each of the major states. At the same time, the process of secondary feudalism which permeated the Spring and Autumn period, and led to such events as the partition of Jin and the usurpation of Qi by the Tian clan, was eventually reversed by the same process of bureaucratisation.
+
+
+Under the demands of warfare, the states adopted bureaucratic reforms in the Warring States period. Wei adopted these in 445 BC, Zhao in 403 BC, Chu in 390 BC, Han in 355 BC, Qi in 357 BC and Qin in 350 BC. Power was centralised by curbing the landed aristocrats and sinecures and creating a new hierarchy based on meritorious service to the state, which were drawn from the lower rungs of society. Systematic auditing and reporting systems, and fixed salaries for officials were created.
+
+
+The reforms of Shang Yang in Qin, and of Wu Qi in Chu, both centred on increased centralisation, the suppression of the nobility, and a vastly increased scope of government based on Legalist ideals, which were necessary to mobilise the large armies of the period.
+
+
+Sophisticated arithmetic
+
+A bundle of 21 bamboo slips from the Tsinghua collection dated to 305 BC are the world's earliest example of a two digit decimal multiplication table, indicating that sophisticated commercial arithmetic was already established during this period.
+
+
+Rod numerals were used to represent both negative and positive integers, and rational numbers, a true positional number system, with a blank for zero dating back to the Warring States period.
+
+
+The nine linked-rings puzzle, an advanced puzzle device which requires mathematical analysis to solve, was invented during the period.
+
+
+Literature
+
+An important literary achievement of the Warring States period is the Zuo Commentary on the Spring and Autumn Annals, which summarizes the preceding Spring and Autumn period. The less famous work Guoyu is thought to be by the same author.
+
+
+Many sayings of Spring and Autumn philosophers, which had previously been circulated orally, were put into writing in the Warring States. These include the Analects and The Art of War.
+
+
+Economic developments
+
+The Warring States period saw the proliferation of iron working in China, replacing bronze as the dominant type of metal used in warfare. Areas such as Shu (present-day Sichuan) and Yue (present-day Zhejiang) were also brought into the Chinese cultural sphere during this time. Trade also became important, and some merchants had considerable power in politics, the most prominent of which was Lü Buwei, who rose to become Chancellor of Qin and was a key supporter of the eventual Qin Shihuang.
+
+
+At the same time, the increased resources of consolidated, bureaucratic states, coupled with the logistical needs of mass levies and large-scale warfare, led to the proliferation of economic projects such as large-scale waterworks. Major examples of such waterworks include the Dujiangyan Irrigation System, which controlled the Min River in Sichuan and turned the former backwater region into a major Qin logistical base, and the Zhengguo Canal which irrigated large areas of land in the Guanzhong Plain, again increasing Qin's agricultural output.
+
+
+The Guanzi is considered one of the most foundational texts of the developing political economy in the Warring States period. It addresses principles of price regulation in the context of effectively dealing with commodities that are "light" (connoting a commodity which is unimportant, non-essential, or inexpensive) or "heavy" (a commodity which is important, essential, or expensive) and how whether a commodity is "light" or "heavy" is understood in relation to other commodities.
+
+
+In summary:"""
+        # fmt: on
+
+        input_ids = tokenizer(LONG_TEXT, return_tensors="pt").input_ids.to(torch_device)
+        generated_ids = model.generate(input_ids, max_new_tokens=50)[:, input_ids.shape[1] :]
+
+        torch.testing.assert_close(generated_ids.cpu(), torch.tensor([[  279,   467, 19859,  4180,  4168,   572,   264,  4168,   315,  2244, 2297,   304,  5616,    11,   504,   279,  8606, 94350,  1849,   311,   279, 79395,  1584,    11,   504,   279,  8606,  6277,   311,   279,  6277,    11,   504,   279,  8606,  8584,   311,   279,  6955,    11,   323,   504,   279,  8606,  7674,   311,   279, 12752,    13,   576]], dtype=torch.long))  # fmt: skip
+        self.assertEqual(
+            tokenizer.decode(generated_ids[0]),
+            """ the Warring States period was a period of great change in China, from the traditional feudal system to the bureaucratic state, from the traditional military to the military, from the traditional economy to the economic, and from the traditional culture to the cultural. The""",
+        )
+        model.config._attn_implementation = "eager"
+        new_generated_ids = model.generate(input_ids, max_new_tokens=50)[:, input_ids.shape[1] :]
+        with self.subTest("Eager matches sdpa"):
+            torch.testing.assert_close(generated_ids, new_generated_ids, rtol=1e-4, atol=1e-4)
+
+        model.config._attn_implementation = "flex_attention"
+        new_generated_ids = model.generate(input_ids, max_new_tokens=50)[:, input_ids.shape[1] :]
+        with self.subTest("Eager matches Flex attention"):
+            torch.testing.assert_close(generated_ids, new_generated_ids, rtol=1e-4, atol=1e-4)
+
+        model.config._attn_implementation = "flash_attention_2"
+        new_generated_ids = model.generate(input_ids, max_new_tokens=50)[:, input_ids.shape[1] :]
+        with self.subTest("Eager matches flash attention"):
+            torch.testing.assert_close(generated_ids, new_generated_ids, rtol=1e-4, atol=1e-4)
diff --git a/tests/models/qwen3_moe/__init__.py b/tests/models/qwen3_moe/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/qwen3_moe/test_modeling_qwen3_moe.py b/tests/models/qwen3_moe/test_modeling_qwen3_moe.py
new file mode 100644
index 000000000000..e548d14ba7e3
--- /dev/null
+++ b/tests/models/qwen3_moe/test_modeling_qwen3_moe.py
@@ -0,0 +1,512 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Qwen3MoE model."""
+
+import gc
+import unittest
+
+import pytest
+
+from transformers import AutoTokenizer, Qwen3MoeConfig, is_torch_available, set_seed
+from transformers.testing_utils import (
+    backend_empty_cache,
+    require_bitsandbytes,
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        Qwen3MoeForCausalLM,
+        Qwen3MoeForQuestionAnswering,
+        Qwen3MoeForSequenceClassification,
+        Qwen3MoeForTokenClassification,
+        Qwen3MoeModel,
+    )
+
+
+class Qwen3MoeModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=64,
+        num_hidden_layers=5,
+        max_window_layers=3,
+        use_sliding_window=True,
+        sliding_window=50,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=16,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        expert_interval=1,
+        moe_intermediate_size=12,
+        num_experts_per_tok=2,
+        num_experts=8,
+        norm_topk_prob=False,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        bos_token_id=1,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.max_window_layers = max_window_layers
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.scope = scope
+        self.expert_interval = expert_interval
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device))
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return Qwen3MoeConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            max_window_layers=self.max_window_layers,
+            use_sliding_window=self.use_sliding_window,
+            sliding_window=self.sliding_window,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            head_dim=self.head_dim,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            expert_interval=self.expert_interval,
+            moe_intermediate_size=self.moe_intermediate_size,
+            num_experts_per_tok=self.num_experts_per_tok,
+            num_experts=self.num_experts,
+            norm_topk_prob=self.norm_topk_prob,
+            output_router_logits=self.output_router_logits,
+            router_aux_loss_coef=self.router_aux_loss_coef,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            bos_token_id=self.bos_token_id,
+        )
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Qwen3Moe
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = Qwen3MoeModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+# Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Qwen3Moe
+class Qwen3MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            Qwen3MoeModel,
+            Qwen3MoeForCausalLM,
+            Qwen3MoeForSequenceClassification,
+            Qwen3MoeForTokenClassification,
+            Qwen3MoeForQuestionAnswering,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": Qwen3MoeModel,
+            "text-classification": Qwen3MoeForSequenceClassification,
+            "token-classification": Qwen3MoeForTokenClassification,
+            "text-generation": Qwen3MoeForCausalLM,
+            "zero-shot": Qwen3MoeForSequenceClassification,
+            "question-answering": Qwen3MoeForQuestionAnswering,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+    fx_compatible = False  # Broken by attention refactor cc @Cyrilvallez
+
+    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
+    def is_pipeline_test_to_skip(
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
+    ):
+        return True
+
+    def setUp(self):
+        self.model_tester = Qwen3MoeModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Qwen3MoeConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_torch_fx_output_loss(self):
+        super().test_torch_fx_output_loss()
+
+    def test_Qwen3Moe_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = Qwen3MoeForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Qwen3Moe_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = Qwen3MoeForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Qwen3Moe_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = Qwen3MoeForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Qwen3Moe,llama->Qwen3Moe
+    def test_Qwen3Moe_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = Qwen3MoeForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
+    # Ignore copy
+    def test_past_key_values_format(self):
+        super().test_past_key_values_format()
+
+    @require_flash_attn
+    @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        self.skipTest(reason="Qwen3Moe flash attention does not support right padding")
+
+    # Ignore copy
+    def test_load_balancing_loss(self):
+        r"""
+        Let's make sure we can actually compute the loss and do a backward on it.
+        """
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.num_experts = 8
+        config.expert_interval = 2
+        config.output_router_logits = True
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = Qwen3MoeForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask)
+        self.assertEqual(result.router_logits[0].shape, (91, config.num_experts))
+        torch.testing.assert_close(result.aux_loss.cpu(), torch.tensor(2, dtype=torch.float32), rtol=1e-2, atol=1e-2)
+
+        # First, we make sure that adding padding tokens doesn't change the loss
+        # loss(input_ids, attention_mask=None) == loss(input_ids + padding, attention_mask=attention_mask_with_padding)
+        pad_length = 1000
+        # Add padding tokens (assume that pad_token_id=1) to input_ids
+        padding_block = torch.ones(input_ids.shape[0], pad_length, dtype=torch.int32).to(torch_device)
+        padded_input_ids = torch.cat((padding_block, input_ids), dim=1)  # this is to simulate padding to the left
+        padded_attention_mask = padded_input_ids.ne(1).to(torch_device)
+
+        padded_result = model(padded_input_ids, attention_mask=padded_attention_mask)
+        torch.testing.assert_close(result.aux_loss.cpu(), padded_result.aux_loss.cpu(), rtol=1e-4, atol=1e-4)
+
+        # We make sure that the loss of includding padding tokens != the loss without padding tokens
+        # if attention_mask=None --> we don't exclude padding tokens
+        include_padding_result = model(padded_input_ids, attention_mask=None)
+
+        # This is to mimic torch.testing.assert_not_close
+        self.assertNotAlmostEqual(include_padding_result.aux_loss.item(), result.aux_loss.item())
+
+
+@require_torch
+class Qwen3MoeIntegrationTest(unittest.TestCase):
+    @slow
+    def test_model_15b_a2b_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-15B-A2B-Base", device_map="auto")
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        with torch.no_grad():
+            out = model(input_ids).logits.float().cpu()
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = torch.tensor([[-1.1184, 1.1356, 9.2112, 8.0254, 5.1663, 7.9287, 8.9245, 10.0671]])
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, rtol=1e-2, atol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        EXPECTED_SLICE = torch.tensor([7.5938, 2.6094, 4.0312, 4.0938, 2.5156, 2.7812, 2.9688, 1.5547, 1.3984, 2.2344, 3.0156, 3.1562, 1.1953, 3.2500, 1.0938, 8.4375, 9.5625, 9.0625, 7.5625, 7.5625, 7.9062, 7.2188, 7.0312, 6.9375, 8.0625, 1.7266, 0.9141, 3.7969, 5.3438, 3.9844])  # fmt: skip
+        torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, rtol=1e-4, atol=1e-4)
+
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    @slow
+    def test_model_15b_a2b_generation(self):
+        EXPECTED_TEXT_COMPLETION = (
+            """To be or not to be, that is the question. Whether 'tis nobler in the mind to suffer the sl"""
+        )
+        prompt = "To be or not to"
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-15B-A2B-Base", use_fast=False)
+        model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-15B-A2B-Base", device_map="auto")
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
+
+        # greedy generation outputs
+        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    @require_bitsandbytes
+    @slow
+    @require_flash_attn
+    @pytest.mark.flash_attn_test
+    def test_model_15b_a2b_long_prompt(self):
+        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
+        # An input with 4097 tokens that is above the size of the sliding window
+        input_ids = [1] + [306, 338] * 2048
+        model = Qwen3MoeForCausalLM.from_pretrained(
+            "Qwen/Qwen3-15B-A2B-Base",
+            device_map="auto",
+            load_in_4bit=True,
+            attn_implementation="flash_attention_2",
+        )
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
+        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
+
+        # Assisted generation
+        assistant_model = model
+        assistant_model.generation_config.num_assistant_tokens = 2
+        assistant_model.generation_config.num_assistant_tokens_schedule = "constant"
+        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
+        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
+
+        del assistant_model
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    @slow
+    @require_torch_sdpa
+    def test_model_15b_a2b_long_prompt_sdpa(self):
+        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
+        # An input with 4097 tokens that is above the size of the sliding window
+        input_ids = [1] + [306, 338] * 2048
+        model = Qwen3MoeForCausalLM.from_pretrained(
+            "Qwen/Qwen3-15B-A2B-Base",
+            device_map="auto",
+            attn_implementation="sdpa",
+        )
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
+        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
+
+        # Assisted generation
+        assistant_model = model
+        assistant_model.generation_config.num_assistant_tokens = 2
+        assistant_model.generation_config.num_assistant_tokens_schedule = "constant"
+        generated_ids = assistant_model.generate(input_ids, max_new_tokens=4, temperature=0)
+        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
+
+        del assistant_model
+
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+        EXPECTED_TEXT_COMPLETION = (
+            """To be or not to be, that is the question. Whether 'tis nobler in the mind to suffer the sl"""
+        )
+        prompt = "To be or not to"
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-15B-A2B-Base", use_fast=False)
+
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
+
+        # greedy generation outputs
+        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+    @slow
+    def test_speculative_generation(self):
+        EXPECTED_TEXT_COMPLETION = (
+            "To be or not to be, that is the question: whether 'tis nobler in the mind to suffer the sl"
+        )
+        prompt = "To be or not to"
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-15B-A2B-Base", use_fast=False)
+        model = Qwen3MoeForCausalLM.from_pretrained(
+            "Qwen/Qwen3-15B-A2B-Base", device_map="auto", torch_dtype=torch.float16
+        )
+        assistant_model = Qwen3MoeForCausalLM.from_pretrained(
+            "Qwen/Qwen3-15B-A2B-Base", device_map="auto", torch_dtype=torch.float16
+        )
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device)
+
+        # greedy generation outputs
+        set_seed(0)
+        generated_ids = model.generate(
+            input_ids, max_new_tokens=20, do_sample=True, temperature=0.3, assistant_model=assistant_model
+        )
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py
index b219d5c74edf..35c14b64c6c9 100644
--- a/tests/models/rag/test_modeling_rag.py
+++ b/tests/models/rag/test_modeling_rag.py
@@ -311,7 +311,7 @@ def check_model_generate_from_context_input_ids(
 
             out = retriever(
                 input_ids,
-                question_hidden_states.cpu().detach().to(torch.float32).numpy(),
+                question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
                 prefix=config.generator.prefix,
                 return_tensors="pt",
             )
@@ -379,7 +379,7 @@ def check_model_without_retriever(
 
             out = retriever(
                 input_ids,
-                question_hidden_states.cpu().detach().to(torch.float32).numpy(),
+                question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
                 prefix=config.generator.prefix,
                 return_tensors="pt",
             )
@@ -438,7 +438,7 @@ def check_model_custom_n_docs(
 
             out = retriever(
                 input_ids,
-                question_hidden_states.cpu().detach().to(torch.float32).numpy(),
+                question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
                 prefix=config.generator.prefix,
                 return_tensors="pt",
                 n_docs=n_docs,
@@ -507,7 +507,7 @@ def check_model_with_mismatch_n_docs_value(
 
             out = retriever(
                 input_ids,
-                question_hidden_states.cpu().detach().to(torch.float32).numpy(),
+                question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
                 prefix=config.generator.prefix,
                 return_tensors="pt",
                 n_docs=retriever_n_docs,
@@ -964,7 +964,7 @@ def test_rag_sequence_generate_batch_from_context_input_ids(self):
 
         question_hidden_states = rag_sequence.question_encoder(input_ids, attention_mask=attention_mask)[0]
         docs_dict = retriever(
-            input_ids.cpu().detach().numpy(), question_hidden_states.cpu().detach().numpy(), return_tensors="pt"
+            input_ids.detach().cpu().numpy(), question_hidden_states.detach().cpu().numpy(), return_tensors="pt"
         )
         doc_scores = torch.bmm(
             question_hidden_states.unsqueeze(1),
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index 3519604c8c0e..36ccd04cf6d0 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -16,6 +16,8 @@
 
 import unittest
 
+import pytest
+
 from transformers import AutoModelForCausalLM, AutoTokenizer, RecurrentGemmaConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
     require_bitsandbytes,
@@ -26,7 +28,6 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -151,119 +152,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->RecurrentGemma
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = RecurrentGemmaModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->RecurrentGemma
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = RecurrentGemmaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->RecurrentGemma
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = RecurrentGemmaForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->RecurrentGemma
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -281,7 +169,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class RecurrentGemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class RecurrentGemmaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (RecurrentGemmaForCausalLM,) if is_torch_available() else ()
     # Doesn't run generation tests. TODO @gante not fully supported
     all_generative_model_classes = ()
@@ -336,10 +224,6 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skip(reason="Fast init from base not tested for RecurrentGemma")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="RecurrentGemma does not return pkv")
     def test_past_key_values_format(self):
         pass
@@ -348,10 +232,6 @@ def test_past_key_values_format(self):
     def test_eager_matches_sdpa_generate(self):
         pass
 
-    @unittest.skip(reason="RecurrentGemma only supports sdpa")
-    def test_eager_matches_sdpa_inference(self):
-        pass
-
     @unittest.skip(reason="RecurrentGemma does not return the cache")
     def test_contrastive_generate_low_memory(self):
         pass
@@ -380,6 +260,7 @@ def test_model_parallelism(self):
     def test_model_parallel_beam_search(self):
         pass
 
+    @pytest.mark.generate
     @unittest.skip(reason="Rely on `past_key_values` to crop the assistant pkv. Not supported")
     def test_assisted_decoding_matches_greedy_search(self):
         pass
@@ -388,6 +269,7 @@ def test_assisted_decoding_matches_greedy_search(self):
     def test_left_padding_compatibility(self):
         pass
 
+    @pytest.mark.generate
     @unittest.skip(reason="Relies on `past_key_values` returned by the model. Not supported with recurrent gemma")
     def test_assisted_decoding_sample(self):
         pass
diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py
index d7e1e3c69153..b4e493dd2c1d 100644
--- a/tests/models/reformer/test_modeling_reformer.py
+++ b/tests/models/reformer/test_modeling_reformer.py
@@ -249,7 +249,7 @@ def create_and_check_reformer_model_with_attn_mask(
         model = ReformerModel(config=config)
         model.to(torch_device)
         model.eval()
-        # set all position encodings to zero so that postions don't matter
+        # set all position encodings to zero so that positions don't matter
         with torch.no_grad():
             embedding = model.embeddings.position_embeddings.embedding
             embedding.weight = nn.Parameter(torch.zeros(embedding.weight.shape).to(torch_device))
@@ -696,7 +696,7 @@ def test_left_padding_compatibility(self):
         pass
 
     def prepare_config_and_inputs_for_generate(self, *args, **kwargs):
-        # override because overwise we hit max possible seq length for model (4*8=32)
+        # override because otherwise we hit max possible seq length for model (4*8=32)
         # decreasing the seq_length in tester causes errors for "training_tests", those need exactly max seq length
         # NOTE: seq_length has to be multiple of 4, otherwise it fails for other tests
         original_sequence_length = self.model_tester.seq_length
@@ -887,7 +887,7 @@ def test_left_padding_compatibility(self):
 @require_tokenizers
 class ReformerIntegrationTests(unittest.TestCase):
     """
-    These integration tests test the current layer activations and gradients againts the output of the Hugging Face Reformer model at time of integration: 29/06/2020. During integration, the model was tested against the output of the official Trax ReformerLM model for various cases ("lsh" only, "lsh" only, masked / non-masked, different chunk length, ....). In order to recover the original trax integration tests, one should use patrickvonplaten's fork of trax and the code that lives on the branch `reformer_trax_tests`.
+    These integration tests test the current layer activations and gradients against the output of the Hugging Face Reformer model at time of integration: 29/06/2020. During integration, the model was tested against the output of the official Trax ReformerLM model for various cases ("lsh" only, "lsh" only, masked / non-masked, different chunk length, ....). In order to recover the original trax integration tests, one should use patrickvonplaten's fork of trax and the code that lives on the branch `reformer_trax_tests`.
     """
 
     def _get_basic_config_and_input(self):
@@ -1246,7 +1246,7 @@ def test_local_lm_model_grad(self):
         )
         loss.backward()
 
-        # check last grads to cover all proable errors
+        # check last grads to cover all probable errors
         grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
         expected_grad_slice_word = torch.tensor(
             [-0.0005, -0.0001, -0.0002, -0.0006, -0.0006],
@@ -1287,7 +1287,7 @@ def test_lsh_lm_model_grad(self):
             loss, torch.tensor(5.7854, dtype=torch.float, device=torch_device), rtol=1e-3, atol=1e-3
         )
         loss.backward()
-        # check last grads to cover all proable errors
+        # check last grads to cover all probable errors
         grad_slice_word = model.reformer.embeddings.word_embeddings.weight.grad[0, :5]
         expected_grad_slice_word = torch.tensor(
             [0.0004, 0.0003, 0.0006, -0.0004, 0.0002],
diff --git a/tests/models/reformer/test_tokenization_reformer.py b/tests/models/reformer/test_tokenization_reformer.py
index 89f3ef7c4ec6..d5e3901b3fb7 100644
--- a/tests/models/reformer/test_tokenization_reformer.py
+++ b/tests/models/reformer/test_tokenization_reformer.py
@@ -34,11 +34,12 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_seq2seq = False
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@@ -84,7 +85,7 @@ def test_rust_and_python_full_tokenizers(self):
     def test_padding(self, max_length=15):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 # Simple input
                 s = "This is a simple input"
@@ -216,7 +217,7 @@ def test_tokenization_base_easy_symbols(self):
     def test_tokenization_base_hard_symbols(self):
         symbols = (
             'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
-            " add words that should not exsist and be tokenized to <unk>, such as saoneuhaoesuth"
+            " add words that should not exist and be tokenized to <unk>, such as saoneuhaoesuth"
         )
         original_tokenizer_encodings = [
             108,
diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py
index d5f9e0d5eccf..956f1a2b80cb 100644
--- a/tests/models/rembert/test_modeling_rembert.py
+++ b/tests/models/rembert/test_modeling_rembert.py
@@ -200,24 +200,6 @@ def create_and_check_model_as_decoder(
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = RemBertForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
     def create_and_check_for_masked_lm(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
@@ -373,6 +355,8 @@ class RemBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": RemBertModel,
diff --git a/tests/models/rembert/test_tokenization_rembert.py b/tests/models/rembert/test_tokenization_rembert.py
index 113d7b7676ae..c2ee3619f8e2 100644
--- a/tests/models/rembert/test_tokenization_rembert.py
+++ b/tests/models/rembert/test_tokenization_rembert.py
@@ -39,11 +39,12 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece_ignore_case = True
     pre_trained_model_path = "google/rembert"
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         tokenizer = RemBertTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     # Copied from ReformerTokenizationTest.get_input_output_texts
     def get_input_output_texts(self, tokenizer):
@@ -222,7 +223,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
 
                 with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
                     if self.rust_tokenizer_class is not None:
-                        tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
+                        tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos)
                         self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
                         self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
                         # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
diff --git a/tests/models/roberta/test_tokenization_roberta.py b/tests/models/roberta/test_tokenization_roberta.py
index 84fde55e0aee..e2760f646cce 100644
--- a/tests/models/roberta/test_tokenization_roberta.py
+++ b/tests/models/roberta/test_tokenization_roberta.py
@@ -18,12 +18,13 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 
 from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast
 from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_tokenizers, slow
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_tokenizers
@@ -34,8 +35,9 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     from_pretrained_kwargs = {"cls_token": "<s>"}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -62,22 +64,30 @@ def setUp(self):
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        self.special_tokens_map = {"unk_token": "<unk>"}
+        cls.special_tokens_map = {"unk_token": "<unk>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
+        with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
@@ -171,8 +181,8 @@ def test_pretokenized_inputs(self):
     def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 sentence = "A, <mask> AllenNLP sentence."
                 tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
                 tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
@@ -202,7 +212,7 @@ def test_embeded_special_tokens(self):
 
     def test_change_add_prefix_space_and_trim_offsets_args(self):
         for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
-            tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+            tokenizer_r = self.get_rust_tokenizer(
                 self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
             )
 
@@ -222,7 +232,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                 text_of_1_token = "hello"  # `hello` is a token in the vocabulary of `pretrained_name`
                 text = f"{text_of_1_token} {text_of_1_token}"
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -232,7 +242,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                     (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
                 )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -242,7 +252,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                     (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
                 )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -252,7 +262,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                     (len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
                 )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -274,7 +284,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                 #     (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
                 # )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -284,7 +294,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                     (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
                 )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@@ -294,7 +304,7 @@ def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_argument
                     (1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
                 )
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
                 )
                 encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index 2f13664e18b8..95de5613a243 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -259,33 +259,6 @@ def create_and_check_model_as_decoder(
         )
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        input_shape_ids,
-        input_pronunciation_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = RoCBertForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            input_shape_ids=input_shape_ids,
-            input_pronunciation_ids=input_pronunciation_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            labels=token_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
     def create_and_check_for_masked_lm(
         self,
         config,
@@ -570,6 +543,8 @@ class RoCBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": RoCBertModel,
diff --git a/tests/models/roc_bert/test_tokenization_roc_bert.py b/tests/models/roc_bert/test_tokenization_roc_bert.py
index fdd95a033aa7..885975b8df34 100644
--- a/tests/models/roc_bert/test_tokenization_roc_bert.py
+++ b/tests/models/roc_bert/test_tokenization_roc_bert.py
@@ -41,8 +41,9 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     space_between_special_tokens = True
     from_pretrained_filter = filter_non_english
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "你", "好", "是", "谁", "a", "b", "c", "d"]
         word_shape = {}
@@ -50,14 +51,14 @@ def setUp(self):
         for i, value in enumerate(vocab_tokens):
             word_shape[value] = i
             word_pronunciation[value] = i
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.word_shape_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"])
-        self.word_pronunciation_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.word_shape_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"])
+        cls.word_pronunciation_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-        with open(self.word_shape_file, "w", encoding="utf-8") as word_shape_writer:
+        with open(cls.word_shape_file, "w", encoding="utf-8") as word_shape_writer:
             json.dump(word_shape, word_shape_writer, ensure_ascii=False)
-        with open(self.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer:
+        with open(cls.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer:
             json.dump(word_pronunciation, word_pronunciation_writer, ensure_ascii=False)
 
     def test_full_tokenizer(self):
@@ -204,7 +205,7 @@ def test_clean_text(self):
     def test_offsets_with_special_characters(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
                 tokens = tokenizer_r.encode_plus(
@@ -260,8 +261,8 @@ def test_change_tokenize_chinese_chars(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 kwargs["tokenize_chinese_chars"] = True
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
                 ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@@ -274,8 +275,8 @@ def test_change_tokenize_chinese_chars(self):
                 self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
 
                 kwargs["tokenize_chinese_chars"] = False
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
                 ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py
index 7ad8165c8483..22c1e8af8ad2 100644
--- a/tests/models/roformer/test_modeling_roformer.py
+++ b/tests/models/roformer/test_modeling_roformer.py
@@ -200,24 +200,6 @@ def create_and_check_model_as_decoder(
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = RoFormerForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
     def create_and_check_for_generate_causal_lm(
         self,
         config,
@@ -392,6 +374,8 @@ class RoFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": RoFormerModel,
@@ -484,19 +468,19 @@ def test_model_from_pretrained(self):
         self.assertIsNotNone(model)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -532,6 +516,7 @@ class RoFormerSinusoidalPositionalEmbeddingTest(unittest.TestCase):
     def test_basic(self):
         input_ids = torch.tensor([[4, 10]], dtype=torch.long, device=torch_device)
         emb1 = RoFormerSinusoidalPositionalEmbedding(num_positions=6, embedding_dim=6).to(torch_device)
+        emb1._init_weight()
         emb = emb1(input_ids.shape)
         desired_weights = torch.tensor(
             [[0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 1.0000], [0.8415, 0.0464, 0.0022, 0.5403, 0.9989, 1.0000]]
@@ -550,6 +535,7 @@ def test_positional_emb_weights_against_roformer(self):
             ]
         ).to(torch_device)
         emb1 = RoFormerSinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512).to(torch_device)
+        emb1._init_weight()
         weights = emb1.weight.data[:3, :5].to(torch_device)
 
         self.assertTrue(
@@ -571,6 +557,7 @@ def test_apply_rotary_position_embeddings(self):
             -torch.arange(2 * 12 * 16 * 64, dtype=torch.float, device=torch_device).reshape(2, 12, 16, 64) / 100
         ).to(torch_device)
         embed_positions = RoFormerSinusoidalPositionalEmbedding(num_positions=32, embedding_dim=64).to(torch_device)
+        embed_positions._init_weight()
         sinusoidal_pos = embed_positions([2, 16, 768])[None, None, :, :]
 
         query_layer, key_layer = RoFormerSelfAttention.apply_rotary_position_embeddings(
diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py
index 6dfd0a385f0d..e6db4f0d09b2 100644
--- a/tests/models/roformer/test_tokenization_roformer.py
+++ b/tests/models/roformer/test_tokenization_roformer.py
@@ -15,11 +15,12 @@
 
 import tempfile
 import unittest
+from functools import lru_cache
 
 from transformers import RoFormerTokenizer, RoFormerTokenizerFast
 from transformers.testing_utils import require_rjieba, require_tokenizers
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_rjieba
@@ -31,14 +32,25 @@ class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     space_between_special_tokens = True
     test_rust_tokenizer = True
 
-    def setUp(self):
-        super().setUp()
-
-    def get_tokenizer(self, **kwargs):
-        return self.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs):
-        return self.rust_tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs)
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        tokenizer = cls.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base")
+        tokenizer.save_pretrained(cls.tmpdirname)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def get_chinese_input_output_texts(self):
         input_text = "永和服装饰品有限公司,今天天气非常好"
diff --git a/tests/models/rt_detr/test_image_processing_rt_detr.py b/tests/models/rt_detr/test_image_processing_rt_detr.py
index e27c1838f940..41e26e2a1328 100644
--- a/tests/models/rt_detr/test_image_processing_rt_detr.py
+++ b/tests/models/rt_detr/test_image_processing_rt_detr.py
@@ -16,7 +16,14 @@
 
 import requests
 
-from transformers.testing_utils import require_torch, require_torch_gpu, require_torchvision, require_vision, slow
+from transformers.testing_utils import (
+    is_flaky,
+    require_torch,
+    require_torch_gpu,
+    require_torchvision,
+    require_vision,
+    slow,
+)
 from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
 
 from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -427,3 +434,9 @@ def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self):
         )
         # verify size
         torch.testing.assert_close(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu"))
+
+    @is_flaky(
+        description="Still flaky with a failing ratio of ~0.6% after #36240",
+    )
+    def test_fast_is_faster_than_slow(self):
+        super().test_fast_is_faster_than_slow()
diff --git a/tests/models/rt_detr/test_modeling_rt_detr.py b/tests/models/rt_detr/test_modeling_rt_detr.py
index ab465065f168..5dedeaceaecc 100644
--- a/tests/models/rt_detr/test_modeling_rt_detr.py
+++ b/tests/models/rt_detr/test_modeling_rt_detr.py
@@ -547,10 +547,10 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
@@ -579,10 +579,10 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
diff --git a/tests/models/rt_detr/test_modeling_rt_detr_resnet.py b/tests/models/rt_detr/test_modeling_rt_detr_resnet.py
index c925ef14ed0c..d7bfadcecade 100644
--- a/tests/models/rt_detr/test_modeling_rt_detr_resnet.py
+++ b/tests/models/rt_detr/test_modeling_rt_detr_resnet.py
@@ -16,7 +16,7 @@
 import unittest
 
 from transformers import RTDetrResNetConfig
-from transformers.testing_utils import require_torch, torch_device
+from transformers.testing_utils import require_torch
 from transformers.utils.import_utils import is_torch_available
 
 from ...test_backbone_common import BackboneTesterMixin
@@ -84,35 +84,6 @@ def get_config(self):
             out_indices=self.out_indices,
         )
 
-    def create_and_check_backbone(self, config, pixel_values, labels):
-        model = RTDetrResNetBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[1], 4, 4])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), len(config.out_features))
-        self.parent.assertListEqual(model.channels, config.hidden_sizes[1:])
-
-        # verify backbone works with out_features=None
-        config.out_features = None
-        model = RTDetrResNetBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(pixel_values)
-
-        # verify feature maps
-        self.parent.assertEqual(len(result.feature_maps), 1)
-        self.parent.assertListEqual(list(result.feature_maps[0].shape), [self.batch_size, self.hidden_sizes[-1], 1, 1])
-
-        # verify channels
-        self.parent.assertEqual(len(model.channels), 1)
-        self.parent.assertListEqual(model.channels, [config.hidden_sizes[-1]])
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values, labels = config_and_inputs
diff --git a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
index d5388cf41a99..e8af79ca7baf 100644
--- a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
+++ b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py
@@ -545,10 +545,10 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
@@ -577,10 +577,10 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index bade708ff9bc..7915be09eb47 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -195,19 +195,6 @@ def create_and_check_state_equivalency(self, config, input_ids, input_mask, head
 
         self.parent.assertTrue(torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-5))
 
-    def create_and_check_forward_and_backwards(
-        self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
-    ):
-        model = RwkvForCausalLM(config)
-        model.to(torch_device)
-        if gradient_checkpointing:
-            model.gradient_checkpointing_enable()
-
-        result = model(input_ids, labels=input_ids)
-        self.parent.assertEqual(result.loss.shape, ())
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        result.loss.backward()
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
 
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index c44046bd8161..2a17fad33447 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -32,13 +32,243 @@
     import torch
     from torch import nn
 
-    from transformers import SamModel, SamProcessor
+    from transformers import SamModel, SamProcessor, SamVisionModel
 
 
 if is_vision_available():
     from PIL import Image
 
 
+class SamVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        hidden_size=36,
+        intermediate_size=72,
+        projection_dim=62,
+        output_channels=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_channels=3,
+        image_size=24,
+        patch_size=2,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        mlp_ratio=4.0,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        rel_pos_zero_init=False,
+        window_size=14,
+        global_attn_indexes=[2, 5, 8, 11],
+        num_pos_feats=16,
+        mlp_dim=None,
+        batch_size=2,
+    ):
+        self.parent = parent
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.output_channels = output_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.qkv_bias = qkv_bias
+        self.mlp_ratio = mlp_ratio
+        self.use_abs_pos = use_abs_pos
+        self.use_rel_pos = use_rel_pos
+        self.rel_pos_zero_init = rel_pos_zero_init
+        self.window_size = window_size
+        self.global_attn_indexes = global_attn_indexes
+        self.num_pos_feats = num_pos_feats
+        self.mlp_dim = mlp_dim
+        self.batch_size = batch_size
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def get_config(self):
+        return SamVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+            initializer_factor=self.initializer_factor,
+            output_channels=self.output_channels,
+            qkv_bias=self.qkv_bias,
+            mlp_ratio=self.mlp_ratio,
+            use_abs_pos=self.use_abs_pos,
+            use_rel_pos=self.use_rel_pos,
+            rel_pos_zero_init=self.rel_pos_zero_init,
+            window_size=self.window_size,
+            global_attn_indexes=self.global_attn_indexes,
+            num_pos_feats=self.num_pos_feats,
+            mlp_dim=self.mlp_dim,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def create_and_check_model(self, config, pixel_values):
+        model = SamVisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        output_size = self.image_size // self.patch_size
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_channels, output_size, output_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class SamVisionModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as SAM's vision encoder does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (SamVisionModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_torchscript = False
+    test_torch_exportable = True
+
+    def setUp(self):
+        self.model_tester = SamVisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=SamVisionConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="SAM's vision encoder does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        expected_attention_shape = (
+            self.model_tester.batch_size * self.model_tester.num_attention_heads,
+            196,
+            196,
+        )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-4:]),
+                list(expected_attention_shape),
+            )
+
+    @unittest.skip(reason="SamVisionModel does not support training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="SamVisionModel does not support training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(reason="SamVisionModel has no base class and is not available in MODEL_MAPPING")
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @unittest.skip(reason="SamVisionModel does not support training")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Hidden_states is tested in create_and_check_model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @require_torch_sdpa
+    def test_sdpa_can_compile_dynamic(self):
+        self.skipTest(reason="SAM model can't be compiled dynamic yet")
+
+
 class SamPromptEncoderTester:
     def __init__(
         self,
@@ -406,25 +636,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="SamModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="SamModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(reason="SamModel does not support training")
     def test_retain_grad_hidden_states_attentions(self):
         pass
@@ -433,10 +655,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_hidden_states_output(self):
         pass
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
-        # Use a slightly higher default tol to make the tests non-flaky
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol=tol, name=name, attributes=attributes)
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "facebook/sam-vit-huge"
@@ -537,12 +755,10 @@ def test_inference_mask_generation_no_point(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+        scores = outputs.iou_scores.squeeze().cpu()
+        masks = outputs.pred_masks[0, 0, 0, 0, :3].cpu()
         torch.testing.assert_close(scores[-1], torch.tensor(0.4515), rtol=2e-4, atol=2e-4)
-        torch.testing.assert_close(
-            masks, torch.tensor([-4.1800, -3.4948, -3.4481]).to(torch_device), rtol=2e-4, atol=2e-4
-        )
+        torch.testing.assert_close(masks, torch.tensor([-4.1800, -3.4948, -3.4481]), rtol=2e-4, atol=2e-4)
 
     def test_inference_mask_generation_one_point_one_bb(self):
         model = SamModel.from_pretrained("facebook/sam-vit-base")
@@ -561,12 +777,10 @@ def test_inference_mask_generation_one_point_one_bb(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
-        masks = outputs.pred_masks[0, 0, 0, 0, :3]
+        scores = outputs.iou_scores.squeeze().cpu()
+        masks = outputs.pred_masks[0, 0, 0, 0, :3].cpu()
         torch.testing.assert_close(scores[-1], torch.tensor(0.9566), rtol=2e-4, atol=2e-4)
-        torch.testing.assert_close(
-            masks, torch.tensor([-12.7729, -12.3665, -12.6061]).to(torch_device), rtol=2e-4, atol=2e-4
-        )
+        torch.testing.assert_close(masks, torch.tensor([-12.7729, -12.3665, -12.6061]), rtol=2e-4, atol=2e-4)
 
     def test_inference_mask_generation_batched_points_batched_images(self):
         model = SamModel.from_pretrained("facebook/sam-vit-base")
@@ -632,7 +846,7 @@ def test_inference_mask_generation_one_point_one_bb_zero(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
+        scores = outputs.iou_scores.squeeze().cpu()
 
         torch.testing.assert_close(scores[-1], torch.tensor(0.7894), rtol=1e-4, atol=1e-4)
 
@@ -654,7 +868,7 @@ def test_inference_mask_generation_one_point(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
+        scores = outputs.iou_scores.squeeze().cpu()
         torch.testing.assert_close(scores[-1], torch.tensor(0.9675), rtol=1e-4, atol=1e-4)
 
         # With no label
@@ -664,7 +878,7 @@ def test_inference_mask_generation_one_point(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
+        scores = outputs.iou_scores.squeeze().cpu()
         torch.testing.assert_close(scores[-1], torch.tensor(0.9675), rtol=1e-4, atol=1e-4)
 
     def test_inference_mask_generation_two_points(self):
@@ -685,7 +899,7 @@ def test_inference_mask_generation_two_points(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
+        scores = outputs.iou_scores.squeeze().cpu()
         torch.testing.assert_close(scores[-1], torch.tensor(0.9762), rtol=1e-4, atol=1e-4)
 
         # no labels
@@ -693,7 +907,7 @@ def test_inference_mask_generation_two_points(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
+        scores = outputs.iou_scores.squeeze().cpu()
 
         torch.testing.assert_close(scores[-1], torch.tensor(0.9762), rtol=1e-4, atol=1e-4)
 
@@ -715,7 +929,7 @@ def test_inference_mask_generation_two_points_batched(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
+        scores = outputs.iou_scores.squeeze().cpu()
         torch.testing.assert_close(scores[0][-1], torch.tensor(0.9762), rtol=1e-4, atol=1e-4)
         torch.testing.assert_close(scores[1][-1], torch.tensor(0.9637), rtol=1e-4, atol=1e-4)
 
@@ -734,7 +948,7 @@ def test_inference_mask_generation_one_box(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores = outputs.iou_scores.squeeze()
+        scores = outputs.iou_scores.squeeze().cpu()
         torch.testing.assert_close(scores[-1], torch.tensor(0.7937), rtol=1e-4, atol=1e-4)
 
     def test_inference_mask_generation_batched_image_one_point(self):
@@ -755,7 +969,7 @@ def test_inference_mask_generation_batched_image_one_point(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores_batched = outputs.iou_scores.squeeze()
+        scores_batched = outputs.iou_scores.squeeze().cpu()
 
         input_points = [[[220, 470]]]
 
@@ -763,7 +977,7 @@ def test_inference_mask_generation_batched_image_one_point(self):
 
         with torch.no_grad():
             outputs = model(**inputs)
-        scores_single = outputs.iou_scores.squeeze()
+        scores_single = outputs.iou_scores.squeeze().cpu()
         torch.testing.assert_close(scores_batched[1, :], scores_single, rtol=1e-4, atol=1e-4)
 
     def test_inference_mask_generation_two_points_point_batch(self):
@@ -801,9 +1015,11 @@ def test_inference_mask_generation_three_boxes_point_batch(self):
 
         # fmt: off
         input_boxes = torch.Tensor([[[620, 900, 1000, 1255]], [[75, 275, 1725, 850]],  [[75, 275, 1725, 850]]]).cpu()
-        EXPECTED_IOU = torch.tensor([[[0.9773, 0.9881, 0.9522],
-         [0.5996, 0.7661, 0.7937],
-         [0.5996, 0.7661, 0.7937]]])
+        EXPECTED_IOU = torch.tensor([[
+            [0.9773, 0.9881, 0.9522],
+            [0.5996, 0.7661, 0.7937],
+            [0.5996, 0.7661, 0.7937],
+        ]])
         # fmt: on
         input_boxes = input_boxes.unsqueeze(0)
 
diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
index 3cb98e683a98..305f429ea0e6 100644
--- a/tests/models/sam/test_modeling_tf_sam.py
+++ b/tests/models/sam/test_modeling_tf_sam.py
@@ -34,13 +34,204 @@
 if is_tf_available():
     import tensorflow as tf
 
-    from transformers import SamProcessor, TFSamModel
+    from transformers import SamProcessor, TFSamModel, TFSamVisionModel
     from transformers.modeling_tf_utils import keras
 
 if is_vision_available():
     from PIL import Image
 
 
+class TFSamVisionModelTester:
+    def __init__(
+        self,
+        parent,
+        hidden_size=36,
+        intermediate_size=72,
+        projection_dim=62,
+        output_channels=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_channels=3,
+        image_size=24,
+        patch_size=2,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        mlp_ratio=4.0,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        rel_pos_zero_init=False,
+        window_size=14,
+        global_attn_indexes=[2, 5, 8, 11],
+        num_pos_feats=16,
+        mlp_dim=None,
+        batch_size=2,
+    ):
+        self.parent = parent
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.output_channels = output_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.qkv_bias = qkv_bias
+        self.mlp_ratio = mlp_ratio
+        self.use_abs_pos = use_abs_pos
+        self.use_rel_pos = use_rel_pos
+        self.rel_pos_zero_init = rel_pos_zero_init
+        self.window_size = window_size
+        self.global_attn_indexes = global_attn_indexes
+        self.num_pos_feats = num_pos_feats
+        self.mlp_dim = mlp_dim
+        self.batch_size = batch_size
+
+    def get_config(self):
+        return SamVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+            initializer_factor=self.initializer_factor,
+            output_channels=self.output_channels,
+            qkv_bias=self.qkv_bias,
+            mlp_ratio=self.mlp_ratio,
+            use_abs_pos=self.use_abs_pos,
+            use_rel_pos=self.use_rel_pos,
+            rel_pos_zero_init=self.rel_pos_zero_init,
+            window_size=self.window_size,
+            global_attn_indexes=self.global_attn_indexes,
+            num_pos_feats=self.num_pos_feats,
+            mlp_dim=self.mlp_dim,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def create_and_check_model(self, config, pixel_values):
+        model = TFSamVisionModel(config=config)
+        result = model(pixel_values)
+        output_size = self.image_size // self.patch_size
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.output_channels, output_size, output_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_tf
+class TFSamVisionModelTest(TFModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as SAM's vision encoder does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (TFSamVisionModel,) if is_tf_available() else ()
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_onnx = False
+
+    def setUp(self):
+        self.model_tester = TFSamVisionModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=SamVisionConfig, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="SAM's vision encoder does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (keras.layers.Layer))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, keras.layers.Dense))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        expected_attention_shape = (
+            self.model_tester.batch_size * self.model_tester.num_attention_heads,
+            196,
+            196,
+        )
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-4:]),
+                list(expected_attention_shape),
+            )
+
+    @unittest.skip(reason="Hidden_states is tested in create_and_check_model tests")
+    def test_hidden_states_output(self):
+        pass
+
+
 class TFSamPromptEncoderTester:
     def __init__(
         self,
@@ -411,16 +602,6 @@ def test_model_from_pretrained(self):
         model = TFSamModel.from_pretrained("facebook/sam-vit-base")  # sam-vit-huge blows out our memory
         self.assertIsNotNone(model)
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-4, name="outputs", attributes=None):
-        super().check_pt_tf_outputs(
-            tf_outputs=tf_outputs,
-            pt_outputs=pt_outputs,
-            model_class=model_class,
-            tol=tol,
-            name=name,
-            attributes=attributes,
-        )
-
 
 def prepare_image():
     img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
diff --git a/tests/models/sam/test_processor_sam.py b/tests/models/sam/test_processor_sam.py
index 3a2814f8f414..d621f5428733 100644
--- a/tests/models/sam/test_processor_sam.py
+++ b/tests/models/sam/test_processor_sam.py
@@ -18,7 +18,6 @@
 import numpy as np
 
 from transformers.testing_utils import (
-    is_pt_tf_cross_test,
     require_tf,
     require_torch,
     require_torchvision,
@@ -313,7 +312,7 @@ def test_rle_encoding(self):
         # This is shape (1, 2, 2).
         # Flattened in Fortran order -> [0, 1, 1, 1].
         # The RLE for [0,1,1,1] is [1, 3].
-        input_mask = tf.tensor([[[0, 1], [1, 1]]], dtype=tf.int64)
+        input_mask = tf.constant([[[0, 1], [1, 1]]], dtype=tf.int64)
         rle = _mask_to_rle_tf(input_mask)
 
         self.assertEqual(len(rle), 1)
@@ -340,42 +339,3 @@ def tearDown(self):
     def prepare_image_inputs(self):
         """This function prepares a list of PIL images."""
         return prepare_image_inputs()
-
-    @is_pt_tf_cross_test
-    def test_post_process_masks_equivalence(self):
-        image_processor = self.get_image_processor()
-
-        processor = SamProcessor(image_processor=image_processor)
-        dummy_masks = np.random.randint(0, 2, size=(1, 3, 5, 5)).astype(np.float32)
-        tf_dummy_masks = [tf.convert_to_tensor(dummy_masks)]
-        pt_dummy_masks = [torch.tensor(dummy_masks)]
-
-        original_sizes = [[1764, 2646]]
-
-        reshaped_input_size = [[683, 1024]]
-        tf_masks = processor.post_process_masks(
-            tf_dummy_masks, original_sizes, reshaped_input_size, return_tensors="tf"
-        )
-        pt_masks = processor.post_process_masks(
-            pt_dummy_masks, original_sizes, reshaped_input_size, return_tensors="pt"
-        )
-
-        self.assertTrue(np.all(tf_masks[0].numpy() == pt_masks[0].numpy()))
-
-    @is_pt_tf_cross_test
-    def test_image_processor_equivalence(self):
-        image_processor = self.get_image_processor()
-
-        processor = SamProcessor(image_processor=image_processor)
-
-        image_input = self.prepare_image_inputs()
-
-        pt_input_feat_extract = image_processor(image_input, return_tensors="pt")["pixel_values"].numpy()
-        pt_input_processor = processor(images=image_input, return_tensors="pt")["pixel_values"].numpy()
-
-        tf_input_feat_extract = image_processor(image_input, return_tensors="tf")["pixel_values"].numpy()
-        tf_input_processor = processor(images=image_input, return_tensors="tf")["pixel_values"].numpy()
-
-        self.assertTrue(np.allclose(pt_input_feat_extract, pt_input_processor))
-        self.assertTrue(np.allclose(pt_input_feat_extract, tf_input_feat_extract))
-        self.assertTrue(np.allclose(pt_input_feat_extract, tf_input_processor))
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index a68030c86b17..d7f015804474 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -23,7 +23,6 @@
 from transformers.trainer_utils import set_seed
 from transformers.utils import cached_property
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
@@ -358,6 +357,8 @@ class SeamlessM4TModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. Custom generation method with a different interface
+    all_generative_model_classes = ()
 
     def setUp(self):
         self.model_tester = SeamlessM4TModelTester(self, input_modality="speech")
@@ -428,40 +429,30 @@ def test_inputs_embeds_matches_input_ids(self):
     def test_model_weights_reload_no_missing_tied_weights(self):
         pass
 
-    @unittest.skip(
-        reason="SeamlessM4TModel is base class but has actually a bigger architecture than seamlessM4T task-specific models."
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(reason="SeamlessM4TModel can takes input_ids or input_features")
     def test_forward_signature(self):
         pass
 
-    @unittest.skip(reason="SeamlessM4T has no base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
@@ -580,9 +571,7 @@ def test_retain_grad_hidden_states_attentions(self):
 
 
 @require_torch
-class SeamlessM4TModelWithTextInputTest(
-    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
+class SeamlessM4TModelWithTextInputTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     fx_compatible = False
     test_missing_keys = False
@@ -686,29 +675,19 @@ def test_decoder_model_past_with_large_inputs(self):
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
     @unittest.skip(
-        reason="SeamlessM4TModel is base class but has actually a bigger architecture than seamlessM4T task-specific models."
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4T has no base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -720,7 +699,7 @@ def test_retain_grad_hidden_states_attentions(self):
         pass
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
@@ -1018,7 +997,7 @@ def test_to_eng_text(self):
         output = model.generate(**self.input_text, num_beams=1, tgt_lang="eng", return_intermediate_token_ids=True)
 
         self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
-        # FOR NOW, only first units correspondance
+        # FOR NOW, only first units correspondence
         self.assertListEqual(expected_unit_tokens[:10], output.unit_sequences.squeeze().tolist()[:10])
 
         self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
diff --git a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
index efb8d87cac16..f55be02e172b 100644
--- a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
@@ -59,12 +59,13 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece = True
     from_pretrained_kwargs = {}
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_full_tokenizer(self):
         tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True)
@@ -353,7 +354,7 @@ def test_special_tokens_initialization(self):
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
                 r_output = tokenizer_r.encode("Hey this is a <special> token")
@@ -363,7 +364,7 @@ def test_special_tokens_initialization(self):
                 self.assertTrue(special_token_id in r_output)
 
                 if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                    tokenizer_cr = self.get_rust_tokenizer(
                         pretrained_name,
                         additional_special_tokens=added_tokens,
                         **kwargs,  # , from_slow=True <- unfortunately too slow to convert
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index c53bc4a8b186..2342f5502c97 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -23,7 +23,6 @@
 from transformers.trainer_utils import set_seed
 from transformers.utils import cached_property
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
@@ -374,6 +373,8 @@ class SeamlessM4Tv2ModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. Has custom generation method with a different interface
+    all_generative_model_classes = ()
 
     def setUp(self):
         self.model_tester = SeamlessM4Tv2ModelTester(self, input_modality="speech")
@@ -444,40 +445,30 @@ def test_inputs_embeds_matches_input_ids(self):
     def test_model_weights_reload_no_missing_tied_weights(self):
         pass
 
-    @unittest.skip(
-        reason="SeamlessM4Tv2Model is base class but has actually a bigger architecture than seamlessM4T task-specific models."
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(reason="SeamlessM4Tv2Model can takes input_ids or input_features")
     def test_forward_signature(self):
         pass
 
-    @unittest.skip(reason="SeamlessM4Tv2 has no base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
@@ -591,11 +582,11 @@ def test_attention_outputs(self):
     # TODO: @ydshieh: refer to #34968
     @unittest.skip(reason="Failing on multi-gpu runner")
     def test_retain_grad_hidden_states_attentions(self):
-        pass
+        self.skipTest(reason="Failing on multi-gpu runner")
 
 
 @require_torch
-class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     fx_compatible = False
     test_missing_keys = False
@@ -687,35 +678,25 @@ def test_decoder_model_past_with_large_inputs(self):
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
     @unittest.skip(
-        reason="SeamlessM4Tv2Model is base class but has actually a bigger architecture than seamlessM4T task-specific models."
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="SeamlessM4Tv2 has no base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index 89a28925c2da..384754ecbe09 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -431,10 +431,6 @@ def apply(model):
                 model = model_class(config)
                 apply(model)
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-4, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as semseg models tend to diverge a bit more
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "nvidia/segformer-b0-finetuned-ade-512-512"
diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py
index 9893fcf0b3b9..76b0dc2a4525 100644
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@@ -147,32 +147,6 @@ def create_and_check_model(self, config, input_values, attention_mask):
             result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
         )
 
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = SEWModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
     def check_ctc_loss(self, config, input_values, *args):
         model = SEWForCTC(config=config)
         model.to(torch_device)
diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py
index 43bd31d92a0b..c6b28b247453 100644
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@@ -168,32 +168,6 @@ def create_and_check_model(self, config, input_values, attention_mask):
             result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
         )
 
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = SEWDModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
     def check_ctc_loss(self, config, input_values, *args):
         model = SEWDForCTC(config=config)
         model.to(torch_device)
diff --git a/tests/models/shieldgemma2/__init__.py b/tests/models/shieldgemma2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/shieldgemma2/test_modeling_shieldgemma2.py b/tests/models/shieldgemma2/test_modeling_shieldgemma2.py
new file mode 100644
index 000000000000..fdc5d9e7138a
--- /dev/null
+++ b/tests/models/shieldgemma2/test_modeling_shieldgemma2.py
@@ -0,0 +1,61 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Gemma3 model."""
+
+import unittest
+from io import BytesIO
+
+import requests
+from PIL import Image
+
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    cleanup,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import ShieldGemma2ForImageClassification, ShieldGemma2Processor
+
+
+@slow
+@require_torch_gpu
+# @require_read_token
+class ShieldGemma2IntegrationTest(unittest.TestCase):
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def test_model(self):
+        model_id = "google/shieldgemma-2-4b-it"
+
+        processor = ShieldGemma2Processor.from_pretrained(model_id, padding_side="left")
+        url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content))
+
+        model = ShieldGemma2ForImageClassification.from_pretrained(
+            model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
+        ).to(torch_device)
+
+        inputs = processor(images=[image]).to(torch_device)
+        output = model(**inputs)
+        self.assertEqual(len(output.probabilities), 3)
+        for element in output.probabilities:
+            self.assertEqual(len(element), 2)
diff --git a/tests/models/shieldgemma2/test_processing_shieldgemma2.py b/tests/models/shieldgemma2/test_processing_shieldgemma2.py
new file mode 100644
index 000000000000..a1c087be1fde
--- /dev/null
+++ b/tests/models/shieldgemma2/test_processing_shieldgemma2.py
@@ -0,0 +1,220 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from collections.abc import Mapping
+
+from parameterized import parameterized
+
+from transformers import GemmaTokenizer, ShieldGemma2Processor
+from transformers.testing_utils import get_tests_dir, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import Gemma3ImageProcessor
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+
+# Copied from _CHAT_TEMPLATE in src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py
+_CHAT_TEMPLATE = """{{ bos_token }}
+{%- for message in messages -%}
+    {%- if message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {{ '<start_of_turn>user\n' }}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ 'You are a policy expert trying to help determine whether a generated or real image is in' }}
+                {{ ' violation of the safety policy below. Your decision must very clearly follow the specific'}}
+                {{ ' language used in the safety policy and should not include any interpretation outside of the' }}
+                {{ ' safety policy.\n\n' }}
+                {{ '<BEGIN_SAFETY_POLICY>\n' }}
+                {{ item['text'] + '\n' }}
+                {{ '</END_SAFETY_POLICY>\n\n' }}
+                {{ 'Does the image violate the above policy? Your Answer must start with \\'Yes\\' or \\'No\\'.' }}
+                {{ '<end_of_turn>\n' }}
+            {%- endif -%}
+        {%- endfor -%}
+        {{'<start_of_turn>model\n'}}
+    {%- else -%}
+        {{ raise_exception("Conversation messages must contain iterable content containing images and policy definitions in text.") }}
+    {%- endif -%}
+{%- endfor -%}
+"""
+
+# Simplified from _SHIELDGEMMA2_POLICIES in src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py
+_SHIELDGEMMA2_POLICIES: Mapping[str, str] = {
+    "dangerous": "Test policy related to dangerous content.",
+    "sexual": "Test policy related to sexually explicit content.",
+    "violence": "Test policy related to violent content.",
+}
+
+
+@require_vision
+class ShieldGemma2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = ShieldGemma2Processor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = Gemma3ImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+
+        extra_special_tokens = {
+            "image_token": "<image_soft_token>",
+            "boi_token": "<start_of_image>",
+            "eoi_token": "<end_of_image>",
+        }
+        tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True, extra_special_tokens=extra_special_tokens)
+
+        processor_kwargs = self.prepare_processor_dict()
+        processor = ShieldGemma2Processor(image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs)
+        processor.save_pretrained(self.tmpdirname)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def prepare_processor_dict(self):
+        return {
+            "chat_template": _CHAT_TEMPLATE,
+            "policy_definitions": _SHIELDGEMMA2_POLICIES,
+        }
+
+    def test_policy_definitions_saved_in_config(self):
+        processor_config_path = os.path.join(self.tmpdirname, "processor_config.json")
+
+        with open(processor_config_path, "rb") as processor_config_file:
+            json_dict = json.load(processor_config_file)
+
+        self.assertIsInstance(json_dict, dict)
+        self.assertIn("policy_definitions", json_dict)
+        self.assertIs(len(json_dict["policy_definitions"]), 3)
+
+    @parameterized.expand(
+        [
+            ("all_policies", None, 3),
+            ("selected_policies", ["dangerous", "violence"], 2),
+            ("single_policy", ["sexual"], 1),
+        ]
+    )
+    def test_with_default_policies(self, name, policies, expected_batch_size):
+        processor = self.get_processor()
+
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        images = self.prepare_image_inputs()
+        processed_inputs = processor(images=images, policies=policies)
+        self.assertEqual(len(processed_inputs[self.text_input_name]), expected_batch_size)
+        self.assertEqual(len(processed_inputs[self.images_input_name]), expected_batch_size)
+
+    @parameterized.expand(
+        [
+            ("all_policies", None, 6),
+            ("selected_policies_from_both", ["cbrne", "dangerous", "specialized_advice", "violence"], 4),
+            ("selected_policies_from_custom", ["cbrne", "specialized_advice"], 2),
+            ("selected_policies_from_default", ["dangerous", "violence"], 2),
+            ("single_policy_from_custom", ["ip"], 1),
+            ("single_policy_from_default", ["sexual"], 1),
+        ]
+    )
+    def test_with_custom_policies(self, name, policies, expected_batch_size):
+        processor = self.get_processor()
+
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        # Test policies adapted from https://ailuminate.mlcommons.org/benchmarks/ hazard categories
+        custom_policies = {
+            "cbrne": "Test policy related to indiscriminate weapons.",
+            "ip": "Test policy related to intellectual property.",
+            "specialized_advice": "Test policy related to specialized advice.",
+        }
+
+        images = self.prepare_image_inputs()
+        processed_inputs = processor(images=images, custom_policies=custom_policies, policies=policies)
+        self.assertEqual(len(processed_inputs[self.text_input_name]), expected_batch_size)
+        self.assertEqual(len(processed_inputs[self.images_input_name]), expected_batch_size)
+
+    def test_with_multiple_images(self):
+        processor = self.get_processor()
+
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        images = self.prepare_image_inputs(batch_size=2)
+        print(images)
+        processed_inputs = processor(images=images)
+        self.assertEqual(len(processed_inputs[self.text_input_name]), 6)
+        self.assertEqual(len(processed_inputs[self.images_input_name]), 6)
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("ShieldGemma 2 chat template requires different message structure from parent.")
+    def test_image_chat_template_accepts_processing_kwargs(self):
+        pass
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("ShieldGemma 2 chat template requires different message structure from parent.")
+    def test_image_chat_template_batched(self):
+        pass
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("ShieldGemma 2 chat template requires different message structure from parent.")
+    def test_image_chat_template_dict_torch(self):
+        pass
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("ShieldGemma 2 chat template requires different message structure from parent.")
+    def test_image_chat_template_single(self):
+        pass
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("Parent test needs to be adapted for ShieldGemma 2.")
+    def test_unstructured_kwargs_batched(self):
+        pass
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("Parent test needs to be adapted for ShieldGemma 2.")
+    def test_unstructured_kwargs(self):
+        pass
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("Parent test needs to be adapted for ShieldGemma 2.")
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        pass
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("Parent test needs to be adapted for ShieldGemma 2.")
+    def test_structured_kwargs_nested_from_dict(self):
+        pass
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("Parent test needs to be adapted for ShieldGemma 2.")
+    def test_structured_kwargs_nested(self):
+        pass
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("Parent test needs to be adapted for ShieldGemma 2.")
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        pass
+
+    # TODO(ryanmullins): Adapt this test for ShieldGemma 2
+    @unittest.skip("Parent test needs to be adapted for ShieldGemma 2.")
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        pass
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index 3dec33018476..10d57a5aebbc 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -18,7 +18,6 @@
 import os
 import tempfile
 import unittest
-from typing import Tuple
 
 import numpy as np
 import requests
@@ -27,30 +26,28 @@
 
 from transformers import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
 from transformers.testing_utils import (
+    is_flaky,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     require_vision,
     slow,
     torch_device,
 )
 from transformers.utils import (
     is_torch_available,
-    is_torch_bf16_available_on_device,
-    is_torch_fp16_available_on_device,
-    is_torch_sdpa_available,
     is_vision_available,
 )
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
     ModelTesterMixin,
     _config_zero_init,
     floats_tensor,
     ids_tensor,
-    is_flaky,
     random_attention_mask,
+    require_torch_sdpa,
 )
 from ...test_pipeline_mixin import PipelineTesterMixin
 
@@ -61,9 +58,6 @@
 
     from transformers import SiglipForImageClassification, SiglipModel, SiglipTextModel, SiglipVisionModel
 
-if is_torch_sdpa_available():
-    from torch.nn.attention import SDPBackend, sdpa_kernel
-
 if is_vision_available():
     from PIL import Image
 
@@ -71,6 +65,7 @@
 
 
 class SiglipModelTesterMixin(ModelTesterMixin):
+    @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -81,171 +76,24 @@ def test_sdpa_can_dispatch_composite_models(self):
 
                 # Load the model with SDPA
                 model_sdpa = model_class.from_pretrained(tmpdirname)
-                model_sdpa = model_sdpa.eval().to(torch_device)
 
                 # Load model with eager attention
                 model_eager = model_class.from_pretrained(
                     tmpdirname,
                     attn_implementation="eager",
                 )
-                model_eager = model_eager.eval().to(torch_device)
-
-            # SigLip has one shared cls attr for all models, so we assign both submodels heer
-            vision_attn = text_attn = "sdpa" if model._supports_sdpa else "eager"
 
-            if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "text_model"):
-                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
-                self.assertTrue(model_sdpa.text_model.config._attn_implementation == text_attn)
+            if hasattr(model_sdpa, "vision_model"):
+                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa")
                 self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
+
+            if hasattr(model_sdpa, "text_model"):
+                self.assertTrue(model_sdpa.text_model.config._attn_implementation == "sdpa")
                 self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
 
             self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
             self.assertTrue(model_eager.config._attn_implementation == "eager")
 
-            for name, submodule in model_eager.named_modules():
-                class_name = submodule.__class__.__name__
-                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                    raise ValueError("The eager model should not have SDPA attention layers")
-
-            has_sdpa = False
-            for name, submodule in model_sdpa.named_modules():
-                class_name = submodule.__class__.__name__
-                if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
-                    has_sdpa = True
-                    break
-            if not has_sdpa and model_sdpa.config.model_type != "falcon":
-                raise ValueError("The SDPA model should have SDPA attention layers")
-
-    def test_eager_matches_sdpa_inference(
-        self,
-        torch_dtype: str,
-        use_attention_mask_options: Tuple[bool, ...] = (True, False),
-        logit_keys: Tuple[str, ...] = ("logits_per_image", "logits_per_text", "image_embeds", "text_embeds"),
-    ):
-        if not self.all_model_classes[0]._supports_sdpa:
-            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
-
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
-            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
-
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
-            self.skipTest(
-                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
-            )
-
-        # Convert to torch dtype
-        dtypes = {
-            "float16": torch.float16,
-            "bfloat16": torch.bfloat16,
-            "float32": torch.float32,
-        }
-        torch_dtype = dtypes[torch_dtype]
-
-        atols = {
-            torch.float32: 1e-5,
-            torch.bfloat16: 3e-2,
-            torch.float16: 5e-3,
-        }
-        rtols = {
-            torch.float32: 1e-4,
-            torch.bfloat16: 3e-2,
-            torch.float16: 5e-3,
-        }
-
-        atol = atols[torch_dtype]
-        rtol = rtols[torch_dtype]
-
-        def get_mean_reldiff(msg, current_case, x, ref, atol, rtol):
-            return f"{msg} {current_case}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                # Load the model with SDPA
-                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
-                model_sdpa = model_sdpa.eval().to(torch_device)
-
-                # Load model with eager attention
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                )
-                model_eager = model_eager.eval().to(torch_device)
-
-            # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving the model each time,
-            # but it would be nicer to have an efficient way to use parameterized.expand
-            cases = [
-                (use_mask, output_attentions, sdpa_backend, batch_size)
-                for use_mask in use_attention_mask_options
-                for output_attentions in [True, False]
-                for sdpa_backend in [
-                    SDPBackend.MATH,
-                    [SDPBackend.FLASH_ATTENTION, SDPBackend.MATH],
-                    [SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH],
-                    [SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH],
-                ]
-                for batch_size in [1, 5]
-            ]
-            fail_cases = []
-
-            for use_mask, output_attentions, sdpa_backend, batch_size in cases:
-                processed_inputs = inputs_dict.copy()
-
-                # convert to torch_dtype
-                if "pixel_values" in processed_inputs:
-                    processed_inputs["pixel_values"] = processed_inputs["pixel_values"].to(torch_dtype)
-
-                # slice for different batch sizes
-                for key in ["pixel_values", "input_ids", "attention_mask"]:
-                    if key in processed_inputs:
-                        processed_inputs[key] = processed_inputs[key][:batch_size]
-
-                # set attention mask with left padding
-                if not use_mask:
-                    processed_inputs.pop("attention_mask", None)
-                else:
-                    dummy_attention_mask = processed_inputs["attention_mask"]
-                    dummy_attention_mask[:] = 1
-                    dummy_attention_mask[:, :1] = 0
-                    processed_inputs["attention_mask"] = dummy_attention_mask
-
-                processed_inputs["output_attentions"] = output_attentions
-                processed_inputs["output_hidden_states"] = True
-
-                current_case = (
-                    f"padding_side=left, use_mask={use_mask}, batch_size={batch_size}, sdpa_backend={sdpa_backend}"
-                )
-
-                prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
-
-                with torch.no_grad():
-                    try:
-                        with sdpa_kernel(sdpa_backend):
-                            outputs_eager = model_eager(**prepared_inputs)
-                            outputs_sdpa = model_sdpa(**prepared_inputs)
-                    except Exception as e:
-                        fail_cases.append(f"{current_case}: {e}")
-                        continue
-
-                for key in logit_keys:
-                    eager_logits = outputs_eager[key]
-                    sdpa_logits = outputs_sdpa[key]
-
-                    if use_mask:
-                        eager_logits = eager_logits[:, 1:]
-                        sdpa_logits = sdpa_logits[:, 1:]
-
-                    is_close = torch.allclose(eager_logits, sdpa_logits, atol=atol, rtol=rtol)
-                    if not is_close:
-                        fail_cases.append(get_mean_reldiff(key, current_case, sdpa_logits, eager_logits, atol, rtol))
-
-            self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
-
 
 class SiglipVisionModelTester:
     def __init__(
@@ -399,14 +247,6 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="SiglipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="SiglipVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(reason="Siglip uses the same initialization scheme as the Flax original implementation")
     def test_initialization(self):
         pass
@@ -417,20 +257,12 @@ def test_model_from_pretrained(self):
         model = SiglipVisionModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     @require_torch_sdpa
-    @slow
     @is_flaky()
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        super().test_eager_matches_sdpa_inference(
-            torch_dtype=torch_dtype,
-            logit_keys=("pooler_output", "last_hidden_state"),
-            use_attention_mask_options=(False,),
-        )
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_composite_models(self):
-        super().test_sdpa_can_dispatch_composite_models()
+    def test_eager_matches_sdpa_inference(self, *args):
+        # adding only flaky decorator here and call the parent test method
+        return getattr(ModelTesterMixin, self._testMethodName)(self)
 
 
 class SiglipTextModelTester:
@@ -563,16 +395,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="SiglipTextModel has no base class and is not available in MODEL_MAPPING")
-    # Copied from tests.models.clip.test_modeling_clip.CLIPTextModelTest.test_save_load_fast_init_from_base
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="SiglipTextModel has no base class and is not available in MODEL_MAPPING")
-    # Copied from tests.models.clip.test_modeling_clip.CLIPTextModelTest.test_save_load_fast_init_to_base
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(reason="Siglip uses the same initialization scheme as the Flax original implementation")
     def test_initialization(self):
         pass
@@ -583,21 +405,6 @@ def test_model_from_pretrained(self):
         model = SiglipTextModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        super().test_eager_matches_sdpa_inference(
-            torch_dtype=torch_dtype,
-            logit_keys=("pooler_output", "last_hidden_state"),
-            use_attention_mask_options=(False, True),
-        )
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_composite_models(self):
-        super().test_sdpa_can_dispatch_composite_models()
-
 
 class SiglipModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
@@ -652,6 +459,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    additional_model_inputs = ["pixel_values"]
     all_model_classes = (SiglipModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": SiglipModel} if is_torch_available() else {}
     fx_compatible = False
@@ -880,21 +688,6 @@ def test_flash_attn_2_inference_equivalence(self):
     def test_flash_attn_2_inference_equivalence_right_padding(self):
         self.skipTest("SigLIP does not support right padding")
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        super().test_eager_matches_sdpa_inference(
-            torch_dtype=torch_dtype,
-            logit_keys=("logits_per_image", "logits_per_text", "image_embeds", "text_embeds"),
-            use_attention_mask_options=(False, True),
-        )
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_composite_models(self):
-        super().test_sdpa_can_dispatch_composite_models()
-
 
 class SiglipForImageClassificationModelTester(SiglipModelTester):
     def __init__(self, parent):
@@ -961,19 +754,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_initialization(self):
         pass
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
-    @require_torch_sdpa
-    @slow
-    @is_flaky()
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
-        super().test_eager_matches_sdpa_inference(
-            torch_dtype=torch_dtype, logit_keys=("logits",), use_attention_mask_options=(False,)
-        )
-
-    @require_torch_sdpa
-    def test_sdpa_can_dispatch_composite_models(self):
-        super().test_sdpa_can_dispatch_composite_models()
-
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/siglip/test_tokenization_siglip.py b/tests/models/siglip/test_tokenization_siglip.py
index b44451f6f202..f4bc56c5e334 100644
--- a/tests/models/siglip/test_tokenization_siglip.py
+++ b/tests/models/siglip/test_tokenization_siglip.py
@@ -17,12 +17,13 @@
 import os
 import tempfile
 import unittest
+from functools import lru_cache
 
 from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, SiglipTokenizer
 from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
 from transformers.utils import cached_property, is_tf_available, is_torch_available
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@@ -44,13 +45,13 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece = True
     test_sentencepiece_ignore_case = True
 
-    # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.setUp with T5->Siglip
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = SiglipTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_convert_token_and_id with T5->Siglip
     def test_convert_token_and_id(self):
@@ -135,9 +136,12 @@ def test_full_tokenizer(self):
     def siglip_tokenizer(self):
         return SiglipTokenizer.from_pretrained("google/siglip-base-patch16-224")
 
-    # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.get_tokenizer with T5->Siglip
-    def get_tokenizer(self, **kwargs) -> SiglipTokenizer:
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> SiglipTokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_rust_and_python_full_tokenizers with T5->Siglip
     def test_rust_and_python_full_tokenizers(self):
@@ -227,10 +231,10 @@ def test_special_tokens_initialization(self):
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
-                tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_cr = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
                 )
                 tokenizer_p = self.tokenizer_class.from_pretrained(
diff --git a/tests/models/siglip2/__init__.py b/tests/models/siglip2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/siglip2/test_image_processing_siglip2.py b/tests/models/siglip2/test_image_processing_siglip2.py
new file mode 100644
index 000000000000..dd96db9c5671
--- /dev/null
+++ b/tests/models/siglip2/test_image_processing_siglip2.py
@@ -0,0 +1,200 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import requests
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import Siglip2ImageProcessor
+
+
+if is_torchvision_available():
+    from transformers import Siglip2ImageProcessorFast
+
+
+class Siglip2ImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        resample=None,
+        patch_size=16,
+        max_num_patches=256,
+    ):
+        size = size if size is not None else {"height": 18, "width": 18}
+        resample = resample if resample is not None else Image.Resampling.BILINEAR
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.resample = resample
+        self.patch_size = patch_size
+        self.max_num_patches = max_num_patches
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "resample": self.resample,
+            "patch_size": self.patch_size,
+            "max_num_patches": self.max_num_patches,
+        }
+
+    def expected_output_image_shape(self, images):
+        return self.max_num_patches, self.patch_size * self.patch_size * self.num_channels
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_image_inputs(
+            batch_size=self.batch_size,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+# Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest with CLIP->Siglip2
+class Siglip2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = Siglip2ImageProcessor if is_vision_available() else None
+    fast_image_processing_class = Siglip2ImageProcessorFast if is_torchvision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = Siglip2ImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    # Ignore copy
+    def test_image_processor_properties(self):
+        for image_processing_class in self.image_processor_list:
+            image_processing = image_processing_class(**self.image_processor_dict)
+            self.assertTrue(hasattr(image_processing, "do_resize"))
+            self.assertTrue(hasattr(image_processing, "resample"))
+            self.assertTrue(hasattr(image_processing, "do_rescale"))
+            self.assertTrue(hasattr(image_processing, "rescale_factor"))
+            self.assertTrue(hasattr(image_processing, "do_normalize"))
+            self.assertTrue(hasattr(image_processing, "image_mean"))
+            self.assertTrue(hasattr(image_processing, "image_std"))
+            self.assertTrue(hasattr(image_processing, "patch_size"))
+            self.assertTrue(hasattr(image_processing, "max_num_patches"))
+
+    # Ignore copy
+    def test_image_processor_from_dict_with_kwargs(self):
+        for image_processing_class in self.image_processor_list:
+            image_processor = image_processing_class.from_dict(self.image_processor_dict)
+            self.assertEqual(image_processor.max_num_patches, 256)
+            self.assertEqual(image_processor.patch_size, 16)
+
+            image_processor = self.image_processing_class.from_dict(
+                self.image_processor_dict, patch_size=32, max_num_patches=512
+            )
+            self.assertEqual(image_processor.patch_size, 32)
+            self.assertEqual(image_processor.max_num_patches, 512)
+
+    @unittest.skip(reason="not supported")
+    # Ignore copy
+    def test_call_numpy_4_channels(self):
+        pass
+
+    # increase mean tolerance to 1e-3 -> 2e-3
+    # Ignore copy
+    def test_slow_fast_equivalence(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        dummy_image = Image.open(
+            requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
+        )
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_image, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_image, return_tensors="pt")
+        torch.testing.assert_close(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1, rtol=1e-1)
+        self.assertLessEqual(
+            torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 2e-3
+        )
+
+    # increase mean tolerance to 1e-3 -> 2e-3
+    # Ignore copy
+    def test_slow_fast_equivalence_batched(self):
+        if not self.test_slow_image_processor or not self.test_fast_image_processor:
+            self.skipTest(reason="Skipping slow/fast equivalence test")
+
+        if self.image_processing_class is None or self.fast_image_processing_class is None:
+            self.skipTest(reason="Skipping slow/fast equivalence test as one of the image processors is not defined")
+
+        if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
+            self.skipTest(
+                reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
+            )
+
+        dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+        image_processor_slow = self.image_processing_class(**self.image_processor_dict)
+        image_processor_fast = self.fast_image_processing_class(**self.image_processor_dict)
+
+        encoding_slow = image_processor_slow(dummy_images, return_tensors="pt")
+        encoding_fast = image_processor_fast(dummy_images, return_tensors="pt")
+
+        torch.testing.assert_close(encoding_slow.pixel_values, encoding_fast.pixel_values, atol=1e-1, rtol=1e-1)
+        self.assertLessEqual(
+            torch.mean(torch.abs(encoding_slow.pixel_values - encoding_fast.pixel_values)).item(), 2e-3
+        )
diff --git a/tests/models/siglip2/test_modeling_siglip2.py b/tests/models/siglip2/test_modeling_siglip2.py
new file mode 100644
index 000000000000..885f57751ba4
--- /dev/null
+++ b/tests/models/siglip2/test_modeling_siglip2.py
@@ -0,0 +1,777 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Siglip2 model."""
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+from parameterized import parameterized
+from pytest import mark
+
+from transformers import Siglip2Config, Siglip2TextConfig, Siglip2VisionConfig
+from transformers.testing_utils import (
+    is_flaky,
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
+from transformers.utils import (
+    is_torch_available,
+    is_vision_available,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+    require_torch_sdpa,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import Siglip2ForImageClassification, Siglip2Model, Siglip2TextModel, Siglip2VisionModel
+
+if is_vision_available():
+    from PIL import Image, ImageDraw
+
+    from transformers import Siglip2Processor
+
+
+class Siglip2ModelTesterMixin(ModelTesterMixin):
+    @require_torch_sdpa
+    def test_sdpa_can_dispatch_composite_models(self):
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                # Load the model with SDPA
+                model_sdpa = model_class.from_pretrained(tmpdirname)
+
+                # Load model with eager attention
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    attn_implementation="eager",
+                )
+
+            if hasattr(model_sdpa, "vision_model"):
+                self.assertTrue(model_sdpa.vision_model.config._attn_implementation == "sdpa")
+                self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
+
+            if hasattr(model_sdpa, "text_model"):
+                self.assertTrue(model_sdpa.text_model.config._attn_implementation == "sdpa")
+                self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
+
+            self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+            self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_equivalence(self):
+        dtype = torch.float16
+
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+
+            # Prepare inputs
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            if "pixel_values" in inputs_dict:
+                inputs_dict["pixel_values"] = inputs_dict["pixel_values"].to(dtype)
+
+            # Separate masks
+            attention_masks = {}
+            if "attention_mask" in inputs_dict:
+                # attention_masks["attention_mask"] = inputs_dict.pop("attention_mask")
+                inputs_dict["attention_mask"] = None
+            if "pixel_attention_mask" in inputs_dict:
+                attention_masks["pixel_attention_mask"] = inputs_dict.pop("pixel_attention_mask")
+                inputs_dict["pixel_attention_mask"] = None
+
+            # Save and load model with flash attention 2 and eager attentions
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                model = model_class(config)
+                model.save_pretrained(tmp_dir)
+
+                model = model_class.from_pretrained(tmp_dir, torch_dtype=dtype)
+                model_fa = model_class.from_pretrained(
+                    tmp_dir, torch_dtype=dtype, attn_implementation="flash_attention_2"
+                )
+
+            model_fa.to(torch_device)
+            model.to(torch_device)
+
+            # Run forward pass without attention masks
+            with torch.no_grad():
+                outputs = model(**inputs_dict, output_hidden_states=True)
+                outputs_fa = model_fa(**inputs_dict, output_hidden_states=True)
+
+            # Choose which key to compare
+            key = [k for k in ["logits", "logits_per_image", "last_hidden_state"] if k in outputs][0]
+
+            torch.testing.assert_close(outputs[key], outputs_fa[key], atol=4e-2, rtol=4e-2)
+
+            # Run forward pass with attention masks
+            inputs_dict.update(attention_masks)
+            with torch.no_grad():
+                outputs = model(**inputs_dict, output_hidden_states=True)
+                outputs_fa = model_fa(**inputs_dict, output_hidden_states=True)
+
+            output_tensor = outputs[key]
+            output_tensor_fa = outputs_fa[key]
+
+            # Mask out padded tokens, they are different for SDPA and Flash Attention 2
+            if key == "last_hidden_state" and "pixel_attention_mask" in inputs_dict:
+                output_tensor = output_tensor * inputs_dict["pixel_attention_mask"][..., None]
+                output_tensor_fa = output_tensor_fa * inputs_dict["pixel_attention_mask"][..., None]
+            elif key == "last_hidden_state" and inputs_dict.get("attention_mask", None) is not None:
+                output_tensor = output_tensor * inputs_dict["attention_mask"][..., None]
+                output_tensor_fa = output_tensor_fa * inputs_dict["attention_mask"][..., None]
+
+            torch.testing.assert_close(output_tensor, output_tensor_fa, atol=4e-2, rtol=4e-2)
+
+            # Check with inference + dropout
+            model.train()
+            _ = model_fa(**inputs_dict, output_hidden_states=True)
+
+    @unittest.skip(reason="Siglip2 has default right padding (tested in test_flash_attn_2_inference_equivalence)")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip(reason="SDPA can't dispatch on flash with not None `attention_mask`")
+    def test_sdpa_can_dispatch_on_flash(self):
+        pass
+
+
+class Siglip2VisionModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        num_patches=16,
+        image_num_patches=24,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_patches = num_patches
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.seq_length = image_num_patches
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [self.batch_size, self.seq_length, self.num_channels * self.patch_size * self.patch_size]
+        )
+        pixel_attention_mask = torch.zeros(self.batch_size, self.seq_length, device=torch_device, dtype=torch.long)
+
+        spatial_shapes = [
+            (height, width)
+            for height in range(1, self.seq_length)
+            for width in range(1, self.seq_length)
+            if height * width <= self.seq_length
+        ] * self.batch_size
+        spatial_shapes = spatial_shapes[: self.batch_size]
+        spatial_shapes = torch.tensor(spatial_shapes, device=torch_device, dtype=torch.long)
+
+        for i, (height, width) in enumerate(spatial_shapes):
+            pixel_attention_mask[i, : height * width] = 1
+
+        config = self.get_config()
+
+        return config, pixel_values, pixel_attention_mask, spatial_shapes
+
+    def get_config(self):
+        return Siglip2VisionConfig(
+            num_patches=self.num_patches,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, pixel_attention_mask, spatial_shapes):
+        model = Siglip2VisionModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values, pixel_attention_mask, spatial_shapes)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config, pixel_values, pixel_attention_mask, spatial_shapes = self.prepare_config_and_inputs()
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "pixel_attention_mask": pixel_attention_mask,
+            "spatial_shapes": spatial_shapes,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Siglip2VisionModelTest(Siglip2ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as SIGLIP2 does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (Siglip2VisionModel,) if is_torch_available() else ()
+    additional_model_inputs = ["pixel_attention_mask", "spatial_shapes"]
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    # MP works but offload doesn't work when the MultiheadAttention is offloaded
+    # TODO: One potential solution would be to add to set preload_module_classes = ["Siglip2MultiheadAttentionPoolingHead"]
+    # in the dispatch_model function
+    test_cpu_offload = False
+    test_disk_offload_safetensors = False
+    test_disk_offload_bin = False
+
+    def setUp(self):
+        self.model_tester = Siglip2VisionModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=Siglip2VisionConfig, has_text_modality=False, hidden_size=37
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="SIGLIP2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    def test_model_get_set_embeddings(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Siglip2VisionModel does not support standalone training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="Siglip2VisionModel does not support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="Siglip2VisionModel does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="Siglip2VisionModel does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="Siglip2 uses the same initialization scheme as the Flax original implementation")
+    def test_initialization(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "google/siglip2-base-patch16-naflex"
+        model = Siglip2VisionModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
+    @require_torch_sdpa
+    @is_flaky()
+    def test_eager_matches_sdpa_inference(self, *args):
+        # adding only flaky decorator here and call the parent test method
+        return getattr(ModelTesterMixin, self._testMethodName)(self)
+
+
+class Siglip2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        if input_mask is not None:
+            batch_size, seq_length = input_mask.shape
+            rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
+            for batch_idx, start_index in enumerate(rnd_start_indices):
+                input_mask[batch_idx, :start_index] = 1
+                input_mask[batch_idx, start_index:] = 0
+
+        config = self.get_config()
+
+        return config, input_ids, input_mask
+
+    def get_config(self):
+        return Siglip2TextConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            max_position_embeddings=self.max_position_embeddings,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, input_ids, input_mask):
+        model = Siglip2TextModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(input_ids, attention_mask=input_mask)
+            result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, input_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class Siglip2TextModelTest(Siglip2ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (Siglip2TextModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_resize_embeddings = False
+    test_pruning = False
+    test_head_masking = False
+    model_split_percents = [0.5, 0.8, 0.9]
+
+    def setUp(self):
+        self.model_tester = Siglip2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Siglip2TextConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Siglip2TextModel does not support standalone training")
+    def test_training(self):
+        pass
+
+    @unittest.skip(reason="Siglip2TextModel does not support standalone training")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="Siglip2TextModel does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="Siglip2TextModel does not support standalone training")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="Siglip2 does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Siglip2 uses the same initialization scheme as the Flax original implementation")
+    def test_initialization(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "google/siglip2-base-patch16-naflex"
+        model = Siglip2TextModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+class Siglip2ModelTester:
+    def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
+        if text_kwargs is None:
+            text_kwargs = {}
+        if vision_kwargs is None:
+            vision_kwargs = {}
+
+        self.parent = parent
+        self.text_model_tester = Siglip2TextModelTester(parent, **text_kwargs)
+        self.vision_model_tester = Siglip2VisionModelTester(parent, **vision_kwargs)
+        self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
+        self.is_training = is_training
+
+    def prepare_config_and_inputs(self):
+        text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+        vision_config, pixel_values, pixel_attention_mask, spatial_shapes = (
+            self.vision_model_tester.prepare_config_and_inputs()
+        )
+
+        config = self.get_config()
+
+        return config, input_ids, attention_mask, pixel_values, pixel_attention_mask, spatial_shapes
+
+    def get_config(self):
+        return Siglip2Config.from_text_vision_configs(
+            self.text_model_tester.get_config(),
+            self.vision_model_tester.get_config(),
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, attention_mask, pixel_values, pixel_attention_mask, spatial_shapes
+    ):
+        model = Siglip2Model(config).to(torch_device).eval()
+        with torch.no_grad():
+            result = model(input_ids, pixel_values, pixel_attention_mask, spatial_shapes, attention_mask)
+        self.parent.assertEqual(
+            result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+        )
+        self.parent.assertEqual(
+            result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask, pixel_values, pixel_attention_mask, spatial_shapes = config_and_inputs
+        inputs_dict = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "pixel_attention_mask": pixel_attention_mask,
+            "spatial_shapes": spatial_shapes,
+            "attention_mask": attention_mask,
+            "position_ids": None,
+            "return_loss": False,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Siglip2ModelTest(Siglip2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Siglip2Model,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": Siglip2Model} if is_torch_available() else {}
+    additional_model_inputs = [
+        "pixel_values",
+        "pixel_attention_mask",
+        "spatial_shapes",
+    ]
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    # MP works but offload doesn't work when the MultiheadAttention is offloaded
+    # TODO: One potential solution would be to add to set preload_module_classes = ["Siglip2MultiheadAttentionPoolingHead"]
+    # in the dispatch_model function
+    test_cpu_offload = False
+    test_disk_offload_safetensors = False
+    test_disk_offload_bin = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = Siglip2ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=Siglip2Config, has_text_modality=False)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="Hidden_states is tested in individual model tests")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Inputs_embeds is tested in individual model tests")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Siglip2Model does not have input/output embeddings")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip(reason="Siglip2 uses the same initialization scheme as the Flax original implementation")
+    def test_initialization(self):
+        pass
+
+    def test_load_vision_text_config(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # Save Siglip2Config and check if we can load Siglip2VisionConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            vision_config = Siglip2VisionConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict())
+
+        # Save Siglip2Config and check if we can load Siglip2TextConfig from it
+        with tempfile.TemporaryDirectory() as tmp_dir_name:
+            config.save_pretrained(tmp_dir_name)
+            text_config = Siglip2TextConfig.from_pretrained(tmp_dir_name)
+            self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "google/siglip2-base-patch16-naflex"
+        model = Siglip2Model.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        self.skipTest("Siglip2 does not support right padding")
+
+
+class Siglip2ForImageClassificationModelTester(Siglip2ModelTester):
+    def __init__(self, parent):
+        super().__init__(parent)
+        self.batch_size = self.vision_model_tester.batch_size
+        self.num_hidden_layers = self.vision_model_tester.num_hidden_layers
+        self.hidden_size = self.vision_model_tester.hidden_size
+        self.seq_length = self.vision_model_tester.seq_length
+
+    def prepare_config_and_inputs(self):
+        _, pixel_values, pixel_attention_mask, spatial_shapes = self.vision_model_tester.prepare_config_and_inputs()
+        config = self.get_config()
+
+        return config, pixel_values, pixel_attention_mask, spatial_shapes
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, pixel_attention_mask, spatial_shapes = config_and_inputs
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "pixel_attention_mask": pixel_attention_mask,
+            "spatial_shapes": spatial_shapes,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class Siglip2ForImageClassificationModelTest(Siglip2ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (Siglip2ForImageClassification,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-classification": Siglip2ForImageClassification} if is_torch_available() else {}
+    additional_model_inputs = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]
+    fx_compatible = False
+    test_head_masking = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_attention_outputs = False
+    # MP works but offload doesn't work when the MultiheadAttention is offloaded
+    # TODO: One potential solution would be to add to set preload_module_classes = ["Siglip2MultiheadAttentionPoolingHead"]
+    # in the dispatch_model function
+    test_cpu_offload = False
+    test_disk_offload_safetensors = False
+    test_disk_offload_bin = False
+    _is_composite = True
+
+    def setUp(self):
+        self.model_tester = Siglip2ForImageClassificationModelTester(self)
+
+    @unittest.skip(reason="Siglip2ForImageClassification does not support inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="Siglip2ForImageClassification does not support inputs_embeds")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip(reason="Siglip2ForImageClassification does not support gradient checkpointing yet")
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(reason="Siglip2ForImageClassification does not support gradient checkpointing yet")
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(reason="Siglip2ForImageClassification does not support gradient checkpointing yet")
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="Siglip2 uses the same initialization scheme as the Flax original implementation")
+    def test_initialization(self):
+        pass
+
+
+# Draw a circle on an images with different aspect ratios
+def prepare_images():
+    shapes = [(224, 224), (1024, 1024), (224, 1024)]
+    images = []
+    for height, width in shapes:
+        image = Image.new("RGB", (width, height), color="red")
+        draw = ImageDraw.Draw(image)
+        center_x = image.width // 2
+        center_y = image.height // 2
+        radius = min(center_x, center_y) // 8 * 7
+        draw.ellipse(
+            (center_x - radius, center_y - radius, center_x + radius, center_y + radius),
+            fill="blue",
+            outline="green",
+            width=image.width // 20,
+        )
+        images.append(image)
+    return images
+
+
+@require_vision
+@require_torch
+class Siglip2ModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "google/siglip2-base-patch16-naflex"
+        model = Siglip2Model.from_pretrained(model_name).to(torch_device)
+        processor = Siglip2Processor.from_pretrained(model_name)
+
+        images = prepare_images()
+        text = [
+            "circle",
+            "ellipsoid",
+            "blue circle on red background",
+            "blue circle with green border on red background",
+            "green circle on red background",
+            "a dog",
+            "a blue dog with a green border on a red background",
+        ]
+
+        inputs = processor(text=text, images=images, return_tensors="pt")
+        inputs = inputs.to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        logits_per_image = outputs.logits_per_image
+        logits_per_text = outputs.logits_per_text
+
+        # verify the logits shape
+        self.assertEqual(
+            logits_per_image.shape,
+            torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+        )
+        self.assertEqual(
+            logits_per_text.shape,
+            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+        )
+
+        # verify the logits values
+        # fmt: off
+        expected_logits_per_text = torch.tensor(
+            [
+                [  1.0195,  -0.0280,  -1.4468],
+                [ -4.5395,  -6.2269,  -1.5667],
+                [  4.1757,   5.0358,   3.5159],
+                [  9.4264,  10.1879,   6.3353],
+                [  2.4409,   3.1058,   4.5491],
+                [-12.3230, -13.7355, -13.4632],
+                [  1.1520,   1.1687,  -1.9647],
+            ]
+        ).to(torch_device)
+        # fmt: on
+
+        torch.testing.assert_close(outputs.logits_per_text, expected_logits_per_text, rtol=1e-3, atol=1e-3)
diff --git a/tests/models/smolvlm/__init__.py b/tests/models/smolvlm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/smolvlm/test_image_processing_smolvlm.py b/tests/models/smolvlm/test_image_processing_smolvlm.py
new file mode 100644
index 000000000000..273a4c0c3cd9
--- /dev/null
+++ b/tests/models/smolvlm/test_image_processing_smolvlm.py
@@ -0,0 +1,284 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.image_utils import PILImageResampling
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import SmolVLMImageProcessor
+
+
+if is_torch_available():
+    import torch
+
+
+class SmolVLMImageProcessingTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        num_images=1,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=40,
+        do_resize=True,
+        size=None,
+        max_image_size=None,
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+        do_convert_rgb=True,
+        do_pad=True,
+        do_image_splitting=True,
+        resample=PILImageResampling.LANCZOS,
+    ):
+        self.size = size if size is not None else {"longest_edge": max_resolution}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_images = num_images
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_image_splitting = do_image_splitting
+        self.max_image_size = max_image_size if max_image_size is not None else {"longest_edge": 20}
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+        self.do_pad = do_pad
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_convert_rgb": self.do_convert_rgb,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "max_image_size": self.max_image_size,
+            "do_rescale": self.do_rescale,
+            "rescale_factor": self.rescale_factor,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_pad": self.do_pad,
+            "do_image_splitting": self.do_image_splitting,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to SmolVLMImageProcessor,
+        assuming do_resize is set to True. The expected size in that case the max image size.
+        """
+        return self.max_image_size["longest_edge"], self.max_image_size["longest_edge"]
+
+    def expected_output_image_shape(self, images):
+        height, width = self.get_expected_values(images, batched=True)
+        effective_nb_images = (
+            self.num_images * 5 if self.do_image_splitting else 1
+        )  # 5 is a squared image divided into 4 + global image resized
+        return effective_nb_images, self.num_channels, height, width
+
+    def prepare_image_inputs(
+        self,
+        batch_size=None,
+        min_resolution=None,
+        max_resolution=None,
+        num_channels=None,
+        num_images=None,
+        size_divisor=None,
+        equal_resolution=False,
+        numpify=False,
+        torchify=False,
+    ):
+        """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
+        or a list of PyTorch tensors if one specifies torchify=True.
+
+        One can specify whether the images are of the same resolution or not.
+        """
+        assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+        batch_size = batch_size if batch_size is not None else self.batch_size
+        min_resolution = min_resolution if min_resolution is not None else self.min_resolution
+        max_resolution = max_resolution if max_resolution is not None else self.max_resolution
+        num_channels = num_channels if num_channels is not None else self.num_channels
+        num_images = num_images if num_images is not None else self.num_images
+
+        images_list = []
+        for i in range(batch_size):
+            images = []
+            for j in range(num_images):
+                if equal_resolution:
+                    width = height = max_resolution
+                else:
+                    # To avoid getting image width/height 0
+                    if size_divisor is not None:
+                        # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
+                        min_resolution = max(size_divisor, min_resolution)
+                    width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
+                images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8))
+            images_list.append(images)
+
+        if not numpify and not torchify:
+            # PIL expects the channel dimension as last dimension
+            images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list]
+
+        if torchify:
+            images_list = [[torch.from_numpy(image) for image in images] for images in images_list]
+
+        if numpify:
+            # Numpy images are typically in channels last format
+            images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list]
+
+        return images_list
+
+
+@require_torch
+@require_vision
+class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = SmolVLMImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = SmolVLMImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "resample"))
+        self.assertTrue(hasattr(image_processing, "do_image_splitting"))
+        self.assertTrue(hasattr(image_processing, "max_image_size"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+        self.assertTrue(hasattr(image_processing, "do_image_splitting"))
+
+    def test_call_numpy(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = self.image_processing_class(**self.image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+            for sample_images in image_inputs:
+                for image in sample_images:
+                    self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_numpy_4_channels(self):
+        # SmolVLM always processes images as RGB, so it always returns images with 3 channels
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processor_dict = self.image_processor_dict
+            image_processing = self.image_processing_class(**image_processor_dict)
+            # create random numpy tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True)
+
+            for sample_images in image_inputs:
+                for image in sample_images:
+                    self.assertIsInstance(image, np.ndarray)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_pil(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = self.image_processing_class(**self.image_processor_dict)
+            # create random PIL images
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False)
+            for images in image_inputs:
+                for image in images:
+                    self.assertIsInstance(image, Image.Image)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            self.assertEqual(
+                tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
+            )
+
+    def test_call_pytorch(self):
+        for image_processing_class in self.image_processor_list:
+            # Initialize image_processing
+            image_processing = self.image_processing_class(**self.image_processor_dict)
+            # create random PyTorch tensors
+            image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
+
+            for images in image_inputs:
+                for image in images:
+                    self.assertIsInstance(image, torch.Tensor)
+
+            # Test not batched input
+            encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]])
+            self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape))
+
+            # Test batched
+            expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs)
+            encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+            self.assertEqual(
+                tuple(encoded_images.shape),
+                (self.image_processor_tester.batch_size, *expected_output_image_shape),
+            )
diff --git a/tests/models/smolvlm/test_modeling_smolvlm.py b/tests/models/smolvlm/test_modeling_smolvlm.py
new file mode 100644
index 000000000000..e506d688098a
--- /dev/null
+++ b/tests/models/smolvlm/test_modeling_smolvlm.py
@@ -0,0 +1,598 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch SmolVLM model."""
+
+import copy
+import unittest
+from io import BytesIO
+
+import pytest
+import requests
+from parameterized import parameterized
+
+from transformers import (
+    AutoProcessor,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import (
+    cleanup,
+    is_flaky,
+    require_torch,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        SmolVLMConfig,
+        SmolVLMForConditionalGeneration,
+        SmolVLMModel,
+    )
+
+if is_vision_available():
+    from PIL import Image
+
+
+class SmolVLMVisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        is_training=True,
+        batch_size=2,
+        scale_factor=2,
+        num_images=2,
+        vision_config={
+            "image_size": 16,
+            "patch_size": 4,
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 32,
+            "dropout": 0.1,
+            "attention_dropout": 0.1,
+            "initializer_range": 0.02,
+        },
+        text_config={
+            "vocab_size": 100,
+            "hidden_size": 64,
+            "intermediate_size": 56,
+            "num_hidden_layers": 3,
+            "num_attention_heads": 2,
+            "num_key_value_heads": 2,
+            "hidden_act": "silu",
+            "max_position_embeddings": 256,
+            "initializer_range": 0.02,
+            "rms_norm_eps": 1e-6,
+            "pad_token_id": 2,
+            "bos_token_id": 0,
+            "eos_token_id": 1,
+            "image_token_id": 57,
+            "tie_word_embeddings": False,
+            "rope_theta": 10000.0,
+            "sliding_window": 32,
+            "attention_dropout": 0.0,
+        },
+        use_cache=False,
+        tie_word_embeddings=False,
+        image_token_id=57,
+    ):
+        self.parent = parent
+        self.is_training = is_training
+        self.batch_size = batch_size
+        self.num_images = num_images
+        self.scale_factor = scale_factor
+        self.seq_length = (
+            int(((vision_config["image_size"] // vision_config["patch_size"]) ** 2) / (self.scale_factor**2))
+            * self.num_images
+        )
+        self.use_cache = use_cache
+        self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+        # Hack - add properties here so use common tests
+        self.vocab_size = text_config["vocab_size"]
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.hidden_size = text_config["hidden_size"]
+
+        self.vision_config = vision_config
+        self.text_config = text_config
+
+    def get_config(self):
+        return SmolVLMConfig(
+            use_cache=self.use_cache,
+            image_token_id=self.image_token_id,
+            tie_word_embeddings=self.tie_word_embeddings,
+            vision_config=self.vision_config,
+            text_config=self.text_config,
+            vocab_size=self.vocab_size,
+            scale_factor=self.scale_factor,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                self.num_images,
+                3,  # SmolVLMImageProcessor always generates RGB pixel values
+                self.vision_config["image_size"],
+                self.vision_config["image_size"],
+            ]
+        )
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1
+
+        # For simplicity just set the last n tokens to the image token
+        n_image_tokens_per_batch = self.seq_length
+        input_ids[:, -n_image_tokens_per_batch:] = self.image_token_id
+        attention_mask = input_ids.ne(1).to(torch_device)
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class SmolVLMModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Model tester for `SmolVLM`.
+    """
+
+    all_model_classes = (SmolVLMModel,) if is_torch_available() else ()
+    fx_compatible = False
+    test_torchscript = False
+    test_pruning = False
+    test_resize_embeddings = True
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = SmolVLMVisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=SmolVLMConfig, has_text_modality=False, common_properties=["image_token_id"]
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
+    def test_inputs_embeds():
+        pass
+
+    @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
+    @unittest.skip(reason="Model does not support padding right")
+    def test_flash_attn_2_inference_padding_right(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported in SmolVLM models")
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported in SmolVLM models")
+    def test_sdpa_can_dispatch_on_flash(self):
+        pass
+
+    # We need to override as we need to prepare such that the image token is the last token
+    def test_resize_tokens_embeddings(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.text_config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # Ignore copy
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
+            n_images = self.model_tester.num_images * self.model_tester.seq_length
+            model.image_token_id = model_vocab_size - 15 - 1
+            inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
+
+            # make sure that decoder_input_ids are resized as well
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
+            self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
+
+            model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+
+            self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
+            self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
+
+            model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+
+            # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
+            target_dimension = 128
+            model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0], target_dimension)
+
+            with self.assertRaisesRegex(
+                ValueError,
+                "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
+            ):
+                model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
+
+    # We need to override as we need to prepare such that the image token is the last token
+    def test_resize_embeddings_untied(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        original_config.tie_word_embeddings = False
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
+            n_images = self.model_tester.num_images * self.model_tester.seq_length
+            model.image_token_id = model_vocab_size - 15 - 1
+            inputs_dict["input_ids"][:, -n_images:] = model.image_token_id
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+
+@require_torch
+class SmolVLMForConditionalGenerationModelTest(GenerationTesterMixin, ModelTesterMixin, unittest.TestCase):
+    """
+    Model tester for `SmolVLMForConditionalGeneration`.
+    """
+
+    all_model_classes = (SmolVLMForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (SmolVLMForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-text-to-text": SmolVLMForConditionalGeneration} if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = True
+    test_head_masking = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = SmolVLMVisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=SmolVLMConfig, has_text_modality=False)
+
+    @unittest.skip(reason="input_embeds cannot be passed in without input_ids")
+    def test_inputs_embeds():
+        pass
+
+    @unittest.skip(reason="Model does not support padding right")
+    def test_flash_attn_2_inference_padding_right(self):
+        pass
+
+    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
+    def test_contrastive_generate(self):
+        pass
+
+    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
+    def test_contrastive_generate_dict_outputs_use_cache(self):
+        pass
+
+    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
+    def test_contrastive_generate_low_memory(self):
+        pass
+
+    @unittest.skip(
+        reason="Prompt lookup decoding needs a way to indicate `bad_word_ids` that should not be suggested as candidates"
+    )
+    def test_prompt_lookup_decoding_matches_greedy_search(self):
+        pass
+
+    @pytest.mark.generate
+    @is_flaky(description="TODO: check why flaky")
+    def test_generate_methods_with_logits_to_keep(self):
+        super().test_generate_methods_with_logits_to_keep()
+
+    @unittest.skip(reason=" FlashAttention only support fp16 and bf16 data type")
+    def test_flash_attn_2_fp32_ln(self):
+        pass
+
+    @unittest.skip
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="Unsupported")
+    def test_generate_from_inputs_embeds_0_greedy(self):
+        pass
+
+    @unittest.skip(reason="Unsupported")
+    def test_generate_from_inputs_embeds_1_beam_search(self):
+        pass
+
+    @unittest.skip(reason="Unsupported")
+    def test_generate_with_static_cache(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported in SmolVLM models")
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported in SmolVLM models")
+    def test_sdpa_can_dispatch_on_flash(self):
+        pass
+
+    @pytest.mark.generate
+    @require_torch_sdpa
+    @slow
+    @unittest.skip(
+        reason="SmolVLM doesn't support SDPA for all backbones, vision backbones has only eager/FA2 attention"
+    )
+    def test_eager_matches_sdpa_generate(self):
+        pass
+
+    @parameterized.expand([("random",), ("same",)])
+    @pytest.mark.generate
+    @unittest.skip(reason="Cache position is off by one leaving out image tokens, FIXME raushan")
+    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
+        pass
+
+    # We need to override as we need to prepare such that the image token is the last token
+    def test_resize_tokens_embeddings(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            model_vocab_size = config.text_config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
+            n_images = self.model_tester.num_images * self.model_tester.seq_length
+            model.model.image_token_id = model_vocab_size - 15 - 1
+            inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
+
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1)
+            self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size)
+
+            model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+
+            self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size)
+            self.assertTrue(model.config.text_config.vocab_size, model.vocab_size)
+
+            model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0] // 64, 0)
+
+            # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size
+            target_dimension = 128
+            model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64)
+            self.assertTrue(model_embed.weight.shape[0], target_dimension)
+
+            with self.assertRaisesRegex(
+                ValueError,
+                "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer",
+            ):
+                model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3)
+
+    # We need to override as we need to prepare such that the image token is the last token
+    def test_resize_embeddings_untied(self):
+        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
+
+        original_config.tie_word_embeddings = False
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.text_config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2)
+            n_images = self.model_tester.num_images * self.model_tester.seq_length
+            model.model.image_token_id = model_vocab_size - 15 - 1
+            inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id
+
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+
+@require_torch
+class SmolVLMForConditionalGenerationIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
+        self.image1 = Image.open(
+            BytesIO(
+                requests.get(
+                    "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+                ).content
+            )
+        )
+        self.image2 = Image.open(
+            BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
+        )
+        self.image3 = Image.open(
+            BytesIO(
+                requests.get(
+                    "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
+                ).content
+            )
+        )
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    @slow
+    # TODO (Orr?) this is a dummy test to check if the model generates things that make sense.
+    # Needs to be expanded to a tiny video
+    def test_integration_test(self):
+        model = SmolVLMForConditionalGeneration.from_pretrained(
+            "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+        )
+
+        # Create inputs
+        text = "<image>In this image, we see"
+        images = self.image1
+        inputs = self.processor(text=text, images=images, return_tensors="pt", padding=True)
+        inputs.to(device=torch_device, dtype=torch.bfloat16)
+
+        generated_ids = model.generate(**inputs, max_new_tokens=9)
+        generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        expected_generated_text = "\n\n\n\nIn this image, we see a view of the Statue of Liberty and the"
+        self.assertEqual(generated_texts[0], expected_generated_text)
diff --git a/tests/models/smolvlm/test_processor_smolvlm.py b/tests/models/smolvlm/test_processor_smolvlm.py
new file mode 100644
index 000000000000..d276d92a5011
--- /dev/null
+++ b/tests/models/smolvlm/test_processor_smolvlm.py
@@ -0,0 +1,655 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+from io import BytesIO
+from typing import Optional
+
+import numpy as np
+import requests
+
+from transformers import SmolVLMProcessor
+from transformers.models.auto.processing_auto import AutoProcessor
+from transformers.testing_utils import require_av, require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+@require_torch
+@require_vision
+class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = SmolVLMProcessor
+    videos_input_name = "pixel_values"
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tmpdirname = tempfile.mkdtemp()
+        processor = SmolVLMProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct", image_seq_len=2)
+        processor.save_pretrained(cls.tmpdirname)
+        cls.image1 = Image.open(
+            BytesIO(
+                requests.get(
+                    "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+                ).content
+            )
+        )
+        cls.image2 = Image.open(
+            BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content)
+        )
+        cls.image3 = Image.open(
+            BytesIO(
+                requests.get(
+                    "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"
+                ).content
+            )
+        )
+        cls.bos_token = processor.tokenizer.bos_token
+        cls.image_token = processor.image_token
+        cls.fake_image_token = processor.fake_image_token
+        cls.global_img_token = processor.global_image_token
+
+        cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
+        cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)
+        cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token)
+        cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"]
+        cls.padding_token_id = processor.tokenizer.pad_token_id
+        cls.image_seq_len = processor.image_seq_len
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def get_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def prepare_processor_dict(self):
+        return {
+            "image_seq_len": self.image_seq_len,
+            "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+        }
+
+    def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
+        text_split_images = []
+        for n_h in range(image_rows):
+            for n_w in range(image_cols):
+                text_split_images += (
+                    [self.fake_image_token_id]
+                    + processor.tokenizer(f"<row_{n_h + 1}_col_{n_w + 1}>", add_special_tokens=False)["input_ids"]
+                    + [self.image_token_id] * self.image_seq_len
+                )
+            text_split_images += processor.tokenizer("\n", add_special_tokens=False)["input_ids"]
+        text_split_images = text_split_images[:-1]  # remove last newline
+        # add double newline, as it gets its own token
+        text_split_images += processor.tokenizer("\n\n", add_special_tokens=False)["input_ids"]
+        text_split_images += (
+            [self.fake_image_token_id]
+            + self.global_img_tokens_id
+            + [self.image_token_id] * self.image_seq_len
+            + [self.fake_image_token_id]
+        )
+        return text_split_images
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tmpdirname)
+
+    def test_process_interleaved_images_prompts_no_image_splitting(self):
+        processor_components = self.prepare_components()
+        processor_components["tokenizer"] = self.get_component("tokenizer", padding_side="left")
+        processor_components["image_processor"] = self.get_component("image_processor", do_image_splitting=False)
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+
+        # Test that a single image is processed correctly
+        inputs = processor(images=self.image1)
+        image1_expected_size = (512, 512)
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size))
+        self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size))
+        # fmt: on
+
+        # Test a single sample with image and text
+        image_str = "<image>"
+        text_str = "In this image, we see"
+        text = image_str + text_str
+        inputs = processor(text=text, images=self.image1)
+
+        # fmt: off
+        tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
+        expected_input_ids = [[self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
+        self.assertEqual(inputs["input_ids"], expected_input_ids)
+        self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 1, 3, *image1_expected_size))
+        self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 1, *image1_expected_size))
+        # fmt: on
+
+        # Test that batch is correctly processed
+        image_str = "<image>"
+        text_str_1 = "In this image, we see"
+        text_str_2 = "In this image, we see"
+
+        text = [
+            image_str + text_str_1,
+            image_str + image_str + text_str_2,
+        ]
+        images = [[self.image1], [self.image2, self.image3]]
+
+        inputs = processor(text=text, images=images, padding=True)
+
+        # fmt: off
+        tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
+        image_tokens = [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
+        expected_input_ids_1 = image_tokens + tokenized_sentence_1["input_ids"]
+        expected_input_ids_2 = 2 * image_tokens + tokenized_sentence_2["input_ids"]
+        # Pad the first input to match the second input
+        pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
+        padded_expected_input_ids_1 = [self.padding_token_id] * pad_len + expected_input_ids_1
+
+        self.assertEqual(
+            inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
+        )
+        self.assertEqual(
+            inputs["attention_mask"],
+            [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
+        )
+        self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 2, 3, 512, 512))
+        self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 2, 512, 512))
+        # fmt: on
+
+    def test_process_interleaved_images_prompts_image_splitting(self):
+        processor_components = self.prepare_components()
+        processor_components["tokenizer"] = self.get_component("tokenizer", padding_side="left")
+        processor_components["image_processor"] = self.get_component("image_processor", do_image_splitting=True)
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+
+        # Test that a single image is processed correctly
+        inputs = processor(images=self.image1)
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 13, 3, 512, 512))
+        self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 13, 512, 512))
+        # fmt: on
+        self.maxDiff = None
+
+        # Test a single sample with image and text
+        image_str = "<image>"
+        text_str = "In this image, we see"
+        text = image_str + text_str
+        inputs = processor(text=text, images=self.image1)
+
+        # fmt: off
+        tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
+        split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
+        expected_input_ids_1 = [split_image1_tokens + tokenized_sentence["input_ids"]]
+        self.assertEqual(inputs["input_ids"], expected_input_ids_1)
+        self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids_1[0])])
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 13, 3, 512, 512))
+        self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (1, 13, 512, 512))
+        # fmt: on
+
+        # Test that batch is correctly processed
+        image_str = "<image>"
+        text_str_1 = "In this image, we see"
+        text_str_2 = "bla, bla"
+
+        text = [
+            image_str + text_str_1,
+            text_str_2 + image_str + image_str,
+        ]
+        images = [[self.image1], [self.image2, self.image3]]
+
+        inputs = processor(text=text, images=images, padding=True)
+
+        # fmt: off
+        tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
+
+        split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
+        split_image2_tokens = self.get_split_image_expected_tokens(processor, 4, 4)
+        split_image3_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
+        expected_input_ids_1 = split_image1_tokens + tokenized_sentence_1["input_ids"]
+        expected_input_ids_2 = tokenized_sentence_2["input_ids"] + split_image2_tokens + split_image3_tokens
+        # Pad the first input to match the second input
+        pad_len = len(expected_input_ids_2) - len(expected_input_ids_1)
+        padded_expected_input_ids_1 = [self.padding_token_id] * pad_len + expected_input_ids_1
+
+        self.assertEqual(
+            inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2]
+        )
+        self.assertEqual(
+            inputs["attention_mask"],
+            [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)]
+        )
+        self.assertEqual(np.array(inputs['pixel_values']).shape, (2, 30, 3, 512, 512))
+        self.assertEqual(np.array(inputs['pixel_attention_mask']).shape, (2, 30, 512, 512))
+        # fmt: on
+
+    def test_add_special_tokens_processor(self):
+        processor = self.get_processor()
+
+        image_str = "<image>"
+        text_str = "In this image, we see"
+        text = text_str + image_str
+
+        # fmt: off
+        inputs = processor(text=text, images=self.image1, add_special_tokens=False)
+        tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
+        split_image1_tokens = self.get_split_image_expected_tokens(processor, 3, 4)
+        expected_input_ids = [tokenized_sentence["input_ids"] + split_image1_tokens]
+        self.assertEqual(inputs["input_ids"], expected_input_ids)
+
+        inputs = processor(text=text, images=self.image1)
+        expected_input_ids = [tokenized_sentence["input_ids"] + split_image1_tokens]
+        self.assertEqual(inputs["input_ids"], expected_input_ids)
+        # fmt: on
+
+    @unittest.skip(reason="from @molbap @zucchini-nlp, passing non-nested images is error-prone and not recommended")
+    def test_non_nested_images_with_batched_text(self):
+        processor = self.get_processor()
+        processor.image_processor.do_image_splitting = False
+
+        image_str = "<image>"
+        text_str_1 = "In this image, we see"
+        text_str_2 = "In this image, we see"
+
+        text = [
+            image_str + text_str_1,
+            image_str + image_str + text_str_2,
+        ]
+        images = [[self.image1], [self.image2, self.image3]]
+
+        inputs = processor(text=text, images=images, padding=True)
+
+        self.assertEqual(np.array(inputs["pixel_values"]).shape, (2, 2, 3, 512, 512))
+        self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (2, 2, 512, 512))
+
+    # Copied from tests.models.idefics2.test_processor_idefics2.Idefics2ProcessorTest.test_process_interleaved_images_prompts_image_error
+    def test_process_interleaved_images_prompts_image_error(self):
+        processor = self.get_processor()
+
+        text = [
+            "This is a test sentence.",
+            "In this other sentence we try some good things",
+        ]
+        images = [[self.image1], [self.image2]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [[self.image1], []]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+
+        text = [
+            "This is a test sentence.<image>",
+            "In this other sentence we try some good things<image>",
+        ]
+        images = [[self.image1], [self.image2, self.image3]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [[], [self.image2]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1, self.image2, self.image3]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+
+        text = [
+            "This is a test sentence.",
+            "In this other sentence we try some good things<image>",
+        ]
+        images = [[self.image1], []]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [[], [self.image2]]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1, self.image2]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+        images = [self.image1]
+        with self.assertRaises(ValueError):
+            processor(text=text, images=images, padding=True)
+
+    def test_apply_chat_template(self):
+        # Message contains content which a mix of lists with images and image urls and string
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What do these images show?"},
+                    {"type": "image"},
+                    {"type": "image"},
+                    "What do these images show?",
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.",
+                    }
+                ],
+            },
+            {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
+        ]
+        processor = self.get_processor()
+        # Make short sequence length to test that the fake tokens are added correctly
+        rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
+
+        expected_rendered = (
+            "<|im_start|>User: What do these images show?<image><image><end_of_utterance>\n"
+            "Assistant: The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.<end_of_utterance>\n"
+            "User: And who is that?<end_of_utterance>\n"
+            "Assistant:"
+        )
+        self.assertEqual(rendered, expected_rendered)
+
+    @unittest.skip(reason="Broken from common. Fixing TODO @zucchini-nlp @molbap")
+    def test_chat_template_video_special_processing(self):
+        pass
+
+    @require_av
+    def test_chat_template_video(self):
+        # overriden because SmolVLM has special preprocessing for videos
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "video",
+                            "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4",
+                        },
+                        {"type": "text", "text": "What is shown in this video?"},
+                    ],
+                },
+            ]
+        ]
+
+        num_frames = 3
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            num_frames=num_frames,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
+        # SmolVLM doesn't sample `num_frames` exactly, by uses other sampling method
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 10)
+
+        # Load with `video_fps` arg
+        video_fps = 1
+        out_dict_with_video = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            video_fps=video_fps,
+        )
+        self.assertTrue(self.videos_input_name in out_dict_with_video)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
+        # SmolVLM doesn't sample 1 frame per second exactly, by uses other sampling method
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), video_fps * 10)
+
+        # NOTE: the last assert checks are removed
+        # Loading video as a list of frames (i.e. images) is not supported in SmolVLM
+
+    # Override as SmolVLMProcessor needs image tokens in prompts
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer <image>"
+
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+
+        if batch_size == 1:
+            return ["lower newer <image>"]
+        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
+            batch_size - 2
+        )
+
+    # Override tests as inputs_ids padded dimension is the second one but not the last one
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=30)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30)
+        self.assertEqual(len(inputs["input_ids"][0]), 30)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            common_kwargs={"return_tensors": "pt"},
+            images_kwargs={"max_image_size": {"longest_edge": 32}},
+            text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
+        )
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 32)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 120)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"max_image_size": {"longest_edge": 32}},
+            "text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[3], 32)
+        self.assertEqual(len(inputs["input_ids"][0]), 120)
+
+    @require_vision
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=30)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["input_ids"][0]), 30)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
+        image_input = [[image_input[0]], [image_input[1]]]
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            padding="longest",
+            max_length=76,
+            truncation=True,
+            max_image_size={"longest_edge": 30},
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 3)
+        self.assertEqual(inputs["pixel_values"].shape[3], 30)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            max_image_size={"longest_edge": 32},
+            padding="max_length",
+            max_length=120,
+            truncation="longest_first",
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 32)
+        self.assertEqual(len(inputs["input_ids"][0]), 120)
+
+    @require_torch
+    @require_vision
+    def test_text_only_inference(self):
+        """Test that the processor works correctly with text-only input."""
+        processor_components = self.prepare_components()
+        processor_components["tokenizer"] = self.get_component("tokenizer", padding_side="left")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+
+        text = "This is a simple text without images."
+        inputs = processor(text=text)
+
+        tokenized_sentence = processor.tokenizer(text, add_special_tokens=False)
+        expected_input_ids = [tokenized_sentence["input_ids"]]
+
+        self.assertEqual(inputs["input_ids"], expected_input_ids)
+        self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
+        self.assertTrue("pixel_values" not in inputs)
+        self.assertTrue("pixel_attention_mask" not in inputs)
+
+        # Test batch of texts without image tokens
+        texts = ["First text.", "Second piece of text."]
+        batch_inputs = processor(text=texts, padding=True)
+
+        tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False)
+        tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False)
+
+        expected_1 = tokenized_1["input_ids"]
+        expected_2 = tokenized_2["input_ids"]
+
+        # Pad the shorter sequence
+        pad_len = len(expected_2) - len(expected_1)
+        if pad_len > 0:
+            padded_expected_1 = [self.padding_token_id] * pad_len + expected_1
+            expected_attention_1 = [0] * pad_len + [1] * len(expected_1)
+            self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2])
+            self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)])
+        else:
+            pad_len = -pad_len
+            padded_expected_2 = [self.padding_token_id] * pad_len + expected_2
+            expected_attention_2 = [0] * pad_len + [1] * len(expected_2)
+            self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2])
+            self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2])
+
+    @require_torch
+    @require_vision
+    def test_missing_images_error(self):
+        """Test that appropriate error is raised when images are referenced but not provided."""
+        processor = self.get_processor()
+
+        # Test single text with image token but no image
+        text = "Let me show you this image: <image> What do you think?"
+        with self.assertRaises(ValueError) as context:
+            processor(text=text)
+        self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
+
+        # Test batch with image tokens but no images
+        texts = [
+            "First text with <image> token.",
+            "Second text <image> with token.",
+        ]
+        with self.assertRaises(ValueError) as context:
+            processor(text=texts)
+        self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
+
+        # Test with None as Images
+        with self.assertRaises(ValueError) as context:
+            processor(text=text, images=None)
+        self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
+
+        with self.assertRaises(ValueError) as context:
+            processor(text=texts, images=None)
+        self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
diff --git a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
index 8f210a07d278..5348315c7c84 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from transformers import is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow, torch_device
+from transformers.testing_utils import require_flax, slow
 
 from ...test_modeling_flax_common import floats_tensor, ids_tensor, random_attention_mask
 from ..bart.test_modeling_flax_bart import FlaxBartStandaloneDecoderModelTester
@@ -43,14 +43,8 @@
         SpeechEncoderDecoderConfig,
     )
     from transformers.modeling_flax_outputs import FlaxBaseModelOutput
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
 
 if is_torch_available():
-    import torch
-
     from transformers import SpeechEncoderDecoderModel
 
 
@@ -406,68 +400,6 @@ def compute_loss(
         for grad, grad_frozen in zip(grads, grads_frozen):
             self.assertTrue((grad == grad_frozen).all())
 
-    def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()).to(torch_device) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-            self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 1e-5)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 1e-5)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output_loaded in zip(fx_outputs, pt_outputs_loaded):
-            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(force=True), 1e-5)
-
-    def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = SpeechEncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)
-
-        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-        fx_model.params = fx_state
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
-    def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = SpeechEncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)
-
-        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
     def test_encoder_decoder_model_from_pretrained_configs(self):
         input_ids_dict = self.prepare_config_and_inputs()
         self.check_encoder_decoder_model_from_pretrained_configs(**input_ids_dict)
@@ -504,46 +436,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         diff = np.abs((a - b)).max()
         self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
 
-    @is_pt_flax_cross_test
-    def test_pt_flax_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        config = config_inputs_dict.pop("config")
-        decoder_config = config_inputs_dict.pop("decoder_config")
-
-        inputs_dict = config_inputs_dict
-        # `encoder_hidden_states` is not used in model call/forward
-        del inputs_dict["encoder_hidden_states"]
-
-        # Avoid the case where a sequence has no place to attend (after combined with the causal attention mask)
-        batch_size = inputs_dict["decoder_attention_mask"].shape[0]
-        inputs_dict["decoder_attention_mask"] = np.concatenate(
-            [np.ones(shape=(batch_size, 1)), inputs_dict["decoder_attention_mask"][:, 1:]], axis=1
-        )
-
-        # Flax models don't use the `use_cache` option and cache is not returned as a default.
-        # So we disable `use_cache` here for PyTorch model.
-        decoder_config.use_cache = False
-
-        self.assertTrue(decoder_config.cross_attention_hidden_size is None)
-
-        # check without `enc_to_dec_proj` projection
-        decoder_config.hidden_size = config.hidden_size
-        self.assertTrue(config.hidden_size == decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
-        # check `enc_to_dec_proj` work as expected
-        decoder_config.hidden_size = decoder_config.hidden_size * 2
-        self.assertTrue(config.hidden_size != decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
-        # check `add_adapter` works as expected
-        config.add_adapter = True
-        self.assertTrue(config.add_adapter)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
     @slow
     def test_real_model_save_load_from_pretrained(self):
         model_2 = self.get_pretrained_model()
@@ -625,71 +517,6 @@ def prepare_config_and_inputs(self):
             "encoder_hidden_states": encoder_hidden_states,
         }
 
-    @slow
-    def test_flaxwav2vec2gpt2_pt_flax_equivalence(self):
-        pt_model = SpeechEncoderDecoderModel.from_pretrained("jsnfly/wav2vec2-large-xlsr-53-german-gpt2")
-        fx_model = FlaxSpeechEncoderDecoderModel.from_pretrained(
-            "jsnfly/wav2vec2-large-xlsr-53-german-gpt2", from_pt=True
-        )
-
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        batch_size = 13
-        input_values = floats_tensor([batch_size, 512], scale=1.0)
-        attention_mask = random_attention_mask([batch_size, 512])
-        decoder_input_ids = ids_tensor([batch_size, 4], fx_model.config.decoder.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs_dict = {
-            "inputs": input_values,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs)
-        pt_logits = pt_outputs.logits
-        pt_outputs = pt_outputs.to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict)
-        fx_logits = fx_outputs.logits
-        fx_outputs = fx_outputs.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits.numpy(), 4e-2)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict)
-        fx_logits_loaded = fx_outputs_loaded.logits
-        fx_outputs_loaded = fx_outputs_loaded.to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits_loaded, pt_logits.numpy(), 4e-2)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-        pt_logits_loaded = pt_outputs_loaded.logits
-        pt_outputs_loaded = pt_outputs_loaded.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits_loaded.numpy(), 4e-2)
-
 
 @require_flax
 class FlaxWav2Vec2BartModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
@@ -742,71 +569,6 @@ def prepare_config_and_inputs(self):
             "encoder_hidden_states": encoder_hidden_states,
         }
 
-    @slow
-    def test_flaxwav2vec2bart_pt_flax_equivalence(self):
-        pt_model = SpeechEncoderDecoderModel.from_pretrained("patrickvonplaten/wav2vec2-2-bart-large")
-        fx_model = FlaxSpeechEncoderDecoderModel.from_pretrained(
-            "patrickvonplaten/wav2vec2-2-bart-large", from_pt=True
-        )
-
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        batch_size = 13
-        input_values = floats_tensor([batch_size, 512], scale=1.0)
-        attention_mask = random_attention_mask([batch_size, 512])
-        decoder_input_ids = ids_tensor([batch_size, 4], fx_model.config.decoder.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs_dict = {
-            "inputs": input_values,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs)
-        pt_logits = pt_outputs.logits
-        pt_outputs = pt_outputs.to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict)
-        fx_logits = fx_outputs.logits
-        fx_outputs = fx_outputs.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits.numpy(), 4e-2)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict)
-        fx_logits_loaded = fx_outputs_loaded.logits
-        fx_outputs_loaded = fx_outputs_loaded.to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits_loaded, pt_logits.numpy(), 4e-2)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-        pt_logits_loaded = pt_outputs_loaded.logits
-        pt_outputs_loaded = pt_outputs_loaded.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits_loaded.numpy(), 4e-2)
-
 
 @require_flax
 class FlaxWav2Vec2BertModelTest(FlaxEncoderDecoderMixin, unittest.TestCase):
@@ -858,66 +620,3 @@ def prepare_config_and_inputs(self):
             "decoder_attention_mask": decoder_attention_mask,
             "encoder_hidden_states": encoder_hidden_states,
         }
-
-    @slow
-    def test_flaxwav2vec2bert_pt_flax_equivalence(self):
-        pt_model = SpeechEncoderDecoderModel.from_pretrained("speech-seq2seq/wav2vec2-2-bert-large")
-        fx_model = FlaxSpeechEncoderDecoderModel.from_pretrained("speech-seq2seq/wav2vec2-2-bert-large", from_pt=True)
-
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        batch_size = 13
-        input_values = floats_tensor([batch_size, 512], fx_model.config.encoder.vocab_size)
-        attention_mask = random_attention_mask([batch_size, 512])
-        decoder_input_ids = ids_tensor([batch_size, 4], fx_model.config.decoder.vocab_size)
-        decoder_attention_mask = random_attention_mask([batch_size, 4])
-        inputs_dict = {
-            "inputs": input_values,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "decoder_attention_mask": decoder_attention_mask,
-        }
-
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs)
-        pt_logits = pt_outputs.logits
-        pt_outputs = pt_outputs.to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict)
-        fx_logits = fx_outputs.logits
-        fx_outputs = fx_outputs.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits.numpy(), 4e-2)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxSpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict)
-        fx_logits_loaded = fx_outputs_loaded.logits
-        fx_outputs_loaded = fx_outputs_loaded.to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits_loaded, pt_logits.numpy(), 4e-2)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-        pt_logits_loaded = pt_outputs_loaded.logits
-        pt_outputs_loaded = pt_outputs_loaded.to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        self.assert_almost_equals(fx_logits, pt_logits_loaded.numpy(), 4e-2)
diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
index 897d4b056f19..399a111530d1 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
@@ -266,7 +266,10 @@ def check_save_and_load_encoder_decoder_model(
             out_2 = outputs[0].cpu().numpy()
             out_2[np.isnan(out_2)] = 0
 
-            with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
+            with (
+                tempfile.TemporaryDirectory() as encoder_tmp_dirname,
+                tempfile.TemporaryDirectory() as decoder_tmp_dirname,
+            ):
                 enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname)
                 enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname)
                 SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
index 2a4ad0894911..8b10b8865001 100644
--- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
+++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
@@ -144,6 +144,40 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+    def test_dither(self):
+        np.random.seed(42)  # seed the dithering randn()
+
+        # Tests that features with and without little dithering are similar, but not the same
+        dict_no_dither = self.feat_extract_tester.prepare_feat_extract_dict()
+        dict_no_dither["dither"] = 0.0
+
+        dict_dither = self.feat_extract_tester.prepare_feat_extract_dict()
+        dict_dither["dither"] = 1.0
+
+        feature_extractor_no_dither = self.feature_extraction_class(**dict_no_dither)
+        feature_extractor_dither = self.feature_extraction_class(**dict_dither)
+
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # compute features
+        input_features_no_dither = feature_extractor_no_dither(
+            np_speech_inputs, padding=True, return_tensors="np", sampling_rate=dict_no_dither["sampling_rate"]
+        ).input_features
+        input_features_dither = feature_extractor_dither(
+            np_speech_inputs, padding=True, return_tensors="np", sampling_rate=dict_dither["sampling_rate"]
+        ).input_features
+
+        # test there is a difference between features (there's added noise to input signal)
+        diff = input_features_dither - input_features_no_dither
+
+        # features are not identical
+        self.assertTrue(np.abs(diff).mean() > 1e-5)
+        # features are not too different
+        self.assertTrue(np.abs(diff).mean() <= 1e-3)
+        self.assertTrue(np.abs(diff).max() <= 5e-2)
+
     def test_cepstral_mean_and_variance_normalization(self):
         feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
         speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 7250cc221098..bcd76a87e959 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -324,13 +324,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -693,10 +693,6 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
-        # Allow missing keys since TF doesn't cache the sinusoidal embeddings in an attribute
-        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
-
     @unittest.skip(reason="Test failing,  @RocketNight is looking into it")
     def test_tf_from_pt_safetensors(self):
         pass
diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
index ef9a2b33bc24..1c80e666d0ab 100644
--- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
@@ -247,13 +247,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -416,82 +416,6 @@ def test_resize_embeddings_untied(self):
     def test_generate_without_input_ids(self):
         pass
 
-    # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is
-    # `input_features`
-    def test_lm_head_model_random_no_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_features = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_features
-                with self.assertRaises(AssertionError):
-                    model.generate(do_sample=True, max_length=5)
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_features, do_sample=True))
-
-            with self.assertRaises(ValueError):
-                # generating multiple sequences when no beam search generation
-                # is not allowed as it would always generate the same sequences
-                model.generate(input_features, do_sample=False, num_return_sequences=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_features, do_sample=True, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_features, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_features.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is
-    # `input_features`
-    def test_lm_head_model_random_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_features = inputs_dict.get("input_features", None)
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_features, do_sample=True, num_beams=2))
-
-            with self.assertRaises(ValueError):
-                # generating more sequences than having beams leads is not possible
-                model.generate(input_features, do_sample=False, num_return_sequences=3, num_beams=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(
-                model.generate(
-                    input_features,
-                    do_sample=True,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
-            # num_return_sequences > 1, greedy
-            self._check_generated_ids(
-                model.generate(input_features, do_sample=False, num_beams=2, num_return_sequences=2)
-            )
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_features, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_features.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
     # overwritten from parent -- the input is `input_features`, not `input_ids`
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -510,10 +434,6 @@ def test_forward_signature(self):
             ]
             self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
 
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=True):
-        # Allow missing keys since TF doesn't cache the sinusoidal embeddings in an attribute
-        super().test_pt_tf_model_equivalence(allow_missing_keys=allow_missing_keys)
-
 
 @require_tf
 @require_sentencepiece
diff --git a/tests/models/speech_to_text/test_tokenization_speech_to_text.py b/tests/models/speech_to_text/test_tokenization_speech_to_text.py
index 6bea58ddfcf2..3fc2926b62e9 100644
--- a/tests/models/speech_to_text/test_tokenization_speech_to_text.py
+++ b/tests/models/speech_to_text/test_tokenization_speech_to_text.py
@@ -42,8 +42,9 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = False
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         spm_model = sp.SentencePieceProcessor()
         spm_model.Load(SAMPLE_VOCAB)
@@ -52,13 +53,13 @@ def setUp(self):
         vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
 
-        save_dir = Path(self.tmpdirname)
+        save_dir = Path(cls.tmpdirname)
         save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
         if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
             copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"])
 
-        tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer = Speech2TextTokenizer.from_pretrained(cls.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index efc384e7051d..3b7ae7fa0488 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -362,6 +362,8 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
 @require_torch
 class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (SpeechT5ForSpeechToText,) if is_torch_available() else ()
+    # Doesn't run generation tests. TODO eustache/joao: shape checks probably need an update
+    all_generative_model_classes = ()
     is_encoder_decoder = True
     test_pruning = False
     test_headmasking = False
@@ -702,13 +704,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -879,6 +881,7 @@ def create_and_check_model_forward(self, config, inputs_dict):
 @require_torch
 class SpeechT5ForTextToSpeechTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (SpeechT5ForTextToSpeech,) if is_torch_available() else ()
+    all_generative_model_classes = ()
     is_encoder_decoder = True
     test_pruning = False
     test_headmasking = False
@@ -890,6 +893,12 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    def test_model_can_generate(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertTrue(model.can_generate())
+
     def test_save_load_strict(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         for model_class in self.all_model_classes:
@@ -1019,13 +1028,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -1721,13 +1730,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -1886,14 +1895,6 @@ def test_model_outputs_equivalence(self):
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip(reason="Fails on automapping of SpeechT5HifiGanConfig")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="Fails on automapping of SpeechT5HifiGanConfig")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     def test_batched_inputs_outputs(self):
         config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/models/speecht5/test_tokenization_speecht5.py b/tests/models/speecht5/test_tokenization_speecht5.py
index 8b53031f5249..026fd1d2f48e 100644
--- a/tests/models/speecht5/test_tokenization_speecht5.py
+++ b/tests/models/speecht5/test_tokenization_speecht5.py
@@ -35,8 +35,9 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = False
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB)
@@ -46,7 +47,7 @@ def setUp(self):
         tokenizer.add_special_tokens({"mask_token": mask_token})
         tokenizer.add_tokens(["<ctc_blank>"])
 
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "this is a test"
diff --git a/tests/models/splinter/test_tokenization_splinter.py b/tests/models/splinter/test_tokenization_splinter.py
index 4c6d295e8a82..f654e2e5b73e 100644
--- a/tests/models/splinter/test_tokenization_splinter.py
+++ b/tests/models/splinter/test_tokenization_splinter.py
@@ -13,8 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
+from functools import lru_cache
 
-from tests.test_tokenization_common import TokenizerTesterMixin
+from tests.test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 from transformers import SplinterTokenizerFast, is_tf_available, is_torch_available
 from transformers.models.splinter import SplinterTokenizer
 from transformers.testing_utils import get_tests_dir, slow
@@ -40,20 +41,29 @@ class SplinterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     pre_trained_model_path = "tau/splinter-base"
 
     # Copied from transformers.models.siglip.SiglipTokenizationTest.setUp
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
         tokenizer = SplinterTokenizer(SAMPLE_VOCAB)
         tokenizer.vocab["[UNK]"] = len(tokenizer.vocab)
         tokenizer.vocab["[QUESTION]"] = len(tokenizer.vocab)
         tokenizer.vocab["."] = len(tokenizer.vocab)
         tokenizer.add_tokens("this is a test thou shall not determine rigor truly".split())
-        tokenizer.save_pretrained(self.tmpdirname)
-
-    def get_tokenizer(self, **kwargs) -> SplinterTokenizer:
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_rust_tokenizer(self, **kwargs) -> SplinterTokenizerFast:
-        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+        tokenizer.save_pretrained(cls.tmpdirname)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> SplinterTokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> SplinterTokenizerFast:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     # Copied from transformers.models.siglip.SiglipTokenizationTest.test_get_vocab
     def test_get_vocab(self):
diff --git a/tests/models/squeezebert/test_tokenization_squeezebert.py b/tests/models/squeezebert/test_tokenization_squeezebert.py
index 3ac24e8374b7..0a75e768ccd3 100644
--- a/tests/models/squeezebert/test_tokenization_squeezebert.py
+++ b/tests/models/squeezebert/test_tokenization_squeezebert.py
@@ -13,22 +13,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from functools import lru_cache
 
 from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast
 from transformers.testing_utils import require_tokenizers, slow
 
-from ..bert.test_tokenization_bert import BertTokenizationTest
+from ...test_tokenization_common import use_cache_if_possible
+
+# Avoid import `BertTokenizationTest` directly as it will run as `test_tokenization_squeezebert.py::BertTokenizationTest`
+# together with `test_tokenization_bert.py::BertTokenizationTest`.
+from ..bert import test_tokenization_bert
 
 
 @require_tokenizers
-class SqueezeBertTokenizationTest(BertTokenizationTest):
+class SqueezeBertTokenizationTest(test_tokenization_bert.BertTokenizationTest):
     tokenizer_class = SqueezeBertTokenizer
     rust_tokenizer_class = SqueezeBertTokenizerFast
     test_rust_tokenizer = True
     from_pretrained_id = "squeezebert/squeezebert-uncased"
 
-    def get_rust_tokenizer(self, **kwargs):
-        return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return SqueezeBertTokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
     @slow
     def test_sequence_builders(self):
diff --git a/tests/models/stablelm/test_modeling_stablelm.py b/tests/models/stablelm/test_modeling_stablelm.py
index 946b220e0ea9..5cc21b8e3d71 100644
--- a/tests/models/stablelm/test_modeling_stablelm.py
+++ b/tests/models/stablelm/test_modeling_stablelm.py
@@ -153,116 +153,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = StableLmModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = StableLmForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = StableLmForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -409,7 +299,9 @@ def test_model_rope_scaling(self):
         long_input_length = int(config.max_position_embeddings * 1.5)
 
         # Inputs
-        x = torch.randn(1, dtype=torch.float32, device=torch_device)  # used exlusively to get the dtype and the device
+        x = torch.randn(
+            1, dtype=torch.float32, device=torch_device
+        )  # used exclusively to get the dtype and the device
         position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device)
         position_ids_short = position_ids_short.unsqueeze(0)
         position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device)
diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py
index 9f06697a1948..e27b9303229c 100644
--- a/tests/models/starcoder2/test_modeling_starcoder2.py
+++ b/tests/models/starcoder2/test_modeling_starcoder2.py
@@ -157,119 +157,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Starcoder2
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = Starcoder2Model(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Starcoder2
-    def create_and_check_for_causal_lm(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        model = Starcoder2ForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Starcoder2
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = Starcoder2ForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -395,10 +282,6 @@ def test_Starcoder2_token_classification_model(self):
             (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
         )
 
-    @unittest.skip(reason="Starcoder2 buffers include complex numbers, which breaks this test")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(reason="Starcoder2 uses GQA on all models so the KV cache is a non standard format")
     def test_past_key_values_format(self):
         pass
diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py
index 3752f6ef3004..5310835e9873 100644
--- a/tests/models/swin2sr/test_modeling_swin2sr.py
+++ b/tests/models/swin2sr/test_modeling_swin2sr.py
@@ -213,13 +213,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index 03de2f72d01d..622c579843c8 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -694,7 +694,7 @@ def test_encoder_decoder_shared_weights(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_encoder_decoder_shared_weights(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
@@ -709,20 +709,6 @@ def test_model_from_pretrained(self):
         model = SwitchTransformersModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = SwitchTransformersModel(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/switch_transformers_test.onnx",
-                export_params=True,
-                opset_version=9,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     def test_generate_with_head_masking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -758,7 +744,7 @@ def test_generate_with_head_masking(self):
             self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
@@ -881,13 +867,13 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
 
     @unittest.skip(
-        reason="This architecure has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
+        reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245"
     )
     def test_load_save_without_tied_weights(self):
         pass
diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py
index 516fb5c6d528..703b9973cb70 100644
--- a/tests/models/t5/test_modeling_flax_t5.py
+++ b/tests/models/t5/test_modeling_flax_t5.py
@@ -17,17 +17,9 @@
 
 import numpy as np
 
-import transformers
 from transformers import is_flax_available
-from transformers.testing_utils import (
-    is_pt_flax_cross_test,
-    require_flax,
-    require_sentencepiece,
-    require_tokenizers,
-    slow,
-)
-
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
+from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
+
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
@@ -48,7 +40,6 @@
     from flax.traverse_util import flatten_dict
 
     from transformers import FLAX_MODEL_MAPPING, ByT5Tokenizer, T5Config, T5Tokenizer
-    from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
     from transformers.models.t5.modeling_flax_t5 import (
         FlaxT5EncoderModel,
         FlaxT5ForConditionalGeneration,
@@ -227,7 +218,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_flax
-class FlaxT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxT5ModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxT5Model, FlaxT5ForConditionalGeneration) if is_flax_available() else ()
     is_encoder_decoder = True
 
@@ -374,95 +365,6 @@ def test_save_load_to_base(self):
                     max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
                     self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                # save pt model
-                pt_model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            model.params = model.to_bf16(model.params)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
 
 class FlaxT5EncoderOnlyModelTester:
     def __init__(
@@ -664,95 +566,6 @@ def test_save_load_to_base(self):
                     max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
                     self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                # save pt model
-                pt_model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite since special base model prefix is used
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            model.params = model.to_bf16(model.params)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
 
 @require_sentencepiece
 @require_tokenizers
@@ -761,7 +574,7 @@ class FlaxT5ModelIntegrationTests(unittest.TestCase):
     @slow
     def test_small_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -791,7 +604,7 @@ def test_small_integration_test(self):
     @slow
     def test_small_v1_1_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -821,7 +634,7 @@ def test_small_v1_1_integration_test(self):
     @slow
     def test_small_byt5_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.9.1
 
         >>> path_to_byt5_small_checkpoint = '<fill_in>'
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index a0439550f8f0..ce543e606618 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -22,7 +22,9 @@
 
 from transformers import T5Config, is_torch_available
 from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4
 from transformers.testing_utils import (
+    cleanup,
     require_accelerate,
     require_sentencepiece,
     require_tokenizers,
@@ -875,20 +877,6 @@ def test_model_from_pretrained(self):
         model = T5Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = T5Model(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/t5_test.onnx",
-                export_params=True,
-                opset_version=9,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     def test_generate_with_head_masking(self):
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -1183,6 +1171,10 @@ def import_accelerate_mock(name, *args, **kwargs):
 @require_sentencepiece
 @require_tokenizers
 class T5ModelIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # See LlamaIntegrationTest.tearDown(). Can be removed once LlamaIntegrationTest.tearDown() is removed.
+        cleanup(torch_device, gc_collect=False)
+
     @cached_property
     def model(self):
         return T5ForConditionalGeneration.from_pretrained("google-t5/t5-base").to(torch_device)
@@ -1222,7 +1214,7 @@ def test_small_generation(self):
     @slow
     def test_small_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -1248,7 +1240,7 @@ def test_small_integration_test(self):
     @slow
     def test_small_v1_1_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -1274,7 +1266,7 @@ def test_small_v1_1_integration_test(self):
     @slow
     def test_small_byt5_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.9.1
 
         >>> path_to_byt5_small_checkpoint = '<fill_in>'
@@ -1489,19 +1481,27 @@ def test_summarization(self):
         )
 
         expected_summaries = [
+            "<pad> "
             'prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a'
             " cell phone video of the final seconds . \"one can hear cries of 'My God' in several languages,\" one"
-            " magazine says .",
+            " magazine says ."
+            "</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>",
+            "<pad> "
             "the formal accession was marked by a ceremony at The Hague, in the Netherlands . the ICC opened a"
             " preliminary examination into the situation in the occupied Palestinian territory . as members of the"
-            " court, Palestinians may be subject to counter-charges as well .",
+            " court, Palestinians may be subject to counter-charges as well ."
+            "</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>",
+            "<pad> "
             "the u.s. and its negotiating partners reached a very strong framework agreement with Iran . aaron miller:"
             " the debate that has already begun since the announcement of the new framework will likely result in more"
             " heat than light . the deal would reduce Iran's low-enriched uranium stockpile, cut centrifuges and"
-            " implement a rigorous inspection regime .",
+            " implement a rigorous inspection regime ."
+            "</s>",
+            "<pad> "
             "prosecutors say the marriages were part of an immigration scam . if convicted, barrientos faces two"
             ' criminal counts of "offering a false instrument for filing in the first degree" she has been married 10'
-            " times, with nine of her marriages occurring between 1999 and 2002 .",
+            " times, with nine of her marriages occurring between 1999 and 2002 ."
+            "</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>",
         ]
 
         use_task_specific_params(model, "summarization")
@@ -1526,11 +1526,8 @@ def test_summarization(self):
             early_stopping=True,
         )
 
-        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertListEqual(
-            expected_summaries,
-            decoded,
-        )
+        decoded = tok.batch_decode(hypotheses_batch)
+        self.assertListEqual(expected_summaries, decoded)
 
     @slow
     def test_translation_en_to_de(self):
@@ -1540,13 +1537,13 @@ def test_translation_en_to_de(self):
 
         en_text = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.'
         expected_translation = (
-            '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.'
+            '<pad> "Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.</s>'
         )
 
         input_ids = tok.encode(model.config.prefix + en_text, return_tensors="pt")
         input_ids = input_ids.to(torch_device)
         output = model.generate(input_ids)
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        translation = tok.decode(output[0])
         self.assertEqual(translation, expected_translation)
 
     @slow
@@ -1572,13 +1569,15 @@ def test_translation_en_to_fr(self):
             do_sample=False,
             early_stopping=True,
         )
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        translation = tok.decode(output[0])
         new_truncated_translation = (
+            "<pad> "
             "Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre "
             "un "
             "« portrait familial » de générations innombrables d’étoiles : les plus anciennes sont observées "
             "sous forme "
             "de points bleus."
+            "</s>"
         )
 
         self.assertEqual(translation, new_truncated_translation)
@@ -1589,11 +1588,13 @@ def test_translation_en_to_ro(self):
         tok = self.tokenizer
         use_task_specific_params(model, "translation_en_to_ro")
         en_text = "Taco Bell said it plans to add 2,000 locations in the US by 2022."
-        expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022."
+        expected_translation = (
+            "<pad> Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022.</s>"
+        )
 
         inputs = tok(model.config.prefix + en_text, return_tensors="pt").to(torch_device)
         output = model.generate(**inputs)
-        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        translation = tok.decode(output[0])
         self.assertEqual(translation, expected_translation)
 
     @slow
@@ -1703,6 +1704,150 @@ def test_compile_static_cache_encoder(self):
         logits_compiled = model(**inputs)
         torch.testing.assert_close(logits[0][:, -3:, -3], logits_compiled[0][:, -3:, -3], rtol=1e-5, atol=1e-5)
 
+    @slow
+    def test_export_encoder(self):
+        """Test exporting T5EncoderModel to torch export format."""
+        if not is_torch_greater_or_equal_than_2_4:
+            self.skipTest("This test requires torch >= 2.4 to run.")
+
+        from transformers.integrations.executorch import Seq2SeqLMEncoderExportableModule
+
+        model_id = "google-t5/t5-small"
+        device = "cpu"
+        example_input_ids = torch.ones((1, 10), dtype=torch.long).to(device)
+
+        # Load model
+        model = T5EncoderModel.from_pretrained(model_id).to(device=device).eval()
+
+        # Get original output for comparison
+        with torch.no_grad():
+            original_output = model(input_ids=example_input_ids).last_hidden_state
+
+        encoder_model = Seq2SeqLMEncoderExportableModule(model)
+
+        # Export the encoder_model
+        with torch.no_grad():
+            seq_len_dim = torch.export.Dim("sequence_length", max=4096)
+
+            exported_program = torch.export.export(
+                encoder_model, (example_input_ids,), dynamic_shapes={"input_ids": {1: seq_len_dim}}, strict=True
+            )
+
+        # Test the exported model
+        with torch.no_grad():
+            exported_output = exported_program.module()(example_input_ids)
+
+        # Verify outputs are close enough
+        self.assertTrue(torch.allclose(original_output, exported_output, atol=1e-5))
+
+    @slow
+    def test_export_decoder(self):
+        """Test exporting T5 decoder with static cache to torch export format."""
+        if not is_torch_greater_or_equal_than_2_4:
+            self.skipTest("This test requires torch >= 2.4 to run.")
+
+        from transformers import AutoModelForSeq2SeqLM, T5ForConditionalGeneration
+        from transformers.integrations.executorch import Seq2SeqLMDecoderExportableModuleWithStaticCache
+
+        model_id = "google-t5/t5-small"
+
+        # Configuration for static cache
+        batch_size = 1
+        max_cache_len = 123
+        device = "cpu"
+
+        full_model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)
+        self.assertIsInstance(full_model, T5ForConditionalGeneration)
+        decoder_model = (
+            Seq2SeqLMDecoderExportableModuleWithStaticCache(full_model, max_cache_len, batch_size).to(device).eval()
+        )
+
+        # Prepare test inputs
+        example_decoder_input_ids = torch.tensor([[0]], dtype=torch.long)  # Start token
+        example_cache_position = torch.tensor([0], dtype=torch.long)
+
+        # For T5-small, hidden size is 512
+        example_encoder_hidden_states = torch.zeros((batch_size, 10, 512), dtype=torch.float32)
+
+        # Export the model
+        with torch.no_grad():
+            encoder_sequence_length_dim = torch.export.Dim("encoder_sequence_length", max=4096)
+
+            exported_program = torch.export.export(
+                decoder_model,
+                (example_decoder_input_ids, example_encoder_hidden_states, example_cache_position),
+                dynamic_shapes={
+                    "decoder_input_ids": None,
+                    "encoder_hidden_states": {1: encoder_sequence_length_dim},
+                    "cache_position": None,
+                },
+                strict=True,
+            )
+
+        # We won't directly verify outputs here as it's complicated with caching,
+        # but we'll check the export was successful
+        self.assertIsNotNone(exported_program)
+
+        # Verify cache buffers existence and shapes
+        cache_buffers = [
+            (name, buffer)
+            for name, buffer in exported_program.named_buffers()
+            if name.startswith("key_cache_") or name.startswith("value_cache_")
+        ]
+
+        # Verify cache buffers
+        self.assertTrue(len(cache_buffers) > 0, "No cache buffers found in exported model")
+        for name, buffer in cache_buffers:
+            # Verify cache buffers are 3D
+            self.assertEqual(buffer.shape[2], max_cache_len)
+
+    @slow
+    def test_export_t5_summarization(self):
+        """Test composing exported T5 encoder and decoder for summarization."""
+        if not is_torch_greater_or_equal_than_2_4:
+            self.skipTest("This test requires torch >= 2.4 to run.")
+
+        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration
+        from transformers.integrations.executorch import Seq2SeqLMExportableModule
+
+        device = "cpu"
+        batch_size = 1
+        max_cache_length = 1234
+        max_hidden_seq_length = 5678
+        model_id = "google-t5/t5-small"
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        full_model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device).eval()
+        self.assertIsInstance(full_model, T5ForConditionalGeneration)
+        wrapped_model = Seq2SeqLMExportableModule(
+            full_model,
+            batch_size=batch_size,
+            max_hidden_seq_length=max_hidden_seq_length,
+            max_cache_length=max_cache_length,
+        )
+
+        exported_t5 = wrapped_model.export()
+
+        # Test Summarization with Composed Models
+        prompts = [
+            "summarize: Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial "
+            "reference frames, and 2) the laws of physics are the same for all inertial reference frames.\nThe "
+            "theory of relativity is not hard to grasp."
+        ]
+        input_ids = tokenizer(prompts, return_tensors="pt").input_ids
+
+        generated_ids = exported_t5.generate(prompt_token_ids=input_ids, max_new_tokens=max_cache_length)
+        generated_summary = tokenizer.decode(generated_ids, skip_special_tokens=True)
+
+        # Also run original model for comparison
+        original_model = T5ForConditionalGeneration.from_pretrained(model_id).eval()
+        with torch.no_grad():
+            original_outputs = original_model.generate(input_ids, max_length=50, num_beams=1)
+        original_summary = tokenizer.decode(original_outputs[0], skip_special_tokens=True)
+
+        # Basic verification that we got a reasonable summary
+        self.assertEqual(generated_summary, original_summary)
+
 
 @require_torch
 class TestAsymmetricT5(unittest.TestCase):
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
index 037f1b1e2188..7e6367582ead 100644
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -618,7 +618,7 @@ def model(self):
     @slow
     def test_small_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -644,7 +644,7 @@ def test_small_integration_test(self):
     @slow
     def test_small_v1_1_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.7.1
         >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
 
@@ -670,7 +670,7 @@ def test_small_v1_1_integration_test(self):
     @slow
     def test_small_byt5_integration_test(self):
         """
-        For comparision run:
+        For comparison run:
         >>> import t5  # pip install t5==0.9.1
 
         >>> path_to_byt5_small_checkpoint = '<fill_in>'
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index e64882b6d3fc..aba5dde8cb95 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -17,12 +17,13 @@
 import re
 import tempfile
 import unittest
+from functools import lru_cache
 
 from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
 from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_seqio, require_tokenizers, slow
 from transformers.utils import cached_property, is_tf_available, is_torch_available
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@@ -44,12 +45,13 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = T5Tokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@@ -145,11 +147,19 @@ def t5_base_tokenizer(self):
     def t5_base_tokenizer_fast(self):
         return T5TokenizerFast.from_pretrained("google-t5/t5-base")
 
-    def get_tokenizer(self, **kwargs) -> T5Tokenizer:
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> T5Tokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
-    def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast:
-        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> T5TokenizerFast:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def test_rust_and_python_full_tokenizers(self):
         if not self.test_rust_tokenizer:
@@ -275,10 +285,10 @@ def test_special_tokens_initialization(self):
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
-                tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_cr = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
                 )
                 tokenizer_p = self.tokenizer_class.from_pretrained(
@@ -460,10 +470,8 @@ def test_add_prefix_space(self):
         EXPECTED_WITH_SPACE = [9459, 149, 33, 25, 692, 1]
         EXPECTED_WO_SPACE = [3845, 63, 149, 33, 25, 692, 1]
 
-        slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
-        fast_ = self.rust_tokenizer_class.from_pretrained(
-            pretrained_name, add_prefix_space=False, legacy=False, from_slow=True
-        )
+        slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
+        fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False, from_slow=True)
         self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
         self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
         self.assertEqual(slow_.tokenize(inputs), ["He", "y", "▁how", "▁are", "▁you", "▁doing"])
@@ -473,8 +481,8 @@ def test_add_prefix_space(self):
             fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
         )
 
-        slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
-        fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
+        slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
+        fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
         self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
         self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
         self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])
@@ -498,7 +506,7 @@ def setUpClass(cls):
         tokenizer.add_special_tokens(
             {"additional_special_tokens": [AddedToken("<extra_id_0>", rstrip=False, lstrip=False)]}
         )
-        # TODO ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created
+        # TODO ArthurZ the above is necessary as addedTokens / initialization sucks. Trie is not correctly created
         # So the extra ids are split....
         cls.tokenizer = tokenizer
 
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index cbed595f66ff..9995aae7d4b4 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -23,7 +23,6 @@
 from transformers import ResNetConfig, TableTransformerConfig, is_torch_available, is_vision_available
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -189,7 +188,7 @@ def create_and_check_table_transformer_no_timm_backbone(self, config, pixel_valu
 
 
 @require_torch
-class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class TableTransformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TableTransformerModel,
@@ -478,10 +477,10 @@ def test_different_timm_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
@@ -510,10 +509,10 @@ def test_hf_backbone(self):
                     self.model_tester.num_labels + 1,
                 )
                 self.assertEqual(outputs.logits.shape, expected_shape)
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.model.backbone.conv_encoder.intermediate_channel_sizes), 3)
             else:
-                # Confirm out_indices was propogated to backbone
+                # Confirm out_indices was propagated to backbone
                 self.assertEqual(len(model.backbone.conv_encoder.intermediate_channel_sizes), 3)
 
             self.assertTrue(outputs)
diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py
index 980ff28b9ee9..c6f880b86dd1 100644
--- a/tests/models/tapas/test_modeling_tapas.py
+++ b/tests/models/tapas/test_modeling_tapas.py
@@ -32,7 +32,7 @@
     is_torch_available,
 )
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_tensorflow_probability, require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, slow, torch_device
 from transformers.utils import cached_property
 
 from ...test_configuration_common import ConfigTester
@@ -522,11 +522,6 @@ def test_for_sequence_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
 
-    @require_tensorflow_probability
-    @unittest.skip(reason="tfp is not defined even if installed. FIXME @Arthur in a followup PR!")
-    def test_pt_tf_model_equivalence(self):
-        pass
-
     @unittest.skip(reason="tfp is not defined even if installed. FIXME @Arthur in a followup PR!")
     def test_tf_from_pt_safetensors(self):
         pass
diff --git a/tests/models/tapas/test_modeling_tf_tapas.py b/tests/models/tapas/test_modeling_tf_tapas.py
index b70ec13452b3..11915b98bbe9 100644
--- a/tests/models/tapas/test_modeling_tf_tapas.py
+++ b/tests/models/tapas/test_modeling_tf_tapas.py
@@ -535,10 +535,6 @@ def test_keras_fit(self):
     def test_loss_computation(self):
         pass
 
-    @unittest.skip("tfp is not defined even if installed. FIXME @Arthur in a followup PR!")
-    def test_pt_tf_model_equivalence(self):
-        pass
-
 
 def prepare_tapas_single_inputs_for_inference():
     # Here we prepare a single table-question pair to test TAPAS inference on:
diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index 9a3a2578fd16..f50e9eb8678f 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -34,7 +34,6 @@
     _is_whitespace,
 )
 from transformers.testing_utils import (
-    is_pt_tf_cross_test,
     require_pandas,
     require_tensorflow_probability,
     require_tokenizers,
@@ -113,8 +112,9 @@ def get_clean_sequence(
 
         return output_txt, output_ids
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab_tokens = [
             "[UNK]",
@@ -133,8 +133,8 @@ def setUp(self):
             "low",
             "lowest",
         ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
@@ -353,7 +353,7 @@ def test_sequence_builders(self):
     def test_offsets_with_special_characters(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
                 tokens = tokenizer_r.encode_plus(
@@ -1158,54 +1158,6 @@ def test_min_max_question_length(self):
 
         self.assertListEqual(encoding.input_ids[:2], expected_results)
 
-    @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                table = self.get_table(tokenizer, length=0)
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        table,
-                        sequences,
-                        padding=True,
-                        return_tensors="pt",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        table,
-                        sequences,
-                        padding="longest",
-                        return_tensors="tf",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
-                    tensorflow_tensor = tokenizer.batch_encode_plus(
-                        table, sequences, padding="longest", return_tensors="tf"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
     @slow
     def test_tapas_integration_test(self):
         data = {
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index c886bb08856c..f9dfefe37672 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -369,19 +369,19 @@ def test_attention_outputs(self):
         )
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -481,7 +481,7 @@ def test_model_get_set_embeddings(self):
 
 def prepare_batch(filename="train-batch.pt"):
     file = hf_hub_download(repo_id="hf-internal-testing/tourism-monthly-batch", filename=filename, repo_type="dataset")
-    batch = torch.load(file, map_location=torch_device)
+    batch = torch.load(file, map_location=torch_device, weights_only=True)
     return batch
 
 
@@ -554,4 +554,4 @@ def test_seq_to_seq_generation(self):
 
         expected_slice = torch.tensor([2825.2749, 3584.9207, 6763.9951], device=torch_device)
         mean_prediction = outputs.sequences.mean(dim=1)
-        torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1)
+        torch.testing.assert_close(mean_prediction[0, -3:], expected_slice, rtol=1e-1, atol=1e-1)
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index 737b1ea3c5d9..06e9aebb99c4 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -27,8 +27,6 @@
 
 
 if is_torch_available():
-    import torch
-
     from transformers import TimmBackbone, TimmBackboneConfig
 
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -76,17 +74,6 @@ def get_config(self):
             backbone=self.backbone,
         )
 
-    def create_and_check_model(self, config, pixel_values):
-        model = TimmBackbone(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            result = model(pixel_values)
-        self.parent.assertEqual(
-            result.feature_map[-1].shape,
-            (self.batch_size, model.channels[-1], 14, 14),
-        )
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values = config_and_inputs
diff --git a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
index 360bbc937177..2628106512bd 100644
--- a/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
+++ b/tests/models/timm_wrapper/test_modeling_timm_wrapper.py
@@ -75,17 +75,6 @@ def prepare_config_and_inputs(self):
     def get_config(self):
         return TimmWrapperConfig.from_pretrained(self.model_name)
 
-    def create_and_check_model(self, config, pixel_values):
-        model = TimmWrapperModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            result = model(pixel_values)
-        self.parent.assertEqual(
-            result.feature_map[-1].shape,
-            (self.batch_size, model.channels[-1], 14, 14),
-        )
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values = config_and_inputs
diff --git a/tests/models/trocr/test_modeling_trocr.py b/tests/models/trocr/test_modeling_trocr.py
index 26654546f648..9af22b1d1bf6 100644
--- a/tests/models/trocr/test_modeling_trocr.py
+++ b/tests/models/trocr/test_modeling_trocr.py
@@ -173,14 +173,6 @@ def setUp(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="trocr has no base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="trocr has no base model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     def test_config(self):
         self.config_tester.run_common_tests()
 
diff --git a/tests/models/tvp/test_modeling_tvp.py b/tests/models/tvp/test_modeling_tvp.py
index 6ab0dffde4eb..3ac01c53d52b 100644
--- a/tests/models/tvp/test_modeling_tvp.py
+++ b/tests/models/tvp/test_modeling_tvp.py
@@ -194,7 +194,7 @@ def test_inputs_embeds(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # override as the `logit_scale` parameter initilization is different for TVP
+    # override as the `logit_scale` parameter initialization is different for TVP
     def test_initialization(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -219,7 +219,7 @@ def _validate_backbone_init():
                 model.to(torch_device)
                 model.eval()
 
-                # Confirm out_indices propogated to backbone
+                # Confirm out_indices propagated to backbone
                 if model.__class__.__name__ == "TvpModel":
                     self.assertEqual(len(model.vision_model.backbone.out_indices), 2)
                 elif model.__class__.__name__ == "TvpForVideoGrounding":
diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py
index e750d50b62d1..c5f485d7973c 100644
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@@ -30,6 +30,7 @@
 )
 from transformers.utils import cached_property
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -265,7 +266,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class UdopModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             UdopModel,
@@ -329,13 +330,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -419,6 +420,14 @@ def test_model_from_pretrained(self):
         model = UdopForConditionalGeneration.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @unittest.skip(reason="TODO: Fix me @joao")
+    def test_generate_with_head_masking(self):
+        pass
+
+    @unittest.skip(reason="TODO: Fix me @joao")
+    def test_generate_without_input_ids(self):
+        pass
+
 
 class UdopEncoderOnlyModelTester:
     def __init__(
diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py
index a6ac2ff3d380..c8490c4cc0ef 100644
--- a/tests/models/udop/test_tokenization_udop.py
+++ b/tests/models/udop/test_tokenization_udop.py
@@ -30,7 +30,6 @@
 )
 from transformers.testing_utils import (
     get_tests_dir,
-    is_pt_tf_cross_test,
     require_pandas,
     require_sentencepiece,
     require_tokenizers,
@@ -94,12 +93,13 @@ def get_question_words_and_boxes_batch(self):
 
         return questions, words, boxes
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = UdopTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "UNwant\u00e9d,running"
@@ -457,8 +457,8 @@ def test_padding_to_max_length(self):
     def test_padding(self, max_length=50):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
@@ -923,8 +923,8 @@ def test_build_inputs_with_special_tokens(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 # Input tokens id
                 words, boxes = self.get_words_and_boxes()
@@ -1110,7 +1110,7 @@ def test_token_type_ids(self):
     def test_offsets_mapping(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 text = ["a", "wonderful", "test"]
                 boxes = [[1, 8, 12, 20] for _ in range(len(text))]
@@ -1240,8 +1240,8 @@ def test_tokenization_python_rust_equals(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 words, boxes = self.get_words_and_boxes()
 
@@ -1294,8 +1294,8 @@ def test_embeded_special_tokens(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 words, boxes = self.get_words_and_boxes()
                 tokens_r = tokenizer_r.encode_plus_boxes(
                     words,
@@ -1321,7 +1321,7 @@ def test_embeded_special_tokens(self):
     def test_compare_add_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
 
@@ -1374,54 +1374,6 @@ def test_udop_truncation_integration_test(self):
         self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
         self.assertLessEqual(len(new_encoded_inputs), 20)
 
-    @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes_batch()
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(
-                    ValueError, tokenizer.batch_encode_plus_boxes, words, boxes=boxes, return_tensors="pt"
-                )
-                self.assertRaises(
-                    ValueError, tokenizer.batch_encode_plus_boxes, words, boxes=boxes, return_tensors="tf"
-                )
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus_boxes,
-                        words,
-                        boxes=boxes,
-                        padding=True,
-                        return_tensors="pt",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus_boxes,
-                        words,
-                        boxes=boxes,
-                        padding="longest",
-                        return_tensors="tf",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus_boxes(
-                        words, boxes=boxes, padding=True, return_tensors="pt"
-                    )
-                    tensorflow_tensor = tokenizer.batch_encode_plus_boxes(
-                        words, boxes=boxes, padding="longest", return_tensors="tf"
-                    )
-                    encoded_sequences = tokenizer.batch_encode_plus_boxes(words, boxes=boxes, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
     def test_sequence_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
@@ -1451,7 +1403,7 @@ def test_special_tokens_initialization(self):
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
                 words = "Hey this is a <special> token".split()
@@ -1465,7 +1417,7 @@ def test_special_tokens_initialization(self):
                 self.assertTrue(special_token_id in r_output)
 
                 if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                    tokenizer_cr = self.get_rust_tokenizer(
                         pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
                     )
                     tokenizer_p = self.tokenizer_class.from_pretrained(
@@ -1640,8 +1592,8 @@ def test_padding_different_model_input_name(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
 
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index e9a5d7e64221..f3ffa8051eeb 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -229,43 +229,6 @@ def create_and_check_model(
         # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
         self.parent.assertEqual(len(decoder_past[0]), 4)
 
-    def create_and_check_decoder_model_past(
-        self,
-        config,
-        input_ids,
-        decoder_input_ids,
-        attention_mask,
-        decoder_attention_mask,
-        lm_labels,
-    ):
-        model = UMT5Model(config=config).get_decoder().to(torch_device).eval()
-        # first forward pass
-        outputs = model(input_ids, use_cache=True)
-        outputs_use_cache_conf = model(input_ids)
-        outputs_no_past = model(input_ids, use_cache=False)
-
-        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-        output, past_key_values = outputs.to_tuple()
-
-        # create hypothetical next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-
-        output_from_no_past = model(next_input_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
     def create_and_check_model_fp16_forward(
         self,
         config,
@@ -525,20 +488,6 @@ def test_with_sequence_classification_head(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_with_sequence_classification_head(*config_and_inputs)
 
-    @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0")
-    def test_export_to_onnx(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        model = UMT5Model(config_and_inputs[0]).to(torch_device)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            torch.onnx.export(
-                model,
-                (config_and_inputs[1], config_and_inputs[3], config_and_inputs[2]),
-                f"{tmpdirname}/t5_test.onnx",
-                export_params=True,
-                opset_version=9,
-                input_names=["input_ids", "decoder_input_ids"],
-            )
-
     @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
     def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -578,19 +527,19 @@ def test_generate_with_head_masking(self):
             self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py
index 9a7ade715527..003c63a3e640 100644
--- a/tests/models/univnet/test_modeling_univnet.py
+++ b/tests/models/univnet/test_modeling_univnet.py
@@ -227,7 +227,7 @@ def get_inputs(self, device, num_samples: int = 3, noise_length: int = 10, seed:
             noise_sequence_shape = (64, noise_length)
         else:
             noise_sequence_shape = (num_samples, 64, noise_length)
-        # Explicity generate noise_sequence on CPU for consistency.
+        # Explicitly generate noise_sequence on CPU for consistency.
         noise_sequence = torch.randn(noise_sequence_shape, generator=generator, dtype=torch.float32, device="cpu")
         # Put noise_sequence on the desired device.
         noise_sequence = noise_sequence.to(device)
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index 1b337460f8d5..d731ca9588ae 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -184,14 +184,6 @@ def test_inputs_embeds(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @unittest.skip(reason="UperNet does not have a base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="UperNet does not have a base model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @require_torch_multi_gpu
     @unittest.skip(reason="UperNet has some layers using `add_module` which doesn't work well with `nn.DataParallel`")
     def test_multi_gpu_data_parallel_forward(self):
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 528f125693f7..208c793a9b04 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -209,19 +209,19 @@ def test_config(self):
         self.config_tester.run_common_tests()
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -391,7 +391,7 @@ def test_mismatching_num_image_tokens(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             input_dict["pixel_values_images"] = input_dict["pixel_values_images"][-1:, ...]
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index f2171f37ad71..8f1ab7de053c 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -15,14 +15,24 @@
 """Testing suite for the PyTorch VideoMAE model."""
 
 import copy
+import tempfile
 import unittest
 
 import numpy as np
 from huggingface_hub import hf_hub_download
+from pytest import mark
 
 from transformers import VideoMAEConfig
 from transformers.models.auto import get_values
-from transformers.testing_utils import require_torch, require_torch_sdpa, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    is_flaky,
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -214,11 +224,6 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
 
         return inputs_dict
 
-    @unittest.skip("`mse_cpu` not implemented for 'BFloat16'")
-    @require_torch_sdpa
-    def test_eager_matches_sdpa_inference_1_bfloat16(self):
-        pass
-
     def test_config(self):
         self.config_tester.run_common_tests()
 
@@ -343,6 +348,59 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    @is_flaky()
+    def test_flash_attn_2_inference_equivalence(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            inputs_dict["pixel_values"] = inputs_dict["pixel_values"].to(torch.bfloat16)
+
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
+                model.to(torch_device)
+
+                outputs = model(**inputs_dict, output_hidden_states=True)
+                outputs_fa = model_fa(**inputs_dict, output_hidden_states=True)
+
+                logits = (
+                    outputs.hidden_states[-1]
+                    if not model.config.is_encoder_decoder
+                    else outputs.decoder_hidden_states[-1]
+                )
+                logits_fa = (
+                    outputs_fa.hidden_states[-1]
+                    if not model.config.is_encoder_decoder
+                    else outputs_fa.decoder_hidden_states[-1]
+                )
+
+                assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)
+
+                # check with inference + dropout
+                model.train()
+                _ = model_fa(**inputs_dict)
+
+    @unittest.skip("Not applicable for VideoMAE")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
 
 # We will verify our results on a video of eating spaghetti
 # Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
@@ -398,7 +456,7 @@ def test_inference_for_pretraining(self):
 
         # add boolean mask, indicating which patches to mask
         local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
-        inputs["bool_masked_pos"] = torch.load(local_path)
+        inputs["bool_masked_pos"] = torch.load(local_path, weights_only=True)
 
         # forward pass
         with torch.no_grad():
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 7977d6298fc3..f85a851dd0a4 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -319,13 +319,13 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index 24f99d4b0b18..47495b2ce348 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -231,13 +231,13 @@ def test_inputs_embeds_matches_input_ids(self):
     def test_mismatching_num_image_tokens(self):
         """
         Tests that VLMs through an error with explicit message saying what is wrong
-        when number of images don't match number of image tokens in the text.
+        when number of images doesn't match number of image tokens in the text.
         Also we need to test multi-image cases when one prompr has multiple image tokens.
         """
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device)
-            _ = model(**input_dict)  # successfull forward with no modifications
+            _ = model(**input_dict)  # successful forward with no modifications
 
             # remove one image but leave the image token in text
             input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
@@ -289,19 +289,19 @@ def test_vision_feature_layers(self, vision_feature_layers):
             model(**input_dict)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
index fabef4b8c6de..bec79869ae31 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
@@ -19,8 +19,8 @@
 
 import numpy as np
 
-from transformers import is_flax_available, is_torch_available, is_vision_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, require_vision, slow, torch_device
+from transformers import is_flax_available, is_vision_available
+from transformers.testing_utils import require_flax, require_vision, slow
 
 from ...test_modeling_flax_common import floats_tensor, ids_tensor
 from ..gpt2.test_modeling_flax_gpt2 import FlaxGPT2ModelTester
@@ -35,15 +35,7 @@
         FlaxViTModel,
         VisionEncoderDecoderConfig,
     )
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-if is_torch_available():
-    import torch
 
-    from transformers import VisionEncoderDecoderModel
 
 if is_vision_available():
     from PIL import Image
@@ -235,68 +227,6 @@ def check_encoder_decoder_model_generate(self, pixel_values, config, decoder_con
         generated_sequences = generated_output.sequences
         self.assertEqual(generated_sequences.shape, (pixel_values.shape[0],) + (decoder_config.max_length,))
 
-    def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()).to(torch_device) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-            self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 1e-5)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxVisionEncoderDecoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 1e-5)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = VisionEncoderDecoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output_loaded in zip(fx_outputs, pt_outputs_loaded):
-            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(force=True), 1e-5)
-
-    def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = VisionEncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxVisionEncoderDecoderModel(encoder_decoder_config)
-
-        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-        fx_model.params = fx_state
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
-    def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-        encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-
-        pt_model = VisionEncoderDecoderModel(encoder_decoder_config)
-        fx_model = FlaxVisionEncoderDecoderModel(encoder_decoder_config)
-
-        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
     def test_encoder_decoder_model_from_pretrained_configs(self):
         config_inputs_dict = self.prepare_config_and_inputs()
         self.check_encoder_decoder_model_from_pretrained_configs(**config_inputs_dict)
@@ -325,39 +255,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         diff = np.abs((a - b)).max()
         self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
 
-    @is_pt_flax_cross_test
-    def test_pt_flax_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        config = config_inputs_dict.pop("config")
-        decoder_config = config_inputs_dict.pop("decoder_config")
-
-        inputs_dict = config_inputs_dict
-        # `encoder_hidden_states` is not used in model call/forward
-        del inputs_dict["encoder_hidden_states"]
-
-        # Avoid the case where a sequence has no place to attend (after combined with the causal attention mask)
-        batch_size = inputs_dict["decoder_attention_mask"].shape[0]
-        inputs_dict["decoder_attention_mask"] = np.concatenate(
-            [np.ones(shape=(batch_size, 1)), inputs_dict["decoder_attention_mask"][:, 1:]], axis=1
-        )
-
-        # Flax models don't use the `use_cache` option and cache is not returned as a default.
-        # So we disable `use_cache` here for PyTorch model.
-        decoder_config.use_cache = False
-
-        self.assertTrue(decoder_config.cross_attention_hidden_size is None)
-
-        # check without `enc_to_dec_proj` projection
-        self.assertTrue(config.hidden_size == decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
-        # check `enc_to_dec_proj` work as expected
-        decoder_config.hidden_size = decoder_config.hidden_size * 2
-        self.assertTrue(config.hidden_size != decoder_config.hidden_size)
-        self.check_equivalence_pt_to_flax(config, decoder_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(config, decoder_config, inputs_dict)
-
     @slow
     def test_real_model_save_load_from_pretrained(self):
         model_2 = self.get_pretrained_model()
diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
index f2bd9c124389..337037229a1d 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
@@ -16,23 +16,18 @@
 
 from __future__ import annotations
 
-import copy
 import os
 import tempfile
 import unittest
 
 import numpy as np
 
-from transformers import is_tf_available, is_torch_available, is_vision_available
+from transformers import is_tf_available, is_vision_available
 from transformers.testing_utils import (
-    is_pt_tf_cross_test,
     require_tf,
-    require_torch,
     require_vision,
     slow,
-    torch_device,
 )
-from transformers.utils.generic import ModelOutput
 
 from ...test_modeling_tf_common import floats_tensor, ids_tensor
 from ..gpt2.test_modeling_tf_gpt2 import TFGPT2ModelTester
@@ -55,11 +50,6 @@
     )
     from transformers.modeling_tf_outputs import TFBaseModelOutput
 
-if is_torch_available():
-    import torch
-
-    from transformers import GPT2LMHeadModel, VisionEncoderDecoderModel, ViTModel
-
 if is_vision_available():
     from PIL import Image
 
@@ -318,185 +308,6 @@ def check_encoder_decoder_model_generate(self, pixel_values, config, decoder_con
             tuple(generated_output.shape.as_list()), (pixel_values.shape[0],) + (decoder_config.max_length,)
         )
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None):
-        """Check the outputs from PyTorch and TensorFlow models are close enough. Checks are done in a recursive way.
-
-        Args:
-            model_class: The class of the model that is currently testing. For example, `TFBertModel`,
-                TFBertForMaskedLM`, `TFBertForSequenceClassification`, etc. Mainly used for providing more informative
-                error messages.
-            name (`str`): The name of the output. For example, `output.hidden_states`, `output.attentions`, etc.
-            attributes (`Tuple[str]`): The names of the output's element if the output is a tuple/list with each element
-                being a named field in the output.
-        """
-
-        self.assertEqual(type(name), str)
-        if attributes is not None:
-            self.assertEqual(type(attributes), tuple, f"{name}: The argument `attributes` should be a `tuple`")
-
-        # Allow `ModelOutput` (e.g. `CLIPOutput` has `text_model_output` and `vision_model_output`).
-        if isinstance(tf_outputs, ModelOutput):
-            self.assertTrue(
-                isinstance(pt_outputs, ModelOutput),
-                f"{name}: `pt_outputs` should an instance of `ModelOutput` when `tf_outputs` is",
-            )
-
-            tf_keys = [k for k, v in tf_outputs.items() if v is not None]
-            pt_keys = [k for k, v in pt_outputs.items() if v is not None]
-
-            self.assertEqual(tf_keys, pt_keys, f"{name}: Output keys differ between TF and PyTorch")
-
-            # convert to the case of `tuple`
-            # appending each key to the current (string) `names`
-            attributes = tuple([f"{name}.{k}" for k in tf_keys])
-            self.check_pt_tf_outputs(
-                tf_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, tol=tol, name=name, attributes=attributes
-            )
-
-        # Allow `list` (e.g. `TransfoXLModelOutput.mems` is a list of tensors.)
-        elif type(tf_outputs) in [tuple, list]:
-            self.assertEqual(type(tf_outputs), type(pt_outputs), f"{name}: Output types differ between TF and PyTorch")
-            self.assertEqual(len(tf_outputs), len(pt_outputs), f"{name}: Output lengths differ between TF and PyTorch")
-
-            if attributes is not None:
-                # case 1: each output has assigned name (e.g. a tuple form of a `ModelOutput`)
-                self.assertEqual(
-                    len(attributes),
-                    len(tf_outputs),
-                    f"{name}: The tuple `names` should have the same length as `tf_outputs`",
-                )
-            else:
-                # case 2: each output has no assigned name (e.g. hidden states of each layer) -> add an index to `names`
-                attributes = tuple([f"{name}_{idx}" for idx in range(len(tf_outputs))])
-
-            for tf_output, pt_output, attr in zip(tf_outputs, pt_outputs, attributes):
-                self.check_pt_tf_outputs(tf_output, pt_output, model_class, tol=tol, name=attr)
-
-        elif isinstance(tf_outputs, tf.Tensor):
-            self.assertTrue(
-                isinstance(pt_outputs, torch.Tensor), f"{name}: `pt_outputs` should a tensor when `tf_outputs` is"
-            )
-
-            tf_outputs = tf_outputs.numpy()
-            pt_outputs = pt_outputs.detach().to("cpu").numpy()
-
-            self.assertEqual(
-                tf_outputs.shape, pt_outputs.shape, f"{name}: Output shapes differ between TF and PyTorch"
-            )
-
-            # deal with NumPy's scalars to make replacing nan values by 0 work.
-            if np.isscalar(tf_outputs):
-                tf_outputs = np.array([tf_outputs])
-                pt_outputs = np.array([pt_outputs])
-
-            tf_nans = np.isnan(tf_outputs)
-            pt_nans = np.isnan(pt_outputs)
-
-            pt_outputs[tf_nans] = 0
-            tf_outputs[tf_nans] = 0
-            pt_outputs[pt_nans] = 0
-            tf_outputs[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tf_outputs - pt_outputs))
-            self.assertLessEqual(max_diff, tol, f"{name}: Difference between torch and tf is {max_diff} (>= {tol}).")
-        else:
-            raise ValueError(
-                "`tf_outputs` should be an instance of `tf.Tensor`, a `tuple`, or an instance of `tf.Tensor`. Got"
-                f" {type(tf_outputs)} instead."
-            )
-
-    def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
-        pt_inputs_dict = {}
-        for name, key in tf_inputs_dict.items():
-            if isinstance(key, bool):
-                pt_inputs_dict[name] = key
-            elif name == "input_values":
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            elif name == "pixel_values":
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            elif name == "input_features":
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            # other general float inputs
-            elif tf_inputs_dict[name].dtype.is_floating:
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            else:
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
-
-        return pt_inputs_dict
-
-    def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-        pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict)
-
-        # send pytorch inputs to the correct device
-        pt_inputs_dict = {
-            k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs_dict.items()
-        }
-
-        # send pytorch model to the correct device
-        pt_model.to(torch_device)
-
-        # Check predictions on first output (logits/hidden-states) are close enough given low-level computational differences
-        pt_model.eval()
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs_dict)
-        tf_outputs = tf_model(tf_inputs_dict)
-
-        # tf models returned loss is usually a tensor rather than a scalar.
-        # (see `hf_compute_loss`: it uses `keras.losses.Reduction.NONE`)
-        # Change it here to a scalar to match PyTorch models' loss
-        tf_loss = getattr(tf_outputs, "loss", None)
-        if tf_loss is not None:
-            tf_outputs.loss = tf.math.reduce_mean(tf_loss)
-
-        self.check_pt_tf_outputs(tf_outputs, pt_outputs, type(tf_model))
-
-    def check_pt_tf_equivalence(self, tf_model, pt_model, tf_inputs_dict):
-        """Wrap `check_pt_tf_models` to further check PT -> TF again"""
-
-        self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
-        # PT -> TF
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            tf_model = TFVisionEncoderDecoderModel.from_pretrained(tmpdirname)
-
-        self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
-    def check_pt_to_tf_equivalence(self, config, decoder_config, tf_inputs_dict):
-        encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        # Output all for aggressive testing
-        encoder_decoder_config.output_hidden_states = True
-        # All models tested in this file have attentions
-        encoder_decoder_config.output_attentions = True
-
-        pt_model = VisionEncoderDecoderModel(encoder_decoder_config)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            tf_model = TFVisionEncoderDecoderModel.from_pretrained(tmpdirname)
-
-        self.check_pt_tf_equivalence(tf_model, pt_model, tf_inputs_dict)
-
-    def check_tf_to_pt_equivalence(self, config, decoder_config, tf_inputs_dict):
-        encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
-        # Output all for aggressive testing
-        encoder_decoder_config.output_hidden_states = True
-        # TODO: A generalizable way to determine this attribute
-        encoder_decoder_config.output_attentions = True
-
-        tf_model = TFVisionEncoderDecoderModel(encoder_decoder_config)
-        # Make sure model is built before saving
-        tf_model(**tf_inputs_dict)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            tf_model.save_pretrained(tmpdirname, safe_serialization=False)
-            pt_model = VisionEncoderDecoderModel.from_pretrained(
-                tmpdirname, from_tf=True, attn_implementation=tf_model.config._attn_implementation
-            )
-
-        self.check_pt_tf_equivalence(tf_model, pt_model, tf_inputs_dict)
-
     def test_encoder_decoder_model(self):
         config_inputs_dict = self.prepare_config_and_inputs()
         self.check_encoder_decoder_model(**config_inputs_dict)
@@ -533,69 +344,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         diff = np.abs((a - b)).max()
         self.assertLessEqual(diff, tol, f"Difference between torch and tf is {diff} (>= {tol}).")
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        labels = config_inputs_dict.pop("decoder_token_labels")
-
-        # Keep only common arguments
-        arg_names = [
-            "config",
-            "pixel_values",
-            "decoder_config",
-            "decoder_input_ids",
-            "decoder_attention_mask",
-            "encoder_hidden_states",
-        ]
-        config_inputs_dict = {k: v for k, v in config_inputs_dict.items() if k in arg_names}
-
-        config = config_inputs_dict.pop("config")
-        decoder_config = config_inputs_dict.pop("decoder_config")
-
-        # Output all for aggressive testing
-        config.output_hidden_states = True
-        decoder_config.output_hidden_states = True
-        # All models tested in this file have attentions
-        config.output_attentions = True
-        decoder_config.output_attentions = True
-
-        tf_inputs_dict = config_inputs_dict
-        # `encoder_hidden_states` is not used in model call/forward
-        del tf_inputs_dict["encoder_hidden_states"]
-
-        # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
-        # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
-        for k in ["decoder_attention_mask"]:
-            attention_mask = tf_inputs_dict[k]
-
-            # Make sure no all 0s attention masks - to avoid failure at this moment.
-            # Put `1` at the beginning of sequences to make it still work when combining causal attention masks.
-            # TODO: remove this line once a fix regarding large negative values for attention mask is done.
-            attention_mask = tf.concat(
-                [tf.ones_like(attention_mask[:, :1], dtype=attention_mask.dtype), attention_mask[:, 1:]], axis=-1
-            )
-            tf_inputs_dict[k] = attention_mask
-
-        tf_inputs_dict_with_labels = copy.copy(tf_inputs_dict)
-        tf_inputs_dict_with_labels["labels"] = labels
-
-        self.assertTrue(decoder_config.cross_attention_hidden_size is None)
-
-        # Original test: check without `labels` and  without `enc_to_dec_proj` projection
-        self.assertTrue(config.hidden_size == decoder_config.hidden_size)
-        self.check_pt_to_tf_equivalence(config, decoder_config, tf_inputs_dict)
-        self.check_tf_to_pt_equivalence(config, decoder_config, tf_inputs_dict)
-
-        # check with `labels`
-        self.check_pt_to_tf_equivalence(config, decoder_config, tf_inputs_dict_with_labels)
-        self.check_tf_to_pt_equivalence(config, decoder_config, tf_inputs_dict_with_labels)
-
-        # check `enc_to_dec_proj` work as expected
-        decoder_config.hidden_size = decoder_config.hidden_size * 2
-        self.assertTrue(config.hidden_size != decoder_config.hidden_size)
-        self.check_pt_to_tf_equivalence(config, decoder_config, tf_inputs_dict)
-        self.check_tf_to_pt_equivalence(config, decoder_config, tf_inputs_dict)
-
     @slow
     def test_real_model_save_load_from_pretrained(self):
         model_2 = self.get_pretrained_model()
@@ -781,56 +529,6 @@ def test_encoder_decoder_save_load_from_encoder_decoder(self):
         max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy()))
         self.assertAlmostEqual(max_diff, 0.0, places=4)
 
-    @require_torch
-    @is_pt_tf_cross_test
-    def test_encoder_decoder_save_load_from_encoder_decoder_from_pt(self):
-        config = self.get_encoder_decoder_config_small()
-
-        # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
-        encoder_pt = ViTModel(config.encoder).to(torch_device).eval()
-        decoder_pt = GPT2LMHeadModel(config.decoder).to(torch_device).eval()
-
-        encoder_decoder_pt = VisionEncoderDecoderModel(encoder=encoder_pt, decoder=decoder_pt).to(torch_device).eval()
-
-        pixel_values = floats_tensor(
-            [
-                13,
-                encoder_pt.config.num_channels,
-                encoder_pt.config.image_size,
-                encoder_pt.config.image_size,
-            ]
-        )
-        decoder_input_ids = ids_tensor([13, 1], decoder_pt.config.vocab_size)
-
-        pt_pixel_values = torch.tensor(pixel_values.numpy(), device=torch_device, dtype=torch.float)
-        pt_decoder_input_ids = torch.tensor(decoder_input_ids.numpy(), device=torch_device, dtype=torch.long)
-
-        logits_pt = encoder_decoder_pt(pixel_values=pt_pixel_values, decoder_input_ids=pt_decoder_input_ids).logits
-
-        # PyTorch => TensorFlow
-        with tempfile.TemporaryDirectory() as tmp_dirname_1, tempfile.TemporaryDirectory() as tmp_dirname_2:
-            encoder_decoder_pt.encoder.save_pretrained(tmp_dirname_1)
-            encoder_decoder_pt.decoder.save_pretrained(tmp_dirname_2)
-            encoder_decoder_tf = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-                tmp_dirname_1, tmp_dirname_2
-            )
-
-        logits_tf = encoder_decoder_tf(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits
-
-        max_diff = np.max(np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy()))
-        self.assertAlmostEqual(max_diff, 0.0, places=3)
-
-        # Make sure `from_pretrained` following `save_pretrained` work and give the same result
-        # (See https://github.com/huggingface/transformers/pull/14016)
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            encoder_decoder_tf.save_pretrained(tmp_dirname, safe_serialization=False)
-            encoder_decoder_tf = TFVisionEncoderDecoderModel.from_pretrained(tmp_dirname)
-
-            logits_tf_2 = encoder_decoder_tf(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids).logits
-
-            max_diff = np.max(np.abs(logits_tf_2.numpy() - logits_tf.numpy()))
-            self.assertAlmostEqual(max_diff, 0.0, places=3)
-
     @require_vision
     @slow
     def test_encoder_decoder_from_pretrained(self):
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index a680e504cd63..317048550e5c 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -216,7 +216,10 @@ def check_save_and_load_encoder_decoder_model(
             out_2 = outputs[0].cpu().numpy()
             out_2[np.isnan(out_2)] = 0
 
-            with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
+            with (
+                tempfile.TemporaryDirectory() as encoder_tmp_dirname,
+                tempfile.TemporaryDirectory() as decoder_tmp_dirname,
+            ):
                 enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname)
                 enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname)
                 VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
index e1e8eb4076c1..115cdf444fe4 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
@@ -20,15 +20,8 @@
 
 import numpy as np
 
-from transformers.testing_utils import (
-    is_pt_flax_cross_test,
-    require_flax,
-    require_torch,
-    require_vision,
-    slow,
-    torch_device,
-)
-from transformers.utils import is_flax_available, is_torch_available, is_vision_available
+from transformers.testing_utils import require_flax, require_torch, require_vision, slow
+from transformers.utils import is_flax_available, is_vision_available
 
 from ...test_modeling_flax_common import floats_tensor, ids_tensor, random_attention_mask
 from ..bert.test_modeling_flax_bert import FlaxBertModelTester
@@ -45,16 +38,7 @@
         VisionTextDualEncoderConfig,
         VisionTextDualEncoderProcessor,
     )
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
 
-if is_torch_available():
-    import torch
-
-    from transformers import VisionTextDualEncoderModel
 
 if is_vision_available():
     from PIL import Image
@@ -154,68 +138,6 @@ def check_vision_text_output_attention(
             (text_config.num_attention_heads, input_ids.shape[-1], input_ids.shape[-1]),
         )
 
-    def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        flax_inputs = inputs_dict
-        pt_inputs = {k: torch.tensor(v.tolist()).to(torch_device) for k, v in flax_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-        fx_outputs = fx_model(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-            self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 4e-2)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxVisionTextDualEncoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**inputs_dict).to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 4e-2)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = VisionTextDualEncoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output_loaded in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(force=True), 4e-2)
-
-    def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        pt_model = VisionTextDualEncoderModel(config)
-        fx_model = FlaxVisionTextDualEncoderModel(config)
-
-        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-        fx_model.params = fx_state
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
-    def check_equivalence_flax_to_pt(self, vision_config, text_config, inputs_dict):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        pt_model = VisionTextDualEncoderModel(config)
-        fx_model = FlaxVisionTextDualEncoderModel(config)
-
-        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
-
     def test_model_from_pretrained_configs(self):
         inputs_dict = self.prepare_config_and_inputs()
         self.check_model_from_pretrained_configs(**inputs_dict)
@@ -232,17 +154,6 @@ def test_vision_text_output_attention(self):
         inputs_dict = self.prepare_config_and_inputs()
         self.check_vision_text_output_attention(**inputs_dict)
 
-    @is_pt_flax_cross_test
-    def test_pt_flax_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        vision_config = config_inputs_dict.pop("vision_config")
-        text_config = config_inputs_dict.pop("text_config")
-
-        inputs_dict = config_inputs_dict
-
-        self.check_equivalence_pt_to_flax(vision_config, text_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(vision_config, text_config, inputs_dict)
-
     @slow
     def test_real_model_save_load_from_pretrained(self):
         model_2, inputs = self.get_pretrained_model_and_inputs()
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
index ab4adeb5d466..ab10f3e93fac 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 
-from transformers.testing_utils import is_pt_flax_cross_test, require_torch, require_vision, slow, torch_device
-from transformers.utils import is_flax_available, is_torch_available, is_vision_available
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
 from ..bert.test_modeling_bert import BertModelTester
@@ -44,12 +44,6 @@
         ViTModel,
     )
 
-if is_flax_available():
-    from transformers import FlaxVisionTextDualEncoderModel
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
 
 if is_vision_available():
     from PIL import Image
@@ -172,69 +166,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         diff = np.abs((a - b)).max()
         self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
 
-    def check_pt_flax_equivalence(self, pt_model, fx_model, input_ids, attention_mask, pixel_values, **kwargs):
-        pt_model.to(torch_device)
-        pt_model.eval()
-
-        # prepare inputs
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "pixel_values": pixel_values}
-        pt_inputs = inputs_dict
-        flax_inputs = {k: v.numpy(force=True) for k, v in pt_inputs.items()}
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-        fx_outputs = fx_model(**flax_inputs).to_tuple()
-        self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output in zip(fx_outputs[:4], pt_outputs[:4]):
-            self.assert_almost_equals(fx_output, pt_output.numpy(force=True), 4e-2)
-
-        # PT -> Flax
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pt_model.save_pretrained(tmpdirname)
-            fx_model_loaded = FlaxVisionTextDualEncoderModel.from_pretrained(tmpdirname, from_pt=True)
-
-        fx_outputs_loaded = fx_model_loaded(**flax_inputs).to_tuple()
-        self.assertEqual(len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-        for fx_output_loaded, pt_output in zip(fx_outputs_loaded[:4], pt_outputs[:4]):
-            self.assert_almost_equals(fx_output_loaded, pt_output.numpy(force=True), 4e-2)
-
-        # Flax -> PT
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            fx_model.save_pretrained(tmpdirname)
-            pt_model_loaded = VisionTextDualEncoderModel.from_pretrained(tmpdirname, from_flax=True)
-
-        pt_model_loaded.to(torch_device)
-        pt_model_loaded.eval()
-
-        with torch.no_grad():
-            pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-        self.assertEqual(len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch")
-        for fx_output, pt_output_loaded in zip(fx_outputs[:4], pt_outputs_loaded[:4]):
-            self.assert_almost_equals(fx_output, pt_output_loaded.numpy(force=True), 4e-2)
-
-    def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        pt_model = VisionTextDualEncoderModel(config)
-        fx_model = FlaxVisionTextDualEncoderModel(config)
-
-        fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-        fx_model.params = fx_state
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, **inputs_dict)
-
-    def check_equivalence_flax_to_pt(self, vision_config, text_config, inputs_dict):
-        config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
-
-        pt_model = VisionTextDualEncoderModel(config)
-        fx_model = FlaxVisionTextDualEncoderModel(config)
-
-        pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-        self.check_pt_flax_equivalence(pt_model, fx_model, **inputs_dict)
-
     def test_vision_text_dual_encoder_model(self):
         inputs_dict = self.prepare_config_and_inputs()
         self.check_vision_text_dual_encoder_model(**inputs_dict)
@@ -255,17 +186,6 @@ def test_vision_text_output_attention(self):
         inputs_dict = self.prepare_config_and_inputs()
         self.check_vision_text_output_attention(**inputs_dict)
 
-    @is_pt_flax_cross_test
-    def test_pt_flax_equivalence(self):
-        config_inputs_dict = self.prepare_config_and_inputs()
-        vision_config = config_inputs_dict.pop("vision_config")
-        text_config = config_inputs_dict.pop("text_config")
-
-        inputs_dict = config_inputs_dict
-
-        self.check_equivalence_pt_to_flax(vision_config, text_config, inputs_dict)
-        self.check_equivalence_flax_to_pt(vision_config, text_config, inputs_dict)
-
     @slow
     def test_real_model_save_load_from_pretrained(self):
         model_2, inputs = self.get_pretrained_model_and_inputs()
@@ -429,10 +349,6 @@ def prepare_config_and_inputs(self):
             "text_choice_labels": choice_labels,
         }
 
-    @unittest.skip(reason="DeiT is not available in Flax")
-    def test_pt_flax_equivalence(self):
-        pass
-
 
 @require_torch
 class CLIPVisionBertModelTest(VisionTextDualEncoderMixin, unittest.TestCase):
diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py
index 5517f3e22ead..1c32f26a6244 100644
--- a/tests/models/visual_bert/test_modeling_visual_bert.py
+++ b/tests/models/visual_bert/test_modeling_visual_bert.py
@@ -555,19 +555,19 @@ def test_model_from_pretrained(self):
         self.assertIsNotNone(model)
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
index 5344c3187c20..dcdf6d71e9f9 100644
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
@@ -270,22 +270,6 @@ def prepare_numpy_arrays(inputs_dict):
             output_for_kw_input = model(**inputs_np, noise=noise)
             self.assert_outputs_same(output_for_dict_input, output_for_kw_input)
 
-    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
-    # to generate masks during test
-    def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-        # make masks reproducible
-        np.random.seed(2)
-
-        num_patches = int((tf_model.config.image_size // tf_model.config.patch_size) ** 2)
-        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
-        tf_noise = tf.constant(noise)
-
-        # Add `noise` argument.
-        # PT inputs will be prepared in `super().check_pt_tf_models()` with this added `noise` argument
-        tf_inputs_dict["noise"] = tf_noise
-
-        super().check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
     # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
     # to generate masks during test
     def test_keras_save_load(self):
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 1b8369001428..a66bb44e8248 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -19,13 +19,22 @@
 import unittest
 
 import numpy as np
+from pytest import mark
 
 from transformers import ViTMAEConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    is_flaky,
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -204,22 +213,6 @@ def test_for_pretraining(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
 
-    # overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise
-    # to generate masks during test
-    def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
-        # make masks reproducible
-        np.random.seed(2)
-
-        num_patches = int((pt_model.config.image_size // pt_model.config.patch_size) ** 2)
-        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
-        pt_noise = torch.from_numpy(noise)
-
-        # Add `noise` argument.
-        # PT inputs will be prepared in `super().check_pt_tf_models()` with this added `noise` argument
-        pt_inputs_dict["noise"] = pt_noise
-
-        super().check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
-
     def test_save_load(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -257,20 +250,6 @@ def test_save_load(self):
     def test_determinism(self):
         pass
 
-    @unittest.skip(
-        reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
-    to get deterministic results."""
-    )
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(
-        reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
-    to get deterministic results."""
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load""")
     def test_model_outputs_equivalence(self):
         pass
@@ -285,6 +264,80 @@ def test_model_from_pretrained(self):
         model = ViTMAEModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    @is_flaky()
+    def test_flash_attn_2_inference_equivalence(self):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
+        for model_class in self.all_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            inputs_dict["pixel_values"] = inputs_dict["pixel_values"].to(torch.bfloat16)
+
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
+                model.to(torch_device)
+
+                # ForPretraining model has random `noise` -> need to set seed
+                # to make the test deterministic
+                torch.manual_seed(12345)
+                outputs = model(**inputs_dict, output_hidden_states=True)
+                torch.manual_seed(12345)
+                outputs_fa = model_fa(**inputs_dict, output_hidden_states=True)
+
+                logits = (
+                    outputs.hidden_states[-1]
+                    if not model.config.is_encoder_decoder
+                    else outputs.decoder_hidden_states[-1]
+                )
+                logits_fa = (
+                    outputs_fa.hidden_states[-1]
+                    if not model.config.is_encoder_decoder
+                    else outputs_fa.decoder_hidden_states[-1]
+                )
+
+                assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)
+
+                # check with inference + dropout
+                model.train()
+                _ = model_fa(**inputs_dict)
+
+    @unittest.skip("Not applicable for VideoMAE")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                # This is an excepton in the module, it's initialized with xavier_uniform without using initializer_range
+                if name.endswith("patch_embeddings.projection.weight"):
+                    continue
+                if param.requires_grad:
+                    self.assertIn(
+                        ((param.data.mean() * 1e9).round() / 1e9).item(),
+                        [0.0, 1.0],
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py
index 8c94a137719d..e437ce75ca81 100644
--- a/tests/models/vit_msn/test_modeling_vit_msn.py
+++ b/tests/models/vit_msn/test_modeling_vit_msn.py
@@ -59,6 +59,7 @@ def __init__(
         initializer_range=0.02,
         scope=None,
         attn_implementation="eager",
+        mask_ratio=0.5,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -82,6 +83,8 @@ def __init__(
         # in ViT MSN, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = num_patches + 1
+        self.num_masks = int(mask_ratio * self.seq_length)
+        self.mask_length = self.seq_length - 1
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
diff --git a/tests/models/vitdet/test_modeling_vitdet.py b/tests/models/vitdet/test_modeling_vitdet.py
index 2c46b60f7e73..4b5ac0f3378c 100644
--- a/tests/models/vitdet/test_modeling_vitdet.py
+++ b/tests/models/vitdet/test_modeling_vitdet.py
@@ -290,6 +290,31 @@ def test_feed_forward_chunking(self):
     def test_model_from_pretrained(self):
         pass
 
+    def test_non_square_image(self):
+        non_square_image_size = (32, 40)
+        patch_size = (2, 2)
+        config = self.model_tester.get_config()
+        config.image_size = non_square_image_size
+        config.patch_size = patch_size
+
+        model = VitDetModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        batch_size = self.model_tester.batch_size
+        # Create a dummy input tensor with non-square spatial dimensions.
+        pixel_values = floats_tensor(
+            [batch_size, config.num_channels, non_square_image_size[0], non_square_image_size[1]]
+        )
+
+        result = model(pixel_values)
+
+        expected_height = non_square_image_size[0] / patch_size[0]
+        expected_width = non_square_image_size[1] / patch_size[1]
+        expected_shape = (batch_size, config.hidden_size, expected_height, expected_width)
+
+        self.assertEqual(result.last_hidden_state.shape, expected_shape)
+
 
 @require_torch
 class VitDetBackboneTest(unittest.TestCase, BackboneTesterMixin):
diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py
index 035e1a65b8a4..b75cfc886c0c 100644
--- a/tests/models/vitmatte/test_modeling_vitmatte.py
+++ b/tests/models/vitmatte/test_modeling_vitmatte.py
@@ -171,13 +171,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -244,7 +244,7 @@ def _validate_backbone_init():
                 model.eval()
 
                 if model.__class__.__name__ == "VitMatteForImageMatting":
-                    # Confirm out_indices propogated to backbone
+                    # Confirm out_indices propagated to backbone
                     self.assertEqual(len(model.backbone.out_indices), 2)
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/vits/test_tokenization_vits.py b/tests/models/vits/test_tokenization_vits.py
index f4a9c8a7438b..98b02ca5fd88 100644
--- a/tests/models/vits/test_tokenization_vits.py
+++ b/tests/models/vits/test_tokenization_vits.py
@@ -19,12 +19,13 @@
 import shutil
 import tempfile
 import unittest
+from functools import lru_cache
 
 from transformers import VitsTokenizer
 from transformers.models.vits.tokenization_vits import VOCAB_FILES_NAMES
 from transformers.testing_utils import slow
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
@@ -32,8 +33,9 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = VitsTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab = (
             "k ' z y u d h e s w – 3 c p - 1 j m i X f l o 0 b r a 4 2 n _ x v t q 5 6 g ț ţ < > | <pad> <unk>".split(
@@ -44,18 +46,22 @@ def setUp(self):
         vocab_tokens[" "] = vocab_tokens["X"]
         del vocab_tokens["X"]
 
-        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>"}
+        cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>"}
 
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.tmpdirname = tempfile.mkdtemp()
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
         kwargs["phonemize"] = False
         kwargs["normalize"] = False
-        return VitsTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return VitsTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
         txt = "beyonce lives in los angeles"
diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
index b91d66654de6..aa55557691b5 100644
--- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
@@ -24,9 +24,7 @@
 from transformers import Wav2Vec2Config, is_flax_available
 from transformers.testing_utils import (
     CaptureLogger,
-    is_flaky,
     is_librosa_available,
-    is_pt_flax_cross_test,
     is_pyctcdecode_available,
     require_flax,
     require_librosa,
@@ -350,11 +348,6 @@ def test_model_from_pretrained(self):
             outputs = model(np.ones((1, 1024), dtype="f4"))
             self.assertIsNotNone(outputs)
 
-    @is_pt_flax_cross_test
-    @is_flaky()
-    def test_equivalence_pt_to_flax(self):
-        super().test_equivalence_pt_to_flax()
-
 
 @require_flax
 class FlaxWav2Vec2UtilsTest(unittest.TestCase):
@@ -419,7 +412,7 @@ def test_sample_negatives(self):
 
         features = (np.arange(sequence_length * hidden_size) // hidden_size).reshape(
             sequence_length, hidden_size
-        )  # each value in vector consits of same value
+        )  # each value in vector consists of same value
         features = np.broadcast_to(features[None, :], (batch_size, sequence_length, hidden_size))
 
         negative_indices = _sample_negative_indices(features.shape, num_negatives)
@@ -449,7 +442,7 @@ def test_sample_negatives_with_attn_mask(self):
 
         features = (np.arange(sequence_length * hidden_size) // hidden_size).reshape(
             sequence_length, hidden_size
-        )  # each value in vector consits of same value
+        )  # each value in vector consists of same value
 
         # second half of last input tensor is padded
         attention_mask = np.ones((batch_size, sequence_length), dtype=np.int8)
@@ -623,9 +616,10 @@ def test_wav2vec2_with_lm_pool(self):
         self.assertEqual(transcription[0], "bien y qué regalo vas a abrir primero")
 
         # user-managed pool + num_processes should trigger a warning
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool(
-            2
-        ) as pool:
+        with (
+            CaptureLogger(processing_wav2vec2_with_lm.logger) as cl,
+            multiprocessing.get_context("fork").Pool(2) as pool,
+        ):
             transcription = processor.batch_decode(np.array(logits), pool, num_processes=2).text
 
         self.assertIn("num_process", cl.out)
diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
index 7ef97290e61c..593d627ccf97 100644
--- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
@@ -22,8 +22,6 @@
 import inspect
 import math
 import multiprocessing
-import os
-import tempfile
 import traceback
 import unittest
 
@@ -36,7 +34,6 @@
 from transformers.testing_utils import (
     CaptureLogger,
     is_flaky,
-    is_pt_tf_cross_test,
     require_librosa,
     require_pyctcdecode,
     require_tf,
@@ -438,62 +435,6 @@ def test_keras_fit(self):
         # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
         pass
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
-        # We override the base test here to skip loss calculation for Wav2Vec2 models because the loss is massive with
-        # the default labels and frequently overflows to inf or exceeds numerical tolerances between TF/PT
-        import torch
-
-        import transformers
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            # Output all for aggressive testing
-            config.output_hidden_states = True
-            config.output_attentions = self.has_attentions
-
-            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
-            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
-            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
-            self._make_attention_mask_non_null(inputs_dict)
-
-            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
-            pt_model_class = getattr(transformers, pt_model_class_name)
-
-            tf_model = model_class(config)
-            pt_model = pt_model_class(config)
-
-            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
-            )
-            pt_model = transformers.load_tf2_model_in_pytorch_model(
-                pt_model, tf_model, allow_missing_keys=allow_missing_keys
-            )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
-                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
-                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
 
 @require_tf
 class TFWav2Vec2RobustModelTest(TFModelTesterMixin, unittest.TestCase):
@@ -623,62 +564,6 @@ def test_keras_fit(self):
         # TODO: (Amy) - check whether skipping CTC model resolves this issue and possible resolutions for CTC
         pass
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
-        # We override the base test here to skip loss calculation for Wav2Vec2 models because the loss is massive with
-        # the default labels and frequently overflows to inf or exceeds numerical tolerances between TF/PT
-        import torch
-
-        import transformers
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            # Output all for aggressive testing
-            config.output_hidden_states = True
-            config.output_attentions = self.has_attentions
-
-            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
-            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
-            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
-            self._make_attention_mask_non_null(inputs_dict)
-
-            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
-            pt_model_class = getattr(transformers, pt_model_class_name)
-
-            tf_model = model_class(config)
-            pt_model = pt_model_class(config)
-
-            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
-            )
-            pt_model = transformers.load_tf2_model_in_pytorch_model(
-                pt_model, tf_model, allow_missing_keys=allow_missing_keys
-            )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
-                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
-                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-
 
 @require_tf
 class TFWav2Vec2UtilsTest(unittest.TestCase):
@@ -827,9 +712,10 @@ def test_wav2vec2_with_lm_pool(self):
         self.assertEqual(transcription[0], "el libro ha sido escrito por cervantes")
 
         # user-managed pool + num_processes should trigger a warning
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool(
-            2
-        ) as pool:
+        with (
+            CaptureLogger(processing_wav2vec2_with_lm.logger) as cl,
+            multiprocessing.get_context("fork").Pool(2) as pool,
+        ):
             transcription = processor.batch_decode(logits.numpy(), pool, num_processes=2).text
 
         self.assertIn("num_process", cl.out)
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 10ca9a22e43c..073d21de65e8 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -31,7 +31,6 @@
     CaptureLogger,
     cleanup,
     is_flaky,
-    is_pt_flax_cross_test,
     is_pyctcdecode_available,
     is_torchaudio_available,
     require_flash_attn,
@@ -569,16 +568,6 @@ def test_resize_tokens_embeddings(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @is_pt_flax_cross_test
-    @unittest.skip(reason="Non-rubst architecture does not exist in Flax")
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    @unittest.skip(reason="Non-rubst architecture does not exist in Flax")
-    def test_equivalence_pt_to_flax(self):
-        pass
-
     def test_retain_grad_hidden_states_attentions(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_hidden_states = True
@@ -1055,7 +1044,7 @@ def test_model_for_pretraining(self):
         ).loss
 
         # loss_more_masked has to be bigger or equal loss since more masked inputs have to be predicted
-        self.assertTrue(loss.detach().item() <= loss_more_masked.detach().item())
+        self.assertTrue(loss.item() <= loss_more_masked.item())
 
     def test_mask_feature_prob_ctc(self):
         model = Wav2Vec2ForCTC.from_pretrained(
@@ -1403,7 +1392,7 @@ def test_sample_negatives(self):
         sequence = torch.div(
             torch.arange(sequence_length * hidden_size, device=torch_device), hidden_size, rounding_mode="floor"
         )
-        features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
+        features = sequence.view(sequence_length, hidden_size)  # each value in vector consists of same value
         features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
 
         # sample negative indices
@@ -1433,7 +1422,7 @@ def test_sample_negatives_with_mask(self):
         sequence = torch.div(
             torch.arange(sequence_length * hidden_size, device=torch_device), hidden_size, rounding_mode="floor"
         )
-        features = sequence.view(sequence_length, hidden_size)  # each value in vector consits of same value
+        features = sequence.view(sequence_length, hidden_size)  # each value in vector consists of same value
         features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
 
         # replace masked feature vectors with -100 to test that those are not sampled
@@ -1896,9 +1885,10 @@ def test_wav2vec2_with_lm_pool(self):
         self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas")
 
         # user-managed pool + num_processes should trigger a warning
-        with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool(
-            2
-        ) as pool:
+        with (
+            CaptureLogger(processing_wav2vec2_with_lm.logger) as cl,
+            multiprocessing.get_context("fork").Pool(2) as pool,
+        ):
             transcription = processor.batch_decode(logits.cpu().numpy(), pool, num_processes=2).text
 
         self.assertIn("num_process", cl.out)
diff --git a/tests/models/wav2vec2/test_processor_wav2vec2.py b/tests/models/wav2vec2/test_processor_wav2vec2.py
index 30c9243e8e4f..2d0e624d3722 100644
--- a/tests/models/wav2vec2/test_processor_wav2vec2.py
+++ b/tests/models/wav2vec2/test_processor_wav2vec2.py
@@ -18,8 +18,6 @@
 import tempfile
 import unittest
 
-import numpy as np
-
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
 from transformers.utils import FEATURE_EXTRACTOR_NAME
@@ -30,6 +28,8 @@
 
 class Wav2Vec2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = Wav2Vec2Processor
+    audio_input_name = "input_values"
+    text_input_name = "labels"
 
     def setUp(self):
         vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
@@ -132,22 +132,6 @@ def test_tokenizer(self):
         for key in encoded_tok.keys():
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
-    def test_padding_argument_not_ignored(self):
-        # padding, or any other overlap arg between audio extractor and tokenizer
-        # should be passed to both text and audio and not ignored
-
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        # padding = True should not raise an error and will if the audio processor popped its value to None
-        _ = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
     def test_tokenizer_decode(self):
         feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
index 4a4058891d3d..57b2dba41456 100644
--- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py
+++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py
@@ -21,6 +21,7 @@
 import shutil
 import tempfile
 import unittest
+from functools import lru_cache
 
 import numpy as np
 
@@ -33,7 +34,7 @@
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizerOutput
 from transformers.testing_utils import require_torch, slow
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 global_rng = random.Random()
@@ -57,22 +58,27 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
 class Wav2Vec2TokenizerTest(unittest.TestCase):
     tokenizer_class = Wav2Vec2Tokenizer
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
 
-        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
+        cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
 
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.tmpdirname = tempfile.mkdtemp()
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return Wav2Vec2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return Wav2Vec2Tokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def test_tokenizer_decode(self):
         # TODO(PVP) - change to facebook
@@ -237,7 +243,7 @@ def _input_values_are_equal(input_values_1, input_values_2):
 
     def test_save_pretrained(self):
         pretrained_name = list(self.tokenizer_class.pretrained_vocab_files_map["vocab_file"].keys())[0]
-        tokenizer = self.tokenizer_class.from_pretrained(pretrained_name)
+        tokenizer = self.get_tokenizer(pretrained_name)
         tmpdirname2 = tempfile.mkdtemp()
 
         tokenizer_files = tokenizer.save_pretrained(tmpdirname2)
@@ -373,22 +379,27 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = Wav2Vec2CTCTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
 
-        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
+        cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
 
-        self.tmpdirname = tempfile.mkdtemp()
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.tmpdirname = tempfile.mkdtemp()
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return Wav2Vec2CTCTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def test_tokenizer_add_token_chars(self):
         tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
@@ -659,7 +670,7 @@ def test_offsets_integration(self):
         #
         #        input_values = feature_extractor(sample["audio"]["array"], return_tensors="pt").input_values
         #        logits = model(input_values).logits
-        #        pred_ids = torch.argmax(logits, axis=-1).cpu().tolist()
+        #        pred_ids = torch.argmax(logits, axis=-1).tolist()
         # ```
         # fmt: off
         pred_ids = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 11, 0, 0, 0, 22, 0, 0, 4, 4, 4, 14, 0, 0, 0, 0, 0, 8, 8, 0, 5, 5, 0, 12, 0, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 10, 0, 0, 0, 15, 0, 0, 10, 0, 0, 0, 12, 0, 0, 0, 0, 0, 7, 0, 9, 0, 0, 14, 0, 0, 0, 13, 0, 7, 0, 0, 4, 4, 0, 15, 8, 8, 0, 0, 8, 0, 26, 0, 0, 4, 4, 0, 0, 15, 0, 0, 0, 0, 0, 0, 10, 0, 26, 5, 5, 0, 4, 4, 0, 0, 12, 11, 0, 0, 5, 4, 4, 4, 0, 18, 0, 0, 0, 7, 9, 9, 0, 6, 0, 12, 12, 4, 4, 0, 6, 0, 0, 8, 0, 4, 4, 4, 0, 19, 0, 0, 8, 9, 9, 0, 0, 0, 0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 16, 16, 0, 0, 17, 5, 5, 5, 0, 4, 4, 4, 0, 0, 29, 29, 0, 0, 0, 0, 8, 11, 0, 9, 9, 0, 0, 0, 4, 4, 0, 12, 12, 0, 0, 0, 9, 0, 0, 0, 0, 0, 8, 18, 0, 0, 0, 4, 4, 0, 0, 8, 9, 0, 4, 4, 0, 6, 11, 5, 0, 4, 4, 0, 13, 13, 0, 0, 0, 10, 0, 0, 25, 0, 0, 6, 0, 4, 4, 0, 0, 0, 0, 7, 0, 0, 23, 0, 0, 4, 4, 0, 0, 0, 6, 11, 0, 5, 4, 4, 18, 0, 0, 0, 0, 0, 0, 7, 15, 0, 0, 0, 15, 15, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
diff --git a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
index 0fbc18165ebb..7f1e1b922429 100644
--- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
@@ -21,7 +21,6 @@
 
 from transformers import Wav2Vec2BertConfig, is_torch_available
 from transformers.testing_utils import (
-    is_pt_flax_cross_test,
     require_torch,
     require_torch_accelerator,
     require_torch_fp16,
@@ -51,9 +50,9 @@
         Wav2Vec2BertForXVector,
         Wav2Vec2BertModel,
     )
+    from transformers.models.wav2vec2.modeling_wav2vec2 import _sample_negative_indices
     from transformers.models.wav2vec2_bert.modeling_wav2vec2_bert import (
         _compute_mask_indices,
-        _sample_negative_indices,
     )
 
 
@@ -247,32 +246,6 @@ def create_and_check_model_float16(self, config, input_features, attention_mask)
             result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
         )
 
-    def create_and_check_batch_inference(self, config, input_features, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Wav2Vec2BertModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_features = input_features[:3]
-        attention_mask = torch.ones(input_features.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_features.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_features[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_features, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_features.shape[0]):
-            input_slice = input_features[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
     def check_ctc_loss(self, config, input_features, *args):
         model = Wav2Vec2BertForCTC(config=config)
         model.to(torch_device)
@@ -559,18 +532,6 @@ def test_resize_tokens_embeddings(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    # Ignore copy
-    @unittest.skip(reason="non-robust architecture does not exist in Flax")
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    # Ignore copy
-    @unittest.skip(reason="non-robust architecture does not exist in Flax")
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        pass
-
     def test_retain_grad_hidden_states_attentions(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_hidden_states = True
@@ -795,7 +756,7 @@ def test_sample_negatives(self):
 
         features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
             sequence_length, hidden_size
-        )  # each value in vector consits of same value
+        )  # each value in vector consists of same value
         features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
 
         # sample negative indices
@@ -824,7 +785,7 @@ def test_sample_negatives_with_mask(self):
 
         features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
             sequence_length, hidden_size
-        )  # each value in vector consits of same value
+        )  # each value in vector consists of same value
         features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
 
         # replace masked feature vectors with -100 to test that those are not sampled
diff --git a/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
index 704d087a56a8..6e98c434dea2 100644
--- a/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_processor_wav2vec2_bert.py
@@ -18,8 +18,6 @@
 import tempfile
 import unittest
 
-import numpy as np
-
 from transformers.models.seamless_m4t import SeamlessM4TFeatureExtractor
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
@@ -32,6 +30,7 @@
 
 class Wav2Vec2BertProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = Wav2Vec2BertProcessor
+    text_input_name = "labels"
 
     def setUp(self):
         vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
@@ -136,22 +135,6 @@ def test_tokenizer(self):
         for key in encoded_tok.keys():
             self.assertListEqual(encoded_tok[key], encoded_processor[key])
 
-    def test_padding_argument_not_ignored(self):
-        # padding, or any other overlap arg between audio extractor and tokenizer
-        # should be passed to both text and audio and not ignored
-        feature_extractor = self.get_feature_extractor()
-        tokenizer = self.get_tokenizer()
-
-        processor = Wav2Vec2BertProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        batch_duration_in_seconds = [1, 3, 2, 6]
-        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
-
-        # padding = True should not raise an error and will if the audio processor popped its value to None
-        # processor(input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt")
-        _ = processor(
-            input_features, padding=True, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt"
-        )
-
     def test_tokenizer_decode(self):
         feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()
diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
index 2f1e5a8e3410..6a884ba36bab 100644
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -24,7 +24,6 @@
 from transformers import Wav2Vec2ConformerConfig, is_torch_available
 from transformers.testing_utils import (
     is_flaky,
-    is_pt_flax_cross_test,
     require_torch,
     require_torch_accelerator,
     require_torch_fp16,
@@ -56,10 +55,10 @@
         Wav2Vec2FeatureExtractor,
         Wav2Vec2Processor,
     )
+    from transformers.models.wav2vec2.modeling_wav2vec2 import _sample_negative_indices
     from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
         Wav2Vec2ConformerGumbelVectorQuantizer,
         _compute_mask_indices,
-        _sample_negative_indices,
     )
 
 
@@ -241,32 +240,6 @@ def create_and_check_model_float16(self, config, input_values, attention_mask):
             result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
         )
 
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = Wav2Vec2ConformerModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
     def check_ctc_loss(self, config, input_values, *args):
         model = Wav2Vec2ConformerForCTC(config=config)
         model.to(torch_device)
@@ -535,16 +508,6 @@ def test_resize_tokens_embeddings(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @is_pt_flax_cross_test
-    @unittest.skip(reason="Non-robust architecture does not exist in Flax")
-    def test_equivalence_flax_to_pt(self):
-        pass
-
-    @is_pt_flax_cross_test
-    @unittest.skip(reason="Non-robust architecture does not exist in Flax")
-    def test_equivalence_pt_to_flax(self):
-        pass
-
     def test_retain_grad_hidden_states_attentions(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_hidden_states = True
@@ -811,7 +774,7 @@ def test_sample_negatives(self):
 
         features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
             sequence_length, hidden_size
-        )  # each value in vector consits of same value
+        )  # each value in vector consists of same value
         features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
 
         # sample negative indices
@@ -840,7 +803,7 @@ def test_sample_negatives_with_mask(self):
 
         features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view(
             sequence_length, hidden_size
-        )  # each value in vector consits of same value
+        )  # each value in vector consists of same value
         features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous()
 
         # replace masked feature vectors with -100 to test that those are not sampled
diff --git a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
index 96bed25ad16a..f9d547acddfd 100644
--- a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
+++ b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
@@ -17,6 +17,7 @@
 import json
 import os
 import unittest
+from functools import lru_cache
 from typing import Tuple
 
 from transformers import Wav2Vec2PhonemeCTCTokenizer
@@ -24,7 +25,7 @@
 from transformers.models.wav2vec2_phoneme.tokenization_wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizerOutput
 from transformers.testing_utils import require_phonemizer
 
-from ...test_tokenization_common import TokenizerTesterMixin
+from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
 
 
 @require_phonemizer
@@ -33,8 +34,9 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = Wav2Vec2PhonemeCTCTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         vocab = (
             "<s> <pad> </s> <unk> n s t ə l a i k d m ɛ ɾ e ɪ p o ɐ z ð f j v b ɹ ʁ ʊ iː r w ʌ u ɡ æ aɪ ʃ h ɔ ɑː "
@@ -53,10 +55,10 @@ def setUp(self):
         ).split(" ")
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
 
-        self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
+        cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
             fp.write(json.dumps(vocab_tokens) + "\n")
 
     # overwrite since phonemes require specific creation
@@ -84,9 +86,13 @@ def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20,
         output_ids = tokenizer.encode(output_txt, add_special_tokens=False)
         return output_txt, output_ids
 
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return Wav2Vec2PhonemeCTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        kwargs.update(cls.special_tokens_map)
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return Wav2Vec2PhonemeCTCTokenizer.from_pretrained(pretrained_name, **kwargs)
 
     def test_tokenizer_add_new_tokens(self):
         tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py
index ed02c6aa1419..228587c2be98 100644
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@@ -157,32 +157,6 @@ def create_and_check_model(self, config, input_values, attention_mask):
             result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
         )
 
-    def create_and_check_batch_inference(self, config, input_values, *args):
-        # test does not pass for models making use of `group_norm`
-        # check: https://github.com/pytorch/fairseq/issues/3227
-        model = WavLMModel(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        input_values = input_values[:3]
-        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.bool)
-
-        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
-
-        # pad input
-        for i in range(len(input_lengths)):
-            input_values[i, input_lengths[i] :] = 0.0
-            attention_mask[i, input_lengths[i] :] = 0.0
-
-        batch_outputs = model(input_values, attention_mask=attention_mask).last_hidden_state
-
-        for i in range(input_values.shape[0]):
-            input_slice = input_values[i : i + 1, : input_lengths[i]]
-            output = model(input_slice).last_hidden_state
-
-            batch_output = batch_outputs[i : i + 1, : output.shape[1]]
-            self.parent.assertTrue(torch.allclose(output, batch_output, atol=1e-3))
-
     def check_ctc_loss(self, config, input_values, *args):
         model = WavLMForCTC(config=config)
         model.to(torch_device)
@@ -549,7 +523,7 @@ def test_inference_large(self):
             [[[0.2122, 0.0500], [0.2118, 0.0563]], [[0.1353, 0.1818], [0.2453, 0.0595]]]
         )
 
-        torch.testing.assert_close(hidden_states_slice, EXPECTED_HIDDEN_STATES_SLICE, rtol=5e-2)
+        torch.testing.assert_close(hidden_states_slice, EXPECTED_HIDDEN_STATES_SLICE, rtol=5e-2, atol=5e-2)
 
     def test_inference_diarization(self):
         model = WavLMForAudioFrameClassification.from_pretrained("microsoft/wavlm-base-plus-sd").to(torch_device)
diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
index 61106c040067..80cb421d1323 100644
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -200,6 +200,40 @@ def test_call(self):
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
+    def test_dither(self):
+        np.random.seed(42)  # seed the dithering randn()
+
+        # Tests that features with and without little dithering are similar, but not the same
+        dict_no_dither = self.feat_extract_tester.prepare_feat_extract_dict()
+        dict_no_dither["dither"] = 0.0
+
+        dict_dither = self.feat_extract_tester.prepare_feat_extract_dict()
+        dict_dither["dither"] = 0.00003  # approx. 1/32k
+
+        feature_extractor_no_dither = self.feature_extraction_class(**dict_no_dither)
+        feature_extractor_dither = self.feature_extraction_class(**dict_dither)
+
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # compute features
+        input_features_no_dither = feature_extractor_no_dither(
+            np_speech_inputs, padding=True, return_tensors="np", sampling_rate=dict_no_dither["sampling_rate"]
+        ).input_features
+        input_features_dither = feature_extractor_dither(
+            np_speech_inputs, padding=True, return_tensors="np", sampling_rate=dict_dither["sampling_rate"]
+        ).input_features
+
+        # test there is a difference between features (there's added noise to input signal)
+        diff = input_features_dither - input_features_no_dither
+
+        # features are not identical
+        self.assertTrue(np.abs(diff).mean() > 1e-6)
+        # features are not too different
+        self.assertTrue(np.abs(diff).mean() <= 1e-4)
+        self.assertTrue(np.abs(diff).max() <= 5e-3)
+
     @require_torch
     def test_double_precision_pad(self):
         import torch
diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py
index f018d0d4198c..274582be2335 100644
--- a/tests/models/whisper/test_modeling_flax_whisper.py
+++ b/tests/models/whisper/test_modeling_flax_whisper.py
@@ -17,9 +17,8 @@
 import tempfile
 import unittest
 
-import transformers
 from transformers import WhisperConfig, is_flax_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers.testing_utils import require_flax, slow
 from transformers.utils import cached_property
 from transformers.utils.import_utils import is_datasets_available
 
@@ -45,7 +44,6 @@
         WhisperFeatureExtractor,
         WhisperProcessor,
     )
-    from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
     from transformers.models.whisper.modeling_flax_whisper import sinusoidal_embedding_init
 
 
@@ -245,99 +243,6 @@ def model_jitted(input_features, decoder_input_ids, **kwargs):
                 for jitted_output, output in zip(jitted_outputs, outputs):
                     self.assertEqual(jitted_output.shape, output.shape)
 
-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as test recently became flaky
-        super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
-
-    # overwrite because of `input_features`
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == base_class.__name__:
-                continue
-
-            model = model_class(config)
-            model.params = model.to_bf16(model.params)
-            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite because of `input_features`
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == base_class.__name__:
-                continue
-
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                # save pt model
-                pt_model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_param_from_head = flatten_dict(unfreeze(head_model.params[head_model.base_model_prefix]))
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    # overwrite because of `input_features`
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = make_partial_class(FLAX_MODEL_MAPPING[config.__class__], input_shape=self.init_shape)
-
-        for model_class in self.all_model_classes:
-            if model_class.__name__ == base_class.__name__:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params[model.base_model_prefix]))
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = flatten_dict(unfreeze(base_model.params))
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
     # overwrite because of `input_features`
     def test_save_load_from_base(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -899,18 +804,3 @@ def test_save_load_to_base(self):
     # WhisperEncoder does not have any base model
     def test_save_load_from_base(self):
         pass
-
-    # WhisperEncoder does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        pass
-
-    # WhisperEncoder does not have any base model
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        pass
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index 7aacf5171921..20109ef2aa58 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -433,10 +433,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as test recently became flaky
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
@@ -524,127 +520,6 @@ def test_attention_outputs(self):
                 [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
             )
 
-    def test_generate_without_input_ids(self):
-        pass
-
-    # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is
-    # `input_features`
-    def test_lm_head_model_random_no_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_features = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_features
-                with self.assertRaises(AssertionError):
-                    model.generate(do_sample=True, max_length=5)
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_features, do_sample=True))
-
-            with self.assertRaises(ValueError):
-                # generating multiple sequences when no beam search generation
-                # is not allowed as it would always generate the same sequences
-                model.generate(input_features, do_sample=False, num_return_sequences=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_features, do_sample=True, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_features, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_features.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is
-    # `input_features`
-    def test_lm_head_model_random_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_features = inputs_dict.get("input_features", None)
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_features, do_sample=True, num_beams=2))
-
-            with self.assertRaises(ValueError):
-                # generating more sequences than having beams leads is not possible
-                model.generate(input_features, do_sample=False, num_return_sequences=3, num_beams=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(
-                model.generate(
-                    input_features,
-                    do_sample=True,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
-            # num_return_sequences > 1, greedy
-            self._check_generated_ids(
-                model.generate(input_features, do_sample=False, num_beams=2, num_return_sequences=2)
-            )
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_features, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_features.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    def test_generate_with_prompt_ids_and_task_and_language(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = TFWhisperForConditionalGeneration(config)
-        input_features = input_dict["input_features"]
-        prompt_ids = np.arange(5)
-        language = "<|de|>"
-        task = "translate"
-        lang_id = 6
-        task_id = 7
-        model.generation_config.__setattr__("lang_to_id", {language: lang_id})
-        model.generation_config.__setattr__("task_to_id", {task: task_id})
-
-        output = model.generate(input_features, max_new_tokens=5, task=task, language=language, prompt_ids=prompt_ids)
-
-        expected_output_start = [
-            *prompt_ids.tolist(),
-            model.generation_config.decoder_start_token_id,
-            lang_id,
-            task_id,
-        ]
-        for row in output.numpy().tolist():
-            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
-
-    def test_generate_with_prompt_ids_and_forced_decoder_ids(self):
-        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = TFWhisperForConditionalGeneration(config)
-        input_features = input_dict["input_features"]
-        prompt_ids = np.asarray(range(5))
-        forced_decoder_ids = [(1, 6), (2, 7), (3, 8)]
-
-        output = model.generate(
-            input_features, max_new_tokens=5, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids
-        )
-
-        expected_output_start = [
-            *prompt_ids.tolist(),
-            model.generation_config.decoder_start_token_id,
-            *[token for _rank, token in forced_decoder_ids],
-        ]
-        for row in output.numpy().tolist():
-            self.assertListEqual(row[: len(expected_output_start)], expected_output_start)
-
 
 def _load_datasamples(num_samples):
     ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 9a951c9306d3..f4a1f8e5068e 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -32,7 +32,6 @@
 from transformers import WhisperConfig
 from transformers.testing_utils import (
     is_flaky,
-    is_pt_flax_cross_test,
     require_flash_attn,
     require_non_xpu,
     require_torch,
@@ -44,7 +43,7 @@
     slow,
     torch_device,
 )
-from transformers.utils import cached_property, is_flax_available, is_torch_available, is_torchaudio_available
+from transformers.utils import cached_property, is_torch_available, is_torchaudio_available
 from transformers.utils.import_utils import is_datasets_available
 
 from ...generation.test_utils import GenerationTesterMixin
@@ -155,15 +154,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
     import torchaudio
 
 
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
-
 def prepare_whisper_inputs_dict(
     config,
     input_features,
@@ -556,13 +546,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -1069,165 +1059,6 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as test recently became flaky
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
-
-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=5e-5, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as test recently became flaky
-        super().check_pt_flax_outputs(fx_outputs, pt_outputs, model_class, tol, name, attributes)
-
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                # load Flax class
-                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
-
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                # send pytorch inputs to the correct device
-                pt_inputs = {
-                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
-                }
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, input_shape=init_shape, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
-
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                # load Flax class
-                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
-
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                # send pytorch inputs to the correct device
-                pt_inputs = {
-                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
-                }
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                # send pytorch model to the correct device
-                pt_model_loaded.to(torch_device)
-                pt_model_loaded.eval()
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)
-
     def test_mask_feature_prob(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.mask_feature_prob = 0.2
@@ -3522,7 +3353,7 @@ def create_and_check_model_forward(self, config, inputs_dict, use_weighted_layer
 
 
 @require_torch
-class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class WhisperEncoderModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (WhisperForAudioClassification,) if is_torch_available() else ()
     is_encoder_decoder = False
     fx_compatible = False
@@ -3626,157 +3457,6 @@ def test_model_get_set_embeddings(self):
     def test_resize_tokens_embeddings(self):
         pass
 
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="Flax model does not exist")
-
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                # load Flax class
-                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
-
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                # send pytorch inputs to the correct device
-                pt_inputs = {
-                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
-                }
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, input_shape=init_shape, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
-
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        init_shape = (1,) + inputs_dict["input_features"].shape[1:]
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest("Flax model does not exist")
-
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                # load Flax class
-                fx_model = fx_model_class(config, input_shape=init_shape, dtype=jnp.float32)
-
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                # send pytorch inputs to the correct device
-                pt_inputs = {
-                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
-                }
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                # send pytorch model to the correct device
-                pt_model_loaded.to(torch_device)
-                pt_model_loaded.eval()
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)
-
 
 class WhisperStandaloneDecoderModelTester:
     def __init__(
@@ -4004,10 +3684,6 @@ def test_generate_without_input_ids(self):
     def test_retain_grad_hidden_states_attentions(self):
         return
 
-    @unittest.skip(reason="The model doesn't support fast init from base")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
     @unittest.skip(
         "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
     )
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 27b24448d5a2..61a34c165d8a 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -40,12 +40,13 @@ class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     test_sentencepiece = False
     test_seq2seq = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
         tokenizer.pad_token_id = 50256
         tokenizer.pad_token = "<|endoftext|>"
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 80ee63fb15c1..5a121d77439c 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -195,25 +195,17 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="XCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="XCLIPVisionModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "microsoft/xclip-base-patch32"
@@ -431,13 +423,13 @@ def test_training_gradient_checkpointing(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant(self):
         pass
 
     @unittest.skip(
-        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+        reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
@@ -446,14 +438,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip(reason="XCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="XCLIPTextModel has no base class and is not available in MODEL_MAPPING")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "microsoft/xclip-base-patch32"
@@ -588,7 +572,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 if param.requires_grad:
-                    # check if `logit_scale` is initilized as per the original implementation
+                    # check if `logit_scale` is initialized as per the original implementation
                     if name == "logit_scale":
                         self.assertAlmostEqual(
                             param.data.item(),
diff --git a/tests/models/xglm/test_modeling_flax_xglm.py b/tests/models/xglm/test_modeling_flax_xglm.py
index b34aee8f3fa8..0eaf5d46af9e 100644
--- a/tests/models/xglm/test_modeling_flax_xglm.py
+++ b/tests/models/xglm/test_modeling_flax_xglm.py
@@ -14,14 +14,11 @@
 # limitations under the License.
 
 
-import tempfile
 import unittest
 
-import transformers
-from transformers import XGLMConfig, XGLMTokenizer, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, require_sentencepiece, slow
+from transformers import XGLMConfig, XGLMTokenizer, is_flax_available
+from transformers.testing_utils import require_flax, require_sentencepiece, slow
 
-from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
 
 
@@ -30,17 +27,9 @@
     import jax.numpy as jnp
     import numpy as np
 
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
     from transformers.models.xglm.modeling_flax_xglm import FlaxXGLMForCausalLM, FlaxXGLMModel
 
 
-if is_torch_available():
-    import torch
-
-
 @require_flax
 class FlaxXGLMModelTester:
     def __init__(
@@ -181,7 +170,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 @require_sentencepiece
 @require_flax
-class FlaxXGLMModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+class FlaxXGLMModelTest(FlaxModelTesterMixin, unittest.TestCase):
     all_model_classes = (FlaxXGLMModel, FlaxXGLMForCausalLM) if is_flax_available() else ()
 
     def setUp(self):
@@ -221,108 +210,6 @@ def test_batch_generation(self):
 
         self.assertListEqual(output_string, expected_string)
 
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                pt_model = pt_model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-                fx_model = model_class(config, dtype=jnp.float32)
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                pt_model.config.use_cache = False
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
index e651d2742327..56094003279e 100644
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -208,7 +208,7 @@ def test_batch_generation(self):
 
         # use different length sentences to test batching
         sentences = [
-            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "This is an extremely long sentence that only exists to test the ability of the model to cope with "
             "left-padding, such as in batched generation. The output for the sequence below should be the same "
             "regardless of whether left padding is applied or not. When",
             "Hello, my dog is a little",
@@ -230,7 +230,7 @@ def test_batch_generation(self):
         padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
 
         expected_output_sentence = [
-            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "This is an extremely long sentence that only exists to test the ability of the model to cope with "
             "left-padding, such as in batched generation. The output for the sequence below should be the same "
             "regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
             "a single",
@@ -238,3 +238,23 @@ def test_batch_generation(self):
         ]
         self.assertListEqual(expected_output_sentence, batch_out_sentence)
         self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
+
+    @slow
+    def test_loss_with_padding(self):
+        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
+        model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
+
+        tokenizer.padding_side = "right"
+
+        sequence = "Sequence"
+
+        tokenized_non_padded = tokenizer(sequence, return_tensors="tf")
+        labels_non_padded = tokenized_non_padded.input_ids
+        loss_non_padded = model(tokenized_non_padded, labels=labels_non_padded).loss
+
+        tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="tf")
+        labels_padded = tokenized_padded.input_ids
+        labels_padded = tf.where(labels_padded == tokenizer.pad_token_id, -100, labels_padded)
+        loss_padded = model(tokenized_padded, labels=labels_padded).loss
+
+        tf.debugging.assert_near(loss_non_padded, loss_padded, atol=1e-3)
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index e321aaf643e8..31c298132e14 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -356,7 +356,7 @@ def _test_lm_generate_xglm_helper(
         model.to(torch_device)
         input_ids = torch.tensor([[2, 268, 9865]], dtype=torch.long, device=torch_device)  # The dog
         # </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
-        expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581]  # fmt: skip
+        expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581, 72616, 5, 984]  # fmt: skip
         output_ids = model.generate(input_ids, do_sample=False, num_beams=1)
         if verify_outputs:
             self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
@@ -371,7 +371,7 @@ def test_batch_generation(self):
 
         # use different length sentences to test batching
         sentences = [
-            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "This is an extremely long sentence that only exists to test the ability of the model to cope with "
             "left-padding, such as in batched generation. The output for the sequence below should be the same "
             "regardless of whether left padding is applied or not. When",
             "Hello, my dog is a little",
@@ -395,7 +395,7 @@ def test_batch_generation(self):
         padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
 
         expected_output_sentence = [
-            "This is an extremelly long sentence that only exists to test the ability of the model to cope with "
+            "This is an extremely long sentence that only exists to test the ability of the model to cope with "
             "left-padding, such as in batched generation. The output for the sequence below should be the same "
             "regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
             "a single",
@@ -423,14 +423,11 @@ def test_xglm_sample(self):
         output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
         output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
-        EXPECTED_OUTPUT_STRS = [
-            # TODO: remove this once we move to torch 2.0
-            # torch 1.13.1 + cu116
-            "Today is a nice day and the sun is shining. A nice day with warm rainy",
-            # torch 2.0 + cu117
-            "Today is a nice day and the water is still cold. We just stopped off for some fresh",
-        ]
-        self.assertIn(output_str, EXPECTED_OUTPUT_STRS)
+        EXPECTED_OUTPUT_STR = (
+            "Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place"
+            " looks like a"
+        )
+        self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
 
     @require_torch_accelerator
     @require_torch_fp16
@@ -451,3 +448,26 @@ def test_batched_nan_fp16(self):
             self.assertFalse(
                 torch.isnan(outputs.logits[0]).any().item()
             )  # the first logits could contain NaNs if it fails
+
+    @slow
+    def test_loss_with_padding(self):
+        tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
+        model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
+        model.to(torch_device)
+
+        tokenizer.padding_side = "right"
+
+        sequence = "Sequence"
+
+        tokenized_non_padded = tokenizer(sequence, return_tensors="pt")
+        tokenized_non_padded.to(torch_device)
+        labels_non_padded = tokenized_non_padded.input_ids.clone()
+        loss_non_padded = model(**tokenized_non_padded, labels=labels_non_padded).loss
+
+        tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="pt")
+        tokenized_padded.to(torch_device)
+        labels_padded = tokenized_padded.input_ids.clone()
+        labels_padded[labels_padded == tokenizer.pad_token_id] = -100
+        loss_padded = model(**tokenized_padded, labels=labels_padded).loss
+
+        torch.testing.assert_close(loss_non_padded, loss_padded, rtol=1e-3, atol=1e-3)
diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py
index eac3eda05daf..08fa3ebf1a38 100644
--- a/tests/models/xglm/test_tokenization_xglm.py
+++ b/tests/models/xglm/test_tokenization_xglm.py
@@ -37,12 +37,13 @@ class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = XGLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
diff --git a/tests/models/xlm/test_modeling_xlm.py b/tests/models/xlm/test_modeling_xlm.py
index afe0a20a00cc..17ba172972b3 100644
--- a/tests/models/xlm/test_modeling_xlm.py
+++ b/tests/models/xlm/test_modeling_xlm.py
@@ -527,4 +527,4 @@ def test_lm_generate_xlm_mlm_en_2048(self):
         ]  # the president the president the president the president the president the president the president the president the president the president
         # TODO(PVP): this and other input_ids I tried for generation give pretty bad results. Not sure why. Model might just not be made for auto-regressive inference
         output_ids = model.generate(input_ids, do_sample=False)
-        self.assertListEqual(output_ids[0].cpu().numpy().tolist(), expected_output_ids)
+        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
diff --git a/tests/models/xlm/test_tokenization_xlm.py b/tests/models/xlm/test_tokenization_xlm.py
index 6bc7fedad48a..2292b18b8bd4 100644
--- a/tests/models/xlm/test_tokenization_xlm.py
+++ b/tests/models/xlm/test_tokenization_xlm.py
@@ -29,8 +29,9 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = XLMTokenizer
     test_rust_tokenizer = False
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = [
@@ -59,11 +60,11 @@ def setUp(self):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
 
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w") as fp:
+        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(cls.vocab_file, "w") as fp:
             fp.write(json.dumps(vocab_tokens))
-        with open(self.merges_file, "w") as fp:
+        with open(cls.merges_file, "w") as fp:
             fp.write("\n".join(merges))
 
     def get_input_output_texts(self, tokenizer):
diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
index ae32a62a9398..ba5a834bfe0c 100644
--- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
@@ -37,12 +37,13 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@@ -148,8 +149,8 @@ def test_save_pretrained(self):
         self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {})
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 tmpdirname2 = tempfile.mkdtemp()
 
diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py
index 32dd4685c8ca..307499b605d5 100644
--- a/tests/models/xlnet/test_tokenization_xlnet.py
+++ b/tests/models/xlnet/test_tokenization_xlnet.py
@@ -33,12 +33,13 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     test_sentencepiece = True
 
-    def setUp(self):
-        super().setUp()
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
         # We have a SentencePiece fixture for testing
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py
index ec52c950aac4..cb31efcd5759 100644
--- a/tests/models/yoso/test_modeling_yoso.py
+++ b/tests/models/yoso/test_modeling_yoso.py
@@ -168,38 +168,6 @@ def create_and_check_model(
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_model_as_decoder(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.add_cross_attention = True
-        model = YosoModel(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-
     def create_and_check_for_masked_lm(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py
index fc1c89322245..88023f6c1a85 100644
--- a/tests/models/zamba/test_modeling_zamba.py
+++ b/tests/models/zamba/test_modeling_zamba.py
@@ -22,6 +22,7 @@
 
 from transformers import AutoTokenizer, ZambaConfig, is_torch_available
 from transformers.testing_utils import (
+    is_flaky,
     require_bitsandbytes,
     require_flash_attn,
     require_torch,
@@ -327,6 +328,7 @@ def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
+    @is_flaky(description="TODO: ydshieh")
     def test_initialization(self):
         r"""
         Overriding the test_initialization test as the A_log and D params of the Mamba block are initialized differently
diff --git a/tests/models/zoedepth/test_modeling_zoedepth.py b/tests/models/zoedepth/test_modeling_zoedepth.py
index 342ae269d39d..e9ffae7f5c60 100644
--- a/tests/models/zoedepth/test_modeling_zoedepth.py
+++ b/tests/models/zoedepth/test_modeling_zoedepth.py
@@ -174,14 +174,6 @@ def test_for_depth_estimation(self):
     def test_model_common_attributes(self):
         pass
 
-    @unittest.skip(reason="ZoeDepth with AutoBackbone does not have a base model")
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(reason="ZoeDepth with AutoBackbone does not have a base model")
-    def test_save_load_fast_init_to_base(self):
-        pass
-
     @unittest.skip(reason="ZoeDepth does not support training yet")
     def test_training(self):
         pass
diff --git a/tests/optimization/test_optimization.py b/tests/optimization/test_optimization.py
index 4ab248e75a9a..73b09c0627ce 100644
--- a/tests/optimization/test_optimization.py
+++ b/tests/optimization/test_optimization.py
@@ -28,7 +28,6 @@
 
     from transformers import (
         Adafactor,
-        AdamW,
         get_constant_schedule,
         get_constant_schedule_with_warmup,
         get_cosine_schedule_with_warmup,
@@ -76,7 +75,7 @@ def test_adam_w(self):
         target = torch.tensor([0.4, 0.2, -0.5])
         criterion = nn.MSELoss()
         # No warmup, constant schedule, no gradient clipping
-        optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
+        optimizer = torch.optim.AdamW(params=[w], lr=2e-1, weight_decay=0.0)
         for _ in range(100):
             loss = criterion(w, target)
             loss.backward()
@@ -114,7 +113,7 @@ def test_adafactor(self):
 @require_torch
 class ScheduleInitTest(unittest.TestCase):
     m = nn.Linear(50, 50) if is_torch_available() else None
-    optimizer = AdamW(m.parameters(), lr=10.0) if is_torch_available() else None
+    optimizer = torch.optim.AdamW(m.parameters(), lr=10.0) if is_torch_available() else None
     num_steps = 10
 
     def assertListAlmostEqual(self, list1, list2, tol, msg=None):
diff --git a/tests/peft_integration/test_peft_integration.py b/tests/peft_integration/test_peft_integration.py
index 61b60901ca1a..22282aa07f32 100644
--- a/tests/peft_integration/test_peft_integration.py
+++ b/tests/peft_integration/test_peft_integration.py
@@ -35,6 +35,7 @@
     require_bitsandbytes,
     require_peft,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     slow,
     torch_device,
@@ -440,7 +441,7 @@ def test_peft_from_pretrained_kwargs(self):
                 # dummy generation
                 _ = peft_model.generate(input_ids=torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]).to(torch_device))
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_bitsandbytes
     def test_peft_save_quantized(self):
         """
@@ -479,7 +480,7 @@ def test_peft_save_quantized(self):
                     self.assertTrue("pytorch_model.bin" not in os.listdir(tmpdirname))
                     self.assertTrue("model.safetensors" not in os.listdir(tmpdirname))
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_bitsandbytes
     def test_peft_save_quantized_regression(self):
         """
@@ -525,9 +526,13 @@ def test_peft_pipeline(self):
         """
         from transformers import pipeline
 
-        for model_id in self.peft_test_model_ids:
-            pipe = pipeline("text-generation", model_id)
-            _ = pipe("Hello")
+        for adapter_id, base_model_id in zip(self.peft_test_model_ids, self.transformers_test_model_ids):
+            peft_pipe = pipeline("text-generation", adapter_id)
+            base_pipe = pipeline("text-generation", base_model_id)
+            peft_params = list(peft_pipe.model.parameters())
+            base_params = list(base_pipe.model.parameters())
+            self.assertNotEqual(len(peft_params), len(base_params))  # Assert we actually loaded the adapter too
+            _ = peft_pipe("Hello")
 
     def test_peft_add_adapter_with_state_dict(self):
         """
@@ -549,7 +554,7 @@ def test_peft_add_adapter_with_state_dict(self):
 
                 state_dict_path = hf_hub_download(peft_model_id, "adapter_model.bin")
 
-                dummy_state_dict = torch.load(state_dict_path)
+                dummy_state_dict = torch.load(state_dict_path, weights_only=True)
 
                 model.load_adapter(adapter_state_dict=dummy_state_dict, peft_config=peft_config)
                 with self.assertRaises(ValueError):
@@ -574,7 +579,7 @@ def test_peft_add_adapter_with_state_dict_low_cpu_mem_usage(self):
 
                 peft_config = LoraConfig()
                 state_dict_path = hf_hub_download(peft_model_id, "adapter_model.bin")
-                dummy_state_dict = torch.load(state_dict_path)
+                dummy_state_dict = torch.load(state_dict_path, weights_only=True)
 
                 # this should always work
                 model.load_adapter(
@@ -642,7 +647,7 @@ def test_peft_from_pretrained_unexpected_keys_warning(self):
 
                 peft_config = LoraConfig()
                 state_dict_path = hf_hub_download(peft_model_id, "adapter_model.bin")
-                dummy_state_dict = torch.load(state_dict_path)
+                dummy_state_dict = torch.load(state_dict_path, weights_only=True)
 
                 # add unexpected key
                 dummy_state_dict["foobar"] = next(iter(dummy_state_dict.values()))
@@ -669,7 +674,7 @@ def test_peft_from_pretrained_missing_keys_warning(self):
 
                 peft_config = LoraConfig()
                 state_dict_path = hf_hub_download(peft_model_id, "adapter_model.bin")
-                dummy_state_dict = torch.load(state_dict_path)
+                dummy_state_dict = torch.load(state_dict_path, weights_only=True)
 
                 # remove a key so that we have missing keys
                 key = next(iter(dummy_state_dict.keys()))
diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py
index a552b79634be..cea317d0eb08 100644
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -14,6 +14,7 @@
 
 import unittest
 
+import datasets
 import numpy as np
 from huggingface_hub import AudioClassificationOutputElement
 
@@ -44,6 +45,15 @@
 class AudioClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            cls._dataset = datasets.load_dataset(
+                "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+            )
 
     def get_test_pipeline(
         self,
@@ -94,11 +104,9 @@ def run_pipeline_test(self, audio_classifier, examples):
 
     @require_torchaudio
     def run_torchaudio(self, audio_classifier):
-        import datasets
-
+        self._load_dataset()
         # test with a local file
-        dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        audio = dataset[0]["audio"]["array"]
+        audio = self._dataset[0]["audio"]["array"]
         output = audio_classifier(audio)
         self.assertEqual(
             output,
@@ -144,18 +152,21 @@ def test_small_model_pt_fp16(self):
         audio = np.ones((8000,))
         output = audio_classifier(audio, top_k=4)
 
+        # Expected outputs are collected running the test on torch 2.6 in few scenarios.
+        # Running on CUDA T4/A100 and on XPU PVC (note: using stock torch xpu, NOT using IPEX):
         EXPECTED_OUTPUT = [
+            {"score": 0.0833, "label": "go"},
+            {"score": 0.0833, "label": "off"},
+            {"score": 0.0833, "label": "stop"},
+            {"score": 0.0833, "label": "on"},
+        ]
+        # Running on CPU:
+        EXPECTED_OUTPUT_PT_2 = [
             {"score": 0.0839, "label": "no"},
             {"score": 0.0837, "label": "go"},
             {"score": 0.0836, "label": "yes"},
             {"score": 0.0835, "label": "right"},
         ]
-        EXPECTED_OUTPUT_PT_2 = [
-            {"score": 0.0845, "label": "stop"},
-            {"score": 0.0844, "label": "on"},
-            {"score": 0.0841, "label": "right"},
-            {"score": 0.0834, "label": "left"},
-        ]
         self.assertIn(nested_simplify(output, decimals=4), [EXPECTED_OUTPUT, EXPECTED_OUTPUT_PT_2])
 
         audio_dict = {"array": np.ones((8000,)), "sampling_rate": audio_classifier.feature_extractor.sampling_rate}
@@ -165,8 +176,6 @@ def test_small_model_pt_fp16(self):
     @require_torch
     @slow
     def test_large_model_pt(self):
-        import datasets
-
         model = "superb/wav2vec2-base-superb-ks"
 
         audio_classifier = pipeline("audio-classification", model=model)
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index da57a002c4f5..4633f497f903 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1458,9 +1458,9 @@ def test_input_parameter_passthrough(self):
 
         chunked_output = speech_recognizer(inputs.copy(), chunk_length_s=30)
         non_chunked_output = speech_recognizer(inputs.copy())
-        assert (
-            chunked_output.keys() == non_chunked_output.keys()
-        ), "The output structure should be the same for chunked vs non-chunked versions of asr pipelines."
+        assert chunked_output.keys() == non_chunked_output.keys(), (
+            "The output structure should be the same for chunked vs non-chunked versions of asr pipelines."
+        )
 
     @require_torch
     def test_return_timestamps_ctc_fast(self):
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index baee8999284b..d35f5978cdaa 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -251,7 +251,7 @@ def test_pipeline_with_task_parameters_no_side_effects(self):
         self.assertTrue(model.generation_config.num_beams == 1)
 
         # Under the hood: we now store a generation config in the pipeline. This generation config stores the
-        # task-specific paremeters.
+        # task-specific parameters.
         self.assertTrue(pipe.generation_config.num_beams == 4)
 
         # We can confirm that the task-specific parameters have an effect. (In this case, the default is `num_beams=1`,
@@ -613,7 +613,7 @@ def test_load_default_pipelines_pt(self):
         set_seed_fn = lambda: torch.manual_seed(0)  # noqa: E731
         for task in SUPPORTED_TASKS.keys():
             if task == "table-question-answering":
-                # test table in seperate test due to more dependencies
+                # test table in separate test due to more dependencies
                 continue
 
             self.check_default_pipeline(task, "pt", set_seed_fn, self.check_models_equal_pt)
@@ -631,7 +631,7 @@ def test_load_default_pipelines_tf(self):
         set_seed_fn = lambda: keras.utils.set_random_seed(0)  # noqa: E731
         for task in SUPPORTED_TASKS.keys():
             if task == "table-question-answering":
-                # test table in seperate test due to more dependencies
+                # test table in separate test due to more dependencies
                 continue
 
             self.check_default_pipeline(task, "tf", set_seed_fn, self.check_models_equal_tf)
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index a905aa8169ba..a5dcb3ef249a 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -14,6 +14,7 @@
 
 import unittest
 
+import datasets
 from huggingface_hub import DepthEstimationOutput
 from huggingface_hub.utils import insecure_hashlib
 
@@ -57,6 +58,17 @@ def hashimage(image: Image) -> str:
 @require_torch
 class DepthEstimationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            # we use revision="refs/pr/1" until the PR is merged
+            # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+            cls._dataset = datasets.load_dataset(
+                "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
+            )
 
     def get_test_pipeline(
         self,
@@ -81,23 +93,20 @@ def get_test_pipeline(
         ]
 
     def run_pipeline_test(self, depth_estimator, examples):
+        self._load_dataset()
         outputs = depth_estimator("./tests/fixtures/tests_samples/COCO/000000039769.png")
         self.assertEqual({"predicted_depth": ANY(torch.Tensor), "depth": ANY(Image.Image)}, outputs)
-        import datasets
 
-        # we use revision="refs/pr/1" until the PR is merged
-        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
         outputs = depth_estimator(
             [
                 Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
                 # RGBA
-                dataset[0]["image"],
+                self._dataset[0]["image"],
                 # LA
-                dataset[1]["image"],
+                self._dataset[1]["image"],
                 # L
-                dataset[2]["image"],
+                self._dataset[2]["image"],
             ]
         )
         self.assertEqual(
diff --git a/tests/pipelines/test_pipelines_document_question_answering.py b/tests/pipelines/test_pipelines_document_question_answering.py
index 85d528ce9104..17f3b9adb9c7 100644
--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
@@ -147,7 +147,7 @@ def test_small_model_pt(self):
         outputs = dqa_pipeline(image=image, question=question, top_k=2)
         self.assertEqual(outputs, [])
 
-        # We can optionnally pass directly the words and bounding boxes
+        # We can optionally pass directly the words and bounding boxes
         image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
         words = []
         boxes = []
@@ -183,7 +183,7 @@ def test_small_model_pt_bf16(self):
         outputs = dqa_pipeline(image=image, question=question, top_k=2)
         self.assertEqual(outputs, [])
 
-        # We can optionnally pass directly the words and bounding boxes
+        # We can optionally pass directly the words and bounding boxes
         image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
         words = []
         boxes = []
diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py
index bceb48d6ffa6..12bc3dc655bd 100644
--- a/tests/pipelines/test_pipelines_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_feature_extraction.py
@@ -196,7 +196,7 @@ def get_test_pipeline(
         elif model.config.is_encoder_decoder:
             self.skipTest(
                 """encoder_decoder models are trickier for this pipeline.
-                Do we want encoder + decoder inputs to get some featues?
+                Do we want encoder + decoder inputs to get some features?
                 Do we want encoder only features ?
                 For now ignore those.
                 """
diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py
index 5f926dd2f075..17aec8bf35b2 100644
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -14,6 +14,7 @@
 
 import unittest
 
+import datasets
 from huggingface_hub import ImageClassificationOutputElement
 
 from transformers import (
@@ -57,6 +58,17 @@ def open(*args, **kwargs):
 class ImageClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            # we use revision="refs/pr/1" until the PR is merged
+            # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+            cls._dataset = datasets.load_dataset(
+                "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
+            )
 
     def get_test_pipeline(
         self,
@@ -83,6 +95,7 @@ def get_test_pipeline(
         return image_classifier, examples
 
     def run_pipeline_test(self, image_classifier, examples):
+        self._load_dataset()
         outputs = image_classifier("./tests/fixtures/tests_samples/COCO/000000039769.png")
 
         self.assertEqual(
@@ -93,23 +106,17 @@ def run_pipeline_test(self, image_classifier, examples):
             ],
         )
 
-        import datasets
-
-        # we use revision="refs/pr/1" until the PR is merged
-        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
-
         # Accepts URL + PIL.Image + lists
         outputs = image_classifier(
             [
                 Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
                 "http://images.cocodataset.org/val2017/000000039769.jpg",
                 # RGBA
-                dataset[0]["image"],
+                self._dataset[0]["image"],
                 # LA
-                dataset[1]["image"],
+                self._dataset[1]["image"],
                 # L
-                dataset[2]["image"],
+                self._dataset[2]["image"],
             ]
         )
         self.assertEqual(
diff --git a/tests/pipelines/test_pipelines_image_feature_extraction.py b/tests/pipelines/test_pipelines_image_feature_extraction.py
index 67140f91226d..d5d441bda694 100644
--- a/tests/pipelines/test_pipelines_image_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_image_feature_extraction.py
@@ -177,7 +177,7 @@ def get_test_pipeline(
         elif model.config.is_encoder_decoder:
             self.skipTest(
                 """encoder_decoder models are trickier for this pipeline.
-                Do we want encoder + decoder inputs to get some featues?
+                Do we want encoder + decoder inputs to get some features?
                 Do we want encoder only features ?
                 For now ignore those.
                 """
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 44ed4e54534b..62e56556ddad 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -88,6 +88,17 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
         + (MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() if MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING else [])
         + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
     )
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            # we use revision="refs/pr/1" until the PR is merged
+            # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+            cls._dataset = datasets.load_dataset(
+                "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
+            )
 
     def get_test_pipeline(
         self,
@@ -112,6 +123,7 @@ def get_test_pipeline(
         ]
 
     def run_pipeline_test(self, image_segmenter, examples):
+        self._load_dataset()
         outputs = image_segmenter(
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             threshold=0.0,
@@ -130,20 +142,22 @@ def run_pipeline_test(self, image_segmenter, examples):
         # to make it work
         self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * n, outputs)
 
-        # we use revision="refs/pr/1" until the PR is merged
-        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
-
         # RGBA
-        outputs = image_segmenter(dataset[0]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
+        outputs = image_segmenter(
+            self._dataset[0]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0
+        )
         m = len(outputs)
         self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
         # LA
-        outputs = image_segmenter(dataset[1]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
+        outputs = image_segmenter(
+            self._dataset[1]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0
+        )
         m = len(outputs)
         self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
         # L
-        outputs = image_segmenter(dataset[2]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0)
+        outputs = image_segmenter(
+            self._dataset[2]["image"], threshold=0.0, mask_threshold=0, overlap_mask_area_threshold=0
+        )
         m = len(outputs)
         self.assertEqual([{"score": ANY(float, type(None)), "label": ANY(str), "mask": ANY(Image.Image)}] * m, outputs)
 
diff --git a/tests/pipelines/test_pipelines_image_text_to_text.py b/tests/pipelines/test_pipelines_image_text_to_text.py
index 7b9e17edd36f..903e90919c2c 100644
--- a/tests/pipelines/test_pipelines_image_text_to_text.py
+++ b/tests/pipelines/test_pipelines_image_text_to_text.py
@@ -124,7 +124,7 @@ def test_model_pt_chat_template(self):
                 ],
             }
         ]
-        outputs = pipe([image_ny, image_chicago], text=messages)
+        outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=False, max_new_tokens=10)
         self.assertEqual(
             outputs,
             [
@@ -139,20 +139,7 @@ def test_model_pt_chat_template(self):
                             ],
                         }
                     ],
-                    "generated_text": [
-                        {
-                            "role": "user",
-                            "content": [
-                                {"type": "text", "text": "What’s the difference between these two images?"},
-                                {"type": "image"},
-                                {"type": "image"},
-                            ],
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "The first image shows a statue of the Statue of Liberty in the foreground, while the second image shows",
-                        },
-                    ],
+                    "generated_text": "The first image shows a statue of Liberty in the",
                 }
             ],
         )
@@ -179,7 +166,7 @@ def test_model_pt_chat_template_continue_final_message(self):
                 ],
             },
         ]
-        outputs = pipe(text=messages)
+        outputs = pipe(text=messages, max_new_tokens=10)
         self.assertEqual(
             outputs,
             [
@@ -213,7 +200,7 @@ def test_model_pt_chat_template_continue_final_message(self):
                             "content": [
                                 {
                                     "type": "text",
-                                    "text": "There is a dog and a person in the image. The dog is sitting on the sand, and the person is sitting on",
+                                    "text": "There is a dog and a person in the image. The dog is sitting",
                                 }
                             ],
                         },
@@ -238,7 +225,7 @@ def test_model_pt_chat_template_new_text(self):
                 ],
             }
         ]
-        outputs = pipe(text=messages, return_full_text=False)
+        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)
         self.assertEqual(
             outputs,
             [
@@ -255,7 +242,7 @@ def test_model_pt_chat_template_new_text(self):
                             ],
                         }
                     ],
-                    "generated_text": "In the image, a woman is sitting on the sandy beach, her legs crossed in a relaxed manner",
+                    "generated_text": "In the image, a woman is sitting on the",
                 }
             ],
         )
@@ -263,7 +250,7 @@ def test_model_pt_chat_template_new_text(self):
     @slow
     @require_torch
     def test_model_pt_chat_template_image_url(self):
-        pipe = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
         messages = [
             {
                 "role": "user",
@@ -279,7 +266,7 @@ def test_model_pt_chat_template_image_url(self):
             }
         ]
         outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)[0]["generated_text"]
-        self.assertEqual(outputs, "The image captures the iconic Statue of Liberty, a")
+        self.assertEqual(outputs, "A statue of liberty in the foreground of a city")
 
     @slow
     @require_torch
diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py
index d58f3749ec3f..fcc50ca5b2ba 100644
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -14,6 +14,7 @@
 
 import unittest
 
+import datasets
 from huggingface_hub import ObjectDetectionOutputElement
 
 from transformers import (
@@ -24,7 +25,7 @@
     is_vision_available,
     pipeline,
 )
-from transformers.testing_utils import (  #
+from transformers.testing_utils import (
     compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
@@ -55,6 +56,17 @@ def open(*args, **kwargs):
 @require_torch
 class ObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
+    _dataset = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls._dataset is None:
+            # we use revision="refs/pr/1" until the PR is merged
+            # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
+            cls._dataset = datasets.load_dataset(
+                "hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1"
+            )
 
     def get_test_pipeline(
         self,
@@ -76,6 +88,7 @@ def get_test_pipeline(
         return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
 
     def run_pipeline_test(self, object_detector, examples):
+        self._load_dataset()
         outputs = object_detector("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
 
         self.assertGreater(len(outputs), 0)
@@ -89,21 +102,15 @@ def run_pipeline_test(self, object_detector, examples):
                 },
             )
 
-        import datasets
-
-        # we use revision="refs/pr/1" until the PR is merged
-        # https://hf.co/datasets/hf-internal-testing/fixtures_image_utils/discussions/1
-        dataset = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
-
         batch = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
             "http://images.cocodataset.org/val2017/000000039769.jpg",
             # RGBA
-            dataset[0]["image"],
+            self._dataset[0]["image"],
             # LA
-            dataset[1]["image"],
+            self._dataset[1]["image"],
             # L
-            dataset[2]["image"],
+            self._dataset[2]["image"],
         ]
         batch_outputs = object_detector(batch, threshold=0.0)
 
diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py
index bf4fc7db1db6..9b061032ea46 100644
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -347,7 +347,7 @@ def test_large_model_issue(self):
                     " Yes Bank a loss of ₹ 1,800 crore by extending credit facilities to Avantha Group, when it was"
                     " not eligible for the same"
                 ),
-                "question": "Is this person invovled in fraud?",
+                "question": "Is this person involved in fraud?",
             }
         )
         self.assertEqual(
diff --git a/tests/pipelines/test_pipelines_text_classification.py b/tests/pipelines/test_pipelines_text_classification.py
index 23625f0d77b3..b3e25dbe2306 100644
--- a/tests/pipelines/test_pipelines_text_classification.py
+++ b/tests/pipelines/test_pipelines_text_classification.py
@@ -109,7 +109,7 @@ def test_small_model_pt(self):
         )
 
         # Do not apply any function to output for regression tasks
-        # hack: changing problem_type artifically (so keep this test at last)
+        # hack: changing problem_type artificially (so keep this test at last)
         text_classifier.model.config.problem_type = "regression"
         outputs = text_classifier("This is great !")
         self.assertEqual(nested_simplify(outputs), [{"label": "LABEL_0", "score": 0.01}])
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 5c5d3de17a1d..9df47d5a2258 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -500,7 +500,7 @@ def run_pipeline_test(self, text_generator, _):
         with self.assertRaises(ValueError):
             outputs = text_generator("test", return_text=True, return_tensors=True)
 
-        # Empty prompt is slighly special
+        # Empty prompt is slightly special
         # it requires BOS token to exist.
         # Special case for Pegasus which will always append EOS so will
         # work even without BOS.
@@ -637,7 +637,7 @@ def test_pipeline_length_setting_warning(self):
             logger = logging.get_logger("transformers.generation.tf_utils")
         else:
             logger = logging.get_logger("transformers.generation.utils")
-        logger_msg = "Both `max_new_tokens`"  # The beggining of the message to be checked in this test
+        logger_msg = "Both `max_new_tokens`"  # The beginning of the message to be checked in this test
 
         # Both are set by the user -> log warning
         with CaptureLogger(logger) as cl:
diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index 5e4b18f91699..94c495b0d621 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -778,7 +778,7 @@ def test_word_heuristic_leading_space(self):
     @require_tf
     def test_tf_only(self):
         model_name = "hf-internal-testing/tiny-random-bert-tf-only"  # This model only has a TensorFlow version
-        # We test that if we don't specificy framework='tf', it gets detected automatically
+        # We test that if we don't specify framework='tf', it gets detected automatically
         token_classifier = pipeline(task="ner", model=model_name)
         self.assertEqual(token_classifier.framework, "tf")
 
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index 078e825ef6bc..6dbe324ed3d0 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -38,6 +38,15 @@
 @require_av
 class VideoClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
+    example_video_filepath = None
+
+    @classmethod
+    def _load_dataset(cls):
+        # Lazy loading of the dataset. Because it is a class method, it will only be loaded once per pytest process.
+        if cls.example_video_filepath is None:
+            cls.example_video_filepath = hf_hub_download(
+                repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
+            )
 
     def get_test_pipeline(
         self,
@@ -48,9 +57,7 @@ def get_test_pipeline(
         processor=None,
         torch_dtype="float32",
     ):
-        example_video_filepath = hf_hub_download(
-            repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
-        )
+        self._load_dataset()
         video_classifier = VideoClassificationPipeline(
             model=model,
             tokenizer=tokenizer,
@@ -61,8 +68,9 @@ def get_test_pipeline(
             top_k=2,
         )
         examples = [
-            example_video_filepath,
-            "https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4",
+            self.example_video_filepath,
+            # TODO: re-enable this once we have a stable hub solution for CI
+            # "https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4",
         ]
         return video_classifier, examples
 
diff --git a/tests/quantization/aqlm_integration/test_aqlm.py b/tests/quantization/aqlm_integration/test_aqlm.py
index 8195d975711a..d5d45f43cc31 100644
--- a/tests/quantization/aqlm_integration/test_aqlm.py
+++ b/tests/quantization/aqlm_integration/test_aqlm.py
@@ -226,7 +226,7 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu
         # Setup static KV cache for generation
         past_key_values = StaticCache(
             config=self.quantized_model.config,
-            batch_size=1,
+            max_batch_size=1,
             max_cache_len=seq_length + self.max_new_tokens + 1,
             device=torch_device,
             dtype=self.quantized_model.config._pre_quantization_dtype,
diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py
index 780efe8aa4b4..913c6636b102 100644
--- a/tests/quantization/autoawq/test_awq.py
+++ b/tests/quantization/autoawq/test_awq.py
@@ -19,9 +19,11 @@
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AwqConfig, OPTForCausalLM
 from transformers.testing_utils import (
+    backend_empty_cache,
     require_accelerate,
     require_auto_awq,
     require_intel_extension_for_pytorch,
+    require_torch_accelerator,
     require_torch_gpu,
     require_torch_multi_gpu,
     slow,
@@ -37,7 +39,7 @@
     from accelerate import init_empty_weights
 
 
-@require_torch_gpu
+@require_torch_accelerator
 class AwqConfigTest(unittest.TestCase):
     def test_wrong_backend(self):
         """
@@ -56,16 +58,23 @@ def test_wrong_backend(self):
         with self.assertRaises(ValueError):
             AwqConfig(bits=4, backend="unexisting-backend")
 
-        compute_capability = torch.cuda.get_device_capability()
-        major, minor = compute_capability
-
-        if major < 8:
+        # Only cuda and xpu devices can run this function
+        support_llm_awq = False
+        if torch.cuda.is_available():
+            compute_capability = torch.cuda.get_device_capability()
+            major, minor = compute_capability
+            if major >= 8:
+                support_llm_awq = True
+        elif torch.xpu.is_available():
+            support_llm_awq = True
+
+        if support_llm_awq:
+            # LLMAWQ should work on an A100
+            AwqConfig(bits=4, backend="llm-awq")
+        else:
             # LLMAWQ does not work on a T4
             with self.assertRaises(ValueError):
                 AwqConfig(bits=4, backend="llm-awq")
-        else:
-            # LLMAWQ should work on an A100
-            AwqConfig(bits=4, backend="llm-awq")
 
     def test_to_dict(self):
         """
@@ -90,7 +99,7 @@ def test_from_dict(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 @require_auto_awq
 @require_accelerate
 class AwqTest(unittest.TestCase):
@@ -107,7 +116,7 @@ class AwqTest(unittest.TestCase):
         "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out",
         "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very creative",
     ]
-    device_map = "cuda"
+    device_map = torch_device
 
     # called only once for all test in this class
     @classmethod
@@ -120,7 +129,7 @@ def setUpClass(cls):
 
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
         gc.collect()
 
     def test_quantized_model_conversion(self):
@@ -475,7 +484,7 @@ def test_generation_mixtral_fused(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 @require_auto_awq
 @require_accelerate
 class AwqScaleTest(unittest.TestCase):
@@ -488,7 +497,7 @@ def test_load_quantized_model(self):
         Simple test that checks if the scales have been replaced in the quantized model
         """
         quantized_model = AutoModelForCausalLM.from_pretrained(
-            "TechxGenus/starcoder2-3b-AWQ", torch_dtype=torch.float16, device_map="cuda"
+            "TechxGenus/starcoder2-3b-AWQ", torch_dtype=torch.float16, device_map=torch_device
         )
         self.assertTrue(isinstance(quantized_model.model.layers[0].mlp.act, ScaledActivation))
 
diff --git a/tests/quantization/bnb/README.md b/tests/quantization/bnb/README.md
index 8155548c848c..9b26fb60afad 100644
--- a/tests/quantization/bnb/README.md
+++ b/tests/quantization/bnb/README.md
@@ -13,7 +13,7 @@ The following is the recipe on how to effectively debug `bitsandbytes` integrati
 
 The following instructions are tested with 2 NVIDIA-Tesla T4 GPUs. To run successfully `bitsandbytes` you would need a 8-bit core tensor supported GPU. Note that Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100, A6000 should be supported. 
 
-## Virutal envs
+## Virtual envs
 
 ```bash
 conda create --name int8-testing python==3.8
@@ -61,7 +61,7 @@ This happens when some Linear weights are set to the CPU when using `accelerate`
 
 Use the latest version of `accelerate` with a command such as: `pip install -U accelerate` and the problem should be solved.
 
-### `Parameter has no attribue .CB` 
+### `Parameter has no attribute .CB` 
 
 Same solution as above.
 
@@ -71,7 +71,7 @@ Run your script by pre-pending `CUDA_LAUNCH_BLOCKING=1` and you should observe a
 
 ### `CUDA illegal memory error: an illegal memory access at line...`:
 
-Check the CUDA verisons with:
+Check the CUDA versions with:
 ```bash
 nvcc --version
 ```
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index f7e3c8382980..2d40e9010432 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -32,13 +32,14 @@
 from transformers.models.opt.modeling_opt import OPTAttention
 from transformers.testing_utils import (
     apply_skip_if_not_implemented,
+    backend_empty_cache,
     is_bitsandbytes_available,
     is_torch_available,
     require_accelerate,
     require_bitsandbytes,
     require_torch,
     require_torch_gpu_if_bnb_not_multi_backend_enabled,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
     slow,
     torch_device,
 )
@@ -136,7 +137,7 @@ def tearDown(self):
         del self.model_4bit
 
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def test_quantization_num_parameters(self):
         r"""
@@ -178,7 +179,7 @@ def test_memory_footprint(self):
 
     def test_original_dtype(self):
         r"""
-        A simple test to check if the model succesfully stores the original dtype
+        A simple test to check if the model successfully stores the original dtype
         """
         self.assertTrue(hasattr(self.model_4bit.config, "_pre_quantization_dtype"))
         self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
@@ -224,7 +225,7 @@ def test_generate_quality(self):
         """
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
         output_sequences = self.model_4bit.generate(
-            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+            input_ids=encoded_input["input_ids"].to(self.model_4bit.device), max_new_tokens=10
         )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -242,7 +243,7 @@ def test_generate_quality_config(self):
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
         output_sequences = model_4bit_from_config.generate(
-            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+            input_ids=encoded_input["input_ids"].to(model_4bit_from_config.device), max_new_tokens=10
         )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -261,7 +262,7 @@ def test_generate_quality_dequantize(self):
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
         output_sequences = model_4bit.generate(
-            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+            input_ids=encoded_input["input_ids"].to(model_4bit.device), max_new_tokens=10
         )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -277,10 +278,10 @@ def test_device_assignment(self):
         self.assertEqual(self.model_4bit.device.type, "cpu")
         self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
 
-        if torch.cuda.is_available():
+        if torch_device in ["cuda", "xpu"]:
             # Move back to CUDA device
-            self.model_4bit.to("cuda")
-            self.assertEqual(self.model_4bit.device.type, "cuda")
+            self.model_4bit.to(torch_device)
+            self.assertEqual(self.model_4bit.device.type, torch_device)
             self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
 
     def test_device_and_dtype_assignment(self):
@@ -323,11 +324,13 @@ def test_device_and_dtype_assignment(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         self.model_fp16 = self.model_fp16.to(torch.float32)
-        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10)
+        _ = self.model_fp16.generate(
+            input_ids=encoded_input["input_ids"].to(self.model_fp16.device), max_new_tokens=10
+        )
 
-        if torch.cuda.is_available():
+        if torch_device in ["cuda", "xpu"]:
             # Check that this does not throw an error
-            _ = self.model_fp16.cuda()
+            _ = self.model_fp16.to(torch_device)
 
         # Check this does not throw an error
         _ = self.model_fp16.to("cpu")
@@ -493,8 +496,8 @@ def tearDown(self):
     def test_pipeline(self):
         r"""
         The aim of this test is to verify that the mixed 4bit is compatible with `pipeline` from transformers. Since
-        we used pipline for inference speed benchmarking we want to make sure that this feature does not break anything
-        on pipline.
+        we used pipeline for inference speed benchmarking we want to make sure that this feature does not break anything
+        on pipeline.
         """
         # self._clear_cuda_cache()
         self.pipe = pipeline(
@@ -514,7 +517,7 @@ def test_pipeline(self):
         self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
 
 
-@require_torch_multi_gpu
+@require_torch_multi_accelerator
 @apply_skip_if_not_implemented
 class Bnb4bitTestMultiGpu(Base4bitTest):
     def setUp(self):
@@ -525,9 +528,39 @@ def test_multi_gpu_loading(self):
         This tests that the model has been loaded and can be used correctly on a multi-GPU setup.
         Let's just try to load a model on 2 GPUs and see if it works. The model we test has ~2GB of total, 3GB should suffice
         """
+        device_map = {
+            "transformer.word_embeddings": 0,
+            "transformer.word_embeddings_layernorm": 0,
+            "lm_head": 0,
+            "transformer.h.0": 0,
+            "transformer.h.1": 0,
+            "transformer.h.2": 0,
+            "transformer.h.3": 0,
+            "transformer.h.4": 0,
+            "transformer.h.5": 0,
+            "transformer.h.6": 0,
+            "transformer.h.7": 0,
+            "transformer.h.8": 0,
+            "transformer.h.9": 0,
+            "transformer.h.10": 1,
+            "transformer.h.11": 1,
+            "transformer.h.12": 1,
+            "transformer.h.13": 1,
+            "transformer.h.14": 1,
+            "transformer.h.15": 1,
+            "transformer.h.16": 1,
+            "transformer.h.17": 0,
+            "transformer.h.18": 0,
+            "transformer.h.19": 0,
+            "transformer.h.20": 0,
+            "transformer.h.21": 0,
+            "transformer.h.22": 0,
+            "transformer.h.23": 1,
+            "transformer.ln_f": 0,
+        }
 
         model_parallel = AutoModelForCausalLM.from_pretrained(
-            self.model_name, load_in_4bit=True, device_map="balanced"
+            self.model_name, load_in_4bit=True, device_map=device_map
         )
 
         # Check correct device map
@@ -593,7 +626,6 @@ def test_training(self):
 
 
 @apply_skip_if_not_implemented
-@unittest.skipIf(torch_device == "xpu", reason="XPU has precision issue on gpt model, will test it once fixed")
 class Bnb4BitGPT2Test(Bnb4BitTest):
     model_name = "openai-community/gpt2-xl"
     EXPECTED_RELATIVE_DIFFERENCE = 3.3191854854152187
@@ -617,7 +649,7 @@ class BaseSerializationTest(unittest.TestCase):
 
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
         r"""
@@ -768,3 +800,36 @@ def test_set_load_in_8_bit(self):
         quantization_config = BitsAndBytesConfig(load_in_4bit=True)
         with self.assertRaisesRegex(ValueError, "load_in_4bit and load_in_8bit are both True"):
             quantization_config.load_in_8bit = True
+
+
+@require_bitsandbytes
+@require_accelerate
+@require_torch_gpu_if_bnb_not_multi_backend_enabled
+@slow
+@apply_skip_if_not_implemented
+class Bnb4bitCompile(unittest.TestCase):
+    model_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"
+    input_text = "Hello my name is"
+
+    def setUp(self):
+        # Models and tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)
+
+    def test_generate_compile(self):
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        # if nothing is set, compile will be disabled for bnb
+        self.model_4bit.generate(
+            input_ids=encoded_input["input_ids"].to(self.model_4bit.device),
+            max_new_tokens=10,
+            cache_implementation="static",
+        )
+        with self.assertRaises(Exception):
+            # overwrite property
+            object.__setattr__(self.model_4bit.hf_quantizer, "is_compileable", True)
+            self.model_4bit.generate(
+                input_ids=encoded_input["input_ids"].to(self.model_4bit.device),
+                max_new_tokens=10,
+                cache_implementation="static",
+            )
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index e73dd82f34a8..26191baa4e9d 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -39,7 +39,7 @@
     require_bitsandbytes,
     require_torch,
     require_torch_gpu_if_bnb_not_multi_backend_enabled,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
     slow,
     torch_device,
 )
@@ -213,7 +213,7 @@ def test_quantization_config_json_serialization(self):
 
     def test_original_dtype(self):
         r"""
-        A simple test to check if the model succesfully stores the original dtype
+        A simple test to check if the model successfully stores the original dtype
         """
         self.assertTrue(hasattr(self.model_8bit.config, "_pre_quantization_dtype"))
         self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
@@ -274,7 +274,7 @@ def test_generate_quality(self):
         """
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
         output_sequences = self.model_8bit.generate(
-            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+            input_ids=encoded_input["input_ids"].to(self.model_8bit.device), max_new_tokens=10
         )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -292,7 +292,7 @@ def test_generate_quality_config(self):
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
         output_sequences = model_8bit_from_config.generate(
-            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+            input_ids=encoded_input["input_ids"].to(model_8bit_from_config.device), max_new_tokens=10
         )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -311,7 +311,7 @@ def test_generate_quality_dequantize(self):
 
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
         output_sequences = model_8bit.generate(
-            input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+            input_ids=encoded_input["input_ids"].to(model_8bit.device), max_new_tokens=10
         )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -362,7 +362,9 @@ def test_device_and_dtype_assignment(self):
         encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
 
         self.model_fp16 = self.model_fp16.to(torch.float32)
-        _ = self.model_fp16.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10)
+        _ = self.model_fp16.generate(
+            input_ids=encoded_input["input_ids"].to(self.model_fp16.device), max_new_tokens=10
+        )
 
         # Check this does not throw an error
         _ = self.model_fp16.to("cpu")
@@ -402,7 +404,7 @@ def test_int8_serialization(self):
             # generate
             encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
             output_sequences = model_from_saved.generate(
-                input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+                input_ids=encoded_input["input_ids"].to(model_from_saved.device), max_new_tokens=10
             )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -429,7 +431,7 @@ def test_int8_serialization_regression(self):
             # generate
             encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
             output_sequences = model_from_saved.generate(
-                input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
+                input_ids=encoded_input["input_ids"].to(model_from_saved.device), max_new_tokens=10
             )
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -653,8 +655,8 @@ def tearDown(self):
     def test_pipeline(self):
         r"""
         The aim of this test is to verify that the mixed int8 is compatible with `pipeline` from transformers. Since
-        we used pipline for inference speed benchmarking we want to make sure that this feature does not break anything
-        on pipline.
+        we used pipeline for inference speed benchmarking we want to make sure that this feature does not break anything
+        on pipeline.
         """
         # self._clear_cuda_cache()
         self.pipe = pipeline(
@@ -669,7 +671,7 @@ def test_pipeline(self):
         self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
 
 
-@require_torch_multi_gpu
+@require_torch_multi_accelerator
 @apply_skip_if_not_implemented
 class MixedInt8TestMultiGpu(BaseMixedInt8Test):
     def setUp(self):
@@ -680,9 +682,39 @@ def test_multi_gpu_loading(self):
         This tests that the model has been loaded and can be used correctly on a multi-GPU setup.
         Let's just try to load a model on 2 GPUs and see if it works. The model we test has ~2GB of total, 3GB should suffice
         """
+        device_map = {
+            "transformer.word_embeddings": 0,
+            "transformer.word_embeddings_layernorm": 0,
+            "lm_head": 0,
+            "transformer.h.0": 0,
+            "transformer.h.1": 0,
+            "transformer.h.2": 0,
+            "transformer.h.3": 0,
+            "transformer.h.4": 0,
+            "transformer.h.5": 0,
+            "transformer.h.6": 0,
+            "transformer.h.7": 0,
+            "transformer.h.8": 0,
+            "transformer.h.9": 0,
+            "transformer.h.10": 1,
+            "transformer.h.11": 1,
+            "transformer.h.12": 1,
+            "transformer.h.13": 1,
+            "transformer.h.14": 1,
+            "transformer.h.15": 1,
+            "transformer.h.16": 1,
+            "transformer.h.17": 0,
+            "transformer.h.18": 0,
+            "transformer.h.19": 0,
+            "transformer.h.20": 0,
+            "transformer.h.21": 0,
+            "transformer.h.22": 0,
+            "transformer.h.23": 1,
+            "transformer.ln_f": 0,
+        }
 
         model_parallel = AutoModelForCausalLM.from_pretrained(
-            self.model_name, load_in_8bit=True, device_map="balanced"
+            self.model_name, load_in_8bit=True, device_map=device_map
         )
 
         # Check correct device map
@@ -698,7 +730,7 @@ def test_multi_gpu_loading(self):
         self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
 
 
-@require_torch_multi_gpu
+@require_torch_multi_accelerator
 @apply_skip_if_not_implemented
 class MixedInt8TestCpuGpu(BaseMixedInt8Test):
     def setUp(self):
@@ -857,6 +889,7 @@ def test_training(self):
 
         # Step 1: freeze all parameters
         model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True)
+        model.train()
 
         if torch.cuda.is_available():
             self.assertEqual(set(model.hf_device_map.values()), {torch.cuda.current_device()})
@@ -882,14 +915,9 @@ def test_training(self):
         batch = self.tokenizer("Test batch ", return_tensors="pt").to(torch_device)
 
         # Step 4: Check if the gradient is not None
-        if torch_device in {"xpu", "cpu"}:
-            # XPU and CPU finetune do not support autocast for now.
+        with torch.autocast(torch_device):
             out = model.forward(**batch)
             out.logits.norm().backward()
-        else:
-            with torch.autocast(torch_device):
-                out = model.forward(**batch)
-                out.logits.norm().backward()
 
         for module in model.modules():
             if isinstance(module, LoRALayer):
@@ -900,7 +928,6 @@ def test_training(self):
 
 
 @apply_skip_if_not_implemented
-@unittest.skipIf(torch_device == "xpu", reason="XPU has precision issue on gpt model, will test it once fixed")
 class MixedInt8GPT2Test(MixedInt8Test):
     model_name = "openai-community/gpt2-xl"
     EXPECTED_RELATIVE_DIFFERENCE = 1.8720077507258357
@@ -964,3 +991,37 @@ def test_int8_from_pretrained(self):
         output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10)
 
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+
+@require_bitsandbytes
+@require_accelerate
+@require_torch
+@require_torch_gpu_if_bnb_not_multi_backend_enabled
+@slow
+@apply_skip_if_not_implemented
+class Bnb8bitCompile(unittest.TestCase):
+    model_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"
+    input_text = "Hello my name is"
+
+    def setUp(self):
+        # Models and tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.model_8bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True)
+
+    def test_generate_compile(self):
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        # if nothing is set, compile will be disabled for bnb
+        self.model_8bit.generate(
+            input_ids=encoded_input["input_ids"].to(self.model_8bit.device),
+            max_new_tokens=10,
+            cache_implementation="static",
+        )
+
+        with self.assertRaises(Exception):
+            object.__setattr__(self.model_8bit.hf_quantizer, "is_compileable", True)
+            self.model_8bit.generate(
+                input_ids=encoded_input["input_ids"].to(self.model_8bit.device),
+                max_new_tokens=10,
+                cache_implementation="static",
+            )
diff --git a/tests/quantization/compressed_tensor/test_load_sparse_model.py b/tests/quantization/compressed_tensor/test_load_sparse_model.py
deleted file mode 100644
index 8992cd3d9bd4..000000000000
--- a/tests/quantization/compressed_tensor/test_load_sparse_model.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import gc
-import unittest
-
-from transformers import AutoModelForCausalLM
-from transformers.testing_utils import require_compressed_tensors, require_torch
-from transformers.utils import is_torch_available
-
-
-if is_torch_available():
-    import torch
-
-
-@require_compressed_tensors
-@require_torch
-class CompressedTensorsTest(unittest.TestCase):
-    model_sparse_uncompressed = "horheynm/llama2.c_stories15M_pruned_50.2of4_uncompressed"
-    model_sparse_compressed = "horheynm/llama2.c_stories15M_pruned_50.2of4_compressed"
-
-    prompt = "Paris is the capital of which country?"
-
-    stubs = [model_sparse_uncompressed, model_sparse_compressed]
-
-    def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
-        gc.collect()
-
-    def test_compressed_uncompressed_model_shapes(self):
-        """
-        Check that the weights are the same between
-         uncompressed and compressed-decompressed model
-        Sparse compressed modules' weights are "packed" and shape/value will
-         differ
-        """
-
-        def _has_nested_attr(obj, attr_path):
-            attrs = attr_path.split(".")
-            for attr in attrs:
-                if not hasattr(obj, attr):
-                    return None
-                obj = getattr(obj, attr)
-            return obj
-
-        from compressed_tensors.quantization.utils import iter_named_leaf_modules
-
-        uncompressed_model = AutoModelForCausalLM.from_pretrained(
-            self.model_sparse_uncompressed,
-        )
-
-        compressed_model_decompressed = AutoModelForCausalLM.from_pretrained(
-            self.model_sparse_compressed,
-        )
-
-        for name, submodule in iter_named_leaf_modules(
-            uncompressed_model,
-        ):
-            if comp_decomp_obj := _has_nested_attr(compressed_model_decompressed, name):
-                if hasattr(submodule, "weight"):
-                    assert torch.equal(submodule.weight, comp_decomp_obj.weight)
-
-    def test_run_compressed_outputs_match(self):
-        """Check that uncompressed and compressed-decompressed model outputs are the same"""
-
-        from transformers import AutoTokenizer
-
-        for stub in self.stubs:
-            tokenizer = AutoTokenizer.from_pretrained(stub)
-            input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
-
-            uncompressed_model = AutoModelForCausalLM.from_pretrained(
-                self.model_sparse_uncompressed,
-            )
-            output_rc_true = uncompressed_model.generate(input_ids, max_new_tokens=100)
-
-            compressed_model_decompressed = AutoModelForCausalLM.from_pretrained(
-                self.model_sparse_compressed,
-            )
-            output_rc_false = compressed_model_decompressed.generate(input_ids, max_new_tokens=100)
-
-            assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
diff --git a/tests/quantization/compressed_tensor/test_run_compressed_model.py b/tests/quantization/compressed_tensor/test_run_compressed_model.py
deleted file mode 100644
index b168ca382cce..000000000000
--- a/tests/quantization/compressed_tensor/test_run_compressed_model.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import gc
-import unittest
-
-from transformers import AutoModelForCausalLM
-from transformers.testing_utils import require_compressed_tensors, require_torch
-from transformers.utils import is_torch_available
-
-
-if is_torch_available():
-    import torch
-
-
-@require_compressed_tensors
-@require_torch
-class CompressedTensorsTest(unittest.TestCase):
-    tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
-    tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
-
-    prompt = "Paris is the capital of which country?"
-
-    stubs = [tinyllama_w4a16, tinyllama_w8a8]
-
-    def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
-        gc.collect()
-
-    def test_default_run_compressed__True(self):
-        from compressed_tensors.linear.compressed_linear import CompressedLinear
-        from compressed_tensors.quantization.utils import iter_named_leaf_modules
-
-        for stub in self.stubs:
-            model = AutoModelForCausalLM.from_pretrained(
-                stub,
-            )
-            compressed_linear_counts = 0
-
-            for _, submodule in iter_named_leaf_modules(
-                model,
-            ):
-                if isinstance(submodule, CompressedLinear):
-                    compressed_linear_counts += 1
-
-            # some linear models are not compressed - ex. lm_head
-            assert compressed_linear_counts > 0
-
-    def test_default_run_compressed__False(self):
-        from compressed_tensors.linear.compressed_linear import CompressedLinear
-        from compressed_tensors.quantization.utils import iter_named_leaf_modules
-
-        from transformers.utils.quantization_config import CompressedTensorsConfig
-
-        quantization_config = CompressedTensorsConfig(run_compressed=False)
-
-        for stub in self.stubs:
-            model = AutoModelForCausalLM.from_pretrained(
-                stub,
-                quantization_config=quantization_config,
-            )
-            compressed_linear_counts = 0
-
-            for _, submodule in iter_named_leaf_modules(
-                model,
-            ):
-                if isinstance(submodule, CompressedLinear):
-                    compressed_linear_counts += 1
-
-            # No modules should be CompressedLinear
-            assert compressed_linear_counts == 0
-
-    def test_run_compressed_outputs_match(self):
-        """Check that run_compressed=True/False output are the same"""
-
-        from transformers import AutoTokenizer
-        from transformers.utils.quantization_config import CompressedTensorsConfig
-
-        quantization_config = CompressedTensorsConfig(run_compressed=False)
-
-        for stub in self.stubs:
-            tokenizer = AutoTokenizer.from_pretrained(stub)
-            input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
-
-            model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
-                stub,
-            )
-            output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
-
-            model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
-                stub,
-                quantization_config=quantization_config,
-            )
-            output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
-
-            assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
diff --git a/tests/quantization/compressed_tensors_integration/__init__.py b/tests/quantization/compressed_tensors_integration/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/quantization/compressed_tensors_integration/test_compressed_models.py b/tests/quantization/compressed_tensors_integration/test_compressed_models.py
new file mode 100644
index 000000000000..074c943431a9
--- /dev/null
+++ b/tests/quantization/compressed_tensors_integration/test_compressed_models.py
@@ -0,0 +1,232 @@
+import gc
+import unittest
+import warnings
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.testing_utils import require_compressed_tensors, require_torch
+from transformers.utils import is_torch_available
+from transformers.utils.quantization_config import CompressedTensorsConfig
+
+
+if is_torch_available():
+    import torch
+
+
+@require_compressed_tensors
+@require_torch
+class StackCompressedModelTest(unittest.TestCase):
+    # Define stubs as class attributes
+    compressed_uncompressed_model_stubs = [
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
+            "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
+        ),
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
+            "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
+        ),
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed",
+            "nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
+        ),
+    ]
+    # Flatten the list for tests that require a single list of stubs.
+    model_stubs = [stub for pair in compressed_uncompressed_model_stubs for stub in pair]
+
+    # For the outputs matching test, use the sparse-only pair.
+    sparse_compressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed"
+    sparse_uncompressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed"
+
+    prompt = "Paris is the capital of which country?"
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def test_compressed_uncompressed_model_shapes(self):
+        """
+        Verify that the weights of an uncompressed model and its decompressed compressed counterpart match.
+        Note: Weights for sparsely compressed models may differ due to packing.
+        """
+
+        def _has_nested_attr(obj, attr_path):
+            attrs = attr_path.split(".")
+            for attr in attrs:
+                if not hasattr(obj, attr):
+                    return None
+                obj = getattr(obj, attr)
+            return obj
+
+        from compressed_tensors.quantization.utils import iter_named_leaf_modules
+
+        for compressed_model, uncompressed_model in self.compressed_uncompressed_model_stubs:
+            with self.subTest(compressed_model=compressed_model, uncompressed_model=uncompressed_model):
+                uncompressed = AutoModelForCausalLM.from_pretrained(
+                    uncompressed_model,
+                    device_map="auto",
+                    torch_dtype="auto",
+                    quantization_config=CompressedTensorsConfig(run_compressed=False),
+                )
+                compressed_decompressed = AutoModelForCausalLM.from_pretrained(
+                    compressed_model,
+                    device_map="auto",
+                    torch_dtype="auto",
+                    quantization_config=CompressedTensorsConfig(run_compressed=False),
+                )
+
+                for name, submodule in iter_named_leaf_modules(uncompressed):
+                    comp_decomp_obj = _has_nested_attr(compressed_decompressed, name)
+                    if comp_decomp_obj is not None and hasattr(submodule, "weight"):
+                        if "sparse-only" in uncompressed_model:
+                            self.assertTrue(
+                                torch.equal(submodule.weight, comp_decomp_obj.weight),
+                                f"Weight mismatch for module '{name}' in sparse-only model.",
+                            )
+                        else:
+                            self.assertTrue(
+                                torch.allclose(submodule.weight, comp_decomp_obj.weight, atol=0.2),
+                                f"Weight mismatch for module '{name}' in quantized-only or stacked model.",
+                            )
+
+    def test_outputs_match(self):
+        """
+        Ensure that the generated outputs match between the uncompressed model
+        and its decompressed compressed counterpart.
+        """
+        tokenizer = AutoTokenizer.from_pretrained(self.sparse_uncompressed_model)
+        input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
+
+        uncompressed = AutoModelForCausalLM.from_pretrained(
+            self.sparse_uncompressed_model,
+            device_map="auto",
+            torch_dtype="auto",
+            quantization_config=CompressedTensorsConfig(run_compressed=False),
+        )
+
+        output_uncompressed = uncompressed.generate(input_ids.to(uncompressed.device), max_new_tokens=100)
+
+        decompressed = AutoModelForCausalLM.from_pretrained(
+            self.sparse_compressed_model,
+            device_map="auto",
+            torch_dtype="auto",
+            quantization_config=CompressedTensorsConfig(run_compressed=False),
+        )
+        output_decompressed = decompressed.generate(input_ids.to(decompressed.device), max_new_tokens=100)
+
+        self.assertEqual(
+            tokenizer.decode(output_uncompressed[0]),
+            tokenizer.decode(output_decompressed[0]),
+            "Generated outputs do not match between compressed and uncompressed models.",
+        )
+
+    def test_no_warnings_for_all_models(self):
+        """
+        Confirm that loading any model using compressed tensors does not trigger
+        warnings about missing or unexpected keys.
+        """
+        for model_stub in self.model_stubs:
+            with self.subTest(model_stub=model_stub):
+                with warnings.catch_warnings(record=True) as caught_warnings:
+                    warnings.simplefilter("always")
+                    AutoModelForCausalLM.from_pretrained(
+                        model_stub,
+                        device_map="auto",
+                        torch_dtype="auto",
+                        quantization_config=CompressedTensorsConfig(run_compressed=False),
+                    )
+                    for warning in caught_warnings:
+                        self.assertNotIn(
+                            "missing keys",
+                            str(warning.message).lower(),
+                            f"'missing keys' found in warnings for model {model_stub}",
+                        )
+                        self.assertNotIn(
+                            "unexpected keys",
+                            str(warning.message).lower(),
+                            f"'unexpected keys' found in warnings for model {model_stub}",
+                        )
+
+
+@require_compressed_tensors
+@require_torch
+class RunCompressedTest(unittest.TestCase):
+    tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
+    tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
+
+    prompt = "Paris is the capital of which country?"
+
+    stubs = [tinyllama_w4a16, tinyllama_w8a8]
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def test_default_run_compressed__True(self):
+        from compressed_tensors.linear.compressed_linear import CompressedLinear
+        from compressed_tensors.quantization.utils import iter_named_leaf_modules
+
+        for stub in self.stubs:
+            model = AutoModelForCausalLM.from_pretrained(
+                stub,
+            )
+            compressed_linear_counts = 0
+
+            for _, submodule in iter_named_leaf_modules(
+                model,
+            ):
+                if isinstance(submodule, CompressedLinear):
+                    compressed_linear_counts += 1
+
+            # some linear models are not compressed - ex. lm_head
+            assert compressed_linear_counts > 0
+
+    def test_default_run_compressed__False(self):
+        from compressed_tensors.linear.compressed_linear import CompressedLinear
+        from compressed_tensors.quantization.utils import iter_named_leaf_modules
+
+        from transformers.utils.quantization_config import CompressedTensorsConfig
+
+        quantization_config = CompressedTensorsConfig(run_compressed=False)
+
+        for stub in self.stubs:
+            model = AutoModelForCausalLM.from_pretrained(
+                stub,
+                quantization_config=quantization_config,
+            )
+            compressed_linear_counts = 0
+
+            for _, submodule in iter_named_leaf_modules(
+                model,
+            ):
+                if isinstance(submodule, CompressedLinear):
+                    compressed_linear_counts += 1
+
+            # No modules should be CompressedLinear
+            assert compressed_linear_counts == 0
+
+    def test_run_compressed_outputs_match(self):
+        """Check that run_compressed=True/False output are the same"""
+
+        from transformers import AutoTokenizer
+        from transformers.utils.quantization_config import CompressedTensorsConfig
+
+        quantization_config = CompressedTensorsConfig(run_compressed=False)
+
+        for stub in self.stubs:
+            tokenizer = AutoTokenizer.from_pretrained(stub)
+            input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
+
+            model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
+                stub,
+            )
+            output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
+
+            model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
+                stub,
+                quantization_config=quantization_config,
+            )
+            output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
+
+            assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensors_integration/test_compressed_tensors.py
similarity index 91%
rename from tests/quantization/compressed_tensor/test_compressed_tensors.py
rename to tests/quantization/compressed_tensors_integration/test_compressed_tensors.py
index cbcf492f7c97..47e784980604 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensors_integration/test_compressed_tensors.py
@@ -47,7 +47,7 @@ def test_config_to_from_dict(self):
         self.assertIsInstance(config_from_dict.sparsity_config, SparsityCompressionConfig)
 
     def test_tinyllama_w8a8(self):
-        expected_out = "<s> Paris is the capital of which country?\n\n**A) Paris**\n\n**Q** ** Paris is the capital of which country?\n\n**A) Paris**\n\n**Q** ** Paris is the capital of which country"
+        expected_out = "<s> Paris is the capital of which country?\n\n  1. Paris is the capital of which country?\n\n  1. Paris is the capital of which country?\n\n  1. Paris is the capital of which country?\n\n"
         self._test_quantized_model(self.tinyllama_w8a8, expected_out)
 
     def test_tinyllama_w4a16(self):
@@ -59,7 +59,7 @@ def test_tinyllama_w8a16(self):
         self._test_quantized_model(self.tinyllama_w8a16, expected_out)
 
     def test_llama_8b_fp8(self):
-        expected_out = "<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous art museum in Paris? The Louvre\nWhat is the name of the famous opera house in Paris? Palais Garnier\nWhat is the name of the"
+        expected_out = "<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous museum in Paris that is home to the Mona Lisa? The Louvre\nWhat is the name of the famous bridge in Paris that is often associated with the city"
         self._test_quantized_model(self.llama3_8b_fp8, expected_out)
 
     def _test_quantized_model(self, model_name: str, expected_output: str):
diff --git a/tests/quantization/eetq_integration/test_eetq.py b/tests/quantization/eetq_integration/test_eetq.py
index f14fa076e4bb..b1453f85eaea 100644
--- a/tests/quantization/eetq_integration/test_eetq.py
+++ b/tests/quantization/eetq_integration/test_eetq.py
@@ -158,7 +158,7 @@ def test_save_pretrained(self):
     def test_quantized_model_multi_gpu(self):
         """
         Simple test that checks if the quantized model is working properly with multiple GPUs
-        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS
+        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
         """
         input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         quantization_config = EetqConfig()
diff --git a/tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py b/tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py
index a9ff650c0397..3efff115ba7a 100644
--- a/tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py
+++ b/tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py
@@ -215,7 +215,7 @@ def test_change_loading_attributes(self):
     def test_quantized_model_multi_gpu(self):
         """
         Simple test that checks if the quantized model is working properly with multiple GPUs
-        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS
+        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
         """
         input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         quantization_config = FbgemmFp8Config()
diff --git a/tests/quantization/finegrained_fp8/test_fp8.py b/tests/quantization/finegrained_fp8/test_fp8.py
index f572567ed18c..69881b4cbbf1 100644
--- a/tests/quantization/finegrained_fp8/test_fp8.py
+++ b/tests/quantization/finegrained_fp8/test_fp8.py
@@ -193,7 +193,7 @@ def test_block_size(self):
     def test_quantized_model_multi_gpu(self):
         """
         Simple test that checks if the quantized model is working properly with multiple GPUs
-        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS
+        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
         """
         input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
         quantization_config = FineGrainedFP8Config()
@@ -250,6 +250,10 @@ def test_save_pretrained_offload(self):
 class FP8LinearTest(unittest.TestCase):
     device = "cuda"
 
+    @unittest.skipIf(
+        torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 9,
+        "Skipping FP8LinearTest because it is not supported on GPU with capability < 9.0",
+    )
     def test_linear_preserves_shape(self):
         """
         Test that FP8Linear preserves shape when in_features == out_features.
@@ -262,6 +266,10 @@ def test_linear_preserves_shape(self):
         x_ = linear(x)
         self.assertEqual(x_.shape, x.shape)
 
+    @unittest.skipIf(
+        torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 9,
+        "Skipping FP8LinearTest because it is not supported on GPU with capability < 9.0",
+    )
     def test_linear_with_diff_feature_size_preserves_shape(self):
         """
         Test that FP8Linear generates the correct shape when in_features != out_features.
diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py
index c0056b238663..d2b3ddf85373 100644
--- a/tests/quantization/gptq/test_gptq.py
+++ b/tests/quantization/gptq/test_gptq.py
@@ -94,6 +94,7 @@ class GPTQTest(unittest.TestCase):
     EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
     EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
     EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
+    EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")
 
     # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
     EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
@@ -166,7 +167,7 @@ def test_device_and_dtype_assignment(self):
 
     def test_original_dtype(self):
         r"""
-        A simple test to check if the model succesfully stores the original dtype
+        A simple test to check if the model successfully stores the original dtype
         """
         self.assertTrue(hasattr(self.quantized_model.config, "_pre_quantization_dtype"))
         self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
@@ -260,7 +261,9 @@ def test_serialization(self):
                 if self.device_map == "cpu":
                     quant_type = "ipex" if is_ipex_available() else "torch"
                 else:
-                    quant_type = "exllama"
+                    # We expect tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
+                    # TODO: Remove this once GPTQModel exllama kernels supports packing
+                    quant_type = "tritonv2"
                 quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
                     tmpdirname, device_map=self.device_map
                 )
@@ -424,10 +427,18 @@ def setUpClass(cls):
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
 
     def test_quantized_layers_type(self):
-        self.assertEqual(
-            self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
-            "exllama" if is_gptqmodel_available() else "exllamav2",
-        )
+        if is_auto_gptq_available() and not is_gptqmodel_available():
+            self.assertEqual(
+                self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
+                "exllamav2",
+            )
+        else:
+            # We expect tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
+            # TODO: Remove this once GPTQModel exllama kernels supports packing
+            self.assertEqual(
+                self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
+                "tritonv2",
+            )
 
     def check_inference_correctness(self, model):
         """
@@ -447,7 +458,7 @@ def check_inference_correctness(self, model):
 
     def test_generate_quality(self):
         """
-        Simple test to check the quality of the model by comapring the the generated tokens with the expected tokens
+        Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
         """
         self.check_inference_correctness(self.quantized_model)
 
diff --git a/tests/quantization/higgs/test_higgs.py b/tests/quantization/higgs/test_higgs.py
index 5c17ed63aa6d..687a4ab22f21 100644
--- a/tests/quantization/higgs/test_higgs.py
+++ b/tests/quantization/higgs/test_higgs.py
@@ -156,7 +156,7 @@ def test_save_pretrained(self):
     def test_quantized_model_multi_gpu(self):
         """
         Simple test that checks if the quantized model is working properly with multiple GPUs
-        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS
+        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
         """
         input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         quantization_config = HiggsConfig()
@@ -184,7 +184,7 @@ def test_save_pretrained_multi_gpu(self):
             output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
             self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
 
-    @unittest.skip("This will almost surely OOM. Enable when swithed to a smaller model")
+    @unittest.skip("This will almost surely OOM. Enable when switched to a smaller model")
     def test_dequantize(self):
         """
         Test the ability to dequantize a model
diff --git a/tests/quantization/hqq/test_hqq.py b/tests/quantization/hqq/test_hqq.py
index c25aada6ed48..031b3fefa5be 100755
--- a/tests/quantization/hqq/test_hqq.py
+++ b/tests/quantization/hqq/test_hqq.py
@@ -145,6 +145,28 @@ def test_fp16_quantized_model_multipgpu(self):
         check_forward(self, hqq_runner.model)
 
 
+@slow
+@require_torch_gpu
+@require_accelerate
+@require_hqq
+class HQQTestBias(unittest.TestCase):
+    def tearDown(self):
+        cleanup()
+
+    def test_fp16_quantized_model(self):
+        """
+        Simple LLM model testing fp16 with bias
+        """
+        quant_config = HqqConfig(nbits=8, group_size=64)
+
+        hqq_runner = HQQLLMRunner(
+            model_id="facebook/opt-125m", quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
+        )
+
+        check_hqqlayer(self, hqq_runner.model.model.decoder.layers[0].self_attn.v_proj)
+        check_forward(self, hqq_runner.model)
+
+
 @slow
 @require_torch_gpu
 @require_accelerate
@@ -185,3 +207,36 @@ def test_model_serialization(self):
             logits_loaded = model_loaded.forward(input_tensor).logits
 
         self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)
+
+    def test_model_serialization_dynamic_quant_with_skip(self):
+        """
+        Simple HQQ LLM save/load test with dynamic quant
+        """
+        q4_config = {"nbits": 4, "group_size": 64}
+        q3_config = {"nbits": 3, "group_size": 64}
+
+        quant_config = HqqConfig(
+            dynamic_config={
+                "self_attn.q_proj": q4_config,
+                "self_attn.k_proj": q4_config,
+                "self_attn.v_proj": q4_config,
+                "self_attn.o_proj": q4_config,
+                "mlp.gate_proj": q3_config,
+                "mlp.up_proj": q3_config,
+            },
+            skip_modules=["lm_head", "down_proj"],
+        )
+
+        hqq_runner = HQQLLMRunner(
+            model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
+        )
+
+        model = hqq_runner.model
+
+        input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
+        with torch.no_grad():
+            model.forward(input_tensor).logits
+
+        self.assertEqual(isinstance(model.model.layers[1].mlp.down_proj, torch.nn.Linear), True)
+        self.assertEqual(model.model.layers[1].self_attn.v_proj.quant_config["weight_quant_params"]["nbits"], 4)
+        self.assertEqual(model.model.layers[1].mlp.gate_proj.quant_config["weight_quant_params"]["nbits"], 3)
diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py
index 2022c3366576..9660694c51a8 100644
--- a/tests/quantization/quanto_integration/test_quanto.py
+++ b/tests/quantization/quanto_integration/test_quanto.py
@@ -22,7 +22,6 @@
     require_optimum_quanto,
     require_read_token,
     require_torch_accelerator,
-    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -181,11 +180,11 @@ def test_generate_quality_cpu(self):
         """
         self.check_inference_correctness(self.quantized_model, "cpu")
 
-    def test_generate_quality_cuda(self):
+    def test_generate_quality_accelerator(self):
         """
-        Simple test to check the quality of the model on cuda by comparing the generated tokens with the expected tokens
+        Simple test to check the quality of the model on accelerators by comparing the generated tokens with the expected tokens
         """
-        self.check_inference_correctness(self.quantized_model, "cuda")
+        self.check_inference_correctness(self.quantized_model, torch_device)
 
     def test_quantized_model_layers(self):
         from optimum.quanto import QBitsTensor, QModuleMixin, QTensor
@@ -215,7 +214,7 @@ def test_quantized_model_layers(self):
             )
             self.quantized_model.to(0)
         self.assertEqual(
-            self.quantized_model.transformer.h[0].self_attention.query_key_value.weight._data.device.type, "cuda"
+            self.quantized_model.transformer.h[0].self_attention.query_key_value.weight._data.device.type, torch_device
         )
 
     def test_serialization_bin(self):
@@ -430,7 +429,7 @@ class QuantoQuantizationQBitsTensorSerializationTest(QuantoQuantizationSerializa
     weights = "int4"
 
 
-@require_torch_gpu
+@require_torch_accelerator
 class QuantoQuantizationActivationTest(unittest.TestCase):
     def test_quantize_activation(self):
         quantization_config = QuantoConfig(
@@ -443,23 +442,25 @@ def test_quantize_activation(self):
 
 
 @require_optimum_quanto
-@require_torch_gpu
+@require_torch_accelerator
 class QuantoKVCacheQuantizationTest(unittest.TestCase):
     @slow
     @require_read_token
     def test_quantized_cache(self):
         EXPECTED_TEXT_COMPLETION = [
-            "Simply put, the theory of relativity states that 1) the speed of light is the same for all observers, and 2) the laws of physics are the same for all observers.\nThe first part of the theory is the most",
-            "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p",
+            "Simply put, the theory of relativity states that 1) time and space are not absolute, but are relative to the observer, and 2) the laws of physics are the same everywhere in the universe. This means that the speed of light is",
+            "My favorite all time favorite condiment is ketchup. I love how it adds a sweet and tangy flavor to my food. I also enjoy using it as a dip for fries, burgers, and grilled meats. It's a classic condiment that never",
         ]
 
         prompts = [
             "Simply put, the theory of relativity states that ",
             "My favorite all time favorite condiment is ketchup.",
         ]
-        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="</s>", padding_side="left")
+        tokenizer = LlamaTokenizer.from_pretrained(
+            "unsloth/Llama-3.2-1B-Instruct", pad_token="</s>", padding_side="left"
+        )
         model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf", device_map="sequential", torch_dtype=torch.float16
+            "unsloth/Llama-3.2-1B-Instruct", device_map="sequential", torch_dtype=torch.float16
         )
         inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(torch_device)
 
diff --git a/tests/quantization/quark_integration/__init__.py b/tests/quantization/quark_integration/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/quantization/quark_integration/test_quark.py b/tests/quantization/quark_integration/test_quark.py
new file mode 100644
index 000000000000..32a9f6a6d8fd
--- /dev/null
+++ b/tests/quantization/quark_integration/test_quark.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2025 Advanced Micro Devices, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, QuarkConfig
+from transformers.testing_utils import (
+    is_torch_available,
+    require_accelerate,
+    require_quark,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    slow,
+)
+from transformers.utils.import_utils import is_quark_available
+
+
+if is_torch_available():
+    import torch
+
+if is_quark_available():
+    from quark.torch.export.nn.modules.qparamslinear import QParamsLinear
+
+
+class QuarkConfigTest(unittest.TestCase):
+    def test_commmon_args(self):
+        config = AutoConfig.from_pretrained("amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test")
+        QuarkConfig(**config.quantization_config)
+
+
+@slow
+@require_quark
+@require_torch_gpu
+class QuarkTest(unittest.TestCase):
+    reference_model_name = "meta-llama/Llama-3.1-8B-Instruct"
+    quantized_model_name = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
+
+    input_text = "Today I am in Paris and"
+
+    EXPECTED_OUTPUTS = set()
+    EXPECTED_OUTPUTS.add("Today I am in Paris and I am not in Paris, France\nToday I am in Paris, Illinois")
+    EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying the city of light. I am not just any ordinary Paris")
+    EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying my day off! The sun is shining, the birds are")
+
+    EXPECTED_RELATIVE_DIFFERENCE = 1.66
+    device_map = None
+
+    @classmethod
+    def setUpClass(cls):
+        """
+        Setup reference & quantized model
+        """
+        cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
+            cls.reference_model_name, torch_dtype=torch.float16, device_map=cls.device_map
+        )
+        cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
+
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.reference_model_name, use_fast=True)
+
+        cls.quantized_model = AutoModelForCausalLM.from_pretrained(
+            cls.quantized_model_name,
+            torch_dtype=torch.float16,
+            device_map=cls.device_map,
+        )
+
+    def test_memory_footprint(self):
+        mem_quantized = self.quantized_model.get_memory_footprint()
+
+        self.assertTrue(self.mem_fp16 / mem_quantized > self.EXPECTED_RELATIVE_DIFFERENCE)
+
+    def test_device_and_dtype_assignment(self):
+        r"""
+        Test whether trying to cast (or assigning a device to) a model after quantization will throw an error.
+        Checks also if other models are casted correctly.
+        """
+        # This should work
+        if self.device_map is None:
+            _ = self.quantized_model.to(0)
+
+        with self.assertRaises(ValueError):
+            # Tries with a `dtype``
+            self.quantized_model.to(torch.float16)
+
+    def test_original_dtype(self):
+        r"""
+        A simple test to check if the model succesfully stores the original dtype
+        """
+        self.assertTrue(hasattr(self.quantized_model.config, "_pre_quantization_dtype"))
+        self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
+        self.assertTrue(self.quantized_model.config._pre_quantization_dtype == torch.float16)
+
+        self.assertTrue(isinstance(self.quantized_model.model.layers[0].mlp.gate_proj, QParamsLinear))
+
+    def check_inference_correctness(self, model):
+        r"""
+        Test the generation quality of the quantized model and see that we are matching the expected output.
+        Given that we are operating on small numbers + the testing model is relatively small, we might not get
+        the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
+        """
+        # Check that inference pass works on the model
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
+
+        gen_config = GenerationConfig(
+            max_new_tokens=15,
+            min_new_tokens=15,
+            use_cache=True,
+            num_beams=1,
+            do_sample=False,
+        )
+
+        # Check the exactness of the results
+        output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), generation_config=gen_config)
+
+        # Get the generation
+        self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+
+    def test_generate_quality(self):
+        """
+        Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
+        """
+        if self.device_map is None:
+            self.check_inference_correctness(self.quantized_model.to(0))
+        else:
+            self.check_inference_correctness(self.quantized_model)
+
+
+@require_accelerate
+@require_torch_multi_gpu
+@require_quark
+class QuarkTestDeviceMap(QuarkTest):
+    device_map = "auto"
diff --git a/tests/quantization/spqr_integration/test_spqr.py b/tests/quantization/spqr_integration/test_spqr.py
index 134e57af5de1..425cce664c02 100644
--- a/tests/quantization/spqr_integration/test_spqr.py
+++ b/tests/quantization/spqr_integration/test_spqr.py
@@ -207,7 +207,7 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu
         # Setup static KV cache for generation
         past_key_values = StaticCache(
             config=self.quantized_model.config,
-            batch_size=1,
+            max_batch_size=1,
             max_cache_len=seq_length + self.max_new_tokens + 1,
             device=torch_device,
             dtype=self.quantized_model.config._pre_quantization_dtype,
diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py
index d0263f45f180..b6c12ab738b5 100644
--- a/tests/quantization/torchao_integration/test_torchao.py
+++ b/tests/quantization/torchao_integration/test_torchao.py
@@ -14,15 +14,18 @@
 # limitations under the License.
 
 import gc
+import importlib.metadata
 import tempfile
 import unittest
 
+from packaging import version
+
 from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
 from transformers.testing_utils import (
     require_torch_gpu,
     require_torch_multi_gpu,
     require_torchao,
-    torch_device,
+    require_torchao_version_greater_or_equal,
 )
 from transformers.utils import is_torch_available, is_torchao_available
 
@@ -31,16 +34,29 @@
     import torch
 
 if is_torchao_available():
-    from torchao.dtypes import AffineQuantizedTensor
-    from torchao.dtypes.affine_quantized_tensor import TensorCoreTiledLayoutType
+    # renamed in torchao 0.7.0, please install the latest torchao
+    from torchao.dtypes import (
+        AffineQuantizedTensor,
+        TensorCoreTiledLayout,
+    )
+    from torchao.quantization.autoquant import AQMixin
+
+    if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0"):
+        from torchao.dtypes import Int4CPULayout
 
 
-def check_torchao_quantized(test_module, qlayer, batch_size=1, context_size=1024):
+def check_torchao_int4_wo_quantized(test_module, qlayer):
     weight = qlayer.weight
-    test_module.assertTrue(isinstance(weight, AffineQuantizedTensor))
     test_module.assertEqual(weight.quant_min, 0)
     test_module.assertEqual(weight.quant_max, 15)
-    test_module.assertTrue(isinstance(weight.layout_type, TensorCoreTiledLayoutType))
+    test_module.assertTrue(isinstance(weight, AffineQuantizedTensor))
+    layout = Int4CPULayout if weight.device.type == "cpu" else TensorCoreTiledLayout
+    test_module.assertTrue(isinstance(weight.tensor_impl._layout, layout))
+
+
+def check_autoquantized(test_module, qlayer):
+    weight = qlayer.weight
+    test_module.assertTrue(isinstance(weight, AQMixin))
 
 
 def check_forward(test_module, model, batch_size=1, context_size=1024):
@@ -51,8 +67,8 @@ def check_forward(test_module, model, batch_size=1, context_size=1024):
     test_module.assertEqual(out.shape[1], context_size)
 
 
-@require_torch_gpu
 @require_torchao
+@require_torchao_version_greater_or_equal("0.8.0")
 class TorchAoConfigTest(unittest.TestCase):
     def test_to_dict(self):
         """
@@ -69,7 +85,7 @@ def test_post_init_check(self):
         Test kwargs validations in TorchAoConfig
         """
         _ = TorchAoConfig("int4_weight_only")
-        with self.assertRaisesRegex(ValueError, "is not supported yet"):
+        with self.assertRaisesRegex(ValueError, "Unsupported string quantization type"):
             _ = TorchAoConfig("fp6")
 
         with self.assertRaisesRegex(ValueError, "Unexpected keyword arg"):
@@ -82,16 +98,30 @@ def test_repr(self):
         quantization_config = TorchAoConfig("int4_weight_only", modules_to_not_convert=["conv"], group_size=8)
         repr(quantization_config)
 
+    def test_json_serializable(self):
+        """
+        Check that the config dict can be JSON serialized.
+        """
+        quantization_config = TorchAoConfig("int4_weight_only", group_size=32, layout=TensorCoreTiledLayout())
+        d = quantization_config.to_dict()
+        self.assertIsInstance(d["quant_type_kwargs"]["layout"], dict)
+        self.assertTrue("inner_k_tiles" in d["quant_type_kwargs"]["layout"])
+        quantization_config.to_json_string(use_diff=False)
+
 
-@require_torch_gpu
 @require_torchao
+@require_torchao_version_greater_or_equal("0.8.0")
 class TorchAoTest(unittest.TestCase):
     input_text = "What are we having for dinner?"
     max_new_tokens = 10
-
     EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
-
     model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    device = "cpu"
+    quant_scheme_kwargs = (
+        {"group_size": 32, "layout": Int4CPULayout()}
+        if is_torchao_available() and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
+        else {"group_size": 32}
+    )
 
     def tearDown(self):
         gc.collect()
@@ -102,20 +132,20 @@ def test_int4wo_quant(self):
         """
         Simple LLM model testing int4 weight only quantization
         """
-        quant_config = TorchAoConfig("int4_weight_only", group_size=32)
+        quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
 
         # Note: we quantize the bfloat16 model on the fly to int4
         quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
             torch_dtype=torch.bfloat16,
-            device_map=torch_device,
+            device_map=self.device,
             quantization_config=quant_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        check_torchao_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
+        check_torchao_int4_wo_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
 
-        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
 
         output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
         self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@@ -124,50 +154,55 @@ def test_int4wo_quant_bfloat16_conversion(self):
         """
         Testing the dtype of model will be modified to be bfloat16 for int4 weight only quantization
         """
-        quant_config = TorchAoConfig("int4_weight_only", group_size=32)
+        quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
 
         # Note: we quantize the bfloat16 model on the fly to int4
         quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
             torch_dtype=None,
-            device_map=torch_device,
+            device_map=self.device,
             quantization_config=quant_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        check_torchao_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
+        check_torchao_int4_wo_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
 
-        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
 
         output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
         self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
 
-    @require_torch_multi_gpu
-    def test_int4wo_quant_multi_gpu(self):
+    def test_int8_dynamic_activation_int8_weight_quant(self):
         """
-        Simple test that checks if the quantized model int4 wieght only is working properly with multiple GPUs
-        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS
+        Simple LLM model testing int8_dynamic_activation_int8_weight
         """
+        quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight")
 
-        quant_config = TorchAoConfig("int4_weight_only", group_size=32)
         quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
+            device_map=self.device,
             quantization_config=quant_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
-
-        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
 
         output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
-        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+        EXPECTED_OUTPUT = [
+            "What are we having for dinner?\n\nJessica: (smiling)",
+            "What are we having for dinner?\n\nJess: (smiling) I",
+        ]
+        self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
+
+
+@require_torch_gpu
+class TorchAoGPUTest(TorchAoTest):
+    device = "cuda"
+    quant_scheme_kwargs = {"group_size": 32}
 
     def test_int4wo_offload(self):
         """
-        Simple test that checks if the quantized model int4 wieght only is working properly with cpu/disk offload
+        Simple test that checks if the quantized model int4 weight only is working properly with cpu/disk offload
         """
 
         device_map_offload = {
@@ -209,36 +244,66 @@ def test_int4wo_offload(self):
         )
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
 
         output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
         EXPECTED_OUTPUT = "What are we having for dinner?\n- 2. What is the temperature outside"
 
         self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
 
-    def test_int8_dynamic_activation_int8_weight_quant(self):
+    @require_torch_multi_gpu
+    def test_int4wo_quant_multi_gpu(self):
         """
-        Simple LLM model testing int8_dynamic_activation_int8_weight
+        Simple test that checks if the quantized model int4 weight only is working properly with multiple GPUs
+        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
         """
-        quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight")
 
-        # Note: we quantize the bfloat16 model on the fly to int4
+        quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
         quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
-            device_map=torch_device,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
             quantization_config=quant_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
 
         output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
-        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+    def test_autoquant(self):
+        """
+        Simple LLM model testing autoquant
+        """
+        quant_config = TorchAoConfig("autoquant")
+
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.bfloat16,
+            device_map=self.device,
+            quantization_config=quant_config,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+        output = quantized_model.generate(
+            **input_ids, max_new_tokens=self.max_new_tokens, cache_implementation="static"
+        )
+        quantized_model.finalize_autoquant()
+
+        check_autoquantized(self, quantized_model.model.layers[0].self_attn.v_proj)
+
+        EXPECTED_OUTPUT = 'What are we having for dinner?\n\n10. "Dinner is ready'
+        output = quantized_model.generate(
+            **input_ids, max_new_tokens=self.max_new_tokens, cache_implementation="static"
+        )
         self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
 
 
-@require_torch_gpu
 @require_torchao
+@require_torchao_version_greater_or_equal("0.8.0")
 class TorchAoSerializationTest(unittest.TestCase):
     input_text = "What are we having for dinner?"
     max_new_tokens = 10
@@ -246,20 +311,27 @@ class TorchAoSerializationTest(unittest.TestCase):
     # TODO: investigate why we don't have the same output as the original model for this test
     SERIALIZED_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
     model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32}
-    device = "cuda:0"
+    quant_scheme = "int4_weight_only"
+    quant_scheme_kwargs = (
+        {"group_size": 32, "layout": Int4CPULayout()}
+        if is_torchao_available() and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
+        else {"group_size": 32}
+    )
+    device = "cpu"
 
     # called only once for all test in this class
     @classmethod
     def setUpClass(cls):
-        cls.quant_config = TorchAoConfig(cls.quant_scheme, **cls.quant_scheme_kwargs)
-        cls.quantized_model = AutoModelForCausalLM.from_pretrained(
-            cls.model_name,
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
+
+    def setUp(self):
+        self.quant_config = TorchAoConfig(self.quant_scheme, **self.quant_scheme_kwargs)
+        self.quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
             torch_dtype=torch.bfloat16,
-            device_map=cls.device,
-            quantization_config=cls.quant_config,
+            device_map=self.device,
+            quantization_config=self.quant_config,
         )
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
 
     def tearDown(self):
         gc.collect()
@@ -279,9 +351,9 @@ def check_serialization_expected_output(self, device, expected_output):
         with tempfile.TemporaryDirectory() as tmpdirname:
             self.quantized_model.save_pretrained(tmpdirname, safe_serialization=False)
             loaded_quantized_model = AutoModelForCausalLM.from_pretrained(
-                self.model_name, torch_dtype=torch.bfloat16, device_map=self.device
+                self.model_name, torch_dtype=torch.bfloat16, device_map=device
             )
-            input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device)
+            input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(device)
 
             output = loaded_quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
             self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
@@ -290,46 +362,88 @@ def test_serialization_expected_output(self):
         self.check_serialization_expected_output(self.device, self.SERIALIZED_EXPECTED_OUTPUT)
 
 
-class TorchAoSerializationW8A8Test(TorchAoSerializationTest):
+class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest):
     quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
     ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
     SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
-    device = "cuda:0"
+
+    @require_torch_gpu
+    def test_serialization_expected_output_on_cuda(self):
+        """
+        Test if we can serialize on device (cpu) and load/infer the model on cuda
+        """
+        self.check_serialization_expected_output("cuda", self.SERIALIZED_EXPECTED_OUTPUT)
 
 
-class TorchAoSerializationW8Test(TorchAoSerializationTest):
+class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
     quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
     ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
     SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
+
+    @require_torch_gpu
+    def test_serialization_expected_output_on_cuda(self):
+        """
+        Test if we can serialize on device (cpu) and load/infer the model on cuda
+        """
+        self.check_serialization_expected_output("cuda", self.SERIALIZED_EXPECTED_OUTPUT)
+
+
+@require_torch_gpu
+class TorchAoSerializationGPTTest(TorchAoSerializationTest):
+    quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32}
     device = "cuda:0"
 
 
-class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest):
+@require_torch_gpu
+class TorchAoSerializationW8A8GPUTest(TorchAoSerializationTest):
     quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
     ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
     SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
-    device = "cpu"
-
-    def test_serialization_expected_output_cuda(self):
-        """
-        Test if we can serialize on device (cpu) and load/infer the model on cuda
-        """
-        new_device = "cuda:0"
-        self.check_serialization_expected_output(new_device, self.SERIALIZED_EXPECTED_OUTPUT)
+    device = "cuda:0"
 
 
-class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
+@require_torch_gpu
+class TorchAoSerializationW8GPUTest(TorchAoSerializationTest):
     quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
     ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
     SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
-    device = "cpu"
+    device = "cuda:0"
 
-    def test_serialization_expected_output_cuda(self):
-        """
-        Test if we can serialize on device (cpu) and load/infer the model on cuda
-        """
-        new_device = "cuda:0"
-        self.check_serialization_expected_output(new_device, self.SERIALIZED_EXPECTED_OUTPUT)
+
+@require_torch_gpu
+@require_torchao_version_greater_or_equal("0.10.0")
+class TorchAoSerializationFP8GPUTest(TorchAoSerializationTest):
+    ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+    SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
+    device = "cuda:0"
+
+    def setUp(self):
+        if not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 9:
+            raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests")
+
+        from torchao.quantization import Float8WeightOnlyConfig
+
+        self.quant_scheme = Float8WeightOnlyConfig()
+        self.quant_scheme_kwargs = {}
+        super().setUp()
+
+
+@require_torch_gpu
+@require_torchao_version_greater_or_equal("0.10.0")
+class TorchAoSerializationA8W4Test(TorchAoSerializationTest):
+    ORIGINAL_EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+    SERIALIZED_EXPECTED_OUTPUT = ORIGINAL_EXPECTED_OUTPUT
+    device = "cuda:0"
+
+    def setUp(self):
+        if not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 9:
+            raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests")
+
+        from torchao.quantization import Int8DynamicActivationInt4WeightConfig
+
+        self.quant_scheme = Int8DynamicActivationInt4WeightConfig()
+        self.quant_scheme_kwargs = {}
+        super().setUp()
 
 
 if __name__ == "__main__":
diff --git a/tests/repo_utils/modular/test_conversion_order.py b/tests/repo_utils/modular/test_conversion_order.py
index 643b3714a763..7f44684b7b5f 100644
--- a/tests/repo_utils/modular/test_conversion_order.py
+++ b/tests/repo_utils/modular/test_conversion_order.py
@@ -22,6 +22,8 @@
     os.path.join(MODEL_ROOT, "olmo", "modular_olmo.py"),
     os.path.join(MODEL_ROOT, "rt_detr", "modular_rt_detr.py"),
     os.path.join(MODEL_ROOT, "qwen2", "modular_qwen2.py"),
+    os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3.py"),
+    os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3_moe.py"),
     os.path.join(MODEL_ROOT, "llava_next_video", "modular_llava_next_video.py"),
     os.path.join(MODEL_ROOT, "cohere2", "modular_cohere2.py"),
     os.path.join(MODEL_ROOT, "modernbert", "modular_modernbert.py"),
diff --git a/tests/sagemaker/README.md b/tests/sagemaker/README.md
index cfbcf390b993..70dc301f9c33 100644
--- a/tests/sagemaker/README.md
+++ b/tests/sagemaker/README.md
@@ -138,7 +138,7 @@ images:
 
 ## Current Tests
 
-| ID                                  | Description                                                       | Platform                   | #GPUS | Collected & evaluated metrics            |
+| ID                                  | Description                                                       | Platform                   | #GPUs | Collected & evaluated metrics            |
 |-------------------------------------|-------------------------------------------------------------------|-----------------------------|-------|------------------------------------------|
 | pytorch-transfromers-test-single    | test bert finetuning using BERT fromtransformerlib+PT             | SageMaker createTrainingJob | 1     | train_runtime, eval_accuracy & eval_loss |
 | pytorch-transfromers-test-2-ddp     | test bert finetuning using BERT from transformer lib+ PT DPP      | SageMaker createTrainingJob | 16    | train_runtime, eval_accuracy & eval_loss |
diff --git a/tests/sagemaker/scripts/pytorch/run_ddp.py b/tests/sagemaker/scripts/pytorch/run_ddp.py
index 1191caeb96a2..474285841b0d 100644
--- a/tests/sagemaker/scripts/pytorch/run_ddp.py
+++ b/tests/sagemaker/scripts/pytorch/run_ddp.py
@@ -36,12 +36,12 @@ def main():
                 --master_addr={hosts[0]}  \
                 --master_port={port} \
                 ./run_glue.py \
-                {"".join([f" --{parameter} {value}" for parameter,value in args.__dict__.items()])}"""
+                {"".join([f" --{parameter} {value}" for parameter, value in args.__dict__.items()])}"""
     else:
         cmd = f"""python -m torch.distributed.launch \
             --nproc_per_node={num_gpus}  \
             ./run_glue.py \
-            {"".join([f" --{parameter} {value}" for parameter,value in args.__dict__.items()])}"""
+            {"".join([f" --{parameter} {value}" for parameter, value in args.__dict__.items()])}"""
     try:
         subprocess.run(cmd, shell=True)
     except Exception as e:
diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
index 2e3d9fdc7f2f..e48717914e08 100644
--- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
+++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
@@ -145,9 +145,9 @@ def __post_init__(self):
             train_extension = self.train_file.split(".")[-1]
             assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
             validation_extension = self.validation_file.split(".")[-1]
-            assert (
-                validation_extension == train_extension
-            ), "`validation_file` should have the same extension (csv or json) as `train_file`."
+            assert validation_extension == train_extension, (
+                "`validation_file` should have the same extension (csv or json) as `train_file`."
+            )
 
 
 @dataclass
@@ -265,9 +265,9 @@ def main():
             if data_args.test_file is not None:
                 train_extension = data_args.train_file.split(".")[-1]
                 test_extension = data_args.test_file.split(".")[-1]
-                assert (
-                    test_extension == train_extension
-                ), "`test_file` should have the same extension (csv or json) as `train_file`."
+                assert test_extension == train_extension, (
+                    "`test_file` should have the same extension (csv or json) as `train_file`."
+                )
                 data_files["test"] = data_args.test_file
             else:
                 raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
diff --git a/tests/tensor_parallel/test_tensor_parallel.py b/tests/tensor_parallel/test_tensor_parallel.py
new file mode 100644
index 000000000000..7276869d7642
--- /dev/null
+++ b/tests/tensor_parallel/test_tensor_parallel.py
@@ -0,0 +1,94 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import tempfile
+import textwrap
+
+from transformers import is_torch_available
+from transformers.testing_utils import (
+    TestCasePlus,
+    get_torch_dist_unique_port,
+    require_torch_multi_gpu,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+# RUN_SLOW=1 pytest -sv tests/tensor_parallel/test_tensor_parallel.py
+class TestTensorParallel(TestCasePlus):
+    nproc_per_node = 2
+
+    def torchrun(self, script: str):
+        """Run the `script` using `torchrun` command for multi-processing in a subprocess. Captures errors as necessary."""
+        with tempfile.NamedTemporaryFile(mode="w+", suffix=".py") as tmp:
+            tmp.write(script)
+            tmp.flush()
+            tmp.seek(0)
+            cmd = (
+                f"torchrun --nproc_per_node {self.nproc_per_node} --master_port {get_torch_dist_unique_port()} {tmp.name}"
+            ).split()
+
+            # Note that the subprocess will be waited for here, and raise an error if not successful
+            try:
+                _ = subprocess.run(cmd, capture_output=True, env=self.get_env(), text=True, check=True)
+            except subprocess.CalledProcessError as e:
+                raise Exception(f"The following error was captured: {e.stderr}")
+
+    def test_model_forward(self):
+        script_to_run = textwrap.dedent(
+            """
+            import torch
+            import os
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+
+            model_id = "JackFram/llama-68m"
+
+            rank = int(os.environ["RANK"])
+            world_size = int(os.environ["WORLD_SIZE"])
+
+            model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", tp_plan="auto")
+            torch.distributed.barrier()
+
+            has_dtensor = 0
+            for name, parameter in model.named_parameters():
+                if isinstance(parameter.data, torch.distributed.tensor.DTensor):
+                    has_dtensor = 1
+                    break
+
+            assert has_dtensor == 1, "TP model must has DTensor"
+
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            prompt = "Can I help"
+
+            inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+            outputs = model(inputs)
+
+            next_token_logits = outputs[0][:, -1, :]
+            next_token = torch.argmax(next_token_logits, dim=-1)
+            response = tokenizer.decode(next_token)
+            assert response == "with"
+
+            torch.distributed.barrier()
+            torch.distributed.destroy_process_group()
+            """
+        )
+        self.torchrun(script_to_run)
+
+
+@require_torch_multi_gpu
+class TestTensorParallelCuda(TestTensorParallel):
+    nproc_per_node = torch.cuda.device_count()
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 4dbbdedbbc2e..c16a88888568 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -118,7 +118,7 @@ def create_and_test_config_from_and_save_pretrained_subfolder(self):
 
     def create_and_test_config_from_and_save_pretrained_composite(self):
         """
-        Tests that composite or nested cofigs can be loaded and saved correctly. In case the config
+        Tests that composite or nested configs can be loaded and saved correctly. In case the config
         has a sub-config, we should be able to call `sub_config.from_pretrained('general_config_file')`
         and get a result same as if we loaded the whole config and obtained `config.sub_config` from it.
         """
@@ -130,7 +130,7 @@ def create_and_test_config_from_and_save_pretrained_composite(self):
             general_config_dict = config.to_dict()
 
             # Iterate over all sub_configs if there are any and load them with their own classes
-            sub_configs = self.config_class.sub_configs
+            sub_configs = general_config_loaded.sub_configs
             for sub_config_key, sub_class in sub_configs.items():
                 if sub_class.__name__ == "AutoConfig":
                     sub_class = sub_class.for_model(**general_config_dict[sub_config_key]).__class__
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index 564e3c15041f..a5627c8b6f19 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -29,6 +29,7 @@
 from transformers.image_utils import AnnotationFormat, AnnotionFormat
 from transformers.testing_utils import (
     check_json_file_has_correct_format,
+    is_flaky,
     require_torch,
     require_torch_gpu,
     require_vision,
@@ -212,6 +213,7 @@ def test_slow_fast_equivalence_batched(self):
 
     @require_vision
     @require_torch
+    @is_flaky()
     def test_fast_is_faster_than_slow(self):
         if not self.test_slow_image_processor or not self.test_fast_image_processor:
             self.skipTest(reason="Skipping speed test")
@@ -223,9 +225,14 @@ def measure_time(image_processor, image):
             # Warmup
             for _ in range(5):
                 _ = image_processor(image, return_tensors="pt")
-            start = time.time()
-            _ = image_processor(image, return_tensors="pt")
-            return time.time() - start
+            all_times = []
+            for _ in range(10):
+                start = time.time()
+                _ = image_processor(image, return_tensors="pt")
+                all_times.append(time.time() - start)
+            # Take the average of the fastest 3 runs
+            avg_time = sum(sorted(all_times[:3])) / 3.0
+            return avg_time
 
         dummy_images = torch.randint(0, 255, (4, 3, 224, 224), dtype=torch.uint8)
         image_processor_slow = self.image_processing_class(**self.image_processor_dict)
@@ -306,8 +313,10 @@ def test_save_load_fast_slow(self):
         }
         dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)}
         dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)}
-        # check that all additional keys are None, except for `default_to_square` which is only set in fast processors
-        self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
+        # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
+        self.assertTrue(
+            all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
+        )
         # check that the remaining keys are the same
         self.assertEqual(dict_slow_0, dict_slow_1)
 
@@ -319,8 +328,10 @@ def test_save_load_fast_slow(self):
         }
         dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)}
         dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)}
-        # check that all additional keys are None, except for `default_to_square` which is only set in fast processors
-        self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
+        # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
+        self.assertTrue(
+            all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
+        )
         # check that the remaining keys are the same
         self.assertEqual(dict_fast_0, dict_fast_1)
 
@@ -352,8 +363,10 @@ def test_save_load_fast_slow_auto(self):
         }
         dict_slow_0 = {key: dict_slow_0[key] for key in set(dict_slow_0) & set(dict_slow_1)}
         dict_slow_1 = {key: dict_slow_1[key] for key in set(dict_slow_0) & set(dict_slow_1)}
-        # check that all additional keys are None, except for `default_to_square` which is only set in fast processors
-        self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
+        # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
+        self.assertTrue(
+            all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
+        )
         # check that the remaining keys are the same
         self.assertEqual(dict_slow_0, dict_slow_1)
 
@@ -365,8 +378,10 @@ def test_save_load_fast_slow_auto(self):
         }
         dict_fast_0 = {key: dict_fast_0[key] for key in set(dict_fast_0) & set(dict_fast_1)}
         dict_fast_1 = {key: dict_fast_1[key] for key in set(dict_fast_0) & set(dict_fast_1)}
-        # check that all additional keys are None, except for `default_to_square` which is only set in fast processors
-        self.assertTrue(all(value is None for key, value in difference.items() if key not in ["default_to_square"]))
+        # check that all additional keys are None, except for `default_to_square` and `data_format` which are only set in fast processors
+        self.assertTrue(
+            all(value is None for key, value in difference.items() if key not in ["default_to_square", "data_format"])
+        )
         # check that the remaining keys are the same
         self.assertEqual(dict_fast_0, dict_fast_1)
 
@@ -387,7 +402,7 @@ def test_cast_dtype_device(self):
                 image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
 
                 encoding = image_processor(image_inputs, return_tensors="pt")
-                # for layoutLM compatiblity
+                # for layoutLM compatibility
                 self.assertEqual(encoding.pixel_values.device, torch.device("cpu"))
                 self.assertEqual(encoding.pixel_values.dtype, torch.float32)
 
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index 25775d787e49..560ea6a36b40 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -326,8 +326,8 @@ def test_normalize(self):
 
         # Test float16 image input keeps float16 dtype
         image = np.random.randint(0, 256, (224, 224, 3)).astype(np.float16) / 255
-        mean = (0.5, 0.6, 0.7)
-        std = (0.1, 0.2, 0.3)
+        mean = np.array((0.5, 0.6, 0.7))
+        std = np.array((0.1, 0.2, 0.3))
 
         # The mean and std are cast to match the dtype of the input image
         cast_mean = np.array(mean, dtype=np.float16)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index a707b25a3110..6a47a882d147 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -32,7 +32,6 @@
 from parameterized import parameterized
 from pytest import mark
 
-import transformers
 from transformers import (
     AutoModel,
     AutoModelForCausalLM,
@@ -58,6 +57,7 @@
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
     MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
     MODEL_FOR_MASKED_LM_MAPPING_NAMES,
     MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
@@ -74,9 +74,8 @@
 )
 from transformers.testing_utils import (
     CaptureLogger,
+    hub_retry,
     is_flaky,
-    is_pt_flax_cross_test,
-    is_pt_tf_cross_test,
     require_accelerate,
     require_bitsandbytes,
     require_deepspeed,
@@ -101,14 +100,14 @@
     GENERATION_CONFIG_NAME,
     SAFE_WEIGHTS_NAME,
     is_accelerate_available,
-    is_flax_available,
-    is_tf_available,
     is_torch_bf16_available_on_device,
     is_torch_fp16_available_on_device,
     is_torch_fx_available,
     is_torch_sdpa_available,
 )
-from transformers.utils.generic import ContextManagers, ModelOutput
+from transformers.utils.generic import ContextManagers
+
+from .generation.test_utils import GenerationTesterMixin
 
 
 if is_accelerate_available():
@@ -127,19 +126,6 @@
     from transformers.modeling_utils import load_state_dict, no_init_weights
     from transformers.pytorch_utils import id_tensor_storage
 
-
-if is_tf_available():
-    import tensorflow as tf
-
-if is_flax_available():
-    import jax.numpy as jnp
-
-    from tests.utils.test_modeling_flax_utils import check_models_equal
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-
 if is_torch_fx_available():
     from transformers.utils.fx import _FX_SUPPORTED_MODELS_WITH_KV_CACHE, symbolic_trace
 
@@ -147,6 +133,23 @@
     import deepspeed
 
 
+# used in other test files e.g. when overwriting the test
+TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION = [
+    (
+        # test name for the test runner
+        f"{dtype}_pad_{padding_side}{'' if use_attention_mask else '_no_attn_mask'}"
+        f"{'_sdpa_kernels' if enable_kernels else ''}",
+        # parameterization
+        *(dtype, padding_side, use_attention_mask, False, enable_kernels),
+    )
+    for dtype in ("fp16", "fp32", "bf16")
+    for padding_side in ("left", "right")
+    for use_attention_mask in (True, False)
+    for enable_kernels in (True, False)
+    # Extra test case: `output_attentions=True` has special attention mask handling and sdpa reverts to eager
+] + [("fp32_pad_left_output_attentions", "fp32", "left", True, True, False)]
+
+
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
@@ -229,6 +232,16 @@ class ModelTesterMixin:
     _is_composite = False
     model_split_percents = [0.5, 0.7, 0.9]
 
+    # Note: for all mixins that utilize the Hub in some way, we should ensure that
+    # they contain the `hub_retry` decorator in case of failures.
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        for attr_name in dir(cls):
+            if attr_name.startswith("test_"):
+                attr = getattr(cls, attr_name)
+                if callable(attr):
+                    setattr(cls, attr_name, hub_retry()(attr))
+
     @property
     def all_generative_model_classes(self):
         return tuple(model_class for model_class in self.all_model_classes if model_class.can_generate())
@@ -280,6 +293,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),
                 *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
                 *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES),
+                *get_values(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES),
                 *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES),
                 *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),
                 *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES),
@@ -301,8 +315,6 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         return inputs_dict
 
     def test_save_load(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
         def check_save_load(out1, out2):
             # make sure we don't have nans
             out_2 = out2.cpu().numpy()
@@ -316,6 +328,7 @@ def check_save_load(out1, out2):
             self.assertLessEqual(max_diff, 1e-5)
 
         for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
             model.to(torch_device)
             model.eval()
@@ -492,60 +505,6 @@ def test_peft_gradient_checkpointing_enable_disable(self):
                         m.gradient_checkpointing, f"Module {n} does not have gradient_checkpointing set to False"
                     )
 
-    @is_flaky(description="low likelihood of failure, reason not yet discovered")
-    def test_save_load_fast_init_from_base(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if config.__class__ not in MODEL_MAPPING:
-            self.skipTest(reason=f"{config.__class__.__name__} not in MODEL_MAPPING")
-
-        base_class = MODEL_MAPPING[config.__class__]
-
-        if isinstance(base_class, tuple):
-            base_class = base_class[0]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            # make a copy of model class to not break future tests
-            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
-            class CopyClass(model_class):
-                pass
-
-            model_class_copy = CopyClass
-
-            # make sure that all keys are expected for test
-            model_class_copy._keys_to_ignore_on_load_missing = []
-
-            # make init deterministic, but make sure that
-            # non-initialized weights throw errors nevertheless
-            model_class_copy._init_weights = _mock_init_weights
-            model_class_copy.init_weights = _mock_all_init_weights
-
-            model = base_class(config)
-            state_dict = model.state_dict()
-
-            # this will often delete a single weight of a multi-weight module
-            # to test an edge case
-            random_key_to_del = random.choice(list(state_dict.keys()))
-            del state_dict[random_key_to_del]
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
-
-                model_fast_init = model_class_copy.from_pretrained(tmpdirname)
-                model_slow_init = model_class_copy.from_pretrained(tmpdirname, _fast_init=False)
-                # Before we test anything
-
-                for key in model_fast_init.state_dict().keys():
-                    if isinstance(model_slow_init.state_dict()[key], torch.BoolTensor):
-                        max_diff = (model_slow_init.state_dict()[key] ^ model_fast_init.state_dict()[key]).sum().item()
-                    else:
-                        max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
     @slow
     @require_accelerate
     @mark.accelerate_tests
@@ -627,62 +586,6 @@ def _check_save_load_low_cpu_mem_usage(self, model_class, saved_model_path):
 
         self.assertEqual(tied_params1, tied_params2)
 
-    def test_save_load_fast_init_to_base(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        if config.__class__ not in MODEL_MAPPING:
-            self.skipTest(reason=f"{config.__class__.__name__} not in MODEL_MAPPING")
-
-        base_class = MODEL_MAPPING[config.__class__]
-
-        if isinstance(base_class, tuple):
-            base_class = base_class[0]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            # make a copy of model class to not break future tests
-            # from https://stackoverflow.com/questions/9541025/how-to-copy-a-python-class
-            class CopyClass(base_class):
-                pass
-
-            base_class_copy = CopyClass
-
-            # make sure that all keys are expected for test
-            base_class_copy._keys_to_ignore_on_load_missing = []
-
-            # make init deterministic, but make sure that
-            # non-initialized weights throw errors nevertheless
-            base_class_copy._init_weights = _mock_init_weights
-            base_class_copy.init_weights = _mock_all_init_weights
-
-            model = model_class(config)
-            state_dict = model.state_dict()
-
-            # this will often delete a single weight of a multi-weight module
-            # to test an edge case
-            random_key_to_del = random.choice(list(state_dict.keys()))
-            del state_dict[random_key_to_del]
-
-            # check that certain keys didn't get saved with the model
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.config.save_pretrained(tmpdirname)
-                torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
-
-                model_fast_init = base_class_copy.from_pretrained(tmpdirname)
-                model_slow_init = base_class_copy.from_pretrained(tmpdirname, _fast_init=False)
-
-                for key in model_fast_init.state_dict().keys():
-                    if isinstance(model_slow_init.state_dict()[key], torch.BoolTensor):
-                        max_diff = torch.max(
-                            model_slow_init.state_dict()[key] ^ model_fast_init.state_dict()[key]
-                        ).item()
-                    else:
-                        max_diff = torch.max(
-                            torch.abs(model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key])
-                        ).item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
     def test_torch_save_load(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if config.__class__ not in MODEL_MAPPING:
@@ -957,11 +860,12 @@ def test_causal_lm_can_accept_kwargs(self):
                         model_eager = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float32)
 
                     model_eager.save_pretrained(tmpdir)
-                    with torch.device(torch_device):
-                        model = AutoModelForCausalLM.from_pretrained(tmpdir, torch_dtype=torch.float32)
-                        inputs_dict["num_items_in_batch"] = inputs_dict["input_ids"].shape[0]
-                        inputs_dict["labels"] = inputs_dict["input_ids"]
-                        _ = model(**inputs_dict, return_dict=False)
+                    model = AutoModelForCausalLM.from_pretrained(
+                        tmpdir, torch_dtype=torch.float32, device_map=torch_device
+                    )
+                    inputs_dict["num_items_in_batch"] = inputs_dict["input_ids"].shape[0]
+                    inputs_dict["labels"] = inputs_dict["input_ids"]
+                    _ = model(**inputs_dict, return_dict=False)
 
     def test_training_gradient_checkpointing(self):
         # Scenario - 1 default behaviour
@@ -1191,6 +1095,10 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                         traced_model = torch.jit.trace(
                             model, (pixel_values, prompt_pixel_values, prompt_masks), check_trace=False
                         )  # when traced model is checked, an error is produced due to name mangling
+                    elif "Siglip2" in model_class.__name__:
+                        outputs = model(**inputs)
+                        example_inputs = [t for t in inputs.values() if isinstance(t, torch.Tensor)]
+                        traced_model = torch.jit.trace(model, example_inputs, check_trace=False)
                     else:
                         main_input = inputs[main_input_name]
 
@@ -1489,7 +1397,7 @@ def test_headmasking(self):
             if model.config.is_encoder_decoder:
                 signature = inspect.signature(model.forward)
                 arg_names = [*signature.parameters.keys()]
-                if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
+                if "decoder_head_mask" in arg_names:  # necessary differentiation because of T5 model
                     inputs["decoder_head_mask"] = head_mask
                 if "cross_attn_head_mask" in arg_names:
                     inputs["cross_attn_head_mask"] = head_mask
@@ -1851,7 +1759,7 @@ def test_resize_position_vector_embeddings(self):
                 cloned_embeddings = model_embed.weight.clone()
 
             # Check that resizing the position embeddings with a larger max_position_embeddings increases
-            # the model's postion embeddings size
+            # the model's position embeddings size
             model.resize_position_embeddings(max_position_embeddings + 10)
             self.assertEqual(model.config.max_position_embeddings, max_position_embeddings + 10)
 
@@ -2070,7 +1978,7 @@ def test_resize_tokens_embeddings_with_deepspeed(self):
             self.test_resize_tokens_embeddings()
 
     @require_deepspeed
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_resize_tokens_embeddings_with_deepspeed_multi_gpu(self):
         ds_config = {
             "zero_optimization": {
@@ -2176,7 +2084,7 @@ def test_resize_embeddings_untied_with_deepspeed(self):
             self.test_resize_embeddings_untied()
 
     @require_deepspeed
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_resize_embeddings_untied_with_deepspeed_multi_gpu(self):
         ds_config = {
             "zero_optimization": {
@@ -2210,9 +2118,9 @@ def test_model_main_input_name(self):
     def test_correct_missing_keys(self):
         if not self.test_missing_keys:
             self.skipTest(reason="test_missing_keys is set to `False`")
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
             base_model_prefix = model.base_model_prefix
 
@@ -2269,8 +2177,8 @@ def check_same_values(layer_1, layer_2):
 
     @require_safetensors
     def test_can_use_safetensors(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
             model_tied = model_class(config)
             with tempfile.TemporaryDirectory() as d:
                 try:
@@ -2305,9 +2213,9 @@ def test_can_use_safetensors(self):
                     )
 
     def test_load_save_without_tied_weights(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        config.tie_word_embeddings = False
         for model_class in self.all_model_classes:
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            config.tie_word_embeddings = False
             model = model_class(config)
             with tempfile.TemporaryDirectory() as d:
                 model.save_pretrained(d)
@@ -2355,8 +2263,8 @@ def test_tied_weights_keys(self):
             )
 
     def test_model_weights_reload_no_missing_tied_weights(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
             with tempfile.TemporaryDirectory() as tmp_dir:
                 model.save_pretrained(tmp_dir)
@@ -2367,10 +2275,9 @@ def test_model_weights_reload_no_missing_tied_weights(self):
                 safe_save_file(placeholder_dict, os.path.join(tmp_dir, "model.safetensors"), metadata={"format": "pt"})
                 model_reloaded, infos = model_class.from_pretrained(tmp_dir, output_loading_info=True)
 
-                prefix = f"{model_reloaded.base_model_prefix}."
                 params = dict(model_reloaded.named_parameters())
                 params.update(dict(model_reloaded.named_buffers()))
-                param_names = {k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys()}
+                param_names = set(params.keys())
 
                 missing_keys = set(infos["missing_keys"])
 
@@ -2382,9 +2289,8 @@ def test_model_weights_reload_no_missing_tied_weights(self):
                     ptrs[id_tensor_storage(tensor)].append(name)
                 tied_params = [names for _, names in ptrs.items() if len(names) > 1]
                 for group in tied_params:
-                    group = {k[len(prefix) :] if k.startswith(prefix) else k for k in group}
                     # We remove the group from extra_missing if not all weights from group are in it
-                    if len(group - extra_missing) > 0:
+                    if len(set(group) - extra_missing) > 0:
                         extra_missing = extra_missing - set(group)
 
                 self.assertEqual(
@@ -2398,15 +2304,14 @@ def test_model_weights_reload_no_missing_tied_weights(self):
                 # Remove nonpersistent buffers from missed_missing
                 buffers = [n for n, _ in model_reloaded.named_buffers()]
                 nonpersistent_buffers = {n for n in buffers if n not in model_reloaded.state_dict()}
-                nonpersistent_buffers = {
-                    k[len(prefix) :] if k.startswith(prefix) else k for k in nonpersistent_buffers
-                }
                 missed_missing = missed_missing - nonpersistent_buffers
 
                 if model_reloaded._keys_to_ignore_on_load_missing is None:
                     expected_missing = set()
                 else:
-                    expected_missing = set(model_reloaded._keys_to_ignore_on_load_missing)
+                    expected_missing = set()
+                    for pattern in model_reloaded._keys_to_ignore_on_load_missing:
+                        expected_missing.update({k for k in param_names if re.search(pattern, k) is not None})
                 self.assertEqual(
                     missed_missing,
                     expected_missing,
@@ -2549,483 +2454,10 @@ def _postprocessing_to_ignore_test_cases(self, tf_outputs, pt_outputs, model_cla
 
         return new_tf_outputs, new_pt_outputs
 
-    # Copied from tests.test_modeling_tf_common.TFModelTesterMixin.check_pt_tf_outputs
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
-        """Check the outputs from PyTorch and TensorFlow models are close enough. Checks are done in a recursive way.
-
-        Args:
-            model_class: The class of the model that is currently testing. For example, `TFBertModel`,
-                TFBertForMaskedLM`, `TFBertForSequenceClassification`, etc. Mainly used for providing more informative
-                error messages.
-            name (`str`): The name of the output. For example, `output.hidden_states`, `output.attentions`, etc.
-            attributes (`Tuple[str]`): The names of the output's element if the output is a tuple/list with each element
-                being a named field in the output.
-        """
-
-        self.assertEqual(type(name), str)
-        if attributes is not None:
-            self.assertEqual(type(attributes), tuple, f"{name}: The argument `attributes` should be a `tuple`")
-
-        # Allow `ModelOutput` (e.g. `CLIPOutput` has `text_model_output` and `vision_model_output`).
-        if isinstance(tf_outputs, ModelOutput):
-            self.assertTrue(
-                isinstance(pt_outputs, ModelOutput),
-                f"{name}: `pt_outputs` should an instance of `ModelOutput` when `tf_outputs` is",
-            )
-
-            # Don't copy this block to model specific test file!
-            # TODO: remove this method and this line after issues are fixed
-            tf_outputs, pt_outputs = self._postprocessing_to_ignore_test_cases(tf_outputs, pt_outputs, model_class)
-
-            tf_keys = [k for k, v in tf_outputs.items() if v is not None]
-            pt_keys = [k for k, v in pt_outputs.items() if v is not None]
-
-            self.assertEqual(tf_keys, pt_keys, f"{name}: Output keys differ between TF and PyTorch")
-
-            # convert to the case of `tuple`
-            # appending each key to the current (string) `name`
-            attributes = tuple([f"{name}.{k}" for k in tf_keys])
-            self.check_pt_tf_outputs(
-                tf_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, tol=tol, name=name, attributes=attributes
-            )
-
-        # Allow `list` (e.g. `TransfoXLModelOutput.mems` is a list of tensors.)
-        elif type(tf_outputs) in [tuple, list]:
-            self.assertEqual(type(tf_outputs), type(pt_outputs), f"{name}: Output types differ between TF and PyTorch")
-            self.assertEqual(len(tf_outputs), len(pt_outputs), f"{name}: Output lengths differ between TF and PyTorch")
-
-            if attributes is not None:
-                # case 1: each output has assigned name (e.g. a tuple form of a `ModelOutput`)
-                self.assertEqual(
-                    len(attributes),
-                    len(tf_outputs),
-                    f"{name}: The tuple `attributes` should have the same length as `tf_outputs`",
-                )
-            else:
-                # case 2: each output has no assigned name (e.g. hidden states of each layer) -> add an index to `name`
-                attributes = tuple([f"{name}_{idx}" for idx in range(len(tf_outputs))])
-
-            for tf_output, pt_output, attr in zip(tf_outputs, pt_outputs, attributes):
-                if isinstance(pt_output, DynamicCache):
-                    pt_output = pt_output.to_legacy_cache()
-                self.check_pt_tf_outputs(tf_output, pt_output, model_class, tol=tol, name=attr)
-
-        elif isinstance(tf_outputs, tf.Tensor):
-            self.assertTrue(
-                isinstance(pt_outputs, torch.Tensor), f"{name}: `pt_outputs` should a tensor when `tf_outputs` is"
-            )
-
-            tf_outputs = tf_outputs.numpy()
-            pt_outputs = pt_outputs.detach().to("cpu").numpy()
-
-            self.assertEqual(
-                tf_outputs.shape, pt_outputs.shape, f"{name}: Output shapes differ between TF and PyTorch"
-            )
-
-            # deal with NumPy's scalars to make replacing nan values by 0 work.
-            if np.isscalar(tf_outputs):
-                tf_outputs = np.array([tf_outputs])
-                pt_outputs = np.array([pt_outputs])
-
-            tf_nans = np.isnan(tf_outputs)
-            pt_nans = np.isnan(pt_outputs)
-
-            pt_outputs[tf_nans] = 0
-            tf_outputs[tf_nans] = 0
-            pt_outputs[pt_nans] = 0
-            tf_outputs[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tf_outputs - pt_outputs))
-            self.assertLessEqual(
-                max_diff,
-                tol,
-                f"{name}: Difference between PyTorch and TF is {max_diff} (>= {tol}) for {model_class.__name__}",
-            )
-        else:
-            raise ValueError(
-                "`tf_outputs` should be an instance of `ModelOutput`, a `tuple`, or an instance of `tf.Tensor`. Got"
-                f" {type(tf_outputs)} instead."
-            )
-
-    def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
-        tf_inputs_dict = {}
-        for key, tensor in pt_inputs_dict.items():
-            # skip key that does not exist in tf
-            if isinstance(tensor, bool):
-                tf_inputs_dict[key] = tensor
-            elif key == "input_values":
-                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-            elif key == "pixel_values":
-                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-            elif key == "input_features":
-                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-            # other general float inputs
-            elif tensor.is_floating_point():
-                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-            else:
-                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
-
-        return tf_inputs_dict
-
-    def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
-        tf_inputs_dict = self.prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
-
-        # send pytorch inputs to the correct device
-        pt_inputs_dict = {
-            k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs_dict.items()
-        }
-
-        # send pytorch model to the correct device
-        pt_model.to(torch_device)
-
-        # Check predictions on first output (logits/hidden-states) are close enough given low-level computational differences
-        pt_model.eval()
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs_dict)
-        tf_outputs = tf_model(tf_inputs_dict)
-
-        # tf models returned loss is usually a tensor rather than a scalar.
-        # (see `hf_compute_loss`: it uses `tf.keras.losses.Reduction.NONE`)
-        # Change it here to a scalar to match PyTorch models' loss
-        tf_loss = getattr(tf_outputs, "loss", None)
-        if tf_loss is not None:
-            tf_outputs.loss = tf.math.reduce_mean(tf_loss)
-
-        self.check_pt_tf_outputs(tf_outputs, pt_outputs, type(pt_model))
-
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
-        import transformers
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
-            if not hasattr(transformers, tf_model_class_name):
-                self.skipTest(reason="transformers does not have TF version of this model yet")
-
-            # Output all for aggressive testing
-            config.output_hidden_states = True
-            config.output_attentions = self.has_attentions
-
-            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
-            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
-            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
-            self._make_attention_mask_non_null(inputs_dict)
-
-            tf_model_class = getattr(transformers, tf_model_class_name)
-
-            pt_model = model_class(config).eval()
-            tf_model = tf_model_class(config)
-
-            pt_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            pt_inputs_dict_with_labels = self._prepare_for_class(
-                inputs_dict,
-                model_class,
-                # Not all models accept "labels" in the forward pass (yet :) )
-                return_labels=True if "labels" in inspect.signature(model_class.forward).parameters.keys() else False,
-            )
-
-            # make sure only tf inputs are forward that actually exist in function args
-            tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys())
-
-            # remove all head masks
-            tf_input_keys.discard("head_mask")
-            tf_input_keys.discard("cross_attn_head_mask")
-            tf_input_keys.discard("decoder_head_mask")
-
-            pt_inputs_dict = {k: v for k, v in pt_inputs_dict.items() if k in tf_input_keys}
-            pt_inputs_dict_with_labels = {k: v for k, v in pt_inputs_dict_with_labels.items() if k in tf_input_keys}
-
-            # For some models (e.g. base models), there is no label returned.
-            # Set the input dict to `None` to avoid check outputs twice for the same input dicts.
-            if not set(pt_inputs_dict_with_labels.keys()).symmetric_difference(pt_inputs_dict.keys()):
-                pt_inputs_dict_with_labels = None
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            # Here requires `tf_inputs_dict` to build `tf_model`
-            tf_inputs_dict = self.prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
-            tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
-            )
-            pt_model = transformers.load_tf2_model_in_pytorch_model(
-                pt_model, tf_model, allow_missing_keys=allow_missing_keys
-            )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
-            # check with `labels`
-            if pt_inputs_dict_with_labels:
-                self.check_pt_tf_models(tf_model, pt_model, pt_inputs_dict_with_labels)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
-                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
-                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
-            # check with `labels`
-            if pt_inputs_dict_with_labels:
-                self.check_pt_tf_models(tf_model, pt_model, pt_inputs_dict_with_labels)
-
     def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         diff = np.abs((a - b)).max()
         self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
 
-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
-        """
-        Args:
-            model_class: The class of the model that is currently testing. For example, ..., etc.
-            Currently unused, but it could make debugging easier and faster.
-
-            names: A string, or a list of strings. These specify what fx_outputs/pt_outputs represent in the model outputs.
-                Currently unused, but in the future, we could use this information to make the error message clearer
-                by giving the name(s) of the output tensor(s) with large difference(s) between PT and Flax.
-        """
-        self.assertEqual(type(name), str)
-        if attributes is not None:
-            self.assertEqual(type(attributes), tuple, f"{name}: The argument `attributes` should be a `tuple`")
-
-        # Allow `ModelOutput` (e.g. `CLIPOutput` has `text_model_output` and `vision_model_output`).
-        if isinstance(fx_outputs, ModelOutput):
-            self.assertTrue(
-                isinstance(pt_outputs, ModelOutput),
-                f"{name}: `pt_outputs` should an instance of `ModelOutput` when `fx_outputs` is",
-            )
-
-            fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-            pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-            self.assertEqual(fx_keys, pt_keys, f"{name}: Output keys differ between Flax and PyTorch")
-
-            # convert to the case of `tuple`
-            # appending each key to the current (string) `name`
-            attributes = tuple([f"{name}.{k}" for k in fx_keys])
-            self.check_pt_flax_outputs(
-                fx_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, tol=tol, name=name, attributes=attributes
-            )
-
-        # Allow `list` (e.g. `TransfoXLModelOutput.mems` is a list of tensors.)
-        elif type(fx_outputs) in [tuple, list]:
-            self.assertEqual(
-                type(fx_outputs), type(pt_outputs), f"{name}: Output types differ between Flax and PyTorch"
-            )
-            self.assertEqual(
-                len(fx_outputs), len(pt_outputs), f"{name}: Output lengths differ between Flax and PyTorch"
-            )
-
-            if attributes is not None:
-                # case 1: each output has assigned name (e.g. a tuple form of a `ModelOutput`)
-                self.assertEqual(
-                    len(attributes),
-                    len(fx_outputs),
-                    f"{name}: The tuple `attributes` should have the same length as `fx_outputs`",
-                )
-            else:
-                # case 2: each output has no assigned name (e.g. hidden states of each layer) -> add an index to `name`
-                attributes = tuple([f"{name}_{idx}" for idx in range(len(fx_outputs))])
-
-            for fx_output, pt_output, attr in zip(fx_outputs, pt_outputs, attributes):
-                self.check_pt_flax_outputs(fx_output, pt_output, model_class, tol=tol, name=attr)
-
-        elif isinstance(fx_outputs, jnp.ndarray):
-            self.assertTrue(
-                isinstance(pt_outputs, torch.Tensor), f"{name}: `pt_outputs` should a tensor when `fx_outputs` is"
-            )
-
-            # Using `np.asarray` gives `ValueError: assignment destination is read-only` at the line `fx_outputs[fx_nans] = 0`.
-            fx_outputs = np.array(fx_outputs)
-            pt_outputs = pt_outputs.detach().to("cpu").numpy()
-
-            self.assertEqual(
-                fx_outputs.shape, pt_outputs.shape, f"{name}: Output shapes differ between Flax and PyTorch"
-            )
-
-            # deal with NumPy's scalars to make replacing nan values by 0 work.
-            if np.isscalar(fx_outputs):
-                fx_outputs = np.array([fx_outputs])
-                pt_outputs = np.array([pt_outputs])
-
-            fx_nans = np.isnan(fx_outputs)
-            pt_nans = np.isnan(pt_outputs)
-
-            pt_outputs[fx_nans] = 0
-            fx_outputs[fx_nans] = 0
-            pt_outputs[pt_nans] = 0
-            fx_outputs[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(fx_outputs - pt_outputs))
-            self.assertLessEqual(
-                max_diff, tol, f"{name}: Difference between PyTorch and Flax is {max_diff} (>= {tol})."
-            )
-        else:
-            raise ValueError(
-                "`fx_outputs` should be an instance of `ModelOutput`, a `tuple`, or an instance of `jnp.ndarray`. Got"
-                f" {type(fx_outputs)} instead."
-            )
-
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                # send pytorch inputs to the correct device
-                pt_inputs = {
-                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
-                }
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = fx_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
-
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                fx_model_class_name = "Flax" + model_class.__name__
-
-                if not hasattr(transformers, fx_model_class_name):
-                    self.skipTest(reason="No Flax model exists for this class")
-
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                fx_model_class = getattr(transformers, fx_model_class_name)
-
-                # load PyTorch class
-                pt_model = model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-
-                # load Flax class
-                fx_model = fx_model_class(config, dtype=jnp.float32)
-
-                # make sure only flax inputs are forward that actually exist in function args
-                fx_input_keys = inspect.signature(fx_model.__call__).parameters.keys()
-
-                # prepare inputs
-                pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-                # remove function args that don't exist in Flax
-                pt_inputs = {k: v for k, v in pt_inputs.items() if k in fx_input_keys}
-
-                # send pytorch inputs to the correct device
-                pt_inputs = {
-                    k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs.items()
-                }
-
-                # convert inputs to Flax
-                fx_inputs = {k: np.array(v.to("cpu")) for k, v in pt_inputs.items() if torch.is_tensor(v)}
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**fx_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = model_class.from_pretrained(
-                        tmpdirname, from_flax=True, attn_implementation=fx_model.config._attn_implementation
-                    )
-
-                # send pytorch model to the correct device
-                pt_model_loaded.to(torch_device)
-                pt_model_loaded.eval()
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)
-
     def test_inputs_embeds(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -3242,7 +2674,7 @@ def check_device_map_is_respected(self, model, device_map):
             elif param_device in ["mps"]:
                 self.assertEqual(param.device, torch.device("mps"))
             else:
-                # when loaded with device_map, `param_device` are integer values for cuda/xpu/npu/mlu
+                # when loaded with device_map, `param_device` are integer values for cuda/xpu/hpu/npu/mlu
                 self.assertEqual(param.device, torch.device(f"{torch_device}:{param_device}"))
 
     @require_accelerate
@@ -3524,6 +2956,7 @@ def test_mismatched_shapes_have_properly_initialized_weights(self):
                 "wav2vec2.masked_spec_embed",
                 "Wav2Vec2ForSequenceClassification",
                 "CLIPForImageClassification",
+                "Siglip2ForImageClassification",
                 "RegNetForImageClassification",
                 "ResNetForImageClassification",
                 "UniSpeechSatForSequenceClassification",
@@ -3544,6 +2977,7 @@ def test_mismatched_shapes_have_properly_initialized_weights(self):
                 "ModernBertForSequenceClassification",
                 "ModernBertForTokenClassification",
                 "TimmWrapperForImageClassification",
+                "ModernBertForQuestionAnswering",
             ]
             special_param_names = [
                 r"^bit\.",
@@ -3646,7 +3080,7 @@ def _init_weights(self, module):
         # not to init. the weights during the creation: to match the logic in `from_pretrained`, so we can keep the
         # same sequence of random ops in the execution path to allow us to compare `target_model` and `new_model` below
         # for `linear` part.
-        with ContextManagers([no_init_weights(True)]):
+        with ContextManagers([no_init_weights()]):
             target_model = MyClass(config=config)
         target_model.apply(target_model._initialize_weights)
 
@@ -3692,9 +3126,9 @@ def test_model_is_small(self):
         for model_class in self.all_model_classes:
             model = model_class(config)
             num_params = model.num_parameters()
-            assert (
-                num_params < 1000000
-            ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
+            assert num_params < 1000000, (
+                f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
+            )
 
     @require_flash_attn
     @require_torch_gpu
@@ -4017,31 +3451,39 @@ def test_sdpa_can_dispatch_composite_models(self):
                     ):
                         raise ValueError("The eager model should not have SDPA attention layers")
 
-    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     @require_torch_sdpa
-    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+    def test_eager_matches_sdpa_inference(
+        self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels
+    ):
+        # TODO: we shouldn't need to do this skip, i.e. the test would be composable from the model tester. CLIP-like
+        # models have a custom mixin, which we detect to skip this test.
+        if any(".CLIPModelTesterMixin" in str(base) for base in self.__class__.__bases__):
+            self.skipTest(reason="CLIP-like models have a different `test_eager_matches_sdpa_inference`")
+
         if not self.has_attentions:
             self.skipTest(reason="Model architecture does not support attentions")
 
         if not self.all_model_classes[0]._supports_sdpa:
             self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
 
-        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
+        # convert shorthand name to torch.dtype
+        if torch_dtype == "fp16":
+            torch_dtype = torch.float16
+        elif torch_dtype == "bf16":
+            torch_dtype = torch.bfloat16
+        elif torch_dtype == "fp32":
+            torch_dtype = torch.float32
+
+        if not is_torch_fp16_available_on_device(torch_device) and torch_dtype == torch.float16:
             self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
 
-        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
+        if not is_torch_bf16_available_on_device(torch_device) and torch_dtype == torch.bfloat16:
             self.skipTest(
                 f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
             )
 
-        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
-        if torch_dtype == "float16":
-            torch_dtype = torch.float16
-        elif torch_dtype == "bfloat16":
-            torch_dtype = torch.bfloat16
-        elif torch_dtype == "float32":
-            torch_dtype = torch.float32
-
+        # Dictionary of tolerances for eager <> sdpa tests. Key = (device, sdpa_kernels_enabled, dtype)
         atols = {
             ("cpu", False, torch.float32): 1e-6,
             ("cpu", False, torch.float16): 5e-3,
@@ -4067,242 +3509,254 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
             ("cuda", False, torch.bfloat16): 1e-2,
             ("cuda", False, torch.float16): 5e-3,
             ("cuda", True, torch.float32): 1e-4,
-            ("cuda", True, torch.bfloat16): 3e-2,
+            ("cuda", True, torch.bfloat16): 3e-2,  # (different from others)
             ("cuda", True, torch.float16): 5e-3,
         }
 
-        def get_mean_reldiff(failcase, x, ref, atol, rtol):
-            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
-
         set_model_tester_for_less_flaky_test(self)
 
         for model_class in self.all_model_classes:
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             set_config_for_less_flaky_test(config)
             model = model_class(config)
-            # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
-            # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
-            # This means that the class needs to be instantiated much later, after `use_mask` is set, which means a significant refactor of the code.
-            # However masking there is not done at any layers that matters (i.e self-attention), therefore we can safely deactivate it.
-            deactivate_mask = "use_mask_token" in inspect.signature(model_class).parameters
-            is_encoder_decoder = model.config.is_encoder_decoder
+            # TODO: standardize the interfaces for musicgen models, see other todo in this test
+            if model.__class__.__name__ == "MusicgenMelodyForConditionalGeneration":
+                is_encoder_decoder = True
+            else:
+                is_encoder_decoder = model.config.is_encoder_decoder
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
+                model_from_pretrained_kwargs = {
+                    "pretrained_model_name_or_path": tmpdirname,
+                    "torch_dtype": torch_dtype,
+                }
+
+                if (
+                    hasattr(config, "use_mask_token")
+                    or "use_mask_token" in inspect.signature(model.__init__).parameters
+                ):
+                    model_from_pretrained_kwargs["use_mask_token"] = True
+
+                # TODO: remove this try/except, models should have a shared API
                 try:
                     model_sdpa = model_class.from_pretrained(
-                        tmpdirname, torch_dtype=torch_dtype, attn_implementation="sdpa"
+                        **model_from_pretrained_kwargs, attn_implementation="sdpa"
                     )
                 except ValueError:
-                    model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
+                    model_sdpa = model_class.from_pretrained(**model_from_pretrained_kwargs)
                 model_sdpa = model_sdpa.eval().to(torch_device, dtype=torch_dtype)
 
-                model_eager = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch_dtype,
-                    attn_implementation="eager",
-                )
+                model_eager = model_class.from_pretrained(**model_from_pretrained_kwargs, attn_implementation="eager")
                 model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype)
 
-                set_model_for_less_flaky_test(model_eager)
-                set_model_for_less_flaky_test(model_sdpa)
-
-                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
-                # but it would be nicer to have an efficient way to use parameterized.expand
-                fail_cases = []
-                for padding_side in ["left", "right"]:
-                    for use_mask in [False, True]:
-                        for output_attentions in [True, False]:
-                            can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                            if not (self.has_attentions and can_output_attn) and output_attentions:
-                                continue
-                            # TODO: if we can also check with `batch_size=1` without being flaky?
-                            for batch_size in [7]:
-                                dummy_input = inputs_dict[model.main_input_name]
-
-                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                    dummy_input = dummy_input.to(torch_dtype)
-
-                                dummy_input = dummy_input[:batch_size]
-                                if dummy_input.shape[0] != batch_size:
-                                    if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
-                                        extension = torch.rand(
-                                            batch_size - dummy_input.shape[0],
-                                            *dummy_input.shape[1:],
-                                            dtype=torch_dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-                                    else:
-                                        extension = torch.randint(
-                                            high=5,
-                                            size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]),
-                                            dtype=dummy_input.dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
-
-                                if not use_mask:
-                                    dummy_attention_mask = None
-                                else:
-                                    dummy_attention_mask = inputs_dict.get("attention_mask", None)
-                                    if dummy_attention_mask is None:
-                                        if is_encoder_decoder:
-                                            seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
-                                        else:
-                                            seqlen = dummy_input.shape[-1]
-                                        dummy_attention_mask = (
-                                            torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
-                                        )
-
-                                    dummy_attention_mask = dummy_attention_mask[:batch_size]
-                                    if dummy_attention_mask.shape[0] != batch_size:
-                                        extension = torch.ones(
-                                            batch_size - dummy_attention_mask.shape[0],
-                                            *dummy_attention_mask.shape[1:],
-                                            dtype=dummy_attention_mask.dtype,
-                                            device=torch_device,
-                                        )
-                                        dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
-                                        dummy_attention_mask = dummy_attention_mask.to(torch_device)
-
-                                    dummy_attention_mask[:] = 1
-                                    if padding_side == "left":
-                                        dummy_attention_mask[-1, :2] = 0
-                                        dummy_attention_mask[-1, 2:] = 1
-                                    elif padding_side == "right":
-                                        dummy_attention_mask[-1, -2:] = 0
-                                        dummy_attention_mask[-1, :-2] = 1
-
-                                for enable_kernels in [False, True]:
-                                    failcase = f"padding_side={padding_side}, use_mask={use_mask}, enable_kernels={enable_kernels}"
-                                    if is_encoder_decoder:
-                                        decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[
-                                            :batch_size
-                                        ]
-                                        if decoder_input_ids.shape[0] != batch_size:
-                                            extension = torch.ones(
-                                                batch_size - decoder_input_ids.shape[0],
-                                                *decoder_input_ids.shape[1:],
-                                                dtype=decoder_input_ids.dtype,
-                                                device=torch_device,
-                                            )
-                                            decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
-                                            decoder_input_ids = decoder_input_ids.to(torch_device)
-
-                                        # TODO: never an `attention_mask` arg here?
-                                        processed_inputs = {
-                                            model.main_input_name: dummy_input,
-                                            "decoder_input_ids": decoder_input_ids,
-                                            "decoder_attention_mask": dummy_attention_mask,
-                                            "output_hidden_states": True,
-                                        }
-                                    else:
-                                        processed_inputs = {
-                                            model.main_input_name: dummy_input,
-                                            "output_hidden_states": True,
-                                        }
-
-                                        # Otherwise fails for e.g. WhisperEncoderModel
-                                        if "attention_mask" in inspect.signature(model_eager.forward).parameters:
-                                            processed_inputs["attention_mask"] = dummy_attention_mask
-
-                                        if (
-                                            self.has_attentions
-                                            and "output_attentions" in inspect.signature(model_sdpa.forward).parameters
-                                        ):
-                                            processed_inputs["output_attentions"] = output_attentions
-                                    if not deactivate_mask and (
-                                        "bool_masked_pos" in inspect.signature(model_eager.forward).parameters
-                                    ):
-                                        dummy_mask = torch.ones((self.model_tester.num_masks,))
-
-                                        # In case of additional token (like class) we define a custom `mask_length`
-                                        if hasattr(self.model_tester, "mask_length"):
-                                            mask_length = self.model_tester.mask_length - dummy_mask.size(0)
-                                        else:
-                                            mask_length = self.model_tester.seq_length - dummy_mask.size(0)
-                                        dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
-                                        dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
-                                        processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)
-
-                                    if "noise" in inspect.signature(model_eager.forward).parameters:
-                                        np.random.seed(2)
-                                        num_patches = int(
-                                            (self.model_tester.image_size // self.model_tester.patch_size) ** 2
-                                        )
-                                        noise = np.random.uniform(size=(batch_size, num_patches))
-                                        processed_inputs["noise"] = torch.from_numpy(noise)
-
-                                    # TODO: test gradients as well (& for FA2 as well!)
-                                    with torch.no_grad():
-                                        with sdpa_kernel(
-                                            enable_flash=enable_kernels,
-                                            enable_math=True,
-                                            enable_mem_efficient=enable_kernels,
-                                        ):
-                                            prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
-                                            outputs_eager = model_eager(**prepared_inputs)
-                                            outputs_sdpa = model_sdpa(**prepared_inputs)
-
-                                    if hasattr(outputs_eager, "vision_hidden_states"):
-                                        logits_eager = outputs_eager.vision_hidden_states[-1]
-                                        logits_sdpa = outputs_sdpa.vision_hidden_states[-1]
-                                    else:
-                                        logits_eager = (
-                                            outputs_eager.hidden_states[-1]
-                                            if not is_encoder_decoder
-                                            else outputs_eager.decoder_hidden_states[-1]
-                                        )
-                                        logits_sdpa = (
-                                            outputs_sdpa.hidden_states[-1]
-                                            if not is_encoder_decoder
-                                            else outputs_sdpa.decoder_hidden_states[-1]
-                                        )
-
-                                    if torch_device in ["cpu", "cuda"]:
-                                        atol = atols[torch_device, enable_kernels, torch_dtype]
-                                        rtol = rtols[torch_device, enable_kernels, torch_dtype]
-                                    elif torch_device == "xpu":
-                                        # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
-                                        # which is implemented on PyTorch level using aten operators and is
-                                        # device agnostic with respect to implementation of each aten operator.
-                                        atol = atols["cuda", False, torch_dtype]
-                                        rtol = rtols["cuda", False, torch_dtype]
-                                    else:
-                                        atol = 1e-7
-                                        rtol = 1e-4
-
-                                    # Masked tokens output slightly deviates - we don't mind that.
-                                    if use_mask:
-                                        _logits_sdpa = torch.zeros_like(input=logits_sdpa)
-                                        _logits_eager = torch.zeros_like(input=logits_eager)
-
-                                        _logits_sdpa[:-1] = logits_sdpa[:-1]
-                                        _logits_eager[:-1] = logits_eager[:-1]
-
-                                        if padding_side == "left":
-                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
-                                            _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
-
-                                        elif padding_side == "right":
-                                            _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
-                                            _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
-
-                                        logits_sdpa = _logits_sdpa
-                                        logits_eager = _logits_eager
-
-                                    results = [
-                                        torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
-                                        for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
-                                    ]
-                                    # If 80% batch elements have matched results, it's fine
-                                    if np.mean(results) < 0.8:
-                                        fail_cases.append(
-                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
-                                        )
-
-                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
+            set_model_for_less_flaky_test(model_eager)
+            set_model_for_less_flaky_test(model_sdpa)
+
+            can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
+            if not (self.has_attentions and can_output_attn) and output_attentions:
+                self.skipTest(reason="Model does not support output_attentions")
+
+            # TODO: if we can also check with `batch_size=1` without being flaky?
+            for batch_size in [7]:
+                # musicgen decoder models; TODO: find better abstraction
+                if hasattr(self.model_tester, "num_codebooks") and not hasattr(model_eager, "text_encoder"):
+                    input_data_batch_size = batch_size * self.model_tester.num_codebooks
+                else:
+                    input_data_batch_size = batch_size
+
+                processed_inputs = {}
+                processed_inputs[model.main_input_name] = inputs_dict[model.main_input_name]
+
+                for key in getattr(self, "additional_model_inputs", []):
+                    processed_inputs[key] = inputs_dict[key]
+
+                for key, value in processed_inputs.items():
+                    if torch.is_floating_point(value):
+                        value = value.to(torch_dtype)
+
+                    # extend value to have at least `input_data_batch_size` elements
+                    if value.shape[0] < input_data_batch_size:
+                        size = (input_data_batch_size - value.shape[0], *value.shape[1:])
+                        if torch.is_floating_point(value):
+                            extension = torch.rand(size=size, dtype=value.dtype, device=torch_device)
+                        else:
+                            extension = torch.randint(high=5, size=size, dtype=value.dtype, device=torch_device)
+                        value = torch.cat((value, extension), dim=0).to(torch_device)
+
+                    processed_inputs[key] = value[:input_data_batch_size]
+
+                if not use_attention_mask:
+                    dummy_attention_mask = None
+                else:
+                    dummy_attention_mask = inputs_dict.get("attention_mask", None)
+                    if dummy_attention_mask is None:
+                        if is_encoder_decoder:
+                            seqlen = inputs_dict.get(
+                                "decoder_input_ids", processed_inputs[model.main_input_name]
+                            ).shape[-1]
+                        else:
+                            seqlen = processed_inputs[model.main_input_name].shape[-1]
+                        dummy_attention_mask = torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
+
+                    # extend dummy_attention_mask to have at least `batch_size` elements
+                    if dummy_attention_mask.shape[0] < batch_size:
+                        size = (batch_size - dummy_attention_mask.shape[0], *dummy_attention_mask.shape[1:])
+                        extension = torch.ones(size=size, dtype=dummy_attention_mask.dtype, device=torch_device)
+                        dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
+
+                    dummy_attention_mask = dummy_attention_mask[:batch_size].to(torch_device)
+
+                    dummy_attention_mask[:] = 1
+                    if padding_side == "left":
+                        dummy_attention_mask[-1, :2] = 0
+                        dummy_attention_mask[-1, 2:] = 1
+                    elif padding_side == "right":
+                        dummy_attention_mask[-1, -2:] = 0
+                        dummy_attention_mask[-1, :-2] = 1
+
+                if is_encoder_decoder:
+                    # musicgen encoder-decoder models; TODO: find better abstraction
+                    if hasattr(self.model_tester, "num_codebooks"):
+                        input_data_batch_size = batch_size * self.model_tester.num_codebooks
+                    else:
+                        input_data_batch_size = batch_size
+
+                    decoder_input_ids = inputs_dict.get("decoder_input_ids", processed_inputs[model.main_input_name])
+                    decoder_input_ids = decoder_input_ids[:input_data_batch_size]
+                    if decoder_input_ids.shape[0] != input_data_batch_size:
+                        extension = torch.ones(
+                            input_data_batch_size - decoder_input_ids.shape[0],
+                            *decoder_input_ids.shape[1:],
+                            dtype=decoder_input_ids.dtype,
+                            device=torch_device,
+                        )
+                        decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
+                        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+                    # TODO: never an `attention_mask` arg here?
+                    processed_inputs.update(
+                        {
+                            "decoder_input_ids": decoder_input_ids,
+                            "decoder_attention_mask": dummy_attention_mask,
+                            "output_hidden_states": True,
+                        }
+                    )
+                else:
+                    processed_inputs.update(
+                        {
+                            "output_hidden_states": True,
+                        }
+                    )
+
+                    # Otherwise fails for e.g. WhisperEncoderModel
+                    if "attention_mask" in inspect.signature(model_eager.forward).parameters:
+                        processed_inputs["attention_mask"] = dummy_attention_mask
+
+                    if self.has_attentions and "output_attentions" in inspect.signature(model_sdpa.forward).parameters:
+                        processed_inputs["output_attentions"] = output_attentions
+                if "bool_masked_pos" in inspect.signature(model_eager.forward).parameters:
+                    dummy_mask = torch.ones((self.model_tester.num_masks,))
+
+                    # In case of additional token (like class) we define a custom `mask_length`
+                    if hasattr(self.model_tester, "mask_length"):
+                        mask_length = self.model_tester.mask_length - dummy_mask.size(0)
+                    else:
+                        mask_length = self.model_tester.seq_length - dummy_mask.size(0)
+                    dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
+                    dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
+                    processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)
+
+                if "noise" in inspect.signature(model_eager.forward).parameters:
+                    np.random.seed(2)
+                    num_patches = int((self.model_tester.image_size // self.model_tester.patch_size) ** 2)
+                    noise = np.random.uniform(size=(batch_size, num_patches))
+                    processed_inputs["noise"] = torch.from_numpy(noise)
+
+                # TODO: test gradients as well (& for FA2 as well!)
+                with torch.no_grad():
+                    with sdpa_kernel(
+                        enable_flash=enable_kernels,
+                        enable_math=True,
+                        enable_mem_efficient=enable_kernels,
+                    ):
+                        prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
+                        prepared_inputs = {
+                            k: v.to(torch_device) if isinstance(v, torch.Tensor) else v
+                            for k, v in prepared_inputs.items()
+                        }
+                        outputs_eager = model_eager(**prepared_inputs)
+                        outputs_sdpa = model_sdpa(**prepared_inputs)
+
+                if "logits_per_text" in outputs_eager:
+                    key = "logits_per_text"
+                elif "vision_hidden_states" in outputs_eager:
+                    key = "vision_hidden_states"
+                elif "audio_values" in outputs_eager:
+                    key = "audio_values"
+                elif "decoder_hidden_states" in outputs_eager:
+                    key = "decoder_hidden_states"
+                elif "logits" in outputs_eager and "Classification" in model_class.__name__:
+                    key = "logits"
+                else:
+                    key = "hidden_states"
+
+                # TODO: rename logits -> hidden_states
+                logits_eager = outputs_eager[key]
+                logits_sdpa = outputs_sdpa[key]
+
+                if key in ["vision_hidden_states", "decoder_hidden_states", "hidden_states"]:
+                    logits_eager = logits_eager[-1]
+                    logits_sdpa = logits_sdpa[-1]
+
+                if key == "logits_per_text":
+                    nan_mask = torch.isnan(logits_eager)
+                    logits_eager[nan_mask] = 0
+                    logits_sdpa[nan_mask] = 0
+
+                if torch_device in ["cpu", "cuda"]:
+                    atol = atols[torch_device, enable_kernels, torch_dtype]
+                    rtol = rtols[torch_device, enable_kernels, torch_dtype]
+                elif torch_device == "xpu":
+                    # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
+                    # which is implemented on PyTorch level using aten operators and is
+                    # device agnostic with respect to implementation of each aten operator.
+                    atol = atols["cuda", False, torch_dtype]
+                    rtol = rtols["cuda", False, torch_dtype]
+                else:
+                    atol = 1e-7
+                    rtol = 1e-4
+
+                # Masked tokens output slightly deviates - we don't mind that.
+                if use_attention_mask:
+                    _logits_sdpa = torch.zeros_like(input=logits_sdpa)
+                    _logits_eager = torch.zeros_like(input=logits_eager)
+
+                    _logits_sdpa[:-1] = logits_sdpa[:-1]
+                    _logits_eager[:-1] = logits_eager[:-1]
+
+                    if padding_side == "left":
+                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:]
+                        _logits_eager[-1:, 2:] = logits_eager[-1:, 2:]
+
+                    elif padding_side == "right":
+                        _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2]
+                        _logits_eager[-1:, 2:] = logits_eager[-1:, :-2]
+
+                    logits_sdpa = _logits_sdpa
+                    logits_eager = _logits_eager
+
+                results = [
+                    torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
+                    for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
+                ]
+                # If 80% batch elements have matched results, it's fine
+                if np.mean(results) < 0.8:
+                    mean_relative_diff = ((logits_sdpa - logits_eager).abs() / (logits_eager.abs() + 1e-12)).mean()
+                    raise ValueError(
+                        f"mean relative difference for {key}: {mean_relative_diff:.3e}, torch atol = {atol}, torch rtol = "
+                        f"{rtol}"
+                    )
 
     @require_torch_sdpa
     @require_torch_gpu
@@ -4469,7 +3923,7 @@ def test_flash_attn_2_can_dispatch_composite_models(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
             if not self._is_composite:
-                self.skipTest("This model is not a composte model!")
+                self.skipTest("This model is not a composite model!")
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
@@ -4644,53 +4098,6 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
                 tol = torch.finfo(torch.float16).eps
                 torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
 
-    @is_pt_tf_cross_test
-    def test_tf_from_pt_safetensors(self):
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
-            if not hasattr(transformers, tf_model_class_name):
-                self.skipTest(reason="transformers does not have this model in TF version yet")
-
-            tf_model_class = getattr(transformers, tf_model_class_name)
-
-            pt_model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname, safe_serialization=True)
-                tf_model_1 = tf_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                pt_model.save_pretrained(tmpdirname, safe_serialization=False)
-                tf_model_2 = tf_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                # Check models are equal
-                for p1, p2 in zip(tf_model_1.weights, tf_model_2.weights):
-                    self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @is_pt_flax_cross_test
-    def test_flax_from_pt_safetensors(self):
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            flax_model_class_name = "Flax" + model_class.__name__  # Add the "Flax at the beginning
-            if not hasattr(transformers, flax_model_class_name):
-                self.skipTest(reason="transformers does not have this model in Flax version yet")
-
-            flax_model_class = getattr(transformers, flax_model_class_name)
-
-            pt_model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname, safe_serialization=True)
-                flax_model_1 = flax_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                pt_model.save_pretrained(tmpdirname, safe_serialization=False)
-                flax_model_2 = flax_model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                # Check models are equal
-                self.assertTrue(check_models_equal(flax_model_1, flax_model_2))
-
     @require_flash_attn
     @require_torch_gpu
     @mark.flash_attn_test
@@ -4832,7 +4239,7 @@ def test_torch_compile_for_training(self):
         loss = model(**inputs).loss
         loss.backward()
 
-        params = {name: param.grad.clone().detach().cpu() for name, param in model.named_parameters()}
+        params = {name: param.grad.detach().to(device="cpu", copy=True) for name, param in model.named_parameters()}
         model.zero_grad()
         del loss
 
@@ -4929,9 +4336,9 @@ def recursively_check(eager_outputs, exported_outputs):
                     exported_outputs = exported_model.module().forward(**inputs_dict)
 
                 # Check if outputs are close:
-                # is_tested is a boolean flag idicating if we comapre any outputs,
+                # is_tested is a boolean flag indicating if we compare any outputs,
                 # e.g. there might be a situation when outputs are empty list, then is_tested will be False.
-                # In case of outputs are different the error will be rasied in `recursively_check` function.
+                # In case of outputs are different the error will be raised in `recursively_check` function.
                 is_tested = recursively_check(eager_outputs, exported_outputs)
                 self.assertTrue(is_tested, msg=f"No outputs were compared for {model_class.__name__}")
 
@@ -4948,6 +4355,33 @@ def test_flex_attention_with_grads(self):
             # If this does not raise an error, the test passes (see https://github.com/huggingface/transformers/pull/35605)
             _ = model(inputs_dict["input_ids"].to(torch_device))
 
+    def test_generation_tester_mixin_inheritance(self):
+        """
+        Ensures that we have the generation tester mixin if the model can generate. The test will fail otherwise,
+        forcing the mixin to be added -- and ensuring proper test coverage
+        """
+        if len(self.all_generative_model_classes) > 0:
+            self.assertTrue(
+                issubclass(self.__class__, GenerationTesterMixin),
+                msg=(
+                    "This model can call `generate` from `GenerationMixin`, so one of two things must happen: 1) the "
+                    "tester must inherit from `GenerationTesterMixin` to run `generate` tests, or 2) if the model "
+                    "doesn't fully support the original `generate` or has a custom `generate` with partial feature "
+                    "support, the tester must overwrite `all_generative_model_classes` to skip the failing classes "
+                    "(make sure to comment why). If `all_generative_model_classes` is overwritten as `()`, then we "
+                    "need to remove the `GenerationTesterMixin` inheritance -- no `generate` tests are being run."
+                ),
+            )
+        else:
+            self.assertFalse(
+                issubclass(self.__class__, GenerationTesterMixin),
+                msg=(
+                    "This model can't call `generate`, so its tester can't inherit `GenerationTesterMixin`. (If you "
+                    "think the model should be able to `generate`, the model may be missing the `GenerationMixin` "
+                    "inheritance)"
+                ),
+            )
+
 
 global_rng = random.Random()
 
diff --git a/tests/test_modeling_flax_common.py b/tests/test_modeling_flax_common.py
index e6e3a860772d..82cbf0901c93 100644
--- a/tests/test_modeling_flax_common.py
+++ b/tests/test_modeling_flax_common.py
@@ -21,13 +21,10 @@
 
 import numpy as np
 
-import transformers
-from transformers import is_flax_available, is_torch_available
-from transformers.cache_utils import DynamicCache
+from transformers import is_flax_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import CaptureLogger, is_pt_flax_cross_test, require_flax, torch_device
+from transformers.testing_utils import CaptureLogger, require_flax
 from transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, logging
-from transformers.utils.generic import ModelOutput
 
 
 if is_flax_available():
@@ -47,17 +44,10 @@
         FlaxAutoModelForSequenceClassification,
         FlaxBertModel,
     )
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
     from transformers.modeling_flax_utils import FLAX_WEIGHTS_INDEX_NAME, FLAX_WEIGHTS_NAME
 
     os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
 
-if is_torch_available():
-    import torch
-
 
 def ids_tensor(shape, vocab_size, rng=None):
     """Creates a random int32 tensor of the shape within the vocab size."""
@@ -184,216 +174,6 @@ def recursive_check(tuple_object, dict_object):
             dict_inputs = self._prepare_for_class(inputs_dict, model_class)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
-    # (Copied from tests.test_modeling_common.ModelTesterMixin.check_pt_flax_outputs)
-    def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
-        """
-        Args:
-            model_class: The class of the model that is currently testing. For example, ..., etc.
-            Currently unused, but it could make debugging easier and faster.
-
-            names: A string, or a list of strings. These specify what fx_outputs/pt_outputs represent in the model outputs.
-                Currently unused, but in the future, we could use this information to make the error message clearer
-                by giving the name(s) of the output tensor(s) with large difference(s) between PT and Flax.
-        """
-        self.assertEqual(type(name), str)
-        if attributes is not None:
-            self.assertEqual(type(attributes), tuple, f"{name}: The argument `attributes` should be a `tuple`")
-
-        # Allow `ModelOutput` (e.g. `CLIPOutput` has `text_model_output` and `vision_model_output`).
-        if isinstance(fx_outputs, ModelOutput):
-            self.assertTrue(
-                isinstance(pt_outputs, ModelOutput),
-                f"{name}: `pt_outputs` should an instance of `ModelOutput` when `fx_outputs` is",
-            )
-
-            fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-            pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-            self.assertEqual(fx_keys, pt_keys, f"{name}: Output keys differ between Flax and PyTorch")
-
-            # convert to the case of `tuple`
-            # appending each key to the current (string) `name`
-            attributes = tuple([f"{name}.{k}" for k in fx_keys])
-            self.check_pt_flax_outputs(
-                fx_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, tol=tol, name=name, attributes=attributes
-            )
-
-        # Allow `list` (e.g. `TransfoXLModelOutput.mems` is a list of tensors.)
-        elif type(fx_outputs) in [tuple, list]:
-            self.assertEqual(
-                type(fx_outputs), type(pt_outputs), f"{name}: Output types differ between Flax and PyTorch"
-            )
-            self.assertEqual(
-                len(fx_outputs), len(pt_outputs), f"{name}: Output lengths differ between Flax and PyTorch"
-            )
-
-            if attributes is not None:
-                # case 1: each output has assigned name (e.g. a tuple form of a `ModelOutput`)
-                self.assertEqual(
-                    len(attributes),
-                    len(fx_outputs),
-                    f"{name}: The tuple `attributes` should have the same length as `fx_outputs`",
-                )
-            else:
-                # case 2: each output has no assigned name (e.g. hidden states of each layer) -> add an index to `name`
-                attributes = tuple([f"{name}_{idx}" for idx in range(len(fx_outputs))])
-
-            for fx_output, pt_output, attr in zip(fx_outputs, pt_outputs, attributes):
-                if isinstance(pt_output, DynamicCache):
-                    pt_output = pt_output.to_legacy_cache()
-                self.check_pt_flax_outputs(fx_output, pt_output, model_class, tol=tol, name=attr)
-
-        elif isinstance(fx_outputs, jnp.ndarray):
-            self.assertTrue(
-                isinstance(pt_outputs, torch.Tensor), f"{name}: `pt_outputs` should a tensor when `fx_outputs` is"
-            )
-
-            # Using `np.asarray` gives `ValueError: assignment destination is read-only` at the line `fx_outputs[fx_nans] = 0`.
-            fx_outputs = np.array(fx_outputs)
-            pt_outputs = pt_outputs.detach().to("cpu").numpy()
-
-            self.assertEqual(
-                fx_outputs.shape, pt_outputs.shape, f"{name}: Output shapes differ between Flax and PyTorch"
-            )
-
-            # deal with NumPy's scalars to make replacing nan values by 0 work.
-            if np.isscalar(fx_outputs):
-                fx_outputs = np.array([fx_outputs])
-                pt_outputs = np.array([pt_outputs])
-
-            fx_nans = np.isnan(fx_outputs)
-            pt_nans = np.isnan(pt_outputs)
-
-            pt_outputs[fx_nans] = 0
-            fx_outputs[fx_nans] = 0
-            pt_outputs[pt_nans] = 0
-            fx_outputs[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(fx_outputs - pt_outputs))
-            self.assertLessEqual(
-                max_diff, tol, f"{name}: Difference between PyTorch and Flax is {max_diff} (>= {tol})."
-            )
-        else:
-            raise ValueError(
-                "`fx_outputs` should be an instance of `ModelOutput`, a `tuple`, or an instance of `jnp.ndarray`. Got"
-                f" {type(fx_outputs)} instead."
-            )
-
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        # It might be better to put this inside the for loop below (because we modify the config there).
-        # But logically, it is fine.
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist(), device=torch_device) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**prepared_inputs_dict)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict)
-
-                fx_keys = tuple([k for k, v in fx_outputs_loaded.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs_loaded, pt_outputs, model_class)
-
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # Output all for aggressive testing
-                config.output_hidden_states = True
-                config.output_attentions = self.has_attentions
-
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist(), device=torch_device) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                # Flax models don't use the `use_cache` option and cache is not returned as a default.
-                # So we disable `use_cache` here for PyTorch model.
-                pt_model.config.use_cache = False
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                # send pytorch model to the correct device
-                pt_model.to(torch_device)
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs)
-                fx_outputs = fx_model(**prepared_inputs_dict)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs, model_class)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(
-                        tmpdirname, from_flax=True, attn_implementation=fx_model.config._attn_implementation
-                    )
-
-                # send pytorch model to the correct device
-                pt_model_loaded.to(torch_device)
-                pt_model_loaded.eval()
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs)
-
-                fx_keys = tuple([k for k, v in fx_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs_loaded.items() if v is not None])
-
-                self.assertEqual(fx_keys, pt_keys)
-                self.check_pt_flax_outputs(fx_outputs, pt_outputs_loaded, model_class)
-
     def test_from_pretrained_save_pretrained(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -474,92 +254,6 @@ def test_save_load_to_base(self):
                     max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
                     self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
-    @is_pt_flax_cross_test
-    def test_save_load_from_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = base_class(config)
-            base_params = get_params(model.params)
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, base_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                # save pt model
-                pt_model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_param_from_head = get_params(head_model.params, from_head_prefix=head_model.base_model_prefix)
-
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    @is_pt_flax_cross_test
-    def test_save_load_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            base_params_from_head = get_params(model.params, from_head_prefix=model.base_model_prefix)
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = get_params(base_model.params)
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
-    @is_pt_flax_cross_test
-    def test_save_load_bf16_to_base_pt(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-
-            model = model_class(config)
-            model.params = model.to_bf16(model.params)
-            base_params_from_head = get_params(model.params, from_head_prefix=model.base_model_prefix)
-
-            # convert Flax model to PyTorch model
-            pt_model_class = getattr(transformers, model_class.__name__[4:])  # Skip the "Flax" at the beginning
-            pt_model = pt_model_class(config).eval()
-            pt_model = load_flax_weights_in_pytorch_model(pt_model, model.params)
-
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname, from_pt=True)
-
-                base_params = get_params(base_model.params)
-
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-
     def test_jit_compilation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -986,14 +680,14 @@ def test_no_automatic_init(self):
         for model_class in self.all_model_classes:
             model = model_class(config, _do_init=False)
 
-            # Check that accesing parmas raises an ValueError when _do_init is False
+            # Check that accessing params raises an ValueError when _do_init is False
             with self.assertRaises(ValueError):
                 params = model.params
 
             # Check if we params can be properly initialized when calling init_weights
             params = model.init_weights(model.key, model.input_shape)
             assert isinstance(params, (dict, FrozenDict)), f"params are not an instance of {FrozenDict}"
-            # Check if all required parmas are initialized
+            # Check if all required params are initialized
             keys = set(flatten_dict(unfreeze(params)).keys())
             self.assertTrue(all(k in keys for k in model.required_params))
             # Check if the shapes match
@@ -1019,7 +713,7 @@ def test_from_pretrained_with_no_automatic_init(self):
         config.return_dict = True
 
         def _assert_all_params_initialised(model, params):
-            # Check if all required parmas are loaded
+            # Check if all required params are loaded
             keys = set(flatten_dict(unfreeze(params)).keys())
             self.assertTrue(all(k in keys for k in model.required_params))
             # Check if the shapes match
@@ -1041,11 +735,11 @@ def _assert_all_params_initialised(model, params):
                 model.save_pretrained(tmpdirname)
                 model, params = model_class.from_pretrained(tmpdirname, _do_init=False)
 
-            # Check that accesing parmas raises an ValueError when _do_init is False
+            # Check that accessing params raises an ValueError when _do_init is False
             with self.assertRaises(ValueError):
                 params = model.params
 
-            # Check if all required parmas are loaded
+            # Check if all required params are loaded
             _assert_all_params_initialised(model, params)
 
             # Check that setting params raises an ValueError when _do_init is False
@@ -1063,7 +757,7 @@ def _assert_all_params_initialised(model, params):
                 model, params = model_class.from_pretrained(tmpdirname, _do_init=False)
 
                 params = model.init_weights(model.key, model.input_shape, params=params)
-                # Check if all required parmas are loaded
+                # Check if all required params are loaded
                 _assert_all_params_initialised(model, params)
 
     def test_checkpoint_sharding_from_hub(self):
@@ -1119,14 +813,6 @@ def test_checkpoint_sharding_local(self):
                 for p1, p2 in zip(flatten_dict(model.params).values(), flatten_dict(new_model.params).values()):
                     self.assertTrue(np.allclose(np.array(p1), np.array(p2)))
 
-    @is_pt_flax_cross_test
-    def test_from_sharded_pt(self):
-        model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded", from_pt=True)
-        ref_model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-fx-only")
-        for key, ref_val in flatten_dict(ref_model.params).items():
-            val = flatten_dict(model.params)[key]
-            assert np.allclose(np.array(val), np.array(ref_val))
-
     def test_gradient_checkpointing(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 9dc712ab67b6..248b43c2f8f8 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -29,16 +29,13 @@
 
 from datasets import Dataset
 
-from transformers import is_tf_available, is_torch_available
+from transformers import is_tf_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import (  # noqa: F401
+from transformers.testing_utils import (
     CaptureLogger,
-    _tf_gpu_memory_limit,
-    is_pt_tf_cross_test,
     require_tf,
     require_tf2onnx,
     slow,
-    torch_device,
 )
 from transformers.utils import CONFIG_NAME, GENERATION_CONFIG_NAME, logging
 from transformers.utils.generic import ModelOutput
@@ -70,37 +67,10 @@
         TFAutoModelForSequenceClassification,
         TFSharedEmbeddings,
     )
-    from transformers.generation import (
-        TFBeamSampleDecoderOnlyOutput,
-        TFBeamSampleEncoderDecoderOutput,
-        TFBeamSearchDecoderOnlyOutput,
-        TFBeamSearchEncoderDecoderOutput,
-        TFGreedySearchDecoderOnlyOutput,
-        TFGreedySearchEncoderDecoderOutput,
-        TFSampleDecoderOnlyOutput,
-        TFSampleEncoderDecoderOutput,
-    )
     from transformers.modeling_tf_utils import keras
 
     tf.config.experimental.enable_tensor_float_32_execution(False)
 
-    if _tf_gpu_memory_limit is not None:
-        gpus = tf.config.list_physical_devices("GPU")
-        for gpu in gpus:
-            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
-            try:
-                tf.config.set_logical_device_configuration(
-                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
-                )
-                logical_gpus = tf.config.list_logical_devices("GPU")
-                print("Logical GPUs", logical_gpus)
-            except RuntimeError as e:
-                # Virtual devices must be set before GPUs have been initialized
-                print(e)
-
-if is_torch_available():
-    import torch
-
 
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
@@ -484,215 +454,6 @@ def _postprocessing_to_ignore_test_cases(self, tf_outputs, pt_outputs, model_cla
 
         return new_tf_outputs, new_pt_outputs
 
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-4, name="outputs", attributes=None):
-        """Check the outputs from PyTorch and TensorFlow models are close enough. Checks are done in a recursive way.
-
-        Args:
-            model_class: The class of the model that is currently testing. For example, `TFBertModel`,
-                TFBertForMaskedLM`, `TFBertForSequenceClassification`, etc. Mainly used for providing more informative
-                error messages.
-            name (`str`): The name of the output. For example, `output.hidden_states`, `output.attentions`, etc.
-            attributes (`Tuple[str]`): The names of the output's element if the output is a tuple/list with each element
-                being a named field in the output.
-        """
-        from transformers.cache_utils import DynamicCache
-
-        self.assertEqual(type(name), str)
-        if attributes is not None:
-            self.assertEqual(type(attributes), tuple, f"{name}: The argument `attributes` should be a `tuple`")
-
-        # Allow `ModelOutput` (e.g. `CLIPOutput` has `text_model_output` and `vision_model_output`).
-        if isinstance(tf_outputs, ModelOutput):
-            self.assertTrue(
-                isinstance(pt_outputs, ModelOutput),
-                f"{name}: `pt_outputs` should an instance of `ModelOutput` when `tf_outputs` is",
-            )
-
-            # Don't copy this block to model specific test file!
-            # TODO: remove this method and this line after issues are fixed
-            tf_outputs, pt_outputs = self._postprocessing_to_ignore_test_cases(tf_outputs, pt_outputs, model_class)
-
-            tf_keys = [k for k, v in tf_outputs.items() if v is not None]
-            pt_keys = [k for k, v in pt_outputs.items() if v is not None]
-
-            self.assertEqual(tf_keys, pt_keys, f"{name}: Output keys differ between TF and PyTorch")
-
-            # convert to the case of `tuple`
-            # appending each key to the current (string) `names`
-            attributes = tuple([f"{name}.{k}" for k in tf_keys])
-            self.check_pt_tf_outputs(
-                tf_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, tol=tol, name=name, attributes=attributes
-            )
-
-        # Allow `list` (e.g. `TransfoXLModelOutput.mems` is a list of tensors.)
-        elif type(tf_outputs) in [tuple, list]:
-            self.assertEqual(type(tf_outputs), type(pt_outputs), f"{name}: Output types differ between TF and PyTorch")
-            self.assertEqual(len(tf_outputs), len(pt_outputs), f"{name}: Output lengths differ between TF and PyTorch")
-
-            if attributes is not None:
-                # case 1: each output has assigned name (e.g. a tuple form of a `ModelOutput`)
-                self.assertEqual(
-                    len(attributes),
-                    len(tf_outputs),
-                    f"{name}: The tuple `names` should have the same length as `tf_outputs`",
-                )
-            else:
-                # case 2: each output has no assigned name (e.g. hidden states of each layer) -> add an index to `names`
-                attributes = tuple([f"{name}_{idx}" for idx in range(len(tf_outputs))])
-
-            for tf_output, pt_output, attr in zip(tf_outputs, pt_outputs, attributes):
-                if isinstance(pt_output, DynamicCache):
-                    pt_output = pt_output.to_legacy_cache()
-                self.check_pt_tf_outputs(tf_output, pt_output, model_class, tol=tol, name=attr)
-
-        elif isinstance(tf_outputs, tf.Tensor):
-            self.assertTrue(
-                isinstance(pt_outputs, torch.Tensor), f"{name}: `pt_outputs` should a tensor when `tf_outputs` is"
-            )
-
-            tf_outputs = tf_outputs.numpy()
-            pt_outputs = pt_outputs.detach().to("cpu").numpy()
-
-            self.assertEqual(
-                tf_outputs.shape, pt_outputs.shape, f"{name}: Output shapes differ between TF and PyTorch"
-            )
-
-            # deal with NumPy's scalars to make replacing nan values by 0 work.
-            if np.isscalar(tf_outputs):
-                tf_outputs = np.array([tf_outputs])
-                pt_outputs = np.array([pt_outputs])
-
-            tf_nans = np.isnan(tf_outputs)
-            pt_nans = np.isnan(pt_outputs)
-
-            pt_outputs[tf_nans] = 0
-            tf_outputs[tf_nans] = 0
-            pt_outputs[pt_nans] = 0
-            tf_outputs[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tf_outputs - pt_outputs))
-            self.assertLessEqual(max_diff, tol, f"{name}: Difference between torch and tf is {max_diff} (>= {tol}).")
-        else:
-            raise ValueError(
-                "`tf_outputs` should be an instance of `tf.Tensor`, a `tuple`, or an instance of `tf.Tensor`. Got"
-                f" {type(tf_outputs)} instead."
-            )
-
-    def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
-        pt_inputs_dict = {}
-        for name, key in tf_inputs_dict.items():
-            if isinstance(key, bool):
-                pt_inputs_dict[name] = key
-            elif name == "input_values":
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            elif name == "pixel_values":
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            elif name == "input_features":
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            # other general float inputs
-            elif tf_inputs_dict[name].dtype.is_floating:
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
-            else:
-                pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
-
-        return pt_inputs_dict
-
-    def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-        pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict)
-
-        # send pytorch inputs to the correct device
-        pt_inputs_dict = {
-            k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs_dict.items()
-        }
-
-        # send pytorch model to the correct device
-        pt_model.to(torch_device)
-
-        # Check predictions on first output (logits/hidden-states) are close enough given low-level computational differences
-        pt_model.eval()
-
-        with torch.no_grad():
-            pt_outputs = pt_model(**pt_inputs_dict)
-        tf_outputs = tf_model(tf_inputs_dict)
-
-        # tf models returned loss is usually a tensor rather than a scalar.
-        # (see `hf_compute_loss`: it uses `keras.losses.Reduction.NONE`)
-        # Change it here to a scalar to match PyTorch models' loss
-        tf_loss = getattr(tf_outputs, "loss", None)
-        if tf_loss is not None:
-            tf_outputs.loss = tf.math.reduce_mean(tf_loss)
-
-        self.check_pt_tf_outputs(tf_outputs, pt_outputs, type(tf_model))
-
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self, allow_missing_keys=False):
-        import transformers
-
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            # Output all for aggressive testing
-            config.output_hidden_states = True
-            config.output_attentions = self.has_attentions
-
-            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
-            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
-            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
-            self._make_attention_mask_non_null(inputs_dict)
-
-            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
-            pt_model_class = getattr(transformers, pt_model_class_name)
-
-            tf_model = model_class(config)
-            pt_model = pt_model_class(config)
-
-            tf_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            tf_inputs_dict_with_labels = self._prepare_for_class(
-                inputs_dict,
-                model_class,
-                # Not all models accept "labels" in the forward pass (yet :) )
-                return_labels=True if "labels" in inspect.signature(model_class.call).parameters.keys() else False,
-            )
-
-            # For some models (e.g. base models), there is no label returned.
-            # Set the input dict to `None` to avoid check outputs twice for the same input dicts.
-            if not set(tf_inputs_dict_with_labels.keys()).symmetric_difference(tf_inputs_dict.keys()):
-                tf_inputs_dict_with_labels = None
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(
-                tf_model, pt_model, tf_inputs=tf_inputs_dict, allow_missing_keys=allow_missing_keys
-            )
-            pt_model = transformers.load_tf2_model_in_pytorch_model(
-                pt_model, tf_model, allow_missing_keys=allow_missing_keys
-            )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-            # check with `labels`
-            if tf_inputs_dict_with_labels:
-                self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict_with_labels)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(
-                    tf_model, pt_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(
-                    pt_model, tf_checkpoint_path, allow_missing_keys=allow_missing_keys
-                )
-
-            # Original test: check without `labels`
-            self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
-            # check with `labels`
-            if tf_inputs_dict_with_labels:
-                self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict_with_labels)
-
     @slow
     def test_compile_tf_model(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -838,7 +599,7 @@ def prepare_layer_head_mask(i, attention_heads, num_hidden_layers):
             if model.config.is_encoder_decoder:
                 signature = inspect.signature(model.call)
                 arg_names = [*signature.parameters.keys()]
-                if "decoder_head_mask" in arg_names:  # necessary diferentiation because of T5 model
+                if "decoder_head_mask" in arg_names:  # necessary differentiation because of T5 model
                     inputs["decoder_head_mask"] = head_mask
                 if "cross_attn_head_mask" in arg_names:
                     inputs["cross_attn_head_mask"] = head_mask
@@ -1211,150 +972,6 @@ def test_embeddings_out_of_bounds_raise_exception(self):
             with self.assertRaises(tf.errors.InvalidArgumentError):
                 model(**prepared_inputs)
 
-    def test_lm_head_model_random_no_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids
-                with self.assertRaises(ValueError):
-                    model.generate(do_sample=True, max_length=5)
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_ids, do_sample=True))
-            elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
-                # Models with non-text inputs won't work here; num_return_sequences = 1
-                self._check_generated_ids(model.generate(do_sample=True, max_length=5))
-
-            with self.assertRaises(ValueError):
-                # generating multiple sequences when no beam search generation
-                # is not allowed as it would always generate the same sequences
-                model.generate(input_ids, do_sample=False, num_return_sequences=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_ids.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-        if input_ids is None:
-            input_ids = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-            output_greedy = model.generate(
-                input_ids,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            output_sample = model.generate(
-                input_ids,
-                do_sample=True,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_greedy, TFGreedySearchEncoderDecoderOutput)
-                self.assertIsInstance(output_sample, TFSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_greedy, TFGreedySearchDecoderOnlyOutput)
-                self.assertIsInstance(output_sample, TFSampleDecoderOnlyOutput)
-
-    def test_lm_head_model_random_beam_search_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            if config.bos_token_id is None:
-                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
-                self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
-            else:
-                # num_return_sequences = 1
-                self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
-
-            with self.assertRaises(ValueError):
-                # generating more sequences than having beams leads is not possible
-                model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
-
-            # num_return_sequences > 1, sample
-            self._check_generated_ids(
-                model.generate(
-                    input_ids,
-                    do_sample=True,
-                    num_beams=2,
-                    num_return_sequences=2,
-                )
-            )
-            # num_return_sequences > 1, greedy
-            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
-
-            # check bad words tokens language generation
-            # create list of 1-seq bad token and list of 2-seq of bad tokens
-            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
-            output_tokens = model.generate(
-                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
-            )
-            # only count generated tokens
-            generated_ids = output_tokens[:, input_ids.shape[-1] :]
-            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
-
-    def test_lm_head_model_beam_search_generate_dict_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids", None)
-        if input_ids is None:
-            input_ids = inputs_dict.get("input_features", None)
-
-        # iterate over all generative models
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-            output_beam_search = model.generate(
-                input_ids,
-                num_beams=2,
-                do_sample=False,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-            output_beam_sample = model.generate(
-                input_ids,
-                num_beams=2,
-                do_sample=True,
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
-
-            if model.config.is_encoder_decoder:
-                self.assertIsInstance(output_beam_search, TFBeamSearchEncoderDecoderOutput)
-                self.assertIsInstance(output_beam_sample, TFBeamSampleEncoderDecoderOutput)
-            else:
-                self.assertIsInstance(output_beam_search, TFBeamSearchDecoderOnlyOutput)
-                self.assertIsInstance(output_beam_sample, TFBeamSampleDecoderOnlyOutput)
-
     def test_loss_computation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
@@ -1574,40 +1191,6 @@ def test_int_support(self):
                 if tensor_spec.dtype.is_integer:
                     self.assertTrue(tensor_spec.dtype == tf.int32, "Input signatures should use tf.int32 for ints!")
 
-    def test_generate_with_headmasking(self):
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_generative_model_classes:
-            model = model_class(config)
-
-            # We want to test only encoder-decoder models
-            if not config.is_encoder_decoder:
-                continue
-
-            head_masking = {
-                "head_mask": tf.zeros((config.encoder_layers, config.encoder_attention_heads)),
-                "decoder_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
-                "cross_attn_head_mask": tf.zeros((config.decoder_layers, config.decoder_attention_heads)),
-            }
-
-            signature = inspect.signature(model.call)
-            if set(head_masking.keys()) < {*signature.parameters.keys()}:
-                continue
-
-            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-                out = model.generate(
-                    inputs_dict["input_ids"],
-                    num_beams=1,
-                    max_length=inputs_dict["input_ids"] + 5,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                    **{name: mask},
-                )
-                # We check the state of decoder_attentions and cross_attentions just from the last step
-                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-                self.assertEqual(sum([tf.reduce_sum(w).numpy() for w in attn_weights]), 0.0)
-
     def test_load_with_mismatched_shapes(self):
         if not self.test_mismatched_shapes:
             return
@@ -1717,133 +1300,6 @@ def test_dataset_conversion(self):
                 model.compile(optimizer="sgd", run_eagerly=True)
                 model.train_on_batch(test_batch, test_batch_labels)
 
-    def _test_xla_generate(self, **generate_kwargs):
-        def _generate_and_check_results(model, inputs, is_input_ids):
-            # make sure there are no pad tokens in prompt, which may trigger unwanted behavior
-            if is_input_ids:
-                if model.generation_config.pad_token_id is not None:
-                    if config.pad_token_id == 0:
-                        new_pad_token = model.generation_config.pad_token_id + 1
-                    else:
-                        new_pad_token = model.generation_config.pad_token_id - 1
-                else:
-                    new_pad_token = None
-                inputs = tf.where(inputs != model.generation_config.pad_token_id, inputs, new_pad_token)
-
-            generated = model.generate(inputs, **generate_kwargs).numpy()
-            generate_xla = tf.function(model.generate, jit_compile=True)
-            generated_xla = generate_xla(inputs, **generate_kwargs).numpy()
-
-            # Due to numerical instability, let's fail the test only if there are more than 10% of input sequences give
-            # different outputs between XLA and non-XLA versions. If there are less than 10 examples, let's be strict
-            # and not allow any difference.
-            diff = [[], []]
-            for _generated, _generated_xla in zip(generated.tolist(), generated_xla.tolist()):
-                if _generated != _generated_xla:
-                    diff[0].append(_generated)
-                    diff[1].append(_generated_xla)
-            ratio = len(diff[0]) / len(generated)
-            if ratio > 0.1 or (len(diff[0]) > 0 and len(generated) < 10):
-                self.assertListEqual(diff[0], diff[1])
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.eos_token_id = None  # Generate until max length
-            config.do_sample = False
-
-            # extract the input to the model
-            is_input_ids = "input_ids" in inputs_dict
-            is_input_features = "input_features" in inputs_dict
-            if not (is_input_ids or is_input_features):
-                raise ValueError("No valid generate input found in inputs_dict")
-            inputs = inputs_dict["input_ids"] if is_input_ids else inputs_dict["input_features"]
-
-            # fix config for models with additional sequence-length limiting settings
-            seq_len = inputs.get_shape()[1]
-            for var_name in ["max_position_embeddings", "max_target_positions"]:
-                attr = getattr(config, var_name, None)
-                if attr is not None and attr < seq_len + generate_kwargs["max_new_tokens"]:
-                    try:
-                        setattr(config, var_name, seq_len + generate_kwargs["max_new_tokens"])
-                    except NotImplementedError:
-                        # xlnet will raise an exception when trying to set
-                        # max_position_embeddings.
-                        pass
-
-            model = model_class(config)
-
-            if model.supports_xla_generation:
-                _generate_and_check_results(model, inputs, is_input_ids)
-            else:
-                with self.assertRaises(ValueError):
-                    _generate_and_check_results(model, inputs, is_input_ids)
-
-    def test_xla_generate_fast(self):
-        """
-        Basic quick test for generate-compatible classes that confirms that XLA-generated tokens are the same as their
-        non XLA counterparts.
-
-        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
-        """
-        self._test_xla_generate(num_beams=1, num_return_sequences=1, max_new_tokens=3)
-
-    @slow
-    def test_xla_generate_contrastive(self):
-        """
-        Slow and challenging version of `test_xla_generate_fast` for contrastive search -- contrastive search directly
-        manipulates the model cache and other outputs, and this test ensures that they are in a valid format that is
-        also supported by XLA.
-
-        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
-        """
-        self._test_xla_generate(num_beams=1, num_return_sequences=1, max_new_tokens=16, penalty_alpha=0.5, top_k=4)
-
-    @slow
-    def test_xla_generate_slow(self):
-        """
-        Slow and challenging version of `test_xla_generate_fast` -- this test asks for several long sequences using
-        beam search, with and without XLA. The two outputs should match, and a failure in this test indicates that the
-        model may need further analysis if it is to be used for XLA generation.
-
-        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
-        """
-        self._test_xla_generate(num_beams=8, num_return_sequences=2, max_new_tokens=128)
-
-    def _generate_random_bad_tokens(self, num_bad_tokens, model):
-        # special tokens cannot be bad tokens
-        special_tokens = []
-        if model.config.bos_token_id is not None:
-            special_tokens.append(model.config.bos_token_id)
-        if model.config.pad_token_id is not None:
-            special_tokens.append(model.config.pad_token_id)
-        if model.config.eos_token_id is not None:
-            special_tokens.append(model.config.eos_token_id)
-
-        # create random bad tokens that are not special tokens
-        bad_tokens = []
-        while len(bad_tokens) < num_bad_tokens:
-            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
-            if token not in special_tokens:
-                bad_tokens.append(token)
-        return bad_tokens
-
-    def _check_generated_ids(self, output_ids):
-        for token_id in output_ids[0].numpy().tolist():
-            self.assertGreaterEqual(token_id, 0)
-            self.assertLess(token_id, self.model_tester.vocab_size)
-
-    def _check_match_tokens(self, generated_ids, bad_words_ids):
-        # for all bad word tokens
-        for bad_word_ids in bad_words_ids:
-            # for all slices in batch
-            for generated_ids_slice in generated_ids:
-                # for all word idx
-                for i in range(len(bad_word_ids), len(generated_ids_slice)):
-                    # if tokens match
-                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
-                        return True
-        return False
-
 
 def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
     """Creates a random int32 tensor of the shape within the vocab size."""
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index 94bc3d5fae1a..32969ab0d3a7 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -930,6 +930,11 @@ def parse_args_from_docstring_by_indentation(docstring):
 
 
 def compare_pipeline_args_to_hub_spec(pipeline_class, hub_spec):
+    """
+    Compares the docstring of a pipeline class to the fields of the matching Hub input signature class to ensure that
+    they match. This guarantees that Transformers pipelines can be used in inference without needing to manually
+    refactor or rename inputs.
+    """
     ALLOWED_TRANSFORMERS_ONLY_ARGS = ["timeout"]
 
     docstring = inspect.getdoc(pipeline_class.__call__).strip()
@@ -937,16 +942,20 @@ def compare_pipeline_args_to_hub_spec(pipeline_class, hub_spec):
     hub_args = set(get_arg_names_from_hub_spec(hub_spec))
 
     # Special casing: We allow the name of this arg to differ
-    js_generate_args = [js_arg for js_arg in hub_args if js_arg.startswith("generate")]
+    hub_generate_args = [
+        hub_arg for hub_arg in hub_args if hub_arg.startswith("generate") or hub_arg.startswith("generation")
+    ]
     docstring_generate_args = [
-        docstring_arg for docstring_arg in docstring_args if docstring_arg.startswith("generate")
+        docstring_arg
+        for docstring_arg in docstring_args
+        if docstring_arg.startswith("generate") or docstring_arg.startswith("generation")
     ]
     if (
-        len(js_generate_args) == 1
+        len(hub_generate_args) == 1
         and len(docstring_generate_args) == 1
-        and js_generate_args != docstring_generate_args
+        and hub_generate_args != docstring_generate_args
     ):
-        hub_args.remove(js_generate_args[0])
+        hub_args.remove(hub_generate_args[0])
         docstring_args.remove(docstring_generate_args[0])
 
     # Special casing 2: We permit some transformers-only arguments that don't affect pipeline output
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 28aff79c7e56..f4e1e1b543fa 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -29,6 +29,7 @@
 from transformers.testing_utils import (
     check_json_file_has_correct_format,
     require_av,
+    require_librosa,
     require_torch,
     require_vision,
 )
@@ -73,6 +74,7 @@ class ProcessorTesterMixin:
     text_input_name = "input_ids"
     images_input_name = "pixel_values"
     videos_input_name = "pixel_values_videos"
+    audio_input_name = "input_features"
 
     def prepare_processor_dict(self):
         return {}
@@ -105,6 +107,8 @@ def get_processor(self):
         processor = self.processor_class(**components, **self.prepare_processor_dict())
         return processor
 
+    # TODO: raushan unify all these special token LLMs under the general preparation. We can get audio/image token
+    # from tokenizer, so we can generalize instead of overriding
     def prepare_text_inputs(self, batch_size: Optional[int] = None):
         if batch_size is None:
             return "lower newer"
@@ -126,11 +130,12 @@ def prepare_image_inputs(self, batch_size: Optional[int] = None):
         return prepare_image_inputs() * batch_size
 
     @require_vision
-    def prepare_video_inputs(self):
+    def prepare_video_inputs(self, batch_size: Optional[int] = None):
         """This function prepares a list of numpy videos."""
         video_input = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] * 8
-        image_inputs = [video_input] * 3  # batch-size=3
-        return image_inputs
+        if batch_size is None:
+            return video_input
+        return [video_input] * batch_size
 
     def test_processor_to_json_string(self):
         processor = self.get_processor()
@@ -362,101 +367,83 @@ def test_structured_kwargs_nested_from_dict(self):
         self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0)
         self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
 
-    #  text + audio kwargs testing
+    # text + audio kwargs testing
     @require_torch
     def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
         if "feature_extractor" not in self.processor_class.attributes:
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+
         feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer(max_length=117, padding="max_length")
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        else:
-            self.assertTrue(False, "Processor doesn't have get_tokenizer or get_component defined")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        tokenizer = self.get_component("tokenizer", max_length=300, padding="max_length")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+
+        input_str = self.prepare_text_inputs(batch_size=3)
         raw_speech = floats_list((3, 1000))
+        raw_speech = [np.asarray(audio) for audio in raw_speech]
         inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 117)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 117)
+        self.assertEqual(len(inputs[self.text_input_name][0]), 300)
 
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
         if "feature_extractor" not in self.processor_class.attributes:
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+
         feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer(max_length=117)
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        tokenizer = self.get_component("tokenizer", max_length=117)
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
+
+        input_str = self.prepare_text_inputs(batch_size=3)
         raw_speech = floats_list((3, 1000))
-        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length")
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 112)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 112)
+        raw_speech = [np.asarray(audio) for audio in raw_speech]
+        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=300, padding="max_length")
+
+        self.assertEqual(len(inputs[self.text_input_name][0]), 300)
 
     @require_torch
     def test_unstructured_kwargs_audio(self):
         if "feature_extractor" not in self.processor_class.attributes:
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+
         feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer(max_length=117)
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        tokenizer = self.get_component("tokenizer")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = "lower newer"
+        input_str = self.prepare_text_inputs(batch_size=3)
         raw_speech = floats_list((3, 1000))
-        inputs = processor(
-            text=input_str,
-            audio=raw_speech,
-            return_tensors="pt",
-            padding="max_length",
-            max_length=76,
-        )
+        raw_speech = [np.asarray(audio) for audio in raw_speech]
+        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=300, padding="max_length")
 
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 76)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 76)
+        self.assertEqual(len(inputs[self.text_input_name][0]), 300)
 
     @require_torch
     def test_doubly_passed_kwargs_audio(self):
         if "feature_extractor" not in self.processor_class.attributes:
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+
         feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer()
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        tokenizer = self.get_component("tokenizer")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = ["lower newer"]
+        input_str = self.prepare_text_inputs(batch_size=3)
         raw_speech = floats_list((3, 1000))
+        raw_speech = [np.asarray(audio) for audio in raw_speech]
         with self.assertRaises(ValueError):
             _ = processor(
                 text=input_str,
                 audio=raw_speech,
-                audio_kwargs={"padding": "max_length"},
+                text_kwargs={"padding": "max_length"},
                 padding="max_length",
             )
 
@@ -465,37 +452,220 @@ def test_doubly_passed_kwargs_audio(self):
     def test_structured_kwargs_audio_nested(self):
         if "feature_extractor" not in self.processor_class.attributes:
             self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+
         feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer()
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        tokenizer = self.get_component("tokenizer", max_length=117)
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
-        input_str = ["lower newer"]
+        input_str = self.prepare_text_inputs(batch_size=3)
         raw_speech = floats_list((3, 1000))
+        raw_speech = [np.asarray(audio) for audio in raw_speech]
 
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
-            "audio_kwargs": {"padding": "max_length", "max_length": 66},
+            "audio_kwargs": {"padding": "max_length", "max_length": 300},
         }
 
         inputs = processor(text=input_str, audio=raw_speech, **all_kwargs)
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 76)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 76)
+        self.assertEqual(len(inputs[self.text_input_name][0]), 76)
+
+    def test_tokenizer_defaults_preserved_by_kwargs_video(self):
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        video_input = self.prepare_video_inputs()
+        inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 117)
+
+    def test_video_processor_defaults_preserved_by_video_kwargs(self):
+        """
+        We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor.
+        We then check that the mean of the pixel_values is less than or equal to 0 after processing.
+        Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied.
+        """
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_components["video_processor"] = self.get_component(
+            "video_processor", do_rescale=True, rescale_factor=-1
+        )
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        video_input = self.prepare_video_inputs()
+
+        inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
+        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+
+    def test_kwargs_overrides_default_tokenizer_kwargs_video(self):
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        video_input = self.prepare_video_inputs()
+        inputs = processor(
+            text=input_str, videos=video_input, return_tensors="pt", max_length=112, padding="max_length"
+        )
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 112)
+
+    def test_kwargs_overrides_default_video_processor_kwargs(self):
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_components["video_processor"] = self.get_component(
+            "video_processor", do_rescale=True, rescale_factor=1
+        )
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        video_input = self.prepare_video_inputs()
+
+        inputs = processor(text=input_str, videos=video_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
+        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+
+    def test_unstructured_kwargs_video(self):
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        video_input = self.prepare_video_inputs()
+        inputs = processor(
+            text=input_str,
+            videos=video_input,
+            return_tensors="pt",
+            do_rescale=True,
+            rescale_factor=-1,
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
+
+    def test_unstructured_kwargs_batched_video(self):
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs(batch_size=2)
+        video_input = self.prepare_video_inputs(batch_size=2)
+        inputs = processor(
+            text=input_str,
+            videos=video_input,
+            return_tensors="pt",
+            do_rescale=True,
+            rescale_factor=-1,
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertTrue(
+            len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
+            and len(inputs[self.text_input_name][1]) < 76
+        )
+
+    def test_doubly_passed_kwargs_video(self):
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = [self.prepare_text_inputs()]
+        video_input = self.prepare_video_inputs()
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                videos=video_input,
+                videos_kwargs={"do_rescale": True, "rescale_factor": -1},
+                do_rescale=True,
+                return_tensors="pt",
+            )
+
+    def test_structured_kwargs_nested_video(self):
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        video_input = self.prepare_video_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "videos_kwargs": {"do_rescale": True, "rescale_factor": -1},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, videos=video_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
+
+    def test_structured_kwargs_nested_from_dict_video(self):
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_kwargs = self.prepare_processor_dict()
+        processor = self.processor_class(**processor_components, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        video_input = self.prepare_video_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "videos_kwargs": {"do_rescale": True, "rescale_factor": -1},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, videos=video_input, **all_kwargs)
+        self.assertLessEqual(inputs[self.videos_input_name][0][0][0].mean(), 0)
+        self.assertEqual(inputs[self.text_input_name].shape[-1], 76)
 
     # TODO: the same test, but for audio + text processors that have strong overlap in kwargs
     # TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
-    def test_overlapping_text_kwargs_handling(self):
+    def test_overlapping_text_image_kwargs_handling(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
         processor_components = self.prepare_components()
         processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
@@ -512,6 +682,28 @@ def test_overlapping_text_kwargs_handling(self):
                 text_kwargs={"padding": "do_not_pad"},
             )
 
+    def test_overlapping_text_audio_kwargs_handling(self):
+        """
+        Checks that `padding`, or any other overlap arg between audio extractor and tokenizer
+        is be passed to only text and ignored for audio for BC purposes
+        """
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+
+        feature_extractor = self.get_component("feature_extractor")
+        tokenizer = self.get_component("tokenizer")
+        processor_kwargs = self.prepare_processor_dict()
+
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor, **processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs(batch_size=3)
+        audio_lengths = [4000, 8000, 16000, 32000]
+        raw_speech = [np.asarray(audio)[:length] for audio, length in zip(floats_list((3, 32_000)), audio_lengths)]
+
+        # padding = True should not raise an error and will if the audio processor popped its value to None
+        _ = processor(text=input_str, audio=raw_speech, padding=True, return_tensors="pt")
+
     def test_prepare_and_validate_optional_call_args(self):
         processor = self.get_processor()
         optional_call_args_name = getattr(processor, "optional_call_args", [])
@@ -565,11 +757,14 @@ def test_chat_template_save_loading(self):
             # the reloaded tokenizer should get the chat template as well
             self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template)
 
-    def test_chat_template_single(self):
+    def test_image_chat_template_single(self):
         processor = self.get_processor()
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
         messages = [
             [
                 {
@@ -584,7 +779,9 @@ def test_chat_template_single(self):
         formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
         self.assertEqual(len(formatted_prompt), 1)
 
-        formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_tensors=None
+        )
         add_special_tokens = True
         if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
             add_special_tokens = False
@@ -594,7 +791,7 @@ def test_chat_template_single(self):
         self.assertListEqual(expected_output, formatted_prompt_tokenized)
 
         out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
-        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+        self.assertTrue(all(key in out_dict for key in ["input_ids", "attention_mask"]))
 
         # Now test the ability to return dict
         messages[0][0]["content"].append(
@@ -608,11 +805,14 @@ def test_chat_template_single(self):
         self.assertEqual(len(out_dict["attention_mask"]), 1)
         self.assertEqual(len(out_dict[self.images_input_name]), 1)
 
-    def test_chat_template_batched(self):
+    def test_image_chat_template_batched(self):
         processor = self.get_processor()
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
         batched_messages = [
             [
                 {
@@ -636,7 +836,7 @@ def test_chat_template_batched(self):
         self.assertEqual(len(formatted_prompt), 2)
 
         formatted_prompt_tokenized = processor.apply_chat_template(
-            batched_messages, add_generation_prompt=True, tokenize=True, padding=True
+            batched_messages, add_generation_prompt=True, tokenize=True, padding=True, return_tensors=None
         )
         add_special_tokens = True
         if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
@@ -650,9 +850,13 @@ def test_chat_template_batched(self):
         self.assertListEqual(expected_output, formatted_prompt_tokenized)
 
         out_dict = processor.apply_chat_template(
-            batched_messages, add_generation_prompt=True, tokenize=True, return_dict=True, padding=True
+            batched_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            padding=True,
         )
-        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+        self.assertTrue(all(key in out_dict for key in ["input_ids", "attention_mask"]))
 
         # Now test the ability to return dict
         batched_messages[0][0]["content"].append(
@@ -671,11 +875,14 @@ def test_chat_template_batched(self):
         self.assertEqual(len(out_dict["attention_mask"]), 2)
         self.assertEqual(len(out_dict[self.images_input_name]), 2)
 
-    def test_chat_template_accepts_processing_kwargs(self):
+    def test_image_chat_template_accepts_processing_kwargs(self):
         processor = self.get_processor()
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
         messages = [
             [
                 {
@@ -692,6 +899,7 @@ def test_chat_template_accepts_processing_kwargs(self):
             add_generation_prompt=True,
             tokenize=True,
             padding="max_length",
+            truncation=True,
             max_length=50,
         )
         self.assertEqual(len(formatted_prompt_tokenized[0]), 50)
@@ -721,11 +929,14 @@ def test_chat_template_accepts_processing_kwargs(self):
         self.assertLessEqual(out_dict[self.images_input_name][0][0].mean(), 0)
 
     @require_torch
-    def test_chat_template_dict_torch(self):
+    def test_image_chat_template_dict_torch(self):
         processor = self.get_processor()
         if processor.chat_template is None:
             self.skipTest("Processor has no chat template")
 
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
         messages = [
             {
                 "role": "user",
@@ -775,7 +986,9 @@ def test_chat_template_video(self):
         formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
         self.assertEqual(len(formatted_prompt), 1)
 
-        formatted_prompt_tokenized = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_tensors=None
+        )
         add_special_tokens = True
         if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
             add_special_tokens = False
@@ -787,7 +1000,7 @@ def test_chat_template_video(self):
         self.assertListEqual(expected_output, formatted_prompt_tokenized)
 
         out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
-        self.assertListEqual(list(out_dict.keys()), ["input_ids", "attention_mask"])
+        self.assertTrue(all(key in out_dict for key in ["input_ids", "attention_mask"]))
 
         # Add video URL for return dict and load with `num_frames` arg
         messages[0][0]["content"][0] = {
@@ -884,17 +1097,14 @@ def test_chat_template_video_custom_sampling(self):
                 {
                     "role": "user",
                     "content": [
-                        {
-                            "type": "video",
-                            "path": video_file_path,
-                        },
+                        {"type": "video", "path": video_file_path},
                         {"type": "text", "text": "What is shown in this video?"},
                     ],
                 },
             ]
         ]
 
-        def dummmy_sample_indices_fn(metadata, **fn_kwargs):
+        def dummy_sample_indices_fn(metadata, **fn_kwargs):
             # sample only the first two frame always
             return [0, 1]
 
@@ -903,7 +1113,7 @@ def dummmy_sample_indices_fn(metadata, **fn_kwargs):
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
-            sample_indices_fn=dummmy_sample_indices_fn,
+            sample_indices_fn=dummy_sample_indices_fn,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
@@ -975,3 +1185,181 @@ def _process_messages_for_chat_template(
         self.assertTrue("Dummy prompt for preprocess testing" in formatted_text)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
         self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 243)
+
+    @require_librosa
+    @require_av
+    def test_audio_chat_template_from_video(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        signature = inspect.signature(processor.__call__)
+        if "videos" not in {*signature.parameters.keys()} or (
+            signature.parameters.get("videos") is not None
+            and signature.parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest(f"{self.processor_class} does not suport video inputs")
+
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+
+        video_file_path = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "path": video_file_path},
+                    {"type": "text", "text": "Which of these animals is making the sound?"},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "It is a cow."}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
+                    },
+                    {"type": "text", "text": "Is it the same sound?"},
+                ],
+            },
+        ]
+
+        formatted_prompt = processor.apply_chat_template([messages], add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 1)  # batch size=1
+
+        out_dict = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="np",
+            load_audio_from_video=True,
+        )
+        self.assertTrue(self.audio_input_name in out_dict)
+        self.assertTrue(self.video_input_name in out_dict)
+
+        # should always have input_ids and attention_mask
+        self.assertEqual(len(out_dict["input_ids"]), 1)  # batch-size=1
+        self.assertEqual(len(out_dict["attention_mask"]), 1)  # batch-size=1
+        self.assertEqual(len(out_dict[self.audio_input_name]), 2)  # 2 audios in the conversation
+        self.assertEqual(len(out_dict[self.video_input_name]), 1)  # 1 video in the conversation
+
+    @require_librosa
+    def test_audio_chat_template_single(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant."}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                    },
+                    {"type": "text", "text": "What's that sound?"},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "It is the sound of glass shattering."}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                    },
+                    {"type": "text", "text": "How about this one?"},
+                ],
+            },
+        ]
+
+        formatted_prompt = processor.apply_chat_template([messages], add_generation_prompt=True, tokenize=False)
+        self.assertEqual(len(formatted_prompt), 1)  # batch size=1
+
+        formatted_prompt_tokenized = processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_tensors=None
+        )
+        expected_output = processor.tokenizer(formatted_prompt, return_tensors=None).input_ids
+        self.assertListEqual(expected_output, formatted_prompt_tokenized)
+
+        messages[1]["content"][0]["audio"] = (
+            "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"
+        )
+        messages[3]["content"][0]["audio"] = (
+            "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"
+        )
+        out_dict = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True)
+        self.assertTrue(self.audio_input_name in out_dict)
+
+        # should always have input_ids and attention_mask
+        self.assertEqual(len(out_dict["input_ids"]), 1)  # batch-size=1
+        self.assertEqual(len(out_dict["attention_mask"]), 1)  # batch-size=1
+        self.assertEqual(len(out_dict[self.audio_input_name]), 2)  # 2 audios in the conversation
+
+    @require_torch
+    @require_librosa
+    def test_audio_chat_template_dict_torch(self):
+        processor = self.get_processor()
+        if processor.chat_template is None:
+            self.skipTest("Processor has no chat template")
+
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant."}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
+                    },
+                    {"type": "text", "text": "What's that sound?"},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": "It is the sound of glass shattering."}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio",
+                        "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav",
+                    },
+                    {"type": "text", "text": "How about this one?"},
+                ],
+            },
+        ]
+
+        out_dict_tensors = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+
+        self.assertTrue(self.audio_input_name in out_dict_tensors)
+        for k in out_dict_tensors:
+            self.assertIsInstance(out_dict_tensors[k], torch.Tensor)
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index d1dc9cc20243..6bc870614dbe 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
+import functools
 import inspect
 import itertools
 import json
@@ -24,6 +26,7 @@
 import traceback
 import unittest
 from collections import OrderedDict
+from functools import lru_cache
 from itertools import takewhile
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
@@ -49,7 +52,6 @@
 from transformers.testing_utils import (
     check_json_file_has_correct_format,
     get_tests_dir,
-    is_pt_tf_cross_test,
     require_jinja,
     require_read_token,
     require_tf,
@@ -70,6 +72,38 @@
     from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
 
 
+def use_cache_if_possible(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        use_cache = kwargs.pop("use_cache", True)
+
+        underline_func = func
+        if "functools" in str(func):
+            underline_func = func.__wrapped__
+
+        if not use_cache:
+            return underline_func(*args, **kwargs)
+        if any(not arg.__hash__ for arg in args):
+            return underline_func(*args, **kwargs)
+        elif any(not kwarg.__hash__ for kwarg in kwargs.values()):
+            return underline_func(*args, **kwargs)
+
+        cached = func(*args, **kwargs)
+        copied = copy.deepcopy(cached)
+
+        if hasattr(copied, "_tokenizer") and "tests.models.clip.test_tokenization_clip.CLIPTokenizationTest" in str(
+            args[0]
+        ):
+            copied._tokenizer = cached._tokenizer
+
+        if hasattr(copied, "sp_model"):
+            copied.sp_model = cached.sp_model
+
+        return copied
+
+    return wrapper
+
+
 logger = logging.get_logger(__name__)
 
 NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
@@ -199,32 +233,34 @@ class TokenizerTesterMixin:
     # test_sentencepiece must also be set to True
     test_sentencepiece_ignore_case = False
 
-    def setUp(self) -> None:
+    @classmethod
+    def setUpClass(cls) -> None:
         # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
         # information available in Tokenizer (name, rust class, python class, vocab key name)
-        self.from_pretrained_id = (
-            [self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id
+        cls.from_pretrained_id = (
+            [cls.from_pretrained_id] if isinstance(cls.from_pretrained_id, str) else cls.from_pretrained_id
         )
 
-        self.tokenizers_list = []
-        if self.test_rust_tokenizer:
-            self.tokenizers_list = [
+        cls.tokenizers_list = []
+        if cls.test_rust_tokenizer:
+            cls.tokenizers_list = [
                 (
-                    self.rust_tokenizer_class,
+                    cls.rust_tokenizer_class,
                     pretrained_id,
-                    self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
+                    cls.from_pretrained_kwargs if cls.from_pretrained_kwargs is not None else {},
                 )
-                for pretrained_id in self.from_pretrained_id
+                for pretrained_id in cls.from_pretrained_id
             ]
         else:
-            self.tokenizers_list = []
+            cls.tokenizers_list = []
         with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
-            self._data = f_data.read().replace("\n\n", "\n").strip()
+            cls._data = f_data.read().replace("\n\n", "\n").strip()
 
-        self.tmpdirname = tempfile.mkdtemp()
+        cls.tmpdirname = tempfile.mkdtemp()
 
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
         input_txt = self.get_clean_sequence(tokenizer)[0]
@@ -268,11 +304,19 @@ def get_tokenizers(self, fast=True, **kwargs) -> List[PreTrainedTokenizerBase]:
         else:
             raise ValueError("This tokenizer class has no tokenizer to be tested.")
 
-    def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
-        return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
-    def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
-        return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
+    @classmethod
+    @use_cache_if_possible
+    @lru_cache(maxsize=64)
+    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast:
+        pretrained_name = pretrained_name or cls.tmpdirname
+        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
     def tokenizer_integration_test_util(
         self,
@@ -839,7 +883,7 @@ def test_added_tokens_do_lower_case(self):
                 toks_after_adding = tokenizer.tokenize(text)
                 toks_after_adding2 = tokenizer.tokenize(text2)
 
-                # Rust tokenizers dont't lowercase added tokens at the time calling `tokenizer.add_tokens`,
+                # Rust tokenizers don't lowercase added tokens at the time calling `tokenizer.add_tokens`,
                 # while python tokenizers do, so new_toks 0 and 2 would be treated as the same, so do new_toks 1 and 3.
                 self.assertIn(added, [2, 4])
 
@@ -1264,7 +1308,7 @@ def test_chat_template_return_assistant_tokens_mask(self):
                 if not self.test_rust_tokenizer:
                     self.skipTest(reason="No fast tokenizer defined")
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name)
                 self._check_no_pad_token_padding(tokenizer_r, conversations)
 
                 tokenizer_r.padding_side = "right"
@@ -1447,7 +1491,7 @@ def test_chat_template_return_assistant_tokens_mask_truncated(self):
                 if not self.test_rust_tokenizer:
                     self.skipTest(reason="No fast tokenizer defined")
 
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name)
 
                 # Find where to truncate, as the amount of tokens is different for different tokenizers and I want the
                 # truncation to happen in the middle of the assistant content.
@@ -1566,6 +1610,33 @@ def test_continue_final_message_with_trim(self):
                     "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message",
                 )
 
+    @require_jinja
+    def test_continue_final_message_with_decoy_earlier_message(self):
+        """Regression test for chat templates where an earlier message has similar content to the final message
+        https://github.com/huggingface/transformers/issues/35433"""
+
+        dummy_template = """
+        {%- for message in messages %}
+            {{- "<|im_start|>" + message['role'] + "\n" + message['content'] | trim + "<|im_end|>" + "\n"}}
+        {%- endfor %}"""
+        dummy_conversation = [
+            {"role": "user", "content": "hi 0"},
+            {"role": "assistant", "content": "bye: 0"},
+            {"role": "user", "content": "hi 1"},
+            {"role": "assistant", "content": "bye: "},
+        ]
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                prefill_output = tokenizer.apply_chat_template(
+                    dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=True
+                )
+                # Assert that the final message is unterminated
+                self.assertEqual(
+                    prefill_output,
+                    "<|im_start|>user\nhi 0<|im_end|>\n<|im_start|>assistant\nbye: 0<|im_end|>\n<|im_start|>user\nhi 1<|im_end|>\n<|im_start|>assistant\nbye:",
+                )
+
     @require_jinja
     def test_chat_template_dict(self):
         dummy_template_1 = "{{'a'}}"
@@ -2024,11 +2095,9 @@ def test_encode_decode_fast_slow_all_tokens(self):
         if self.rust_tokenizer_class is not None:
             pretrained_name = self.from_pretrained_id
 
-            slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False)
+            slow_tokenizer = self.get_tokenizer(pretrained_name, legacy=False)
             with self.subTest(f"{pretrained_name}"):
-                rust_tokenizer = self.rust_tokenizer_class.from_pretrained(
-                    pretrained_name, from_slow=True, legacy=False
-                )
+                rust_tokenizer = self.get_rust_tokenizer(pretrained_name, from_slow=True, legacy=False)
                 input_full_vocab_ids = list(
                     range(len(slow_tokenizer))
                 )  # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
@@ -2039,21 +2108,21 @@ def test_encode_decode_fast_slow_all_tokens(self):
 
                 for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
                     string_to_check = input_full_vocab_string[chunk : chunk + 1024]
-                    with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
+                    with self.subTest(f"{(chunk / len(input_full_vocab_string)) * 100}%"):
                         slow_encode = slow_tokenizer.encode(string_to_check)
                         fast_encode = rust_tokenizer.encode(string_to_check)
                         self.assertEqual(
                             slow_encode,
                             fast_encode,
                             "Hint: the following tokenization diff were obtained for slow vs fast:\n "
-                            f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
-                            f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
+                            f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check)) - set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
+                            f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check)) - set(slow_tokenizer.tokenize(string_to_check))} \n"
                             f"string used     : {string_to_check}",
                         )
                 print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
                 for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
                     ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
-                    with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
+                    with self.subTest(f"{(chunk / len(input_full_vocab_string)) * 100}%"):
                         self.assertEqual(
                             slow_tokenizer.decode(
                                 ids_to_decode,
@@ -2174,14 +2243,10 @@ def test_padding_side_in_kwargs(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 if self.test_rust_tokenizer:
-                    tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, padding_side="left", **kwargs
-                    )
+                    tokenizer_r = self.get_rust_tokenizer(pretrained_name, padding_side="left", **kwargs)
                     self.assertEqual(tokenizer_r.padding_side, "left")
 
-                    tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, padding_side="right", **kwargs
-                    )
+                    tokenizer_r = self.get_rust_tokenizer(pretrained_name, padding_side="right", **kwargs)
                     self.assertEqual(tokenizer_r.padding_side, "right")
 
                     self.assertRaises(
@@ -2193,10 +2258,10 @@ def test_padding_side_in_kwargs(self):
                     )
 
                 if self.test_slow_tokenizer:
-                    tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="left", **kwargs)
+                    tokenizer_p = self.get_tokenizer(pretrained_name, padding_side="left", **kwargs)
                     self.assertEqual(tokenizer_p.padding_side, "left")
 
-                    tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="right", **kwargs)
+                    tokenizer_p = self.get_tokenizer(pretrained_name, padding_side="right", **kwargs)
                     self.assertEqual(tokenizer_p.padding_side, "right")
 
                     self.assertRaises(
@@ -2211,14 +2276,10 @@ def test_truncation_side_in_kwargs(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 if self.test_rust_tokenizer:
-                    tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, truncation_side="left", **kwargs
-                    )
+                    tokenizer_r = self.get_rust_tokenizer(pretrained_name, truncation_side="left", **kwargs)
                     self.assertEqual(tokenizer_r.truncation_side, "left")
 
-                    tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, truncation_side="right", **kwargs
-                    )
+                    tokenizer_r = self.get_rust_tokenizer(pretrained_name, truncation_side="right", **kwargs)
                     self.assertEqual(tokenizer_r.truncation_side, "right")
 
                     self.assertRaises(
@@ -2230,14 +2291,10 @@ def test_truncation_side_in_kwargs(self):
                     )
 
                 if self.test_slow_tokenizer:
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, truncation_side="left", **kwargs
-                    )
+                    tokenizer_p = self.get_tokenizer(pretrained_name, truncation_side="left", **kwargs)
                     self.assertEqual(tokenizer_p.truncation_side, "left")
 
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, truncation_side="right", **kwargs
-                    )
+                    tokenizer_p = self.get_tokenizer(pretrained_name, truncation_side="right", **kwargs)
                     self.assertEqual(tokenizer_p.truncation_side, "right")
 
                     self.assertRaises(
@@ -2971,48 +3028,6 @@ def test_batch_encode_plus_overflowing_tokens(self):
                 string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
             )
 
-    @is_pt_tf_cross_test
-    def test_batch_encode_plus_tensors(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                sequences = [
-                    "Testing batch encode plus",
-                    "Testing batch encode plus with different sequence lengths",
-                    "Testing batch encode plus with different sequence lengths correctly pads",
-                ]
-
-                # A Tensor cannot be build by sequences which are not the same size
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, sequences, return_tensors="pt")
-                self.assertRaises(ValueError, tokenizer.batch_encode_plus, sequences, return_tensors="tf")
-
-                if tokenizer.pad_token_id is None:
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        sequences,
-                        padding=True,
-                        return_tensors="pt",
-                    )
-                    self.assertRaises(
-                        ValueError,
-                        tokenizer.batch_encode_plus,
-                        sequences,
-                        padding="longest",
-                        return_tensors="tf",
-                    )
-                else:
-                    pytorch_tensor = tokenizer.batch_encode_plus(sequences, padding=True, return_tensors="pt")
-                    tensorflow_tensor = tokenizer.batch_encode_plus(sequences, padding="longest", return_tensors="tf")
-                    encoded_sequences = tokenizer.batch_encode_plus(sequences, padding=True)
-
-                    for key in encoded_sequences.keys():
-                        pytorch_value = pytorch_tensor[key].tolist()
-                        tensorflow_value = tensorflow_tensor[key].numpy().tolist()
-                        encoded_value = encoded_sequences[key]
-
-                        self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
-
     def _check_no_pad_token_padding(self, tokenizer, sequences):
         # if tokenizer does not have pad_token_id, an error should be thrown
         if tokenizer.pad_token_id is None:
@@ -3210,18 +3225,18 @@ def test_prepare_seq2seq_batch(self):
     def test_is_fast(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
                 # Check is_fast is set correctly
                 self.assertTrue(tokenizer_r.is_fast)
 
                 if self.test_slow_tokenizer:
-                    tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                    tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                     self.assertFalse(tokenizer_p.is_fast)
 
     def test_fast_only_inputs(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 # Ensure None raise an error
                 self.assertRaises(TypeError, tokenizer_r.tokenize, None)
@@ -3232,7 +3247,7 @@ def test_fast_only_inputs(self):
     def test_alignement_methods(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
                 text = " ".join(words)
@@ -3462,8 +3477,8 @@ def test_tokenization_python_rust_equals(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 # Ensure basic input match
                 input_p = tokenizer_p.encode_plus(self._data)
@@ -3503,8 +3518,8 @@ def test_num_special_tokens_to_add_equal(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 # Check we have the same number of added_tokens for both pair and non-pair inputs.
                 self.assertEqual(
@@ -3521,8 +3536,8 @@ def test_max_length_equal(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 # Check we have the correct max_length for both pair and non-pair inputs.
                 self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
@@ -3536,8 +3551,8 @@ def test_special_tokens_map_equal(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 # sometimes the tokenizer saved online is not the same
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 # Assert the set of special tokens match.
                 self.assertSequenceEqual(
@@ -3548,7 +3563,7 @@ def test_special_tokens_map_equal(self):
     def test_add_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 vocab_size = len(tokenizer_r)
                 self.assertEqual(tokenizer_r.add_tokens(""), 0)
@@ -3574,7 +3589,7 @@ def test_add_tokens(self):
     def test_offsets_mapping(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 text = "Wonderful no inspiration example with subtoken"
                 pair = "Along with an awesome pair"
@@ -3617,7 +3632,7 @@ def test_batch_encode_dynamic_overflowing(self):
         This needs to be padded so that it can represented as a tensor
         """
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+            tokenizer = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
                 if is_torch_available():
@@ -3679,8 +3694,8 @@ def test_compare_pretokenized_inputs(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space:
                     continue  # Too hard to test for now
@@ -3761,8 +3776,8 @@ def test_create_token_type_ids(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 input_simple = [1, 2, 3]
                 input_pair = [1, 2, 3]
 
@@ -3783,8 +3798,8 @@ def test_build_inputs_with_special_tokens(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 # # Input string
                 # input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
                 # input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
@@ -3828,8 +3843,8 @@ def test_padding(self, max_length=50):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
@@ -4054,8 +4069,8 @@ def test_padding_different_model_input_name(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
                 pad_token_id = tokenizer_p.pad_token_id
 
@@ -4092,8 +4107,8 @@ def test_save_pretrained(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 tmpdirname2 = tempfile.mkdtemp()
 
@@ -4167,8 +4182,8 @@ def test_embeded_special_tokens(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
                 sentence = "A, <mask> AllenNLP sentence."
                 tokens_r = tokenizer_r.encode_plus(
                     sentence,
@@ -4192,7 +4207,7 @@ def test_embeded_special_tokens(self):
     def test_compare_add_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
 
                 simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
                 # pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
@@ -4235,8 +4250,8 @@ def test_compare_prepare_for_model(self):
 
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
                 string_sequence = "Asserting that both tokenizers are equal"
                 python_output = tokenizer_p.prepare_for_model(
                     tokenizer_p.encode(string_sequence, add_special_tokens=False)
@@ -4251,7 +4266,7 @@ def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [AddedToken("<special>", lstrip=True)]
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_r = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=added_tokens, **kwargs
                 )
                 r_output = tokenizer_r.encode("Hey this is a <special> token")
@@ -4262,12 +4277,10 @@ def test_special_tokens_initialization(self):
 
                 if self.test_slow_tokenizer:
                     # in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens`
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
+                    tokenizer_cr = self.get_rust_tokenizer(
                         pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
                     )
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
+                    tokenizer_p = self.get_tokenizer(pretrained_name, additional_special_tokens=added_tokens, **kwargs)
 
                     p_output = tokenizer_p.encode("Hey this is a <special> token")
 
@@ -4439,7 +4452,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 self.assertTrue(
                     find,
                     f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = "
-                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k)==new_special_token_str]} but it is missing"
+                    f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k) == new_special_token_str]} but it is missing"
                     ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.",
                 )
             elif special_token not in special_tokens_map:
@@ -4514,7 +4527,7 @@ def test_saving_tokenizer_trainer(self):
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 with tempfile.TemporaryDirectory() as tmp_dir:
                     # Save the fast tokenizer files in a temporary directory
-                    tokenizer_old = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs, use_fast=True)
+                    tokenizer_old = self.get_rust_tokenizer(pretrained_name, **kwargs, use_fast=True)
                     tokenizer_old.save_pretrained(tmp_dir, legacy_format=False)  # save only fast version
 
                     # Initialize toy model for the trainer
@@ -4548,13 +4561,11 @@ def test_save_slow_from_fast_and_reload_fast(self):
                 with tempfile.TemporaryDirectory() as tmp_dir_1:
                     # Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can
                     # still save only the slow version and use these saved files to rebuild a tokenizer
-                    tokenizer_fast_old_1 = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, **kwargs, use_fast=True
-                    )
+                    tokenizer_fast_old_1 = self.get_rust_tokenizer(pretrained_name, **kwargs, use_fast=True)
                     tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json")
                     tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file)
 
-                    tokenizer_fast_old_2 = self.rust_tokenizer_class.from_pretrained(
+                    tokenizer_fast_old_2 = self.get_rust_tokenizer(
                         pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file
                     )
 
@@ -4576,10 +4587,10 @@ def test_split_special_tokens(self):
             special_token = "<my_new_token>"
             special_sentence = f"Hey this is a {special_token} token"
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
+                tokenizer_rust = self.get_rust_tokenizer(
                     pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
                 )
-                tokenizer_py = self.tokenizer_class.from_pretrained(
+                tokenizer_py = self.get_tokenizer(
                     pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
                 )
 
@@ -4638,7 +4649,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 # Load a slow tokenizer from the hub, init with the new token for fast to also include it
-                tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
+                tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos)
                 EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
                 with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
                     self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
@@ -4678,7 +4689,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir)
 
                 with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
                     if self.rust_tokenizer_class is not None:
-                        tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
+                        tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos)
                         self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
                         self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
                         # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
diff --git a/tests/test_training_args.py b/tests/test_training_args.py
index a4da834582e7..c207196abc8c 100644
--- a/tests/test_training_args.py
+++ b/tests/test_training_args.py
@@ -40,3 +40,28 @@ def test_output_dir_creation(self):
             self.assertFalse(os.path.exists(output_dir))  # Still shouldn't exist
 
             # Directory should be created when actually needed (e.g. in Trainer)
+
+    def test_torch_empty_cache_steps_requirements(self):
+        """Test that torch_empty_cache_steps is a positive integer or None."""
+
+        # None is acceptable (feature is disabled):
+        args = TrainingArguments(torch_empty_cache_steps=None)
+        self.assertIsNone(args.torch_empty_cache_steps)
+
+        # non-int is unacceptable:
+        with self.assertRaises(ValueError):
+            TrainingArguments(torch_empty_cache_steps=1.0)
+        with self.assertRaises(ValueError):
+            TrainingArguments(torch_empty_cache_steps="none")
+
+        # negative int is unacceptable:
+        with self.assertRaises(ValueError):
+            TrainingArguments(torch_empty_cache_steps=-1)
+
+        # zero is unacceptable:
+        with self.assertRaises(ValueError):
+            TrainingArguments(torch_empty_cache_steps=0)
+
+        # positive int is acceptable:
+        args = TrainingArguments(torch_empty_cache_steps=1)
+        self.assertEqual(args.torch_empty_cache_steps, 1)
diff --git a/tests/tokenization/test_tokenization_fast.py b/tests/tokenization/test_tokenization_fast.py
index 4bd9b046d406..40b945e272b7 100644
--- a/tests/tokenization/test_tokenization_fast.py
+++ b/tests/tokenization/test_tokenization_fast.py
@@ -33,19 +33,20 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
     test_rust_tokenizer = True
     from_pretrained_vocab_key = "tokenizer_file"
 
-    def setUp(self):
-        self.test_rust_tokenizer = False  # because we don't have pretrained_vocab_files_map
-        super().setUp()
-        self.test_rust_tokenizer = True
+    @classmethod
+    def setUpClass(cls):
+        cls.test_rust_tokenizer = False  # because we don't have pretrained_vocab_files_map
+        super().setUpClass()
+        cls.test_rust_tokenizer = True
 
         model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"]
-        self.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe"
+        cls.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe"
 
         # Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment)
-        self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]
+        cls.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]
 
         tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
-        tokenizer.save_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
     @unittest.skip(
         "We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model"
diff --git a/tests/tp/test_tp.py b/tests/tp/test_tp.py
deleted file mode 100644
index 7b9bff5f1666..000000000000
--- a/tests/tp/test_tp.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import subprocess
-import tempfile
-import textwrap
-
-#  TORCH_LOGS=+dtensor CUDA_LAUNCH_BLOCKING=1 TORCH_USE_CUDA_DSA=1 PYTHONPATH="src" python -m torch.distributed.run --nproc_per_node 2 ./tests/tp/test_tp.py
-from transformers import is_torch_available
-from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.llama.modeling_llama import LlamaModel
-from transformers.testing_utils import (
-    TestCasePlus,
-    execute_subprocess_async,
-    get_torch_dist_unique_port,
-    require_torch_multi_gpu,
-)
-
-
-if is_torch_available():
-    import torch
-
-
-class TestTensorParallel(TestCasePlus):
-    def torchrun(self, script: str):
-        """Run the `script` using `torchrun` command for multi-processing in a subprocess. Captures errors as necesary."""
-        with tempfile.NamedTemporaryFile(mode="w+", suffix=".py") as tmp:
-            tmp.write(script)
-            tmp.flush()
-            tmp.seek(0)
-            cmd = (
-                f"torchrun --nproc_per_node {torch.cuda.device_count()} --master_port {get_torch_dist_unique_port()} {tmp.name}"
-            ).split()
-
-            # Note that the subprocess will be waited for here, and raise an error if not successful
-            try:
-                _ = subprocess.run(cmd, capture_output=True, env=self.get_env(), text=True, check=True)
-            except subprocess.CalledProcessError as e:
-                raise Exception(f"The following error was captured: {e.stderr}")
-
-    @require_torch_multi_gpu
-    def test_tp(self):
-        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
-            --master_port={get_torch_dist_unique_port()}
-            {self.test_file_dir}/test_tp.py
-        """.split()
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"--output_dir {output_dir} --report_to none".split()
-        cmd = ["torchrun"] + distributed_args + args
-        print(cmd)
-        execute_subprocess_async(cmd, env=self.get_env())
-        # successful return here == success - any errors would have caused an error in the sub-call
-
-    @require_torch_multi_gpu
-    def test_loading_memory_consumption(self):
-        script_to_run = textwrap.dedent(
-            """
-            import torch
-            import os
-            from transformers import AutoModelForCausalLM
-
-            model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-
-            rank = int(os.environ["RANK"])
-            world_size = int(os.environ["WORLD_SIZE"])
-            device = torch.device(f"cuda:{rank}")
-            torch.distributed.init_process_group("nccl", device_id=device)
-
-            model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, tp_plan="auto")
-            torch.distributed.barrier()
-
-            # The expected full model memory footprint
-            expected_model_memory = 16
-            overhead_factor = 1.2
-
-            # Assert we did not use more than the full model expected memory (with some overhead)
-            if not torch.cuda.max_memory_allocated(device) / 1024**3 < expected_model_memory * overhead_factor:
-                raise ValueError("Loading the model used more than the full model size")
-
-            # Assert we correctly handled the sharding between devices
-            if not torch.cuda.memory_allocated(device) / 1024**3 < (expected_model_memory / world_size) * overhead_factor:
-                raise ValueError("Each model shard is larger than what is expected.")
-
-            torch.distributed.barrier()
-            torch.distributed.destroy_process_group()
-            """
-        )
-        self.torchrun(script_to_run)
-
-
-if __name__ == "__main__":
-    # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
-    # CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/tp/test_tp.py
-    # or
-    # PYTHONPATH="src" python -m torch.distributed.run --nproc_per_node 2 ./tests/tp/test_tp.py
-
-    if not is_torch_available():
-        exit(0)
-
-    # Test settings
-    model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-    bs = 1
-    seqlen = 4096
-    # Get distributed settings
-    rank = int(os.environ["RANK"])
-    world_size = int(os.environ["WORLD_SIZE"])
-
-    # Initialize distributed
-    device = torch.device(f"cuda:{rank}")
-    torch.distributed.init_process_group("nccl", device_id=device)
-    device_mesh = torch.distributed.init_device_mesh("cuda", (world_size,))
-
-    # Get model config
-    config = LlamaConfig.from_pretrained(model_id)
-    config.hidden_size = 2048
-    config.attention_bias = False
-    # Instantiate model
-    with device:
-        model = LlamaModel(config).to(dtype=torch.float16)
-
-    model.eval()
-    # Tensor Parallel
-    if world_size > 1:
-        model.tensor_parallel(device_mesh)
-    # Run model
-
-    inputs = torch.randint(config.vocab_size, (bs, seqlen), device=device)
-
-    # Test cuda graphing explicitly
-    with torch.cuda.device(device):
-        print("Cuda graphing")
-        with torch.no_grad():
-            inputs = torch.randint(config.vocab_size, (bs, seqlen), device=device)
-            # CUDA Graph setup
-            s = torch.cuda.Stream(device=device)
-            s.wait_stream(torch.cuda.current_stream())
-            with torch.cuda.stream(s):
-                for i in range(3):
-                    out = model(inputs)
-            torch.cuda.current_stream().wait_stream(s)
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g):
-                out = model(inputs)
-
-            for _ in range(2):
-                g.replay()
-            s.synchronize()
-
-    assert out.last_hidden_state.shape == torch.Size([bs, seqlen, config.hidden_size])
-
-    # Test compile
-    with torch.no_grad():
-        out = model(inputs)
-        model.forward = torch.compile(model.forward, mode="reduce-overhead")
-        out = model(inputs)
-        out = model(inputs)
diff --git a/tests/trainer/test_data_collator.py b/tests/trainer/test_data_collator.py
index c3e9b5a3badf..a88641ca1694 100644
--- a/tests/trainer/test_data_collator.py
+++ b/tests/trainer/test_data_collator.py
@@ -350,6 +350,86 @@ def test_data_collator_for_language_modeling(self):
         pad_features = [list(range(5)), list(range(10))]
         self._test_no_pad_and_pad(no_pad_features, pad_features)
 
+    def test_data_collator_for_language_modeling_with_seed(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        features = [{"input_ids": list(range(1000))}, {"input_ids": list(range(1000))}]
+
+        # check if seed is respected between two different DataCollatorForLanguageModeling instances
+        data_collator = DataCollatorForLanguageModeling(tokenizer, seed=42)
+        batch_1 = data_collator(features)
+        self.assertEqual(batch_1["input_ids"].shape, torch.Size((2, 1000)))
+        self.assertEqual(batch_1["labels"].shape, torch.Size((2, 1000)))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, seed=42)
+        batch_2 = data_collator(features)
+        self.assertEqual(batch_2["input_ids"].shape, torch.Size((2, 1000)))
+        self.assertEqual(batch_2["labels"].shape, torch.Size((2, 1000)))
+
+        self.assertTrue(torch.all(batch_1["input_ids"] == batch_2["input_ids"]))
+        self.assertTrue(torch.all(batch_1["labels"] == batch_2["labels"]))
+
+        # check if seed is respected in multiple workers situation
+        features = [{"input_ids": list(range(1000))} for _ in range(10)]
+        dataloader = torch.utils.data.DataLoader(
+            features,
+            batch_size=2,
+            num_workers=2,
+            generator=torch.Generator().manual_seed(42),
+            collate_fn=DataCollatorForLanguageModeling(tokenizer, seed=42),
+        )
+
+        batch_3_input_ids = []
+        batch_3_labels = []
+        for batch in dataloader:
+            batch_3_input_ids.append(batch["input_ids"])
+            batch_3_labels.append(batch["labels"])
+
+        batch_3_input_ids = torch.stack(batch_3_input_ids)
+        batch_3_labels = torch.stack(batch_3_labels)
+        self.assertEqual(batch_3_input_ids.shape, torch.Size((5, 2, 1000)))
+        self.assertEqual(batch_3_labels.shape, torch.Size((5, 2, 1000)))
+
+        dataloader = torch.utils.data.DataLoader(
+            features,
+            batch_size=2,
+            num_workers=2,
+            collate_fn=DataCollatorForLanguageModeling(tokenizer, seed=42),
+        )
+
+        batch_4_input_ids = []
+        batch_4_labels = []
+        for batch in dataloader:
+            batch_4_input_ids.append(batch["input_ids"])
+            batch_4_labels.append(batch["labels"])
+        batch_4_input_ids = torch.stack(batch_4_input_ids)
+        batch_4_labels = torch.stack(batch_4_labels)
+        self.assertEqual(batch_4_input_ids.shape, torch.Size((5, 2, 1000)))
+        self.assertEqual(batch_4_labels.shape, torch.Size((5, 2, 1000)))
+
+        self.assertTrue(torch.all(batch_3_input_ids == batch_4_input_ids))
+        self.assertTrue(torch.all(batch_3_labels == batch_4_labels))
+
+        # try with different seed
+        dataloader = torch.utils.data.DataLoader(
+            features,
+            batch_size=2,
+            num_workers=2,
+            collate_fn=DataCollatorForLanguageModeling(tokenizer, seed=43),
+        )
+
+        batch_5_input_ids = []
+        batch_5_labels = []
+        for batch in dataloader:
+            batch_5_input_ids.append(batch["input_ids"])
+            batch_5_labels.append(batch["labels"])
+        batch_5_input_ids = torch.stack(batch_5_input_ids)
+        batch_5_labels = torch.stack(batch_5_labels)
+        self.assertEqual(batch_5_input_ids.shape, torch.Size((5, 2, 1000)))
+        self.assertEqual(batch_5_labels.shape, torch.Size((5, 2, 1000)))
+
+        self.assertFalse(torch.all(batch_3_input_ids == batch_5_input_ids))
+        self.assertFalse(torch.all(batch_3_labels == batch_5_labels))
+
     def test_data_collator_for_whole_word_mask(self):
         tokenizer = BertTokenizer(self.vocab_file)
         data_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="pt")
@@ -365,6 +445,86 @@ def test_data_collator_for_whole_word_mask(self):
         self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
         self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
 
+    def test_data_collator_for_whole_word_mask_with_seed(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        features = [{"input_ids": list(range(1000))}, {"input_ids": list(range(1000))}]
+
+        # check if seed is respected between two different DataCollatorForWholeWordMask instances
+        data_collator = DataCollatorForWholeWordMask(tokenizer, seed=42)
+        batch_1 = data_collator(features)
+        self.assertEqual(batch_1["input_ids"].shape, torch.Size((2, 1000)))
+        self.assertEqual(batch_1["labels"].shape, torch.Size((2, 1000)))
+
+        data_collator = DataCollatorForWholeWordMask(tokenizer, seed=42)
+        batch_2 = data_collator(features)
+        self.assertEqual(batch_2["input_ids"].shape, torch.Size((2, 1000)))
+        self.assertEqual(batch_2["labels"].shape, torch.Size((2, 1000)))
+
+        self.assertTrue(torch.all(batch_1["input_ids"] == batch_2["input_ids"]))
+        self.assertTrue(torch.all(batch_1["labels"] == batch_2["labels"]))
+
+        # check if seed is respected in multiple workers situation
+        features = [{"input_ids": list(range(1000))} for _ in range(10)]
+        dataloader = torch.utils.data.DataLoader(
+            features,
+            batch_size=2,
+            num_workers=2,
+            generator=torch.Generator().manual_seed(42),
+            collate_fn=DataCollatorForWholeWordMask(tokenizer, seed=42),
+        )
+
+        batch_3_input_ids = []
+        batch_3_labels = []
+        for batch in dataloader:
+            batch_3_input_ids.append(batch["input_ids"])
+            batch_3_labels.append(batch["labels"])
+
+        batch_3_input_ids = torch.stack(batch_3_input_ids)
+        batch_3_labels = torch.stack(batch_3_labels)
+        self.assertEqual(batch_3_input_ids.shape, torch.Size((5, 2, 1000)))
+        self.assertEqual(batch_3_labels.shape, torch.Size((5, 2, 1000)))
+
+        dataloader = torch.utils.data.DataLoader(
+            features,
+            batch_size=2,
+            num_workers=2,
+            collate_fn=DataCollatorForWholeWordMask(tokenizer, seed=42),
+        )
+
+        batch_4_input_ids = []
+        batch_4_labels = []
+        for batch in dataloader:
+            batch_4_input_ids.append(batch["input_ids"])
+            batch_4_labels.append(batch["labels"])
+        batch_4_input_ids = torch.stack(batch_4_input_ids)
+        batch_4_labels = torch.stack(batch_4_labels)
+        self.assertEqual(batch_4_input_ids.shape, torch.Size((5, 2, 1000)))
+        self.assertEqual(batch_4_labels.shape, torch.Size((5, 2, 1000)))
+
+        self.assertTrue(torch.all(batch_3_input_ids == batch_4_input_ids))
+        self.assertTrue(torch.all(batch_3_labels == batch_4_labels))
+
+        # try with different seed
+        dataloader = torch.utils.data.DataLoader(
+            features,
+            batch_size=2,
+            num_workers=2,
+            collate_fn=DataCollatorForWholeWordMask(tokenizer, seed=43),
+        )
+
+        batch_5_input_ids = []
+        batch_5_labels = []
+        for batch in dataloader:
+            batch_5_input_ids.append(batch["input_ids"])
+            batch_5_labels.append(batch["labels"])
+        batch_5_input_ids = torch.stack(batch_5_input_ids)
+        batch_5_labels = torch.stack(batch_5_labels)
+        self.assertEqual(batch_5_input_ids.shape, torch.Size((5, 2, 1000)))
+        self.assertEqual(batch_5_labels.shape, torch.Size((5, 2, 1000)))
+
+        self.assertFalse(torch.all(batch_3_input_ids == batch_5_input_ids))
+        self.assertFalse(torch.all(batch_3_labels == batch_5_labels))
+
     def test_plm(self):
         tokenizer = BertTokenizer(self.vocab_file)
         no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
@@ -1052,7 +1212,9 @@ def test_all_mask_replacement(self):
 
         # confirm that every token is either the original token or [MASK]
         self.assertTrue(
-            tf.reduce_all((batch["input_ids"] == inputs) | (batch["input_ids"] == tokenizer.mask_token_id))
+            tf.reduce_all(
+                (batch["input_ids"] == tf.cast(inputs, tf.int64)) | (batch["input_ids"] == tokenizer.mask_token_id)
+            )
         )
 
         # numpy call
@@ -1075,6 +1237,33 @@ def test_data_collator_for_language_modeling(self):
         pad_features = [list(range(5)), list(range(10))]
         self._test_no_pad_and_pad(no_pad_features, pad_features)
 
+    def test_data_collator_for_language_modeling_with_seed(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        features = [{"input_ids": list(range(1000))}, {"input_ids": list(range(1000))}]
+
+        # check if seed is respected between two different DataCollatorForLanguageModeling instances
+        data_collator = DataCollatorForLanguageModeling(tokenizer, seed=42, return_tensors="tf")
+        batch_1 = data_collator(features)
+        self.assertEqual(batch_1["input_ids"].shape.as_list(), [2, 1000])
+        self.assertEqual(batch_1["labels"].shape.as_list(), [2, 1000])
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, seed=42, return_tensors="tf")
+        batch_2 = data_collator(features)
+        self.assertEqual(batch_2["input_ids"].shape.as_list(), [2, 1000])
+        self.assertEqual(batch_2["labels"].shape.as_list(), [2, 1000])
+
+        self.assertTrue(np.all(batch_1["input_ids"] == batch_2["input_ids"]))
+        self.assertTrue(np.all(batch_1["labels"] == batch_2["labels"]))
+
+        # try with different seed
+        data_collator = DataCollatorForLanguageModeling(tokenizer, seed=43, return_tensors="tf")
+        batch_3 = data_collator(features)
+        self.assertEqual(batch_3["input_ids"].shape.as_list(), [2, 1000])
+        self.assertEqual(batch_3["labels"].shape.as_list(), [2, 1000])
+
+        self.assertFalse(np.all(batch_1["input_ids"] == batch_3["input_ids"]))
+        self.assertFalse(np.all(batch_1["labels"] == batch_3["labels"]))
+
     def test_data_collator_for_whole_word_mask(self):
         tokenizer = BertTokenizer(self.vocab_file)
         data_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="tf")
@@ -1090,6 +1279,33 @@ def test_data_collator_for_whole_word_mask(self):
         self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
         self.assertEqual(batch["labels"].shape.as_list(), [2, 10])
 
+    def test_data_collator_for_whole_word_mask_with_seed(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        features = [{"input_ids": list(range(1000))}, {"input_ids": list(range(1000))}]
+
+        # check if seed is respected between two different DataCollatorForWholeWordMask instances
+        data_collator = DataCollatorForWholeWordMask(tokenizer, seed=42, return_tensors="tf")
+        batch_1 = data_collator(features)
+        self.assertEqual(batch_1["input_ids"].shape.as_list(), [2, 1000])
+        self.assertEqual(batch_1["labels"].shape.as_list(), [2, 1000])
+
+        data_collator = DataCollatorForWholeWordMask(tokenizer, seed=42, return_tensors="tf")
+        batch_2 = data_collator(features)
+        self.assertEqual(batch_2["input_ids"].shape.as_list(), [2, 1000])
+        self.assertEqual(batch_2["labels"].shape.as_list(), [2, 1000])
+
+        self.assertTrue(np.all(batch_1["input_ids"] == batch_2["input_ids"]))
+        self.assertTrue(np.all(batch_1["labels"] == batch_2["labels"]))
+
+        # try with different seed
+        data_collator = DataCollatorForWholeWordMask(tokenizer, seed=43, return_tensors="tf")
+        batch_3 = data_collator(features)
+        self.assertEqual(batch_3["input_ids"].shape.as_list(), [2, 1000])
+        self.assertEqual(batch_3["labels"].shape.as_list(), [2, 1000])
+
+        self.assertFalse(np.all(batch_1["input_ids"] == batch_3["input_ids"]))
+        self.assertFalse(np.all(batch_1["labels"] == batch_3["labels"]))
+
     def test_plm(self):
         tokenizer = BertTokenizer(self.vocab_file)
         no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
@@ -1770,6 +1986,32 @@ def test_data_collator_for_language_modeling(self):
         pad_features = [list(range(5)), list(range(10))]
         self._test_no_pad_and_pad(no_pad_features, pad_features)
 
+    def test_data_collator_for_language_modeling_with_seed(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        features = [{"input_ids": list(range(1000))}, {"input_ids": list(range(1000))}]
+
+        # check if seed is respected between two different DataCollatorForLanguageModeling instances
+        data_collator = DataCollatorForLanguageModeling(tokenizer, seed=42, return_tensors="np")
+        batch_1 = data_collator(features)
+        self.assertEqual(batch_1["input_ids"].shape, (2, 1000))
+        self.assertEqual(batch_1["labels"].shape, (2, 1000))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, seed=42, return_tensors="np")
+        batch_2 = data_collator(features)
+        self.assertEqual(batch_2["input_ids"].shape, (2, 1000))
+        self.assertEqual(batch_2["labels"].shape, (2, 1000))
+
+        self.assertTrue(np.all(batch_1["input_ids"] == batch_2["input_ids"]))
+        self.assertTrue(np.all(batch_1["labels"] == batch_2["labels"]))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, seed=43, return_tensors="np")
+        batch_3 = data_collator(features)
+        self.assertEqual(batch_3["input_ids"].shape, (2, 1000))
+        self.assertEqual(batch_3["labels"].shape, (2, 1000))
+
+        self.assertFalse(np.all(batch_1["input_ids"] == batch_3["input_ids"]))
+        self.assertFalse(np.all(batch_1["labels"] == batch_3["labels"]))
+
     def test_data_collator_for_whole_word_mask(self):
         tokenizer = BertTokenizer(self.vocab_file)
         data_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="np")
@@ -1785,6 +2027,32 @@ def test_data_collator_for_whole_word_mask(self):
         self.assertEqual(batch["input_ids"].shape, (2, 10))
         self.assertEqual(batch["labels"].shape, (2, 10))
 
+    def test_data_collator_for_whole_word_mask_with_seed(self):
+        tokenizer = BertTokenizer(self.vocab_file)
+        features = [{"input_ids": list(range(1000))}, {"input_ids": list(range(1000))}]
+
+        # check if seed is respected between two different DataCollatorForWholeWordMask instances
+        data_collator = DataCollatorForWholeWordMask(tokenizer, seed=42, return_tensors="np")
+        batch_1 = data_collator(features)
+        self.assertEqual(batch_1["input_ids"].shape, (2, 1000))
+        self.assertEqual(batch_1["labels"].shape, (2, 1000))
+
+        data_collator = DataCollatorForWholeWordMask(tokenizer, seed=42, return_tensors="np")
+        batch_2 = data_collator(features)
+        self.assertEqual(batch_2["input_ids"].shape, (2, 1000))
+        self.assertEqual(batch_2["labels"].shape, (2, 1000))
+
+        self.assertTrue(np.all(batch_1["input_ids"] == batch_2["input_ids"]))
+        self.assertTrue(np.all(batch_1["labels"] == batch_2["labels"]))
+
+        data_collator = DataCollatorForWholeWordMask(tokenizer, seed=43, return_tensors="np")
+        batch_3 = data_collator(features)
+        self.assertEqual(batch_3["input_ids"].shape, (2, 1000))
+        self.assertEqual(batch_3["labels"].shape, (2, 1000))
+
+        self.assertFalse(np.all(batch_1["input_ids"] == batch_3["input_ids"]))
+        self.assertFalse(np.all(batch_1["labels"] == batch_3["labels"]))
+
     def test_plm(self):
         tokenizer = BertTokenizer(self.vocab_file)
         no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 5e910bc79bd4..f7e16926f2d3 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -28,7 +28,7 @@
 from functools import partial
 from itertools import product
 from pathlib import Path
-from typing import Dict, List
+from typing import Any
 from unittest.mock import Mock, patch
 
 import numpy as np
@@ -46,6 +46,7 @@
     PretrainedConfig,
     TrainerCallback,
     TrainingArguments,
+    default_data_collator,
     enable_full_determinism,
     get_polynomial_decay_schedule_with_warmup,
     is_torch_available,
@@ -62,8 +63,10 @@
     TemporaryHubRepo,
     TestCasePlus,
     backend_device_count,
+    evaluate_side_effect_factory,
     execute_subprocess_async,
     get_gpu_count,
+    get_steps_per_epoch,
     get_tests_dir,
     is_staging_test,
     require_accelerate,
@@ -75,6 +78,7 @@
     require_intel_extension_for_pytorch,
     require_liger_kernel,
     require_lomo,
+    require_non_hpu,
     require_non_xpu,
     require_optuna,
     require_peft,
@@ -88,6 +92,7 @@
     require_torch,
     require_torch_accelerator,
     require_torch_bf16,
+    require_torch_fp16,
     require_torch_gpu,
     require_torch_multi_accelerator,
     require_torch_non_multi_accelerator,
@@ -98,6 +103,7 @@
     require_torchdynamo,
     require_vision,
     require_wandb,
+    run_first,
     run_test_using_subprocess,
     slow,
     torch_device,
@@ -119,6 +125,13 @@
 from transformers.utils.hp_naming import TrialShortNamer
 
 
+if torch_device == "hpu":
+    RTOL = 1e-3
+    ATOL = 1e-3
+else:
+    RTOL = 1e-5
+    ATOL = 1e-5
+
 if is_torch_available():
     import torch
     from torch import nn
@@ -229,7 +242,7 @@ def bytes2megabytes(x):
     return int(x / 2**20)
 
 
-# Copied from acclerate: https://github.com/huggingface/accelerate/blob/ee163b66fb7848892519e804688cb4ae981aacbe/src/accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py#L40C1-L73C68
+# Copied from accelerate: https://github.com/huggingface/accelerate/blob/ee163b66fb7848892519e804688cb4ae981aacbe/src/accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py#L40C1-L73C68
 class TorchTracemalloc:
     def __enter__(self):
         gc.collect()
@@ -635,7 +648,7 @@ def check_best_model_has_been_loaded(
         else:
             best_model = RegressionModel()
             if not safe_weights:
-                state_dict = torch.load(os.path.join(checkpoint, WEIGHTS_NAME))
+                state_dict = torch.load(os.path.join(checkpoint, WEIGHTS_NAME), weights_only=True)
             else:
                 state_dict = safetensors.torch.load_file(os.path.join(checkpoint, SAFE_WEIGHTS_NAME))
             best_model.load_state_dict(state_dict)
@@ -726,11 +739,11 @@ def setUp(self):
             trainer.train()
             self.alternate_trained_model = (trainer.model.a, trainer.model.b)
 
-    def check_trained_model(self, model, alternate_seed=False):
+    def check_trained_model(self, model, alternate_seed=False, **kwargs):
         # Checks a training seeded with learning_rate = 0.1
         (a, b) = self.alternate_trained_model if alternate_seed else self.default_trained_model
-        torch.testing.assert_close(model.a, a)
-        torch.testing.assert_close(model.b, b)
+        torch.testing.assert_close(model.a, a, **kwargs)
+        torch.testing.assert_close(model.b, b, **kwargs)
 
     def test_reproducible_training(self):
         # Checks that training worked, model trained and seed made a reproducible training.
@@ -793,6 +806,7 @@ def test_model_init(self):
             trainer.train()
             self.check_trained_model(trainer.model, alternate_seed=True)
 
+    @slow
     def test_gradient_accumulation_loss_alignment_with_model_loss(self):
         set_seed(42)
         import datasets
@@ -812,11 +826,6 @@ def tokenize_function(examples):
 
         data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
 
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        state_dict = model.state_dict()
-
-        base_loss_callback = StoreLossCallback()
-
         args_kwargs = {
             "report_to": "none",
             "logging_steps": 1,
@@ -830,6 +839,10 @@ def tokenize_function(examples):
                 tmp_dir,
                 **args_kwargs,
             )
+            # train with base loss
+            set_seed(42)
+            model = AutoModelForCausalLM.from_pretrained(model_name)
+            base_loss_callback = StoreLossCallback()
             trainer = Trainer(
                 model,
                 args,
@@ -840,16 +853,17 @@ def tokenize_function(examples):
             assert trainer.model_accepts_loss_kwargs
             trainer.train()
 
-        grad_accum_loss_callback = StoreLossCallback()
-        with tempfile.TemporaryDirectory() as tmp_dir:
             args = TrainingArguments(
                 tmp_dir,
                 **args_kwargs,
                 gradient_accumulation_steps=2,
                 per_device_train_batch_size=4,
             )
+
+            # train with gradient accumulation
             set_seed(42)
             model = AutoModelForCausalLM.from_pretrained(model_name)
+            grad_accum_loss_callback = StoreLossCallback()
             trainer = Trainer(
                 model,
                 args,
@@ -857,10 +871,12 @@ def tokenize_function(examples):
                 callbacks=[grad_accum_loss_callback],
                 data_collator=data_collator,
             )
+            assert trainer.model_accepts_loss_kwargs
             trainer.train()
 
+            # train with broken loss
             set_seed(42)
-            model.load_state_dict(state_dict)
+            model = AutoModelForCausalLM.from_pretrained(model_name)
             broken_loss_callback = StoreLossCallback()
             trainer = Trainer(
                 model,
@@ -869,30 +885,27 @@ def tokenize_function(examples):
                 callbacks=[broken_loss_callback],
                 data_collator=data_collator,
             )
-            # disable model_accepts_loss_kwargs
+            # disable model_accepts_loss_kwargs so that "num_items_in_batch" is not passed to the model
             trainer.model_accepts_loss_kwargs = False
             trainer.train()
 
-            # Calculate the difference between the base loss and the grad_accum loss
-            diff_truth = [
-                abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
-            ]
-            diff_broken = [
-                abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)
-            ]
-
-            # all diff truth should be quite close
-            self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
+        # Calculate the difference between the base loss and the grad_accum loss
+        diff_truth = [
+            abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)
+        ]
+        diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)]
 
-            # max diff broken should be very off
-            self.assertGreater(max(diff_broken), 1.5, f"Difference {max(diff_broken)} is not greater than 2")
+        # all diff truth should be quite close
+        self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01")
+        # max diff broken should be very off ("very off" is arbitrary, but as long as it's bigger than 0.1, it's fine)
+        self.assertGreater(max(diff_broken), 0.7, f"Difference {max(diff_broken)} is not greater than 0.7")
 
-            loss_base = sum(base_loss_callback.losses)
-            loss_broken = sum(broken_loss_callback.losses)
+        loss_base = sum(base_loss_callback.losses)
+        loss_broken = sum(broken_loss_callback.losses)
 
-            # mean/sum loss should not vary too much.
-            relative_diff = abs(loss_base - loss_broken) / max(loss_base, loss_broken)
-            self.assertLess(relative_diff, 0.2, f"Relative difference {relative_diff} is not within 0.2")
+        # mean/sum loss should not vary too much.
+        relative_diff = abs(loss_base - loss_broken) / max(loss_base, loss_broken)
+        self.assertLess(relative_diff, 0.2, f"Relative difference {relative_diff} is not within 0.2")
 
     def test_gradient_accumulation_loss_alignment_with_loss_func(self):
         set_seed(42)
@@ -1214,14 +1227,14 @@ def test_adafactor_lr_none(self):
             self.assertFalse(torch.allclose(trainer.model.b, b))
             self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0)
 
-    @require_torch_accelerator
     @require_torch_bf16
+    @require_torch_accelerator
     def test_mixed_bf16(self):
         # very basic test
         with tempfile.TemporaryDirectory() as tmp_dir:
             trainer = get_regression_trainer(learning_rate=0.1, bf16=True, output_dir=tmp_dir)
             trainer.train()
-            self.check_trained_model(trainer.model)
+            self.check_trained_model(trainer.model, atol=ATOL, rtol=RTOL)
 
         # --bf16 --half_precision_backend apex can't be used together
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -2963,20 +2976,38 @@ def test_safe_checkpoints(self):
                 tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
             )
 
+    def test_save_collator_tokenizer_by_default(self):
+        class FakeCollator:
+            def __init__(self):
+                self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+                self.tokenizer.add_tokens(["<NEW_TOKEN1>", "<NEW_TOKEN2>"])
+
+            def __call__(self, features: list[Any], return_tensors="pt") -> dict[str, Any]:
+                return default_data_collator(features, return_tensors)
+
+        data_collator = FakeCollator()
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(
+            output_dir=tmp_dir, save_steps=5, save_safetensors=True, data_collator=data_collator
+        )
+        trainer.train()
+        loaded_tokenizer = AutoTokenizer.from_pretrained(os.path.join(tmp_dir, os.listdir(tmp_dir)[0]))
+        assert len(loaded_tokenizer) == len(trainer.data_collator.tokenizer), "Failed to load updated tokenizer"
+
     def test_load_best_model_with_save(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         trainer = get_regression_trainer(
             output_dir=tmp_dir,
             save_steps=5,
-            evaluation_strategy="steps",
+            eval_strategy="steps",
             eval_steps=5,
             max_steps=9,
         )
         trainer.train()
         # Check that we have the last known step:
-        assert os.path.exists(
-            os.path.join(tmp_dir, f"checkpoint-{trainer.state.max_steps}")
-        ), f"Could not find checkpoint-{trainer.state.max_steps}"
+        assert os.path.exists(os.path.join(tmp_dir, f"checkpoint-{trainer.state.max_steps}")), (
+            f"Could not find checkpoint-{trainer.state.max_steps}"
+        )
         # And then check the last step
         assert os.path.exists(os.path.join(tmp_dir, "checkpoint-9")), "Could not find checkpoint-9"
 
@@ -2989,7 +3020,7 @@ def test_load_best_model_with_save(self):
         trainer = get_regression_trainer(
             output_dir=tmp_dir,
             save_steps=5,
-            evaluation_strategy="steps",
+            eval_strategy="steps",
             eval_steps=5,
             load_best_model_at_end=True,
             save_total_limit=2,
@@ -3169,7 +3200,7 @@ def test_can_resume_training_lm(self):
 
             # Checkpoint at intermediate step
             enable_full_determinism(0)
-            checkpoint = os.path.join(tmpdir, f"checkpoint-{resume_from_step+1}")
+            checkpoint = os.path.join(tmpdir, f"checkpoint-{resume_from_step + 1}")
             trainer = get_language_model_trainer(**kwargs)
             trainer.train(resume_from_checkpoint=checkpoint)
             model_params = torch.cat([p.cpu().flatten() for p in trainer.model.parameters()])
@@ -3582,6 +3613,7 @@ def test_load_best_model_from_safetensors(self):
                 )
 
     @slow
+    @run_first
     def test_trainer_eval_mrpc(self):
         MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -3598,6 +3630,7 @@ def test_trainer_eval_mrpc(self):
             self.assertLess(result["eval_loss"], 0.2)
 
     @slow
+    @run_first
     def test_trainer_eval_multiple(self):
         MODEL_ID = "openai-community/gpt2"
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -3897,6 +3930,7 @@ def test_mem_metrics(self):
             trainer = get_regression_trainer(skip_memory_metrics=True, output_dir=tmp_dir)
             self.check_mem_metrics(trainer, self.assertNotIn)
 
+    @require_torch_fp16
     @require_torch_accelerator
     def test_fp16_full_eval(self):
         # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
@@ -4071,7 +4105,7 @@ def forward(self, x):
             # Functional check
             self.assertAlmostEqual(loss, orig_loss)
 
-            # AOT Autograd recomputaion and nvfuser recomputation optimization
+            # AOT Autograd recomputation and nvfuser recomputation optimization
             # aggressively fuses the operations and reduce the memory footprint.
             self.assertGreater(orig_peak_mem, peak_mem * 2)
 
@@ -4152,6 +4186,7 @@ def test_no_wd_param_group(self):
             self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
 
     @slow
+    @require_non_hpu
     @require_torch_multi_accelerator
     def test_end_to_end_example(self):
         # Tests that `translation.py` will run without issues
@@ -4225,7 +4260,7 @@ def test_accelerator_config_from_dict(self):
             model = RegressionPreTrainedModel(config)
             eval_dataset = SampleIterableDataset()
 
-            accelerator_config = {
+            accelerator_config: dict[str, Any] = {
                 "split_batches": True,
                 "dispatch_batches": True,
                 "even_batches": False,
@@ -4335,56 +4370,6 @@ def test_accelerator_config_from_partial(self):
             self.assertEqual(trainer.accelerator.even_batches, True)
             self.assertEqual(trainer.accelerator.use_seedable_sampler, True)
 
-    def test_accelerator_config_from_dict_with_deprecated_args(self):
-        # Checks that accelerator kwargs can be passed through
-        # and the accelerator is initialized respectively
-        # and maintains the deprecated args if passed in
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config = RegressionModelConfig(a=1.5, b=2.5)
-            model = RegressionPreTrainedModel(config)
-            eval_dataset = SampleIterableDataset()
-
-            # Leaves all options as something *not* basic
-            with self.assertWarns(FutureWarning) as cm:
-                args = RegressionTrainingArguments(
-                    output_dir=tmp_dir,
-                    accelerator_config={
-                        "split_batches": True,
-                    },
-                    dispatch_batches=False,
-                )
-                self.assertIn("dispatch_batches", str(cm.warnings[0].message))
-            trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.dispatch_batches, False)
-            self.assertEqual(trainer.accelerator.split_batches, True)
-            with self.assertWarns(FutureWarning) as cm:
-                args = RegressionTrainingArguments(
-                    output_dir=tmp_dir,
-                    accelerator_config={
-                        "even_batches": False,
-                    },
-                    split_batches=True,
-                )
-                self.assertIn("split_batches", str(cm.warnings[0].message))
-            trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset)
-            self.assertEqual(trainer.accelerator.split_batches, True)
-            self.assertEqual(trainer.accelerator.even_batches, False)
-            self.assertEqual(trainer.accelerator.dispatch_batches, None)
-
-    def test_accelerator_config_only_deprecated_args(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertWarns(FutureWarning) as cm:
-                args = RegressionTrainingArguments(
-                    output_dir=tmp_dir,
-                    split_batches=True,
-                )
-                self.assertIn("split_batches", str(cm.warnings[0].message))
-                config = RegressionModelConfig(a=1.5, b=2.5)
-                model = RegressionPreTrainedModel(config)
-                eval_dataset = SampleIterableDataset()
-                trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset)
-                self.assertEqual(trainer.accelerator.split_batches, True)
-
     def test_accelerator_custom_state(self):
         AcceleratorState._reset_state(reset_partial_state=True)
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -4695,6 +4680,191 @@ def test_metric_for_best_model_behavior(self):
             )
             self.assertTrue(trainer.args.metric_for_best_model == "loss")
 
+    def test_best_model_checkpoint_behavior(self):
+        # Case 1. Never evaluated, save_total_limit > 1 and save_steps == 1.
+        # Both best_metric and best_model_checkpoint should be None.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                eval_strategy="steps",
+                save_strategy="steps",
+                save_steps=1,
+                metric_for_best_model="accuracy",
+                greater_is_better=True,
+            )
+            trainer.train()
+
+            assert trainer.state.best_metric is None
+            assert trainer.state.best_model_checkpoint is None
+            assert len(os.listdir(tmpdir)) == trainer.state.global_step
+
+        # Case 2. Never evaluated and save_total_limit == 1.
+        # Both best_metric and best_model_checkpoint should be None.
+        # Only the last checkpoint should remain.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                eval_strategy="steps",
+                save_strategy="steps",
+                save_steps=1,
+                metric_for_best_model="accuracy",
+                greater_is_better=True,
+                save_total_limit=1,
+            )
+            trainer.train()
+
+            num_steps = trainer.state.global_step
+
+            assert trainer.state.best_metric is None
+            assert trainer.state.best_model_checkpoint is None
+            assert len(os.listdir(tmpdir)) == 1
+
+            ckpt = os.path.join(tmpdir, f"{PREFIX_CHECKPOINT_DIR}-{num_steps}")
+            assert os.path.isdir(ckpt)
+            assert os.listdir(tmpdir)[0] == f"{PREFIX_CHECKPOINT_DIR}-{num_steps}"
+
+        # Case 3. eval_strategy == save_strategy.
+        # best_model_checkpoint should be at epoch 1.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                eval_strategy="epoch",
+                save_strategy="epoch",
+                metric_for_best_model="accuracy",
+                compute_metrics=AlmostAccuracy(),
+                greater_is_better=True,
+                load_best_model_at_end=False,
+            )
+
+            with patch.object(
+                trainer,
+                "_evaluate",
+                side_effect=evaluate_side_effect_factory(
+                    [
+                        {"eval_accuracy": 0.59},
+                        {"eval_accuracy": 0.57},
+                        {"eval_accuracy": 0.55},
+                    ]
+                ),
+            ):
+                trainer.train()
+
+            steps_per_epoch = get_steps_per_epoch(trainer)
+
+            assert trainer.state.best_metric == 0.59
+            assert trainer.state.best_global_step == steps_per_epoch
+
+            best_ckpt = os.path.join(tmpdir, f"{PREFIX_CHECKPOINT_DIR}-{trainer.state.best_global_step}")
+            assert trainer.state.best_model_checkpoint == best_ckpt
+
+            assert len(os.listdir(tmpdir)) == trainer.state.num_train_epochs
+
+        # Case 4. eval_strategy != save_strategy.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                eval_strategy="epoch",
+                save_strategy="steps",
+                save_steps=1,
+                metric_for_best_model="accuracy",
+                compute_metrics=AlmostAccuracy(),
+                greater_is_better=True,
+                load_best_model_at_end=False,
+            )
+
+            with patch.object(
+                trainer,
+                "_evaluate",
+                side_effect=evaluate_side_effect_factory(
+                    [
+                        {"eval_accuracy": 0.59},
+                        {"eval_accuracy": 0.57},
+                        {"eval_accuracy": 0.55},
+                    ]
+                ),
+            ):
+                trainer.train()
+
+            steps_per_epoch = get_steps_per_epoch(trainer)
+
+            assert trainer.state.best_metric == 0.59
+            assert trainer.state.best_global_step == steps_per_epoch
+
+            best_ckpt = os.path.join(tmpdir, f"{PREFIX_CHECKPOINT_DIR}-{trainer.state.best_global_step}")
+            assert trainer.state.best_model_checkpoint == best_ckpt
+
+            assert len(os.listdir(tmpdir)) == trainer.state.global_step
+
+        # Case 5. Multiple checkpoints, save_total_limit == 1.
+        # Best metric is found at step 1 and that checkpoint should be saved.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                eval_strategy="steps",
+                eval_steps=1,
+                save_strategy="steps",
+                save_steps=1,
+                metric_for_best_model="accuracy",
+                compute_metrics=AlmostAccuracy(),
+                greater_is_better=True,
+                save_total_limit=1,
+            )
+
+            with patch.object(
+                trainer,
+                "_evaluate",
+                side_effect=evaluate_side_effect_factory(
+                    [
+                        {"eval_accuracy": 0.90},
+                        {"eval_accuracy": 0.80},
+                        {"eval_accuracy": 0.70},
+                    ]
+                ),
+            ):
+                trainer.train()
+
+            assert trainer.state.best_metric == 0.90
+            assert trainer.state.best_global_step == 1
+
+            best_ckpt = os.path.join(tmpdir, f"{PREFIX_CHECKPOINT_DIR}-{trainer.state.best_global_step}")
+            assert trainer.state.best_model_checkpoint == best_ckpt
+
+            assert len(os.listdir(tmpdir)) == 1
+
+        # Case 6. Saving happens more often and eval/save mismatch.
+        # `best_model_checkpoint` should be None due to a step mismatch.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                eval_strategy="steps",
+                eval_steps=3,
+                save_strategy="steps",
+                save_steps=2,
+                metric_for_best_model="accuracy",
+                compute_metrics=AlmostAccuracy(),
+                greater_is_better=True,
+            )
+
+            with patch.object(
+                trainer,
+                "_evaluate",
+                side_effect=evaluate_side_effect_factory(
+                    [
+                        {"eval_accuracy": 0.90},
+                        {"eval_accuracy": 0.80},
+                        {"eval_accuracy": 0.70},
+                    ]
+                ),
+            ):
+                trainer.train()
+
+            assert trainer.state.best_metric == 0.90
+            assert trainer.state.best_global_step == 3
+
+            assert trainer.state.best_model_checkpoint is None
+
+            assert len(os.listdir(tmpdir)) == trainer.state.global_step // 2
+
 
 @require_torch
 @is_staging_test
@@ -4971,7 +5141,7 @@ def model_init(trial):
         def hp_name(trial):
             return MyTrialShortNamer.shortname(trial.params)
 
-        def compute_objective(metrics: Dict[str, float]) -> List[float]:
+        def compute_objective(metrics: dict[str, float]) -> list[float]:
             return metrics["eval_loss"], metrics["eval_accuracy"]
 
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -4998,6 +5168,38 @@ def compute_objective(metrics: Dict[str, float]) -> List[float]:
             )
 
 
+@require_torch
+@require_optuna
+class TrainerHyperParameterOptunaIntegrationTestWithFullEval(unittest.TestCase):
+    def test_hyperparameter_search(self):
+        def hp_space(trial):
+            return {}
+
+        def model_init(trial):
+            if trial is not None:
+                a = trial.suggest_int("a", -4, 4)
+                b = trial.suggest_int("b", -4, 4)
+            else:
+                a = 0
+                b = 0
+            config = RegressionModelConfig(a=a, b=b, double_output=False)
+
+            return RegressionPreTrainedModel(config)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir,
+                disable_tqdm=True,
+                model_init=model_init,
+                fp16_full_eval=True,
+            )
+            trainer.hyperparameter_search(
+                direction="minimize",
+                hp_space=hp_space,
+                n_trials=2,
+            )
+
+
 @require_torch
 @require_ray
 class TrainerHyperParameterRayIntegrationTest(unittest.TestCase):
@@ -5142,16 +5344,6 @@ def hp_name(trial):
     }
 
     optim_test_params = [
-        (
-            OptimizerNames.ADAMW_HF,
-            transformers.optimization.AdamW,
-            default_adam_kwargs,
-        ),
-        (
-            OptimizerNames.ADAMW_HF.value,
-            transformers.optimization.AdamW,
-            default_adam_kwargs,
-        ),
         (
             OptimizerNames.ADAMW_TORCH,
             torch.optim.AdamW,
@@ -5660,9 +5852,6 @@ def setUp(self):
         self.batch_size = args.train_batch_size
 
     def test_hyperparameter_search(self):
-        class MyTrialShortNamer(TrialShortNamer):
-            DEFAULTS = {"a": 0, "b": 0}
-
         def hp_space(trial):
             return {
                 "method": "random",
@@ -5684,9 +5873,6 @@ def model_init(config):
 
             return RegressionPreTrainedModel(model_config)
 
-        def hp_name(params):
-            return MyTrialShortNamer.shortname(params)
-
         with tempfile.TemporaryDirectory() as tmp_dir:
             trainer = get_regression_trainer(
                 output_dir=tmp_dir,
@@ -5701,9 +5887,31 @@ def hp_name(params):
                 run_name="test",
                 model_init=model_init,
             )
-            trainer.hyperparameter_search(
-                direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="wandb", n_trials=4, anonymous="must"
-            )
+            sweep_kwargs = {
+                "direction": "minimize",
+                "hp_space": hp_space,
+                "backend": "wandb",
+                "n_trials": 4,
+            }
+            best_run = trainer.hyperparameter_search(**sweep_kwargs)
+
+            self.assertIsNotNone(best_run.run_id)
+            self.assertIsNotNone(best_run.run_summary)
+            hp_keys = set(best_run.hyperparameters.keys())
+            self.assertSetEqual(hp_keys, {"a", "b", "assignments", "metric"})
+
+            # pretend restarting the process purged the environ
+            import os
+
+            del os.environ["WANDB_ENTITY"]
+            del os.environ["WANDB_PROJECT"]
+            sweep_kwargs["sweep_id"] = best_run.run_summary
+            updated_best_run = trainer.hyperparameter_search(**sweep_kwargs)
+
+            self.assertIsNotNone(updated_best_run.run_id)
+            self.assertEqual(updated_best_run.run_summary, best_run.run_summary)
+            updated_hp_keys = set(updated_best_run.hyperparameters.keys())
+            self.assertSetEqual(updated_hp_keys, {"a", "b", "assignments", "metric"})
 
 
 class HyperParameterSearchBackendsTest(unittest.TestCase):
diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py
index 0d1e6645f9a5..996cd1ecb9a2 100644
--- a/tests/trainer/test_trainer_callback.py
+++ b/tests/trainer/test_trainer_callback.py
@@ -425,3 +425,21 @@ def test_stateful_control(self):
         trainer.state = TrainerState.load_from_json(os.path.join(checkpoint, TRAINER_STATE_NAME))
         trainer._load_callback_state()
         assert trainer.control.should_training_stop
+
+    def test_no_duplicate_save_on_epoch_save_strategy(self):
+        times_saved = 0
+
+        class OnEndCallback(TrainerCallback):
+            def on_step_end(self, args: TrainingArguments, state: TrainerState, control, **kwargs):
+                nonlocal times_saved
+                if control.should_save:
+                    times_saved += 1
+
+            def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control, **kwargs):
+                nonlocal times_saved
+                if control.should_save:
+                    times_saved += 1
+
+        trainer = self.get_trainer(max_steps=2, save_strategy="epoch", callbacks=[OnEndCallback])
+        trainer.train()
+        assert times_saved == 1
diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py
index 968f800174a6..86c34c4efd01 100644
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -19,12 +19,11 @@
 from transformers import EvalPrediction, HfArgumentParser, TrainingArguments, is_torch_available
 from transformers.testing_utils import (
     TestCasePlus,
+    backend_device_count,
     execute_subprocess_async,
     get_torch_dist_unique_port,
-    require_torch_multi_gpu,
-    require_torch_multi_xpu,
-    require_torch_neuroncore,
-    require_torch_npu,
+    require_torch_multi_accelerator,
+    torch_device,
 )
 from transformers.training_args import ParallelMode
 from transformers.utils import logging
@@ -117,38 +116,10 @@ def __getitem__(self, i):
             return result
 
 
-class TestTrainerDistributedNeuronCore(TestCasePlus):
-    @require_torch_neuroncore
-    def test_trainer(self):
-        distributed_args = f"""--nproc_per_node=2
-            --master_port={get_torch_dist_unique_port()}
-            {self.test_file_dir}/test_trainer_distributed.py
-        """.split()
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"--output_dir {output_dir}".split()
-        cmd = ["torchrun"] + distributed_args + args
-        execute_subprocess_async(cmd, env=self.get_env())
-        # successful return here == success - any errors would have caused an error in the sub-call
-
-
-class TestTrainerDistributedNPU(TestCasePlus):
-    @require_torch_npu
-    def test_trainer(self):
-        distributed_args = f"""--nproc_per_node=2
-            --master_port={get_torch_dist_unique_port()}
-            {self.test_file_dir}/test_trainer_distributed.py
-        """.split()
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"--output_dir {output_dir}".split()
-        cmd = ["torchrun"] + distributed_args + args
-        execute_subprocess_async(cmd, env=self.get_env())
-        # successful return here == success - any errors would have caused an error in the sub-call
-
-
 class TestTrainerDistributed(TestCasePlus):
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_trainer(self):
-        distributed_args = f"""--nproc_per_node={torch.cuda.device_count()}
+        distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
             --master_port={get_torch_dist_unique_port()}
             {self.test_file_dir}/test_trainer_distributed.py
         """.split()
@@ -159,20 +130,6 @@ def test_trainer(self):
         # successful return here == success - any errors would have caused an error in the sub-call
 
 
-@require_torch_multi_xpu
-class TestTrainerDistributedXPU(TestCasePlus):
-    def test_trainer(self):
-        distributed_args = f"""--nproc_per_node={torch.xpu.device_count()}
-            --master_port={get_torch_dist_unique_port()}
-            {self.test_file_dir}/test_trainer_distributed.py
-        """.split()
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"--output_dir {output_dir}".split()
-        cmd = ["torchrun"] + distributed_args + args
-        execute_subprocess_async(cmd, env=self.get_env())
-        # successful return here == success - any errors would have caused an error in the sub-call
-
-
 if __name__ == "__main__":
     # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
     #
@@ -243,6 +200,8 @@ def compute_metrics(p: EvalPrediction) -> Dict:
     model = RegressionModel()
     training_args.per_device_train_batch_size = 1
     training_args.max_steps = 1
-    training_args.dispatch_batches = False
+    training_args.accelerator_config = {
+        "dispatch_batches": False,
+    }
     trainer = Trainer(model, training_args, train_dataset=train_dataset)
     trainer.train()
diff --git a/tests/trainer/test_trainer_fsdp.py b/tests/trainer/test_trainer_fsdp.py
index eca6a30664f0..255739a2d7f3 100644
--- a/tests/trainer/test_trainer_fsdp.py
+++ b/tests/trainer/test_trainer_fsdp.py
@@ -17,12 +17,15 @@
 from transformers import is_torch_available
 from transformers.testing_utils import (
     TestCasePlus,
+    backend_device_count,
     execute_subprocess_async,
     get_torch_dist_unique_port,
     require_accelerate,
     require_fp8,
     require_fsdp,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
+    run_first,
+    torch_device,
 )
 
 
@@ -64,9 +67,10 @@ def __getitem__(self, i: int) -> str:
 
 
 class TestFSDPTrainer(TestCasePlus):
+    @require_torch_multi_accelerator
     @require_accelerate
-    @require_torch_multi_gpu
     @require_fsdp
+    @run_first
     def test_trainer(self):
         output_dir = self.get_auto_remove_tmp_dir()
         cmd = [
@@ -76,7 +80,7 @@ def test_trainer(self):
             "--main_process_port",
             f"{get_torch_dist_unique_port()}",
             "--num_processes",
-            f"{torch.cuda.device_count()}",
+            f"{backend_device_count(torch_device)}",
             "--fsdp_transformer_layer_cls_to_wrap",
             "GPT2Block",
             f"{self.test_file_dir}/test_trainer_fsdp.py",
@@ -90,10 +94,11 @@ def test_trainer(self):
 
 
 class TestFSDPTrainerFP8(TestCasePlus):
+    @require_torch_multi_accelerator
     @require_accelerate
-    @require_torch_multi_gpu
     @require_fsdp
     @require_fp8
+    @run_first
     def test_trainer(self):
         output_dir = self.get_auto_remove_tmp_dir()
         cmd = [
@@ -103,7 +108,7 @@ def test_trainer(self):
             "--main_process_port",
             f"{get_torch_dist_unique_port()}",
             "--num_processes",
-            f"{torch.cuda.device_count()}",
+            f"{backend_device_count(torch_device)}",
             "--mixed_precision",
             "fp8",
             "--fsdp_transformer_layer_cls_to_wrap",
@@ -117,32 +122,34 @@ def test_trainer(self):
         execute_subprocess_async(cmd, env=self.get_env())
         # successful return here == success - any errors would have caused an error in the sub-call
 
-    class TestFSDPTrainerWrap(TestCasePlus):
-        @require_accelerate
-        @require_torch_multi_gpu
-        @require_fsdp
-        def test_trainer(self):
-            output_dir = self.get_auto_remove_tmp_dir()
-            cmd = [
-                "accelerate",
-                "launch",
-                "--use_fsdp",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
-                "--num_processes",
-                f"{torch.cuda.device_count()}",
-                "--fsdp_transformer_layer_cls_to_wrap",
-                "GPT2Block",
-                f"{self.test_file_dir}/test_trainer_fsdp.py",
-                "--output_dir",
-                f"{output_dir}",
-                "--report_to",
-                "none",
-                "--auto_find_batch_size",
-                "True",
-            ]
-            execute_subprocess_async(cmd, env=self.get_env())
-            # successful return here == success - any errors would have caused an error in the sub-call
+
+class TestFSDPTrainerWrap(TestCasePlus):
+    @require_torch_multi_accelerator
+    @require_accelerate
+    @require_fsdp
+    @run_first
+    def test_trainer(self):
+        output_dir = self.get_auto_remove_tmp_dir()
+        cmd = [
+            "accelerate",
+            "launch",
+            "--use_fsdp",
+            "--main_process_port",
+            f"{get_torch_dist_unique_port()}",
+            "--num_processes",
+            f"{backend_device_count(torch_device)}",
+            "--fsdp_transformer_layer_cls_to_wrap",
+            "GPT2Block",
+            f"{self.test_file_dir}/test_trainer_fsdp.py",
+            "--output_dir",
+            f"{output_dir}",
+            "--report_to",
+            "none",
+            "--auto_find_batch_size",
+            "True",
+        ]
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
 
 
 if __name__ == "__main__":
diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py
index 30dd2ed460c9..d43edd19fffc 100644
--- a/tests/trainer/test_trainer_seq2seq.py
+++ b/tests/trainer/test_trainer_seq2seq.py
@@ -180,13 +180,13 @@ def prepare_data(examples):
         for num_return_sequences in range(3, 0, -1):
             gen_config.num_return_sequences = num_return_sequences
             metrics = trainer.evaluate(eval_dataset=prepared_dataset, generation_config=gen_config)
-            assert (
-                metrics["eval_samples"] == dataset_len * num_return_sequences
-            ), f"Got {metrics['eval_samples']}, expected: {dataset_len * num_return_sequences}"
+            assert metrics["eval_samples"] == dataset_len * num_return_sequences, (
+                f"Got {metrics['eval_samples']}, expected: {dataset_len * num_return_sequences}"
+            )
 
     @require_torch
     def test_bad_generation_config_fail_early(self):
-        # Tests that a bad geneartion config causes the trainer to fail early
+        # Tests that a bad generation config causes the trainer to fail early
         model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
         tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
         data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding="longest")
diff --git a/tests/utils/test_add_new_model_like.py b/tests/utils/test_add_new_model_like.py
index 27e53ed06365..4e755f1d4a52 100644
--- a/tests/utils/test_add_new_model_like.py
+++ b/tests/utils/test_add_new_model_like.py
@@ -436,7 +436,7 @@ class TFNewBertPreTrainedModel(PreTrainedModel):
 
             self.init_file(file_name, bert_test)
             duplicate_module(file_name, bert_model_patterns, new_bert_model_patterns)
-            # There should not be a new Copied from statement, the old one should be adapated.
+            # There should not be a new Copied from statement, the old one should be adapted.
             self.check_result(dest_file_name, bert_expected)
 
             self.init_file(file_name, bert_test)
@@ -736,55 +736,6 @@ def test_retrieve_info_for_model_with_bert(self):
         self.assertIsNone(bert_model_patterns.feature_extractor_class)
         self.assertIsNone(bert_model_patterns.processor_class)
 
-    def test_retrieve_info_for_model_pt_tf_with_bert(self):
-        bert_info = retrieve_info_for_model("bert", frameworks=["pt", "tf"])
-        bert_classes = [
-            "BertForTokenClassification",
-            "BertForQuestionAnswering",
-            "BertForNextSentencePrediction",
-            "BertForSequenceClassification",
-            "BertForMaskedLM",
-            "BertForMultipleChoice",
-            "BertModel",
-            "BertForPreTraining",
-            "BertLMHeadModel",
-        ]
-        expected_model_classes = {"pt": set(bert_classes), "tf": {f"TF{m}" for m in bert_classes}}
-
-        self.assertEqual(set(bert_info["frameworks"]), {"pt", "tf"})
-        model_classes = {k: set(v) for k, v in bert_info["model_classes"].items()}
-        self.assertEqual(model_classes, expected_model_classes)
-
-        all_bert_files = bert_info["model_files"]
-        model_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_bert_files["model_files"]}
-        bert_model_files = BERT_MODEL_FILES - {"src/transformers/models/bert/modeling_flax_bert.py"}
-        self.assertEqual(model_files, bert_model_files)
-
-        test_files = {str(Path(f).relative_to(REPO_PATH)) for f in all_bert_files["test_files"]}
-        bert_test_files = {
-            "tests/models/bert/test_tokenization_bert.py",
-            "tests/models/bert/test_modeling_bert.py",
-            "tests/models/bert/test_modeling_tf_bert.py",
-        }
-        self.assertEqual(test_files, bert_test_files)
-
-        doc_file = str(Path(all_bert_files["doc_file"]).relative_to(REPO_PATH))
-        self.assertEqual(doc_file, "docs/source/en/model_doc/bert.md")
-
-        self.assertEqual(all_bert_files["module_name"], "bert")
-
-        bert_model_patterns = bert_info["model_patterns"]
-        self.assertEqual(bert_model_patterns.model_name, "BERT")
-        self.assertEqual(bert_model_patterns.checkpoint, "google-bert/bert-base-uncased")
-        self.assertEqual(bert_model_patterns.model_type, "bert")
-        self.assertEqual(bert_model_patterns.model_lower_cased, "bert")
-        self.assertEqual(bert_model_patterns.model_camel_cased, "Bert")
-        self.assertEqual(bert_model_patterns.model_upper_cased, "BERT")
-        self.assertEqual(bert_model_patterns.config_class, "BertConfig")
-        self.assertEqual(bert_model_patterns.tokenizer_class, "BertTokenizer")
-        self.assertIsNone(bert_model_patterns.feature_extractor_class)
-        self.assertIsNone(bert_model_patterns.processor_class)
-
     def test_retrieve_info_for_model_with_vit(self):
         vit_info = retrieve_info_for_model("vit")
         vit_classes = ["ViTForImageClassification", "ViTModel"]
diff --git a/tests/utils/test_audio_utils.py b/tests/utils/test_audio_utils.py
index 3e417bf7e3b4..f8d88b602b11 100644
--- a/tests/utils/test_audio_utils.py
+++ b/tests/utils/test_audio_utils.py
@@ -39,6 +39,9 @@
 
 
 class AudioUtilsFunctionTester(unittest.TestCase):
+    # will be set in `def _load_datasamples`
+    _dataset = None
+
     def test_hertz_to_mel(self):
         self.assertEqual(hertz_to_mel(0.0), 0.0)
         self.assertAlmostEqual(hertz_to_mel(100), 150.48910241)
@@ -194,26 +197,38 @@ def test_mel_filter_bank_kaldi(self):
             triangularize_in_mel_space=True,
         )
         # fmt: off
+        # here the expected values from torchaudio.compliance.kaldi.get_mel_banks
+        # note that we compute values in float64 while they do it in float32
         expected = np.array(
-        [[0.0000, 0.0000, 0.0000, 0.0000],
-        [0.6086, 0.0000, 0.0000, 0.0000],
-        [0.8689, 0.1311, 0.0000, 0.0000],
-        [0.4110, 0.5890, 0.0000, 0.0000],
-        [0.0036, 0.9964, 0.0000, 0.0000],
-        [0.0000, 0.6366, 0.3634, 0.0000],
-        [0.0000, 0.3027, 0.6973, 0.0000],
-        [0.0000, 0.0000, 0.9964, 0.0036],
-        [0.0000, 0.0000, 0.7135, 0.2865],
-        [0.0000, 0.0000, 0.4507, 0.5493],
-        [0.0000, 0.0000, 0.2053, 0.7947],
-        [0.0000, 0.0000, 0.0000, 0.9752],
-        [0.0000, 0.0000, 0.0000, 0.7585],
-        [0.0000, 0.0000, 0.0000, 0.5539],
-        [0.0000, 0.0000, 0.0000, 0.3599],
-        [0.0000, 0.0000, 0.0000, 0.1756]]
+            [
+                [0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.0000000000000000],
+                [0.6457883715629578, 0.0000000000000000, 0.0000000000000000, 0.0000000000000000],
+                [0.8044781088829041, 0.1955219060182571, 0.0000000000000000, 0.0000000000000000],
+                [0.3258901536464691, 0.6741098165512085, 0.0000000000000000, 0.0000000000000000],
+                [0.0000000000000000, 0.9021250009536743, 0.0978749766945839, 0.0000000000000000],
+                [0.0000000000000000, 0.5219038724899292, 0.4780961275100708, 0.0000000000000000],
+                [0.0000000000000000, 0.1771058291196823, 0.8228941559791565, 0.0000000000000000],
+                [0.0000000000000000, 0.0000000000000000, 0.8616894483566284, 0.1383105516433716],
+                [0.0000000000000000, 0.0000000000000000, 0.5710380673408508, 0.4289619624614716],
+                [0.0000000000000000, 0.0000000000000000, 0.3015440106391907, 0.6984559893608093],
+                [0.0000000000000000, 0.0000000000000000, 0.0503356307744980, 0.9496643543243408],
+                [0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.8150880336761475],
+                [0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.5938932299613953],
+                [0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.3851676583290100],
+                [0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.1875794380903244],
+            ],
+            dtype=np.float64,
         )
         # fmt: on
-        self.assertTrue(np.allclose(mel_filters, expected, atol=5e-5))
+
+        # kaldi implementation does not compute values for last fft bin
+        # indeed, they enforce max_frequency <= sampling_rate / 2 and
+        # therefore they know that last fft bin filter bank values will be all 0
+        # and pad after with zeros
+        # to comply with our API for `mel_filter_bank`, we need to also pad here
+        expected = np.pad(expected, ((0, 1), (0, 0)))
+
+        self.assertTrue(np.allclose(mel_filters, expected))
 
     def test_mel_filter_bank_slaney_norm(self):
         mel_filters = mel_filter_bank(
@@ -262,8 +277,9 @@ def test_window_function(self):
     def _load_datasamples(self, num_samples):
         from datasets import load_dataset
 
-        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        if self._dataset is None:
+            self._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        speech_samples = self._dataset.sort("id").select(range(num_samples))[:num_samples]["audio"]
         return [x["array"] for x in speech_samples]
 
     def test_spectrogram_impulse(self):
@@ -369,7 +385,7 @@ def test_spectrogram_integration_test(self):
         self.assertTrue(np.allclose(spec[:64, 400], expected))
 
         mel_filters = mel_filter_bank(
-            num_frequency_bins=256,
+            num_frequency_bins=257,
             num_mel_filters=400,
             min_frequency=20,
             max_frequency=8000,
@@ -379,8 +395,6 @@ def test_spectrogram_integration_test(self):
             triangularize_in_mel_space=True,
         )
 
-        mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
-
         spec = spectrogram(
             waveform,
             window_function(400, "povey", periodic=False),
@@ -510,7 +524,7 @@ def test_spectrogram_batch_integration_test(self):
         self.assertTrue(np.allclose(spec_list[2][:64, 400], expected3))
 
         mel_filters = mel_filter_bank(
-            num_frequency_bins=256,
+            num_frequency_bins=257,
             num_mel_filters=400,
             min_frequency=20,
             max_frequency=8000,
@@ -520,8 +534,6 @@ def test_spectrogram_batch_integration_test(self):
             triangularize_in_mel_space=True,
         )
 
-        mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
-
         spec_list = spectrogram_batch(
             waveform_list,
             window_function(400, "povey", periodic=False),
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
index 6541dad8d016..15a3e44ef8d4 100644
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -20,12 +20,16 @@
 
 from transformers import set_seed
 from transformers.testing_utils import (
+    CaptureStderr,
+    get_gpu_count,
     is_torch_available,
     require_gptq,
     require_non_xpu,
     require_read_token,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
+    require_torch_multi_gpu,
     slow,
     torch_device,
 )
@@ -45,7 +49,7 @@
         StaticCache,
         convert_and_export_with_cache,
     )
-    from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_3
+    from transformers.utils import is_torch_greater_or_equal
 
 
 @require_torch
@@ -147,7 +151,7 @@ def _random_kvs(config):
             return random_keys, random_values
 
         mha_config = LlamaConfig(num_attention_heads=32)
-        mha_static_cache = StaticCache(config=mha_config, batch_size=1, max_cache_len=10, device=torch_device)
+        mha_static_cache = StaticCache(config=mha_config, max_batch_size=1, max_cache_len=10, device=torch_device)
         cached_keys, cached_values = mha_static_cache.update(
             *_random_kvs(mha_config), 0, cache_kwargs={"cache_position": torch.arange(1).to(torch_device)}
         )
@@ -155,7 +159,7 @@ def _random_kvs(config):
         self.assertTrue(cached_values.shape == (1, 32, 10, 128))
 
         gqa_config = LlamaConfig(num_attention_heads=32, num_key_value_heads=4)
-        gqa_static_cache = StaticCache(config=gqa_config, batch_size=1, max_cache_len=10, device=torch_device)
+        gqa_static_cache = StaticCache(config=gqa_config, max_batch_size=1, max_cache_len=10, device=torch_device)
         cached_keys, cached_values = gqa_static_cache.update(
             *_random_kvs(gqa_config), 0, cache_kwargs={"cache_position": torch.arange(1).to(torch_device)}
         )
@@ -163,20 +167,74 @@ def _random_kvs(config):
         self.assertTrue(cached_values.shape == (1, 4, 10, 128))
 
         mqa_config = LlamaConfig(num_attention_heads=32, num_key_value_heads=1)
-        mqa_static_cache = StaticCache(config=mqa_config, batch_size=1, max_cache_len=10, device=torch_device)
+        mqa_static_cache = StaticCache(config=mqa_config, max_batch_size=1, max_cache_len=10, device=torch_device)
         cached_keys, cached_values = mqa_static_cache.update(
             *_random_kvs(mqa_config), 0, cache_kwargs={"cache_position": torch.arange(1).to(torch_device)}
         )
         self.assertTrue(cached_keys.shape == (1, 1, 10, 128))
         self.assertTrue(cached_values.shape == (1, 1, 10, 128))
 
+    def test_dynamic_cache_exportability(self):
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
+        model = model.eval()
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
+        prompt = "What is the best way to debug python script?"
+        inputs = tokenizer(prompt, return_tensors="pt")
+        attention_mask = inputs.attention_mask
+        input_ids = inputs.input_ids
+
+        past_key_values = DynamicCache()
+        ep = torch.export.export(
+            model,
+            (),
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "past_key_values": past_key_values,
+                "use_cache": True,
+            },
+            strict=False,
+        )
+        res = ep.module()(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=True,
+        )
+        self.assertTrue(len(res.past_key_values.key_cache) == model.config.num_hidden_layers)
+        self.assertEqual(2 * model.config.num_hidden_layers + 1, len(ep.graph_signature.output_specs))
+        self.assertEqual(
+            3,
+            len(
+                [
+                    x
+                    for x in ep.graph_signature.input_specs
+                    if x.kind == torch.export.graph_signature.InputKind.USER_INPUT
+                ]
+            ),
+        )
+
+        past_key_values_eager = DynamicCache()
+        res_eager = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values_eager,
+            use_cache=True,
+        )
+        self.assertTrue(torch.allclose(res.logits, res_eager.logits))
+        for k1, k2 in zip(res.past_key_values.key_cache, res_eager.past_key_values.key_cache):
+            self.assertTrue(torch.allclose(k1, k2))
+
+        for v1, v2 in zip(res.past_key_values.value_cache, res_eager.past_key_values.value_cache):
+            self.assertTrue(torch.allclose(v1, v2))
+
     @slow
     @require_read_token
     def test_static_cache_exportability(self):
         """
         Tests that static cache works with `torch.export()`
         """
-        if not is_torch_greater_or_equal_than_2_3:
+        if not is_torch_greater_or_equal("2.3"):
             self.skipTest(reason="This test requires torch >= 2.3 to run.")
 
         set_seed(0)
@@ -215,11 +273,11 @@ def test_static_cache_exportability(self):
         # Check if the exported model is configured with the `StaticCache` correctly
         n_static_key_caches = n_static_value_caches = 0
         for buffer_name, buffer in exported_program.named_buffers():
-            if buffer_name.startswith("static_cache.key_cache"):
+            if buffer_name.startswith("key_cache"):
                 self.assertTrue(buffer.shape[0] == batch_size)
                 self.assertTrue(buffer.shape[2] == max_cache_len)
                 n_static_key_caches = n_static_key_caches + 1
-            if buffer_name.startswith("static_cache.value_cache"):
+            if buffer_name.startswith("value_cache"):
                 self.assertTrue(buffer.shape[0] == batch_size)
                 self.assertTrue(buffer.shape[2] == max_cache_len)
                 n_static_value_caches = n_static_value_caches + 1
@@ -227,7 +285,7 @@ def test_static_cache_exportability(self):
         self.assertEqual(n_static_value_caches, model.config.num_hidden_layers)
 
 
-@require_torch_gpu
+@require_torch_accelerator
 @slow
 class CacheIntegrationTest(unittest.TestCase):
     def test_dynamic_cache_hard(self):
@@ -539,13 +597,17 @@ def test_static_cache_extra_left_padding(self, cache_implementation):
     def test_static_cache_beam_search(self):
         pass
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_offloaded_cache_equivalent_to_dynamic_cache(self):
         """Tests that OffloadedCache produces the same result as the default DynamicCache"""
         model_name = "microsoft/Phi-3-mini-4k-instruct"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
         device = model.device
+
+        if not is_torch_greater_or_equal("2.7", accept_dev=True) and device.type == "xpu":
+            self.skipTest(reason="This test requires torch >= 2.7 to run on xpu.")
+
         input_text = "Fun fact:"
         inputs = tokenizer(input_text, return_tensors="pt").to(device)
         common = {
@@ -563,13 +625,17 @@ def test_offloaded_cache_equivalent_to_dynamic_cache(self):
         for original_output, offloaded_output in zip(original_outputs, offloaded_outputs):
             assert torch.all(original_output == offloaded_output).item()
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self):
         """Tests that OffloadedCache uses less memory than the default DynamicCache"""
         model_name = "microsoft/Phi-3-mini-4k-instruct"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
         device = model.device
+
+        if not is_torch_greater_or_equal("2.7", accept_dev=True) and device.type == "xpu":
+            self.skipTest(reason="This test requires torch >= 2.7 to run on xpu.")
+
         input_text = "Fun fact:"
         inputs = tokenizer(input_text, return_tensors="pt").to(device)
         common = {
@@ -582,12 +648,20 @@ def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self):
         }
         original = GenerationConfig(**common)
         offloaded = GenerationConfig(cache_implementation="offloaded", **common)
-        torch.cuda.reset_peak_memory_stats(device)
+
+        torch_accelerator_module = None
+        if device.type == "cuda":
+            torch_accelerator_module = torch.cuda
+        elif device.type == "xpu":
+            torch_accelerator_module = torch.xpu
+
+        torch_accelerator_module.reset_peak_memory_stats(device)
         model.generate(generation_config=original, **inputs)
-        original_peak_memory = torch.cuda.max_memory_allocated(device)
-        torch.cuda.reset_peak_memory_stats(device)
+        original_peak_memory = torch_accelerator_module.max_memory_allocated(device)
+        torch_accelerator_module.reset_peak_memory_stats(device)
         model.generate(generation_config=offloaded, **inputs)
-        offloaded_peak_memory = torch.cuda.max_memory_allocated(device)
+        offloaded_peak_memory = torch_accelerator_module.max_memory_allocated(device)
+        print(f"original_peak_memory: {original_peak_memory}, offloaded_peak_memory: {offloaded_peak_memory}")
         assert offloaded_peak_memory < original_peak_memory
 
     @require_torch_gpu
@@ -619,4 +693,75 @@ def test_cache_copy(self):
             "You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTraveling is an enriching experience that broadens our horizons and exposes us to new cultures, landscapes, and people. Whether it's a week",
             'You are a helpful assistant. What is the capital of France?\n\n\n## Response:Paris is the capital of France.\n\n\n\n\n\n## Query:\n\nIn a detailed analysis, compare the economic impacts of the introduction of the'
         ]  # fmt: skip
-        self.assertTrue(responses == EXPECTED_DECODED_TEXT)
+        self.assertEqual(responses, EXPECTED_DECODED_TEXT)
+
+    @require_torch_multi_gpu
+    def test_data_parallel_dynamic_cache(self):
+        """
+        Tests that the dynamic cache works with nn.DataParallel. Under the hood, `DynamicCache` is rebuilt from
+        multiple `DynamicCache` in the gather step.
+        """
+
+        model_repo = "hf-internal-testing/tiny-random-MistralForCausalLM"
+        model = AutoModelForCausalLM.from_pretrained(model_repo).to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained(model_repo)
+
+        # w/o DP: batch_size = num_gpu
+        # w DP: batch_size = 1 (with num_gpus replicas)
+        num_gpus = get_gpu_count()
+        model_inputs = tokenizer(["foo bar"] * num_gpus, return_tensors="pt").to(model.device)
+
+        # w/o DP
+        no_parallelism_cache = model(**model_inputs).past_key_values
+        self.assertIsInstance(no_parallelism_cache, DynamicCache)
+
+        # w DP
+        model = torch.nn.DataParallel(model)
+        parallelism_cache = model(**model_inputs).past_key_values
+        self.assertIsInstance(parallelism_cache, DynamicCache)
+
+        # Check that the caches are the same
+        for layer_idx in range(len(no_parallelism_cache)):
+            for kv_idx in range(2):  # 0 = key, 1 = value
+                torch.testing.assert_close(
+                    actual=parallelism_cache[layer_idx][kv_idx], expected=no_parallelism_cache[layer_idx][kv_idx]
+                )
+
+    @require_torch_gpu
+    def test_static_cache_no_cuda_graph_skips(self):
+        """
+        Tests generating with static cache and compilation doesn't skip cuda graphs. Regression test for #36543.
+
+        (? We set `fullgraph=True`, which according to torch docs means it should raise an exception. Instead,
+        messages are being thrown to stderr?)
+        """
+        model_repo = "hf-internal-testing/tiny-random-MistralForCausalLM"
+        model = AutoModelForCausalLM.from_pretrained(model_repo).to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained(model_repo)
+        inputs = tokenizer(["foo bar"], return_tensors="pt").to(torch_device)
+
+        # on `main`, prior to #36543, this would send stderr messages about cuda graphs being skipped.
+        with CaptureStderr() as cap:
+            model.generate(**inputs, max_new_tokens=2, cache_implementation="static")
+        self.assertEqual(cap.err, "")
+
+    @require_torch_multi_gpu
+    def test_static_cache_multi_gpu(self):
+        """Regression test for #35164: static cache with multi-gpu"""
+
+        model_id = "google/gemma-2-2b-it"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        device_map = {"model.embed_tokens": 0, "model.norm": 1, "model.rotary_emb": 1, "lm_head": 0}
+        num_hidden_layers = 26
+        for i in range(num_hidden_layers):
+            device_map[f"model.layers.{i}"] = 0 if i < 13 else 1
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype="bfloat16",
+            device_map=device_map,
+        )
+        inputs = tokenizer("Today is a beautiful day!", return_tensors="pt").to(0)
+        _ = model(**inputs)
+        _ = model.generate(**inputs, max_new_tokens=2, cache_implementation="hybrid")
diff --git a/tests/utils/test_expectations.py b/tests/utils/test_expectations.py
new file mode 100644
index 000000000000..b4372d262ed6
--- /dev/null
+++ b/tests/utils/test_expectations.py
@@ -0,0 +1,32 @@
+import unittest
+
+from transformers.testing_utils import Expectations
+
+
+class ExpectationsTest(unittest.TestCase):
+    def test_expectations(self):
+        expectations = Expectations(
+            {
+                (None, None): 1,
+                ("cuda", 8): 2,
+                ("cuda", 7): 3,
+                ("rocm", 8): 4,
+                ("rocm", None): 5,
+                ("cpu", None): 6,
+            }
+        )
+
+        def check(value, key):
+            assert expectations.find_expectation(key) == value
+
+        # xpu has no matches so should find default expectation
+        check(1, ("xpu", None))
+        check(2, ("cuda", 8))
+        check(3, ("cuda", 7))
+        check(4, ("rocm", 9))
+        check(4, ("rocm", None))
+        check(2, ("cuda", 2))
+
+        expectations = Expectations({("cuda", 8): 1})
+        with self.assertRaises(ValueError):
+            expectations.find_expectation(("xpu", None))
diff --git a/tests/utils/test_generic.py b/tests/utils/test_generic.py
index 287887038ab4..4af8d7c5147c 100644
--- a/tests/utils/test_generic.py
+++ b/tests/utils/test_generic.py
@@ -18,8 +18,11 @@
 
 import numpy as np
 
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutput
 from transformers.testing_utils import require_flax, require_tf, require_torch
 from transformers.utils import (
+    can_return_tuple,
     expand_dims,
     filter_out_non_signature_kwargs,
     flatten_dict,
@@ -28,6 +31,7 @@
     is_torch_available,
     reshape,
     squeeze,
+    to_py_obj,
     transpose,
 )
 
@@ -201,6 +205,77 @@ def test_expand_dims_flax(self):
         t = jnp.array(x)
         self.assertTrue(np.allclose(expand_dims(x, axis=1), np.asarray(expand_dims(t, axis=1))))
 
+    def test_to_py_obj_native(self):
+        self.assertTrue(to_py_obj(1) == 1)
+        self.assertTrue(to_py_obj([1, 2, 3]) == [1, 2, 3])
+        self.assertTrue(to_py_obj([((1.0, 1.1), 1.2), (2, 3)]) == [[[1.0, 1.1], 1.2], [2, 3]])
+
+    def test_to_py_obj_numpy(self):
+        x1 = [[1, 2, 3], [4, 5, 6]]
+        t1 = np.array(x1)
+        self.assertTrue(to_py_obj(t1) == x1)
+
+        x2 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
+        t2 = np.array(x2)
+        self.assertTrue(to_py_obj(t2) == x2)
+
+        self.assertTrue(to_py_obj([t1, t2]) == [x1, x2])
+
+    @require_torch
+    def test_to_py_obj_torch(self):
+        x1 = [[1, 2, 3], [4, 5, 6]]
+        t1 = torch.tensor(x1)
+        self.assertTrue(to_py_obj(t1) == x1)
+
+        x2 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
+        t2 = torch.tensor(x2)
+        self.assertTrue(to_py_obj(t2) == x2)
+
+        self.assertTrue(to_py_obj([t1, t2]) == [x1, x2])
+
+    @require_tf
+    def test_to_py_obj_tf(self):
+        x1 = [[1, 2, 3], [4, 5, 6]]
+        t1 = tf.constant(x1)
+        self.assertTrue(to_py_obj(t1) == x1)
+
+        x2 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
+        t2 = tf.constant(x2)
+        self.assertTrue(to_py_obj(t2) == x2)
+
+        self.assertTrue(to_py_obj([t1, t2]) == [x1, x2])
+
+    @require_flax
+    def test_to_py_obj_flax(self):
+        x1 = [[1, 2, 3], [4, 5, 6]]
+        t1 = jnp.array(x1)
+        self.assertTrue(to_py_obj(t1) == x1)
+
+        x2 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
+        t2 = jnp.array(x2)
+        self.assertTrue(to_py_obj(t2) == x2)
+
+        self.assertTrue(to_py_obj([t1, t2]) == [x1, x2])
+
+    @require_torch
+    @require_tf
+    @require_flax
+    def test_to_py_obj_mixed(self):
+        x1 = [[1], [2]]
+        t1 = np.array(x1)
+
+        x2 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
+        t2 = torch.tensor(x2)
+
+        x3 = [1, 2, 3]
+        t3 = tf.constant(x3)
+
+        x4 = [[[1.0, 2.0]]]
+        t4 = jnp.array(x4)
+
+        mixed = [(t1, t2), (t3, t4)]
+        self.assertTrue(to_py_obj(mixed) == [[x1, x2], [x3, x4]])
+
 
 class ValidationDecoratorTester(unittest.TestCase):
     def test_cases_no_warning(self):
@@ -271,3 +346,119 @@ def func3(a, **kwargs):
         with self.assertWarns(UserWarning):
             kwargs = func3(1, extra_arg=2, extra_arg2=3, extra_arg3=4)
         self.assertEqual(kwargs, {"extra_arg": 2, "extra_arg2": 3})
+
+
+@require_torch
+class CanReturnTupleDecoratorTester(unittest.TestCase):
+    def _get_model(self, config, store_config=True, raise_in_forward=False):
+        # Simple model class for testing can_return_tuple decorator.
+        class SimpleTestModel(torch.nn.Module):
+            def __init__(self, config):
+                super().__init__()
+                if store_config:
+                    self.config = config
+
+            @can_return_tuple
+            def forward(self, x):
+                if raise_in_forward:
+                    raise ValueError("Test error")
+                return BaseModelOutput(
+                    last_hidden_state=x,
+                    hidden_states=None,
+                    attentions=None,
+                )
+
+        return SimpleTestModel(config)
+
+    def test_decorator_eager(self):
+        """Test that the can_return_tuple decorator works with eager mode."""
+
+        # test nothing is set
+        config = PretrainedConfig()
+        model = self._get_model(config)
+        inputs = torch.tensor(10)
+        output = model(inputs)
+        self.assertIsInstance(
+            output, BaseModelOutput, "output should be a BaseModelOutput when return_dict is not set"
+        )
+
+        # test all explicit cases
+        for config_return_dict in [True, False, None]:
+            for return_dict in [True, False, None]:
+                config = PretrainedConfig(return_dict=config_return_dict)
+                model = self._get_model(config)
+                output = model(torch.tensor(10), return_dict=return_dict)
+
+                expected_type = tuple if config_return_dict is False or return_dict is False else BaseModelOutput
+                message = f"output should be a {expected_type.__name__} when config.use_return_dict={config_return_dict} and return_dict={return_dict}"
+                self.assertIsInstance(output, expected_type, message)
+
+    def test_decorator_compiled(self):
+        """Test that the can_return_tuple decorator works with compiled mode."""
+        config = PretrainedConfig()
+
+        # Output object
+        model = self._get_model(config)
+        compiled_model = torch.compile(model)
+        output = compiled_model(torch.tensor(10))
+        self.assertIsInstance(output, BaseModelOutput)
+
+        # Tuple output
+        model = self._get_model(config)
+        compiled_model = torch.compile(model)
+        output = compiled_model(torch.tensor(10), return_dict=False)
+        self.assertIsInstance(output, tuple)
+
+    def test_decorator_torch_export(self):
+        """Test that the can_return_tuple decorator works with torch.export."""
+        config = PretrainedConfig()
+        model = self._get_model(config)
+        torch.export.export(model, args=(torch.tensor(10),))
+
+    def test_decorator_torchscript(self):
+        """Test that the can_return_tuple decorator works with torch.jit.trace."""
+        config = PretrainedConfig(return_dict=False)
+        model = self._get_model(config)
+        inputs = torch.tensor(10)
+        traced_module = torch.jit.trace(model, inputs)
+        output = traced_module(inputs)
+        self.assertIsInstance(output, tuple)
+
+    def test_attribute_cleanup(self):
+        """Test that the `_is_top_level_module` attribute is removed after the forward call."""
+
+        config = PretrainedConfig(return_dict=False)
+        inputs = torch.tensor(10)
+
+        # working case
+        model = self._get_model(config)
+        output = model(inputs)
+
+        self.assertIsInstance(output, tuple)
+        for name, module in model.named_modules():
+            self.assertFalse(
+                hasattr(module, "_is_top_level_module"),
+                f"Module `{name}` should not have `_is_top_level_module` attribute",
+            )
+
+        # model without config
+        no_config_model = self._get_model(config, store_config=False)
+        output = no_config_model(inputs)
+
+        self.assertIsInstance(output, BaseModelOutput)
+        for name, module in no_config_model.named_modules():
+            self.assertFalse(
+                hasattr(module, "_is_top_level_module"),
+                f"Module `{name}` should not have `_is_top_level_module` attribute",
+            )
+
+        # model with raise in forward
+        model_with_raise = self._get_model(config, raise_in_forward=True)
+        with self.assertRaises(ValueError):
+            model_with_raise(inputs)
+
+        for name, module in model_with_raise.named_modules():
+            self.assertFalse(
+                hasattr(module, "_is_top_level_module"),
+                f"Module `{name}` should not have `_is_top_level_module` attribute",
+            )
diff --git a/tests/utils/test_hf_argparser.py b/tests/utils/test_hf_argparser.py
index 08c730f7348b..56bb553bf32a 100644
--- a/tests/utils/test_hf_argparser.py
+++ b/tests/utils/test_hf_argparser.py
@@ -29,7 +29,6 @@
 from transformers import HfArgumentParser, TrainingArguments
 from transformers.hf_argparser import make_choice_type_function, string_to_bool
 from transformers.testing_utils import require_torch
-from transformers.training_args import _VALID_DICT_FIELDS
 
 
 # Since Python 3.10, we can use the builtin `|` operator for Union types
@@ -412,7 +411,8 @@ def test_parse_yaml(self):
         args = BasicExample(**args_dict_for_yaml)
         self.assertEqual(parsed_args, args)
 
-    def test_integration_training_args(self):
+    def test_z_integration_training_args(self):
+        # make sure that this test executes last in the test suite
         parser = HfArgumentParser(TrainingArguments)
         self.assertIsNotNone(parser)
 
@@ -424,7 +424,7 @@ def test_valid_dict_annotation(self):
         If this fails, a type annotation change is
         needed on a new input
         """
-        base_list = _VALID_DICT_FIELDS.copy()
+        base_list = TrainingArguments._VALID_DICT_FIELDS.copy()
         args = TrainingArguments
 
         # First find any annotations that contain `dict`
@@ -468,7 +468,7 @@ def test_valid_dict_annotation(self):
             self.assertIn(
                 field.name,
                 base_list,
-                f"Optional dict field `{field.name}` is not in the base list of valid fields. Please add it to `training_args._VALID_DICT_FIELDS`",
+                f"Optional dict field `{field.name}` is not in the base list of valid fields. Please add it to `TrainingArguments._VALID_DICT_FIELDS`",
             )
 
     @require_torch
diff --git a/tests/utils/test_hub_utils.py b/tests/utils/test_hub_utils.py
index aae9bd63cf7c..ec5887bd16c1 100644
--- a/tests/utils/test_hub_utils.py
+++ b/tests/utils/test_hub_utils.py
@@ -28,7 +28,6 @@
     TRANSFORMERS_CACHE,
     WEIGHTS_NAME,
     cached_file,
-    get_file_from_repo,
     has_file,
 )
 
@@ -87,14 +86,8 @@ def test_non_existence_is_cached(self):
         path = cached_file(RANDOM_BERT, "conf", local_files_only=True, _raise_exceptions_for_missing_entries=False)
         self.assertIsNone(path)
 
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Under the mock environment we get a 500 error when trying to reach the tokenizer.
-        with mock.patch("requests.Session.request", return_value=response_mock) as mock_head:
+        # Under the mock environment, hf_hub_download will always raise an HTTPError
+        with mock.patch("transformers.utils.hub.hf_hub_download", side_effect=HTTPError) as mock_head:
             path = cached_file(RANDOM_BERT, "conf", _raise_exceptions_for_connection_errors=False)
             self.assertIsNone(path)
             # This check we did call the fake head request
@@ -117,18 +110,45 @@ def test_has_file_in_cache(self):
             assert has_file(TINY_BERT_PT_ONLY, WEIGHTS_NAME, local_files_only=True, cache_dir=tmp_dir)
 
     def test_get_file_from_repo_distant(self):
-        # `get_file_from_repo` returns None if the file does not exist
-        self.assertIsNone(get_file_from_repo("google-bert/bert-base-cased", "ahah.txt"))
+        # should return None if the file does not exist
+        self.assertIsNone(
+            cached_file(
+                "google-bert/bert-base-cased",
+                "ahah.txt",
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+            )
+        )
 
         # The function raises if the repository does not exist.
         with self.assertRaisesRegex(EnvironmentError, "is not a valid model identifier"):
-            get_file_from_repo("bert-base-case", CONFIG_NAME)
+            cached_file(
+                "bert-base-case",
+                CONFIG_NAME,
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+            )
 
         # The function raises if the revision does not exist.
         with self.assertRaisesRegex(EnvironmentError, "is not a valid git identifier"):
-            get_file_from_repo("google-bert/bert-base-cased", CONFIG_NAME, revision="ahaha")
-
-        resolved_file = get_file_from_repo("google-bert/bert-base-cased", CONFIG_NAME)
+            cached_file(
+                "google-bert/bert-base-cased",
+                CONFIG_NAME,
+                revision="ahaha",
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+            )
+
+        resolved_file = cached_file(
+            "google-bert/bert-base-cased",
+            CONFIG_NAME,
+            _raise_exceptions_for_gated_repo=False,
+            _raise_exceptions_for_missing_entries=False,
+            _raise_exceptions_for_connection_errors=False,
+        )
         # The name is the cached name which is not very easy to test, so instead we load the content.
         config = json.loads(open(resolved_file, "r").read())
         self.assertEqual(config["hidden_size"], 768)
@@ -137,9 +157,26 @@ def test_get_file_from_repo_local(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             filename = Path(tmp_dir) / "a.txt"
             filename.touch()
-            self.assertEqual(get_file_from_repo(tmp_dir, "a.txt"), str(filename))
-
-            self.assertIsNone(get_file_from_repo(tmp_dir, "b.txt"))
+            self.assertEqual(
+                cached_file(
+                    tmp_dir,
+                    "a.txt",
+                    _raise_exceptions_for_gated_repo=False,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                ),
+                str(filename),
+            )
+
+            self.assertIsNone(
+                cached_file(
+                    tmp_dir,
+                    "b.txt",
+                    _raise_exceptions_for_gated_repo=False,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                )
+            )
 
     def test_get_file_gated_repo(self):
         """Test download file from a gated repo fails with correct message when not authenticated."""
diff --git a/tests/utils/test_image_utils.py b/tests/utils/test_image_utils.py
index 1d2682a85b62..47dc0dc46cb8 100644
--- a/tests/utils/test_image_utils.py
+++ b/tests/utils/test_image_utils.py
@@ -429,7 +429,7 @@ def test_make_batched_videos_numpy(self):
         self.assertEqual(len(videos_list), 1)
         self.assertTrue(np.array_equal(videos_list[0][0], images))
 
-        # Test a 4d array of images is converted to a a list of 1 video
+        # Test a 4d array of images is converted to a list of 1 video
         images = np.random.randint(0, 256, (4, 16, 32, 3))
         videos_list = make_batched_videos(images)
         self.assertIsInstance(videos_list[0], list)
@@ -996,7 +996,7 @@ def test_get_image_size(self):
         image = np.random.randint(0, 256, (3, 32, 64))
         self.assertEqual(get_image_size(image), (32, 64))
 
-        # Test the channel dimension can be overriden
+        # Test the channel dimension can be overridden
         image = np.random.randint(0, 256, (3, 32, 64))
         self.assertEqual(get_image_size(image, channel_dim=ChannelDimension.LAST), (3, 32))
 
diff --git a/tests/utils/test_import_utils.py b/tests/utils/test_import_utils.py
index 801d1c5484fb..fe616e9cfbe2 100644
--- a/tests/utils/test_import_utils.py
+++ b/tests/utils/test_import_utils.py
@@ -1,23 +1,26 @@
 import sys
 
+from transformers.testing_utils import run_test_using_subprocess
 from transformers.utils.import_utils import clear_import_cache
 
 
+@run_test_using_subprocess
 def test_clear_import_cache():
-    # Import some transformers modules
+    """Test the clear_import_cache function."""
 
-    # Get initial module count
+    # Save initial state
     initial_modules = {name: mod for name, mod in sys.modules.items() if name.startswith("transformers.")}
+    assert len(initial_modules) > 0, "No transformers modules loaded before test"
 
-    # Verify we have some modules loaded
-    assert len(initial_modules) > 0
-
-    # Clear cache
+    # Execute clear_import_cache() function
     clear_import_cache()
 
-    # Check modules were removed
+    # Verify modules were removed
     remaining_modules = {name: mod for name, mod in sys.modules.items() if name.startswith("transformers.")}
-    assert len(remaining_modules) < len(initial_modules)
+    assert len(remaining_modules) < len(initial_modules), "No modules were removed"
+
+    # Import and verify module exists
+    from transformers.models.auto import modeling_auto
 
-    # Verify we can reimport
-    assert "transformers" in sys.modules
+    assert "transformers.models.auto.modeling_auto" in sys.modules
+    assert modeling_auto.__name__ == "transformers.models.auto.modeling_auto"
diff --git a/tests/utils/test_modeling_flax_utils.py b/tests/utils/test_modeling_flax_utils.py
index 7f66944446ab..7a2c516132b8 100644
--- a/tests/utils/test_modeling_flax_utils.py
+++ b/tests/utils/test_modeling_flax_utils.py
@@ -18,16 +18,14 @@
 import numpy as np
 from huggingface_hub import HfFolder, snapshot_download
 
-from transformers import BertConfig, BertModel, is_flax_available, is_torch_available
+from transformers import BertConfig, is_flax_available
 from transformers.testing_utils import (
     TOKEN,
     CaptureLogger,
     TemporaryHubRepo,
-    is_pt_flax_cross_test,
     is_staging_test,
     require_flax,
     require_safetensors,
-    require_torch,
 )
 from transformers.utils import FLAX_WEIGHTS_NAME, SAFE_WEIGHTS_NAME, logging
 
@@ -42,9 +40,6 @@
 
     os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "0.12"  # assumed parallelism: 8
 
-if is_torch_available():
-    import torch
-
 
 @require_flax
 @is_staging_test
@@ -205,23 +200,6 @@ def test_safetensors_save_and_load(self):
 
         self.assertTrue(check_models_equal(model, new_model))
 
-    @require_flax
-    @require_torch
-    @is_pt_flax_cross_test
-    def test_safetensors_save_and_load_pt_to_flax(self):
-        model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", from_pt=True)
-        pt_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            pt_model.save_pretrained(tmp_dir)
-
-            # Check we have a model.safetensors file
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            new_model = FlaxBertModel.from_pretrained(tmp_dir)
-
-        # Check models are equal
-        self.assertTrue(check_models_equal(model, new_model))
-
     @require_safetensors
     def test_safetensors_load_from_hub(self):
         """
@@ -248,58 +226,6 @@ def test_safetensors_load_from_local(self):
 
         self.assertTrue(check_models_equal(flax_model, safetensors_model))
 
-    @require_safetensors
-    @is_pt_flax_cross_test
-    def test_safetensors_load_from_hub_from_safetensors_pt(self):
-        """
-        This test checks that we can load safetensors from a checkpoint that only has those on the Hub.
-        saved in the "pt" format.
-        """
-        flax_model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-msgpack")
-
-        # Can load from the PyTorch-formatted checkpoint
-        safetensors_model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-safetensors")
-        self.assertTrue(check_models_equal(flax_model, safetensors_model))
-
-    @require_safetensors
-    @require_torch
-    @is_pt_flax_cross_test
-    def test_safetensors_load_from_hub_from_safetensors_pt_bf16(self):
-        """
-        This test checks that we can load safetensors from a checkpoint that only has those on the Hub.
-        saved in the "pt" format.
-        """
-        import torch
-
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-safetensors")
-        model.to(torch.bfloat16)
-
-        with tempfile.TemporaryDirectory() as tmp:
-            model.save_pretrained(tmp)
-            flax_model = FlaxBertModel.from_pretrained(tmp)
-
-        # Can load from the PyTorch-formatted checkpoint
-        safetensors_model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-safetensors-bf16")
-        self.assertTrue(check_models_equal(flax_model, safetensors_model))
-
-    @require_safetensors
-    @is_pt_flax_cross_test
-    def test_safetensors_load_from_local_from_safetensors_pt(self):
-        """
-        This test checks that we can load safetensors from a checkpoint that only has those on the Hub.
-        saved in the "pt" format.
-        """
-        with tempfile.TemporaryDirectory() as tmp:
-            location = snapshot_download("hf-internal-testing/tiny-bert-msgpack", cache_dir=tmp)
-            flax_model = FlaxBertModel.from_pretrained(location)
-
-        # Can load from the PyTorch-formatted checkpoint
-        with tempfile.TemporaryDirectory() as tmp:
-            location = snapshot_download("hf-internal-testing/tiny-bert-pt-safetensors", cache_dir=tmp)
-            safetensors_model = FlaxBertModel.from_pretrained(location)
-
-        self.assertTrue(check_models_equal(flax_model, safetensors_model))
-
     @require_safetensors
     def test_safetensors_load_from_hub_msgpack_before_safetensors(self):
         """
@@ -328,19 +254,6 @@ def test_safetensors_flax_from_flax(self):
 
         self.assertTrue(check_models_equal(model, new_model))
 
-    @require_safetensors
-    @require_torch
-    @is_pt_flax_cross_test
-    def test_safetensors_flax_from_torch(self):
-        hub_model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-flax-only")
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            new_model = FlaxBertModel.from_pretrained(tmp_dir)
-
-        self.assertTrue(check_models_equal(hub_model, new_model))
-
     @require_safetensors
     def test_safetensors_flax_from_sharded_msgpack_with_sharded_safetensors_local(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -370,27 +283,3 @@ def test_safetensors_from_pt_bf16(self):
             "Some of the weights of FlaxBertModel were initialized in bfloat16 precision from the model checkpoint"
             in cl.out
         )
-
-    @require_torch
-    @require_safetensors
-    @is_pt_flax_cross_test
-    def test_from_pt_bf16(self):
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only")
-        model.to(torch.bfloat16)
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=False)
-
-            logger = logging.get_logger("transformers.modeling_flax_utils")
-
-            with CaptureLogger(logger) as cl:
-                new_model = FlaxBertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-safetensors-bf16")
-
-            self.assertTrue(
-                "Some of the weights of FlaxBertModel were initialized in bfloat16 precision from the model checkpoint"
-                in cl.out
-            )
-
-            flat_params_1 = flatten_dict(new_model.params)
-            for value in flat_params_1.values():
-                self.assertEqual(value.dtype, "bfloat16")
diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py
index 9fe7d21b2265..233fbcde2ea2 100644
--- a/tests/utils/test_modeling_rope_utils.py
+++ b/tests/utils/test_modeling_rope_utils.py
@@ -411,7 +411,7 @@ def test_llama3_rope_numerically(self):
             self.assertEqual(attention_scale, 1.0)
 
         # Check 2: based on `low_freq_factor` and `high_freq_factor`, the frequencies will be scaled between 1 and
-        # `factor` (similar to yarn). Low frequencies get scaled by `factor`, high frequences see no change, medium
+        # `factor` (similar to yarn). Low frequencies get scaled by `factor`, high frequencies see no change, medium
         # frequencies are scaled by a value in between. Changing `low_freq_factor` and `high_freq_factor` changes what
         # is considered low, medium, and high frequencies.
         factor = 10.0
diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py
index 53f3edede7ae..c1cfc469697b 100644
--- a/tests/utils/test_modeling_tf_core.py
+++ b/tests/utils/test_modeling_tf_core.py
@@ -24,7 +24,7 @@
 
 from transformers import is_tf_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import _tf_gpu_memory_limit, require_tf, slow
+from transformers.testing_utils import require_tf, slow
 
 from ..test_modeling_tf_common import ids_tensor
 
@@ -48,20 +48,6 @@
     )
     from transformers.modeling_tf_utils import keras
 
-    if _tf_gpu_memory_limit is not None:
-        gpus = tf.config.list_physical_devices("GPU")
-        for gpu in gpus:
-            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
-            try:
-                tf.config.set_logical_device_configuration(
-                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
-                )
-                logical_gpus = tf.config.list_logical_devices("GPU")
-                print("Logical GPUs", logical_gpus)
-            except RuntimeError as e:
-                # Virtual devices must be set before GPUs have been initialized
-                print(e)
-
 
 @require_tf
 class TFCoreModelTesterMixin:
diff --git a/tests/utils/test_modeling_tf_utils.py b/tests/utils/test_modeling_tf_utils.py
index 995618a52050..116af748e6f9 100644
--- a/tests/utils/test_modeling_tf_utils.py
+++ b/tests/utils/test_modeling_tf_utils.py
@@ -16,7 +16,6 @@
 
 from __future__ import annotations
 
-import inspect
 import json
 import os
 import random
@@ -24,22 +23,19 @@
 import unittest
 import unittest.mock as mock
 
-from huggingface_hub import HfFolder, Repository, snapshot_download
+from huggingface_hub import HfFolder, snapshot_download
 from requests.exceptions import HTTPError
 
-from transformers import is_tf_available, is_torch_available
+from transformers import is_tf_available
 from transformers.configuration_utils import PretrainedConfig
 from transformers.testing_utils import (  # noqa: F401
     TOKEN,
     USER,
     CaptureLogger,
     TemporaryHubRepo,
-    _tf_gpu_memory_limit,
-    is_pt_tf_cross_test,
     is_staging_test,
     require_safetensors,
     require_tf,
-    require_torch,
     slow,
 )
 from transformers.utils import (
@@ -61,14 +57,9 @@
 
     from transformers import (
         BertConfig,
-        PreTrainedModel,
-        PushToHubCallback,
         RagRetriever,
-        TFAutoModel,
-        TFBertForMaskedLM,
         TFBertForSequenceClassification,
         TFBertModel,
-        TFPreTrainedModel,
         TFRagModel,
     )
     from transformers.modeling_tf_utils import keras, tf_shard_checkpoint, unpack_inputs
@@ -76,23 +67,6 @@
 
     tf.config.experimental.enable_tensor_float_32_execution(False)
 
-    if _tf_gpu_memory_limit is not None:
-        gpus = tf.config.list_physical_devices("GPU")
-        for gpu in gpus:
-            # Restrict TensorFlow to only allocate x GB of memory on the GPUs
-            try:
-                tf.config.set_logical_device_configuration(
-                    gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=_tf_gpu_memory_limit)]
-                )
-                logical_gpus = tf.config.list_logical_devices("GPU")
-                print("Logical GPUs", logical_gpus)
-            except RuntimeError as e:
-                # Virtual devices must be set before GPUs have been initialized
-                print(e)
-
-if is_torch_available():
-    from transformers import BertModel
-
 
 @require_tf
 class TFModelUtilsTest(unittest.TestCase):
@@ -241,34 +215,6 @@ def test_sharded_checkpoint_transfer(self):
         # If this doesn't throw an error then the test passes
         TFBertForSequenceClassification.from_pretrained("ArthurZ/tiny-random-bert-sharded")
 
-    @is_pt_tf_cross_test
-    def test_checkpoint_sharding_local_from_pt(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            _ = Repository(local_dir=tmp_dir, clone_from="hf-internal-testing/tiny-random-bert-sharded")
-            model = TFBertModel.from_pretrained(tmp_dir, from_pt=True)
-            # the model above is the same as the model below, just a sharded pytorch version.
-            ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-            for p1, p2 in zip(model.weights, ref_model.weights):
-                assert np.allclose(p1.numpy(), p2.numpy())
-
-    @is_pt_tf_cross_test
-    def test_checkpoint_loading_with_prefix_from_pt(self):
-        model = TFBertModel.from_pretrained(
-            "hf-internal-testing/tiny-random-bert", from_pt=True, load_weight_prefix="a/b"
-        )
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert", from_pt=True)
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-            self.assertTrue(p1.name.startswith("a/b/"))
-
-    @is_pt_tf_cross_test
-    def test_checkpoint_sharding_hub_from_pt(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert-sharded", from_pt=True)
-        # the model above is the same as the model below, just a sharded pytorch version.
-        ref_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        for p1, p2 in zip(model.weights, ref_model.weights):
-            assert np.allclose(p1.numpy(), p2.numpy())
-
     def test_shard_checkpoint(self):
         # This is the model we will use, total size 340,000 bytes.
         model = keras.Sequential(
@@ -437,16 +383,6 @@ def test_safetensors_checkpoint_sharding_local(self):
                 for p1, p2 in zip(model.weights, new_model.weights):
                     self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
 
-    @is_pt_tf_cross_test
-    @require_safetensors
-    def test_bfloat16_torch_loading(self):
-        # Assert that neither of these raise an error - both repos contain bfloat16 tensors
-        model1 = TFAutoModel.from_pretrained("Rocketknight1/tiny-random-gpt2-bfloat16-pt", from_pt=True)
-        model2 = TFAutoModel.from_pretrained("Rocketknight1/tiny-random-gpt2-bfloat16")  # PT-format safetensors
-        # Check that PT and safetensors loading paths end up with the same values
-        for weight1, weight2 in zip(model1.weights, model2.weights):
-            self.assertTrue(tf.reduce_all(weight1 == weight2))
-
     @slow
     def test_save_pretrained_signatures(self):
         model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
@@ -522,36 +458,6 @@ def test_safetensors_sharded_save_and_load(self):
             for p1, p2 in zip(model.weights, new_model.weights):
                 self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
 
-    @is_pt_tf_cross_test
-    def test_safetensors_save_and_load_pt_to_tf(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        pt_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            pt_model.save_pretrained(tmp_dir, safe_serialization=True)
-            # Check we have a model.safetensors file
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.weights, new_model.weights):
-                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
-    @is_pt_tf_cross_test
-    def test_sharded_safetensors_save_and_load_pt_to_tf(self):
-        model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        pt_model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            pt_model.save_pretrained(tmp_dir, safe_serialization=True, max_shard_size="150kB")
-            # Check we have a safetensors shard index file
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-            # Check models are equal
-            for p1, p2 in zip(model.weights, new_model.weights):
-                self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
     @require_safetensors
     def test_safetensors_load_from_hub(self):
         tf_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
@@ -581,19 +487,6 @@ def test_safetensors_tf_from_tf(self):
         for p1, p2 in zip(model.weights, new_model.weights):
             self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
 
-    @require_safetensors
-    @is_pt_tf_cross_test
-    def test_safetensors_tf_from_torch(self):
-        hub_model = TFBertModel.from_pretrained("hf-internal-testing/tiny-bert-tf-only")
-        model = BertModel.from_pretrained("hf-internal-testing/tiny-bert-pt-only")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, safe_serialization=True)
-            new_model = TFBertModel.from_pretrained(tmp_dir)
-
-        for p1, p2 in zip(hub_model.weights, new_model.weights):
-            self.assertTrue(np.allclose(p1.numpy(), p2.numpy()))
-
     @require_safetensors
     def test_safetensors_tf_from_sharded_h5_with_sharded_safetensors_local(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -729,37 +622,6 @@ def test_push_to_hub_via_save_pretrained(self):
                     break
             self.assertTrue(models_equal)
 
-    @is_pt_tf_cross_test
-    def test_push_to_hub_callback(self):
-        with TemporaryHubRepo(token=self._token) as tmp_repo:
-            config = BertConfig(
-                vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
-            )
-            model = TFBertForMaskedLM(config)
-            model.compile()
-
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                push_to_hub_callback = PushToHubCallback(
-                    output_dir=tmp_dir,
-                    hub_model_id=tmp_repo.repo_id,
-                    hub_token=self._token,
-                )
-                model.fit(model.dummy_inputs, model.dummy_inputs, epochs=1, callbacks=[push_to_hub_callback])
-
-            new_model = TFBertForMaskedLM.from_pretrained(tmp_repo.repo_id)
-            models_equal = True
-            for p1, p2 in zip(model.weights, new_model.weights):
-                if not tf.math.reduce_all(p1 == p2):
-                    models_equal = False
-                    break
-            self.assertTrue(models_equal)
-
-            tf_push_to_hub_params = dict(inspect.signature(TFPreTrainedModel.push_to_hub).parameters)
-            tf_push_to_hub_params.pop("base_model_card_args")
-            pt_push_to_hub_params = dict(inspect.signature(PreTrainedModel.push_to_hub).parameters)
-            pt_push_to_hub_params.pop("deprecated_kwargs")
-            self.assertDictEaual(tf_push_to_hub_params, pt_push_to_hub_params)
-
     def test_push_to_hub_in_organization(self):
         with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
             config = BertConfig(
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 9bd34168d6ff..4ec2c4974403 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -14,12 +14,13 @@
 # limitations under the License.
 import copy
 import glob
-import itertools
 import json
 import os
 import os.path
+import subprocess
 import sys
 import tempfile
+import textwrap
 import threading
 import unittest
 import unittest.mock as mock
@@ -29,6 +30,7 @@
 
 import requests
 from huggingface_hub import HfApi, HfFolder
+from parameterized import parameterized
 from pytest import mark
 from requests.exceptions import HTTPError
 
@@ -37,6 +39,7 @@
     AutoModel,
     AutoModelForImageClassification,
     AutoModelForSequenceClassification,
+    CLIPTextModelWithProjection,
     DynamicCache,
     LlavaForConditionalGeneration,
     MistralForCausalLM,
@@ -45,19 +48,23 @@
     is_torch_available,
     logging,
 )
+from transformers.modeling_flash_attention_utils import is_flash_attn_available
 from transformers.testing_utils import (
     TOKEN,
     CaptureLogger,
     LoggingLevel,
     TemporaryHubRepo,
     TestCasePlus,
+    hub_retry,
     is_staging_test,
     require_accelerate,
     require_flax,
+    require_read_token,
     require_safetensors,
     require_tf,
     require_torch,
     require_torch_accelerator,
+    require_torch_gpu,
     require_torch_multi_accelerator,
     require_usr_bin_time,
     slow,
@@ -73,6 +80,7 @@
     is_flash_attn_2_available,
     is_flax_available,
     is_tf_available,
+    is_torch_npu_available,
     is_torch_sdpa_available,
     is_torchdynamo_available,
 )
@@ -108,7 +116,6 @@
     from transformers.modeling_utils import (
         _find_disjoint,
         _find_identical,
-        dtype_byte_size,
     )
     from transformers.pytorch_utils import isin_mps_friendly
 
@@ -327,6 +334,18 @@ def tearDown(self):
         torch.set_default_dtype(self.old_dtype)
         super().tearDown()
 
+    def test_hub_retry(self):
+        @hub_retry(max_attempts=2)
+        def test_func():
+            # First attempt will fail with a connection error
+            if not hasattr(test_func, "attempt"):
+                test_func.attempt = 1
+                raise requests.exceptions.ConnectionError("Connection failed")
+            # Second attempt will succeed
+            return True
+
+        self.assertTrue(test_func())
+
     @slow
     def test_model_from_pretrained(self):
         model_name = "google-bert/bert-base-uncased"
@@ -464,9 +483,11 @@ def test_model_from_config_torch_dtype_str(self):
         # test that from_pretrained works with torch_dtype being strings like "float32" for PyTorch backend
         model = AutoModel.from_pretrained(TINY_T5, torch_dtype="float32")
         self.assertEqual(model.dtype, torch.float32)
+        self.assertIsInstance(model.config.torch_dtype, torch.dtype)
 
         model = AutoModel.from_pretrained(TINY_T5, torch_dtype="float16")
         self.assertEqual(model.dtype, torch.float16)
+        self.assertIsInstance(model.config.torch_dtype, torch.dtype)
 
         # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
         with self.assertRaises(ValueError):
@@ -477,14 +498,22 @@ def test_model_from_config_torch_dtype_composite(self):
         Test that from_pretrained works with torch_dtype being as a dict per each sub-config in composite config
         Tiny-Llava has saved auto dtype as `torch.float32` for all modules.
         """
+        # Load without dtype specified
+        model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA)
+        self.assertEqual(model.language_model.dtype, torch.float32)
+        self.assertEqual(model.vision_tower.dtype, torch.float32)
+        self.assertIsInstance(model.config.torch_dtype, torch.dtype)
+
         # should be able to set torch_dtype as a simple string and the model loads it correctly
         model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, torch_dtype="float32")
         self.assertEqual(model.language_model.dtype, torch.float32)
         self.assertEqual(model.vision_tower.dtype, torch.float32)
+        self.assertIsInstance(model.config.torch_dtype, torch.dtype)
 
         model = LlavaForConditionalGeneration.from_pretrained(TINY_LLAVA, torch_dtype=torch.float16)
         self.assertEqual(model.language_model.dtype, torch.float16)
         self.assertEqual(model.vision_tower.dtype, torch.float16)
+        self.assertIsInstance(model.config.torch_dtype, torch.dtype)
 
         # should be able to set torch_dtype as a dict for each sub-config
         model = LlavaForConditionalGeneration.from_pretrained(
@@ -493,6 +522,7 @@ def test_model_from_config_torch_dtype_composite(self):
         self.assertEqual(model.language_model.dtype, torch.float32)
         self.assertEqual(model.vision_tower.dtype, torch.float16)
         self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.bfloat16)
+        self.assertIsInstance(model.config.torch_dtype, torch.dtype)
 
         # should be able to set the values as torch.dtype (not str)
         model = LlavaForConditionalGeneration.from_pretrained(
@@ -501,6 +531,7 @@ def test_model_from_config_torch_dtype_composite(self):
         self.assertEqual(model.language_model.dtype, torch.float32)
         self.assertEqual(model.vision_tower.dtype, torch.float16)
         self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.bfloat16)
+        self.assertIsInstance(model.config.torch_dtype, torch.dtype)
 
         # should be able to set the values in configs directly and pass it to `from_pretrained`
         config = copy.deepcopy(model.config)
@@ -511,6 +542,7 @@ def test_model_from_config_torch_dtype_composite(self):
         self.assertEqual(model.language_model.dtype, torch.float32)
         self.assertEqual(model.vision_tower.dtype, torch.bfloat16)
         self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.float16)
+        self.assertIsInstance(model.config.torch_dtype, torch.dtype)
 
         # but if the model has `_keep_in_fp32_modules` then those modules should be in fp32 no matter what
         LlavaForConditionalGeneration._keep_in_fp32_modules = ["multi_modal_projector"]
@@ -518,6 +550,7 @@ def test_model_from_config_torch_dtype_composite(self):
         self.assertEqual(model.language_model.dtype, torch.float32)
         self.assertEqual(model.vision_tower.dtype, torch.bfloat16)
         self.assertEqual(model.multi_modal_projector.linear_1.weight.dtype, torch.float32)
+        self.assertIsInstance(model.config.torch_dtype, torch.dtype)
 
         # torch.set_default_dtype() supports only float dtypes, so will fail with non-float type
         with self.assertRaises(ValueError):
@@ -526,19 +559,6 @@ def test_model_from_config_torch_dtype_composite(self):
                 TINY_LLAVA, torch_dtype={"text_config": "float32", "vision_config": "int64", "": "float16"}
             )
 
-    @require_torch
-    def test_model_from_pretrained_meta_device(self):
-        def is_on_meta(model_id, dtype):
-            with torch.device("meta"):
-                model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype)
-                return all(value.device.type == "meta" for value in model.state_dict().values())
-
-        model_ids = ("fxmarty/tiny-llama-fast-tokenizer", "fxmarty/small-llama-testing")
-        dtypes = (None, "auto", torch.float16)
-
-        for model_id, dtype in itertools.product(model_ids, dtypes):
-            self.assertTrue(is_on_meta(model_id, dtype))
-
     def test_model_from_pretrained_torch_dtype(self):
         # test that the model can be instantiated with dtype of either
         # 1. explicit from_pretrained's torch_dtype argument
@@ -618,6 +638,14 @@ def remove_torch_dtype(model_path):
         model = AutoModel.from_pretrained(TINY_BERT_FOR_TOKEN_CLASSIFICATION, torch_dtype="auto")
         self.assertEqual(model.dtype, torch.float32)
 
+        # test model that init the model with _from_config
+        model = CLIPTextModelWithProjection.from_pretrained(
+            "hf-internal-testing/diffusers-stable-diffusion-tiny-all",
+            subfolder="text_encoder",
+            torch_dtype=torch.bfloat16,
+        )
+        self.assertEqual(model.dtype, torch.bfloat16)
+
     def test_model_from_pretrained_attn_implementation(self):
         # test that the model can be instantiated with attn_implementation of either
         # 1. explicit from_pretrained's attn_implementation argument
@@ -626,7 +654,7 @@ def test_model_from_pretrained_attn_implementation(self):
         if is_torch_sdpa_available():
             attn_implementation_available.append("sdpa")
 
-        if is_flash_attn_2_available():
+        if is_flash_attn_available():
             attn_implementation_available.append("flash_attention_2")
 
         for requested_attn_implementation in attn_implementation_available:
@@ -650,7 +678,7 @@ def test_model_from_config_attn_implementation(self):
         if is_torch_sdpa_available():
             attn_implementation_available.append("sdpa")
 
-        if is_flash_attn_2_available():
+        if is_flash_attn_available():
             attn_implementation_available.append("flash_attention_2")
 
         for requested_attn_implementation in attn_implementation_available:
@@ -675,31 +703,6 @@ def test_model_from_config_attn_implementation(self):
             model = AutoModelForCausalLM.from_config(config=config, attn_implementation=requested_attn_implementation)
             self.assertEqual(model.config._attn_implementation, requested_attn_implementation)
 
-    def test_torch_dtype_byte_sizes(self):
-        torch_dtypes_and_bytes = [
-            (torch.double, 8),
-            (torch.float64, 8),
-            (torch.float, 4),
-            (torch.float32, 4),
-            (torch.half, 2),
-            (torch.float16, 2),
-            (torch.bfloat16, 2),
-            (torch.long, 8),
-            (torch.int64, 8),
-            (torch.int, 4),
-            (torch.int32, 4),
-            (torch.short, 2),
-            (torch.int16, 2),
-            (torch.uint8, 1),
-            (torch.int8, 1),
-            (torch.float8_e4m3fn, 1),
-            (torch.float8_e5m2, 1),
-            (torch.bool, 0.125),
-        ]
-
-        for torch_dtype, bytes_per_element in torch_dtypes_and_bytes:
-            self.assertEqual(dtype_byte_size(torch_dtype), bytes_per_element)
-
     def test_no_super_init_config_and_model(self):
         config = NoSuperInitConfig(attribute=32)
         model = NoSuperInitModel(config)
@@ -738,7 +741,7 @@ def test_checkpoint_sharding_local_bin(self):
                     # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
                     # the size asked for (since we count parameters)
                     if size >= max_size_int + 50000:
-                        state_dict = torch.load(shard_file)
+                        state_dict = torch.load(shard_file, weights_only=True)
                         self.assertEqual(len(state_dict), 1)
 
                 # Check the index and the shard files found match
@@ -993,6 +996,7 @@ def test_from_pretrained_low_cpu_mem_usage_functional(self):
         for mname in mnames:
             _ = BertModel.from_pretrained(mname, low_cpu_mem_usage=True)
 
+    @slow
     @require_usr_bin_time
     @require_accelerate
     @mark.accelerate_tests
@@ -1001,30 +1005,29 @@ def test_from_pretrained_low_cpu_mem_usage_equal(self):
         # Now though these should be around the same.
         # TODO: Look for good bounds to check that their timings are near the same
 
-        mname = "hf-internal-testing/tiny-random-bert"
+        mname = "HuggingFaceTB/SmolLM-135M"
 
         preamble = "from transformers import AutoModel"
         one_liner_str = f'{preamble}; AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=False)'
         # Save this output as `max_rss_normal` if testing memory results
         max_rss_normal = self.python_one_liner_max_rss(one_liner_str)
-        # print(f"{max_rss_normal=}")
 
         one_liner_str = f'{preamble};  AutoModel.from_pretrained("{mname}", low_cpu_mem_usage=True)'
         # Save this output as `max_rss_low_mem` if testing memory results
         max_rss_low_mem = self.python_one_liner_max_rss(one_liner_str)
 
-        # Should be within 2MBs of each other (overhead)
+        # Should be within 5MBs of each other (overhead)
         self.assertAlmostEqual(
             max_rss_normal / 1024 / 1024,
             max_rss_low_mem / 1024 / 1024,
-            delta=2,
+            delta=5,
             msg="using `low_cpu_mem_usage` should incur the same memory usage in both cases.",
         )
 
         # if you want to compare things manually, let's first look at the size of the model in bytes
-        # model = BertModel.from_pretrained(mname, low_cpu_mem_usage=False)
+        # model = AutoModel.from_pretrained(mname, low_cpu_mem_usage=False)
         # total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
-        # total_bytes = total_numel * 4  # 420MB
+        # total_bytes = total_numel * 4
         # Now the diff_bytes should be very close to total_bytes, but the reports are inconsistent.
         # The easiest way to test this is to switch the model and torch.load to do all the work on
         # gpu - that way one can measure exactly the total and peak memory used. Perhaps once we add
@@ -1673,7 +1676,7 @@ def forward(self):
     def test_isin_mps_friendly(self):
         """tests that our custom `isin_mps_friendly` matches `torch.isin`"""
         random_ids = torch.randint(0, 100, (100,))
-        # We can match against an interger
+        # We can match against an integer
         random_test_integer = torch.randint(0, 100, (1,)).item()
         self.assertTrue(
             torch.equal(
@@ -1709,17 +1712,7 @@ class DummyBertWithMixin(BertModel, GenerationMixin):
         self.assertTrue("" == cl.out)
         self.assertTrue(can_generate)
 
-        # 3 - Alternatively, a model can implement a `generate` method
-        class DummyBertWithGenerate(BertModel):
-            def generate(self):
-                pass
-
-        with CaptureLogger(logger) as cl:
-            can_generate = DummyBertWithGenerate.can_generate()
-        self.assertTrue("" == cl.out)
-        self.assertTrue(can_generate)
-
-        # 4 - Finally, it can inherit from a model that can generate
+        # 3 - Finally, it can inherit from a model that can generate
         class DummyBertWithParent(DummyBertWithMixin):
             pass
 
@@ -1728,7 +1721,7 @@ class DummyBertWithParent(DummyBertWithMixin):
         self.assertTrue("" == cl.out)
         self.assertTrue(can_generate)
 
-        # 5 - BC: models with a custom `prepare_inputs_for_generation` can generate (it was assumed they inherited
+        # 4 - BC: models with a custom `prepare_inputs_for_generation` can generate (it was assumed they inherited
         # `GenerationMixin`)
         class DummyBertWithPrepareInputs(BertModel):
             def prepare_inputs_for_generation(self):
@@ -1902,6 +1895,61 @@ def test_unknown_quantization_config(self):
             self.assertEqual(len(cm.records), 1)
             self.assertTrue(cm.records[0].message.startswith("Unknown quantization type, got"))
 
+    @parameterized.expand([("Qwen/Qwen2.5-3B-Instruct", 10), ("meta-llama/Llama-2-7b-chat-hf", 10)])
+    @slow
+    @require_read_token
+    @require_torch_gpu
+    def test_loading_is_fast_on_gpu(self, model_id: str, max_loading_time: float):
+        """
+        This test is used to avoid regression on https://github.com/huggingface/transformers/pull/36380.
+        10s should be more than enough for both models, and allows for some margin as loading time are quite
+        unstable. Before #36380, it used to take more than 40s, so 10s is still reasonable.
+        Note that we run this test in a subprocess, to ensure that cuda is not already initialized/warmed-up.
+        """
+        # First download the weights if not already on disk
+        _ = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
+
+        script_to_run = textwrap.dedent(
+            """
+            import torch
+            import time
+            import argparse
+            from transformers import AutoModelForCausalLM
+
+            parser = argparse.ArgumentParser()
+            parser.add_argument("model_id", type=str)
+            parser.add_argument("max_loading_time", type=float)
+            args = parser.parse_args()
+
+            device = torch.device("cuda:0")
+
+            torch.cuda.synchronize(device)
+            t0 = time.time()
+            model = AutoModelForCausalLM.from_pretrained(args.model_id, torch_dtype=torch.float16, device_map=device)
+            torch.cuda.synchronize(device)
+            dt = time.time() - t0
+
+            # Assert loading is faster (it should be more than enough in both cases)
+            if dt > args.max_loading_time:
+                raise ValueError(f"Loading took {dt:.2f}s! It should not take more than {args.max_loading_time}s")
+            # Ensure everything is correctly loaded on gpu
+            bad_device_params = {k for k, v in model.named_parameters() if v.device != device}
+            if len(bad_device_params) > 0:
+                raise ValueError(f"The following parameters are not on GPU: {bad_device_params}")
+            """
+        )
+
+        with tempfile.NamedTemporaryFile(mode="w+", suffix=".py") as tmp:
+            tmp.write(script_to_run)
+            tmp.flush()
+            tmp.seek(0)
+            cmd = f"python {tmp.name} {model_id} {max_loading_time}".split()
+            try:
+                # We cannot use a timeout of `max_loading_time` as cuda initialization can take up to 15-20s
+                _ = subprocess.run(cmd, capture_output=True, env=self.get_env(), text=True, check=True, timeout=60)
+            except subprocess.CalledProcessError as e:
+                raise Exception(f"The following error was captured: {e.stderr}")
+
 
 @slow
 @require_torch
@@ -2304,14 +2352,14 @@ def check_to_4d(self, mask_converter, q_len, kv_len, additional_mask=None, bsz=3
             num_tokens_masked = bsz * (q_len * (q_len - 1) // 2)
 
             if 0 not in mask_2d:
-                assert (mask_4d != 0).sum().cpu().item() == num_tokens_masked
+                assert (mask_4d != 0).sum().item() == num_tokens_masked
             if 0 in mask_2d:
                 # at least causal mask + maybe more
-                assert (mask_4d != 0).sum().cpu().item() >= num_tokens_masked
+                assert (mask_4d != 0).sum().item() >= num_tokens_masked
                 self.check_non_causal(bsz, q_len, kv_len, mask_2d, mask_4d)
         elif not mask_converter.is_causal and context is None:
             if 0 not in mask_2d:
-                assert (mask_4d != 0).sum().cpu().item() == 0
+                assert (mask_4d != 0).sum().item() == 0
             if 0 in mask_2d:
                 self.check_non_causal(bsz, q_len, kv_len, mask_2d, mask_4d)
         elif mask_converter.is_causal and context is not None:
@@ -2320,10 +2368,10 @@ def check_to_4d(self, mask_converter, q_len, kv_len, additional_mask=None, bsz=3
             num_tokens_masked = bsz * num_tokens_masked
 
             if 0 not in mask_2d:
-                assert (mask_4d != 0).sum().cpu().item() == num_tokens_masked
+                assert (mask_4d != 0).sum().item() == num_tokens_masked
             if 0 in mask_2d:
                 # at least causal mask + maybe more
-                assert (mask_4d != 0).sum().cpu().item() >= num_tokens_masked
+                assert (mask_4d != 0).sum().item() >= num_tokens_masked
                 self.check_non_causal(bsz, q_len, kv_len, mask_2d, mask_4d)
 
     def check_to_causal(self, mask_converter, q_len, kv_len, bsz=3):
@@ -2341,15 +2389,15 @@ def check_to_causal(self, mask_converter, q_len, kv_len, bsz=3):
             # k * (k+1) / 2 tokens are masked in triangualar masks
             num_tokens_masked = bsz * (q_len * (q_len - 1) // 2)
 
-            assert (mask_4d != 0).sum().cpu().item() == num_tokens_masked
+            assert (mask_4d != 0).sum().item() == num_tokens_masked
         elif not mask_converter.is_causal and context is None:
-            assert (mask_4d != 0).sum().cpu().item() == 0
+            assert (mask_4d != 0).sum().item() == 0
         elif mask_converter.is_causal and context is not None:
             # k * (k+1) / 2 tokens are masked in triangualar masks
             num_tokens_masked = (q_len * (q_len - 1) // 2) + self.compute_num_context_mask(kv_len, context, q_len)
             num_tokens_masked = bsz * num_tokens_masked
 
-            assert (mask_4d != 0).sum().cpu().item() == num_tokens_masked
+            assert (mask_4d != 0).sum().item() == num_tokens_masked
 
     def compute_num_context_mask(self, kv_len, context, q_len):
         # This function computes the # of attention tokens that are added for
@@ -2604,6 +2652,11 @@ def test_not_available_flash(self):
         if is_flash_attn_2_available():
             self.skipTest(reason="Please uninstall flash-attn package to run test_not_available_flash")
 
+        if is_torch_npu_available():
+            self.skipTest(
+                reason="FlashAttention2 is supported on Ascend NPU without using package `flash-attn`, ignore this test case."
+            )
+
         with self.assertRaises(ImportError) as cm:
             _ = AutoModel.from_pretrained(
                 "hf-internal-testing/tiny-random-GPTBigCodeModel", attn_implementation="flash_attention_2"
@@ -2614,6 +2667,11 @@ def test_not_available_flash_with_config(self):
         if is_flash_attn_2_available():
             self.skipTest(reason="Please uninstall flash-attn package to run test_not_available_flash")
 
+        if is_torch_npu_available():
+            self.skipTest(
+                reason="FlashAttention2 is supported on Ascend NPU without using package `flash-attn`, ignore this test case."
+            )
+
         config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-GPTBigCodeModel")
 
         with self.assertRaises(ImportError) as cm:
diff --git a/tests/utils/test_offline.py b/tests/utils/test_offline.py
index 59ed034201a6..2b383b357e25 100644
--- a/tests/utils/test_offline.py
+++ b/tests/utils/test_offline.py
@@ -14,6 +14,7 @@
 
 import subprocess
 import sys
+import unittest
 from typing import Tuple
 
 from transformers import BertConfig, BertModel, BertTokenizer, pipeline
@@ -22,6 +23,7 @@
 
 class OfflineTests(TestCasePlus):
     @require_torch
+    @unittest.skip("This test is failing on main")  # TODO matt/ydshieh, this test needs to be fixed
     def test_offline_mode(self):
         # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
         # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py
index f5f4a3944b16..2f46e0fe4241 100644
--- a/utils/check_bad_commit.py
+++ b/utils/check_bad_commit.py
@@ -47,6 +47,9 @@ def create_script(target_test):
 
 if len(result.stderr) > 0:
     if "ERROR: file or directory not found: " in result.stderr:
+        print("test file or directory not found in this commit")
+        exit(0)
+    elif "ERROR: not found: " in result.stderr:
         print("test not found in this commit")
         exit(0)
     else:
diff --git a/utils/check_build.py b/utils/check_build.py
index 9ac309bd675b..929caeb7fa06 100644
--- a/utils/check_build.py
+++ b/utils/check_build.py
@@ -21,8 +21,6 @@
 FILES_TO_FIND = [
     "kernels/rwkv/wkv_cuda.cu",
     "kernels/rwkv/wkv_op.cpp",
-    "kernels/deformable_detr/ms_deform_attn.h",
-    "kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh",
     "kernels/falcon_mamba/selective_scan_with_ln_interface.py",
     "kernels/falcon_mamba/__init__.py",
     "kernels/__init__.py",
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 87b7e8be0adf..76fc2cc428f8 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -33,7 +33,7 @@
 
 SPECIAL_CASES_TO_ALLOW = {
     # 'max_position_embeddings' is not used in modeling file, but needed for eval frameworks like Huggingface's lighteval (https://github.com/huggingface/lighteval/blob/af24080ea4f16eaf1683e353042a2dfc9099f038/src/lighteval/models/base_model.py#L264).
-    # periods and offsers are not used in modeling file, but used in the configuration file to define `layers_block_type` and `layers_num_experts`.
+    # periods and offsets are not used in modeling file, but used in the configuration file to define `layers_block_type` and `layers_num_experts`.
     "BambaConfig": [
         "attn_layer_indices",
     ],
@@ -226,6 +226,26 @@
         "giou_loss_coefficient",
     ],
     "GPTNeoXConfig": ["rotary_emb_base"],
+    "Gemma3Config": ["boi_token_index", "eoi_token_index"],
+    "Gemma3TextConfig": ["cache_implementation", "tie_word_embeddings"],
+    "ShieldGemma2Config": [
+        "boi_token_index",
+        "eoi_token_index",
+        "initializer_range",
+        "mm_tokens_per_image",
+        "text_config",
+        "vision_config",
+    ],
+    "Llama4Config": ["boi_token_index", "eoi_token_index"],
+    "Llama4TextConfig": [
+        "interleave_moe_layer_step",
+        "no_rope_layer_interval",
+        "no_rope_layers",
+        "output_router_logits",
+        "router_aux_loss_coef",
+        "router_jitter_noise",
+    ],
+    "Llama4VisionConfig": ["multi_modal_projector_bias", "norm_eps"],
 }
 
 
@@ -348,6 +368,8 @@ def check_attribute_being_used(config_class, attributes, default_value, source_s
         "rope_theta",
         "partial_rotary_factor",
         "pretraining_tp",
+        "boi_token_index",
+        "eoi_token_index",
     ]
     attributes_used_in_generation = ["encoder_no_repeat_ngram_size"]
 
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index 341fc42b9c68..a22abd238810 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -47,6 +47,7 @@
     "LlamaConfig",
     "GraniteConfig",
     "GraniteMoeConfig",
+    "Qwen3MoeConfig",
 }
 
 
diff --git a/utils/check_copies.py b/utils/check_copies.py
index c62a192c1075..0dffa79a3275 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -245,7 +245,7 @@ def g(x):
             ["block_without_name", "block_with_name"],
         ]:
             raise ValueError(
-                f"""Class defined in {filename} doesn't have the expected stucture.
+                f"""Class defined in {filename} doesn't have the expected structure.
                 See the docstring of `_sanity_check_splits` in the file `utils/check_copies.py`""",
             )
 
@@ -652,7 +652,7 @@ def is_copy_consistent(filename: str, overwrite: bool = False, buffer: dict = No
 
     Returns:
         `Optional[List[Tuple[str, int]]]`: If `overwrite=False`, returns the list of differences as tuples `(str, int)`
-        with the name of the object having a diff and the line number where theere is the first diff.
+        with the name of the object having a diff and the line number where there is the first diff.
     """
     base_path = TRANSFORMERS_PATH if not filename.startswith("tests") else MODEL_TEST_PATH
 
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index 43dfdc9a4979..b9f6645638f5 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -67,6 +67,7 @@
 # docstrings instead. If formatting should be ignored for the docstring, you can put a comment # no-format on the
 # line before the docstring.
 OBJECTS_TO_IGNORE = [
+    "Llama4Processor",
     # Deprecated
     "InputExample",
     "InputFeatures",
@@ -74,11 +75,6 @@
     "TFSequenceSummary",
     "TFBertTokenizer",
     "TFGPT2Tokenizer",
-    # Going through an argument deprecation cycle, remove after v4.46
-    "HybridCache",
-    "MambaCache",
-    "SlidingWindowCache",
-    "StaticCache",
     # Missing arguments in the docstring
     "ASTFeatureExtractor",
     "AlbertModel",
@@ -524,6 +520,7 @@
     "TimeSeriesTransformerConfig",
     "TokenClassificationPipeline",
     "TrOCRConfig",
+    "Phi4MultimodalProcessor",
     "TrainerState",
     "TrainingArguments",
     "TrajectoryTransformerConfig",
@@ -683,7 +680,7 @@ def replace_default_in_arg_description(description: str, default: Any) -> str:
 
     Args:
         description (`str`): The description of an argument in a docstring to process.
-        default (`Any`): The default value that whould be in the docstring of that argument.
+        default (`Any`): The default value that would be in the docstring of that argument.
 
     Returns:
        `str`: The description updated with the new default value.
@@ -735,7 +732,7 @@ def replace_default_in_arg_description(description: str, default: Any) -> str:
         elif _re_parse_description.search(description) is None:
             idx = description.find(OPTIONAL_KEYWORD)
             len_optional = len(OPTIONAL_KEYWORD)
-            description = f"{description[:idx + len_optional]}, defaults to {str_default}"
+            description = f"{description[: idx + len_optional]}, defaults to {str_default}"
         else:
             description = _re_parse_description.sub(rf"*optional*, defaults to {str_default}", description)
 
@@ -906,7 +903,7 @@ def match_docstring_with_signature(obj: Any) -> Optional[Tuple[str, str]]:
 
 def fix_docstring(obj: Any, old_doc_args: str, new_doc_args: str):
     """
-    Fixes the docstring of an object by replacing its arguments documentaiton by the one matched with the signature.
+    Fixes the docstring of an object by replacing its arguments documentation by the one matched with the signature.
 
     Args:
         obj (`Any`):
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index e66d69ada1a2..48a6b2fa7185 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -222,10 +222,30 @@ def check_dummies(overwrite: bool = False):
                 with open(dummy_file_paths[backend], "w", encoding="utf-8", newline="\n") as f:
                     f.write(dummy_files[backend])
             else:
+                # Temporary fix to help people identify which objects introduced are not correctly protected.
+                found = False
+                for _actual, _dummy in zip(
+                    actual_dummies["torch"].split("class"), dummy_files["torch"].split("class")
+                ):
+                    if _actual != _dummy:
+                        actual_broken = _actual
+                        dummy_broken = _dummy
+                        found = True
+                        break
+
+                if not found:
+                    print("A transient error was found with the dummies, please investigate.")
+                    continue
+
                 raise ValueError(
                     "The main __init__ has objects that are not present in "
-                    f"transformers.utils.dummy_{short_names.get(backend, backend)}_objects.py. Run `make fix-copies` "
-                    "to fix this."
+                    f"transformers.utils.dummy_{short_names.get(backend, backend)}_objects.py.\n"
+                    f" It is likely the following objects are responsible, see these excerpts: \n"
+                    f"---------------------------------- Actual -------------------------------------\n"
+                    f" \n {actual_broken} \n"
+                    f"---------------------------------- Dummy -------------------------------------\n"
+                    f" \n {dummy_broken} \n"
+                    "Run `make fix-copies` to fix this."
                 )
 
 
diff --git a/utils/check_inits.py b/utils/check_inits.py
index 840bad086dd7..95e5a48a0fb0 100644
--- a/utils/check_inits.py
+++ b/utils/check_inits.py
@@ -16,7 +16,7 @@
 Utility that checks the custom inits of Transformers are well-defined: Transformers uses init files that delay the
 import of an object to when it's actually needed. This is to avoid the main init importing all models, which would
 make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with
-delayed imports have two halves: one definining a dictionary `_import_structure` which maps modules to the name of the
+delayed imports have two halves: one defining a dictionary `_import_structure` which maps modules to the name of the
 objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. The goal of this
 script is to check the objects defined in both halves are the same.
 
@@ -363,7 +363,7 @@ def check_submodules():
     if len(module_not_registered) > 0:
         list_of_modules = "\n".join(f"- {module}" for module in module_not_registered)
         raise ValueError(
-            "The following submodules are not properly registed in the main init of Transformers:\n"
+            "The following submodules are not properly registered in the main init of Transformers:\n"
             f"{list_of_modules}\n"
             "Make sure they appear somewhere in the keys of `_import_structure` with an empty list as value."
         )
diff --git a/utils/check_modular_conversion.py b/utils/check_modular_conversion.py
index 033ad9ba81eb..d92ff4092039 100644
--- a/utils/check_modular_conversion.py
+++ b/utils/check_modular_conversion.py
@@ -23,7 +23,7 @@ def process_file(modular_file_path, generated_modeling_content, file_type="model
     file_name_suffix = file_type.split("*")[-1] if "*" in file_type else ""
     file_path = modular_file_path.replace("modular_", f"{file_name_prefix}_").replace(".py", f"{file_name_suffix}.py")
     # Read the actual modeling file
-    with open(file_path, "r") as modeling_file:
+    with open(file_path, "r", encoding="utf-8") as modeling_file:
         content = modeling_file.read()
     output_buffer = StringIO(generated_modeling_content[file_type][0])
     output_buffer.seek(0)
@@ -39,7 +39,7 @@ def process_file(modular_file_path, generated_modeling_content, file_type="model
     # Check for differences
     if diff_list:
         if fix_and_overwrite:
-            with open(file_path, "w") as modeling_file:
+            with open(file_path, "w", encoding="utf-8", newline="\n") as modeling_file:
                 modeling_file.write(generated_modeling_content[file_type][0])
             console.print(f"[bold blue]Overwritten {file_path} with the generated content.[/bold blue]")
         else:
@@ -162,7 +162,7 @@ def guaranteed_no_diff(modular_file_path, dependencies, models_in_diff):
 
         import multiprocessing
 
-        with multiprocessing.Pool(4) as p:
+        with multiprocessing.Pool(args.num_workers) as p:
             outputs = p.map(compare_files, new_ordered_files)
         for output in outputs:
             non_matching_files += output
@@ -173,5 +173,5 @@ def guaranteed_no_diff(modular_file_path, dependencies, models_in_diff):
     if skipped_models:
         console.print(
             f"[bold green]Skipped {len(skipped_models)} models and their dependencies that are not in the diff: "
-            f"{', '.join(skipped_models)}[/bold green]"
+            f"{', '.join(sorted(skipped_models))}[/bold green]"
         )
diff --git a/utils/check_repo.py b/utils/check_repo.py
index c91edc52cb49..42beda83e6a7 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -86,8 +86,11 @@
     "Idefics2PerceiverResampler",
     "Idefics2VisionTransformer",
     "Idefics3VisionTransformer",
+    "SmolVLMVisionTransformer",
     "AriaTextForCausalLM",
     "AriaTextModel",
+    "Phi4MultimodalAudioModel",
+    "Phi4MultimodalVisionModel",
 ]
 
 # Update this list for models that are not tested with a comment explaining the reason it should not be.
@@ -141,6 +144,8 @@
         "Qwen2_5_VLModel",  # Building part of bigger (tested) model. Tested implicitly through Qwen2_5_VLForConditionalGeneration.
         "MllamaTextModel",  # Building part of bigger (tested) model. # TODO: add tests
         "MllamaVisionModel",  # Building part of bigger (tested) model. # TODO: add tests
+        "Llama4TextModel",  # Building part of bigger (tested) model. # TODO: add tests
+        "Llama4VisionModel",  # Building part of bigger (tested) model. # TODO: add tests
         "Emu3VQVAE",  # Building part of bigger (tested) model
         "Emu3TextModel",  # Building part of bigger (tested) model
     ]
@@ -166,6 +171,8 @@
     "models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py",
     "models/decision_transformer/test_modeling_decision_transformer.py",
     "models/bark/test_modeling_bark.py",
+    "models/shieldgemma2/test_modeling_shieldgemma2.py",
+    "models/llama4/test_modeling_llama4.py",
 ]
 
 # Update this list for models that are not in any of the auto MODEL_XXX_MAPPING. Being in this list is an exception and
@@ -333,6 +340,8 @@
     "SegGptForImageSegmentation",
     "SiglipVisionModel",
     "SiglipTextModel",
+    "Siglip2VisionModel",
+    "Siglip2TextModel",
     "ChameleonVQVAE",  # no autoclass for VQ-VAE models
     "VitPoseForPoseEstimation",
     "CLIPTextModel",
@@ -1077,8 +1086,7 @@ def check_all_objects_are_documented():
     undocumented_objs = [c for c in objects if c not in documented_objs and not ignore_undocumented(c)]
     if len(undocumented_objs) > 0:
         raise Exception(
-            "The following objects are in the public init so should be documented:\n - "
-            + "\n - ".join(undocumented_objs)
+            "The following objects are in the public init, but not in the docs:\n - " + "\n - ".join(undocumented_objs)
         )
     check_model_type_doc_match()
     check_public_method_exists(documented_methods_map)
diff --git a/utils/check_support_list.py b/utils/check_support_list.py
deleted file mode 100644
index 55d93611f4ce..000000000000
--- a/utils/check_support_list.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utility that checks the supports of 3rd party libraries are listed in the documentation file. Currently, this includes:
-- flash attention support
-- SDPA support
-
-Use from the root of the repo with (as used in `make repo-consistency`):
-
-```bash
-python utils/check_support_list.py
-```
-
-It has no auto-fix mode.
-"""
-
-import os
-from glob import glob
-
-
-# All paths are set with the intent you should run this script from the root of the repo with the command
-# python utils/check_doctest_list.py
-REPO_PATH = "."
-
-
-def check_flash_support_list():
-    with open(os.path.join(REPO_PATH, "docs/source/en/perf_infer_gpu_one.md"), "r") as f:
-        doctext = f.read()
-
-        doctext = doctext.split("FlashAttention-2 is currently supported for the following architectures:")[1]
-        doctext = doctext.split("You can request to add FlashAttention-2 support")[0]
-
-    patterns = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_*.py"))
-    patterns_tf = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_tf_*.py"))
-    patterns_flax = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_flax_*.py"))
-    patterns = list(set(patterns) - set(patterns_tf) - set(patterns_flax))
-    archs_supporting_fa2 = []
-    for filename in patterns:
-        with open(filename, "r") as f:
-            text = f.read()
-
-            if "_supports_flash_attn_2 = True" in text:
-                model_name = os.path.basename(filename).replace(".py", "").replace("modeling_", "")
-                archs_supporting_fa2.append(model_name)
-
-    for arch in archs_supporting_fa2:
-        if arch not in doctext:
-            raise ValueError(
-                f"{arch} should be in listed in the flash attention documentation but is not. Please update the documentation."
-            )
-
-
-def check_sdpa_support_list():
-    with open(os.path.join(REPO_PATH, "docs/source/en/perf_infer_gpu_one.md"), "r") as f:
-        doctext = f.read()
-
-        doctext = doctext.split(
-            "For now, Transformers supports SDPA inference and training for the following architectures:"
-        )[1]
-        doctext = doctext.split("Note that FlashAttention can only be used for models using the")[0]
-        doctext = doctext.lower()
-
-    patterns = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_*.py"))
-    patterns_tf = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_tf_*.py"))
-    patterns_flax = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_flax_*.py"))
-    patterns = list(set(patterns) - set(patterns_tf) - set(patterns_flax))
-    archs_supporting_sdpa = []
-    for filename in patterns:
-        with open(filename, "r") as f:
-            text = f.read()
-
-            if "_supports_sdpa = True" in text:
-                model_name = os.path.basename(filename).replace(".py", "").replace("modeling_", "")
-                archs_supporting_sdpa.append(model_name)
-
-    for arch in archs_supporting_sdpa:
-        if not any(term in doctext for term in [arch, arch.replace("_", "-"), arch.replace("_", " ")]):
-            raise ValueError(
-                f"{arch} should be in listed in the SDPA documentation but is not. Please update the documentation."
-            )
-
-
-if __name__ == "__main__":
-    check_flash_support_list()
-    check_sdpa_support_list()
diff --git a/utils/check_table.py b/utils/check_table.py
deleted file mode 100644
index 9ce7deaf6e32..000000000000
--- a/utils/check_table.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Utility that checks the big table in the file docs/source/en/index.md and potentially updates it.
-
-Use from the root of the repo with:
-
-```bash
-python utils/check_table.py
-```
-
-for a check that will error in case of inconsistencies (used by `make repo-consistency`).
-
-To auto-fix issues run:
-
-```bash
-python utils/check_table.py --fix_and_overwrite
-```
-
-which is used by `make fix-copies`.
-"""
-
-import argparse
-import collections
-import os
-import re
-from typing import List
-
-from transformers.utils import direct_transformers_import
-
-
-# All paths are set with the intent you should run this script from the root of the repo with the command
-# python utils/check_table.py
-TRANSFORMERS_PATH = "src/transformers"
-PATH_TO_DOCS = "docs/source/en"
-REPO_PATH = "."
-
-
-def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> str:
-    """
-    Find the text in filename between two prompts.
-
-    Args:
-        filename (`str`): The file to search into.
-        start_prompt (`str`): A string to look for at the start of the content searched.
-        end_prompt (`str`): A string that will mark the end of the content to look for.
-
-    Returns:
-        `str`: The content between the prompts.
-    """
-    with open(filename, "r", encoding="utf-8", newline="\n") as f:
-        lines = f.readlines()
-
-    # Find the start prompt.
-    start_index = 0
-    while not lines[start_index].startswith(start_prompt):
-        start_index += 1
-    start_index += 1
-
-    # Now go until the end prompt.
-    end_index = start_index
-    while not lines[end_index].startswith(end_prompt):
-        end_index += 1
-    end_index -= 1
-
-    while len(lines[start_index]) <= 1:
-        start_index += 1
-    while len(lines[end_index]) <= 1:
-        end_index -= 1
-    end_index += 1
-    return "".join(lines[start_index:end_index]), start_index, end_index, lines
-
-
-# Regexes that match TF/Flax/PT model names. Add here suffixes that are used to identify models, separated by |
-_re_tf_models = re.compile(r"TF(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
-_re_flax_models = re.compile(r"Flax(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)")
-# Will match any TF or Flax model too so need to be in an else branch after the two previous regexes.
-_re_pt_models = re.compile(r"(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration|ForRetrieval)")
-
-
-# This is to make sure the transformers module imported is the one in the repo.
-transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
-
-
-def camel_case_split(identifier: str) -> List[str]:
-    """
-    Split a camel-cased name into words.
-
-    Args:
-        identifier (`str`): The camel-cased name to parse.
-
-    Returns:
-        `List[str]`: The list of words in the identifier (as seprated by capital letters).
-
-    Example:
-
-    ```py
-    >>> camel_case_split("CamelCasedClass")
-    ["Camel", "Cased", "Class"]
-    ```
-    """
-    # Regex thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
-    matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
-    return [m.group(0) for m in matches]
-
-
-def _center_text(text: str, width: int) -> str:
-    """
-    Utility that will add spaces on the left and right of a text to make it centered for a given width.
-
-    Args:
-        text (`str`): The text to center.
-        width (`int`): The desired length of the result.
-
-    Returns:
-        `str`: A text of length `width` with the original `text` in the middle.
-    """
-    text_length = 2 if text == "✅" or text == "❌" else len(text)
-    left_indent = (width - text_length) // 2
-    right_indent = width - text_length - left_indent
-    return " " * left_indent + text + " " * right_indent
-
-
-SPECIAL_MODEL_NAME_LINK_MAPPING = {
-    "Data2VecAudio": "[Data2VecAudio](model_doc/data2vec)",
-    "Data2VecText": "[Data2VecText](model_doc/data2vec)",
-    "Data2VecVision": "[Data2VecVision](model_doc/data2vec)",
-    "DonutSwin": "[DonutSwin](model_doc/donut)",
-}
-
-MODEL_NAMES_WITH_SAME_CONFIG = {
-    "BARThez": "BART",
-    "BARTpho": "BART",
-    "BertJapanese": "BERT",
-    "BERTweet": "BERT",
-    "BORT": "BERT",
-    "ByT5": "T5",
-    "CPM": "OpenAI GPT-2",
-    "DePlot": "Pix2Struct",
-    "DialoGPT": "OpenAI GPT-2",
-    "DiT": "BEiT",
-    "FLAN-T5": "T5",
-    "FLAN-UL2": "T5",
-    "HerBERT": "BERT",
-    "LayoutXLM": "LayoutLMv2",
-    "Llama2": "LLaMA",
-    "Llama3": "LLaMA",
-    "Falcon3": "LLaMA",
-    "MADLAD-400": "T5",
-    "MatCha": "Pix2Struct",
-    "mBART-50": "mBART",
-    "Megatron-GPT2": "OpenAI GPT-2",
-    "mLUKE": "LUKE",
-    "MMS": "Wav2Vec2",
-    "NLLB": "M2M100",
-    "PhoBERT": "BERT",
-    "T5v1.1": "T5",
-    "TAPEX": "BART",
-    "UL2": "T5",
-    "Wav2Vec2Phoneme": "Wav2Vec2",
-    "XLM-V": "XLM-RoBERTa",
-    "XLS-R": "Wav2Vec2",
-    "XLSR-Wav2Vec2": "Wav2Vec2",
-}
-MODEL_NAMES_TO_IGNORE = [
-    "ChineseCLIPVisionModel",
-    "CLIPTextModel",
-    "CLIPVisionModel",
-    "Qwen2AudioEncoder",
-    "SiglipVisionModel",
-]
-
-
-def get_model_table_from_auto_modules() -> str:
-    """
-    Generates an up-to-date model table from the content of the auto modules.
-    """
-    # Dictionary model names to config.
-    config_maping_names = transformers_module.models.auto.configuration_auto.CONFIG_MAPPING_NAMES
-    model_name_to_config = {
-        name: config_maping_names[code]
-        for code, name in transformers_module.MODEL_NAMES_MAPPING.items()
-        if code in config_maping_names
-    }
-    model_name_to_prefix = {name: config.replace("Config", "") for name, config in model_name_to_config.items()}
-
-    # Dictionaries flagging if each model prefix has a backend in PT/TF/Flax.
-    pt_models = collections.defaultdict(bool)
-    tf_models = collections.defaultdict(bool)
-    flax_models = collections.defaultdict(bool)
-
-    # Let's lookup through all transformers object (once).
-    for attr_name in dir(transformers_module):
-        lookup_dict = None
-        if _re_tf_models.match(attr_name) is not None:
-            lookup_dict = tf_models
-            attr_name = _re_tf_models.match(attr_name).groups()[0]
-        elif _re_flax_models.match(attr_name) is not None:
-            lookup_dict = flax_models
-            attr_name = _re_flax_models.match(attr_name).groups()[0]
-        elif _re_pt_models.match(attr_name) is not None:
-            lookup_dict = pt_models
-            attr_name = _re_pt_models.match(attr_name).groups()[0]
-
-        if lookup_dict is not None:
-            while len(attr_name) > 0:
-                if attr_name in model_name_to_prefix.values():
-                    lookup_dict[attr_name] = True
-                    break
-                # Try again after removing the last word in the name
-                attr_name = "".join(camel_case_split(attr_name)[:-1])
-
-    # Let's build that table!
-    model_names = list(model_name_to_config.keys()) + list(MODEL_NAMES_WITH_SAME_CONFIG.keys())
-
-    # model name to doc link mapping
-    model_names_mapping = transformers_module.models.auto.configuration_auto.MODEL_NAMES_MAPPING
-    model_name_to_link_mapping = {value: f"[{value}](model_doc/{key})" for key, value in model_names_mapping.items()}
-    # update mapping with special model names
-    model_name_to_link_mapping = {
-        k: SPECIAL_MODEL_NAME_LINK_MAPPING[k] if k in SPECIAL_MODEL_NAME_LINK_MAPPING else v
-        for k, v in model_name_to_link_mapping.items()
-    }
-
-    # MaskFormerSwin and TimmBackbone are backbones and so not meant to be loaded and used on their own. Instead, they define architectures which can be loaded using the AutoBackbone API.
-    names_to_exclude = ["MaskFormerSwin", "TimmBackbone", "Speech2Text2"]
-    model_names = [name for name in model_names if name not in names_to_exclude]
-    model_names.sort(key=str.lower)
-
-    columns = ["Model", "PyTorch support", "TensorFlow support", "Flax Support"]
-    # We'll need widths to properly display everything in the center (+2 is to leave one extra space on each side).
-
-    widths = [len(c) + 2 for c in columns]
-    widths[0] = max([len(doc_link) for doc_link in model_name_to_link_mapping.values()]) + 2
-
-    # Build the table per se
-    table = "|" + "|".join([_center_text(c, w) for c, w in zip(columns, widths)]) + "|\n"
-    # Use ":-----:" format to center-aligned table cell texts
-    table += "|" + "|".join([":" + "-" * (w - 2) + ":" for w in widths]) + "|\n"
-
-    check = {True: "✅", False: "❌"}
-
-    for name in model_names:
-        if name in MODEL_NAMES_TO_IGNORE:
-            continue
-        if name in MODEL_NAMES_WITH_SAME_CONFIG.keys():
-            prefix = model_name_to_prefix[MODEL_NAMES_WITH_SAME_CONFIG[name]]
-        else:
-            prefix = model_name_to_prefix[name]
-        line = [
-            model_name_to_link_mapping[name],
-            check[pt_models[prefix]],
-            check[tf_models[prefix]],
-            check[flax_models[prefix]],
-        ]
-        table += "|" + "|".join([_center_text(l, w) for l, w in zip(line, widths)]) + "|\n"
-    return table
-
-
-def check_model_table(overwrite=False):
-    """
-    Check the model table in the index.md is consistent with the state of the lib and potentially fix it.
-
-    Args:
-        overwrite (`bool`, *optional*, defaults to `False`):
-            Whether or not to overwrite the table when it's not up to date.
-    """
-    current_table, start_index, end_index, lines = _find_text_in_file(
-        filename=os.path.join(PATH_TO_DOCS, "index.md"),
-        start_prompt="<!--This table is updated automatically from the auto modules",
-        end_prompt="<!-- End table-->",
-    )
-    new_table = get_model_table_from_auto_modules()
-
-    if current_table != new_table:
-        if overwrite:
-            with open(os.path.join(PATH_TO_DOCS, "index.md"), "w", encoding="utf-8", newline="\n") as f:
-                f.writelines(lines[:start_index] + [new_table] + lines[end_index:])
-        else:
-            raise ValueError(
-                "The model table in the `index.md` has not been updated. Run `make fix-copies` to fix this."
-            )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
-    args = parser.parse_args()
-
-    check_model_table(args.fix_and_overwrite)
diff --git a/utils/create_dependency_mapping.py b/utils/create_dependency_mapping.py
index f0f62cf1b000..782617e27c28 100644
--- a/utils/create_dependency_mapping.py
+++ b/utils/create_dependency_mapping.py
@@ -30,7 +30,7 @@ def topological_sort(dependencies: dict):
 
 # Function to extract class and import info from a file
 def extract_classes_and_imports(file_path):
-    with open(file_path, "r") as file:
+    with open(file_path, "r", encoding="utf-8") as file:
         tree = ast.parse(file.read(), filename=file_path)
     imports = set()
 
diff --git a/utils/custom_init_isort.py b/utils/custom_init_isort.py
index 82bf07ce43a9..7d3635c71b53 100644
--- a/utils/custom_init_isort.py
+++ b/utils/custom_init_isort.py
@@ -16,7 +16,7 @@
 Utility that sorts the imports in the custom inits of Transformers. Transformers uses init files that delay the
 import of an object to when it's actually needed. This is to avoid the main init importing all models, which would
 make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with
-delayed imports have two halves: one definining a dictionary `_import_structure` which maps modules to the name of the
+delayed imports have two halves: one defining a dictionary `_import_structure` which maps modules to the name of the
 objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. `isort` or `ruff`
 properly sort the second half which looks like traditionl imports, the goal of this script is to sort the first half.
 
diff --git a/utils/deprecate_models.py b/utils/deprecate_models.py
index add8da74d930..db449ef00a06 100644
--- a/utils/deprecate_models.py
+++ b/utils/deprecate_models.py
@@ -344,7 +344,7 @@ def deprecate_models(models):
         print("Removing #Copied from statements from model's files")
         remove_copied_from_statements(model)
 
-        # Move the model file to deprecated: src/transfomers/models/model -> src/transformers/models/deprecated/model
+        # Move the model file to deprecated: src/transformers/models/model -> src/transformers/models/deprecated/model
         print("Moving model files to deprecated for model")
         move_model_files_to_deprecated(model)
 
diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py
index 22e9fcae471f..64cd3752024a 100644
--- a/utils/download_glue_data.py
+++ b/utils/download_glue_data.py
@@ -79,9 +79,11 @@ def format_mrpc(data_dir, path_to_data):
         for row in ids_fh:
             dev_ids.append(row.strip().split("\t"))
 
-    with open(mrpc_train_file, encoding="utf8") as data_fh, open(
-        os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8"
-    ) as train_fh, open(os.path.join(mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh:
+    with (
+        open(mrpc_train_file, encoding="utf8") as data_fh,
+        open(os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8") as train_fh,
+        open(os.path.join(mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh,
+    ):
         header = data_fh.readline()
         train_fh.write(header)
         dev_fh.write(header)
@@ -92,9 +94,10 @@ def format_mrpc(data_dir, path_to_data):
             else:
                 train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
 
-    with open(mrpc_test_file, encoding="utf8") as data_fh, open(
-        os.path.join(mrpc_dir, "test.tsv"), "w", encoding="utf8"
-    ) as test_fh:
+    with (
+        open(mrpc_test_file, encoding="utf8") as data_fh,
+        open(os.path.join(mrpc_dir, "test.tsv"), "w", encoding="utf8") as test_fh,
+    ):
         header = data_fh.readline()
         test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
         for idx, row in enumerate(data_fh):
diff --git a/utils/fetch_hub_objects_for_ci.py b/utils/fetch_hub_objects_for_ci.py
new file mode 100644
index 000000000000..b7f2f06bebda
--- /dev/null
+++ b/utils/fetch_hub_objects_for_ci.py
@@ -0,0 +1,12 @@
+from huggingface_hub import hf_hub_download
+
+from transformers.testing_utils import _run_pipeline_tests
+
+
+if __name__ == "__main__":
+    if _run_pipeline_tests:
+        import datasets
+
+        _ = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        _ = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
+        _ = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")
diff --git a/utils/get_github_job_time.py b/utils/get_github_job_time.py
index af59081ffd46..155d859bba65 100644
--- a/utils/get_github_job_time.py
+++ b/utils/get_github_job_time.py
@@ -68,4 +68,4 @@ def get_job_time(workflow_run_id, token=None):
     job_time = dict(sorted(job_time.items(), key=lambda item: item[1]["duration"], reverse=True))
 
     for k, v in job_time.items():
-        print(f'{k}: {v["duration"]}')
+        print(f"{k}: {v['duration']}")
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index 3c5d062fbe3a..0962056270d8 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -79,8 +79,11 @@ def replace(match):
 
 def get_cased_name(lowercase_name: str) -> str:
     """From a model name in lowercase in the format `my_model`, return the cased name in the format `MyModel`."""
+    alt_lowercase_name = lowercase_name.replace("_", "-")
     if lowercase_name in CONFIG_MAPPING_NAMES:
         return CONFIG_MAPPING_NAMES[lowercase_name].replace("Config", "")
+    elif alt_lowercase_name in CONFIG_MAPPING_NAMES:
+        return CONFIG_MAPPING_NAMES[alt_lowercase_name].replace("Config", "")
     else:
         return "".join(x.title() for x in lowercase_name.split("_"))
 
@@ -106,6 +109,8 @@ class ReplaceNameTransformer(m.MatcherDecoratableTransformer):
 
     def __init__(self, old_name: str, new_name: str, original_new_model_name: str = "", only_doc: bool = False):
         super().__init__()
+        old_name = old_name.replace("-", "_")
+        new_name = new_name.replace("-", "_")
         self.old_name = old_name
         self.new_name = new_name
         self.cased_new_name = get_cased_name(self.new_name)
@@ -253,10 +258,29 @@ def get_docstring_indent(docstring):
     return 0
 
 
+def is_full_docstring(new_docstring: str) -> bool:
+    """Check if `new_docstring` is a full docstring, or if it is only part of a docstring that should then
+    be merged with the existing old one.
+    """
+    # libcst returns the docstrinbgs with literal `r"""` quotes in front
+    new_docstring = new_docstring.split('"""', 1)[1]
+    # The docstring contains Args definition, so it is self-contained
+    if re.search(r"\n\s*Args:\n", new_docstring):
+        return True
+    # If it contains Returns, but starts with text indented with an additional 4 spaces before, it is self-contained
+    # (this is the scenario when using `@add_start_docstrings_to_model_forward`, but adding more args to docstring)
+    match_object = re.search(r"\n([^\S\n]*)Returns:\n", new_docstring)
+    if match_object is not None:
+        full_indent = match_object.group(1)
+        striped_doc = new_docstring.strip("\n")
+        if striped_doc.startswith(full_indent + " " * 4) or striped_doc.startswith(full_indent + "\t"):
+            return True
+    return False
+
+
 def merge_docstrings(original_docstring, updated_docstring):
-    # indent_level = get_docstring_indent(updated_docstring)
     original_level = get_docstring_indent(original_docstring)
-    if not re.findall(r"\n\s*Args:\n", updated_docstring):
+    if not is_full_docstring(updated_docstring):
         # Split the docstring at the example section, assuming `"""` is used to define the docstring
         parts = original_docstring.split("```")
         if "```" in updated_docstring and len(parts) > 1:
@@ -494,7 +518,7 @@ def forward(...):
     all_dependencies = set()
     all_dependencies_with_parent = []
     checked_dependencies = set(initial_checked_dependencies)
-    parents = {initial_dep: start_entity for initial_dep in initial_dependencies}
+    parents = dict.fromkeys(initial_dependencies, start_entity)
     while len(dependency_queue) > 0:
         # Pick element to visit
         current = dependency_queue.popleft()
@@ -505,7 +529,7 @@ def forward(...):
             if current in dependency_mapping.keys():
                 # Update dependency queue
                 dependency_queue.extend(dependency_mapping[current])
-                parents.update({dep: current for dep in dependency_mapping[current]})
+                parents.update(dict.fromkeys(dependency_mapping[current], current))
             # add visited node to the list
             checked_dependencies.add(current)
 
@@ -516,7 +540,10 @@ def forward(...):
 
 
 # Top-level variables that match the following patterns will always use the value in the `modular_xxx.py` file
-ASSIGNMENTS_REGEX_TO_KEEP = [r"_CHECKPOINT", r"_EXPECTED", r"_FOR_DOC"]
+ASSIGNMENTS_REGEX_TO_KEEP = [r"_CHECKPOINT", r"_EXPECTED", r"_FOR_DOC", r"_HIDDEN_STATES_START_POSITION"]
+
+# Top-level variables that match the following patterns will use the value in the `modular_xxx.py` file only if they are not None
+ASSIGNMENTS_REGEX_TO_KEEP_IF_NOT_NONE = [r"_DOCSTRING"]
 
 
 class ClassDependencyMapper(CSTVisitor):
@@ -594,6 +621,7 @@ def __init__(self, python_module: cst.Module):
         self.object_dependency_mapping = defaultdict(set)          # immediate function/assignment dependency mapping (i.e. dependencies immediately in the function/assignment definition)
         self.assignments: Dict[str, cst.SimpleStatementLine] = {}  # mapping of global assignments names to Nodes
         self.current_function = None                               # this keeps track of the current module-scope function
+        self.current_class = None                                  # this keeps track of the current module-scope class
         self.current_assignment = None                             # this keeps track of the current module-scope assignment
         # this keeps track of objects imported from modeling files (`from .configuration import Config`) -> `Config` should not be a dependency
         self.objects_imported_from_modeling = set()
@@ -649,13 +677,19 @@ def leave_FunctionDef(self, node):
             self.current_function = None
 
     def visit_If(self, node):
-        for stmt in node.body.body:
-            if m.matches(stmt, m.SimpleStatementLine(body=[m.ImportFrom() | m.Import()])):
-                self.imports.append(node)
+        # If we are inside a function, do not add the import to the list of imports
+        if self.current_function is None and self.current_class is None:
+            for stmt in node.body.body:
+                if m.matches(stmt, m.SimpleStatementLine(body=[m.ImportFrom() | m.Import()])):
+                    self.imports.append(node)
 
     def visit_ClassDef(self, node: ClassDef) -> None:
         """Record class nodes to create their dependencies at the end."""
         self.classes[node.name.value] = node
+        self.current_class = node.name.value
+
+    def leave_ClassDef(self, node):
+        self.current_class = None
 
     def visit_Name(self, node: cst.Call):
         """This is used to create a mapping from module-scope functions and assignments to objects used inside them."""
@@ -833,13 +867,17 @@ def _merge_assignments(self, assignments: dict[str, cst.CSTNode], object_mapping
         """Update the global nodes with the assignment from the modular file.
 
         Merging rule: if any assignment with the same name was redefined in the modular, we use it and its dependencies ONLY if it matches
-        a pattern in `ASSIGNMENTS_REGEX_TO_KEEP`. Otherwise, we use the original value and dependencies. This rule was chosen to avoid having to rewrite the
-        big docstrings.
+        a pattern in `ASSIGNMENTS_REGEX_TO_KEEP_IF_NOT_NONE` and its value is not None, or if it matches a pattern in `ASSIGNMENTS_REGEX_TO_KEEP.
+        Otherwise, we use the original value and dependencies. This rule was chosen to avoid having to rewrite the big docstrings.
         """
         for assignment, node in assignments.items():
             should_keep = any(re.search(pattern, assignment) for pattern in ASSIGNMENTS_REGEX_TO_KEEP)
 
-            if should_keep or assignment not in self.assignments:
+            should_keep_if_not_none = any(
+                re.search(pattern, assignment) for pattern in ASSIGNMENTS_REGEX_TO_KEEP_IF_NOT_NONE
+            ) and not (hasattr(node.body[0].value, "value") and node.body[0].value.value == "None")
+
+            if should_keep or should_keep_if_not_none or assignment not in self.assignments:
                 self.assignments[assignment] = node
                 if assignment in object_mapping:
                     self.object_dependency_mapping[assignment] = object_mapping[assignment]
@@ -996,11 +1034,20 @@ def replace_class_node(
             new_decorators = (
                 updated_methods[name].decorators if len(updated_methods[name].decorators) > 0 else func.decorators
             )
+
+            # Keep return annotation in `modular_xxx.py` if any, else original return annotation
+            new_return_annotation = updated_methods[name].returns if updated_methods[name].returns else func.returns
+
             if not re.match(
                 r"\ndef .*\(.*\):\n    raise.*Error\(.*",
                 mapper.python_module.code_for_node(updated_methods[name]),
             ):
-                func = func.with_changes(body=updated_methods[name].body, params=new_params, decorators=new_decorators)
+                func = func.with_changes(
+                    body=updated_methods[name].body,
+                    params=new_params,
+                    decorators=new_decorators,
+                    returns=new_return_annotation,
+                )
             else:
                 continue
 
@@ -1066,8 +1113,7 @@ def replace_class_node(
     "Processor": "processing",
     "ImageProcessor": "image_processing",
     "ImageProcessorFast": "image_processing*_fast",  # "*" indicates where to insert the model name before the "_fast" suffix
-    "FastImageProcessorInitKwargs": "image_processing*_fast",
-    "FastImageProcessorPreprocessKwargs": "image_processing*_fast",
+    "FastImageProcessorKwargs": "image_processing*_fast",
     "FeatureExtractor": "feature_extractor",
     "ProcessorKwargs": "processing",
     "ImagesKwargs": "processing",
@@ -1109,7 +1155,7 @@ def append_new_import_node(
     import_node = node.body[0]
     names_to_keep = []
     for name in import_node.names:
-        name_value = name.evaluated_name
+        name_value = name.evaluated_alias or name.evaluated_name
         if name_value not in unused_imports and name_value not in added_names:
             names_to_keep.append(name.with_changes(comma=cst.MaybeSentinel.DEFAULT))
             added_names.add(name_value)
@@ -1121,7 +1167,7 @@ def append_new_import_node(
 def get_needed_imports(body: dict[str, dict], all_imports: list[cst.CSTNode]) -> list[cst.CSTNode]:
     """Get all the imports needed in the `body`, from the list of `all_imports`.
     `body` is a dict with the following structure `{str: {"insert_idx": int, "node": cst.CSTNode}}`.
-    Note: we need to use `isinstance` on scope assignements, m.matches apparently does not work here yet!
+    Note: we need to use `isinstance` on scope assignments, m.matches apparently does not work here yet!
     """
     new_body = [k[1]["node"] for k in sorted(body.items(), key=lambda x: x[1]["insert_idx"])]
     wrapper = MetadataWrapper(cst.Module(body=all_imports + new_body))
@@ -1595,7 +1641,7 @@ class node based on the inherited classes if needed. Also returns any new import
 
 
 def create_modules(modular_mapper: ModularFileMapper) -> dict[str, cst.Module]:
-    """Create all the new modules based on visiting the modular file. It replaces all classes as necesary."""
+    """Create all the new modules based on visiting the modular file. It replaces all classes as necessary."""
     files = defaultdict(dict)
     current_file_indices = defaultdict(lambda: 0)
 
diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt
index 24a8a4ba7a28..61d452464c68 100644
--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@@ -5,11 +5,8 @@ docs/source/en/add_new_pipeline.md
 docs/source/en/agents.md
 docs/source/en/agents.md
 docs/source/en/attention.md
-docs/source/en/bertology.md
-docs/source/en/big_models.md
 docs/source/en/community.md
 docs/source/en/contributing.md
-docs/source/en/create_a_model.md
 docs/source/en/custom_models.md
 docs/source/en/debugging.md
 docs/source/en/fast_tokenizers.md
@@ -273,7 +270,6 @@ docs/source/en/model_doc/yoso.md
 docs/source/en/model_memory_anatomy.md
 docs/source/en/model_sharing.md
 docs/source/en/model_summary.md
-docs/source/en/multilingual.md
 docs/source/en/notebooks.md
 docs/source/en/pad_truncation.md
 docs/source/en/peft.md
@@ -287,14 +283,11 @@ docs/source/en/perf_train_gpu_many.md
 docs/source/en/perf_train_gpu_one.md
 docs/source/en/perf_train_special.md
 docs/source/en/perf_train_tpu_tf.md
-docs/source/en/performance.md
 docs/source/en/perplexity.md
 docs/source/en/philosophy.md
 docs/source/en/pipeline_webserver.md
 docs/source/en/pr_checks.md
-docs/source/en/preprocessing.md
 docs/source/en/run_scripts.md
-docs/source/en/sagemaker.md
 docs/source/en/serialization.md
 docs/source/en/tasks/asr.md
 docs/source/en/tasks/audio_classification.md
@@ -482,7 +475,6 @@ src/transformers/models/deberta/modeling_tf_deberta.py
 src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
 src/transformers/models/decision_transformer/modeling_decision_transformer.py
 src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
-src/transformers/models/deformable_detr/load_custom.py
 src/transformers/models/deit/convert_deit_timm_to_pytorch.py
 src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
 src/transformers/models/deprecated/mctct/configuration_mctct.py
diff --git a/utils/notification_service.py b/utils/notification_service.py
index f3bcfcd4f266..66db34e00c22 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -665,7 +665,7 @@ def get_reply_blocks(self, job_name, job_result, failures, device, text):
 
         failure_text = ""
         for idx, error in enumerate(failures):
-            new_text = failure_text + f'*{error["line"]}*\n_{error["trace"]}_\n\n'
+            new_text = failure_text + f"*{error['line']}*\n_{error['trace']}_\n\n"
             if len(new_text) > MAX_ERROR_TEXT:
                 # `failure_text` here has length <= 3000
                 failure_text = failure_text + "[Truncated]"
@@ -728,7 +728,7 @@ def get_new_model_failure_blocks(self, with_header=True, to_truncate=True):
                         if error["line"] in prev_error_lines:
                             continue
 
-                        new_text = f'{error["line"]}\n\n'
+                        new_text = f"{error['line']}\n\n"
 
                         if new_text not in all_failure_lines:
                             all_failure_lines[new_text] = []
@@ -794,7 +794,7 @@ def post_reply(self):
                         job_result,
                         failures,
                         device,
-                        text=f'Number of failures: {job_result["failed"][device]}',
+                        text=f"Number of failures: {job_result['failed'][device]}",
                     )
 
                     print("Sending the following reply")
diff --git a/utils/notification_service_quantization.py b/utils/notification_service_quantization.py
index 0264797c94ef..f15aa68f907b 100644
--- a/utils/notification_service_quantization.py
+++ b/utils/notification_service_quantization.py
@@ -152,7 +152,7 @@ def post_reply(self):
                         job_result,
                         failures,
                         device,
-                        text=f'Number of failures: {job_result["failed"][device]}',
+                        text=f"Number of failures: {job_result['failed'][device]}",
                     )
 
                     print("Sending the following reply")
@@ -203,7 +203,7 @@ def post_reply(self):
             "job_link": {},
         }
         for quant in quantization_matrix
-        if f"run_quantization_torch_gpu_{ quant }_test_reports" in available_artifacts
+        if f"run_quantization_torch_gpu_{quant}_test_reports" in available_artifacts
     }
 
     github_actions_jobs = get_jobs(
@@ -220,7 +220,7 @@ def post_reply(self):
                 break
 
     for quant in quantization_results.keys():
-        for artifact_path in available_artifacts[f"run_quantization_torch_gpu_{ quant }_test_reports"].paths:
+        for artifact_path in available_artifacts[f"run_quantization_torch_gpu_{quant}_test_reports"].paths:
             artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
             if "stats" in artifact:
                 # Link to the GitHub Action job
diff --git a/utils/past_ci_versions.py b/utils/past_ci_versions.py
index 61495ab2a46f..858f7184d707 100644
--- a/utils/past_ci_versions.py
+++ b/utils/past_ci_versions.py
@@ -116,8 +116,8 @@
 
     info = past_versions_testing[args.framework][args.version]
 
-    os.system(f'echo "export INSTALL_CMD=\'{info["install"]}\'" >> ~/.profile')
-    print(f'echo "export INSTALL_CMD=\'{info["install"]}\'" >> ~/.profile')
+    os.system(f"echo \"export INSTALL_CMD='{info['install']}'\" >> ~/.profile")
+    print(f"echo \"export INSTALL_CMD='{info['install']}'\" >> ~/.profile")
 
     cuda = ""
     if args.framework == "pytorch":
diff --git a/utils/pr_slow_ci_models.py b/utils/pr_slow_ci_models.py
index 312bd078e63a..4403ade49804 100644
--- a/utils/pr_slow_ci_models.py
+++ b/utils/pr_slow_ci_models.py
@@ -64,7 +64,7 @@ def get_new_python_files_between_commits(base_commit: str, commits: List[str]) -
     return code_diff
 
 
-def get_new_python_files() -> List[str]:
+def get_new_python_files(diff_with_last_commit=False) -> List[str]:
     """
     Return a list of python files that have been added between the current head and the main branch.
 
@@ -80,17 +80,24 @@ def get_new_python_files() -> List[str]:
         # On GitHub Actions runners, it doesn't have local main branch
         main = repo.remotes.origin.refs.main
 
-    print(f"main is at {main.commit}")
-    print(f"Current head is at {repo.head.commit}")
+    if not diff_with_last_commit:
+        print(f"main is at {main.commit}")
+        print(f"Current head is at {repo.head.commit}")
 
-    branching_commits = repo.merge_base(main, repo.head)
-    for commit in branching_commits:
-        print(f"Branching commit: {commit}")
-    return get_new_python_files_between_commits(repo.head.commit, branching_commits)
+        commits = repo.merge_base(main, repo.head)
+        for commit in commits:
+            print(f"Branching commit: {commit}")
+    else:
+        print(f"main is at {main.commit}")
+        commits = main.commit.parents
+        for commit in commits:
+            print(f"Parent commit: {commit}")
 
+    return get_new_python_files_between_commits(repo.head.commit, commits)
 
-def get_new_model():
-    new_files = get_new_python_files()
+
+def get_new_model(diff_with_last_commit=False):
+    new_files = get_new_python_files(diff_with_last_commit)
     reg = re.compile(r"src/transformers/models/(.*)/modeling_.*\.py")
 
     new_model = ""
diff --git a/utils/process_circleci_workflow_test_reports.py b/utils/process_circleci_workflow_test_reports.py
index 944bc47a7e2f..eb61f6d586e5 100644
--- a/utils/process_circleci_workflow_test_reports.py
+++ b/utils/process_circleci_workflow_test_reports.py
@@ -37,12 +37,12 @@
     for job in jobs:
         project_slug = job["project_slug"]
         if job["name"].startswith(("tests_", "examples_", "pipelines_")):
-            url = f'https://circleci.com/api/v2/project/{project_slug}/{job["job_number"]}/artifacts'
+            url = f"https://circleci.com/api/v2/project/{project_slug}/{job['job_number']}/artifacts"
             r = requests.get(url, headers={"Circle-Token": os.environ.get("CIRCLE_TOKEN", "")})
             job_artifacts = r.json()["items"]
 
             os.makedirs(job["name"], exist_ok=True)
-            os.makedirs(f'outputs/{job["name"]}', exist_ok=True)
+            os.makedirs(f"outputs/{job['name']}", exist_ok=True)
 
             job_test_summaries = {}
             for artifact in job_artifacts:
@@ -67,7 +67,7 @@
             workflow_summary[job["name"]] = summary
 
             # collected version
-            with open(f'outputs/{job["name"]}/test_summary.json', "w") as fp:
+            with open(f"outputs/{job['name']}/test_summary.json", "w") as fp:
                 json.dump(summary, fp, indent=4)
 
     new_workflow_summary = {}
diff --git a/utils/release.py b/utils/release.py
index d5b74602e68c..e4e79cec1589 100644
--- a/utils/release.py
+++ b/utils/release.py
@@ -95,8 +95,6 @@ def update_version_in_examples(version: str):
     """
     for folder, directories, fnames in os.walk(PATH_TO_EXAMPLES):
         # Removing some of the folders with non-actively maintained examples from the walk
-        if "research_projects" in directories:
-            directories.remove("research_projects")
         if "legacy" in directories:
             directories.remove("legacy")
         for fname in fnames:
diff --git a/utils/split_doctest_jobs.py b/utils/split_doctest_jobs.py
index 0735298f3123..d8c97f574019 100644
--- a/utils/split_doctest_jobs.py
+++ b/utils/split_doctest_jobs.py
@@ -49,7 +49,7 @@
         "--num_splits",
         type=int,
         default=1,
-        help="the number of splits into which the (flat) list of direcotry/file paths will be split. This has effect only if `only_return_keys` is `True`.",
+        help="the number of splits into which the (flat) list of directory/file paths will be split. This has effect only if `only_return_keys` is `True`.",
     )
     args = parser.parse_args()
 
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 865b4fb9d5db..c4b528eb8990 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -185,7 +185,7 @@ def keep_doc_examples_only(content: str) -> str:
 def get_all_tests() -> List[str]:
     """
     Walks the `tests` folder to return a list of files/subfolders. This is used to split the tests to run when using
-    paralellism. The split is:
+    parallelism. The split is:
 
     - folders under `tests`: (`tokenization`, `pipelines`, etc) except the subfolder `models` is excluded.
     - folders under `tests/models`: `bert`, `gpt2`, etc.
@@ -730,6 +730,8 @@ def get_module_dependencies(module_fname: str, cache: Dict[str, List[str]] = Non
     while len(imported_modules) > 0:
         new_modules = []
         for module, imports in imported_modules:
+            if "models" in module.split("/") and module.split("/")[-1].startswith("convert_"):
+                continue
             # If we end up in an __init__ we are often not actually importing from this init (except in the case where
             # the object is fully defined in the __init__)
             if module.endswith("__init__.py"):
@@ -780,7 +782,9 @@ def create_reverse_dependency_tree() -> List[Tuple[str, str]]:
     Create a list of all edges (a, b) which mean that modifying a impacts b with a going over all module and test files.
     """
     cache = {}
-    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
+    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py"))
+    all_modules = [x for x in all_modules if not ("models" in x.parts and x.parts[-1].startswith("convert_"))]
+    all_modules += list(PATH_TO_TESTS.glob("**/*.py"))
     all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
     edges = [(dep, mod) for mod in all_modules for dep in get_module_dependencies(mod, cache=cache)]
 
@@ -850,7 +854,7 @@ def print_tree_deps_of(module, all_edges=None):
 
 def init_test_examples_dependencies() -> Tuple[Dict[str, List[str]], List[str]]:
     """
-    The test examples do not import from the examples (which are just scripts, not modules) so we need som extra
+    The test examples do not import from the examples (which are just scripts, not modules) so we need some extra
     care initializing the dependency map, which is the goal of this function. It initializes the dependency map for
     example files by linking each example to the example test file for the example framework.
 
@@ -898,7 +902,9 @@ def create_reverse_dependency_map() -> Dict[str, List[str]]:
     # Start from the example deps init.
     example_deps, examples = init_test_examples_dependencies()
     # Add all modules and all tests to all examples
-    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py")) + examples
+    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py"))
+    all_modules = [x for x in all_modules if not ("models" in x.parts and x.parts[-1].startswith("convert_"))]
+    all_modules += list(PATH_TO_TESTS.glob("**/*.py")) + examples
     all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
     # Compute the direct dependencies of all modules.
     direct_deps = {m: get_module_dependencies(m, cache=cache) for m in all_modules}
@@ -1148,19 +1154,14 @@ def parse_commit_message(commit_message: str) -> Dict[str, bool]:
 
 
 JOB_TO_TEST_FILE = {
-    "tests_torch_and_tf": r"tests/models/.*/test_modeling_(?:tf_|(?!flax)).*",
-    "tests_torch_and_flax": r"tests/models/.*/test_modeling_(?:flax|(?!tf)).*",
-    "tests_tf": r"tests/models/.*/test_modeling_tf_.*",
     "tests_torch": r"tests/models/.*/test_modeling_(?!(?:flax_|tf_)).*",
     "tests_generate": r"tests/models/.*/test_modeling_(?!(?:flax_|tf_)).*",
     "tests_tokenization": r"tests/models/.*/test_tokenization.*",
     "tests_processors": r"tests/models/.*/test_(?!(?:modeling_|tokenization_)).*",  # takes feature extractors, image processors, processors
     "examples_torch": r"examples/pytorch/.*test_.*",
-    "examples_tensorflow": r"examples/tensorflow/.*test_.*",
     "tests_exotic_models": r"tests/models/.*(?=layoutlmv|nat|deta|udop|nougat).*",
     "tests_custom_tokenizers": r"tests/models/.*/test_tokenization_(?=bert_japanese|openai|clip).*",
     # "repo_utils": r"tests/[^models].*test.*", TODO later on we might want to do
-    "pipelines_tf": r"tests/models/.*/test_modeling_tf_.*",
     "pipelines_torch": r"tests/models/.*/test_modeling_(?!(?:flax_|tf_)).*",
     "tests_hub": r"tests/.*",
     "tests_onnx": r"tests/models/.*/test_modeling_(?:tf_|(?!flax)).*",
diff --git a/utils/update_metadata.py b/utils/update_metadata.py
index 8e4a7e3fe534..d8df28c2a325 100755
--- a/utils/update_metadata.py
+++ b/utils/update_metadata.py
@@ -132,7 +132,7 @@ def camel_case_split(identifier: str) -> List[str]:
         identifier (`str`): The camel-cased name to parse.
 
     Returns:
-        `List[str]`: The list of words in the identifier (as seprated by capital letters).
+        `List[str]`: The list of words in the identifier (as separated by capital letters).
 
     Example:
 
@@ -215,7 +215,7 @@ def get_frameworks_table() -> pd.DataFrame:
 
 def update_pipeline_and_auto_class_table(table: Dict[str, Tuple[str, str]]) -> Dict[str, Tuple[str, str]]:
     """
-    Update the table maping models to pipelines and auto classes without removing old keys if they don't exist anymore.
+    Update the table mapping models to pipelines and auto classes without removing old keys if they don't exist anymore.
 
     Args:
         table (`Dict[str, Tuple[str, str]]`):
@@ -247,7 +247,7 @@ def update_pipeline_and_auto_class_table(table: Dict[str, Tuple[str, str]]) -> D
                     model_names.extend(list(name))
 
             # Add pipeline tag and auto model class for those models
-            table.update({model_name: (pipeline_tag, cls) for model_name in model_names})
+            table.update(dict.fromkeys(model_names, (pipeline_tag, cls)))
 
     return table